04 01 Preparing data - HannaAA17/Data-Scientist-With-Python-datacamp GitHub Wiki
pd.read_csv()
- _excel(); _html(); _json()
# Import pandas
import pandas as pd
# Create the list of file names: filenames
filenames = ['Gold.csv', 'Silver.csv', 'Bronze.csv']
# Create the list of three DataFrames: dataframes
dataframes = []
for filename in filenames:
dataframes.append(pd.read_csv(filename))
# Print top 5 rows of 1st DataFrame in dataframes
print(dataframes[0].head())
two more ways
# Import pandas
import pandas as pd
# Make a copy of gold: medals
medals = gold.copy()
# Create list of new column labels: new_labels
new_labels = ['NOC', 'Country', 'Gold']
# Rename the columns of medals using new_labels
medals.columns = new_labels
# Add columns 'Silver' & 'Bronze' to medals
medals['Silver'] = silver['Total']
medals['Bronze'] = bronze['Total']
# Print the head of medals
print(medals.head())
# Import pandas
import pandas as pd
# Read 'monthly_max_temp.csv' into a DataFrame: weather1
weather1 = pd.read_csv('monthly_max_temp.csv',index_col='Month')
# Print the head of weather1
print(weather1.head())
# Sort the index of weather1 in alphabetical order: weather2
weather2 = weather1.sort_index()
# Print the head of weather2
print(weather2.head())
# Sort the index of weather1 in reverse alphabetical order: weather3
weather3 = weather1.sort_index(ascending=False)
# Print the head of weather3
print(weather3.head())
# Sort weather1 numerically using the values of 'Max TemperatureF': weather4
weather4 = weather1.sort_values('Max TemperatureF')
# Print the head of weather4
print(weather4.head())
# Import pandas
import pandas as pd
# Reindex weather1 using the list year: weather2
weather2 = weather1.reindex(year)
# Print weather2
print(weather2)
# Reindex weather1 using the list year with forward-fill: weather3
weather3 = weather1.reindex(year).ffill()
# Print weather3
print(weather3)
# Import pandas
import pandas as pd
# Reindex names_1981 with index of names_1881: common_names
common_names = names_1981.reindex(names_1881.index)
# Print shape of common_names
print(common_names.shape)
# Drop rows with null counts: common_names
common_names = common_names.dropna()
# Print shape of new common_names
print(common_names.shape)
# Extract selected columns from weather as new DataFrame: temps_f
temps_f = weather[['Min TemperatureF','Mean TemperatureF','Max TemperatureF']]
# Convert temps_f to celsius: temps_c
temps_c = (temps_f - 32) * 5/9
# Rename 'F' in column names with 'C': temps_c.columns
temps_c.columns = temps_c.columns.str.replace('F','C')
# Print first 5 rows of temps_c
print(temps_c.head())
import pandas as pd
# Read 'GDP.csv' into a DataFrame: gdp
gdp = pd.read_csv('GDP.csv',parse_dates=True,index_col='DATE')
# Slice all the gdp data from 2008 onward: post2008
post2008 = gdp.loc['2008-01-01':]
# Print the last 8 rows of post2008
print(post2008.tail(8))
# Resample post2008 by year, keeping last(): yearly
yearly = post2008.resample('A').last()
# Print yearly
print(yearly)
# Compute percentage growth of yearly: yearly['growth']
yearly['growth'] = yearly.pct_change()*100
# Print yearly again
print(yearly)
<script.py> output:
VALUE
DATE
2014-07-01 17569.4
2014-10-01 17692.2
2015-01-01 17783.6
2015-04-01 17998.3
2015-07-01 18141.9
2015-10-01 18222.8
2016-01-01 18281.6
2016-04-01 18436.5
VALUE
DATE
2008-12-31 14549.9
2009-12-31 14566.5
2010-12-31 15230.2
2011-12-31 15785.3
2012-12-31 16297.3
2013-12-31 16999.9
2014-12-31 17692.2
2015-12-31 18222.8
2016-12-31 18436.5
VALUE growth
DATE
2008-12-31 14549.9 NaN
2009-12-31 14566.5 0.114090
2010-12-31 15230.2 4.556345
2011-12-31 15785.3 3.644732
2012-12-31 16297.3 3.243524
2013-12-31 16999.9 4.311144
2014-12-31 17692.2 4.072377
2015-12-31 18222.8 2.999062
2016-12-31 18436.5 1.172707
Converting curryency of stocks
# Import pandas
import pandas as pd
# Read 'sp500.csv' into a DataFrame: sp500
sp500 = pd.read_csv('sp500.csv', parse_dates=True, index_col='Date')
# Read 'exchange.csv' into a DataFrame: exchange
exchange =pd.read_csv('exchange.csv', parse_dates=True, index_col='Date')
# Subset 'Open' & 'Close' columns from sp500: dollars
dollars = sp500[['Open','Close']]
# Print the head of dollars
print(dollars.head())
# Convert dollars to pounds: pounds
pounds = dollars.multiply(exchange['GBP/USD'],axis='rows')
# Print the head of pounds
print(pounds.head())
In [3]: dollars.head()
Out[3]:
Open Close
Date
2015-01-02 2058.899902 2058.199951
2015-01-05 2054.439941 2020.579956
2015-01-06 2022.150024 2002.609985
2015-01-07 2005.550049 2025.900024
2015-01-08 2030.609985 2062.139893
In [4]: exchange.head()
Out[4]:
GBP/USD
Date
2015-01-02 0.65101
2015-01-05 0.65644
2015-01-06 0.65896
2015-01-07 0.66344
2015-01-08 0.66151
In [5]: pounds.head()
Out[5]:
Open Close
Date
2015-01-02 1340.364425 1339.908750
2015-01-05 1348.616555 1326.389506
2015-01-06 1332.515980 1319.639876
2015-01-07 1330.562125 1344.063112
2015-01-08 1343.268811 1364.126161