Resource: Intro to pandas data structures
import pandas as pd
data = {'year': [2010, 2011, 2012, 2011, 2012, 2010, 2011, 2012],
'team': ['Bears', 'Bears', 'Bears', 'Packers', 'Packers', 'Lions',
'Lions', 'Lions'],
'wins': [11, 8, 10, 15, 11, 6, 10, 4],
'losses': [5, 8, 6, 1, 5, 10, 6, 12]}
football = pd.DataFrame(data)
print football, "\n"
data2 = [data['year'], data['team'], data['wins'], data['losses']]
football2 = pd.DataFrame(data2)
print football2, "\n"
print football2.T
from pandas import DataFrame, Series
def create_dataframe():
'''
Create a pandas dataframe called 'olympic_medal_counts_df' containing
the data from the table of 2014 Sochi winter olympics medal counts.
The columns for this dataframe should be called
'country_name', 'gold', 'silver', and 'bronze'.
There is no need to specify row indexes for this dataframe
(in this case, the rows will automatically be assigned numbered indexes).
'''
countries = ['Russian Fed.', 'Norway', 'Canada', 'United States',
'Netherlands', 'Germany', 'Switzerland', 'Belarus',
'Austria', 'France', 'Poland', 'China', 'Korea',
'Sweden', 'Czech Republic', 'Slovenia', 'Japan',
'Finland', 'Great Britain', 'Ukraine', 'Slovakia',
'Italy', 'Latvia', 'Australia', 'Croatia', 'Kazakhstan']
gold = [13, 11, 10, 9, 8, 8, 6, 5, 4, 4, 4, 3, 3, 2, 2, 2, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0]
silver = [11, 5, 10, 7, 7, 6, 3, 0, 8, 4, 1, 4, 3, 7, 4, 2, 4, 3, 1, 0, 0, 2, 2, 2, 1, 0]
bronze = [9, 10, 5, 12, 9, 5, 2, 1, 5, 7, 1, 2, 2, 6, 2, 4, 3, 1, 2, 1, 0, 6, 2, 1, 0, 1]
# your code here
olympic_medal_counts_df = DataFrame({'country_name':countries, 'gold':gold, 'silver':silver, 'bronze':bronze},
columns=['country_name','gold','silver','bronze'])
return olympic_medal_counts_df
# the medal count data frame
df = create_dataframe()
# create a data frame indexed by country
df2 = df.set_index('country_name')
# select the first 4 rows
print df[0:4] # slicing
print ""
print df.iloc[0:4] # .iloc (integer location)
print ""
print df.loc[0:3] # .loc (label location)
print ""
# select the first 4 rows ==> .loc doesn't work using an integer slice
print df2[0:4]
print ""
print df2.iloc[0:4] # .iloc (integer location)
print ""
# select the first 4 rows for the "country_name" and "silver" columns
print df[[0,2]][0:4] # slicing
print ""
print df[["country_name", "silver"]].iloc[0:4] # .iloc (integer location)
print ""
# countries with 10 or more gold medals
print df[['country_name','gold']][df.gold >= 10]
from pandas import DataFrame, Series
import numpy as np
def avg_medal_count():
'''
Compute the average number of bronze medals earned by countries who
earned at least one gold medal.
Save this to a variable named avg_bronze_at_least_one_gold.
HINT-1:
You can retrieve all of the values of a Pandas column from a
data frame, "df", as follows:
df['column_name']
HINT-2:
The numpy.mean function can accept as an argument a single
Pandas column.
For example, numpy.mean(df["col_name"]) would return the
mean of the values located in "col_name" of a dataframe df.
'''
countries = ['Russian Fed.', 'Norway', 'Canada', 'United States',
'Netherlands', 'Germany', 'Switzerland', 'Belarus',
'Austria', 'France', 'Poland', 'China', 'Korea',
'Sweden', 'Czech Republic', 'Slovenia', 'Japan',
'Finland', 'Great Britain', 'Ukraine', 'Slovakia',
'Italy', 'Latvia', 'Australia', 'Croatia', 'Kazakhstan']
gold = [13, 11, 10, 9, 8, 8, 6, 5, 4, 4, 4, 3, 3, 2, 2, 2, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0]
silver = [11, 5, 10, 7, 7, 6, 3, 0, 8, 4, 1, 4, 3, 7, 4, 2, 4, 3, 1, 0, 0, 2, 2, 2, 1, 0]
bronze = [9, 10, 5, 12, 9, 5, 2, 1, 5, 7, 1, 2, 2, 6, 2, 4, 3, 1, 2, 1, 0, 6, 2, 1, 0, 1]
olympic_medal_counts = {'country_name':Series(countries),
'gold': Series(gold),
'silver': Series(silver),
'bronze': Series(bronze)}
df = DataFrame(olympic_medal_counts)
# YOUR CODE HERE
avg_bronze_at_least_one_gold = np.mean(df.bronze[df.gold > 0])
return avg_bronze_at_least_one_gold
import numpy as np
from pandas import DataFrame, Series
def avg_medal_count():
'''
Using the dataframe's apply method, create a new Series called
avg_medal_count that indicates the average number of gold, silver,
and bronze medals earned amongst countries who earned at
least one medal of any kind at the 2014 Sochi olympics. Note that
the countries list already only includes countries that have earned
at least one medal. No additional filtering is necessary.
'''
countries = ['Russian Fed.', 'Norway', 'Canada', 'United States',
'Netherlands', 'Germany', 'Switzerland', 'Belarus',
'Austria', 'France', 'Poland', 'China', 'Korea',
'Sweden', 'Czech Republic', 'Slovenia', 'Japan',
'Finland', 'Great Britain', 'Ukraine', 'Slovakia',
'Italy', 'Latvia', 'Australia', 'Croatia', 'Kazakhstan']
gold = [13, 11, 10, 9, 8, 8, 6, 5, 4, 4, 4, 3, 3, 2, 2, 2, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0]
silver = [11, 5, 10, 7, 7, 6, 3, 0, 8, 4, 1, 4, 3, 7, 4, 2, 4, 3, 1, 0, 0, 2, 2, 2, 1, 0]
bronze = [9, 10, 5, 12, 9, 5, 2, 1, 5, 7, 1, 2, 2, 6, 2, 4, 3, 1, 2, 1, 0, 6, 2, 1, 0, 1]
olympic_medal_counts = {'country_name':countries,
'gold': Series(gold),
'silver': Series(silver),
'bronze': Series(bronze)}
df = DataFrame(olympic_medal_counts)
# YOUR CODE HERE
avg_medal_count = df[['gold','silver','bronze']].mean()
# using apply: avg_medal_count = df[['gold','silver','bronze']].apply(np.mean)
return avg_medal_count
import numpy as np
from pandas import DataFrame, Series
def numpy_dot():
'''
Imagine a point system in which each country is awarded 4 points for each
gold medal, 2 points for each silver medal, and one point for each
bronze medal.
Using the numpy.dot function, create a new dataframe called
'olympic_points_df' that includes:
a) a column called 'country_name' with the country name
b) a column called 'points' with the total number of points the country
earned at the Sochi olympics.
'''
countries = ['Russian Fed.', 'Norway', 'Canada', 'United States',
'Netherlands', 'Germany', 'Switzerland', 'Belarus',
'Austria', 'France', 'Poland', 'China', 'Korea',
'Sweden', 'Czech Republic', 'Slovenia', 'Japan',
'Finland', 'Great Britain', 'Ukraine', 'Slovakia',
'Italy', 'Latvia', 'Australia', 'Croatia', 'Kazakhstan']
gold = [13, 11, 10, 9, 8, 8, 6, 5, 4, 4, 4, 3, 3, 2, 2, 2, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0]
silver = [11, 5, 10, 7, 7, 6, 3, 0, 8, 4, 1, 4, 3, 7, 4, 2, 4, 3, 1, 0, 0, 2, 2, 2, 1, 0]
bronze = [9, 10, 5, 12, 9, 5, 2, 1, 5, 7, 1, 2, 2, 6, 2, 4, 3, 1, 2, 1, 0, 6, 2, 1, 0, 1]
# YOUR CODE HERE
olympic_points_df = DataFrame({'country_name':countries, 'points':np.dot([4,2,1], [gold, silver, bronze])})
return olympic_points_df