Numpy¶

Resource: Intro to pandas data structures

import pandas as pd

data = {'year': [2010, 2011, 2012, 2011, 2012, 2010, 2011, 2012],
    'team': ['Bears', 'Bears', 'Bears', 'Packers', 'Packers', 'Lions',
    'Lions', 'Lions'],
    'wins': [11, 8, 10, 15, 11, 6, 10, 4],
    'losses': [5, 8, 6, 1, 5, 10, 6, 12]}
football = pd.DataFrame(data)
print football, "\n"

data2 = [data['year'], data['team'], data['wins'], data['losses']]
football2 = pd.DataFrame(data2)
print football2, "\n"
print football2.T

   losses     team  wins  year
0       5    Bears    11  2010
1       8    Bears     8  2011
2       6    Bears    10  2012
3       1  Packers    15  2011
4       5  Packers    11  2012
5      10    Lions     6  2010
6       6    Lions    10  2011
7      12    Lions     4  2012 

       0      1      2        3        4      5      6      7
0   2010   2011   2012     2011     2012   2010   2011   2012
1  Bears  Bears  Bears  Packers  Packers  Lions  Lions  Lions
2     11      8     10       15       11      6     10      4
3      5      8      6        1        5     10      6     12 

      0        1   2   3
0  2010    Bears  11   5
1  2011    Bears   8   8
2  2012    Bears  10   6
3  2011  Packers  15   1
4  2012  Packers  11   5
5  2010    Lions   6  10
6  2011    Lions  10   6
7  2012    Lions   4  12

Dataframe -- Olympic Medal Counts¶

from pandas import DataFrame, Series

def create_dataframe():
    '''
    Create a pandas dataframe called 'olympic_medal_counts_df' containing
    the data from the table of 2014 Sochi winter olympics medal counts.  

    The columns for this dataframe should be called 
    'country_name', 'gold', 'silver', and 'bronze'.  

    There is no need to  specify row indexes for this dataframe 
    (in this case, the rows will automatically be assigned numbered indexes).
    '''

    countries = ['Russian Fed.', 'Norway', 'Canada', 'United States',
                 'Netherlands', 'Germany', 'Switzerland', 'Belarus',
                 'Austria', 'France', 'Poland', 'China', 'Korea', 
                 'Sweden', 'Czech Republic', 'Slovenia', 'Japan',
                 'Finland', 'Great Britain', 'Ukraine', 'Slovakia',
                 'Italy', 'Latvia', 'Australia', 'Croatia', 'Kazakhstan']

    gold = [13, 11, 10, 9, 8, 8, 6, 5, 4, 4, 4, 3, 3, 2, 2, 2, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0]
    silver = [11, 5, 10, 7, 7, 6, 3, 0, 8, 4, 1, 4, 3, 7, 4, 2, 4, 3, 1, 0, 0, 2, 2, 2, 1, 0]
    bronze = [9, 10, 5, 12, 9, 5, 2, 1, 5, 7, 1, 2, 2, 6, 2, 4, 3, 1, 2, 1, 0, 6, 2, 1, 0, 1]

    # your code here
    olympic_medal_counts_df = DataFrame({'country_name':countries, 'gold':gold, 'silver':silver, 'bronze':bronze},
        columns=['country_name','gold','silver','bronze'])

    return olympic_medal_counts_df

Selecting from a DataFrame¶

# the medal count data frame
df = create_dataframe()

# create a data frame indexed by country
df2 = df.set_index('country_name')

# select the first 4 rows
print df[0:4] # slicing
print ""
print df.iloc[0:4] # .iloc (integer location)
print ""
print df.loc[0:3] # .loc (label location)
print ""

# select the first 4 rows ==> .loc doesn't work using an integer slice
print df2[0:4]
print ""
print df2.iloc[0:4] # .iloc (integer location)
print ""

# select the first 4 rows for the "country_name" and "silver" columns
print df[[0,2]][0:4] # slicing
print ""
print df[["country_name", "silver"]].iloc[0:4] # .iloc (integer location)
print ""

# countries with 10 or more gold medals
print df[['country_name','gold']][df.gold >= 10]

    country_name  gold  silver  bronze
0   Russian Fed.    13      11       9
1         Norway    11       5      10
2         Canada    10      10       5
3  United States     9       7      12

    country_name  gold  silver  bronze
0   Russian Fed.    13      11       9
1         Norway    11       5      10
2         Canada    10      10       5
3  United States     9       7      12

    country_name  gold  silver  bronze
0   Russian Fed.    13      11       9
1         Norway    11       5      10
2         Canada    10      10       5
3  United States     9       7      12

               gold  silver  bronze
country_name                       
Russian Fed.     13      11       9
Norway           11       5      10
Canada           10      10       5
United States     9       7      12

               gold  silver  bronze
country_name                       
Russian Fed.     13      11       9
Norway           11       5      10
Canada           10      10       5
United States     9       7      12

    country_name  silver
0   Russian Fed.      11
1         Norway       5
2         Canada      10
3  United States       7

    country_name  silver
0   Russian Fed.      11
1         Norway       5
2         Canada      10
3  United States       7

   country_name  gold
0  Russian Fed.    13
1        Norway    11
2        Canada    10

Average Bronze Medals¶

from pandas import DataFrame, Series
import numpy as np


def avg_medal_count():
    '''
    Compute the average number of bronze medals earned by countries who 
    earned at least one gold medal.  
    
    Save this to a variable named avg_bronze_at_least_one_gold. 
    
    HINT-1:
    You can retrieve all of the values of a Pandas column from a 
    data frame, "df", as follows:
    df['column_name']
    
    HINT-2:
    The numpy.mean function can accept as an argument a single
    Pandas column. 
    
    For example, numpy.mean(df["col_name"]) would return the 
    mean of the values located in "col_name" of a dataframe df.
    '''


    countries = ['Russian Fed.', 'Norway', 'Canada', 'United States',
                 'Netherlands', 'Germany', 'Switzerland', 'Belarus',
                 'Austria', 'France', 'Poland', 'China', 'Korea', 
                 'Sweden', 'Czech Republic', 'Slovenia', 'Japan',
                 'Finland', 'Great Britain', 'Ukraine', 'Slovakia',
                 'Italy', 'Latvia', 'Australia', 'Croatia', 'Kazakhstan']

    gold = [13, 11, 10, 9, 8, 8, 6, 5, 4, 4, 4, 3, 3, 2, 2, 2, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0]
    silver = [11, 5, 10, 7, 7, 6, 3, 0, 8, 4, 1, 4, 3, 7, 4, 2, 4, 3, 1, 0, 0, 2, 2, 2, 1, 0]
    bronze = [9, 10, 5, 12, 9, 5, 2, 1, 5, 7, 1, 2, 2, 6, 2, 4, 3, 1, 2, 1, 0, 6, 2, 1, 0, 1]
    
    olympic_medal_counts = {'country_name':Series(countries),
                            'gold': Series(gold),
                            'silver': Series(silver),
                            'bronze': Series(bronze)}
    df = DataFrame(olympic_medal_counts)
    
    # YOUR CODE HERE
    avg_bronze_at_least_one_gold = np.mean(df.bronze[df.gold > 0])

    return avg_bronze_at_least_one_gold

Average Gold, Silver, and Bronze Medals¶

import numpy as np
from pandas import DataFrame, Series


def avg_medal_count():
    '''
    Using the dataframe's apply method, create a new Series called 
    avg_medal_count that indicates the average number of gold, silver,
    and bronze medals earned amongst countries who earned at 
    least one medal of any kind at the 2014 Sochi olympics.  Note that
    the countries list already only includes countries that have earned
    at least one medal. No additional filtering is necessary.
    '''

    countries = ['Russian Fed.', 'Norway', 'Canada', 'United States',
                 'Netherlands', 'Germany', 'Switzerland', 'Belarus',
                 'Austria', 'France', 'Poland', 'China', 'Korea', 
                 'Sweden', 'Czech Republic', 'Slovenia', 'Japan',
                 'Finland', 'Great Britain', 'Ukraine', 'Slovakia',
                 'Italy', 'Latvia', 'Australia', 'Croatia', 'Kazakhstan']

    gold = [13, 11, 10, 9, 8, 8, 6, 5, 4, 4, 4, 3, 3, 2, 2, 2, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0]
    silver = [11, 5, 10, 7, 7, 6, 3, 0, 8, 4, 1, 4, 3, 7, 4, 2, 4, 3, 1, 0, 0, 2, 2, 2, 1, 0]
    bronze = [9, 10, 5, 12, 9, 5, 2, 1, 5, 7, 1, 2, 2, 6, 2, 4, 3, 1, 2, 1, 0, 6, 2, 1, 0, 1]
    
    olympic_medal_counts = {'country_name':countries,
                            'gold': Series(gold),
                            'silver': Series(silver),
                            'bronze': Series(bronze)}    
    df = DataFrame(olympic_medal_counts)
    
    # YOUR CODE HERE
    avg_medal_count = df[['gold','silver','bronze']].mean()
    # using apply: avg_medal_count = df[['gold','silver','bronze']].apply(np.mean)
    
    return avg_medal_count

Olympic Medal Points¶

import numpy as np
from pandas import DataFrame, Series

def numpy_dot():
    '''
    Imagine a point system in which each country is awarded 4 points for each
    gold medal,  2 points for each silver medal, and one point for each 
    bronze medal.  

    Using the numpy.dot function, create a new dataframe called 
    'olympic_points_df' that includes:
        a) a column called 'country_name' with the country name
        b) a column called 'points' with the total number of points the country
           earned at the Sochi olympics.
    '''

    countries = ['Russian Fed.', 'Norway', 'Canada', 'United States',
                 'Netherlands', 'Germany', 'Switzerland', 'Belarus',
                 'Austria', 'France', 'Poland', 'China', 'Korea', 
                 'Sweden', 'Czech Republic', 'Slovenia', 'Japan',
                 'Finland', 'Great Britain', 'Ukraine', 'Slovakia',
                 'Italy', 'Latvia', 'Australia', 'Croatia', 'Kazakhstan']

    gold = [13, 11, 10, 9, 8, 8, 6, 5, 4, 4, 4, 3, 3, 2, 2, 2, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0]
    silver = [11, 5, 10, 7, 7, 6, 3, 0, 8, 4, 1, 4, 3, 7, 4, 2, 4, 3, 1, 0, 0, 2, 2, 2, 1, 0]
    bronze = [9, 10, 5, 12, 9, 5, 2, 1, 5, 7, 1, 2, 2, 6, 2, 4, 3, 1, 2, 1, 0, 6, 2, 1, 0, 1]
 
    # YOUR CODE HERE
    olympic_points_df = DataFrame({'country_name':countries, 'points':np.dot([4,2,1], [gold, silver, bronze])})
    
    return olympic_points_df