import pandas as pd
def add_full_name(path_to_csv, path_to_new_csv):
#Assume you will be reading in a csv file with the same columns that the
#Lahman baseball data set has -- most importantly, there are columns
#called 'nameFirst' and 'nameLast'.
#1) Write a function that reads a csv
#located at "path_to_csv" into a pandas dataframe and adds a new column
#called 'nameFull' with a player's full name.
#
#For example:
# for Hank Aaron, nameFull would be 'Hank Aaron',
#
#2) Write the data in the pandas dataFrame to a new csv file located at
#path_to_new_csv
#WRITE YOUR CODE HERE
baseball = pd.read_csv(path_to_csv)
baseball['nameFull'] = baseball['nameFirst'] + ' ' + baseball['nameLast']
baseball.to_csv(path_to_new_csv)
if __name__ == "__main__":
# For local use only
# If you are running this on your own machine add the path to the
# Lahman baseball csv and a path for the new csv.
# The dataset can be downloaded from this website: http://www.seanlahman.com/baseball-archive/statistics
# We are using the file Master.csv
path_to_csv = ""
path_to_new_csv = ""
#add_full_name(path_to_csv, path_to_new_csv)
import pandasql
def select_first_50(filename):
# Read in our aadhaar_data csv to a pandas dataframe. Afterwards, we rename the columns
# by replacing spaces with underscores and setting all characters to lowercase, so the
# column names more closely resemble columns names one might find in a table.
aadhaar_data = pd.read_csv(filename)
aadhaar_data.rename(columns = lambda x: x.replace(' ', '_').lower(), inplace=True)
# Select out the first 50 values for "registrar" and "enrolment_agency"
# in the aadhaar_data table using SQL syntax.
#
# Note that "enrolment_agency" is spelled with one l. Also, the order
# of the select does matter. Make sure you select registrar then enrolment agency
# in your query.
q = """
SELECT registrar, enrolment_agency FROM aadhaar_data LIMIT 50;
"""
#Execute your SQL command against the pandas frame
aadhaar_solution = pandasql.sqldf(q.lower(), locals())
return aadhaar_solution
def aggregate_query(filename):
# Read in our aadhaar_data csv to a pandas dataframe. Afterwards, we rename the columns
# by replacing spaces with underscores and setting all characters to lowercase, so the
# column names more closely resemble columns names one might find in a table.
aadhaar_data = pd.read_csv(filename)
aadhaar_data.rename(columns = lambda x: x.replace(' ', '_').lower(), inplace=True)
# Write a query that will select from the aadhaar_data table how many men and how
# many women over the age of 50 have had aadhaar generated for them in each district.
# aadhaar_generated is a column in the Aadhaar Data that denotes the number who have had
# aadhaar generated in each row of the table.
#
# Note that in this quiz, the SQL query keywords are case sensitive.
# For example, if you want to do a sum make sure you type 'sum' rather than 'SUM'.
#
# The possible columns to select from aadhaar data are:
# 1) registrar
# 2) enrolment_agency
# 3) state
# 4) district
# 5) sub_district
# 6) pin_code
# 7) gender
# 8) age
# 9) aadhaar_generated
# 10) enrolment_rejected
# 11) residents_providing_email,
# 12) residents_providing_mobile_number
q = '''
SELECT gender, district, SUM(aadhaar_generated) FROM aadhaar_data WHERE age > 50 GROUP BY gender, district
'''
# Execute your SQL command against the pandas frame
aadhaar_solution = pandasql.sqldf(q.lower(), locals())
return aadhaar_solution
import json
import requests
import pprint
url = 'http://ws.audioscrobbler.com/2.0/?method=geo.gettopartists&country=spain&api_key=4beab33cc6d65b05800d51f5e83bde1b&format=json'
def api_get_request(url):
# In this exercise, you want to call the last.fm API to get a list of the
# top artists in Spain. The grader will supply the URL as an argument to
# the function; you do not need to construct the address or call this
# function in your grader submission.
#
# Once you've done this, return the name of the number 1 top artist in
# Spain.
data = requests.get(url).text
data = json.loads(data)
pp = pprint.PrettyPrinter(indent=2)
pp.pprint(data['topartists']['artist'][0]['name'])
return data['topartists']['artist'][0]['name']
api_get_request(url)
import pandas as pd
import numpy as np
filename = './data/Master.csv'
def imputation(filename):
# Pandas dataframes have a method called 'fillna(value)', such that you can
# pass in a single value to replace any NAs in a dataframe or series. You
# can call it like this:
# dataframe['column'] = dataframe['column'].fillna(value)
#
# Using the numpy.mean function, which calculates the mean of a numpy
# array, impute any missing values in our Lahman baseball
# data sets 'weight' column by setting them equal to the average weight.
#
# You can access the 'weight' colum in the baseball data frame by
# calling baseball['weight']
baseball = pd.read_csv(filename)
baseball['weight'].fillna(baseball['weight'].mean(), inplace=True)
return baseball