Welch's t-Test

In [1]:
import numpy
import scipy.stats
import pandas

def compare_averages(filename):
    """
    Performs a t-test on two sets of baseball data (left-handed and right-handed hitters).

    You will be given a csv file that has three columns.  A player's
    name, handedness (L for lefthanded or R for righthanded) and their
    career batting average (called 'avg'). You can look at the csv
    file by downloading the baseball_stats file from Downloadables below. 
    
    Write a function that will read that the csv file into a pandas data frame,
    and run Welch's t-test on the two cohorts defined by handedness.
    
    One cohort should be a data frame of right-handed batters. And the other
    cohort should be a data frame of left-handed batters.
    
    We have included the scipy.stats library to help you write
    or implement Welch's t-test:
    http://docs.scipy.org/doc/scipy/reference/stats.html
    
    With a significance level of 95%, if there is no difference
    between the two cohorts, return a tuple consisting of
    True, and then the tuple returned by scipy.stats.ttest.  
    
    If there is a difference, return a tuple consisting of
    False, and then the tuple returned by scipy.stats.ttest.
    
    For example, the tuple that you return may look like:
    (True, (9.93570222, 0.000023))
    """
    baseball = pandas.read_csv(filename)
    (t,p) = scipy.stats.ttest_ind(baseball.loc[baseball['handedness']=='L','avg'],
                                 baseball.loc[baseball['handedness']=='R','avg'],
                                 equal_var=False)
    return (p > 0.05, (t, p))
    
filename = './data/baseball_stats.csv'
compare_averages(filename)
Out[1]:
(False, (9.9357022262420944, 3.8102742258887383e-23))

Gradient Descent in Python

In [2]:
def compute_cost(features, values, theta):
    """
    Compute the cost of a list of parameters, theta, given a list of features 
    (input data points) and values (output data points).
    """
    m = len(values)
    sum_of_square_errors = numpy.square(numpy.dot(features, theta) - values).sum()
    cost = sum_of_square_errors / (2*m)

    return cost

def gradient_descent(features, values, theta, alpha, num_iterations):
    """
    Perform gradient descent given a data set with an arbitrary number of features.
    """

    # Write code here that performs num_iterations updates to the elements of theta.
    # times. Every time you compute the cost for a given list of thetas, append it 
    # to cost_history.
    # See the Instructor notes for hints. 
    
    cost_history = []

    ###########################
    ### YOUR CODE GOES HERE ###
    ###########################
    for iter in range(num_iterations):
        cost_history.append( compute_cost(features, values, theta))
        theta += alpha/len(values)*numpy.dot(features.transpose(), values - numpy.dot(features,theta))
        
        

    return theta, pandas.Series(cost_history) # leave this line for the grader

Calculating R^2

In [ ]:
def compute_r_squared(data, predictions):
    # Write a function that, given two input numpy arrays, 'data', and 'predictions,'
    # returns the coefficient of determination, R^2, for the model that produced 
    # predictions.
    # 
    # Numpy has a couple of functions -- np.mean() and np.sum() --
    # that you might find useful, but you don't have to use them.

    # YOUR CODE GOES HERE
    r_squared = 1 - numpy.linalg.norm(data-predictions)**2 / numpy.linalg.norm(data-numpy.mean(data))**2
    
    return r_squared