Size of the Enron Dataset

In [1]:
import sys
sys.path.append("C:/Users/Jeff/udacity/Intro_to_Machine_Learning/ud120-projects/tools/")
sys.path.append('C:/Users/Jeff/udacity/Intro_to_Machine_Learning/ud120-projects/choose_your_own')
sys.path.append('C:/Users/Jeff/udacity/Intro_to_Machine_Learning/ud120-projects/datasets_questions')

import os
os.chdir('C:/Users/Jeff/udacity/Intro_to_Machine_Learning/ud120-projects/datasets_questions')

import pickle

enron_data = pickle.load(open("../final_project/final_project_dataset.pkl", "r"))
print 'Number of people in the Enron dataset: {0}'.format(len(enron_data))
Number of people in the Enron dataset: 146

Features in the Enron Dataset

In [2]:
print 'Number of features for each person in the Enron dataset: {0}'.format(len(enron_data.values()[0]))
Number of features for each person in the Enron dataset: 21

Finding POI's in the Enron Data

In [3]:
pois = [x for x, y in enron_data.items() if y['poi']]
print 'Number of POI\'s: {0}'.format(len(pois))
#enron_data.items()[0]
Number of POI's: 18

Query the Dataset 1

In [4]:
# DELETE ME
enron_data['PRENTICE JAMES']
Out[4]:
{'bonus': 'NaN',
 'deferral_payments': 564348,
 'deferred_income': 'NaN',
 'director_fees': 'NaN',
 'email_address': 'james.prentice@enron.com',
 'exercised_stock_options': 886231,
 'expenses': 'NaN',
 'from_messages': 'NaN',
 'from_poi_to_this_person': 'NaN',
 'from_this_person_to_poi': 'NaN',
 'loan_advances': 'NaN',
 'long_term_incentive': 'NaN',
 'other': 'NaN',
 'poi': False,
 'restricted_stock': 208809,
 'restricted_stock_deferred': 'NaN',
 'salary': 'NaN',
 'shared_receipt_with_poi': 'NaN',
 'to_messages': 'NaN',
 'total_payments': 564348,
 'total_stock_value': 1095040}
In [5]:
enron_data['PRENTICE JAMES']['total_stock_value']
Out[5]:
1095040

Query the Dataset 2

In [6]:
enron_data['COLWELL WESLEY']['from_this_person_to_poi']
Out[6]:
11

Query the Dataset 3

In [7]:
enron_data['SKILLING JEFFREY K']['exercised_stock_options']
Out[7]:
19250000

Follow the Money

In [8]:
names = ['SKILLING JEFFREY K', 'FASTOW ANDREW S', 'LAY KENNETH L']
names_payments = {name:enron_data[name]['total_payments'] for name in names}
print sorted(names_payments.items(), key=lambda x: x[1], reverse=True)
[('LAY KENNETH L', 103559793), ('SKILLING JEFFREY K', 8682716), ('FASTOW ANDREW S', 2424083)]

Dealing with Unfilled Features

In [9]:
import pandas as pd

df = pd.DataFrame(enron_data)
print 'Has salary data: {0}'.format(sum(df.loc['salary',:] != 'NaN'))
print 'Has email: {0}'.format(sum(df.loc['email_address',:] != 'NaN'))
Has salary data: 95
Has email: 111

Missing POI's 1

In [10]:
# How many people in the E+F dataset (as it currently exists) have “NaN” for their total payments? 
# What percentage of people in the dataset as a whole is this?

isnan = sum(df.loc['total_payments',:]=='NaN')
_,cols = df.shape
print 'total_payments == \'NaN\': {0} people = {1:.2f}%'.format(isnan, 100.*isnan/cols)
total_payments == 'NaN': 21 people = 14.38%

Missing POI's 2

In [11]:
isnan = sum(df.loc['total_payments',pois]=='NaN')
print 'POI total_payments == \'NaN\': {0} people = {1:.2f}%'.format(isnan, 100.*isnan/len(pois))
POI total_payments == 'NaN': 0 people = 0.00%