import sys
sys.path.append("C:/Users/Jeff/udacity/Intro_to_Machine_Learning/ud120-projects/tools/")
sys.path.append('C:/Users/Jeff/udacity/Intro_to_Machine_Learning/ud120-projects/choose_your_own')
sys.path.append('C:/Users/Jeff/udacity/Intro_to_Machine_Learning/ud120-projects/datasets_questions')
import os
os.chdir('C:/Users/Jeff/udacity/Intro_to_Machine_Learning/ud120-projects/datasets_questions')
import pickle
enron_data = pickle.load(open("../final_project/final_project_dataset.pkl", "r"))
print 'Number of people in the Enron dataset: {0}'.format(len(enron_data))
print 'Number of features for each person in the Enron dataset: {0}'.format(len(enron_data.values()[0]))
pois = [x for x, y in enron_data.items() if y['poi']]
print 'Number of POI\'s: {0}'.format(len(pois))
#enron_data.items()[0]
# DELETE ME
enron_data['PRENTICE JAMES']
enron_data['PRENTICE JAMES']['total_stock_value']
enron_data['COLWELL WESLEY']['from_this_person_to_poi']
enron_data['SKILLING JEFFREY K']['exercised_stock_options']
names = ['SKILLING JEFFREY K', 'FASTOW ANDREW S', 'LAY KENNETH L']
names_payments = {name:enron_data[name]['total_payments'] for name in names}
print sorted(names_payments.items(), key=lambda x: x[1], reverse=True)
import pandas as pd
df = pd.DataFrame(enron_data)
print 'Has salary data: {0}'.format(sum(df.loc['salary',:] != 'NaN'))
print 'Has email: {0}'.format(sum(df.loc['email_address',:] != 'NaN'))
# How many people in the E+F dataset (as it currently exists) have “NaN” for their total payments?
# What percentage of people in the dataset as a whole is this?
isnan = sum(df.loc['total_payments',:]=='NaN')
_,cols = df.shape
print 'total_payments == \'NaN\': {0} people = {1:.2f}%'.format(isnan, 100.*isnan/cols)
isnan = sum(df.loc['total_payments',pois]=='NaN')
print 'POI total_payments == \'NaN\': {0} people = {1:.2f}%'.format(isnan, 100.*isnan/len(pois))