import sys
import os
import pickle
import numpy
import matplotlib.pyplot as plt
from feature_format import featureFormat, targetFeatureSplit
%matplotlib inline
def Draw(pred, features, poi, mark_poi=True, name="image.png", f1_name="feature 1", f2_name="feature 2"):
""" some plotting code designed to help you visualize your clusters """
### plot each cluster with a different color--add more colors for
### drawing more than five clusters
colors = ["b", "c", "k", "m", "g"]
for ii, pp in enumerate(pred):
plt.scatter(features[ii][0], features[ii][1], color = colors[pred[ii]])
### if you like, place red stars over points that are POIs (just for funsies)
if mark_poi:
for ii, pp in enumerate(pred):
if poi[ii]:
plt.scatter(features[ii][0], features[ii][1], color="r", marker="*")
### load in the dict of dicts containing all the data on each person in the dataset
data_dict = pickle.load( open("../final_project/final_project_dataset.pkl", "r") )
### there's an outlier--remove it!
data_dict.pop("TOTAL", 0)
# the features to be used
features_list = ['poi', 'salary', 'exercised_stock_options']
def finance_kmeans(data_dict, features_list):
data = featureFormat(data_dict, features_list )
poi, finance_features = targetFeatureSplit( data )
# plot the first 2 features
for f in finance_features:
plt.scatter( f[0], f[1] )
# k-means clustering
from sklearn.cluster import KMeans
clf = KMeans(2)
pred = clf.predict(finance_features)
# show the clustering
Draw(pred, finance_features, poi, mark_poi=False, name="clusters.pdf", f1_name=features_list[1], f2_name=features_list[2])
return None
finance_kmeans(data_dict, features_list)