Notes:
Notes:
library(ggplot2)
library(gridExtra)
setwd('C:/Users/Jeff/udacity/Data_Analysis_with_R')
pf <- read.csv('pseudo_facebook.tsv', sep='\t')
Notes:
library(ggplot2)
ggplot(data=pf, aes(x=dob_day)) +
geom_histogram(binwidth=1) +
scale_x_continuous(breaks=seq(1,31))
Response:
Notes:
Notes:
Response:
Response:
Notes:
Notes:
ggplot(data=pf, aes(x=dob_day)) +
geom_histogram(binwidth=1) +
scale_x_continuous(breaks=seq(1,31)) +
facet_wrap(~dob_month, ncol=3)
Response:
Notes:
Notes: #### Which case do you think applies to Moira’s outlier? Response:
Notes:
ggplot(data=pf, aes(x=friend_count)) +
geom_histogram(bins=30) +
scale_x_continuous(limits=c(0,1000))
Response:
Notes:
Notes:
Notes:
# What code would you add to create a facet the histogram by gender?
# Add it to the code below.
qplot(x = friend_count, data = pf, binwidth = 10) +
scale_x_continuous(limits = c(0, 1000), breaks = seq(0, 1000, 50)) +
facet_wrap(~gender)
Notes:
qplot(x = friend_count, data = subset(pf, !is.na(gender)), binwidth = 10) +
scale_x_continuous(limits = c(0, 1000), breaks = seq(0, 1000, 50)) +
facet_wrap(~gender)
## Warning: Removed 2949 rows containing non-finite values (stat_bin).
Notes:
table(pf$gender)
##
## female male
## 40254 58574
by(pf$friend_count, pf$gender, summary)
## pf$gender: female
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0 37 96 242 244 4923
## --------------------------------------------------------
## pf$gender: male
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0 27 74 165 182 4917
Response:
Response:
Response:
Notes:
qplot(x=tenure, data=pf, binwidth=30, color=I('black'), fill=I('#099DD9'))
## Warning: Removed 2 rows containing non-finite values (stat_bin).
qplot(x=tenure/365, data=pf, binwidth=0.25, color=I('black'), fill=I('#F79240')) +
scale_x_continuous(breaks=seq(1,7), limits=c(0,7))
## Warning: Removed 26 rows containing non-finite values (stat_bin).
Notes:
qplot(x=tenure/365, data=pf, xlab = 'Number of years using Facebook', ylab='Number of users in sample', binwidth=0.25, color=I('black'), fill=I('#F79240')) +
scale_x_continuous(breaks=seq(1,7), limits=c(0,7))
## Warning: Removed 26 rows containing non-finite values (stat_bin).
Notes:
qplot(data=pf, x=age, xlab="Age", ylab="Frequency", binwidth=1, color=I('black'), fill=I('#5760AB')) +
scale_x_continuous(breaks=c(min(pf$age), max(pf$age)))
Response:
Notes:
Notes:
Notes:
Notes:
library(gridExtra)
p1 = qplot(data=pf, x=friend_count, xlab="Friend Count", ylab="Frequency", main="Friend Count Histogram", color=I('black'), fill=I('#5760AB'))
p2 = qplot(data=pf, x=friend_count, xlab="Friend Count", ylab="Frequency", main="Friend Count Histogram (log10)", color=I('black'), fill=I('#5760AB')) + scale_x_log10()
p3 = qplot(data=pf, x=friend_count, xlab="Friend Count", ylab="Frequency", main="Friend Count Histogram (sqrt)", color=I('black'), fill=I('#5760AB')) + scale_x_sqrt()
grid.arrange(p1,p2,p3, ncol=1)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 1962 rows containing non-finite values (stat_bin).
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
qplot(x = friend_count, y = ..count../sum(..count..), ylab="Proportion of Users with Friend Count", data = subset(pf, !is.na(gender)), binwidth = 10, geom="freqpoly", color=gender) +
scale_x_continuous(limits = c(0, 1000), breaks = seq(0, 1000, 50))
## Warning: Removed 2949 rows containing non-finite values (stat_bin).
## Warning: Removed 4 rows containing missing values (geom_path).
Notes:
qplot(x = www_likes, , xlab="Likes on the WWW", ylab="Frequency", data = subset(pf, !is.na(gender)), geom="freqpoly", color=gender) +
#scale_x_continuous() +
scale_x_log10()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 60935 rows containing non-finite values (stat_bin).
by(pf$www_likes, pf$gender, sum)
## pf$gender: female
## [1] 3507665
## --------------------------------------------------------
## pf$gender: male
## [1] 1430175
Notes:
qplot(data=subset(pf, !is.na(gender)), x=gender, y=friend_count, geom="boxplot")
qplot(data=subset(pf, !is.na(gender)), x=gender, y=friend_count, geom="boxplot") +
coord_cartesian(ylim=c(0,1000))
Notes:
Response: #### Write about some ways that you can verify your answer. Response:
qplot(data=subset(pf, !is.na(gender)), x=gender, y=friendships_initiated, geom="boxplot") +
coord_cartesian(ylim=c(0,250))
by(pf$friendships_initiated, pf$gender, summary)
## pf$gender: female
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.0 19.0 49.0 113.9 124.8 3654.0
## --------------------------------------------------------
## pf$gender: male
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.0 15.0 44.0 103.1 111.0 4144.0
Response:
Notes:
pf$mobile_check_in <- NA
pf$mobile_check_in <- ifelse(pf$mobile_likes > 0, 1, 0)
pf$mobile_check_in <- factor(pf$mobile_check_in)
summary(pf$mobile_check_in)
## 0 1
## 35056 63947
# percent that check in with mobile
levels(pf$mobile_check_in)
## [1] "0" "1"
print(sum(pf$mobile_check_in=="1")/length(pf$mobile_check_in)*100)
## [1] 64.59097
Response:
Reflection:
Click KnitHTML to see all of your hard work and to have an html page of this lesson, your answers, and your notes!