Notes:
Notes:
Notes:
library(ggplot2)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
pf <- read.csv('pseudo_facebook.tsv', sep='\t')
ggplot(aes(x = gender, y = age),
data = subset(pf, !is.na(gender))) + geom_boxplot() +
stat_summary(fun.y=mean, geom='point', shape=4)
ggplot(aes(x = friend_count),
data = subset(pf, !is.na(gender))) + geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
ggplot(aes(x = age, y = friend_count), data = subset(pf, !is.na(gender))) +
geom_line(aes(color=gender), stat='summary', fun.y=median)
# Write code to create a new data frame,
# called 'pf.fc_by_age_gender', that contains
# information on each age AND gender group.
# The data frame should contain the following variables:
# mean_friend_count,
# median_friend_count,
# n (the number of users in each age and gender grouping)
# Here is an example of the structure of your data frame. Your
# data values will be different. Note that if you are grouping by
# more than one variable, you will probably need to call the
# ungroup() function.
# age gender mean_friend_count median_friend_count n
# 1 13 female 247.2953 150 207
# 2 13 male 184.2342 61 265
# 3 14 female 329.1938 245 834
# 4 14 male 157.1204 88 1201
# See the Instructor Note for two hints.
# ENTER YOUR CODE BELOW THIS LINE.
# ==============================================================
pf.fc_by_age_gender <- pf %>%
filter(!is.na(gender)) %>%
group_by(age, gender) %>%
summarise(mean_friend_count=mean(friend_count),
median_friend_count=median(friend_count),
n=n())
Notes:
# Create a line graph showing the
# median friend count over the ages
# for each gender. Be sure to use
# the data frame you just created,
# pf.fc_by_age_gender.
# See the Instructor Notes for a hint.
# This assignment is not graded and
# will be marked as correct when you submit.
# ENTER YOUR CODE BELOW THIS LINE
# =================================================
ggplot(pf.fc_by_age_gender, aes(x=age,y=median_friend_count, color=gender)) + geom_line()
Notes:
Notes:
Notes:
library(reshape2)
library(tidyr)
pf.fc_by_age_gender.wide <- dcast(pf.fc_by_age_gender, age~gender, value.var='median_friend_count')
# another way to reshape the data using dplyr and tidyr
pf.fc_by_age_gender.wide <- subset(pf.fc_by_age_gender[c('age', 'gender', 'median_friend_count')],
!is.na(gender)) %>%
spread(gender, median_friend_count) %>%
mutate(ratio = male / female)
head(pf.fc_by_age_gender.wide)
## Source: local data frame [6 x 4]
## Groups: age [6]
##
## age female male ratio
## (int) (dbl) (dbl) (dbl)
## 1 13 148.0 55.0 0.3716216
## 2 14 224.0 92.5 0.4129464
## 3 15 276.0 106.5 0.3858696
## 4 16 258.5 136.0 0.5261122
## 5 17 245.5 125.0 0.5091650
## 6 18 243.0 122.0 0.5020576
Notes:
# Plot the ratio of the female to male median
# friend counts using the data frame
# pf.fc_by_age_gender.wide.
# Think about what geom you should use.
# Add a horizontal line to the plot with
# a y intercept of 1, which will be the
# base line. Look up the documentation
# for geom_hline to do that. Use the parameter
# linetype in geom_hline to make the
# line dashed.
# The linetype parameter can take the values 0-6:
# 0 = blank, 1 = solid, 2 = dashed
# 3 = dotted, 4 = dotdash, 5 = longdash
# 6 = twodash
# This assignment is not graded and
# will be marked as correct when you submit.
# ENTER YOUR CODE BELOW THIS LINE
# =================================================
ggplot(pf.fc_by_age_gender.wide, aes(age, y=female/male)) + geom_line() +
geom_hline(yintercept=1, alpha=0.3, linetype=2) +
xlab('Age') +
ylab('Median Female Friend Count / Median Male Friend Count')
Notes:
# Create a variable called year_joined
# in the pf data frame using the variable
# tenure and 2014 as the reference year.
# The variable year joined should contain the year
# that a user joined facebook.
# See the Instructor Notes for three hints if you get
# stuck. Scroll down slowly to see one hint at a time
# if you would like some guidance.
# This programming exercise WILL BE automatically graded.
# ENTER YOUR CODE BELOW THIS LINE.
# ========================================================
pf$year_joined <- 2014-ceiling(pf$tenure/365)
Notes:
summary(pf$year_joined)
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 2005 2012 2012 2012 2013 2014 2
table(pf$year_joined)
##
## 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014
## 9 15 581 1507 4557 5448 9860 33366 43588 70
# Create a new variable in the data frame
# called year_joined.bucket by using
# the cut function on the variable year_joined.
# You need to create the following buckets for the
# new variable, year_joined.bucket
# (2004, 2009]
# (2009, 2011]
# (2011, 2012]
# (2012, 2014]
# Note that a parenthesis means exclude the year and a
# bracket means include the year.
# Look up the documentation for cut or try the link
# in the Instructor Notes to accomplish this task.
# ENTER YOUR CODE BELOW THIS LINE
# ========================================================================
pf$year_joined.bucket <- cut(pf$year_joined, breaks=c(2004,2009,2011,2012,2014), right=TRUE)
Notes:
table(pf$year_joined.bucket, useNA='ifany')
##
## (2004,2009] (2009,2011] (2011,2012] (2012,2014] <NA>
## 6669 15308 33366 43658 2
ggplot(aes(x = age, y = friend_count), data = subset(pf, !is.na(year_joined.bucket))) +
geom_line(aes(color=year_joined.bucket), stat='summary', fun.y=median)
Notes:
# Write code to do the following:
# (1) Add another geom_line to code below
# to plot the grand mean of the friend count vs age.
# (2) Exclude any users whose year_joined.bucket is NA.
# (3) Use a different line type for the grand mean.
# As a reminder, the parameter linetype can take the values 0-6:
# 0 = blank, 1 = solid, 2 = dashed
# 3 = dotted, 4 = dotdash, 5 = longdash
# 6 = twodash
# This assignment is not graded and
# will be marked as correct when you submit.
# The code from the last programming exercise should
# be your starter code!
# ENTER YOUR CODE BELOW THIS LINE
# ==================================================================
ggplot(aes(x = age, y = friend_count), data = subset(pf, !is.na(year_joined.bucket))) +
geom_line(aes(color=year_joined.bucket), stat='summary', fun.y=mean) +
geom_line(stat='summary', fun.y=mean, linetype=2)
Notes:
summary( with(subset(pf, tenure>0), friend_count/tenure) )
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.0000 0.0775 0.2205 0.6096 0.5658 417.0000
What is the median friend rate?
What is the maximum friend rate?
Notes:
# Create a line graph of mean of friendships_initiated per day (of tenure)
# vs. tenure colored by year_joined.bucket.
# You need to make use of the variables tenure,
# friendships_initiated, and year_joined.bucket.
# You also need to subset the data to only consider user with at least
# one day of tenure.
# This assignment is not graded and
# will be marked as correct when you submit.
# ENTER YOUR CODE BELOW THIS LINE
# ========================================================================
ggplot(subset(pf, tenure>0), aes(x=tenure, y=friendships_initiated/tenure, color=year_joined.bucket)) +
geom_line(stat='summary', fun.y=mean)
Notes:
library(gridExtra)
##
## Attaching package: 'gridExtra'
## The following object is masked from 'package:dplyr':
##
## combine
p1 <- ggplot(aes(x = tenure, y = friendships_initiated / tenure),
data = subset(pf, tenure >= 1)) +
geom_line(aes(color = year_joined.bucket),
stat = 'summary',
fun.y = mean)
p2 <- ggplot(aes(x = 7 * round(tenure / 7), y = friendships_initiated / tenure),
data = subset(pf, tenure > 0)) +
geom_line(aes(color = year_joined.bucket),
stat = "summary",
fun.y = mean)
p3 <- ggplot(aes(x = 30 * round(tenure / 30), y = friendships_initiated / tenure),
data = subset(pf, tenure > 0)) +
geom_line(aes(color = year_joined.bucket),
stat = "summary",
fun.y = mean)
p4 <- ggplot(aes(x = 90 * round(tenure / 90), y = friendships_initiated / tenure),
data = subset(pf, tenure > 0)) +
geom_line(aes(color = year_joined.bucket),
stat = "summary",
fun.y = mean)
grid.arrange(p1,p2,p3,p4, ncol=1)
# Instead of geom_line(), use geom_smooth() to add a smoother to the plot.
# You can use the defaults for geom_smooth() but do color the line
# by year_joined.bucket
# ALTER THE CODE BELOW THIS LINE
# ==============================================================================
ggplot(aes(x = 7 * round(tenure / 7), y = friendships_initiated / tenure),
data = subset(pf, tenure > 0)) +
geom_smooth(aes(color = year_joined.bucket))
Notes:
Notes:
Notes:
yo <- read.csv('yogurt.csv')
str(yo)
## 'data.frame': 2380 obs. of 9 variables:
## $ obs : int 1 2 3 4 5 6 7 8 9 10 ...
## $ id : int 2100081 2100081 2100081 2100081 2100081 2100081 2100081 2100081 2100081 2100081 ...
## $ time : int 9678 9697 9825 9999 10015 10029 10036 10042 10083 10091 ...
## $ strawberry : int 0 0 0 0 1 1 0 0 0 0 ...
## $ blueberry : int 0 0 0 0 0 0 0 0 0 0 ...
## $ pina.colada: int 0 0 0 0 1 2 0 0 0 0 ...
## $ plain : int 0 0 0 0 0 0 0 0 0 0 ...
## $ mixed.berry: int 1 1 1 1 1 1 1 1 1 1 ...
## $ price : num 59 59 65 65 49 ...
# change the id from an int to a factor
yo$id <- factor(yo$id)
str(yo)
## 'data.frame': 2380 obs. of 9 variables:
## $ obs : int 1 2 3 4 5 6 7 8 9 10 ...
## $ id : Factor w/ 332 levels "2100081","2100370",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ time : int 9678 9697 9825 9999 10015 10029 10036 10042 10083 10091 ...
## $ strawberry : int 0 0 0 0 1 1 0 0 0 0 ...
## $ blueberry : int 0 0 0 0 0 0 0 0 0 0 ...
## $ pina.colada: int 0 0 0 0 1 2 0 0 0 0 ...
## $ plain : int 0 0 0 0 0 0 0 0 0 0 ...
## $ mixed.berry: int 1 1 1 1 1 1 1 1 1 1 ...
## $ price : num 59 59 65 65 49 ...
qplot(data=yo, x=price)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
Notes:
# Create a new variable called all.purchases,
# which gives the total counts of yogurt for
# each observation or household.
# One way to do this is using the transform
# function. You can look up the function transform
# and run the examples of code at the bottom of the
# documentation to figure out what it does.
# The transform function produces a data frame
# so if you use it then save the result to 'yo'!
# OR you can figure out another way to create the
# variable.
# ENTER YOUR CODE BELOW THIS LINE
# ========================================================
yo <- transform(yo, all.purchases=strawberry+blueberry+pina.colada+plain+mixed.berry)
# alternate method
yo$all.purchases <- with(yo, strawberry+blueberry+pina.colada+plain+mixed.berry)
Notes:
# Create a scatterplot of price vs time.
# This will be an example of a time series plot.
# Resolve overplotting issues by using
# techniques you learned in Lesson 4.
# What are some things that you notice?
# ENTER YOUR CODE BELOW THIS LINE
# ================================================
ggplot(yo, aes(x=time, y=price)) + geom_point(alpha=0.1, position=position_jitter(), color='blue')
Notes:
set.seed(4230)
sample.ids <- sample( levels(yo$id), 16)
ggplot(subset(yo, id %in% sample.ids), aes(x=time, y=price)) +
facet_wrap(~id) +
geom_line() +
geom_point(aes(size=all.purchases), pch=1)
set.seed(1)
sample.ids <- sample( levels(yo$id), 16)
ggplot(subset(yo, id %in% sample.ids), aes(x=time, y=price)) +
facet_wrap(~id) +
geom_line() +
geom_point(aes(size=all.purchases), pch=1)
Notes:
Notes:
Notes:
library(GGally)
##
## Attaching package: 'GGally'
## The following object is masked from 'package:dplyr':
##
## nasa
theme_set(theme_minimal(20))
# set the seed for reproducible results
set.seed(1836)
pf_subset <- pf[, c(2:15)]
names(pf_subset)
## [1] "age" "dob_day"
## [3] "dob_year" "dob_month"
## [5] "gender" "tenure"
## [7] "friend_count" "friendships_initiated"
## [9] "likes" "likes_received"
## [11] "mobile_likes" "mobile_likes_received"
## [13] "www_likes" "www_likes_received"
ggpairs(pf_subset[sample.int(nrow(pf_subset), 1000), ])
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
Notes:
Notes:
nci <- read.table("nci.tsv")
colnames(nci) <- c(1:64)
nci.long.samp <- melt(as.matrix(nci[1:200,]))
names(nci.long.samp) <- c("gene", "case", "value")
head(nci.long.samp)
## gene case value
## 1 1 1 0.300
## 2 2 1 1.180
## 3 3 1 0.550
## 4 4 1 1.140
## 5 5 1 -0.265
## 6 6 1 -0.070
ggplot(aes(y = gene, x = case, fill = value),
data = nci.long.samp) +
geom_tile() +
scale_fill_gradientn(colours = colorRampPalette(c("blue", "red"))(100))
Reflection:
Click KnitHTML to see all of your hard work and to have an html page of this lesson, your answers, and your notes!