Lesson 4


Scatterplots and Perceived Audience Size

Notes:


Scatterplots

Notes:

library(ggplot2)
pf <- read.csv('pseudo_facebook.tsv', sep='\t')

qplot(data=pf, x=age, y=friend_count)


What are some things that you notice right away?

Response:


ggplot Syntax

Notes:

ggplot(pf, aes(x=age, y=friend_count)) + geom_point() + xlim(13,90)
## Warning: Removed 4906 rows containing missing values (geom_point).

summary(pf$age)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   13.00   20.00   28.00   37.28   50.00  113.00

Overplotting

Notes:

ggplot(pf, aes(x=age, y=friend_count)) + geom_jitter(alpha=1/20) + xlim(13,90)
## Warning: Removed 5178 rows containing missing values (geom_point).

What do you notice in the plot?

Response:


Coord_trans()

Notes:

Look up the documentation for coord_trans() and add a layer to the plot that transforms friend_count using the square root function. Create your plot!

ggplot(pf, aes(x=age, y=friend_count)) + xlab('Age') + ylab('Friend Count') + 
  geom_point(alpha=1/20, position = position_jitter(h=0)) + 
  xlim(13,90) + coord_trans(y='sqrt')
## Warning: Removed 5201 rows containing missing values (geom_point).

What do you notice?


Alpha and Jitter

Notes:

ggplot(pf, aes(x=age, y=friendships_initiated)) + xlab('Age') + ylab('Friendships Initiated') + 
  geom_point(alpha=1/20, position = position_jitter(h=0)) + xlim(13,90) + coord_trans(y='sqrt')
## Warning: Removed 5183 rows containing missing values (geom_point).


Overplotting and Domain Knowledge

Notes:


Conditional Means

Notes:

library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
# method 1
age_groups <- group_by(pf, age)
pf.fc_by_age <- summarise(age_groups, 
                          friend_count_mean = mean(friend_count), 
                          friend_count_median = median(friend_count), 
                          n=n()) 
pf.fc_by_age <- arrange(pf.fc_by_age, age)

# method 2
pf.fc_by_age <- pf %>% group_by(age) %>%
  summarise(friend_count_mean = mean(friend_count),
            friend_count_median = median(friend_count),
            n = n()) %>%
  arrange(age)

head(pf.fc_by_age)
## Source: local data frame [6 x 4]
## 
##     age friend_count_mean friend_count_median     n
##   (int)             (dbl)               (dbl) (int)
## 1    13          164.7500                74.0   484
## 2    14          251.3901               132.0  1925
## 3    15          347.6921               161.0  2618
## 4    16          351.9371               171.5  3086
## 5    17          350.3006               156.0  3283
## 6    18          331.1663               162.0  5196

Create your plot!

# Plot mean friend count vs. age using a line graph.
# Be sure you use the correct variable names
# and the correct data frame. You should be working
# with the new data frame created from the dplyr
# functions. The data frame is called 'pf.fc_by_age'.
ggplot(pf.fc_by_age, aes(x=age, y=friend_count_mean)) + geom_line()


Overlaying Summaries with Raw Data

Notes:

ggplot(pf, aes(x=age, y=friend_count)) + xlab('Age') + ylab('Friend Count') + 
  geom_point(alpha=1/20, position = position_jitter(h=0), color='orange') + 
  xlim(13,90) + 
  coord_trans(y='sqrt') + 
  geom_line(stat='summary', fun.y=mean) +
  geom_line(stat ='summary', fun.y=quantile, fun.args=list(probs=0.1), linetype=2, color='blue') + 
  geom_line(stat ='summary', fun.y=quantile, fun.args=list(probs=0.9), linetype=2, color='blue') +
  geom_line(stat ='summary', fun.y=median, color='blue')
## Warning: Removed 4906 rows containing non-finite values (stat_summary).

## Warning: Removed 4906 rows containing non-finite values (stat_summary).

## Warning: Removed 4906 rows containing non-finite values (stat_summary).

## Warning: Removed 4906 rows containing non-finite values (stat_summary).
## Warning: Removed 5185 rows containing missing values (geom_point).

ggplot(pf, aes(x=age, y=friend_count)) + xlab('Age') + ylab('Friend Count') + 
  geom_point(alpha=1/20, position = position_jitter(h=0), color='orange') + 
  xlim(13,90) + 
  coord_cartesian(xlim = c(13,70), ylim = c(0,1000)) + 
  geom_line(stat='summary', fun.y=mean) +
  geom_line(stat ='summary', fun.y=quantile, fun.args=list(probs=0.1), linetype=2, color='blue') + 
  geom_line(stat ='summary', fun.y=quantile, fun.args=list(probs=0.9), linetype=2, color='blue') +
  geom_line(stat ='summary', fun.y=median, color='blue')
## Warning: Removed 4906 rows containing non-finite values (stat_summary).
## Warning: Removed 4906 rows containing non-finite values (stat_summary).

## Warning: Removed 4906 rows containing non-finite values (stat_summary).

## Warning: Removed 4906 rows containing non-finite values (stat_summary).
## Warning: Removed 5177 rows containing missing values (geom_point).

What are some of your observations of the plot?

Response:


Moira: Histogram Summary and Scatterplot

See the Instructor Notes of this video to download Moira’s paper on perceived audience size and to see the final plot.

Notes:


Correlation

Notes:

# rule of thumb:
#   |correlation| > 0.3 is meaningful but small
#   |correlation| > 0.5 is moderate
#   |correlation| > 0.7 is pretty large
cor.test(pf$age, pf$friend_count, method='pearson')
## 
##  Pearson's product-moment correlation
## 
## data:  pf$age and pf$friend_count
## t = -8.6268, df = 99001, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  -0.03363072 -0.02118189
## sample estimates:
##         cor 
## -0.02740737
# equivalent code
with(pf, cor.test(age, friend_count, method='pearson'))
## 
##  Pearson's product-moment correlation
## 
## data:  age and friend_count
## t = -8.6268, df = 99001, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  -0.03363072 -0.02118189
## sample estimates:
##         cor 
## -0.02740737

Look up the documentation for the cor.test function.

What’s the correlation between age and friend count? Round to three decimal places. Response:


Correlation on Subsets

Notes:

with(subset(pf,age <= 70), cor.test(age, friend_count))
## 
##  Pearson's product-moment correlation
## 
## data:  age and friend_count
## t = -52.592, df = 91029, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  -0.1780220 -0.1654129
## sample estimates:
##        cor 
## -0.1717245

Correlation Methods

Notes:


Create Scatterplots

Notes:

ggplot(pf, aes(x=www_likes_received, y=likes_received)) + geom_point(alpha=0.1, position=position_jitter(h=0)) + scale_x_log10() + scale_y_log10()


Strong Correlations

Notes:

ggplot(pf, aes(x=www_likes_received, y=likes_received)) + 
  geom_point(alpha=0.1, position=position_jitter(h=0)) + 
  xlim(0, quantile(pf$www_likes_received, 0.95)) +
  ylim(0, quantile(pf$likes_received, 0.95)) +
  geom_smooth(method='lm', color='red')
## Warning: Removed 6075 rows containing non-finite values (stat_smooth).
## Warning: Removed 24476 rows containing missing values (geom_point).

What’s the correlation betwen the two variables? Include the top 5% of values for the variable in the calculation and round to 3 decimal places.

cor.test(pf$www_likes_received, pf$likes_received)
## 
##  Pearson's product-moment correlation
## 
## data:  pf$www_likes_received and pf$likes_received
## t = 937.1, df = 99001, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.9473553 0.9486176
## sample estimates:
##       cor 
## 0.9479902

Response:


Moira on Correlation

Notes:


More Caution with Correlation

Notes:

library(alr3)
## Loading required package: car
data(Mitchell)

Create your plot!

ggplot(Mitchell, aes(x=Month, y=Temp)) + geom_point()


Noisy Scatterplots

  1. Take a guess for the correlation coefficient for the scatterplot.

  2. What is the actual correlation of the two variables? (Round to the thousandths place)

with(Mitchell, cor.test(Temp,Month))
## 
##  Pearson's product-moment correlation
## 
## data:  Temp and Month
## t = 0.81816, df = 202, p-value = 0.4142
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  -0.08053637  0.19331562
## sample estimates:
##        cor 
## 0.05747063

Making Sense of Data

Notes:

ggplot(Mitchell, aes(x=Month, y=Temp)) + 
  geom_point() +
  scale_x_discrete(breaks=seq(0,203,12))


A New Perspective

What do you notice? Response:

Watch the solution video and check out the Instructor Notes! Notes:


Understanding Noise: Age to Age Months

Notes:

names(pf)
##  [1] "userid"                "age"                  
##  [3] "dob_day"               "dob_year"             
##  [5] "dob_month"             "gender"               
##  [7] "tenure"                "friend_count"         
##  [9] "friendships_initiated" "likes"                
## [11] "likes_received"        "mobile_likes"         
## [13] "mobile_likes_received" "www_likes"            
## [15] "www_likes_received"
head(pf)
##    userid age dob_day dob_year dob_month gender tenure friend_count
## 1 2094382  14      19     1999        11   male    266            0
## 2 1192601  14       2     1999        11 female      6            0
## 3 2083884  14      16     1999        11   male     13            0
## 4 1203168  14      25     1999        12 female     93            0
## 5 1733186  14       4     1999        12   male     82            0
## 6 1524765  14       1     1999        12   male     15            0
##   friendships_initiated likes likes_received mobile_likes
## 1                     0     0              0            0
## 2                     0     0              0            0
## 3                     0     0              0            0
## 4                     0     0              0            0
## 5                     0     0              0            0
## 6                     0     0              0            0
##   mobile_likes_received www_likes www_likes_received
## 1                     0         0                  0
## 2                     0         0                  0
## 3                     0         0                  0
## 4                     0         0                  0
## 5                     0         0                  0
## 6                     0         0                  0
pf$age_with_months <- pf$age + pf$dob_month/12

Age with Months Means

Programming Assignment

# Create a new data frame called
# pf.fc_by_age_months that contains
# the mean friend count, the median friend
# count, and the number of users in each
# group of age_with_months. The rows of the
# data framed should be arranged in increasing
# order by the age_with_months variable.

# For example, the first two rows of the resulting
# data frame would look something like...

# age_with_months  friend_count_mean    friend_count_median n
#              13            275.0000                   275 2
#        13.25000            133.2000                   101 11

pf.fc_by_age_months <- group_by(pf, age_with_months) %>%
  summarise( friend_count_mean=mean(friend_count),
             friend_count_median=median(friend_count),
             n=n()) %>%
  arrange(age_with_months)
?arrange
## starting httpd help server ...
##  done

Noise in Conditional Means

ggplot(subset(pf.fc_by_age_months, age_with_months<71), aes(x=age_with_months, y=friend_count_mean)) + geom_line()


Smoothing Conditional Means

Notes:

p1 <- ggplot(subset(pf.fc_by_age, age<71), aes(x=age, y=friend_count_mean)) + geom_line() + geom_smooth()

p2 <- ggplot(subset(pf.fc_by_age_months, age_with_months<71), aes(x=age_with_months, y=friend_count_mean)) + geom_line() + geom_smooth()

p3 <- ggplot(subset(pf.fc_by_age, age<71), aes(x=round(age/5)*5, y=friend_count_mean)) + geom_line(stat='summary', fun.y=mean)

library(gridExtra)
## 
## Attaching package: 'gridExtra'
## The following object is masked from 'package:dplyr':
## 
##     combine
grid.arrange(p2,p1,p3,ncol=1)


Which Plot to Choose?

Notes:


Analyzing Two Variables

Reflection:


Click KnitHTML to see all of your hard work and to have an html page of this lesson, your answers, and your notes!