Lesson 4

Scatterplots and Perceived Audience Size




pf <- read.csv('pseudo_facebook.tsv', sep='\t')

qplot(data=pf, x=age, y=friend_count)

What are some things that you notice right away?


ggplot Syntax


ggplot(pf, aes(x=age, y=friend_count)) + geom_point() + xlim(13,90)
## Warning: Removed 4906 rows containing missing values (geom_point).

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   13.00   20.00   28.00   37.28   50.00  113.00



ggplot(pf, aes(x=age, y=friend_count)) + geom_jitter(alpha=1/20) + xlim(13,90)
## Warning: Removed 5178 rows containing missing values (geom_point).

What do you notice in the plot?




Look up the documentation for coord_trans() and add a layer to the plot that transforms friend_count using the square root function. Create your plot!

ggplot(pf, aes(x=age, y=friend_count)) + xlab('Age') + ylab('Friend Count') + 
  geom_point(alpha=1/20, position = position_jitter(h=0)) + 
  xlim(13,90) + coord_trans(y='sqrt')
## Warning: Removed 5201 rows containing missing values (geom_point).

What do you notice?

Alpha and Jitter


ggplot(pf, aes(x=age, y=friendships_initiated)) + xlab('Age') + ylab('Friendships Initiated') + 
  geom_point(alpha=1/20, position = position_jitter(h=0)) + xlim(13,90) + coord_trans(y='sqrt')
## Warning: Removed 5183 rows containing missing values (geom_point).

Overplotting and Domain Knowledge


Conditional Means


## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##     filter, lag
## The following objects are masked from 'package:base':
##     intersect, setdiff, setequal, union
# method 1
age_groups <- group_by(pf, age)
pf.fc_by_age <- summarise(age_groups, 
                          friend_count_mean = mean(friend_count), 
                          friend_count_median = median(friend_count), 
pf.fc_by_age <- arrange(pf.fc_by_age, age)

# method 2
pf.fc_by_age <- pf %>% group_by(age) %>%
  summarise(friend_count_mean = mean(friend_count),
            friend_count_median = median(friend_count),
            n = n()) %>%

## Source: local data frame [6 x 4]
##     age friend_count_mean friend_count_median     n
##   (int)             (dbl)               (dbl) (int)
## 1    13          164.7500                74.0   484
## 2    14          251.3901               132.0  1925
## 3    15          347.6921               161.0  2618
## 4    16          351.9371               171.5  3086
## 5    17          350.3006               156.0  3283
## 6    18          331.1663               162.0  5196

Create your plot!

# Plot mean friend count vs. age using a line graph.
# Be sure you use the correct variable names
# and the correct data frame. You should be working
# with the new data frame created from the dplyr
# functions. The data frame is called 'pf.fc_by_age'.
ggplot(pf.fc_by_age, aes(x=age, y=friend_count_mean)) + geom_line()

Overlaying Summaries with Raw Data


ggplot(pf, aes(x=age, y=friend_count)) + xlab('Age') + ylab('Friend Count') + 
  geom_point(alpha=1/20, position = position_jitter(h=0), color='orange') + 
  xlim(13,90) + 
  coord_trans(y='sqrt') + 
  geom_line(stat='summary', fun.y=mean) +
  geom_line(stat ='summary', fun.y=quantile, fun.args=list(probs=0.1), linetype=2, color='blue') + 
  geom_line(stat ='summary', fun.y=quantile, fun.args=list(probs=0.9), linetype=2, color='blue') +
  geom_line(stat ='summary', fun.y=median, color='blue')
## Warning: Removed 4906 rows containing non-finite values (stat_summary).

## Warning: Removed 4906 rows containing non-finite values (stat_summary).

## Warning: Removed 4906 rows containing non-finite values (stat_summary).

## Warning: Removed 4906 rows containing non-finite values (stat_summary).
## Warning: Removed 5185 rows containing missing values (geom_point).

ggplot(pf, aes(x=age, y=friend_count)) + xlab('Age') + ylab('Friend Count') + 
  geom_point(alpha=1/20, position = position_jitter(h=0), color='orange') + 
  xlim(13,90) + 
  coord_cartesian(xlim = c(13,70), ylim = c(0,1000)) + 
  geom_line(stat='summary', fun.y=mean) +
  geom_line(stat ='summary', fun.y=quantile, fun.args=list(probs=0.1), linetype=2, color='blue') + 
  geom_line(stat ='summary', fun.y=quantile, fun.args=list(probs=0.9), linetype=2, color='blue') +
  geom_line(stat ='summary', fun.y=median, color='blue')
## Warning: Removed 4906 rows containing non-finite values (stat_summary).
## Warning: Removed 4906 rows containing non-finite values (stat_summary).

## Warning: Removed 4906 rows containing non-finite values (stat_summary).

## Warning: Removed 4906 rows containing non-finite values (stat_summary).
## Warning: Removed 5177 rows containing missing values (geom_point).

What are some of your observations of the plot?


Moira: Histogram Summary and Scatterplot

See the Instructor Notes of this video to download Moira’s paper on perceived audience size and to see the final plot.




# rule of thumb:
#   |correlation| > 0.3 is meaningful but small
#   |correlation| > 0.5 is moderate
#   |correlation| > 0.7 is pretty large
cor.test(pf$age, pf$friend_count, method='pearson')
##  Pearson's product-moment correlation
## data:  pf$age and pf$friend_count
## t = -8.6268, df = 99001, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  -0.03363072 -0.02118189
## sample estimates:
##         cor 
## -0.02740737
# equivalent code
with(pf, cor.test(age, friend_count, method='pearson'))
##  Pearson's product-moment correlation
## data:  age and friend_count
## t = -8.6268, df = 99001, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  -0.03363072 -0.02118189
## sample estimates:
##         cor 
## -0.02740737

Look up the documentation for the cor.test function.

What’s the correlation between age and friend count? Round to three decimal places. Response:

Correlation on Subsets


with(subset(pf,age <= 70), cor.test(age, friend_count))
##  Pearson's product-moment correlation
## data:  age and friend_count
## t = -52.592, df = 91029, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  -0.1780220 -0.1654129
## sample estimates:
##        cor 
## -0.1717245

Correlation Methods


Create Scatterplots


ggplot(pf, aes(x=www_likes_received, y=likes_received)) + geom_point(alpha=0.1, position=position_jitter(h=0)) + scale_x_log10() + scale_y_log10()

Strong Correlations


ggplot(pf, aes(x=www_likes_received, y=likes_received)) + 
  geom_point(alpha=0.1, position=position_jitter(h=0)) + 
  xlim(0, quantile(pf$www_likes_received, 0.95)) +
  ylim(0, quantile(pf$likes_received, 0.95)) +
  geom_smooth(method='lm', color='red')
## Warning: Removed 6075 rows containing non-finite values (stat_smooth).
## Warning: Removed 24476 rows containing missing values (geom_point).

What’s the correlation betwen the two variables? Include the top 5% of values for the variable in the calculation and round to 3 decimal places.

cor.test(pf$www_likes_received, pf$likes_received)
##  Pearson's product-moment correlation
## data:  pf$www_likes_received and pf$likes_received
## t = 937.1, df = 99001, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.9473553 0.9486176
## sample estimates:
##       cor 
## 0.9479902


Moira on Correlation


More Caution with Correlation


## Loading required package: car

Create your plot!

ggplot(Mitchell, aes(x=Month, y=Temp)) + geom_point()

Noisy Scatterplots

  1. Take a guess for the correlation coefficient for the scatterplot.

  2. What is the actual correlation of the two variables? (Round to the thousandths place)

with(Mitchell, cor.test(Temp,Month))
##  Pearson's product-moment correlation
## data:  Temp and Month
## t = 0.81816, df = 202, p-value = 0.4142
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  -0.08053637  0.19331562
## sample estimates:
##        cor 
## 0.05747063

Making Sense of Data


ggplot(Mitchell, aes(x=Month, y=Temp)) + 
  geom_point() +

A New Perspective

What do you notice? Response:

Watch the solution video and check out the Instructor Notes! Notes:

Understanding Noise: Age to Age Months


##  [1] "userid"                "age"                  
##  [3] "dob_day"               "dob_year"             
##  [5] "dob_month"             "gender"               
##  [7] "tenure"                "friend_count"         
##  [9] "friendships_initiated" "likes"                
## [11] "likes_received"        "mobile_likes"         
## [13] "mobile_likes_received" "www_likes"            
## [15] "www_likes_received"
##    userid age dob_day dob_year dob_month gender tenure friend_count
## 1 2094382  14      19     1999        11   male    266            0
## 2 1192601  14       2     1999        11 female      6            0
## 3 2083884  14      16     1999        11   male     13            0
## 4 1203168  14      25     1999        12 female     93            0
## 5 1733186  14       4     1999        12   male     82            0
## 6 1524765  14       1     1999        12   male     15            0
##   friendships_initiated likes likes_received mobile_likes
## 1                     0     0              0            0
## 2                     0     0              0            0
## 3                     0     0              0            0
## 4                     0     0              0            0
## 5                     0     0              0            0
## 6                     0     0              0            0
##   mobile_likes_received www_likes www_likes_received
## 1                     0         0                  0
## 2                     0         0                  0
## 3                     0         0                  0
## 4                     0         0                  0
## 5                     0         0                  0
## 6                     0         0                  0
pf$age_with_months <- pf$age + pf$dob_month/12

Age with Months Means

Programming Assignment

# Create a new data frame called
# pf.fc_by_age_months that contains
# the mean friend count, the median friend
# count, and the number of users in each
# group of age_with_months. The rows of the
# data framed should be arranged in increasing
# order by the age_with_months variable.

# For example, the first two rows of the resulting
# data frame would look something like...

# age_with_months  friend_count_mean    friend_count_median n
#              13            275.0000                   275 2
#        13.25000            133.2000                   101 11

pf.fc_by_age_months <- group_by(pf, age_with_months) %>%
  summarise( friend_count_mean=mean(friend_count),
             n=n()) %>%
## starting httpd help server ...
##  done

Noise in Conditional Means

ggplot(subset(pf.fc_by_age_months, age_with_months<71), aes(x=age_with_months, y=friend_count_mean)) + geom_line()

Smoothing Conditional Means


p1 <- ggplot(subset(pf.fc_by_age, age<71), aes(x=age, y=friend_count_mean)) + geom_line() + geom_smooth()

p2 <- ggplot(subset(pf.fc_by_age_months, age_with_months<71), aes(x=age_with_months, y=friend_count_mean)) + geom_line() + geom_smooth()

p3 <- ggplot(subset(pf.fc_by_age, age<71), aes(x=round(age/5)*5, y=friend_count_mean)) + geom_line(stat='summary', fun.y=mean)

## Attaching package: 'gridExtra'
## The following object is masked from 'package:dplyr':
##     combine

Which Plot to Choose?


Analyzing Two Variables


Click KnitHTML to see all of your hard work and to have an html page of this lesson, your answers, and your notes!