Data Analysis with R: Problem Set 4

1. Price vs. x
3. Correlations
4. Price vs. Depth
5. Adjustments – Price vs. Depth
7. Correlation – Price and Depth
8. Price vs. Carat
9. Price vs. Volume
10. Correlations on Subsets
11. Adjustments – Price vs. Volume
12. Mean Price by Clarity
13. Bar Charts of Mean Price
14. Gapminder Revisited

1. Price vs. x

library(ggplot2)
data(diamonds)

# Your first task is to create a
# scatterplot of price vs x.
# using the ggplot syntax.
ggplot(diamonds, aes(x=x, y=price)) + geom_point()

3. Correlations

# correlation between price and x
with(diamonds, cor.test(x, price))

## 
##  Pearson's product-moment correlation
## 
## data:  x and price
## t = 440.16, df = 53938, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.8825835 0.8862594
## sample estimates:
##       cor 
## 0.8844352

# correlation between price and y
with(diamonds, cor.test(y, price))

## 
##  Pearson's product-moment correlation
## 
## data:  y and price
## t = 401.14, df = 53938, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.8632867 0.8675241
## sample estimates:
##       cor 
## 0.8654209

# correlation between price and z
with(diamonds, cor.test(z, price))

## 
##  Pearson's product-moment correlation
## 
## data:  z and price
## t = 393.6, df = 53938, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.8590541 0.8634131
## sample estimates:
##       cor 
## 0.8612494

4. Price vs. Depth

# Create a simple scatter plot of price vs depth.
ggplot(diamonds, aes(x=depth, y=price)) + geom_point()

5. Adjustments – Price vs. Depth

# Change the code to make the transparency of the
# points to be 1/100 of what they are now and mark
# the x-axis every 2 units. 
ggplot(data = diamonds, aes(x = depth, y = price)) + 
  geom_point(alpha=0.01) +
  scale_x_continuous(breaks=seq(42,80,2))

7. Correlation – Price and Depth

cor.test(diamonds$price, diamonds$depth)

## 
##  Pearson's product-moment correlation
## 
## data:  diamonds$price and diamonds$depth
## t = -2.473, df = 53938, p-value = 0.0134
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  -0.019084756 -0.002208537
## sample estimates:
##        cor 
## -0.0106474

8. Price vs. Carat

# Create a scatterplot of price vs carat
# and omit the top 1% of price and carat
# values.
price99 <- quantile(diamonds$price, 0.99)
carat99 <- quantile(diamonds$carat, 0.99)
ggplot(subset(diamonds, price < price99 & carat < carat99), aes(x=carat, y=price)) +
  geom_point()

9. Price vs. Volume

# Create a scatterplot of price vs. volume (x * y * z).
# This is a very rough approximation for a diamond's volume.

# Create a new variable for volume in the diamonds data frame.
# This will be useful in a later exercise.
diamonds$volume <- diamonds$x * diamonds$y * diamonds$z
ggplot(diamonds, aes(x=volume, y=price)) + geom_point()

10. Correlations on Subsets

# What's the correlation of price and volume?
# Exclude diamonds that have a volume of zero or
# that are >= 800.  
with(subset(diamonds, 0 < volume & volume < 800), cor.test(price, volume))

## 
##  Pearson's product-moment correlation
## 
## data:  price and volume
## t = 559.19, df = 53915, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.9222944 0.9247772
## sample estimates:
##       cor 
## 0.9235455

11. Adjustments – Price vs. Volume

# Subset the data to exclude diamonds with a volume
# greater than or equal to 800. Also, exclude diamonds
# with a volume of 0. Adjust the transparency of the
# points and add a linear model to the plot.
ggplot(subset(diamonds, 0 < volume & volume < 800), aes(x=volume, y=price)) + 
  geom_point(alpha=0.05) +
  geom_smooth(method='lm', color='red')

12. Mean Price by Clarity

library(dplyr)

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

# Use the function dplyr package
# to create a new data frame containing
# info on diamonds by clarity.

# Name the data frame diamondsByClarity

# The data frame should contain the following
# variables in this order.

#       (1) mean_price
#       (2) median_price
#       (3) min_price
#       (4) max_price
#       (5) n

# where n is the number of diamonds in each
# level of clarity.

diamondsByClarity <- group_by(diamonds, clarity) %>%
  summarise( mean_price=mean(price),
             median_price=median(price),
             min_price=min(price),
             max_price=max(price),
             n=n())

13. Bar Charts of Mean Price

diamonds_by_clarity <- group_by(diamonds, clarity)
diamonds_mp_by_clarity <- summarise(diamonds_by_clarity, mean_price = mean(price))

diamonds_by_color <- group_by(diamonds, color)
diamonds_mp_by_color <- summarise(diamonds_by_color, mean_price = mean(price))
# We've created summary data frames with the mean price
# by clarity and color. You can run the code in R to
# verify what data is in the variables diamonds_mp_by_clarity
# and diamonds_mp_by_color.

# Your task is to write additional code to create two bar plots
# on one output image using the grid.arrange() function from the package
# gridExtra.

library(gridExtra)

## 
## Attaching package: 'gridExtra'

## The following object is masked from 'package:dplyr':
## 
##     combine

p1 <- ggplot(diamonds_mp_by_clarity, aes(x=clarity, y=mean_price)) + geom_bar(stat='identity')
p2 <- ggplot(diamonds_mp_by_color, aes(x=color, y=mean_price)) + geom_bar(stat='identity')
grid.arrange(p1,p2,ncol=1)

14. Gapminder Revisited

# The Gapminder website contains over 500 data sets with information about
# the world's population. Your task is to continue the investigation you did at the
# end of Problem Set 3 or you can start fresh and choose a different
# data set from Gapminder.

# If you're feeling adventurous or want to try some data munging see if you can
# find a data set or scrape one from the web.

# In your investigation, examine pairs of variable and create 2-5 plots that make
# use of the techniques from Lesson 4.

# You can find a link to the Gapminder website in the Instructor Notes.

# Once you've completed your investigation, create a post in the discussions that includes:
#       1. the variable(s) you investigated, your observations, and any summary statistics
#       2. snippets of code that created the plots
#       3. links to the images of your plots

# Copy and paste all of the code that you used for
# your investigation, and submit it when you are ready.
# ====================================================================
library('xlsx', quietly=TRUE)
library(reshape2)

sugar <- read.xlsx('indicator_sugar_consumption.xlsx', sheetIndex=1)

# get rid of NA rows and columns
sugar[ncol(sugar)] <- NULL
sugar <- sugar[!is.na(sugar[[1]]),]

# clean up the country column
colnames(sugar)[1] <- 'country'
sugar$country <- gsub(' ','_',sugar$country)

# the countries and years
countries <- as.character(sugar$country)
years <- seq(1961,2004)

# transpose the dataframe
sugar2 <- as.data.frame(t(sugar[,-1]))
colnames(sugar2) <- countries
sugar2['year'] <- years


# sugar per person vs. year in the US
ggplot(sugar2, aes(x=year, y=United_States)) + geom_line() + xlab('Year') + ylab('Sugar per Person (g per day)') + ggtitle('Sugar Consumption in the United States')

# sugar per person vs. year for all countries
sugar3 <- melt(sugar2, id.vars='year', na.rm=TRUE, variable.name='country')
colnames(sugar3)[3] <- 'amount'
ggplot(sugar3, aes(x=year, y=amount)) + 
  geom_point(alpha=1/3, position = position_jitter(h=0), color='orange') + 
  geom_line(stat='summary', fun.y=mean) +
  geom_line(stat ='summary', fun.y=quantile, fun.args=list(probs=0.1), linetype=2, color='blue') + 
  geom_line(stat ='summary', fun.y=quantile, fun.args=list(probs=0.9), linetype=2, color='blue') +
  geom_line(stat ='summary', fun.y=median, color='blue') + 
  xlab('Year') + 
  ylab('Sugar per Person (g per day)') +
  ggtitle('Sugar Consumption Worldwide from 1961-2004')

Data Analysis with R: Problem Set 4

Jeff Irion

March 25, 2016

1. Price vs. x

3. Correlations

4. Price vs. Depth

5. Adjustments – Price vs. Depth

7. Correlation – Price and Depth

8. Price vs. Carat

9. Price vs. Volume

10. Correlations on Subsets

11. Adjustments – Price vs. Volume

12. Mean Price by Clarity

13. Bar Charts of Mean Price

14. Gapminder Revisited