1. Diamonds
library(ggplot2)
data(diamonds)
# observations, variables, number of ordered factors
str(diamonds)
## Classes 'tbl_df', 'tbl' and 'data.frame': 53940 obs. of 10 variables:
## $ carat : num 0.23 0.21 0.23 0.29 0.31 0.24 0.24 0.26 0.22 0.23 ...
## $ cut : Ord.factor w/ 5 levels "Fair"<"Good"<..: 5 4 2 4 2 3 3 3 1 3 ...
## $ color : Ord.factor w/ 7 levels "D"<"E"<"F"<"G"<..: 2 2 2 6 7 7 6 5 2 5 ...
## $ clarity: Ord.factor w/ 8 levels "I1"<"SI2"<"SI1"<..: 2 3 5 4 2 6 7 3 4 5 ...
## $ depth : num 61.5 59.8 56.9 62.4 63.3 62.8 62.3 61.9 65.1 59.4 ...
## $ table : num 55 61 65 58 58 57 57 55 61 61 ...
## $ price : int 326 326 327 334 335 336 336 337 337 338 ...
## $ x : num 3.95 3.89 4.05 4.2 4.34 3.94 3.95 4.07 3.87 4 ...
## $ y : num 3.98 3.84 4.07 4.23 4.35 3.96 3.98 4.11 3.78 4.05 ...
## $ z : num 2.43 2.31 2.31 2.63 2.75 2.48 2.47 2.53 2.49 2.39 ...
levels(diamonds$color)
## [1] "D" "E" "F" "G" "H" "I" "J"
# also see:
#?diamonds
2. Price Histogram
# Create a histogram of the price of
# all the diamonds in the diamond data set.
# TYPE YOUR CODE BELOW THE LINE
# =======================================
hist(diamonds$price)
3. Price Histogram Summary
summary(diamonds$price)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 326 950 2401 3933 5324 18820
4. Diamond Counts
# diamonds under $500
sum(diamonds$price < 500)
## [1] 1729
# diamonds under $250
sum(diamonds$price < 250)
## [1] 0
# diamonds that cost $15,000 or more
sum(diamonds$price >= 15000)
## [1] 1656
5. Cheaper Diamonds
# Explore the largest peak in the
# price histogram you created earlier.
# Try limiting the x-axis, altering the bin width,
# and setting different breaks on the x-axis.
# There won't be a solution video for this
# question so go to the discussions to
# share your thoughts and discover
# what other people find.
# You can save images by using the ggsave() command.
# ggsave() will save the last plot created.
# For example...
# qplot(x = price, data = diamonds)
# ggsave('priceHistogram.png')
# ggsave currently recognises the extensions eps/ps, tex (pictex),
# pdf, jpeg, tiff, png, bmp, svg and wmf (windows only).
# Submit your final code when you are ready.
# TYPE YOUR CODE BELOW THE LINE
# ======================================================================
ggplot(diamonds, aes(x=price)) + geom_histogram(binwidth=500) + scale_x_continuous(limits=c(5250,14750))
## Warning: Removed 41944 rows containing non-finite values (stat_bin).
6. Price by Cut Histograms
# Break out the histogram of diamond prices by cut.
# You should have five histograms in separate
# panels on your resulting plot.
# TYPE YOUR CODE BELOW THE LINE
# ======================================================
ggplot(diamonds, aes(x=price)) + geom_histogram() + facet_wrap(~cut)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
7. Price by Cut
by(diamonds$price, diamonds$cut, summary, digits=max(getOption('digits')))
## diamonds$cut: Fair
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 337.000 2050.250 3282.000 4358.758 5205.500 18574.000
## --------------------------------------------------------
## diamonds$cut: Good
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 327.000 1145.000 3050.500 3928.864 5028.000 18788.000
## --------------------------------------------------------
## diamonds$cut: Very Good
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 336.00 912.00 2648.00 3981.76 5372.75 18818.00
## --------------------------------------------------------
## diamonds$cut: Premium
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 326.000 1046.000 3185.000 4584.258 6296.000 18823.000
## --------------------------------------------------------
## diamonds$cut: Ideal
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 326.000 878.000 1810.000 3457.542 4678.500 18806.000
8. Scales and Multiple Histograms
ggplot(diamonds, aes(x=price)) + geom_histogram() + facet_wrap(~cut, scales="free")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
9. Price per Carat by Cut
# Create a histogram of price per carat
# and facet it by cut. You can make adjustments
# to the code from the previous exercise to get
# started.
# Adjust the bin width and transform the scale
# of the x-axis using log10.
# Submit your final code when you are ready.
# ENTER YOUR CODE BELOW THIS LINE.
# ===========================================================================
ggplot(diamonds, aes(x=price)) + geom_histogram() + facet_wrap(~cut, scales="free") + scale_x_log10()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
10. Price Box Plots
# Investigate the price of diamonds using box plots,
# numerical summaries, and one of the following categorical
# variables: cut, clarity, or color.
ggplot(diamonds, aes(x=clarity, y=price)) + geom_boxplot() + xlab('Clarity') + ylab('Price')
by(diamonds$price, diamonds$color, summary)
## diamonds$color: D
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 357 911 1838 3170 4214 18690
## --------------------------------------------------------
## diamonds$color: E
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 326 882 1739 3077 4003 18730
## --------------------------------------------------------
## diamonds$color: F
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 342 982 2344 3725 4868 18790
## --------------------------------------------------------
## diamonds$color: G
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 354 931 2242 3999 6048 18820
## --------------------------------------------------------
## diamonds$color: H
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 337 984 3460 4487 5980 18800
## --------------------------------------------------------
## diamonds$color: I
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 334 1120 3730 5092 7202 18820
## --------------------------------------------------------
## diamonds$color: J
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 335 1860 4234 5324 7695 18710
11. Interquartile Range (IQR)
# IQR for the best color
IQR(diamonds$price[diamonds$color=='D'])
## [1] 3302.5
# IQR for the worst color
IQR(diamonds$price[diamonds$color=='J'])
## [1] 5834.5
12. Price per Carat Box Plots by Color
ggplot(diamonds, aes(x=color, y=price)) + geom_boxplot() + xlab('Color') + ylab('Price')
by(diamonds$price, diamonds$color, summary)
## diamonds$color: D
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 357 911 1838 3170 4214 18690
## --------------------------------------------------------
## diamonds$color: E
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 326 882 1739 3077 4003 18730
## --------------------------------------------------------
## diamonds$color: F
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 342 982 2344 3725 4868 18790
## --------------------------------------------------------
## diamonds$color: G
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 354 931 2242 3999 6048 18820
## --------------------------------------------------------
## diamonds$color: H
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 337 984 3460 4487 5980 18800
## --------------------------------------------------------
## diamonds$color: I
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 334 1120 3730 5092 7202 18820
## --------------------------------------------------------
## diamonds$color: J
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 335 1860 4234 5324 7695 18710
13. Carat Frequency Polygon
qplot(data=diamonds, x=carat, xlab='Carat', ylab='Frequency', binwidth=0.1, geom='freqpoly') + scale_x_continuous(breaks=seq(0,5,0.2)) + scale_y_continuous(breaks=seq(0,12000,2000))
14. Gapminder Data
# The Gapminder website contains over 500 data sets with information about
# the world's population. Your task is to download a data set of your choice
# and create 2-5 plots that make use of the techniques from Lesson 3.
# You might use a simple histogram, a boxplot split over a categorical variable,
# or a frequency polygon. The choice is yours!
# You can find a link to the Gapminder website in the Instructor Notes.
# Once you've completed your investigation, create a post in the discussions that includes:
# 1. any questions you answered, your observations, and summary statistics
# 2. snippets of code that created the plots
# 3. links to the images of your plots
# You can save images by using the ggsave() command.
# ggsave() will save the last plot created.
# For example...
# qplot(x = price, data = diamonds)
# ggsave('priceHistogram.png')
# ggsave currently recognises the extensions eps/ps, tex (pictex),
# pdf, jpeg, tiff, png, bmp, svg and wmf (windows only).
# Copy and paste all of the code that you used for
# your investigation, and submit it when you are ready.
# ====================================================================================
library('xlsx', quietly=TRUE)
library(reshape2)
sugar <- read.xlsx('indicator_sugar_consumption.xlsx', sheetIndex=1)
# get rid of NA rows and columns
sugar[ncol(sugar)] <- NULL
sugar <- sugar[!is.na(sugar[[1]]),]
# clean up the country column
colnames(sugar)[1] <- 'country'
sugar$country <- gsub(' ','_',sugar$country)
# the countries and years
countries <- as.character(sugar$country)
years <- seq(1961,2004)
# sugar consumption in 2004 histogram
ggplot(sugar, aes(x=X2004)) + geom_histogram() + xlab('Sugar per Person (g per day)') + ylab('Count') + ggtitle('Sugar Consumption per Person by Country in 2004')
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 84 rows containing non-finite values (stat_bin).
# sugar consumption in 1970 vs. 1980 vs. 1990 vs. 2000 boxplot
sugar2 <- melt(sugar, id.vars='country', measure.vars=c('X1970','X1980','X1990','X2000'), na.rm=TRUE, variable.name='year')
sugar2$year <- gsub('X','',sugar2$year)
ggplot(sugar2, aes(x=year, y=value)) + geom_boxplot() + xlab('Year') + ylab('Sugar per Person (g per day)') + ggtitle('Sugar Consumption Worldwide by Decade')
15. Exploring Your Friends’ Birthdays
# Your task is to investigate the distribution of your friends'
# birth months and days.
# Here some questions you could answer, and we hope you think of others.
# **********************************************************************
# How many people share your birthday? Do you know them?
# (Reserve time with them or save money to buy them a gift!)
# Which month contains the most number of birthdays?
# How many birthdays are in each month?
# Which day of the year has the most number of birthdays?
# Do you have at least 365 friends that have birthdays on everyday
# of the year?
# **********************************************************************
# You will need to do some data munging and additional research to
# complete this task. This task won't be easy, and you may encounter some
# unexpected challenges along the way. We hope you learn a lot from it though.
# You can expect to spend 30 min or more on this task depending if you
# use the provided data or obtain your personal data. We also encourage you
# to use the lubridate package for working with dates. Read over the documentation
# in RStudio and search for examples online if you need help.
# You'll need to export your Facebooks friends' birthdays to a csv file.
# You may need to create a calendar of your Facebook friends' birthdays
# in a program like Outlook or Gmail and then export the calendar as a
# csv file.
# Once you load the data into R Studio, you can use the strptime() function
# to extract the birth months and birth days. We recommend looking up the
# documentation for the function and finding examples online.
# We've included some links in the Instructor Notes to help get you started.
# Once you've completed your investigation, create a post in the discussions that includes:
# 1. any questions you answered, your observations, and summary statistics
# 2. snippets of code that created the plots
# 3. links to the images of your plots
# You can save images by using the ggsave() command.
# ggsave() will save the last plot created.
# For example...
# qplot(x = price, data = diamonds)
# ggsave('priceHistogram.png')
# ggsave currently recognises the extensions eps/ps, tex (pictex),
# pdf, jpeg, tiff, png, bmp, svg and wmf (windows only).
# Copy and paste all of the code that you used for
# your investigation below the line. Submit it when you are ready.
# ===============================================================================
library(lubridate)
library(scales)
bdays <- read.csv('birthdaysExample.csv')
bdays$dates <- as.Date(bdays$dates, '%m/%d/%y')
# to simplify working with February 29th birthdays, set the year to be a leap year
year(bdays$dates) <- 2016
ggplot(bdays, aes(x=dates)) +
geom_histogram(binwidth = 1, color = "gray", fill = "blue") +
scale_x_date(labels = date_format('%b'), breaks = date_breaks("months"), limits=c(as.Date("2016-01-01"), as.Date("2016-12-31"))) +
xlab('Birthday') + ylab('Count') + ggtitle('Histogram of Birthdays')
ggplot(bdays, aes(x=month(bdays$date))) + geom_bar() + scale_x_continuous(breaks=seq(1,12), labels=month.abb) +
xlab('Month') + ylab('Number of Birthdays') + ggtitle('Birthdays by Month')