1. Diamonds

library(ggplot2)
data(diamonds)

# observations, variables, number of ordered factors
str(diamonds)
## Classes 'tbl_df', 'tbl' and 'data.frame':    53940 obs. of  10 variables:
##  $ carat  : num  0.23 0.21 0.23 0.29 0.31 0.24 0.24 0.26 0.22 0.23 ...
##  $ cut    : Ord.factor w/ 5 levels "Fair"<"Good"<..: 5 4 2 4 2 3 3 3 1 3 ...
##  $ color  : Ord.factor w/ 7 levels "D"<"E"<"F"<"G"<..: 2 2 2 6 7 7 6 5 2 5 ...
##  $ clarity: Ord.factor w/ 8 levels "I1"<"SI2"<"SI1"<..: 2 3 5 4 2 6 7 3 4 5 ...
##  $ depth  : num  61.5 59.8 56.9 62.4 63.3 62.8 62.3 61.9 65.1 59.4 ...
##  $ table  : num  55 61 65 58 58 57 57 55 61 61 ...
##  $ price  : int  326 326 327 334 335 336 336 337 337 338 ...
##  $ x      : num  3.95 3.89 4.05 4.2 4.34 3.94 3.95 4.07 3.87 4 ...
##  $ y      : num  3.98 3.84 4.07 4.23 4.35 3.96 3.98 4.11 3.78 4.05 ...
##  $ z      : num  2.43 2.31 2.31 2.63 2.75 2.48 2.47 2.53 2.49 2.39 ...
levels(diamonds$color)
## [1] "D" "E" "F" "G" "H" "I" "J"
# also see:
#?diamonds

2. Price Histogram

# Create a histogram of the price of
# all the diamonds in the diamond data set.

# TYPE YOUR CODE BELOW THE LINE
# =======================================
hist(diamonds$price)

3. Price Histogram Summary

summary(diamonds$price)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##     326     950    2401    3933    5324   18820

4. Diamond Counts

# diamonds under $500
sum(diamonds$price < 500)
## [1] 1729
# diamonds under $250
sum(diamonds$price < 250)
## [1] 0
# diamonds that cost $15,000 or more
sum(diamonds$price >= 15000)
## [1] 1656

5. Cheaper Diamonds

# Explore the largest peak in the
# price histogram you created earlier.

# Try limiting the x-axis, altering the bin width,
# and setting different breaks on the x-axis.

# There won't be a solution video for this
# question so go to the discussions to
# share your thoughts and discover
# what other people find.

# You can save images by using the ggsave() command.
# ggsave() will save the last plot created.
# For example...
#                  qplot(x = price, data = diamonds)
#                  ggsave('priceHistogram.png')

# ggsave currently recognises the extensions eps/ps, tex (pictex),
# pdf, jpeg, tiff, png, bmp, svg and wmf (windows only).

# Submit your final code when you are ready.

# TYPE YOUR CODE BELOW THE LINE
# ======================================================================
ggplot(diamonds, aes(x=price)) + geom_histogram(binwidth=500) + scale_x_continuous(limits=c(5250,14750))
## Warning: Removed 41944 rows containing non-finite values (stat_bin).

6. Price by Cut Histograms

# Break out the histogram of diamond prices by cut.

# You should have five histograms in separate
# panels on your resulting plot.

# TYPE YOUR CODE BELOW THE LINE
# ======================================================
ggplot(diamonds, aes(x=price)) + geom_histogram() + facet_wrap(~cut)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

7. Price by Cut

by(diamonds$price, diamonds$cut, summary, digits=max(getOption('digits')))
## diamonds$cut: Fair
##      Min.   1st Qu.    Median      Mean   3rd Qu.      Max. 
##   337.000  2050.250  3282.000  4358.758  5205.500 18574.000 
## -------------------------------------------------------- 
## diamonds$cut: Good
##      Min.   1st Qu.    Median      Mean   3rd Qu.      Max. 
##   327.000  1145.000  3050.500  3928.864  5028.000 18788.000 
## -------------------------------------------------------- 
## diamonds$cut: Very Good
##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
##   336.00   912.00  2648.00  3981.76  5372.75 18818.00 
## -------------------------------------------------------- 
## diamonds$cut: Premium
##      Min.   1st Qu.    Median      Mean   3rd Qu.      Max. 
##   326.000  1046.000  3185.000  4584.258  6296.000 18823.000 
## -------------------------------------------------------- 
## diamonds$cut: Ideal
##      Min.   1st Qu.    Median      Mean   3rd Qu.      Max. 
##   326.000   878.000  1810.000  3457.542  4678.500 18806.000

8. Scales and Multiple Histograms

ggplot(diamonds, aes(x=price)) + geom_histogram() + facet_wrap(~cut, scales="free")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

9. Price per Carat by Cut

# Create a histogram of price per carat
# and facet it by cut. You can make adjustments
# to the code from the previous exercise to get
# started.

# Adjust the bin width and transform the scale
# of the x-axis using log10.

# Submit your final code when you are ready.

# ENTER YOUR CODE BELOW THIS LINE.
# ===========================================================================
ggplot(diamonds, aes(x=price)) + geom_histogram() + facet_wrap(~cut, scales="free") + scale_x_log10()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

10. Price Box Plots

# Investigate the price of diamonds using box plots,
# numerical summaries, and one of the following categorical
# variables: cut, clarity, or color.
ggplot(diamonds, aes(x=clarity, y=price)) + geom_boxplot() + xlab('Clarity') + ylab('Price')

by(diamonds$price, diamonds$color, summary)
## diamonds$color: D
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##     357     911    1838    3170    4214   18690 
## -------------------------------------------------------- 
## diamonds$color: E
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##     326     882    1739    3077    4003   18730 
## -------------------------------------------------------- 
## diamonds$color: F
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##     342     982    2344    3725    4868   18790 
## -------------------------------------------------------- 
## diamonds$color: G
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##     354     931    2242    3999    6048   18820 
## -------------------------------------------------------- 
## diamonds$color: H
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##     337     984    3460    4487    5980   18800 
## -------------------------------------------------------- 
## diamonds$color: I
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##     334    1120    3730    5092    7202   18820 
## -------------------------------------------------------- 
## diamonds$color: J
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##     335    1860    4234    5324    7695   18710

11. Interquartile Range (IQR)

# IQR for the best color
IQR(diamonds$price[diamonds$color=='D'])
## [1] 3302.5
# IQR for the worst color
IQR(diamonds$price[diamonds$color=='J'])
## [1] 5834.5

12. Price per Carat Box Plots by Color

ggplot(diamonds, aes(x=color, y=price)) + geom_boxplot() + xlab('Color') + ylab('Price')

by(diamonds$price, diamonds$color, summary)
## diamonds$color: D
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##     357     911    1838    3170    4214   18690 
## -------------------------------------------------------- 
## diamonds$color: E
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##     326     882    1739    3077    4003   18730 
## -------------------------------------------------------- 
## diamonds$color: F
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##     342     982    2344    3725    4868   18790 
## -------------------------------------------------------- 
## diamonds$color: G
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##     354     931    2242    3999    6048   18820 
## -------------------------------------------------------- 
## diamonds$color: H
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##     337     984    3460    4487    5980   18800 
## -------------------------------------------------------- 
## diamonds$color: I
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##     334    1120    3730    5092    7202   18820 
## -------------------------------------------------------- 
## diamonds$color: J
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##     335    1860    4234    5324    7695   18710

13. Carat Frequency Polygon

qplot(data=diamonds, x=carat, xlab='Carat', ylab='Frequency', binwidth=0.1, geom='freqpoly') + scale_x_continuous(breaks=seq(0,5,0.2)) + scale_y_continuous(breaks=seq(0,12000,2000))

14. Gapminder Data

# The Gapminder website contains over 500 data sets with information about
# the world's population. Your task is to download a data set of your choice
# and create 2-5 plots that make use of the techniques from Lesson 3.

# You might use a simple histogram, a boxplot split over a categorical variable,
# or a frequency polygon. The choice is yours!

# You can find a link to the Gapminder website in the Instructor Notes.

# Once you've completed your investigation, create a post in the discussions that includes:
#       1. any questions you answered, your observations, and summary statistics
#       2. snippets of code that created the plots
#       3. links to the images of your plots

# You can save images by using the ggsave() command.
# ggsave() will save the last plot created.
# For example...
#                  qplot(x = price, data = diamonds)
#                  ggsave('priceHistogram.png')

# ggsave currently recognises the extensions eps/ps, tex (pictex),
# pdf, jpeg, tiff, png, bmp, svg and wmf (windows only).

# Copy and paste all of the code that you used for
# your investigation, and submit it when you are ready.
# ====================================================================================
library('xlsx', quietly=TRUE)
library(reshape2)

sugar <- read.xlsx('indicator_sugar_consumption.xlsx', sheetIndex=1)

# get rid of NA rows and columns
sugar[ncol(sugar)] <- NULL
sugar <- sugar[!is.na(sugar[[1]]),]

# clean up the country column
colnames(sugar)[1] <- 'country'
sugar$country <- gsub(' ','_',sugar$country)

# the countries and years
countries <- as.character(sugar$country)
years <- seq(1961,2004)


# sugar consumption in 2004 histogram
ggplot(sugar, aes(x=X2004)) + geom_histogram() + xlab('Sugar per Person (g per day)') + ylab('Count') + ggtitle('Sugar Consumption per Person by Country in 2004')
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 84 rows containing non-finite values (stat_bin).

# sugar consumption in 1970 vs. 1980 vs. 1990 vs. 2000 boxplot
sugar2 <- melt(sugar, id.vars='country', measure.vars=c('X1970','X1980','X1990','X2000'), na.rm=TRUE, variable.name='year')
sugar2$year <- gsub('X','',sugar2$year)
ggplot(sugar2, aes(x=year, y=value)) + geom_boxplot() + xlab('Year') + ylab('Sugar per Person (g per day)') + ggtitle('Sugar Consumption Worldwide by Decade')

15. Exploring Your Friends’ Birthdays

# Your task is to investigate the distribution of your friends'
# birth months and days.

# Here some questions you could answer, and we hope you think of others.

# **********************************************************************

# How many people share your birthday? Do you know them?
# (Reserve time with them or save money to buy them a gift!)

# Which month contains the most number of birthdays?

# How many birthdays are in each month?

# Which day of the year has the most number of birthdays?

# Do you have at least 365 friends that have birthdays on everyday
# of the year?

# **********************************************************************

# You will need to do some data munging and additional research to
# complete this task. This task won't be easy, and you may encounter some
# unexpected challenges along the way. We hope you learn a lot from it though.

# You can expect to spend 30 min or more on this task depending if you
# use the provided data or obtain your personal data. We also encourage you
# to use the lubridate package for working with dates. Read over the documentation
# in RStudio and search for examples online if you need help.

# You'll need to export your Facebooks friends' birthdays to a csv file.
# You may need to create a calendar of your Facebook friends' birthdays
# in a program like Outlook or Gmail and then export the calendar as a
# csv file.

# Once you load the data into R Studio, you can use the strptime() function
# to extract the birth months and birth days. We recommend looking up the
# documentation for the function and finding examples online.

# We've included some links in the Instructor Notes to help get you started.

# Once you've completed your investigation, create a post in the discussions that includes:
#       1. any questions you answered, your observations, and summary statistics
#       2. snippets of code that created the plots
#       3. links to the images of your plots

# You can save images by using the ggsave() command.
# ggsave() will save the last plot created.
# For example...
#                  qplot(x = price, data = diamonds)
#                  ggsave('priceHistogram.png')

# ggsave currently recognises the extensions eps/ps, tex (pictex),
# pdf, jpeg, tiff, png, bmp, svg and wmf (windows only).

# Copy and paste all of the code that you used for
# your investigation below the line. Submit it when you are ready.
# ===============================================================================
library(lubridate)
library(scales)
bdays <- read.csv('birthdaysExample.csv')
bdays$dates <- as.Date(bdays$dates, '%m/%d/%y')

# to simplify working with February 29th birthdays, set the year to be a leap year
year(bdays$dates) <- 2016

ggplot(bdays, aes(x=dates)) + 
  geom_histogram(binwidth = 1, color = "gray", fill = "blue") + 
  scale_x_date(labels = date_format('%b'), breaks = date_breaks("months"), limits=c(as.Date("2016-01-01"), as.Date("2016-12-31"))) + 
  xlab('Birthday') + ylab('Count') + ggtitle('Histogram of Birthdays')

ggplot(bdays, aes(x=month(bdays$date))) + geom_bar() + scale_x_continuous(breaks=seq(1,12), labels=month.abb) + 
  xlab('Month') + ylab('Number of Birthdays') + ggtitle('Birthdays by Month')