Lesson 6

setwd("C:/Users/Ryan/Desktop/MOOCs/Data Analysis with R/lesson6")

Welcome

Notes:

Scatterplot Review

library(ggplot2)
dat = diamonds
ggplot(aes(carat, price), data=dat) +
  geom_point(fill=I('#F79420'),
             color=I('black'),
             shape=21) +
  stat_smooth(method='lm') +
  scale_x_continuous(limits = c(0, quantile(dat$carat, 0.99)))+
  scale_y_continuous(limits = c(0, quantile(dat$price, 0.99)))

## Warning: Removed 926 rows containing non-finite values (stat_smooth).

## Warning: Removed 926 rows containing missing values (geom_point).

## Warning: Removed 4 rows containing missing values (geom_smooth).

  # ylim(0, quantile(dat$price, 0.99))

Price and Carat Relationship

Response:

Frances Gerety

Notes:

A diamonds is

The Rise of Diamonds

Notes:

ggpairs Function

Notes:

# install these if necessary
# install.packages('GGally')
# install.packages('scales')
# install.packages('memisc')
# install.packages('lattice')
# install.packages('MASS')
# install.packages('car')
# install.packages('reshape')
# install.packages('plyr')
# install.packages('ggplot2')

# load the ggplot graphics package and the others
library(ggplot2)
library(GGally)
library(scales)
library(memisc)

## Loading required package: lattice

## Loading required package: MASS

## 
## Attaching package: 'memisc'

## The following object is masked from 'package:scales':
## 
##     percent

## The following objects are masked from 'package:stats':
## 
##     contr.sum, contr.treatment, contrasts

## The following object is masked from 'package:base':
## 
##     as.array

# sample 10,000 diamonds from the data set
set.seed(20022012)
diamond_samp <- diamonds[sample(1:length(diamonds$price), 10000), ]
ggpairs(diamond_samp, 
  lower = list(continuous = wrap("points", shape = I('.'))), 
  upper = list(combo = wrap("box", outlier.shape = I('.'))))

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

What are some things you notice in the ggpairs output? Response:

The Demand of Diamonds

Notes:

library(gridExtra)
p1 = qplot(x=price, data=dat,
           fill=I('blue')) + 
  ggtitle('Price')
p2 = qplot(x=price, data=dat,
           fill=I('orange')) +
  ggtitle('Price (log10)') +
  scale_x_log10()

grid.arrange(p1,p2,ncol=2)

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Connecting Demand and Price Distributions

Notes:

Scatterplot Transformation

qplot(carat, price, data=dat) +
  scale_y_log10() +
  ggtitle('Price (log10) by Carat')

Create a new function to transform the carat variable

cuberoot_trans = function() trans_new('cuberoot', 
                                      transform = function(x) x^(1/3),
                                      inverse = function(x) x^3)

Use the cuberoot_trans function

ggplot(aes(carat, price), data = diamonds) + 
  geom_point() + 
  scale_x_continuous(trans = cuberoot_trans(), limits = c(0.2, 3),
                     breaks = c(0.2, 0.5, 1, 2, 3)) + 
  scale_y_continuous(trans = log10_trans(), limits = c(350, 15000),
                     breaks = c(350, 1000, 5000, 10000, 15000)) +
  ggtitle('Price (log10) by Cube-Root of Carat')

## Warning: Removed 1683 rows containing missing values (geom_point).

Overplotting Revisited

head(sort(table(dat$carat), decreasing=T))

## 
##  0.3 0.31 1.01  0.7 0.32    1 
## 2604 2249 2242 1981 1840 1558

head(sort(table(dat$price), decreasing=T))

## 
## 605 802 625 828 776 698 
## 132 127 126 125 124 121

ggplot(aes(carat, price), data = diamonds) + 
  geom_point(size=.75, alpha=.5, position='jitter') + 
  scale_x_continuous(trans = cuberoot_trans(), limits = c(0.2, 3),
                     breaks = c(0.2, 0.5, 1, 2, 3)) + 
  scale_y_continuous(trans = log10_trans(), limits = c(350, 15000),
                     breaks = c(350, 1000, 5000, 10000, 15000)) +
  ggtitle('Price (log10) by Cube-Root of Carat')

## Warning: Removed 1691 rows containing missing values (geom_point).

Other Qualitative Factors

Price vs. Carat and Clarity

Alter the code below.

# install and load the RColorBrewer package
# install.packages('RColorBrewer')
library(RColorBrewer)

ggplot(aes(x = carat, y = price), data = diamonds) + 
  geom_point(aes(color=clarity),
             alpha = 0.5, size = 1, position = 'jitter') +
  scale_color_brewer(type = 'div',
    guide = guide_legend(title = 'Clarity', reverse = T,
    override.aes = list(alpha = 1, size = 2))) +  
  scale_x_continuous(trans = cuberoot_trans(), limits = c(0.2, 3),
    breaks = c(0.2, 0.5, 1, 2, 3)) + 
  scale_y_continuous(trans = log10_trans(), limits = c(350, 15000),
    breaks = c(350, 1000, 5000, 10000, 15000)) +
  ggtitle('Price (log10) by Cube-Root of Carat and Clarity')

## Warning: Removed 1693 rows containing missing values (geom_point).

Clarity and Price

Response:

Price vs. Carat and Cut

Alter the code below.

ggplot(aes(x = carat, y = price, color = cut), data = diamonds) + 
  geom_point(alpha = 0.5, size = 1, position = 'jitter') +
  scale_color_brewer(type = 'div',
                     guide = guide_legend(title = 'Cut', reverse = T,
                                          override.aes = list(alpha = 1, size = 2))) +  
  scale_x_continuous(trans = cuberoot_trans(), limits = c(0.2, 3),
                     breaks = c(0.2, 0.5, 1, 2, 3)) + 
  scale_y_continuous(trans = log10_trans(), limits = c(350, 15000),
                     breaks = c(350, 1000, 5000, 10000, 15000)) +
  ggtitle('Price (log10) by Cube-Root of Carat and Cut')

## Warning: Removed 1696 rows containing missing values (geom_point).

Cut and Price

Response:

Price vs. Carat and Color

Alter the code below.

ggplot(aes(x = carat, y = price, color = color), data = diamonds) + 
  geom_point(alpha = 0.5, size = 1, position = 'jitter') +
  scale_color_brewer(type = 'div',
                     guide = guide_legend(title = 'Color',
                                          override.aes = list(alpha = 1, size = 2))) +  
  scale_x_continuous(trans = cuberoot_trans(), limits = c(0.2, 3),
                     breaks = c(0.2, 0.5, 1, 2, 3)) + 
  scale_y_continuous(trans = log10_trans(), limits = c(350, 15000),
                     breaks = c(350, 1000, 5000, 10000, 15000)) +
  ggtitle('Price (log10) by Cube-Root of Carat and Color')

## Warning: Removed 1688 rows containing missing values (geom_point).

Color and Price

Response:

Linear Models in R

Notes:

Response:

Building the Linear Model

Notes:

m1 <- lm(I(log(price)) ~ I(carat^(1/3)), data = diamonds)
m2 <- update(m1, ~ . + carat)
m3 <- update(m2, ~ . + cut)
m4 <- update(m3, ~ . + color)
m5 <- update(m4, ~ . + clarity)
mtable(m1, m2, m3, m4, m5)

## 
## Calls:
## m1: lm(formula = I(log(price)) ~ I(carat^(1/3)), data = diamonds)
## m2: lm(formula = I(log(price)) ~ I(carat^(1/3)) + carat, data = diamonds)
## m3: lm(formula = I(log(price)) ~ I(carat^(1/3)) + carat + cut, data = diamonds)
## m4: lm(formula = I(log(price)) ~ I(carat^(1/3)) + carat + cut + color, 
##     data = diamonds)
## m5: lm(formula = I(log(price)) ~ I(carat^(1/3)) + carat + cut + color + 
##     clarity, data = diamonds)
## 
## =========================================================================
##                      m1         m2         m3         m4         m5      
## -------------------------------------------------------------------------
##   (Intercept)      2.821***   1.039***   0.874***   0.932***   0.415***  
##                   (0.006)    (0.019)    (0.019)    (0.017)    (0.010)    
##   I(carat^(1/3))   5.558***   8.568***   8.703***   8.438***   9.144***  
##                   (0.007)    (0.032)    (0.031)    (0.028)    (0.016)    
##   carat                      -1.137***  -1.163***  -0.992***  -1.093***  
##                              (0.012)    (0.011)    (0.010)    (0.006)    
##   cut: .L                                0.224***   0.224***   0.120***  
##                                         (0.004)    (0.004)    (0.002)    
##   cut: .Q                               -0.062***  -0.062***  -0.031***  
##                                         (0.004)    (0.003)    (0.002)    
##   cut: .C                                0.051***   0.052***   0.014***  
##                                         (0.003)    (0.003)    (0.002)    
##   cut: ^4                                0.018***   0.018***  -0.002     
##                                         (0.003)    (0.002)    (0.001)    
##   color: .L                                        -0.373***  -0.441***  
##                                                    (0.003)    (0.002)    
##   color: .Q                                        -0.129***  -0.093***  
##                                                    (0.003)    (0.002)    
##   color: .C                                         0.001     -0.013***  
##                                                    (0.003)    (0.002)    
##   color: ^4                                         0.029***   0.012***  
##                                                    (0.003)    (0.002)    
##   color: ^5                                        -0.016***  -0.003*    
##                                                    (0.003)    (0.001)    
##   color: ^6                                        -0.023***   0.001     
##                                                    (0.002)    (0.001)    
##   clarity: .L                                                  0.907***  
##                                                               (0.003)    
##   clarity: .Q                                                 -0.240***  
##                                                               (0.003)    
##   clarity: .C                                                  0.131***  
##                                                               (0.003)    
##   clarity: ^4                                                 -0.063***  
##                                                               (0.002)    
##   clarity: ^5                                                  0.026***  
##                                                               (0.002)    
##   clarity: ^6                                                 -0.002     
##                                                               (0.002)    
##   clarity: ^7                                                  0.032***  
##                                                               (0.001)    
## -------------------------------------------------------------------------
##   R-squared            0.9        0.9        0.9        1.0        1.0   
##   adj. R-squared       0.9        0.9        0.9        1.0        1.0   
##   sigma                0.3        0.3        0.3        0.2        0.1   
##   F               652012.1   387489.4   138654.5    87959.5   173791.1   
##   p                    0.0        0.0        0.0        0.0        0.0   
##   Log-likelihood   -7962.5    -3631.3    -1837.4     4235.2    34091.3   
##   Deviance          4242.8     3613.4     3380.8     2699.2      892.2   
##   AIC              15931.0     7270.6     3690.8    -8442.5   -68140.5   
##   BIC              15957.7     7306.2     3762.0    -8317.9   -67953.7   
##   N                53940      53940      53940      53940      53940     
## =========================================================================

Notice how adding cut to our model does not help explain much of the variance in the price of diamonds. This fits with out exploration earlier.

Model Problems

Video Notes:

Research: (Take some time to come up with 2-4 problems for the model) (You should 10-20 min on this)

Response:

A Bigger, Better Data Set

Notes:

# install.packages('bitops')
# install.packages('RCurl')
library('bitops')
library('RCurl')

# diamondsurl = getBinaryURL("https://raw.github.com/solomonm/diamonds-data/master/BigDiamonds.Rda")
# load(rawConnection(diamondsurl))
big_diamond = read.csv('diamondsbig.csv')

The code used to obtain the data is available here: https://github.com/solomonm/diamonds-data

library(caTools)

set.seed(7)
s1 = subset(big_diamond, cert=='GIA', price<10000)

diamond_samp <- s1[sample(1:length(s1$price), 10000), ]



# sample1 = sample.split(diamond_samp$price, SplitRatio = .8)
# train = subset(diamond_samp, sample1==T & price < 10000 & cert=='GIA')
# test = subset(diamond_samp, sample1==F & price < 10000 )

Building a Model Using the Big Diamonds Data Set

m1 = lm(I(log(price)) ~ I(carat^(1/3)), data = s1)
m2 = update(m1, ~ . + carat)
m3 = update(m2, ~ . + cut)
m4 = update(m3, ~ . + color)
m5 = update(m4, ~ . + clarity)
mtable(m1, m2, m3, m4, m5)

## 
## Calls:
## m1: lm(formula = I(log(price)) ~ I(carat^(1/3)), data = s1)
## m2: lm(formula = I(log(price)) ~ I(carat^(1/3)) + carat, data = s1)
## m3: lm(formula = I(log(price)) ~ I(carat^(1/3)) + carat + cut, data = s1)
## m4: lm(formula = I(log(price)) ~ I(carat^(1/3)) + carat + cut + color, 
##     data = s1)
## m5: lm(formula = I(log(price)) ~ I(carat^(1/3)) + carat + cut + color + 
##     clarity, data = s1)
## 
## =========================================================================
##                      m1         m2         m3         m4         m5      
## -------------------------------------------------------------------------
##   (Intercept)      2.888***   1.194***   0.993***   1.099***   0.462***  
##                   (0.002)    (0.006)    (0.006)    (0.004)    (0.004)    
##   I(carat^(1/3))   5.633***   8.296***   8.322***   8.573***   8.714***  
##                   (0.003)    (0.009)    (0.009)    (0.006)    (0.004)    
##   carat                      -0.831***  -0.849***  -0.827***  -0.851***  
##                              (0.003)    (0.003)    (0.002)    (0.001)    
##   cut: Ideal                             0.249***   0.195***   0.129***  
##                                         (0.002)    (0.001)    (0.001)    
##   cut: V.Good                            0.130***   0.087***   0.065***  
##                                         (0.002)    (0.001)    (0.001)    
##   color: E/D                                       -0.140***  -0.113***  
##                                                    (0.001)    (0.001)    
##   color: F/D                                       -0.202***  -0.163***  
##                                                    (0.001)    (0.001)    
##   color: G/D                                       -0.290***  -0.247***  
##                                                    (0.001)    (0.001)    
##   color: H/D                                       -0.392***  -0.334***  
##                                                    (0.001)    (0.001)    
##   color: I/D                                       -0.540***  -0.483***  
##                                                    (0.002)    (0.001)    
##   color: J/D                                       -0.690***  -0.641***  
##                                                    (0.002)    (0.001)    
##   color: K/D                                       -0.868***  -0.839***  
##                                                    (0.002)    (0.001)    
##   color: L/D                                       -0.993***  -0.962***  
##                                                    (0.003)    (0.002)    
##   clarity: I2                                                 -0.291***  
##                                                               (0.007)    
##   clarity: IF                                                  0.872***  
##                                                               (0.002)    
##   clarity: SI1                                                 0.413***  
##                                                               (0.002)    
##   clarity: SI2                                                 0.279***  
##                                                               (0.002)    
##   clarity: VS1                                                 0.614***  
##                                                               (0.002)    
##   clarity: VS2                                                 0.538***  
##                                                               (0.002)    
##   clarity: VVS1                                                0.748***  
##                                                               (0.002)    
##   clarity: VVS2                                                0.671***  
##                                                               (0.002)    
## -------------------------------------------------------------------------
##   R-squared             0.9        0.9        0.9        1.0        1.0  
##   adj. R-squared        0.9        0.9        0.9        1.0        1.0  
##   sigma                 0.4        0.3        0.3        0.2        0.2  
##   F               5019847.5  3084408.5  1638488.5  1046511.1  1344897.8  
##   p                     0.0        0.0        0.0        0.0        0.0  
##   Log-likelihood  -210283.1  -166233.0  -153162.0   -10015.1   161911.6  
##   Deviance          67236.3    55587.5    52536.3    28310.9    13473.0  
##   AIC              420572.2   332473.9   306336.1    20058.2  -323779.3  
##   BIC              420605.3   332518.1   306402.3    20212.8  -323536.3  
##   N                463066     463066     463066     463066     463066    
## =========================================================================

Predictions

Example Diamond from BlueNile: Round 1.00 Very Good I VS1 $5,601

#Be sure you've loaded the library memisc and have m5 saved as an object in your workspace.
thisDiamond = data.frame(carat = 1.00, cut = "V.Good",
                         color = "I", clarity="VS1")
modelEstimate = predict(m5, newdata = thisDiamond,
                        interval="prediction", level = .95)
exp(modelEstimate)

##        fit      lwr      upr
## 1 5019.162 3592.796 7011.808

Evaluate how well the model predicts the BlueNile diamond’s price. Think about the fitted point estimate as well as the 95% CI.

Final Thoughts

Notes:

Click KnitHTML to see all of your hard work and to have an html page of this lesson, your answers, and your notes!