setwd("C:/Users/Ryan/Desktop/MOOCs/Data Analysis with R/lesson6")
Notes:
library(ggplot2)
dat = diamonds
ggplot(aes(carat, price), data=dat) +
geom_point(fill=I('#F79420'),
color=I('black'),
shape=21) +
stat_smooth(method='lm') +
scale_x_continuous(limits = c(0, quantile(dat$carat, 0.99)))+
scale_y_continuous(limits = c(0, quantile(dat$price, 0.99)))
## Warning: Removed 926 rows containing non-finite values (stat_smooth).
## Warning: Removed 926 rows containing missing values (geom_point).
## Warning: Removed 4 rows containing missing values (geom_smooth).
# ylim(0, quantile(dat$price, 0.99))
Response:
Notes:
Notes:
Notes:
# install these if necessary
# install.packages('GGally')
# install.packages('scales')
# install.packages('memisc')
# install.packages('lattice')
# install.packages('MASS')
# install.packages('car')
# install.packages('reshape')
# install.packages('plyr')
# install.packages('ggplot2')
# load the ggplot graphics package and the others
library(ggplot2)
library(GGally)
library(scales)
library(memisc)
## Loading required package: lattice
## Loading required package: MASS
##
## Attaching package: 'memisc'
## The following object is masked from 'package:scales':
##
## percent
## The following objects are masked from 'package:stats':
##
## contr.sum, contr.treatment, contrasts
## The following object is masked from 'package:base':
##
## as.array
# sample 10,000 diamonds from the data set
set.seed(20022012)
diamond_samp <- diamonds[sample(1:length(diamonds$price), 10000), ]
ggpairs(diamond_samp,
lower = list(continuous = wrap("points", shape = I('.'))),
upper = list(combo = wrap("box", outlier.shape = I('.'))))
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
What are some things you notice in the ggpairs output? Response:
Notes:
library(gridExtra)
p1 = qplot(x=price, data=dat,
fill=I('blue')) +
ggtitle('Price')
p2 = qplot(x=price, data=dat,
fill=I('orange')) +
ggtitle('Price (log10)') +
scale_x_log10()
grid.arrange(p1,p2,ncol=2)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
Notes:
qplot(carat, price, data=dat) +
scale_y_log10() +
ggtitle('Price (log10) by Carat')
cuberoot_trans = function() trans_new('cuberoot',
transform = function(x) x^(1/3),
inverse = function(x) x^3)
ggplot(aes(carat, price), data = diamonds) +
geom_point() +
scale_x_continuous(trans = cuberoot_trans(), limits = c(0.2, 3),
breaks = c(0.2, 0.5, 1, 2, 3)) +
scale_y_continuous(trans = log10_trans(), limits = c(350, 15000),
breaks = c(350, 1000, 5000, 10000, 15000)) +
ggtitle('Price (log10) by Cube-Root of Carat')
## Warning: Removed 1683 rows containing missing values (geom_point).
head(sort(table(dat$carat), decreasing=T))
##
## 0.3 0.31 1.01 0.7 0.32 1
## 2604 2249 2242 1981 1840 1558
head(sort(table(dat$price), decreasing=T))
##
## 605 802 625 828 776 698
## 132 127 126 125 124 121
ggplot(aes(carat, price), data = diamonds) +
geom_point(size=.75, alpha=.5, position='jitter') +
scale_x_continuous(trans = cuberoot_trans(), limits = c(0.2, 3),
breaks = c(0.2, 0.5, 1, 2, 3)) +
scale_y_continuous(trans = log10_trans(), limits = c(350, 15000),
breaks = c(350, 1000, 5000, 10000, 15000)) +
ggtitle('Price (log10) by Cube-Root of Carat')
## Warning: Removed 1691 rows containing missing values (geom_point).
Alter the code below.
# install and load the RColorBrewer package
# install.packages('RColorBrewer')
library(RColorBrewer)
ggplot(aes(x = carat, y = price), data = diamonds) +
geom_point(aes(color=clarity),
alpha = 0.5, size = 1, position = 'jitter') +
scale_color_brewer(type = 'div',
guide = guide_legend(title = 'Clarity', reverse = T,
override.aes = list(alpha = 1, size = 2))) +
scale_x_continuous(trans = cuberoot_trans(), limits = c(0.2, 3),
breaks = c(0.2, 0.5, 1, 2, 3)) +
scale_y_continuous(trans = log10_trans(), limits = c(350, 15000),
breaks = c(350, 1000, 5000, 10000, 15000)) +
ggtitle('Price (log10) by Cube-Root of Carat and Clarity')
## Warning: Removed 1693 rows containing missing values (geom_point).
Response:
Alter the code below.
ggplot(aes(x = carat, y = price, color = cut), data = diamonds) +
geom_point(alpha = 0.5, size = 1, position = 'jitter') +
scale_color_brewer(type = 'div',
guide = guide_legend(title = 'Cut', reverse = T,
override.aes = list(alpha = 1, size = 2))) +
scale_x_continuous(trans = cuberoot_trans(), limits = c(0.2, 3),
breaks = c(0.2, 0.5, 1, 2, 3)) +
scale_y_continuous(trans = log10_trans(), limits = c(350, 15000),
breaks = c(350, 1000, 5000, 10000, 15000)) +
ggtitle('Price (log10) by Cube-Root of Carat and Cut')
## Warning: Removed 1696 rows containing missing values (geom_point).
Response:
Alter the code below.
ggplot(aes(x = carat, y = price, color = color), data = diamonds) +
geom_point(alpha = 0.5, size = 1, position = 'jitter') +
scale_color_brewer(type = 'div',
guide = guide_legend(title = 'Color',
override.aes = list(alpha = 1, size = 2))) +
scale_x_continuous(trans = cuberoot_trans(), limits = c(0.2, 3),
breaks = c(0.2, 0.5, 1, 2, 3)) +
scale_y_continuous(trans = log10_trans(), limits = c(350, 15000),
breaks = c(350, 1000, 5000, 10000, 15000)) +
ggtitle('Price (log10) by Cube-Root of Carat and Color')
## Warning: Removed 1688 rows containing missing values (geom_point).
Response:
Notes:
Response:
Notes:
m1 <- lm(I(log(price)) ~ I(carat^(1/3)), data = diamonds)
m2 <- update(m1, ~ . + carat)
m3 <- update(m2, ~ . + cut)
m4 <- update(m3, ~ . + color)
m5 <- update(m4, ~ . + clarity)
mtable(m1, m2, m3, m4, m5)
##
## Calls:
## m1: lm(formula = I(log(price)) ~ I(carat^(1/3)), data = diamonds)
## m2: lm(formula = I(log(price)) ~ I(carat^(1/3)) + carat, data = diamonds)
## m3: lm(formula = I(log(price)) ~ I(carat^(1/3)) + carat + cut, data = diamonds)
## m4: lm(formula = I(log(price)) ~ I(carat^(1/3)) + carat + cut + color,
## data = diamonds)
## m5: lm(formula = I(log(price)) ~ I(carat^(1/3)) + carat + cut + color +
## clarity, data = diamonds)
##
## =========================================================================
## m1 m2 m3 m4 m5
## -------------------------------------------------------------------------
## (Intercept) 2.821*** 1.039*** 0.874*** 0.932*** 0.415***
## (0.006) (0.019) (0.019) (0.017) (0.010)
## I(carat^(1/3)) 5.558*** 8.568*** 8.703*** 8.438*** 9.144***
## (0.007) (0.032) (0.031) (0.028) (0.016)
## carat -1.137*** -1.163*** -0.992*** -1.093***
## (0.012) (0.011) (0.010) (0.006)
## cut: .L 0.224*** 0.224*** 0.120***
## (0.004) (0.004) (0.002)
## cut: .Q -0.062*** -0.062*** -0.031***
## (0.004) (0.003) (0.002)
## cut: .C 0.051*** 0.052*** 0.014***
## (0.003) (0.003) (0.002)
## cut: ^4 0.018*** 0.018*** -0.002
## (0.003) (0.002) (0.001)
## color: .L -0.373*** -0.441***
## (0.003) (0.002)
## color: .Q -0.129*** -0.093***
## (0.003) (0.002)
## color: .C 0.001 -0.013***
## (0.003) (0.002)
## color: ^4 0.029*** 0.012***
## (0.003) (0.002)
## color: ^5 -0.016*** -0.003*
## (0.003) (0.001)
## color: ^6 -0.023*** 0.001
## (0.002) (0.001)
## clarity: .L 0.907***
## (0.003)
## clarity: .Q -0.240***
## (0.003)
## clarity: .C 0.131***
## (0.003)
## clarity: ^4 -0.063***
## (0.002)
## clarity: ^5 0.026***
## (0.002)
## clarity: ^6 -0.002
## (0.002)
## clarity: ^7 0.032***
## (0.001)
## -------------------------------------------------------------------------
## R-squared 0.9 0.9 0.9 1.0 1.0
## adj. R-squared 0.9 0.9 0.9 1.0 1.0
## sigma 0.3 0.3 0.3 0.2 0.1
## F 652012.1 387489.4 138654.5 87959.5 173791.1
## p 0.0 0.0 0.0 0.0 0.0
## Log-likelihood -7962.5 -3631.3 -1837.4 4235.2 34091.3
## Deviance 4242.8 3613.4 3380.8 2699.2 892.2
## AIC 15931.0 7270.6 3690.8 -8442.5 -68140.5
## BIC 15957.7 7306.2 3762.0 -8317.9 -67953.7
## N 53940 53940 53940 53940 53940
## =========================================================================
Notice how adding cut to our model does not help explain much of the variance in the price of diamonds. This fits with out exploration earlier.
Video Notes:
Research: (Take some time to come up with 2-4 problems for the model) (You should 10-20 min on this)
Response:
Notes:
# install.packages('bitops')
# install.packages('RCurl')
library('bitops')
library('RCurl')
# diamondsurl = getBinaryURL("https://raw.github.com/solomonm/diamonds-data/master/BigDiamonds.Rda")
# load(rawConnection(diamondsurl))
big_diamond = read.csv('diamondsbig.csv')
The code used to obtain the data is available here: https://github.com/solomonm/diamonds-data
library(caTools)
set.seed(7)
s1 = subset(big_diamond, cert=='GIA', price<10000)
diamond_samp <- s1[sample(1:length(s1$price), 10000), ]
# sample1 = sample.split(diamond_samp$price, SplitRatio = .8)
# train = subset(diamond_samp, sample1==T & price < 10000 & cert=='GIA')
# test = subset(diamond_samp, sample1==F & price < 10000 )
m1 = lm(I(log(price)) ~ I(carat^(1/3)), data = s1)
m2 = update(m1, ~ . + carat)
m3 = update(m2, ~ . + cut)
m4 = update(m3, ~ . + color)
m5 = update(m4, ~ . + clarity)
mtable(m1, m2, m3, m4, m5)
##
## Calls:
## m1: lm(formula = I(log(price)) ~ I(carat^(1/3)), data = s1)
## m2: lm(formula = I(log(price)) ~ I(carat^(1/3)) + carat, data = s1)
## m3: lm(formula = I(log(price)) ~ I(carat^(1/3)) + carat + cut, data = s1)
## m4: lm(formula = I(log(price)) ~ I(carat^(1/3)) + carat + cut + color,
## data = s1)
## m5: lm(formula = I(log(price)) ~ I(carat^(1/3)) + carat + cut + color +
## clarity, data = s1)
##
## =========================================================================
## m1 m2 m3 m4 m5
## -------------------------------------------------------------------------
## (Intercept) 2.888*** 1.194*** 0.993*** 1.099*** 0.462***
## (0.002) (0.006) (0.006) (0.004) (0.004)
## I(carat^(1/3)) 5.633*** 8.296*** 8.322*** 8.573*** 8.714***
## (0.003) (0.009) (0.009) (0.006) (0.004)
## carat -0.831*** -0.849*** -0.827*** -0.851***
## (0.003) (0.003) (0.002) (0.001)
## cut: Ideal 0.249*** 0.195*** 0.129***
## (0.002) (0.001) (0.001)
## cut: V.Good 0.130*** 0.087*** 0.065***
## (0.002) (0.001) (0.001)
## color: E/D -0.140*** -0.113***
## (0.001) (0.001)
## color: F/D -0.202*** -0.163***
## (0.001) (0.001)
## color: G/D -0.290*** -0.247***
## (0.001) (0.001)
## color: H/D -0.392*** -0.334***
## (0.001) (0.001)
## color: I/D -0.540*** -0.483***
## (0.002) (0.001)
## color: J/D -0.690*** -0.641***
## (0.002) (0.001)
## color: K/D -0.868*** -0.839***
## (0.002) (0.001)
## color: L/D -0.993*** -0.962***
## (0.003) (0.002)
## clarity: I2 -0.291***
## (0.007)
## clarity: IF 0.872***
## (0.002)
## clarity: SI1 0.413***
## (0.002)
## clarity: SI2 0.279***
## (0.002)
## clarity: VS1 0.614***
## (0.002)
## clarity: VS2 0.538***
## (0.002)
## clarity: VVS1 0.748***
## (0.002)
## clarity: VVS2 0.671***
## (0.002)
## -------------------------------------------------------------------------
## R-squared 0.9 0.9 0.9 1.0 1.0
## adj. R-squared 0.9 0.9 0.9 1.0 1.0
## sigma 0.4 0.3 0.3 0.2 0.2
## F 5019847.5 3084408.5 1638488.5 1046511.1 1344897.8
## p 0.0 0.0 0.0 0.0 0.0
## Log-likelihood -210283.1 -166233.0 -153162.0 -10015.1 161911.6
## Deviance 67236.3 55587.5 52536.3 28310.9 13473.0
## AIC 420572.2 332473.9 306336.1 20058.2 -323779.3
## BIC 420605.3 332518.1 306402.3 20212.8 -323536.3
## N 463066 463066 463066 463066 463066
## =========================================================================
Example Diamond from BlueNile: Round 1.00 Very Good I VS1 $5,601
#Be sure you've loaded the library memisc and have m5 saved as an object in your workspace.
thisDiamond = data.frame(carat = 1.00, cut = "V.Good",
color = "I", clarity="VS1")
modelEstimate = predict(m5, newdata = thisDiamond,
interval="prediction", level = .95)
exp(modelEstimate)
## fit lwr upr
## 1 5019.162 3592.796 7011.808
Evaluate how well the model predicts the BlueNile diamond’s price. Think about the fitted point estimate as well as the 95% CI.
Notes:
Click KnitHTML to see all of your hard work and to have an html page of this lesson, your answers, and your notes!