Lesson 5

setwd("C:/Users/Ryan/Desktop/MOOCs/Data Analysis with R/lesson5")

pf = read.csv('../lesson3/pseudo_facebook.tsv',
              sep = '\t')

Multivariate Data

Notes:


Moira Perceived Audience Size Colored by Age

Notes:


Third Qualitative Variable

Notes:

setwd("C:/Users/Ryan/Desktop/MOOCs/Data Analysis with R/lesson5")
library(ggplot2)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
pf.fc_by_age_gender = pf %>% 
  filter(!is.na(gender)) %>% 
  group_by(age, gender) %>% 
  summarise(mean_friend_count = mean(friend_count),
            median_friend_count = as.integer(median(friend_count)),
            n=n()) %>% 
  ungroup() %>% 
  arrange(age)

head(pf.fc_by_age_gender)
## # A tibble: 6 x 5
##     age gender mean_friend_count median_friend_count     n
##   <int> <fctr>             <dbl>               <int> <int>
## 1    13 female          259.1606                 148   193
## 2    13   male          102.1340                  55   291
## 3    14 female          362.4286                 224   847
## 4    14   male          164.1456                  92  1078
## 5    15 female          538.6813                 276  1139
## 6    15   male          200.6658                 106  1478
tail(pf.fc_by_age_gender)
## # A tibble: 6 x 5
##     age gender mean_friend_count median_friend_count     n
##   <int> <fctr>             <dbl>               <int> <int>
## 1   111 female          244.4286                 109     7
## 2   111   male          246.5000                 191    10
## 3   112 female          201.2000                 203     5
## 4   112   male          594.0769                  95    13
## 5   113 female          278.6571                 198   105
## 6   113   male          410.3956                 223    91

Plotting Conditional Summaries

Notes:

ggplot(aes(age, median_friend_count),
       data=pf.fc_by_age_gender) +
  geom_line(aes(color=gender))


Thinking in Ratios

Notes:


Wide and Long Format

Notes:


Reshaping Data

Notes:

library(tidyr)
pf.fc_by_age_gender.wide = 
  subset(pf.fc_by_age_gender[c('age',
                               'gender',
                               'median_friend_count')],
         !is.na(gender)) %>%
  spread(gender, median_friend_count) %>% 
  mutate(ratio = male/female)
head(pf.fc_by_age_gender.wide)
## # A tibble: 6 x 4
##     age female  male     ratio
##   <int>  <int> <int>     <dbl>
## 1    13    148    55 0.3716216
## 2    14    224    92 0.4107143
## 3    15    276   106 0.3840580
## 4    16    258   136 0.5271318
## 5    17    245   125 0.5102041
## 6    18    243   122 0.5020576

Ratio Plot

Notes:

ggplot(aes(age, female/male), 
       data=pf.fc_by_age_gender.wide) +
  geom_line() +
  geom_hline(yintercept = 1,
             color='red',
             linetype=2)


Third Quantitative Variable

Notes:

pf$year_joined = floor(2014-pf$tenure/365)
table(pf$year_joined)              
## 
##  2005  2006  2007  2008  2009  2010  2011  2012  2013  2014 
##     9    15   581  1507  4557  5448  9860 33366 43588    70

Cut a Variable

Notes:

?cut
## starting httpd help server ...
##  done
pf$year_joined.bucket = cut(pf$year_joined, breaks = c(2004,
                                                       2009,
                                                       2011,
                                                       2012,
                                                       2014))
table(pf$year_joined.bucket)
## 
## (2004,2009] (2009,2011] (2011,2012] (2012,2014] 
##        6669       15308       33366       43658

Plotting it All Together

Notes:

ggplot(aes(age, friend_count), data=pf[!is.na(pf$gender),]) +
  geom_line(aes(color=year_joined.bucket),
            stat = 'summary', fun.y=median)


Plot the Grand Mean

Notes:

ggplot(aes(age, friend_count), data=pf[!is.na(pf$gender),]) +
  geom_line(aes(color=year_joined.bucket),
            stat = 'summary', fun.y=mean) + 
  geom_line(linetype=2,
            stat='summary',
            fun.y=mean)


Friending Rate

Notes:

# x = pf %>% 
#   filter(tenure>0)
# x=x$friend_count/x$tenure
with(pf[pf$tenure>0,],
     summary(friend_count/tenure))
##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max.     NA's 
##   0.0000   0.0775   0.2205   0.6096   0.5658 417.0000        2

Friendships Initiated

Notes:

What is the median friend rate?

What is the maximum friend rate?

ggplot(aes(tenure, 
           friendships_initiated/tenure),
       data=pf[pf$tenure>0,]) + 
  geom_line(aes(color=year_joined.bucket),
            stat = 'summary',
            fun.y=mean)
## Warning: Removed 2 rows containing non-finite values (stat_summary).


Bias-Variance Tradeoff Revisited

Notes:

ggplot(aes(x = tenure, y = friendships_initiated / tenure),
       data = subset(pf, tenure >= 1)) +
  geom_line(aes(color = year_joined.bucket),
            stat = 'summary',
            fun.y = mean)

ggplot(aes(x = 7 * round(tenure / 7), y = friendships_initiated / tenure),
       data = subset(pf, tenure > 0)) +
  geom_line(aes(color = year_joined.bucket),
            stat = "summary",
            fun.y = mean)

ggplot(aes(x = 30 * round(tenure / 30), y = friendships_initiated / tenure),
       data = subset(pf, tenure > 0)) +
  geom_line(aes(color = year_joined.bucket),
            stat = "summary",
            fun.y = mean)

ggplot(aes(x = 90 * round(tenure / 90), y = friendships_initiated / tenure),
       data = subset(pf, tenure > 0)) +
  geom_line(aes(color = year_joined.bucket),
            stat = "summary",
            fun.y = mean)

ggplot(aes(tenure, y = friendships_initiated / tenure),
       data = subset(pf, tenure > 0)) +
  geom_smooth(aes(color = year_joined.bucket))


Sean’s NFL Fan Sentiment Study

Notes:


yo = read.csv('yogurt.csv')
head(yo)
##   obs      id  time strawberry blueberry pina.colada plain mixed.berry
## 1   1 2100081  9678          0         0           0     0           1
## 2   2 2100081  9697          0         0           0     0           1
## 3   3 2100081  9825          0         0           0     0           1
## 4   4 2100081  9999          0         0           0     0           1
## 5   5 2100081 10015          1         0           1     0           1
## 6   6 2100081 10029          1         0           2     0           1
##   price
## 1 58.96
## 2 58.96
## 3 65.04
## 4 65.04
## 5 48.96
## 6 65.04
str(yo)
## 'data.frame':    2380 obs. of  9 variables:
##  $ obs        : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ id         : int  2100081 2100081 2100081 2100081 2100081 2100081 2100081 2100081 2100081 2100081 ...
##  $ time       : int  9678 9697 9825 9999 10015 10029 10036 10042 10083 10091 ...
##  $ strawberry : int  0 0 0 0 1 1 0 0 0 0 ...
##  $ blueberry  : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ pina.colada: int  0 0 0 0 1 2 0 0 0 0 ...
##  $ plain      : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ mixed.berry: int  1 1 1 1 1 1 1 1 1 1 ...
##  $ price      : num  59 59 65 65 49 ...
yo$id = factor(yo$id)
str(yo)
## 'data.frame':    2380 obs. of  9 variables:
##  $ obs        : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ id         : Factor w/ 332 levels "2100081","2100370",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ time       : int  9678 9697 9825 9999 10015 10029 10036 10042 10083 10091 ...
##  $ strawberry : int  0 0 0 0 1 1 0 0 0 0 ...
##  $ blueberry  : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ pina.colada: int  0 0 0 0 1 2 0 0 0 0 ...
##  $ plain      : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ mixed.berry: int  1 1 1 1 1 1 1 1 1 1 ...
##  $ price      : num  59 59 65 65 49 ...

Introducing the Yogurt Data Set

Notes:


Histograms Revisited

Notes:

# ggplot(aes(price), data=yo,
#        fill='orange')) +
#   geom_histogram()
qplot(price, data=yo, fill=I('orange'))
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

table(yo$price)
## 
##    20 24.96 33.04  33.2 33.28 33.36 33.52 39.04    44 45.04 48.96 49.52 
##     2    11    54     1     1    22     1   234    21    11    81     1 
##  49.6    50 55.04 58.96    62 63.04 65.04 68.96 
##     1   205     6   303    15     2   799   609

Number of Purchases

Notes:

yo = transform(yo,
               all.purchases=strawberry+blueberry+pina.colada+plain+mixed.berry)

Prices over Time

Notes:

ggplot(aes(time, price), data=yo) +
  geom_jitter(alpha=.25, shape=21,
              fill='orange')

ggplot(aes(time, price), data=yo) +
  geom_smooth()


Sampling Observations

Notes:


Looking at Samples of Households

set.seed(7)
sample.ids = sample(levels(yo$id), 16)

p1=ggplot(aes(time, price), 
       data=subset(yo, id %in% sample.ids)) +
  facet_wrap(~id) +
  geom_line() +
  geom_point(aes(size=all.purchases), pch=1)
p1

library(plotly)
## 
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## The following object is masked from 'package:graphics':
## 
##     layout
ggplotly(p1)

The Limits of Cross Sectional Data

Notes:


Many Variables

Notes:


Scatterplot Matrix

# install.packages('GGally')
library(GGally)
## 
## Attaching package: 'GGally'
## The following object is masked from 'package:dplyr':
## 
##     nasa
theme_set(theme_minimal(20))

set.seed(7)
pf_subset = pf[, c(2:15)]
names(pf_subset)
##  [1] "age"                   "dob_day"              
##  [3] "dob_year"              "dob_month"            
##  [5] "gender"                "tenure"               
##  [7] "friend_count"          "friendships_initiated"
##  [9] "likes"                 "likes_received"       
## [11] "mobile_likes"          "mobile_likes_received"
## [13] "www_likes"             "www_likes_received"
ggpairs(pf_subset[sample.int(nrow(pf_subset),
                             1000), ])
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.


Even More Variables

Notes:


Heat Maps

Notes:

nci <- read.table("nci.tsv")
colnames(nci) <- c(1:64)
library(ggplot2)
library(reshape2)
## 
## Attaching package: 'reshape2'
## The following object is masked from 'package:tidyr':
## 
##     smiths
nci.long.samp <- melt(as.matrix(nci[1:200,]))
names(nci.long.samp) <- c("gene", "case", "value")
head(nci.long.samp)
##   gene case  value
## 1    1    1  0.300
## 2    2    1  1.180
## 3    3    1  0.550
## 4    4    1  1.140
## 5    5    1 -0.265
## 6    6    1 -0.070
ggplot(aes(y = gene, x = case, fill = value),
  data = nci.long.samp) +
  geom_tile() +
  scale_fill_gradientn(colours = colorRampPalette(c("yellow", "red"))(100))

ggplotly()


Click KnitHTML to see all of your hard work and to have an html page of this lesson, your answers, and your notes!