Lesson 4


setwd("C:/Users/Ryan/Desktop/MOOCs/Data Analysis with R/lesson4")
library(ggplot2)
pf = read.csv('../lesson3/pseudo_facebook.tsv',
              sep='\t')

Scatterplots and Perceived Audience Size

Notes:


Scatterplots

Notes:

qplot(x=age,y=friend_count,
      data=pf)


What are some things that you notice right away?

Response:


ggplot Syntax

Notes:

ggplot(aes(x=age, y=friend_count), data=pf) +
  geom_point() +
  xlim(13,90)
## Warning: Removed 4906 rows containing missing values (geom_point).

summary(pf$age)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   13.00   20.00   28.00   37.28   50.00  113.00

Overplotting

Notes:

ggplot(aes(age, friend_count), data=pf) +
  geom_jitter(alpha=1/20) +
  xlim(13,90)
## Warning: Removed 5176 rows containing missing values (geom_point).

What do you notice in the plot?

Response:


Coord_trans()

Notes:

?coord_trans
## starting httpd help server ...
##  done

Look up the documentation for coord_trans() and add a layer to the plot that transforms friend_count using the square root function. Create your plot!

ggplot(aes(age, friend_count),data=pf) +
  geom_point(alpha=1/20) +
  xlim(13,90) +
  coord_trans(y='sqrt')
## Warning: Removed 4906 rows containing missing values (geom_point).

What do you notice?


Alpha and Jitter

Notes:

ggplot(aes(x = age, y = friendships_initiated,
           color=gender), data = pf) + 
  geom_jitter(alpha = 1/10)+#, position = position_jitter(h = 0)) + 
  xlim(13,90)
## Warning: Removed 5184 rows containing missing values (geom_point).


Overplotting and Domain Knowledge

Notes:


Conditional Means

Notes:

library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
pf.fc_by_age = pf %>% 
  group_by(age) %>% 
  summarise(friend_count_mean=mean(friend_count),
            friend_count_median=median(friend_count),
            n=n()) %>% 
  arrange(age)

head(pf.fc_by_age)
## # A tibble: 6 x 4
##     age friend_count_mean friend_count_median     n
##   <int>             <dbl>               <dbl> <int>
## 1    13          164.7500                74.0   484
## 2    14          251.3901               132.0  1925
## 3    15          347.6921               161.0  2618
## 4    16          351.9371               171.5  3086
## 5    17          350.3006               156.0  3283
## 6    18          331.1663               162.0  5196

Create your plot!

ggplot(aes(age, friend_count_mean), data=pf.fc_by_age) +
  geom_line()


Overlaying Summaries with Raw Data

Notes:

ggplot(aes(age, friend_count), data=pf) +
  geom_point(alpha=1/10,
             position = position_jitter(h=0),
             color='orange')+
  geom_line(stat='summary', fun.y=mean) +
  geom_line(stat='summary', fun.y=quantile, 
            fun.args=list(prob=.1),
            linetype='dashed') +
  geom_line(stat='summary', fun.y=quantile,
            fun.args=list(prob=.9),
            linetype='dashed') +
  geom_line(stat='summary', fun.y=median,
            color='blue') +
  coord_cartesian(xlim=c(13,90))

What are some of your observations of the plot?

Response:


Moira: Histogram Summary and Scatterplot

See the Instructor Notes of this video to download Moira’s paper on perceived audience size and to see the final plot.

Notes:


Correlation

Notes:

cor.test(pf$age, pf$friend_count)
## 
##  Pearson's product-moment correlation
## 
## data:  pf$age and pf$friend_count
## t = -8.6268, df = 99001, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  -0.03363072 -0.02118189
## sample estimates:
##         cor 
## -0.02740737
## or
cor(pf$friend_count, pf$age)
## [1] -0.02740737

Look up the documentation for the cor.test function.

What’s the correlation between age and friend count? Round to three decimal places. Response:


Correlation on Subsets

Notes:

with(pf, cor.test(age, friend_count))
## 
##  Pearson's product-moment correlation
## 
## data:  age and friend_count
## t = -8.6268, df = 99001, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  -0.03363072 -0.02118189
## sample estimates:
##         cor 
## -0.02740737

Correlation Methods

Notes:


Create Scatterplots

Notes:

ggplot(aes(www_likes_received, likes_received), data=pf) + 
  geom_point() +
  scale_y_log10() +
  scale_x_log10()


Strong Correlations

Notes:

ggplot(aes(www_likes_received, likes_received), data=pf) +
  geom_point() +
  xlim(0, quantile(pf$www_likes_received, 0.95)) +
  ylim(0, quantile(pf$likes_received, 0.95)) + 
  geom_smooth(method='lm', color='red')
## Warning: Removed 6075 rows containing non-finite values (stat_smooth).
## Warning: Removed 6075 rows containing missing values (geom_point).

What’s the correlation betwen the two variables? Include the top 5% of values for the variable in the calculation and round to 3 decimal places.

cor(pf$www_likes_received, pf$likes_received)
## [1] 0.9479902

Response:


Moira on Correlation

Notes:


More Caution with Correlation

Notes:

library(alr3)
## Loading required package: car
## 
## Attaching package: 'car'
## The following object is masked from 'package:dplyr':
## 
##     recode
data(Mitchell)
?Mitchell
# load(Mitchell)

Create your plot!


Noisy Scatterplots

  1. Take a guess for the correlation coefficient for the scatterplot.

  2. What is the actual correlation of the two variables? (Round to the thousandths place)

ggplot(aes(Month, Temp), data=Mitchell) +
  geom_point() #+

  scale_x_date()
## <ScaleContinuousDate>
##  Range:  
##  Limits:    0 --    1

Making Sense of Data

Notes:

cor(Mitchell)
##            Month       Temp
## Month 1.00000000 0.05747063
## Temp  0.05747063 1.00000000
ggplot(aes(Month, Temp), data=Mitchell) +
  geom_point() +
  scale_x_discrete(breaks = seq(0,203,12))


A New Perspective

What do you notice? Response:

Watch the solution video and check out the Instructor Notes! Notes:


Understanding Noise: Age to Age Months

Notes:

ggplot(aes(age, friend_count_mean), 
       data=pf.fc_by_age) +
  geom_line()


Age with Months Means

pf$age_with_months = pf$age + (12-pf$dob_month)/12
str(pf$age_with_months)
##  num [1:99003] 14.1 14.1 14.1 14 14 ...
pf.fc_by_age_months = pf %>% 
  group_by(age_with_months) %>% 
  summarise(friend_count_mean=mean(friend_count),
            friend_count_median=median(friend_count),
            n=n()) %>% 
  arrange(age_with_months)

head(pf.fc_by_age_months)
## # A tibble: 6 x 4
##   age_with_months friend_count_mean friend_count_median     n
##             <dbl>             <dbl>               <dbl> <int>
## 1        13.16667          46.33333                30.5     6
## 2        13.25000         115.07143                23.5    14
## 3        13.33333         136.20000                44.0    25
## 4        13.41667         164.24242                72.0    33
## 5        13.50000         131.17778                66.0    45
## 6        13.58333         156.81481                64.0    54

Programming Assignment

ggplot(aes(age_with_months, friend_count_mean), 
       data=pf.fc_by_age_months[pf.fc_by_age_months$age_with_months< 71,]) +
  # geom_point()+
  geom_line(color='blue')


Noise in Conditional Means

library(gridExtra)
## 
## Attaching package: 'gridExtra'
## The following object is masked from 'package:dplyr':
## 
##     combine
p1 = ggplot(aes(age, friend_count_mean),
       data=pf.fc_by_age[pf.fc_by_age$age<71,]) +
  geom_line(color='orange')

p2 = ggplot(aes(age_with_months, friend_count_mean),
            data=pf.fc_by_age_months[pf.fc_by_age_months$age_with_months<71,])+
  geom_line(color='blue')

p3 = ggplot(aes(round(age/5)*5, friend_count),
            data=pf[pf$age < 71,]) +
  geom_line(stat='summary', fun.y=mean, color='red')

grid.arrange(p1,p2,p3, ncol=1)


Smoothing Conditional Means

Notes:

a = p1+geom_smooth()
b = p2+geom_smooth()
c = p3+geom_smooth()
grid.arrange(a,b,c, ncol=1)


Which Plot to Choose?

Notes:


Analyzing Two Variables

Reflection:


Click KnitHTML to see all of your hard work and to have an html page of this lesson, your answers, and your notes!