setwd("C:/Users/Ryan/Desktop/MOOCs/Data Analysis with R/lesson4")
library(ggplot2)
pf = read.csv('../lesson3/pseudo_facebook.tsv',
sep='\t')
Notes:
Notes:
qplot(x=age,y=friend_count,
data=pf)
Response:
Notes:
ggplot(aes(x=age, y=friend_count), data=pf) +
geom_point() +
xlim(13,90)
## Warning: Removed 4906 rows containing missing values (geom_point).
summary(pf$age)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 13.00 20.00 28.00 37.28 50.00 113.00
Notes:
ggplot(aes(age, friend_count), data=pf) +
geom_jitter(alpha=1/20) +
xlim(13,90)
## Warning: Removed 5176 rows containing missing values (geom_point).
Response:
Notes:
?coord_trans
## starting httpd help server ...
## done
ggplot(aes(age, friend_count),data=pf) +
geom_point(alpha=1/20) +
xlim(13,90) +
coord_trans(y='sqrt')
## Warning: Removed 4906 rows containing missing values (geom_point).
Notes:
ggplot(aes(x = age, y = friendships_initiated,
color=gender), data = pf) +
geom_jitter(alpha = 1/10)+#, position = position_jitter(h = 0)) +
xlim(13,90)
## Warning: Removed 5184 rows containing missing values (geom_point).
Notes:
Notes:
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
pf.fc_by_age = pf %>%
group_by(age) %>%
summarise(friend_count_mean=mean(friend_count),
friend_count_median=median(friend_count),
n=n()) %>%
arrange(age)
head(pf.fc_by_age)
## # A tibble: 6 x 4
## age friend_count_mean friend_count_median n
## <int> <dbl> <dbl> <int>
## 1 13 164.7500 74.0 484
## 2 14 251.3901 132.0 1925
## 3 15 347.6921 161.0 2618
## 4 16 351.9371 171.5 3086
## 5 17 350.3006 156.0 3283
## 6 18 331.1663 162.0 5196
Create your plot!
ggplot(aes(age, friend_count_mean), data=pf.fc_by_age) +
geom_line()
Notes:
ggplot(aes(age, friend_count), data=pf) +
geom_point(alpha=1/10,
position = position_jitter(h=0),
color='orange')+
geom_line(stat='summary', fun.y=mean) +
geom_line(stat='summary', fun.y=quantile,
fun.args=list(prob=.1),
linetype='dashed') +
geom_line(stat='summary', fun.y=quantile,
fun.args=list(prob=.9),
linetype='dashed') +
geom_line(stat='summary', fun.y=median,
color='blue') +
coord_cartesian(xlim=c(13,90))
Response:
See the Instructor Notes of this video to download Moira’s paper on perceived audience size and to see the final plot.
Notes:
Notes:
cor.test(pf$age, pf$friend_count)
##
## Pearson's product-moment correlation
##
## data: pf$age and pf$friend_count
## t = -8.6268, df = 99001, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## -0.03363072 -0.02118189
## sample estimates:
## cor
## -0.02740737
## or
cor(pf$friend_count, pf$age)
## [1] -0.02740737
Look up the documentation for the cor.test function.
What’s the correlation between age and friend count? Round to three decimal places. Response:
Notes:
with(pf, cor.test(age, friend_count))
##
## Pearson's product-moment correlation
##
## data: age and friend_count
## t = -8.6268, df = 99001, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## -0.03363072 -0.02118189
## sample estimates:
## cor
## -0.02740737
Notes:
Notes:
ggplot(aes(www_likes_received, likes_received), data=pf) +
geom_point() +
scale_y_log10() +
scale_x_log10()
Notes:
ggplot(aes(www_likes_received, likes_received), data=pf) +
geom_point() +
xlim(0, quantile(pf$www_likes_received, 0.95)) +
ylim(0, quantile(pf$likes_received, 0.95)) +
geom_smooth(method='lm', color='red')
## Warning: Removed 6075 rows containing non-finite values (stat_smooth).
## Warning: Removed 6075 rows containing missing values (geom_point).
What’s the correlation betwen the two variables? Include the top 5% of values for the variable in the calculation and round to 3 decimal places.
cor(pf$www_likes_received, pf$likes_received)
## [1] 0.9479902
Response:
Notes:
Notes:
library(alr3)
## Loading required package: car
##
## Attaching package: 'car'
## The following object is masked from 'package:dplyr':
##
## recode
data(Mitchell)
?Mitchell
# load(Mitchell)
Create your plot!
Take a guess for the correlation coefficient for the scatterplot.
What is the actual correlation of the two variables? (Round to the thousandths place)
ggplot(aes(Month, Temp), data=Mitchell) +
geom_point() #+
scale_x_date()
## <ScaleContinuousDate>
## Range:
## Limits: 0 -- 1
Notes:
cor(Mitchell)
## Month Temp
## Month 1.00000000 0.05747063
## Temp 0.05747063 1.00000000
ggplot(aes(Month, Temp), data=Mitchell) +
geom_point() +
scale_x_discrete(breaks = seq(0,203,12))
What do you notice? Response:
Watch the solution video and check out the Instructor Notes! Notes:
Notes:
ggplot(aes(age, friend_count_mean),
data=pf.fc_by_age) +
geom_line()
pf$age_with_months = pf$age + (12-pf$dob_month)/12
str(pf$age_with_months)
## num [1:99003] 14.1 14.1 14.1 14 14 ...
pf.fc_by_age_months = pf %>%
group_by(age_with_months) %>%
summarise(friend_count_mean=mean(friend_count),
friend_count_median=median(friend_count),
n=n()) %>%
arrange(age_with_months)
head(pf.fc_by_age_months)
## # A tibble: 6 x 4
## age_with_months friend_count_mean friend_count_median n
## <dbl> <dbl> <dbl> <int>
## 1 13.16667 46.33333 30.5 6
## 2 13.25000 115.07143 23.5 14
## 3 13.33333 136.20000 44.0 25
## 4 13.41667 164.24242 72.0 33
## 5 13.50000 131.17778 66.0 45
## 6 13.58333 156.81481 64.0 54
Programming Assignment
ggplot(aes(age_with_months, friend_count_mean),
data=pf.fc_by_age_months[pf.fc_by_age_months$age_with_months< 71,]) +
# geom_point()+
geom_line(color='blue')
library(gridExtra)
##
## Attaching package: 'gridExtra'
## The following object is masked from 'package:dplyr':
##
## combine
p1 = ggplot(aes(age, friend_count_mean),
data=pf.fc_by_age[pf.fc_by_age$age<71,]) +
geom_line(color='orange')
p2 = ggplot(aes(age_with_months, friend_count_mean),
data=pf.fc_by_age_months[pf.fc_by_age_months$age_with_months<71,])+
geom_line(color='blue')
p3 = ggplot(aes(round(age/5)*5, friend_count),
data=pf[pf$age < 71,]) +
geom_line(stat='summary', fun.y=mean, color='red')
grid.arrange(p1,p2,p3, ncol=1)
Notes:
a = p1+geom_smooth()
b = p2+geom_smooth()
c = p3+geom_smooth()
grid.arrange(a,b,c, ncol=1)
Notes:
Reflection:
Click KnitHTML to see all of your hard work and to have an html page of this lesson, your answers, and your notes!