setwd("C:/Users/Ryan/Desktop/MOOCs/Data Analysis with R/lesson5")
pf = read.csv('../lesson3/pseudo_facebook.tsv',
sep = '\t')
Notes:
Notes:
Notes:
setwd("C:/Users/Ryan/Desktop/MOOCs/Data Analysis with R/lesson5")
library(ggplot2)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
pf.fc_by_age_gender = pf %>%
filter(!is.na(gender)) %>%
group_by(age, gender) %>%
summarise(mean_friend_count = mean(friend_count),
median_friend_count = as.integer(median(friend_count)),
n=n()) %>%
ungroup() %>%
arrange(age)
head(pf.fc_by_age_gender)
## # A tibble: 6 x 5
## age gender mean_friend_count median_friend_count n
## <int> <fctr> <dbl> <int> <int>
## 1 13 female 259.1606 148 193
## 2 13 male 102.1340 55 291
## 3 14 female 362.4286 224 847
## 4 14 male 164.1456 92 1078
## 5 15 female 538.6813 276 1139
## 6 15 male 200.6658 106 1478
tail(pf.fc_by_age_gender)
## # A tibble: 6 x 5
## age gender mean_friend_count median_friend_count n
## <int> <fctr> <dbl> <int> <int>
## 1 111 female 244.4286 109 7
## 2 111 male 246.5000 191 10
## 3 112 female 201.2000 203 5
## 4 112 male 594.0769 95 13
## 5 113 female 278.6571 198 105
## 6 113 male 410.3956 223 91
Notes:
ggplot(aes(age, median_friend_count),
data=pf.fc_by_age_gender) +
geom_line(aes(color=gender))
Notes:
Notes:
Notes:
library(tidyr)
pf.fc_by_age_gender.wide =
subset(pf.fc_by_age_gender[c('age',
'gender',
'median_friend_count')],
!is.na(gender)) %>%
spread(gender, median_friend_count) %>%
mutate(ratio = male/female)
head(pf.fc_by_age_gender.wide)
## # A tibble: 6 x 4
## age female male ratio
## <int> <int> <int> <dbl>
## 1 13 148 55 0.3716216
## 2 14 224 92 0.4107143
## 3 15 276 106 0.3840580
## 4 16 258 136 0.5271318
## 5 17 245 125 0.5102041
## 6 18 243 122 0.5020576
Notes:
ggplot(aes(age, female/male),
data=pf.fc_by_age_gender.wide) +
geom_line() +
geom_hline(yintercept = 1,
color='red',
linetype=2)
Notes:
pf$year_joined = floor(2014-pf$tenure/365)
table(pf$year_joined)
##
## 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014
## 9 15 581 1507 4557 5448 9860 33366 43588 70
Notes:
?cut
## starting httpd help server ...
## done
pf$year_joined.bucket = cut(pf$year_joined, breaks = c(2004,
2009,
2011,
2012,
2014))
table(pf$year_joined.bucket)
##
## (2004,2009] (2009,2011] (2011,2012] (2012,2014]
## 6669 15308 33366 43658
Notes:
ggplot(aes(age, friend_count), data=pf[!is.na(pf$gender),]) +
geom_line(aes(color=year_joined.bucket),
stat = 'summary', fun.y=median)
Notes:
ggplot(aes(age, friend_count), data=pf[!is.na(pf$gender),]) +
geom_line(aes(color=year_joined.bucket),
stat = 'summary', fun.y=mean) +
geom_line(linetype=2,
stat='summary',
fun.y=mean)
Notes:
# x = pf %>%
# filter(tenure>0)
# x=x$friend_count/x$tenure
with(pf[pf$tenure>0,],
summary(friend_count/tenure))
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 0.0000 0.0775 0.2205 0.6096 0.5658 417.0000 2
Notes:
What is the median friend rate?
What is the maximum friend rate?
ggplot(aes(tenure,
friendships_initiated/tenure),
data=pf[pf$tenure>0,]) +
geom_line(aes(color=year_joined.bucket),
stat = 'summary',
fun.y=mean)
## Warning: Removed 2 rows containing non-finite values (stat_summary).
Notes:
ggplot(aes(x = tenure, y = friendships_initiated / tenure),
data = subset(pf, tenure >= 1)) +
geom_line(aes(color = year_joined.bucket),
stat = 'summary',
fun.y = mean)
ggplot(aes(x = 7 * round(tenure / 7), y = friendships_initiated / tenure),
data = subset(pf, tenure > 0)) +
geom_line(aes(color = year_joined.bucket),
stat = "summary",
fun.y = mean)
ggplot(aes(x = 30 * round(tenure / 30), y = friendships_initiated / tenure),
data = subset(pf, tenure > 0)) +
geom_line(aes(color = year_joined.bucket),
stat = "summary",
fun.y = mean)
ggplot(aes(x = 90 * round(tenure / 90), y = friendships_initiated / tenure),
data = subset(pf, tenure > 0)) +
geom_line(aes(color = year_joined.bucket),
stat = "summary",
fun.y = mean)
ggplot(aes(tenure, y = friendships_initiated / tenure),
data = subset(pf, tenure > 0)) +
geom_smooth(aes(color = year_joined.bucket))
Notes:
yo = read.csv('yogurt.csv')
head(yo)
## obs id time strawberry blueberry pina.colada plain mixed.berry
## 1 1 2100081 9678 0 0 0 0 1
## 2 2 2100081 9697 0 0 0 0 1
## 3 3 2100081 9825 0 0 0 0 1
## 4 4 2100081 9999 0 0 0 0 1
## 5 5 2100081 10015 1 0 1 0 1
## 6 6 2100081 10029 1 0 2 0 1
## price
## 1 58.96
## 2 58.96
## 3 65.04
## 4 65.04
## 5 48.96
## 6 65.04
str(yo)
## 'data.frame': 2380 obs. of 9 variables:
## $ obs : int 1 2 3 4 5 6 7 8 9 10 ...
## $ id : int 2100081 2100081 2100081 2100081 2100081 2100081 2100081 2100081 2100081 2100081 ...
## $ time : int 9678 9697 9825 9999 10015 10029 10036 10042 10083 10091 ...
## $ strawberry : int 0 0 0 0 1 1 0 0 0 0 ...
## $ blueberry : int 0 0 0 0 0 0 0 0 0 0 ...
## $ pina.colada: int 0 0 0 0 1 2 0 0 0 0 ...
## $ plain : int 0 0 0 0 0 0 0 0 0 0 ...
## $ mixed.berry: int 1 1 1 1 1 1 1 1 1 1 ...
## $ price : num 59 59 65 65 49 ...
yo$id = factor(yo$id)
str(yo)
## 'data.frame': 2380 obs. of 9 variables:
## $ obs : int 1 2 3 4 5 6 7 8 9 10 ...
## $ id : Factor w/ 332 levels "2100081","2100370",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ time : int 9678 9697 9825 9999 10015 10029 10036 10042 10083 10091 ...
## $ strawberry : int 0 0 0 0 1 1 0 0 0 0 ...
## $ blueberry : int 0 0 0 0 0 0 0 0 0 0 ...
## $ pina.colada: int 0 0 0 0 1 2 0 0 0 0 ...
## $ plain : int 0 0 0 0 0 0 0 0 0 0 ...
## $ mixed.berry: int 1 1 1 1 1 1 1 1 1 1 ...
## $ price : num 59 59 65 65 49 ...
Notes:
Notes:
# ggplot(aes(price), data=yo,
# fill='orange')) +
# geom_histogram()
qplot(price, data=yo, fill=I('orange'))
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
table(yo$price)
##
## 20 24.96 33.04 33.2 33.28 33.36 33.52 39.04 44 45.04 48.96 49.52
## 2 11 54 1 1 22 1 234 21 11 81 1
## 49.6 50 55.04 58.96 62 63.04 65.04 68.96
## 1 205 6 303 15 2 799 609
Notes:
yo = transform(yo,
all.purchases=strawberry+blueberry+pina.colada+plain+mixed.berry)
Notes:
ggplot(aes(time, price), data=yo) +
geom_jitter(alpha=.25, shape=21,
fill='orange')
ggplot(aes(time, price), data=yo) +
geom_smooth()
Notes:
set.seed(7)
sample.ids = sample(levels(yo$id), 16)
p1=ggplot(aes(time, price),
data=subset(yo, id %in% sample.ids)) +
facet_wrap(~id) +
geom_line() +
geom_point(aes(size=all.purchases), pch=1)
p1
library(plotly)
##
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
##
## last_plot
## The following object is masked from 'package:graphics':
##
## layout
ggplotly(p1)
Notes:
Notes:
# install.packages('GGally')
library(GGally)
##
## Attaching package: 'GGally'
## The following object is masked from 'package:dplyr':
##
## nasa
theme_set(theme_minimal(20))
set.seed(7)
pf_subset = pf[, c(2:15)]
names(pf_subset)
## [1] "age" "dob_day"
## [3] "dob_year" "dob_month"
## [5] "gender" "tenure"
## [7] "friend_count" "friendships_initiated"
## [9] "likes" "likes_received"
## [11] "mobile_likes" "mobile_likes_received"
## [13] "www_likes" "www_likes_received"
ggpairs(pf_subset[sample.int(nrow(pf_subset),
1000), ])
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
Notes:
Notes:
nci <- read.table("nci.tsv")
colnames(nci) <- c(1:64)
library(ggplot2)
library(reshape2)
##
## Attaching package: 'reshape2'
## The following object is masked from 'package:tidyr':
##
## smiths
nci.long.samp <- melt(as.matrix(nci[1:200,]))
names(nci.long.samp) <- c("gene", "case", "value")
head(nci.long.samp)
## gene case value
## 1 1 1 0.300
## 2 2 1 1.180
## 3 3 1 0.550
## 4 4 1 1.140
## 5 5 1 -0.265
## 6 6 1 -0.070
ggplot(aes(y = gene, x = case, fill = value),
data = nci.long.samp) +
geom_tile() +
scale_fill_gradientn(colours = colorRampPalette(c("yellow", "red"))(100))
ggplotly()
Click KnitHTML to see all of your hard work and to have an html page of this lesson, your answers, and your notes!