food = read.csv(
'https://spreadsheets.google.com/pub?key=0ArfEDsV3bBwCdGlYVVpXX20tbU13STZyVG0yNkRrZnc&output=csv',
header=TRUE, check.names = FALSE)
head(food)
## 1961 1962 1963 1964 1965 1966
## 1 Abkhazia NA NA NA NA NA NA
## 2 Afghanistan NA NA NA NA NA NA
## 3 Akrotiri and Dhekelia NA NA NA NA NA NA
## 4 Albania 2233.67 2248.11 2163.45 2275.92 2258.32 2258.64
## 5 Algeria 1700.44 1657.72 1624.03 1644.20 1704.08 1685.77
## 6 American Samoa NA NA NA NA NA NA
## 1967 1968 1969 1970 1971 1972 1973 1974 1975
## 1 NA NA NA NA NA NA NA NA NA
## 2 NA NA NA NA NA NA NA NA NA
## 3 NA NA NA NA NA NA NA NA NA
## 4 2265.64 2345.52 2407.55 2418.76 2366.48 2395.14 2444.74 2513.19 2521.74
## 5 1767.02 1831.98 1825.67 1790.36 1832.87 1964.81 1958.93 2092.12 2163.39
## 6 NA NA NA NA NA NA NA NA NA
## 1976 1977 1978 1979 1980 1981 1982 1983 1984
## 1 NA NA NA NA NA NA NA NA NA
## 2 NA NA NA NA NA NA NA NA NA
## 3 NA NA NA NA NA NA NA NA NA
## 4 2715.62 2819.06 2736.79 2660.29 2659.95 2747.51 2691.75 2840.86 2758.98
## 5 2143.57 2305.94 2437.99 2537.11 2657.34 2685.14 2656.45 2637.55 2584.16
## 6 NA NA NA NA NA NA NA NA NA
## 1985 1986 1987 1988 1989 1990 1991 1992 1993
## 1 NA NA NA NA NA NA NA NA NA
## 2 NA NA NA NA NA NA NA NA NA
## 3 NA NA NA NA NA NA NA NA NA
## 4 2607.83 2738.90 2552.18 2647.18 2629.80 2656.37 2388.87 2650.29 2855.45
## 5 2700.86 2717.66 2724.66 2795.00 2862.84 2856.21 2835.47 2972.77 2971.61
## 6 NA NA NA NA NA NA NA NA NA
## 1994 1995 1996 1997 1998 1999 2000 2001 2002
## 1 NA NA NA NA NA NA NA NA NA
## 2 NA NA NA NA NA NA NA NA NA
## 3 NA NA NA NA NA NA NA NA NA
## 4 2938.79 2825.83 2979.30 2746.72 2790.70 2891.76 2832.07 2861.18 2864.93
## 5 2865.20 2891.30 2893.72 2844.07 2904.07 2957.91 2928.84 3003.63 3034.33
## 6 NA NA NA NA NA NA NA NA NA
## 2003 2004 2005 2006 2007
## 1 NA NA NA NA NA
## 2 NA NA NA NA NA
## 3 NA NA NA NA NA
## 4 2838.24 2849.36 2917.08 2914.95 2879.57
## 5 3073.26 3090.13 3059.24 3101.20 3153.38
## 6 NA NA NA NA NA
to tansform data
library(tidyr)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
food = tidyr::gather(food, 'Year',
'kcal', na.rm = T, 2:48)
head(food)
## Year kcal
## 4 Albania 1961 2233.67
## 5 Algeria 1961 1700.44
## 8 Angola 1961 1784.67
## 10 Antigua and Barbuda 1961 1971.91
## 11 Argentina 1961 3099.85
## 14 Australia 1961 3086.60
colnames(food) = c('country', 'year','kcal')
Let’s look at United States data, what is trend?
us = food[food$country=='United States',]
library(ggplot2)
range(us$year)
## [1] "1961" "2007"
# scale with range tick every 5 years
ggplot(aes(year, kcal), data=us) +
geom_point(color='red') +
scale_x_discrete(breaks=seq(1961, 2007, 5))
As expected it increases, but almost 3800 kcal per person?! We must be exporting most of our food right.
Are there any decreasing? Let’s check out some other countries:
food[food$kcal==min(food$kcal[food$year==2007]),]
## country year kcal
## 11964 Congo, Dem. Rep. 2007 1605.08
So Democratic Republic of Congo produces least amount of food (in 2007) let’s look at the plot:
congo = subset(food, food$country=='Congo, Dem. Rep.')
qplot(year, kcal, data=congo) +
scale_x_discrete(breaks=seq(1961, 2007, 5))
Whoa! Huge drop off in the 90’s let’s check out why. DR Congo wiki Sounds like US relations post cold war contributed and Mobutu became more currupt, leading to civil war, haven’t really recovered since.
Let’s try some more interesting plots with Plotly: borrowing a dataset with country codes and GDP
library(plotly)
##
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
##
## last_plot
## The following object is masked from 'package:graphics':
##
## layout
df <- read.csv('https://raw.githubusercontent.com/plotly/datasets/master/2014_world_gdp_with_codes.csv')
str(df)
## 'data.frame': 222 obs. of 3 variables:
## $ COUNTRY : Factor w/ 222 levels "Afghanistan",..: 1 2 3 4 5 6 7 8 9 10 ...
## $ GDP..BILLIONS.: num 21.71 13.4 227.8 0.75 4.8 ...
## $ CODE : Factor w/ 221 levels "ABW","AFG","AGO",..: 2 5 57 10 6 3 4 11 8 9 ...
colnames(df) = c('country', 'GDP_Billions', 'Country_Code')
set up parameters, boundaries and map options:
# light grey boundaries
l <- list(color = toRGB("grey"), width = 0.5)
# specify map projection/options
g <- list(
showframe = FALSE,
showcoastlines = FALSE,
projection = list(type = 'Mercator')
)
Before plotting need to join the two datasets
x = merge(food, df, by='country', all.x = TRUE)#, all.y = TRUE)
subset(x, is.na(Country_Code) & year==2007)
## country year kcal GDP_Billions Country_Code
## 365 Bahamas 2007 2712.50 NA <NA>
## 1236 Cape Verde 2007 2571.92 NA <NA>
## 1282 Central African Rep. 2007 1985.84 NA <NA>
## 1563 Congo, Dem. Rep. 2007 1605.08 NA <NA>
## 1603 Congo, Rep. 2007 2511.88 NA <NA>
## 1830 Czech Rep. 2007 3260.34 NA <NA>
## 2041 Dominican Rep. 2007 2295.10 NA <NA>
## 2547 Gambia 2007 2384.59 NA <NA>
## 3713 Korea, Dem. Rep. 2007 2087.04 NA <NA>
## 3751 Korea, Rep. 2007 3073.71 NA <NA>
## 4110 Macedonia, FYR 2007 3105.47 NA <NA>
## 4706 Myanmar 2007 2464.91 NA <NA>
## 4900 Netherlands Antilles 2007 3004.49 NA <NA>
## 6101 Slovak Republic 2007 2892.60 NA <NA>
## 7399 West Bank and Gaza 2007 2019.71 NA <NA>
## 7449 Yemen, Rep. 2007 2067.57 NA <NA>
Have a bunch of countries without country codes (‘NA’) let’s fix it:
x$Country_Code[x$country=='Congo, Rep.'] = 'COG'
x$Country_Code[x$country=='Congo, Dem. Rep.'] = 'COD'
x$Country_Code[x$country=='Central African Rep.'] = 'CAF'
x$Country_Code[x$country=='Myanmar'] = 'MMR'
x$Country_Code[x$country=='Yemen, Rep.'] = 'YEM'
x$Country_Code[x$country=='Czech Rep.'] = 'CZE'
x$Country_Code[x$country=='Slovak Republic'] = 'SVK'
x$Country_Code[x$country=='West Bank and Gaza'] = 'WBG'
x$Country_Code[x$country=='Bahamas'] = 'BHM'
x$Country_Code[x$country=='Cape Verde'] = 'CPV'
x$Country_Code[x$country=='Dominican Rep.'] = 'DOM'
x$Country_Code[x$country=='Gambia'] = 'GMB'
x$Country_Code[x$country=='Korea, Dem. Rep.'] = 'KOR'
x$Country_Code[x$country=='Korea, Rep.'] = 'PRK'
x$Country_Code[x$country=='Macedonia, FYR'] = 'MKD'
#x$Country_Code[x$country=='Netherlands Antilles'] = 'ANT'
subset(x, is.na(Country_Code) & year==2007)
## country year kcal GDP_Billions Country_Code
## 4900 Netherlands Antilles 2007 3004.49 NA <NA>
Code for ‘Netherlands and Antilles’ is invalid so we’ll leave it out Let’s check out the map:
plot_ly(x[x$year==2007,], z = kcal, text = country, locations = Country_Code, type = 'choropleth',
color = kcal, colors = 'Purples', marker = list(line = l),
colorbar = list(title = 'kcal/person')) %>%
layout(title = 'food supply per person per day in kcals<br>(2007)',
geo = g)
We have some white holes in the map. Greenland, Afghanstan, and South Sudan all stick out as missing. Can add them so hovering over them at least shows country name:
af = df[df$country=='Afghanistan',]
af$kcal = NA
af$year = 2007
x = rbind(x, af)
green = df[df$country=='Greenland',]
green$kcal=NA
green$year = 2007
x = rbind(x, green)
sud = df[df$country=='South Sudan',]
sud$kcal=NA
sud$year = 2007
x = rbind(x, sud)
Let’s check the map out again!
plot_ly(x[x$year==2007,], z = kcal, text = country, locations = Country_Code, type = 'choropleth',
color = kcal, colors = 'Purples', marker = list(line = l),
colorbar = list(title = 'kcal/person')) %>%
layout(title = 'food supply per person per day in kcals<br>(2007)',
geo = g)
There we go! That looks better. It follows what we would expect: richer countries produce more food. But most countries produce over 2000 kcals per day per person which is interesting. Let’s check out total production really quick.
mean(x$kcal[x$year==2007], na.rm = TRUE)
## [1] 2756.14
So Average kcal production per person in 2007 was 2756. Let’s plot the averages:
means = aggregate(food$kcal, by=list(food$year), FUN='mean')
colnames(means) = c('year', 'mean_kcal')
qplot(year, mean_kcal, data=means,
main = 'mean kcal production') +
geom_point() +
scale_x_discrete(breaks=seq(1961, 2007, 5))
Global production increasing linearly by about 500 kcal since 1961.
means$mean_kcal[means$year==1961]
## [1] 2245.955
and we saw before 2007 mean was about 2756. In 2007 there were ~6.6 Billion people according to the internet, so total production:
means$mean_kcal[means$year==2007]*6600000000
## [1] 1.819052e+13
that is 18.2 trillion kcals of food per DAY!