food = read.csv(
  'https://spreadsheets.google.com/pub?key=0ArfEDsV3bBwCdGlYVVpXX20tbU13STZyVG0yNkRrZnc&output=csv',
  header=TRUE, check.names = FALSE)
head(food)
##                            1961    1962    1963    1964    1965    1966
## 1              Abkhazia      NA      NA      NA      NA      NA      NA
## 2           Afghanistan      NA      NA      NA      NA      NA      NA
## 3 Akrotiri and Dhekelia      NA      NA      NA      NA      NA      NA
## 4               Albania 2233.67 2248.11 2163.45 2275.92 2258.32 2258.64
## 5               Algeria 1700.44 1657.72 1624.03 1644.20 1704.08 1685.77
## 6        American Samoa      NA      NA      NA      NA      NA      NA
##      1967    1968    1969    1970    1971    1972    1973    1974    1975
## 1      NA      NA      NA      NA      NA      NA      NA      NA      NA
## 2      NA      NA      NA      NA      NA      NA      NA      NA      NA
## 3      NA      NA      NA      NA      NA      NA      NA      NA      NA
## 4 2265.64 2345.52 2407.55 2418.76 2366.48 2395.14 2444.74 2513.19 2521.74
## 5 1767.02 1831.98 1825.67 1790.36 1832.87 1964.81 1958.93 2092.12 2163.39
## 6      NA      NA      NA      NA      NA      NA      NA      NA      NA
##      1976    1977    1978    1979    1980    1981    1982    1983    1984
## 1      NA      NA      NA      NA      NA      NA      NA      NA      NA
## 2      NA      NA      NA      NA      NA      NA      NA      NA      NA
## 3      NA      NA      NA      NA      NA      NA      NA      NA      NA
## 4 2715.62 2819.06 2736.79 2660.29 2659.95 2747.51 2691.75 2840.86 2758.98
## 5 2143.57 2305.94 2437.99 2537.11 2657.34 2685.14 2656.45 2637.55 2584.16
## 6      NA      NA      NA      NA      NA      NA      NA      NA      NA
##      1985    1986    1987    1988    1989    1990    1991    1992    1993
## 1      NA      NA      NA      NA      NA      NA      NA      NA      NA
## 2      NA      NA      NA      NA      NA      NA      NA      NA      NA
## 3      NA      NA      NA      NA      NA      NA      NA      NA      NA
## 4 2607.83 2738.90 2552.18 2647.18 2629.80 2656.37 2388.87 2650.29 2855.45
## 5 2700.86 2717.66 2724.66 2795.00 2862.84 2856.21 2835.47 2972.77 2971.61
## 6      NA      NA      NA      NA      NA      NA      NA      NA      NA
##      1994    1995    1996    1997    1998    1999    2000    2001    2002
## 1      NA      NA      NA      NA      NA      NA      NA      NA      NA
## 2      NA      NA      NA      NA      NA      NA      NA      NA      NA
## 3      NA      NA      NA      NA      NA      NA      NA      NA      NA
## 4 2938.79 2825.83 2979.30 2746.72 2790.70 2891.76 2832.07 2861.18 2864.93
## 5 2865.20 2891.30 2893.72 2844.07 2904.07 2957.91 2928.84 3003.63 3034.33
## 6      NA      NA      NA      NA      NA      NA      NA      NA      NA
##      2003    2004    2005    2006    2007
## 1      NA      NA      NA      NA      NA
## 2      NA      NA      NA      NA      NA
## 3      NA      NA      NA      NA      NA
## 4 2838.24 2849.36 2917.08 2914.95 2879.57
## 5 3073.26 3090.13 3059.24 3101.20 3153.38
## 6      NA      NA      NA      NA      NA

diplyr and tidyr

to tansform data

library(tidyr)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
food = tidyr::gather(food, 'Year', 
                     'kcal', na.rm = T, 2:48)
head(food)
##                        Year    kcal
## 4              Albania 1961 2233.67
## 5              Algeria 1961 1700.44
## 8               Angola 1961 1784.67
## 10 Antigua and Barbuda 1961 1971.91
## 11           Argentina 1961 3099.85
## 14           Australia 1961 3086.60
colnames(food) = c('country', 'year','kcal')

Explore

Let’s look at United States data, what is trend?

us = food[food$country=='United States',]

library(ggplot2)

range(us$year)
## [1] "1961" "2007"
# scale with range tick every 5 years

ggplot(aes(year, kcal), data=us) +
  geom_point(color='red') +
  scale_x_discrete(breaks=seq(1961, 2007, 5))

As expected it increases, but almost 3800 kcal per person?! We must be exporting most of our food right.

Are there any decreasing? Let’s check out some other countries:

food[food$kcal==min(food$kcal[food$year==2007]),]
##                country year    kcal
## 11964 Congo, Dem. Rep. 2007 1605.08

So Democratic Republic of Congo produces least amount of food (in 2007) let’s look at the plot:

congo = subset(food, food$country=='Congo, Dem. Rep.')
qplot(year, kcal, data=congo) +
  scale_x_discrete(breaks=seq(1961, 2007, 5))

Whoa! Huge drop off in the 90’s let’s check out why. DR Congo wiki Sounds like US relations post cold war contributed and Mobutu became more currupt, leading to civil war, haven’t really recovered since.

Let’s try some more interesting plots with Plotly: borrowing a dataset with country codes and GDP

library(plotly)
## 
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## The following object is masked from 'package:graphics':
## 
##     layout
df <- read.csv('https://raw.githubusercontent.com/plotly/datasets/master/2014_world_gdp_with_codes.csv')
str(df)
## 'data.frame':    222 obs. of  3 variables:
##  $ COUNTRY       : Factor w/ 222 levels "Afghanistan",..: 1 2 3 4 5 6 7 8 9 10 ...
##  $ GDP..BILLIONS.: num  21.71 13.4 227.8 0.75 4.8 ...
##  $ CODE          : Factor w/ 221 levels "ABW","AFG","AGO",..: 2 5 57 10 6 3 4 11 8 9 ...
colnames(df) = c('country', 'GDP_Billions', 'Country_Code')

set up parameters, boundaries and map options:

# light grey boundaries
l <- list(color = toRGB("grey"), width = 0.5)

# specify map projection/options
g <- list(
  showframe = FALSE,
  showcoastlines = FALSE,
  projection = list(type = 'Mercator')
)

Before plotting need to join the two datasets

x = merge(food, df, by='country', all.x = TRUE)#, all.y = TRUE)
subset(x, is.na(Country_Code) & year==2007)
##                   country year    kcal GDP_Billions Country_Code
## 365               Bahamas 2007 2712.50           NA         <NA>
## 1236           Cape Verde 2007 2571.92           NA         <NA>
## 1282 Central African Rep. 2007 1985.84           NA         <NA>
## 1563     Congo, Dem. Rep. 2007 1605.08           NA         <NA>
## 1603          Congo, Rep. 2007 2511.88           NA         <NA>
## 1830           Czech Rep. 2007 3260.34           NA         <NA>
## 2041       Dominican Rep. 2007 2295.10           NA         <NA>
## 2547               Gambia 2007 2384.59           NA         <NA>
## 3713     Korea, Dem. Rep. 2007 2087.04           NA         <NA>
## 3751          Korea, Rep. 2007 3073.71           NA         <NA>
## 4110       Macedonia, FYR 2007 3105.47           NA         <NA>
## 4706              Myanmar 2007 2464.91           NA         <NA>
## 4900 Netherlands Antilles 2007 3004.49           NA         <NA>
## 6101      Slovak Republic 2007 2892.60           NA         <NA>
## 7399   West Bank and Gaza 2007 2019.71           NA         <NA>
## 7449          Yemen, Rep. 2007 2067.57           NA         <NA>

Have a bunch of countries without country codes (‘NA’) let’s fix it:

x$Country_Code[x$country=='Congo, Rep.'] = 'COG'
x$Country_Code[x$country=='Congo, Dem. Rep.'] = 'COD'
x$Country_Code[x$country=='Central African Rep.'] = 'CAF'
x$Country_Code[x$country=='Myanmar'] = 'MMR'
x$Country_Code[x$country=='Yemen, Rep.'] = 'YEM'
x$Country_Code[x$country=='Czech Rep.'] = 'CZE'
x$Country_Code[x$country=='Slovak Republic'] = 'SVK'
x$Country_Code[x$country=='West Bank and Gaza'] = 'WBG'
x$Country_Code[x$country=='Bahamas'] = 'BHM'
x$Country_Code[x$country=='Cape Verde'] = 'CPV'
x$Country_Code[x$country=='Dominican Rep.'] = 'DOM'
x$Country_Code[x$country=='Gambia'] = 'GMB'
x$Country_Code[x$country=='Korea, Dem. Rep.'] = 'KOR'
x$Country_Code[x$country=='Korea, Rep.'] = 'PRK'
x$Country_Code[x$country=='Macedonia, FYR'] = 'MKD'
#x$Country_Code[x$country=='Netherlands Antilles'] = 'ANT'
subset(x, is.na(Country_Code) & year==2007)
##                   country year    kcal GDP_Billions Country_Code
## 4900 Netherlands Antilles 2007 3004.49           NA         <NA>

Code for ‘Netherlands and Antilles’ is invalid so we’ll leave it out Let’s check out the map:

plot_ly(x[x$year==2007,], z = kcal, text = country, locations = Country_Code, type = 'choropleth',
        color = kcal, colors = 'Purples', marker = list(line = l),
        colorbar = list(title = 'kcal/person')) %>%
  layout(title = 'food supply per person per day in kcals<br>(2007)',
         geo = g)

We have some white holes in the map. Greenland, Afghanstan, and South Sudan all stick out as missing. Can add them so hovering over them at least shows country name:

af = df[df$country=='Afghanistan',]
af$kcal = NA
af$year = 2007
x = rbind(x, af)

green = df[df$country=='Greenland',]
green$kcal=NA
green$year = 2007
x = rbind(x, green)

sud = df[df$country=='South Sudan',]
sud$kcal=NA
sud$year = 2007
x = rbind(x, sud)

Let’s check the map out again!

plot_ly(x[x$year==2007,], z = kcal, text = country, locations = Country_Code, type = 'choropleth',
        color = kcal, colors = 'Purples', marker = list(line = l),
        colorbar = list(title = 'kcal/person')) %>%
  layout(title = 'food supply per person per day in kcals<br>(2007)',
         geo = g)

There we go! That looks better. It follows what we would expect: richer countries produce more food. But most countries produce over 2000 kcals per day per person which is interesting. Let’s check out total production really quick.

mean(x$kcal[x$year==2007], na.rm = TRUE)
## [1] 2756.14

So Average kcal production per person in 2007 was 2756. Let’s plot the averages:

means = aggregate(food$kcal, by=list(food$year), FUN='mean')
colnames(means) = c('year', 'mean_kcal')
qplot(year, mean_kcal, data=means,
      main = 'mean kcal production') +
  geom_point() + 
  scale_x_discrete(breaks=seq(1961, 2007, 5))

Global production increasing linearly by about 500 kcal since 1961.

means$mean_kcal[means$year==1961]
## [1] 2245.955

and we saw before 2007 mean was about 2756. In 2007 there were ~6.6 Billion people according to the internet, so total production:

means$mean_kcal[means$year==2007]*6600000000
## [1] 1.819052e+13

that is 18.2 trillion kcals of food per DAY!