import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
dat = pd.read_csv('../data/world-development-indicators/Indicators.csv')
dat.shape
dat.head()
dat.dtypes
Let's take a look at forest area (sq. km) indicator (AG.LND.FRST.K2). How much forest have we lost since 1960?
# helper function
def filter_data(df, feat, filter0, reset_index=False):
d = df[df[feat]==filter0]
if reset_index:
d.reset_index(inplace=True, drop=True)
return d
forest_area = filter_data(dat, 'IndicatorCode', 'AG.LND.FRST.K2', reset_index=True)
forest_area.tail(15)
us = filter_data(forest_area, 'CountryName', 'United States', True)
us
plt.figure(figsize=(5,5))
plt.scatter(us.Year, us.Value)
plt.ylim(us.Value.min()-10000, us.Value.max()+10000)
plt.show()
brazil = filter_data(forest_area, 'CountryName', 'Brazil', True)
plt.figure(figsize=(5,5))
plt.scatter(brazil.Year, brazil.Value)
plt.show()
# brazil.Value.max()-brazil.Value.min()
brazil.Value.describe()
import plotly.tools
import plotly.plotly as py
import plotly.graph_objs as go
# plotly.tools.set_credentials_file(username=USER_NAME, api_key=API_KEY)
forest_1990 = filter_data(forest_area, 'Year', 1990)
forest_2012 = filter_data(forest_area, 'Year', 2012)
print('1990 Countries: ', forest_1990.shape[0])
print('2012 Countries: ', forest_2012.shape[0])
## hmm
for country in forest_1990.CountryName:
if country not in forest_1990.CountryName:
print(country)
break
forest_1990 = forest_1990[forest_1990.CountryName != 'Arab World']
print('1990 Countries: ', forest_1990.shape[0])
forest_meg = pd.merge(forest_1990, forest_2012, how='left', on=['CountryName', 'CountryCode', 'IndicatorName', 'IndicatorCode'],
suffixes=('1990','2012'))
forest_meg.head()
forest_meg['DifferenceValue'] = forest_meg['Value2012'] - forest_meg['Value1990']
forest_meg.head()
?pd.merge
forest_meg[:50]
Looks like rows 0 through 31 are aggregates, will filter these out for plotting. They are intersting though. Will explore next!
# #from demo: https://plot.ly/python/cmocean-colorscales/#chlorophyll
# import cmocean
# def cmocean_to_plotly(cmap, pl_entries):
# h = 1.0/(pl_entries-1)
# pl_colorscale = []
# for k in range(pl_entries):
# C = list(map(np.uint8, np.array(cmap(k*h)[:3])*255))
# pl_colorscale.append([k*h, 'rgb'+str((C[0], C[1], C[2]))])
# return pl_colorscale
# chlorophyll = cmocean_to_plotly(cmocean.cm.algae, 20000)
trace = dict(type='choropleth',
locations = forest_meg['CountryCode'],
z = forest_meg['DifferenceValue'],
text = forest_meg['CountryName'],
colorscale = [[0.0, 'rgb(165,0,38)'], [0.1111111111111111, 'rgb(215,48,39)'],
[0.2222222222222222, 'rgb(244,109,67)'], [0.3333333333333333, 'rgb(253,174,97)'],
[0.4444444444444444, 'rgb(254,224,144)'], [0.5555555555555556, 'rgb(224,243,248)'],
[0.6666666666666666, 'rgb(171,217,233)'], [0.7777777777777778, 'rgb(116,173,209)'],
[0.8888888888888888, 'rgb(69,117,180)'], [1.0, 'rgb(49,54,149)']],
autocolorscale = False,
reversescale = False,
marker = dict(line = dict (color = 'rgb(180,180,180)',width = 0.5) ),
colorbar = dict(autotick = True, title = 'Gain / Loss'),
zauto = False,#True,
zmin=forest_meg[32:]['DifferenceValue'].min()-1000,
zmax=forest_meg[32:]['DifferenceValue'].max()+1000)
data = [trace]
layout = dict(
title = 'Forest Area Gain/Loss Since 1960',
geo = dict(
showframe = False,
showcoastlines = False,
projection = dict(type = 'Mercator')
)
)
[0.034482758620689655, 'rgb(165,42,42)'],
# py.iplot(colorscale_plot(colorscale=chlorophyll, title='Chlorophyll'))
fig = dict( data=data, layout=layout )
py.iplot( fig, validate=False, filename='forest gain-world-map' )
Brazil and China clearly outliers here. I knew about Brazil's deforestation, but did not realize China's reforestation efforts, wow. Let's remove these two to get better view of the rest of the world.
forest_meg2 = forest_meg[(forest_meg.CountryName != 'Brazil') & (forest_meg.CountryName != 'China')][32:].reset_index(drop=True)
# this puts holes in the map, let's change china and brazil to NULLs
forest_meg2 = forest_meg.loc[32:,:].reset_index(drop=True)
forest_meg2.loc[forest_meg2.CountryName == 'Brazil', 'DifferenceValue'] = np.NaN #Brazil
forest_meg2.loc[forest_meg2.CountryName == 'China', 'DifferenceValue'] = np.NaN #China
# forest_meg2[20:].head(20)
## converting hex to rbg int('b4', 16) ==> 180
import re
colors = '#a52a2a #a23029 #9f3628 #9c3b28 #994027 #954426 #934725 #8f4b24 #8b4f23 #885222 #845521 #815820 #7d5b1f #7b5d1e #75601c #71631b #6b6619 #696819 #636a17 #5f6d16 #5b6f14 #567013 #507311 #4a740f #44760d #3c790b #337a08 #287d05 #187e02 #008000'.split()
colors_rgb = [0]*len(colors)
for idx, h in enumerate(colors):
rgb = 'rgb('
for i in re.findall('..', h.lstrip('#')):
rgb += str(int(i,16))+','
colors_rgb[idx] = rgb[:-1]+')'
# colors_rgb = ['rgb(str(for h in colors for i in re.findall('..', h.lstrip('#'))]
# re.findall('..',hex)
# ?hex
c_scale = [[i/(len(colors_rgb)-1),colors_rgb[i-1]] for i in range(1,len(colors_rgb)+1)]
c_scale.insert(0,[0.0,0])
c_scale[:-1]
# chlorophyll = cmocean_to_plotly(cmocean.cm.algae, 20000)
# colors = '#a52a2a #a23029 #9f3628 #9c3b28 #994027 #954426 #934725 #8f4b24 #8b4f23 #885222 #845521 #815820 #7d5b1f #7b5d1e #75601c #71631b #6b6619 #696819 #636a17 #5f6d16 #5b6f14 #567013 #507311 #4a740f #44760d #3c790b #337a08 #287d05 #187e02 #008000'.split()
c_scale = [[i/(len(colors_rgb)-1),colors_rgb[i-1]] for i in range(1,len(colors_rgb))]
c_scale.insert(0,[0,0])
trace = dict(type='choropleth',
locations = forest_meg['CountryCode'],
z = forest_meg['DifferenceValue'],
text = forest_meg['CountryName'],
colorscale = 'Greens',
autocolorscale = False,
reversescale = True,
marker = dict(line = dict (color = 'rgb(180,180,180)',width = 0.5) ),
colorbar = dict(autotick = True, title = 'Gain / Loss'),
zauto = False,#True,
zmin=forest_meg[32:].loc[((forest_meg.CountryName!='Indonesia')&(forest_meg.CountryName!='Brazil')),\
'DifferenceValue'].min(),
zmax=forest_meg[32:].loc[((forest_meg.CountryName!='United States')&(forest_meg.CountryName!='China')),\
'DifferenceValue'].max()+25000)
data = [trace]
layout = dict(
title = 'Forest Area Gain/Loss Since 1960',
geo = dict(
showframe = False,
showcoastlines = False,
projection = dict(type = 'Mercator')
)
)
# py.iplot(colorscale_plot(colorscale=chlorophyll, title='Chlorophyll'))
fig = dict( data=data, layout=layout )
py.iplot( fig, validate=False, filename='forest gain-world-map without')# china-or-brazil' )
forest_meg[forest_meg.DifferenceValue == -2235712.5]['CountryName']
forest_meg[forest_meg.DifferenceValue == forest_meg[32:]['DifferenceValue'].min()]['CountryName']
forest_meg[forest_meg.DifferenceValue == forest_meg[32:]['DifferenceValue'].max()]['CountryName']
forest_meg2.loc[forest_meg2.CountryName!='Indonesia','DifferenceValue'].min()
forest_meg.loc[(forest_meg.CountryName!='United States')&(forest_meg.CountryName!='China'),'DifferenceValue'].max()+1000
def ind_byYear(df, indicator_code, years=[1960], country=None, as_df=False):
'''
- Get Indicator for given year(s) and Country
params:
indicator_code: View top of notebook to get discription.
years: (default 1960) list of years wish to filter on.
country: (default to None) name of country wish to filter on.
as_df: (default False) return as a DataFrame, default return is dictionary of values.
return:
if as_df (default=False) is True returns Pandas DataFrame, dictionary o.w.
filtered on indicator code, years, and countryName
'''
years = set(years)
filt = df['Year'].map(lambda x: x in years)
d = df[filt][df.IndicatorCode == indicator_code].reset_index(drop=True)
if country:
d = df[filt][(df.CountryName==country) & (df.IndicatorCode==indicator_code)].reset_index(drop=True)
if as_df:
return d
d = d['Value'].values
return {list(years)[i]:d[i] for i in range(len(d))}
# filt = dat['Year'].map(lambda x: x in set([1960]))
# dat[filt][].head()
import warnings
warnings.filterwarnings('ignore', category=UserWarning)
# ?warnings.filterwarnings
# ind_byYear(dat, 'SP.ADO.TFRT', years=list(range(1960,1965)), country='United States', as_df=True)