Exploring World Development Indicators¶

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

dat = pd.read_csv('../data/world-development-indicators/Indicators.csv')
dat.shape

(5656458, 6)

dat.head()

dat.dtypes

CountryName       object
CountryCode       object
IndicatorName     object
IndicatorCode     object
Year               int64
Value            float64
dtype: object

Let's take a look at forest area (sq. km) indicator (AG.LND.FRST.K2). How much forest have we lost since 1960?

# helper function
def filter_data(df, feat, filter0, reset_index=False):
    d = df[df[feat]==filter0]
    if reset_index:
        d.reset_index(inplace=True, drop=True)
    return d

forest_area = filter_data(dat, 'IndicatorCode', 'AG.LND.FRST.K2', reset_index=True)
forest_area.tail(15)

us = filter_data(forest_area, 'CountryName', 'United States', True)
us

plt.figure(figsize=(5,5))
plt.scatter(us.Year, us.Value) 
plt.ylim(us.Value.min()-10000, us.Value.max()+10000)

plt.show()

brazil = filter_data(forest_area, 'CountryName', 'Brazil', True)
plt.figure(figsize=(5,5))
plt.scatter(brazil.Year, brazil.Value)
plt.show()

# brazil.Value.max()-brazil.Value.min()
brazil.Value.describe()

count    2.300000e+01
mean     5.434645e+06
std      1.887975e+05
min      5.151332e+06
25%      5.272024e+06
50%      5.428532e+06
75%      5.589462e+06
max      5.748390e+06
Name: Value, dtype: float64

import plotly.tools
import plotly.plotly as py
import plotly.graph_objs as go
# plotly.tools.set_credentials_file(username=USER_NAME, api_key=API_KEY)

forest_1990 = filter_data(forest_area, 'Year', 1990)
forest_2012 = filter_data(forest_area, 'Year', 2012)

print('1990 Countries: ', forest_1990.shape[0])
print('2012 Countries: ', forest_2012.shape[0])
## hmm

1990 Countries:  239
2012 Countries:  238

for country in forest_1990.CountryName:
    if country not in forest_1990.CountryName:
        print(country)
        break

Arab World

forest_1990 = forest_1990[forest_1990.CountryName != 'Arab World']

print('1990 Countries: ', forest_1990.shape[0])

1990 Countries:  238

forest_meg = pd.merge(forest_1990, forest_2012, how='left', on=['CountryName', 'CountryCode', 'IndicatorName', 'IndicatorCode'],
                     suffixes=('1990','2012'))

forest_meg.head()

forest_meg['DifferenceValue'] = forest_meg['Value2012'] - forest_meg['Value1990']

forest_meg.head()

?pd.merge

forest_meg[:50]

Looks like rows 0 through 31 are aggregates, will filter these out for plotting. They are intersting though. Will explore next!

# #from demo: https://plot.ly/python/cmocean-colorscales/#chlorophyll
# import cmocean

# def cmocean_to_plotly(cmap, pl_entries):
#     h = 1.0/(pl_entries-1)
#     pl_colorscale = []
    
#     for k in range(pl_entries):
#         C = list(map(np.uint8, np.array(cmap(k*h)[:3])*255))
#         pl_colorscale.append([k*h, 'rgb'+str((C[0], C[1], C[2]))])
        
#     return pl_colorscale

# chlorophyll = cmocean_to_plotly(cmocean.cm.algae, 20000)
trace = dict(type='choropleth',
            locations = forest_meg['CountryCode'],
                      z = forest_meg['DifferenceValue'],
                      text = forest_meg['CountryName'],
                      colorscale = [[0.0, 'rgb(165,0,38)'], [0.1111111111111111, 'rgb(215,48,39)'], 
                                    [0.2222222222222222, 'rgb(244,109,67)'], [0.3333333333333333, 'rgb(253,174,97)'],
                                    [0.4444444444444444, 'rgb(254,224,144)'], [0.5555555555555556, 'rgb(224,243,248)'], 
                                    [0.6666666666666666, 'rgb(171,217,233)'], [0.7777777777777778, 'rgb(116,173,209)'], 
                                    [0.8888888888888888, 'rgb(69,117,180)'], [1.0, 'rgb(49,54,149)']],
                      autocolorscale = False,
                      reversescale = False,
                      marker = dict(line = dict (color = 'rgb(180,180,180)',width = 0.5) ),
                      colorbar = dict(autotick = True, title = 'Gain / Loss'),
                      zauto = False,#True,
                      zmin=forest_meg[32:]['DifferenceValue'].min()-1000,
                      zmax=forest_meg[32:]['DifferenceValue'].max()+1000)
data = [trace]

layout = dict(
    title = 'Forest Area Gain/Loss Since 1960',
    geo = dict(
        showframe = False,
        showcoastlines = False,
        projection = dict(type = 'Mercator')
    )
)
[0.034482758620689655, 'rgb(165,42,42)'],
# py.iplot(colorscale_plot(colorscale=chlorophyll, title='Chlorophyll'))

fig = dict( data=data, layout=layout )
py.iplot( fig, validate=False, filename='forest gain-world-map' )

Brazil and China clearly outliers here. I knew about Brazil's deforestation, but did not realize China's reforestation efforts, wow. Let's remove these two to get better view of the rest of the world.

forest_meg2 = forest_meg[(forest_meg.CountryName != 'Brazil') & (forest_meg.CountryName != 'China')][32:].reset_index(drop=True)
# this puts holes in the map, let's change china and brazil to NULLs
forest_meg2 = forest_meg.loc[32:,:].reset_index(drop=True)
forest_meg2.loc[forest_meg2.CountryName == 'Brazil', 'DifferenceValue'] = np.NaN #Brazil
forest_meg2.loc[forest_meg2.CountryName == 'China', 'DifferenceValue'] = np.NaN #China

# forest_meg2[20:].head(20)

## converting hex to rbg int('b4', 16) ==> 180
import re
colors = '#a52a2a #a23029 #9f3628 #9c3b28 #994027 #954426 #934725 #8f4b24 #8b4f23 #885222 #845521 #815820 #7d5b1f #7b5d1e #75601c #71631b #6b6619 #696819 #636a17 #5f6d16 #5b6f14 #567013 #507311 #4a740f #44760d #3c790b #337a08 #287d05 #187e02 #008000'.split()
colors_rgb = [0]*len(colors)
for idx, h in enumerate(colors):
    rgb = 'rgb('
    for i in re.findall('..', h.lstrip('#')):
        rgb += str(int(i,16))+','
    colors_rgb[idx] = rgb[:-1]+')'
# colors_rgb = ['rgb(str(for h in colors for i in re.findall('..', h.lstrip('#'))]
# re.findall('..',hex)
# ?hex
c_scale = [[i/(len(colors_rgb)-1),colors_rgb[i-1]] for i in range(1,len(colors_rgb)+1)]

c_scale.insert(0,[0.0,0])
c_scale[:-1]

[[0.0, 0],
 [0.034482758620689655, 'rgb(165,42,42)'],
 [0.06896551724137931, 'rgb(162,48,41)'],
 [0.10344827586206896, 'rgb(159,54,40)'],
 [0.13793103448275862, 'rgb(156,59,40)'],
 [0.1724137931034483, 'rgb(153,64,39)'],
 [0.20689655172413793, 'rgb(149,68,38)'],
 [0.2413793103448276, 'rgb(147,71,37)'],
 [0.27586206896551724, 'rgb(143,75,36)'],
 [0.3103448275862069, 'rgb(139,79,35)'],
 [0.3448275862068966, 'rgb(136,82,34)'],
 [0.3793103448275862, 'rgb(132,85,33)'],
 [0.41379310344827586, 'rgb(129,88,32)'],
 [0.4482758620689655, 'rgb(125,91,31)'],
 [0.4827586206896552, 'rgb(123,93,30)'],
 [0.5172413793103449, 'rgb(117,96,28)'],
 [0.5517241379310345, 'rgb(113,99,27)'],
 [0.5862068965517241, 'rgb(107,102,25)'],
 [0.6206896551724138, 'rgb(105,104,25)'],
 [0.6551724137931034, 'rgb(99,106,23)'],
 [0.6896551724137931, 'rgb(95,109,22)'],
 [0.7241379310344828, 'rgb(91,111,20)'],
 [0.7586206896551724, 'rgb(86,112,19)'],
 [0.7931034482758621, 'rgb(80,115,17)'],
 [0.8275862068965517, 'rgb(74,116,15)'],
 [0.8620689655172413, 'rgb(68,118,13)'],
 [0.896551724137931, 'rgb(60,121,11)'],
 [0.9310344827586207, 'rgb(51,122,8)'],
 [0.9655172413793104, 'rgb(40,125,5)'],
 [1.0, 'rgb(24,126,2)']]

# chlorophyll = cmocean_to_plotly(cmocean.cm.algae, 20000)
# colors = '#a52a2a #a23029 #9f3628 #9c3b28 #994027 #954426 #934725 #8f4b24 #8b4f23 #885222 #845521 #815820 #7d5b1f #7b5d1e #75601c #71631b #6b6619 #696819 #636a17 #5f6d16 #5b6f14 #567013 #507311 #4a740f #44760d #3c790b #337a08 #287d05 #187e02 #008000'.split()
c_scale = [[i/(len(colors_rgb)-1),colors_rgb[i-1]] for i in range(1,len(colors_rgb))]
c_scale.insert(0,[0,0])

trace = dict(type='choropleth',
            locations = forest_meg['CountryCode'],
                      z = forest_meg['DifferenceValue'],
                      text = forest_meg['CountryName'],
                      colorscale = 'Greens',
                      autocolorscale = False,
                      reversescale = True,
                      marker = dict(line = dict (color = 'rgb(180,180,180)',width = 0.5) ),
                      colorbar = dict(autotick = True, title = 'Gain / Loss'),
                      zauto = False,#True,
                      zmin=forest_meg[32:].loc[((forest_meg.CountryName!='Indonesia')&(forest_meg.CountryName!='Brazil')),\
                                          'DifferenceValue'].min(),
                      zmax=forest_meg[32:].loc[((forest_meg.CountryName!='United States')&(forest_meg.CountryName!='China')),\
                                          'DifferenceValue'].max()+25000)
data = [trace]

layout = dict(
    title = 'Forest Area Gain/Loss Since 1960',
    geo = dict(
        showframe = False,
        showcoastlines = False,
        projection = dict(type = 'Mercator')
    )
)

# py.iplot(colorscale_plot(colorscale=chlorophyll, title='Chlorophyll'))

fig = dict( data=data, layout=layout )
py.iplot( fig, validate=False, filename='forest gain-world-map without')# china-or-brazil' )

forest_meg[forest_meg.DifferenceValue == -2235712.5]['CountryName']

16    Low & middle income
Name: CountryName, dtype: object

forest_meg[forest_meg.DifferenceValue == forest_meg[32:]['DifferenceValue'].min()]['CountryName']

58    Brazil
Name: CountryName, dtype: object

forest_meg[forest_meg.DifferenceValue == forest_meg[32:]['DifferenceValue'].max()]['CountryName']

72    China
Name: CountryName, dtype: object

forest_meg2.loc[forest_meg2.CountryName!='Indonesia','DifferenceValue'].min()

-90122.0

forest_meg.loc[(forest_meg.CountryName!='United States')&(forest_meg.CountryName!='China'),'DifferenceValue'].max()+1000

191834.80000000075

def ind_byYear(df, indicator_code, years=[1960], country=None, as_df=False):
    '''
    - Get Indicator for given year(s) and Country
    
    params: 
            indicator_code: View top of notebook to get discription.
            years: (default 1960) list of years wish to filter on.
            country: (default to None) name of country wish to filter on.
            as_df: (default False) return as a DataFrame, default return is dictionary of values.
    return: 
            if as_df (default=False) is True returns Pandas DataFrame, dictionary o.w.
            filtered on indicator code, years, and countryName
    '''
    years = set(years)
    filt = df['Year'].map(lambda x: x in years)
    d = df[filt][df.IndicatorCode == indicator_code].reset_index(drop=True)
    if country:
        d = df[filt][(df.CountryName==country) & (df.IndicatorCode==indicator_code)].reset_index(drop=True)
    if as_df:
        return d
    d = d['Value'].values
    return {list(years)[i]:d[i] for i in range(len(d))}

# filt = dat['Year'].map(lambda x: x in set([1960]))
# dat[filt][].head()
import warnings
warnings.filterwarnings('ignore', category=UserWarning)
# ?warnings.filterwarnings
# ind_byYear(dat, 'SP.ADO.TFRT', years=list(range(1960,1965)), country='United States', as_df=True)

	CountryName	CountryCode	IndicatorName	IndicatorCode	Year	Value
0	Arab World	ARB	Adolescent fertility rate (births per 1,000 wo...	SP.ADO.TFRT	1960	1.335609e+02
1	Arab World	ARB	Age dependency ratio (% of working-age populat...	SP.POP.DPND	1960	8.779760e+01
2	Arab World	ARB	Age dependency ratio, old (% of working-age po...	SP.POP.DPND.OL	1960	6.634579e+00
3	Arab World	ARB	Age dependency ratio, young (% of working-age ...	SP.POP.DPND.YG	1960	8.102333e+01
4	Arab World	ARB	Arms exports (SIPRI trend indicator values)	MS.MIL.XPRT.KD	1960	3.000000e+06

	CountryName	CountryCode	IndicatorName	IndicatorCode	Year	Value
5416	Uganda	UGA	Forest area (sq. km)	AG.LND.FRST.K2	2012	28116.0
5417	Ukraine	UKR	Forest area (sq. km)	AG.LND.FRST.K2	2012	97570.0
5418	United Arab Emirates	ARE	Forest area (sq. km)	AG.LND.FRST.K2	2012	3194.2
5419	United Kingdom	GBR	Forest area (sq. km)	AG.LND.FRST.K2	2012	28954.0
5420	United States	USA	Forest area (sq. km)	AG.LND.FRST.K2	2012	3047876.0
5421	Uruguay	URY	Forest area (sq. km)	AG.LND.FRST.K2	2012	18336.0
5422	Uzbekistan	UZB	Forest area (sq. km)	AG.LND.FRST.K2	2012	32677.0
5423	Vanuatu	VUT	Forest area (sq. km)	AG.LND.FRST.K2	2012	4400.0
5424	Venezuela, RB	VEN	Forest area (sq. km)	AG.LND.FRST.K2	2012	456998.0
5425	Vietnam	VNM	Forest area (sq. km)	AG.LND.FRST.K2	2012	140850.0
5426	Virgin Islands (U.S.)	VIR	Forest area (sq. km)	AG.LND.FRST.K2	2012	199.0
5427	West Bank and Gaza	WBG	Forest area (sq. km)	AG.LND.FRST.K2	2012	91.7
5428	Yemen, Rep.	YEM	Forest area (sq. km)	AG.LND.FRST.K2	2012	5490.0
5429	Zambia	ZMB	Forest area (sq. km)	AG.LND.FRST.K2	2012	491348.0
5430	Zimbabwe	ZWE	Forest area (sq. km)	AG.LND.FRST.K2	2012	149700.0

	CountryName	CountryCode	IndicatorName	IndicatorCode	Year	Value
0	United States	USA	Forest area (sq. km)	AG.LND.FRST.K2	1990	2963350.0
1	United States	USA	Forest area (sq. km)	AG.LND.FRST.K2	1991	2967210.0
2	United States	USA	Forest area (sq. km)	AG.LND.FRST.K2	1992	2971070.0
3	United States	USA	Forest area (sq. km)	AG.LND.FRST.K2	1993	2974930.0
4	United States	USA	Forest area (sq. km)	AG.LND.FRST.K2	1994	2978790.0
5	United States	USA	Forest area (sq. km)	AG.LND.FRST.K2	1995	2982650.0
6	United States	USA	Forest area (sq. km)	AG.LND.FRST.K2	1996	2986510.0
7	United States	USA	Forest area (sq. km)	AG.LND.FRST.K2	1997	2990370.0
8	United States	USA	Forest area (sq. km)	AG.LND.FRST.K2	1998	2994230.0
9	United States	USA	Forest area (sq. km)	AG.LND.FRST.K2	1999	2998090.0
10	United States	USA	Forest area (sq. km)	AG.LND.FRST.K2	2000	3001950.0
11	United States	USA	Forest area (sq. km)	AG.LND.FRST.K2	2001	3005776.0
12	United States	USA	Forest area (sq. km)	AG.LND.FRST.K2	2002	3009602.0
13	United States	USA	Forest area (sq. km)	AG.LND.FRST.K2	2003	3013428.0
14	United States	USA	Forest area (sq. km)	AG.LND.FRST.K2	2004	3017254.0
15	United States	USA	Forest area (sq. km)	AG.LND.FRST.K2	2005	3021080.0
16	United States	USA	Forest area (sq. km)	AG.LND.FRST.K2	2006	3024908.0
17	United States	USA	Forest area (sq. km)	AG.LND.FRST.K2	2007	3028736.0
18	United States	USA	Forest area (sq. km)	AG.LND.FRST.K2	2008	3032564.0
19	United States	USA	Forest area (sq. km)	AG.LND.FRST.K2	2009	3036392.0
20	United States	USA	Forest area (sq. km)	AG.LND.FRST.K2	2010	3040220.0
21	United States	USA	Forest area (sq. km)	AG.LND.FRST.K2	2011	3044048.0
22	United States	USA	Forest area (sq. km)	AG.LND.FRST.K2	2012	3047876.0

	CountryName	CountryCode	IndicatorName	IndicatorCode	Year1990	Value1990	Year2012	Value2012
0	Caribbean small states	CSS	Forest area (sq. km)	AG.LND.FRST.K2	1990	328330.5	2012.0	325699.3
1	Central Europe and the Baltics	CEB	Forest area (sq. km)	AG.LND.FRST.K2	1990	351770.0	2012.0	376490.0
2	East Asia & Pacific (all income levels)	EAS	Forest area (sq. km)	AG.LND.FRST.K2	1990	6550932.2	2012.0	6608271.6
3	East Asia & Pacific (developing only)	EAP	Forest area (sq. km)	AG.LND.FRST.K2	1990	4601840.4	2012.0	4724789.9
4	Euro area	EMU	Forest area (sq. km)	AG.LND.FRST.K2	1990	910298.0	2012.0	1014379.6

	CountryName	CountryCode	IndicatorName	IndicatorCode	Year1990	Value1990	Year2012	Value2012	DifferenceValue
0	Caribbean small states	CSS	Forest area (sq. km)	AG.LND.FRST.K2	1990	328330.5	2012.0	325699.3	-2631.2
1	Central Europe and the Baltics	CEB	Forest area (sq. km)	AG.LND.FRST.K2	1990	351770.0	2012.0	376490.0	24720.0
2	East Asia & Pacific (all income levels)	EAS	Forest area (sq. km)	AG.LND.FRST.K2	1990	6550932.2	2012.0	6608271.6	57339.4
3	East Asia & Pacific (developing only)	EAP	Forest area (sq. km)	AG.LND.FRST.K2	1990	4601840.4	2012.0	4724789.9	122949.5
4	Euro area	EMU	Forest area (sq. km)	AG.LND.FRST.K2	1990	910298.0	2012.0	1014379.6	104081.6