# -*- coding: utf-8 -*-
"""
Created on Mon Mar 21 14:49:47 2016

@author: Dr. Z
"""

# MAIN METHOD

##Main analysis
#Initialization
import pandas
import numpy as np
import scipy.stats
import matplotlib.pyplot as plt
data = pandas.read_csv('gapminder_with_hemispheres.csv', low_memory=False)

#Convert relevant variables to be numeric
data["incomeperperson"] = data["incomeperperson"].convert_objects(convert_numeric=True)
data["urbanrate"] = data["urbanrate"].convert_objects(convert_numeric=True)

#Handle missing data
subframe = data[["incomeperperson", "urbanrate", "hemisphere"]] # select only relevant columns
subframe['incomeperperson'] = data['incomeperperson'].replace(" ", np.nan)
subframe['urbanrate'] = data['urbanrate'].replace(" ", np.nan)
subframe['hemisphere'] = data['hemisphere'].replace("NA", np.nan)
clean_data = subframe.dropna()

#Create partitions of data set based on hemisphere moderator var.
sub1 = clean_data[(clean_data['hemisphere']== "North")]
sub2 = clean_data[(clean_data['hemisphere']== "South")]

#Perform correlation analysis for the different partitions
print('association between Income per Person and Urban Rate for North hemisphere countries')
print(scipy.stats.pearsonr(sub1['incomeperperson'], sub1['urbanrate']))
print('')
print('association between Income per Person and Urban Rate for South hemisphere countries')
print(scipy.stats.pearsonr(sub2['incomeperperson'], sub2['urbanrate']))
print('')

#Create plots
plt.xlabel('Income per Person')
plt.ylabel('Urban Rate')
plt.title('Scatterplot for the Association Between Income per Person and Urban Rate for NORTH hemisphere countries')
plt.scatter(sub1["incomeperperson"], sub1["urbanrate"], )
plt.show()

plt.xlabel('Income per Person')
plt.ylabel('Urban Rate')
plt.title('Scatterplot for the Association Between Income per Person and Urban Rate for SOUTH hemisphere countries')
plt.scatter(sub2["incomeperperson"], sub2["urbanrate"], )
plt.show()