Script to Clean Gapminder Data

clean_gapminder.py
""" Gapminder data cleaning

This script:
- tidies and merges GDP, population and life expectancy data from Gapminder
- converts string numbers e.g. '10M' to floats
- converts population and year to integers
- obtains continent information for each country from the pycountry module

The resultant DataFrame is saved as 'gapminder_clean.csv' in the args.data directory. 


Data files:
-------------------
- `gdp_pcap.xlsx` 
Gross domestic product per capita, accessed from https://www.gapminder.org/data/documentation/gd001/

- `lex.xlsx` 
Life expectancy data, accessed from https://www.gapminder.org/data/documentation/gd004/

- `pop.xlsx` 
Population data, accessed from https://www.gapminder.org/data/documentation/gd003/


Available functions
-------------------
* load_tidy_merge: Loads data, converts data to tidy format and merges into one pd.DataFrame
* filter_between_years: Keeps rows of a pd.DataFrame between start year and end year
* format_numbers: Changes string numbers e.g. '10M' to floats for columns ['gdp', 'pop'], converts 'pop' and 'year' to int64
* get_continents: Creates DataFrame with new 'continent' column using pycountry module

Requirements
------------
* pandas module
* pycountry module
* argparse module
"""
import pycountry
import pycountry_convert as pc
import pandas as pd
import warnings
import argparse

def main(args):

    gapminder = load_tidy_merge(args.data)
    gapminder = filter_between_years(gapminder, 1950, 2019)
    gapminder = format_numbers(gapminder)
    gapminder = get_continents(gapminder, args.log)

    gapminder.to_csv(args.data + 'gapminder_clean.csv', index=False)

def load_tidy_merge(data_path):
    """Loads data, converts data to tidy format and merges into one dataset

    Parameters
    ----------
    data_path : str
        folder where data is stored

    Returns
    -------
    pd.DataFrame
        a Pandas DataFrame in tidy format
    """

    gdp = pd.read_excel(data_path + 'gdp_pcap.xlsx')
    pop = pd.read_excel(data_path + 'pop.xlsx')
    lex = pd.read_excel(data_path + 'lex.xlsx')

    gdp = gdp.melt(id_vars='country',
        var_name='year',
        value_name='gdp')

    pop = pop.melt(id_vars = 'country',
            var_name = 'year',
            value_name = 'pop')

    lex = lex.melt(id_vars = 'country',
            var_name = 'year',
            value_name = 'lex')

    gapminder = gdp.merge(pop, on=['country', 'year']).merge(lex, on=['country', 'year'])

    return gapminder


def filter_between_years(df, start, end):
    """Keeps rows of a pd.DataFrame between start year and end year

    Parameters
    ----------
    df : pd.DataFrame
        DataFrame to filter rows of
    start : int
        Starting year (inclusive)
    end : int
        Ending year (inclusive)

    Returns
    -------
    pd.DataFrame
        a pd.DataFrame with rows only between start year and end year
    """

    keep = (df['year'] >= start) & (df['year'] <= end)
    df = df[keep]
    df = df.reset_index().drop(columns='index')

    return df

def format_numbers(df):
    """Changes string numbers e.g. '10M' to floats for columns ['gdp', 'pop'], converts 'pop' and 'year' to int64

    Parameters
    ----------
    df : pd.DataFrame
        DataFrame to filter rows of

    Returns
    -------
    pd.DataFrame
        a pd.DataFrame with cleaned numbers
    """

    df[['gdp', 'pop']] = df[['gdp', 'pop']].map(number_to_float)

    df['pop'] = df['pop'].astype('int64')
    df['year'] = df['year'].astype('int64')

    return df

def number_to_float(x):
    if isinstance(x, str):
        if 'k' in x:
            return float(x.replace('k', '')) * 1_000
        elif 'M' in x:
            return float(x.replace('M', '')) * 1_000_000
        elif 'B' in x:
            return float(x.replace('B', '')) * 1_000_000_000
        else:
            try:
                return float(x)  
            except ValueError:
                warnings.warn(f"{x} not converted to float - returning None")
                return None
    else:
        return x 

    
def get_continents(df, log_path):
    """Creates DataFrame with new 'continent' column using pycountry module

    Parameters
    ----------
    df : pd.DataFrame
        DataFrame with 'country' column 

    Returns
    -------
    pd.DataFrame
        input pd.DataFrame with 'continent' column added
    """

    df_country = df[['country']].drop_duplicates()

    # hardcoded countries to rename
    rename_countries = {'UAE': 'United Arab Emirates',
            "Cote d'Ivoire": "Côte d'Ivoire",
            'Congo, Dem. Rep.': 'Congo, The Democratic Republic of the',
            'Congo, Rep.': 'Congo',
            'Micronesia, Fed. Sts.': 'Micronesia, Federated States of',
            'UK': 'United Kingdom',
            'Hong Kong, China': 'Hong Kong',
            'Lao': 'Laos',
            'St. Vincent and the Grenadines': 'Saint Vincent and the Grenadines'}

    df_country['country'] = df_country['country'].replace(rename_countries)

    # get country codes

    df_country['country_code'] = pd.NA

    for c in df_country['country']:
        try:
            df_country.loc[df_country['country']==c, 'country_code'] = pc.country_name_to_country_alpha2(c)
        except:
            warnings.warn(f"{c} not mapped to country code - no continent will be mapped")
            with open(log_path + 'country_exceptions.txt', 'a') as f:
                f.write(f"{c}\n")
        
    # get continents
    df_country['continent'] = pd.NA
    df_country.loc[df_country['country_code']=='TL', 'continent'] = 'Asia'

    for c in df_country['country_code']:
        if df_country.loc[df_country['country_code']==c, 'continent'].isna().any():
            try:
                cc = pc.country_alpha2_to_continent_code(c)
                df_country.loc[df_country['country_code']==c, 'continent'] = pc.convert_continent_code_to_continent_name(cc)
            except:
                warnings.warn(f"{c} not mapped to continent")
                with open(log_path + 'continent_exceptions.txt', 'a') as f:
                    f.write(f"{c}\n")

    df = df.merge(df_country, on='country')
    
    return df
    

if __name__ == "__main__":

    parser = argparse.ArgumentParser(description='Clean Gapminder data')

    parser.add_argument('--data', type=str, default='../data/',
                        help='path to data folder')
    parser.add_argument('--log', type=str, default="../log/",
                        help='path to log folder')

    args = parser.parse_args()

    main(args)