""" Gapminder data cleaningThis script:- tidies and merges GDP, population and life expectancy data from Gapminder- converts string numbers e.g. '10M' to floats- converts population and year to integers- obtains continent information for each country from the pycountry moduleThe resultant DataFrame is saved as 'gapminder_clean.csv' in the args.data directory. Data files:-------------------- `gdp_pcap.xlsx` Gross domestic product per capita, accessed from https://www.gapminder.org/data/documentation/gd001/- `lex.xlsx` Life expectancy data, accessed from https://www.gapminder.org/data/documentation/gd004/- `pop.xlsx` Population data, accessed from https://www.gapminder.org/data/documentation/gd003/Available functions-------------------* load_tidy_merge: Loads data, converts data to tidy format and merges into one pd.DataFrame* filter_between_years: Keeps rows of a pd.DataFrame between start year and end year* format_numbers: Changes string numbers e.g. '10M' to floats for columns ['gdp', 'pop'], converts 'pop' and 'year' to int64* get_continents: Creates DataFrame with new 'continent' column using pycountry moduleRequirements------------* pandas module* pycountry module* argparse module"""import pycountryimport pycountry_convert as pcimport pandas as pdimport warningsimport argparsedef main(args): gapminder = load_tidy_merge(args.data) gapminder = filter_between_years(gapminder, 1950, 2019) gapminder = format_numbers(gapminder) gapminder = get_continents(gapminder, args.log) gapminder.to_csv(args.data +'gapminder_clean.csv', index=False)def load_tidy_merge(data_path):"""Loads data, converts data to tidy format and merges into one dataset Parameters ---------- data_path : str folder where data is stored Returns ------- pd.DataFrame a Pandas DataFrame in tidy format """ gdp = pd.read_excel(data_path +'gdp_pcap.xlsx') pop = pd.read_excel(data_path +'pop.xlsx') lex = pd.read_excel(data_path +'lex.xlsx') gdp = gdp.melt(id_vars='country', var_name='year', value_name='gdp') pop = pop.melt(id_vars ='country', var_name ='year', value_name ='pop') lex = lex.melt(id_vars ='country', var_name ='year', value_name ='lex') gapminder = gdp.merge(pop, on=['country', 'year']).merge(lex, on=['country', 'year'])return gapminderdef filter_between_years(df, start, end):"""Keeps rows of a pd.DataFrame between start year and end year Parameters ---------- df : pd.DataFrame DataFrame to filter rows of start : int Starting year (inclusive) end : int Ending year (inclusive) Returns ------- pd.DataFrame a pd.DataFrame with rows only between start year and end year """ keep = (df['year'] >= start) & (df['year'] <= end) df = df[keep] df = df.reset_index().drop(columns='index')return dfdef format_numbers(df):"""Changes string numbers e.g. '10M' to floats for columns ['gdp', 'pop'], converts 'pop' and 'year' to int64 Parameters ---------- df : pd.DataFrame DataFrame to filter rows of Returns ------- pd.DataFrame a pd.DataFrame with cleaned numbers """ df[['gdp', 'pop']] = df[['gdp', 'pop']].map(number_to_float) df['pop'] = df['pop'].astype('int64') df['year'] = df['year'].astype('int64')return dfdef number_to_float(x):ifisinstance(x, str):if'k'in x:returnfloat(x.replace('k', '')) *1_000elif'M'in x:returnfloat(x.replace('M', '')) *1_000_000elif'B'in x:returnfloat(x.replace('B', '')) *1_000_000_000else:try:returnfloat(x) exceptValueError: warnings.warn(f"{x} not converted to float - returning None")returnNoneelse:return x def get_continents(df, log_path):"""Creates DataFrame with new 'continent' column using pycountry module Parameters ---------- df : pd.DataFrame DataFrame with 'country' column Returns ------- pd.DataFrame input pd.DataFrame with 'continent' column added """ df_country = df[['country']].drop_duplicates()# hardcoded countries to rename rename_countries = {'UAE': 'United Arab Emirates',"Cote d'Ivoire": "Côte d'Ivoire",'Congo, Dem. Rep.': 'Congo, The Democratic Republic of the','Congo, Rep.': 'Congo','Micronesia, Fed. Sts.': 'Micronesia, Federated States of','UK': 'United Kingdom','Hong Kong, China': 'Hong Kong','Lao': 'Laos','St. Vincent and the Grenadines': 'Saint Vincent and the Grenadines'} df_country['country'] = df_country['country'].replace(rename_countries)# get country codes df_country['country_code'] = pd.NAfor c in df_country['country']:try: df_country.loc[df_country['country']==c, 'country_code'] = pc.country_name_to_country_alpha2(c)except: warnings.warn(f"{c} not mapped to country code - no continent will be mapped")withopen(log_path +'country_exceptions.txt', 'a') as f: f.write(f"{c}\n")# get continents df_country['continent'] = pd.NA df_country.loc[df_country['country_code']=='TL', 'continent'] ='Asia'for c in df_country['country_code']:if df_country.loc[df_country['country_code']==c, 'continent'].isna().any():try: cc = pc.country_alpha2_to_continent_code(c) df_country.loc[df_country['country_code']==c, 'continent'] = pc.convert_continent_code_to_continent_name(cc)except: warnings.warn(f"{c} not mapped to continent")withopen(log_path +'continent_exceptions.txt', 'a') as f: f.write(f"{c}\n") df = df.merge(df_country, on='country')return dfif__name__=="__main__": parser = argparse.ArgumentParser(description='Clean Gapminder data') parser.add_argument('--data', type=str, default='../data/',help='path to data folder') parser.add_argument('--log', type=str, default="../log/",help='path to log folder') args = parser.parse_args() main(args)