import pandas as pd import argparse # parse input arguments parser = argparse.ArgumentParser() parser.add_argument('path_to_csv', help = 'path to the csv file') args = parser.parse_args() all_covid_data = pd.read_csv('data/owid-covid-data.csv', encoding = 'ISO-8859-1') # filter out data past 2020 all_covid_data = all_covid_data[(all_covid_data['date'] >= '2020-01-01') & (all_covid_data['date'] <= '2020-12-31')] all_covid_data.date = pd.to_datetime(all_covid_data.date) # create groupby objects and sum new cases/deaths by month new_cases = all_covid_data.loc[:, ['location', 'date', 'new_cases']] new_cases_grouped = new_cases.groupby([new_cases.date.dt.month, new_cases.location]).new_cases.sum() new_deaths = all_covid_data.loc[:, ['location', 'date', 'new_deaths']] new_deaths_grouped = new_deaths.groupby([new_deaths.date.dt.month, new_deaths.location]).new_deaths.sum() # convert multi-indexed series to dataframe new_cases_grouped = new_cases_grouped.to_frame() new_cases_grouped = pd.DataFrame(new_cases_grouped.to_records()) new_deaths_grouped = new_deaths_grouped.to_frame() new_deaths_grouped = pd.DataFrame(new_deaths_grouped.to_records()) # sort by location, then date new_cases_grouped.sort_values(by = ['location', 'date'], inplace = True) new_deaths_grouped.sort_values(by = ['location', 'date'], inplace = True) # merge new_deaths_grouped and new_cases_grouped aggregated_data = new_cases_grouped.merge(new_deaths_grouped, how = 'outer', left_on = ['location', 'date'], right_on = ['location', 'date']) # filter out all entries that aren't at the end of the month all_covid_data['end_of_month'] = pd.to_datetime(all_covid_data['date']).dt.is_month_end all_covid_data = all_covid_data.loc[all_covid_data.end_of_month, :] # extract monthly total cases and total deaths total_cases = all_covid_data.loc[:, ['location', 'date', 'total_cases']] total_cases.date = total_cases.date.dt.month total_deaths = all_covid_data.loc[:, ['location', 'date', 'total_deaths']] total_deaths.date = total_deaths.date.dt.month # merge total_deaths and total_cases into aggregated_data aggregated_data = aggregated_data.merge(total_cases, how = 'outer', left_on = ['location', 'date'], right_on = ['location', 'date']) aggregated_data = aggregated_data.merge(total_deaths, how = 'outer', left_on = ['location', 'date'], right_on = ['location', 'date']) # compute case fatality rate for each month aggregated_data['case_fatality_rate'] = (aggregated_data['new_deaths'] / aggregated_data['new_cases']) # format aggregated_data and output results aggregated_data = aggregated_data.reindex(columns = ['location', 'date', 'case_fatality_rate', 'total_cases', 'new_cases', 'total_deaths', 'new_deaths']) aggregated_data.rename(columns = {'date': 'month'}, inplace = True) aggregated_data.set_index(['location', 'month'], inplace = True) print(aggregated_data.head(5)) aggregated_data.to_csv(args.path_to_csv)