2021-03-01 17:57:17 +11:00
|
|
|
import pandas as pd
|
|
|
|
import argparse
|
2021-04-09 22:46:35 +10:00
|
|
|
|
2021-04-10 15:16:53 +10:00
|
|
|
# parse input arguments
|
|
|
|
parser = argparse.ArgumentParser()
|
|
|
|
parser.add_argument('path_to_csv', help = 'path to the csv file')
|
|
|
|
args = parser.parse_args()
|
|
|
|
|
2021-04-15 07:47:29 +10:00
|
|
|
all_covid_data = pd.read_csv('data/owid-covid-data.csv', \
|
|
|
|
encoding = 'ISO-8859-1')
|
2021-04-09 22:46:35 +10:00
|
|
|
|
2021-04-10 14:47:26 +10:00
|
|
|
# filter out data past 2020
|
2021-04-15 07:47:29 +10:00
|
|
|
all_covid_data = all_covid_data[(all_covid_data['date'] >= '2020-01-01') \
|
|
|
|
& (all_covid_data['date'] <= '2020-12-31')]
|
2021-04-10 14:47:26 +10:00
|
|
|
all_covid_data.date = pd.to_datetime(all_covid_data.date)
|
2021-04-09 22:46:35 +10:00
|
|
|
|
2021-04-10 14:47:26 +10:00
|
|
|
# create groupby objects and sum new cases/deaths by month
|
2021-04-09 22:46:35 +10:00
|
|
|
new_cases = all_covid_data.loc[:, ['location', 'date', 'new_cases']]
|
2021-04-15 07:47:29 +10:00
|
|
|
new_cases_grouped = new_cases.groupby([new_cases.date.dt.month, \
|
|
|
|
new_cases.location]).new_cases.sum()
|
2021-04-09 22:46:35 +10:00
|
|
|
|
|
|
|
new_deaths = all_covid_data.loc[:, ['location', 'date', 'new_deaths']]
|
2021-04-15 07:47:29 +10:00
|
|
|
new_deaths_grouped = new_deaths.groupby([new_deaths.date.dt.month, \
|
|
|
|
new_deaths.location]).new_deaths.sum()
|
2021-04-09 22:46:35 +10:00
|
|
|
|
2021-04-10 14:47:26 +10:00
|
|
|
# convert multi-indexed series to dataframe
|
|
|
|
new_cases_grouped = new_cases_grouped.to_frame()
|
|
|
|
new_cases_grouped = pd.DataFrame(new_cases_grouped.to_records())
|
|
|
|
new_deaths_grouped = new_deaths_grouped.to_frame()
|
|
|
|
new_deaths_grouped = pd.DataFrame(new_deaths_grouped.to_records())
|
2021-04-09 22:46:35 +10:00
|
|
|
|
2021-04-10 14:47:26 +10:00
|
|
|
# sort by location, then date
|
|
|
|
new_cases_grouped.sort_values(by = ['location', 'date'], inplace = True)
|
|
|
|
new_deaths_grouped.sort_values(by = ['location', 'date'], inplace = True)
|
2021-04-09 22:46:35 +10:00
|
|
|
|
2021-04-10 15:16:53 +10:00
|
|
|
# merge new_deaths_grouped and new_cases_grouped
|
2021-04-15 07:47:29 +10:00
|
|
|
aggregated_data = new_cases_grouped.merge(new_deaths_grouped, \
|
|
|
|
how = 'outer', left_on = ['location', 'date'], \
|
|
|
|
right_on = ['location', 'date'])
|
2021-04-10 14:47:26 +10:00
|
|
|
|
|
|
|
# filter out all entries that aren't at the end of the month
|
2021-04-15 07:47:29 +10:00
|
|
|
all_covid_data['end_of_month'] = \
|
|
|
|
pd.to_datetime(all_covid_data['date']).dt.is_month_end
|
2021-04-10 14:47:26 +10:00
|
|
|
all_covid_data = all_covid_data.loc[all_covid_data.end_of_month, :]
|
|
|
|
|
|
|
|
# extract monthly total cases and total deaths
|
|
|
|
total_cases = all_covid_data.loc[:, ['location', 'date', 'total_cases']]
|
|
|
|
total_cases.date = total_cases.date.dt.month
|
|
|
|
|
|
|
|
total_deaths = all_covid_data.loc[:, ['location', 'date', 'total_deaths']]
|
|
|
|
total_deaths.date = total_deaths.date.dt.month
|
|
|
|
|
|
|
|
# merge total_deaths and total_cases into aggregated_data
|
2021-04-15 07:47:29 +10:00
|
|
|
aggregated_data = aggregated_data.merge(total_cases, how = 'outer', \
|
|
|
|
left_on = ['location', 'date'], right_on = ['location', 'date'])
|
|
|
|
aggregated_data = aggregated_data.merge(total_deaths, how = 'outer', \
|
|
|
|
left_on = ['location', 'date'], right_on = ['location', 'date'])
|
2021-04-10 14:59:49 +10:00
|
|
|
|
|
|
|
# compute case fatality rate for each month
|
2021-04-15 07:47:29 +10:00
|
|
|
aggregated_data['case_fatality_rate'] = \
|
|
|
|
(aggregated_data['new_deaths'] / aggregated_data['new_cases'])
|
2021-04-10 14:47:26 +10:00
|
|
|
|
2021-04-10 15:16:53 +10:00
|
|
|
# format aggregated_data and output results
|
2021-04-15 07:47:29 +10:00
|
|
|
aggregated_data = aggregated_data.reindex(columns = ['location', 'date', \
|
|
|
|
'case_fatality_rate', 'total_cases', 'new_cases', 'total_deaths', \
|
|
|
|
'new_deaths'])
|
2021-04-10 15:16:53 +10:00
|
|
|
aggregated_data.rename(columns = {'date': 'month'}, inplace = True)
|
|
|
|
aggregated_data.set_index(['location', 'month'], inplace = True)
|
|
|
|
|
2021-04-10 14:59:49 +10:00
|
|
|
print(aggregated_data.head(5))
|
2021-04-10 15:16:53 +10:00
|
|
|
aggregated_data.to_csv(args.path_to_csv)
|