From 8f7e856039796b9b1b32fc77afe6d03d17fb53d2 Mon Sep 17 00:00:00 2001 From: Rory Healy Date: Sat, 10 Apr 2021 14:47:26 +1000 Subject: [PATCH] sub-task 1 complete --- parta1.py | 45 ++++++++++++++++++++++++++++++++++++++------- 1 file changed, 38 insertions(+), 7 deletions(-) diff --git a/parta1.py b/parta1.py index 0413d73..55b3fd7 100644 --- a/parta1.py +++ b/parta1.py @@ -3,20 +3,51 @@ import argparse all_covid_data = pd.read_csv('data/owid-covid-data.csv', encoding = 'ISO-8859-1') -reduced_data = all_covid_data.loc[:,['location', 'date', 'total_cases', 'new_cases', 'total_deaths', 'new_deaths']] -# reduced_data_grouped = reduced_data.groupby(['location', 'date'], as_index = False) +# filter out data past 2020 +all_covid_data = all_covid_data[(all_covid_data['date'] >= '2020-01-01') & (all_covid_data['date'] <= '2020-12-31')] +all_covid_data.date = pd.to_datetime(all_covid_data.date) +# create groupby objects and sum new cases/deaths by month new_cases = all_covid_data.loc[:, ['location', 'date', 'new_cases']] -new_cases.date = pd.to_datetime(new_cases.date) new_cases_grouped = new_cases.groupby([new_cases.date.dt.month, new_cases.location]).new_cases.sum() new_deaths = all_covid_data.loc[:, ['location', 'date', 'new_deaths']] -new_deaths.date = pd.to_datetime(new_deaths.date) new_deaths_grouped = new_deaths.groupby([new_deaths.date.dt.month, new_deaths.location]).new_deaths.sum() +# convert multi-indexed series to dataframe +new_cases_grouped = new_cases_grouped.to_frame() +new_cases_grouped = pd.DataFrame(new_cases_grouped.to_records()) +new_deaths_grouped = new_deaths_grouped.to_frame() +new_deaths_grouped = pd.DataFrame(new_deaths_grouped.to_records()) -print(new_cases_grouped) +# sort by location, then date +new_cases_grouped.sort_values(by = ['location', 'date'], inplace = True) +new_deaths_grouped.sort_values(by = ['location', 'date'], inplace = True) -print("\n\n") +# rename columns +new_cases_grouped.rename(columns = {'date': 'month'}, inplace = True) +new_deaths_grouped.rename(columns = {'date': 'month'}, inplace = True) -print(new_deaths_grouped) +# merge new_deaths and new_cases +new_cases_grouped = new_cases_grouped.reindex(columns = ['location', 'month', 'new_cases']) +aggregated_data = new_cases_grouped.merge(new_deaths_grouped, how = 'outer', left_on = ['location', 'month'], right_on = ['location', 'month']) + +# filter out all entries that aren't at the end of the month +all_covid_data['end_of_month'] = pd.to_datetime(all_covid_data['date']).dt.is_month_end +all_covid_data = all_covid_data.loc[all_covid_data.end_of_month, :] + +# extract monthly total cases and total deaths +total_cases = all_covid_data.loc[:, ['location', 'date', 'total_cases']] +total_cases.date = total_cases.date.dt.month +total_cases.rename(columns = {'date': 'month'}, inplace = True) + +total_deaths = all_covid_data.loc[:, ['location', 'date', 'total_deaths']] +total_deaths.date = total_deaths.date.dt.month +total_deaths.rename(columns = {'date': 'month'}, inplace = True) + +# merge total_deaths and total_cases into aggregated_data +aggregated_data = aggregated_data.merge(total_cases, how = 'outer', left_on = ['location', 'month'], right_on = ['location', 'month']) +aggregated_data = aggregated_data.merge(total_deaths, how = 'outer', left_on = ['location', 'month'], right_on = ['location', 'month']) +aggregated_data = aggregated_data.reindex(columns = ['location', 'month', 'total_cases', 'new_cases', 'total_deaths', 'new_deaths']) + +print(aggregated_data.head(25))