partb5 complete, refactored code for cleaner design

This commit is contained in:
Rory Healy 2021-04-15 07:47:29 +10:00
parent 8cb005de77
commit a171eb1ca1
11 changed files with 230 additions and 139 deletions

View file

@ -12,9 +12,12 @@ Using the cricket dataset from the LMS, this part of the project deals with buil
## Dependencies:
- pandas >= 1.2.2
- matplotlib >= 3.3.2
- numpy >= 1.19.2
Python version used: 3.9.4
- pandas >= 1.2.4
- matplotlib >= 3.4.1
- numpy >= 1.20.2
- regex >= 2021.4.4
- scikit-learn >= 0.24.1
- nltk >= 3.6.1
- punkt model needed
- punkt model and Porter stemmer needed

Binary file not shown.

View file

@ -6,18 +6,22 @@ parser = argparse.ArgumentParser()
parser.add_argument('path_to_csv', help = 'path to the csv file')
args = parser.parse_args()
all_covid_data = pd.read_csv('data/owid-covid-data.csv', encoding = 'ISO-8859-1')
all_covid_data = pd.read_csv('data/owid-covid-data.csv', \
encoding = 'ISO-8859-1')
# filter out data past 2020
all_covid_data = all_covid_data[(all_covid_data['date'] >= '2020-01-01') & (all_covid_data['date'] <= '2020-12-31')]
all_covid_data = all_covid_data[(all_covid_data['date'] >= '2020-01-01') \
& (all_covid_data['date'] <= '2020-12-31')]
all_covid_data.date = pd.to_datetime(all_covid_data.date)
# create groupby objects and sum new cases/deaths by month
new_cases = all_covid_data.loc[:, ['location', 'date', 'new_cases']]
new_cases_grouped = new_cases.groupby([new_cases.date.dt.month, new_cases.location]).new_cases.sum()
new_cases_grouped = new_cases.groupby([new_cases.date.dt.month, \
new_cases.location]).new_cases.sum()
new_deaths = all_covid_data.loc[:, ['location', 'date', 'new_deaths']]
new_deaths_grouped = new_deaths.groupby([new_deaths.date.dt.month, new_deaths.location]).new_deaths.sum()
new_deaths_grouped = new_deaths.groupby([new_deaths.date.dt.month, \
new_deaths.location]).new_deaths.sum()
# convert multi-indexed series to dataframe
new_cases_grouped = new_cases_grouped.to_frame()
@ -30,10 +34,13 @@ new_cases_grouped.sort_values(by = ['location', 'date'], inplace = True)
new_deaths_grouped.sort_values(by = ['location', 'date'], inplace = True)
# merge new_deaths_grouped and new_cases_grouped
aggregated_data = new_cases_grouped.merge(new_deaths_grouped, how = 'outer', left_on = ['location', 'date'], right_on = ['location', 'date'])
aggregated_data = new_cases_grouped.merge(new_deaths_grouped, \
how = 'outer', left_on = ['location', 'date'], \
right_on = ['location', 'date'])
# filter out all entries that aren't at the end of the month
all_covid_data['end_of_month'] = pd.to_datetime(all_covid_data['date']).dt.is_month_end
all_covid_data['end_of_month'] = \
pd.to_datetime(all_covid_data['date']).dt.is_month_end
all_covid_data = all_covid_data.loc[all_covid_data.end_of_month, :]
# extract monthly total cases and total deaths
@ -44,14 +51,19 @@ total_deaths = all_covid_data.loc[:, ['location', 'date', 'total_deaths']]
total_deaths.date = total_deaths.date.dt.month
# merge total_deaths and total_cases into aggregated_data
aggregated_data = aggregated_data.merge(total_cases, how = 'outer', left_on = ['location', 'date'], right_on = ['location', 'date'])
aggregated_data = aggregated_data.merge(total_deaths, how = 'outer', left_on = ['location', 'date'], right_on = ['location', 'date'])
aggregated_data = aggregated_data.merge(total_cases, how = 'outer', \
left_on = ['location', 'date'], right_on = ['location', 'date'])
aggregated_data = aggregated_data.merge(total_deaths, how = 'outer', \
left_on = ['location', 'date'], right_on = ['location', 'date'])
# compute case fatality rate for each month
aggregated_data['case_fatality_rate'] = (aggregated_data['new_deaths'] / aggregated_data['new_cases'])
aggregated_data['case_fatality_rate'] = \
(aggregated_data['new_deaths'] / aggregated_data['new_cases'])
# format aggregated_data and output results
aggregated_data = aggregated_data.reindex(columns = ['location', 'date', 'case_fatality_rate', 'total_cases', 'new_cases', 'total_deaths', 'new_deaths'])
aggregated_data = aggregated_data.reindex(columns = ['location', 'date', \
'case_fatality_rate', 'total_cases', 'new_cases', 'total_deaths', \
'new_deaths'])
aggregated_data.rename(columns = {'date': 'month'}, inplace = True)
aggregated_data.set_index(['location', 'month'], inplace = True)

View file

@ -9,7 +9,8 @@ parser.add_argument('scatter_a', help = 'output path of figure 1')
parser.add_argument('scatter_b', help = 'output path of figure 2')
args = parser.parse_args()
all_covid_data = pd.read_csv('data/owid-covid-data.csv', encoding = 'ISO-8859-1')
all_covid_data = pd.read_csv('data/owid-covid-data.csv', \
encoding = 'ISO-8859-1')
# filter out data - only need 2020-12-31
all_covid_data = all_covid_data[(all_covid_data['date'] == '2020-12-31')]
@ -21,13 +22,12 @@ total_cases.set_index(['location'], inplace = True)
total_deaths = all_covid_data.loc[:, ['location', 'total_deaths']]
# merge total_cases and total_deaths
aggregated_data = total_cases.merge(total_deaths, how = 'inner', on = 'location')
aggregated_data = total_cases.merge(total_deaths, how = 'inner', \
on = 'location')
# compute case fatality rate for each country
aggregated_data['case_fatality_rate'] = (aggregated_data['total_deaths'] / aggregated_data['total_cases'])
# format aggregated_data
aggregated_data = aggregated_data.reindex(columns = ['location', 'case_fatality_rate', 'total_deaths', 'total_cases'])
aggregated_data['case_fatality_rate'] = \
(aggregated_data['total_deaths'] / aggregated_data['total_cases'])
# extract case fatality rate from aggregated data
case_fatality_rate = aggregated_data.loc[:, ['case_fatality_rate']]

View file

@ -13,10 +13,9 @@ pattern = r'[A-Z]{4}-\d{3}[a-zA-Z]?'
os.chdir(os.getcwd() + '/cricket')
# open every file, search each line for the document ID, add it to the list
document_ids = []
filenames = []
# open every file, search each line for the document ID, add it to the list
for filename in os.listdir():
filenames.append(filename)
f = open(filename)

View file

@ -20,11 +20,16 @@ def apply_preprocessing(f):
f = f.lower()
return f
# parse input arguments
parser = argparse.ArgumentParser()
parser.add_argument('path_to_file', help = 'path to document')
args = parser.parse_args()
def read_args():
'''Creates an argparse ArgumentParser to read the command line
arguments.'''
parser = argparse.ArgumentParser()
parser.add_argument('path_to_file', help = 'path to document')
args = parser.parse_args()
return args
f = read_document(args.path_to_file)
f = apply_preprocessing(f)
print(f)
if __name__ == '__main__':
args = read_args()
f = read_document(args.path_to_file)
f = apply_preprocessing(f)
print(f)

View file

@ -1,54 +1,50 @@
import re
import pandas as pd
import nltk
import os
import argparse
from partb2 import read_document, apply_preprocessing
def read_document(path):
'''Reads a file when provided with its path, and returns a string
containing the lines of the file.'''
file_given = open(path)
f = ""
for line in file_given:
f += line + " "
file_given.close()
return f
def apply_preprocessing(f):
'''Removes non-alphabetic characters, replaces all whitespace characters
with a single whitespace, and changes all uppercase characters to
lowercase'''
f = re.sub(r'[^a-zA-Z\s]', r'', f)
f = re.sub(r'\s+', r' ', f)
f = f.lower()
return f
# parse input arguments
parser = argparse.ArgumentParser()
parser.add_argument('keywords', nargs = '+', help = 'keywords to search for \
(1-5 keywords accepted)')
args = parser.parse_args()
if len(args.keywords) > 5:
print("Too many keywords.")
quit()
# load document IDs from csv
df = pd.read_csv('partb1.csv', encoding = 'ISO-8859-1')
doc_ids = pd.Series(data = df.documentID.tolist(), \
index = df.filename.tolist())
documents = doc_ids.index.tolist()
matched_doc_ids = []
os.chdir(os.getcwd() + '/cricket')
# search through each document for the keywords
for doc in documents:
f = read_document(doc)
f = apply_preprocessing(f)
tokens = nltk.word_tokenize(f)
# only add the document ID if all the keywords are in the token list
if all(keyword in tokens for keyword in args.keywords):
matched_doc_ids.append(doc_ids.get(doc))
def read_args():
'''Creates an argparse ArgumentParser to read the command line
arguments.'''
parser = argparse.ArgumentParser()
parser.add_argument('keywords', nargs = '+', \
help = 'keywords to search for (1-5 keywords accepted)')
args = parser.parse_args()
if len(args.keywords) > 5:
print("Too many keywords.")
quit()
print(matched_doc_ids)
return args
def load_doc_ids():
'''Loads in the documentIDs from partb1.csv, and returns the lists of
documentIDs and filenames.'''
df = pd.read_csv('partb1.csv', encoding = 'ISO-8859-1')
doc_ids = pd.Series(data = df.documentID.tolist(), \
index = df.filename.tolist())
documents = doc_ids.index.tolist()
return doc_ids, documents
def find_matching_docs(doc_ids, documents, args):
'''Takes the document list, applies pre-processing techniques to each
document, tokenises the words, and returns a list containing document IDs
that match the keywords given as arguments to this program.'''
matched_doc_ids = []
for doc in documents:
f = read_document(doc)
f = apply_preprocessing(f)
tokens = nltk.word_tokenize(f)
# only add the document ID if all the keywords are in the token list
if all(keyword in tokens for keyword in args.keywords):
matched_doc_ids.append(doc_ids.get(doc))
return matched_doc_ids
if __name__ == '__main__':
args = read_args()
doc_ids, documents = load_doc_ids()
os.chdir(os.getcwd() + '/cricket')
matched_doc_ids = find_matching_docs(doc_ids, documents, args)
print(matched_doc_ids)

View file

@ -1,66 +1,37 @@
import re
import pandas as pd
import os
import nltk
from nltk.stem.porter import *
import argparse
from partb2 import read_document, apply_preprocessing
from partb3 import read_args, load_doc_ids
def read_document(path):
'''Reads a file when provided with its path, and returns a string
containing the lines of the file.'''
file_given = open(path)
f = ""
for line in file_given:
f += line + " "
file_given.close()
return f
def find_matching_docs(doc_ids, documents, args):
'''Takes the document list, applies pre-processing techniques to each
document, tokenises the words, and returns a list containing document IDs
that match the keywords given as arguments to this program.'''
matched_doc_ids = []
porter_stemmer = PorterStemmer()
for doc in documents:
f = read_document(doc)
f = apply_preprocessing(f)
def apply_preprocessing(f):
'''Removes non-alphabetic characters, replaces all whitespace characters
with a single whitespace, and changes all uppercase characters to
lowercase'''
f = re.sub(r'[^a-zA-Z\s]', r'', f)
f = re.sub(r'\s+', r' ', f)
f = f.lower()
return f
# parse input arguments
parser = argparse.ArgumentParser()
parser.add_argument('keywords', nargs = '+', help = 'keywords to search \
for (1-5 keywords accepted)')
args = parser.parse_args()
if len(args.keywords) > 5:
print("Too many keywords.")
quit()
# load document IDs from csv
df = pd.read_csv('partb1.csv', encoding = 'ISO-8859-1')
doc_ids = pd.Series(data = df.documentID.tolist(), \
index = df.filename.tolist())
documents = doc_ids.index.tolist()
matched_doc_ids = []
# change directory to get cricket data
os.chdir(os.getcwd() + '/cricket')
# search through each document for the keywords
porter_stemmer = PorterStemmer()
for doc in documents:
f = read_document(doc)
f = apply_preprocessing(f)
# tokenise the document, remove stop words
word_list = nltk.word_tokenize(f)
# tokenise the document
word_list = nltk.word_tokenize(f)
# use the Porter stemmer to add stem words to the word list
for word in word_list:
stemmed_word = porter_stemmer.stem(word)
if stemmed_word not in word_list:
word_list.append(stemmed_word)
# add document ID if all keywords are in this new word list
if all(keyword in word_list for keyword in args.keywords):
matched_doc_ids.append(doc_ids.get(doc))
# use the Porter stemmer to add stem words to the word list
for word in word_list:
stemmed_word = porter_stemmer.stem(word)
if stemmed_word not in word_list:
word_list.append(stemmed_word)
return matched_doc_ids
# add document ID if all keywords are in this new word list
if all(keyword in word_list for keyword in args.keywords):
matched_doc_ids.append(doc_ids.get(doc))
print(matched_doc_ids)
if __name__ == '__main__':
args = read_args()
doc_ids, documents = load_doc_ids()
os.chdir(os.getcwd() + '/cricket')
matched_doc_ids = find_matching_docs(doc_ids, documents, args)
print(matched_doc_ids)

111
partb5.py
View file

@ -1,6 +1,111 @@
## Part B Task 5
import re
import os
import sys
import pandas as pd
import nltk
import argparse
from math import sqrt
from numpy.linalg import norm
from numpy import dot
from nltk.corpus import stopwords
from nltk.stem.porter import *
from sklearn.feature_extraction.text import TfidfTransformer
from partb2 import read_document, apply_preprocessing
from partb3 import read_args
def cosine_similarity(x1, x2):
'''Calculates the cosine similarity between two vectors. Equal to the
cosine definition using the dot product.'''
return dot(x1, x2) / (norm(x1) * norm(x2))
args = read_args()
# load document IDs from csv
df = pd.read_csv('partb1.csv', encoding = 'ISO-8859-1')
doc_ids = pd.Series(data = df.documentID.tolist(), \
index = df.filename.tolist())
documents = doc_ids.index.tolist()
# change directory to get cricket data
os.chdir(os.getcwd() + '/cricket')
# build the corpus
corpus = []
porter_stemmer = PorterStemmer()
stop_words = set(stopwords.words('english'))
for doc in documents:
f = read_document(doc)
f = apply_preprocessing(f)
# tokenise the document, remove stop words
word_list = nltk.word_tokenize(f)
word_list = [w for w in word_list if not w in stop_words]
# add all words and their stems to the corpus
for word in word_list:
stemmed_word = porter_stemmer.stem(word)
if stemmed_word not in corpus:
corpus.append(word)
# build the term counts
term_counts = []
for doc in documents:
curr_term_count = []
f = read_document(doc)
f = apply_preprocessing(f)
# tokenise the document, remove stop words
word_list = nltk.word_tokenize(f)
word_list = [w for w in word_list if not w in stop_words]
# build frequency dictionary of stemmed words
wordDict = {}
for word in word_list:
stemmed_word = porter_stemmer.stem(word)
if stemmed_word in wordDict:
wordDict[stemmed_word] += 1
else:
wordDict[stemmed_word] = 1
# fill in the current count of terms, then add to the overall list
for word in corpus:
if word in wordDict.keys():
curr_term_count.append(wordDict[word])
else:
curr_term_count.append(0)
term_counts.append(curr_term_count)
# calculate the tf-idf scores
transformer = TfidfTransformer()
tfidf = transformer.fit_transform(term_counts)
doc_tfidf = tfidf.toarray()
# construct the query unit vector
query_vector = []
for word in corpus:
if word in args.keywords:
# this assumes that keywords are unique
# and aren't entered more than once
query_vector.append(1)
else:
query_vector.append(0)
query_unit_vector = [x / norm(query_vector) for x in query_vector]
similarities = [cosine_similarity(query_unit_vector, doc_tfidf[d_id]) for \
d_id in range(doc_tfidf.shape[0])]
# this holds the similarities with their respective document IDs
sim_doc_ids = df
sim_doc_ids.insert(1, 'similarity_scores', similarities)
sim_doc_ids.sort_values(by = 'similarity_scores', ascending = False, \
inplace = True)
sorted_doc_ids = sim_doc_ids.documentID.tolist()
sorted_similarities = sim_doc_ids.similarity_scores.tolist()
# print documentID and scores, sorted by scores
print('documentID | score')
for i in range(len(sorted_doc_ids)):
if sorted_similarities[i] > 0:
print(f"{sorted_doc_ids[i]:10} | {sorted_similarities[i]:.4f}")

Binary file not shown.

Before

Width:  |  Height:  |  Size: 86 KiB

After

Width:  |  Height:  |  Size: 86 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 94 KiB

After

Width:  |  Height:  |  Size: 94 KiB