partb5 complete, refactored code for cleaner design
This commit is contained in:
parent
8cb005de77
commit
a171eb1ca1
11 changed files with 230 additions and 139 deletions
11
README.md
11
README.md
|
@ -12,9 +12,12 @@ Using the cricket dataset from the LMS, this part of the project deals with buil
|
|||
|
||||
## Dependencies:
|
||||
|
||||
- pandas >= 1.2.2
|
||||
- matplotlib >= 3.3.2
|
||||
- numpy >= 1.19.2
|
||||
Python version used: 3.9.4
|
||||
|
||||
- pandas >= 1.2.4
|
||||
- matplotlib >= 3.4.1
|
||||
- numpy >= 1.20.2
|
||||
- regex >= 2021.4.4
|
||||
- scikit-learn >= 0.24.1
|
||||
- nltk >= 3.6.1
|
||||
- punkt model needed
|
||||
- punkt model and Porter stemmer needed
|
||||
|
|
BIN
owid-covid-2020-visual-analysis.pdf
Normal file
BIN
owid-covid-2020-visual-analysis.pdf
Normal file
Binary file not shown.
32
parta1.py
32
parta1.py
|
@ -6,18 +6,22 @@ parser = argparse.ArgumentParser()
|
|||
parser.add_argument('path_to_csv', help = 'path to the csv file')
|
||||
args = parser.parse_args()
|
||||
|
||||
all_covid_data = pd.read_csv('data/owid-covid-data.csv', encoding = 'ISO-8859-1')
|
||||
all_covid_data = pd.read_csv('data/owid-covid-data.csv', \
|
||||
encoding = 'ISO-8859-1')
|
||||
|
||||
# filter out data past 2020
|
||||
all_covid_data = all_covid_data[(all_covid_data['date'] >= '2020-01-01') & (all_covid_data['date'] <= '2020-12-31')]
|
||||
all_covid_data = all_covid_data[(all_covid_data['date'] >= '2020-01-01') \
|
||||
& (all_covid_data['date'] <= '2020-12-31')]
|
||||
all_covid_data.date = pd.to_datetime(all_covid_data.date)
|
||||
|
||||
# create groupby objects and sum new cases/deaths by month
|
||||
new_cases = all_covid_data.loc[:, ['location', 'date', 'new_cases']]
|
||||
new_cases_grouped = new_cases.groupby([new_cases.date.dt.month, new_cases.location]).new_cases.sum()
|
||||
new_cases_grouped = new_cases.groupby([new_cases.date.dt.month, \
|
||||
new_cases.location]).new_cases.sum()
|
||||
|
||||
new_deaths = all_covid_data.loc[:, ['location', 'date', 'new_deaths']]
|
||||
new_deaths_grouped = new_deaths.groupby([new_deaths.date.dt.month, new_deaths.location]).new_deaths.sum()
|
||||
new_deaths_grouped = new_deaths.groupby([new_deaths.date.dt.month, \
|
||||
new_deaths.location]).new_deaths.sum()
|
||||
|
||||
# convert multi-indexed series to dataframe
|
||||
new_cases_grouped = new_cases_grouped.to_frame()
|
||||
|
@ -30,10 +34,13 @@ new_cases_grouped.sort_values(by = ['location', 'date'], inplace = True)
|
|||
new_deaths_grouped.sort_values(by = ['location', 'date'], inplace = True)
|
||||
|
||||
# merge new_deaths_grouped and new_cases_grouped
|
||||
aggregated_data = new_cases_grouped.merge(new_deaths_grouped, how = 'outer', left_on = ['location', 'date'], right_on = ['location', 'date'])
|
||||
aggregated_data = new_cases_grouped.merge(new_deaths_grouped, \
|
||||
how = 'outer', left_on = ['location', 'date'], \
|
||||
right_on = ['location', 'date'])
|
||||
|
||||
# filter out all entries that aren't at the end of the month
|
||||
all_covid_data['end_of_month'] = pd.to_datetime(all_covid_data['date']).dt.is_month_end
|
||||
all_covid_data['end_of_month'] = \
|
||||
pd.to_datetime(all_covid_data['date']).dt.is_month_end
|
||||
all_covid_data = all_covid_data.loc[all_covid_data.end_of_month, :]
|
||||
|
||||
# extract monthly total cases and total deaths
|
||||
|
@ -44,14 +51,19 @@ total_deaths = all_covid_data.loc[:, ['location', 'date', 'total_deaths']]
|
|||
total_deaths.date = total_deaths.date.dt.month
|
||||
|
||||
# merge total_deaths and total_cases into aggregated_data
|
||||
aggregated_data = aggregated_data.merge(total_cases, how = 'outer', left_on = ['location', 'date'], right_on = ['location', 'date'])
|
||||
aggregated_data = aggregated_data.merge(total_deaths, how = 'outer', left_on = ['location', 'date'], right_on = ['location', 'date'])
|
||||
aggregated_data = aggregated_data.merge(total_cases, how = 'outer', \
|
||||
left_on = ['location', 'date'], right_on = ['location', 'date'])
|
||||
aggregated_data = aggregated_data.merge(total_deaths, how = 'outer', \
|
||||
left_on = ['location', 'date'], right_on = ['location', 'date'])
|
||||
|
||||
# compute case fatality rate for each month
|
||||
aggregated_data['case_fatality_rate'] = (aggregated_data['new_deaths'] / aggregated_data['new_cases'])
|
||||
aggregated_data['case_fatality_rate'] = \
|
||||
(aggregated_data['new_deaths'] / aggregated_data['new_cases'])
|
||||
|
||||
# format aggregated_data and output results
|
||||
aggregated_data = aggregated_data.reindex(columns = ['location', 'date', 'case_fatality_rate', 'total_cases', 'new_cases', 'total_deaths', 'new_deaths'])
|
||||
aggregated_data = aggregated_data.reindex(columns = ['location', 'date', \
|
||||
'case_fatality_rate', 'total_cases', 'new_cases', 'total_deaths', \
|
||||
'new_deaths'])
|
||||
aggregated_data.rename(columns = {'date': 'month'}, inplace = True)
|
||||
aggregated_data.set_index(['location', 'month'], inplace = True)
|
||||
|
||||
|
|
12
parta2.py
12
parta2.py
|
@ -9,7 +9,8 @@ parser.add_argument('scatter_a', help = 'output path of figure 1')
|
|||
parser.add_argument('scatter_b', help = 'output path of figure 2')
|
||||
args = parser.parse_args()
|
||||
|
||||
all_covid_data = pd.read_csv('data/owid-covid-data.csv', encoding = 'ISO-8859-1')
|
||||
all_covid_data = pd.read_csv('data/owid-covid-data.csv', \
|
||||
encoding = 'ISO-8859-1')
|
||||
|
||||
# filter out data - only need 2020-12-31
|
||||
all_covid_data = all_covid_data[(all_covid_data['date'] == '2020-12-31')]
|
||||
|
@ -21,13 +22,12 @@ total_cases.set_index(['location'], inplace = True)
|
|||
total_deaths = all_covid_data.loc[:, ['location', 'total_deaths']]
|
||||
|
||||
# merge total_cases and total_deaths
|
||||
aggregated_data = total_cases.merge(total_deaths, how = 'inner', on = 'location')
|
||||
aggregated_data = total_cases.merge(total_deaths, how = 'inner', \
|
||||
on = 'location')
|
||||
|
||||
# compute case fatality rate for each country
|
||||
aggregated_data['case_fatality_rate'] = (aggregated_data['total_deaths'] / aggregated_data['total_cases'])
|
||||
|
||||
# format aggregated_data
|
||||
aggregated_data = aggregated_data.reindex(columns = ['location', 'case_fatality_rate', 'total_deaths', 'total_cases'])
|
||||
aggregated_data['case_fatality_rate'] = \
|
||||
(aggregated_data['total_deaths'] / aggregated_data['total_cases'])
|
||||
|
||||
# extract case fatality rate from aggregated data
|
||||
case_fatality_rate = aggregated_data.loc[:, ['case_fatality_rate']]
|
||||
|
|
|
@ -13,10 +13,9 @@ pattern = r'[A-Z]{4}-\d{3}[a-zA-Z]?'
|
|||
|
||||
os.chdir(os.getcwd() + '/cricket')
|
||||
|
||||
# open every file, search each line for the document ID, add it to the list
|
||||
document_ids = []
|
||||
filenames = []
|
||||
|
||||
# open every file, search each line for the document ID, add it to the list
|
||||
for filename in os.listdir():
|
||||
filenames.append(filename)
|
||||
f = open(filename)
|
||||
|
|
19
partb2.py
19
partb2.py
|
@ -20,11 +20,16 @@ def apply_preprocessing(f):
|
|||
f = f.lower()
|
||||
return f
|
||||
|
||||
# parse input arguments
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument('path_to_file', help = 'path to document')
|
||||
args = parser.parse_args()
|
||||
def read_args():
|
||||
'''Creates an argparse ArgumentParser to read the command line
|
||||
arguments.'''
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument('path_to_file', help = 'path to document')
|
||||
args = parser.parse_args()
|
||||
return args
|
||||
|
||||
f = read_document(args.path_to_file)
|
||||
f = apply_preprocessing(f)
|
||||
print(f)
|
||||
if __name__ == '__main__':
|
||||
args = read_args()
|
||||
f = read_document(args.path_to_file)
|
||||
f = apply_preprocessing(f)
|
||||
print(f)
|
||||
|
|
92
partb3.py
92
partb3.py
|
@ -1,54 +1,50 @@
|
|||
import re
|
||||
import pandas as pd
|
||||
import nltk
|
||||
import os
|
||||
import argparse
|
||||
from partb2 import read_document, apply_preprocessing
|
||||
|
||||
def read_document(path):
|
||||
'''Reads a file when provided with its path, and returns a string
|
||||
containing the lines of the file.'''
|
||||
file_given = open(path)
|
||||
f = ""
|
||||
for line in file_given:
|
||||
f += line + " "
|
||||
file_given.close()
|
||||
return f
|
||||
|
||||
def apply_preprocessing(f):
|
||||
'''Removes non-alphabetic characters, replaces all whitespace characters
|
||||
with a single whitespace, and changes all uppercase characters to
|
||||
lowercase'''
|
||||
f = re.sub(r'[^a-zA-Z\s]', r'', f)
|
||||
f = re.sub(r'\s+', r' ', f)
|
||||
f = f.lower()
|
||||
return f
|
||||
|
||||
# parse input arguments
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument('keywords', nargs = '+', help = 'keywords to search for \
|
||||
(1-5 keywords accepted)')
|
||||
args = parser.parse_args()
|
||||
if len(args.keywords) > 5:
|
||||
print("Too many keywords.")
|
||||
quit()
|
||||
|
||||
# load document IDs from csv
|
||||
df = pd.read_csv('partb1.csv', encoding = 'ISO-8859-1')
|
||||
doc_ids = pd.Series(data = df.documentID.tolist(), \
|
||||
index = df.filename.tolist())
|
||||
documents = doc_ids.index.tolist()
|
||||
matched_doc_ids = []
|
||||
|
||||
os.chdir(os.getcwd() + '/cricket')
|
||||
|
||||
# search through each document for the keywords
|
||||
for doc in documents:
|
||||
f = read_document(doc)
|
||||
f = apply_preprocessing(f)
|
||||
|
||||
tokens = nltk.word_tokenize(f)
|
||||
# only add the document ID if all the keywords are in the token list
|
||||
if all(keyword in tokens for keyword in args.keywords):
|
||||
matched_doc_ids.append(doc_ids.get(doc))
|
||||
def read_args():
|
||||
'''Creates an argparse ArgumentParser to read the command line
|
||||
arguments.'''
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument('keywords', nargs = '+', \
|
||||
help = 'keywords to search for (1-5 keywords accepted)')
|
||||
args = parser.parse_args()
|
||||
if len(args.keywords) > 5:
|
||||
print("Too many keywords.")
|
||||
quit()
|
||||
|
||||
print(matched_doc_ids)
|
||||
return args
|
||||
|
||||
def load_doc_ids():
|
||||
'''Loads in the documentIDs from partb1.csv, and returns the lists of
|
||||
documentIDs and filenames.'''
|
||||
df = pd.read_csv('partb1.csv', encoding = 'ISO-8859-1')
|
||||
doc_ids = pd.Series(data = df.documentID.tolist(), \
|
||||
index = df.filename.tolist())
|
||||
documents = doc_ids.index.tolist()
|
||||
return doc_ids, documents
|
||||
|
||||
def find_matching_docs(doc_ids, documents, args):
|
||||
'''Takes the document list, applies pre-processing techniques to each
|
||||
document, tokenises the words, and returns a list containing document IDs
|
||||
that match the keywords given as arguments to this program.'''
|
||||
matched_doc_ids = []
|
||||
for doc in documents:
|
||||
f = read_document(doc)
|
||||
f = apply_preprocessing(f)
|
||||
|
||||
tokens = nltk.word_tokenize(f)
|
||||
# only add the document ID if all the keywords are in the token list
|
||||
if all(keyword in tokens for keyword in args.keywords):
|
||||
matched_doc_ids.append(doc_ids.get(doc))
|
||||
|
||||
return matched_doc_ids
|
||||
|
||||
if __name__ == '__main__':
|
||||
args = read_args()
|
||||
doc_ids, documents = load_doc_ids()
|
||||
os.chdir(os.getcwd() + '/cricket')
|
||||
matched_doc_ids = find_matching_docs(doc_ids, documents, args)
|
||||
print(matched_doc_ids)
|
||||
|
|
89
partb4.py
89
partb4.py
|
@ -1,66 +1,37 @@
|
|||
import re
|
||||
import pandas as pd
|
||||
import os
|
||||
import nltk
|
||||
from nltk.stem.porter import *
|
||||
import argparse
|
||||
from partb2 import read_document, apply_preprocessing
|
||||
from partb3 import read_args, load_doc_ids
|
||||
|
||||
def read_document(path):
|
||||
'''Reads a file when provided with its path, and returns a string
|
||||
containing the lines of the file.'''
|
||||
file_given = open(path)
|
||||
f = ""
|
||||
for line in file_given:
|
||||
f += line + " "
|
||||
file_given.close()
|
||||
return f
|
||||
def find_matching_docs(doc_ids, documents, args):
|
||||
'''Takes the document list, applies pre-processing techniques to each
|
||||
document, tokenises the words, and returns a list containing document IDs
|
||||
that match the keywords given as arguments to this program.'''
|
||||
matched_doc_ids = []
|
||||
porter_stemmer = PorterStemmer()
|
||||
for doc in documents:
|
||||
f = read_document(doc)
|
||||
f = apply_preprocessing(f)
|
||||
|
||||
def apply_preprocessing(f):
|
||||
'''Removes non-alphabetic characters, replaces all whitespace characters
|
||||
with a single whitespace, and changes all uppercase characters to
|
||||
lowercase'''
|
||||
f = re.sub(r'[^a-zA-Z\s]', r'', f)
|
||||
f = re.sub(r'\s+', r' ', f)
|
||||
f = f.lower()
|
||||
return f
|
||||
|
||||
# parse input arguments
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument('keywords', nargs = '+', help = 'keywords to search \
|
||||
for (1-5 keywords accepted)')
|
||||
args = parser.parse_args()
|
||||
if len(args.keywords) > 5:
|
||||
print("Too many keywords.")
|
||||
quit()
|
||||
|
||||
# load document IDs from csv
|
||||
df = pd.read_csv('partb1.csv', encoding = 'ISO-8859-1')
|
||||
doc_ids = pd.Series(data = df.documentID.tolist(), \
|
||||
index = df.filename.tolist())
|
||||
documents = doc_ids.index.tolist()
|
||||
matched_doc_ids = []
|
||||
|
||||
# change directory to get cricket data
|
||||
os.chdir(os.getcwd() + '/cricket')
|
||||
|
||||
# search through each document for the keywords
|
||||
porter_stemmer = PorterStemmer()
|
||||
|
||||
for doc in documents:
|
||||
f = read_document(doc)
|
||||
f = apply_preprocessing(f)
|
||||
|
||||
# tokenise the document, remove stop words
|
||||
word_list = nltk.word_tokenize(f)
|
||||
# tokenise the document
|
||||
word_list = nltk.word_tokenize(f)
|
||||
|
||||
# use the Porter stemmer to add stem words to the word list
|
||||
for word in word_list:
|
||||
stemmed_word = porter_stemmer.stem(word)
|
||||
if stemmed_word not in word_list:
|
||||
word_list.append(stemmed_word)
|
||||
|
||||
# add document ID if all keywords are in this new word list
|
||||
if all(keyword in word_list for keyword in args.keywords):
|
||||
matched_doc_ids.append(doc_ids.get(doc))
|
||||
|
||||
# use the Porter stemmer to add stem words to the word list
|
||||
for word in word_list:
|
||||
stemmed_word = porter_stemmer.stem(word)
|
||||
if stemmed_word not in word_list:
|
||||
word_list.append(stemmed_word)
|
||||
return matched_doc_ids
|
||||
|
||||
# add document ID if all keywords are in this new word list
|
||||
if all(keyword in word_list for keyword in args.keywords):
|
||||
matched_doc_ids.append(doc_ids.get(doc))
|
||||
|
||||
print(matched_doc_ids)
|
||||
if __name__ == '__main__':
|
||||
args = read_args()
|
||||
doc_ids, documents = load_doc_ids()
|
||||
os.chdir(os.getcwd() + '/cricket')
|
||||
matched_doc_ids = find_matching_docs(doc_ids, documents, args)
|
||||
print(matched_doc_ids)
|
||||
|
|
111
partb5.py
111
partb5.py
|
@ -1,6 +1,111 @@
|
|||
## Part B Task 5
|
||||
import re
|
||||
import os
|
||||
import sys
|
||||
import pandas as pd
|
||||
import nltk
|
||||
import argparse
|
||||
from math import sqrt
|
||||
from numpy.linalg import norm
|
||||
from numpy import dot
|
||||
from nltk.corpus import stopwords
|
||||
from nltk.stem.porter import *
|
||||
from sklearn.feature_extraction.text import TfidfTransformer
|
||||
from partb2 import read_document, apply_preprocessing
|
||||
from partb3 import read_args
|
||||
|
||||
def cosine_similarity(x1, x2):
|
||||
'''Calculates the cosine similarity between two vectors. Equal to the
|
||||
cosine definition using the dot product.'''
|
||||
return dot(x1, x2) / (norm(x1) * norm(x2))
|
||||
|
||||
args = read_args()
|
||||
|
||||
# load document IDs from csv
|
||||
df = pd.read_csv('partb1.csv', encoding = 'ISO-8859-1')
|
||||
doc_ids = pd.Series(data = df.documentID.tolist(), \
|
||||
index = df.filename.tolist())
|
||||
documents = doc_ids.index.tolist()
|
||||
|
||||
# change directory to get cricket data
|
||||
os.chdir(os.getcwd() + '/cricket')
|
||||
|
||||
# build the corpus
|
||||
corpus = []
|
||||
porter_stemmer = PorterStemmer()
|
||||
stop_words = set(stopwords.words('english'))
|
||||
|
||||
for doc in documents:
|
||||
f = read_document(doc)
|
||||
f = apply_preprocessing(f)
|
||||
|
||||
# tokenise the document, remove stop words
|
||||
word_list = nltk.word_tokenize(f)
|
||||
word_list = [w for w in word_list if not w in stop_words]
|
||||
|
||||
# add all words and their stems to the corpus
|
||||
for word in word_list:
|
||||
stemmed_word = porter_stemmer.stem(word)
|
||||
|
||||
if stemmed_word not in corpus:
|
||||
corpus.append(word)
|
||||
|
||||
# build the term counts
|
||||
term_counts = []
|
||||
for doc in documents:
|
||||
curr_term_count = []
|
||||
f = read_document(doc)
|
||||
f = apply_preprocessing(f)
|
||||
|
||||
# tokenise the document, remove stop words
|
||||
word_list = nltk.word_tokenize(f)
|
||||
word_list = [w for w in word_list if not w in stop_words]
|
||||
|
||||
# build frequency dictionary of stemmed words
|
||||
wordDict = {}
|
||||
for word in word_list:
|
||||
stemmed_word = porter_stemmer.stem(word)
|
||||
|
||||
if stemmed_word in wordDict:
|
||||
wordDict[stemmed_word] += 1
|
||||
else:
|
||||
wordDict[stemmed_word] = 1
|
||||
|
||||
# fill in the current count of terms, then add to the overall list
|
||||
for word in corpus:
|
||||
if word in wordDict.keys():
|
||||
curr_term_count.append(wordDict[word])
|
||||
else:
|
||||
curr_term_count.append(0)
|
||||
|
||||
term_counts.append(curr_term_count)
|
||||
|
||||
# calculate the tf-idf scores
|
||||
transformer = TfidfTransformer()
|
||||
tfidf = transformer.fit_transform(term_counts)
|
||||
doc_tfidf = tfidf.toarray()
|
||||
|
||||
# construct the query unit vector
|
||||
query_vector = []
|
||||
for word in corpus:
|
||||
if word in args.keywords:
|
||||
# this assumes that keywords are unique
|
||||
# and aren't entered more than once
|
||||
query_vector.append(1)
|
||||
else:
|
||||
query_vector.append(0)
|
||||
query_unit_vector = [x / norm(query_vector) for x in query_vector]
|
||||
|
||||
similarities = [cosine_similarity(query_unit_vector, doc_tfidf[d_id]) for \
|
||||
d_id in range(doc_tfidf.shape[0])]
|
||||
|
||||
# this holds the similarities with their respective document IDs
|
||||
sim_doc_ids = df
|
||||
sim_doc_ids.insert(1, 'similarity_scores', similarities)
|
||||
sim_doc_ids.sort_values(by = 'similarity_scores', ascending = False, \
|
||||
inplace = True)
|
||||
sorted_doc_ids = sim_doc_ids.documentID.tolist()
|
||||
sorted_similarities = sim_doc_ids.similarity_scores.tolist()
|
||||
|
||||
# print documentID and scores, sorted by scores
|
||||
print('documentID | score')
|
||||
for i in range(len(sorted_doc_ids)):
|
||||
if sorted_similarities[i] > 0:
|
||||
print(f"{sorted_doc_ids[i]:10} | {sorted_similarities[i]:.4f}")
|
||||
|
|
BIN
scatter-a.png
BIN
scatter-a.png
Binary file not shown.
Before Width: | Height: | Size: 86 KiB After Width: | Height: | Size: 86 KiB |
BIN
scatter-b.png
BIN
scatter-b.png
Binary file not shown.
Before Width: | Height: | Size: 94 KiB After Width: | Height: | Size: 94 KiB |
Loading…
Reference in a new issue