diff --git a/README.md b/README.md index da3972f..91724cf 100644 --- a/README.md +++ b/README.md @@ -12,9 +12,12 @@ Using the cricket dataset from the LMS, this part of the project deals with buil ## Dependencies: - - pandas >= 1.2.2 - - matplotlib >= 3.3.2 - - numpy >= 1.19.2 +Python version used: 3.9.4 + + - pandas >= 1.2.4 + - matplotlib >= 3.4.1 + - numpy >= 1.20.2 - regex >= 2021.4.4 + - scikit-learn >= 0.24.1 - nltk >= 3.6.1 - - punkt model needed + - punkt model and Porter stemmer needed diff --git a/owid-covid-2020-visual-analysis.pdf b/owid-covid-2020-visual-analysis.pdf new file mode 100644 index 0000000..6cb8ee3 Binary files /dev/null and b/owid-covid-2020-visual-analysis.pdf differ diff --git a/parta1.py b/parta1.py index 4e27bcd..b567cc6 100644 --- a/parta1.py +++ b/parta1.py @@ -6,18 +6,22 @@ parser = argparse.ArgumentParser() parser.add_argument('path_to_csv', help = 'path to the csv file') args = parser.parse_args() -all_covid_data = pd.read_csv('data/owid-covid-data.csv', encoding = 'ISO-8859-1') +all_covid_data = pd.read_csv('data/owid-covid-data.csv', \ + encoding = 'ISO-8859-1') # filter out data past 2020 -all_covid_data = all_covid_data[(all_covid_data['date'] >= '2020-01-01') & (all_covid_data['date'] <= '2020-12-31')] +all_covid_data = all_covid_data[(all_covid_data['date'] >= '2020-01-01') \ + & (all_covid_data['date'] <= '2020-12-31')] all_covid_data.date = pd.to_datetime(all_covid_data.date) # create groupby objects and sum new cases/deaths by month new_cases = all_covid_data.loc[:, ['location', 'date', 'new_cases']] -new_cases_grouped = new_cases.groupby([new_cases.date.dt.month, new_cases.location]).new_cases.sum() +new_cases_grouped = new_cases.groupby([new_cases.date.dt.month, \ + new_cases.location]).new_cases.sum() new_deaths = all_covid_data.loc[:, ['location', 'date', 'new_deaths']] -new_deaths_grouped = new_deaths.groupby([new_deaths.date.dt.month, new_deaths.location]).new_deaths.sum() +new_deaths_grouped = new_deaths.groupby([new_deaths.date.dt.month, \ + new_deaths.location]).new_deaths.sum() # convert multi-indexed series to dataframe new_cases_grouped = new_cases_grouped.to_frame() @@ -30,10 +34,13 @@ new_cases_grouped.sort_values(by = ['location', 'date'], inplace = True) new_deaths_grouped.sort_values(by = ['location', 'date'], inplace = True) # merge new_deaths_grouped and new_cases_grouped -aggregated_data = new_cases_grouped.merge(new_deaths_grouped, how = 'outer', left_on = ['location', 'date'], right_on = ['location', 'date']) +aggregated_data = new_cases_grouped.merge(new_deaths_grouped, \ + how = 'outer', left_on = ['location', 'date'], \ + right_on = ['location', 'date']) # filter out all entries that aren't at the end of the month -all_covid_data['end_of_month'] = pd.to_datetime(all_covid_data['date']).dt.is_month_end +all_covid_data['end_of_month'] = \ + pd.to_datetime(all_covid_data['date']).dt.is_month_end all_covid_data = all_covid_data.loc[all_covid_data.end_of_month, :] # extract monthly total cases and total deaths @@ -44,14 +51,19 @@ total_deaths = all_covid_data.loc[:, ['location', 'date', 'total_deaths']] total_deaths.date = total_deaths.date.dt.month # merge total_deaths and total_cases into aggregated_data -aggregated_data = aggregated_data.merge(total_cases, how = 'outer', left_on = ['location', 'date'], right_on = ['location', 'date']) -aggregated_data = aggregated_data.merge(total_deaths, how = 'outer', left_on = ['location', 'date'], right_on = ['location', 'date']) +aggregated_data = aggregated_data.merge(total_cases, how = 'outer', \ + left_on = ['location', 'date'], right_on = ['location', 'date']) +aggregated_data = aggregated_data.merge(total_deaths, how = 'outer', \ + left_on = ['location', 'date'], right_on = ['location', 'date']) # compute case fatality rate for each month -aggregated_data['case_fatality_rate'] = (aggregated_data['new_deaths'] / aggregated_data['new_cases']) +aggregated_data['case_fatality_rate'] = \ + (aggregated_data['new_deaths'] / aggregated_data['new_cases']) # format aggregated_data and output results -aggregated_data = aggregated_data.reindex(columns = ['location', 'date', 'case_fatality_rate', 'total_cases', 'new_cases', 'total_deaths', 'new_deaths']) +aggregated_data = aggregated_data.reindex(columns = ['location', 'date', \ + 'case_fatality_rate', 'total_cases', 'new_cases', 'total_deaths', \ + 'new_deaths']) aggregated_data.rename(columns = {'date': 'month'}, inplace = True) aggregated_data.set_index(['location', 'month'], inplace = True) diff --git a/parta2.py b/parta2.py index 896d572..fab5513 100644 --- a/parta2.py +++ b/parta2.py @@ -9,7 +9,8 @@ parser.add_argument('scatter_a', help = 'output path of figure 1') parser.add_argument('scatter_b', help = 'output path of figure 2') args = parser.parse_args() -all_covid_data = pd.read_csv('data/owid-covid-data.csv', encoding = 'ISO-8859-1') +all_covid_data = pd.read_csv('data/owid-covid-data.csv', \ + encoding = 'ISO-8859-1') # filter out data - only need 2020-12-31 all_covid_data = all_covid_data[(all_covid_data['date'] == '2020-12-31')] @@ -21,13 +22,12 @@ total_cases.set_index(['location'], inplace = True) total_deaths = all_covid_data.loc[:, ['location', 'total_deaths']] # merge total_cases and total_deaths -aggregated_data = total_cases.merge(total_deaths, how = 'inner', on = 'location') +aggregated_data = total_cases.merge(total_deaths, how = 'inner', \ + on = 'location') # compute case fatality rate for each country -aggregated_data['case_fatality_rate'] = (aggregated_data['total_deaths'] / aggregated_data['total_cases']) - -# format aggregated_data -aggregated_data = aggregated_data.reindex(columns = ['location', 'case_fatality_rate', 'total_deaths', 'total_cases']) +aggregated_data['case_fatality_rate'] = \ + (aggregated_data['total_deaths'] / aggregated_data['total_cases']) # extract case fatality rate from aggregated data case_fatality_rate = aggregated_data.loc[:, ['case_fatality_rate']] diff --git a/partb1.py b/partb1.py index b90e283..120c091 100644 --- a/partb1.py +++ b/partb1.py @@ -13,10 +13,9 @@ pattern = r'[A-Z]{4}-\d{3}[a-zA-Z]?' os.chdir(os.getcwd() + '/cricket') +# open every file, search each line for the document ID, add it to the list document_ids = [] filenames = [] - -# open every file, search each line for the document ID, add it to the list for filename in os.listdir(): filenames.append(filename) f = open(filename) diff --git a/partb2.py b/partb2.py index 94bec37..cf9f371 100644 --- a/partb2.py +++ b/partb2.py @@ -20,11 +20,16 @@ def apply_preprocessing(f): f = f.lower() return f -# parse input arguments -parser = argparse.ArgumentParser() -parser.add_argument('path_to_file', help = 'path to document') -args = parser.parse_args() +def read_args(): + '''Creates an argparse ArgumentParser to read the command line + arguments.''' + parser = argparse.ArgumentParser() + parser.add_argument('path_to_file', help = 'path to document') + args = parser.parse_args() + return args -f = read_document(args.path_to_file) -f = apply_preprocessing(f) -print(f) +if __name__ == '__main__': + args = read_args() + f = read_document(args.path_to_file) + f = apply_preprocessing(f) + print(f) diff --git a/partb3.py b/partb3.py index 8ee1eee..131ecd8 100644 --- a/partb3.py +++ b/partb3.py @@ -1,54 +1,50 @@ -import re import pandas as pd import nltk import os import argparse +from partb2 import read_document, apply_preprocessing -def read_document(path): - '''Reads a file when provided with its path, and returns a string - containing the lines of the file.''' - file_given = open(path) - f = "" - for line in file_given: - f += line + " " - file_given.close() - return f - -def apply_preprocessing(f): - '''Removes non-alphabetic characters, replaces all whitespace characters - with a single whitespace, and changes all uppercase characters to - lowercase''' - f = re.sub(r'[^a-zA-Z\s]', r'', f) - f = re.sub(r'\s+', r' ', f) - f = f.lower() - return f - -# parse input arguments -parser = argparse.ArgumentParser() -parser.add_argument('keywords', nargs = '+', help = 'keywords to search for \ - (1-5 keywords accepted)') -args = parser.parse_args() -if len(args.keywords) > 5: - print("Too many keywords.") - quit() - -# load document IDs from csv -df = pd.read_csv('partb1.csv', encoding = 'ISO-8859-1') -doc_ids = pd.Series(data = df.documentID.tolist(), \ - index = df.filename.tolist()) -documents = doc_ids.index.tolist() -matched_doc_ids = [] - -os.chdir(os.getcwd() + '/cricket') - -# search through each document for the keywords -for doc in documents: - f = read_document(doc) - f = apply_preprocessing(f) - - tokens = nltk.word_tokenize(f) - # only add the document ID if all the keywords are in the token list - if all(keyword in tokens for keyword in args.keywords): - matched_doc_ids.append(doc_ids.get(doc)) +def read_args(): + '''Creates an argparse ArgumentParser to read the command line + arguments.''' + parser = argparse.ArgumentParser() + parser.add_argument('keywords', nargs = '+', \ + help = 'keywords to search for (1-5 keywords accepted)') + args = parser.parse_args() + if len(args.keywords) > 5: + print("Too many keywords.") + quit() -print(matched_doc_ids) + return args + +def load_doc_ids(): + '''Loads in the documentIDs from partb1.csv, and returns the lists of + documentIDs and filenames.''' + df = pd.read_csv('partb1.csv', encoding = 'ISO-8859-1') + doc_ids = pd.Series(data = df.documentID.tolist(), \ + index = df.filename.tolist()) + documents = doc_ids.index.tolist() + return doc_ids, documents + +def find_matching_docs(doc_ids, documents, args): + '''Takes the document list, applies pre-processing techniques to each + document, tokenises the words, and returns a list containing document IDs + that match the keywords given as arguments to this program.''' + matched_doc_ids = [] + for doc in documents: + f = read_document(doc) + f = apply_preprocessing(f) + + tokens = nltk.word_tokenize(f) + # only add the document ID if all the keywords are in the token list + if all(keyword in tokens for keyword in args.keywords): + matched_doc_ids.append(doc_ids.get(doc)) + + return matched_doc_ids + +if __name__ == '__main__': + args = read_args() + doc_ids, documents = load_doc_ids() + os.chdir(os.getcwd() + '/cricket') + matched_doc_ids = find_matching_docs(doc_ids, documents, args) + print(matched_doc_ids) diff --git a/partb4.py b/partb4.py index aa5f779..22d12a3 100644 --- a/partb4.py +++ b/partb4.py @@ -1,66 +1,37 @@ -import re -import pandas as pd import os import nltk from nltk.stem.porter import * -import argparse +from partb2 import read_document, apply_preprocessing +from partb3 import read_args, load_doc_ids -def read_document(path): - '''Reads a file when provided with its path, and returns a string - containing the lines of the file.''' - file_given = open(path) - f = "" - for line in file_given: - f += line + " " - file_given.close() - return f +def find_matching_docs(doc_ids, documents, args): + '''Takes the document list, applies pre-processing techniques to each + document, tokenises the words, and returns a list containing document IDs + that match the keywords given as arguments to this program.''' + matched_doc_ids = [] + porter_stemmer = PorterStemmer() + for doc in documents: + f = read_document(doc) + f = apply_preprocessing(f) -def apply_preprocessing(f): - '''Removes non-alphabetic characters, replaces all whitespace characters - with a single whitespace, and changes all uppercase characters to - lowercase''' - f = re.sub(r'[^a-zA-Z\s]', r'', f) - f = re.sub(r'\s+', r' ', f) - f = f.lower() - return f - -# parse input arguments -parser = argparse.ArgumentParser() -parser.add_argument('keywords', nargs = '+', help = 'keywords to search \ - for (1-5 keywords accepted)') -args = parser.parse_args() -if len(args.keywords) > 5: - print("Too many keywords.") - quit() - -# load document IDs from csv -df = pd.read_csv('partb1.csv', encoding = 'ISO-8859-1') -doc_ids = pd.Series(data = df.documentID.tolist(), \ - index = df.filename.tolist()) -documents = doc_ids.index.tolist() -matched_doc_ids = [] - -# change directory to get cricket data -os.chdir(os.getcwd() + '/cricket') - -# search through each document for the keywords -porter_stemmer = PorterStemmer() - -for doc in documents: - f = read_document(doc) - f = apply_preprocessing(f) - - # tokenise the document, remove stop words - word_list = nltk.word_tokenize(f) + # tokenise the document + word_list = nltk.word_tokenize(f) + + # use the Porter stemmer to add stem words to the word list + for word in word_list: + stemmed_word = porter_stemmer.stem(word) + if stemmed_word not in word_list: + word_list.append(stemmed_word) + + # add document ID if all keywords are in this new word list + if all(keyword in word_list for keyword in args.keywords): + matched_doc_ids.append(doc_ids.get(doc)) - # use the Porter stemmer to add stem words to the word list - for word in word_list: - stemmed_word = porter_stemmer.stem(word) - if stemmed_word not in word_list: - word_list.append(stemmed_word) + return matched_doc_ids - # add document ID if all keywords are in this new word list - if all(keyword in word_list for keyword in args.keywords): - matched_doc_ids.append(doc_ids.get(doc)) - -print(matched_doc_ids) +if __name__ == '__main__': + args = read_args() + doc_ids, documents = load_doc_ids() + os.chdir(os.getcwd() + '/cricket') + matched_doc_ids = find_matching_docs(doc_ids, documents, args) + print(matched_doc_ids) diff --git a/partb5.py b/partb5.py index 6e760fd..66ae585 100644 --- a/partb5.py +++ b/partb5.py @@ -1,6 +1,111 @@ -## Part B Task 5 -import re import os -import sys import pandas as pd import nltk +import argparse +from math import sqrt +from numpy.linalg import norm +from numpy import dot +from nltk.corpus import stopwords +from nltk.stem.porter import * +from sklearn.feature_extraction.text import TfidfTransformer +from partb2 import read_document, apply_preprocessing +from partb3 import read_args + +def cosine_similarity(x1, x2): + '''Calculates the cosine similarity between two vectors. Equal to the + cosine definition using the dot product.''' + return dot(x1, x2) / (norm(x1) * norm(x2)) + +args = read_args() + +# load document IDs from csv +df = pd.read_csv('partb1.csv', encoding = 'ISO-8859-1') +doc_ids = pd.Series(data = df.documentID.tolist(), \ + index = df.filename.tolist()) +documents = doc_ids.index.tolist() + +# change directory to get cricket data +os.chdir(os.getcwd() + '/cricket') + +# build the corpus +corpus = [] +porter_stemmer = PorterStemmer() +stop_words = set(stopwords.words('english')) + +for doc in documents: + f = read_document(doc) + f = apply_preprocessing(f) + + # tokenise the document, remove stop words + word_list = nltk.word_tokenize(f) + word_list = [w for w in word_list if not w in stop_words] + + # add all words and their stems to the corpus + for word in word_list: + stemmed_word = porter_stemmer.stem(word) + + if stemmed_word not in corpus: + corpus.append(word) + +# build the term counts +term_counts = [] +for doc in documents: + curr_term_count = [] + f = read_document(doc) + f = apply_preprocessing(f) + + # tokenise the document, remove stop words + word_list = nltk.word_tokenize(f) + word_list = [w for w in word_list if not w in stop_words] + + # build frequency dictionary of stemmed words + wordDict = {} + for word in word_list: + stemmed_word = porter_stemmer.stem(word) + + if stemmed_word in wordDict: + wordDict[stemmed_word] += 1 + else: + wordDict[stemmed_word] = 1 + + # fill in the current count of terms, then add to the overall list + for word in corpus: + if word in wordDict.keys(): + curr_term_count.append(wordDict[word]) + else: + curr_term_count.append(0) + + term_counts.append(curr_term_count) + +# calculate the tf-idf scores +transformer = TfidfTransformer() +tfidf = transformer.fit_transform(term_counts) +doc_tfidf = tfidf.toarray() + +# construct the query unit vector +query_vector = [] +for word in corpus: + if word in args.keywords: + # this assumes that keywords are unique + # and aren't entered more than once + query_vector.append(1) + else: + query_vector.append(0) +query_unit_vector = [x / norm(query_vector) for x in query_vector] + +similarities = [cosine_similarity(query_unit_vector, doc_tfidf[d_id]) for \ + d_id in range(doc_tfidf.shape[0])] + +# this holds the similarities with their respective document IDs +sim_doc_ids = df +sim_doc_ids.insert(1, 'similarity_scores', similarities) +sim_doc_ids.sort_values(by = 'similarity_scores', ascending = False, \ + inplace = True) +sorted_doc_ids = sim_doc_ids.documentID.tolist() +sorted_similarities = sim_doc_ids.similarity_scores.tolist() + +# print documentID and scores, sorted by scores +print('documentID | score') +for i in range(len(sorted_doc_ids)): + if sorted_similarities[i] > 0: + print(f"{sorted_doc_ids[i]:10} | {sorted_similarities[i]:.4f}") diff --git a/scatter-a.png b/scatter-a.png index aae6d31..bca3678 100644 Binary files a/scatter-a.png and b/scatter-a.png differ diff --git a/scatter-b.png b/scatter-b.png index b8c63eb..aee03f1 100644 Binary files a/scatter-b.png and b/scatter-b.png differ