partb5 complete, refactored code for cleaner design

2021-04-15 07:47:29 +10:00 · 2021-04-15 07:47:29 +10:00 · a171eb1ca1
commit a171eb1ca1
parent 8cb005de77
11 changed files with 230 additions and 139 deletions
--- a/README.md
+++ b/README.md
@ -12,9 +12,12 @@ Using the cricket dataset from the LMS, this part of the project deals with buil

 ## Dependencies:

-    - pandas >= 1.2.2
-    - matplotlib >= 3.3.2
-    - numpy >= 1.19.2
+Python version used: 3.9.4
+
+    - pandas >= 1.2.4
+    - matplotlib >= 3.4.1
+    - numpy >= 1.20.2
    - regex >= 2021.4.4
+    - scikit-learn >= 0.24.1
    - nltk >= 3.6.1
-        - punkt model needed
+        - punkt model and Porter stemmer needed
--- a/owid-covid-2020-visual-analysis.pdf
+++ b/owid-covid-2020-visual-analysis.pdf
--- a/parta1.py
+++ b/parta1.py
@ -6,18 +6,22 @@ parser = argparse.ArgumentParser()
 parser.add_argument('path_to_csv', help = 'path to the csv file')
 args = parser.parse_args()

-all_covid_data = pd.read_csv('data/owid-covid-data.csv', encoding = 'ISO-8859-1')
+all_covid_data = pd.read_csv('data/owid-covid-data.csv', \
+    encoding = 'ISO-8859-1')

 # filter out data past 2020
-all_covid_data = all_covid_data[(all_covid_data['date'] >= '2020-01-01') & (all_covid_data['date'] <= '2020-12-31')]
+all_covid_data = all_covid_data[(all_covid_data['date'] >= '2020-01-01') \
+     & (all_covid_data['date'] <= '2020-12-31')]
 all_covid_data.date = pd.to_datetime(all_covid_data.date)

 # create groupby objects and sum new cases/deaths by month
 new_cases = all_covid_data.loc[:, ['location', 'date', 'new_cases']]
-new_cases_grouped = new_cases.groupby([new_cases.date.dt.month, new_cases.location]).new_cases.sum()
+new_cases_grouped = new_cases.groupby([new_cases.date.dt.month, \
+    new_cases.location]).new_cases.sum()

 new_deaths = all_covid_data.loc[:, ['location', 'date', 'new_deaths']]
-new_deaths_grouped = new_deaths.groupby([new_deaths.date.dt.month, new_deaths.location]).new_deaths.sum()
+new_deaths_grouped = new_deaths.groupby([new_deaths.date.dt.month, \
+    new_deaths.location]).new_deaths.sum()

 # convert multi-indexed series to dataframe
 new_cases_grouped = new_cases_grouped.to_frame()
@ -30,10 +34,13 @@ new_cases_grouped.sort_values(by = ['location', 'date'], inplace = True)
 new_deaths_grouped.sort_values(by = ['location', 'date'], inplace = True)

 # merge new_deaths_grouped and new_cases_grouped
-aggregated_data = new_cases_grouped.merge(new_deaths_grouped, how = 'outer', left_on = ['location', 'date'], right_on = ['location', 'date'])
+aggregated_data = new_cases_grouped.merge(new_deaths_grouped, \
+    how = 'outer', left_on = ['location', 'date'], \
+        right_on = ['location', 'date'])

 # filter out all entries that aren't at the end of the month
-all_covid_data['end_of_month'] = pd.to_datetime(all_covid_data['date']).dt.is_month_end
+all_covid_data['end_of_month'] = \
+    pd.to_datetime(all_covid_data['date']).dt.is_month_end
 all_covid_data = all_covid_data.loc[all_covid_data.end_of_month, :]

 # extract monthly total cases and total deaths
@ -44,14 +51,19 @@ total_deaths = all_covid_data.loc[:, ['location', 'date', 'total_deaths']]
 total_deaths.date = total_deaths.date.dt.month

 # merge total_deaths and total_cases into aggregated_data
-aggregated_data = aggregated_data.merge(total_cases, how = 'outer', left_on = ['location', 'date'], right_on = ['location', 'date'])
-aggregated_data = aggregated_data.merge(total_deaths, how = 'outer', left_on = ['location', 'date'], right_on = ['location', 'date'])
+aggregated_data = aggregated_data.merge(total_cases, how = 'outer', \
+    left_on = ['location', 'date'], right_on = ['location', 'date'])
+aggregated_data = aggregated_data.merge(total_deaths, how = 'outer', \
+    left_on = ['location', 'date'], right_on = ['location', 'date'])

 # compute case fatality rate for each month
-aggregated_data['case_fatality_rate'] = (aggregated_data['new_deaths'] / aggregated_data['new_cases'])
+aggregated_data['case_fatality_rate'] = \
+    (aggregated_data['new_deaths'] / aggregated_data['new_cases'])

 # format aggregated_data and output results
-aggregated_data = aggregated_data.reindex(columns = ['location', 'date', 'case_fatality_rate', 'total_cases', 'new_cases', 'total_deaths', 'new_deaths'])
+aggregated_data = aggregated_data.reindex(columns = ['location', 'date', \
+    'case_fatality_rate', 'total_cases', 'new_cases', 'total_deaths', \
+        'new_deaths'])
 aggregated_data.rename(columns = {'date': 'month'}, inplace = True)
 aggregated_data.set_index(['location', 'month'], inplace = True)

--- a/parta2.py
+++ b/parta2.py
@ -9,7 +9,8 @@ parser.add_argument('scatter_a', help = 'output path of figure 1')
 parser.add_argument('scatter_b', help = 'output path of figure 2')
 args = parser.parse_args()

-all_covid_data = pd.read_csv('data/owid-covid-data.csv', encoding = 'ISO-8859-1')
+all_covid_data = pd.read_csv('data/owid-covid-data.csv', \
+    encoding = 'ISO-8859-1')

 # filter out data - only need 2020-12-31
 all_covid_data = all_covid_data[(all_covid_data['date'] == '2020-12-31')]
@ -21,13 +22,12 @@ total_cases.set_index(['location'], inplace = True)
 total_deaths = all_covid_data.loc[:, ['location', 'total_deaths']]

 # merge total_cases and total_deaths
-aggregated_data = total_cases.merge(total_deaths, how = 'inner', on = 'location')
+aggregated_data = total_cases.merge(total_deaths, how = 'inner', \
+    on = 'location')

 # compute case fatality rate for each country
-aggregated_data['case_fatality_rate'] = (aggregated_data['total_deaths'] / aggregated_data['total_cases'])
-
-# format aggregated_data
-aggregated_data = aggregated_data.reindex(columns = ['location', 'case_fatality_rate', 'total_deaths', 'total_cases'])
+aggregated_data['case_fatality_rate'] = \
+    (aggregated_data['total_deaths'] / aggregated_data['total_cases'])

 # extract case fatality rate from aggregated data
 case_fatality_rate = aggregated_data.loc[:, ['case_fatality_rate']]
--- a/partb1.py
+++ b/partb1.py
@ -13,10 +13,9 @@ pattern = r'[A-Z]{4}-\d{3}[a-zA-Z]?'

 os.chdir(os.getcwd() + '/cricket')

+# open every file, search each line for the document ID, add it to the list
 document_ids = []
 filenames = []
-
-# open every file, search each line for the document ID, add it to the list
 for filename in os.listdir():
    filenames.append(filename)
    f = open(filename)
--- a/partb2.py
+++ b/partb2.py
@ -20,11 +20,16 @@ def apply_preprocessing(f):
    f = f.lower()
    return f

-# parse input arguments
+def read_args():
+    '''Creates an argparse ArgumentParser to read the command line 
+    arguments.'''
    parser = argparse.ArgumentParser()
    parser.add_argument('path_to_file', help = 'path to document')
    args = parser.parse_args()
+    return args

+if __name__ == '__main__':
+    args = read_args()
    f = read_document(args.path_to_file)
    f = apply_preprocessing(f)
    print(f)
--- a/partb3.py
+++ b/partb3.py
@ -1,47 +1,36 @@
-import re
 import pandas as pd
 import nltk
 import os
 import argparse
+from partb2 import read_document, apply_preprocessing

-def read_document(path):
-    '''Reads a file when provided with its path, and returns a string 
-    containing the lines of the file.'''
-    file_given = open(path)
-    f = ""
-    for line in file_given:
-        f += line + " "
-    file_given.close()
-    return f
-
-def apply_preprocessing(f):
-    '''Removes non-alphabetic characters, replaces all whitespace characters 
-    with a single whitespace, and changes all uppercase characters to 
-    lowercase'''
-    f = re.sub(r'[^a-zA-Z\s]', r'', f)
-    f = re.sub(r'\s+', r' ', f)
-    f = f.lower()
-    return f
-
-# parse input arguments
+def read_args():
+    '''Creates an argparse ArgumentParser to read the command line 
+    arguments.'''
    parser = argparse.ArgumentParser()
-parser.add_argument('keywords', nargs = '+', help = 'keywords to search for \
-    (1-5 keywords accepted)')
+    parser.add_argument('keywords', nargs = '+', \
+        help = 'keywords to search for (1-5 keywords accepted)')
    args = parser.parse_args()
    if len(args.keywords) > 5:
        print("Too many keywords.")
        quit()
    
-# load document IDs from csv
+    return args
+
+def load_doc_ids():
+    '''Loads in the documentIDs from partb1.csv, and returns the lists of 
+    documentIDs and filenames.'''
    df = pd.read_csv('partb1.csv', encoding = 'ISO-8859-1')
    doc_ids = pd.Series(data = df.documentID.tolist(), \
        index = df.filename.tolist())
    documents = doc_ids.index.tolist()
+    return doc_ids, documents
+
+def find_matching_docs(doc_ids, documents, args):
+    '''Takes the document list, applies pre-processing techniques to each 
+    document, tokenises the words, and returns a list containing document IDs
+    that match the keywords given as arguments to this program.'''
    matched_doc_ids = []
-
-os.chdir(os.getcwd() + '/cricket')
-
-# search through each document for the keywords
    for doc in documents:
        f = read_document(doc)
        f = apply_preprocessing(f)
@ -51,4 +40,11 @@ for doc in documents:
        if all(keyword in tokens for keyword in args.keywords):
            matched_doc_ids.append(doc_ids.get(doc))
    
+    return matched_doc_ids
+
+if __name__ == '__main__':
+    args = read_args()
+    doc_ids, documents = load_doc_ids()
+    os.chdir(os.getcwd() + '/cricket')
+    matched_doc_ids = find_matching_docs(doc_ids, documents, args)
    print(matched_doc_ids)
--- a/partb4.py
+++ b/partb4.py
@ -1,56 +1,20 @@
-import re
-import pandas as pd
 import os
 import nltk
 from nltk.stem.porter import *
-import argparse
+from partb2 import read_document, apply_preprocessing
+from partb3 import read_args, load_doc_ids

-def read_document(path):
-    '''Reads a file when provided with its path, and returns a string 
-    containing the lines of the file.'''
-    file_given = open(path)
-    f = ""
-    for line in file_given:
-        f += line + " "
-    file_given.close()
-    return f
-
-def apply_preprocessing(f):
-    '''Removes non-alphabetic characters, replaces all whitespace characters 
-    with a single whitespace, and changes all uppercase characters to 
-    lowercase'''
-    f = re.sub(r'[^a-zA-Z\s]', r'', f)
-    f = re.sub(r'\s+', r' ', f)
-    f = f.lower()
-    return f
-
-# parse input arguments
-parser = argparse.ArgumentParser()
-parser.add_argument('keywords', nargs = '+', help = 'keywords to search \
-    for (1-5 keywords accepted)')
-args = parser.parse_args()
-if len(args.keywords) > 5:
-    print("Too many keywords.")
-    quit()
-
-# load document IDs from csv
-df = pd.read_csv('partb1.csv', encoding = 'ISO-8859-1')
-doc_ids = pd.Series(data = df.documentID.tolist(), \
-    index = df.filename.tolist())
-documents = doc_ids.index.tolist()
+def find_matching_docs(doc_ids, documents, args):
+    '''Takes the document list, applies pre-processing techniques to each 
+    document, tokenises the words, and returns a list containing document IDs
+    that match the keywords given as arguments to this program.'''
    matched_doc_ids = []
-
-# change directory to get cricket data
-os.chdir(os.getcwd() + '/cricket')
-
-# search through each document for the keywords
    porter_stemmer = PorterStemmer()
-
    for doc in documents:
        f = read_document(doc)
        f = apply_preprocessing(f)

-    # tokenise the document, remove stop words
+        # tokenise the document
        word_list = nltk.word_tokenize(f)
        
        # use the Porter stemmer to add stem words to the word list
@ -63,4 +27,11 @@ for doc in documents:
        if all(keyword in word_list for keyword in args.keywords):
            matched_doc_ids.append(doc_ids.get(doc))
    
+    return matched_doc_ids
+    
+if __name__ == '__main__':
+    args = read_args()
+    doc_ids, documents = load_doc_ids()
+    os.chdir(os.getcwd() + '/cricket')
+    matched_doc_ids = find_matching_docs(doc_ids, documents, args)
    print(matched_doc_ids)
--- a/partb5.py
+++ b/partb5.py
@ -1,6 +1,111 @@
-## Part B Task 5
-import re
 import os
-import sys
 import pandas as pd
 import nltk
+import argparse
+from math import sqrt
+from numpy.linalg import norm
+from numpy import dot
+from nltk.corpus import stopwords
+from nltk.stem.porter import *
+from sklearn.feature_extraction.text import TfidfTransformer
+from partb2 import read_document, apply_preprocessing
+from partb3 import read_args
+
+def cosine_similarity(x1, x2):
+    '''Calculates the cosine similarity between two vectors. Equal to the  
+    cosine definition using the dot product.'''
+    return dot(x1, x2) / (norm(x1) * norm(x2))
+
+args = read_args()
+
+# load document IDs from csv
+df = pd.read_csv('partb1.csv', encoding = 'ISO-8859-1')
+doc_ids = pd.Series(data = df.documentID.tolist(), \
+    index = df.filename.tolist())
+documents = doc_ids.index.tolist()
+
+# change directory to get cricket data
+os.chdir(os.getcwd() + '/cricket')
+
+# build the corpus
+corpus = []
+porter_stemmer = PorterStemmer()
+stop_words = set(stopwords.words('english'))
+
+for doc in documents:
+    f = read_document(doc)
+    f = apply_preprocessing(f)
+
+    # tokenise the document, remove stop words
+    word_list = nltk.word_tokenize(f)
+    word_list = [w for w in word_list if not w in stop_words]
+    
+    # add all words and their stems to the corpus
+    for word in word_list:
+        stemmed_word = porter_stemmer.stem(word)
+
+        if stemmed_word not in corpus:
+            corpus.append(word)
+
+# build the term counts
+term_counts = []
+for doc in documents:
+    curr_term_count = []
+    f = read_document(doc)
+    f = apply_preprocessing(f)
+
+    # tokenise the document, remove stop words
+    word_list = nltk.word_tokenize(f)
+    word_list = [w for w in word_list if not w in stop_words]
+    
+    # build frequency dictionary of stemmed words
+    wordDict = {}
+    for word in word_list:
+        stemmed_word = porter_stemmer.stem(word)
+        
+        if stemmed_word in wordDict:
+            wordDict[stemmed_word] += 1
+        else:
+            wordDict[stemmed_word] = 1
+    
+    # fill in the current count of terms, then add to the overall list
+    for word in corpus:
+        if word in wordDict.keys():
+            curr_term_count.append(wordDict[word])
+        else:
+            curr_term_count.append(0)
+    
+    term_counts.append(curr_term_count)
+
+# calculate the tf-idf scores
+transformer = TfidfTransformer()
+tfidf = transformer.fit_transform(term_counts)
+doc_tfidf = tfidf.toarray()
+
+# construct the query unit vector
+query_vector = []
+for word in corpus:
+    if word in args.keywords:
+        # this assumes that keywords are unique 
+        # and aren't entered more than once
+        query_vector.append(1)
+    else:
+        query_vector.append(0)
+query_unit_vector = [x / norm(query_vector) for x in query_vector]
+
+similarities = [cosine_similarity(query_unit_vector, doc_tfidf[d_id]) for \
+    d_id in range(doc_tfidf.shape[0])]
+
+# this holds the similarities with their respective document IDs
+sim_doc_ids = df
+sim_doc_ids.insert(1, 'similarity_scores', similarities)
+sim_doc_ids.sort_values(by = 'similarity_scores', ascending = False, \
+    inplace = True)
+sorted_doc_ids = sim_doc_ids.documentID.tolist()
+sorted_similarities = sim_doc_ids.similarity_scores.tolist()
+
+# print documentID and scores, sorted by scores
+print('documentID | score')
+for i in range(len(sorted_doc_ids)):
+    if sorted_similarities[i] > 0:
+        print(f"{sorted_doc_ids[i]:10} | {sorted_similarities[i]:.4f}")
--- a/scatter-a.png
+++ b/scatter-a.png
--- a/scatter-b.png
+++ b/scatter-b.png