From 8cb005de77de445447aea9c11897e9cae547ee99 Mon Sep 17 00:00:00 2001
From: Rory Healy <rory.healy@student.unimelb.edu.au>
Date: Wed, 14 Apr 2021 23:42:57 +1000
Subject: [PATCH] partb4 completed, refactored code

---
 partb2.py | 36 +++++++++++++++++++------------
 partb3.py | 31 ++++++++++++++++-----------
 partb4.py | 64 +++++++++++++++++++++++++++++++++++++++++++++++++++++--
 3 files changed, 102 insertions(+), 29 deletions(-)

diff --git a/partb2.py b/partb2.py
index 140080c..94bec37 100644
--- a/partb2.py
+++ b/partb2.py
@@ -1,22 +1,30 @@
 import re
 import argparse
 
+def read_document(path):
+    '''Reads a file when provided with its path, and returns a string 
+    containing the lines of the file.'''
+    file_given = open(path)
+    f = ""
+    for line in file_given:
+        f += line + " "
+    file_given.close()
+    return f
+
+def apply_preprocessing(f):
+    '''Removes non-alphabetic characters, replaces all whitespace characters 
+    with a single whitespace, and changes all uppercase characters to 
+    lowercase'''
+    f = re.sub(r'[^a-zA-Z\s]', r'', f)
+    f = re.sub(r'\s+', r' ', f)
+    f = f.lower()
+    return f
+
 # parse input arguments
 parser = argparse.ArgumentParser()
-parser.add_argument('path_to_file', help = 'path to the csv file')
+parser.add_argument('path_to_file', help = 'path to document')
 args = parser.parse_args()
 
-# open file, add all lines to a single string
-file_given = open(args.path_to_file)
-f = ""
-for line in file_given:
-    f += line + " "
-file_given.close()
-
-# remove non-alphabetic characters, replace all whitespace characters with a 
-# single whitespace, and change all uppercase characters to lowercase
-f = re.sub(r'[^a-zA-Z\s]', r'', f)
-f = re.sub(r'\s+', r' ', f)
-f = f.lower()
-
+f = read_document(args.path_to_file)
+f = apply_preprocessing(f)
 print(f)
diff --git a/partb3.py b/partb3.py
index fa187b4..8ee1eee 100644
--- a/partb3.py
+++ b/partb3.py
@@ -4,23 +4,29 @@ import nltk
 import os
 import argparse
 
+def read_document(path):
+    '''Reads a file when provided with its path, and returns a string 
+    containing the lines of the file.'''
+    file_given = open(path)
+    f = ""
+    for line in file_given:
+        f += line + " "
+    file_given.close()
+    return f
+
 def apply_preprocessing(f):
-    '''Applies preprocessing from partb2 to a string f.'''
+    '''Removes non-alphabetic characters, replaces all whitespace characters 
+    with a single whitespace, and changes all uppercase characters to 
+    lowercase'''
     f = re.sub(r'[^a-zA-Z\s]', r'', f)
     f = re.sub(r'\s+', r' ', f)
     f = f.lower()
     return f
 
-def doc_to_str(doc):
-    '''Returns a string with the contents of a .txt file'''
-    f = ""
-    for line in doc:
-        f += line + " "
-    return f
-
 # parse input arguments
 parser = argparse.ArgumentParser()
-parser.add_argument('keywords', nargs = '+', help = 'keywords to search for (1-5 keywords accepted)')
+parser.add_argument('keywords', nargs = '+', help = 'keywords to search for \
+    (1-5 keywords accepted)')
 args = parser.parse_args()
 if len(args.keywords) > 5:
     print("Too many keywords.")
@@ -28,7 +34,8 @@ if len(args.keywords) > 5:
 
 # load document IDs from csv
 df = pd.read_csv('partb1.csv', encoding = 'ISO-8859-1')
-doc_ids = pd.Series(data = df.documentID.tolist(), index = df.filename.tolist())
+doc_ids = pd.Series(data = df.documentID.tolist(), \
+    index = df.filename.tolist())
 documents = doc_ids.index.tolist()
 matched_doc_ids = []
 
@@ -36,9 +43,7 @@ os.chdir(os.getcwd() + '/cricket')
 
 # search through each document for the keywords
 for doc in documents:
-    curr = open(doc)
-    f = doc_to_str(curr)
-    curr.close()
+    f = read_document(doc)
     f = apply_preprocessing(f)
 
     tokens = nltk.word_tokenize(f)
diff --git a/partb4.py b/partb4.py
index 8dcac8e..aa5f779 100644
--- a/partb4.py
+++ b/partb4.py
@@ -1,6 +1,66 @@
-## Part B Task 4
 import re
 import pandas as pd
 import os
-import sys
 import nltk
+from nltk.stem.porter import *
+import argparse
+
+def read_document(path):
+    '''Reads a file when provided with its path, and returns a string 
+    containing the lines of the file.'''
+    file_given = open(path)
+    f = ""
+    for line in file_given:
+        f += line + " "
+    file_given.close()
+    return f
+
+def apply_preprocessing(f):
+    '''Removes non-alphabetic characters, replaces all whitespace characters 
+    with a single whitespace, and changes all uppercase characters to 
+    lowercase'''
+    f = re.sub(r'[^a-zA-Z\s]', r'', f)
+    f = re.sub(r'\s+', r' ', f)
+    f = f.lower()
+    return f
+
+# parse input arguments
+parser = argparse.ArgumentParser()
+parser.add_argument('keywords', nargs = '+', help = 'keywords to search \
+    for (1-5 keywords accepted)')
+args = parser.parse_args()
+if len(args.keywords) > 5:
+    print("Too many keywords.")
+    quit()
+
+# load document IDs from csv
+df = pd.read_csv('partb1.csv', encoding = 'ISO-8859-1')
+doc_ids = pd.Series(data = df.documentID.tolist(), \
+    index = df.filename.tolist())
+documents = doc_ids.index.tolist()
+matched_doc_ids = []
+
+# change directory to get cricket data
+os.chdir(os.getcwd() + '/cricket')
+
+# search through each document for the keywords
+porter_stemmer = PorterStemmer()
+
+for doc in documents:
+    f = read_document(doc)
+    f = apply_preprocessing(f)
+
+    # tokenise the document, remove stop words
+    word_list = nltk.word_tokenize(f)
+    
+    # use the Porter stemmer to add stem words to the word list
+    for word in word_list:
+        stemmed_word = porter_stemmer.stem(word)
+        if stemmed_word not in word_list:
+            word_list.append(stemmed_word)
+    
+    # add document ID if all keywords are in this new word list
+    if all(keyword in word_list for keyword in args.keywords):
+        matched_doc_ids.append(doc_ids.get(doc))
+    
+print(matched_doc_ids)