From 8cb005de77de445447aea9c11897e9cae547ee99 Mon Sep 17 00:00:00 2001 From: Rory Healy Date: Wed, 14 Apr 2021 23:42:57 +1000 Subject: [PATCH] partb4 completed, refactored code --- partb2.py | 36 +++++++++++++++++++------------ partb3.py | 31 ++++++++++++++++----------- partb4.py | 64 +++++++++++++++++++++++++++++++++++++++++++++++++++++-- 3 files changed, 102 insertions(+), 29 deletions(-) diff --git a/partb2.py b/partb2.py index 140080c..94bec37 100644 --- a/partb2.py +++ b/partb2.py @@ -1,22 +1,30 @@ import re import argparse +def read_document(path): + '''Reads a file when provided with its path, and returns a string + containing the lines of the file.''' + file_given = open(path) + f = "" + for line in file_given: + f += line + " " + file_given.close() + return f + +def apply_preprocessing(f): + '''Removes non-alphabetic characters, replaces all whitespace characters + with a single whitespace, and changes all uppercase characters to + lowercase''' + f = re.sub(r'[^a-zA-Z\s]', r'', f) + f = re.sub(r'\s+', r' ', f) + f = f.lower() + return f + # parse input arguments parser = argparse.ArgumentParser() -parser.add_argument('path_to_file', help = 'path to the csv file') +parser.add_argument('path_to_file', help = 'path to document') args = parser.parse_args() -# open file, add all lines to a single string -file_given = open(args.path_to_file) -f = "" -for line in file_given: - f += line + " " -file_given.close() - -# remove non-alphabetic characters, replace all whitespace characters with a -# single whitespace, and change all uppercase characters to lowercase -f = re.sub(r'[^a-zA-Z\s]', r'', f) -f = re.sub(r'\s+', r' ', f) -f = f.lower() - +f = read_document(args.path_to_file) +f = apply_preprocessing(f) print(f) diff --git a/partb3.py b/partb3.py index fa187b4..8ee1eee 100644 --- a/partb3.py +++ b/partb3.py @@ -4,23 +4,29 @@ import nltk import os import argparse +def read_document(path): + '''Reads a file when provided with its path, and returns a string + containing the lines of the file.''' + file_given = open(path) + f = "" + for line in file_given: + f += line + " " + file_given.close() + return f + def apply_preprocessing(f): - '''Applies preprocessing from partb2 to a string f.''' + '''Removes non-alphabetic characters, replaces all whitespace characters + with a single whitespace, and changes all uppercase characters to + lowercase''' f = re.sub(r'[^a-zA-Z\s]', r'', f) f = re.sub(r'\s+', r' ', f) f = f.lower() return f -def doc_to_str(doc): - '''Returns a string with the contents of a .txt file''' - f = "" - for line in doc: - f += line + " " - return f - # parse input arguments parser = argparse.ArgumentParser() -parser.add_argument('keywords', nargs = '+', help = 'keywords to search for (1-5 keywords accepted)') +parser.add_argument('keywords', nargs = '+', help = 'keywords to search for \ + (1-5 keywords accepted)') args = parser.parse_args() if len(args.keywords) > 5: print("Too many keywords.") @@ -28,7 +34,8 @@ if len(args.keywords) > 5: # load document IDs from csv df = pd.read_csv('partb1.csv', encoding = 'ISO-8859-1') -doc_ids = pd.Series(data = df.documentID.tolist(), index = df.filename.tolist()) +doc_ids = pd.Series(data = df.documentID.tolist(), \ + index = df.filename.tolist()) documents = doc_ids.index.tolist() matched_doc_ids = [] @@ -36,9 +43,7 @@ os.chdir(os.getcwd() + '/cricket') # search through each document for the keywords for doc in documents: - curr = open(doc) - f = doc_to_str(curr) - curr.close() + f = read_document(doc) f = apply_preprocessing(f) tokens = nltk.word_tokenize(f) diff --git a/partb4.py b/partb4.py index 8dcac8e..aa5f779 100644 --- a/partb4.py +++ b/partb4.py @@ -1,6 +1,66 @@ -## Part B Task 4 import re import pandas as pd import os -import sys import nltk +from nltk.stem.porter import * +import argparse + +def read_document(path): + '''Reads a file when provided with its path, and returns a string + containing the lines of the file.''' + file_given = open(path) + f = "" + for line in file_given: + f += line + " " + file_given.close() + return f + +def apply_preprocessing(f): + '''Removes non-alphabetic characters, replaces all whitespace characters + with a single whitespace, and changes all uppercase characters to + lowercase''' + f = re.sub(r'[^a-zA-Z\s]', r'', f) + f = re.sub(r'\s+', r' ', f) + f = f.lower() + return f + +# parse input arguments +parser = argparse.ArgumentParser() +parser.add_argument('keywords', nargs = '+', help = 'keywords to search \ + for (1-5 keywords accepted)') +args = parser.parse_args() +if len(args.keywords) > 5: + print("Too many keywords.") + quit() + +# load document IDs from csv +df = pd.read_csv('partb1.csv', encoding = 'ISO-8859-1') +doc_ids = pd.Series(data = df.documentID.tolist(), \ + index = df.filename.tolist()) +documents = doc_ids.index.tolist() +matched_doc_ids = [] + +# change directory to get cricket data +os.chdir(os.getcwd() + '/cricket') + +# search through each document for the keywords +porter_stemmer = PorterStemmer() + +for doc in documents: + f = read_document(doc) + f = apply_preprocessing(f) + + # tokenise the document, remove stop words + word_list = nltk.word_tokenize(f) + + # use the Porter stemmer to add stem words to the word list + for word in word_list: + stemmed_word = porter_stemmer.stem(word) + if stemmed_word not in word_list: + word_list.append(stemmed_word) + + # add document ID if all keywords are in this new word list + if all(keyword in word_list for keyword in args.keywords): + matched_doc_ids.append(doc_ids.get(doc)) + +print(matched_doc_ids)