partb4 completed, refactored code

This commit is contained in:
Rory Healy 2021-04-14 23:42:57 +10:00
parent 9ebd727ca1
commit 8cb005de77
3 changed files with 102 additions and 29 deletions

View file

@ -1,22 +1,30 @@
import re import re
import argparse import argparse
def read_document(path):
'''Reads a file when provided with its path, and returns a string
containing the lines of the file.'''
file_given = open(path)
f = ""
for line in file_given:
f += line + " "
file_given.close()
return f
def apply_preprocessing(f):
'''Removes non-alphabetic characters, replaces all whitespace characters
with a single whitespace, and changes all uppercase characters to
lowercase'''
f = re.sub(r'[^a-zA-Z\s]', r'', f)
f = re.sub(r'\s+', r' ', f)
f = f.lower()
return f
# parse input arguments # parse input arguments
parser = argparse.ArgumentParser() parser = argparse.ArgumentParser()
parser.add_argument('path_to_file', help = 'path to the csv file') parser.add_argument('path_to_file', help = 'path to document')
args = parser.parse_args() args = parser.parse_args()
# open file, add all lines to a single string f = read_document(args.path_to_file)
file_given = open(args.path_to_file) f = apply_preprocessing(f)
f = ""
for line in file_given:
f += line + " "
file_given.close()
# remove non-alphabetic characters, replace all whitespace characters with a
# single whitespace, and change all uppercase characters to lowercase
f = re.sub(r'[^a-zA-Z\s]', r'', f)
f = re.sub(r'\s+', r' ', f)
f = f.lower()
print(f) print(f)

View file

@ -4,23 +4,29 @@ import nltk
import os import os
import argparse import argparse
def read_document(path):
'''Reads a file when provided with its path, and returns a string
containing the lines of the file.'''
file_given = open(path)
f = ""
for line in file_given:
f += line + " "
file_given.close()
return f
def apply_preprocessing(f): def apply_preprocessing(f):
'''Applies preprocessing from partb2 to a string f.''' '''Removes non-alphabetic characters, replaces all whitespace characters
with a single whitespace, and changes all uppercase characters to
lowercase'''
f = re.sub(r'[^a-zA-Z\s]', r'', f) f = re.sub(r'[^a-zA-Z\s]', r'', f)
f = re.sub(r'\s+', r' ', f) f = re.sub(r'\s+', r' ', f)
f = f.lower() f = f.lower()
return f return f
def doc_to_str(doc):
'''Returns a string with the contents of a .txt file'''
f = ""
for line in doc:
f += line + " "
return f
# parse input arguments # parse input arguments
parser = argparse.ArgumentParser() parser = argparse.ArgumentParser()
parser.add_argument('keywords', nargs = '+', help = 'keywords to search for (1-5 keywords accepted)') parser.add_argument('keywords', nargs = '+', help = 'keywords to search for \
(1-5 keywords accepted)')
args = parser.parse_args() args = parser.parse_args()
if len(args.keywords) > 5: if len(args.keywords) > 5:
print("Too many keywords.") print("Too many keywords.")
@ -28,7 +34,8 @@ if len(args.keywords) > 5:
# load document IDs from csv # load document IDs from csv
df = pd.read_csv('partb1.csv', encoding = 'ISO-8859-1') df = pd.read_csv('partb1.csv', encoding = 'ISO-8859-1')
doc_ids = pd.Series(data = df.documentID.tolist(), index = df.filename.tolist()) doc_ids = pd.Series(data = df.documentID.tolist(), \
index = df.filename.tolist())
documents = doc_ids.index.tolist() documents = doc_ids.index.tolist()
matched_doc_ids = [] matched_doc_ids = []
@ -36,9 +43,7 @@ os.chdir(os.getcwd() + '/cricket')
# search through each document for the keywords # search through each document for the keywords
for doc in documents: for doc in documents:
curr = open(doc) f = read_document(doc)
f = doc_to_str(curr)
curr.close()
f = apply_preprocessing(f) f = apply_preprocessing(f)
tokens = nltk.word_tokenize(f) tokens = nltk.word_tokenize(f)

View file

@ -1,6 +1,66 @@
## Part B Task 4
import re import re
import pandas as pd import pandas as pd
import os import os
import sys
import nltk import nltk
from nltk.stem.porter import *
import argparse
def read_document(path):
'''Reads a file when provided with its path, and returns a string
containing the lines of the file.'''
file_given = open(path)
f = ""
for line in file_given:
f += line + " "
file_given.close()
return f
def apply_preprocessing(f):
'''Removes non-alphabetic characters, replaces all whitespace characters
with a single whitespace, and changes all uppercase characters to
lowercase'''
f = re.sub(r'[^a-zA-Z\s]', r'', f)
f = re.sub(r'\s+', r' ', f)
f = f.lower()
return f
# parse input arguments
parser = argparse.ArgumentParser()
parser.add_argument('keywords', nargs = '+', help = 'keywords to search \
for (1-5 keywords accepted)')
args = parser.parse_args()
if len(args.keywords) > 5:
print("Too many keywords.")
quit()
# load document IDs from csv
df = pd.read_csv('partb1.csv', encoding = 'ISO-8859-1')
doc_ids = pd.Series(data = df.documentID.tolist(), \
index = df.filename.tolist())
documents = doc_ids.index.tolist()
matched_doc_ids = []
# change directory to get cricket data
os.chdir(os.getcwd() + '/cricket')
# search through each document for the keywords
porter_stemmer = PorterStemmer()
for doc in documents:
f = read_document(doc)
f = apply_preprocessing(f)
# tokenise the document, remove stop words
word_list = nltk.word_tokenize(f)
# use the Porter stemmer to add stem words to the word list
for word in word_list:
stemmed_word = porter_stemmer.stem(word)
if stemmed_word not in word_list:
word_list.append(stemmed_word)
# add document ID if all keywords are in this new word list
if all(keyword in word_list for keyword in args.keywords):
matched_doc_ids.append(doc_ids.get(doc))
print(matched_doc_ids)