partb4 completed, refactored code
This commit is contained in:
parent
9ebd727ca1
commit
8cb005de77
3 changed files with 102 additions and 29 deletions
26
partb2.py
26
partb2.py
|
@ -1,22 +1,30 @@
|
||||||
import re
|
import re
|
||||||
import argparse
|
import argparse
|
||||||
|
|
||||||
# parse input arguments
|
def read_document(path):
|
||||||
parser = argparse.ArgumentParser()
|
'''Reads a file when provided with its path, and returns a string
|
||||||
parser.add_argument('path_to_file', help = 'path to the csv file')
|
containing the lines of the file.'''
|
||||||
args = parser.parse_args()
|
file_given = open(path)
|
||||||
|
|
||||||
# open file, add all lines to a single string
|
|
||||||
file_given = open(args.path_to_file)
|
|
||||||
f = ""
|
f = ""
|
||||||
for line in file_given:
|
for line in file_given:
|
||||||
f += line + " "
|
f += line + " "
|
||||||
file_given.close()
|
file_given.close()
|
||||||
|
return f
|
||||||
|
|
||||||
# remove non-alphabetic characters, replace all whitespace characters with a
|
def apply_preprocessing(f):
|
||||||
# single whitespace, and change all uppercase characters to lowercase
|
'''Removes non-alphabetic characters, replaces all whitespace characters
|
||||||
|
with a single whitespace, and changes all uppercase characters to
|
||||||
|
lowercase'''
|
||||||
f = re.sub(r'[^a-zA-Z\s]', r'', f)
|
f = re.sub(r'[^a-zA-Z\s]', r'', f)
|
||||||
f = re.sub(r'\s+', r' ', f)
|
f = re.sub(r'\s+', r' ', f)
|
||||||
f = f.lower()
|
f = f.lower()
|
||||||
|
return f
|
||||||
|
|
||||||
|
# parse input arguments
|
||||||
|
parser = argparse.ArgumentParser()
|
||||||
|
parser.add_argument('path_to_file', help = 'path to document')
|
||||||
|
args = parser.parse_args()
|
||||||
|
|
||||||
|
f = read_document(args.path_to_file)
|
||||||
|
f = apply_preprocessing(f)
|
||||||
print(f)
|
print(f)
|
||||||
|
|
31
partb3.py
31
partb3.py
|
@ -4,23 +4,29 @@ import nltk
|
||||||
import os
|
import os
|
||||||
import argparse
|
import argparse
|
||||||
|
|
||||||
|
def read_document(path):
|
||||||
|
'''Reads a file when provided with its path, and returns a string
|
||||||
|
containing the lines of the file.'''
|
||||||
|
file_given = open(path)
|
||||||
|
f = ""
|
||||||
|
for line in file_given:
|
||||||
|
f += line + " "
|
||||||
|
file_given.close()
|
||||||
|
return f
|
||||||
|
|
||||||
def apply_preprocessing(f):
|
def apply_preprocessing(f):
|
||||||
'''Applies preprocessing from partb2 to a string f.'''
|
'''Removes non-alphabetic characters, replaces all whitespace characters
|
||||||
|
with a single whitespace, and changes all uppercase characters to
|
||||||
|
lowercase'''
|
||||||
f = re.sub(r'[^a-zA-Z\s]', r'', f)
|
f = re.sub(r'[^a-zA-Z\s]', r'', f)
|
||||||
f = re.sub(r'\s+', r' ', f)
|
f = re.sub(r'\s+', r' ', f)
|
||||||
f = f.lower()
|
f = f.lower()
|
||||||
return f
|
return f
|
||||||
|
|
||||||
def doc_to_str(doc):
|
|
||||||
'''Returns a string with the contents of a .txt file'''
|
|
||||||
f = ""
|
|
||||||
for line in doc:
|
|
||||||
f += line + " "
|
|
||||||
return f
|
|
||||||
|
|
||||||
# parse input arguments
|
# parse input arguments
|
||||||
parser = argparse.ArgumentParser()
|
parser = argparse.ArgumentParser()
|
||||||
parser.add_argument('keywords', nargs = '+', help = 'keywords to search for (1-5 keywords accepted)')
|
parser.add_argument('keywords', nargs = '+', help = 'keywords to search for \
|
||||||
|
(1-5 keywords accepted)')
|
||||||
args = parser.parse_args()
|
args = parser.parse_args()
|
||||||
if len(args.keywords) > 5:
|
if len(args.keywords) > 5:
|
||||||
print("Too many keywords.")
|
print("Too many keywords.")
|
||||||
|
@ -28,7 +34,8 @@ if len(args.keywords) > 5:
|
||||||
|
|
||||||
# load document IDs from csv
|
# load document IDs from csv
|
||||||
df = pd.read_csv('partb1.csv', encoding = 'ISO-8859-1')
|
df = pd.read_csv('partb1.csv', encoding = 'ISO-8859-1')
|
||||||
doc_ids = pd.Series(data = df.documentID.tolist(), index = df.filename.tolist())
|
doc_ids = pd.Series(data = df.documentID.tolist(), \
|
||||||
|
index = df.filename.tolist())
|
||||||
documents = doc_ids.index.tolist()
|
documents = doc_ids.index.tolist()
|
||||||
matched_doc_ids = []
|
matched_doc_ids = []
|
||||||
|
|
||||||
|
@ -36,9 +43,7 @@ os.chdir(os.getcwd() + '/cricket')
|
||||||
|
|
||||||
# search through each document for the keywords
|
# search through each document for the keywords
|
||||||
for doc in documents:
|
for doc in documents:
|
||||||
curr = open(doc)
|
f = read_document(doc)
|
||||||
f = doc_to_str(curr)
|
|
||||||
curr.close()
|
|
||||||
f = apply_preprocessing(f)
|
f = apply_preprocessing(f)
|
||||||
|
|
||||||
tokens = nltk.word_tokenize(f)
|
tokens = nltk.word_tokenize(f)
|
||||||
|
|
64
partb4.py
64
partb4.py
|
@ -1,6 +1,66 @@
|
||||||
## Part B Task 4
|
|
||||||
import re
|
import re
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
import os
|
import os
|
||||||
import sys
|
|
||||||
import nltk
|
import nltk
|
||||||
|
from nltk.stem.porter import *
|
||||||
|
import argparse
|
||||||
|
|
||||||
|
def read_document(path):
|
||||||
|
'''Reads a file when provided with its path, and returns a string
|
||||||
|
containing the lines of the file.'''
|
||||||
|
file_given = open(path)
|
||||||
|
f = ""
|
||||||
|
for line in file_given:
|
||||||
|
f += line + " "
|
||||||
|
file_given.close()
|
||||||
|
return f
|
||||||
|
|
||||||
|
def apply_preprocessing(f):
|
||||||
|
'''Removes non-alphabetic characters, replaces all whitespace characters
|
||||||
|
with a single whitespace, and changes all uppercase characters to
|
||||||
|
lowercase'''
|
||||||
|
f = re.sub(r'[^a-zA-Z\s]', r'', f)
|
||||||
|
f = re.sub(r'\s+', r' ', f)
|
||||||
|
f = f.lower()
|
||||||
|
return f
|
||||||
|
|
||||||
|
# parse input arguments
|
||||||
|
parser = argparse.ArgumentParser()
|
||||||
|
parser.add_argument('keywords', nargs = '+', help = 'keywords to search \
|
||||||
|
for (1-5 keywords accepted)')
|
||||||
|
args = parser.parse_args()
|
||||||
|
if len(args.keywords) > 5:
|
||||||
|
print("Too many keywords.")
|
||||||
|
quit()
|
||||||
|
|
||||||
|
# load document IDs from csv
|
||||||
|
df = pd.read_csv('partb1.csv', encoding = 'ISO-8859-1')
|
||||||
|
doc_ids = pd.Series(data = df.documentID.tolist(), \
|
||||||
|
index = df.filename.tolist())
|
||||||
|
documents = doc_ids.index.tolist()
|
||||||
|
matched_doc_ids = []
|
||||||
|
|
||||||
|
# change directory to get cricket data
|
||||||
|
os.chdir(os.getcwd() + '/cricket')
|
||||||
|
|
||||||
|
# search through each document for the keywords
|
||||||
|
porter_stemmer = PorterStemmer()
|
||||||
|
|
||||||
|
for doc in documents:
|
||||||
|
f = read_document(doc)
|
||||||
|
f = apply_preprocessing(f)
|
||||||
|
|
||||||
|
# tokenise the document, remove stop words
|
||||||
|
word_list = nltk.word_tokenize(f)
|
||||||
|
|
||||||
|
# use the Porter stemmer to add stem words to the word list
|
||||||
|
for word in word_list:
|
||||||
|
stemmed_word = porter_stemmer.stem(word)
|
||||||
|
if stemmed_word not in word_list:
|
||||||
|
word_list.append(stemmed_word)
|
||||||
|
|
||||||
|
# add document ID if all keywords are in this new word list
|
||||||
|
if all(keyword in word_list for keyword in args.keywords):
|
||||||
|
matched_doc_ids.append(doc_ids.get(doc))
|
||||||
|
|
||||||
|
print(matched_doc_ids)
|
||||||
|
|
Loading…
Reference in a new issue