2021-03-01 17:57:17 +11:00
|
|
|
import re
|
|
|
|
import pandas as pd
|
|
|
|
import nltk
|
|
|
|
import os
|
2021-04-11 23:09:58 +10:00
|
|
|
import argparse
|
2021-03-01 17:57:17 +11:00
|
|
|
|
2021-04-14 23:42:57 +10:00
|
|
|
def read_document(path):
|
|
|
|
'''Reads a file when provided with its path, and returns a string
|
|
|
|
containing the lines of the file.'''
|
|
|
|
file_given = open(path)
|
|
|
|
f = ""
|
|
|
|
for line in file_given:
|
|
|
|
f += line + " "
|
|
|
|
file_given.close()
|
|
|
|
return f
|
|
|
|
|
2021-04-11 23:09:58 +10:00
|
|
|
def apply_preprocessing(f):
|
2021-04-14 23:42:57 +10:00
|
|
|
'''Removes non-alphabetic characters, replaces all whitespace characters
|
|
|
|
with a single whitespace, and changes all uppercase characters to
|
|
|
|
lowercase'''
|
2021-04-11 23:09:58 +10:00
|
|
|
f = re.sub(r'[^a-zA-Z\s]', r'', f)
|
|
|
|
f = re.sub(r'\s+', r' ', f)
|
|
|
|
f = f.lower()
|
|
|
|
return f
|
|
|
|
|
|
|
|
# parse input arguments
|
|
|
|
parser = argparse.ArgumentParser()
|
2021-04-14 23:42:57 +10:00
|
|
|
parser.add_argument('keywords', nargs = '+', help = 'keywords to search for \
|
|
|
|
(1-5 keywords accepted)')
|
2021-04-11 23:09:58 +10:00
|
|
|
args = parser.parse_args()
|
|
|
|
if len(args.keywords) > 5:
|
|
|
|
print("Too many keywords.")
|
|
|
|
quit()
|
|
|
|
|
|
|
|
# load document IDs from csv
|
|
|
|
df = pd.read_csv('partb1.csv', encoding = 'ISO-8859-1')
|
2021-04-14 23:42:57 +10:00
|
|
|
doc_ids = pd.Series(data = df.documentID.tolist(), \
|
|
|
|
index = df.filename.tolist())
|
2021-04-11 23:09:58 +10:00
|
|
|
documents = doc_ids.index.tolist()
|
|
|
|
matched_doc_ids = []
|
|
|
|
|
|
|
|
os.chdir(os.getcwd() + '/cricket')
|
|
|
|
|
|
|
|
# search through each document for the keywords
|
|
|
|
for doc in documents:
|
2021-04-14 23:42:57 +10:00
|
|
|
f = read_document(doc)
|
2021-04-11 23:09:58 +10:00
|
|
|
f = apply_preprocessing(f)
|
|
|
|
|
|
|
|
tokens = nltk.word_tokenize(f)
|
|
|
|
# only add the document ID if all the keywords are in the token list
|
|
|
|
if all(keyword in tokens for keyword in args.keywords):
|
|
|
|
matched_doc_ids.append(doc_ids.get(doc))
|
|
|
|
|
|
|
|
print(matched_doc_ids)
|