2021-03-01 17:57:17 +11:00
|
|
|
import pandas as pd
|
|
|
|
import nltk
|
|
|
|
import os
|
2021-04-11 23:09:58 +10:00
|
|
|
import argparse
|
2021-04-15 07:47:29 +10:00
|
|
|
from partb2 import read_document, apply_preprocessing
|
2021-03-01 17:57:17 +11:00
|
|
|
|
2021-04-15 07:47:29 +10:00
|
|
|
def read_args():
|
|
|
|
'''Creates an argparse ArgumentParser to read the command line
|
|
|
|
arguments.'''
|
|
|
|
parser = argparse.ArgumentParser()
|
|
|
|
parser.add_argument('keywords', nargs = '+', \
|
|
|
|
help = 'keywords to search for (1-5 keywords accepted)')
|
|
|
|
args = parser.parse_args()
|
|
|
|
if len(args.keywords) > 5:
|
|
|
|
print("Too many keywords.")
|
|
|
|
quit()
|
|
|
|
|
|
|
|
return args
|
2021-04-11 23:09:58 +10:00
|
|
|
|
2021-04-15 07:47:29 +10:00
|
|
|
def load_doc_ids():
|
|
|
|
'''Loads in the documentIDs from partb1.csv, and returns the lists of
|
|
|
|
documentIDs and filenames.'''
|
|
|
|
df = pd.read_csv('partb1.csv', encoding = 'ISO-8859-1')
|
|
|
|
doc_ids = pd.Series(data = df.documentID.tolist(), \
|
|
|
|
index = df.filename.tolist())
|
|
|
|
documents = doc_ids.index.tolist()
|
|
|
|
return doc_ids, documents
|
2021-04-11 23:09:58 +10:00
|
|
|
|
2021-04-15 07:47:29 +10:00
|
|
|
def find_matching_docs(doc_ids, documents, args):
|
|
|
|
'''Takes the document list, applies pre-processing techniques to each
|
|
|
|
document, tokenises the words, and returns a list containing document IDs
|
|
|
|
that match the keywords given as arguments to this program.'''
|
|
|
|
matched_doc_ids = []
|
|
|
|
for doc in documents:
|
|
|
|
f = read_document(doc)
|
|
|
|
f = apply_preprocessing(f)
|
2021-04-11 23:09:58 +10:00
|
|
|
|
2021-04-15 07:47:29 +10:00
|
|
|
tokens = nltk.word_tokenize(f)
|
|
|
|
# only add the document ID if all the keywords are in the token list
|
|
|
|
if all(keyword in tokens for keyword in args.keywords):
|
|
|
|
matched_doc_ids.append(doc_ids.get(doc))
|
2021-04-11 23:09:58 +10:00
|
|
|
|
2021-04-15 07:47:29 +10:00
|
|
|
return matched_doc_ids
|
|
|
|
|
|
|
|
if __name__ == '__main__':
|
|
|
|
args = read_args()
|
|
|
|
doc_ids, documents = load_doc_ids()
|
|
|
|
os.chdir(os.getcwd() + '/cricket')
|
|
|
|
matched_doc_ids = find_matching_docs(doc_ids, documents, args)
|
|
|
|
print(matched_doc_ids)
|