comp20008-project01/partb3.py

50 lines
1.7 KiB
Python

import pandas as pd
import nltk
import os
import argparse
from partb2 import read_document, apply_preprocessing
def read_args():
'''Creates an argparse ArgumentParser to read the command line
arguments.'''
parser = argparse.ArgumentParser()
parser.add_argument('keywords', nargs = '+', \
help = 'keywords to search for (1-5 keywords accepted)')
args = parser.parse_args()
if len(args.keywords) > 5:
print("Too many keywords.")
quit()
return args
def load_doc_ids():
'''Loads in the documentIDs from partb1.csv, and returns the lists of
documentIDs and filenames.'''
df = pd.read_csv('partb1.csv', encoding = 'ISO-8859-1')
doc_ids = pd.Series(data = df.documentID.tolist(), \
index = df.filename.tolist())
documents = doc_ids.index.tolist()
return doc_ids, documents
def find_matching_docs(doc_ids, documents, args):
'''Takes the document list, applies pre-processing techniques to each
document, tokenises the words, and returns a list containing document IDs
that match the keywords given as arguments to this program.'''
matched_doc_ids = []
for doc in documents:
f = read_document(doc)
f = apply_preprocessing(f)
tokens = nltk.word_tokenize(f)
# only add the document ID if all the keywords are in the token list
if all(keyword in tokens for keyword in args.keywords):
matched_doc_ids.append(doc_ids.get(doc))
return matched_doc_ids
if __name__ == '__main__':
args = read_args()
doc_ids, documents = load_doc_ids()
os.chdir(os.getcwd() + '/cricket')
matched_doc_ids = find_matching_docs(doc_ids, documents, args)
print(matched_doc_ids)