diff --git a/partb3.py b/partb3.py index a5ff736..fa187b4 100644 --- a/partb3.py +++ b/partb3.py @@ -1,7 +1,49 @@ -## Part B Task 3 import re -import sys import pandas as pd import nltk import os +import argparse +def apply_preprocessing(f): + '''Applies preprocessing from partb2 to a string f.''' + f = re.sub(r'[^a-zA-Z\s]', r'', f) + f = re.sub(r'\s+', r' ', f) + f = f.lower() + return f + +def doc_to_str(doc): + '''Returns a string with the contents of a .txt file''' + f = "" + for line in doc: + f += line + " " + return f + +# parse input arguments +parser = argparse.ArgumentParser() +parser.add_argument('keywords', nargs = '+', help = 'keywords to search for (1-5 keywords accepted)') +args = parser.parse_args() +if len(args.keywords) > 5: + print("Too many keywords.") + quit() + +# load document IDs from csv +df = pd.read_csv('partb1.csv', encoding = 'ISO-8859-1') +doc_ids = pd.Series(data = df.documentID.tolist(), index = df.filename.tolist()) +documents = doc_ids.index.tolist() +matched_doc_ids = [] + +os.chdir(os.getcwd() + '/cricket') + +# search through each document for the keywords +for doc in documents: + curr = open(doc) + f = doc_to_str(curr) + curr.close() + f = apply_preprocessing(f) + + tokens = nltk.word_tokenize(f) + # only add the document ID if all the keywords are in the token list + if all(keyword in tokens for keyword in args.keywords): + matched_doc_ids.append(doc_ids.get(doc)) + +print(matched_doc_ids)