import re import pandas as pd import nltk import os import argparse def apply_preprocessing(f): '''Applies preprocessing from partb2 to a string f.''' f = re.sub(r'[^a-zA-Z\s]', r'', f) f = re.sub(r'\s+', r' ', f) f = f.lower() return f def doc_to_str(doc): '''Returns a string with the contents of a .txt file''' f = "" for line in doc: f += line + " " return f # parse input arguments parser = argparse.ArgumentParser() parser.add_argument('keywords', nargs = '+', help = 'keywords to search for (1-5 keywords accepted)') args = parser.parse_args() if len(args.keywords) > 5: print("Too many keywords.") quit() # load document IDs from csv df = pd.read_csv('partb1.csv', encoding = 'ISO-8859-1') doc_ids = pd.Series(data = df.documentID.tolist(), index = df.filename.tolist()) documents = doc_ids.index.tolist() matched_doc_ids = [] os.chdir(os.getcwd() + '/cricket') # search through each document for the keywords for doc in documents: curr = open(doc) f = doc_to_str(curr) curr.close() f = apply_preprocessing(f) tokens = nltk.word_tokenize(f) # only add the document ID if all the keywords are in the token list if all(keyword in tokens for keyword in args.keywords): matched_doc_ids.append(doc_ids.get(doc)) print(matched_doc_ids)