import pandas as pd import nltk import os import argparse from partb2 import read_document, apply_preprocessing def read_args(): '''Creates an argparse ArgumentParser to read the command line arguments.''' parser = argparse.ArgumentParser() parser.add_argument('keywords', nargs = '+', \ help = 'keywords to search for (1-5 keywords accepted)') args = parser.parse_args() if len(args.keywords) > 5: print("Too many keywords.") quit() return args def load_doc_ids(): '''Loads in the documentIDs from partb1.csv, and returns the lists of documentIDs and filenames.''' df = pd.read_csv('partb1.csv', encoding = 'ISO-8859-1') doc_ids = pd.Series(data = df.documentID.tolist(), \ index = df.filename.tolist()) documents = doc_ids.index.tolist() return doc_ids, documents def find_matching_docs(doc_ids, documents, args): '''Takes the document list, applies pre-processing techniques to each document, tokenises the words, and returns a list containing document IDs that match the keywords given as arguments to this program.''' matched_doc_ids = [] for doc in documents: f = read_document(doc) f = apply_preprocessing(f) tokens = nltk.word_tokenize(f) # only add the document ID if all the keywords are in the token list if all(keyword in tokens for keyword in args.keywords): matched_doc_ids.append(doc_ids.get(doc)) return matched_doc_ids if __name__ == '__main__': args = read_args() doc_ids, documents = load_doc_ids() os.chdir(os.getcwd() + '/cricket') matched_doc_ids = find_matching_docs(doc_ids, documents, args) print(matched_doc_ids)