import re import pandas as pd import os import nltk from nltk.stem.porter import * import argparse def read_document(path): '''Reads a file when provided with its path, and returns a string containing the lines of the file.''' file_given = open(path) f = "" for line in file_given: f += line + " " file_given.close() return f def apply_preprocessing(f): '''Removes non-alphabetic characters, replaces all whitespace characters with a single whitespace, and changes all uppercase characters to lowercase''' f = re.sub(r'[^a-zA-Z\s]', r'', f) f = re.sub(r'\s+', r' ', f) f = f.lower() return f # parse input arguments parser = argparse.ArgumentParser() parser.add_argument('keywords', nargs = '+', help = 'keywords to search \ for (1-5 keywords accepted)') args = parser.parse_args() if len(args.keywords) > 5: print("Too many keywords.") quit() # load document IDs from csv df = pd.read_csv('partb1.csv', encoding = 'ISO-8859-1') doc_ids = pd.Series(data = df.documentID.tolist(), \ index = df.filename.tolist()) documents = doc_ids.index.tolist() matched_doc_ids = [] # change directory to get cricket data os.chdir(os.getcwd() + '/cricket') # search through each document for the keywords porter_stemmer = PorterStemmer() for doc in documents: f = read_document(doc) f = apply_preprocessing(f) # tokenise the document, remove stop words word_list = nltk.word_tokenize(f) # use the Porter stemmer to add stem words to the word list for word in word_list: stemmed_word = porter_stemmer.stem(word) if stemmed_word not in word_list: word_list.append(stemmed_word) # add document ID if all keywords are in this new word list if all(keyword in word_list for keyword in args.keywords): matched_doc_ids.append(doc_ids.get(doc)) print(matched_doc_ids)