import os import nltk from nltk.stem.porter import * from partb2 import read_document, apply_preprocessing from partb3 import read_args, load_doc_ids def find_matching_docs(doc_ids, documents, args): '''Takes the document list, applies pre-processing techniques to each document, tokenises the words, and returns a list containing document IDs that match the keywords given as arguments to this program.''' matched_doc_ids = [] porter_stemmer = PorterStemmer() for doc in documents: f = read_document(doc) f = apply_preprocessing(f) # tokenise the document word_list = nltk.word_tokenize(f) # use the Porter stemmer to add stem words to the word list for word in word_list: stemmed_word = porter_stemmer.stem(word) if stemmed_word not in word_list: word_list.append(stemmed_word) # add document ID if all keywords are in this new word list if all(keyword in word_list for keyword in args.keywords): matched_doc_ids.append(doc_ids.get(doc)) return matched_doc_ids if __name__ == '__main__': args = read_args() doc_ids, documents = load_doc_ids() os.chdir(os.getcwd() + '/cricket') matched_doc_ids = find_matching_docs(doc_ids, documents, args) print(matched_doc_ids)