2021-03-01 17:57:17 +11:00
|
|
|
import os
|
|
|
|
import nltk
|
2021-04-14 23:42:57 +10:00
|
|
|
from nltk.stem.porter import *
|
2021-04-15 07:47:29 +10:00
|
|
|
from partb2 import read_document, apply_preprocessing
|
|
|
|
from partb3 import read_args, load_doc_ids
|
|
|
|
|
|
|
|
def find_matching_docs(doc_ids, documents, args):
|
|
|
|
'''Takes the document list, applies pre-processing techniques to each
|
|
|
|
document, tokenises the words, and returns a list containing document IDs
|
|
|
|
that match the keywords given as arguments to this program.'''
|
|
|
|
matched_doc_ids = []
|
|
|
|
porter_stemmer = PorterStemmer()
|
|
|
|
for doc in documents:
|
|
|
|
f = read_document(doc)
|
|
|
|
f = apply_preprocessing(f)
|
|
|
|
|
|
|
|
# tokenise the document
|
|
|
|
word_list = nltk.word_tokenize(f)
|
|
|
|
|
|
|
|
# use the Porter stemmer to add stem words to the word list
|
|
|
|
for word in word_list:
|
|
|
|
stemmed_word = porter_stemmer.stem(word)
|
|
|
|
if stemmed_word not in word_list:
|
|
|
|
word_list.append(stemmed_word)
|
|
|
|
|
|
|
|
# add document ID if all keywords are in this new word list
|
|
|
|
if all(keyword in word_list for keyword in args.keywords):
|
|
|
|
matched_doc_ids.append(doc_ids.get(doc))
|
2021-04-14 23:42:57 +10:00
|
|
|
|
2021-04-15 07:47:29 +10:00
|
|
|
return matched_doc_ids
|
2021-04-14 23:42:57 +10:00
|
|
|
|
2021-04-15 07:47:29 +10:00
|
|
|
if __name__ == '__main__':
|
|
|
|
args = read_args()
|
|
|
|
doc_ids, documents = load_doc_ids()
|
|
|
|
os.chdir(os.getcwd() + '/cricket')
|
|
|
|
matched_doc_ids = find_matching_docs(doc_ids, documents, args)
|
|
|
|
print(matched_doc_ids)
|