comp20008-project01/partb4.py

38 lines
1.3 KiB
Python
Raw Normal View History

2021-03-01 17:57:17 +11:00
import os
import nltk
2021-04-14 23:42:57 +10:00
from nltk.stem.porter import *
from partb2 import read_document, apply_preprocessing
from partb3 import read_args, load_doc_ids
def find_matching_docs(doc_ids, documents, args):
'''Takes the document list, applies pre-processing techniques to each
document, tokenises the words, and returns a list containing document IDs
that match the keywords given as arguments to this program.'''
matched_doc_ids = []
porter_stemmer = PorterStemmer()
for doc in documents:
f = read_document(doc)
f = apply_preprocessing(f)
# tokenise the document
word_list = nltk.word_tokenize(f)
# use the Porter stemmer to add stem words to the word list
for word in word_list:
stemmed_word = porter_stemmer.stem(word)
if stemmed_word not in word_list:
word_list.append(stemmed_word)
# add document ID if all keywords are in this new word list
if all(keyword in word_list for keyword in args.keywords):
matched_doc_ids.append(doc_ids.get(doc))
2021-04-14 23:42:57 +10:00
return matched_doc_ids
2021-04-14 23:42:57 +10:00
if __name__ == '__main__':
args = read_args()
doc_ids, documents = load_doc_ids()
os.chdir(os.getcwd() + '/cricket')
matched_doc_ids = find_matching_docs(doc_ids, documents, args)
print(matched_doc_ids)