comp20008-project01/partb4.py

import os
import nltk
from nltk.stem.porter import *
from partb2 import read_document, apply_preprocessing
from partb3 import read_args, load_doc_ids

def find_matching_docs(doc_ids, documents, args):
    '''Takes the document list, applies pre-processing techniques to each 
    document, tokenises the words, and returns a list containing document IDs
    that match the keywords given as arguments to this program.'''
    matched_doc_ids = []
    porter_stemmer = PorterStemmer()
    for doc in documents:
        f = read_document(doc)
        f = apply_preprocessing(f)

        # tokenise the document
        word_list = nltk.word_tokenize(f)
        
        # use the Porter stemmer to add stem words to the word list
        for word in word_list:
            stemmed_word = porter_stemmer.stem(word)
            if stemmed_word not in word_list:
                word_list.append(stemmed_word)
        
        # add document ID if all keywords are in this new word list
        if all(keyword in word_list for keyword in args.keywords):
            matched_doc_ids.append(doc_ids.get(doc))
    
    return matched_doc_ids
    
if __name__ == '__main__':
    args = read_args()
    doc_ids, documents = load_doc_ids()
    os.chdir(os.getcwd() + '/cricket')
    matched_doc_ids = find_matching_docs(doc_ids, documents, args)
    print(matched_doc_ids)
Initial commit 2021-03-01 17:57:17 +11:00			`import os`
			`import nltk`
partb4 completed, refactored code 2021-04-14 23:42:57 +10:00			`from nltk.stem.porter import *`
partb5 complete, refactored code for cleaner design 2021-04-15 07:47:29 +10:00			`from partb2 import read_document, apply_preprocessing`
			`from partb3 import read_args, load_doc_ids`

			`def find_matching_docs(doc_ids, documents, args):`
			`'''Takes the document list, applies pre-processing techniques to each`
			`document, tokenises the words, and returns a list containing document IDs`
			`that match the keywords given as arguments to this program.'''`
			`matched_doc_ids = []`
			`porter_stemmer = PorterStemmer()`
			`for doc in documents:`
			`f = read_document(doc)`
			`f = apply_preprocessing(f)`

			`# tokenise the document`
			`word_list = nltk.word_tokenize(f)`

			`# use the Porter stemmer to add stem words to the word list`
			`for word in word_list:`
			`stemmed_word = porter_stemmer.stem(word)`
			`if stemmed_word not in word_list:`
			`word_list.append(stemmed_word)`

			`# add document ID if all keywords are in this new word list`
			`if all(keyword in word_list for keyword in args.keywords):`
			`matched_doc_ids.append(doc_ids.get(doc))`
partb4 completed, refactored code 2021-04-14 23:42:57 +10:00
partb5 complete, refactored code for cleaner design 2021-04-15 07:47:29 +10:00			`return matched_doc_ids`
partb4 completed, refactored code 2021-04-14 23:42:57 +10:00
partb5 complete, refactored code for cleaner design 2021-04-15 07:47:29 +10:00			`if __name__ == '__main__':`
			`args = read_args()`
			`doc_ids, documents = load_doc_ids()`
			`os.chdir(os.getcwd() + '/cricket')`
			`matched_doc_ids = find_matching_docs(doc_ids, documents, args)`
			`print(matched_doc_ids)`