import os
import nltk
from nltk.stem.porter import *
from partb2 import read_document, apply_preprocessing
from partb3 import read_args, load_doc_ids

def find_matching_docs(doc_ids, documents, args):
    '''Takes the document list, applies pre-processing techniques to each 
    document, tokenises the words, and returns a list containing document IDs
    that match the keywords given as arguments to this program.'''
    matched_doc_ids = []
    porter_stemmer = PorterStemmer()
    for doc in documents:
        f = read_document(doc)
        f = apply_preprocessing(f)

        # tokenise the document
        word_list = nltk.word_tokenize(f)
        
        # use the Porter stemmer to add stem words to the word list
        for word in word_list:
            stemmed_word = porter_stemmer.stem(word)
            if stemmed_word not in word_list:
                word_list.append(stemmed_word)
        
        # add document ID if all keywords are in this new word list
        if all(keyword in word_list for keyword in args.keywords):
            matched_doc_ids.append(doc_ids.get(doc))
    
    return matched_doc_ids
    
if __name__ == '__main__':
    args = read_args()
    doc_ids, documents = load_doc_ids()
    os.chdir(os.getcwd() + '/cricket')
    matched_doc_ids = find_matching_docs(doc_ids, documents, args)
    print(matched_doc_ids)