comp20008-project01/partb3.py

import pandas as pd
import nltk
import os
import argparse
from partb2 import read_document, apply_preprocessing

def read_args():
    '''Creates an argparse ArgumentParser to read the command line
    arguments.'''
    parser = argparse.ArgumentParser()
    parser.add_argument('keywords', nargs = '+', \
        help = 'keywords to search for (1-5 keywords accepted)')
    args = parser.parse_args()
    if len(args.keywords) > 5:
        print("Too many keywords.")
        quit()

    return args

def load_doc_ids():
    '''Loads in the documentIDs from partb1.csv, and returns the lists of
    documentIDs and filenames.'''
    df = pd.read_csv('partb1.csv', encoding = 'ISO-8859-1')
    doc_ids = pd.Series(data = df.documentID.tolist(), \
        index = df.filename.tolist())
    documents = doc_ids.index.tolist()
    return doc_ids, documents

def find_matching_docs(doc_ids, documents, args):
    '''Takes the document list, applies pre-processing techniques to each
    document, tokenises the words, and returns a list containing document IDs
    that match the keywords given as arguments to this program.'''
    matched_doc_ids = []
    for doc in documents:
        f = read_document(doc)
        f = apply_preprocessing(f)

        tokens = nltk.word_tokenize(f)
        # only add the document ID if all the keywords are in the token list
        if all(keyword in tokens for keyword in args.keywords):
            matched_doc_ids.append(doc_ids.get(doc))

    return matched_doc_ids

if __name__ == '__main__':
    args = read_args()
    doc_ids, documents = load_doc_ids()
    os.chdir(os.getcwd() + '/cricket')
    matched_doc_ids = find_matching_docs(doc_ids, documents, args)
    print(matched_doc_ids)