comp20008-project01/partb3.py

import pandas as pd
import nltk
import os
import argparse
from partb2 import read_document, apply_preprocessing

def read_args():
    '''Creates an argparse ArgumentParser to read the command line 
    arguments.'''
    parser = argparse.ArgumentParser()
    parser.add_argument('keywords', nargs = '+', \
        help = 'keywords to search for (1-5 keywords accepted)')
    args = parser.parse_args()
    if len(args.keywords) > 5:
        print("Too many keywords.")
        quit()
    
    return args

def load_doc_ids():
    '''Loads in the documentIDs from partb1.csv, and returns the lists of 
    documentIDs and filenames.'''
    df = pd.read_csv('partb1.csv', encoding = 'ISO-8859-1')
    doc_ids = pd.Series(data = df.documentID.tolist(), \
        index = df.filename.tolist())
    documents = doc_ids.index.tolist()
    return doc_ids, documents

def find_matching_docs(doc_ids, documents, args):
    '''Takes the document list, applies pre-processing techniques to each 
    document, tokenises the words, and returns a list containing document IDs
    that match the keywords given as arguments to this program.'''
    matched_doc_ids = []
    for doc in documents:
        f = read_document(doc)
        f = apply_preprocessing(f)

        tokens = nltk.word_tokenize(f)
        # only add the document ID if all the keywords are in the token list
        if all(keyword in tokens for keyword in args.keywords):
            matched_doc_ids.append(doc_ids.get(doc))
    
    return matched_doc_ids

if __name__ == '__main__':
    args = read_args()
    doc_ids, documents = load_doc_ids()
    os.chdir(os.getcwd() + '/cricket')
    matched_doc_ids = find_matching_docs(doc_ids, documents, args)
    print(matched_doc_ids)
Initial commit 2021-03-01 17:57:17 +11:00			`import pandas as pd`
			`import nltk`
			`import os`
added basic search functionality 2021-04-11 23:09:58 +10:00			`import argparse`
partb5 complete, refactored code for cleaner design 2021-04-15 07:47:29 +10:00			`from partb2 import read_document, apply_preprocessing`
Initial commit 2021-03-01 17:57:17 +11:00
partb5 complete, refactored code for cleaner design 2021-04-15 07:47:29 +10:00			`def read_args():`
			`'''Creates an argparse ArgumentParser to read the command line`
			`arguments.'''`
			`parser = argparse.ArgumentParser()`
			`parser.add_argument('keywords', nargs = '+', \`
			`help = 'keywords to search for (1-5 keywords accepted)')`
			`args = parser.parse_args()`
			`if len(args.keywords) > 5:`
			`print("Too many keywords.")`
			`quit()`

			`return args`
added basic search functionality 2021-04-11 23:09:58 +10:00
partb5 complete, refactored code for cleaner design 2021-04-15 07:47:29 +10:00			`def load_doc_ids():`
			`'''Loads in the documentIDs from partb1.csv, and returns the lists of`
			`documentIDs and filenames.'''`
			`df = pd.read_csv('partb1.csv', encoding = 'ISO-8859-1')`
			`doc_ids = pd.Series(data = df.documentID.tolist(), \`
			`index = df.filename.tolist())`
			`documents = doc_ids.index.tolist()`
			`return doc_ids, documents`
added basic search functionality 2021-04-11 23:09:58 +10:00
partb5 complete, refactored code for cleaner design 2021-04-15 07:47:29 +10:00			`def find_matching_docs(doc_ids, documents, args):`
			`'''Takes the document list, applies pre-processing techniques to each`
			`document, tokenises the words, and returns a list containing document IDs`
			`that match the keywords given as arguments to this program.'''`
			`matched_doc_ids = []`
			`for doc in documents:`
			`f = read_document(doc)`
			`f = apply_preprocessing(f)`
added basic search functionality 2021-04-11 23:09:58 +10:00
partb5 complete, refactored code for cleaner design 2021-04-15 07:47:29 +10:00			`tokens = nltk.word_tokenize(f)`
			`# only add the document ID if all the keywords are in the token list`
			`if all(keyword in tokens for keyword in args.keywords):`
			`matched_doc_ids.append(doc_ids.get(doc))`
added basic search functionality 2021-04-11 23:09:58 +10:00
partb5 complete, refactored code for cleaner design 2021-04-15 07:47:29 +10:00			`return matched_doc_ids`

			`if __name__ == '__main__':`
			`args = read_args()`
			`doc_ids, documents = load_doc_ids()`
			`os.chdir(os.getcwd() + '/cricket')`
			`matched_doc_ids = find_matching_docs(doc_ids, documents, args)`
			`print(matched_doc_ids)`