comp20008-project01/partb3.py

import re
import pandas as pd
import nltk
import os
import argparse

def read_document(path):
    '''Reads a file when provided with its path, and returns a string 
    containing the lines of the file.'''
    file_given = open(path)
    f = ""
    for line in file_given:
        f += line + " "
    file_given.close()
    return f

def apply_preprocessing(f):
    '''Removes non-alphabetic characters, replaces all whitespace characters 
    with a single whitespace, and changes all uppercase characters to 
    lowercase'''
    f = re.sub(r'[^a-zA-Z\s]', r'', f)
    f = re.sub(r'\s+', r' ', f)
    f = f.lower()
    return f

# parse input arguments
parser = argparse.ArgumentParser()
parser.add_argument('keywords', nargs = '+', help = 'keywords to search for \
    (1-5 keywords accepted)')
args = parser.parse_args()
if len(args.keywords) > 5:
    print("Too many keywords.")
    quit()

# load document IDs from csv
df = pd.read_csv('partb1.csv', encoding = 'ISO-8859-1')
doc_ids = pd.Series(data = df.documentID.tolist(), \
    index = df.filename.tolist())
documents = doc_ids.index.tolist()
matched_doc_ids = []

os.chdir(os.getcwd() + '/cricket')

# search through each document for the keywords
for doc in documents:
    f = read_document(doc)
    f = apply_preprocessing(f)

    tokens = nltk.word_tokenize(f)
    # only add the document ID if all the keywords are in the token list
    if all(keyword in tokens for keyword in args.keywords):
        matched_doc_ids.append(doc_ids.get(doc))
    
print(matched_doc_ids)
Initial commit 2021-03-01 17:57:17 +11:00			`import re`
			`import pandas as pd`
			`import nltk`
			`import os`
added basic search functionality 2021-04-11 23:09:58 +10:00			`import argparse`
Initial commit 2021-03-01 17:57:17 +11:00
partb4 completed, refactored code 2021-04-14 23:42:57 +10:00			`def read_document(path):`
			`'''Reads a file when provided with its path, and returns a string`
			`containing the lines of the file.'''`
			`file_given = open(path)`
			`f = ""`
			`for line in file_given:`
			`f += line + " "`
			`file_given.close()`
			`return f`

added basic search functionality 2021-04-11 23:09:58 +10:00			`def apply_preprocessing(f):`
partb4 completed, refactored code 2021-04-14 23:42:57 +10:00			`'''Removes non-alphabetic characters, replaces all whitespace characters`
			`with a single whitespace, and changes all uppercase characters to`
			`lowercase'''`
added basic search functionality 2021-04-11 23:09:58 +10:00			`f = re.sub(r'[^a-zA-Z\s]', r'', f)`
			`f = re.sub(r'\s+', r' ', f)`
			`f = f.lower()`
			`return f`

			`# parse input arguments`
			`parser = argparse.ArgumentParser()`
partb4 completed, refactored code 2021-04-14 23:42:57 +10:00			`parser.add_argument('keywords', nargs = '+', help = 'keywords to search for \`
			`(1-5 keywords accepted)')`
added basic search functionality 2021-04-11 23:09:58 +10:00			`args = parser.parse_args()`
			`if len(args.keywords) > 5:`
			`print("Too many keywords.")`
			`quit()`

			`# load document IDs from csv`
			`df = pd.read_csv('partb1.csv', encoding = 'ISO-8859-1')`
partb4 completed, refactored code 2021-04-14 23:42:57 +10:00			`doc_ids = pd.Series(data = df.documentID.tolist(), \`
			`index = df.filename.tolist())`
added basic search functionality 2021-04-11 23:09:58 +10:00			`documents = doc_ids.index.tolist()`
			`matched_doc_ids = []`

			`os.chdir(os.getcwd() + '/cricket')`

			`# search through each document for the keywords`
			`for doc in documents:`
partb4 completed, refactored code 2021-04-14 23:42:57 +10:00			`f = read_document(doc)`
added basic search functionality 2021-04-11 23:09:58 +10:00			`f = apply_preprocessing(f)`

			`tokens = nltk.word_tokenize(f)`
			`# only add the document ID if all the keywords are in the token list`
			`if all(keyword in tokens for keyword in args.keywords):`
			`matched_doc_ids.append(doc_ids.get(doc))`

			`print(matched_doc_ids)`