added basic search functionality

This commit is contained in:
Rory Healy 2021-04-11 23:09:58 +10:00
parent cf9299e503
commit 67a1524bd6

View file

@ -1,7 +1,49 @@
## Part B Task 3
import re import re
import sys
import pandas as pd import pandas as pd
import nltk import nltk
import os import os
import argparse
def apply_preprocessing(f):
'''Applies preprocessing from partb2 to a string f.'''
f = re.sub(r'[^a-zA-Z\s]', r'', f)
f = re.sub(r'\s+', r' ', f)
f = f.lower()
return f
def doc_to_str(doc):
'''Returns a string with the contents of a .txt file'''
f = ""
for line in doc:
f += line + " "
return f
# parse input arguments
parser = argparse.ArgumentParser()
parser.add_argument('keywords', nargs = '+', help = 'keywords to search for (1-5 keywords accepted)')
args = parser.parse_args()
if len(args.keywords) > 5:
print("Too many keywords.")
quit()
# load document IDs from csv
df = pd.read_csv('partb1.csv', encoding = 'ISO-8859-1')
doc_ids = pd.Series(data = df.documentID.tolist(), index = df.filename.tolist())
documents = doc_ids.index.tolist()
matched_doc_ids = []
os.chdir(os.getcwd() + '/cricket')
# search through each document for the keywords
for doc in documents:
curr = open(doc)
f = doc_to_str(curr)
curr.close()
f = apply_preprocessing(f)
tokens = nltk.word_tokenize(f)
# only add the document ID if all the keywords are in the token list
if all(keyword in tokens for keyword in args.keywords):
matched_doc_ids.append(doc_ids.get(doc))
print(matched_doc_ids)