added basic search functionality
This commit is contained in:
parent
cf9299e503
commit
67a1524bd6
1 changed files with 44 additions and 2 deletions
46
partb3.py
46
partb3.py
|
@ -1,7 +1,49 @@
|
|||
## Part B Task 3
|
||||
import re
|
||||
import sys
|
||||
import pandas as pd
|
||||
import nltk
|
||||
import os
|
||||
import argparse
|
||||
|
||||
def apply_preprocessing(f):
|
||||
'''Applies preprocessing from partb2 to a string f.'''
|
||||
f = re.sub(r'[^a-zA-Z\s]', r'', f)
|
||||
f = re.sub(r'\s+', r' ', f)
|
||||
f = f.lower()
|
||||
return f
|
||||
|
||||
def doc_to_str(doc):
|
||||
'''Returns a string with the contents of a .txt file'''
|
||||
f = ""
|
||||
for line in doc:
|
||||
f += line + " "
|
||||
return f
|
||||
|
||||
# parse input arguments
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument('keywords', nargs = '+', help = 'keywords to search for (1-5 keywords accepted)')
|
||||
args = parser.parse_args()
|
||||
if len(args.keywords) > 5:
|
||||
print("Too many keywords.")
|
||||
quit()
|
||||
|
||||
# load document IDs from csv
|
||||
df = pd.read_csv('partb1.csv', encoding = 'ISO-8859-1')
|
||||
doc_ids = pd.Series(data = df.documentID.tolist(), index = df.filename.tolist())
|
||||
documents = doc_ids.index.tolist()
|
||||
matched_doc_ids = []
|
||||
|
||||
os.chdir(os.getcwd() + '/cricket')
|
||||
|
||||
# search through each document for the keywords
|
||||
for doc in documents:
|
||||
curr = open(doc)
|
||||
f = doc_to_str(curr)
|
||||
curr.close()
|
||||
f = apply_preprocessing(f)
|
||||
|
||||
tokens = nltk.word_tokenize(f)
|
||||
# only add the document ID if all the keywords are in the token list
|
||||
if all(keyword in tokens for keyword in args.keywords):
|
||||
matched_doc_ids.append(doc_ids.get(doc))
|
||||
|
||||
print(matched_doc_ids)
|
||||
|
|
Loading…
Reference in a new issue