added basic search functionality

2021-04-11 23:09:58 +10:00 · 2021-04-11 23:09:58 +10:00 · 67a1524bd6
commit 67a1524bd6
parent cf9299e503
1 changed files with 44 additions and 2 deletions
--- a/partb3.py
+++ b/partb3.py
@ -1,7 +1,49 @@
-## Part B Task 3
 import re
-import sys
 import pandas as pd
 import nltk
 import os
+import argparse

+def apply_preprocessing(f):
+    '''Applies preprocessing from partb2 to a string f.'''
+    f = re.sub(r'[^a-zA-Z\s]', r'', f)
+    f = re.sub(r'\s+', r' ', f)
+    f = f.lower()
+    return f
+
+def doc_to_str(doc):
+    '''Returns a string with the contents of a .txt file'''
+    f = ""
+    for line in doc:
+        f += line + " "
+    return f
+
+# parse input arguments
+parser = argparse.ArgumentParser()
+parser.add_argument('keywords', nargs = '+', help = 'keywords to search for (1-5 keywords accepted)')
+args = parser.parse_args()
+if len(args.keywords) > 5:
+    print("Too many keywords.")
+    quit()
+
+# load document IDs from csv
+df = pd.read_csv('partb1.csv', encoding = 'ISO-8859-1')
+doc_ids = pd.Series(data = df.documentID.tolist(), index = df.filename.tolist())
+documents = doc_ids.index.tolist()
+matched_doc_ids = []
+
+os.chdir(os.getcwd() + '/cricket')
+
+# search through each document for the keywords
+for doc in documents:
+    curr = open(doc)
+    f = doc_to_str(curr)
+    curr.close()
+    f = apply_preprocessing(f)
+
+    tokens = nltk.word_tokenize(f)
+    # only add the document ID if all the keywords are in the token list
+    if all(keyword in tokens for keyword in args.keywords):
+        matched_doc_ids.append(doc_ids.get(doc))
+    
+print(matched_doc_ids)