import re
import pandas as pd
import os
import nltk
from nltk.stem.porter import *
import argparse

def read_document(path):
    '''Reads a file when provided with its path, and returns a string 
    containing the lines of the file.'''
    file_given = open(path)
    f = ""
    for line in file_given:
        f += line + " "
    file_given.close()
    return f

def apply_preprocessing(f):
    '''Removes non-alphabetic characters, replaces all whitespace characters 
    with a single whitespace, and changes all uppercase characters to 
    lowercase'''
    f = re.sub(r'[^a-zA-Z\s]', r'', f)
    f = re.sub(r'\s+', r' ', f)
    f = f.lower()
    return f

# parse input arguments
parser = argparse.ArgumentParser()
parser.add_argument('keywords', nargs = '+', help = 'keywords to search \
    for (1-5 keywords accepted)')
args = parser.parse_args()
if len(args.keywords) > 5:
    print("Too many keywords.")
    quit()

# load document IDs from csv
df = pd.read_csv('partb1.csv', encoding = 'ISO-8859-1')
doc_ids = pd.Series(data = df.documentID.tolist(), \
    index = df.filename.tolist())
documents = doc_ids.index.tolist()
matched_doc_ids = []

# change directory to get cricket data
os.chdir(os.getcwd() + '/cricket')

# search through each document for the keywords
porter_stemmer = PorterStemmer()

for doc in documents:
    f = read_document(doc)
    f = apply_preprocessing(f)

    # tokenise the document, remove stop words
    word_list = nltk.word_tokenize(f)
    
    # use the Porter stemmer to add stem words to the word list
    for word in word_list:
        stemmed_word = porter_stemmer.stem(word)
        if stemmed_word not in word_list:
            word_list.append(stemmed_word)
    
    # add document ID if all keywords are in this new word list
    if all(keyword in word_list for keyword in args.keywords):
        matched_doc_ids.append(doc_ids.get(doc))
    
print(matched_doc_ids)