comp20008-project01/partb5.py

import os
import pandas as pd
import nltk
import argparse
from math import sqrt
from numpy.linalg import norm
from numpy import dot
from nltk.corpus import stopwords
from nltk.stem.porter import *
from sklearn.feature_extraction.text import TfidfTransformer
from partb2 import read_document, apply_preprocessing
from partb3 import read_args

def cosine_similarity(x1, x2):
    '''Calculates the cosine similarity between two vectors. Equal to the
    cosine definition using the dot product.'''
    return dot(x1, x2) / (norm(x1) * norm(x2))

args = read_args()

# load document IDs from csv
df = pd.read_csv('partb1.csv', encoding = 'ISO-8859-1')
doc_ids = pd.Series(data = df.documentID.tolist(), \
    index = df.filename.tolist())
documents = doc_ids.index.tolist()

# change directory to get cricket data
os.chdir(os.getcwd() + '/cricket')

# build the corpus
corpus = []
porter_stemmer = PorterStemmer()
stop_words = set(stopwords.words('english'))

for doc in documents:
    f = read_document(doc)
    f = apply_preprocessing(f)

    # tokenise the document, remove stop words
    word_list = nltk.word_tokenize(f)
    word_list = [w for w in word_list if not w in stop_words]

    # add all words and their stems to the corpus
    for word in word_list:
        stemmed_word = porter_stemmer.stem(word)

        if stemmed_word not in corpus:
            corpus.append(word)

# build the term counts
term_counts = []
for doc in documents:
    curr_term_count = []
    f = read_document(doc)
    f = apply_preprocessing(f)

    # tokenise the document, remove stop words
    word_list = nltk.word_tokenize(f)
    word_list = [w for w in word_list if not w in stop_words]

    # build frequency dictionary of stemmed words
    wordDict = {}
    for word in word_list:
        stemmed_word = porter_stemmer.stem(word)

        if stemmed_word in wordDict:
            wordDict[stemmed_word] += 1
        else:
            wordDict[stemmed_word] = 1

    # fill in the current count of terms, then add to the overall list
    for word in corpus:
        if word in wordDict.keys():
            curr_term_count.append(wordDict[word])
        else:
            curr_term_count.append(0)

    term_counts.append(curr_term_count)

# calculate the tf-idf scores
transformer = TfidfTransformer()
tfidf = transformer.fit_transform(term_counts)
doc_tfidf = tfidf.toarray()

# construct the query unit vector
query_vector = []
for word in corpus:
    if word in args.keywords:
        # this assumes that keywords are unique
        # and aren't entered more than once
        query_vector.append(1)
    else:
        query_vector.append(0)
query_unit_vector = [x / norm(query_vector) for x in query_vector]

similarities = [cosine_similarity(query_unit_vector, doc_tfidf[d_id]) for \
    d_id in range(doc_tfidf.shape[0])]

# this holds the similarities with their respective document IDs
sim_doc_ids = df
sim_doc_ids.insert(1, 'similarity_scores', similarities)
sim_doc_ids.sort_values(by = 'similarity_scores', ascending = False, \
    inplace = True)
sorted_doc_ids = sim_doc_ids.documentID.tolist()
sorted_similarities = sim_doc_ids.similarity_scores.tolist()

# print documentID and scores, sorted by scores
print('documentID | score')
for i in range(len(sorted_doc_ids)):
    if sorted_similarities[i] > 0:
        print(f"{sorted_doc_ids[i]:10} | {sorted_similarities[i]:.4f}")