2021-03-01 17:57:17 +11:00
|
|
|
import os
|
|
|
|
import pandas as pd
|
|
|
|
import nltk
|
2021-04-15 07:47:29 +10:00
|
|
|
import argparse
|
|
|
|
from math import sqrt
|
|
|
|
from numpy.linalg import norm
|
|
|
|
from numpy import dot
|
|
|
|
from nltk.corpus import stopwords
|
|
|
|
from nltk.stem.porter import *
|
|
|
|
from sklearn.feature_extraction.text import TfidfTransformer
|
|
|
|
from partb2 import read_document, apply_preprocessing
|
|
|
|
from partb3 import read_args
|
|
|
|
|
|
|
|
def cosine_similarity(x1, x2):
|
|
|
|
'''Calculates the cosine similarity between two vectors. Equal to the
|
|
|
|
cosine definition using the dot product.'''
|
|
|
|
return dot(x1, x2) / (norm(x1) * norm(x2))
|
|
|
|
|
|
|
|
args = read_args()
|
|
|
|
|
|
|
|
# load document IDs from csv
|
|
|
|
df = pd.read_csv('partb1.csv', encoding = 'ISO-8859-1')
|
|
|
|
doc_ids = pd.Series(data = df.documentID.tolist(), \
|
|
|
|
index = df.filename.tolist())
|
|
|
|
documents = doc_ids.index.tolist()
|
|
|
|
|
|
|
|
# change directory to get cricket data
|
|
|
|
os.chdir(os.getcwd() + '/cricket')
|
|
|
|
|
|
|
|
# build the corpus
|
|
|
|
corpus = []
|
|
|
|
porter_stemmer = PorterStemmer()
|
|
|
|
stop_words = set(stopwords.words('english'))
|
|
|
|
|
|
|
|
for doc in documents:
|
|
|
|
f = read_document(doc)
|
|
|
|
f = apply_preprocessing(f)
|
|
|
|
|
|
|
|
# tokenise the document, remove stop words
|
|
|
|
word_list = nltk.word_tokenize(f)
|
|
|
|
word_list = [w for w in word_list if not w in stop_words]
|
|
|
|
|
|
|
|
# add all words and their stems to the corpus
|
|
|
|
for word in word_list:
|
|
|
|
stemmed_word = porter_stemmer.stem(word)
|
|
|
|
|
|
|
|
if stemmed_word not in corpus:
|
|
|
|
corpus.append(word)
|
|
|
|
|
|
|
|
# build the term counts
|
|
|
|
term_counts = []
|
|
|
|
for doc in documents:
|
|
|
|
curr_term_count = []
|
|
|
|
f = read_document(doc)
|
|
|
|
f = apply_preprocessing(f)
|
|
|
|
|
|
|
|
# tokenise the document, remove stop words
|
|
|
|
word_list = nltk.word_tokenize(f)
|
|
|
|
word_list = [w for w in word_list if not w in stop_words]
|
|
|
|
|
|
|
|
# build frequency dictionary of stemmed words
|
|
|
|
wordDict = {}
|
|
|
|
for word in word_list:
|
|
|
|
stemmed_word = porter_stemmer.stem(word)
|
|
|
|
|
|
|
|
if stemmed_word in wordDict:
|
|
|
|
wordDict[stemmed_word] += 1
|
|
|
|
else:
|
|
|
|
wordDict[stemmed_word] = 1
|
|
|
|
|
|
|
|
# fill in the current count of terms, then add to the overall list
|
|
|
|
for word in corpus:
|
|
|
|
if word in wordDict.keys():
|
|
|
|
curr_term_count.append(wordDict[word])
|
|
|
|
else:
|
|
|
|
curr_term_count.append(0)
|
|
|
|
|
|
|
|
term_counts.append(curr_term_count)
|
|
|
|
|
|
|
|
# calculate the tf-idf scores
|
|
|
|
transformer = TfidfTransformer()
|
|
|
|
tfidf = transformer.fit_transform(term_counts)
|
|
|
|
doc_tfidf = tfidf.toarray()
|
|
|
|
|
|
|
|
# construct the query unit vector
|
|
|
|
query_vector = []
|
|
|
|
for word in corpus:
|
|
|
|
if word in args.keywords:
|
|
|
|
# this assumes that keywords are unique
|
|
|
|
# and aren't entered more than once
|
|
|
|
query_vector.append(1)
|
|
|
|
else:
|
|
|
|
query_vector.append(0)
|
|
|
|
query_unit_vector = [x / norm(query_vector) for x in query_vector]
|
|
|
|
|
|
|
|
similarities = [cosine_similarity(query_unit_vector, doc_tfidf[d_id]) for \
|
|
|
|
d_id in range(doc_tfidf.shape[0])]
|
|
|
|
|
|
|
|
# this holds the similarities with their respective document IDs
|
|
|
|
sim_doc_ids = df
|
|
|
|
sim_doc_ids.insert(1, 'similarity_scores', similarities)
|
|
|
|
sim_doc_ids.sort_values(by = 'similarity_scores', ascending = False, \
|
|
|
|
inplace = True)
|
|
|
|
sorted_doc_ids = sim_doc_ids.documentID.tolist()
|
|
|
|
sorted_similarities = sim_doc_ids.similarity_scores.tolist()
|
|
|
|
|
|
|
|
# print documentID and scores, sorted by scores
|
|
|
|
print('documentID | score')
|
|
|
|
for i in range(len(sorted_doc_ids)):
|
|
|
|
if sorted_similarities[i] > 0:
|
|
|
|
print(f"{sorted_doc_ids[i]:10} | {sorted_similarities[i]:.4f}")
|