comp20008-project01/partb5.py

111 lines
3.3 KiB
Python

import os
import pandas as pd
import nltk
import argparse
from math import sqrt
from numpy.linalg import norm
from numpy import dot
from nltk.corpus import stopwords
from nltk.stem.porter import *
from sklearn.feature_extraction.text import TfidfTransformer
from partb2 import read_document, apply_preprocessing
from partb3 import read_args
def cosine_similarity(x1, x2):
'''Calculates the cosine similarity between two vectors. Equal to the
cosine definition using the dot product.'''
return dot(x1, x2) / (norm(x1) * norm(x2))
args = read_args()
# load document IDs from csv
df = pd.read_csv('partb1.csv', encoding = 'ISO-8859-1')
doc_ids = pd.Series(data = df.documentID.tolist(), \
index = df.filename.tolist())
documents = doc_ids.index.tolist()
# change directory to get cricket data
os.chdir(os.getcwd() + '/cricket')
# build the corpus
corpus = []
porter_stemmer = PorterStemmer()
stop_words = set(stopwords.words('english'))
for doc in documents:
f = read_document(doc)
f = apply_preprocessing(f)
# tokenise the document, remove stop words
word_list = nltk.word_tokenize(f)
word_list = [w for w in word_list if not w in stop_words]
# add all words and their stems to the corpus
for word in word_list:
stemmed_word = porter_stemmer.stem(word)
if stemmed_word not in corpus:
corpus.append(word)
# build the term counts
term_counts = []
for doc in documents:
curr_term_count = []
f = read_document(doc)
f = apply_preprocessing(f)
# tokenise the document, remove stop words
word_list = nltk.word_tokenize(f)
word_list = [w for w in word_list if not w in stop_words]
# build frequency dictionary of stemmed words
wordDict = {}
for word in word_list:
stemmed_word = porter_stemmer.stem(word)
if stemmed_word in wordDict:
wordDict[stemmed_word] += 1
else:
wordDict[stemmed_word] = 1
# fill in the current count of terms, then add to the overall list
for word in corpus:
if word in wordDict.keys():
curr_term_count.append(wordDict[word])
else:
curr_term_count.append(0)
term_counts.append(curr_term_count)
# calculate the tf-idf scores
transformer = TfidfTransformer()
tfidf = transformer.fit_transform(term_counts)
doc_tfidf = tfidf.toarray()
# construct the query unit vector
query_vector = []
for word in corpus:
if word in args.keywords:
# this assumes that keywords are unique
# and aren't entered more than once
query_vector.append(1)
else:
query_vector.append(0)
query_unit_vector = [x / norm(query_vector) for x in query_vector]
similarities = [cosine_similarity(query_unit_vector, doc_tfidf[d_id]) for \
d_id in range(doc_tfidf.shape[0])]
# this holds the similarities with their respective document IDs
sim_doc_ids = df
sim_doc_ids.insert(1, 'similarity_scores', similarities)
sim_doc_ids.sort_values(by = 'similarity_scores', ascending = False, \
inplace = True)
sorted_doc_ids = sim_doc_ids.documentID.tolist()
sorted_similarities = sim_doc_ids.similarity_scores.tolist()
# print documentID and scores, sorted by scores
print('documentID | score')
for i in range(len(sorted_doc_ids)):
if sorted_similarities[i] > 0:
print(f"{sorted_doc_ids[i]:10} | {sorted_similarities[i]:.4f}")