comp20008-project01/partb5.py

112 lines
3.3 KiB
Python
Raw Normal View History

2021-03-01 17:57:17 +11:00
import os
import pandas as pd
import nltk
import argparse
from math import sqrt
from numpy.linalg import norm
from numpy import dot
from nltk.corpus import stopwords
from nltk.stem.porter import *
from sklearn.feature_extraction.text import TfidfTransformer
from partb2 import read_document, apply_preprocessing
from partb3 import read_args
def cosine_similarity(x1, x2):
'''Calculates the cosine similarity between two vectors. Equal to the
cosine definition using the dot product.'''
return dot(x1, x2) / (norm(x1) * norm(x2))
args = read_args()
# load document IDs from csv
df = pd.read_csv('partb1.csv', encoding = 'ISO-8859-1')
doc_ids = pd.Series(data = df.documentID.tolist(), \
index = df.filename.tolist())
documents = doc_ids.index.tolist()
# change directory to get cricket data
os.chdir(os.getcwd() + '/cricket')
# build the corpus
corpus = []
porter_stemmer = PorterStemmer()
stop_words = set(stopwords.words('english'))
for doc in documents:
f = read_document(doc)
f = apply_preprocessing(f)
# tokenise the document, remove stop words
word_list = nltk.word_tokenize(f)
word_list = [w for w in word_list if not w in stop_words]
# add all words and their stems to the corpus
for word in word_list:
stemmed_word = porter_stemmer.stem(word)
if stemmed_word not in corpus:
corpus.append(word)
# build the term counts
term_counts = []
for doc in documents:
curr_term_count = []
f = read_document(doc)
f = apply_preprocessing(f)
# tokenise the document, remove stop words
word_list = nltk.word_tokenize(f)
word_list = [w for w in word_list if not w in stop_words]
# build frequency dictionary of stemmed words
wordDict = {}
for word in word_list:
stemmed_word = porter_stemmer.stem(word)
if stemmed_word in wordDict:
wordDict[stemmed_word] += 1
else:
wordDict[stemmed_word] = 1
# fill in the current count of terms, then add to the overall list
for word in corpus:
if word in wordDict.keys():
curr_term_count.append(wordDict[word])
else:
curr_term_count.append(0)
term_counts.append(curr_term_count)
# calculate the tf-idf scores
transformer = TfidfTransformer()
tfidf = transformer.fit_transform(term_counts)
doc_tfidf = tfidf.toarray()
# construct the query unit vector
query_vector = []
for word in corpus:
if word in args.keywords:
# this assumes that keywords are unique
# and aren't entered more than once
query_vector.append(1)
else:
query_vector.append(0)
query_unit_vector = [x / norm(query_vector) for x in query_vector]
similarities = [cosine_similarity(query_unit_vector, doc_tfidf[d_id]) for \
d_id in range(doc_tfidf.shape[0])]
# this holds the similarities with their respective document IDs
sim_doc_ids = df
sim_doc_ids.insert(1, 'similarity_scores', similarities)
sim_doc_ids.sort_values(by = 'similarity_scores', ascending = False, \
inplace = True)
sorted_doc_ids = sim_doc_ids.documentID.tolist()
sorted_similarities = sim_doc_ids.similarity_scores.tolist()
# print documentID and scores, sorted by scores
print('documentID | score')
for i in range(len(sorted_doc_ids)):
if sorted_similarities[i] > 0:
print(f"{sorted_doc_ids[i]:10} | {sorted_similarities[i]:.4f}")