import os import pandas as pd import nltk import argparse from math import sqrt from numpy.linalg import norm from numpy import dot from nltk.corpus import stopwords from nltk.stem.porter import * from sklearn.feature_extraction.text import TfidfTransformer from partb2 import read_document, apply_preprocessing from partb3 import read_args def cosine_similarity(x1, x2): '''Calculates the cosine similarity between two vectors. Equal to the cosine definition using the dot product.''' return dot(x1, x2) / (norm(x1) * norm(x2)) args = read_args() # load document IDs from csv df = pd.read_csv('partb1.csv', encoding = 'ISO-8859-1') doc_ids = pd.Series(data = df.documentID.tolist(), \ index = df.filename.tolist()) documents = doc_ids.index.tolist() # change directory to get cricket data os.chdir(os.getcwd() + '/cricket') # build the corpus corpus = [] porter_stemmer = PorterStemmer() stop_words = set(stopwords.words('english')) for doc in documents: f = read_document(doc) f = apply_preprocessing(f) # tokenise the document, remove stop words word_list = nltk.word_tokenize(f) word_list = [w for w in word_list if not w in stop_words] # add all words and their stems to the corpus for word in word_list: stemmed_word = porter_stemmer.stem(word) if stemmed_word not in corpus: corpus.append(word) # build the term counts term_counts = [] for doc in documents: curr_term_count = [] f = read_document(doc) f = apply_preprocessing(f) # tokenise the document, remove stop words word_list = nltk.word_tokenize(f) word_list = [w for w in word_list if not w in stop_words] # build frequency dictionary of stemmed words wordDict = {} for word in word_list: stemmed_word = porter_stemmer.stem(word) if stemmed_word in wordDict: wordDict[stemmed_word] += 1 else: wordDict[stemmed_word] = 1 # fill in the current count of terms, then add to the overall list for word in corpus: if word in wordDict.keys(): curr_term_count.append(wordDict[word]) else: curr_term_count.append(0) term_counts.append(curr_term_count) # calculate the tf-idf scores transformer = TfidfTransformer() tfidf = transformer.fit_transform(term_counts) doc_tfidf = tfidf.toarray() # construct the query unit vector query_vector = [] for word in corpus: if word in args.keywords: # this assumes that keywords are unique # and aren't entered more than once query_vector.append(1) else: query_vector.append(0) query_unit_vector = [x / norm(query_vector) for x in query_vector] similarities = [cosine_similarity(query_unit_vector, doc_tfidf[d_id]) for \ d_id in range(doc_tfidf.shape[0])] # this holds the similarities with their respective document IDs sim_doc_ids = df sim_doc_ids.insert(1, 'similarity_scores', similarities) sim_doc_ids.sort_values(by = 'similarity_scores', ascending = False, \ inplace = True) sorted_doc_ids = sim_doc_ids.documentID.tolist() sorted_similarities = sim_doc_ids.similarity_scores.tolist() # print documentID and scores, sorted by scores print('documentID | score') for i in range(len(sorted_doc_ids)): if sorted_similarities[i] > 0: print(f"{sorted_doc_ids[i]:10} | {sorted_similarities[i]:.4f}")