comp20008-project01/partb5.py

import os
import pandas as pd
import nltk
import argparse
from math import sqrt
from numpy.linalg import norm
from numpy import dot
from nltk.corpus import stopwords
from nltk.stem.porter import *
from sklearn.feature_extraction.text import TfidfTransformer
from partb2 import read_document, apply_preprocessing
from partb3 import read_args

def cosine_similarity(x1, x2):
    '''Calculates the cosine similarity between two vectors. Equal to the  
    cosine definition using the dot product.'''
    return dot(x1, x2) / (norm(x1) * norm(x2))

args = read_args()

# load document IDs from csv
df = pd.read_csv('partb1.csv', encoding = 'ISO-8859-1')
doc_ids = pd.Series(data = df.documentID.tolist(), \
    index = df.filename.tolist())
documents = doc_ids.index.tolist()

# change directory to get cricket data
os.chdir(os.getcwd() + '/cricket')

# build the corpus
corpus = []
porter_stemmer = PorterStemmer()
stop_words = set(stopwords.words('english'))

for doc in documents:
    f = read_document(doc)
    f = apply_preprocessing(f)

    # tokenise the document, remove stop words
    word_list = nltk.word_tokenize(f)
    word_list = [w for w in word_list if not w in stop_words]
    
    # add all words and their stems to the corpus
    for word in word_list:
        stemmed_word = porter_stemmer.stem(word)

        if stemmed_word not in corpus:
            corpus.append(word)

# build the term counts
term_counts = []
for doc in documents:
    curr_term_count = []
    f = read_document(doc)
    f = apply_preprocessing(f)

    # tokenise the document, remove stop words
    word_list = nltk.word_tokenize(f)
    word_list = [w for w in word_list if not w in stop_words]
    
    # build frequency dictionary of stemmed words
    wordDict = {}
    for word in word_list:
        stemmed_word = porter_stemmer.stem(word)
        
        if stemmed_word in wordDict:
            wordDict[stemmed_word] += 1
        else:
            wordDict[stemmed_word] = 1
    
    # fill in the current count of terms, then add to the overall list
    for word in corpus:
        if word in wordDict.keys():
            curr_term_count.append(wordDict[word])
        else:
            curr_term_count.append(0)
    
    term_counts.append(curr_term_count)

# calculate the tf-idf scores
transformer = TfidfTransformer()
tfidf = transformer.fit_transform(term_counts)
doc_tfidf = tfidf.toarray()

# construct the query unit vector
query_vector = []
for word in corpus:
    if word in args.keywords:
        # this assumes that keywords are unique 
        # and aren't entered more than once
        query_vector.append(1)
    else:
        query_vector.append(0)
query_unit_vector = [x / norm(query_vector) for x in query_vector]

similarities = [cosine_similarity(query_unit_vector, doc_tfidf[d_id]) for \
    d_id in range(doc_tfidf.shape[0])]

# this holds the similarities with their respective document IDs
sim_doc_ids = df
sim_doc_ids.insert(1, 'similarity_scores', similarities)
sim_doc_ids.sort_values(by = 'similarity_scores', ascending = False, \
    inplace = True)
sorted_doc_ids = sim_doc_ids.documentID.tolist()
sorted_similarities = sim_doc_ids.similarity_scores.tolist()

# print documentID and scores, sorted by scores
print('documentID | score')
for i in range(len(sorted_doc_ids)):
    if sorted_similarities[i] > 0:
        print(f"{sorted_doc_ids[i]:10} | {sorted_similarities[i]:.4f}")
Initial commit 2021-03-01 17:57:17 +11:00			`import os`
			`import pandas as pd`
			`import nltk`
partb5 complete, refactored code for cleaner design 2021-04-15 07:47:29 +10:00			`import argparse`
			`from math import sqrt`
			`from numpy.linalg import norm`
			`from numpy import dot`
			`from nltk.corpus import stopwords`
			`from nltk.stem.porter import *`
			`from sklearn.feature_extraction.text import TfidfTransformer`
			`from partb2 import read_document, apply_preprocessing`
			`from partb3 import read_args`

			`def cosine_similarity(x1, x2):`
			`'''Calculates the cosine similarity between two vectors. Equal to the`
			`cosine definition using the dot product.'''`
			`return dot(x1, x2) / (norm(x1) * norm(x2))`

			`args = read_args()`

			`# load document IDs from csv`
			`df = pd.read_csv('partb1.csv', encoding = 'ISO-8859-1')`
			`doc_ids = pd.Series(data = df.documentID.tolist(), \`
			`index = df.filename.tolist())`
			`documents = doc_ids.index.tolist()`

			`# change directory to get cricket data`
			`os.chdir(os.getcwd() + '/cricket')`

			`# build the corpus`
			`corpus = []`
			`porter_stemmer = PorterStemmer()`
			`stop_words = set(stopwords.words('english'))`

			`for doc in documents:`
			`f = read_document(doc)`
			`f = apply_preprocessing(f)`

			`# tokenise the document, remove stop words`
			`word_list = nltk.word_tokenize(f)`
			`word_list = [w for w in word_list if not w in stop_words]`

			`# add all words and their stems to the corpus`
			`for word in word_list:`
			`stemmed_word = porter_stemmer.stem(word)`

			`if stemmed_word not in corpus:`
			`corpus.append(word)`

			`# build the term counts`
			`term_counts = []`
			`for doc in documents:`
			`curr_term_count = []`
			`f = read_document(doc)`
			`f = apply_preprocessing(f)`

			`# tokenise the document, remove stop words`
			`word_list = nltk.word_tokenize(f)`
			`word_list = [w for w in word_list if not w in stop_words]`

			`# build frequency dictionary of stemmed words`
			`wordDict = {}`
			`for word in word_list:`
			`stemmed_word = porter_stemmer.stem(word)`

			`if stemmed_word in wordDict:`
			`wordDict[stemmed_word] += 1`
			`else:`
			`wordDict[stemmed_word] = 1`

			`# fill in the current count of terms, then add to the overall list`
			`for word in corpus:`
			`if word in wordDict.keys():`
			`curr_term_count.append(wordDict[word])`
			`else:`
			`curr_term_count.append(0)`

			`term_counts.append(curr_term_count)`

			`# calculate the tf-idf scores`
			`transformer = TfidfTransformer()`
			`tfidf = transformer.fit_transform(term_counts)`
			`doc_tfidf = tfidf.toarray()`

			`# construct the query unit vector`
			`query_vector = []`
			`for word in corpus:`
			`if word in args.keywords:`
			`# this assumes that keywords are unique`
			`# and aren't entered more than once`
			`query_vector.append(1)`
			`else:`
			`query_vector.append(0)`
			`query_unit_vector = [x / norm(query_vector) for x in query_vector]`

			`similarities = [cosine_similarity(query_unit_vector, doc_tfidf[d_id]) for \`
			`d_id in range(doc_tfidf.shape[0])]`

			`# this holds the similarities with their respective document IDs`
			`sim_doc_ids = df`
			`sim_doc_ids.insert(1, 'similarity_scores', similarities)`
			`sim_doc_ids.sort_values(by = 'similarity_scores', ascending = False, \`
			`inplace = True)`
			`sorted_doc_ids = sim_doc_ids.documentID.tolist()`
			`sorted_similarities = sim_doc_ids.similarity_scores.tolist()`

			`# print documentID and scores, sorted by scores`
			`print('documentID \| score')`
			`for i in range(len(sorted_doc_ids)):`
			`if sorted_similarities[i] > 0:`
			`print(f"{sorted_doc_ids[i]:10} \| {sorted_similarities[i]:.4f}")`