partb5 complete, refactored code for cleaner design
This commit is contained in:
parent
8cb005de77
commit
a171eb1ca1
11 changed files with 230 additions and 139 deletions
11
README.md
11
README.md
|
@ -12,9 +12,12 @@ Using the cricket dataset from the LMS, this part of the project deals with buil
|
||||||
|
|
||||||
## Dependencies:
|
## Dependencies:
|
||||||
|
|
||||||
- pandas >= 1.2.2
|
Python version used: 3.9.4
|
||||||
- matplotlib >= 3.3.2
|
|
||||||
- numpy >= 1.19.2
|
- pandas >= 1.2.4
|
||||||
|
- matplotlib >= 3.4.1
|
||||||
|
- numpy >= 1.20.2
|
||||||
- regex >= 2021.4.4
|
- regex >= 2021.4.4
|
||||||
|
- scikit-learn >= 0.24.1
|
||||||
- nltk >= 3.6.1
|
- nltk >= 3.6.1
|
||||||
- punkt model needed
|
- punkt model and Porter stemmer needed
|
||||||
|
|
BIN
owid-covid-2020-visual-analysis.pdf
Normal file
BIN
owid-covid-2020-visual-analysis.pdf
Normal file
Binary file not shown.
32
parta1.py
32
parta1.py
|
@ -6,18 +6,22 @@ parser = argparse.ArgumentParser()
|
||||||
parser.add_argument('path_to_csv', help = 'path to the csv file')
|
parser.add_argument('path_to_csv', help = 'path to the csv file')
|
||||||
args = parser.parse_args()
|
args = parser.parse_args()
|
||||||
|
|
||||||
all_covid_data = pd.read_csv('data/owid-covid-data.csv', encoding = 'ISO-8859-1')
|
all_covid_data = pd.read_csv('data/owid-covid-data.csv', \
|
||||||
|
encoding = 'ISO-8859-1')
|
||||||
|
|
||||||
# filter out data past 2020
|
# filter out data past 2020
|
||||||
all_covid_data = all_covid_data[(all_covid_data['date'] >= '2020-01-01') & (all_covid_data['date'] <= '2020-12-31')]
|
all_covid_data = all_covid_data[(all_covid_data['date'] >= '2020-01-01') \
|
||||||
|
& (all_covid_data['date'] <= '2020-12-31')]
|
||||||
all_covid_data.date = pd.to_datetime(all_covid_data.date)
|
all_covid_data.date = pd.to_datetime(all_covid_data.date)
|
||||||
|
|
||||||
# create groupby objects and sum new cases/deaths by month
|
# create groupby objects and sum new cases/deaths by month
|
||||||
new_cases = all_covid_data.loc[:, ['location', 'date', 'new_cases']]
|
new_cases = all_covid_data.loc[:, ['location', 'date', 'new_cases']]
|
||||||
new_cases_grouped = new_cases.groupby([new_cases.date.dt.month, new_cases.location]).new_cases.sum()
|
new_cases_grouped = new_cases.groupby([new_cases.date.dt.month, \
|
||||||
|
new_cases.location]).new_cases.sum()
|
||||||
|
|
||||||
new_deaths = all_covid_data.loc[:, ['location', 'date', 'new_deaths']]
|
new_deaths = all_covid_data.loc[:, ['location', 'date', 'new_deaths']]
|
||||||
new_deaths_grouped = new_deaths.groupby([new_deaths.date.dt.month, new_deaths.location]).new_deaths.sum()
|
new_deaths_grouped = new_deaths.groupby([new_deaths.date.dt.month, \
|
||||||
|
new_deaths.location]).new_deaths.sum()
|
||||||
|
|
||||||
# convert multi-indexed series to dataframe
|
# convert multi-indexed series to dataframe
|
||||||
new_cases_grouped = new_cases_grouped.to_frame()
|
new_cases_grouped = new_cases_grouped.to_frame()
|
||||||
|
@ -30,10 +34,13 @@ new_cases_grouped.sort_values(by = ['location', 'date'], inplace = True)
|
||||||
new_deaths_grouped.sort_values(by = ['location', 'date'], inplace = True)
|
new_deaths_grouped.sort_values(by = ['location', 'date'], inplace = True)
|
||||||
|
|
||||||
# merge new_deaths_grouped and new_cases_grouped
|
# merge new_deaths_grouped and new_cases_grouped
|
||||||
aggregated_data = new_cases_grouped.merge(new_deaths_grouped, how = 'outer', left_on = ['location', 'date'], right_on = ['location', 'date'])
|
aggregated_data = new_cases_grouped.merge(new_deaths_grouped, \
|
||||||
|
how = 'outer', left_on = ['location', 'date'], \
|
||||||
|
right_on = ['location', 'date'])
|
||||||
|
|
||||||
# filter out all entries that aren't at the end of the month
|
# filter out all entries that aren't at the end of the month
|
||||||
all_covid_data['end_of_month'] = pd.to_datetime(all_covid_data['date']).dt.is_month_end
|
all_covid_data['end_of_month'] = \
|
||||||
|
pd.to_datetime(all_covid_data['date']).dt.is_month_end
|
||||||
all_covid_data = all_covid_data.loc[all_covid_data.end_of_month, :]
|
all_covid_data = all_covid_data.loc[all_covid_data.end_of_month, :]
|
||||||
|
|
||||||
# extract monthly total cases and total deaths
|
# extract monthly total cases and total deaths
|
||||||
|
@ -44,14 +51,19 @@ total_deaths = all_covid_data.loc[:, ['location', 'date', 'total_deaths']]
|
||||||
total_deaths.date = total_deaths.date.dt.month
|
total_deaths.date = total_deaths.date.dt.month
|
||||||
|
|
||||||
# merge total_deaths and total_cases into aggregated_data
|
# merge total_deaths and total_cases into aggregated_data
|
||||||
aggregated_data = aggregated_data.merge(total_cases, how = 'outer', left_on = ['location', 'date'], right_on = ['location', 'date'])
|
aggregated_data = aggregated_data.merge(total_cases, how = 'outer', \
|
||||||
aggregated_data = aggregated_data.merge(total_deaths, how = 'outer', left_on = ['location', 'date'], right_on = ['location', 'date'])
|
left_on = ['location', 'date'], right_on = ['location', 'date'])
|
||||||
|
aggregated_data = aggregated_data.merge(total_deaths, how = 'outer', \
|
||||||
|
left_on = ['location', 'date'], right_on = ['location', 'date'])
|
||||||
|
|
||||||
# compute case fatality rate for each month
|
# compute case fatality rate for each month
|
||||||
aggregated_data['case_fatality_rate'] = (aggregated_data['new_deaths'] / aggregated_data['new_cases'])
|
aggregated_data['case_fatality_rate'] = \
|
||||||
|
(aggregated_data['new_deaths'] / aggregated_data['new_cases'])
|
||||||
|
|
||||||
# format aggregated_data and output results
|
# format aggregated_data and output results
|
||||||
aggregated_data = aggregated_data.reindex(columns = ['location', 'date', 'case_fatality_rate', 'total_cases', 'new_cases', 'total_deaths', 'new_deaths'])
|
aggregated_data = aggregated_data.reindex(columns = ['location', 'date', \
|
||||||
|
'case_fatality_rate', 'total_cases', 'new_cases', 'total_deaths', \
|
||||||
|
'new_deaths'])
|
||||||
aggregated_data.rename(columns = {'date': 'month'}, inplace = True)
|
aggregated_data.rename(columns = {'date': 'month'}, inplace = True)
|
||||||
aggregated_data.set_index(['location', 'month'], inplace = True)
|
aggregated_data.set_index(['location', 'month'], inplace = True)
|
||||||
|
|
||||||
|
|
12
parta2.py
12
parta2.py
|
@ -9,7 +9,8 @@ parser.add_argument('scatter_a', help = 'output path of figure 1')
|
||||||
parser.add_argument('scatter_b', help = 'output path of figure 2')
|
parser.add_argument('scatter_b', help = 'output path of figure 2')
|
||||||
args = parser.parse_args()
|
args = parser.parse_args()
|
||||||
|
|
||||||
all_covid_data = pd.read_csv('data/owid-covid-data.csv', encoding = 'ISO-8859-1')
|
all_covid_data = pd.read_csv('data/owid-covid-data.csv', \
|
||||||
|
encoding = 'ISO-8859-1')
|
||||||
|
|
||||||
# filter out data - only need 2020-12-31
|
# filter out data - only need 2020-12-31
|
||||||
all_covid_data = all_covid_data[(all_covid_data['date'] == '2020-12-31')]
|
all_covid_data = all_covid_data[(all_covid_data['date'] == '2020-12-31')]
|
||||||
|
@ -21,13 +22,12 @@ total_cases.set_index(['location'], inplace = True)
|
||||||
total_deaths = all_covid_data.loc[:, ['location', 'total_deaths']]
|
total_deaths = all_covid_data.loc[:, ['location', 'total_deaths']]
|
||||||
|
|
||||||
# merge total_cases and total_deaths
|
# merge total_cases and total_deaths
|
||||||
aggregated_data = total_cases.merge(total_deaths, how = 'inner', on = 'location')
|
aggregated_data = total_cases.merge(total_deaths, how = 'inner', \
|
||||||
|
on = 'location')
|
||||||
|
|
||||||
# compute case fatality rate for each country
|
# compute case fatality rate for each country
|
||||||
aggregated_data['case_fatality_rate'] = (aggregated_data['total_deaths'] / aggregated_data['total_cases'])
|
aggregated_data['case_fatality_rate'] = \
|
||||||
|
(aggregated_data['total_deaths'] / aggregated_data['total_cases'])
|
||||||
# format aggregated_data
|
|
||||||
aggregated_data = aggregated_data.reindex(columns = ['location', 'case_fatality_rate', 'total_deaths', 'total_cases'])
|
|
||||||
|
|
||||||
# extract case fatality rate from aggregated data
|
# extract case fatality rate from aggregated data
|
||||||
case_fatality_rate = aggregated_data.loc[:, ['case_fatality_rate']]
|
case_fatality_rate = aggregated_data.loc[:, ['case_fatality_rate']]
|
||||||
|
|
|
@ -13,10 +13,9 @@ pattern = r'[A-Z]{4}-\d{3}[a-zA-Z]?'
|
||||||
|
|
||||||
os.chdir(os.getcwd() + '/cricket')
|
os.chdir(os.getcwd() + '/cricket')
|
||||||
|
|
||||||
|
# open every file, search each line for the document ID, add it to the list
|
||||||
document_ids = []
|
document_ids = []
|
||||||
filenames = []
|
filenames = []
|
||||||
|
|
||||||
# open every file, search each line for the document ID, add it to the list
|
|
||||||
for filename in os.listdir():
|
for filename in os.listdir():
|
||||||
filenames.append(filename)
|
filenames.append(filename)
|
||||||
f = open(filename)
|
f = open(filename)
|
||||||
|
|
19
partb2.py
19
partb2.py
|
@ -20,11 +20,16 @@ def apply_preprocessing(f):
|
||||||
f = f.lower()
|
f = f.lower()
|
||||||
return f
|
return f
|
||||||
|
|
||||||
# parse input arguments
|
def read_args():
|
||||||
parser = argparse.ArgumentParser()
|
'''Creates an argparse ArgumentParser to read the command line
|
||||||
parser.add_argument('path_to_file', help = 'path to document')
|
arguments.'''
|
||||||
args = parser.parse_args()
|
parser = argparse.ArgumentParser()
|
||||||
|
parser.add_argument('path_to_file', help = 'path to document')
|
||||||
|
args = parser.parse_args()
|
||||||
|
return args
|
||||||
|
|
||||||
f = read_document(args.path_to_file)
|
if __name__ == '__main__':
|
||||||
f = apply_preprocessing(f)
|
args = read_args()
|
||||||
print(f)
|
f = read_document(args.path_to_file)
|
||||||
|
f = apply_preprocessing(f)
|
||||||
|
print(f)
|
||||||
|
|
82
partb3.py
82
partb3.py
|
@ -1,54 +1,50 @@
|
||||||
import re
|
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
import nltk
|
import nltk
|
||||||
import os
|
import os
|
||||||
import argparse
|
import argparse
|
||||||
|
from partb2 import read_document, apply_preprocessing
|
||||||
|
|
||||||
def read_document(path):
|
def read_args():
|
||||||
'''Reads a file when provided with its path, and returns a string
|
'''Creates an argparse ArgumentParser to read the command line
|
||||||
containing the lines of the file.'''
|
arguments.'''
|
||||||
file_given = open(path)
|
parser = argparse.ArgumentParser()
|
||||||
f = ""
|
parser.add_argument('keywords', nargs = '+', \
|
||||||
for line in file_given:
|
help = 'keywords to search for (1-5 keywords accepted)')
|
||||||
f += line + " "
|
args = parser.parse_args()
|
||||||
file_given.close()
|
if len(args.keywords) > 5:
|
||||||
return f
|
print("Too many keywords.")
|
||||||
|
quit()
|
||||||
|
|
||||||
def apply_preprocessing(f):
|
return args
|
||||||
'''Removes non-alphabetic characters, replaces all whitespace characters
|
|
||||||
with a single whitespace, and changes all uppercase characters to
|
|
||||||
lowercase'''
|
|
||||||
f = re.sub(r'[^a-zA-Z\s]', r'', f)
|
|
||||||
f = re.sub(r'\s+', r' ', f)
|
|
||||||
f = f.lower()
|
|
||||||
return f
|
|
||||||
|
|
||||||
# parse input arguments
|
def load_doc_ids():
|
||||||
parser = argparse.ArgumentParser()
|
'''Loads in the documentIDs from partb1.csv, and returns the lists of
|
||||||
parser.add_argument('keywords', nargs = '+', help = 'keywords to search for \
|
documentIDs and filenames.'''
|
||||||
(1-5 keywords accepted)')
|
df = pd.read_csv('partb1.csv', encoding = 'ISO-8859-1')
|
||||||
args = parser.parse_args()
|
doc_ids = pd.Series(data = df.documentID.tolist(), \
|
||||||
if len(args.keywords) > 5:
|
index = df.filename.tolist())
|
||||||
print("Too many keywords.")
|
documents = doc_ids.index.tolist()
|
||||||
quit()
|
return doc_ids, documents
|
||||||
|
|
||||||
# load document IDs from csv
|
def find_matching_docs(doc_ids, documents, args):
|
||||||
df = pd.read_csv('partb1.csv', encoding = 'ISO-8859-1')
|
'''Takes the document list, applies pre-processing techniques to each
|
||||||
doc_ids = pd.Series(data = df.documentID.tolist(), \
|
document, tokenises the words, and returns a list containing document IDs
|
||||||
index = df.filename.tolist())
|
that match the keywords given as arguments to this program.'''
|
||||||
documents = doc_ids.index.tolist()
|
matched_doc_ids = []
|
||||||
matched_doc_ids = []
|
for doc in documents:
|
||||||
|
f = read_document(doc)
|
||||||
|
f = apply_preprocessing(f)
|
||||||
|
|
||||||
os.chdir(os.getcwd() + '/cricket')
|
tokens = nltk.word_tokenize(f)
|
||||||
|
# only add the document ID if all the keywords are in the token list
|
||||||
|
if all(keyword in tokens for keyword in args.keywords):
|
||||||
|
matched_doc_ids.append(doc_ids.get(doc))
|
||||||
|
|
||||||
# search through each document for the keywords
|
return matched_doc_ids
|
||||||
for doc in documents:
|
|
||||||
f = read_document(doc)
|
|
||||||
f = apply_preprocessing(f)
|
|
||||||
|
|
||||||
tokens = nltk.word_tokenize(f)
|
if __name__ == '__main__':
|
||||||
# only add the document ID if all the keywords are in the token list
|
args = read_args()
|
||||||
if all(keyword in tokens for keyword in args.keywords):
|
doc_ids, documents = load_doc_ids()
|
||||||
matched_doc_ids.append(doc_ids.get(doc))
|
os.chdir(os.getcwd() + '/cricket')
|
||||||
|
matched_doc_ids = find_matching_docs(doc_ids, documents, args)
|
||||||
print(matched_doc_ids)
|
print(matched_doc_ids)
|
||||||
|
|
85
partb4.py
85
partb4.py
|
@ -1,66 +1,37 @@
|
||||||
import re
|
|
||||||
import pandas as pd
|
|
||||||
import os
|
import os
|
||||||
import nltk
|
import nltk
|
||||||
from nltk.stem.porter import *
|
from nltk.stem.porter import *
|
||||||
import argparse
|
from partb2 import read_document, apply_preprocessing
|
||||||
|
from partb3 import read_args, load_doc_ids
|
||||||
|
|
||||||
def read_document(path):
|
def find_matching_docs(doc_ids, documents, args):
|
||||||
'''Reads a file when provided with its path, and returns a string
|
'''Takes the document list, applies pre-processing techniques to each
|
||||||
containing the lines of the file.'''
|
document, tokenises the words, and returns a list containing document IDs
|
||||||
file_given = open(path)
|
that match the keywords given as arguments to this program.'''
|
||||||
f = ""
|
matched_doc_ids = []
|
||||||
for line in file_given:
|
porter_stemmer = PorterStemmer()
|
||||||
f += line + " "
|
for doc in documents:
|
||||||
file_given.close()
|
f = read_document(doc)
|
||||||
return f
|
f = apply_preprocessing(f)
|
||||||
|
|
||||||
def apply_preprocessing(f):
|
# tokenise the document
|
||||||
'''Removes non-alphabetic characters, replaces all whitespace characters
|
word_list = nltk.word_tokenize(f)
|
||||||
with a single whitespace, and changes all uppercase characters to
|
|
||||||
lowercase'''
|
|
||||||
f = re.sub(r'[^a-zA-Z\s]', r'', f)
|
|
||||||
f = re.sub(r'\s+', r' ', f)
|
|
||||||
f = f.lower()
|
|
||||||
return f
|
|
||||||
|
|
||||||
# parse input arguments
|
# use the Porter stemmer to add stem words to the word list
|
||||||
parser = argparse.ArgumentParser()
|
for word in word_list:
|
||||||
parser.add_argument('keywords', nargs = '+', help = 'keywords to search \
|
stemmed_word = porter_stemmer.stem(word)
|
||||||
for (1-5 keywords accepted)')
|
if stemmed_word not in word_list:
|
||||||
args = parser.parse_args()
|
word_list.append(stemmed_word)
|
||||||
if len(args.keywords) > 5:
|
|
||||||
print("Too many keywords.")
|
|
||||||
quit()
|
|
||||||
|
|
||||||
# load document IDs from csv
|
# add document ID if all keywords are in this new word list
|
||||||
df = pd.read_csv('partb1.csv', encoding = 'ISO-8859-1')
|
if all(keyword in word_list for keyword in args.keywords):
|
||||||
doc_ids = pd.Series(data = df.documentID.tolist(), \
|
matched_doc_ids.append(doc_ids.get(doc))
|
||||||
index = df.filename.tolist())
|
|
||||||
documents = doc_ids.index.tolist()
|
|
||||||
matched_doc_ids = []
|
|
||||||
|
|
||||||
# change directory to get cricket data
|
return matched_doc_ids
|
||||||
os.chdir(os.getcwd() + '/cricket')
|
|
||||||
|
|
||||||
# search through each document for the keywords
|
if __name__ == '__main__':
|
||||||
porter_stemmer = PorterStemmer()
|
args = read_args()
|
||||||
|
doc_ids, documents = load_doc_ids()
|
||||||
for doc in documents:
|
os.chdir(os.getcwd() + '/cricket')
|
||||||
f = read_document(doc)
|
matched_doc_ids = find_matching_docs(doc_ids, documents, args)
|
||||||
f = apply_preprocessing(f)
|
print(matched_doc_ids)
|
||||||
|
|
||||||
# tokenise the document, remove stop words
|
|
||||||
word_list = nltk.word_tokenize(f)
|
|
||||||
|
|
||||||
# use the Porter stemmer to add stem words to the word list
|
|
||||||
for word in word_list:
|
|
||||||
stemmed_word = porter_stemmer.stem(word)
|
|
||||||
if stemmed_word not in word_list:
|
|
||||||
word_list.append(stemmed_word)
|
|
||||||
|
|
||||||
# add document ID if all keywords are in this new word list
|
|
||||||
if all(keyword in word_list for keyword in args.keywords):
|
|
||||||
matched_doc_ids.append(doc_ids.get(doc))
|
|
||||||
|
|
||||||
print(matched_doc_ids)
|
|
||||||
|
|
111
partb5.py
111
partb5.py
|
@ -1,6 +1,111 @@
|
||||||
## Part B Task 5
|
|
||||||
import re
|
|
||||||
import os
|
import os
|
||||||
import sys
|
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
import nltk
|
import nltk
|
||||||
|
import argparse
|
||||||
|
from math import sqrt
|
||||||
|
from numpy.linalg import norm
|
||||||
|
from numpy import dot
|
||||||
|
from nltk.corpus import stopwords
|
||||||
|
from nltk.stem.porter import *
|
||||||
|
from sklearn.feature_extraction.text import TfidfTransformer
|
||||||
|
from partb2 import read_document, apply_preprocessing
|
||||||
|
from partb3 import read_args
|
||||||
|
|
||||||
|
def cosine_similarity(x1, x2):
|
||||||
|
'''Calculates the cosine similarity between two vectors. Equal to the
|
||||||
|
cosine definition using the dot product.'''
|
||||||
|
return dot(x1, x2) / (norm(x1) * norm(x2))
|
||||||
|
|
||||||
|
args = read_args()
|
||||||
|
|
||||||
|
# load document IDs from csv
|
||||||
|
df = pd.read_csv('partb1.csv', encoding = 'ISO-8859-1')
|
||||||
|
doc_ids = pd.Series(data = df.documentID.tolist(), \
|
||||||
|
index = df.filename.tolist())
|
||||||
|
documents = doc_ids.index.tolist()
|
||||||
|
|
||||||
|
# change directory to get cricket data
|
||||||
|
os.chdir(os.getcwd() + '/cricket')
|
||||||
|
|
||||||
|
# build the corpus
|
||||||
|
corpus = []
|
||||||
|
porter_stemmer = PorterStemmer()
|
||||||
|
stop_words = set(stopwords.words('english'))
|
||||||
|
|
||||||
|
for doc in documents:
|
||||||
|
f = read_document(doc)
|
||||||
|
f = apply_preprocessing(f)
|
||||||
|
|
||||||
|
# tokenise the document, remove stop words
|
||||||
|
word_list = nltk.word_tokenize(f)
|
||||||
|
word_list = [w for w in word_list if not w in stop_words]
|
||||||
|
|
||||||
|
# add all words and their stems to the corpus
|
||||||
|
for word in word_list:
|
||||||
|
stemmed_word = porter_stemmer.stem(word)
|
||||||
|
|
||||||
|
if stemmed_word not in corpus:
|
||||||
|
corpus.append(word)
|
||||||
|
|
||||||
|
# build the term counts
|
||||||
|
term_counts = []
|
||||||
|
for doc in documents:
|
||||||
|
curr_term_count = []
|
||||||
|
f = read_document(doc)
|
||||||
|
f = apply_preprocessing(f)
|
||||||
|
|
||||||
|
# tokenise the document, remove stop words
|
||||||
|
word_list = nltk.word_tokenize(f)
|
||||||
|
word_list = [w for w in word_list if not w in stop_words]
|
||||||
|
|
||||||
|
# build frequency dictionary of stemmed words
|
||||||
|
wordDict = {}
|
||||||
|
for word in word_list:
|
||||||
|
stemmed_word = porter_stemmer.stem(word)
|
||||||
|
|
||||||
|
if stemmed_word in wordDict:
|
||||||
|
wordDict[stemmed_word] += 1
|
||||||
|
else:
|
||||||
|
wordDict[stemmed_word] = 1
|
||||||
|
|
||||||
|
# fill in the current count of terms, then add to the overall list
|
||||||
|
for word in corpus:
|
||||||
|
if word in wordDict.keys():
|
||||||
|
curr_term_count.append(wordDict[word])
|
||||||
|
else:
|
||||||
|
curr_term_count.append(0)
|
||||||
|
|
||||||
|
term_counts.append(curr_term_count)
|
||||||
|
|
||||||
|
# calculate the tf-idf scores
|
||||||
|
transformer = TfidfTransformer()
|
||||||
|
tfidf = transformer.fit_transform(term_counts)
|
||||||
|
doc_tfidf = tfidf.toarray()
|
||||||
|
|
||||||
|
# construct the query unit vector
|
||||||
|
query_vector = []
|
||||||
|
for word in corpus:
|
||||||
|
if word in args.keywords:
|
||||||
|
# this assumes that keywords are unique
|
||||||
|
# and aren't entered more than once
|
||||||
|
query_vector.append(1)
|
||||||
|
else:
|
||||||
|
query_vector.append(0)
|
||||||
|
query_unit_vector = [x / norm(query_vector) for x in query_vector]
|
||||||
|
|
||||||
|
similarities = [cosine_similarity(query_unit_vector, doc_tfidf[d_id]) for \
|
||||||
|
d_id in range(doc_tfidf.shape[0])]
|
||||||
|
|
||||||
|
# this holds the similarities with their respective document IDs
|
||||||
|
sim_doc_ids = df
|
||||||
|
sim_doc_ids.insert(1, 'similarity_scores', similarities)
|
||||||
|
sim_doc_ids.sort_values(by = 'similarity_scores', ascending = False, \
|
||||||
|
inplace = True)
|
||||||
|
sorted_doc_ids = sim_doc_ids.documentID.tolist()
|
||||||
|
sorted_similarities = sim_doc_ids.similarity_scores.tolist()
|
||||||
|
|
||||||
|
# print documentID and scores, sorted by scores
|
||||||
|
print('documentID | score')
|
||||||
|
for i in range(len(sorted_doc_ids)):
|
||||||
|
if sorted_similarities[i] > 0:
|
||||||
|
print(f"{sorted_doc_ids[i]:10} | {sorted_similarities[i]:.4f}")
|
||||||
|
|
BIN
scatter-a.png
BIN
scatter-a.png
Binary file not shown.
Before Width: | Height: | Size: 86 KiB After Width: | Height: | Size: 86 KiB |
BIN
scatter-b.png
BIN
scatter-b.png
Binary file not shown.
Before Width: | Height: | Size: 94 KiB After Width: | Height: | Size: 94 KiB |
Loading…
Reference in a new issue