2021-03-01 17:57:17 +11:00
|
|
|
import re
|
2021-04-11 19:43:49 +10:00
|
|
|
import argparse
|
2021-03-01 17:57:17 +11:00
|
|
|
|
2021-04-14 23:42:57 +10:00
|
|
|
def read_document(path):
|
|
|
|
'''Reads a file when provided with its path, and returns a string
|
|
|
|
containing the lines of the file.'''
|
|
|
|
file_given = open(path)
|
|
|
|
f = ""
|
|
|
|
for line in file_given:
|
|
|
|
f += line + " "
|
|
|
|
file_given.close()
|
|
|
|
return f
|
|
|
|
|
|
|
|
def apply_preprocessing(f):
|
|
|
|
'''Removes non-alphabetic characters, replaces all whitespace characters
|
|
|
|
with a single whitespace, and changes all uppercase characters to
|
|
|
|
lowercase'''
|
|
|
|
f = re.sub(r'[^a-zA-Z\s]', r'', f)
|
|
|
|
f = re.sub(r'\s+', r' ', f)
|
|
|
|
f = f.lower()
|
|
|
|
return f
|
|
|
|
|
2021-04-15 07:47:29 +10:00
|
|
|
def read_args():
|
|
|
|
'''Creates an argparse ArgumentParser to read the command line
|
|
|
|
arguments.'''
|
|
|
|
parser = argparse.ArgumentParser()
|
|
|
|
parser.add_argument('path_to_file', help = 'path to document')
|
|
|
|
args = parser.parse_args()
|
|
|
|
return args
|
2021-04-11 19:43:49 +10:00
|
|
|
|
2021-04-15 07:47:29 +10:00
|
|
|
if __name__ == '__main__':
|
|
|
|
args = read_args()
|
|
|
|
f = read_document(args.path_to_file)
|
|
|
|
f = apply_preprocessing(f)
|
|
|
|
print(f)
|