comp20008-project01/partb2.py

31 lines
812 B
Python
Raw Normal View History

2021-03-01 17:57:17 +11:00
import re
2021-04-11 19:43:49 +10:00
import argparse
2021-03-01 17:57:17 +11:00
2021-04-14 23:42:57 +10:00
def read_document(path):
'''Reads a file when provided with its path, and returns a string
containing the lines of the file.'''
file_given = open(path)
f = ""
for line in file_given:
f += line + " "
file_given.close()
return f
def apply_preprocessing(f):
'''Removes non-alphabetic characters, replaces all whitespace characters
with a single whitespace, and changes all uppercase characters to
lowercase'''
f = re.sub(r'[^a-zA-Z\s]', r'', f)
f = re.sub(r'\s+', r' ', f)
f = f.lower()
return f
2021-04-11 19:43:49 +10:00
# parse input arguments
parser = argparse.ArgumentParser()
2021-04-14 23:42:57 +10:00
parser.add_argument('path_to_file', help = 'path to document')
2021-04-11 19:43:49 +10:00
args = parser.parse_args()
2021-04-14 23:42:57 +10:00
f = read_document(args.path_to_file)
f = apply_preprocessing(f)
2021-04-11 19:43:49 +10:00
print(f)