comp20008-project01/partb2.py
2021-04-14 23:42:57 +10:00

30 lines
812 B
Python

import re
import argparse
def read_document(path):
'''Reads a file when provided with its path, and returns a string
containing the lines of the file.'''
file_given = open(path)
f = ""
for line in file_given:
f += line + " "
file_given.close()
return f
def apply_preprocessing(f):
'''Removes non-alphabetic characters, replaces all whitespace characters
with a single whitespace, and changes all uppercase characters to
lowercase'''
f = re.sub(r'[^a-zA-Z\s]', r'', f)
f = re.sub(r'\s+', r' ', f)
f = f.lower()
return f
# parse input arguments
parser = argparse.ArgumentParser()
parser.add_argument('path_to_file', help = 'path to document')
args = parser.parse_args()
f = read_document(args.path_to_file)
f = apply_preprocessing(f)
print(f)