import re import argparse def read_document(path): '''Reads a file when provided with its path, and returns a string containing the lines of the file.''' file_given = open(path) f = "" for line in file_given: f += line + " " file_given.close() return f def apply_preprocessing(f): '''Removes non-alphabetic characters, replaces all whitespace characters with a single whitespace, and changes all uppercase characters to lowercase''' f = re.sub(r'[^a-zA-Z\s]', r'', f) f = re.sub(r'\s+', r' ', f) f = f.lower() return f # parse input arguments parser = argparse.ArgumentParser() parser.add_argument('path_to_file', help = 'path to document') args = parser.parse_args() f = read_document(args.path_to_file) f = apply_preprocessing(f) print(f)