import re import argparse def read_document(path): '''Reads a file when provided with its path, and returns a string containing the lines of the file.''' file_given = open(path) f = "" for line in file_given: f += line + " " file_given.close() return f def apply_preprocessing(f): '''Removes non-alphabetic characters, replaces all whitespace characters with a single whitespace, and changes all uppercase characters to lowercase''' f = re.sub(r'[^a-zA-Z\s]', r'', f) f = re.sub(r'\s+', r' ', f) f = f.lower() return f def read_args(): '''Creates an argparse ArgumentParser to read the command line arguments.''' parser = argparse.ArgumentParser() parser.add_argument('path_to_file', help = 'path to document') args = parser.parse_args() return args if __name__ == '__main__': args = read_args() f = read_document(args.path_to_file) f = apply_preprocessing(f) print(f)