import re import pandas as pd import os import argparse # parse input arguments parser = argparse.ArgumentParser() parser.add_argument('path_to_csv', help = 'path to the csv file') args = parser.parse_args() # regex pattern matching document ID pattern = r'[A-Z]{4}-\d{3}[a-zA-Z]?' os.chdir(os.getcwd() + '/cricket') document_ids = [] filenames = [] # open every file, search each line for the document ID, add it to the list for filename in os.listdir(): filenames.append(filename) f = open(filename) for line in f: results = re.findall(pattern, line) if len(results) != 0: document_ids.append(results[0]) f.close() # construct a Series with the document IDs and filenames, and create a CSV s = pd.Series(data = document_ids, index = filenames) s.rename_axis('filename', inplace = True) s.rename('documentID', inplace = True) os.chdir('..') s.to_csv(args.path_to_csv)