comp20008-project01/partb1.py

import re
import pandas as pd
import os
import argparse

# parse input arguments
parser = argparse.ArgumentParser()
parser.add_argument('path_to_csv', help = 'path to the csv file')
args = parser.parse_args()

# regex pattern matching document ID
pattern = r'[A-Z]{4}-\d{3}[a-zA-Z]?'

os.chdir(os.getcwd() + '/cricket')

# open every file, search each line for the document ID, add it to the list
document_ids = []
filenames = []
for filename in os.listdir():
    filenames.append(filename)
    f = open(filename)

    for line in f:
        results = re.findall(pattern, line)
        if len(results) != 0:
            document_ids.append(results[0])

    f.close()

# construct a Series with the document IDs and filenames, and create a CSV
s = pd.Series(data = document_ids, index = filenames)
s.rename_axis('filename', inplace = True)
s.rename('documentID', inplace = True)
os.chdir('..')
s.to_csv(args.path_to_csv)