comp20008-project01/partb1.py

35 lines
920 B
Python

import re
import pandas as pd
import os
import argparse
# parse input arguments
parser = argparse.ArgumentParser()
parser.add_argument('path_to_csv', help = 'path to the csv file')
args = parser.parse_args()
# regex pattern matching document ID
pattern = r'[A-Z]{4}-\d{3}[a-zA-Z]?'
os.chdir(os.getcwd() + '/cricket')
# open every file, search each line for the document ID, add it to the list
document_ids = []
filenames = []
for filename in os.listdir():
filenames.append(filename)
f = open(filename)
for line in f:
results = re.findall(pattern, line)
if len(results) != 0:
document_ids.append(results[0])
f.close()
# construct a Series with the document IDs and filenames, and create a CSV
s = pd.Series(data = document_ids, index = filenames)
s.rename_axis('filename', inplace = True)
s.rename('documentID', inplace = True)
os.chdir('..')
s.to_csv(args.path_to_csv)