2021-03-01 17:57:17 +11:00
|
|
|
import re
|
|
|
|
import pandas as pd
|
|
|
|
import os
|
2021-04-11 19:01:03 +10:00
|
|
|
import argparse
|
|
|
|
|
|
|
|
# parse input arguments
|
|
|
|
parser = argparse.ArgumentParser()
|
|
|
|
parser.add_argument('path_to_csv', help = 'path to the csv file')
|
|
|
|
args = parser.parse_args()
|
|
|
|
|
|
|
|
# regex pattern matching document ID
|
|
|
|
pattern = r'[A-Z]{4}-\d{3}[a-zA-Z]?'
|
|
|
|
|
|
|
|
os.chdir(os.getcwd() + '/cricket')
|
|
|
|
|
2021-04-15 07:47:29 +10:00
|
|
|
# open every file, search each line for the document ID, add it to the list
|
2021-04-11 19:01:03 +10:00
|
|
|
document_ids = []
|
|
|
|
filenames = []
|
|
|
|
for filename in os.listdir():
|
|
|
|
filenames.append(filename)
|
|
|
|
f = open(filename)
|
|
|
|
|
|
|
|
for line in f:
|
|
|
|
results = re.findall(pattern, line)
|
|
|
|
if len(results) != 0:
|
|
|
|
document_ids.append(results[0])
|
|
|
|
|
|
|
|
f.close()
|
|
|
|
|
|
|
|
# construct a Series with the document IDs and filenames, and create a CSV
|
|
|
|
s = pd.Series(data = document_ids, index = filenames)
|
|
|
|
s.rename_axis('filename', inplace = True)
|
|
|
|
s.rename('documentID', inplace = True)
|
|
|
|
os.chdir('..')
|
|
|
|
s.to_csv(args.path_to_csv)
|