From 1c39aa187396a37e6f9bcff4f2b9963409ffe07d Mon Sep 17 00:00:00 2001 From: Rory Healy Date: Sun, 11 Apr 2021 19:01:03 +1000 Subject: [PATCH] partb1 completed --- partb1.csv | 125 +++++++++++++++++++++++++++++++++++++++++++++++++++++ partb1.py | 35 ++++++++++++++- 2 files changed, 158 insertions(+), 2 deletions(-) create mode 100644 partb1.csv diff --git a/partb1.csv b/partb1.csv new file mode 100644 index 0000000..fbe03c5 --- /dev/null +++ b/partb1.csv @@ -0,0 +1,125 @@ +filename,documentID +001.txt,JDKC-105M +002.txt,PUQK-674B +003.txt,VPBC-005 +004.txt,YWCE-738I +005.txt,GHXO-669P +006.txt,KAWQ-187J +007.txt,SOCR-012L +008.txt,KDHH-278R +009.txt,JLCK-314G +010.txt,QXYP-302 +011.txt,KBIU-888S +012.txt,AYEY-671H +013.txt,KKXQ-499 +014.txt,ZVCZ-635K +015.txt,HRAM-828D +016.txt,HHIF-616S +017.txt,MLGR-878X +018.txt,NIXS-836H +019.txt,OESP-334 +020.txt,MQHK-220P +021.txt,BWQF-732 +022.txt,AOXA-744D +023.txt,AMCX-238Z +024.txt,ZRLB-963 +025.txt,ZJXS-236K +026.txt,OGVB-726B +027.txt,OTPT-625L +028.txt,DLOI-457S +029.txt,UGNG-987P +030.txt,VLGL-512K +031.txt,MEVF-928T +032.txt,XTUU-890C +033.txt,LMFL-110 +034.txt,PIGH-858P +035.txt,RIZC-127R +036.txt,YBHW-577V +037.txt,CRGU-326 +038.txt,EXMG-013B +039.txt,XBFV-441P +040.txt,PTXN-906 +041.txt,AJUO-808R +042.txt,FZHV-289W +043.txt,IFWQ-428L +044.txt,GFIB-810N +045.txt,LNQR-256 +046.txt,PVYR-593X +047.txt,CXWG-362 +048.txt,AVVP-372N +049.txt,HIKP-557S +050.txt,JMGH-608I +051.txt,RZIA-145G +052.txt,IBKV-251K +053.txt,KTUL-361B +054.txt,PUZZ-195H +055.txt,RSHE-829 +056.txt,BTAR-174V +057.txt,TSAM-385Q +058.txt,RSYY-734T +059.txt,SSUD-401 +060.txt,SBAC-693P +061.txt,LFWR-772Y +062.txt,TSBI-281 +063.txt,RINK-392 +064.txt,ETZD-846 +065.txt,GQMX-983 +066.txt,GGJQ-271 +067.txt,PHYU-165S +068.txt,HLNI-938D +069.txt,PIMH-385U +070.txt,LWEW-582E +071.txt,BTCD-438F +072.txt,EJFC-205 +073.txt,XWBA-608 +074.txt,UGXI-811F +075.txt,FICA-623T +076.txt,AWMI-088K +077.txt,YSNS-697P +078.txt,YZJY-617P +079.txt,XFXG-118T +080.txt,AEBA-345H +081.txt,SDNM-432V +082.txt,FZHJ-523 +083.txt,MEYM-146 +084.txt,ELPI-149T +085.txt,USJW-494 +086.txt,OJIB-671D +087.txt,ANKW-165P +088.txt,EFHN-444 +089.txt,MGOB-327W +090.txt,WDCS-487 +091.txt,EALY-521Z +092.txt,FRPL-275B +093.txt,FLZT-426 +094.txt,CFQJ-830 +095.txt,SPIL-111S +096.txt,HNRN-134B +097.txt,UDND-112 +098.txt,EZDK-705A +099.txt,JJWD-835 +100.txt,ESHL-668Y +101.txt,LXNO-661O +102.txt,YCUZ-432 +103.txt,HQRE-637M +104.txt,TQCI-200A +105.txt,ZQTE-982B +106.txt,CFHG-288 +107.txt,UJKR-627 +108.txt,YHWW-255C +109.txt,AMLY-573J +110.txt,RHGF-926Y +111.txt,VYRH-360S +112.txt,AMRX-523T +113.txt,GPEK-672T +114.txt,AGCR-591A +115.txt,WNFK-465I +116.txt,EEMR-682A +117.txt,XMBY-038T +118.txt,ETJN-385Z +119.txt,ZFZV-394 +120.txt,BLVY-265 +121.txt,ERRM-330E +122.txt,JFGO-085F +123.txt,KGPU-366S +124.txt,PMFQ-998Z diff --git a/partb1.py b/partb1.py index 8ee976f..b90e283 100644 --- a/partb1.py +++ b/partb1.py @@ -1,5 +1,36 @@ -## Part B Task 1 - import re import pandas as pd import os +import argparse + +# parse input arguments +parser = argparse.ArgumentParser() +parser.add_argument('path_to_csv', help = 'path to the csv file') +args = parser.parse_args() + +# regex pattern matching document ID +pattern = r'[A-Z]{4}-\d{3}[a-zA-Z]?' + +os.chdir(os.getcwd() + '/cricket') + +document_ids = [] +filenames = [] + +# open every file, search each line for the document ID, add it to the list +for filename in os.listdir(): + filenames.append(filename) + f = open(filename) + + for line in f: + results = re.findall(pattern, line) + if len(results) != 0: + document_ids.append(results[0]) + + f.close() + +# construct a Series with the document IDs and filenames, and create a CSV +s = pd.Series(data = document_ids, index = filenames) +s.rename_axis('filename', inplace = True) +s.rename('documentID', inplace = True) +os.chdir('..') +s.to_csv(args.path_to_csv)