partb1 completed

This commit is contained in:
Rory Healy 2021-04-11 19:01:03 +10:00
parent 1c2258939e
commit 1c39aa1873
2 changed files with 158 additions and 2 deletions

125
partb1.csv Normal file
View file

@ -0,0 +1,125 @@
filename,documentID
001.txt,JDKC-105M
002.txt,PUQK-674B
003.txt,VPBC-005
004.txt,YWCE-738I
005.txt,GHXO-669P
006.txt,KAWQ-187J
007.txt,SOCR-012L
008.txt,KDHH-278R
009.txt,JLCK-314G
010.txt,QXYP-302
011.txt,KBIU-888S
012.txt,AYEY-671H
013.txt,KKXQ-499
014.txt,ZVCZ-635K
015.txt,HRAM-828D
016.txt,HHIF-616S
017.txt,MLGR-878X
018.txt,NIXS-836H
019.txt,OESP-334
020.txt,MQHK-220P
021.txt,BWQF-732
022.txt,AOXA-744D
023.txt,AMCX-238Z
024.txt,ZRLB-963
025.txt,ZJXS-236K
026.txt,OGVB-726B
027.txt,OTPT-625L
028.txt,DLOI-457S
029.txt,UGNG-987P
030.txt,VLGL-512K
031.txt,MEVF-928T
032.txt,XTUU-890C
033.txt,LMFL-110
034.txt,PIGH-858P
035.txt,RIZC-127R
036.txt,YBHW-577V
037.txt,CRGU-326
038.txt,EXMG-013B
039.txt,XBFV-441P
040.txt,PTXN-906
041.txt,AJUO-808R
042.txt,FZHV-289W
043.txt,IFWQ-428L
044.txt,GFIB-810N
045.txt,LNQR-256
046.txt,PVYR-593X
047.txt,CXWG-362
048.txt,AVVP-372N
049.txt,HIKP-557S
050.txt,JMGH-608I
051.txt,RZIA-145G
052.txt,IBKV-251K
053.txt,KTUL-361B
054.txt,PUZZ-195H
055.txt,RSHE-829
056.txt,BTAR-174V
057.txt,TSAM-385Q
058.txt,RSYY-734T
059.txt,SSUD-401
060.txt,SBAC-693P
061.txt,LFWR-772Y
062.txt,TSBI-281
063.txt,RINK-392
064.txt,ETZD-846
065.txt,GQMX-983
066.txt,GGJQ-271
067.txt,PHYU-165S
068.txt,HLNI-938D
069.txt,PIMH-385U
070.txt,LWEW-582E
071.txt,BTCD-438F
072.txt,EJFC-205
073.txt,XWBA-608
074.txt,UGXI-811F
075.txt,FICA-623T
076.txt,AWMI-088K
077.txt,YSNS-697P
078.txt,YZJY-617P
079.txt,XFXG-118T
080.txt,AEBA-345H
081.txt,SDNM-432V
082.txt,FZHJ-523
083.txt,MEYM-146
084.txt,ELPI-149T
085.txt,USJW-494
086.txt,OJIB-671D
087.txt,ANKW-165P
088.txt,EFHN-444
089.txt,MGOB-327W
090.txt,WDCS-487
091.txt,EALY-521Z
092.txt,FRPL-275B
093.txt,FLZT-426
094.txt,CFQJ-830
095.txt,SPIL-111S
096.txt,HNRN-134B
097.txt,UDND-112
098.txt,EZDK-705A
099.txt,JJWD-835
100.txt,ESHL-668Y
101.txt,LXNO-661O
102.txt,YCUZ-432
103.txt,HQRE-637M
104.txt,TQCI-200A
105.txt,ZQTE-982B
106.txt,CFHG-288
107.txt,UJKR-627
108.txt,YHWW-255C
109.txt,AMLY-573J
110.txt,RHGF-926Y
111.txt,VYRH-360S
112.txt,AMRX-523T
113.txt,GPEK-672T
114.txt,AGCR-591A
115.txt,WNFK-465I
116.txt,EEMR-682A
117.txt,XMBY-038T
118.txt,ETJN-385Z
119.txt,ZFZV-394
120.txt,BLVY-265
121.txt,ERRM-330E
122.txt,JFGO-085F
123.txt,KGPU-366S
124.txt,PMFQ-998Z
1 filename documentID
2 001.txt JDKC-105M
3 002.txt PUQK-674B
4 003.txt VPBC-005
5 004.txt YWCE-738I
6 005.txt GHXO-669P
7 006.txt KAWQ-187J
8 007.txt SOCR-012L
9 008.txt KDHH-278R
10 009.txt JLCK-314G
11 010.txt QXYP-302
12 011.txt KBIU-888S
13 012.txt AYEY-671H
14 013.txt KKXQ-499
15 014.txt ZVCZ-635K
16 015.txt HRAM-828D
17 016.txt HHIF-616S
18 017.txt MLGR-878X
19 018.txt NIXS-836H
20 019.txt OESP-334
21 020.txt MQHK-220P
22 021.txt BWQF-732
23 022.txt AOXA-744D
24 023.txt AMCX-238Z
25 024.txt ZRLB-963
26 025.txt ZJXS-236K
27 026.txt OGVB-726B
28 027.txt OTPT-625L
29 028.txt DLOI-457S
30 029.txt UGNG-987P
31 030.txt VLGL-512K
32 031.txt MEVF-928T
33 032.txt XTUU-890C
34 033.txt LMFL-110
35 034.txt PIGH-858P
36 035.txt RIZC-127R
37 036.txt YBHW-577V
38 037.txt CRGU-326
39 038.txt EXMG-013B
40 039.txt XBFV-441P
41 040.txt PTXN-906
42 041.txt AJUO-808R
43 042.txt FZHV-289W
44 043.txt IFWQ-428L
45 044.txt GFIB-810N
46 045.txt LNQR-256
47 046.txt PVYR-593X
48 047.txt CXWG-362
49 048.txt AVVP-372N
50 049.txt HIKP-557S
51 050.txt JMGH-608I
52 051.txt RZIA-145G
53 052.txt IBKV-251K
54 053.txt KTUL-361B
55 054.txt PUZZ-195H
56 055.txt RSHE-829
57 056.txt BTAR-174V
58 057.txt TSAM-385Q
59 058.txt RSYY-734T
60 059.txt SSUD-401
61 060.txt SBAC-693P
62 061.txt LFWR-772Y
63 062.txt TSBI-281
64 063.txt RINK-392
65 064.txt ETZD-846
66 065.txt GQMX-983
67 066.txt GGJQ-271
68 067.txt PHYU-165S
69 068.txt HLNI-938D
70 069.txt PIMH-385U
71 070.txt LWEW-582E
72 071.txt BTCD-438F
73 072.txt EJFC-205
74 073.txt XWBA-608
75 074.txt UGXI-811F
76 075.txt FICA-623T
77 076.txt AWMI-088K
78 077.txt YSNS-697P
79 078.txt YZJY-617P
80 079.txt XFXG-118T
81 080.txt AEBA-345H
82 081.txt SDNM-432V
83 082.txt FZHJ-523
84 083.txt MEYM-146
85 084.txt ELPI-149T
86 085.txt USJW-494
87 086.txt OJIB-671D
88 087.txt ANKW-165P
89 088.txt EFHN-444
90 089.txt MGOB-327W
91 090.txt WDCS-487
92 091.txt EALY-521Z
93 092.txt FRPL-275B
94 093.txt FLZT-426
95 094.txt CFQJ-830
96 095.txt SPIL-111S
97 096.txt HNRN-134B
98 097.txt UDND-112
99 098.txt EZDK-705A
100 099.txt JJWD-835
101 100.txt ESHL-668Y
102 101.txt LXNO-661O
103 102.txt YCUZ-432
104 103.txt HQRE-637M
105 104.txt TQCI-200A
106 105.txt ZQTE-982B
107 106.txt CFHG-288
108 107.txt UJKR-627
109 108.txt YHWW-255C
110 109.txt AMLY-573J
111 110.txt RHGF-926Y
112 111.txt VYRH-360S
113 112.txt AMRX-523T
114 113.txt GPEK-672T
115 114.txt AGCR-591A
116 115.txt WNFK-465I
117 116.txt EEMR-682A
118 117.txt XMBY-038T
119 118.txt ETJN-385Z
120 119.txt ZFZV-394
121 120.txt BLVY-265
122 121.txt ERRM-330E
123 122.txt JFGO-085F
124 123.txt KGPU-366S
125 124.txt PMFQ-998Z

View file

@ -1,5 +1,36 @@
## Part B Task 1
import re import re
import pandas as pd import pandas as pd
import os import os
import argparse
# parse input arguments
parser = argparse.ArgumentParser()
parser.add_argument('path_to_csv', help = 'path to the csv file')
args = parser.parse_args()
# regex pattern matching document ID
pattern = r'[A-Z]{4}-\d{3}[a-zA-Z]?'
os.chdir(os.getcwd() + '/cricket')
document_ids = []
filenames = []
# open every file, search each line for the document ID, add it to the list
for filename in os.listdir():
filenames.append(filename)
f = open(filename)
for line in f:
results = re.findall(pattern, line)
if len(results) != 0:
document_ids.append(results[0])
f.close()
# construct a Series with the document IDs and filenames, and create a CSV
s = pd.Series(data = document_ids, index = filenames)
s.rename_axis('filename', inplace = True)
s.rename('documentID', inplace = True)
os.chdir('..')
s.to_csv(args.path_to_csv)