From d6c365e02d40ca12ee095b796bf96d7f9567b3e2 Mon Sep 17 00:00:00 2001 From: muhammadzpw Date: Fri, 20 Dec 2024 17:22:39 +0700 Subject: [PATCH] Add merge export file script, README, and sample --- merge-export-file-script/README.md | 52 ++++++++ merge-export-file-script/merge.py | 117 ++++++++++++++++++ .../sample-input/sample.zip | Bin 0 -> 25535 bytes 3 files changed, 169 insertions(+) create mode 100644 merge-export-file-script/README.md create mode 100644 merge-export-file-script/merge.py create mode 100644 merge-export-file-script/sample-input/sample.zip diff --git a/merge-export-file-script/README.md b/merge-export-file-script/README.md new file mode 100644 index 0000000..4f1afec --- /dev/null +++ b/merge-export-file-script/README.md @@ -0,0 +1,52 @@ +# Merge Export File Script + +This script is designed to merge CSV files from a Datasaur exported ZIP file and output a new ZIP file containing the merged CSVs. + +## Prerequisites + +- Python 3.x +- Ensure you have the necessary permissions to read/write files in the directories you are working with. + +## Installation + +- Clone the repository or download the script to your local machine. +-Ensure Python is installed on your system. You can download it from python.org. + +## Usage + +To run the script, use the following command in your terminal or command prompt: + +```bash +python merge.py -I -O +``` + +### Arguments + +-I, --input: Required. The path to the input Datasaur exported ZIP file. +-O, --output: Required. The path where the output ZIP file will be saved. + +### Example + +```bash +python merge.py -I /path/to/input.zip -O /path/to/output.zip +``` + +This command will: + +- Validate the input ZIP file to ensure it exists and is a valid ZIP file. +- Extract the contents of the input ZIP file to a temporary directory. +- Merge all CSV files found in each folder within the extracted contents. +- Create a new ZIP file containing the merged CSV files at the specified output path. +- Clean up the temporary directory used during the process. + +## Notes + +- Ensure the input file is a valid ZIP file containing CSV files to be merged. +- The output file path should not already exist, as the script will not overwrite existing files. +- The script will create a temporary directory named tmp in the current working directory. Ensure you have write permissions in this directory. + +## Troubleshooting + +- If you encounter a `FileNotFoundError`, ensure the input file path is correct. +- If you encounter a `FileExistsError`, ensure the output file path does not already exist. +- For any other issues, ensure you have the necessary permissions and that your Python environment is correctly set up. diff --git a/merge-export-file-script/merge.py b/merge-export-file-script/merge.py new file mode 100644 index 0000000..08656af --- /dev/null +++ b/merge-export-file-script/merge.py @@ -0,0 +1,117 @@ +import argparse +import csv +import os +import shutil +import zipfile + + +def create_dirs(path): + if not os.path.exists(path): + os.makedirs(path) + + +def clean_tmp_dir(tmp_dir): + shutil.rmtree(tmp_dir) + + +def validate_input_file(input_file_path): + if not os.path.exists(input_file_path): + raise FileNotFoundError(f"Input file {input_file_path} does not exist") + + if not input_file_path.endswith(".zip"): + raise ValueError(f"Input file {input_file_path} is not a zip file") + + if not zipfile.is_zipfile(input_file_path): + raise ValueError(f"Input file {input_file_path} is not a valid zip file") + + +def validate_output_file(output_file_path): + if os.path.exists(output_file_path): + raise FileExistsError(f"Output file {output_file_path} already exists") + + if not output_file_path.endswith(".zip"): + raise ValueError(f"Output file {output_file_path} is not a zip file") + + +def read_csv_with_dict_reader(csv_file_path): + with open(csv_file_path, "r") as f: + reader = csv.DictReader(f) + return [row for row in reader] + + +def write_csv_with_dict_writer(csv_file_path, data): + with open(csv_file_path, "w") as f: + writer = csv.DictWriter(f, fieldnames=data[0].keys()) + writer.writeheader() + writer.writerows(data) + + +def merge_csv_files(csv_files): + data = [] + for csv_file in csv_files: + data.extend(read_csv_with_dict_reader(csv_file)) + return data + + +def do_merge_csv_files_per_folder(folder_path): + csv_files = [ + f"{folder_path}/{file}" + for file in os.listdir(folder_path) + if file.endswith(".csv") + ] + data = merge_csv_files(csv_files) + write_csv_with_dict_writer(f"{folder_path}/all_merged.csv", data) + + +def zip_folder(folder_path, output_path): + with zipfile.ZipFile(output_path, "w") as zipf: + for root, _dirs, files in os.walk(folder_path): + for file in files: + file_path = os.path.join(root, file) + arcname = os.path.relpath(file_path, folder_path) + zipf.write(file_path, arcname) + + +def extract_zip_file(zip_file_path, output_dir): + with zipfile.ZipFile(zip_file_path, "r") as zip_ref: + zip_ref.extractall(output_dir) + + +def write_zip_file(zip_file_path, file_path): + with zipfile.ZipFile(zip_file_path, "w") as zipf: + zipf.write(file_path, os.path.basename(file_path)) + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument( + "-I", "--input", required=True, help="Input Datasaur exported zip file path" + ) + parser.add_argument("-O", "--output", required=True, help="Output zip file path") + args = parser.parse_args() + + INPUT_ZIP_FILE = args.input + OUTPUT_ZIP_FILE = args.output + + validate_input_file(INPUT_ZIP_FILE) + validate_output_file(OUTPUT_ZIP_FILE) + + TMP_DIR = "tmp" + create_dirs(TMP_DIR) + + extract_zip_file(INPUT_ZIP_FILE, TMP_DIR) + + BASE_EXTRACTED_PATH = "tmp/{name}".format(name=os.listdir("tmp")[0]) + + folders = [ + f"{BASE_EXTRACTED_PATH}/{folder}" + for folder in os.listdir(BASE_EXTRACTED_PATH) + if os.path.isdir(os.path.join(BASE_EXTRACTED_PATH, folder)) + ] + + for folder in folders: + do_merge_csv_files_per_folder(folder) + + zip_folder(BASE_EXTRACTED_PATH, OUTPUT_ZIP_FILE) + + clean_tmp_dir(TMP_DIR) diff --git a/merge-export-file-script/sample-input/sample.zip b/merge-export-file-script/sample-input/sample.zip new file mode 100644 index 0000000000000000000000000000000000000000..46429110446e3f45ddf1c4dee37ae43102d74a89 GIT binary patch literal 25535 zcmeHPc|4Tc|DOsY5@kstBYO?Gr7RIM8v8P~$(Aj$?^{tc5-!qMijXB!(M1xHONA_v zvX$(SEJ-Ri^v&{np8Cq@y7%|Zc)H!!Yd&5tGyL&9?{nVgocB5J^En5rwuXuY2F7XG zSi{d>{`*T3_%uUXcpkPz{9 zL^KZSIBD*}{G#cL*@^x7)ZlNPYC^_S4`?aqYhX0>M4YT`EXeKYOn@|@z!fGezm!6v9+FapGHR#3sIs3Q)dbbxBXy1Q6;9kzBy=vceD zxO)Qs(Zyg9yTy>7E6hw$MH8hidf3CeG^S3oLVg`)Qh8co|Gl)B2dBIZHHt$IsOewI z-J7|VS0LGTS7yRpCCtkkudd)P^4CooC>7>8g~*^YOL=P6Y8bptj?7JeB+c#o`rx^A zL9Dm=n4(a1q5AhW;YJmy6?3YR~=b2=XB!Cn~Nm_ zH08>jf~_Jar1t+h5JQY(&58;+C{u6myt@AcA*K=T&Koq&7uD#Q+TAQc+}7$)XJeen z!ralmey3t<``MKI?3>xLl@m;gnS?nWts`#Y^$aGlf|cA`6J+LU;PXxG3*lor2u2)0 zcm*5=%Yc}03=X4XsHd!=sf76a^`J6F2ZKPVt0Pc)2rW$o%y%?|%9uvg3NaS>9_l$B zZr73Gtn7{#%BMIUY97g)Iy#)QJH9&>8S`Pxy1YT~1OJ;*G(s@M?~+L?FC#Y5e61=v z5qV5a=^1wHxbk+NWYu+4wOo=@t6rrEV%cp6r(yim;%oz5!z?Xjetq6Wcbd+3|H59~ zSC-%V+-_&=_zc64qD%5fWnM9J%8O>SOMd)yoTX{F*JW`S@n$0**8{yHsRXz1-2SIt zRJCGW?CSI%c38ZwzH{2ZYLakqt!pKitR}1BLUf&GA{q`0AJ#_3Dqvv*@PkuK%pV}KK zdGew4R*7q7_piEdte!qQU6VP3mQ$KswYY6$8U6}a1KI|f1_n!nxNRt^Yw4&QKq0U? zTB;ZYJtRU;`Fol}X^WaexjYl>xnJ5G&(Y+TlH**~%#At)&-f>KE?iNQxM^{7!}wS@ z#?R3&w3~));IQcq^)AFD;#@P8aN2}~Nz>Z(E$k!H7jG*HORu(&_AZ3CNl>lk(yy8z zL@QjjFibH+c|P5^Y37B(mElm$%@d~jz0zvgf)P&1=J~#D+j?O3{M9JywWO10rO&*tn(W)L!+*m;nDvq54) zNd%W+NSK2q+=Bh#o9<@G3q4K}@RFASxVX;eoU;hQ=DO?esTa3_sE_UG&jIdfR>NTD zp>6{@TAF$Y9W5=5@1;MRfU>U;W5P_H?krP$RmV%Xo3@koa3uBM^` zw|1&Db68FGmIX|YO_!wd4H*aqM!VL^S*OzH=TD`6d@y%)3EHu7I_M6@}Tan&l(C47)^dz`QRgf`Y`OS911a7yX6Ow0Dw(=lnZ z8Ri{-G-cX$+QF!Qt19n#dFFQ4M-J^}VMc@eCdHHI6nTU?&sR=jCqz(qG{>M{1D`EY zJ#p}mP+>Q7>&&~xhfTa;Zk#5GMuo}6sjhcq@pYMZ7#Y}Fa62P5 z*uIDAw1quZo#CZy+ELwf2HgF?!c#K(*rAsxTsIG?b)GnfY>*?ccgstvOj_QpmwVZ6 z5Pm1*@{70Q>k0k#;z9C>XEx+x57~vLJ909vz1S5irun4bRao$yzwk7y=X!-puiIYb z0Bh>OWFes)*mpuwoh94r;P^OJB;l6N)bXcp1>{ZP30cSZdK4rzgOA1ROwb%s(2g35 zc+qOv>=^ob$4vqFrkphMDzs^Xs{)cx6*DAP9(HcaT!>A6&-J%z;kDh@0={VorlBm% zqCi8S2WD$g5O+>2N>>k|u7bnpen(5lj_OjakY`dHqwe8h6uV(Jn;&P*oNMl2XZa?? zDgqpq_hr_1s6|~mNays1Em(Fi@nh_$Z~q2WkzfR`_-q)5bXlbD$o-yYbvqPzfXSL0 zZ|q^MRBT3pb5I0T0p8jtIdDr7Z%*~6-#Z&NV2mdUi_e!emgzqIz?gf8?eb*awfC!G zft5(QBNj|mse~IS12?CAt1n-WW;vwr5hbMAYFol}BUMU1>55irUVZ&}{?>MOPhLml zCpm%8!A8D7^;<>_@*fgVoXB#Ov}4-nEsw8T-FcThDrlB!Ox5yQMb6~#aBNOnn)3@eKTvVfyJMUEQnVFf=3UOInF%SLYLf)M=Jc1dw zdD)c9Z7)sDnFxn*-<366+{>HlrvtYG#(Sg^5;Hb=?!|k?IlJ7g599Vzds0vYigw zS9~v75umpHdZs)NcH(}YK*rT5`82Fc7Zvqd`!&Dy7E^n;-L8#l9C>H8fouzYc7jmb@eUsX3rB8~{lD8-Zq8qbxq+3GF2 zha!#sTD}YI^E!rWqcrhsQ3cIh|s`jIM!~g-pM?E z?2}E^RF2?gtdUnl(t}E4yQ#o6llyH;CU0#Qv}O<3$grk1&4yja`2;)RX0(@qPpjxd zVXE`KMR#{_rl@4c4Ly14OLsgXf^lYzz*yo34}3-^3p4EI6Y+~0F@||3dD4P&&$auk zJA8Vdj&b_5Rg<}cdm|q6fa``jPVA#URlSk@2|)I^@oXwG9UXfr z`dlO?N+8_{i+dK8w0}t2`SKa%1R)!n$DB0x#~nYW|A%HVk@

X?}pnQh>-Lh(})| zElmtUOA(=>sRyi?biaFkT+meSTrRc_{`M2GRN;Ob*9@oY0lTYisfY~8*8{BA!_Q0W zCv1B^q1z~6da@hY_Fn60Pg6cuUWlrc27&pNd4)}JIbQnkA#B3jb|oc5{k=!N8ON0n zjoOE^LXtSQGmLn@4&=<@&zTw5O&bo>+|1^u|9hFvPU4L&`iJ|yt$Dm%EOl{}X_7d^ zwMk1FSg(C@(70rTE}`bA;xkt#qt%)5MxNJ#UnMpe>Zh~8EklY!gfyD@XfDK^#k892 z$Zqr%Vu1Uw=RFyDQDCyCHZ8bWG{m-m_U3S?n`CYD+Mrid)g7zsVnw(&tGJDZe&Wc* zY`x>!Ni5n>V-ppF+myF8(CT(PtTdp7MM(QCkCTb$r4nLX!Gu0>x+R zMmNpV`)6zC;7ZWzVc~A)>Sbl;{6G3H;n2O5V!ixi)=N5cTvJ?^f3NGZA}j8t#~q5{ z^79)mE3&5h3;HQu%XfP%5DGHIX8D)cED_KNHN|212OSm&`!>Z~`7!1SxF1hyu0TjB z#aDsIR{=RB^;P1a8*V9<$`74{#%Q=C0-ohut?eUoi{IniUu}8%4(E`|W)*8xvt^art(R&@%_oYpW31lkicVL+`jQf%2h+UhgiMj)Lmpf=sld;qy6D} z3=A8*ie(3j73ssnJbx8U7;KA6^4;b6<_NK|Jmr4K&&Zj?QTZ$;~AE(*J>i^Qz5mnM_UXkPQ-UH{E8df;$o6FO;&p`U=2kq(t+V~{v zfej*#nInx}qE7>^(4At}V|VVMo)lRp8RWo(ddPaV+eYcm;H&-Jq0iUnR9jWB!Ugzq z*~{;79Lm+*_C6@qQ9Km$i}H=ZMuTX<2{gLA#eiTnE1MPczB(|PA!fRM?l^Po`i!!q zN(YbFy{u14X>i(^!>8rC?pi6fTIJaSKSM_v-xpvy`VQp2$=KUIeOy>uKgo9DEvNc= zjrAvvhBVc^{F-C*XJbu{j8>q7!Kq*letkGCEC^^~qzOzONg-8OSMVjuJVd z{@(^eF~t5oh8RdcX&RD0`CXb~eEodJ7f3T{rqQt#pqXND{l5kmSnZ_AL@OYe6eH^= zHL^giN%Ms~$?swm!|DebRv>?*DM2gHS1HESk1?h|>PU^L6`+n{K>Z*CDuDc9l43Od zK%)uv*#!Vd`x?5yx8)wo0&>yyM3&w_olh2`g`EPrH>vFq0?i94R>V(cMWjH(GsT|x z_u3QD&@>5(W%2V{7H1%$o?>Hsw~Yaj08^}ue~Gmb3Qbp}*d70%-2qXbrdS~Vj0FM_ zofKOHB3lHsut@WAVxSpfDOSl3u}W60W?A#sch~^b0>K0uMS-8owN9{V3vXHKP8;yq zU*3WPK9~EJ|LumGc_d5Uaq}4n@Qa9VfEdYgES!aUT>8S61@vXW6%A0<<4PJ6-wD)#P*!Ed%M6FnGVjd=T+o zw9uAeIZLc>(UWd+f(K~kgNXMB(hi|yU|9=r0ADA+`drx zyVpaDemPUWZ_$&^aKIDY^FhQAcY*>Mk{MDAoUkfdm zw>KX|Ja!{wAXZvq!IQc3LBz`d(*8@;haI{yKq9J( zJ45eg$l$NEo`R>}=7WfT$pIPg6_?V*RbI*k9da^Nbg5HJ3sp|wh735l0=m@Eodw`} zJdgojIqh8fl*&ROM0uBAh(DcK`8&x5k41b5P_o7Om;c(Yha?uH8T4{U^A*5!r1>D? zqsV_5xe^Ql{5I!*0id-Y*|hEWhUqML)v63!U!IbOyAmY__ zkdJjmq!WZmx*lEJSj&-Q!z5QCL4c%7PB5=_K8W}nQL+J&ErlRL(pBZ+Ds&bjA0e5N z2Lk*yKXRc8g?Et+aHXXUObVP2BL2y4vH`A~ge{%__601cC2@Pm0Q+~@a0~jfR6g87 zZPD(91i(sb4w&3EA4L3H2}nTyC>;XRlIDYmUn31kVIb5dFt=tti1^uk(BOweufUDY z`5@wdm4yWQN^2OHKrtTxXka$FT+ie05@?rmq3pt>`O9%%Qe9hI)rWxHD=1K}u+}XG wjZ}aHG`ac*3N)!V8Z+NGfKMX+^#Mpgldp-?z>W(HwjKBx6~Vv$4{z;8S^xk5 literal 0 HcmV?d00001