diff --git a/merge-export-file-script/README.md b/merge-export-file-script/README.md new file mode 100644 index 0000000..4f1afec --- /dev/null +++ b/merge-export-file-script/README.md @@ -0,0 +1,52 @@ +# Merge Export File Script + +This script is designed to merge CSV files from a Datasaur exported ZIP file and output a new ZIP file containing the merged CSVs. + +## Prerequisites + +- Python 3.x +- Ensure you have the necessary permissions to read/write files in the directories you are working with. + +## Installation + +- Clone the repository or download the script to your local machine. +-Ensure Python is installed on your system. You can download it from python.org. + +## Usage + +To run the script, use the following command in your terminal or command prompt: + +```bash +python merge.py -I -O +``` + +### Arguments + +-I, --input: Required. The path to the input Datasaur exported ZIP file. +-O, --output: Required. The path where the output ZIP file will be saved. + +### Example + +```bash +python merge.py -I /path/to/input.zip -O /path/to/output.zip +``` + +This command will: + +- Validate the input ZIP file to ensure it exists and is a valid ZIP file. +- Extract the contents of the input ZIP file to a temporary directory. +- Merge all CSV files found in each folder within the extracted contents. +- Create a new ZIP file containing the merged CSV files at the specified output path. +- Clean up the temporary directory used during the process. + +## Notes + +- Ensure the input file is a valid ZIP file containing CSV files to be merged. +- The output file path should not already exist, as the script will not overwrite existing files. +- The script will create a temporary directory named tmp in the current working directory. Ensure you have write permissions in this directory. + +## Troubleshooting + +- If you encounter a `FileNotFoundError`, ensure the input file path is correct. +- If you encounter a `FileExistsError`, ensure the output file path does not already exist. +- For any other issues, ensure you have the necessary permissions and that your Python environment is correctly set up. diff --git a/merge-export-file-script/merge.py b/merge-export-file-script/merge.py new file mode 100644 index 0000000..08656af --- /dev/null +++ b/merge-export-file-script/merge.py @@ -0,0 +1,117 @@ +import argparse +import csv +import os +import shutil +import zipfile + + +def create_dirs(path): + if not os.path.exists(path): + os.makedirs(path) + + +def clean_tmp_dir(tmp_dir): + shutil.rmtree(tmp_dir) + + +def validate_input_file(input_file_path): + if not os.path.exists(input_file_path): + raise FileNotFoundError(f"Input file {input_file_path} does not exist") + + if not input_file_path.endswith(".zip"): + raise ValueError(f"Input file {input_file_path} is not a zip file") + + if not zipfile.is_zipfile(input_file_path): + raise ValueError(f"Input file {input_file_path} is not a valid zip file") + + +def validate_output_file(output_file_path): + if os.path.exists(output_file_path): + raise FileExistsError(f"Output file {output_file_path} already exists") + + if not output_file_path.endswith(".zip"): + raise ValueError(f"Output file {output_file_path} is not a zip file") + + +def read_csv_with_dict_reader(csv_file_path): + with open(csv_file_path, "r") as f: + reader = csv.DictReader(f) + return [row for row in reader] + + +def write_csv_with_dict_writer(csv_file_path, data): + with open(csv_file_path, "w") as f: + writer = csv.DictWriter(f, fieldnames=data[0].keys()) + writer.writeheader() + writer.writerows(data) + + +def merge_csv_files(csv_files): + data = [] + for csv_file in csv_files: + data.extend(read_csv_with_dict_reader(csv_file)) + return data + + +def do_merge_csv_files_per_folder(folder_path): + csv_files = [ + f"{folder_path}/{file}" + for file in os.listdir(folder_path) + if file.endswith(".csv") + ] + data = merge_csv_files(csv_files) + write_csv_with_dict_writer(f"{folder_path}/all_merged.csv", data) + + +def zip_folder(folder_path, output_path): + with zipfile.ZipFile(output_path, "w") as zipf: + for root, _dirs, files in os.walk(folder_path): + for file in files: + file_path = os.path.join(root, file) + arcname = os.path.relpath(file_path, folder_path) + zipf.write(file_path, arcname) + + +def extract_zip_file(zip_file_path, output_dir): + with zipfile.ZipFile(zip_file_path, "r") as zip_ref: + zip_ref.extractall(output_dir) + + +def write_zip_file(zip_file_path, file_path): + with zipfile.ZipFile(zip_file_path, "w") as zipf: + zipf.write(file_path, os.path.basename(file_path)) + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument( + "-I", "--input", required=True, help="Input Datasaur exported zip file path" + ) + parser.add_argument("-O", "--output", required=True, help="Output zip file path") + args = parser.parse_args() + + INPUT_ZIP_FILE = args.input + OUTPUT_ZIP_FILE = args.output + + validate_input_file(INPUT_ZIP_FILE) + validate_output_file(OUTPUT_ZIP_FILE) + + TMP_DIR = "tmp" + create_dirs(TMP_DIR) + + extract_zip_file(INPUT_ZIP_FILE, TMP_DIR) + + BASE_EXTRACTED_PATH = "tmp/{name}".format(name=os.listdir("tmp")[0]) + + folders = [ + f"{BASE_EXTRACTED_PATH}/{folder}" + for folder in os.listdir(BASE_EXTRACTED_PATH) + if os.path.isdir(os.path.join(BASE_EXTRACTED_PATH, folder)) + ] + + for folder in folders: + do_merge_csv_files_per_folder(folder) + + zip_folder(BASE_EXTRACTED_PATH, OUTPUT_ZIP_FILE) + + clean_tmp_dir(TMP_DIR) diff --git a/merge-export-file-script/sample-input/sample.zip b/merge-export-file-script/sample-input/sample.zip new file mode 100644 index 0000000..4642911 Binary files /dev/null and b/merge-export-file-script/sample-input/sample.zip differ