diff --git a/config/config.yaml b/config/config.yaml index aebb12e..9619c85 100644 --- a/config/config.yaml +++ b/config/config.yaml @@ -1,61 +1,64 @@ -# NOTE: All paths should be fully qualified paths +#Name your experiment here +EXPERIMENT_NAME: "test" -# Path to raw ligand data | DATABASE/DATASET/file -INPUT_DIR: "/lustre/project/m2_jgu-smitt/data/raw" +# NOTE: All paths are relative to the workflow directory (or --directory if specified) +# Path to raw ligand data | DATABASE/DATASET/file # if you want to manually upload target pdb file upload these to in a subfolder of the input dir called "/PDB/receptor" - -# Path to output prepared target proteins -PREPARED_DATA_DIR: "/lustre/project/m2_jgu-smitt/data/prepared" - -# Path to energy minimized ligand files -PREPARED_LIGAND_DIR: "/lustre/project/m2_jgu-smitt/data/minimized" - -# Path to scratch directory -TEMP_DATA_DIR: "/lustre/scratch/m2_jgu-smitt" - -# Path where docking results are stored -OUTPUT_DIR: "/lustre/project/m2_jgu-smitt/" - -# Number of best results to be displayed (0" - -#Specify database to use ZINC usees and downloads compounds from ZINC database, others read local input from LOCAL_INPUT_DIR - -DATABASE: ["ZINC"] - -# First letter is the molecular weight bin - a measure of size - horizontal axis, left to right, online. A: 200 D, B: 250, C:300, D: 325, E:350, F: 375 -# Second letter is the logP bin - a measure of polarity - vertical axis, top to bottom, online. -# The third letter is reactivity : A=anodyne. B=Bother (e.g. chromophores) C=clean (but pains ok), E=mild reactivity ok, G=reactive ok, I = hot chemistry ok -# The fourth letter is purchasability: A and B = in stock, C = in stock via agent, D = make on demand, E = boutique (expensive), F=annotated (not for sale) -# The fifth letter is pH range: R = ref (7.4), M = mid (near 7.4), L = low (around 6.4), H=high (around 8.4). -# The sixth and last dimension is net molecular charge. Here we follow the convention of InChIkeys. -# Thus. N = neutral, M = minus 1, L = minus 2 (or greater). O = plus 1, P = plus 2 (or greater). +# Path to folder which contains compounds +# Here, a full qualified path should be indicated. +# NOTE: this will be ignored, if a 'DATABASE' (see above) is specified +LOCAL_INPUT_DIR: "" + +# Specify "ZINC" to obtain compounds from the ZINC database. +# Otherwise read local input from the LOCAL_INPUT_DIR, above. +#TODO: unlist DATABASE +DATABASE: "ZINC" + +# Specify a ZINC mirror site. Options are: +# - files.docking.org +# - ftp.uni-mainz.de/mirror/zink20/ +#ZINC_MIRROR: "ftp.uni-mainz.de/mirror/zink20/" +ZINC_MIRROR: "files.docking.org" + +# Select the part of the ZINC database for screening. This section follows the ZINC notation and is +# outlined, here: +# - the 1st letter is the molecular weight bin - a measure of size - horizontal axis, +# left to right, as shown on the ZINC webpage. A: 200 D, B: 250, C:300, D: 325, E:350, F: 375 +# - the 2nd letter is the logP bin - a measure of polarity - vertical axis, top to bottom, +# as shown on the ZINC webpage. +# - the 3rd letter defines reactivity : A=anodyne, B=Bother (e.g. chromophores), +# C=clean (but pains ok), E=mild reactivity ok, G=reactive ok, I = hot chemistry ok +# - the 4th letter notes purchasability: A and B = in stock, C = in stock via agent, +# D = make on demand, E = boutique (expensive), F=annotated (not for sale) +# - the 5th letter defines pH range: R = ref (7.4), M = mid (near 7.4), L = low (around 6.4), +# H=high (around 8.4). +# - the 6th and last dimension is net molecular charge. Here we follow the convention of InChIkeys. +# Thus. N = neutral, M = minus 1, L = minus 2 (or greater). O = plus 1, P = plus 2 (or greater). ZINC_INPUT: - WEIGHT: ["A", "B"] #["C","D","E","F","G"] - LOGP: ["A"] # ,"D","E","F","G", "H","I","J"] + WEIGHT: ["B", "C"] #["C","D","E","F","G"] + LOGP: ["B", "C"] # ,"D","E","F","G", "H","I","J"] REACT: ["A"] #,"B"] # ,"C", "E", "G"] - PURCHASE: ["A"] #, "B"] #, "C", "D", "E"] - PH: ["M"] - CHARGE: ["N"] # ,"M","O","L","P"] + PURCHASE: ["B"] #, "B"] #, "C", "D", "E"] + PH: ["M", "R"] + CHARGE: ["P"] # ,"M","O","L","P"] -#In case you don't want to download tranches from ZINC based on the paramters given above, a ZINC subset can be choosen. Otherwise set subset as TRANCHES -# ex. -SUBSET: "" +# In case you don't want to download tranches from ZINC based on the paramters given above, +# a ZINC subset can be choosen. Otherwise set subset as TRANCHES. +SUBSET: "TRANCHES" #Specify ENAMINE collection ENAMINE_INPUT: @@ -66,23 +69,21 @@ ENAMINE_INPUT: ENAMINE_URL: http://www.enamine.net/files/Stock_Screening_Collections/ +# Specify whether rescreening is desired ("TRUE" or "FALSE") +# Rescreening will be performed on the top results as specified by 'RESULT_NUMBER' and 'CUTOFF_VALUE' +# for the targets specified in 'RESCREENING_TARGETS', below. RESCREENING: "FALSE" -# Specify target enzyme ID and chains format: ["PDB_ID, " - -#Name your experiment here or change it in the final json file - -EXPERIMENT_NAME: "" +GRID_DIR: "GRID" -#parameters for energy minimization +# parameters for energy minimization ENERGY_MIN_ALGORITHM: 'cg' CONVERGENCE_CRITERIA: '1e-6' STEPS: '2500' diff --git a/profiles/Mogon-NHR/config.yaml b/profiles/Mogon-NHR/config.yaml index 9af9c4f..92de414 100644 --- a/profiles/Mogon-NHR/config.yaml +++ b/profiles/Mogon-NHR/config.yaml @@ -1,5 +1,7 @@ default-resources: - slurm_partition: smallcpu + executor: "slurm" + slurm_account: "nhr-zdvhpc" + slurm_partition: "smallcpu" mem_mb_per_cpu: 1800 runtime: "30m" clusters: "mogonnhr" @@ -7,8 +9,9 @@ default-resources: set-resources: docking: mem_mb_per_cpu: 3000 - slurm_partition: parallel - ntasks: 512 + slurm_partition: "parallel" + tasks: 512 + runtime: 500 energyMin: mem_mb: 350 runtime: 90 diff --git a/workflow/Snakefile b/workflow/Snakefile index 4287547..b2191c2 100644 --- a/workflow/Snakefile +++ b/workflow/Snakefile @@ -5,31 +5,18 @@ from snakemake.utils import min_version min_version("7.19.1") # this is where SLURM support was introduced -INPUT_DIR = config["INPUT_DIR"] - -MIN_DIR = config["PREPARED_LIGAND_DIR"] - -PREPARED_DIR = config["PREPARED_DATA_DIR"] - -OUTPUT_DIR = config["OUTPUT_DIR"] - -TMP_DIR = config["TEMP_DATA_DIR"] LOCAL_INPUT = config["LOCAL_INPUT_DIR"] - DATABASE = config["DATABASE"] - SUBSET = config["SUBSET"] - RESCREENING_TARGETS = config["RESCREENING_TARGETS"] def generateOutput(wildcards): - irods = path.join(OUTPUT_DIR, "results", "irods.json") + irods = path.join("results", "irods.json") if config["RESCREENING"] == "TRUE": out = expand( path.join( - OUTPUT_DIR, "results", "rescreening_{percentage}", "{receptorID}", @@ -40,54 +27,22 @@ def generateOutput(wildcards): combAll=combAll, ) hist = expand( - path.join(OUTPUT_DIR, "results", "{receptorID}_hist.png"), + path.join("results", "{receptorID}_hist.png"), receptorID=config["TARGETS"][0].split(",")[0], ) - return hist + out + [irods] - else: out = expand( - path.join(OUTPUT_DIR, "results", "{receptorID}_{percentage}.csv"), + path.join("results", "{receptorID}_{percentage}.csv"), receptorID=config["TARGETS"][0].split(",")[0], percentage=config["RESULT_NUMBER"], ) hist = expand( - path.join(OUTPUT_DIR, "results", "{receptorID}_hist.png"), + path.join("results", "{receptorID}_hist.png"), receptorID=config["TARGETS"][0].split(",")[0], ) return hist + out + [irods] - -localrules: - all, - generateIRODS, - dockingResultsTxt, - removeDuplicateLigands, - makeVenn, - prepareLigands2, - mergeDocking2, - bestLigands, - prepareSecondDocking, - convertMol2, - makeReceptorPDBQT, - mergeDocking, - mergeLocalInput, - split, - split2, - targetProtein, - getZINCdata, - getZINCSubsets, - gunzip, - ENAMINEdownload, - prepareReceptor, - prepareDocking, - prepareLibrary, - prepareGeometry, - makeHistogram, - cleanLigands, - - targetList = [] # get ProteinIDs from configfile for rescreening for i in config["RESCREENING_TARGETS"]: targetList.append(i.split(",")[0]) @@ -102,7 +57,6 @@ combAll = "_".join(targetList) # combine all rescreening targets def getAllVenn(wildcards): path.join( - OUTPUT_DIR, "output", "rescreening", "{receptorID}", @@ -114,7 +68,6 @@ def IRODSinput(wildcards): if config["RESCREENING"] == "TRUE": out = expand( path.join( - OUTPUT_DIR, "results", "rescreening_{percentage}", "{receptorID}", @@ -126,7 +79,7 @@ def IRODSinput(wildcards): ) else: out = expand( - path.join(OUTPUT_DIR, "results", "{receptorID}_{percentage}.csv"), + path.join("results", "{receptorID}_{percentage}.csv"), receptorID=config["TARGETS"][0].split(",")[0], percentage=config["RESULT_NUMBER"], ) @@ -150,13 +103,14 @@ rule generateIRODS: input: IRODSinput, output: - path.join(OUTPUT_DIR, "results", "irods.json"), + path.join("results", "irods.json"), log: "logs/generateIRODS.log", script: "scripts/generateIRODS.py" -include: "rules/analyse.smk" -include: "rules/docking.smk" include: "rules/preparation.smk" +include: "rules/docking.smk" +include: "rules/analyse.smk" + diff --git a/workflow/rules/analyse.smk b/workflow/rules/analyse.smk index 2cb9e3b..1ace296 100644 --- a/workflow/rules/analyse.smk +++ b/workflow/rules/analyse.smk @@ -3,14 +3,66 @@ import re import os import requests from snakemake.logging import logger +import builtins +import importlib +from urllib.parse import urlparse +# all rules in this file are local rules +localrules: dockingResults, dockingResultsTxt, bestLigands, makeHistogram, mergeDocking def url_reachable(url): """ test for reachable URL """ - r = requests.head(url) - return r.status_code == 200 + try: + r = requests.head(url, allow_redirects=True, timeout=5) + # Accept common codes: 200 OK, 301/302 redirects, 403 Forbidden (some servers block HEAD) + return r.status_code in (200, 301, 302, 403) + except Exception: + return False + + +def check_zinc_url(url): + """Robust check for ZINC availability. + + Behaviour: + - If config contains `ZINC_IGNORE_CHECK` and it's truthy, return True. + - If a user override function `zinc_available(url)` exists in + `workflow.scripts.user_checks` or `user_checks`, call it and use its result. + - Otherwise try http and https variants, allow redirects and accept 200/301/302/403. + """ + # allow config to skip checks (useful for clusters/non-interactive runs) + try: + if config.get("ZINC_IGNORE_CHECK"): + return True + except Exception: + pass + + # user override hook: look for zinc_available(url) in user_checks + for modname in ("workflow.scripts.user_checks", "user_checks"): + try: + mod = importlib.import_module(modname) + if hasattr(mod, "zinc_available"): + try: + return bool(mod.zinc_available(url)) + except Exception: + # if the user function errors, fall back to default checks + logger.warning(f"user zinc_available in {modname} raised an exception; falling back to default checks") + except Exception: + continue + + # Try provided url, and fallback to https/http variants + variants = [url] + parsed = urlparse(url) + if parsed.scheme == "http": + variants.insert(0, url.replace("http://", "https://", 1)) + elif parsed.scheme == "https": + variants.insert(0, url.replace("https://", "http://", 1)) + + for u in variants: + if url_reachable(u): + return True + return False def getTranches(): @@ -24,17 +76,24 @@ def getTranches(): return list(set(matches)) -def library(wildcards): +def library_files(wildcards): if DATABASE[0] == "ZINC": # ZINC database selected if SUBSET == "TRANCHES": # Tranches selected out = [] - # test for ZINC reachability: - if not url_reachable("http://files.docking.org/3D/"): + # Use ZINC_MIRROR from config if available + zinc_mirror = config.get("ZINC_MIRROR", "files.docking.org") + if not zinc_mirror.startswith("http://") and not zinc_mirror.startswith("https://"): + zinc_mirror = "http://" + zinc_mirror + zinc_mirror = zinc_mirror.rstrip("/") + zinc_test_url = f"{zinc_mirror}/3D/" + + # test for ZINC reachability (robust): + if not check_zinc_url(zinc_test_url): logger.info( - "The ZINC database is not accessible right now. Perhaps it is temporarily down?" + f"The ZINC database mirror ({zinc_mirror}) is not accessible right now. Perhaps it is temporarily down?" ) - user_input = input( + user_input = __import__("builtins").input( "Have you already run this workflow in the current folder with the same input data?(y/n) \n" ) if user_input == "y": @@ -49,8 +108,9 @@ def library(wildcards): database = config["DATABASE"] for i in tranch_list: - wl = i[0:2] - entry = f"{OUTPUT_DIR}/output/{receptorID}/{receptorID}_{database}_{wl}_{i}.pdbqt.gz" + wl = i[0:2] # first 2 chars are weight+logP = dataset + full_name = i # all 6 chars = name + entry = f"docking/{receptorID}/{receptorID}_{database}_{wl}_{full_name}.pdbqt.gz" out.append(entry) if not out: logger.error("No tranche parameter found in last log file.") @@ -59,45 +119,37 @@ def library(wildcards): return out else: logger.error( - "Aborting the snakemake run for now, as the data from the ZINC database (http://files.docking.org) are temporarily not available. Please try at a later time." + f"Aborting the snakemake run for now, as the data from the ZINC database ({zinc_mirror}) are temporarily not available. Please try at a later time." ) sys.exit(1) rawOut = expand( - path.join( - OUTPUT_DIR, - "output", + path.join( + "docking", "{receptorID}", - "{receptorID}_{database}_{w}{l}_{w}{l}{r}{p}{ph}{c}.pdbqt.gz", + "{receptorID}_{database}_{dataset}_{name}.pdbqt.gz", ), receptorID=config["TARGETS"][0].split(",")[0], database=config["DATABASE"], - w=config["ZINC_INPUT"]["WEIGHT"], - l=config["ZINC_INPUT"]["LOGP"], - r=config["ZINC_INPUT"]["REACT"], - p=config["ZINC_INPUT"]["PURCHASE"], - ph=config["ZINC_INPUT"]["PH"], - c=config["ZINC_INPUT"]["CHARGE"], - dataset=(config["ZINC_INPUT"]["WEIGHT"]) - + (config["ZINC_INPUT"]["LOGP"]), + dataset=[w+l for w in config["ZINC_INPUT"]["WEIGHT"] for l in config["ZINC_INPUT"]["LOGP"]], + name=[w+l+r+p+ph+c for w in config["ZINC_INPUT"]["WEIGHT"] + for l in config["ZINC_INPUT"]["LOGP"] + for r in config["ZINC_INPUT"]["REACT"] + for p in config["ZINC_INPUT"]["PURCHASE"] + for ph in config["ZINC_INPUT"]["PH"] + for c in config["ZINC_INPUT"]["CHARGE"]], ) for i in rawOut: weighLog = i.split("_")[-2] restAttr = (i.split("_")[-1])[2:6] - url = "".join( - [ - "http://files.docking.org/3D/", - weighLog, - "/", - restAttr, - "/", - weighLog, - restAttr, - ".xaa.pdbqt.gz", - ] - ) - r = requests.get(url) - if r.status_code == 200: - out.append(i) + url = f"{zinc_mirror}/3D/{weighLog}/{restAttr}/{weighLog}{restAttr}.xaa.pdbqt.gz" + try: + r = requests.get(url, allow_redirects=True, timeout=10) + # Accept common positive/redirect/forbidden codes and treat them as present + if r.status_code in (200, 301, 302, 403): + out.append(i) + except Exception: + # treat as not present and continue + continue if not out: logger.error( "All selected tranches are empty; select other parameters!" @@ -109,8 +161,7 @@ def library(wildcards): out = expand( path.join( - OUTPUT_DIR, - "output", + "docking", "{receptorID}", "{receptorID}_{database}_subsets_{subset}.pdbqt.gz", ), @@ -124,17 +175,37 @@ def library(wildcards): + SUBSET + ".mol2?count=all" ) - r = requests.get(url) - if r.status_code == 200: # test if subset is valid - return out + try: + r = requests.get(url, allow_redirects=True, timeout=10) + if r.status_code == 200: # test if subset is valid + return out + except Exception as e: + logger.warning(f"Could not connect to ZINC to validate subset: {e}") - r_zinc = requests.get("https://zinc15.docking.org/") - if r_zinc.status_code != 200: # test if ZINC database is available + try: + r_zinc = requests.get("https://zinc15.docking.org/", allow_redirects=True, timeout=10) + if r_zinc.status_code != 200: # test if ZINC database is available + logger.info( + "The ZINC database is not accessible right now. Perhaps it is temporarily down?" + ) + # if ZINC not available, but dataset is already downloaded --> continue + subset_dir = path.join(INPUT_DIR, config["SUBSET"] + ".mol2") + if os.path.isfile(subset_dir): + return out + else: + logger.error( + "Subset is not availiable in the specified data folder. \n Abort snakemake run, try again later" + ) + sys.exit(1) + else: + logger.error("Invalid subset name!") + sys.exit(1) + except Exception as e: logger.info( - "The ZINC database is not accessible right now. Perhaps it is temporarily down?" + f"The ZINC database is not accessible right now (error: {e}). Perhaps it is temporarily down?" ) # if ZINC not available, but dataset is already downloaded --> continue - subset_dir = path.join(INPUT_DIR, config["SUBSET"] + ".mol2") + subset_dir = path.join(DATABASE, config["SUBSET"] + ".mol2") if os.path.isfile(subset_dir): return out else: @@ -143,15 +214,10 @@ def library(wildcards): ) sys.exit(1) - else: - logger.error("Invalid subset name!") - sys.exit(1) - else: # not ZINC database --> local input data best = expand( path.join( - OUTPUT_DIR, - "output", + "docking", "{receptorID}", "{receptorID}_{database}_{dataset}_local.pdbqt.gz", ), @@ -164,10 +230,10 @@ def library(wildcards): rule makeHistogram: input: - path.join(OUTPUT_DIR, "results", "{receptorID}.pdbqt.gz"), + path.join("results", "{receptorID}.pdbqt.gz"), output: report( - path.join(OUTPUT_DIR, "results", "{receptorID}_hist.png"), + path.join("results", "{receptorID}_hist.png"), category="Histogram", ), log: @@ -182,9 +248,9 @@ rule makeHistogram: rule bestLigands: input: - library, + library_files, output: - path.join(OUTPUT_DIR, "results", "{receptorID}.pdbqt.gz"), + path.join("results", "{receptorID}.pdbqt.gz"), log: "logs/bestLigands_{receptorID}.log", script: @@ -193,27 +259,23 @@ rule bestLigands: rule dockingResults: input: - path.join(OUTPUT_DIR, "results", "{receptorID}.pdbqt.gz"), + path.join("results", "{receptorID}.pdbqt.gz"), output: - path.join(OUTPUT_DIR, "results", "{receptorID}_{percentage}.pdbqt"), + path.join("results", "{receptorID}_{percentage}.pdbqt"), envmodules: config["PYTHON"], log: "logs/dockingResults_{receptorID}_{percentage}.log", - threads: config["DOCKING_RESULTS"]["threads"] - resources: - mem_mb=config["DOCKING_RESULTS"]["mem_mb"], - runtime=config["DOCKING_RESULTS"]["runtime"], - partition=config["DOCKING_RESULTS"]["partition"], + threads: 1 script: "../scripts/sortResult.py" rule dockingResultsTxt: input: - path.join(OUTPUT_DIR, "results", "{receptorID}_{percentage}.pdbqt"), + path.join("results", "{receptorID}_{percentage}.pdbqt"), output: - path.join(OUTPUT_DIR, "results", "{receptorID}_{percentage}.csv"), + path.join("results", "{receptorID}_{percentage}.csv"), log: "logs/dockingResultsTxt_{receptorID}_{percentage}.log", conda: @@ -227,10 +289,10 @@ rule dockingResultsTxt: rule removeDuplicateLigands: input: - path.join(OUTPUT_DIR, "results", "{receptorID}_{percentage}.pdbqt"), + path.join("results", "{receptorID}_{percentage}.pdbqt"), output: path.join( - OUTPUT_DIR, "rescreening", "unique", "{receptorID}_{percentage}.pdbqt" + "rescreening", "unique", "{receptorID}_{percentage}.pdbqt" ), log: "logs/removeDuplicateLigands_{receptorID}_{percentage}.log", @@ -241,12 +303,12 @@ rule removeDuplicateLigands: checkpoint split2: input: path.join( - OUTPUT_DIR, "rescreening", "unique", "{receptorID}_{percentage}.pdbqt" + "rescreening", "unique", "{receptorID}_{percentage}.pdbqt" ), output: - directory( - os.path.join(TMP_DIR, "rescreening_ligands_{percentage}", "{receptorID}") - ), + temp(directory( + os.path.join("scratch", "rescreening_ligands_{percentage}", "{receptorID}") + )), log: "logs/split2_{receptorID}_{percentage}.log", script: @@ -256,11 +318,11 @@ checkpoint split2: rule prepareLigands2: input: ligands=path.join( - TMP_DIR, "rescreening_ligands_{percentage}", "{receptorID}", "{i}.pdbqt" + "scratch", "rescreening_ligands_{percentage}", "{receptorID}", "{i}.pdbqt" ), output: ligands=path.join( - OUTPUT_DIR, "rescreening_{percentage}", "{name}_{receptorID}", "{i}.txt" + "rescreening_{percentage}", "{name}_{receptorID}", "{i}.txt" ), log: "logs/prepareLigands2_{receptorID}_{percentage}_{name}_{i}.log", @@ -270,14 +332,14 @@ rule prepareLigands2: rule prepareSecondDocking: input: - grid=path.join(OUTPUT_DIR, "grid", "{name}_grid.txt"), - receptor=path.join(PREPARED_DIR, "receptor", "{name}.pdbqt"), + grid=path.join("grid", "{name}_grid.txt"), + receptor=path.join("prepared", "receptor", "{name}.pdbqt"), output: grid=path.join( - OUTPUT_DIR, "rescreening_{percentage}", "{name}_{receptorID}", "{name}.grd" + "rescreening_{percentage}", "{name}_{receptorID}", "{name}.grd" ), receptor=path.join( - OUTPUT_DIR, "rescreening_{percentage}", "{name}_{receptorID}", "{name}.rec" + "rescreening_{percentage}", "{name}_{receptorID}", "{name}.rec" ), log: "logs/prepareSecondDocking_{name}_{receptorID}_{percentage}.log", @@ -291,17 +353,16 @@ rule prepareSecondDocking: rule docking2: input: ligands=path.join( - OUTPUT_DIR, "rescreening_{percentage}", "{name}_{receptorID}", "{i}.txt" + "rescreening_{percentage}", "{name}_{receptorID}", "{i}.txt" ), grid=path.join( - OUTPUT_DIR, "rescreening_{percentage}", "{name}_{receptorID}", "{name}.grd" + "rescreening_{percentage}", "{name}_{receptorID}", "{name}.grd" ), receptor=path.join( - OUTPUT_DIR, "rescreening_{percentage}", "{name}_{receptorID}", "{name}.rec" + "rescreening_{percentage}", "{name}_{receptorID}", "{name}.rec" ), output: path.join( - OUTPUT_DIR, "rescreening_{percentage}", "{name}_{receptorID}", "{name}.rec_{i}.txt.pdbqt.gz", @@ -309,15 +370,9 @@ rule docking2: log: "logs/docking2_{name}_{receptorID}_{percentage}_{i}.log", params: - dir=path.join(OUTPUT_DIR, "rescreening_{percentage}"), + dir=path.join("rescreening_{percentage}"), gridfile=path.join(config["GRID_DIR"], "{name}.gpf"), cutOff=config["CUTOFF_VALUE"], - resources: - mpi=True, - partition=config["DOCKING"]["partition"], - tasks=config["DOCKING"]["ntasks"], - slurm_extra=config["DOCKING"]["slurm_extra"], - runtime=config["DOCKING"]["runtime"], conda: "../envs/vinalc.yml" envmodules: @@ -336,7 +391,7 @@ def aggregate_in2(wildcards): checkpoint_output = checkpoints.split2.get(**wildcards).output[0] files_names = expand( path.join( - OUTPUT_DIR, + "rescreening", "rescreening_{{percentage}}", "{{name}}_{{receptorID}}", "{{name}}.rec_{i}.txt.pdbqt.gz", @@ -351,8 +406,7 @@ rule mergeDocking2: unpack(aggregate_in2), output: path.join( - OUTPUT_DIR, - "output", + "rescreening", "rescreening_{percentage}", "{name}_{receptorID}", "{name}.pdbqt.gz", @@ -368,16 +422,14 @@ rule mergeDocking2: rule dockingResults2: input: path.join( - OUTPUT_DIR, - "output", + "rescreening", "rescreening_{percentage}", "{name}_{receptorID}", "{name}.pdbqt.gz", ), output: path.join( - OUTPUT_DIR, - "output", + "resreening", "rescreening_{percentage}", "{name}_{receptorID}", "{name}_best.pdbqt", @@ -392,11 +444,10 @@ rule dockingResults2: rule makeVenn: input: - best=path.join(OUTPUT_DIR, "results", "{receptorID}_{percentage}.pdbqt"), + best=path.join("results", "{receptorID}_{percentage}.pdbqt"), re_results=expand( path.join( - OUTPUT_DIR, - "output", + "rescreening", "rescreening_{percentage}", "{name}_{receptorID}", "{name}_best.pdbqt", @@ -408,7 +459,6 @@ rule makeVenn: output: report( path.join( - OUTPUT_DIR, "results", "rescreening_{percentage}", "{receptorID}", @@ -418,7 +468,7 @@ rule makeVenn: ), report( path.join( - OUTPUT_DIR, + "rescreening", "results", "rescreening_{percentage}", "{receptorID}", diff --git a/workflow/rules/docking.smk b/workflow/rules/docking.smk index 1470c85..bee23f9 100644 --- a/workflow/rules/docking.smk +++ b/workflow/rules/docking.smk @@ -1,68 +1,132 @@ +localrules: prepare_docking_local, prepare_docking_ligand + +from snakemake.exceptions import WorkflowError + def get_spacing(gridfile): - # """ - # function to retrieve the 'spacing' (scale) of a grid file - # """ - # with open(gridfile) as infile: - # for line in gridfile: - # match = re.findall("spacing", line) - # if match: - # try: - # spacing = float(match.split(" ")[1] - # except: # in any case: unable to convert to float - # raise WorkflowError("unable to convert spacing to float from grid file: {gridfile}") - # return spacing - pass + """Return spacing as float parsed from a .gpf grid file. + + Raise WorkflowError if the file does not contain a usable spacing value. + """ + try: + with open(gridfile) as fh: + for line in fh: + if "spacing" in line.lower(): + parts = line.strip().split() + # expect: spacing + try: + return float(parts[-1]) + except Exception: + raise WorkflowError( + f"unable to convert spacing to float from grid file: {gridfile}" + ) + except FileNotFoundError: + raise WorkflowError(f"grid file not found: {gridfile}") + + # nothing found + raise WorkflowError(f"no spacing found in grid file: {gridfile}") + + +rule prepare_docking_local: + """Prepare files for docking: copy receptor, geometry and ligand list into the per-job output directory. + + This runs as a regular (non-MPI) job so we can perform setup steps safely. + """ + input: + receptor=rules.makeReceptorPDBQT.output, + geometry=path.join("grid", "{receptorID}_grid.txt"), + output: + temp(path.join( + "docking", "{receptorID}", "{dataset}", "{receptorID}.txt" + )), + temp(path.join( + "docking", "{receptorID}", "{dataset}", "{receptorID}_grid.txt" + )), + message: + ( + f" Copying receptor from {str(input.receptor)} to {str(output[0])}; " + f"Copying geometry from {str(input.geometry)} to {str(output[1])}" + ), + run: + import shutil + shutil.copy(str(input.receptor), str(output[0])) + shutil.copy(str(input.geometry), str(output[1])) + + + +rule prepare_docking_ligand: + """Copy a single ligand-list entry into the per-job output directory. + + This is kept as a separate rule because ligand files carry additional + wildcards (database, name, i) and must not be mixed with the per-receptor + outputs in `prepare_docking_local`. + """ + input: + ligands=path.join("library", "{database}_{dataset}_{name}_{i}.txt"), + output: + lig=path.join( + "docking", + "{receptorID}", + "{dataset}", + "{database}_{dataset}_{name}_{i}.txt", + ), + params: + directory=path.join("docking"), + run: + import shutil + import os + + outdir = os.path.join(params.directory, wildcards.receptorID, wildcards.dataset) + os.makedirs(outdir, exist_ok=True) + + shutil.copy(input.ligands, output.lig) rule docking: + """MPI docking rule: invoke the MPI program (vinalc) as a single command. + + The preparation step is handled by `prepare_docking_local` which creates the + necessary files in output/{receptorID}/{dataset}/ so this rule can + run the MPI program only. + """ input: - receptor=path.join(OUTPUT_DIR, "receptor", "{receptorID}.txt"), - geometry=path.join(OUTPUT_DIR, "grid", "{receptorID}_grid.txt"), - ligands=path.join(OUTPUT_DIR, "library", "{database}_{dataset}_{name}_{i}.txt"), + rec=rules.prepare_docking_local.output[0], + geo=rules.prepare_docking_local.output[1], + lig=rules.prepare_docking_ligand.output.lig, output: path.join( - OUTPUT_DIR, - "output", + "docking", "{receptorID}", "{dataset}", - "{receptorID}.txt_{database}_{dataset}_{name}_{i}.txt.pdbqt.gz", + "{receptorID}_{database}_{dataset}_{name}_{i}.pdbqt.gz", ), conda: "../envs/vinalc.yml" envmodules: config["VINALC"], params: - dir=path.join(OUTPUT_DIR, "output"), - space=get_spacing(path.join(config["GRID_DIR"], "{receptorID}.gpf")), - errorDir=path.join(OUTPUT_DIR, "errorLigands.txt"), + # get spacing from the receptor's .gpf at runtime using wildcards + space=lambda wildcards: get_spacing(os.path.join(config['GRID_DIR'], f'{wildcards.receptorID}.gpf')), + log: + "logs/docking/{receptorID}_{dataset}_{database}_{name}_{i}.log", resources: - partition=config["DOCKING"]["partition"], - runtime=config["DOCKING"]["runtime"], - slurm_extra=config["DOCKING"]["slurm_extra"], - mpi="srun", - mem_mb_per_cpu=config["DOCKING"]["mem_mb_per_cpu"], - ntasks=config["DOCKING"]["ntasks"], + mpi="mpiexec" shell: - """( - mkdir -p {params.dir}/{wildcards.receptorID}/{wildcards.dataset} - cd {params.dir}/{wildcards.receptorID}/{wildcards.dataset} - cp {input.receptor} . - cp {input.geometry} . - cp {input.ligands} . - {resources.mpi} vinalc --recList {wildcards.receptorID}.txt --ligList {wildcards.database}_{wildcards.dataset}_{wildcards.name}_{wildcards.i}.txt --geoList {wildcards.receptorID}_grid.txt --granularity {params.space} - cd - - )""" + ( + "cd docking/{wildcards.receptorID}/{wildcards.dataset} ; " + "{resources.mpi} vinalc --recList {wildcards.receptorID}.txt " + "--ligList {wildcards.database}_{wildcards.dataset}_{wildcards.name}_{wildcards.i}.txt " + "--geoList {wildcards.receptorID}_grid.txt --granularity {params.space} " + ) def aggregate_in(wildcards): checkpoint_output = checkpoints.split.get(**wildcards).output[0] files_names = expand( path.join( - OUTPUT_DIR, - "output", + "docking", "{{receptorID}}", "{{dataset}}", - "{{receptorID}}.txt_{{database}}_{{dataset}}_{{name}}_{i}.txt.pdbqt.gz", + "{{receptorID}}_{{database}}_{{dataset}}_{{name}}_{i}.pdbqt.gz", ), i=glob_wildcards(os.path.join(checkpoint_output, "{i}.pdbqt")).i, ) @@ -74,8 +138,7 @@ rule mergeDocking: unpack(aggregate_in), output: path.join( - OUTPUT_DIR, - "output", + "docking", "{receptorID}", "{receptorID}_{database}_{dataset}_{name}.pdbqt.gz", ), diff --git a/workflow/rules/preparation.smk b/workflow/rules/preparation.smk index c1da5a3..f6cdb40 100644 --- a/workflow/rules/preparation.smk +++ b/workflow/rules/preparation.smk @@ -1,16 +1,40 @@ ruleorder: convertMol2 > gunzip +# most rules in this file are local rules: +localrules: + targetProtein, + getZINCdata, + getZINCSubsets, + convertMol2, + mergeLocalInput, + ENAMINEdownload, + SDFToPDBQT, + prepareReceptor, + makeReceptorPDBQT, + gunzip, + cleanLigands, + split, + prepareGeometry, + prepareLibrary, + prepareDocking, + + rule targetProtein: output: - path.join(INPUT_DIR, "PDB", "receptor", "{receptorID}.pdb.gz"), + path.join("PDB", "receptor", "{receptorID}.pdb.gz"), + log: + "logs/targetProtein/{receptorID}.log", shell: - "curl -o {output} {config[TARGET_URL]}/{wildcards.receptorID}.pdb.gz" + "curl -o {output} {config[TARGET_URL]}/{wildcards.receptorID}.pdb.gz &> {log} 2>&1" rule getZINCdata: output: - path.join(INPUT_DIR, "ZINC", "{dataset}", "{name}.pdbqt.gz"), + temp(path.join(DATABASE, "{dataset}", "{name}.pdbqt.gz")), + log: "logs/downloadZINC/{dataset}_{name}.log", + message: + "Downloading ZINC data for {wildcards.name} from ZINC database {wildcards.dataset}...", script: "../scripts/ZINCdownload.py" @@ -20,7 +44,7 @@ rule getZINCSubsets: sub=config["SUBSET"], output: expand( - path.join(INPUT_DIR, "ZINC", "subsets", "{subset}.mol2"), + path.join(DATABASE, "subsets", "{subset}.mol2"), subset=config["SUBSET"], ), script: @@ -29,9 +53,9 @@ rule getZINCSubsets: rule convertMol2: input: - path.join(INPUT_DIR, "ZINC", "subsets", "{subset}.mol2"), + path.join("ZINC", "subsets", "{subset}.mol2"), output: - path.join(TMP_DIR, "unzipped", "ZINC", "subsets", "{subset}.pdbqt"), + temp(path.join("scratch", "unzipped", "ZINC", "subsets", "{subset}.pdbqt")), conda: "../envs/openbabel.yml" envmodules: @@ -44,7 +68,7 @@ rule mergeLocalInput: params: in_dir=LOCAL_INPUT, output: - path.join(TMP_DIR, "unzipped", "{database}", "{dataset}", "local.pdbqt"), + temp(path.join("scratch", "unzipped", "{database}", "{dataset}", "total.pdbqt")), conda: "../envs/openbabel.yml" envmodules: @@ -56,7 +80,7 @@ rule mergeLocalInput: rule ENAMINEdownload: output: expand( - path.join(INPUT_DIR, "ENAMINE", "{ENAMINE_collection}"), + path.join("ENAMINE", "{ENAMINE_collection}"), ENAMINE_collection=config["ENAMINE_INPUT"], ), script: @@ -65,9 +89,11 @@ rule ENAMINEdownload: rule SDFToPDBQT: input: - path.join(TMP_DIR, "unzipped", "{database}", "{dataset}", "{name}.sdf"), + path.join("scratch", "unzipped", "{database}", "{dataset}", "{name}.sdf"), output: - path.join(TMP_DIR, "unzipped", "{database}", "{dataset}", "{name}.pdbqt"), + temp( + path.join("scratch", "unzipped", "{database}", "{dataset}", "{name}.pdbqt") + ), conda: "../envs/openbabel.yml" envmodules: @@ -78,57 +104,86 @@ rule SDFToPDBQT: rule prepareReceptor: input: - path.join(TMP_DIR, "unzipped", "PDB", "receptor", "{name}.pdb"), + rules.targetProtein.output, output: - path.join(TMP_DIR, "PDB", "receptor", "{name}.pdb"), + temp(path.join("scratch", "PDB", "receptor", "{receptorID}.pdb")), conda: "../envs/biopython.yml" envmodules: config["BIOPYTHON"], + log: + "logs/prepareReceptor/{receptorID}.log", script: "../scripts/prepareReceptor.py" rule makeReceptorPDBQT: input: - path.join(TMP_DIR, "PDB", "receptor", "{name}.pdb"), + path.join("scratch", "PDB", "receptor", "{receptorID}.pdb"), output: - path.join(PREPARED_DIR, "receptor", "{name}.pdbqt"), + path.join("prepared", "receptor", "{receptorID}.pdbqt"), conda: "../envs/openbabel.yml" envmodules: config["OPENBABEL"], + log: + "logs/makeReceptorPDBQT/{receptorID}.log", shell: - "obabel -ipdb {input} -opdbqt -O {output} -xr" + "obabel -ipdb {input} -opdbqt -O {output} -xr > {log} 2>&1" rule gunzip: input: - path.join(INPUT_DIR, "{database}", "{dataset}", "{name}.{filetype}.gz"), + path.join("{database}", "{dataset}", "{name}.{filetype}.gz"), output: - path.join(TMP_DIR, "unzipped", "{database}", "{dataset}", "{name}.{filetype}"), + temp( + path.join( + "scratch", "unzipped", "{database}", "{dataset}", "{name}.{filetype}" + ) + ), conda: "../envs/basic.yml" - shell: - "gunzip < {input} > {output} || touch {output}" + log: + "logs/gunzip/{database}_{dataset}_{name}.log", + run: + import gzip, shutil, os + try: + with gzip.open(input[0], "rb") as src, open(output[0], "wb") as dst: + shutil.copyfileobj(src, dst) + except Exception as e: + # log the error if a log file is defined, then create an empty output (touch) + try: + with open(log[0], "a") as lf: + lf.write(str(e) + "\n") + except Exception: + pass + open(output[0], "wb").close() rule cleanLigands: input: - path.join(TMP_DIR, "unzipped", "{database}", "{dataset}", "{name}.pdbqt"), + path.join("scratch", "unzipped", "{database}", "{dataset}", "{name}.pdbqt"), output: - path.join(TMP_DIR, "cleaned", "{database}", "{dataset}", "{name}.pdbqt"), + temp(path.join("scratch", "cleaned", "{database}", "{dataset}", "{name}.pdbqt")), + log: + "logs/cleanLigands/{database}_{dataset}_{name}.log", shell: "awk '/MODEL/ {{s=1}} s {{a[++c]=$0}} /Si/ {{p=1}} /ENDMDL/ {{if (!p) for (i=1;i<=c;i++) print a[i]; delete a;s=p=c=0;next}} !s' {input} > {output}" checkpoint split: input: - path.join(TMP_DIR, "cleaned", "{database}", "{dataset}", "{name}.pdbqt"), + path.join("scratch", "cleaned", "{database}", "{dataset}", "{name}.pdbqt"), output: - directory( - os.path.join(TMP_DIR, "prepared", "{database}", "{dataset}", "{name}_dir") + temp( + directory( + os.path.join( + "scratch", "prepared", "{database}", "{dataset}", "{name}_dir" + ) + ) ), + log: + "logs/split/{database}_{dataset}_{name}.log", script: "../scripts/splitFile.py" @@ -136,33 +191,33 @@ checkpoint split: rule energyMin: input: path.join( - TMP_DIR, "prepared", "{database}", "{dataset}", "{name}_dir", "{i}.pdbqt" + "scratch", "prepared", "{database}", "{dataset}", "{name}_dir", "{i}.pdbqt" ), output: - path.join(MIN_DIR, "{database}", "{dataset}", "{name}", "{i}.pdbqt"), + path.join("minimized", "{database}", "{dataset}", "{name}", "{i}.pdbqt"), params: algorithm=config["ENERGY_MIN_ALGORITHM"], steps=config["STEPS"], forcefield=config["FORCEFIELD"], convergence=config["CONVERGENCE_CRITERIA"], - threads: config["ENERGY_MIN"]["threads"] - resources: - partition=config["ENERGY_MIN"]["partition"], - runtime=config["ENERGY_MIN"]["runtime"], - mem_mb=config["ENERGY_MIN"]["mem_mb"], + threads: 8 + log: + "logs/energyMin/{database}_{dataset}_{name}_{i}.log", conda: "../envs/openbabel.yml" envmodules: config["OPENBABEL"], shell: - "obabel -ipdbqt {input} -opdbqt -O {output} --minimize --{params.algorithm} --steps {params.steps} --ff {params.forcefield} --crit {params.convergence}" + "obabel -ipdbqt {input} -opdbqt -O {output} --minimize --{params.algorithm} --steps {params.steps} --ff {params.forcefield} --crit {params.convergence} > {log} 2>&1" rule prepareGeometry: input: path.join(config["GRID_DIR"], "{receptorID}.gpf"), output: - path.join(OUTPUT_DIR, "grid", "{receptorID}_grid.txt"), + path.join("grid", "{receptorID}_grid.txt"), + log: + "logs/prepareGeometry/{receptorID}.log", run: grid_params = [] @@ -185,17 +240,21 @@ rule prepareGeometry: rule prepareLibrary: input: - path.join(MIN_DIR, "{database}", "{dataset}", "{name}", "{i}.pdbqt"), + path.join("minimized", "{database}", "{dataset}", "{receptorID}", "{i}.pdbqt"), output: - library=path.join(OUTPUT_DIR, "library", "{database}_{dataset}_{name}_{i}.txt"), + library=path.join("library", "{database}_{dataset}_{receptorID}_{i}.txt"), + log: + "logs/prepareLibrary/{database}_{dataset}_{receptorID}_{i}.log", shell: "echo {input} > {output}" rule prepareDocking: input: - path.join(PREPARED_DIR, "receptor", "{receptorID}.pdbqt"), + rules.makeReceptorPDBQT.output, output: - path.join(OUTPUT_DIR, "receptor", "{receptorID}.txt"), + path.join("receptor", "{receptorID}.txt"), + log: + "logs/prepareDocking/{receptorID}.log", shell: "echo {input} > {output}" diff --git a/workflow/scripts/ENAMINEdownload.py b/workflow/scripts/ENAMINEdownload.py index 456f05d..df22cd3 100644 --- a/workflow/scripts/ENAMINEdownload.py +++ b/workflow/scripts/ENAMINEdownload.py @@ -6,11 +6,11 @@ for collection in snakemake.config["ENAMINE_INPUT"]: url = os.path.join(snakemake.config["ENAMINE_URL"], collection, ".zip") folder = os.path.join( - snakemake.config["INPUT_DIR"], "ENAMINE", os.path.dirname(collection) + DATABASE, os.path.dirname(collection) ) r = requests.get(url, allow_redirects=True, timout=60) with open( - os.path.join(snakemake.config["INPUT_DIR"], "ENAMINE", collection), "wb" + os.path.join(DATABASE, collection), "wb" ) as outfile: outfile.write(r.content) # hashed = (folder+".sh2") diff --git a/workflow/scripts/ZINCdownload.py b/workflow/scripts/ZINCdownload.py index 455e420..8d0d1a2 100644 --- a/workflow/scripts/ZINCdownload.py +++ b/workflow/scripts/ZINCdownload.py @@ -1,22 +1,158 @@ -"""download ligands in pdbqt format from ZINC database""" +"""download ligands in pdbqt format from ZINC database +ZINC tranches are organized as: +- First 2 letters (WEIGHT+LOGP): first subdirectory, e.g., "BB", "CB" +- Next 4 letters (REACT+PURCHASE+PH+CHARGE): second subdirectory, e.g., "ABRP" +- All 6 letters: filename prefix, e.g., "CBABRP" +- Chunks: .xaa.pdbqt.gz, .xab.pdbqt.gz, etc. + +This script: +1. Discovers all available chunks for the tranche +2. Downloads each chunk +3. Combines them into a single .pdbqt.gz file (without chunk indicator) +""" + +import itertools import os -import requests +import subprocess +import hashlib +import gzip +import sys + +# Redirect all stdout/stderr to the log file +sys.stdout = open(snakemake.log[0], 'w', buffering=1) # line buffering +sys.stderr = sys.stdout + +# Use the snakemake output path directly +output_file = str(snakemake.output[0]) + +# Get ZINC mirror from config +zinc_mirror = snakemake.config.get("ZINC_MIRROR", "files.docking.org") +if not zinc_mirror.startswith("http://") and not zinc_mirror.startswith("https://"): + zinc_mirror = "https://" + zinc_mirror +zinc_mirror = zinc_mirror.rstrip("/") + +# Extract path components from the configuration file +zinc_params = snakemake.config.get("ZINC_INPUT", {}) +print(f"ZINC input parameters from config: {zinc_params}") + +# construct all download paths from the zinc_params +# The first two letters indicate the tranche sets (all permutations of WEIGHT+LOGP lists) +tranches = [ + "".join(_) + for _ in itertools.product( + zinc_params.get("WEIGHT", []), zinc_params.get("LOGP", []) + ) +] +# The following for letters are REACT+PURCHASE+PH+CHARGE +# We need to generate all combinations based on the provided parameters, +# keeping the order consistent with ZINC naming conventions. +subsets = [] +for r in itertools.product( + zinc_params.get("REACT", []), + zinc_params.get("PURCHASE", []), + zinc_params.get("PH", []), + zinc_params.get("CHARGE", []), +): + subsets.append("".join(r)) + +print(f"Generated tranches: {tranches}") +print(f"Generated subsets: {subsets}") + +print(f"Output file: {output_file}") + +# Ensure output directory exists +directory = os.path.dirname(output_file) +os.makedirs(directory, exist_ok=True) + +# file to store hashes of all downloaded chunks in a format " " +hashfile = os.path.join(directory, "hashes.txt") + +# memorize all downloaded chunks in the hashfile +downloaded_chunks = [] +if os.path.exists(hashfile): + with open(hashfile, "r") as hf: + for line in hf: + parts = line.strip().split(" ") + if len(parts) == 2: + downloaded_chunks.append(parts[1]) + +# Download all compbinations of tranches and subsets +for tranche in tranches: + chunks = [] + for subset in subsets: + full_tranche = tranche + subset + print(f"Processing tranche: {full_tranche}") + # there may (!) be more then one chunk per tranche + for chunk in ("xaa", "xab", "xac"): + chunk_url = ( + f"{zinc_mirror}/3D/{tranche}/{subset}/{full_tranche}.{chunk}.pdbqt.gz" + ) + chunkfile = os.path.join(directory, f"{full_tranche}.{chunk}.pdbqt.gz") + + # skipt download if chunk file already exists and hashfile has an entry for this file + if os.path.exists(chunkfile) and chunkfile in downloaded_chunks: + chunks.append(chunkfile) + print(f" Found existing chunk file: {full_tranche}.{chunk}.pdbqt.gz") + print(" Skipping download...") + continue + + # Quick HEAD request to check if chunk exists using curl + cmd = ["curl", "--remote-time", "--fail", "-o", chunkfile, chunk_url] + proc = subprocess.run(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE) + + if proc.returncode == 0: + chunks.append(chunkfile) + # compute sha256 for the chunk and write to .sh2 file + print(f" Downloaded: {full_tranche}.{chunk}.pdbqt.gz") + print(" Computing SHA-256 checksum...") + hashobj = hashlib.sha256() + with open(chunkfile, "rb") as f: + for c in iter(lambda: f.read(8192), b""): + hashobj.update(c) + digest = hashobj.hexdigest() + with open(hashfile, "w") as hf: + hf.write(f"{digest} {chunkfile}\n") + + print(f" Ready: {full_tranche}.{chunk}.pdbqt.gz") + else: + # No more chunks found - delete any partial file + if os.path.exists(chunkfile): + os.remove(chunkfile) + break + if proc.returncode != 0 and len(chunks) > 0: + continue + if proc.returncode != 0 and len(chunks) > 0: + continue + +if not chunks: + raise RuntimeError(f"No chunks found for tranche {tranche}/{subset}") + +print(f"\nCombining {len(chunks)} chunk(s) into {output_file}") -out = str(snakemake.output) -weight_log = out.split("/")[-2] -attr = out.rsplit("/", 1)[-1] +# Combine all chunks into the final output file +# Decompress each chunk, concatenate, then recompress +with gzip.open(output_file, "wb") as outf: + for chunk_path in chunks: + print(f" Reading {os.path.basename(chunk_path)}") + with gzip.open(chunk_path, "rb") as inf: + outf.write(inf.read()) -file_name = "".join([attr[:6], ".xaa.pdbqt.gz"]) -url = os.path.join("http://files.docking.org/3D", weight_log, attr[2:6], file_name) + # Clean up chunk file + os.remove(chunk_path) -folder = os.path.join( - snakemake.config["INPUT_DIR"], "ZINC", weight_log, "".join([attr[:6], ".pdbqt.gz"]) -) +print(f"Successfully created {output_file}") -r = requests.get(url, timeout=60) -with open(folder, "wb") as outfile: - outfile.write(r.content) +# Compute sha256 and write to the .sh2 file (same format as `sha256sum`) +print("Computing SHA-256 checksum...") +hashed = output_file + ".sh2" +hashobj = hashlib.sha256() +with open(output_file, "rb") as f: + for chunk in iter(lambda: f.read(8192), b""): + hashobj.update(chunk) +digest = hashobj.hexdigest() +with open(hashed, "w") as hf: + hf.write(f"{digest} {output_file}\n") -hashed = folder + ".sh2" -shell: "sha256sum {folder} > {hashed}" +print(f"Checksum written to {hashed}") +print("Done!") diff --git a/workflow/scripts/prepareDocking.py b/workflow/scripts/prepareDocking.py index 80ed987..305036f 100644 --- a/workflow/scripts/prepareDocking.py +++ b/workflow/scripts/prepareDocking.py @@ -6,7 +6,7 @@ import os input_directory = snakemake.input.in_dir -output_directory = snakemake.config["OUTPUT_DIR"] +output_directory = snakemake.output. if "ZINC" in snakemake.config["DATABASE"]: weightLog = input_directory[-2:] diff --git a/workflow/scripts/prepareReceptor.py b/workflow/scripts/prepareReceptor.py index 239b81c..ffd5f6a 100644 --- a/workflow/scripts/prepareReceptor.py +++ b/workflow/scripts/prepareReceptor.py @@ -1,8 +1,12 @@ """preparation of proteins specified in configfile""" import os +import tempfile from Bio.PDB import PDBParser, PDBIO +# Redirect all stdout/stderr to the log file +sys.stdout = open(snakemake.log[0], 'w', buffering=1) # line buffering +sys.stderr = sys.stdout def removeChains(model, chainlist): """ @@ -39,18 +43,31 @@ def prepareRec(inputfile, outputfile, target): """ select chains to delete depending on config definition """ - print(target) + print(f"Preparing target: {target}") + # target might be a gzipped file + if inputfile.endswith(".gz"): + import gzip + import shutil + # the unzipped file needs to be temporary + with tempfile.NamedTemporaryFile(delete=False) as f: + print(f" Unzipping {inputfile} to temporary file.") + ungzipped = f.name + with gzip.open(inputfile, "rb") as f_in: + with open(ungzipped, "wb") as f_out: + shutil.copyfileobj(f_in, f_out) + inputfile = ungzipped ID = target.split(",") chains = ID[1].split(" ") parser = PDBParser() # MMCIFParser() structure = parser.get_structure(ID[0], inputfile) model = structure[0] + print(f" Removing chains not in: {chains}") removeChains(model, chains) io = PDBIO() io.set_structure(structure) out = outputfile - print("printing outfile") + print(f" Printing outfile: {out}") io.save(out)