From 4d62bb411bb2f8cbd4d837db66d881b82111e57f Mon Sep 17 00:00:00 2001 From: Maya Sheth Date: Wed, 12 Mar 2025 10:38:50 -0700 Subject: [PATCH 1/2] modify load function for compatibility with raw E2G pred --- workflow/scripts/crisprComparisonLoadInputData.R | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/workflow/scripts/crisprComparisonLoadInputData.R b/workflow/scripts/crisprComparisonLoadInputData.R index fda4778..153a5ff 100644 --- a/workflow/scripts/crisprComparisonLoadInputData.R +++ b/workflow/scripts/crisprComparisonLoadInputData.R @@ -198,6 +198,12 @@ load_encode_pred_file <- function(file, showProgress) { # load predictions and remove optional "#" in header row pred <- fread(file) colnames(pred)[[1]] <- sub("^#", "", colnames(pred)[[1]]) + + if ("PredictionCellType" %in% colnames(pred)) { + pred <- pred %>% rename(CellType = PredictionCellType) + } + + pred <- pred %>% mutate(name = paste0(chr, ":", start, "-", end)) return(pred) From f3630577275b67cf7aa5fad740197b922572c331 Mon Sep 17 00:00:00 2001 From: Maya Sheth Date: Wed, 12 Mar 2025 18:02:18 -0700 Subject: [PATCH 2/2] add memory estimation function --- workflow/Snakefile | 2 ++ workflow/rules/crispr_comparison.smk | 6 +++--- workflow/rules/utils.smk | 10 ++++++++++ 3 files changed, 15 insertions(+), 3 deletions(-) create mode 100644 workflow/rules/utils.smk diff --git a/workflow/Snakefile b/workflow/Snakefile index ba33077..60474dd 100644 --- a/workflow/Snakefile +++ b/workflow/Snakefile @@ -3,8 +3,10 @@ configfile: "config/config.yml" # rules for CRISPR comparisons +include: "rules/utils.smk" include: "rules/crispr_comparison.smk" + # perform all comparisons listed in config.yml rule all: input: diff --git a/workflow/rules/crispr_comparison.smk b/workflow/rules/crispr_comparison.smk index 814ea87..4cd60ef 100644 --- a/workflow/rules/crispr_comparison.smk +++ b/workflow/rules/crispr_comparison.smk @@ -65,7 +65,7 @@ rule mergePredictionsWithExperiment: log: "results/{comparison}/logs/mergePredictionsWithExperiment.log" conda: "../envs/r_crispr_comparison.yml" resources: - mem_mb = 32000 + mem_mb = determine_mem_mb script: "../../workflow/scripts/mergePredictionsWithExperiment.R" @@ -80,7 +80,7 @@ rule annotateEnhFeatures: "results/{comparison}/expt_pred_merged_annot.txt.gz" conda: "../envs/r_crispr_comparison.yml" resources: - mem_mb = 32000 + mem_mb = determine_mem_mb script: "../../workflow/scripts/annotateMergedData.R" @@ -99,7 +99,7 @@ rule comparePredictionsToExperiment: include_col = lambda wildcards: get_optional_parameter(wildcards, "include_col", None) conda: "../envs/r_crispr_comparison.yml" resources: - mem_mb = 32000, + mem_mb = determine_mem_mb, runtime = "6h" script: "../../workflow/scripts/comparePredictionsToExperiment.Rmd" diff --git a/workflow/rules/utils.smk b/workflow/rules/utils.smk new file mode 100644 index 0000000..c7f1d90 --- /dev/null +++ b/workflow/rules/utils.smk @@ -0,0 +1,10 @@ +MAX_MEM_MB = 250 * 1000 # 250GB + +def determine_mem_mb(wildcards, input, attempt, min_gb=8): + # Memory resource calculator for snakemake rules + input_size_mb = input.size_mb + if ".gz" in str(input): + input_size_mb *= 8 # assume gz compressesed the file <= 8x + attempt_multiplier = 2 ** (attempt - 1) # Double memory for each retry + mem_to_use_mb = attempt_multiplier * max(4 * input_size_mb, min_gb * 1000) + return min(mem_to_use_mb, MAX_MEM_MB) \ No newline at end of file