From 4d62bb411bb2f8cbd4d837db66d881b82111e57f Mon Sep 17 00:00:00 2001
From: Maya Sheth <shethm@sh04-ln03.stanford.edu>
Date: Wed, 12 Mar 2025 10:38:50 -0700
Subject: [PATCH 1/2] modify load function for compatibility with raw E2G pred

---
 workflow/scripts/crisprComparisonLoadInputData.R | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/workflow/scripts/crisprComparisonLoadInputData.R b/workflow/scripts/crisprComparisonLoadInputData.R
index fda4778..153a5ff 100644
--- a/workflow/scripts/crisprComparisonLoadInputData.R
+++ b/workflow/scripts/crisprComparisonLoadInputData.R
@@ -198,6 +198,12 @@ load_encode_pred_file <- function(file, showProgress) {
   # load predictions and remove optional "#" in header row
   pred <- fread(file)
   colnames(pred)[[1]] <- sub("^#", "", colnames(pred)[[1]])
+
+  if ("PredictionCellType" %in% colnames(pred)) {
+    pred <- pred %>% rename(CellType = PredictionCellType)
+  }
+
+  pred <- pred %>% mutate(name = paste0(chr, ":", start, "-", end))
   
   return(pred)
   

From f3630577275b67cf7aa5fad740197b922572c331 Mon Sep 17 00:00:00 2001
From: Maya Sheth <shethm@sh04-02n16.int>
Date: Wed, 12 Mar 2025 18:02:18 -0700
Subject: [PATCH 2/2] add memory estimation function

---
 workflow/Snakefile                   |  2 ++
 workflow/rules/crispr_comparison.smk |  6 +++---
 workflow/rules/utils.smk             | 10 ++++++++++
 3 files changed, 15 insertions(+), 3 deletions(-)
 create mode 100644 workflow/rules/utils.smk

diff --git a/workflow/Snakefile b/workflow/Snakefile
index ba33077..60474dd 100644
--- a/workflow/Snakefile
+++ b/workflow/Snakefile
@@ -3,8 +3,10 @@
 configfile: "config/config.yml"
 
 # rules for CRISPR comparisons
+include: "rules/utils.smk"
 include: "rules/crispr_comparison.smk"
 
+
 # perform all comparisons listed in config.yml
 rule all:
   input:
diff --git a/workflow/rules/crispr_comparison.smk b/workflow/rules/crispr_comparison.smk
index 814ea87..4cd60ef 100644
--- a/workflow/rules/crispr_comparison.smk
+++ b/workflow/rules/crispr_comparison.smk
@@ -65,7 +65,7 @@ rule mergePredictionsWithExperiment:
   log: "results/{comparison}/logs/mergePredictionsWithExperiment.log"
   conda: "../envs/r_crispr_comparison.yml"
   resources:
-    mem_mb = 32000
+    mem_mb = determine_mem_mb
   script:
    "../../workflow/scripts/mergePredictionsWithExperiment.R"
    
@@ -80,7 +80,7 @@ rule annotateEnhFeatures:
     "results/{comparison}/expt_pred_merged_annot.txt.gz"
   conda: "../envs/r_crispr_comparison.yml"
   resources:
-    mem_mb = 32000
+    mem_mb = determine_mem_mb
   script:
     "../../workflow/scripts/annotateMergedData.R"
    
@@ -99,7 +99,7 @@ rule comparePredictionsToExperiment:
      include_col = lambda wildcards: get_optional_parameter(wildcards, "include_col", None)
   conda: "../envs/r_crispr_comparison.yml"
   resources:
-    mem_mb = 32000,
+    mem_mb = determine_mem_mb,
     runtime = "6h"
   script:
     "../../workflow/scripts/comparePredictionsToExperiment.Rmd"
diff --git a/workflow/rules/utils.smk b/workflow/rules/utils.smk
new file mode 100644
index 0000000..c7f1d90
--- /dev/null
+++ b/workflow/rules/utils.smk
@@ -0,0 +1,10 @@
+MAX_MEM_MB = 250 * 1000  # 250GB
+
+def determine_mem_mb(wildcards, input, attempt, min_gb=8):
+	# Memory resource calculator for snakemake rules
+	input_size_mb = input.size_mb
+	if ".gz" in str(input):
+		input_size_mb *= 8  # assume gz compressesed the file <= 8x
+	attempt_multiplier = 2 ** (attempt - 1)  # Double memory for each retry
+	mem_to_use_mb = attempt_multiplier *  max(4 * input_size_mb, min_gb * 1000)
+	return min(mem_to_use_mb, MAX_MEM_MB)
\ No newline at end of file