diff --git a/README.md b/README.md
index f5e5255..03298c3 100755
--- a/README.md
+++ b/README.md
@@ -49,16 +49,18 @@ The way we choose the model depends on the biosamples input. The code for model
 
 ## Train model
 
-**Important: Only train models for biosamples matching the corresponding CRISPR data (in this case, K562)**
+**Important: Only train models for biosamples matching the corresponding CRISPR data**
 - Much of the the model training code was adapted from Alireza Karbalayghareh's [original implementation](https://github.com/karbalayghareh/ENCODE-E2G).
 
 Modify `config/config_training.yaml` with your model and dataset configs
-- `model_config` has columns:  model, dataset, ABC_directory, feature_table, polynomial (do you want to use polynomial features?), and override_params (are there model training parameters you would like to change from the default logistic regression settings specfied in `config/config_training.yaml`?)
+- `model_config` has columns:  model, dataset, crispr_dataset, feature_table, polynomial (do you want to use polynomial features?), and override_params (are there model training parameters you would like to change from the default logistic regression settings specfied in `config/config_training.yaml`?)
     - See [this example](https://pastebin.com/zt1868R3) `model_config` for how to specfiy override parameters. If there are no override_params, leave the column blank but still include the header.
+    - The `dataset` column is a comma-separated list with values corresponding to biosamples in `dataset_config`. Datasets must be provided for each `crispr_cell_type` included in the corresponding `crispr_dataset`.
+    - The `crispr_dataset` column corresponds to a key under `crispr_dataset` in `config_training`.
     - Feature tables must be specified for each model (example: `resources/feature_tables`) with columns: feature (name in final table), input_col (name in ABC output), second_input (multiplied by input_col if provided), aggregate_function (how to combine feature values when a CRISPR element overlaps more than one ABC element), fill_value (how to replace NAs), nice_name (used when plotting)
     - Note that trained models generated using polynomial features cannot directly be used in the **Apply model** workflow
 - `dataset_config` is an ABC biosamples config to generate ABC predictions for datasets without an existing ABC directory. 
-- Each dataset must correspond to a unique ABC_directory, with "biosample" in `dataset_config` equals "dataset" in `model_config`. If no ABC_directory is indicated in `model_config`, it must have an entry in `dataset_config`.
+    Each dataset must correspond to a "biosample" in `dataset_config` equals "dataset" in `model_config`. The column `crispr_cell_type` indicates the which cell type in the CRISPR data corresponds to this biosample.
 - If you are including features in addition to those generated within the pipeline (e.g. a value in input_col or second_input of a feature table is not included in `reference_features` in `config/config_training/yaml`), you must also define how to add these values with an external_features_config, which you include in `dataset_config` in the optional column external_features_config:
     - An `external_features_config` has columns feature (corresponding to the missing input_col or second_input value), source_col (column name in the source file), aggregate_function (how to combine values when merging different element definitions), join_by, and source_file
     - join_by must be either "TargetGene" (feature is defined per gene) or "overlap" (feature is defined per element-gene pair)
diff --git a/config/config_biosamples_chr22.tsv b/config/config_biosamples_chr22.tsv
index b9559d7..670633b 100755
--- a/config/config_biosamples_chr22.tsv
+++ b/config/config_biosamples_chr22.tsv
@@ -1,2 +1,2 @@
-biosample	DHS	ATAC	H3K27ac	default_accessibility_feature	HiC_file	HiC_type	HiC_resolution	alt_TSS	alt_genes
-K562_chr22	ABC/example_chr/chr22/ENCFF860XAE.chr22.sorted.se.bam			DHS	https://www.encodeproject.org/files/ENCFF621AIY/@@download/ENCFF621AIY.hic	hic	5000	ABC/example_chr/chr22/RefSeqCurated.170308.bed.CollapsedGeneBounds.chr22.hg38.TSS500bp.bed	ABC/example_chr/chr22/RefSeqCurated.170308.bed.CollapsedGeneBounds.chr22.hg38.bed
\ No newline at end of file
+biosample	crispr_cell_type	DHS	ATAC	H3K27ac	default_accessibility_feature	HiC_file	HiC_type	HiC_resolution	alt_TSS	alt_genes
+K562_chr22	K562	ABC/example_chr/chr22/ENCFF860XAE.chr22.sorted.se.bam			DHS	https://www.encodeproject.org/files/ENCFF621AIY/@@download/ENCFF621AIY.hic	hic	5000	ABC/example_chr/chr22/RefSeqCurated.170308.bed.CollapsedGeneBounds.chr22.hg38.TSS500bp.bed	ABC/example_chr/chr22/RefSeqCurated.170308.bed.CollapsedGeneBounds.chr22.hg38.bed
\ No newline at end of file
diff --git a/config/config_models_test.tsv b/config/config_models_test.tsv
index a4a7988..392ed18 100644
--- a/config/config_models_test.tsv
+++ b/config/config_models_test.tsv
@@ -1,2 +1,2 @@
-model	dataset	ABC_directory	feature_table	polynomial	override_params
-DNase_megamap	K562_chr22		resources/feature_tables/final_feature_set_DNase_hic.tsv	False 	
+model	dataset	crispr_dataset	feature_table	polynomial	override_params
+DNase_megamap	K562_chr22	training	resources/feature_tables/final_feature_set_DNase_hic.tsv	FALSE	
\ No newline at end of file
diff --git a/config/config_training.yaml b/config/config_training.yaml
index fa8536b..1b78793 100755
--- a/config/config_training.yaml
+++ b/config/config_training.yaml
@@ -12,7 +12,10 @@ max_memory_allocation_mb: 250000 # for submiting jobs
 gene_TSS500: "reference/CollapsedGeneBounds.hg38.TSS500bp.bed" # TSS reference file
 chr_sizes: "reference/GRCh38_EBV.no_alt.chrom.sizes.tsv"
 gene_classes: "resources/external_features/gene_promoter_class_RefSeqCurated.170308.bed.CollapsedGeneBounds.hg38.TSS500bp.tsv"
-crispr_dataset: "reference/EPCrisprBenchmark_ensemble_data_GRCh38.tsv.gz" # CRISPR dataset
+crispr_dataset:
+  training: "reference/EPCrisprBenchmark_ensemble_data_GRCh38.tsv.gz" # CRISPR dataset
+crispr_cell_types: 
+  training: ["K562"]
 # list of features that are generated by default, referenced by their column names in source files;
 reference_features: ["numTSSEnhGene", "distance", "activity_base", "TargetGenePromoterActivityQuantile", "numNearbyEnhancers", "sumNearbyEnhancers", "is_ubiquitous_uniform", "P2PromoterClass", "numCandidateEnhGene", "hic_contact_pl_scaled_adj", "ABC.Score", "ABC.Numerator", "ABC.Denominator", "normalized_dhs_prom", "normalized_dhs_enh", "normalized_atac_prom", "normalized_atac_enh", "normalized_h3k27ac_enh", "normalized_h3k27ac_prom"] 
 
diff --git a/workflow/Snakefile_training b/workflow/Snakefile_training
index 545a8fe..c844bdd 100755
--- a/workflow/Snakefile_training
+++ b/workflow/Snakefile_training
@@ -1,27 +1,34 @@
 from snakemake.utils import min_version
 min_version("7.0")
+conda: "mamba"
 
 import pandas as pd
 import os
 import yaml
 
-configfile: "config/config_training.yaml"
 model_config = pd.read_table(config["model_config"], na_values="").fillna("None").set_index("model", drop=False)
 dataset_config = pd.read_table(config["dataset_config"], na_values="").set_index("biosample", drop=False)
 conda: "mamba"
+
+# import utils and make config paths absolute
 MAX_MEM_MB = config["max_memory_allocation_mb"]
 include: "rules/utils.smk"
 
 E2G_DIR_PATH = os.path.abspath(config["E2G_DIR_PATH"])
 config = make_paths_absolute(config, E2G_DIR_PATH)
+config["results_dir"] = os.path.join(E2G_DIR_PATH, config["results_dir"]) # manually modify results dir since may not exist
 
-# Need to manually make results_dir an absolute path since above may
-# not work if results_dir folder isn't created
-# If results_dir is already an absolute path, this is a no-op
-config["results_dir"] = os.path.join(E2G_DIR_PATH, config["results_dir"])
+# define some global variables 
+RESULTS_DIR = config["results_dir"]
+MODELS_RESULTS_DIR = os.path.join(config["results_dir"], "model_results")
+SCRIPTS_DIR = os.path.join(E2G_DIR_PATH, "workflow/scripts")
 
+# process configs
 model_config = process_model_config(model_config)
+model_dataset_dict = make_model_dataset_dict(model_config, dataset_config) # dictionary of dictionaries: model: {crispr_ct: dataset, ...}
+print(model_dataset_dict)
 
+# import ABC submodule
 module ABC:
     snakefile:
         f"{config['ABC_DIR_PATH']}/workflow/Snakefile"
@@ -31,39 +38,39 @@ abc_config = get_abc_config(config)
 
 use rule * from ABC exclude all as abc_*
 
-RESULTS_DIR = config["results_dir"]
-SCRIPTS_DIR = os.path.join(E2G_DIR_PATH, "workflow/scripts")
-
-# These rules requires the variables above to be defined
+# import all rules (require the variables above to be defined)
 include: "rules/genomewide_features.smk"
 include: "rules/crispr_features.smk"
 include: "rules/train_model.smk"
 include: "rules/feature_analysis.smk"
 include: "rules/compare_models.smk"
 
-# Validate ABC_directory column and make ABC directory dict
+# validate ABC_directory column and make ABC directory dict
 ABC_BIOSAMPLES_DIR = process_abc_directory_column(model_config)
 
 # specify target files
 output_files = []
-output_files.extend(expand(os.path.join(RESULTS_DIR, "{dataset}",  "{model}", "for_training.EPCrisprBenchmark_ensemble_data_GRCh38.K562_features_NAfilled.tsv.gz"), zip, dataset=model_config["dataset"], model=model_config["model"]))
-output_files.extend(expand(os.path.join(RESULTS_DIR, "{dataset}", "{model}", "model", "model_full.pkl"), zip, dataset=model_config["dataset"], model=model_config["model"])) # trained models
+output_files.extend(expand(os.path.join(MODELS_RESULTS_DIR, "{model}", "model", "model_full.pkl"), model=model_config["model"])) # trained models
 
-output_files.append(os.path.join(RESULTS_DIR, "performance_across_models.tsv")) # comparison across models
-output_files.extend(expand(os.path.join(RESULTS_DIR, "performance_across_models_{metric}.pdf"), metric=["auprc", "precision"])) # plot of comparisons
+# output_files.extend(expand(os.path.join(RESULTS_DIR, "{dataset}",  "{model}", "for_training.EPCrisprBenchmark_ensemble_data_GRCh38.K562_features_NAfilled.tsv.gz"), zip, dataset=model_config["dataset"], model=model_config["model"]))
+# output_files.extend(expand(os.path.join(RESULTS_DIR, "{dataset}", "{model}", "model", "model_full.pkl"), zip, dataset=model_config["dataset"], model=model_config["model"])) # trained models
+
+output_files.append(os.path.join(MODELS_RESULTS_DIR, "performance_across_models.tsv")) # comparison across models
+output_files.extend(expand(os.path.join(MODELS_RESULTS_DIR, "performance_across_models_{metric}.pdf"), metric=["auprc", "precision"])) # plot of comparisons
 
 if config["run_feature_analysis"]:
-    output_files.extend(expand(os.path.join(RESULTS_DIR, "{dataset}", "{model}", "feature_analysis", "forward_feature_selection_auprc.pdf"), zip, dataset=model_config["dataset"], model=model_config["model"]))
-    output_files.extend(expand(os.path.join(RESULTS_DIR, "{dataset}", "{model}", "feature_analysis", "backward_feature_selection_auprc.pdf"), zip, dataset=model_config["dataset"], model=model_config["model"]))
-    output_files.extend(expand(os.path.join(RESULTS_DIR, "{dataset}", "{model}", "feature_analysis", "permutation_feature_importance_auprc.pdf"), zip, dataset=model_config["dataset"], model=model_config["model"]))
+    output_files.extend(expand(os.path.join(MODELS_RESULTS_DIR, "{model}", "feature_analysis", "forward_feature_selection_auprc.pdf"), model=model_config["model"]))
+    output_files.extend(expand(os.path.join(MODELS_RESULTS_DIR, "{model}", "feature_analysis", "backward_feature_selection_auprc.pdf"), model=model_config["model"]))
+    output_files.extend(expand(os.path.join(MODELS_RESULTS_DIR, "{model}", "feature_analysis", "permutation_feature_importance_auprc.pdf"), model=model_config["model"]))
     
     # only test all feature sets if polynomial==False and n_features<14
     for row in model_config.itertuples(index=False):
         if not row.polynomial == 'True':
             features = pd.read_table(row.feature_table)
             n_features = len(features) 
+
             if n_features<14:
-                output_files.append(os.path.join(RESULTS_DIR, row.dataset, row.model, "feature_analysis", "all_feature_sets.tsv"))
+                output_files.append(os.path.join(MODELS_RESULTS_DIR, row.model, "feature_analysis", "all_feature_sets.tsv"))
 
 rule all:
     input: 
diff --git a/workflow/rules/compare_models.smk b/workflow/rules/compare_models.smk
index b0623d8..d6ce2ed 100755
--- a/workflow/rules/compare_models.smk
+++ b/workflow/rules/compare_models.smk
@@ -2,14 +2,16 @@
 # compare cv-performance on training data across all models (note, this is not the true benchmarking performance CRISPR elements not overlapping prediction elements aren't considered)  
 rule gather_model_performances:
 	input:
-		all_predictions = expand(os.path.join(RESULTS_DIR, "{dataset}", "{model}", "model", "training_predictions.tsv"), zip, dataset=model_config["dataset"], model=model_config["model"])
+		all_predictions = expand(os.path.join(MODELS_RESULTS_DIR, "{model}", "model", "training_predictions.tsv"), model=model_config["model"]),
+		all_missing = expand(os.path.join(MODELS_RESULTS_DIR, "{model}", "merged_CRISPR_dataset.missing_from_features.NAfilled.tsv.gz"), model=model_config["model"]),
 	output:
-		comp_table = os.path.join(RESULTS_DIR, "performance_across_models.tsv")
+		comp_table = os.path.join(MODELS_RESULTS_DIR, "performance_across_models.tsv")
 	params:
 		scripts_dir = SCRIPTS_DIR,
 		out_dir = RESULTS_DIR,
 		model_config_file = config["model_config"],
-		crispr_dataset = config["crispr_dataset"]
+		crispr_dataset_names = [n for n in model_config["crispr_dataset"].unique()],
+		crispr_dataset = lambda wildcards: [config["crispr_dataset"][cd] for cd in model_config["crispr_dataset"].unique()] 
 	conda:
 		"../envs/encode_re2g.yml" 
 	resources:
@@ -17,18 +19,21 @@ rule gather_model_performances:
 	shell: 
 		""" 
 		python {params.scripts_dir}/model_training/compare_all_models.py \
+			--all_pred "{input.all_predictions}" \
+			--all_missing "{input.all_missing}" \
 			--model_config_file {params.model_config_file} \
 			--output_file {output.comp_table}  \
-			--crispr_data {params.crispr_dataset} \
+			--crispr_names "{params.crispr_dataset_names}" \
+			--crispr_data "{params.crispr_dataset}" \
 			--out_dir {params.out_dir}
 		"""
 
 rule plot_model_performances:
 	input:
-		comp_table = os.path.join(RESULTS_DIR, "performance_across_models.tsv")
+		comp_table = os.path.join(MODELS_RESULTS_DIR, "performance_across_models.tsv")
 	output:
-		comp_plot_auprc = os.path.join(RESULTS_DIR, "performance_across_models_auprc.pdf"),
-		comp_plot_prec = os.path.join(RESULTS_DIR, "performance_across_models_precision.pdf")
+		comp_plot_auprc = os.path.join(MODELS_RESULTS_DIR, "performance_across_models_auprc.pdf"),
+		comp_plot_prec = os.path.join(MODELS_RESULTS_DIR, "performance_across_models_precision.pdf")
 	conda:
 		"../envs/encode_re2g.yml" 
 	resources:
diff --git a/workflow/rules/crispr_features.smk b/workflow/rules/crispr_features.smk
index 378fdf0..b7a9e3a 100644
--- a/workflow/rules/crispr_features.smk
+++ b/workflow/rules/crispr_features.smk
@@ -28,18 +28,18 @@ rule format_external_features_config:
 	script:
 		"../scripts/feature_tables/format_external_features_config.R"
 
-# overlap feature table  with K562 CRISPR data
-rule overlap_features_crispr:
+# overlap feature table  with CRISPR data for this dataset
+rule overlap_features_crispr_for_dataset:
 	input:
 		features = os.path.join(RESULTS_DIR, "{dataset}", "genomewide_features.tsv.gz"),
-		crispr = config['crispr_dataset'],
 		feature_table_file = os.path.join(RESULTS_DIR, "{dataset}", "feature_table.tsv"),
+		crispr = lambda wildcards: config['crispr_dataset'][wildcards.crispr_dataset],
 		tss = config['gene_TSS500']
 	params:
-		tpm_threshold = lambda wildcards: model_config.loc[wildcards.model, 'tpm_threshold']
+		crispr_cell_type = lambda wildcards: dataset_config.loc[wildcards.dataset, "crispr_cell_type"]
 	output: 
-		features = os.path.join(RESULTS_DIR, "{dataset}", "{model}", "EPCrisprBenchmark_ensemble_data_GRCh38.K562_features_{nafill}.tsv.gz"),
-		missing = os.path.join(RESULTS_DIR, "{dataset}",  "{model}", "missing.EPCrisprBenchmark_ensemble_data_GRCh38.K562_features_{nafill}.tsv.gz")
+		features = os.path.join(RESULTS_DIR, "{dataset}", "CRISPR_dataset_{crispr_dataset}.overlapping_features.{nafill}.tsv.gz"),
+		missing = os.path.join(RESULTS_DIR, "{dataset}", "CRISPR_dataset_{crispr_dataset}.missing_from_features.{nafill}.tsv.gz")
 	conda:
 		"../envs/encode_re2g.yml" 
 	resources:
@@ -47,15 +47,23 @@ rule overlap_features_crispr:
 	script:
 		"../scripts/feature_tables/overlap_features_with_crispr_data.R"
 
-# process data for model training: rename columns, apply filter features, filter to gene list
+# process data for model training: rename columns, apply filter features, filter to gene list, combine across datasets
 # note: we use the NAfilled CRISPR feature data here!
+def get_crispr_files_for_model(wildcards, file_type):
+	datasets = model_dataset_dict[wildcards.model].values()
+	crispr_dataset = model_config.loc[wildcards.model, "crispr_dataset"]
+	files = [os.path.join(RESULTS_DIR, ds, f"CRISPR_dataset_{crispr_dataset}.{file_type}.{wildcards.nafill}.tsv.gz") for ds in datasets]
+	return files
+
 rule process_crispr_data:
 	input:
-		crispr_features = os.path.join(RESULTS_DIR, "{dataset}", "{model}", "EPCrisprBenchmark_ensemble_data_GRCh38.K562_features_{nafill}.tsv.gz")
+		crispr_features = lambda wildcards: get_crispr_files_for_model(wildcards, "overlapping_features"),
+		crispr_missing = lambda wildcards: get_crispr_files_for_model(wildcards, "missing_from_features"),
 	params:
-		genes = config["gene_TSS500"]
+		genes = config["gene_TSS500"],
 	output:
-		processed = os.path.join(RESULTS_DIR, "{dataset}",  "{model}",  "for_training.EPCrisprBenchmark_ensemble_data_GRCh38.K562_features_{nafill}.tsv.gz")
+		processed = os.path.join(MODELS_RESULTS_DIR, "{model}",  "for_training.merged_CRISPR_dataset.overlapping_features.{nafill}.tsv.gz"),
+		missing = os.path.join(MODELS_RESULTS_DIR, "{model}",  "merged_CRISPR_dataset.missing_from_features.{nafill}.tsv.gz")
 	conda:
 		"../envs/encode_re2g.yml" 
 	resources:
diff --git a/workflow/rules/feature_analysis.smk b/workflow/rules/feature_analysis.smk
index 02354a1..3f6925e 100755
--- a/workflow/rules/feature_analysis.smk
+++ b/workflow/rules/feature_analysis.smk
@@ -1,16 +1,16 @@
 # forward sequential feature selection
 rule calculate_forward_feature_selection:
 	input:
-		crispr_features_processed = os.path.join(RESULTS_DIR, "{dataset}", "{model}", "for_training.EPCrisprBenchmark_ensemble_data_GRCh38.K562_features_NAfilled.tsv.gz"),
+		crispr_features_processed = os.path.join(MODELS_RESULTS_DIR, "{model}", "for_training.merged_CRISPR_dataset.overlapping_features.NAfilled.tsv.gz"),
 		feature_table = lambda wildcards: model_config.loc[wildcards.model, 'feature_table'],
-		model_params = os.path.join(RESULTS_DIR, "{dataset}", "{model}", "model", "training_params.pkl")
+		model_params = os.path.join(MODELS_RESULTS_DIR, "{model}", "model", "training_params.pkl")
 	params:
 		epsilon = config["epsilon"],
 		scripts_dir = SCRIPTS_DIR,
 		polynomial = lambda wildcards: model_config.loc[wildcards.model, 'polynomial'],
-		out_dir = os.path.join(RESULTS_DIR, "{dataset}", "{model}", "feature_analysis")
+		out_dir = os.path.join(MODELS_RESULTS_DIR, "{model}", "feature_analysis")
 	output:
-		results = os.path.join(RESULTS_DIR, "{dataset}", "{model}", "feature_analysis", "forward_feature_selection.tsv"),
+		results = os.path.join(MODELS_RESULTS_DIR,  "{model}", "feature_analysis", "forward_feature_selection.tsv"),
 	conda:
 		"../envs/encode_re2g.yml" 
 	resources:
@@ -28,13 +28,13 @@ rule calculate_forward_feature_selection:
 
 rule plot_forward_feature_selection:
 	input:
-		results = os.path.join(RESULTS_DIR, "{dataset}", "{model}", "feature_analysis", "forward_feature_selection.tsv"),
+		results = os.path.join(MODELS_RESULTS_DIR, "{model}", "feature_analysis", "forward_feature_selection.tsv"),
 		feature_table = lambda wildcards: model_config.loc[wildcards.model, 'feature_table'],
 	params:
 		polynomial = lambda wildcards: model_config.loc[wildcards.model, 'polynomial']
 	output:
-		out_auprc = os.path.join(RESULTS_DIR, "{dataset}", "{model}", "feature_analysis", "forward_feature_selection_auprc.pdf"),
-		out_prec = os.path.join(RESULTS_DIR, "{dataset}", "{model}", "feature_analysis", "forward_feature_selection_precision.pdf")
+		out_auprc = os.path.join(MODELS_RESULTS_DIR, "{model}", "feature_analysis", "forward_feature_selection_auprc.pdf"),
+		out_prec = os.path.join(MODELS_RESULTS_DIR, "{model}", "feature_analysis", "forward_feature_selection_precision.pdf")
 	conda:
 		"../envs/encode_re2g.yml" 
 	resources:
@@ -45,16 +45,16 @@ rule plot_forward_feature_selection:
 # backward sequential feature selection
 rule calculate_backward_feature_selection:
 	input:
-		crispr_features_processed = os.path.join(RESULTS_DIR, "{dataset}", "{model}", "for_training.EPCrisprBenchmark_ensemble_data_GRCh38.K562_features_NAfilled.tsv.gz"),
+		crispr_features_processed = os.path.join(MODELS_RESULTS_DIR, "{model}", "for_training.merged_CRISPR_dataset.overlapping_features.NAfilled.tsv.gz"),
 		feature_table = lambda wildcards: model_config.loc[wildcards.model, 'feature_table'],
-		model_params = os.path.join(RESULTS_DIR, "{dataset}", "{model}", "model", "training_params.pkl")
+		model_params = os.path.join(MODELS_RESULTS_DIR, "{model}", "model", "training_params.pkl")
 	params:
 		epsilon = config["epsilon"],
 		scripts_dir = SCRIPTS_DIR,
 		polynomial = lambda wildcards: model_config.loc[wildcards.model, 'polynomial'],
-		out_dir = os.path.join(RESULTS_DIR, "{dataset}", "{model}", "feature_analysis")
+		out_dir = os.path.join(MODELS_RESULTS_DIR, "{model}", "feature_analysis")
 	output:
-		results = os.path.join(RESULTS_DIR, "{dataset}", "{model}", "feature_analysis", "backward_feature_selection.tsv"),
+		results = os.path.join(MODELS_RESULTS_DIR, "{model}", "feature_analysis", "backward_feature_selection.tsv"),
 	conda:
 		"../envs/encode_re2g.yml" 
 	resources:
@@ -72,13 +72,13 @@ rule calculate_backward_feature_selection:
 
 rule plot_backward_feature_selection:
 	input:
-		results = os.path.join(RESULTS_DIR, "{dataset}", "{model}", "feature_analysis", "backward_feature_selection.tsv"),
+		results = os.path.join(MODELS_RESULTS_DIR, "{model}", "feature_analysis", "backward_feature_selection.tsv"),
 		feature_table = lambda wildcards: model_config.loc[wildcards.model, 'feature_table']
 	params:
 		polynomial = lambda wildcards: model_config.loc[wildcards.model, 'polynomial']
 	output:
-		out_auprc = os.path.join(RESULTS_DIR, "{dataset}", "{model}", "feature_analysis", "backward_feature_selection_auprc.pdf"),
-		out_prec = os.path.join(RESULTS_DIR, "{dataset}", "{model}", "feature_analysis", "backward_feature_selection_precision.pdf")
+		out_auprc = os.path.join(MODELS_RESULTS_DIR,  "{model}", "feature_analysis", "backward_feature_selection_auprc.pdf"),
+		out_prec = os.path.join(MODELS_RESULTS_DIR, "{model}", "feature_analysis", "backward_feature_selection_precision.pdf")
 	conda:
 		"../envs/encode_re2g.yml" 
 	resources:
@@ -89,15 +89,15 @@ rule plot_backward_feature_selection:
 # compare all features sets
 rule compare_all_feature_sets:
 	input:
-		crispr_features_processed = os.path.join(RESULTS_DIR, "{dataset}", "{model}", "for_training.EPCrisprBenchmark_ensemble_data_GRCh38.K562_features_NAfilled.tsv.gz"),
+		crispr_features_processed = os.path.join(MODELS_RESULTS_DIR, "{model}", "for_training.merged_CRISPR_dataset.overlapping_features.NAfilled.tsv.gz"),
 		feature_table = lambda wildcards: model_config.loc[wildcards.model, 'feature_table'],
-		model_params = os.path.join(RESULTS_DIR, "{dataset}", "{model}", "model", "training_params.pkl")
+		model_params = os.path.join(MODELS_RESULTS_DIR, "{model}", "model", "training_params.pkl")
 	params:
 		epsilon = config["epsilon"],
 		scripts_dir = SCRIPTS_DIR,
-		out_dir = os.path.join(RESULTS_DIR, "{dataset}", "{model}", "feature_analysis")
+		out_dir = os.path.join(MODELS_RESULTS_DIR, "{model}", "feature_analysis")
 	output:
-		results = os.path.join(RESULTS_DIR, "{dataset}", "{model}", "feature_analysis", "all_feature_sets.tsv"),
+		results = os.path.join(MODELS_RESULTS_DIR, "{model}", "feature_analysis", "all_feature_sets.tsv"),
 	conda:
 		"../envs/encode_re2g.yml" 
 	resources:
@@ -116,17 +116,17 @@ rule compare_all_feature_sets:
 # permuation feature importance
 rule calculate_permutation_feature_importance:
 	input:
-		crispr_features_processed = os.path.join(RESULTS_DIR, "{dataset}", "{model}", "for_training.EPCrisprBenchmark_ensemble_data_GRCh38.K562_features_NAfilled.tsv.gz"),
+		crispr_features_processed = os.path.join(MODELS_RESULTS_DIR, "{model}", "for_training.merged_CRISPR_dataset.overlapping_features.NAfilled.tsv.gz"),
 		feature_table = lambda wildcards: model_config.loc[wildcards.model, 'feature_table'],
-		model_params = os.path.join(RESULTS_DIR, "{dataset}", "{model}", "model", "training_params.pkl")
+		model_params = os.path.join(MODELS_RESULTS_DIR, "{model}", "model", "training_params.pkl")
 	params:
 		epsilon = config["epsilon"],
 		n_repeats = 20,
 		scripts_dir = SCRIPTS_DIR,
 		polynomial = lambda wildcards: model_config.loc[wildcards.model, 'polynomial'],
-		out_dir = os.path.join(RESULTS_DIR, "{dataset}", "{model}", "feature_analysis")
+		out_dir = os.path.join(MODELS_RESULTS_DIR, "{model}", "feature_analysis")
 	output:
-		results = os.path.join(RESULTS_DIR, "{dataset}", "{model}", "feature_analysis", "permutation_feature_importance.tsv"),
+		results = os.path.join(MODELS_RESULTS_DIR, "{model}", "feature_analysis", "permutation_feature_importance.tsv"),
 	conda:
 		"../envs/encode_re2g.yml" 
 	resources:
@@ -145,14 +145,14 @@ rule calculate_permutation_feature_importance:
 
 rule plot_permutation_feature_importance:
 	input:
-		results = os.path.join(RESULTS_DIR, "{dataset}", "{model}", "feature_analysis", "permutation_feature_importance.tsv"),
+		results = os.path.join(MODELS_RESULTS_DIR, "{model}", "feature_analysis", "permutation_feature_importance.tsv"),
 		feature_table = lambda wildcards: model_config.loc[wildcards.model, 'feature_table']
 	params:
 		polynomial = lambda wildcards: model_config.loc[wildcards.model, 'polynomial'],
 		n_repeats = 20
 	output:
-		out_auprc = os.path.join(RESULTS_DIR, "{dataset}", "{model}", "feature_analysis", "permutation_feature_importance_auprc.pdf"),
-		out_prec = os.path.join(RESULTS_DIR, "{dataset}", "{model}", "feature_analysis", "permutation_feature_importance_precision.pdf")
+		out_auprc = os.path.join(MODELS_RESULTS_DIR, "{model}", "feature_analysis", "permutation_feature_importance_auprc.pdf"),
+		out_prec = os.path.join(MODELS_RESULTS_DIR, "{model}", "feature_analysis", "permutation_feature_importance_precision.pdf")
 	conda:
 		"../envs/encode_re2g.yml" 
 	resources:
diff --git a/workflow/rules/genomewide_features.smk b/workflow/rules/genomewide_features.smk
index 8afe56e..0e7c8f8 100644
--- a/workflow/rules/genomewide_features.smk
+++ b/workflow/rules/genomewide_features.smk
@@ -172,7 +172,7 @@ rule add_external_features:
 	input:
 		predictions_extended = os.path.join(RESULTS_DIR, "{biosample}", "ActivityOnly_features.tsv.gz"),
 		feature_table_file = os.path.join(RESULTS_DIR, "{biosample}", "feature_table.tsv"),
-		external_features_config = ancient(os.path.join(RESULTS_DIR, "{biosample}", "external_features_config.tsv"))
+		external_features_config = os.path.join(RESULTS_DIR, "{biosample}", "external_features_config.tsv")
 	output:
 		plus_external_features = os.path.join(RESULTS_DIR, "{biosample}",  "ActivityOnly_plus_external_features.tsv.gz")
 	conda:
diff --git a/workflow/rules/train_model.smk b/workflow/rules/train_model.smk
index 9a7e683..4f8093b 100644
--- a/workflow/rules/train_model.smk
+++ b/workflow/rules/train_model.smk
@@ -7,7 +7,7 @@ rule generate_model_params:
 		default_params = config["default_params"],
 		scripts_dir = SCRIPTS_DIR
 	output:
-		final_params = os.path.join(RESULTS_DIR, "{dataset}", "{model}", "model", "training_params.pkl")
+		final_params = os.path.join(MODELS_RESULTS_DIR, "{model}", "model", "training_params.pkl")
 	conda:
 		"../envs/encode_re2g.yml" 
 	shell:
@@ -21,19 +21,17 @@ rule generate_model_params:
 # generate trained model and cross-validated predictions on CRISPR data
 rule train_model:
 	input:
-		crispr_features_processed = os.path.join(RESULTS_DIR, "{dataset}", "{model}", "for_training.EPCrisprBenchmark_ensemble_data_GRCh38.K562_features_NAfilled.tsv.gz"),
+		crispr_features_processed = os.path.join(MODELS_RESULTS_DIR, "{model}", "for_training.merged_CRISPR_dataset.overlapping_features.NAfilled.tsv.gz"),
 		feature_table = lambda wildcards: model_config.loc[wildcards.model, 'feature_table'],
-		model_params = os.path.join(RESULTS_DIR, "{dataset}", "{model}", "model", "training_params.pkl")
+		model_params = os.path.join(MODELS_RESULTS_DIR, "{model}", "model", "training_params.pkl")
 	params:
 		epsilon = config["epsilon"],
 		scripts_dir = SCRIPTS_DIR,
 		polynomial = lambda wildcards: model_config.loc[wildcards.model, 'polynomial'],
-		out_dir = os.path.join(RESULTS_DIR, "{dataset}", "{model}", "model")
+		out_dir = os.path.join(MODELS_RESULTS_DIR, "{model}", "model")
 	output:
-		trained_model = os.path.join(RESULTS_DIR, "{dataset}", "{model}", "model", "model_full.pkl"),
-		pred = os.path.join(RESULTS_DIR, "{dataset}", "{model}", "model", "training_predictions.tsv"),
-		#in_order = os.path.join(RESULTS_DIR, "{dataset}", "{model}", "model", "training_data_in_order.tsv"),
-		#shap = os.path.join(RESULTS_DIR, "{dataset}", "{model}", "model", "shap_scores.tsv")
+		trained_model = os.path.join(MODELS_RESULTS_DIR, "{model}", "model", "model_full.pkl"),
+		pred = os.path.join(MODELS_RESULTS_DIR, "{model}", "model", "training_predictions.tsv"),
 	conda:
 		"../envs/encode_re2g.yml" 
 	resources:
diff --git a/workflow/rules/utils.smk b/workflow/rules/utils.smk
index fa1db11..2fcc66b 100755
--- a/workflow/rules/utils.smk
+++ b/workflow/rules/utils.smk
@@ -35,6 +35,34 @@ def process_model_config(model_config):
 
 	return model_config
 
+# create dictionary of dictionaries to map datasets to crispr cell types: {model: {crispr_ct: dataset, ...}}
+def make_model_dataset_dict(model_config, dataset_config):
+
+	# dict of dataset: crispr_cell_type
+	dataset_celltype_dict = dict(zip(dataset_config['biosample'], dataset_config['crispr_cell_type']))
+
+	model_dicts = []
+	for index, row in model_config.iterrows():
+		model_datasets = [item.strip() for item in row["dataset"].split(",")] # return list of datasets 
+		mapped_celltypes = [dataset_celltype_dict[dataset] for dataset in model_datasets]
+
+		# make sure 1:1 correspondance with CRISPR cell types
+		crispr_data_cell_types = config["crispr_cell_types"][row["crispr_dataset"]]
+
+		if sorted(crispr_data_cell_types) != sorted(mapped_celltypes):
+			print(f"Model datasets: {model_datasets} -> cell types: {mapped_celltypes}")
+			raise Exception(f"Datasets specified for {row['model']} do not map to all CRISPR cell types.")	
+
+		this_model_dict = dict(zip(mapped_celltypes, model_datasets))
+		model_dicts.append(this_model_dict)
+
+	# combine into dictionary of dicitonaries
+	model_dataset_dict = dict(zip(model_config["model"], model_dicts))
+	
+	return model_dataset_dict
+
+
+
 def get_abc_config(config):
 	abc_config_file = os.path.join(config["ABC_DIR_PATH"], "config/config.yaml")
 	with open(abc_config_file, 'r') as stream:
@@ -86,7 +114,7 @@ def make_accessibility_file_df(biosample_df, biosample_activities):
 
 def get_input_for_bw(this_biosample, this_simple_id):
 	df_sub = ACCESSIBILITY_DF.loc[(ACCESSIBILITY_DF["biosample"]==this_biosample) & (ACCESSIBILITY_DF["access_simple_id"]==this_simple_id)]
-	return df_sub["single_access_file"][0]
+	return df_sub["single_access_file"].item()
 
 def expand_biosample_df(biosample_df):
 	# add new columns
@@ -120,13 +148,15 @@ def process_abc_directory_column(model_config):
 	# Make a dictionary of dataset:ABC_dir pairs
 	ABC_BIOSAMPLES_DIR = {}
 	for row in model_config.itertuples(index=False):
-		if row.dataset not in ABC_BIOSAMPLES_DIR: 
-			if row.ABC_directory=="None": # ABC directory is not provided
-				if row.dataset not in dataset_config['biosample']: # is there info to run ABC?
-					raise Exception(f"Dataset {row.dataset} not specified in dataset_config.")
-				ABC_BIOSAMPLES_DIR[row.dataset] = os.path.join(RESULTS_DIR, row.dataset)
-			else: # ABC directory is provided
-				ABC_BIOSAMPLES_DIR[row.dataset] = row.ABC_directory
+		model_datasets = [item.strip() for item in row.dataset.split(",")]
+		for ds in model_datasets:
+			if ds not in ABC_BIOSAMPLES_DIR: 
+				if row.ABC_directory=="None": # ABC directory is not provided
+					if ds not in dataset_config['biosample']: # is there info to run ABC?
+						raise Exception(f"Dataset {ds} not specified in dataset_config.")
+					ABC_BIOSAMPLES_DIR[ds] = os.path.join(RESULTS_DIR, ds)
+				else: # ABC directory is provided
+					ABC_BIOSAMPLES_DIR[ds] = ds
 	
 	return ABC_BIOSAMPLES_DIR
 
@@ -199,4 +229,4 @@ def get_tpm_threshold(biosample, model_name, biosample_df=None):
 		return 0
 	else:
 		tpm_file = os.path.basename(tpm_files[0])
-	return tpm_file.split("threshold_")[1]
\ No newline at end of file
+	return tpm_file.split("threshold_")[1]
diff --git a/workflow/scripts/feature_tables/combine_feature_tables.R b/workflow/scripts/feature_tables/combine_feature_tables.R
index 54cbe91..0268b49 100644
--- a/workflow/scripts/feature_tables/combine_feature_tables.R
+++ b/workflow/scripts/feature_tables/combine_feature_tables.R
@@ -4,15 +4,19 @@ library(dplyr)
 
 # inputs from snakemake
 model_config = fread(snakemake@input$model_config)
-ds = snakemake@wildcards$dataset
+ds = snakemake@wildcards$dataset 
 
-# merge feature tables
-models_this = dplyr::filter(model_config, dataset==ds)
-for (i in 1:nrow(models_this)){
-    ft = fread(models_this$feature_table[i])
-    if (i==1){df = ft} else {df = rbind(df, ft)}
+# merge feature tables for models with this dataset
+ft_files = c()
+for (i in 1:nrow(model_config)){
+	model_datasets = model_config$dataset[i] %>% strsplit(",") %>% unlist() %>% trimws()
+	if (ds %in% model_datasets) {
+		ft_files = c(ft_files, model_config$feature_table[i])
+	}
 }
 
+df <- lapply(ft_files, fread) %>% rbindlist() %>% as.data.frame()
+
 # for sc-E2G pipeline
 if (("ARC.E2G.Score" %in% df$feature) | ("Kendall" %in% df$feature)){
 	ARC_rows = data.frame(c("RNA_meanLogNorm", "RNA_pseudobulkTPM", "RNA_percentCellsDetected", "Kendall", "ARC.E2G.Score", "ABC.Score", "normalizedATAC_enh"),
diff --git a/workflow/scripts/feature_tables/gen_num_candidate_enh_gene.py b/workflow/scripts/feature_tables/gen_num_candidate_enh_gene.py
index f1ecca0..6ea4300 100644
--- a/workflow/scripts/feature_tables/gen_num_candidate_enh_gene.py
+++ b/workflow/scripts/feature_tables/gen_num_candidate_enh_gene.py
@@ -44,7 +44,7 @@ def determine_num_candidate_enh_gene(pred_df, out_file):
 @click.option("--abc_predictions")
 @click.option("--out_file")
 def main(abc_predictions, out_file):
-    pred_df = pd.read_csv(abc_predictions, sep="\t")
+    pred_df = pd.read_csv(abc_predictions, sep="\t", compression="gzip")
     if len(pred_df) == 0:
         raise Exception("Did not find any enhancers in the Predictions file")
 
diff --git a/workflow/scripts/feature_tables/gen_num_sum_nearby_enhancers.py b/workflow/scripts/feature_tables/gen_num_sum_nearby_enhancers.py
index d1aab36..942fcca 100644
--- a/workflow/scripts/feature_tables/gen_num_sum_nearby_enhancers.py
+++ b/workflow/scripts/feature_tables/gen_num_sum_nearby_enhancers.py
@@ -35,11 +35,13 @@ def generate_num_sum_enhancers(
     )
 
     # select columns from EnhancerPredictionsAllPutative
-    os.system(
-        "zcat {} | csvtk cut -t -f chr,start,end,name,activity_base | sed '1d' > {}".format(
-            pred_file, pred_slim
-        )
-    )
+    df = pd.read_csv(pred_file, sep="\t", usecols=["chr", "start", "end", "name", "activity_base"], compression= "gzip")
+    df.to_csv(pred_slim, sep="\t", index=False, header=False)
+    # os.system(
+    #     "zcat {} | csvtk cut -t -f chr,start,end,name,activity_base | sed '1d' > {}".format(
+    #         pred_file, pred_slim
+    #     )
+    # )
 
     # intersect with expanded enhancer regions and count n enhancers
     os.system(
diff --git a/workflow/scripts/feature_tables/gen_num_tss_enh_gene.py b/workflow/scripts/feature_tables/gen_num_tss_enh_gene.py
index 211a2bb..4643858 100644
--- a/workflow/scripts/feature_tables/gen_num_tss_enh_gene.py
+++ b/workflow/scripts/feature_tables/gen_num_tss_enh_gene.py
@@ -53,7 +53,7 @@ def determine_num_tss_enh_gene(
 @click.option("--enhancer_tss_int")
 @click.option("--out_file")
 def main(abc_predictions, ref_gene_tss, extended_enhancers, enhancer_tss_int, out_file):
-    pred_df = pd.read_csv(abc_predictions, sep="\t")
+    pred_df = pd.read_csv(abc_predictions, sep="\t", compression="gzip")
     if len(pred_df) == 0:
         raise Exception("Did not find any enhancers in the Predictions file")
 
diff --git a/workflow/scripts/feature_tables/overlap_features_with_crispr_data.R b/workflow/scripts/feature_tables/overlap_features_with_crispr_data.R
index 34d2850..ca470d9 100755
--- a/workflow/scripts/feature_tables/overlap_features_with_crispr_data.R
+++ b/workflow/scripts/feature_tables/overlap_features_with_crispr_data.R
@@ -57,7 +57,7 @@ merge_feature_to_crispr <- function(crispr, features, feature_score_cols, agg_fu
   output <- merged
 
   # sort output by cre position
-  output <- output[order(dataset, chrom, chromStart, chromEnd, measuredGeneSymbol), ]
+  output <- output[order(chrom, chromStart, chromEnd, measuredGeneSymbol), ]
   return(list(merged=output, missing=missing))
 }
 
@@ -81,9 +81,10 @@ aggregate_features <- function(merged, feature_score_cols, agg_cols, agg_fun) {
 # load feature table
 features <- fread(snakemake@input$features)
 
-# load crispri data and only retain relevant columns
+# load crispri data and only retain relevant columns and filter to cell type
 crispr <- fread(snakemake@input$crispr)
-crispr <- select(crispr, -c(pair_uid, merged_uid, merged_start, merged_end))
+crispr <- select(crispr, -any_of(c("pair_uid", "merged_uid", "merged_start", "merged_end"))) %>% 
+  filter(CellType == snakemake@params$crispr_cell_type)
 
 # load feature config file and only retain entries for features in input data
 config <- fread(snakemake@input$feature_table_file)
diff --git a/workflow/scripts/feature_tables/process_crispr_data.R b/workflow/scripts/feature_tables/process_crispr_data.R
index e382a16..73f070c 100755
--- a/workflow/scripts/feature_tables/process_crispr_data.R
+++ b/workflow/scripts/feature_tables/process_crispr_data.R
@@ -8,19 +8,45 @@ suppressPackageStartupMessages({
   library(tibble)
 })
 
+combine_crispr_features <- function(space_sep_string) {
+  crispr_feature_files <- space_sep_string %>% strsplit(" ") %>% unlist() %>% trimws()
+  crispr_features <- lapply(crispr_feature_files, fread)
+
+  # get common columns (e.g. features in all)
+  col_names_list <- lapply(crispr_features, colnames)
+  common_cols <- Reduce(intersect, col_names_list)
+
+  # filter crispr features to selected columns then combine
+  crispr_features <- lapply(crispr_features, function(df) select(df, all_of(common_cols))) %>% 
+    rbindlist() %>% as.data.frame()
+
+  return(crispr_features)
+}
+
+
+
 # load inputs
-df <- fread(snakemake@input$crispr_features)
+crispr_features <- combine_crispr_features(snakemake@input$crispr_features)
+crispr_missing <- combine_crispr_features(snakemake@input$crispr_missing)
+
+
 genes <- fread(snakemake@params$genes)
 colnames(genes) <- c('chr', 'start', 'end', 'gene', 'score', 'strand', 'Ensembl_ID', 'gene_type')
 
 # rename some column names for later utility
-df = dplyr::rename(df, 'chr'='chrom', 'start'='chromStart', 'end'='chromEnd', 'TargetGene'='measuredGeneSymbol')
+crispr_features <- crispr_features %>% 
+  rename('chr'='chrom', 'start'='chromStart', 'end'='chromEnd', 'TargetGene'='measuredGeneSymbol')
+crispr_missing <- crispr_missing %>% 
+  rename('chr'='chrom', 'start'='chromStart', 'end'='chromEnd', 'TargetGene'='measuredGeneSymbol')
 
 # drop elements where Regulated=NA
-df = dplyr::filter(df, Regulated!="NA")
+crispr_features = filter(crispr_features, Regulated != "NA" & !is.na(Regulated))
+crispr_missing = filter(crispr_missing, Regulated != "NA" & !is.na(Regulated))
 
 # filter to elements with target gene in gene universe (should be redundant with applying filter_genes=tss_universe in feature overlap)
-df = dplyr::filter(df, TargetGene %in% genes$gene)
+crispr_features = filter(crispr_features, TargetGene %in% genes$gene)
+crispr_missing = filter(crispr_missing, TargetGene %in% genes$gene)
 
 # save output to file
-fwrite(df, file = snakemake@output$processed, sep = "\t", na = "NA", quote = FALSE)
+fwrite(crispr_features, file = snakemake@output$processed, sep = "\t", na = "NA", quote = FALSE)
+fwrite(crispr_missing, file = snakemake@output$missing, sep = "\t", na = "NA", quote = FALSE)
diff --git a/workflow/scripts/model_training/compare_all_feature_sets.py b/workflow/scripts/model_training/compare_all_feature_sets.py
index fc84e78..04546c1 100644
--- a/workflow/scripts/model_training/compare_all_feature_sets.py
+++ b/workflow/scripts/model_training/compare_all_feature_sets.py
@@ -5,11 +5,9 @@
 import scipy
 from training_functions import (
     statistic_aupr,
-    statistic_precision,
+    statistic_precision_at_threshold,
     train_and_predict_once,
-    bootstrap_pvalue,
-    statistic_delta_aupr,
-    statistic_delta_precision,
+    threshold_70_pct_recall
 )
 
 
@@ -32,10 +30,13 @@ def compare_feature_sets(df_dataset, feature_table, epsilon, params, n_boot):
         df.loc[len(df)] = fill_zeros
 
     df["features"] = df.apply(lambda row: list(df.columns[row == 1]), axis=1)
-    df["n_features"] = 0
-    df["AUPRC"] = 0
-    df["AUPRC_95CI_low"] = 0
-    df["AUPRC_95CI_high"] = 0
+    df["n_features"] = 0.0
+    df["AUPRC"] = 0.0
+    df["AUPRC_95CI_low"] = 0.0
+    df["AUPRC_95CI_high"] = 0.0
+    df["precision_70pct_recall"] = 0.0
+    df["precision_95CI_low"] = 0.0
+    df["precision_95CI_high"] = 0.0
 
     for i in range(len(df)):  # iterate through sets
         model_name = "row_" + str(i)
@@ -48,9 +49,8 @@ def compare_feature_sets(df_dataset, feature_table, epsilon, params, n_boot):
         )
         Y_pred = df_dataset[model_name + ".Score"]
 
-        data = (Y_true, Y_pred)
         res_aupr = scipy.stats.bootstrap(
-            data,
+            (Y_true, Y_pred),
             statistic_aupr,
             n_resamples=n_boot,
             paired=True,
@@ -61,6 +61,20 @@ def compare_feature_sets(df_dataset, feature_table, epsilon, params, n_boot):
         df.loc[i, "AUPRC_95CI_low"] = res_aupr.confidence_interval[0]
         df.loc[i, "AUPRC_95CI_high"] = res_aupr.confidence_interval[1]
 
+        thresh = threshold_70_pct_recall(Y_true, Y_pred)
+
+        res_prec = scipy.stats.bootstrap(
+            (Y_true, Y_pred),
+            lambda Y_true, Y_pred: statistic_precision_at_threshold(Y_true, Y_pred, thresh),
+            n_resamples=n_boot,
+            paired=True,
+            confidence_level=0.95,
+            method="BCa",
+        )
+        df.loc[i, "precision_70pct_recall"] = np.mean(res_prec.bootstrap_distribution)
+        df.loc[i, "precision_95CI_low"] = res_prec.confidence_interval[0]
+        df.loc[i, "precision_95CI_high"] = res_prec.confidence_interval[1]
+
     # sort table by AUPRC
     df = df.sort_values(by="AUPRC", ascending=False)
 
diff --git a/workflow/scripts/model_training/compare_all_models.py b/workflow/scripts/model_training/compare_all_models.py
index 9b38528..b9244c8 100644
--- a/workflow/scripts/model_training/compare_all_models.py
+++ b/workflow/scripts/model_training/compare_all_models.py
@@ -12,10 +12,10 @@
 
 
 def performance_summary(
-    model_id, dataset, model_name, out_dir, crispr_data="", n_boot=1000
+    model_id, dataset, pred_file, missing_file, model_name, out_dir, crispr_data="", n_boot=1000
 ):
     # read in predicitons
-    if model_id == "distance":
+    if model_id.startswith("distance"):
         crispr_data = pd.read_csv(crispr_data, sep="\t")
         crispr_data["distance"] = np.abs(
             (crispr_data["chromStart"] + crispr_data["chromEnd"]) / 2
@@ -26,15 +26,15 @@ def performance_summary(
         Y_pred_all = crispr_data["distance"] * -1
         pct_missing = 0
     else:  # normal models
-        pred_file = os.path.join(
-            out_dir, dataset, model_id, "model", "training_predictions.tsv"
-        )
-        missing_file = os.path.join(
-            out_dir,
-            dataset,
-            model_id,
-            "missing.EPCrisprBenchmark_ensemble_data_GRCh38.K562_features_NAfilled.tsv.gz",
-        )
+        # pred_file = os.path.join(
+        #     out_dir, dataset, model_id, "model", "training_predictions.tsv"
+        # )
+        # missing_file = os.path.join(
+        #     out_dir,
+        #     dataset,
+        #     model_id,
+        #     "missing.EPCrisprBenchmark_ensemble_data_GRCh38.K562_features_NAfilled.tsv.gz",
+        # )
         pred_df = pd.read_csv(pred_file, sep="\t")
         missing_df = pd.read_csv(missing_file, sep="\t")
 
@@ -101,17 +101,27 @@ def performance_summary(
 
 
 @click.command()
+@click.option("--all_pred", required=True)
+@click.option("--all_missing", required=True)
 @click.option("--model_config_file", required=True)
 @click.option("--output_file", required=True)
 @click.option("--crispr_data", required=True)
+@click.option("--crispr_names", required=True)
 @click.option("--out_dir", required=True)
-def main(model_config_file, output_file, crispr_data, out_dir):
+def main(all_pred, all_missing, model_config_file, output_file, crispr_data, crispr_names, out_dir):
+    all_pred_files = all_pred.split(" ")
+    all_missing_files = all_missing.split(" ")
+    crispr_dict = dict(zip(crispr_names.split(" "), crispr_data.split(" ")))
+
     model_name = "E2G"
     model_config = (
         pd.read_table(model_config_file, na_values="")
         .fillna("None")
         .set_index("model", drop=False)
     )
+    model_config["prediction_file"] = all_pred_files
+    model_config["missing_file"] = all_missing_files
+    
     # initiate final df
     df = pd.DataFrame(
         columns=[
@@ -130,13 +140,15 @@ def main(model_config_file, output_file, crispr_data, out_dir):
 
     # iterate through rows of model config and add results to final df
     for row in model_config.itertuples(index=False):
-        res_row = performance_summary(row.model, row.dataset, model_name, out_dir)
+        res_row = performance_summary(row.model, row.dataset, row.prediction_file, row.missing_file, model_name, out_dir)
+        df = pd.concat([df, res_row])
+    
+    for key, file_path in crispr_dict.items():
+    # add row(s) for distance
+        res_row = performance_summary(
+            f"distance_{key}", "baseline", "", "", "", out_dir, crispr_data=file_path
+        )
         df = pd.concat([df, res_row])
-    # add row for distance
-    res_row = performance_summary(
-        "distance", "baseline", "", out_dir, crispr_data=crispr_data
-    )
-    df = pd.concat([df, res_row])
 
     # sort table by AUPRC
     df = df.sort_values(by="AUPRC", ascending=False)
diff --git a/workflow/scripts/model_training/train_model.py b/workflow/scripts/model_training/train_model.py
index 653104f..8d88ef4 100644
--- a/workflow/scripts/model_training/train_model.py
+++ b/workflow/scripts/model_training/train_model.py
@@ -25,6 +25,7 @@ def train_and_predict(
 		polynomial_names = poly.get_feature_names_out(feature_list_core)
 		X = pd.DataFrame(X, columns=polynomial_names)
 
+
 	Y = df_dataset["Regulated"].values.astype(np.int64)
 
 	# initialize df for feature weights & metrics
@@ -88,32 +89,34 @@ def train_and_predict(
 				n_test_pos = np.sum(Y_test)
 				n_test_neg = len(Y_test) - n_test_pos
 
+				multiple_labels = n_test_pos > 0 and n_test_neg > 0
+
 				ll_train = log_loss(Y_train, model.predict_proba(X_train)[:, 1])
 				ll_test_full = (
 					log_loss(Y_test, probs_full[idx_test, 1])
-					if n_test_pos > 0
+					if multiple_labels
 					else np.NaN
 				)
-				ll_test = log_loss(Y_test, probs[:, 1]) if n_test_pos > 0 else np.NaN
+				ll_test = log_loss(Y_test, probs[:, 1]) if multiple_labels else np.NaN
 				auroc_train = roc_auc_score(Y_train, model.predict_proba(X_train)[:, 1])
 				auroc_test_full = (
 					roc_auc_score(Y_test, probs_full[idx_test, 1])
-					if n_test_pos > 0
+					if multiple_labels
 					else np.NaN
 				)
 				auroc_test = (
-					roc_auc_score(Y_test, probs[:, 1]) if n_test_pos > 0 else np.NaN
+					roc_auc_score(Y_test, probs[:, 1]) if multiple_labels > 0 else np.NaN
 				)
 				auprc_train = statistic_aupr(
 					Y_train, model.predict_proba(X_train)[:, 1]
 				)
 				auprc_test_full = (
 					statistic_aupr(Y_test, probs_full[idx_test, 1])
-					if n_test_pos > 0
+					if multiple_labels
 					else np.NaN
 				)
 				auprc_test = (
-					statistic_aupr(Y_test, probs[:, 1]) if n_test_pos > 0 else np.NaN
+					statistic_aupr(Y_test, probs[:, 1]) if multiple_labels else np.NaN
 				)
 
 				df_temp = pd.DataFrame(