From 0eb97b491a924664e6bda7c9e7833df9ea2ad30d Mon Sep 17 00:00:00 2001
From: dltamayo <DLT@breakthroughcancer.org>
Date: Fri, 26 Dec 2025 19:14:38 -0500
Subject: [PATCH 01/14] Retain original samplesheet sample order

---
 .../local/samplesheet/samplesheet_resolve.nf  | 34 ++++++++++++++++---
 subworkflows/local/resolve_samplesheet.nf     |  1 +
 2 files changed, 31 insertions(+), 4 deletions(-)

diff --git a/modules/local/samplesheet/samplesheet_resolve.nf b/modules/local/samplesheet/samplesheet_resolve.nf
index 61ca0c1..3128eff 100644
--- a/modules/local/samplesheet/samplesheet_resolve.nf
+++ b/modules/local/samplesheet/samplesheet_resolve.nf
@@ -3,6 +3,7 @@ process SAMPLESHEET_RESOLVE {
     container "ghcr.io/karchinlab/tcrtoolkit:main"
 
     input:
+    path samplesheet_utf8
     val(resolved_rows)     // List of tab-separated strings
     val(resolved_header)   // Comma-separated header line
 
@@ -11,10 +12,35 @@ process SAMPLESHEET_RESOLVE {
 
     script:
     """
-    echo \"$resolved_header\" > samplesheet_resolved.csv
+# Write resolved rows to a temp file
+cat << 'EOF' > resolved.tmp
+${resolved_rows.join('\n')}
+EOF
 
-    for row in ${resolved_rows.collect{"\"${it}\""}.join(' ')}; do
-        echo -e "\$row" >> samplesheet_resolved.csv
-    done
+# Emit header
+echo "${resolved_header}" > samplesheet_resolved.csv
+
+# Two-pass awk:
+#  - pass 1: read original samplesheet, store sample order
+#  - pass 2: read resolved rows, store rows by sample
+awk -F',' '
+    NR==FNR {
+        if (FNR > 1) order[++n] = \$1
+        next
+    }
+    {
+        resolved[\$1] = \$0
+    }
+    END {
+        for (i = 1; i <= n; i++) {
+            s = order[i]
+            if (!(s in resolved)) {
+                printf "ERROR: missing resolved row for %s\\n", s > "/dev/stderr"
+                exit 1
+            }
+            print resolved[s]
+        }
+    }
+' "${samplesheet_utf8}" resolved.tmp >> samplesheet_resolved.csv
     """
 }
\ No newline at end of file
diff --git a/subworkflows/local/resolve_samplesheet.nf b/subworkflows/local/resolve_samplesheet.nf
index 98abe4d..a9f03a5 100644
--- a/subworkflows/local/resolve_samplesheet.nf
+++ b/subworkflows/local/resolve_samplesheet.nf
@@ -34,6 +34,7 @@ workflow RESOLVE_SAMPLESHEET {
         .set { resolved_header }
 
     SAMPLESHEET_RESOLVE(
+            samplesheet_utf8,
             resolved_rows,
             resolved_header
         )

From 175ffcfc7b923d8be80c1da8ea4f23641200f701 Mon Sep 17 00:00:00 2001
From: dltamayo <DLT@breakthroughcancer.org>
Date: Fri, 26 Dec 2025 19:22:58 -0500
Subject: [PATCH 02/14] Refactor compare_calc - Removed utils.py code, placed
 functions into compare_calc.py - Refactored compare_calc.py for efficient
 access of dataframes - Fixed bug where NaN `junction_aa` values were not
 dropped prior to calculation of Jaccard and Sorensen matrices

---
 bin/compare_calc.py | 269 +++++++++++++++++++++++---------------------
 bin/utils.py        |  85 --------------
 2 files changed, 143 insertions(+), 211 deletions(-)
 delete mode 100755 bin/utils.py

diff --git a/bin/compare_calc.py b/bin/compare_calc.py
index b86700c..81a83ef 100755
--- a/bin/compare_calc.py
+++ b/bin/compare_calc.py
@@ -1,133 +1,150 @@
 #!/usr/bin/env python3
 """
-Description: this script calculates overlap measures between TCR repertoires
-
-@author: Domenick Braccia
+Description: Calculate overlap measures between TCR repertoires
+Author: Dylan Tamayo, Domenick Braccia
 """
 
 import argparse
 import pandas as pd
 import numpy as np
-import os
-import sys
-import csv
-from scipy.stats import entropy
-from utils import jaccard_index, sorensen_index, morisita_horn_index #, jensen_shannon_distance
-
-print('-- ENTERED compare_calc.py--')
-print('-- THE TIME IS: --' + str(pd.Timestamp.now()))
-
-# initialize parser
-parser = argparse.ArgumentParser(description='Calculate clonality of a TCR repertoire')
-
-# add arguments
-parser.add_argument('-s', '--sample_utf8', 
-                    metavar='sample_utf8', 
-                    type=str, 
-                    help='sample CSV file initially passed to nextflow run command')
-# parser.add_argument('-m', '--meta_data',
-#                     metavar='meta_data',
-#                     type=str,
-#                     help='metadata CSV file initially passed to nextflow run command')
-
-args = parser.parse_args() 
-
-## Read in sample table CSV file
-## convert metadata to list
-s = args.sample_utf8
-sample_utf8 = pd.read_csv(args.sample_utf8, sep=',', header=0)
-print('sample_utf8 looks like this: ' + str(sample_utf8))
-print('sample_utf8 columns: \n')
-print(sample_utf8.columns)
-
-# Read in metadata table CSV file
-# meta_data = pd.read_csv(args.meta_data, sep=',', header=0)
-# print('meta_data looks like this: ' + str(meta_data))
-# print('meta_data columns: \n')
-# print(meta_data.columns)
-
-# Import TCR count tables into dictionary of dataframes
-files = sample_utf8['file']
-dfs = {}
-for file in files:
-    # load data
-    df = pd.read_csv(file, sep='\t', header=0)
-    dfs[file] = df
-
-print('number of files in dfs: ' + str(len(dfs)))
-
-## calculate the jaccard index between each sample pair in dfs and store in an nxn matrix and write to file
-samples = list(dfs.keys())
-
-print('- calculating jaccard index... -')
-jaccard_mat = np.zeros((len(samples), len(samples)))
-for i, sample1 in enumerate(samples):
-    for j, sample2 in enumerate(samples):
-        # calculate jaccard index
-        value = jaccard_index(dfs[sample1]['junction_aa'], dfs[sample2]['junction_aa'])
-        # store in numpy array
-        jaccard_mat[i, j] = value
-
-# define column and index names
-sample_names= [os.path.basename(sample).split('.')[0] for sample in samples]
-jaccard_df = pd.DataFrame(jaccard_mat, columns=sample_names, index=sample_names)
-
-# save jacard_df to csv
-jaccard_df.to_csv('jaccard_mat.csv', index=True, header=True)
-
-## calculate the sorensen index between each sample pair in dfs and store in an nxn matrix and write to file
-print('- calculating sorensen index... -')
-sorensen_mat = np.zeros((len(samples), len(samples)))
-for i, sample1 in enumerate(samples):
-    for j, sample2 in enumerate(samples):
-        # calculate sorensen index
-        value = sorensen_index(dfs[sample1]['junction_aa'], dfs[sample2]['junction_aa'])
-        # store in numpy array
-        sorensen_mat[i, j] = value
-
-# define column and index names
-sorensen_df = pd.DataFrame(sorensen_mat, columns=sample_names, index=sample_names)
-
-# save sorensen_df to csv
-sorensen_df.to_csv('sorensen_mat.csv', index=True, header=True)
-
-## calculate the morisita index between each sample pair in dfs and store in an nxn matrix and write to file
-print('- calculating morisita index... -')
-morisita_mat = np.zeros((len(samples), len(samples)))
-for i in range(len(samples)):
-    print('-- on sample ' + str(i) + ' --')
-    for j in range(i+1):
-        # calculate morisita index
-        value = morisita_horn_index(dfs, samples[i], samples[j])
-        # store in numpy array
-        morisita_mat[i, j] = value
-
-# Copy the lower triangle to the upper triangle
-morisita_mat = morisita_mat + morisita_mat.T - np.diag(morisita_mat.diagonal())
-
-# define column and index names
-morisita_df = pd.DataFrame(morisita_mat, columns=sample_names, index=sample_names)
-
-# save morisita_df to csv
-morisita_df.to_csv('morisita_mat.csv', index=True, header=True)
-
-## calculate jensen shannon distance between each sample pair in dfs and store in an nxn matrix and write to file
-# print('- calculating jensen shannon distance... -')
-# jsd_mat = np.zeros((len(samples), len(samples)))
-# for i, sample1 in enumerate(samples):
-#     for j, sample2 in enumerate(samples):
-#         # calculate jensen shannon distance
-#         value = jensen_shannon_distance(dfs[sample1][['junction_aa', 'duplicate_count']], dfs[sample2][['junction_aa', 'duplicate_count']])
-#         # store in numpy array
-#         jsd_mat[i, j] = value
-
-# # Copy the lower triangle to the upper triangle
-# jsd_mat = jsd_mat + jsd_mat.T - np.diag(jsd_mat.diagonal())
-
-# # define column and index names
-# jsd_df = pd.DataFrame(jsd_mat, columns=sample_names, index=sample_names)
-
-# # save jsd_df to csv
-# jsd_df.to_csv('jsd_mat.csv', index=True, header=True)
-
-## ========================================================================== ##
+
+# -------------------------
+# Similarity functions
+# -------------------------
+def jaccard_index(set1, set2):
+    union = len(set1 | set2)
+    return len(set1 & set2) / union if union else 0.0
+
+
+def sorensen_index(set1, set2):
+    denom = len(set1) + len(set2)
+    return (2 * len(set1 & set2) / denom) if denom else 0.0
+
+
+def morisita_horn_index(counts1, counts2):
+    X = counts1.sum()
+    Y = counts2.sum()
+
+    if X == 0 or Y == 0:
+        return 0.0
+
+    prod_sum = np.sum(counts1 * counts2)
+    lambda1 = np.sum(counts1 ** 2) / (X ** 2)
+    lambda2 = np.sum(counts2 ** 2) / (Y ** 2)
+
+    return (2 * prod_sum) / ((lambda1 + lambda2) * X * Y)
+
+if __name__ == "__main__":
+    # -------------------------
+    # Argument parsing
+    # -------------------------
+    parser = argparse.ArgumentParser(
+        description="Calculate overlap metrics for TCR repertoires"
+    )
+    parser.add_argument(
+        "-s", "--sample_utf8",
+        required=True,
+        help="Samplesheet CSV passed from Nextflow"
+    )
+    args = parser.parse_args()
+
+
+    # -------------------------
+    # Load samplesheet
+    # -------------------------
+    sample_df = pd.read_csv(args.sample_utf8)
+
+    samples = sample_df["sample"].tolist()
+    files = sample_df["file"].tolist()
+    n = len(samples)
+
+    print(f"Loaded {n} samples")
+
+    # -------------------------
+    # Preload data structures
+    # -------------------------
+    junction_sets = {}
+    count_vectors = {}
+
+    for sample, file in zip(samples, files):
+        df = pd.read_csv(file, sep="\t", usecols=["junction_aa", "duplicate_count"])
+        df = df.dropna(subset=["junction_aa"])
+
+        # Set for presence/absence metrics
+        junction_sets[sample] = set(df["junction_aa"])
+
+        # Counts for Morisita–Horn
+        count_vectors[sample] = (
+            df.groupby("junction_aa")["duplicate_count"]
+            .sum()
+        )
+
+
+    # -------------------------
+    # Align count vectors across union space
+    # -------------------------
+    all_junctions = sorted(
+        set().union(*junction_sets.values())
+    )
+
+    for sample in samples:
+        count_vectors[sample] = (
+            count_vectors[sample]
+            .reindex(all_junctions, fill_value=0)
+            .to_numpy()
+        )
+
+
+    # -------------------------
+    # Initialize matrices
+    # -------------------------
+    jaccard_mat = np.zeros((n, n))
+    sorensen_mat = np.zeros((n, n))
+    morisita_mat = np.zeros((n, n))
+
+
+    # -------------------------
+    # Compute upper triangle only
+    # -------------------------
+    print("Calculating overlap metrics...")
+
+    for i in range(n):
+        s1 = samples[i]
+        set1 = junction_sets[s1]
+        counts1 = count_vectors[s1]
+
+        # Diagonal
+        jaccard_mat[i, i] = 1.0
+        sorensen_mat[i, i] = 1.0
+        morisita_mat[i, i] = 1.0
+
+        for j in range(i + 1, n):
+            s2 = samples[j]
+
+            j_val = jaccard_index(set1, junction_sets[s2])
+            s_val = sorensen_index(set1, junction_sets[s2])
+            m_val = morisita_horn_index(counts1, count_vectors[s2])
+
+            jaccard_mat[i, j] = jaccard_mat[j, i] = j_val
+            sorensen_mat[i, j] = sorensen_mat[j, i] = s_val
+            morisita_mat[i, j] = morisita_mat[j, i] = m_val
+
+
+    # -------------------------
+    # Write outputs
+    # -------------------------
+    index_names = samples
+
+    pd.DataFrame(
+        jaccard_mat, index=index_names, columns=index_names
+    ).to_csv("jaccard_mat.csv")
+
+    pd.DataFrame(
+        sorensen_mat, index=index_names, columns=index_names
+    ).to_csv("sorensen_mat.csv")
+
+    pd.DataFrame(
+        morisita_mat, index=index_names, columns=index_names
+    ).to_csv("morisita_mat.csv")
+
+    print("Finished writing all matrices")
\ No newline at end of file
diff --git a/bin/utils.py b/bin/utils.py
deleted file mode 100755
index f4391f1..0000000
--- a/bin/utils.py
+++ /dev/null
@@ -1,85 +0,0 @@
-#!/usr/bin/env python3
-
-"""
-Description: utility functions for plotting simple TCR repertoire statistics
-
-Authors: Domenick Braccia
-"""
-
-## import packages
-import time
-import pandas as pd
-import matplotlib.pyplot as plt
-import seaborn as sns
-from scipy.spatial import distance
-
-def TicTocGenerator():
-    # Generator that returns time differences
-    ti = 0           # initial time
-    tf = time.time() # final time
-    while True:
-        ti = tf
-        tf = time.time()
-        yield tf-ti # returns the time difference
-
-TicToc = TicTocGenerator() # create an instance of the TicTocGen generator
-
-# This will be the main function through which we define both tic() and toc()
-def toc(tempBool=True):
-    # Prints the time difference yielded by generator instance TicToc
-    tempTimeInterval = next(TicToc)
-    if tempBool:
-        print( "Elapsed time: %f seconds.\n" %tempTimeInterval )
-
-def tic():
-    # Records a time in TicToc, marks the beginning of a time interval
-    toc(False)
-
-# Defining sample comparison functions
-def jaccard_index(sample1, sample2):
-    set1 = set(sample1)
-    set2 = set(sample2)
-    intersection = len(set1.intersection(set2))
-    union = len(set1.union(set2))
-    return intersection / union
-
-def sorensen_index(sample1, sample2):
-    set1 = set(sample1)
-    set2 = set(sample2)
-    intersection = len(set1.intersection(set2))
-    return 2 * intersection / (len(set1) + len(set2))
-
-def morisita_horn_index(dfs, sample1, sample2):
-    # create sets of amino acid sequences
-    set1 = set(dfs[sample1]['junction_aa'])
-    set2 = set(dfs[sample2]['junction_aa'])
-
-    # identify union of sets
-    union = set1.union(set2)
-
-    # get counts of aa sequences in sample1 and sample2
-    df1 = dfs[sample1].groupby('junction_aa')['duplicate_count'].sum().reindex(union).fillna(0)
-    df2 = dfs[sample2].groupby('junction_aa')['duplicate_count'].sum().reindex(union).fillna(0)
-    n1i = df1.values
-    n2i = df2.values
-
-    # calculate product of counts
-    products = n1i * n2i
-
-    # calculate simpson index values for sample1 and sample2
-    print(type(df1))
-    X = df1.sum()
-    Y = df2.sum()
-
-    s1_si = sum(count**2 for count in df1)/(X**2)
-    s2_si = sum(count**2 for count in df2)/(Y**2)
-
-    numerator = 2 * sum(products)
-    denominator = (s1_si + s2_si) * (X * Y)
-    return numerator / denominator
-
-def jensen_shannon_distance(sample1, sample2):
-    # Merge the two samples based on junction_aa column
-    merged = pd.merge(sample1, sample2, on='junction_aa', how='outer', suffixes=('_1', '_2')).fillna(0)
-    # Enter probability distributions into the distance function
-    return distance.jensenshannon(merged['duplicate_count_1'], merged['duplicate_count_2'])
\ No newline at end of file

From 8433d77b6222e2c47f19bfab14693a611b70375c Mon Sep 17 00:00:00 2001
From: dltamayo <DLT@breakthroughcancer.org>
Date: Mon, 29 Dec 2025 11:24:49 -0500
Subject: [PATCH 03/14] Address linting issues, add param validation Reformated
 code to address new Nextflow linting issues for strict syntax compatibility,
 namely: - moving statements into workflows - making implicit closure
 parameters explicit - removing use of projectDir in processes

Added nf-schema for parameter validation
---
 conf/base.config                          |  34 +++-
 main.nf                                   |  12 --
 modules/local/compare/compare_plot.nf     |   1 -
 modules/local/compare/gliph2.nf           |   1 -
 modules/local/sample/sample_aggregate.nf  |   2 +-
 modules/local/sample/sample_plot.nf       |   1 -
 modules/local/sample/tcrdist3.nf          |  38 ++---
 nextflow.config                           |  42 +----
 nextflow_schema.json                      | 186 ++++++++++++++++++++++
 notebooks/compare_stats_template.qmd      |  34 +---
 notebooks/gliph2_report_template.qmd      |   2 -
 notebooks/sample_stats_template.qmd       |   9 +-
 subworkflows/local/input_check.nf         |   2 +-
 subworkflows/local/resolve_samplesheet.nf |   2 +-
 subworkflows/local/sample.nf              |  24 ++-
 subworkflows/local/validate_params.nf     |   8 +
 workflows/tcrtoolkit.nf                   |  27 +---
 17 files changed, 269 insertions(+), 156 deletions(-)
 create mode 100644 nextflow_schema.json
 create mode 100644 subworkflows/local/validate_params.nf

diff --git a/conf/base.config b/conf/base.config
index daf7fa4..7129697 100644
--- a/conf/base.config
+++ b/conf/base.config
@@ -9,7 +9,6 @@
 */
 
 process {
-
     // TODO nf-core: Check the defaults for all processes
     cpus   = { check_max( 1    * task.attempt, 'cpus'   ) }
     memory = { check_max( 4.GB * task.attempt, 'memory' ) }
@@ -63,4 +62,37 @@ process {
         maxRetries    = 2
     }
 
+}
+
+// Function to ensure that resource requirements don't go beyond
+// a maximum limit
+def check_max(obj, type) {
+    if (type == 'memory') {
+        try {
+            if (obj.compareTo(params.max_memory as nextflow.util.MemoryUnit) == 1)
+                return params.max_memory as nextflow.util.MemoryUnit
+            else
+                return obj
+        } catch (all) {
+            println "   ### ERROR ###   Max memory '${params.max_memory}' is not valid! Using default value: $obj"
+            return obj
+        }
+    } else if (type == 'time') {
+        try {
+            if (obj.compareTo(params.max_time as nextflow.util.Duration) == 1)
+                return params.max_time as nextflow.util.Duration
+            else
+                return obj
+        } catch (all) {
+            println "   ### ERROR ###   Max time '${params.max_time}' is not valid! Using default value: $obj"
+            return obj
+        }
+    } else if (type == 'cpus') {
+        try {
+            return Math.min( obj, params.max_cpus as int )
+        } catch (all) {
+            println "   ### ERROR ###   Max cpus '${params.max_cpus}' is not valid! Using default value: $obj"
+            return obj
+        }
+    }
 }
\ No newline at end of file
diff --git a/main.nf b/main.nf
index a09fdb0..c84b10f 100644
--- a/main.nf
+++ b/main.nf
@@ -27,18 +27,6 @@ workflow {
     TCRTOOLKIT()
 }
 
-/*
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-    COMPLETION EMAIL AND SUMMARY
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-*/
-
-workflow.onComplete {
-
-    log.info(workflow.success ? "All done!" : "Please check your inputs.")
-
-}
-
 /*
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
     THE END
diff --git a/modules/local/compare/compare_plot.nf b/modules/local/compare/compare_plot.nf
index 471ddaa..460f536 100644
--- a/modules/local/compare/compare_plot.nf
+++ b/modules/local/compare/compare_plot.nf
@@ -24,7 +24,6 @@ process COMPARE_PLOT {
     quarto render compare_stats.qmd \
         -P project_name:$project_name \
         -P workflow_cmd:'$workflow.commandLine' \
-        -P project_dir:$projectDir \
         -P jaccard_mat:$jaccard_mat \
         -P sorensen_mat:$sorensen_mat \
         -P morisita_mat:$morisita_mat \
diff --git a/modules/local/compare/gliph2.nf b/modules/local/compare/gliph2.nf
index 486442d..1a45e4c 100644
--- a/modules/local/compare/gliph2.nf
+++ b/modules/local/compare/gliph2.nf
@@ -77,7 +77,6 @@ process GLIPH2_PLOT {
     quarto render gliph2_report.qmd \
         -P project_name:$params.project_name \
         -P workflow_cmd:'$workflow.commandLine' \
-        -P project_dir:$projectDir \
         -P results_dir:'./' \
 
         # -P clusters:$cluster_member_details \
diff --git a/modules/local/sample/sample_aggregate.nf b/modules/local/sample/sample_aggregate.nf
index bb4a293..ea7b43a 100644
--- a/modules/local/sample/sample_aggregate.nf
+++ b/modules/local/sample/sample_aggregate.nf
@@ -15,7 +15,7 @@ process SAMPLE_AGGREGATE {
     python3 <<EOF
     import pandas as pd
 
-    input_files = [${csv_files.collect { '"' + it.getName() + '"' }.join(', ')}]
+    input_files = [${csv_files.collect { input_file -> '"' + input_file.getName() + '"' }.join(', ')}]
     dfs = [pd.read_csv(f) for f in input_files]
     merged = pd.concat(dfs, axis=0, ignore_index=True)
     merged.to_csv("${output_file}", index=False)
diff --git a/modules/local/sample/sample_plot.nf b/modules/local/sample/sample_plot.nf
index 5ca0494..3ae48a1 100644
--- a/modules/local/sample/sample_plot.nf
+++ b/modules/local/sample/sample_plot.nf
@@ -22,7 +22,6 @@ process SAMPLE_PLOT {
     quarto render sample_stats.qmd \
         -P project_name:$params.project_name \
         -P workflow_cmd:'$workflow.commandLine' \
-        -P project_dir:$projectDir \
         -P sample_table:$sample_table \
         -P sample_stats_csv:$sample_stats_csv \
         -P v_family_csv:$v_family_csv \
diff --git a/modules/local/sample/tcrdist3.nf b/modules/local/sample/tcrdist3.nf
index b37aab5..67dd2b0 100644
--- a/modules/local/sample/tcrdist3.nf
+++ b/modules/local/sample/tcrdist3.nf
@@ -3,30 +3,21 @@ process TCRDIST3_MATRIX {
     container "ghcr.io/karchinlab/tcrtoolkit:main"
 
     cpus {
-        if (task.memory > 256.GB)
-            return 16 * task.attempt
-        else if (task.memory > 64.GB)
-            return 8 * task.attempt
-        else if (task.memory > 4.GB)
-            return 4 * task.attempt
-        else
-            return 2 * task.attempt
-        }
+            task.memory > 256.GB ? 16 * task.attempt:
+            task.memory >  64.GB ?  8 * task.attempt:
+            task.memory >   4.GB ?  4 * task.attempt:
+                                    2 * task.attempt
+    }
+
+
     memory {
-        def sz = count_table.size()
-        def mb = 1024 * 1024
-        if (sz > 26 * mb)
-            return 512.GB * task.attempt
-        else if (sz > 20 * mb)
-            return 256.GB * task.attempt
-        else if (sz > 10 * mb)
-            return 128.GB * task.attempt
-        else if (sz > 4 * mb)
-            return 64.GB * task.attempt
-        else if (sz > 2 * mb)
-            return 16.GB * task.attempt
-        else
-            return 4.GB * task.attempt
+        size -> count_table.size()
+        count_table.size() > 26 * 1024**2 ? 512.GB * task.attempt:
+        count_table.size() > 20 * 1024**2 ? 256.GB * task.attempt:
+        count_table.size() > 10 * 1024**2 ? 128.GB * task.attempt:
+        count_table.size() >  4 * 1024**2 ?  64.GB * task.attempt:
+        count_table.size() >  2 * 1024**2 ?  16.GB * task.attempt:
+                                              4.GB * task.attempt
     }
 
     input:
@@ -42,7 +33,6 @@ process TCRDIST3_MATRIX {
 
     script:
     """
-    # Run tcrdist3 on input
     tcrdist3_matrix.py ${count_table} ${sample_meta.sample} ${matrix_sparsity} ${distance_metric} ${ref_db} ${task.cpus}
     """
 }
diff --git a/nextflow.config b/nextflow.config
index f685ead..b569dcb 100644
--- a/nextflow.config
+++ b/nextflow.config
@@ -7,7 +7,14 @@ docker {
 // Load base.config by default for all pipelines
 includeConfig 'conf/base.config'
 
+plugins {
+    id 'nf-schema@2.6.1'
+}
+
 params {
+    samplesheet = null
+    outdir = 'out'
+    
     publish_dir_mode           = 'copy'
     
     // Max resource options
@@ -16,7 +23,7 @@ params {
     max_cpus                   = 192
     max_time                   = '48.h'
 
-    input_format = "airr"
+    input_format = "airr" // cellranger, adaptive
     airr_schema = "${projectDir}/assets/airr/airr_rearrangement_schema.json"
     imgt_lookup = "${projectDir}/assets/airr/imgt_adaptive_lookup.tsv"
 
@@ -57,36 +64,3 @@ params {
 }
 
 includeConfig 'conf/modules.config'
-
-// Function to ensure that resource requirements don't go beyond
-// a maximum limit
-def check_max(obj, type) {
-    if (type == 'memory') {
-        try {
-            if (obj.compareTo(params.max_memory as nextflow.util.MemoryUnit) == 1)
-                return params.max_memory as nextflow.util.MemoryUnit
-            else
-                return obj
-        } catch (all) {
-            println "   ### ERROR ###   Max memory '${params.max_memory}' is not valid! Using default value: $obj"
-            return obj
-        }
-    } else if (type == 'time') {
-        try {
-            if (obj.compareTo(params.max_time as nextflow.util.Duration) == 1)
-                return params.max_time as nextflow.util.Duration
-            else
-                return obj
-        } catch (all) {
-            println "   ### ERROR ###   Max time '${params.max_time}' is not valid! Using default value: $obj"
-            return obj
-        }
-    } else if (type == 'cpus') {
-        try {
-            return Math.min( obj, params.max_cpus as int )
-        } catch (all) {
-            println "   ### ERROR ###   Max cpus '${params.max_cpus}' is not valid! Using default value: $obj"
-            return obj
-        }
-    }
-}
\ No newline at end of file
diff --git a/nextflow_schema.json b/nextflow_schema.json
new file mode 100644
index 0000000..c2a202f
--- /dev/null
+++ b/nextflow_schema.json
@@ -0,0 +1,186 @@
+{
+  "$schema": "https://json-schema.org/draft/2020-12/schema",
+  "$id": "https://raw.githubusercontent.com/break-through-cancer/tcrtoolkit-pipeline/main/nextflow_schema.json",
+  "title": "tcrtoolkit pipeline parameters",
+  "description": "BTC TCR Toolkit pipeline",
+  "type": "object",
+
+  "$defs": {
+    "input_output_options": {
+      "title": "Input/output options",
+      "type": "object",
+      "fa_icon": "fas fa-terminal",
+      "description": "Define where the pipeline should find input data and save output data.",
+      "required": ["samplesheet", "outdir"],
+      "properties": {
+        "samplesheet": {
+          "type": "string",
+          "format": "file-path",
+          "pattern": ".*.csv$",
+          "description": "Path to the samplesheet describing input AIRR data.",
+          "help_text": "A CSV of samples and metadata for this TCR analysis.",
+          "fa_icon": "fas fa-file-csv"
+        },
+        "outdir": {
+          "type": "string",
+          "format": "directory-path",
+          "default": "out",
+          "description": "Output directory where results will be saved.",
+          "fa_icon": "fas fa-folder-open"
+        }
+      }
+    },
+
+    "resource_options": {
+      "title": "Max resource options",
+      "type": "object",
+      "fa_icon": "fab fa-acquisitions-incorporated",
+      "description": "Set the top limit for requested resources for any single job.",
+      "properties": {
+        "max_cpus": {
+          "type": "integer",
+          "default": 192,
+          "description": "Maximum CPUs that can be requested by any process.",
+          "fa_icon": "fas fa-microchip"
+        },
+        "max_memory": {
+          "type": "string",
+          "default": "768.GB",
+          "pattern": "^\\d+(\\.\\d+)?\\.?\\s*(K|M|G|T)?B$",
+          "description": "Maximum memory for any process.",
+          "fa_icon": "fas fa-memory"
+        },
+        "max_time": {
+          "type": "string",
+          "default": "48.h",
+          "description": "Maximum walltime for any job.",
+          "fa_icon": "far fa-clock"
+        }
+      }
+    },
+
+    "workflow_options": {
+      "title": "Workflow parameters",
+      "type": "object",
+      "fa_icon": "fas fa-project-diagram",
+      "description": "General pipeline workflow settings.",
+      "properties": {
+        "workflow_level": {
+          "type": "string",
+          "default": "sample,compare",
+          "enum": ["sample,compare", "sample", "compare", "convert"],
+          "description": "Comma-separated workflow stages (sample, compare)."
+        },
+        "project_name": {
+          "type": "string",
+          "description": "Name of this analysis project."
+        },
+        "publish_dir_mode": {
+          "type": "string",
+          "default": "copy",
+          "enum": ["copy", "move", "link", "symlink"],
+          "description": "Method used by `publishDir` to save outputs."
+        }
+      }
+    },
+
+    "airr_options": {
+      "title": "AIRR data options",
+      "type": "object",
+      "fa_icon": "fas fa-dna",
+      "description": "Parameters related to AIRR format and schema references.",
+      "properties": {
+        "input_format": {
+          "type": "string",
+          "default": "airr",
+          "enum": ["airr", "adaptive", "cellranger"],
+          "description": "Input data format."
+        },
+        "airr_schema": {
+          "type": "string",
+          "description": "Path to AIRR rearrangement schema JSON."
+        },
+        "imgt_lookup": {
+          "type": "string",
+          "description": "Path to imgt lookup table."
+        },
+        "sample_stats_template": {
+          "type": "string",
+          "description": "Path to sample notebook template."
+        },
+        "compare_stats_template": {
+          "type": "string",
+          "description": "Path to compare notebook template."
+        }
+      }
+    },
+
+    "plotting_options": {
+      "title": "Plotting and metadata options",
+      "type": "object",
+      "fa_icon": "fas fa-chart-bar",
+      "description": "Parameters for plotting and metadata columns.",
+      "properties": {
+        "samplechart_x_col": { "type": "string", "default": "timepoint" },
+        "samplechart_color_col": { "type": "string", "default": "origin" },
+        "vgene_subject_col": { "type": "string", "default": "subject_id" },
+        "vgene_x_cols": { "type": "string", "default": "origin,timepoint" }
+      }
+    },
+
+    "giana_options": {
+      "title": "GIANA clustering options",
+      "type": "object",
+      "fa_icon": "fas fa-brain",
+      "properties": {
+        "threshold": { "type": "number", "default": 7.0 },
+        "threshold_score": { "type": "number", "default": 3.6 },
+        "threshold_vgene": { "type": "number", "default": 3.7 }
+      }
+    },
+
+    "gliph2_options": {
+      "title": "GLIPH2 clustering options",
+      "type": "object",
+      "fa_icon": "fas fa-code-branch",
+      "properties": {
+        "gliph2_report_template": { "type": "string" },
+        "ref_files": { "type": "string" },
+        "local_min_pvalue": { "type": "string", "default": "0.001" },
+        "p_depth": { "type": "string", "default": "1000" },
+        "global_convergence_cutoff": { "type": "string", "default": "1" },
+        "simulation_depth": { "type": "string", "default": "1000" },
+        "kmer_min_depth": { "type": "string", "default": "3" },
+        "local_min_OVE": { "type": "string", "default": "c(1000, 100, 10)" },
+        "algorithm": { "type": "string", "default": "GLIPH2" },
+        "all_aa_interchangeable": { "type": "string", "default": "1" }
+      }
+    },
+
+    "tcrdist3_options": {
+      "title": "TCRdist3 distance options",
+      "type": "object",
+      "fa_icon": "fas fa-ruler-combined",
+      "properties": {
+        "matrix_sparsity": {
+          "type": "string",
+          "default": "sparse",
+          "enum": ["sparse", "full"]
+        },
+        "distance_metric": { "type": "string", "default": "tcrdist" },
+        "db_path": { "type": "string" }
+      }
+    }
+  },
+
+  "allOf": [
+    { "$ref": "#/$defs/input_output_options" },
+    { "$ref": "#/$defs/resource_options" },
+    { "$ref": "#/$defs/workflow_options" },
+    { "$ref": "#/$defs/airr_options" },
+    { "$ref": "#/$defs/plotting_options" },
+    { "$ref": "#/$defs/giana_options" },
+    { "$ref": "#/$defs/gliph2_options" },
+    { "$ref": "#/$defs/tcrdist3_options" }
+  ]
+}
\ No newline at end of file
diff --git a/notebooks/compare_stats_template.qmd b/notebooks/compare_stats_template.qmd
index 4df9221..01b2c66 100644
--- a/notebooks/compare_stats_template.qmd
+++ b/notebooks/compare_stats_template.qmd
@@ -30,7 +30,6 @@ Thank you for using TCRtoolkit! This report is generated from sample data and me
 #Default inputs are overwritten at the command line in `modules/local/plot_sample.nf`
 workflow_cmd='<command used to run the pipeline>'
 project_name='path/to/project_name'
-project_dir='path/to/project_dir'
 jaccard_mat='path/to/jaccard_mat.csv'
 sorensen_mat='path/to/sorensen_mat.csv'
 morisita_mat='path/to/morisita_mat.csv'
@@ -57,43 +56,18 @@ import seaborn as sns
 print('Pipeline information and parameters:' + '\n')
 print('Project Name:          ' + project_name)
 print('Workflow command:      ' + workflow_cmd)
-print('Pipeline Directory:    ' + project_dir)
 print('Date and time:         ' + str(datetime.datetime.now()))
 
-# 3. Importing custom plotting functions
-## Plotting functions are defined in `bin/utils.py`.
-# sys.path.append(project_dir + '/bin/')
-# source_file = os.path.join(project_dir, 'bin', 'utils.py')
-# destination_file = os.path.join(os.getcwd(), 'utils.py')
-# shutil.copyfile(source_file, destination_file)
-# from utils import TicTocGenerator, tic, toc
-# TicToc = TicTocGenerator()
-
-# 4. Importing similarity data
-## 4a. jaccard similarity matrix
+# 3. Importing similarity data
+## 3a. jaccard similarity matrix
 jaccard_df = pd.read_csv(jaccard_mat, sep=',', header=0, index_col=0)
 
-## 4b. sorensen similarity matrix
+## 3b. sorensen similarity matrix
 sorensen_df = pd.read_csv(sorensen_mat, sep=',', header=0, index_col=0)
 
-## 4c. morisita similarity matrix
+## 3c. morisita similarity matrix
 morisita_df = pd.read_csv(morisita_mat, sep=',', header=0, index_col=0)
 
-## 4d. jensen-shannon matrix
-# jsd_df = pd.read_csv(jsd_mat, sep=',', header=0, index_col=0)
-
-# 5. Importing sample level counts
-# sample_utf8 = pd.read_csv(sample_utf8, sep=',', header=0, index_col=0)
-# files = sample_utf8['file']
-# dfs = {}
-# for file in files:
-#     # load data
-#     df = pd.read_csv(file, sep='\t', header=0)
-
-#     # Rename columns
-#     df = df.rename(columns={'count (templates/reads)': 'read_count', 'frequencyCount (%)': 'frequency'})
-#     sample_id = os.path.basename(file).split('.')[0]
-#     dfs[sample_id] = df
 ```
 
 # Analysis
diff --git a/notebooks/gliph2_report_template.qmd b/notebooks/gliph2_report_template.qmd
index 5d48831..e39d248 100644
--- a/notebooks/gliph2_report_template.qmd
+++ b/notebooks/gliph2_report_template.qmd
@@ -30,7 +30,6 @@ Thank you for using TCRtoolkit! This report is generated from sample data and me
 #Default inputs are overwritten at the command line in `modules/local/plot_gliph2.nf`
 workflow_cmd='<command used to run the pipeline>'
 project_name='path/to/project_name'
-project_dir='path/to/project_dir'
 clusters='path/to/{project_name}_cluster.csv'
 cluster_stats='path/to/{project_name}_cluster.txt'
 ```
@@ -54,5 +53,4 @@ import seaborn as sns
 print('Pipeline information and parameters:' + '\n')
 print('Project Name:          ' + project_name)
 print('Workflow command:      ' + workflow_cmd)
-print('Pipeline Directory:    ' + project_dir)
 print('Date and time:         ' + str(datetime.datetime.now()))
diff --git a/notebooks/sample_stats_template.qmd b/notebooks/sample_stats_template.qmd
index 5a97598..a85d7ef 100644
--- a/notebooks/sample_stats_template.qmd
+++ b/notebooks/sample_stats_template.qmd
@@ -32,7 +32,6 @@ Thank you for using TCRtoolkit! This report is generated from sample data and me
 
 workflow_cmd='<command used to run the pipeline>'
 project_name='<path/to/project_name>'
-project_dir='<path/to/project_dir>'
 sample_table='<path/to/sample_table.csv>'
 sample_stats_csv='<path/to/sample_stats.csv>'
 v_family_csv='<path/to/v_family.csv>'
@@ -70,15 +69,9 @@ warnings.filterwarnings(
 
 print('Project Name:          ' + project_name)
 print('Workflow command:      ' + workflow_cmd)
-print('Pipeline Directory:    ' + project_dir)
 print('Date and time:         ' + str(datetime.datetime.now()))
 
-# 3. Importing custom plotting functions
-# sys.path.append(project_dir + '/bin/')
-# from utils import TicTocGenerator, tic, toc
-# TicToc = TicTocGenerator()
-
-# 4. Loading data
+# 3. Loading data
 ## reading sample metadata
 meta = pd.read_csv(sample_table, sep=',')
 meta_cols = meta.columns.tolist()
diff --git a/subworkflows/local/input_check.nf b/subworkflows/local/input_check.nf
index 08a2fdb..b197003 100644
--- a/subworkflows/local/input_check.nf
+++ b/subworkflows/local/input_check.nf
@@ -19,7 +19,7 @@ workflow INPUT_CHECK {
     samplesheet_utf8
         .splitCsv(header: true, sep: ',')
         .map { row ->
-            def meta = row.findAll { k, v -> k != 'file' }  // everything except the file column
+            def meta = row.findAll { k, _v -> k != 'file' }  // everything except the file column
             def file_obj = file(row.file)
             return [meta, file_obj]
         }
diff --git a/subworkflows/local/resolve_samplesheet.nf b/subworkflows/local/resolve_samplesheet.nf
index a9f03a5..e6d8c06 100644
--- a/subworkflows/local/resolve_samplesheet.nf
+++ b/subworkflows/local/resolve_samplesheet.nf
@@ -28,7 +28,7 @@ workflow RESOLVE_SAMPLESHEET {
         .splitCsv(header: true, sep: ',')
         .first()
         .map { row -> 
-            def header = row.keySet().findAll { it != 'file' } + ['file']
+            def header = row.keySet().findAll { header_col -> header_col != 'file' } + ['file']
             return header.join(',')  // <-- convert to string
         }
         .set { resolved_header }
diff --git a/subworkflows/local/sample.nf b/subworkflows/local/sample.nf
index 9278537..c63bbe6 100644
--- a/subworkflows/local/sample.nf
+++ b/subworkflows/local/sample.nf
@@ -61,13 +61,11 @@ workflow SAMPLE {
     )
 
     TCRDIST3_MATRIX.out.max_matrix_value
-        .map { it.text.trim().toDouble() }
+        .map { tcrdist_xmax -> tcrdist_xmax.text.trim().toDouble() }
         .collect()
         .map { values -> values.max() }
         .set { global_x_max_value }
-
-    // Use `global_max_value` in downstream processes or print it
-    global_x_max_value.view { "Global x max matrix value: $it" }
+    global_x_max_value.view { global_xmax -> "Global x max matrix value: $global_xmax" }
 
     TCRDIST3_HISTOGRAM_CALC( 
         TCRDIST3_MATRIX.out.tcrdist_output,
@@ -77,13 +75,11 @@ workflow SAMPLE {
     )
 
     TCRDIST3_HISTOGRAM_CALC.out.max_histogram_count
-        .map { it.text.trim().toDouble() }
+        .map { tcrdist_ymax -> tcrdist_ymax.text.trim().toDouble() }
         .collect()
         .map { values -> values.max() }
         .set { global_y_max_value }
-
-    // Use `global_max_value` in downstream processes or print it
-    global_y_max_value.view { "Global y max matrix value: $it" }
+    global_y_max_value.view { global_ymax -> "Global y max matrix value: $global_ymax" }
 
     TCRDIST3_HISTOGRAM_PLOT( 
         TCRDIST3_HISTOGRAM_CALC.out.histogram_data,
@@ -93,27 +89,27 @@ workflow SAMPLE {
     OLGA_PGEN_CALC ( sample_map )
 
     OLGA_PGEN_CALC.out.olga_xmin
-        .map { it.text.trim().toDouble() }
+        .map { xmin -> xmin.text.trim().toDouble() }
         .collect()
         .map { values -> values.min() }
         .set { olga_x_min_value }
-    olga_x_min_value.view { "Olga x min matrix value: $it" }
+    olga_x_min_value.view { olga_xmin -> "Olga x min matrix value: $olga_xmin" }
 
     OLGA_PGEN_CALC.out.olga_xmax
-        .map { it.text.trim().toDouble() }
+        .map { xmax -> xmax.text.trim().toDouble() }
         .collect()
         .map { values -> values.max() }
         .set { olga_x_max_value }
-    olga_x_max_value.view { "Olga x max matrix value: $it" }
+    olga_x_max_value.view { olga_xmax -> "Olga x max matrix value: $olga_xmax" }
 
     OLGA_HISTOGRAM_CALC ( OLGA_PGEN_CALC.out.olga_pgen, olga_x_min_value, olga_x_max_value )
 
     OLGA_HISTOGRAM_CALC.out.olga_ymax
-        .map { it.text.trim().toDouble() }
+        .map { ymax -> ymax.text.trim().toDouble() }
         .collect()
         .map { values -> values.max() }
         .set { olga_y_max_value }
-    olga_y_max_value.view { "Olga y max matrix value: $it" }
+    olga_y_max_value.view { olga_ymax -> "Olga y max matrix value: $olga_ymax" }
 
     OLGA_HISTOGRAM_PLOT( OLGA_HISTOGRAM_CALC.out.olga_histogram, olga_y_max_value )
 
diff --git a/subworkflows/local/validate_params.nf b/subworkflows/local/validate_params.nf
new file mode 100644
index 0000000..d92f83f
--- /dev/null
+++ b/subworkflows/local/validate_params.nf
@@ -0,0 +1,8 @@
+include { validateParameters; paramsSummaryLog } from 'plugin/nf-schema'
+
+workflow VALIDATE_PARAMS{   
+
+    main:
+    validateParameters()
+    log.info paramsSummaryLog(workflow)
+}
\ No newline at end of file
diff --git a/workflows/tcrtoolkit.nf b/workflows/tcrtoolkit.nf
index 056a2d8..5329576 100644
--- a/workflows/tcrtoolkit.nf
+++ b/workflows/tcrtoolkit.nf
@@ -1,16 +1,3 @@
-/*
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-    VALIDATE & PRINT PARAMETER SUMMARY
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-*/
-
-// Validate pipeline parameters
-def checkPathParamList = [ params.samplesheet]
-for (param in checkPathParamList) { if (param) { file(param, checkIfExists: true) } }
-
-// Check mandatory parameters
-if (params.samplesheet) { samplesheet = file(params.samplesheet) } else { exit 1, 'Samplesheet not specified. Please, provide a --samplesheet=/path/to/samplesheet.csv !' }
-if (params.outdir) { outdir = params.outdir } else { exit 1, 'Output directory not specified. Please, provide a --outdir=/path/to/outdir !' }
 
 /*
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
@@ -27,6 +14,7 @@ include { AIRR_CONVERT }        from '../subworkflows/local/airr_convert'
 include { RESOLVE_SAMPLESHEET } from '../subworkflows/local/resolve_samplesheet'
 include { SAMPLE }              from '../subworkflows/local/sample'
 include { COMPARE }             from '../subworkflows/local/compare'
+include { VALIDATE_PARAMS }     from '../subworkflows/local/validate_params'
 
 /*
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
@@ -36,6 +24,7 @@ include { COMPARE }             from '../subworkflows/local/compare'
 
 
 workflow TCRTOOLKIT {
+    VALIDATE_PARAMS()
 
     println("Running TCRTOOLKIT workflow...")
 
@@ -80,18 +69,6 @@ workflow TCRTOOLKIT {
     }
 }
 
-/*
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-    COMPLETION EMAIL AND SUMMARY
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-*/
-
-// workflow.onComplete {
-
-//     log.info(workflow.success ? "Finished tcrtoolkit!" : "Please check your inputs.")
-
-// }
-
 /*
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
     THE END

From b2f8f561506f8520d48b40e6011e5fed4f84e19d Mon Sep 17 00:00:00 2001
From: dltamayo <DLT@breakthroughcancer.org>
Date: Mon, 29 Dec 2025 11:30:25 -0500
Subject: [PATCH 04/14] Init license

---
 LICENSE | 21 +++++++++++++++++++++
 1 file changed, 21 insertions(+)
 create mode 100644 LICENSE

diff --git a/LICENSE b/LICENSE
new file mode 100644
index 0000000..44ec6b9
--- /dev/null
+++ b/LICENSE
@@ -0,0 +1,21 @@
+MIT License
+
+Copyright (c) 
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
\ No newline at end of file

From d50144176d3af0cd3310b9c57e95ad52b2db4aca Mon Sep 17 00:00:00 2001
From: dltamayo <DLT@breakthroughcancer.org>
Date: Mon, 29 Dec 2025 11:31:33 -0500
Subject: [PATCH 05/14] Update memory specification

---
 modules/local/sample/tcrdist3.nf | 1 -
 1 file changed, 1 deletion(-)

diff --git a/modules/local/sample/tcrdist3.nf b/modules/local/sample/tcrdist3.nf
index 67dd2b0..8fb8aa6 100644
--- a/modules/local/sample/tcrdist3.nf
+++ b/modules/local/sample/tcrdist3.nf
@@ -11,7 +11,6 @@ process TCRDIST3_MATRIX {
 
 
     memory {
-        size -> count_table.size()
         count_table.size() > 26 * 1024**2 ? 512.GB * task.attempt:
         count_table.size() > 20 * 1024**2 ? 256.GB * task.attempt:
         count_table.size() > 10 * 1024**2 ? 128.GB * task.attempt:

From 3d2500d78f6ac685a2ee7d248d793aa1f36f9142 Mon Sep 17 00:00:00 2001
From: dltamayo <DLT@breakthroughcancer.org>
Date: Tue, 30 Dec 2025 11:44:58 -0500
Subject: [PATCH 06/14] Move container to .config Moving container to
 modules.config instead of specifying in every process

---
 conf/modules.config                                 | 1 +
 modules/local/airr_convert/convert_adaptive.nf      | 1 -
 modules/local/airr_convert/pseudobulk_cellranger.nf | 1 -
 modules/local/compare/compare_calc.nf               | 1 -
 modules/local/compare/compare_plot.nf               | 1 -
 modules/local/compare/giana.nf                      | 1 -
 modules/local/compare/gliph2.nf                     | 2 --
 modules/local/compare/tcrsharing.nf                 | 3 ---
 modules/local/sample/convergence.nf                 | 1 -
 modules/local/sample/olga.nf                        | 4 ----
 modules/local/sample/sample_aggregate.nf            | 1 -
 modules/local/sample/sample_calc.nf                 | 1 -
 modules/local/sample/sample_plot.nf                 | 1 -
 modules/local/sample/tcrdist3.nf                    | 3 ---
 modules/local/sample/tcrpheno.nf                    | 1 -
 modules/local/sample/tcrspecificity.nf              | 2 --
 modules/local/samplesheet/samplesheet_check.nf      | 1 -
 modules/local/samplesheet/samplesheet_resolve.nf    | 1 -
 18 files changed, 1 insertion(+), 26 deletions(-)

diff --git a/conf/modules.config b/conf/modules.config
index 8fc2514..3e891f7 100644
--- a/conf/modules.config
+++ b/conf/modules.config
@@ -11,6 +11,7 @@
 */
 
 process {
+    container = "ghcr.io/karchinlab/tcrtoolkit:main"
 
     publishDir = [
         path: { "${params.outdir}/${task.process.tokenize(':')[-1].tokenize('_')[0].toLowerCase()}" },
diff --git a/modules/local/airr_convert/convert_adaptive.nf b/modules/local/airr_convert/convert_adaptive.nf
index aa818cc..a476bb1 100644
--- a/modules/local/airr_convert/convert_adaptive.nf
+++ b/modules/local/airr_convert/convert_adaptive.nf
@@ -1,7 +1,6 @@
 process CONVERT_ADAPTIVE {
     tag "${sample_meta.sample}"
     label 'process_low'
-    container "ghcr.io/karchinlab/tcrtoolkit:main"
 
     input:
     tuple val(sample_meta), path(count_table)
diff --git a/modules/local/airr_convert/pseudobulk_cellranger.nf b/modules/local/airr_convert/pseudobulk_cellranger.nf
index ec1916c..653b2a9 100644
--- a/modules/local/airr_convert/pseudobulk_cellranger.nf
+++ b/modules/local/airr_convert/pseudobulk_cellranger.nf
@@ -1,7 +1,6 @@
 process PSEUDOBULK_CELLRANGER {
     tag "${sample_meta.sample}"
     label 'process_low'
-    container "ghcr.io/karchinlab/tcrtoolkit:main"
 
     input:
     tuple val(sample_meta), path(count_table)
diff --git a/modules/local/compare/compare_calc.nf b/modules/local/compare/compare_calc.nf
index a6d0a60..2206fdb 100644
--- a/modules/local/compare/compare_calc.nf
+++ b/modules/local/compare/compare_calc.nf
@@ -1,6 +1,5 @@
 process COMPARE_CALC {
     label 'process_single'
-    container "ghcr.io/karchinlab/tcrtoolkit:main"
     
     input:
     path sample_utf8
diff --git a/modules/local/compare/compare_plot.nf b/modules/local/compare/compare_plot.nf
index 460f536..1eb5792 100644
--- a/modules/local/compare/compare_plot.nf
+++ b/modules/local/compare/compare_plot.nf
@@ -1,6 +1,5 @@
 process COMPARE_PLOT {
     label 'process_single'
-    container "ghcr.io/karchinlab/tcrtoolkit:main"
 
     input:
     path sample_utf8
diff --git a/modules/local/compare/giana.nf b/modules/local/compare/giana.nf
index 561c1b3..7140448 100644
--- a/modules/local/compare/giana.nf
+++ b/modules/local/compare/giana.nf
@@ -1,6 +1,5 @@
 process GIANA_CALC {
     label 'process_medium'
-    container "ghcr.io/karchinlab/tcrtoolkit:main"
 
     input:
     path concat_cdr3
diff --git a/modules/local/compare/gliph2.nf b/modules/local/compare/gliph2.nf
index 1a45e4c..68dd080 100644
--- a/modules/local/compare/gliph2.nf
+++ b/modules/local/compare/gliph2.nf
@@ -2,7 +2,6 @@ process GLIPH2_TURBOGLIPH {
     label 'process_high'
     label 'process_high_compute'
     label 'process_high_memory'
-    container "ghcr.io/karchinlab/tcrtoolkit:main"
 
     input:
     path concat_cdr3
@@ -53,7 +52,6 @@ process GLIPH2_TURBOGLIPH {
 
 process GLIPH2_PLOT {
     label 'process_low'
-    container "ghcr.io/karchinlab/tcrtoolkit:main"
 
     input:
     path gliph2_report_template
diff --git a/modules/local/compare/tcrsharing.nf b/modules/local/compare/tcrsharing.nf
index ff68619..2bc5e1f 100644
--- a/modules/local/compare/tcrsharing.nf
+++ b/modules/local/compare/tcrsharing.nf
@@ -1,6 +1,5 @@
 process TCRSHARING_CALC {
     label 'process_low'
-    container "ghcr.io/karchinlab/tcrtoolkit:main"
 
     input:
     path concat_cdr3
@@ -73,7 +72,6 @@ process TCRSHARING_CALC {
 
 process TCRSHARING_HISTOGRAM {
     label 'process_low'
-    container "ghcr.io/karchinlab/tcrtoolkit:main"
 
     input:
     path shared_cdr3
@@ -114,7 +112,6 @@ process TCRSHARING_HISTOGRAM {
 
 process TCRSHARING_SCATTERPLOT {
     label 'process_low'
-    container "ghcr.io/karchinlab/tcrtoolkit:main"
 
     input:
     path shared_cdr3
diff --git a/modules/local/sample/convergence.nf b/modules/local/sample/convergence.nf
index 1e077ac..962d041 100644
--- a/modules/local/sample/convergence.nf
+++ b/modules/local/sample/convergence.nf
@@ -1,7 +1,6 @@
 process CONVERGENCE {
     tag "${sample_meta.sample}"
     label 'process_low'
-    container "ghcr.io/karchinlab/tcrtoolkit:main"
 
     input:
     tuple val(sample_meta), path(count_table)
diff --git a/modules/local/sample/olga.nf b/modules/local/sample/olga.nf
index dfc71f2..d7b76ba 100644
--- a/modules/local/sample/olga.nf
+++ b/modules/local/sample/olga.nf
@@ -1,7 +1,6 @@
 process OLGA_PGEN_CALC {
     tag "${sample_meta.sample}"
     label 'process_low'
-    container "ghcr.io/karchinlab/tcrtoolkit:main"
 
     input:
     tuple val(sample_meta), path(count_table)
@@ -56,7 +55,6 @@ process OLGA_PGEN_CALC {
 process OLGA_HISTOGRAM_CALC {
     tag "${sample_meta.sample}"
     label 'process_low'
-    container "ghcr.io/karchinlab/tcrtoolkit:main"
 
     input:
     tuple val(sample_meta), path(olga_pgen)
@@ -113,7 +111,6 @@ process OLGA_HISTOGRAM_CALC {
 process OLGA_HISTOGRAM_PLOT {
     tag "${sample_meta.sample}"
     label 'process_low'
-    container "ghcr.io/karchinlab/tcrtoolkit:main"
 
     input:
     tuple val(sample_meta), path(olga_histogram)
@@ -172,7 +169,6 @@ process OLGA_HISTOGRAM_PLOT {
 
 process OLGA_WRITE_MAX {
     label 'process_single'
-    container "ghcr.io/karchinlab/tcrtoolkit:main"
 
     input:
     val olga_global_xmin
diff --git a/modules/local/sample/sample_aggregate.nf b/modules/local/sample/sample_aggregate.nf
index ea7b43a..ae9e952 100644
--- a/modules/local/sample/sample_aggregate.nf
+++ b/modules/local/sample/sample_aggregate.nf
@@ -1,7 +1,6 @@
 process SAMPLE_AGGREGATE {
     tag "${output_file}"
     label 'process_low'
-    container "ghcr.io/karchinlab/tcrtoolkit:main"
 
     input:
     path csv_files
diff --git a/modules/local/sample/sample_calc.nf b/modules/local/sample/sample_calc.nf
index dff4c0a..d6e4e90 100644
--- a/modules/local/sample/sample_calc.nf
+++ b/modules/local/sample/sample_calc.nf
@@ -1,7 +1,6 @@
 process SAMPLE_CALC {
     tag "${sample_meta.sample}"
     label 'process_single'
-    container "ghcr.io/karchinlab/tcrtoolkit:main"
 
     input:
     tuple val(sample_meta), path(count_table)
diff --git a/modules/local/sample/sample_plot.nf b/modules/local/sample/sample_plot.nf
index 3ae48a1..e19dd16 100644
--- a/modules/local/sample/sample_plot.nf
+++ b/modules/local/sample/sample_plot.nf
@@ -2,7 +2,6 @@
 process SAMPLE_PLOT {
     tag "${sample_stats_csv}"
     label 'process_single'
-    container "ghcr.io/karchinlab/tcrtoolkit:main"
     
     input:
     path sample_table
diff --git a/modules/local/sample/tcrdist3.nf b/modules/local/sample/tcrdist3.nf
index 8fb8aa6..6369df4 100644
--- a/modules/local/sample/tcrdist3.nf
+++ b/modules/local/sample/tcrdist3.nf
@@ -1,6 +1,5 @@
 process TCRDIST3_MATRIX {
     tag "${sample_meta.sample}"
-    container "ghcr.io/karchinlab/tcrtoolkit:main"
 
     cpus {
             task.memory > 256.GB ? 16 * task.attempt:
@@ -39,7 +38,6 @@ process TCRDIST3_MATRIX {
 process TCRDIST3_HISTOGRAM_CALC {
     tag "${sample_meta.sample}"
     label 'process_high'
-    container "ghcr.io/karchinlab/tcrtoolkit:main"
 
     input:
     tuple val(sample_meta), path(distance_matrix)
@@ -120,7 +118,6 @@ process TCRDIST3_HISTOGRAM_CALC {
 process TCRDIST3_HISTOGRAM_PLOT {
     tag "${sample_meta.sample}"
     label 'process_low'
-    container "ghcr.io/karchinlab/tcrtoolkit:main"
 
     input:
     tuple val(sample_meta), path(histogram_data)
diff --git a/modules/local/sample/tcrpheno.nf b/modules/local/sample/tcrpheno.nf
index e71ed63..f4be077 100644
--- a/modules/local/sample/tcrpheno.nf
+++ b/modules/local/sample/tcrpheno.nf
@@ -1,7 +1,6 @@
 process TCRPHENO {
     tag "${sample_meta.sample}"
     label 'process_low'
-    container "ghcr.io/karchinlab/tcrtoolkit:main"
 
     input:
     tuple val(sample_meta), path(count_table)
diff --git a/modules/local/sample/tcrspecificity.nf b/modules/local/sample/tcrspecificity.nf
index e92cdb6..30ce184 100644
--- a/modules/local/sample/tcrspecificity.nf
+++ b/modules/local/sample/tcrspecificity.nf
@@ -1,6 +1,5 @@
 process VDJDB_GET {
     label 'process_low'
-    container "ghcr.io/karchinlab/tcrtoolkit:main"
 
     output:
     path("vdjdb-2025-02-21/"), emit: ref_db
@@ -15,7 +14,6 @@ process VDJDB_GET {
 process VDJDB_VDJMATCH {
     tag "${sample_meta.sample}"
     label 'process_medium'
-    container "ghcr.io/karchinlab/tcrtoolkit:main"
 
     input:
     tuple val(sample_meta), path(count_table)
diff --git a/modules/local/samplesheet/samplesheet_check.nf b/modules/local/samplesheet/samplesheet_check.nf
index 527ed9c..04bd0d7 100644
--- a/modules/local/samplesheet/samplesheet_check.nf
+++ b/modules/local/samplesheet/samplesheet_check.nf
@@ -1,7 +1,6 @@
 process SAMPLESHEET_CHECK {
     tag "${samplesheet}"
     label 'process_single'
-    container "ghcr.io/karchinlab/tcrtoolkit:main"
 
     input:
     path samplesheet
diff --git a/modules/local/samplesheet/samplesheet_resolve.nf b/modules/local/samplesheet/samplesheet_resolve.nf
index 3128eff..ea31239 100644
--- a/modules/local/samplesheet/samplesheet_resolve.nf
+++ b/modules/local/samplesheet/samplesheet_resolve.nf
@@ -1,6 +1,5 @@
 process SAMPLESHEET_RESOLVE {
     label 'process_single'
-    container "ghcr.io/karchinlab/tcrtoolkit:main"
 
     input:
     path samplesheet_utf8

From fb3461fd549448239287c3aea23a82b16e7bc39d Mon Sep 17 00:00:00 2001
From: dltamayo <DLT@breakthroughcancer.org>
Date: Tue, 30 Dec 2025 11:46:24 -0500
Subject: [PATCH 07/14] Update dockerfile

---
 Dockerfile | 64 ++++++++++++++++++++++++++++++++++--------------------
 env.yml    |  1 +
 2 files changed, 41 insertions(+), 24 deletions(-)

diff --git a/Dockerfile b/Dockerfile
index 148ae3a..7d5ba29 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -1,22 +1,44 @@
-FROM condaforge/miniforge3:24.9.2-0
+FROM mambaorg/micromamba:1.5.8
 
-# Copy the environment file into /tmp
+# Ensure we run as root for apt
+USER root
+
+# Update the conda base environment with required packages
 COPY env.yml /tmp/env.yml
+WORKDIR /tmp
 
-# Install system dependencies
-RUN apt-get update \
-    && apt-get install -y \
-    build-essential \
-    curl \
-    gcc \
-    g++ \
+RUN apt-get update && apt-get install -y \
+        # runtime CLIs (KEEP)
+        curl \
+        wget \
+        git \
+        unzip \
+        zip \
+        jq \
+        \
+        # build-only deps (REMOVE LATER)
+        build-essential \
+        gcc \
+        g++ \
+    && micromamba install -y -n base -f /tmp/env.yml \
+    && micromamba clean -afy \
+    \
+    # R packages (need compilers)
+    && micromamba run -n base Rscript -e "remotes::install_github('HetzDra/turboGliph')" \
+    && micromamba run -n base Rscript -e "remotes::install_github('kalaga27/tcrpheno')" \
+    \
+    # R cleanup
+    && rm -rf /tmp/Rtmp* /root/.cache/R \
+    \
+    # REMOVE build deps ONLY
+    && apt-get purge -y \
+        build-essential \
+        gcc \
+        g++ \
+    && apt-get autoremove -y \
     && apt-get clean \
     && rm -rf /var/lib/apt/lists/*
 
-# Update the conda base environment with required packages
-WORKDIR /tmp
-RUN conda env update -n base --file env.yml
-
 # Install GIANA, patch shebang, symlink for PATH command availability
 RUN git init /opt/GIANA && \
     cd /opt/GIANA && \
@@ -29,26 +51,20 @@ RUN git init /opt/GIANA && \
 
 # Install quarto
 RUN mkdir -p /opt/quarto/1.6.42 \
-    && curl -o quarto.tar.gz -L \
+    && curl -o /tmp/quarto.tar.gz -L \
         "https://github.com/quarto-dev/quarto-cli/releases/download/v1.6.42/quarto-1.6.42-linux-amd64.tar.gz" \
     && tar -zxvf quarto.tar.gz \
         -C "/opt/quarto/1.6.42" \
         --strip-components=1 \
-    && rm quarto.tar.gz 
+    && rm /tmp/quarto.tar.gz 
 
-# Install R package not available via conda
-RUN Rscript -e "remotes::install_github('HetzDra/turboGliph')"
-RUN Rscript -e "remotes::install_github('kalaga27/tcrpheno')"
-
-# Install VDJmatch
+# Install VDJmatch and symlink
 RUN mkdir -p /opt/vdjmatch/1.3.1 \
     && curl -L -o vdjmatch.zip \
         "https://github.com/antigenomics/vdjmatch/releases/download/1.3.1/vdjmatch-1.3.1.zip" \
     && unzip vdjmatch.zip -d /opt/vdjmatch/1.3.1 \
-    && rm vdjmatch.zip
-
-# symlink VDJmatch
-RUN ln -s /opt/vdjmatch/1.3.1/vdjmatch-1.3.1/vdjmatch-1.3.1.jar /usr/local/bin/vdjmatch.jar
+    && rm vdjmatch.zip \
+    && ln -s /opt/vdjmatch/1.3.1/vdjmatch-1.3.1/vdjmatch-1.3.1.jar /usr/local/bin/vdjmatch.jar
 
 # Add to PATH
 ENV PATH="/opt/quarto/1.6.42/bin:${PATH}"
diff --git a/env.yml b/env.yml
index 27ce717..331ef45 100644
--- a/env.yml
+++ b/env.yml
@@ -29,6 +29,7 @@ dependencies:
 
   # R and R packages
   - r-base=4.4.2
+  - r-grr=0.9.5
   - r-igraph=2.0.3
   - r-pheatmap=1.0.12
   - r-remotes=2.5.0

From 28efeebff8a8f7717412ed9b643e0e0bb3a2f6d3 Mon Sep 17 00:00:00 2001
From: dltamayo <DLT@breakthroughcancer.org>
Date: Mon, 5 Jan 2026 12:57:58 -0500
Subject: [PATCH 08/14] Implement Copilot suggestions

---
 Dockerfile                                       |  6 +++---
 modules/local/sample/sample_aggregate.nf         |  7 +++++--
 modules/local/sample/tcrdist3.nf                 | 16 ++++++++--------
 modules/local/samplesheet/samplesheet_resolve.nf |  4 +++-
 nextflow_schema.json                             |  3 ++-
 subworkflows/local/validate_params.nf            |  2 +-
 6 files changed, 22 insertions(+), 16 deletions(-)

diff --git a/Dockerfile b/Dockerfile
index 7d5ba29..3aff1f6 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -24,8 +24,8 @@ RUN apt-get update && apt-get install -y \
     && micromamba clean -afy \
     \
     # R packages (need compilers)
-    && micromamba run -n base Rscript -e "remotes::install_github('HetzDra/turboGliph')" \
-    && micromamba run -n base Rscript -e "remotes::install_github('kalaga27/tcrpheno')" \
+    && micromamba run -n base Rscript -e "remotes::install_github('HetzDra/turboGliph@2a5264b')" \
+    && micromamba run -n base Rscript -e "remotes::install_github('kalaga27/tcrpheno@56f9372')" \
     \
     # R cleanup
     && rm -rf /tmp/Rtmp* /root/.cache/R \
@@ -53,7 +53,7 @@ RUN git init /opt/GIANA && \
 RUN mkdir -p /opt/quarto/1.6.42 \
     && curl -o /tmp/quarto.tar.gz -L \
         "https://github.com/quarto-dev/quarto-cli/releases/download/v1.6.42/quarto-1.6.42-linux-amd64.tar.gz" \
-    && tar -zxvf quarto.tar.gz \
+    && tar -zxvf /tmp/quarto.tar.gz \
         -C "/opt/quarto/1.6.42" \
         --strip-components=1 \
     && rm /tmp/quarto.tar.gz 
diff --git a/modules/local/sample/sample_aggregate.nf b/modules/local/sample/sample_aggregate.nf
index ae9e952..a1c9c55 100644
--- a/modules/local/sample/sample_aggregate.nf
+++ b/modules/local/sample/sample_aggregate.nf
@@ -11,13 +11,16 @@ process SAMPLE_AGGREGATE {
 
     script:
     """
-    python3 <<EOF
+    cat > aggregate.py <<EOF
+    import sys
     import pandas as pd
 
-    input_files = [${csv_files.collect { input_file -> '"' + input_file.getName() + '"' }.join(', ')}]
+    input_files = sys.argv[1:]
     dfs = [pd.read_csv(f) for f in input_files]
     merged = pd.concat(dfs, axis=0, ignore_index=True)
     merged.to_csv("${output_file}", index=False)
     EOF
+
+    python3 aggregate.py ${csv_files}
     """
 }
\ No newline at end of file
diff --git a/modules/local/sample/tcrdist3.nf b/modules/local/sample/tcrdist3.nf
index 6369df4..f592675 100644
--- a/modules/local/sample/tcrdist3.nf
+++ b/modules/local/sample/tcrdist3.nf
@@ -2,19 +2,19 @@ process TCRDIST3_MATRIX {
     tag "${sample_meta.sample}"
 
     cpus {
-            task.memory > 256.GB ? 16 * task.attempt:
-            task.memory >  64.GB ?  8 * task.attempt:
-            task.memory >   4.GB ?  4 * task.attempt:
+            task.memory > 256.GB ? 16 * task.attempt :
+            task.memory >  64.GB ?  8 * task.attempt :
+            task.memory >   4.GB ?  4 * task.attempt :
                                     2 * task.attempt
     }
 
 
     memory {
-        count_table.size() > 26 * 1024**2 ? 512.GB * task.attempt:
-        count_table.size() > 20 * 1024**2 ? 256.GB * task.attempt:
-        count_table.size() > 10 * 1024**2 ? 128.GB * task.attempt:
-        count_table.size() >  4 * 1024**2 ?  64.GB * task.attempt:
-        count_table.size() >  2 * 1024**2 ?  16.GB * task.attempt:
+        count_table.size() > 26 * 1024**2 ? 512.GB * task.attempt :
+        count_table.size() > 20 * 1024**2 ? 256.GB * task.attempt :
+        count_table.size() > 10 * 1024**2 ? 128.GB * task.attempt :
+        count_table.size() >  4 * 1024**2 ?  64.GB * task.attempt :
+        count_table.size() >  2 * 1024**2 ?  16.GB * task.attempt :
                                               4.GB * task.attempt
     }
 
diff --git a/modules/local/samplesheet/samplesheet_resolve.nf b/modules/local/samplesheet/samplesheet_resolve.nf
index ea31239..4dc0746 100644
--- a/modules/local/samplesheet/samplesheet_resolve.nf
+++ b/modules/local/samplesheet/samplesheet_resolve.nf
@@ -17,7 +17,9 @@ ${resolved_rows.join('\n')}
 EOF
 
 # Emit header
-echo "${resolved_header}" > samplesheet_resolved.csv
+cat << 'EOF' > samplesheet_resolved.csv
+${resolved_header}
+EOF
 
 # Two-pass awk:
 #  - pass 1: read original samplesheet, store sample order
diff --git a/nextflow_schema.json b/nextflow_schema.json
index c2a202f..d333039 100644
--- a/nextflow_schema.json
+++ b/nextflow_schema.json
@@ -46,13 +46,14 @@
         "max_memory": {
           "type": "string",
           "default": "768.GB",
-          "pattern": "^\\d+(\\.\\d+)?\\.?\\s*(K|M|G|T)?B$",
+          "pattern": "^\\d+(?:\\.\\d+|\\.)?\\s*(?:[KMGT]?B|[KMGT])$",
           "description": "Maximum memory for any process.",
           "fa_icon": "fas fa-memory"
         },
         "max_time": {
           "type": "string",
           "default": "48.h",
+          "pattern": "^\\d+(\\.\\d+)?\\s*(s|m|h|d)$",
           "description": "Maximum walltime for any job.",
           "fa_icon": "far fa-clock"
         }
diff --git a/subworkflows/local/validate_params.nf b/subworkflows/local/validate_params.nf
index d92f83f..b4b2f22 100644
--- a/subworkflows/local/validate_params.nf
+++ b/subworkflows/local/validate_params.nf
@@ -1,6 +1,6 @@
 include { validateParameters; paramsSummaryLog } from 'plugin/nf-schema'
 
-workflow VALIDATE_PARAMS{   
+workflow VALIDATE_PARAMS {   
 
     main:
     validateParameters()

From c8f85ff404d28e43bae0b72a510a6eca672f78f4 Mon Sep 17 00:00:00 2001
From: dltamayo <DLT@breakthroughcancer.org>
Date: Mon, 5 Jan 2026 13:09:42 -0500
Subject: [PATCH 09/14] Update schema

---
 nextflow_schema.json | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/nextflow_schema.json b/nextflow_schema.json
index d333039..e81a2e5 100644
--- a/nextflow_schema.json
+++ b/nextflow_schema.json
@@ -53,7 +53,7 @@
         "max_time": {
           "type": "string",
           "default": "48.h",
-          "pattern": "^\\d+(\\.\\d+)?\\s*(s|m|h|d)$",
+          "pattern": "^\\d+(?:\\.\\d+|\\.)?\\s*(s|m|h|d)$",
           "description": "Maximum walltime for any job.",
           "fa_icon": "far fa-clock"
         }

From 748df0f2c516aca697dd225609a51fa55d1f17a0 Mon Sep 17 00:00:00 2001
From: dimalvovs <dmitrijs.lvovs@gmail.com>
Date: Wed, 7 Jan 2026 10:07:17 -0500
Subject: [PATCH 10/14] make one container definition

---
 conf/base.config                                    | 1 +
 modules/local/airr_convert/convert_adaptive.nf      | 1 -
 modules/local/airr_convert/pseudobulk_cellranger.nf | 1 -
 modules/local/compare/compare_calc.nf               | 3 +--
 modules/local/compare/compare_concatenate.nf        | 1 -
 modules/local/compare/compare_plot.nf               | 1 -
 modules/local/compare/giana.nf                      | 1 -
 modules/local/compare/gliph2.nf                     | 2 --
 modules/local/compare/tcrsharing.nf                 | 3 ---
 modules/local/sample/convergence.nf                 | 1 -
 modules/local/sample/olga.nf                        | 4 ----
 modules/local/sample/sample_calc.nf                 | 1 -
 modules/local/sample/sample_plot.nf                 | 1 -
 modules/local/sample/tcrdist3.nf                    | 3 ---
 modules/local/sample/tcrpheno.nf                    | 1 -
 modules/local/sample/tcrspecificity.nf              | 2 --
 modules/local/samplesheet/samplesheet_check.nf      | 1 -
 modules/local/samplesheet/samplesheet_resolve.nf    | 1 -
 18 files changed, 2 insertions(+), 27 deletions(-)

diff --git a/conf/base.config b/conf/base.config
index daf7fa4..4ce1d96 100644
--- a/conf/base.config
+++ b/conf/base.config
@@ -9,6 +9,7 @@
 */
 
 process {
+    container = "ghcr.io/karchinlab/tcrtoolkit:main"
 
     // TODO nf-core: Check the defaults for all processes
     cpus   = { check_max( 1    * task.attempt, 'cpus'   ) }
diff --git a/modules/local/airr_convert/convert_adaptive.nf b/modules/local/airr_convert/convert_adaptive.nf
index aa818cc..a476bb1 100644
--- a/modules/local/airr_convert/convert_adaptive.nf
+++ b/modules/local/airr_convert/convert_adaptive.nf
@@ -1,7 +1,6 @@
 process CONVERT_ADAPTIVE {
     tag "${sample_meta.sample}"
     label 'process_low'
-    container "ghcr.io/karchinlab/tcrtoolkit:main"
 
     input:
     tuple val(sample_meta), path(count_table)
diff --git a/modules/local/airr_convert/pseudobulk_cellranger.nf b/modules/local/airr_convert/pseudobulk_cellranger.nf
index ec1916c..653b2a9 100644
--- a/modules/local/airr_convert/pseudobulk_cellranger.nf
+++ b/modules/local/airr_convert/pseudobulk_cellranger.nf
@@ -1,7 +1,6 @@
 process PSEUDOBULK_CELLRANGER {
     tag "${sample_meta.sample}"
     label 'process_low'
-    container "ghcr.io/karchinlab/tcrtoolkit:main"
 
     input:
     tuple val(sample_meta), path(count_table)
diff --git a/modules/local/compare/compare_calc.nf b/modules/local/compare/compare_calc.nf
index a6d0a60..05b5800 100644
--- a/modules/local/compare/compare_calc.nf
+++ b/modules/local/compare/compare_calc.nf
@@ -1,7 +1,6 @@
 process COMPARE_CALC {
     label 'process_single'
-    container "ghcr.io/karchinlab/tcrtoolkit:main"
-    
+
     input:
     path sample_utf8
     path all_sample_files
diff --git a/modules/local/compare/compare_concatenate.nf b/modules/local/compare/compare_concatenate.nf
index a05e50c..c955013 100644
--- a/modules/local/compare/compare_concatenate.nf
+++ b/modules/local/compare/compare_concatenate.nf
@@ -1,6 +1,5 @@
 process COMPARE_CONCATENATE {
     label 'process_low'
-    container "ghcr.io/karchinlab/tcrtoolkit:main"
 
     input:
     path samplesheet_utf8
diff --git a/modules/local/compare/compare_plot.nf b/modules/local/compare/compare_plot.nf
index 471ddaa..478096f 100644
--- a/modules/local/compare/compare_plot.nf
+++ b/modules/local/compare/compare_plot.nf
@@ -1,6 +1,5 @@
 process COMPARE_PLOT {
     label 'process_single'
-    container "ghcr.io/karchinlab/tcrtoolkit:main"
 
     input:
     path sample_utf8
diff --git a/modules/local/compare/giana.nf b/modules/local/compare/giana.nf
index b062f5f..6e4a237 100644
--- a/modules/local/compare/giana.nf
+++ b/modules/local/compare/giana.nf
@@ -1,6 +1,5 @@
 process GIANA_CALC {
     label 'process_medium'
-    container "ghcr.io/karchinlab/tcrtoolkit:main"
 
     input:
     path concat_cdr3
diff --git a/modules/local/compare/gliph2.nf b/modules/local/compare/gliph2.nf
index 36ca9f4..5958af1 100644
--- a/modules/local/compare/gliph2.nf
+++ b/modules/local/compare/gliph2.nf
@@ -2,7 +2,6 @@ process GLIPH2_TURBOGLIPH {
     label 'process_high'
     label 'process_high_compute'
     label 'process_high_memory'
-    container "ghcr.io/karchinlab/tcrtoolkit:main"
 
     input:
     path concat_cdr3
@@ -56,7 +55,6 @@ process GLIPH2_TURBOGLIPH {
 
 process GLIPH2_PLOT {
     label 'process_low'
-    container "ghcr.io/karchinlab/tcrtoolkit:main"
 
     input:
     path gliph2_report_template
diff --git a/modules/local/compare/tcrsharing.nf b/modules/local/compare/tcrsharing.nf
index b4b9fab..34c4349 100644
--- a/modules/local/compare/tcrsharing.nf
+++ b/modules/local/compare/tcrsharing.nf
@@ -1,6 +1,5 @@
 process TCRSHARING_CALC {
     label 'process_low'
-    container "ghcr.io/karchinlab/tcrtoolkit:main"
 
     input:
     path concat_cdr3
@@ -67,7 +66,6 @@ process TCRSHARING_CALC {
 
 process TCRSHARING_HISTOGRAM {
     label 'process_low'
-    container "ghcr.io/karchinlab/tcrtoolkit:main"
 
     input:
     path shared_cdr3
@@ -108,7 +106,6 @@ process TCRSHARING_HISTOGRAM {
 
 process TCRSHARING_SCATTERPLOT {
     label 'process_low'
-    container "ghcr.io/karchinlab/tcrtoolkit:main"
 
     input:
     path shared_cdr3
diff --git a/modules/local/sample/convergence.nf b/modules/local/sample/convergence.nf
index 1e077ac..962d041 100644
--- a/modules/local/sample/convergence.nf
+++ b/modules/local/sample/convergence.nf
@@ -1,7 +1,6 @@
 process CONVERGENCE {
     tag "${sample_meta.sample}"
     label 'process_low'
-    container "ghcr.io/karchinlab/tcrtoolkit:main"
 
     input:
     tuple val(sample_meta), path(count_table)
diff --git a/modules/local/sample/olga.nf b/modules/local/sample/olga.nf
index dfc71f2..d7b76ba 100644
--- a/modules/local/sample/olga.nf
+++ b/modules/local/sample/olga.nf
@@ -1,7 +1,6 @@
 process OLGA_PGEN_CALC {
     tag "${sample_meta.sample}"
     label 'process_low'
-    container "ghcr.io/karchinlab/tcrtoolkit:main"
 
     input:
     tuple val(sample_meta), path(count_table)
@@ -56,7 +55,6 @@ process OLGA_PGEN_CALC {
 process OLGA_HISTOGRAM_CALC {
     tag "${sample_meta.sample}"
     label 'process_low'
-    container "ghcr.io/karchinlab/tcrtoolkit:main"
 
     input:
     tuple val(sample_meta), path(olga_pgen)
@@ -113,7 +111,6 @@ process OLGA_HISTOGRAM_CALC {
 process OLGA_HISTOGRAM_PLOT {
     tag "${sample_meta.sample}"
     label 'process_low'
-    container "ghcr.io/karchinlab/tcrtoolkit:main"
 
     input:
     tuple val(sample_meta), path(olga_histogram)
@@ -172,7 +169,6 @@ process OLGA_HISTOGRAM_PLOT {
 
 process OLGA_WRITE_MAX {
     label 'process_single'
-    container "ghcr.io/karchinlab/tcrtoolkit:main"
 
     input:
     val olga_global_xmin
diff --git a/modules/local/sample/sample_calc.nf b/modules/local/sample/sample_calc.nf
index ef88d43..c989aad 100644
--- a/modules/local/sample/sample_calc.nf
+++ b/modules/local/sample/sample_calc.nf
@@ -1,7 +1,6 @@
 process SAMPLE_CALC {
     tag "${sample_meta.sample}"
     label 'process_single'
-    container "ghcr.io/karchinlab/tcrtoolkit:main"
 
     input:
     tuple val(sample_meta), path(count_table)
diff --git a/modules/local/sample/sample_plot.nf b/modules/local/sample/sample_plot.nf
index 272533d..9502b06 100644
--- a/modules/local/sample/sample_plot.nf
+++ b/modules/local/sample/sample_plot.nf
@@ -2,7 +2,6 @@
 process SAMPLE_PLOT {
     tag "${sample_stats_csv}"
     label 'process_single'
-    container "ghcr.io/karchinlab/tcrtoolkit:main"
     
     input:
     path sample_table
diff --git a/modules/local/sample/tcrdist3.nf b/modules/local/sample/tcrdist3.nf
index 9635030..963182e 100644
--- a/modules/local/sample/tcrdist3.nf
+++ b/modules/local/sample/tcrdist3.nf
@@ -1,6 +1,5 @@
 process TCRDIST3_MATRIX {
     tag "${sample_meta.sample}"
-    container "ghcr.io/karchinlab/tcrtoolkit:main"
 
     cpus params.max_cpus
     memory {
@@ -41,7 +40,6 @@ process TCRDIST3_MATRIX {
 process TCRDIST3_HISTOGRAM_CALC {
     tag "${sample_meta.sample}"
     label 'process_high'
-    container "ghcr.io/karchinlab/tcrtoolkit:main"
 
     input:
     tuple val(sample_meta), path(distance_matrix)
@@ -122,7 +120,6 @@ process TCRDIST3_HISTOGRAM_CALC {
 process TCRDIST3_HISTOGRAM_PLOT {
     tag "${sample_meta.sample}"
     label 'process_low'
-    container "ghcr.io/karchinlab/tcrtoolkit:main"
 
     input:
     tuple val(sample_meta), path(histogram_data)
diff --git a/modules/local/sample/tcrpheno.nf b/modules/local/sample/tcrpheno.nf
index e71ed63..f4be077 100644
--- a/modules/local/sample/tcrpheno.nf
+++ b/modules/local/sample/tcrpheno.nf
@@ -1,7 +1,6 @@
 process TCRPHENO {
     tag "${sample_meta.sample}"
     label 'process_low'
-    container "ghcr.io/karchinlab/tcrtoolkit:main"
 
     input:
     tuple val(sample_meta), path(count_table)
diff --git a/modules/local/sample/tcrspecificity.nf b/modules/local/sample/tcrspecificity.nf
index e92cdb6..30ce184 100644
--- a/modules/local/sample/tcrspecificity.nf
+++ b/modules/local/sample/tcrspecificity.nf
@@ -1,6 +1,5 @@
 process VDJDB_GET {
     label 'process_low'
-    container "ghcr.io/karchinlab/tcrtoolkit:main"
 
     output:
     path("vdjdb-2025-02-21/"), emit: ref_db
@@ -15,7 +14,6 @@ process VDJDB_GET {
 process VDJDB_VDJMATCH {
     tag "${sample_meta.sample}"
     label 'process_medium'
-    container "ghcr.io/karchinlab/tcrtoolkit:main"
 
     input:
     tuple val(sample_meta), path(count_table)
diff --git a/modules/local/samplesheet/samplesheet_check.nf b/modules/local/samplesheet/samplesheet_check.nf
index 527ed9c..04bd0d7 100644
--- a/modules/local/samplesheet/samplesheet_check.nf
+++ b/modules/local/samplesheet/samplesheet_check.nf
@@ -1,7 +1,6 @@
 process SAMPLESHEET_CHECK {
     tag "${samplesheet}"
     label 'process_single'
-    container "ghcr.io/karchinlab/tcrtoolkit:main"
 
     input:
     path samplesheet
diff --git a/modules/local/samplesheet/samplesheet_resolve.nf b/modules/local/samplesheet/samplesheet_resolve.nf
index 61ca0c1..1ab9cce 100644
--- a/modules/local/samplesheet/samplesheet_resolve.nf
+++ b/modules/local/samplesheet/samplesheet_resolve.nf
@@ -1,6 +1,5 @@
 process SAMPLESHEET_RESOLVE {
     label 'process_single'
-    container "ghcr.io/karchinlab/tcrtoolkit:main"
 
     input:
     val(resolved_rows)     // List of tab-separated strings

From 877a68e7fed42c11a9565976e1da515174740bbc Mon Sep 17 00:00:00 2001
From: dimalvovs <dmitrijs.lvovs@gmail.com>
Date: Wed, 7 Jan 2026 10:44:50 -0500
Subject: [PATCH 11/14] push on changed Dockerfile

---
 .github/workflows/build-push-container.yml | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/.github/workflows/build-push-container.yml b/.github/workflows/build-push-container.yml
index 1e0a5d9..1e76253 100644
--- a/.github/workflows/build-push-container.yml
+++ b/.github/workflows/build-push-container.yml
@@ -4,7 +4,10 @@ name: build-push-container
 # Configures this workflow to run every time a change is pushed to the branch called `release`.
 on:
   push:
-    branches: ['main']
+    paths:
+      - '**/Dockerfile'
+      - '**/*.dockerfile'
+      - '.github/workflows/build-push-container.yml'
   workflow_dispatch:
 
 # Defines two custom environment variables for the workflow. These are used for the Container registry domain, and a name for the Docker image that this workflow builds.
@@ -39,6 +42,8 @@ jobs:
         uses: docker/metadata-action@9ec57ed1fcdbf14dcef7dfbe97b2010124a938b7
         with:
           images: ${{ env.REGISTRY }}/${{ env.IMAGE_NAME }}
+          tags: |
+            type=sha,format=short,prefix= # Generates a tag like '860c190'
       # This step uses the `docker/build-push-action` action to build the image, based on your repository's `Dockerfile`. If the build succeeds, it pushes the image to GitHub Packages.
       # It uses the `context` parameter to define the build's context as the set of files located in the specified path. For more information, see "[Usage](https://github.com/docker/build-push-action#usage)" in the README of the `docker/build-push-action` repository.
       # It uses the `tags` and `labels` parameters to tag and label the image with the output from the "meta" step.

From a8168d86194b3d25c2a1ee5a6c2170be889dd62b Mon Sep 17 00:00:00 2001
From: dimalvovs <dmitrijs.lvovs@gmail.com>
Date: Wed, 7 Jan 2026 11:42:09 -0500
Subject: [PATCH 12/14] add main tag for back-compatibility

---
 .github/workflows/build-push-container.yml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.github/workflows/build-push-container.yml b/.github/workflows/build-push-container.yml
index 1e76253..9fa5ca6 100644
--- a/.github/workflows/build-push-container.yml
+++ b/.github/workflows/build-push-container.yml
@@ -44,6 +44,7 @@ jobs:
           images: ${{ env.REGISTRY }}/${{ env.IMAGE_NAME }}
           tags: |
             type=sha,format=short,prefix= # Generates a tag like '860c190'
+            main
       # This step uses the `docker/build-push-action` action to build the image, based on your repository's `Dockerfile`. If the build succeeds, it pushes the image to GitHub Packages.
       # It uses the `context` parameter to define the build's context as the set of files located in the specified path. For more information, see "[Usage](https://github.com/docker/build-push-action#usage)" in the README of the `docker/build-push-action` repository.
       # It uses the `tags` and `labels` parameters to tag and label the image with the output from the "meta" step.

From 26a6b29c27e96c24ef46e0c572d2c82c61afaed0 Mon Sep 17 00:00:00 2001
From: dltamayo <DLT@breakthroughcancer.org>
Date: Wed, 7 Jan 2026 14:39:12 -0500
Subject: [PATCH 13/14] Update docker, workflow

---
 .github/workflows/build-push-container.yml | 1 +
 Dockerfile                                 | 1 +
 2 files changed, 2 insertions(+)

diff --git a/.github/workflows/build-push-container.yml b/.github/workflows/build-push-container.yml
index 9fa5ca6..d4975b2 100644
--- a/.github/workflows/build-push-container.yml
+++ b/.github/workflows/build-push-container.yml
@@ -5,6 +5,7 @@ name: build-push-container
 on:
   push:
     paths:
+      - '**/env.yml'
       - '**/Dockerfile'
       - '**/*.dockerfile'
       - '.github/workflows/build-push-container.yml'
diff --git a/Dockerfile b/Dockerfile
index 3aff1f6..cab497a 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -15,6 +15,7 @@ RUN apt-get update && apt-get install -y \
         unzip \
         zip \
         jq \
+        ps \
         \
         # build-only deps (REMOVE LATER)
         build-essential \

From bfa4f504b67fba1982f4d24a937e04aad65d89ce Mon Sep 17 00:00:00 2001
From: dltamayo <DLT@breakthroughcancer.org>
Date: Wed, 7 Jan 2026 14:56:46 -0500
Subject: [PATCH 14/14] Update docker

---
 Dockerfile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Dockerfile b/Dockerfile
index cab497a..50f3cb3 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -15,7 +15,7 @@ RUN apt-get update && apt-get install -y \
         unzip \
         zip \
         jq \
-        ps \
+        procps \
         \
         # build-only deps (REMOVE LATER)
         build-essential \