From 0eb97b491a924664e6bda7c9e7833df9ea2ad30d Mon Sep 17 00:00:00 2001 From: dltamayo Date: Fri, 26 Dec 2025 19:14:38 -0500 Subject: [PATCH 01/14] Retain original samplesheet sample order --- .../local/samplesheet/samplesheet_resolve.nf | 34 ++++++++++++++++--- subworkflows/local/resolve_samplesheet.nf | 1 + 2 files changed, 31 insertions(+), 4 deletions(-) diff --git a/modules/local/samplesheet/samplesheet_resolve.nf b/modules/local/samplesheet/samplesheet_resolve.nf index 61ca0c1..3128eff 100644 --- a/modules/local/samplesheet/samplesheet_resolve.nf +++ b/modules/local/samplesheet/samplesheet_resolve.nf @@ -3,6 +3,7 @@ process SAMPLESHEET_RESOLVE { container "ghcr.io/karchinlab/tcrtoolkit:main" input: + path samplesheet_utf8 val(resolved_rows) // List of tab-separated strings val(resolved_header) // Comma-separated header line @@ -11,10 +12,35 @@ process SAMPLESHEET_RESOLVE { script: """ - echo \"$resolved_header\" > samplesheet_resolved.csv +# Write resolved rows to a temp file +cat << 'EOF' > resolved.tmp +${resolved_rows.join('\n')} +EOF - for row in ${resolved_rows.collect{"\"${it}\""}.join(' ')}; do - echo -e "\$row" >> samplesheet_resolved.csv - done +# Emit header +echo "${resolved_header}" > samplesheet_resolved.csv + +# Two-pass awk: +# - pass 1: read original samplesheet, store sample order +# - pass 2: read resolved rows, store rows by sample +awk -F',' ' + NR==FNR { + if (FNR > 1) order[++n] = \$1 + next + } + { + resolved[\$1] = \$0 + } + END { + for (i = 1; i <= n; i++) { + s = order[i] + if (!(s in resolved)) { + printf "ERROR: missing resolved row for %s\\n", s > "/dev/stderr" + exit 1 + } + print resolved[s] + } + } +' "${samplesheet_utf8}" resolved.tmp >> samplesheet_resolved.csv """ } \ No newline at end of file diff --git a/subworkflows/local/resolve_samplesheet.nf b/subworkflows/local/resolve_samplesheet.nf index 98abe4d..a9f03a5 100644 --- a/subworkflows/local/resolve_samplesheet.nf +++ b/subworkflows/local/resolve_samplesheet.nf @@ -34,6 +34,7 @@ workflow RESOLVE_SAMPLESHEET { .set { resolved_header } SAMPLESHEET_RESOLVE( + samplesheet_utf8, resolved_rows, resolved_header ) From 175ffcfc7b923d8be80c1da8ea4f23641200f701 Mon Sep 17 00:00:00 2001 From: dltamayo Date: Fri, 26 Dec 2025 19:22:58 -0500 Subject: [PATCH 02/14] Refactor compare_calc - Removed utils.py code, placed functions into compare_calc.py - Refactored compare_calc.py for efficient access of dataframes - Fixed bug where NaN `junction_aa` values were not dropped prior to calculation of Jaccard and Sorensen matrices --- bin/compare_calc.py | 269 +++++++++++++++++++++++--------------------- bin/utils.py | 85 -------------- 2 files changed, 143 insertions(+), 211 deletions(-) delete mode 100755 bin/utils.py diff --git a/bin/compare_calc.py b/bin/compare_calc.py index b86700c..81a83ef 100755 --- a/bin/compare_calc.py +++ b/bin/compare_calc.py @@ -1,133 +1,150 @@ #!/usr/bin/env python3 """ -Description: this script calculates overlap measures between TCR repertoires - -@author: Domenick Braccia +Description: Calculate overlap measures between TCR repertoires +Author: Dylan Tamayo, Domenick Braccia """ import argparse import pandas as pd import numpy as np -import os -import sys -import csv -from scipy.stats import entropy -from utils import jaccard_index, sorensen_index, morisita_horn_index #, jensen_shannon_distance - -print('-- ENTERED compare_calc.py--') -print('-- THE TIME IS: --' + str(pd.Timestamp.now())) - -# initialize parser -parser = argparse.ArgumentParser(description='Calculate clonality of a TCR repertoire') - -# add arguments -parser.add_argument('-s', '--sample_utf8', - metavar='sample_utf8', - type=str, - help='sample CSV file initially passed to nextflow run command') -# parser.add_argument('-m', '--meta_data', -# metavar='meta_data', -# type=str, -# help='metadata CSV file initially passed to nextflow run command') - -args = parser.parse_args() - -## Read in sample table CSV file -## convert metadata to list -s = args.sample_utf8 -sample_utf8 = pd.read_csv(args.sample_utf8, sep=',', header=0) -print('sample_utf8 looks like this: ' + str(sample_utf8)) -print('sample_utf8 columns: \n') -print(sample_utf8.columns) - -# Read in metadata table CSV file -# meta_data = pd.read_csv(args.meta_data, sep=',', header=0) -# print('meta_data looks like this: ' + str(meta_data)) -# print('meta_data columns: \n') -# print(meta_data.columns) - -# Import TCR count tables into dictionary of dataframes -files = sample_utf8['file'] -dfs = {} -for file in files: - # load data - df = pd.read_csv(file, sep='\t', header=0) - dfs[file] = df - -print('number of files in dfs: ' + str(len(dfs))) - -## calculate the jaccard index between each sample pair in dfs and store in an nxn matrix and write to file -samples = list(dfs.keys()) - -print('- calculating jaccard index... -') -jaccard_mat = np.zeros((len(samples), len(samples))) -for i, sample1 in enumerate(samples): - for j, sample2 in enumerate(samples): - # calculate jaccard index - value = jaccard_index(dfs[sample1]['junction_aa'], dfs[sample2]['junction_aa']) - # store in numpy array - jaccard_mat[i, j] = value - -# define column and index names -sample_names= [os.path.basename(sample).split('.')[0] for sample in samples] -jaccard_df = pd.DataFrame(jaccard_mat, columns=sample_names, index=sample_names) - -# save jacard_df to csv -jaccard_df.to_csv('jaccard_mat.csv', index=True, header=True) - -## calculate the sorensen index between each sample pair in dfs and store in an nxn matrix and write to file -print('- calculating sorensen index... -') -sorensen_mat = np.zeros((len(samples), len(samples))) -for i, sample1 in enumerate(samples): - for j, sample2 in enumerate(samples): - # calculate sorensen index - value = sorensen_index(dfs[sample1]['junction_aa'], dfs[sample2]['junction_aa']) - # store in numpy array - sorensen_mat[i, j] = value - -# define column and index names -sorensen_df = pd.DataFrame(sorensen_mat, columns=sample_names, index=sample_names) - -# save sorensen_df to csv -sorensen_df.to_csv('sorensen_mat.csv', index=True, header=True) - -## calculate the morisita index between each sample pair in dfs and store in an nxn matrix and write to file -print('- calculating morisita index... -') -morisita_mat = np.zeros((len(samples), len(samples))) -for i in range(len(samples)): - print('-- on sample ' + str(i) + ' --') - for j in range(i+1): - # calculate morisita index - value = morisita_horn_index(dfs, samples[i], samples[j]) - # store in numpy array - morisita_mat[i, j] = value - -# Copy the lower triangle to the upper triangle -morisita_mat = morisita_mat + morisita_mat.T - np.diag(morisita_mat.diagonal()) - -# define column and index names -morisita_df = pd.DataFrame(morisita_mat, columns=sample_names, index=sample_names) - -# save morisita_df to csv -morisita_df.to_csv('morisita_mat.csv', index=True, header=True) - -## calculate jensen shannon distance between each sample pair in dfs and store in an nxn matrix and write to file -# print('- calculating jensen shannon distance... -') -# jsd_mat = np.zeros((len(samples), len(samples))) -# for i, sample1 in enumerate(samples): -# for j, sample2 in enumerate(samples): -# # calculate jensen shannon distance -# value = jensen_shannon_distance(dfs[sample1][['junction_aa', 'duplicate_count']], dfs[sample2][['junction_aa', 'duplicate_count']]) -# # store in numpy array -# jsd_mat[i, j] = value - -# # Copy the lower triangle to the upper triangle -# jsd_mat = jsd_mat + jsd_mat.T - np.diag(jsd_mat.diagonal()) - -# # define column and index names -# jsd_df = pd.DataFrame(jsd_mat, columns=sample_names, index=sample_names) - -# # save jsd_df to csv -# jsd_df.to_csv('jsd_mat.csv', index=True, header=True) - -## ========================================================================== ## + +# ------------------------- +# Similarity functions +# ------------------------- +def jaccard_index(set1, set2): + union = len(set1 | set2) + return len(set1 & set2) / union if union else 0.0 + + +def sorensen_index(set1, set2): + denom = len(set1) + len(set2) + return (2 * len(set1 & set2) / denom) if denom else 0.0 + + +def morisita_horn_index(counts1, counts2): + X = counts1.sum() + Y = counts2.sum() + + if X == 0 or Y == 0: + return 0.0 + + prod_sum = np.sum(counts1 * counts2) + lambda1 = np.sum(counts1 ** 2) / (X ** 2) + lambda2 = np.sum(counts2 ** 2) / (Y ** 2) + + return (2 * prod_sum) / ((lambda1 + lambda2) * X * Y) + +if __name__ == "__main__": + # ------------------------- + # Argument parsing + # ------------------------- + parser = argparse.ArgumentParser( + description="Calculate overlap metrics for TCR repertoires" + ) + parser.add_argument( + "-s", "--sample_utf8", + required=True, + help="Samplesheet CSV passed from Nextflow" + ) + args = parser.parse_args() + + + # ------------------------- + # Load samplesheet + # ------------------------- + sample_df = pd.read_csv(args.sample_utf8) + + samples = sample_df["sample"].tolist() + files = sample_df["file"].tolist() + n = len(samples) + + print(f"Loaded {n} samples") + + # ------------------------- + # Preload data structures + # ------------------------- + junction_sets = {} + count_vectors = {} + + for sample, file in zip(samples, files): + df = pd.read_csv(file, sep="\t", usecols=["junction_aa", "duplicate_count"]) + df = df.dropna(subset=["junction_aa"]) + + # Set for presence/absence metrics + junction_sets[sample] = set(df["junction_aa"]) + + # Counts for Morisita–Horn + count_vectors[sample] = ( + df.groupby("junction_aa")["duplicate_count"] + .sum() + ) + + + # ------------------------- + # Align count vectors across union space + # ------------------------- + all_junctions = sorted( + set().union(*junction_sets.values()) + ) + + for sample in samples: + count_vectors[sample] = ( + count_vectors[sample] + .reindex(all_junctions, fill_value=0) + .to_numpy() + ) + + + # ------------------------- + # Initialize matrices + # ------------------------- + jaccard_mat = np.zeros((n, n)) + sorensen_mat = np.zeros((n, n)) + morisita_mat = np.zeros((n, n)) + + + # ------------------------- + # Compute upper triangle only + # ------------------------- + print("Calculating overlap metrics...") + + for i in range(n): + s1 = samples[i] + set1 = junction_sets[s1] + counts1 = count_vectors[s1] + + # Diagonal + jaccard_mat[i, i] = 1.0 + sorensen_mat[i, i] = 1.0 + morisita_mat[i, i] = 1.0 + + for j in range(i + 1, n): + s2 = samples[j] + + j_val = jaccard_index(set1, junction_sets[s2]) + s_val = sorensen_index(set1, junction_sets[s2]) + m_val = morisita_horn_index(counts1, count_vectors[s2]) + + jaccard_mat[i, j] = jaccard_mat[j, i] = j_val + sorensen_mat[i, j] = sorensen_mat[j, i] = s_val + morisita_mat[i, j] = morisita_mat[j, i] = m_val + + + # ------------------------- + # Write outputs + # ------------------------- + index_names = samples + + pd.DataFrame( + jaccard_mat, index=index_names, columns=index_names + ).to_csv("jaccard_mat.csv") + + pd.DataFrame( + sorensen_mat, index=index_names, columns=index_names + ).to_csv("sorensen_mat.csv") + + pd.DataFrame( + morisita_mat, index=index_names, columns=index_names + ).to_csv("morisita_mat.csv") + + print("Finished writing all matrices") \ No newline at end of file diff --git a/bin/utils.py b/bin/utils.py deleted file mode 100755 index f4391f1..0000000 --- a/bin/utils.py +++ /dev/null @@ -1,85 +0,0 @@ -#!/usr/bin/env python3 - -""" -Description: utility functions for plotting simple TCR repertoire statistics - -Authors: Domenick Braccia -""" - -## import packages -import time -import pandas as pd -import matplotlib.pyplot as plt -import seaborn as sns -from scipy.spatial import distance - -def TicTocGenerator(): - # Generator that returns time differences - ti = 0 # initial time - tf = time.time() # final time - while True: - ti = tf - tf = time.time() - yield tf-ti # returns the time difference - -TicToc = TicTocGenerator() # create an instance of the TicTocGen generator - -# This will be the main function through which we define both tic() and toc() -def toc(tempBool=True): - # Prints the time difference yielded by generator instance TicToc - tempTimeInterval = next(TicToc) - if tempBool: - print( "Elapsed time: %f seconds.\n" %tempTimeInterval ) - -def tic(): - # Records a time in TicToc, marks the beginning of a time interval - toc(False) - -# Defining sample comparison functions -def jaccard_index(sample1, sample2): - set1 = set(sample1) - set2 = set(sample2) - intersection = len(set1.intersection(set2)) - union = len(set1.union(set2)) - return intersection / union - -def sorensen_index(sample1, sample2): - set1 = set(sample1) - set2 = set(sample2) - intersection = len(set1.intersection(set2)) - return 2 * intersection / (len(set1) + len(set2)) - -def morisita_horn_index(dfs, sample1, sample2): - # create sets of amino acid sequences - set1 = set(dfs[sample1]['junction_aa']) - set2 = set(dfs[sample2]['junction_aa']) - - # identify union of sets - union = set1.union(set2) - - # get counts of aa sequences in sample1 and sample2 - df1 = dfs[sample1].groupby('junction_aa')['duplicate_count'].sum().reindex(union).fillna(0) - df2 = dfs[sample2].groupby('junction_aa')['duplicate_count'].sum().reindex(union).fillna(0) - n1i = df1.values - n2i = df2.values - - # calculate product of counts - products = n1i * n2i - - # calculate simpson index values for sample1 and sample2 - print(type(df1)) - X = df1.sum() - Y = df2.sum() - - s1_si = sum(count**2 for count in df1)/(X**2) - s2_si = sum(count**2 for count in df2)/(Y**2) - - numerator = 2 * sum(products) - denominator = (s1_si + s2_si) * (X * Y) - return numerator / denominator - -def jensen_shannon_distance(sample1, sample2): - # Merge the two samples based on junction_aa column - merged = pd.merge(sample1, sample2, on='junction_aa', how='outer', suffixes=('_1', '_2')).fillna(0) - # Enter probability distributions into the distance function - return distance.jensenshannon(merged['duplicate_count_1'], merged['duplicate_count_2']) \ No newline at end of file From 8433d77b6222e2c47f19bfab14693a611b70375c Mon Sep 17 00:00:00 2001 From: dltamayo Date: Mon, 29 Dec 2025 11:24:49 -0500 Subject: [PATCH 03/14] Address linting issues, add param validation Reformated code to address new Nextflow linting issues for strict syntax compatibility, namely: - moving statements into workflows - making implicit closure parameters explicit - removing use of projectDir in processes Added nf-schema for parameter validation --- conf/base.config | 34 +++- main.nf | 12 -- modules/local/compare/compare_plot.nf | 1 - modules/local/compare/gliph2.nf | 1 - modules/local/sample/sample_aggregate.nf | 2 +- modules/local/sample/sample_plot.nf | 1 - modules/local/sample/tcrdist3.nf | 38 ++--- nextflow.config | 42 +---- nextflow_schema.json | 186 ++++++++++++++++++++++ notebooks/compare_stats_template.qmd | 34 +--- notebooks/gliph2_report_template.qmd | 2 - notebooks/sample_stats_template.qmd | 9 +- subworkflows/local/input_check.nf | 2 +- subworkflows/local/resolve_samplesheet.nf | 2 +- subworkflows/local/sample.nf | 24 ++- subworkflows/local/validate_params.nf | 8 + workflows/tcrtoolkit.nf | 27 +--- 17 files changed, 269 insertions(+), 156 deletions(-) create mode 100644 nextflow_schema.json create mode 100644 subworkflows/local/validate_params.nf diff --git a/conf/base.config b/conf/base.config index daf7fa4..7129697 100644 --- a/conf/base.config +++ b/conf/base.config @@ -9,7 +9,6 @@ */ process { - // TODO nf-core: Check the defaults for all processes cpus = { check_max( 1 * task.attempt, 'cpus' ) } memory = { check_max( 4.GB * task.attempt, 'memory' ) } @@ -63,4 +62,37 @@ process { maxRetries = 2 } +} + +// Function to ensure that resource requirements don't go beyond +// a maximum limit +def check_max(obj, type) { + if (type == 'memory') { + try { + if (obj.compareTo(params.max_memory as nextflow.util.MemoryUnit) == 1) + return params.max_memory as nextflow.util.MemoryUnit + else + return obj + } catch (all) { + println " ### ERROR ### Max memory '${params.max_memory}' is not valid! Using default value: $obj" + return obj + } + } else if (type == 'time') { + try { + if (obj.compareTo(params.max_time as nextflow.util.Duration) == 1) + return params.max_time as nextflow.util.Duration + else + return obj + } catch (all) { + println " ### ERROR ### Max time '${params.max_time}' is not valid! Using default value: $obj" + return obj + } + } else if (type == 'cpus') { + try { + return Math.min( obj, params.max_cpus as int ) + } catch (all) { + println " ### ERROR ### Max cpus '${params.max_cpus}' is not valid! Using default value: $obj" + return obj + } + } } \ No newline at end of file diff --git a/main.nf b/main.nf index a09fdb0..c84b10f 100644 --- a/main.nf +++ b/main.nf @@ -27,18 +27,6 @@ workflow { TCRTOOLKIT() } -/* -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - COMPLETION EMAIL AND SUMMARY -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -*/ - -workflow.onComplete { - - log.info(workflow.success ? "All done!" : "Please check your inputs.") - -} - /* ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ THE END diff --git a/modules/local/compare/compare_plot.nf b/modules/local/compare/compare_plot.nf index 471ddaa..460f536 100644 --- a/modules/local/compare/compare_plot.nf +++ b/modules/local/compare/compare_plot.nf @@ -24,7 +24,6 @@ process COMPARE_PLOT { quarto render compare_stats.qmd \ -P project_name:$project_name \ -P workflow_cmd:'$workflow.commandLine' \ - -P project_dir:$projectDir \ -P jaccard_mat:$jaccard_mat \ -P sorensen_mat:$sorensen_mat \ -P morisita_mat:$morisita_mat \ diff --git a/modules/local/compare/gliph2.nf b/modules/local/compare/gliph2.nf index 486442d..1a45e4c 100644 --- a/modules/local/compare/gliph2.nf +++ b/modules/local/compare/gliph2.nf @@ -77,7 +77,6 @@ process GLIPH2_PLOT { quarto render gliph2_report.qmd \ -P project_name:$params.project_name \ -P workflow_cmd:'$workflow.commandLine' \ - -P project_dir:$projectDir \ -P results_dir:'./' \ # -P clusters:$cluster_member_details \ diff --git a/modules/local/sample/sample_aggregate.nf b/modules/local/sample/sample_aggregate.nf index bb4a293..ea7b43a 100644 --- a/modules/local/sample/sample_aggregate.nf +++ b/modules/local/sample/sample_aggregate.nf @@ -15,7 +15,7 @@ process SAMPLE_AGGREGATE { python3 < '"' + input_file.getName() + '"' }.join(', ')}] dfs = [pd.read_csv(f) for f in input_files] merged = pd.concat(dfs, axis=0, ignore_index=True) merged.to_csv("${output_file}", index=False) diff --git a/modules/local/sample/sample_plot.nf b/modules/local/sample/sample_plot.nf index 5ca0494..3ae48a1 100644 --- a/modules/local/sample/sample_plot.nf +++ b/modules/local/sample/sample_plot.nf @@ -22,7 +22,6 @@ process SAMPLE_PLOT { quarto render sample_stats.qmd \ -P project_name:$params.project_name \ -P workflow_cmd:'$workflow.commandLine' \ - -P project_dir:$projectDir \ -P sample_table:$sample_table \ -P sample_stats_csv:$sample_stats_csv \ -P v_family_csv:$v_family_csv \ diff --git a/modules/local/sample/tcrdist3.nf b/modules/local/sample/tcrdist3.nf index b37aab5..67dd2b0 100644 --- a/modules/local/sample/tcrdist3.nf +++ b/modules/local/sample/tcrdist3.nf @@ -3,30 +3,21 @@ process TCRDIST3_MATRIX { container "ghcr.io/karchinlab/tcrtoolkit:main" cpus { - if (task.memory > 256.GB) - return 16 * task.attempt - else if (task.memory > 64.GB) - return 8 * task.attempt - else if (task.memory > 4.GB) - return 4 * task.attempt - else - return 2 * task.attempt - } + task.memory > 256.GB ? 16 * task.attempt: + task.memory > 64.GB ? 8 * task.attempt: + task.memory > 4.GB ? 4 * task.attempt: + 2 * task.attempt + } + + memory { - def sz = count_table.size() - def mb = 1024 * 1024 - if (sz > 26 * mb) - return 512.GB * task.attempt - else if (sz > 20 * mb) - return 256.GB * task.attempt - else if (sz > 10 * mb) - return 128.GB * task.attempt - else if (sz > 4 * mb) - return 64.GB * task.attempt - else if (sz > 2 * mb) - return 16.GB * task.attempt - else - return 4.GB * task.attempt + size -> count_table.size() + count_table.size() > 26 * 1024**2 ? 512.GB * task.attempt: + count_table.size() > 20 * 1024**2 ? 256.GB * task.attempt: + count_table.size() > 10 * 1024**2 ? 128.GB * task.attempt: + count_table.size() > 4 * 1024**2 ? 64.GB * task.attempt: + count_table.size() > 2 * 1024**2 ? 16.GB * task.attempt: + 4.GB * task.attempt } input: @@ -42,7 +33,6 @@ process TCRDIST3_MATRIX { script: """ - # Run tcrdist3 on input tcrdist3_matrix.py ${count_table} ${sample_meta.sample} ${matrix_sparsity} ${distance_metric} ${ref_db} ${task.cpus} """ } diff --git a/nextflow.config b/nextflow.config index f685ead..b569dcb 100644 --- a/nextflow.config +++ b/nextflow.config @@ -7,7 +7,14 @@ docker { // Load base.config by default for all pipelines includeConfig 'conf/base.config' +plugins { + id 'nf-schema@2.6.1' +} + params { + samplesheet = null + outdir = 'out' + publish_dir_mode = 'copy' // Max resource options @@ -16,7 +23,7 @@ params { max_cpus = 192 max_time = '48.h' - input_format = "airr" + input_format = "airr" // cellranger, adaptive airr_schema = "${projectDir}/assets/airr/airr_rearrangement_schema.json" imgt_lookup = "${projectDir}/assets/airr/imgt_adaptive_lookup.tsv" @@ -57,36 +64,3 @@ params { } includeConfig 'conf/modules.config' - -// Function to ensure that resource requirements don't go beyond -// a maximum limit -def check_max(obj, type) { - if (type == 'memory') { - try { - if (obj.compareTo(params.max_memory as nextflow.util.MemoryUnit) == 1) - return params.max_memory as nextflow.util.MemoryUnit - else - return obj - } catch (all) { - println " ### ERROR ### Max memory '${params.max_memory}' is not valid! Using default value: $obj" - return obj - } - } else if (type == 'time') { - try { - if (obj.compareTo(params.max_time as nextflow.util.Duration) == 1) - return params.max_time as nextflow.util.Duration - else - return obj - } catch (all) { - println " ### ERROR ### Max time '${params.max_time}' is not valid! Using default value: $obj" - return obj - } - } else if (type == 'cpus') { - try { - return Math.min( obj, params.max_cpus as int ) - } catch (all) { - println " ### ERROR ### Max cpus '${params.max_cpus}' is not valid! Using default value: $obj" - return obj - } - } -} \ No newline at end of file diff --git a/nextflow_schema.json b/nextflow_schema.json new file mode 100644 index 0000000..c2a202f --- /dev/null +++ b/nextflow_schema.json @@ -0,0 +1,186 @@ +{ + "$schema": "https://json-schema.org/draft/2020-12/schema", + "$id": "https://raw.githubusercontent.com/break-through-cancer/tcrtoolkit-pipeline/main/nextflow_schema.json", + "title": "tcrtoolkit pipeline parameters", + "description": "BTC TCR Toolkit pipeline", + "type": "object", + + "$defs": { + "input_output_options": { + "title": "Input/output options", + "type": "object", + "fa_icon": "fas fa-terminal", + "description": "Define where the pipeline should find input data and save output data.", + "required": ["samplesheet", "outdir"], + "properties": { + "samplesheet": { + "type": "string", + "format": "file-path", + "pattern": ".*.csv$", + "description": "Path to the samplesheet describing input AIRR data.", + "help_text": "A CSV of samples and metadata for this TCR analysis.", + "fa_icon": "fas fa-file-csv" + }, + "outdir": { + "type": "string", + "format": "directory-path", + "default": "out", + "description": "Output directory where results will be saved.", + "fa_icon": "fas fa-folder-open" + } + } + }, + + "resource_options": { + "title": "Max resource options", + "type": "object", + "fa_icon": "fab fa-acquisitions-incorporated", + "description": "Set the top limit for requested resources for any single job.", + "properties": { + "max_cpus": { + "type": "integer", + "default": 192, + "description": "Maximum CPUs that can be requested by any process.", + "fa_icon": "fas fa-microchip" + }, + "max_memory": { + "type": "string", + "default": "768.GB", + "pattern": "^\\d+(\\.\\d+)?\\.?\\s*(K|M|G|T)?B$", + "description": "Maximum memory for any process.", + "fa_icon": "fas fa-memory" + }, + "max_time": { + "type": "string", + "default": "48.h", + "description": "Maximum walltime for any job.", + "fa_icon": "far fa-clock" + } + } + }, + + "workflow_options": { + "title": "Workflow parameters", + "type": "object", + "fa_icon": "fas fa-project-diagram", + "description": "General pipeline workflow settings.", + "properties": { + "workflow_level": { + "type": "string", + "default": "sample,compare", + "enum": ["sample,compare", "sample", "compare", "convert"], + "description": "Comma-separated workflow stages (sample, compare)." + }, + "project_name": { + "type": "string", + "description": "Name of this analysis project." + }, + "publish_dir_mode": { + "type": "string", + "default": "copy", + "enum": ["copy", "move", "link", "symlink"], + "description": "Method used by `publishDir` to save outputs." + } + } + }, + + "airr_options": { + "title": "AIRR data options", + "type": "object", + "fa_icon": "fas fa-dna", + "description": "Parameters related to AIRR format and schema references.", + "properties": { + "input_format": { + "type": "string", + "default": "airr", + "enum": ["airr", "adaptive", "cellranger"], + "description": "Input data format." + }, + "airr_schema": { + "type": "string", + "description": "Path to AIRR rearrangement schema JSON." + }, + "imgt_lookup": { + "type": "string", + "description": "Path to imgt lookup table." + }, + "sample_stats_template": { + "type": "string", + "description": "Path to sample notebook template." + }, + "compare_stats_template": { + "type": "string", + "description": "Path to compare notebook template." + } + } + }, + + "plotting_options": { + "title": "Plotting and metadata options", + "type": "object", + "fa_icon": "fas fa-chart-bar", + "description": "Parameters for plotting and metadata columns.", + "properties": { + "samplechart_x_col": { "type": "string", "default": "timepoint" }, + "samplechart_color_col": { "type": "string", "default": "origin" }, + "vgene_subject_col": { "type": "string", "default": "subject_id" }, + "vgene_x_cols": { "type": "string", "default": "origin,timepoint" } + } + }, + + "giana_options": { + "title": "GIANA clustering options", + "type": "object", + "fa_icon": "fas fa-brain", + "properties": { + "threshold": { "type": "number", "default": 7.0 }, + "threshold_score": { "type": "number", "default": 3.6 }, + "threshold_vgene": { "type": "number", "default": 3.7 } + } + }, + + "gliph2_options": { + "title": "GLIPH2 clustering options", + "type": "object", + "fa_icon": "fas fa-code-branch", + "properties": { + "gliph2_report_template": { "type": "string" }, + "ref_files": { "type": "string" }, + "local_min_pvalue": { "type": "string", "default": "0.001" }, + "p_depth": { "type": "string", "default": "1000" }, + "global_convergence_cutoff": { "type": "string", "default": "1" }, + "simulation_depth": { "type": "string", "default": "1000" }, + "kmer_min_depth": { "type": "string", "default": "3" }, + "local_min_OVE": { "type": "string", "default": "c(1000, 100, 10)" }, + "algorithm": { "type": "string", "default": "GLIPH2" }, + "all_aa_interchangeable": { "type": "string", "default": "1" } + } + }, + + "tcrdist3_options": { + "title": "TCRdist3 distance options", + "type": "object", + "fa_icon": "fas fa-ruler-combined", + "properties": { + "matrix_sparsity": { + "type": "string", + "default": "sparse", + "enum": ["sparse", "full"] + }, + "distance_metric": { "type": "string", "default": "tcrdist" }, + "db_path": { "type": "string" } + } + } + }, + + "allOf": [ + { "$ref": "#/$defs/input_output_options" }, + { "$ref": "#/$defs/resource_options" }, + { "$ref": "#/$defs/workflow_options" }, + { "$ref": "#/$defs/airr_options" }, + { "$ref": "#/$defs/plotting_options" }, + { "$ref": "#/$defs/giana_options" }, + { "$ref": "#/$defs/gliph2_options" }, + { "$ref": "#/$defs/tcrdist3_options" } + ] +} \ No newline at end of file diff --git a/notebooks/compare_stats_template.qmd b/notebooks/compare_stats_template.qmd index 4df9221..01b2c66 100644 --- a/notebooks/compare_stats_template.qmd +++ b/notebooks/compare_stats_template.qmd @@ -30,7 +30,6 @@ Thank you for using TCRtoolkit! This report is generated from sample data and me #Default inputs are overwritten at the command line in `modules/local/plot_sample.nf` workflow_cmd='' project_name='path/to/project_name' -project_dir='path/to/project_dir' jaccard_mat='path/to/jaccard_mat.csv' sorensen_mat='path/to/sorensen_mat.csv' morisita_mat='path/to/morisita_mat.csv' @@ -57,43 +56,18 @@ import seaborn as sns print('Pipeline information and parameters:' + '\n') print('Project Name: ' + project_name) print('Workflow command: ' + workflow_cmd) -print('Pipeline Directory: ' + project_dir) print('Date and time: ' + str(datetime.datetime.now())) -# 3. Importing custom plotting functions -## Plotting functions are defined in `bin/utils.py`. -# sys.path.append(project_dir + '/bin/') -# source_file = os.path.join(project_dir, 'bin', 'utils.py') -# destination_file = os.path.join(os.getcwd(), 'utils.py') -# shutil.copyfile(source_file, destination_file) -# from utils import TicTocGenerator, tic, toc -# TicToc = TicTocGenerator() - -# 4. Importing similarity data -## 4a. jaccard similarity matrix +# 3. Importing similarity data +## 3a. jaccard similarity matrix jaccard_df = pd.read_csv(jaccard_mat, sep=',', header=0, index_col=0) -## 4b. sorensen similarity matrix +## 3b. sorensen similarity matrix sorensen_df = pd.read_csv(sorensen_mat, sep=',', header=0, index_col=0) -## 4c. morisita similarity matrix +## 3c. morisita similarity matrix morisita_df = pd.read_csv(morisita_mat, sep=',', header=0, index_col=0) -## 4d. jensen-shannon matrix -# jsd_df = pd.read_csv(jsd_mat, sep=',', header=0, index_col=0) - -# 5. Importing sample level counts -# sample_utf8 = pd.read_csv(sample_utf8, sep=',', header=0, index_col=0) -# files = sample_utf8['file'] -# dfs = {} -# for file in files: -# # load data -# df = pd.read_csv(file, sep='\t', header=0) - -# # Rename columns -# df = df.rename(columns={'count (templates/reads)': 'read_count', 'frequencyCount (%)': 'frequency'}) -# sample_id = os.path.basename(file).split('.')[0] -# dfs[sample_id] = df ``` # Analysis diff --git a/notebooks/gliph2_report_template.qmd b/notebooks/gliph2_report_template.qmd index 5d48831..e39d248 100644 --- a/notebooks/gliph2_report_template.qmd +++ b/notebooks/gliph2_report_template.qmd @@ -30,7 +30,6 @@ Thank you for using TCRtoolkit! This report is generated from sample data and me #Default inputs are overwritten at the command line in `modules/local/plot_gliph2.nf` workflow_cmd='' project_name='path/to/project_name' -project_dir='path/to/project_dir' clusters='path/to/{project_name}_cluster.csv' cluster_stats='path/to/{project_name}_cluster.txt' ``` @@ -54,5 +53,4 @@ import seaborn as sns print('Pipeline information and parameters:' + '\n') print('Project Name: ' + project_name) print('Workflow command: ' + workflow_cmd) -print('Pipeline Directory: ' + project_dir) print('Date and time: ' + str(datetime.datetime.now())) diff --git a/notebooks/sample_stats_template.qmd b/notebooks/sample_stats_template.qmd index 5a97598..a85d7ef 100644 --- a/notebooks/sample_stats_template.qmd +++ b/notebooks/sample_stats_template.qmd @@ -32,7 +32,6 @@ Thank you for using TCRtoolkit! This report is generated from sample data and me workflow_cmd='' project_name='' -project_dir='' sample_table='' sample_stats_csv='' v_family_csv='' @@ -70,15 +69,9 @@ warnings.filterwarnings( print('Project Name: ' + project_name) print('Workflow command: ' + workflow_cmd) -print('Pipeline Directory: ' + project_dir) print('Date and time: ' + str(datetime.datetime.now())) -# 3. Importing custom plotting functions -# sys.path.append(project_dir + '/bin/') -# from utils import TicTocGenerator, tic, toc -# TicToc = TicTocGenerator() - -# 4. Loading data +# 3. Loading data ## reading sample metadata meta = pd.read_csv(sample_table, sep=',') meta_cols = meta.columns.tolist() diff --git a/subworkflows/local/input_check.nf b/subworkflows/local/input_check.nf index 08a2fdb..b197003 100644 --- a/subworkflows/local/input_check.nf +++ b/subworkflows/local/input_check.nf @@ -19,7 +19,7 @@ workflow INPUT_CHECK { samplesheet_utf8 .splitCsv(header: true, sep: ',') .map { row -> - def meta = row.findAll { k, v -> k != 'file' } // everything except the file column + def meta = row.findAll { k, _v -> k != 'file' } // everything except the file column def file_obj = file(row.file) return [meta, file_obj] } diff --git a/subworkflows/local/resolve_samplesheet.nf b/subworkflows/local/resolve_samplesheet.nf index a9f03a5..e6d8c06 100644 --- a/subworkflows/local/resolve_samplesheet.nf +++ b/subworkflows/local/resolve_samplesheet.nf @@ -28,7 +28,7 @@ workflow RESOLVE_SAMPLESHEET { .splitCsv(header: true, sep: ',') .first() .map { row -> - def header = row.keySet().findAll { it != 'file' } + ['file'] + def header = row.keySet().findAll { header_col -> header_col != 'file' } + ['file'] return header.join(',') // <-- convert to string } .set { resolved_header } diff --git a/subworkflows/local/sample.nf b/subworkflows/local/sample.nf index 9278537..c63bbe6 100644 --- a/subworkflows/local/sample.nf +++ b/subworkflows/local/sample.nf @@ -61,13 +61,11 @@ workflow SAMPLE { ) TCRDIST3_MATRIX.out.max_matrix_value - .map { it.text.trim().toDouble() } + .map { tcrdist_xmax -> tcrdist_xmax.text.trim().toDouble() } .collect() .map { values -> values.max() } .set { global_x_max_value } - - // Use `global_max_value` in downstream processes or print it - global_x_max_value.view { "Global x max matrix value: $it" } + global_x_max_value.view { global_xmax -> "Global x max matrix value: $global_xmax" } TCRDIST3_HISTOGRAM_CALC( TCRDIST3_MATRIX.out.tcrdist_output, @@ -77,13 +75,11 @@ workflow SAMPLE { ) TCRDIST3_HISTOGRAM_CALC.out.max_histogram_count - .map { it.text.trim().toDouble() } + .map { tcrdist_ymax -> tcrdist_ymax.text.trim().toDouble() } .collect() .map { values -> values.max() } .set { global_y_max_value } - - // Use `global_max_value` in downstream processes or print it - global_y_max_value.view { "Global y max matrix value: $it" } + global_y_max_value.view { global_ymax -> "Global y max matrix value: $global_ymax" } TCRDIST3_HISTOGRAM_PLOT( TCRDIST3_HISTOGRAM_CALC.out.histogram_data, @@ -93,27 +89,27 @@ workflow SAMPLE { OLGA_PGEN_CALC ( sample_map ) OLGA_PGEN_CALC.out.olga_xmin - .map { it.text.trim().toDouble() } + .map { xmin -> xmin.text.trim().toDouble() } .collect() .map { values -> values.min() } .set { olga_x_min_value } - olga_x_min_value.view { "Olga x min matrix value: $it" } + olga_x_min_value.view { olga_xmin -> "Olga x min matrix value: $olga_xmin" } OLGA_PGEN_CALC.out.olga_xmax - .map { it.text.trim().toDouble() } + .map { xmax -> xmax.text.trim().toDouble() } .collect() .map { values -> values.max() } .set { olga_x_max_value } - olga_x_max_value.view { "Olga x max matrix value: $it" } + olga_x_max_value.view { olga_xmax -> "Olga x max matrix value: $olga_xmax" } OLGA_HISTOGRAM_CALC ( OLGA_PGEN_CALC.out.olga_pgen, olga_x_min_value, olga_x_max_value ) OLGA_HISTOGRAM_CALC.out.olga_ymax - .map { it.text.trim().toDouble() } + .map { ymax -> ymax.text.trim().toDouble() } .collect() .map { values -> values.max() } .set { olga_y_max_value } - olga_y_max_value.view { "Olga y max matrix value: $it" } + olga_y_max_value.view { olga_ymax -> "Olga y max matrix value: $olga_ymax" } OLGA_HISTOGRAM_PLOT( OLGA_HISTOGRAM_CALC.out.olga_histogram, olga_y_max_value ) diff --git a/subworkflows/local/validate_params.nf b/subworkflows/local/validate_params.nf new file mode 100644 index 0000000..d92f83f --- /dev/null +++ b/subworkflows/local/validate_params.nf @@ -0,0 +1,8 @@ +include { validateParameters; paramsSummaryLog } from 'plugin/nf-schema' + +workflow VALIDATE_PARAMS{ + + main: + validateParameters() + log.info paramsSummaryLog(workflow) +} \ No newline at end of file diff --git a/workflows/tcrtoolkit.nf b/workflows/tcrtoolkit.nf index 056a2d8..5329576 100644 --- a/workflows/tcrtoolkit.nf +++ b/workflows/tcrtoolkit.nf @@ -1,16 +1,3 @@ -/* -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - VALIDATE & PRINT PARAMETER SUMMARY -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -*/ - -// Validate pipeline parameters -def checkPathParamList = [ params.samplesheet] -for (param in checkPathParamList) { if (param) { file(param, checkIfExists: true) } } - -// Check mandatory parameters -if (params.samplesheet) { samplesheet = file(params.samplesheet) } else { exit 1, 'Samplesheet not specified. Please, provide a --samplesheet=/path/to/samplesheet.csv !' } -if (params.outdir) { outdir = params.outdir } else { exit 1, 'Output directory not specified. Please, provide a --outdir=/path/to/outdir !' } /* ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -27,6 +14,7 @@ include { AIRR_CONVERT } from '../subworkflows/local/airr_convert' include { RESOLVE_SAMPLESHEET } from '../subworkflows/local/resolve_samplesheet' include { SAMPLE } from '../subworkflows/local/sample' include { COMPARE } from '../subworkflows/local/compare' +include { VALIDATE_PARAMS } from '../subworkflows/local/validate_params' /* ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -36,6 +24,7 @@ include { COMPARE } from '../subworkflows/local/compare' workflow TCRTOOLKIT { + VALIDATE_PARAMS() println("Running TCRTOOLKIT workflow...") @@ -80,18 +69,6 @@ workflow TCRTOOLKIT { } } -/* -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - COMPLETION EMAIL AND SUMMARY -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -*/ - -// workflow.onComplete { - -// log.info(workflow.success ? "Finished tcrtoolkit!" : "Please check your inputs.") - -// } - /* ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ THE END From b2f8f561506f8520d48b40e6011e5fed4f84e19d Mon Sep 17 00:00:00 2001 From: dltamayo Date: Mon, 29 Dec 2025 11:30:25 -0500 Subject: [PATCH 04/14] Init license --- LICENSE | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) create mode 100644 LICENSE diff --git a/LICENSE b/LICENSE new file mode 100644 index 0000000..44ec6b9 --- /dev/null +++ b/LICENSE @@ -0,0 +1,21 @@ +MIT License + +Copyright (c) + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. \ No newline at end of file From d50144176d3af0cd3310b9c57e95ad52b2db4aca Mon Sep 17 00:00:00 2001 From: dltamayo Date: Mon, 29 Dec 2025 11:31:33 -0500 Subject: [PATCH 05/14] Update memory specification --- modules/local/sample/tcrdist3.nf | 1 - 1 file changed, 1 deletion(-) diff --git a/modules/local/sample/tcrdist3.nf b/modules/local/sample/tcrdist3.nf index 67dd2b0..8fb8aa6 100644 --- a/modules/local/sample/tcrdist3.nf +++ b/modules/local/sample/tcrdist3.nf @@ -11,7 +11,6 @@ process TCRDIST3_MATRIX { memory { - size -> count_table.size() count_table.size() > 26 * 1024**2 ? 512.GB * task.attempt: count_table.size() > 20 * 1024**2 ? 256.GB * task.attempt: count_table.size() > 10 * 1024**2 ? 128.GB * task.attempt: From 3d2500d78f6ac685a2ee7d248d793aa1f36f9142 Mon Sep 17 00:00:00 2001 From: dltamayo Date: Tue, 30 Dec 2025 11:44:58 -0500 Subject: [PATCH 06/14] Move container to .config Moving container to modules.config instead of specifying in every process --- conf/modules.config | 1 + modules/local/airr_convert/convert_adaptive.nf | 1 - modules/local/airr_convert/pseudobulk_cellranger.nf | 1 - modules/local/compare/compare_calc.nf | 1 - modules/local/compare/compare_plot.nf | 1 - modules/local/compare/giana.nf | 1 - modules/local/compare/gliph2.nf | 2 -- modules/local/compare/tcrsharing.nf | 3 --- modules/local/sample/convergence.nf | 1 - modules/local/sample/olga.nf | 4 ---- modules/local/sample/sample_aggregate.nf | 1 - modules/local/sample/sample_calc.nf | 1 - modules/local/sample/sample_plot.nf | 1 - modules/local/sample/tcrdist3.nf | 3 --- modules/local/sample/tcrpheno.nf | 1 - modules/local/sample/tcrspecificity.nf | 2 -- modules/local/samplesheet/samplesheet_check.nf | 1 - modules/local/samplesheet/samplesheet_resolve.nf | 1 - 18 files changed, 1 insertion(+), 26 deletions(-) diff --git a/conf/modules.config b/conf/modules.config index 8fc2514..3e891f7 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -11,6 +11,7 @@ */ process { + container = "ghcr.io/karchinlab/tcrtoolkit:main" publishDir = [ path: { "${params.outdir}/${task.process.tokenize(':')[-1].tokenize('_')[0].toLowerCase()}" }, diff --git a/modules/local/airr_convert/convert_adaptive.nf b/modules/local/airr_convert/convert_adaptive.nf index aa818cc..a476bb1 100644 --- a/modules/local/airr_convert/convert_adaptive.nf +++ b/modules/local/airr_convert/convert_adaptive.nf @@ -1,7 +1,6 @@ process CONVERT_ADAPTIVE { tag "${sample_meta.sample}" label 'process_low' - container "ghcr.io/karchinlab/tcrtoolkit:main" input: tuple val(sample_meta), path(count_table) diff --git a/modules/local/airr_convert/pseudobulk_cellranger.nf b/modules/local/airr_convert/pseudobulk_cellranger.nf index ec1916c..653b2a9 100644 --- a/modules/local/airr_convert/pseudobulk_cellranger.nf +++ b/modules/local/airr_convert/pseudobulk_cellranger.nf @@ -1,7 +1,6 @@ process PSEUDOBULK_CELLRANGER { tag "${sample_meta.sample}" label 'process_low' - container "ghcr.io/karchinlab/tcrtoolkit:main" input: tuple val(sample_meta), path(count_table) diff --git a/modules/local/compare/compare_calc.nf b/modules/local/compare/compare_calc.nf index a6d0a60..2206fdb 100644 --- a/modules/local/compare/compare_calc.nf +++ b/modules/local/compare/compare_calc.nf @@ -1,6 +1,5 @@ process COMPARE_CALC { label 'process_single' - container "ghcr.io/karchinlab/tcrtoolkit:main" input: path sample_utf8 diff --git a/modules/local/compare/compare_plot.nf b/modules/local/compare/compare_plot.nf index 460f536..1eb5792 100644 --- a/modules/local/compare/compare_plot.nf +++ b/modules/local/compare/compare_plot.nf @@ -1,6 +1,5 @@ process COMPARE_PLOT { label 'process_single' - container "ghcr.io/karchinlab/tcrtoolkit:main" input: path sample_utf8 diff --git a/modules/local/compare/giana.nf b/modules/local/compare/giana.nf index 561c1b3..7140448 100644 --- a/modules/local/compare/giana.nf +++ b/modules/local/compare/giana.nf @@ -1,6 +1,5 @@ process GIANA_CALC { label 'process_medium' - container "ghcr.io/karchinlab/tcrtoolkit:main" input: path concat_cdr3 diff --git a/modules/local/compare/gliph2.nf b/modules/local/compare/gliph2.nf index 1a45e4c..68dd080 100644 --- a/modules/local/compare/gliph2.nf +++ b/modules/local/compare/gliph2.nf @@ -2,7 +2,6 @@ process GLIPH2_TURBOGLIPH { label 'process_high' label 'process_high_compute' label 'process_high_memory' - container "ghcr.io/karchinlab/tcrtoolkit:main" input: path concat_cdr3 @@ -53,7 +52,6 @@ process GLIPH2_TURBOGLIPH { process GLIPH2_PLOT { label 'process_low' - container "ghcr.io/karchinlab/tcrtoolkit:main" input: path gliph2_report_template diff --git a/modules/local/compare/tcrsharing.nf b/modules/local/compare/tcrsharing.nf index ff68619..2bc5e1f 100644 --- a/modules/local/compare/tcrsharing.nf +++ b/modules/local/compare/tcrsharing.nf @@ -1,6 +1,5 @@ process TCRSHARING_CALC { label 'process_low' - container "ghcr.io/karchinlab/tcrtoolkit:main" input: path concat_cdr3 @@ -73,7 +72,6 @@ process TCRSHARING_CALC { process TCRSHARING_HISTOGRAM { label 'process_low' - container "ghcr.io/karchinlab/tcrtoolkit:main" input: path shared_cdr3 @@ -114,7 +112,6 @@ process TCRSHARING_HISTOGRAM { process TCRSHARING_SCATTERPLOT { label 'process_low' - container "ghcr.io/karchinlab/tcrtoolkit:main" input: path shared_cdr3 diff --git a/modules/local/sample/convergence.nf b/modules/local/sample/convergence.nf index 1e077ac..962d041 100644 --- a/modules/local/sample/convergence.nf +++ b/modules/local/sample/convergence.nf @@ -1,7 +1,6 @@ process CONVERGENCE { tag "${sample_meta.sample}" label 'process_low' - container "ghcr.io/karchinlab/tcrtoolkit:main" input: tuple val(sample_meta), path(count_table) diff --git a/modules/local/sample/olga.nf b/modules/local/sample/olga.nf index dfc71f2..d7b76ba 100644 --- a/modules/local/sample/olga.nf +++ b/modules/local/sample/olga.nf @@ -1,7 +1,6 @@ process OLGA_PGEN_CALC { tag "${sample_meta.sample}" label 'process_low' - container "ghcr.io/karchinlab/tcrtoolkit:main" input: tuple val(sample_meta), path(count_table) @@ -56,7 +55,6 @@ process OLGA_PGEN_CALC { process OLGA_HISTOGRAM_CALC { tag "${sample_meta.sample}" label 'process_low' - container "ghcr.io/karchinlab/tcrtoolkit:main" input: tuple val(sample_meta), path(olga_pgen) @@ -113,7 +111,6 @@ process OLGA_HISTOGRAM_CALC { process OLGA_HISTOGRAM_PLOT { tag "${sample_meta.sample}" label 'process_low' - container "ghcr.io/karchinlab/tcrtoolkit:main" input: tuple val(sample_meta), path(olga_histogram) @@ -172,7 +169,6 @@ process OLGA_HISTOGRAM_PLOT { process OLGA_WRITE_MAX { label 'process_single' - container "ghcr.io/karchinlab/tcrtoolkit:main" input: val olga_global_xmin diff --git a/modules/local/sample/sample_aggregate.nf b/modules/local/sample/sample_aggregate.nf index ea7b43a..ae9e952 100644 --- a/modules/local/sample/sample_aggregate.nf +++ b/modules/local/sample/sample_aggregate.nf @@ -1,7 +1,6 @@ process SAMPLE_AGGREGATE { tag "${output_file}" label 'process_low' - container "ghcr.io/karchinlab/tcrtoolkit:main" input: path csv_files diff --git a/modules/local/sample/sample_calc.nf b/modules/local/sample/sample_calc.nf index dff4c0a..d6e4e90 100644 --- a/modules/local/sample/sample_calc.nf +++ b/modules/local/sample/sample_calc.nf @@ -1,7 +1,6 @@ process SAMPLE_CALC { tag "${sample_meta.sample}" label 'process_single' - container "ghcr.io/karchinlab/tcrtoolkit:main" input: tuple val(sample_meta), path(count_table) diff --git a/modules/local/sample/sample_plot.nf b/modules/local/sample/sample_plot.nf index 3ae48a1..e19dd16 100644 --- a/modules/local/sample/sample_plot.nf +++ b/modules/local/sample/sample_plot.nf @@ -2,7 +2,6 @@ process SAMPLE_PLOT { tag "${sample_stats_csv}" label 'process_single' - container "ghcr.io/karchinlab/tcrtoolkit:main" input: path sample_table diff --git a/modules/local/sample/tcrdist3.nf b/modules/local/sample/tcrdist3.nf index 8fb8aa6..6369df4 100644 --- a/modules/local/sample/tcrdist3.nf +++ b/modules/local/sample/tcrdist3.nf @@ -1,6 +1,5 @@ process TCRDIST3_MATRIX { tag "${sample_meta.sample}" - container "ghcr.io/karchinlab/tcrtoolkit:main" cpus { task.memory > 256.GB ? 16 * task.attempt: @@ -39,7 +38,6 @@ process TCRDIST3_MATRIX { process TCRDIST3_HISTOGRAM_CALC { tag "${sample_meta.sample}" label 'process_high' - container "ghcr.io/karchinlab/tcrtoolkit:main" input: tuple val(sample_meta), path(distance_matrix) @@ -120,7 +118,6 @@ process TCRDIST3_HISTOGRAM_CALC { process TCRDIST3_HISTOGRAM_PLOT { tag "${sample_meta.sample}" label 'process_low' - container "ghcr.io/karchinlab/tcrtoolkit:main" input: tuple val(sample_meta), path(histogram_data) diff --git a/modules/local/sample/tcrpheno.nf b/modules/local/sample/tcrpheno.nf index e71ed63..f4be077 100644 --- a/modules/local/sample/tcrpheno.nf +++ b/modules/local/sample/tcrpheno.nf @@ -1,7 +1,6 @@ process TCRPHENO { tag "${sample_meta.sample}" label 'process_low' - container "ghcr.io/karchinlab/tcrtoolkit:main" input: tuple val(sample_meta), path(count_table) diff --git a/modules/local/sample/tcrspecificity.nf b/modules/local/sample/tcrspecificity.nf index e92cdb6..30ce184 100644 --- a/modules/local/sample/tcrspecificity.nf +++ b/modules/local/sample/tcrspecificity.nf @@ -1,6 +1,5 @@ process VDJDB_GET { label 'process_low' - container "ghcr.io/karchinlab/tcrtoolkit:main" output: path("vdjdb-2025-02-21/"), emit: ref_db @@ -15,7 +14,6 @@ process VDJDB_GET { process VDJDB_VDJMATCH { tag "${sample_meta.sample}" label 'process_medium' - container "ghcr.io/karchinlab/tcrtoolkit:main" input: tuple val(sample_meta), path(count_table) diff --git a/modules/local/samplesheet/samplesheet_check.nf b/modules/local/samplesheet/samplesheet_check.nf index 527ed9c..04bd0d7 100644 --- a/modules/local/samplesheet/samplesheet_check.nf +++ b/modules/local/samplesheet/samplesheet_check.nf @@ -1,7 +1,6 @@ process SAMPLESHEET_CHECK { tag "${samplesheet}" label 'process_single' - container "ghcr.io/karchinlab/tcrtoolkit:main" input: path samplesheet diff --git a/modules/local/samplesheet/samplesheet_resolve.nf b/modules/local/samplesheet/samplesheet_resolve.nf index 3128eff..ea31239 100644 --- a/modules/local/samplesheet/samplesheet_resolve.nf +++ b/modules/local/samplesheet/samplesheet_resolve.nf @@ -1,6 +1,5 @@ process SAMPLESHEET_RESOLVE { label 'process_single' - container "ghcr.io/karchinlab/tcrtoolkit:main" input: path samplesheet_utf8 From fb3461fd549448239287c3aea23a82b16e7bc39d Mon Sep 17 00:00:00 2001 From: dltamayo Date: Tue, 30 Dec 2025 11:46:24 -0500 Subject: [PATCH 07/14] Update dockerfile --- Dockerfile | 64 ++++++++++++++++++++++++++++++++++-------------------- env.yml | 1 + 2 files changed, 41 insertions(+), 24 deletions(-) diff --git a/Dockerfile b/Dockerfile index 148ae3a..7d5ba29 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,22 +1,44 @@ -FROM condaforge/miniforge3:24.9.2-0 +FROM mambaorg/micromamba:1.5.8 -# Copy the environment file into /tmp +# Ensure we run as root for apt +USER root + +# Update the conda base environment with required packages COPY env.yml /tmp/env.yml +WORKDIR /tmp -# Install system dependencies -RUN apt-get update \ - && apt-get install -y \ - build-essential \ - curl \ - gcc \ - g++ \ +RUN apt-get update && apt-get install -y \ + # runtime CLIs (KEEP) + curl \ + wget \ + git \ + unzip \ + zip \ + jq \ + \ + # build-only deps (REMOVE LATER) + build-essential \ + gcc \ + g++ \ + && micromamba install -y -n base -f /tmp/env.yml \ + && micromamba clean -afy \ + \ + # R packages (need compilers) + && micromamba run -n base Rscript -e "remotes::install_github('HetzDra/turboGliph')" \ + && micromamba run -n base Rscript -e "remotes::install_github('kalaga27/tcrpheno')" \ + \ + # R cleanup + && rm -rf /tmp/Rtmp* /root/.cache/R \ + \ + # REMOVE build deps ONLY + && apt-get purge -y \ + build-essential \ + gcc \ + g++ \ + && apt-get autoremove -y \ && apt-get clean \ && rm -rf /var/lib/apt/lists/* -# Update the conda base environment with required packages -WORKDIR /tmp -RUN conda env update -n base --file env.yml - # Install GIANA, patch shebang, symlink for PATH command availability RUN git init /opt/GIANA && \ cd /opt/GIANA && \ @@ -29,26 +51,20 @@ RUN git init /opt/GIANA && \ # Install quarto RUN mkdir -p /opt/quarto/1.6.42 \ - && curl -o quarto.tar.gz -L \ + && curl -o /tmp/quarto.tar.gz -L \ "https://github.com/quarto-dev/quarto-cli/releases/download/v1.6.42/quarto-1.6.42-linux-amd64.tar.gz" \ && tar -zxvf quarto.tar.gz \ -C "/opt/quarto/1.6.42" \ --strip-components=1 \ - && rm quarto.tar.gz + && rm /tmp/quarto.tar.gz -# Install R package not available via conda -RUN Rscript -e "remotes::install_github('HetzDra/turboGliph')" -RUN Rscript -e "remotes::install_github('kalaga27/tcrpheno')" - -# Install VDJmatch +# Install VDJmatch and symlink RUN mkdir -p /opt/vdjmatch/1.3.1 \ && curl -L -o vdjmatch.zip \ "https://github.com/antigenomics/vdjmatch/releases/download/1.3.1/vdjmatch-1.3.1.zip" \ && unzip vdjmatch.zip -d /opt/vdjmatch/1.3.1 \ - && rm vdjmatch.zip - -# symlink VDJmatch -RUN ln -s /opt/vdjmatch/1.3.1/vdjmatch-1.3.1/vdjmatch-1.3.1.jar /usr/local/bin/vdjmatch.jar + && rm vdjmatch.zip \ + && ln -s /opt/vdjmatch/1.3.1/vdjmatch-1.3.1/vdjmatch-1.3.1.jar /usr/local/bin/vdjmatch.jar # Add to PATH ENV PATH="/opt/quarto/1.6.42/bin:${PATH}" diff --git a/env.yml b/env.yml index 27ce717..331ef45 100644 --- a/env.yml +++ b/env.yml @@ -29,6 +29,7 @@ dependencies: # R and R packages - r-base=4.4.2 + - r-grr=0.9.5 - r-igraph=2.0.3 - r-pheatmap=1.0.12 - r-remotes=2.5.0 From 28efeebff8a8f7717412ed9b643e0e0bb3a2f6d3 Mon Sep 17 00:00:00 2001 From: dltamayo Date: Mon, 5 Jan 2026 12:57:58 -0500 Subject: [PATCH 08/14] Implement Copilot suggestions --- Dockerfile | 6 +++--- modules/local/sample/sample_aggregate.nf | 7 +++++-- modules/local/sample/tcrdist3.nf | 16 ++++++++-------- modules/local/samplesheet/samplesheet_resolve.nf | 4 +++- nextflow_schema.json | 3 ++- subworkflows/local/validate_params.nf | 2 +- 6 files changed, 22 insertions(+), 16 deletions(-) diff --git a/Dockerfile b/Dockerfile index 7d5ba29..3aff1f6 100644 --- a/Dockerfile +++ b/Dockerfile @@ -24,8 +24,8 @@ RUN apt-get update && apt-get install -y \ && micromamba clean -afy \ \ # R packages (need compilers) - && micromamba run -n base Rscript -e "remotes::install_github('HetzDra/turboGliph')" \ - && micromamba run -n base Rscript -e "remotes::install_github('kalaga27/tcrpheno')" \ + && micromamba run -n base Rscript -e "remotes::install_github('HetzDra/turboGliph@2a5264b')" \ + && micromamba run -n base Rscript -e "remotes::install_github('kalaga27/tcrpheno@56f9372')" \ \ # R cleanup && rm -rf /tmp/Rtmp* /root/.cache/R \ @@ -53,7 +53,7 @@ RUN git init /opt/GIANA && \ RUN mkdir -p /opt/quarto/1.6.42 \ && curl -o /tmp/quarto.tar.gz -L \ "https://github.com/quarto-dev/quarto-cli/releases/download/v1.6.42/quarto-1.6.42-linux-amd64.tar.gz" \ - && tar -zxvf quarto.tar.gz \ + && tar -zxvf /tmp/quarto.tar.gz \ -C "/opt/quarto/1.6.42" \ --strip-components=1 \ && rm /tmp/quarto.tar.gz diff --git a/modules/local/sample/sample_aggregate.nf b/modules/local/sample/sample_aggregate.nf index ae9e952..a1c9c55 100644 --- a/modules/local/sample/sample_aggregate.nf +++ b/modules/local/sample/sample_aggregate.nf @@ -11,13 +11,16 @@ process SAMPLE_AGGREGATE { script: """ - python3 < aggregate.py < '"' + input_file.getName() + '"' }.join(', ')}] + input_files = sys.argv[1:] dfs = [pd.read_csv(f) for f in input_files] merged = pd.concat(dfs, axis=0, ignore_index=True) merged.to_csv("${output_file}", index=False) EOF + + python3 aggregate.py ${csv_files} """ } \ No newline at end of file diff --git a/modules/local/sample/tcrdist3.nf b/modules/local/sample/tcrdist3.nf index 6369df4..f592675 100644 --- a/modules/local/sample/tcrdist3.nf +++ b/modules/local/sample/tcrdist3.nf @@ -2,19 +2,19 @@ process TCRDIST3_MATRIX { tag "${sample_meta.sample}" cpus { - task.memory > 256.GB ? 16 * task.attempt: - task.memory > 64.GB ? 8 * task.attempt: - task.memory > 4.GB ? 4 * task.attempt: + task.memory > 256.GB ? 16 * task.attempt : + task.memory > 64.GB ? 8 * task.attempt : + task.memory > 4.GB ? 4 * task.attempt : 2 * task.attempt } memory { - count_table.size() > 26 * 1024**2 ? 512.GB * task.attempt: - count_table.size() > 20 * 1024**2 ? 256.GB * task.attempt: - count_table.size() > 10 * 1024**2 ? 128.GB * task.attempt: - count_table.size() > 4 * 1024**2 ? 64.GB * task.attempt: - count_table.size() > 2 * 1024**2 ? 16.GB * task.attempt: + count_table.size() > 26 * 1024**2 ? 512.GB * task.attempt : + count_table.size() > 20 * 1024**2 ? 256.GB * task.attempt : + count_table.size() > 10 * 1024**2 ? 128.GB * task.attempt : + count_table.size() > 4 * 1024**2 ? 64.GB * task.attempt : + count_table.size() > 2 * 1024**2 ? 16.GB * task.attempt : 4.GB * task.attempt } diff --git a/modules/local/samplesheet/samplesheet_resolve.nf b/modules/local/samplesheet/samplesheet_resolve.nf index ea31239..4dc0746 100644 --- a/modules/local/samplesheet/samplesheet_resolve.nf +++ b/modules/local/samplesheet/samplesheet_resolve.nf @@ -17,7 +17,9 @@ ${resolved_rows.join('\n')} EOF # Emit header -echo "${resolved_header}" > samplesheet_resolved.csv +cat << 'EOF' > samplesheet_resolved.csv +${resolved_header} +EOF # Two-pass awk: # - pass 1: read original samplesheet, store sample order diff --git a/nextflow_schema.json b/nextflow_schema.json index c2a202f..d333039 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -46,13 +46,14 @@ "max_memory": { "type": "string", "default": "768.GB", - "pattern": "^\\d+(\\.\\d+)?\\.?\\s*(K|M|G|T)?B$", + "pattern": "^\\d+(?:\\.\\d+|\\.)?\\s*(?:[KMGT]?B|[KMGT])$", "description": "Maximum memory for any process.", "fa_icon": "fas fa-memory" }, "max_time": { "type": "string", "default": "48.h", + "pattern": "^\\d+(\\.\\d+)?\\s*(s|m|h|d)$", "description": "Maximum walltime for any job.", "fa_icon": "far fa-clock" } diff --git a/subworkflows/local/validate_params.nf b/subworkflows/local/validate_params.nf index d92f83f..b4b2f22 100644 --- a/subworkflows/local/validate_params.nf +++ b/subworkflows/local/validate_params.nf @@ -1,6 +1,6 @@ include { validateParameters; paramsSummaryLog } from 'plugin/nf-schema' -workflow VALIDATE_PARAMS{ +workflow VALIDATE_PARAMS { main: validateParameters() From c8f85ff404d28e43bae0b72a510a6eca672f78f4 Mon Sep 17 00:00:00 2001 From: dltamayo Date: Mon, 5 Jan 2026 13:09:42 -0500 Subject: [PATCH 09/14] Update schema --- nextflow_schema.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/nextflow_schema.json b/nextflow_schema.json index d333039..e81a2e5 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -53,7 +53,7 @@ "max_time": { "type": "string", "default": "48.h", - "pattern": "^\\d+(\\.\\d+)?\\s*(s|m|h|d)$", + "pattern": "^\\d+(?:\\.\\d+|\\.)?\\s*(s|m|h|d)$", "description": "Maximum walltime for any job.", "fa_icon": "far fa-clock" } From 748df0f2c516aca697dd225609a51fa55d1f17a0 Mon Sep 17 00:00:00 2001 From: dimalvovs Date: Wed, 7 Jan 2026 10:07:17 -0500 Subject: [PATCH 10/14] make one container definition --- conf/base.config | 1 + modules/local/airr_convert/convert_adaptive.nf | 1 - modules/local/airr_convert/pseudobulk_cellranger.nf | 1 - modules/local/compare/compare_calc.nf | 3 +-- modules/local/compare/compare_concatenate.nf | 1 - modules/local/compare/compare_plot.nf | 1 - modules/local/compare/giana.nf | 1 - modules/local/compare/gliph2.nf | 2 -- modules/local/compare/tcrsharing.nf | 3 --- modules/local/sample/convergence.nf | 1 - modules/local/sample/olga.nf | 4 ---- modules/local/sample/sample_calc.nf | 1 - modules/local/sample/sample_plot.nf | 1 - modules/local/sample/tcrdist3.nf | 3 --- modules/local/sample/tcrpheno.nf | 1 - modules/local/sample/tcrspecificity.nf | 2 -- modules/local/samplesheet/samplesheet_check.nf | 1 - modules/local/samplesheet/samplesheet_resolve.nf | 1 - 18 files changed, 2 insertions(+), 27 deletions(-) diff --git a/conf/base.config b/conf/base.config index daf7fa4..4ce1d96 100644 --- a/conf/base.config +++ b/conf/base.config @@ -9,6 +9,7 @@ */ process { + container = "ghcr.io/karchinlab/tcrtoolkit:main" // TODO nf-core: Check the defaults for all processes cpus = { check_max( 1 * task.attempt, 'cpus' ) } diff --git a/modules/local/airr_convert/convert_adaptive.nf b/modules/local/airr_convert/convert_adaptive.nf index aa818cc..a476bb1 100644 --- a/modules/local/airr_convert/convert_adaptive.nf +++ b/modules/local/airr_convert/convert_adaptive.nf @@ -1,7 +1,6 @@ process CONVERT_ADAPTIVE { tag "${sample_meta.sample}" label 'process_low' - container "ghcr.io/karchinlab/tcrtoolkit:main" input: tuple val(sample_meta), path(count_table) diff --git a/modules/local/airr_convert/pseudobulk_cellranger.nf b/modules/local/airr_convert/pseudobulk_cellranger.nf index ec1916c..653b2a9 100644 --- a/modules/local/airr_convert/pseudobulk_cellranger.nf +++ b/modules/local/airr_convert/pseudobulk_cellranger.nf @@ -1,7 +1,6 @@ process PSEUDOBULK_CELLRANGER { tag "${sample_meta.sample}" label 'process_low' - container "ghcr.io/karchinlab/tcrtoolkit:main" input: tuple val(sample_meta), path(count_table) diff --git a/modules/local/compare/compare_calc.nf b/modules/local/compare/compare_calc.nf index a6d0a60..05b5800 100644 --- a/modules/local/compare/compare_calc.nf +++ b/modules/local/compare/compare_calc.nf @@ -1,7 +1,6 @@ process COMPARE_CALC { label 'process_single' - container "ghcr.io/karchinlab/tcrtoolkit:main" - + input: path sample_utf8 path all_sample_files diff --git a/modules/local/compare/compare_concatenate.nf b/modules/local/compare/compare_concatenate.nf index a05e50c..c955013 100644 --- a/modules/local/compare/compare_concatenate.nf +++ b/modules/local/compare/compare_concatenate.nf @@ -1,6 +1,5 @@ process COMPARE_CONCATENATE { label 'process_low' - container "ghcr.io/karchinlab/tcrtoolkit:main" input: path samplesheet_utf8 diff --git a/modules/local/compare/compare_plot.nf b/modules/local/compare/compare_plot.nf index 471ddaa..478096f 100644 --- a/modules/local/compare/compare_plot.nf +++ b/modules/local/compare/compare_plot.nf @@ -1,6 +1,5 @@ process COMPARE_PLOT { label 'process_single' - container "ghcr.io/karchinlab/tcrtoolkit:main" input: path sample_utf8 diff --git a/modules/local/compare/giana.nf b/modules/local/compare/giana.nf index b062f5f..6e4a237 100644 --- a/modules/local/compare/giana.nf +++ b/modules/local/compare/giana.nf @@ -1,6 +1,5 @@ process GIANA_CALC { label 'process_medium' - container "ghcr.io/karchinlab/tcrtoolkit:main" input: path concat_cdr3 diff --git a/modules/local/compare/gliph2.nf b/modules/local/compare/gliph2.nf index 36ca9f4..5958af1 100644 --- a/modules/local/compare/gliph2.nf +++ b/modules/local/compare/gliph2.nf @@ -2,7 +2,6 @@ process GLIPH2_TURBOGLIPH { label 'process_high' label 'process_high_compute' label 'process_high_memory' - container "ghcr.io/karchinlab/tcrtoolkit:main" input: path concat_cdr3 @@ -56,7 +55,6 @@ process GLIPH2_TURBOGLIPH { process GLIPH2_PLOT { label 'process_low' - container "ghcr.io/karchinlab/tcrtoolkit:main" input: path gliph2_report_template diff --git a/modules/local/compare/tcrsharing.nf b/modules/local/compare/tcrsharing.nf index b4b9fab..34c4349 100644 --- a/modules/local/compare/tcrsharing.nf +++ b/modules/local/compare/tcrsharing.nf @@ -1,6 +1,5 @@ process TCRSHARING_CALC { label 'process_low' - container "ghcr.io/karchinlab/tcrtoolkit:main" input: path concat_cdr3 @@ -67,7 +66,6 @@ process TCRSHARING_CALC { process TCRSHARING_HISTOGRAM { label 'process_low' - container "ghcr.io/karchinlab/tcrtoolkit:main" input: path shared_cdr3 @@ -108,7 +106,6 @@ process TCRSHARING_HISTOGRAM { process TCRSHARING_SCATTERPLOT { label 'process_low' - container "ghcr.io/karchinlab/tcrtoolkit:main" input: path shared_cdr3 diff --git a/modules/local/sample/convergence.nf b/modules/local/sample/convergence.nf index 1e077ac..962d041 100644 --- a/modules/local/sample/convergence.nf +++ b/modules/local/sample/convergence.nf @@ -1,7 +1,6 @@ process CONVERGENCE { tag "${sample_meta.sample}" label 'process_low' - container "ghcr.io/karchinlab/tcrtoolkit:main" input: tuple val(sample_meta), path(count_table) diff --git a/modules/local/sample/olga.nf b/modules/local/sample/olga.nf index dfc71f2..d7b76ba 100644 --- a/modules/local/sample/olga.nf +++ b/modules/local/sample/olga.nf @@ -1,7 +1,6 @@ process OLGA_PGEN_CALC { tag "${sample_meta.sample}" label 'process_low' - container "ghcr.io/karchinlab/tcrtoolkit:main" input: tuple val(sample_meta), path(count_table) @@ -56,7 +55,6 @@ process OLGA_PGEN_CALC { process OLGA_HISTOGRAM_CALC { tag "${sample_meta.sample}" label 'process_low' - container "ghcr.io/karchinlab/tcrtoolkit:main" input: tuple val(sample_meta), path(olga_pgen) @@ -113,7 +111,6 @@ process OLGA_HISTOGRAM_CALC { process OLGA_HISTOGRAM_PLOT { tag "${sample_meta.sample}" label 'process_low' - container "ghcr.io/karchinlab/tcrtoolkit:main" input: tuple val(sample_meta), path(olga_histogram) @@ -172,7 +169,6 @@ process OLGA_HISTOGRAM_PLOT { process OLGA_WRITE_MAX { label 'process_single' - container "ghcr.io/karchinlab/tcrtoolkit:main" input: val olga_global_xmin diff --git a/modules/local/sample/sample_calc.nf b/modules/local/sample/sample_calc.nf index ef88d43..c989aad 100644 --- a/modules/local/sample/sample_calc.nf +++ b/modules/local/sample/sample_calc.nf @@ -1,7 +1,6 @@ process SAMPLE_CALC { tag "${sample_meta.sample}" label 'process_single' - container "ghcr.io/karchinlab/tcrtoolkit:main" input: tuple val(sample_meta), path(count_table) diff --git a/modules/local/sample/sample_plot.nf b/modules/local/sample/sample_plot.nf index 272533d..9502b06 100644 --- a/modules/local/sample/sample_plot.nf +++ b/modules/local/sample/sample_plot.nf @@ -2,7 +2,6 @@ process SAMPLE_PLOT { tag "${sample_stats_csv}" label 'process_single' - container "ghcr.io/karchinlab/tcrtoolkit:main" input: path sample_table diff --git a/modules/local/sample/tcrdist3.nf b/modules/local/sample/tcrdist3.nf index 9635030..963182e 100644 --- a/modules/local/sample/tcrdist3.nf +++ b/modules/local/sample/tcrdist3.nf @@ -1,6 +1,5 @@ process TCRDIST3_MATRIX { tag "${sample_meta.sample}" - container "ghcr.io/karchinlab/tcrtoolkit:main" cpus params.max_cpus memory { @@ -41,7 +40,6 @@ process TCRDIST3_MATRIX { process TCRDIST3_HISTOGRAM_CALC { tag "${sample_meta.sample}" label 'process_high' - container "ghcr.io/karchinlab/tcrtoolkit:main" input: tuple val(sample_meta), path(distance_matrix) @@ -122,7 +120,6 @@ process TCRDIST3_HISTOGRAM_CALC { process TCRDIST3_HISTOGRAM_PLOT { tag "${sample_meta.sample}" label 'process_low' - container "ghcr.io/karchinlab/tcrtoolkit:main" input: tuple val(sample_meta), path(histogram_data) diff --git a/modules/local/sample/tcrpheno.nf b/modules/local/sample/tcrpheno.nf index e71ed63..f4be077 100644 --- a/modules/local/sample/tcrpheno.nf +++ b/modules/local/sample/tcrpheno.nf @@ -1,7 +1,6 @@ process TCRPHENO { tag "${sample_meta.sample}" label 'process_low' - container "ghcr.io/karchinlab/tcrtoolkit:main" input: tuple val(sample_meta), path(count_table) diff --git a/modules/local/sample/tcrspecificity.nf b/modules/local/sample/tcrspecificity.nf index e92cdb6..30ce184 100644 --- a/modules/local/sample/tcrspecificity.nf +++ b/modules/local/sample/tcrspecificity.nf @@ -1,6 +1,5 @@ process VDJDB_GET { label 'process_low' - container "ghcr.io/karchinlab/tcrtoolkit:main" output: path("vdjdb-2025-02-21/"), emit: ref_db @@ -15,7 +14,6 @@ process VDJDB_GET { process VDJDB_VDJMATCH { tag "${sample_meta.sample}" label 'process_medium' - container "ghcr.io/karchinlab/tcrtoolkit:main" input: tuple val(sample_meta), path(count_table) diff --git a/modules/local/samplesheet/samplesheet_check.nf b/modules/local/samplesheet/samplesheet_check.nf index 527ed9c..04bd0d7 100644 --- a/modules/local/samplesheet/samplesheet_check.nf +++ b/modules/local/samplesheet/samplesheet_check.nf @@ -1,7 +1,6 @@ process SAMPLESHEET_CHECK { tag "${samplesheet}" label 'process_single' - container "ghcr.io/karchinlab/tcrtoolkit:main" input: path samplesheet diff --git a/modules/local/samplesheet/samplesheet_resolve.nf b/modules/local/samplesheet/samplesheet_resolve.nf index 61ca0c1..1ab9cce 100644 --- a/modules/local/samplesheet/samplesheet_resolve.nf +++ b/modules/local/samplesheet/samplesheet_resolve.nf @@ -1,6 +1,5 @@ process SAMPLESHEET_RESOLVE { label 'process_single' - container "ghcr.io/karchinlab/tcrtoolkit:main" input: val(resolved_rows) // List of tab-separated strings From 877a68e7fed42c11a9565976e1da515174740bbc Mon Sep 17 00:00:00 2001 From: dimalvovs Date: Wed, 7 Jan 2026 10:44:50 -0500 Subject: [PATCH 11/14] push on changed Dockerfile --- .github/workflows/build-push-container.yml | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/.github/workflows/build-push-container.yml b/.github/workflows/build-push-container.yml index 1e0a5d9..1e76253 100644 --- a/.github/workflows/build-push-container.yml +++ b/.github/workflows/build-push-container.yml @@ -4,7 +4,10 @@ name: build-push-container # Configures this workflow to run every time a change is pushed to the branch called `release`. on: push: - branches: ['main'] + paths: + - '**/Dockerfile' + - '**/*.dockerfile' + - '.github/workflows/build-push-container.yml' workflow_dispatch: # Defines two custom environment variables for the workflow. These are used for the Container registry domain, and a name for the Docker image that this workflow builds. @@ -39,6 +42,8 @@ jobs: uses: docker/metadata-action@9ec57ed1fcdbf14dcef7dfbe97b2010124a938b7 with: images: ${{ env.REGISTRY }}/${{ env.IMAGE_NAME }} + tags: | + type=sha,format=short,prefix= # Generates a tag like '860c190' # This step uses the `docker/build-push-action` action to build the image, based on your repository's `Dockerfile`. If the build succeeds, it pushes the image to GitHub Packages. # It uses the `context` parameter to define the build's context as the set of files located in the specified path. For more information, see "[Usage](https://github.com/docker/build-push-action#usage)" in the README of the `docker/build-push-action` repository. # It uses the `tags` and `labels` parameters to tag and label the image with the output from the "meta" step. From a8168d86194b3d25c2a1ee5a6c2170be889dd62b Mon Sep 17 00:00:00 2001 From: dimalvovs Date: Wed, 7 Jan 2026 11:42:09 -0500 Subject: [PATCH 12/14] add main tag for back-compatibility --- .github/workflows/build-push-container.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/build-push-container.yml b/.github/workflows/build-push-container.yml index 1e76253..9fa5ca6 100644 --- a/.github/workflows/build-push-container.yml +++ b/.github/workflows/build-push-container.yml @@ -44,6 +44,7 @@ jobs: images: ${{ env.REGISTRY }}/${{ env.IMAGE_NAME }} tags: | type=sha,format=short,prefix= # Generates a tag like '860c190' + main # This step uses the `docker/build-push-action` action to build the image, based on your repository's `Dockerfile`. If the build succeeds, it pushes the image to GitHub Packages. # It uses the `context` parameter to define the build's context as the set of files located in the specified path. For more information, see "[Usage](https://github.com/docker/build-push-action#usage)" in the README of the `docker/build-push-action` repository. # It uses the `tags` and `labels` parameters to tag and label the image with the output from the "meta" step. From 26a6b29c27e96c24ef46e0c572d2c82c61afaed0 Mon Sep 17 00:00:00 2001 From: dltamayo Date: Wed, 7 Jan 2026 14:39:12 -0500 Subject: [PATCH 13/14] Update docker, workflow --- .github/workflows/build-push-container.yml | 1 + Dockerfile | 1 + 2 files changed, 2 insertions(+) diff --git a/.github/workflows/build-push-container.yml b/.github/workflows/build-push-container.yml index 9fa5ca6..d4975b2 100644 --- a/.github/workflows/build-push-container.yml +++ b/.github/workflows/build-push-container.yml @@ -5,6 +5,7 @@ name: build-push-container on: push: paths: + - '**/env.yml' - '**/Dockerfile' - '**/*.dockerfile' - '.github/workflows/build-push-container.yml' diff --git a/Dockerfile b/Dockerfile index 3aff1f6..cab497a 100644 --- a/Dockerfile +++ b/Dockerfile @@ -15,6 +15,7 @@ RUN apt-get update && apt-get install -y \ unzip \ zip \ jq \ + ps \ \ # build-only deps (REMOVE LATER) build-essential \ From bfa4f504b67fba1982f4d24a937e04aad65d89ce Mon Sep 17 00:00:00 2001 From: dltamayo Date: Wed, 7 Jan 2026 14:56:46 -0500 Subject: [PATCH 14/14] Update docker --- Dockerfile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Dockerfile b/Dockerfile index cab497a..50f3cb3 100644 --- a/Dockerfile +++ b/Dockerfile @@ -15,7 +15,7 @@ RUN apt-get update && apt-get install -y \ unzip \ zip \ jq \ - ps \ + procps \ \ # build-only deps (REMOVE LATER) build-essential \