diff --git a/.github/workflows/build-push-container.yml b/.github/workflows/build-push-container.yml index 1e0a5d9..d4975b2 100644 --- a/.github/workflows/build-push-container.yml +++ b/.github/workflows/build-push-container.yml @@ -4,7 +4,11 @@ name: build-push-container # Configures this workflow to run every time a change is pushed to the branch called `release`. on: push: - branches: ['main'] + paths: + - '**/env.yml' + - '**/Dockerfile' + - '**/*.dockerfile' + - '.github/workflows/build-push-container.yml' workflow_dispatch: # Defines two custom environment variables for the workflow. These are used for the Container registry domain, and a name for the Docker image that this workflow builds. @@ -39,6 +43,9 @@ jobs: uses: docker/metadata-action@9ec57ed1fcdbf14dcef7dfbe97b2010124a938b7 with: images: ${{ env.REGISTRY }}/${{ env.IMAGE_NAME }} + tags: | + type=sha,format=short,prefix= # Generates a tag like '860c190' + main # This step uses the `docker/build-push-action` action to build the image, based on your repository's `Dockerfile`. If the build succeeds, it pushes the image to GitHub Packages. # It uses the `context` parameter to define the build's context as the set of files located in the specified path. For more information, see "[Usage](https://github.com/docker/build-push-action#usage)" in the README of the `docker/build-push-action` repository. # It uses the `tags` and `labels` parameters to tag and label the image with the output from the "meta" step. diff --git a/Dockerfile b/Dockerfile index 148ae3a..50f3cb3 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,22 +1,45 @@ -FROM condaforge/miniforge3:24.9.2-0 +FROM mambaorg/micromamba:1.5.8 -# Copy the environment file into /tmp +# Ensure we run as root for apt +USER root + +# Update the conda base environment with required packages COPY env.yml /tmp/env.yml +WORKDIR /tmp -# Install system dependencies -RUN apt-get update \ - && apt-get install -y \ - build-essential \ - curl \ - gcc \ - g++ \ +RUN apt-get update && apt-get install -y \ + # runtime CLIs (KEEP) + curl \ + wget \ + git \ + unzip \ + zip \ + jq \ + procps \ + \ + # build-only deps (REMOVE LATER) + build-essential \ + gcc \ + g++ \ + && micromamba install -y -n base -f /tmp/env.yml \ + && micromamba clean -afy \ + \ + # R packages (need compilers) + && micromamba run -n base Rscript -e "remotes::install_github('HetzDra/turboGliph@2a5264b')" \ + && micromamba run -n base Rscript -e "remotes::install_github('kalaga27/tcrpheno@56f9372')" \ + \ + # R cleanup + && rm -rf /tmp/Rtmp* /root/.cache/R \ + \ + # REMOVE build deps ONLY + && apt-get purge -y \ + build-essential \ + gcc \ + g++ \ + && apt-get autoremove -y \ && apt-get clean \ && rm -rf /var/lib/apt/lists/* -# Update the conda base environment with required packages -WORKDIR /tmp -RUN conda env update -n base --file env.yml - # Install GIANA, patch shebang, symlink for PATH command availability RUN git init /opt/GIANA && \ cd /opt/GIANA && \ @@ -29,26 +52,20 @@ RUN git init /opt/GIANA && \ # Install quarto RUN mkdir -p /opt/quarto/1.6.42 \ - && curl -o quarto.tar.gz -L \ + && curl -o /tmp/quarto.tar.gz -L \ "https://github.com/quarto-dev/quarto-cli/releases/download/v1.6.42/quarto-1.6.42-linux-amd64.tar.gz" \ - && tar -zxvf quarto.tar.gz \ + && tar -zxvf /tmp/quarto.tar.gz \ -C "/opt/quarto/1.6.42" \ --strip-components=1 \ - && rm quarto.tar.gz + && rm /tmp/quarto.tar.gz -# Install R package not available via conda -RUN Rscript -e "remotes::install_github('HetzDra/turboGliph')" -RUN Rscript -e "remotes::install_github('kalaga27/tcrpheno')" - -# Install VDJmatch +# Install VDJmatch and symlink RUN mkdir -p /opt/vdjmatch/1.3.1 \ && curl -L -o vdjmatch.zip \ "https://github.com/antigenomics/vdjmatch/releases/download/1.3.1/vdjmatch-1.3.1.zip" \ && unzip vdjmatch.zip -d /opt/vdjmatch/1.3.1 \ - && rm vdjmatch.zip - -# symlink VDJmatch -RUN ln -s /opt/vdjmatch/1.3.1/vdjmatch-1.3.1/vdjmatch-1.3.1.jar /usr/local/bin/vdjmatch.jar + && rm vdjmatch.zip \ + && ln -s /opt/vdjmatch/1.3.1/vdjmatch-1.3.1/vdjmatch-1.3.1.jar /usr/local/bin/vdjmatch.jar # Add to PATH ENV PATH="/opt/quarto/1.6.42/bin:${PATH}" diff --git a/LICENSE b/LICENSE new file mode 100644 index 0000000..44ec6b9 --- /dev/null +++ b/LICENSE @@ -0,0 +1,21 @@ +MIT License + +Copyright (c) + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. \ No newline at end of file diff --git a/bin/compare_calc.py b/bin/compare_calc.py index b86700c..81a83ef 100755 --- a/bin/compare_calc.py +++ b/bin/compare_calc.py @@ -1,133 +1,150 @@ #!/usr/bin/env python3 """ -Description: this script calculates overlap measures between TCR repertoires - -@author: Domenick Braccia +Description: Calculate overlap measures between TCR repertoires +Author: Dylan Tamayo, Domenick Braccia """ import argparse import pandas as pd import numpy as np -import os -import sys -import csv -from scipy.stats import entropy -from utils import jaccard_index, sorensen_index, morisita_horn_index #, jensen_shannon_distance - -print('-- ENTERED compare_calc.py--') -print('-- THE TIME IS: --' + str(pd.Timestamp.now())) - -# initialize parser -parser = argparse.ArgumentParser(description='Calculate clonality of a TCR repertoire') - -# add arguments -parser.add_argument('-s', '--sample_utf8', - metavar='sample_utf8', - type=str, - help='sample CSV file initially passed to nextflow run command') -# parser.add_argument('-m', '--meta_data', -# metavar='meta_data', -# type=str, -# help='metadata CSV file initially passed to nextflow run command') - -args = parser.parse_args() - -## Read in sample table CSV file -## convert metadata to list -s = args.sample_utf8 -sample_utf8 = pd.read_csv(args.sample_utf8, sep=',', header=0) -print('sample_utf8 looks like this: ' + str(sample_utf8)) -print('sample_utf8 columns: \n') -print(sample_utf8.columns) - -# Read in metadata table CSV file -# meta_data = pd.read_csv(args.meta_data, sep=',', header=0) -# print('meta_data looks like this: ' + str(meta_data)) -# print('meta_data columns: \n') -# print(meta_data.columns) - -# Import TCR count tables into dictionary of dataframes -files = sample_utf8['file'] -dfs = {} -for file in files: - # load data - df = pd.read_csv(file, sep='\t', header=0) - dfs[file] = df - -print('number of files in dfs: ' + str(len(dfs))) - -## calculate the jaccard index between each sample pair in dfs and store in an nxn matrix and write to file -samples = list(dfs.keys()) - -print('- calculating jaccard index... -') -jaccard_mat = np.zeros((len(samples), len(samples))) -for i, sample1 in enumerate(samples): - for j, sample2 in enumerate(samples): - # calculate jaccard index - value = jaccard_index(dfs[sample1]['junction_aa'], dfs[sample2]['junction_aa']) - # store in numpy array - jaccard_mat[i, j] = value - -# define column and index names -sample_names= [os.path.basename(sample).split('.')[0] for sample in samples] -jaccard_df = pd.DataFrame(jaccard_mat, columns=sample_names, index=sample_names) - -# save jacard_df to csv -jaccard_df.to_csv('jaccard_mat.csv', index=True, header=True) - -## calculate the sorensen index between each sample pair in dfs and store in an nxn matrix and write to file -print('- calculating sorensen index... -') -sorensen_mat = np.zeros((len(samples), len(samples))) -for i, sample1 in enumerate(samples): - for j, sample2 in enumerate(samples): - # calculate sorensen index - value = sorensen_index(dfs[sample1]['junction_aa'], dfs[sample2]['junction_aa']) - # store in numpy array - sorensen_mat[i, j] = value - -# define column and index names -sorensen_df = pd.DataFrame(sorensen_mat, columns=sample_names, index=sample_names) - -# save sorensen_df to csv -sorensen_df.to_csv('sorensen_mat.csv', index=True, header=True) - -## calculate the morisita index between each sample pair in dfs and store in an nxn matrix and write to file -print('- calculating morisita index... -') -morisita_mat = np.zeros((len(samples), len(samples))) -for i in range(len(samples)): - print('-- on sample ' + str(i) + ' --') - for j in range(i+1): - # calculate morisita index - value = morisita_horn_index(dfs, samples[i], samples[j]) - # store in numpy array - morisita_mat[i, j] = value - -# Copy the lower triangle to the upper triangle -morisita_mat = morisita_mat + morisita_mat.T - np.diag(morisita_mat.diagonal()) - -# define column and index names -morisita_df = pd.DataFrame(morisita_mat, columns=sample_names, index=sample_names) - -# save morisita_df to csv -morisita_df.to_csv('morisita_mat.csv', index=True, header=True) - -## calculate jensen shannon distance between each sample pair in dfs and store in an nxn matrix and write to file -# print('- calculating jensen shannon distance... -') -# jsd_mat = np.zeros((len(samples), len(samples))) -# for i, sample1 in enumerate(samples): -# for j, sample2 in enumerate(samples): -# # calculate jensen shannon distance -# value = jensen_shannon_distance(dfs[sample1][['junction_aa', 'duplicate_count']], dfs[sample2][['junction_aa', 'duplicate_count']]) -# # store in numpy array -# jsd_mat[i, j] = value - -# # Copy the lower triangle to the upper triangle -# jsd_mat = jsd_mat + jsd_mat.T - np.diag(jsd_mat.diagonal()) - -# # define column and index names -# jsd_df = pd.DataFrame(jsd_mat, columns=sample_names, index=sample_names) - -# # save jsd_df to csv -# jsd_df.to_csv('jsd_mat.csv', index=True, header=True) - -## ========================================================================== ## + +# ------------------------- +# Similarity functions +# ------------------------- +def jaccard_index(set1, set2): + union = len(set1 | set2) + return len(set1 & set2) / union if union else 0.0 + + +def sorensen_index(set1, set2): + denom = len(set1) + len(set2) + return (2 * len(set1 & set2) / denom) if denom else 0.0 + + +def morisita_horn_index(counts1, counts2): + X = counts1.sum() + Y = counts2.sum() + + if X == 0 or Y == 0: + return 0.0 + + prod_sum = np.sum(counts1 * counts2) + lambda1 = np.sum(counts1 ** 2) / (X ** 2) + lambda2 = np.sum(counts2 ** 2) / (Y ** 2) + + return (2 * prod_sum) / ((lambda1 + lambda2) * X * Y) + +if __name__ == "__main__": + # ------------------------- + # Argument parsing + # ------------------------- + parser = argparse.ArgumentParser( + description="Calculate overlap metrics for TCR repertoires" + ) + parser.add_argument( + "-s", "--sample_utf8", + required=True, + help="Samplesheet CSV passed from Nextflow" + ) + args = parser.parse_args() + + + # ------------------------- + # Load samplesheet + # ------------------------- + sample_df = pd.read_csv(args.sample_utf8) + + samples = sample_df["sample"].tolist() + files = sample_df["file"].tolist() + n = len(samples) + + print(f"Loaded {n} samples") + + # ------------------------- + # Preload data structures + # ------------------------- + junction_sets = {} + count_vectors = {} + + for sample, file in zip(samples, files): + df = pd.read_csv(file, sep="\t", usecols=["junction_aa", "duplicate_count"]) + df = df.dropna(subset=["junction_aa"]) + + # Set for presence/absence metrics + junction_sets[sample] = set(df["junction_aa"]) + + # Counts for Morisita–Horn + count_vectors[sample] = ( + df.groupby("junction_aa")["duplicate_count"] + .sum() + ) + + + # ------------------------- + # Align count vectors across union space + # ------------------------- + all_junctions = sorted( + set().union(*junction_sets.values()) + ) + + for sample in samples: + count_vectors[sample] = ( + count_vectors[sample] + .reindex(all_junctions, fill_value=0) + .to_numpy() + ) + + + # ------------------------- + # Initialize matrices + # ------------------------- + jaccard_mat = np.zeros((n, n)) + sorensen_mat = np.zeros((n, n)) + morisita_mat = np.zeros((n, n)) + + + # ------------------------- + # Compute upper triangle only + # ------------------------- + print("Calculating overlap metrics...") + + for i in range(n): + s1 = samples[i] + set1 = junction_sets[s1] + counts1 = count_vectors[s1] + + # Diagonal + jaccard_mat[i, i] = 1.0 + sorensen_mat[i, i] = 1.0 + morisita_mat[i, i] = 1.0 + + for j in range(i + 1, n): + s2 = samples[j] + + j_val = jaccard_index(set1, junction_sets[s2]) + s_val = sorensen_index(set1, junction_sets[s2]) + m_val = morisita_horn_index(counts1, count_vectors[s2]) + + jaccard_mat[i, j] = jaccard_mat[j, i] = j_val + sorensen_mat[i, j] = sorensen_mat[j, i] = s_val + morisita_mat[i, j] = morisita_mat[j, i] = m_val + + + # ------------------------- + # Write outputs + # ------------------------- + index_names = samples + + pd.DataFrame( + jaccard_mat, index=index_names, columns=index_names + ).to_csv("jaccard_mat.csv") + + pd.DataFrame( + sorensen_mat, index=index_names, columns=index_names + ).to_csv("sorensen_mat.csv") + + pd.DataFrame( + morisita_mat, index=index_names, columns=index_names + ).to_csv("morisita_mat.csv") + + print("Finished writing all matrices") \ No newline at end of file diff --git a/bin/utils.py b/bin/utils.py deleted file mode 100755 index f4391f1..0000000 --- a/bin/utils.py +++ /dev/null @@ -1,85 +0,0 @@ -#!/usr/bin/env python3 - -""" -Description: utility functions for plotting simple TCR repertoire statistics - -Authors: Domenick Braccia -""" - -## import packages -import time -import pandas as pd -import matplotlib.pyplot as plt -import seaborn as sns -from scipy.spatial import distance - -def TicTocGenerator(): - # Generator that returns time differences - ti = 0 # initial time - tf = time.time() # final time - while True: - ti = tf - tf = time.time() - yield tf-ti # returns the time difference - -TicToc = TicTocGenerator() # create an instance of the TicTocGen generator - -# This will be the main function through which we define both tic() and toc() -def toc(tempBool=True): - # Prints the time difference yielded by generator instance TicToc - tempTimeInterval = next(TicToc) - if tempBool: - print( "Elapsed time: %f seconds.\n" %tempTimeInterval ) - -def tic(): - # Records a time in TicToc, marks the beginning of a time interval - toc(False) - -# Defining sample comparison functions -def jaccard_index(sample1, sample2): - set1 = set(sample1) - set2 = set(sample2) - intersection = len(set1.intersection(set2)) - union = len(set1.union(set2)) - return intersection / union - -def sorensen_index(sample1, sample2): - set1 = set(sample1) - set2 = set(sample2) - intersection = len(set1.intersection(set2)) - return 2 * intersection / (len(set1) + len(set2)) - -def morisita_horn_index(dfs, sample1, sample2): - # create sets of amino acid sequences - set1 = set(dfs[sample1]['junction_aa']) - set2 = set(dfs[sample2]['junction_aa']) - - # identify union of sets - union = set1.union(set2) - - # get counts of aa sequences in sample1 and sample2 - df1 = dfs[sample1].groupby('junction_aa')['duplicate_count'].sum().reindex(union).fillna(0) - df2 = dfs[sample2].groupby('junction_aa')['duplicate_count'].sum().reindex(union).fillna(0) - n1i = df1.values - n2i = df2.values - - # calculate product of counts - products = n1i * n2i - - # calculate simpson index values for sample1 and sample2 - print(type(df1)) - X = df1.sum() - Y = df2.sum() - - s1_si = sum(count**2 for count in df1)/(X**2) - s2_si = sum(count**2 for count in df2)/(Y**2) - - numerator = 2 * sum(products) - denominator = (s1_si + s2_si) * (X * Y) - return numerator / denominator - -def jensen_shannon_distance(sample1, sample2): - # Merge the two samples based on junction_aa column - merged = pd.merge(sample1, sample2, on='junction_aa', how='outer', suffixes=('_1', '_2')).fillna(0) - # Enter probability distributions into the distance function - return distance.jensenshannon(merged['duplicate_count_1'], merged['duplicate_count_2']) \ No newline at end of file diff --git a/conf/base.config b/conf/base.config index daf7fa4..07ce073 100644 --- a/conf/base.config +++ b/conf/base.config @@ -9,6 +9,7 @@ */ process { + container = "ghcr.io/karchinlab/tcrtoolkit:main" // TODO nf-core: Check the defaults for all processes cpus = { check_max( 1 * task.attempt, 'cpus' ) } @@ -63,4 +64,37 @@ process { maxRetries = 2 } +} + +// Function to ensure that resource requirements don't go beyond +// a maximum limit +def check_max(obj, type) { + if (type == 'memory') { + try { + if (obj.compareTo(params.max_memory as nextflow.util.MemoryUnit) == 1) + return params.max_memory as nextflow.util.MemoryUnit + else + return obj + } catch (all) { + println " ### ERROR ### Max memory '${params.max_memory}' is not valid! Using default value: $obj" + return obj + } + } else if (type == 'time') { + try { + if (obj.compareTo(params.max_time as nextflow.util.Duration) == 1) + return params.max_time as nextflow.util.Duration + else + return obj + } catch (all) { + println " ### ERROR ### Max time '${params.max_time}' is not valid! Using default value: $obj" + return obj + } + } else if (type == 'cpus') { + try { + return Math.min( obj, params.max_cpus as int ) + } catch (all) { + println " ### ERROR ### Max cpus '${params.max_cpus}' is not valid! Using default value: $obj" + return obj + } + } } \ No newline at end of file diff --git a/conf/modules.config b/conf/modules.config index 8fc2514..3e891f7 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -11,6 +11,7 @@ */ process { + container = "ghcr.io/karchinlab/tcrtoolkit:main" publishDir = [ path: { "${params.outdir}/${task.process.tokenize(':')[-1].tokenize('_')[0].toLowerCase()}" }, diff --git a/env.yml b/env.yml index 27ce717..331ef45 100644 --- a/env.yml +++ b/env.yml @@ -29,6 +29,7 @@ dependencies: # R and R packages - r-base=4.4.2 + - r-grr=0.9.5 - r-igraph=2.0.3 - r-pheatmap=1.0.12 - r-remotes=2.5.0 diff --git a/main.nf b/main.nf index a09fdb0..c84b10f 100644 --- a/main.nf +++ b/main.nf @@ -27,18 +27,6 @@ workflow { TCRTOOLKIT() } -/* -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - COMPLETION EMAIL AND SUMMARY -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -*/ - -workflow.onComplete { - - log.info(workflow.success ? "All done!" : "Please check your inputs.") - -} - /* ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ THE END diff --git a/modules/local/airr_convert/convert_adaptive.nf b/modules/local/airr_convert/convert_adaptive.nf index aa818cc..a476bb1 100644 --- a/modules/local/airr_convert/convert_adaptive.nf +++ b/modules/local/airr_convert/convert_adaptive.nf @@ -1,7 +1,6 @@ process CONVERT_ADAPTIVE { tag "${sample_meta.sample}" label 'process_low' - container "ghcr.io/karchinlab/tcrtoolkit:main" input: tuple val(sample_meta), path(count_table) diff --git a/modules/local/airr_convert/pseudobulk_cellranger.nf b/modules/local/airr_convert/pseudobulk_cellranger.nf index ec1916c..653b2a9 100644 --- a/modules/local/airr_convert/pseudobulk_cellranger.nf +++ b/modules/local/airr_convert/pseudobulk_cellranger.nf @@ -1,7 +1,6 @@ process PSEUDOBULK_CELLRANGER { tag "${sample_meta.sample}" label 'process_low' - container "ghcr.io/karchinlab/tcrtoolkit:main" input: tuple val(sample_meta), path(count_table) diff --git a/modules/local/compare/compare_calc.nf b/modules/local/compare/compare_calc.nf index a6d0a60..2206fdb 100644 --- a/modules/local/compare/compare_calc.nf +++ b/modules/local/compare/compare_calc.nf @@ -1,6 +1,5 @@ process COMPARE_CALC { label 'process_single' - container "ghcr.io/karchinlab/tcrtoolkit:main" input: path sample_utf8 diff --git a/modules/local/compare/compare_concatenate.nf b/modules/local/compare/compare_concatenate.nf index a05e50c..c955013 100644 --- a/modules/local/compare/compare_concatenate.nf +++ b/modules/local/compare/compare_concatenate.nf @@ -1,6 +1,5 @@ process COMPARE_CONCATENATE { label 'process_low' - container "ghcr.io/karchinlab/tcrtoolkit:main" input: path samplesheet_utf8 diff --git a/modules/local/compare/compare_plot.nf b/modules/local/compare/compare_plot.nf index 471ddaa..1eb5792 100644 --- a/modules/local/compare/compare_plot.nf +++ b/modules/local/compare/compare_plot.nf @@ -1,6 +1,5 @@ process COMPARE_PLOT { label 'process_single' - container "ghcr.io/karchinlab/tcrtoolkit:main" input: path sample_utf8 @@ -24,7 +23,6 @@ process COMPARE_PLOT { quarto render compare_stats.qmd \ -P project_name:$project_name \ -P workflow_cmd:'$workflow.commandLine' \ - -P project_dir:$projectDir \ -P jaccard_mat:$jaccard_mat \ -P sorensen_mat:$sorensen_mat \ -P morisita_mat:$morisita_mat \ diff --git a/modules/local/compare/giana.nf b/modules/local/compare/giana.nf index 561c1b3..7140448 100644 --- a/modules/local/compare/giana.nf +++ b/modules/local/compare/giana.nf @@ -1,6 +1,5 @@ process GIANA_CALC { label 'process_medium' - container "ghcr.io/karchinlab/tcrtoolkit:main" input: path concat_cdr3 diff --git a/modules/local/compare/gliph2.nf b/modules/local/compare/gliph2.nf index 486442d..68dd080 100644 --- a/modules/local/compare/gliph2.nf +++ b/modules/local/compare/gliph2.nf @@ -2,7 +2,6 @@ process GLIPH2_TURBOGLIPH { label 'process_high' label 'process_high_compute' label 'process_high_memory' - container "ghcr.io/karchinlab/tcrtoolkit:main" input: path concat_cdr3 @@ -53,7 +52,6 @@ process GLIPH2_TURBOGLIPH { process GLIPH2_PLOT { label 'process_low' - container "ghcr.io/karchinlab/tcrtoolkit:main" input: path gliph2_report_template @@ -77,7 +75,6 @@ process GLIPH2_PLOT { quarto render gliph2_report.qmd \ -P project_name:$params.project_name \ -P workflow_cmd:'$workflow.commandLine' \ - -P project_dir:$projectDir \ -P results_dir:'./' \ # -P clusters:$cluster_member_details \ diff --git a/modules/local/compare/tcrsharing.nf b/modules/local/compare/tcrsharing.nf index ff68619..2bc5e1f 100644 --- a/modules/local/compare/tcrsharing.nf +++ b/modules/local/compare/tcrsharing.nf @@ -1,6 +1,5 @@ process TCRSHARING_CALC { label 'process_low' - container "ghcr.io/karchinlab/tcrtoolkit:main" input: path concat_cdr3 @@ -73,7 +72,6 @@ process TCRSHARING_CALC { process TCRSHARING_HISTOGRAM { label 'process_low' - container "ghcr.io/karchinlab/tcrtoolkit:main" input: path shared_cdr3 @@ -114,7 +112,6 @@ process TCRSHARING_HISTOGRAM { process TCRSHARING_SCATTERPLOT { label 'process_low' - container "ghcr.io/karchinlab/tcrtoolkit:main" input: path shared_cdr3 diff --git a/modules/local/sample/convergence.nf b/modules/local/sample/convergence.nf index 1e077ac..962d041 100644 --- a/modules/local/sample/convergence.nf +++ b/modules/local/sample/convergence.nf @@ -1,7 +1,6 @@ process CONVERGENCE { tag "${sample_meta.sample}" label 'process_low' - container "ghcr.io/karchinlab/tcrtoolkit:main" input: tuple val(sample_meta), path(count_table) diff --git a/modules/local/sample/olga.nf b/modules/local/sample/olga.nf index dfc71f2..d7b76ba 100644 --- a/modules/local/sample/olga.nf +++ b/modules/local/sample/olga.nf @@ -1,7 +1,6 @@ process OLGA_PGEN_CALC { tag "${sample_meta.sample}" label 'process_low' - container "ghcr.io/karchinlab/tcrtoolkit:main" input: tuple val(sample_meta), path(count_table) @@ -56,7 +55,6 @@ process OLGA_PGEN_CALC { process OLGA_HISTOGRAM_CALC { tag "${sample_meta.sample}" label 'process_low' - container "ghcr.io/karchinlab/tcrtoolkit:main" input: tuple val(sample_meta), path(olga_pgen) @@ -113,7 +111,6 @@ process OLGA_HISTOGRAM_CALC { process OLGA_HISTOGRAM_PLOT { tag "${sample_meta.sample}" label 'process_low' - container "ghcr.io/karchinlab/tcrtoolkit:main" input: tuple val(sample_meta), path(olga_histogram) @@ -172,7 +169,6 @@ process OLGA_HISTOGRAM_PLOT { process OLGA_WRITE_MAX { label 'process_single' - container "ghcr.io/karchinlab/tcrtoolkit:main" input: val olga_global_xmin diff --git a/modules/local/sample/sample_aggregate.nf b/modules/local/sample/sample_aggregate.nf index bb4a293..a1c9c55 100644 --- a/modules/local/sample/sample_aggregate.nf +++ b/modules/local/sample/sample_aggregate.nf @@ -1,7 +1,6 @@ process SAMPLE_AGGREGATE { tag "${output_file}" label 'process_low' - container "ghcr.io/karchinlab/tcrtoolkit:main" input: path csv_files @@ -12,13 +11,16 @@ process SAMPLE_AGGREGATE { script: """ - python3 < aggregate.py < 256.GB) - return 16 * task.attempt - else if (task.memory > 64.GB) - return 8 * task.attempt - else if (task.memory > 4.GB) - return 4 * task.attempt - else - return 2 * task.attempt - } + task.memory > 256.GB ? 16 * task.attempt : + task.memory > 64.GB ? 8 * task.attempt : + task.memory > 4.GB ? 4 * task.attempt : + 2 * task.attempt + } + + memory { - def sz = count_table.size() - def mb = 1024 * 1024 - if (sz > 26 * mb) - return 512.GB * task.attempt - else if (sz > 20 * mb) - return 256.GB * task.attempt - else if (sz > 10 * mb) - return 128.GB * task.attempt - else if (sz > 4 * mb) - return 64.GB * task.attempt - else if (sz > 2 * mb) - return 16.GB * task.attempt - else - return 4.GB * task.attempt + count_table.size() > 26 * 1024**2 ? 512.GB * task.attempt : + count_table.size() > 20 * 1024**2 ? 256.GB * task.attempt : + count_table.size() > 10 * 1024**2 ? 128.GB * task.attempt : + count_table.size() > 4 * 1024**2 ? 64.GB * task.attempt : + count_table.size() > 2 * 1024**2 ? 16.GB * task.attempt : + 4.GB * task.attempt } input: @@ -42,7 +31,6 @@ process TCRDIST3_MATRIX { script: """ - # Run tcrdist3 on input tcrdist3_matrix.py ${count_table} ${sample_meta.sample} ${matrix_sparsity} ${distance_metric} ${ref_db} ${task.cpus} """ } @@ -50,7 +38,6 @@ process TCRDIST3_MATRIX { process TCRDIST3_HISTOGRAM_CALC { tag "${sample_meta.sample}" label 'process_high' - container "ghcr.io/karchinlab/tcrtoolkit:main" input: tuple val(sample_meta), path(distance_matrix) @@ -131,7 +118,6 @@ process TCRDIST3_HISTOGRAM_CALC { process TCRDIST3_HISTOGRAM_PLOT { tag "${sample_meta.sample}" label 'process_low' - container "ghcr.io/karchinlab/tcrtoolkit:main" input: tuple val(sample_meta), path(histogram_data) diff --git a/modules/local/sample/tcrpheno.nf b/modules/local/sample/tcrpheno.nf index e71ed63..f4be077 100644 --- a/modules/local/sample/tcrpheno.nf +++ b/modules/local/sample/tcrpheno.nf @@ -1,7 +1,6 @@ process TCRPHENO { tag "${sample_meta.sample}" label 'process_low' - container "ghcr.io/karchinlab/tcrtoolkit:main" input: tuple val(sample_meta), path(count_table) diff --git a/modules/local/sample/tcrspecificity.nf b/modules/local/sample/tcrspecificity.nf index e92cdb6..30ce184 100644 --- a/modules/local/sample/tcrspecificity.nf +++ b/modules/local/sample/tcrspecificity.nf @@ -1,6 +1,5 @@ process VDJDB_GET { label 'process_low' - container "ghcr.io/karchinlab/tcrtoolkit:main" output: path("vdjdb-2025-02-21/"), emit: ref_db @@ -15,7 +14,6 @@ process VDJDB_GET { process VDJDB_VDJMATCH { tag "${sample_meta.sample}" label 'process_medium' - container "ghcr.io/karchinlab/tcrtoolkit:main" input: tuple val(sample_meta), path(count_table) diff --git a/modules/local/samplesheet/samplesheet_check.nf b/modules/local/samplesheet/samplesheet_check.nf index 527ed9c..04bd0d7 100644 --- a/modules/local/samplesheet/samplesheet_check.nf +++ b/modules/local/samplesheet/samplesheet_check.nf @@ -1,7 +1,6 @@ process SAMPLESHEET_CHECK { tag "${samplesheet}" label 'process_single' - container "ghcr.io/karchinlab/tcrtoolkit:main" input: path samplesheet diff --git a/modules/local/samplesheet/samplesheet_resolve.nf b/modules/local/samplesheet/samplesheet_resolve.nf index 61ca0c1..4dc0746 100644 --- a/modules/local/samplesheet/samplesheet_resolve.nf +++ b/modules/local/samplesheet/samplesheet_resolve.nf @@ -1,8 +1,8 @@ process SAMPLESHEET_RESOLVE { label 'process_single' - container "ghcr.io/karchinlab/tcrtoolkit:main" input: + path samplesheet_utf8 val(resolved_rows) // List of tab-separated strings val(resolved_header) // Comma-separated header line @@ -11,10 +11,37 @@ process SAMPLESHEET_RESOLVE { script: """ - echo \"$resolved_header\" > samplesheet_resolved.csv +# Write resolved rows to a temp file +cat << 'EOF' > resolved.tmp +${resolved_rows.join('\n')} +EOF - for row in ${resolved_rows.collect{"\"${it}\""}.join(' ')}; do - echo -e "\$row" >> samplesheet_resolved.csv - done +# Emit header +cat << 'EOF' > samplesheet_resolved.csv +${resolved_header} +EOF + +# Two-pass awk: +# - pass 1: read original samplesheet, store sample order +# - pass 2: read resolved rows, store rows by sample +awk -F',' ' + NR==FNR { + if (FNR > 1) order[++n] = \$1 + next + } + { + resolved[\$1] = \$0 + } + END { + for (i = 1; i <= n; i++) { + s = order[i] + if (!(s in resolved)) { + printf "ERROR: missing resolved row for %s\\n", s > "/dev/stderr" + exit 1 + } + print resolved[s] + } + } +' "${samplesheet_utf8}" resolved.tmp >> samplesheet_resolved.csv """ } \ No newline at end of file diff --git a/nextflow.config b/nextflow.config index f685ead..b569dcb 100644 --- a/nextflow.config +++ b/nextflow.config @@ -7,7 +7,14 @@ docker { // Load base.config by default for all pipelines includeConfig 'conf/base.config' +plugins { + id 'nf-schema@2.6.1' +} + params { + samplesheet = null + outdir = 'out' + publish_dir_mode = 'copy' // Max resource options @@ -16,7 +23,7 @@ params { max_cpus = 192 max_time = '48.h' - input_format = "airr" + input_format = "airr" // cellranger, adaptive airr_schema = "${projectDir}/assets/airr/airr_rearrangement_schema.json" imgt_lookup = "${projectDir}/assets/airr/imgt_adaptive_lookup.tsv" @@ -57,36 +64,3 @@ params { } includeConfig 'conf/modules.config' - -// Function to ensure that resource requirements don't go beyond -// a maximum limit -def check_max(obj, type) { - if (type == 'memory') { - try { - if (obj.compareTo(params.max_memory as nextflow.util.MemoryUnit) == 1) - return params.max_memory as nextflow.util.MemoryUnit - else - return obj - } catch (all) { - println " ### ERROR ### Max memory '${params.max_memory}' is not valid! Using default value: $obj" - return obj - } - } else if (type == 'time') { - try { - if (obj.compareTo(params.max_time as nextflow.util.Duration) == 1) - return params.max_time as nextflow.util.Duration - else - return obj - } catch (all) { - println " ### ERROR ### Max time '${params.max_time}' is not valid! Using default value: $obj" - return obj - } - } else if (type == 'cpus') { - try { - return Math.min( obj, params.max_cpus as int ) - } catch (all) { - println " ### ERROR ### Max cpus '${params.max_cpus}' is not valid! Using default value: $obj" - return obj - } - } -} \ No newline at end of file diff --git a/nextflow_schema.json b/nextflow_schema.json new file mode 100644 index 0000000..e81a2e5 --- /dev/null +++ b/nextflow_schema.json @@ -0,0 +1,187 @@ +{ + "$schema": "https://json-schema.org/draft/2020-12/schema", + "$id": "https://raw.githubusercontent.com/break-through-cancer/tcrtoolkit-pipeline/main/nextflow_schema.json", + "title": "tcrtoolkit pipeline parameters", + "description": "BTC TCR Toolkit pipeline", + "type": "object", + + "$defs": { + "input_output_options": { + "title": "Input/output options", + "type": "object", + "fa_icon": "fas fa-terminal", + "description": "Define where the pipeline should find input data and save output data.", + "required": ["samplesheet", "outdir"], + "properties": { + "samplesheet": { + "type": "string", + "format": "file-path", + "pattern": ".*.csv$", + "description": "Path to the samplesheet describing input AIRR data.", + "help_text": "A CSV of samples and metadata for this TCR analysis.", + "fa_icon": "fas fa-file-csv" + }, + "outdir": { + "type": "string", + "format": "directory-path", + "default": "out", + "description": "Output directory where results will be saved.", + "fa_icon": "fas fa-folder-open" + } + } + }, + + "resource_options": { + "title": "Max resource options", + "type": "object", + "fa_icon": "fab fa-acquisitions-incorporated", + "description": "Set the top limit for requested resources for any single job.", + "properties": { + "max_cpus": { + "type": "integer", + "default": 192, + "description": "Maximum CPUs that can be requested by any process.", + "fa_icon": "fas fa-microchip" + }, + "max_memory": { + "type": "string", + "default": "768.GB", + "pattern": "^\\d+(?:\\.\\d+|\\.)?\\s*(?:[KMGT]?B|[KMGT])$", + "description": "Maximum memory for any process.", + "fa_icon": "fas fa-memory" + }, + "max_time": { + "type": "string", + "default": "48.h", + "pattern": "^\\d+(?:\\.\\d+|\\.)?\\s*(s|m|h|d)$", + "description": "Maximum walltime for any job.", + "fa_icon": "far fa-clock" + } + } + }, + + "workflow_options": { + "title": "Workflow parameters", + "type": "object", + "fa_icon": "fas fa-project-diagram", + "description": "General pipeline workflow settings.", + "properties": { + "workflow_level": { + "type": "string", + "default": "sample,compare", + "enum": ["sample,compare", "sample", "compare", "convert"], + "description": "Comma-separated workflow stages (sample, compare)." + }, + "project_name": { + "type": "string", + "description": "Name of this analysis project." + }, + "publish_dir_mode": { + "type": "string", + "default": "copy", + "enum": ["copy", "move", "link", "symlink"], + "description": "Method used by `publishDir` to save outputs." + } + } + }, + + "airr_options": { + "title": "AIRR data options", + "type": "object", + "fa_icon": "fas fa-dna", + "description": "Parameters related to AIRR format and schema references.", + "properties": { + "input_format": { + "type": "string", + "default": "airr", + "enum": ["airr", "adaptive", "cellranger"], + "description": "Input data format." + }, + "airr_schema": { + "type": "string", + "description": "Path to AIRR rearrangement schema JSON." + }, + "imgt_lookup": { + "type": "string", + "description": "Path to imgt lookup table." + }, + "sample_stats_template": { + "type": "string", + "description": "Path to sample notebook template." + }, + "compare_stats_template": { + "type": "string", + "description": "Path to compare notebook template." + } + } + }, + + "plotting_options": { + "title": "Plotting and metadata options", + "type": "object", + "fa_icon": "fas fa-chart-bar", + "description": "Parameters for plotting and metadata columns.", + "properties": { + "samplechart_x_col": { "type": "string", "default": "timepoint" }, + "samplechart_color_col": { "type": "string", "default": "origin" }, + "vgene_subject_col": { "type": "string", "default": "subject_id" }, + "vgene_x_cols": { "type": "string", "default": "origin,timepoint" } + } + }, + + "giana_options": { + "title": "GIANA clustering options", + "type": "object", + "fa_icon": "fas fa-brain", + "properties": { + "threshold": { "type": "number", "default": 7.0 }, + "threshold_score": { "type": "number", "default": 3.6 }, + "threshold_vgene": { "type": "number", "default": 3.7 } + } + }, + + "gliph2_options": { + "title": "GLIPH2 clustering options", + "type": "object", + "fa_icon": "fas fa-code-branch", + "properties": { + "gliph2_report_template": { "type": "string" }, + "ref_files": { "type": "string" }, + "local_min_pvalue": { "type": "string", "default": "0.001" }, + "p_depth": { "type": "string", "default": "1000" }, + "global_convergence_cutoff": { "type": "string", "default": "1" }, + "simulation_depth": { "type": "string", "default": "1000" }, + "kmer_min_depth": { "type": "string", "default": "3" }, + "local_min_OVE": { "type": "string", "default": "c(1000, 100, 10)" }, + "algorithm": { "type": "string", "default": "GLIPH2" }, + "all_aa_interchangeable": { "type": "string", "default": "1" } + } + }, + + "tcrdist3_options": { + "title": "TCRdist3 distance options", + "type": "object", + "fa_icon": "fas fa-ruler-combined", + "properties": { + "matrix_sparsity": { + "type": "string", + "default": "sparse", + "enum": ["sparse", "full"] + }, + "distance_metric": { "type": "string", "default": "tcrdist" }, + "db_path": { "type": "string" } + } + } + }, + + "allOf": [ + { "$ref": "#/$defs/input_output_options" }, + { "$ref": "#/$defs/resource_options" }, + { "$ref": "#/$defs/workflow_options" }, + { "$ref": "#/$defs/airr_options" }, + { "$ref": "#/$defs/plotting_options" }, + { "$ref": "#/$defs/giana_options" }, + { "$ref": "#/$defs/gliph2_options" }, + { "$ref": "#/$defs/tcrdist3_options" } + ] +} \ No newline at end of file diff --git a/notebooks/compare_stats_template.qmd b/notebooks/compare_stats_template.qmd index 4df9221..01b2c66 100644 --- a/notebooks/compare_stats_template.qmd +++ b/notebooks/compare_stats_template.qmd @@ -30,7 +30,6 @@ Thank you for using TCRtoolkit! This report is generated from sample data and me #Default inputs are overwritten at the command line in `modules/local/plot_sample.nf` workflow_cmd='' project_name='path/to/project_name' -project_dir='path/to/project_dir' jaccard_mat='path/to/jaccard_mat.csv' sorensen_mat='path/to/sorensen_mat.csv' morisita_mat='path/to/morisita_mat.csv' @@ -57,43 +56,18 @@ import seaborn as sns print('Pipeline information and parameters:' + '\n') print('Project Name: ' + project_name) print('Workflow command: ' + workflow_cmd) -print('Pipeline Directory: ' + project_dir) print('Date and time: ' + str(datetime.datetime.now())) -# 3. Importing custom plotting functions -## Plotting functions are defined in `bin/utils.py`. -# sys.path.append(project_dir + '/bin/') -# source_file = os.path.join(project_dir, 'bin', 'utils.py') -# destination_file = os.path.join(os.getcwd(), 'utils.py') -# shutil.copyfile(source_file, destination_file) -# from utils import TicTocGenerator, tic, toc -# TicToc = TicTocGenerator() - -# 4. Importing similarity data -## 4a. jaccard similarity matrix +# 3. Importing similarity data +## 3a. jaccard similarity matrix jaccard_df = pd.read_csv(jaccard_mat, sep=',', header=0, index_col=0) -## 4b. sorensen similarity matrix +## 3b. sorensen similarity matrix sorensen_df = pd.read_csv(sorensen_mat, sep=',', header=0, index_col=0) -## 4c. morisita similarity matrix +## 3c. morisita similarity matrix morisita_df = pd.read_csv(morisita_mat, sep=',', header=0, index_col=0) -## 4d. jensen-shannon matrix -# jsd_df = pd.read_csv(jsd_mat, sep=',', header=0, index_col=0) - -# 5. Importing sample level counts -# sample_utf8 = pd.read_csv(sample_utf8, sep=',', header=0, index_col=0) -# files = sample_utf8['file'] -# dfs = {} -# for file in files: -# # load data -# df = pd.read_csv(file, sep='\t', header=0) - -# # Rename columns -# df = df.rename(columns={'count (templates/reads)': 'read_count', 'frequencyCount (%)': 'frequency'}) -# sample_id = os.path.basename(file).split('.')[0] -# dfs[sample_id] = df ``` # Analysis diff --git a/notebooks/gliph2_report_template.qmd b/notebooks/gliph2_report_template.qmd index 5d48831..e39d248 100644 --- a/notebooks/gliph2_report_template.qmd +++ b/notebooks/gliph2_report_template.qmd @@ -30,7 +30,6 @@ Thank you for using TCRtoolkit! This report is generated from sample data and me #Default inputs are overwritten at the command line in `modules/local/plot_gliph2.nf` workflow_cmd='' project_name='path/to/project_name' -project_dir='path/to/project_dir' clusters='path/to/{project_name}_cluster.csv' cluster_stats='path/to/{project_name}_cluster.txt' ``` @@ -54,5 +53,4 @@ import seaborn as sns print('Pipeline information and parameters:' + '\n') print('Project Name: ' + project_name) print('Workflow command: ' + workflow_cmd) -print('Pipeline Directory: ' + project_dir) print('Date and time: ' + str(datetime.datetime.now())) diff --git a/notebooks/sample_stats_template.qmd b/notebooks/sample_stats_template.qmd index 5a97598..a85d7ef 100644 --- a/notebooks/sample_stats_template.qmd +++ b/notebooks/sample_stats_template.qmd @@ -32,7 +32,6 @@ Thank you for using TCRtoolkit! This report is generated from sample data and me workflow_cmd='' project_name='' -project_dir='' sample_table='' sample_stats_csv='' v_family_csv='' @@ -70,15 +69,9 @@ warnings.filterwarnings( print('Project Name: ' + project_name) print('Workflow command: ' + workflow_cmd) -print('Pipeline Directory: ' + project_dir) print('Date and time: ' + str(datetime.datetime.now())) -# 3. Importing custom plotting functions -# sys.path.append(project_dir + '/bin/') -# from utils import TicTocGenerator, tic, toc -# TicToc = TicTocGenerator() - -# 4. Loading data +# 3. Loading data ## reading sample metadata meta = pd.read_csv(sample_table, sep=',') meta_cols = meta.columns.tolist() diff --git a/subworkflows/local/input_check.nf b/subworkflows/local/input_check.nf index 08a2fdb..b197003 100644 --- a/subworkflows/local/input_check.nf +++ b/subworkflows/local/input_check.nf @@ -19,7 +19,7 @@ workflow INPUT_CHECK { samplesheet_utf8 .splitCsv(header: true, sep: ',') .map { row -> - def meta = row.findAll { k, v -> k != 'file' } // everything except the file column + def meta = row.findAll { k, _v -> k != 'file' } // everything except the file column def file_obj = file(row.file) return [meta, file_obj] } diff --git a/subworkflows/local/resolve_samplesheet.nf b/subworkflows/local/resolve_samplesheet.nf index 98abe4d..e6d8c06 100644 --- a/subworkflows/local/resolve_samplesheet.nf +++ b/subworkflows/local/resolve_samplesheet.nf @@ -28,12 +28,13 @@ workflow RESOLVE_SAMPLESHEET { .splitCsv(header: true, sep: ',') .first() .map { row -> - def header = row.keySet().findAll { it != 'file' } + ['file'] + def header = row.keySet().findAll { header_col -> header_col != 'file' } + ['file'] return header.join(',') // <-- convert to string } .set { resolved_header } SAMPLESHEET_RESOLVE( + samplesheet_utf8, resolved_rows, resolved_header ) diff --git a/subworkflows/local/sample.nf b/subworkflows/local/sample.nf index 9278537..c63bbe6 100644 --- a/subworkflows/local/sample.nf +++ b/subworkflows/local/sample.nf @@ -61,13 +61,11 @@ workflow SAMPLE { ) TCRDIST3_MATRIX.out.max_matrix_value - .map { it.text.trim().toDouble() } + .map { tcrdist_xmax -> tcrdist_xmax.text.trim().toDouble() } .collect() .map { values -> values.max() } .set { global_x_max_value } - - // Use `global_max_value` in downstream processes or print it - global_x_max_value.view { "Global x max matrix value: $it" } + global_x_max_value.view { global_xmax -> "Global x max matrix value: $global_xmax" } TCRDIST3_HISTOGRAM_CALC( TCRDIST3_MATRIX.out.tcrdist_output, @@ -77,13 +75,11 @@ workflow SAMPLE { ) TCRDIST3_HISTOGRAM_CALC.out.max_histogram_count - .map { it.text.trim().toDouble() } + .map { tcrdist_ymax -> tcrdist_ymax.text.trim().toDouble() } .collect() .map { values -> values.max() } .set { global_y_max_value } - - // Use `global_max_value` in downstream processes or print it - global_y_max_value.view { "Global y max matrix value: $it" } + global_y_max_value.view { global_ymax -> "Global y max matrix value: $global_ymax" } TCRDIST3_HISTOGRAM_PLOT( TCRDIST3_HISTOGRAM_CALC.out.histogram_data, @@ -93,27 +89,27 @@ workflow SAMPLE { OLGA_PGEN_CALC ( sample_map ) OLGA_PGEN_CALC.out.olga_xmin - .map { it.text.trim().toDouble() } + .map { xmin -> xmin.text.trim().toDouble() } .collect() .map { values -> values.min() } .set { olga_x_min_value } - olga_x_min_value.view { "Olga x min matrix value: $it" } + olga_x_min_value.view { olga_xmin -> "Olga x min matrix value: $olga_xmin" } OLGA_PGEN_CALC.out.olga_xmax - .map { it.text.trim().toDouble() } + .map { xmax -> xmax.text.trim().toDouble() } .collect() .map { values -> values.max() } .set { olga_x_max_value } - olga_x_max_value.view { "Olga x max matrix value: $it" } + olga_x_max_value.view { olga_xmax -> "Olga x max matrix value: $olga_xmax" } OLGA_HISTOGRAM_CALC ( OLGA_PGEN_CALC.out.olga_pgen, olga_x_min_value, olga_x_max_value ) OLGA_HISTOGRAM_CALC.out.olga_ymax - .map { it.text.trim().toDouble() } + .map { ymax -> ymax.text.trim().toDouble() } .collect() .map { values -> values.max() } .set { olga_y_max_value } - olga_y_max_value.view { "Olga y max matrix value: $it" } + olga_y_max_value.view { olga_ymax -> "Olga y max matrix value: $olga_ymax" } OLGA_HISTOGRAM_PLOT( OLGA_HISTOGRAM_CALC.out.olga_histogram, olga_y_max_value ) diff --git a/subworkflows/local/validate_params.nf b/subworkflows/local/validate_params.nf new file mode 100644 index 0000000..b4b2f22 --- /dev/null +++ b/subworkflows/local/validate_params.nf @@ -0,0 +1,8 @@ +include { validateParameters; paramsSummaryLog } from 'plugin/nf-schema' + +workflow VALIDATE_PARAMS { + + main: + validateParameters() + log.info paramsSummaryLog(workflow) +} \ No newline at end of file diff --git a/workflows/tcrtoolkit.nf b/workflows/tcrtoolkit.nf index 056a2d8..5329576 100644 --- a/workflows/tcrtoolkit.nf +++ b/workflows/tcrtoolkit.nf @@ -1,16 +1,3 @@ -/* -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - VALIDATE & PRINT PARAMETER SUMMARY -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -*/ - -// Validate pipeline parameters -def checkPathParamList = [ params.samplesheet] -for (param in checkPathParamList) { if (param) { file(param, checkIfExists: true) } } - -// Check mandatory parameters -if (params.samplesheet) { samplesheet = file(params.samplesheet) } else { exit 1, 'Samplesheet not specified. Please, provide a --samplesheet=/path/to/samplesheet.csv !' } -if (params.outdir) { outdir = params.outdir } else { exit 1, 'Output directory not specified. Please, provide a --outdir=/path/to/outdir !' } /* ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -27,6 +14,7 @@ include { AIRR_CONVERT } from '../subworkflows/local/airr_convert' include { RESOLVE_SAMPLESHEET } from '../subworkflows/local/resolve_samplesheet' include { SAMPLE } from '../subworkflows/local/sample' include { COMPARE } from '../subworkflows/local/compare' +include { VALIDATE_PARAMS } from '../subworkflows/local/validate_params' /* ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -36,6 +24,7 @@ include { COMPARE } from '../subworkflows/local/compare' workflow TCRTOOLKIT { + VALIDATE_PARAMS() println("Running TCRTOOLKIT workflow...") @@ -80,18 +69,6 @@ workflow TCRTOOLKIT { } } -/* -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - COMPLETION EMAIL AND SUMMARY -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -*/ - -// workflow.onComplete { - -// log.info(workflow.success ? "Finished tcrtoolkit!" : "Please check your inputs.") - -// } - /* ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ THE END