diff --git a/bin/compare_calc.py b/bin/compare_calc.py index 81a83ef..4f611b7 100755 --- a/bin/compare_calc.py +++ b/bin/compare_calc.py @@ -52,8 +52,13 @@ def morisita_horn_index(counts1, counts2): # ------------------------- # Load samplesheet # ------------------------- - sample_df = pd.read_csv(args.sample_utf8) + sample_df = pd.read_csv(args.sample_utf8, index_col=None) + # Basic hygiene: drop rows missing sample or file + sample_df = sample_df.dropna(subset=["sample", "file"]) + print(sample_df.head()) + print(sample_df.columns.tolist()) + samples = sample_df["sample"].tolist() files = sample_df["file"].tolist() n = len(samples) @@ -64,44 +69,46 @@ def morisita_horn_index(counts1, counts2): # Preload data structures # ------------------------- junction_sets = {} - count_vectors = {} + count_series = {} for sample, file in zip(samples, files): df = pd.read_csv(file, sep="\t", usecols=["junction_aa", "duplicate_count"]) df = df.dropna(subset=["junction_aa"]) + # Ensure counts are numeric + df["duplicate_count"] = pd.to_numeric(df["duplicate_count"], errors="coerce").fillna(0) + # Set for presence/absence metrics junction_sets[sample] = set(df["junction_aa"]) - # Counts for Morisita–Horn - count_vectors[sample] = ( + # Counts for Morisita–Horn as a pandas Series (index = junction_aa) + counts = ( df.groupby("junction_aa")["duplicate_count"] .sum() ) - + # Ensure we have a Series with a named index + if not isinstance(counts, pd.Series): + counts = pd.Series(counts) + count_series[sample] = counts # ------------------------- - # Align count vectors across union space + # Build union index and align counts # ------------------------- - all_junctions = sorted( - set().union(*junction_sets.values()) - ) + union_set = set().union(*junction_sets.values()) if junction_sets else set() + all_junctions = pd.Index(sorted(union_set)) + aligned_vectors = {} for sample in samples: - count_vectors[sample] = ( - count_vectors[sample] - .reindex(all_junctions, fill_value=0) - .to_numpy() - ) - + aligned = count_series[sample].reindex(all_junctions, fill_value=0) + # Store as numpy for MH computation + aligned_vectors[sample] = aligned.to_numpy(dtype=float) # ------------------------- # Initialize matrices # ------------------------- - jaccard_mat = np.zeros((n, n)) - sorensen_mat = np.zeros((n, n)) - morisita_mat = np.zeros((n, n)) - + jaccard_mat = np.zeros((n, n), dtype=float) + sorensen_mat = np.zeros((n, n), dtype=float) + morisita_mat = np.zeros((n, n), dtype=float) # ------------------------- # Compute upper triangle only @@ -111,7 +118,7 @@ def morisita_horn_index(counts1, counts2): for i in range(n): s1 = samples[i] set1 = junction_sets[s1] - counts1 = count_vectors[s1] + counts1 = aligned_vectors[s1] # Diagonal jaccard_mat[i, i] = 1.0 @@ -120,16 +127,14 @@ def morisita_horn_index(counts1, counts2): for j in range(i + 1, n): s2 = samples[j] - j_val = jaccard_index(set1, junction_sets[s2]) s_val = sorensen_index(set1, junction_sets[s2]) - m_val = morisita_horn_index(counts1, count_vectors[s2]) + m_val = morisita_horn_index(counts1, aligned_vectors[s2]) jaccard_mat[i, j] = jaccard_mat[j, i] = j_val sorensen_mat[i, j] = sorensen_mat[j, i] = s_val morisita_mat[i, j] = morisita_mat[j, i] = m_val - # ------------------------- # Write outputs # ------------------------- diff --git a/bin/create_pheno_samplesheet.py b/bin/create_pheno_samplesheet.py new file mode 100755 index 0000000..64832ac --- /dev/null +++ b/bin/create_pheno_samplesheet.py @@ -0,0 +1,37 @@ +#!/usr/bin/env python3 + +import sys +import json +from pathlib import Path + +# The script now takes two simple arguments: +# 1. The input JSON file +# 2. The output CSV filename +meta_json_file = sys.argv[1] +output_csv = sys.argv[2] + +# Load the list from the JSON file +with open(meta_json_file, 'r') as f_in: + # This will be a list, e.g., + # [ [ {'sample': 'Patient01_Base_CD4', ...}, 'file1.tsv' ], + # [ {'sample': 'Patient01_Base_CD8', ...}, 'file2.tsv' ] ] + all_new_meta = json.load(f_in) + +with open(output_csv, 'w') as f_out: + + # Write the new header + f_out.write('sample,subject_id,timepoint,origin,phenotype,file\n') + + # Iterate over each item in the list + for item in all_new_meta: + meta = item # 'item' is the meta map + file_path = str(meta['file']) # Get the file path from the map + + f_out.write( + f"{meta['sample']}," + f"{meta['subject_id']}," + f"{meta['timepoint']}," + f"{meta['origin']}," + f"{meta['phenotype']}," + f"{file_path}\n" + ) diff --git a/bin/pseudobulk.py b/bin/pseudobulk.py index e0dc7aa..15cb30e 100755 --- a/bin/pseudobulk.py +++ b/bin/pseudobulk.py @@ -141,6 +141,120 @@ def pseudobulk(input_df, basename, airr_schema): bulk_df.to_csv(output_path, sep='\t', index=False) logger.info(f"Pseudobulked AIRR file written to: {output_path}") + +def add_phenotype_info(input_df, basename, sobject_gex): + import numpy as np + import pandas as pd + from rpy2.robjects import r + from rpy2.robjects import pandas2ri + import rpy2.robjects as ro + from rpy2.robjects.conversion import localconverter + + # Read Seurat Object + readRDS = r['readRDS'] + seurat_obj = readRDS(sobject_gex) + # Extract and Convert Metadata into a pandas DataFrame + with localconverter(ro.default_converter + pandas2ri.converter): + cell_metadata_pd = ro.conversion.rpy2py(seurat_obj.slots['meta.data']) + + cell_metadata_pd["cell_id"] = cell_metadata_pd["library"] + "_" + cell_metadata_pd["cell_id"] + cell_metadata_pd = cell_metadata_pd.loc[:,["cell_id","phenotype"]] + cell_metadata_pd['phenotype'] = cell_metadata_pd['phenotype'].str.replace(' ', '_') + + # ------ Fix TCR Barcodes ------ # + input_df["cell_id"] = basename + "_" + input_df["cell_id"] + + # ------ Merge ------ # + merged_df = pd.merge(input_df, cell_metadata_pd, on="cell_id", how="left") + return merged_df + +def pseudobulk_phenotype(input_df, basename, airr_schema): + # Load data, filter out TRA calls + df = input_df + df_trb = df[df["v_call"].str.startswith("TRB") & df["j_call"].str.startswith("TRB")] + + # Define required aggregations + agg_columns = { + "cell_id": pd.Series.nunique, + "junction": lambda x: assert_single_unique(x, "junction"), + "junction_aa": lambda x: assert_single_unique(x, "junction_aa"), + "v_call": lambda x: assert_single_unique(x, "v_call"), + "j_call": lambda x: assert_single_unique(x, "j_call"), + "consensus_count": "sum", + "duplicate_count": "sum" + } + + # Identify columns not already aggregated + excluded_cols = set(agg_columns) | {"sequence", "phenotype"} + remaining_cols = [col for col in df_trb.columns if col not in excluded_cols] + + # Group by sequence + grouped = df_trb.groupby(["sequence", "phenotype"]) + + # Determine which of the remaining columns have consistent values + consensus_cols = [] + for col in remaining_cols: + if (grouped[col].nunique(dropna=True) <= 1).all(): + consensus_cols.append(col) + + # Add those columns to the aggregation with the unique value + for col in consensus_cols: + agg_columns[col] = lambda x, col=col: assert_single_unique(x, col) + + # Aggregate + bulk_df = grouped.agg(agg_columns).reset_index() + + # Rename and reorder columns + bulk_df.rename(columns={"cell_id": "cell_count"}, inplace=True) + bulk_df["sequence_id"] = [f"{basename}|sequence{i+1}|{bulk_df['phenotype'].iloc[i]}" for i in range(len(bulk_df))] + total_duplicate = bulk_df['duplicate_count'].sum() + bulk_df['duplicate_frequency_percent'] = bulk_df['duplicate_count'] / total_duplicate * 100 + + # Load schema + schema = airr_schema + + required = schema["Rearrangement"]["required"] + properties = list(schema["Rearrangement"]["properties"].keys()) + + # Add missing required fields as blank + for col in required: + if col not in bulk_df.columns: + bulk_df[col] = pd.NA + + # Apply to relevant columns if they exist + for col in ['v_call', 'd_call', 'j_call', 'c_call']: + if col in df.columns: + bulk_df[col] = bulk_df[col].apply(airr_compliant_gene) + + # Identify all boolean columns from schema + bool_cols = [ + name for name, spec in schema["Rearrangement"]["properties"].items() + if spec.get("type") == "boolean" + ] + + bool_cols = bool_cols + ['is_cell'] + for col in bool_cols: + if col in bulk_df.columns: + bulk_df[col] = bulk_df[col].apply(normalize_bool) + + # Order columns + ordered_cols = [col for col in properties if col in bulk_df.columns] + extra_cols = [col for col in bulk_df.columns if col not in ordered_cols] + bulk_df = bulk_df[ordered_cols + extra_cols] + + # Iterate through each unique phenotype and save to a separate file + for phenotype in bulk_df['phenotype'].unique(): + # Filter the dataframe for the current phenotype + phenotype_df = bulk_df[bulk_df['phenotype'] == phenotype] + + # Define the output path based on basename and phenotype + output_path = f"{basename}_{phenotype}_pseudobulk_phenotype.tsv" + + # Save the filtered dataframe to a TSV file + phenotype_df.to_csv(output_path, sep='\t', index=False) + logger.info(f"Pseudobulked AIRR file for phenotype '{phenotype}' written to: {output_path}") + + if __name__ == "__main__": # Parse input arguments parser = argparse.ArgumentParser(description="Take positional args") @@ -163,6 +277,18 @@ def pseudobulk(input_df, basename, airr_schema): help="Path to airr json schema for validating adaptive conversion.", ) + parser.add_argument( + "--phenotype", + action="store_true", + help="A boolean flag to indicate wether the pseudobulk approach will consider phenotype.", + ) + + parser.add_argument( + "--sobject_gex", + type=str, + help="Path to Seurat Object, 'phenotype' should be a column in meta.data slot.", + ) + add_logging_args(parser) args = parser.parse_args() @@ -181,5 +307,10 @@ def pseudobulk(input_df, basename, airr_schema): with open(args.airr_schema) as f: airr_schema = json.load(f) basename = args.basename - - pseudobulk(input_df, basename, airr_schema) \ No newline at end of file + + # Pseudobulk by phenotype (if available) + if args.phenotype: + input_df = add_phenotype_info(input_df, basename, args.sobject_gex) # Add single-cell phenotype info from Seurat Object. + pseudobulk_phenotype(input_df, basename, airr_schema) + else: + pseudobulk(input_df, basename, airr_schema) \ No newline at end of file diff --git a/conf/modules.config b/conf/modules.config index 3e891f7..03d0b3d 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -35,6 +35,22 @@ process { ] } + withName: /SAMPLE_.*_PHENO/ { + publishDir = [ + path: { "${params.outdir}/sample_phenotype" }, + mode: params.publish_dir_mode, + saveAs: { filename -> filename.equals('versions.yml') ? null : filename } + ] + } + + withName: /COMPARE_.*_PHENO/ { + publishDir = [ + path: { "${params.outdir}/compare_phenotype" }, + mode: params.publish_dir_mode, + saveAs: { filename -> filename.equals('versions.yml') ? null : filename } + ] + } + withName: TCRDIST3_MATRIX { label = params.matrix_sparsity == 'sparse' ? 'process_medium' : ['process_high', 'process_high_memory'] } diff --git a/modules/local/airr_convert/pseudobulk_cellranger.nf b/modules/local/airr_convert/pseudobulk_cellranger.nf index 653b2a9..ec1916c 100644 --- a/modules/local/airr_convert/pseudobulk_cellranger.nf +++ b/modules/local/airr_convert/pseudobulk_cellranger.nf @@ -1,6 +1,7 @@ process PSEUDOBULK_CELLRANGER { tag "${sample_meta.sample}" label 'process_low' + container "ghcr.io/karchinlab/tcrtoolkit:main" input: tuple val(sample_meta), path(count_table) diff --git a/modules/local/airr_convert/pseudobulk_phenotype_cellranger.nf b/modules/local/airr_convert/pseudobulk_phenotype_cellranger.nf new file mode 100644 index 0000000..3866e9f --- /dev/null +++ b/modules/local/airr_convert/pseudobulk_phenotype_cellranger.nf @@ -0,0 +1,25 @@ +process PSEUDOBULK_CELLRANGER_PHENOTYPE { + tag "${sample_meta.sample}" + label 'process_low' + container "ghcr.io/karchinlab/tcrtoolkit:main" + + input: + tuple val(sample_meta), path(count_table) + path airr_schema + path sobject_gex + + output: + tuple val(sample_meta), path("${sample_meta.sample}_*_pseudobulk_phenotype.tsv"), emit: "cellranger_pseudobulk_phenotype" + + script: + // Dynamically add phenotype arguments only if sobject_gex is provided (i.e., not null) + def phenotype_args = sobject_gex ? "--phenotype --sobject_gex ${sobject_gex}" : "" + + """ + pseudobulk.py \\ + ${count_table} \\ + ${sample_meta.sample} \\ + ${airr_schema} \\ + ${phenotype_args} + """ +} \ No newline at end of file diff --git a/modules/local/samplesheet/generate_pheno_samplesheet.nf b/modules/local/samplesheet/generate_pheno_samplesheet.nf new file mode 100644 index 0000000..f8b4587 --- /dev/null +++ b/modules/local/samplesheet/generate_pheno_samplesheet.nf @@ -0,0 +1,20 @@ +process GENERATE_PHENO_SAMPLESHEET { + tag "Generating phenotype samplesheet" + label 'process_single' + // container "ghcr.io/karchinlab/tcrtoolkit:main" + + input: + path original_samplesheet + path meta_json + path py_script // NOTE: Remove when in container + + output: + path "phenotype_samplesheet.csv", emit: samplesheet + + script: + def out_file = "phenotype_samplesheet.csv" // Define the output file variable + + """ + ${py_script} ${meta_json} ${out_file} + """ +} \ No newline at end of file diff --git a/subworkflows/local/airr_convert.nf b/subworkflows/local/airr_convert.nf index c625557..eb5395f 100644 --- a/subworkflows/local/airr_convert.nf +++ b/subworkflows/local/airr_convert.nf @@ -7,6 +7,7 @@ include { CONVERT_ADAPTIVE } from '../../modules/local/airr_convert/convert_adaptive' include { PSEUDOBULK_CELLRANGER } from '../../modules/local/airr_convert/pseudobulk_cellranger' +include { PSEUDOBULK_CELLRANGER_PHENOTYPE } from '../../modules/local/airr_convert/pseudobulk_phenotype_cellranger' /* ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -18,8 +19,11 @@ workflow AIRR_CONVERT { take: sample_map input_format + sobject_gex main: + ch_pseudobulk_phenotype = Channel.empty() // Initialize empty channel for phenotype files + if (input_format == 'adaptive') { CONVERT_ADAPTIVE( sample_map, @@ -35,8 +39,20 @@ workflow AIRR_CONVERT { ) .cellranger_pseudobulk .set { sample_map_converted } + + // Pseudobulk by phenotype + if(sobject_gex) { + PSEUDOBULK_CELLRANGER_PHENOTYPE( + sample_map, + params.airr_schema, + file(sobject_gex) // Can remove file() + ) + // Capture phenotype outputs + ch_pseudobulk_phenotype = PSEUDOBULK_CELLRANGER_PHENOTYPE.out.cellranger_pseudobulk_phenotype + } } emit: sample_map_converted + pseudobulk_phenotype_files = ch_pseudobulk_phenotype } \ No newline at end of file diff --git a/subworkflows/local/compare.nf b/subworkflows/local/compare.nf index 2924a72..c7831a6 100644 --- a/subworkflows/local/compare.nf +++ b/subworkflows/local/compare.nf @@ -27,9 +27,25 @@ workflow COMPARE { all_sample_files main: + COMPARE_CALC( samplesheet_resolved, all_sample_files ) + COMPARE_CALC.out.jaccard_mat + .collectFile(name: 'jaccard_mat.csv', sort: true, + storeDir: "${params.outdir}/compare") + .set { jaccard_mat } + + COMPARE_CALC.out.sorensen_mat + .collectFile(name: 'sorensen_mat.csv', sort: true, + storeDir: "${params.outdir}/compare") + .set { sorensen_mat } + + COMPARE_CALC.out.morisita_mat + .collectFile(name: 'morisita_mat.csv', sort: true, + storeDir: "${params.outdir}/compare") + .set { morisita_mat } + COMPARE_PLOT( samplesheet_resolved, COMPARE_CALC.out.jaccard_mat, COMPARE_CALC.out.sorensen_mat, @@ -65,7 +81,4 @@ workflow COMPARE { TCRSHARING_CALC.out.shared_cdr3 ) - // emit: - // compare_stats_html - // versions = SAMPLESHEET_CHECK.out.versions // channel: [ versions.yml ] } \ No newline at end of file diff --git a/subworkflows/local/compare_pheno.nf b/subworkflows/local/compare_pheno.nf new file mode 100644 index 0000000..6294ac8 --- /dev/null +++ b/subworkflows/local/compare_pheno.nf @@ -0,0 +1,46 @@ + +/* +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + IMPORT LOCAL MODULES +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +*/ + +include { COMPARE_CALC as COMPARE_CALC_PHENO } from '../../modules/local/compare/compare_calc' +// include { COMPARE_PLOT } from '../../modules/local/compare/compare_plot' +include { COMPARE_CONCATENATE as COMPARE_CONCATENATE_PHENO } from '../../modules/local/compare/compare_concatenate' + +/* +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + RUN MAIN SUBWORKFLOW +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +*/ + +workflow COMPARE_PHENO { + + // println("Welcome to the BULK TCRSEQ pipeline! -- COMPARE ") + + take: + samplesheet_resolved + all_sample_files + + main: + + COMPARE_CALC_PHENO( samplesheet_resolved, + all_sample_files ) + + COMPARE_CONCATENATE_PHENO( samplesheet_resolved, + all_sample_files ) + + + /////// =================== COMPARE NOTEBOOK =================== /////// + + // COMPARE_PLOT( samplesheet_resolved, + // COMPARE_CALC_PHENO.out.jaccard_mat, + // COMPARE_CALC_PHENO.out.sorensen_mat, + // COMPARE_CALC_PHENO.out.morisita_mat, + // file(params.compare_stats_template), + // params.project_name, + // all_sample_files + // ) + +} \ No newline at end of file diff --git a/subworkflows/local/map_phenotypes.nf b/subworkflows/local/map_phenotypes.nf new file mode 100644 index 0000000..e0d2966 --- /dev/null +++ b/subworkflows/local/map_phenotypes.nf @@ -0,0 +1,68 @@ +// +// Generate phenotype-specific samplesheet and transform phenotype files +// + +include { GENERATE_PHENO_SAMPLESHEET } from '../../modules/local/samplesheet/generate_pheno_samplesheet.nf' +import groovy.json.JsonOutput + +workflow MAP_PHENOTYPES { + + take: + ch_phenotype_files_raw + original_samplesheet + + main: + + // --- 1. Define transformation logic --- + ch_phenotype_files_transformed = ch_phenotype_files_raw + .transpose() + .map { meta, file -> + def original_sample_id = meta.sample + def phenotype = file.name + .replaceFirst("^${original_sample_id}_", "") + .replaceFirst("_pseudobulk_phenotype.tsv\$", "") + + def new_sample_id = original_sample_id + '_' + phenotype + + def new_meta = meta.clone() + new_meta.sample = new_sample_id + new_meta.phenotype = phenotype + new_meta.original_sample = original_sample_id + + new_meta.file = file.toString() + return [ new_meta, file ] + + } + + // --- 2. Prepare inputs for samplesheet process --- + ch_meta_json = ch_phenotype_files_transformed + .map { meta, file -> meta } + .collect() // Collect channel into a list + .map { collected_list -> + def json_string = JsonOutput.toJson(collected_list) + def json_file = file("phenotype_meta.json") // Write to local temp file + json_file.write(json_string) + return json_file + } + + // original_samplesheet_ = file(original_samplesheet) + ch_script = file("$baseDir/bin/create_pheno_samplesheet.py") + + // --- 3. Call GENERATE_PHENO_SAMPLESHEET --- + GENERATE_PHENO_SAMPLESHEET( + original_samplesheet, // original_samplesheet_, + ch_meta_json, + ch_script + ) + + // View the output (for debugging) + GENERATE_PHENO_SAMPLESHEET.out.samplesheet.view { file -> + "--- START: Output of phenotype_samplesheet.csv ---\n" + + file.text + + "--- END: Output of phenotype_samplesheet.csv ---" + } + + emit: + files_transformed = ch_phenotype_files_transformed + samplesheet_pheno = GENERATE_PHENO_SAMPLESHEET.out.samplesheet +} \ No newline at end of file diff --git a/subworkflows/local/resolve_samplesheet_pheno.nf b/subworkflows/local/resolve_samplesheet_pheno.nf new file mode 100644 index 0000000..c8be87a --- /dev/null +++ b/subworkflows/local/resolve_samplesheet_pheno.nf @@ -0,0 +1,18 @@ +// +// Check input samplesheet and get read channels +// + +workflow RESOLVE_SAMPLESHEET_PHENO { + take: + sample_map_final + + main: + // Write resolved samplesheet with absolute paths + sample_map_final + .map { _meta, f -> f } + .collect() + .set { all_sample_files } + + emit: + all_sample_files // pass files to comparison tasks to be read by resolved samplesheet +} diff --git a/subworkflows/local/sample.nf b/subworkflows/local/sample.nf index c63bbe6..a919617 100644 --- a/subworkflows/local/sample.nf +++ b/subworkflows/local/sample.nf @@ -127,9 +127,4 @@ workflow SAMPLE { VDJDB_VDJMATCH (sample_map, VDJDB_GET.out.ref_db) - // emit: - // sample_stats_csv - // v_family_csv - // sample_meta_csv - // versions = SAMPLESHEET_CHECK.out.versions // channel: [ versions.yml ] } \ No newline at end of file diff --git a/subworkflows/local/sample_pheno.nf b/subworkflows/local/sample_pheno.nf new file mode 100644 index 0000000..a7c737b --- /dev/null +++ b/subworkflows/local/sample_pheno.nf @@ -0,0 +1,53 @@ + +/* +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + IMPORT LOCAL MODULES +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +*/ + +include { SAMPLE_CALC as SAMPLE_CALC_PHENO } from '../../modules/local/sample/sample_calc' +include { SAMPLE_AGGREGATE as SAMPLE_AGG_STAT_PHENO } from '../../modules/local/sample/sample_aggregate' +include { SAMPLE_AGGREGATE as SAMPLE_AGG_V_PHENO } from '../../modules/local/sample/sample_aggregate' +include { SAMPLE_AGGREGATE as SAMPLE_AGG_D_PHENO } from '../../modules/local/sample/sample_aggregate' +include { SAMPLE_AGGREGATE as SAMPLE_AGG_J_PHENO } from '../../modules/local/sample/sample_aggregate' +// include { SAMPLE_PLOT } from '../../modules/local/sample/sample_plot' + +/* +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + RUN MAIN WORKFLOW +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +*/ + +workflow SAMPLE_PHENO { + + take: + sample_map + pheno_samplesheet + + main: + + /////// =================== CALC SAMPLE =================== /////// + + SAMPLE_CALC_PHENO( sample_map ) + + SAMPLE_CALC_PHENO.out.sample_csv.collect().set { sample_csv_files } + SAMPLE_CALC_PHENO.out.v_family_csv.collect().set { v_family_csv_files } + SAMPLE_CALC_PHENO.out.d_family_csv.collect().set { d_family_csv_files } + SAMPLE_CALC_PHENO.out.j_family_csv.collect().set { j_family_csv_files } + + SAMPLE_AGG_STAT_PHENO(sample_csv_files, "sample_stats.csv") + SAMPLE_AGG_V_PHENO(v_family_csv_files, "v_family.csv") + SAMPLE_AGG_D_PHENO(d_family_csv_files, "d_family.csv") + SAMPLE_AGG_J_PHENO(j_family_csv_files, "j_family.csv") + + + /////// =================== SAMPLE NOTEBOOK =================== /////// + + // SAMPLE_PLOT ( + // pheno_samplesheet, + // file(params.sample_stats_template), + // sample_stats_csv, + // v_family_csv + // ) + +} \ No newline at end of file diff --git a/workflows/tcrtoolkit.nf b/workflows/tcrtoolkit.nf index 5329576..257e389 100644 --- a/workflows/tcrtoolkit.nf +++ b/workflows/tcrtoolkit.nf @@ -16,6 +16,11 @@ include { SAMPLE } from '../subworkflows/local/sample' include { COMPARE } from '../subworkflows/local/compare' include { VALIDATE_PARAMS } from '../subworkflows/local/validate_params' +include { MAP_PHENOTYPES } from '../subworkflows/local/map_phenotypes' +include { RESOLVE_SAMPLESHEET_PHENO } from '../subworkflows/local/resolve_samplesheet_pheno' +include { SAMPLE_PHENO } from '../subworkflows/local/sample_pheno' +include { COMPARE_PHENO } from '../subworkflows/local/compare_pheno' + /* ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ RUN MAIN WORKFLOW @@ -42,19 +47,38 @@ workflow TCRTOOLKIT { // Checking input tables INPUT_CHECK( file(params.samplesheet) ) + ch_samplesheet_utf8 = INPUT_CHECK.out.samplesheet_utf8 + + // Initialize empty channels + ch_phenotype_files_transformed = Channel.empty() + ch_phenotype_samplesheet = Channel.empty() if (input_format in ['adaptive', 'cellranger']) { + def sobject_gex_file = params.sobject_gex ? file(params.sobject_gex) : [] + AIRR_CONVERT( INPUT_CHECK.out.sample_map, - input_format + input_format, sobject_gex_file ) .sample_map_converted .set { sample_map_final } + + if (params.sobject_gex) { + MAP_PHENOTYPES( + AIRR_CONVERT.out.pseudobulk_phenotype_files, + ch_samplesheet_utf8 + ) + ch_phenotype_files_transformed = MAP_PHENOTYPES.out.files_transformed + ch_phenotype_samplesheet = MAP_PHENOTYPES.out.samplesheet_pheno + } + } else { INPUT_CHECK.out.sample_map .set { sample_map_final } } - RESOLVE_SAMPLESHEET( INPUT_CHECK.out.samplesheet_utf8, + // --- Main Analysis --- + + RESOLVE_SAMPLESHEET( ch_samplesheet_utf8, sample_map_final ) // Running sample level analysis @@ -67,6 +91,27 @@ workflow TCRTOOLKIT { COMPARE( RESOLVE_SAMPLESHEET.out.samplesheet_resolved, RESOLVE_SAMPLESHEET.out.all_sample_files) } + + // --- Phenotype Analysis --- + + if (params.sobject_gex) { + + RESOLVE_SAMPLESHEET_PHENO( + ch_phenotype_files_transformed + ) + + if (levels.contains('sample')) { + SAMPLE_PHENO( ch_phenotype_files_transformed, ch_phenotype_samplesheet ) // ch_phenotype_samplesheet // RESOLVE_SAMPLESHEET_PHENO.out.samplesheet_resolved + } + + if (levels.contains('compare')) { + COMPARE_PHENO( + ch_phenotype_samplesheet, + RESOLVE_SAMPLESHEET_PHENO.out.all_sample_files + ) + } + } + } /*