KarchinLab · dltamayo · Dec 23, 2025 · Dec 15, 2025 · Dec 16, 2025 · Dec 17, 2025
diff --git a/.cirro/process-form.json b/.cirro/process-form.json
@@ -50,8 +50,31 @@
                 "description": "p_depth (GLIPH2)",
                 "title": "p_depth",
                 "type": "string"
+            },
+            "samplechart_x_col": {
+                "default": "timepoint",
+                "description": "Metadata column for x axis of sample-level plots for Sample workflow notebook",
+                "title": "Sample plot X axis column",
+                "type": "string"
+            },
+            "samplechart_color_col": {
+                "default": "origin",
+                "description": "Metadata column for legend color of sample-level plots for Sample workflow notebook",
+                "title": "Sample plot X axis column",
+                "type": "string"
+            },
+            "vgene_subject_col": {
+                "default": "subject_id",
+                "description": "Metadata column for grouping of samples for V gene plots for Sample workflow notebook",
+                "title": "V gene plot subject column",
+                "type": "string"
+            },
+            "vgene_x_cols": {
+                "default": "timepoint,origin",
+                "description": "Comma-separated list of metadata columns for x axis of V gene plot (eg. timepoint,origin or timepoint)",
+                "title": "V gene plot X axis columns",
+                "type": "string"
             }
-
         }
     },
     "ui": {}

diff --git a/.cirro/process-input.json b/.cirro/process-input.json
@@ -9,5 +9,9 @@
     "local_min_OVE": "$.params.dataset.paramJson.local_min_OVE",
     "local_min_pvalue": "$.params.dataset.paramJson.local_min_pvalue",
     "outdir": "$.params.dataset.s3|/data/",
-    "p_depth": "$.params.dataset.paramJson.p_depth"
+    "p_depth": "$.params.dataset.paramJson.p_depth",
+    "samplechart_x_col": "$.params.dataset.paramJson.samplechart_x_col",
+    "samplechart_color_col": "$.params.dataset.paramJson.samplechart_color_col",
+    "vgene_subject_col": "$.params.dataset.paramJson.vgene_subject_col",
+    "vgene_x_cols": "$.params.dataset.paramJson.vgene_x_cols"
 }
diff --git a/bin/compare_concatenate.py b/bin/compare_concatenate.py
@@ -29,18 +29,12 @@ def main():
         # Read the TSV file into a dataframe
         file_path = str(row['file'])
         df = pd.read_csv(file_path, sep="\t", header=0)
-
-        # Get metadata
-        subject_id = row['subject_id']
-        timepoint = row['timepoint']
-        origin = row['origin']
-
+
         # Add patient column
-        df['patient'] = f"{subject_id}:{timepoint}_{origin}"
         df['sample'] = row['sample']
-        
+
         # Select relevant columns
-        df = df[['junction_aa', 'v_call', 'j_call', 'duplicate_count', 'patient', 'sample']]
+        df = df[['junction_aa', 'v_call', 'j_call', 'duplicate_count', 'sample']]
         dfs.append(df)
 
 

diff --git a/bin/sample_calc.py b/bin/sample_calc.py
@@ -22,18 +22,7 @@ def extract_trb_family(allele):
     match = re.match(r'(TRB[V|D|J])(\d+)', allele)
     return f"{match.group(1)}{match.group(2)}" if match else None
 
-def compute_gene_family_table(counts, col_name, all_families, sample_meta):
-    fam_col = f"{col_name}FamilyName"
-    counts[fam_col] = counts[col_name].apply(extract_trb_family)
-    fam_df = counts[fam_col].value_counts(dropna=False).to_frame().T.sort_index(axis=1)
-    fam_df = fam_df.reindex(columns=all_families, fill_value=0)
-
-    for col in ['origin', 'timepoint', 'subject_id']:
-        fam_df.insert(0, col, sample_meta[col])
-
-    return fam_df
-
-def calc_gene_family(counts, gene_column, family_prefix, max_index, output_file, meta_df):
+def calc_gene_family(sample_name, counts, gene_column, family_prefix, max_index, output_file):
     # Build list of all possible family names
     all_fams = [f'{family_prefix}{i}' for i in range(1, max_index + 1)]
 
@@ -43,12 +32,12 @@ def calc_gene_family(counts, gene_column, family_prefix, max_index, output_file,
     # Reindex to include all families
     fam_df = pd.DataFrame([fam_df.reindex(columns=all_fams, fill_value=0).iloc[0]]).reset_index(drop=True)
 
-    # Add metadata columns
-    fam_df = pd.concat([meta_df, fam_df], axis=1)
+    # Add sample column
+    fam_df.insert(0, 'sample', sample_name)
 
     fam_df.to_csv(output_file, header=True, index=False)
 
-def calc_sample_stats(meta_df, counts, output_file):
+def calc_sample_stats(sample_name, counts, output_file):
     """Calculate sample level statistics of TCR repertoire."""
 
     ## first pass stats
@@ -105,8 +94,8 @@ def calc_sample_stats(meta_df, counts, output_file):
     # Convert to single-row dataframe
     df_stats = pd.DataFrame([row_data])
 
-    # Add metadata columns
-    df_stats = pd.concat([meta_df, df_stats], axis=1)
+    # Add sample column
+    df_stats.insert(0, 'sample', sample_name)
 
     # Save to CSV
     df_stats.to_csv(output_file, header=True, index=False)
@@ -117,40 +106,27 @@ def main():
     parser = argparse.ArgumentParser(description='Calculate clonality of a TCR repertoire')
 
     # add arguments
-    parser.add_argument('-s', '--sample_meta', 
-                        metavar='sample_meta', 
+    parser.add_argument('-s', '--sample_name', 
+                        metavar='sample_name', 
                         type=str, 
-                        help='sample metadata passed in as json format')
+                        help='sample name')
     parser.add_argument('-c', '--count_table', 
                         metavar='count_table', 
                         type=argparse.FileType('r'), 
                         help='counts file in TSV format')
 
     args = parser.parse_args() 
 
-    ## convert metadata to list
-    sample_meta = json.loads(args.sample_meta)
+    sample = args.sample_name
 
     # Read in the counts file
-    counts = pd.read_csv(args.count_table, sep='\t', header=0)
-
-    # Build metadata row from selected keys
-    meta_keys = ['subject_id', 'timepoint', 'origin']
-    meta_row = {k: sample_meta[k] for k in meta_keys}
-    meta_df = pd.DataFrame([meta_row])
-
-    sample = sample_meta['sample']
-
-    calc_gene_family(counts, 'v_call', 'TRBV', 30, f'vdj/v_family_{sample}.csv', meta_df)
-    calc_gene_family(counts, 'd_call', 'TRBD', 2, f'vdj/d_family_{sample}.csv', meta_df)
-    calc_gene_family(counts, 'j_call', 'TRBJ', 2, f'vdj/j_family_{sample}.csv', meta_df)
+    counts = pd.read_csv(args.count_table, sep='\t')
 
-    # Build metadata row from selected keys
-    meta_keys = ['sample', 'subject_id', 'timepoint', 'origin']
-    meta_row = {k: sample_meta[k] for k in meta_keys}
-    meta_df = pd.DataFrame([meta_row])
+    calc_gene_family(sample, counts, 'v_call', 'TRBV', 30, f'vdj/v_family_{sample}.csv')
+    calc_gene_family(sample, counts, 'd_call', 'TRBD', 2, f'vdj/d_family_{sample}.csv')
+    calc_gene_family(sample, counts, 'j_call', 'TRBJ', 2, f'vdj/j_family_{sample}.csv')
 
-    calc_sample_stats(meta_df, counts, f'stats/sample_stats_{sample}.csv')
+    calc_sample_stats(sample, counts, f'stats/sample_stats_{sample}.csv')
 
 if __name__ == "__main__":
     main()
diff --git a/modules/local/compare/giana.nf b/modules/local/compare/giana.nf
@@ -28,30 +28,20 @@ process GIANA_CALC {
         > giana.log 2>&1
 
     # Insert header after GIANA comments
-    python3 - <<EOF
-    input_file = "giana_RotationEncodingBL62.txt"
-    concat_header_file = "${concat_cdr3}"
-
-    with open(concat_header_file, 'r', encoding='utf-8') as f:
-        header = f.readline().strip().split('\\t')
-    header.insert(1, "cluster")
-    header_line = '\\t'.join(header)
-
-    with open(input_file, 'r', encoding='utf-8') as infile:
-        lines = infile.readlines()
-
-    with open(input_file, 'w', encoding='utf-8') as outfile:
-        inserted = False
-        for line in lines:
-            if line.startswith("##"):
-                outfile.write(line)
-            elif not inserted:
-                outfile.write(header_line + '\\n')
-                outfile.write(line)
-                inserted = True
-            else:
-                outfile.write(line)
-    EOF
+    insert=\$(head -n 1 "${concat_cdr3}")
+    insert=\$(echo "\$insert" | awk -F'\t' 'BEGIN{OFS="\t"} {
+        out = \$1 OFS "cluster"
+        for (i=2; i<=NF; i++) {
+            out = out OFS \$i
+        }
+        print out
+    }')
+
+    awk -v insert="\$insert" '
+    /^##/ { print; next }
+    !inserted { print insert; inserted=1 }
+    { print }
+    ' giana_RotationEncodingBL62.txt > tmp && mv tmp giana_RotationEncodingBL62.txt
 
     mv giana_RotationEncodingBL62.txt_EncodingMatrix.txt giana_EncodingMatrix.txt
     """

diff --git a/modules/local/compare/gliph2.nf b/modules/local/compare/gliph2.nf
@@ -18,15 +18,14 @@ process GLIPH2_TURBOGLIPH {
 
     script:
     """
-    # R script starts here
-    cat > run_gliph2.R <<EOF
+    Rscript - <<EOF
     #!/usr/bin/env Rscript
 
     library(turboGliph)
 
     # During testing, including TRBJ column was causing issues in clustering step. Removing and reinserting afterwards.
     df <- read.csv("$concat_cdr3", sep = "\t", stringsAsFactors = FALSE, check.names = FALSE)
-    # df2 <- subset(df, select = c('CDR3b', 'TRBV', 'patient', 'counts'))
+    df[,'patient'] <- df[,'sample']
 
     result <- turboGliph::gliph2(
         cdr3_sequences = df,
@@ -40,14 +39,12 @@ process GLIPH2_TURBOGLIPH {
     )
 
     df3 <- read.csv('cluster_member_details.txt', sep = '\t', stringsAsFactors = FALSE, check.names = FALSE)
-    df3 <- merge(df3, df[, c("CDR3b", "TRBV", "patient", "TRBJ", 'counts')], by = c("CDR3b", "TRBV", "patient", 'counts'), all.x = TRUE)
+    df3[,'sample'] <- df3[,'patient']
+    df3 <- merge(df3, df[, c("CDR3b", "TRBV", "sample", 'counts')], by = c("CDR3b", "TRBV", "sample", 'counts'), all.x = TRUE)
+    df3 <- df3[, c('CDR3b', 'TRBV', 'TRBJ', 'counts', 'sample', 'tag', 'seq_ID', 'ultCDR3b')]
     write.table(df3, "cluster_member_details.txt", sep = "\t", row.names = FALSE, quote = FALSE)
-
     EOF
 
-    # Run the R script
-    Rscript run_gliph2.R
-
     # Rename local_similarities file to standardize output name
     input_file="local_similarities_*.txt"
     cat \$input_file > local_similarities.txt

diff --git a/modules/local/compare/tcrsharing.nf b/modules/local/compare/tcrsharing.nf
@@ -19,30 +19,36 @@ process TCRSHARING_CALC {
     # Load data
     df = pd.read_csv("${concat_cdr3}", sep="\t")
 
-    # Step 1: Map samples to integers
-    sample_mapping = {sample: i + 1 for i, sample in enumerate(df['sample'].unique())}
-    sample_map_df = pd.DataFrame.from_dict(sample_mapping, orient='index', columns=['sample_id']).reset_index()
-    sample_map_df.columns = ['patient', 'sample_id']
+    # Map sample to integer codes
+    df['sample'] = df['sample'].astype('category')
+    df['sample_id'] = df['sample'].cat.codes + 1
+
+    # Export mapping (uses category lookup directly)
+    sample_map_df = pd.DataFrame({
+        'patient': df['sample'].cat.categories,
+        'sample_id': np.arange(1, len(df['sample'].cat.categories) + 1)
+    })
     sample_map_df.to_csv("sample_mapping.tsv", sep="\t", index=False)
 
-    # Step 2: Group by CDR3b and aggregate sample_ids
-    df['sample_id'] = df['sample'].map(sample_mapping)
-
+    # Get unique sample_ids per CDR3b — vectorized
     grouped = (
         df.groupby('CDR3b')['sample_id']
-        .apply(lambda x: sorted(set(x)))  # remove duplicates if any
+        .unique()     # UNIQUE — fast & vectorized
+        .apply(np.sort)  # SORT — vectorized
         .reset_index()
     )
 
-    # Step 3: Add comma-separated list and total count
-    grouped['samples_present'] = grouped['sample_id'].apply(lambda x: ",".join(map(str, x)))
+    # Calculate counts
     grouped['total_samples'] = grouped['sample_id'].apply(len)
+    grouped['samples_present'] = grouped['sample_id'].apply(
+        lambda arr: ",".join(arr.astype(str))
+    )
 
-    # Step 4: Final output — drop raw list
+    # Drop raw list
     final_df = grouped[['CDR3b', 'total_samples', 'samples_present']]
-    final_df = final_df.sort_values(by='total_samples', axis=0, ascending=False)
+    final_df = final_df.sort_values(by="total_samples", ascending=False)
 
-    # Step 5: Export both outputs
+    # Export final list
     final_df.to_csv("cdr3_sharing.tsv", sep="\t", index=False)
     EOF
 

diff --git a/modules/local/sample/sample_calc.nf b/modules/local/sample/sample_calc.nf
@@ -14,13 +14,11 @@ process SAMPLE_CALC {
     val sample_meta                                  , emit: sample_meta
 
     script:
-    def meta_json = groovy.json.JsonOutput.toJson(sample_meta)
-
     """
     mkdir -p stats
     mkdir -p vdj
 
-    sample_calc.py -s '${meta_json}' -c ${count_table}
+    sample_calc.py -s '${sample_meta.sample}' -c ${count_table}
     """
 
     stub:

diff --git a/modules/local/sample/sample_plot.nf b/modules/local/sample/sample_plot.nf
@@ -13,7 +13,7 @@ process SAMPLE_PLOT {
     output:
     path 'sample_stats.html'
 
-    script:    
+    script:
     """
     ## copy quarto notebook to output directory
     cp $sample_stats_template sample_stats.qmd
@@ -26,6 +26,10 @@ process SAMPLE_PLOT {
         -P sample_table:$sample_table \
         -P sample_stats_csv:$sample_stats_csv \
         -P v_family_csv:$v_family_csv \
+        -P samplechart_x_col:${params.samplechart_x_col} \
+        -P samplechart_color_col:${params.samplechart_color_col} \
+        -P vgene_subject_col:${params.vgene_subject_col} \
+        -P vgene_x_cols:${params.vgene_x_cols} \
         --to html
     """
 

diff --git a/nextflow.config b/nextflow.config
@@ -26,6 +26,12 @@ params {
     sample_stats_template = "${projectDir}/notebooks/sample_stats_template.qmd"
     compare_stats_template = "${projectDir}/notebooks/compare_stats_template.qmd"
 
+    // Sample stats metadata parameters
+    samplechart_x_col = 'timepoint'
+    samplechart_color_col = 'origin'
+    vgene_subject_col = 'subject_id'
+    vgene_x_cols = 'origin,timepoint'
+
     // GIANA parameters
     threshold = 7.0
     threshold_score = 3.6