From 542f21f8dfedcafd9a3326c278a8e7af259b27ef Mon Sep 17 00:00:00 2001
From: dltamayo <DLT@breakthroughcancer.org>
Date: Mon, 15 Dec 2025 14:15:31 -0500
Subject: [PATCH 1/7] Vectorize tcrsharing

---
 modules/local/compare/tcrsharing.nf | 32 +++++++++++++++++------------
 1 file changed, 19 insertions(+), 13 deletions(-)

diff --git a/modules/local/compare/tcrsharing.nf b/modules/local/compare/tcrsharing.nf
index b4b9fab..ff68619 100644
--- a/modules/local/compare/tcrsharing.nf
+++ b/modules/local/compare/tcrsharing.nf
@@ -19,30 +19,36 @@ process TCRSHARING_CALC {
     # Load data
     df = pd.read_csv("${concat_cdr3}", sep="\t")
 
-    # Step 1: Map samples to integers
-    sample_mapping = {sample: i + 1 for i, sample in enumerate(df['sample'].unique())}
-    sample_map_df = pd.DataFrame.from_dict(sample_mapping, orient='index', columns=['sample_id']).reset_index()
-    sample_map_df.columns = ['patient', 'sample_id']
+    # Map sample to integer codes
+    df['sample'] = df['sample'].astype('category')
+    df['sample_id'] = df['sample'].cat.codes + 1
+
+    # Export mapping (uses category lookup directly)
+    sample_map_df = pd.DataFrame({
+        'patient': df['sample'].cat.categories,
+        'sample_id': np.arange(1, len(df['sample'].cat.categories) + 1)
+    })
     sample_map_df.to_csv("sample_mapping.tsv", sep="\t", index=False)
 
-    # Step 2: Group by CDR3b and aggregate sample_ids
-    df['sample_id'] = df['sample'].map(sample_mapping)
-
+    # Get unique sample_ids per CDR3b — vectorized
     grouped = (
         df.groupby('CDR3b')['sample_id']
-        .apply(lambda x: sorted(set(x)))  # remove duplicates if any
+        .unique()     # UNIQUE — fast & vectorized
+        .apply(np.sort)  # SORT — vectorized
         .reset_index()
     )
 
-    # Step 3: Add comma-separated list and total count
-    grouped['samples_present'] = grouped['sample_id'].apply(lambda x: ",".join(map(str, x)))
+    # Calculate counts
     grouped['total_samples'] = grouped['sample_id'].apply(len)
+    grouped['samples_present'] = grouped['sample_id'].apply(
+        lambda arr: ",".join(arr.astype(str))
+    )
 
-    # Step 4: Final output — drop raw list
+    # Drop raw list
     final_df = grouped[['CDR3b', 'total_samples', 'samples_present']]
-    final_df = final_df.sort_values(by='total_samples', axis=0, ascending=False)
+    final_df = final_df.sort_values(by="total_samples", ascending=False)
 
-    # Step 5: Export both outputs
+    # Export final list
     final_df.to_csv("cdr3_sharing.tsv", sep="\t", index=False)
     EOF
 

From 2adef3b364008d8bf109776a6ebca2d6912bc774 Mon Sep 17 00:00:00 2001
From: dltamayo <DLT@breakthroughcancer.org>
Date: Tue, 16 Dec 2025 15:08:22 -0500
Subject: [PATCH 2/7] Update GIANA post-processing Use awk instead of python to
 avoid reading files into memory

---
 modules/local/compare/giana.nf | 38 +++++++++++++---------------------
 1 file changed, 14 insertions(+), 24 deletions(-)

diff --git a/modules/local/compare/giana.nf b/modules/local/compare/giana.nf
index b062f5f..561c1b3 100644
--- a/modules/local/compare/giana.nf
+++ b/modules/local/compare/giana.nf
@@ -28,30 +28,20 @@ process GIANA_CALC {
         > giana.log 2>&1
 
     # Insert header after GIANA comments
-    python3 - <<EOF
-    input_file = "giana_RotationEncodingBL62.txt"
-    concat_header_file = "${concat_cdr3}"
-
-    with open(concat_header_file, 'r', encoding='utf-8') as f:
-        header = f.readline().strip().split('\\t')
-    header.insert(1, "cluster")
-    header_line = '\\t'.join(header)
-
-    with open(input_file, 'r', encoding='utf-8') as infile:
-        lines = infile.readlines()
-
-    with open(input_file, 'w', encoding='utf-8') as outfile:
-        inserted = False
-        for line in lines:
-            if line.startswith("##"):
-                outfile.write(line)
-            elif not inserted:
-                outfile.write(header_line + '\\n')
-                outfile.write(line)
-                inserted = True
-            else:
-                outfile.write(line)
-    EOF
+    insert=\$(head -n 1 "${concat_cdr3}")
+    insert=\$(echo "\$insert" | awk -F'\t' 'BEGIN{OFS="\t"} {
+        out = \$1 OFS "cluster"
+        for (i=2; i<=NF; i++) {
+            out = out OFS \$i
+        }
+        print out
+    }')
+
+    awk -v insert="\$insert" '
+    /^##/ { print; next }
+    !inserted { print insert; inserted=1 }
+    { print }
+    ' giana_RotationEncodingBL62.txt > tmp && mv tmp giana_RotationEncodingBL62.txt
 
     mv giana_RotationEncodingBL62.txt_EncodingMatrix.txt giana_EncodingMatrix.txt
     """

From 626f5fa51d71b6e689d9edd0e311f7722d4cb80a Mon Sep 17 00:00:00 2001
From: dltamayo <DLT@breakthroughcancer.org>
Date: Wed, 17 Dec 2025 15:59:13 -0500
Subject: [PATCH 3/7] Remove hardcoded metadata from compare - `sample` is the
 unique identifier within samplesheet, no need to include other metadata
 columns for the purposes of clustering.

---
 bin/compare_concatenate.py      | 12 +++---------
 modules/local/compare/gliph2.nf | 13 +++++--------
 2 files changed, 8 insertions(+), 17 deletions(-)

diff --git a/bin/compare_concatenate.py b/bin/compare_concatenate.py
index 7b92891..2138a2f 100755
--- a/bin/compare_concatenate.py
+++ b/bin/compare_concatenate.py
@@ -29,18 +29,12 @@ def main():
         # Read the TSV file into a dataframe
         file_path = str(row['file'])
         df = pd.read_csv(file_path, sep="\t", header=0)
-        
-        # Get metadata
-        subject_id = row['subject_id']
-        timepoint = row['timepoint']
-        origin = row['origin']
-            
+
         # Add patient column
-        df['patient'] = f"{subject_id}:{timepoint}_{origin}"
         df['sample'] = row['sample']
-        
+
         # Select relevant columns
-        df = df[['junction_aa', 'v_call', 'j_call', 'duplicate_count', 'patient', 'sample']]
+        df = df[['junction_aa', 'v_call', 'j_call', 'duplicate_count', 'sample']]
         dfs.append(df)
 
 
diff --git a/modules/local/compare/gliph2.nf b/modules/local/compare/gliph2.nf
index 36ca9f4..fafefde 100644
--- a/modules/local/compare/gliph2.nf
+++ b/modules/local/compare/gliph2.nf
@@ -18,15 +18,14 @@ process GLIPH2_TURBOGLIPH {
     
     script:
     """
-    # R script starts here
-    cat > run_gliph2.R <<EOF
+    Rscript - <<EOF
     #!/usr/bin/env Rscript
 
     library(turboGliph)
 
     # During testing, including TRBJ column was causing issues in clustering step. Removing and reinserting afterwards.
     df <- read.csv("$concat_cdr3", sep = "\t", stringsAsFactors = FALSE, check.names = FALSE)
-    # df2 <- subset(df, select = c('CDR3b', 'TRBV', 'patient', 'counts'))
+    df['patient'] <- df['sample']
 
     result <- turboGliph::gliph2(
         cdr3_sequences = df,
@@ -40,14 +39,12 @@ process GLIPH2_TURBOGLIPH {
     )
 
     df3 <- read.csv('cluster_member_details.txt', sep = '\t', stringsAsFactors = FALSE, check.names = FALSE)
-    df3 <- merge(df3, df[, c("CDR3b", "TRBV", "patient", "TRBJ", 'counts')], by = c("CDR3b", "TRBV", "patient", 'counts'), all.x = TRUE)
+    df3['sample'] <- df3['patient']
+    df3 <- merge(df3, df[, c("CDR3b", "TRBV", "sample", 'counts')], by = c("CDR3b", "TRBV", "sample", 'counts'), all.x = TRUE)
+    df3 <- df3[, c('CDR3b', 'TRBV', 'TRBJ', 'counts', 'sample', 'tag', 'seq_ID', 'ultCDR3b')]
     write.table(df3, "cluster_member_details.txt", sep = "\t", row.names = FALSE, quote = FALSE)
-
     EOF
 
-    # Run the R script
-    Rscript run_gliph2.R
-
     # Rename local_similarities file to standardize output name
     input_file="local_similarities_*.txt"
     cat \$input_file > local_similarities.txt

From bc09d6b3a7b6e40a23c3bad1bdb211b4ba839654 Mon Sep 17 00:00:00 2001
From: dltamayo <DLT@breakthroughcancer.org>
Date: Wed, 17 Dec 2025 16:18:52 -0500
Subject: [PATCH 4/7] Remove harcoded metadata from sample - reformatted
 sample_calc.py to only add sample as a metadata column; the rest of the
 metadata columns are now merged in the quarto notebook based on sample -
 added adjustable parameters allowing user to specify which metadata columns
 to use for notebook charts - cleaned up a lot of code within the notebook for
 aesthetics, reducing code redundancy

---
 bin/sample_calc.py                  |  54 +++-----
 modules/local/sample/sample_calc.nf |   4 +-
 modules/local/sample/sample_plot.nf |   6 +-
 nextflow.config                     |   6 +
 notebooks/sample_stats_template.qmd | 191 ++++++++++++++++------------
 5 files changed, 139 insertions(+), 122 deletions(-)

diff --git a/bin/sample_calc.py b/bin/sample_calc.py
index f5d219a..9614775 100755
--- a/bin/sample_calc.py
+++ b/bin/sample_calc.py
@@ -22,18 +22,7 @@ def extract_trb_family(allele):
     match = re.match(r'(TRB[V|D|J])(\d+)', allele)
     return f"{match.group(1)}{match.group(2)}" if match else None
 
-def compute_gene_family_table(counts, col_name, all_families, sample_meta):
-    fam_col = f"{col_name}FamilyName"
-    counts[fam_col] = counts[col_name].apply(extract_trb_family)
-    fam_df = counts[fam_col].value_counts(dropna=False).to_frame().T.sort_index(axis=1)
-    fam_df = fam_df.reindex(columns=all_families, fill_value=0)
-
-    for col in ['origin', 'timepoint', 'subject_id']:
-        fam_df.insert(0, col, sample_meta[col])
-
-    return fam_df
-
-def calc_gene_family(counts, gene_column, family_prefix, max_index, output_file, meta_df):
+def calc_gene_family(sample_name, counts, gene_column, family_prefix, max_index, output_file):
     # Build list of all possible family names
     all_fams = [f'{family_prefix}{i}' for i in range(1, max_index + 1)]
 
@@ -43,12 +32,12 @@ def calc_gene_family(counts, gene_column, family_prefix, max_index, output_file,
     # Reindex to include all families
     fam_df = pd.DataFrame([fam_df.reindex(columns=all_fams, fill_value=0).iloc[0]]).reset_index(drop=True)
 
-    # Add metadata columns
-    fam_df = pd.concat([meta_df, fam_df], axis=1)
+    # Add sample column
+    fam_df.insert(0, 'sample', sample_name)
 
     fam_df.to_csv(output_file, header=True, index=False)
 
-def calc_sample_stats(meta_df, counts, output_file):
+def calc_sample_stats(sample_name, counts, output_file):
     """Calculate sample level statistics of TCR repertoire."""
 
     ## first pass stats
@@ -105,8 +94,8 @@ def calc_sample_stats(meta_df, counts, output_file):
     # Convert to single-row dataframe
     df_stats = pd.DataFrame([row_data])
 
-    # Add metadata columns
-    df_stats = pd.concat([meta_df, df_stats], axis=1)
+    # Add sample column
+    df_stats.insert(0, 'sample', sample_name)
 
     # Save to CSV
     df_stats.to_csv(output_file, header=True, index=False)
@@ -117,10 +106,10 @@ def main():
     parser = argparse.ArgumentParser(description='Calculate clonality of a TCR repertoire')
 
     # add arguments
-    parser.add_argument('-s', '--sample_meta', 
-                        metavar='sample_meta', 
+    parser.add_argument('-s', '--sample_name', 
+                        metavar='sample_name', 
                         type=str, 
-                        help='sample metadata passed in as json format')
+                        help='sample name')
     parser.add_argument('-c', '--count_table', 
                         metavar='count_table', 
                         type=argparse.FileType('r'), 
@@ -128,29 +117,16 @@ def main():
 
     args = parser.parse_args() 
 
-    ## convert metadata to list
-    sample_meta = json.loads(args.sample_meta)
+    sample = args.sample_name
 
     # Read in the counts file
-    counts = pd.read_csv(args.count_table, sep='\t', header=0)
-
-    # Build metadata row from selected keys
-    meta_keys = ['subject_id', 'timepoint', 'origin']
-    meta_row = {k: sample_meta[k] for k in meta_keys}
-    meta_df = pd.DataFrame([meta_row])
-
-    sample = sample_meta['sample']
-
-    calc_gene_family(counts, 'v_call', 'TRBV', 30, f'vdj/v_family_{sample}.csv', meta_df)
-    calc_gene_family(counts, 'd_call', 'TRBD', 2, f'vdj/d_family_{sample}.csv', meta_df)
-    calc_gene_family(counts, 'j_call', 'TRBJ', 2, f'vdj/j_family_{sample}.csv', meta_df)
+    counts = pd.read_csv(args.count_table, sep='\t')
 
-    # Build metadata row from selected keys
-    meta_keys = ['sample', 'subject_id', 'timepoint', 'origin']
-    meta_row = {k: sample_meta[k] for k in meta_keys}
-    meta_df = pd.DataFrame([meta_row])
+    calc_gene_family(sample, counts, 'v_call', 'TRBV', 30, f'vdj/v_family_{sample}.csv')
+    calc_gene_family(sample, counts, 'd_call', 'TRBD', 2, f'vdj/d_family_{sample}.csv')
+    calc_gene_family(sample, counts, 'j_call', 'TRBJ', 2, f'vdj/j_family_{sample}.csv')
     
-    calc_sample_stats(meta_df, counts, f'stats/sample_stats_{sample}.csv')
+    calc_sample_stats(sample, counts, f'stats/sample_stats_{sample}.csv')
 
 if __name__ == "__main__":
     main()
diff --git a/modules/local/sample/sample_calc.nf b/modules/local/sample/sample_calc.nf
index 1a72867..dff4c0a 100644
--- a/modules/local/sample/sample_calc.nf
+++ b/modules/local/sample/sample_calc.nf
@@ -14,13 +14,11 @@ process SAMPLE_CALC {
     val sample_meta                                  , emit: sample_meta
 
     script:
-    def meta_json = groovy.json.JsonOutput.toJson(sample_meta)
-
     """
     mkdir -p stats
     mkdir -p vdj
     
-    sample_calc.py -s '${meta_json}' -c ${count_table}
+    sample_calc.py -s '${sample_meta.sample}' -c ${count_table}
     """
 
     stub:
diff --git a/modules/local/sample/sample_plot.nf b/modules/local/sample/sample_plot.nf
index 272533d..5ca0494 100644
--- a/modules/local/sample/sample_plot.nf
+++ b/modules/local/sample/sample_plot.nf
@@ -13,7 +13,7 @@ process SAMPLE_PLOT {
     output:
     path 'sample_stats.html'
 
-    script:    
+    script:
     """
     ## copy quarto notebook to output directory
     cp $sample_stats_template sample_stats.qmd
@@ -26,6 +26,10 @@ process SAMPLE_PLOT {
         -P sample_table:$sample_table \
         -P sample_stats_csv:$sample_stats_csv \
         -P v_family_csv:$v_family_csv \
+        -P samplechart_x_col:${params.samplechart_x_col} \
+        -P samplechart_color_col:${params.samplechart_color_col} \
+        -P vgene_subject_col:${params.vgene_subject_col} \
+        -P vgene_x_cols:${params.vgene_x_cols} \
         --to html
     """
 
diff --git a/nextflow.config b/nextflow.config
index 32f7938..45b8fbd 100644
--- a/nextflow.config
+++ b/nextflow.config
@@ -26,6 +26,12 @@ params {
     sample_stats_template = "${projectDir}/notebooks/sample_stats_template.qmd"
     compare_stats_template = "${projectDir}/notebooks/compare_stats_template.qmd"
 
+    // Sample stats metadata parameters
+    samplechart_x_col = 'timepoint'
+    samplechart_color_col = 'origin'
+    vgene_subject_col = 'subject_id'
+    vgene_x_cols = 'timepoint,origin'
+
     // GIANA parameters
     threshold = 7.0
     threshold_score = 3.6
diff --git a/notebooks/sample_stats_template.qmd b/notebooks/sample_stats_template.qmd
index 648732f..d3b89ed 100644
--- a/notebooks/sample_stats_template.qmd
+++ b/notebooks/sample_stats_template.qmd
@@ -31,14 +31,21 @@ Thank you for using TCRtoolkit! This report is generated from sample data and me
 #| echo: false
 
 workflow_cmd='<command used to run the pipeline>'
-project_name='path/to/project_name'
-project_dir='path/to/project_dir'
-sample_table='path/to/sample_table.csv'
-sample_stats_csv='path/to/sample_stats.csv'
-v_family_csv='path/to/v_family.csv'
+project_name='<path/to/project_name>'
+project_dir='<path/to/project_dir>'
+sample_table='<path/to/sample_table.csv>'
+sample_stats_csv='<path/to/sample_stats.csv>'
+v_family_csv='<path/to/v_family.csv>'
+
+samplechart_x_col='<x column for section 2>'
+samplechart_color_col='<colors for section 2>'
+vgene_subject_col='<subject column for section 3>'
+vgene_x_cols='<x columns for section 3>'
 ```
 
 ```{python}
+#| code-fold: true
+
 # 1. Load Packages
 from IPython.display import Image
 import os
@@ -65,19 +72,20 @@ print('Date and time:         ' + str(datetime.datetime.now()))
 # TicToc = TicTocGenerator()
 
 # 4. Loading data
+## reading sample metadata
+meta = pd.read_csv(sample_table, sep=',')
+meta_cols = meta.columns.tolist()[0:-1]
 
 ## reading combined repertoire statistics
 df = pd.read_csv(sample_stats_csv, sep=',')
-# print('-- Imported sample_stats_csv as `df`...')
-
-## reading sample metadata
-meta = pd.read_csv(sample_table, sep=',', header=None, index_col=None,
-                   names=['sample', 'file', 'subject_id', 'timepoint', 'origin'])
-# print('-- Imported sample_table as `meta`...')
+df = pd.merge(df, meta, on='sample', how='left')
+df = df[meta_cols + [c for c in df.columns if c not in meta_cols]]
 
 ## reading V gene family usage 
 v_family = pd.read_csv(v_family_csv, sep=',')
-v_family = v_family.sort_values(by=['subject_id', 'timepoint'])
+v_family = pd.merge(v_family, meta, on='sample', how='left')
+v_family = v_family[meta_cols + [c for c in v_family.columns if c not in meta_cols]]
+v_family = v_family.sort_values(by=[vgene_subject_col])
 ```
 
 # Sample level statistics {#sec-sample-level-stats}
@@ -86,46 +94,56 @@ Below are plots showing basic T cell repertoire statistics. Each plot has a desc
 
 Version 3 of these plots features plotly express interactive plots. This version is exploratory and may be updated in the future.
 
+```{python}
+#| code-fold: true
+
+
+x_category = df[samplechart_x_col].unique().tolist()
+x_category.sort()
+
+def samplechart(df, y_col):
+    fig = px.box(
+        df, 
+        x=samplechart_x_col, 
+        y=y_col, 
+        color=samplechart_color_col, 
+        points='all', hover_data=['sample'],
+        category_orders={f'{samplechart_x_col}': x_category}
+    )
+
+    fig.update_layout(
+        margin=dict(b=80)
+    )
+
+    fig.show()
+```
+
 ## Number of clones
 
 ```{python}
-timepts = df.timepoint.unique().tolist()
-timepts.sort()
-fig = px.box(df, 
-             x = 'timepoint', 
-             y='num_clones', 
-             color='origin', 
-             points='all', hover_data=['sample'],
-             category_orders={'timepoint': timepts})
-fig.show()
+#| code-fold: true
+
+samplechart(df, y_col='num_clones')
 ```
 
-**Figure 1. Number of clones across timepoints.** A clone is defined as a T cell with a unique CDR3 amino acid sequence. The number of clones is shown on the y-axis and 'origin_timepoint' is shown on the x-axis. 
+**Figure 1. Number of clones across sample groupings.** A clone is defined as a T cell with a unique CDR3 amino acid sequence. The number of clones is shown on the y-axis. The x-axis represents sample groupings defined by user-specified metadata fields (eg. origin, timepoint).
 
 ## Clonality
 
 ```{python}
-fig = px.box(df, 
-             x = 'timepoint', 
-             y='clonality', 
-             color='origin', 
-             points='all', hover_data=['sample'],
-             category_orders={'timepoint': timepts})
-fig.show()
+#| code-fold: true
+
+samplechart(df, y_col='clonality')
 ```
 
-**Figure 2. The clonality of samples across timepoints.** Clonality is a measure of T cell clonal expansion and reflects the degree to which the sample is dominated by 1 or more T cell clones. Clonality is calculated via: $$Clonality = \frac {1-H} {\log_{2} N} \quad\text{,}\quad H = -\sum\limits_{i=1}^N p_i \log_{2}{p_i}$$ where $H$ is the Shannon entropy of a given sample, $N$ is the number of unique TCRs in the sample, and $p_i$ is the frequency of the $i$ th unique TCR in the sample. 
+**Figure 2. The clonality of samples across sample groupings.** Clonality is a measure of T cell clonal expansion and reflects the degree to which the sample is dominated by 1 or more T cell clones. Clonality is calculated via: $$Clonality = \frac {1-H} {\log_{2} N} \quad\text{,}\quad H = -\sum\limits_{i=1}^N p_i \log_{2}{p_i}$$ where $H$ is the Shannon entropy of a given sample, $N$ is the number of unique TCRs in the sample, and $p_i$ is the frequency of the $i$ th unique TCR in the sample. 
 
 ## Simpson Index
 
 ```{python}
-fig = px.box(df, 
-             x = 'timepoint', 
-             y='simpson_index_corrected', 
-             color='origin', 
-             points='all', hover_data=['sample'],
-             category_orders={'timepoint': timepts})
-fig.show()
+#| code-fold: true
+
+samplechart(df, y_col='simpson_index_corrected')
 ```
 
 **Figure 3. Corrected Simpson Index.** The Simpson Index is a measure of diversity that takes into account the number of clones and the relative abundance of each clone in a sample. The corrected Simpson Index, $D$, is calculated as: 
@@ -137,13 +155,9 @@ Where $N$ is the number of unique TCRs in the sample, $p_i$ is the frequency of
 ## Percent of productive TCRs
 
 ```{python}
-fig = px.box(df, 
-             x = 'timepoint', 
-             y='pct_prod', 
-             color='origin', 
-             points='all', hover_data=['sample'],
-             category_orders={'timepoint': timepts})
-fig.show()
+#| code-fold: true
+
+samplechart(df, y_col='pct_prod')
 ```
 
 **Figure 4. Percent of productive TCRs.** A productive TCR is a DNA/RNA sequence that can be translated into a protein sequence, i.e. it does not contain a premature stop codon or an out of frame rearrangement. The percent of productive TCRs is calculated as: 
@@ -155,13 +169,9 @@ where $P$ is the number of productive TCRs and $N$ is the total number of TCRs i
 ## Average productive CDR3 Length
 
 ```{python}
-fig = px.box(df, 
-             x = 'timepoint', 
-             y='productive_cdr3_avg_len', 
-             color='origin', 
-             points='all', hover_data=['sample'],
-             category_orders={'timepoint': timepts})
-fig.show()
+#| code-fold: true
+
+samplechart(df, y_col='productive_cdr3_avg_len')
 ```
 
 **Figure 5. Average Productive CDR3 Length** The average length of the CDR3 region of the TCR for productive clones. The CDR3 region is the most variable region of the TCR and is the region that determines antigen specificity.
@@ -169,13 +179,9 @@ fig.show()
 ## TCR Convergence
 
 ```{python}
-fig = px.box(df, 
-             x = 'timepoint', 
-             y='ratio_convergent', 
-             color='origin', 
-             points='all', hover_data=['sample'],
-             category_orders={'timepoint': timepts})
-fig.show()
+#| code-fold: true
+
+samplechart(df, y_col='ratio_convergent')
 ```
 
 **Figure 6. TCR Convergence** The ratio of convergent TCRs to total TCRs. A convergent TCR is a TCR that is generated via 2 or more unique nucleotide sequences via codon degeneracy. 
@@ -195,43 +201,70 @@ $$
 where $N_{k}$ is the number of TCRs that use the $k$ th V gene, and T is the total number of TCRs in the sample.
 
 ```{python}
+#| code-fold: true
+
 ## code adapted from https://www.moritzkoerber.com/posts/plotly-grouped-stacked-bar-chart/
 colors = ["#fafa70","#fdef6b","#ffe566","#ffda63","#ffd061","#ffc660","#ffbb5f","#fdb15f","#fba860","#f79e61","#f39562","#ef8c63","#e98365","#e37b66","#dd7367","#d66b68","#ce6469","#c65e6a","#bd576b","#b4526b","#ab4c6b","#a1476a","#974369","#8c3e68","#823a66","#773764","#6d3361","#62305e","#572c5a","#4d2956"]
+vgene_x_cols = vgene_x_cols.split(',')
 
 ## calculate calulate proportions and add to v_family_long
-v_family_long = pd.melt(v_family, id_vars=['subject_id', 'timepoint', 'origin'], value_vars=v_family.columns[3:], var_name='v_gene', value_name='count')
-v_family_long['proportion'] = v_family_long.groupby(['subject_id', 'timepoint', 'origin'])['count'].transform(lambda x: x / x.sum())
+v_family_long = pd.melt(v_family,
+                        id_vars=meta_cols,
+                        value_vars=[c for c in v_family.columns if c.startswith('TRBV')],
+                        var_name='v_gene',
+                        value_name='count')
+v_family_long['proportion'] = v_family_long.groupby(meta_cols)['count'].transform(lambda x: x / x.sum())
 
 ## add in the total number of v genes for each sample
-total_v_genes = v_family_long.groupby(['subject_id', 'timepoint', 'origin'])['count'].sum().reset_index()
-total_v_genes.columns = ['subject_id', 'timepoint', 'origin', 'total_v_genes']
-v_family_long = pd.merge(v_family_long, total_v_genes, on=['subject_id', 'timepoint', 'origin'])
+total_v_genes = v_family_long.groupby(meta_cols)['count'].sum().reset_index().rename(columns={'count': 'total_v_genes'})
+v_family_long = v_family_long.merge(total_v_genes, on=meta_cols)
+
+subjects = (
+    v_family_long[vgene_subject_col]
+    .dropna()
+    .unique()
+)
 
-for patient in v_family_long.subject_id.unique().tolist():
-    current = v_family_long[v_family_long.subject_id == patient]
+for subject in subjects:
+    current = v_family_long[v_family_long[vgene_subject_col] == subject]
     fig = go.Figure()
     fig.update_layout(
         template="simple_white",
-        title_text=f"Patient: {patient}",
-        xaxis=dict(title_text="timepoint"),
+        title_text=f"Subject: {subject}",
+        xaxis=dict(title_text=f"{', '.join(vgene_x_cols)}"),
         yaxis=dict(title_text="proportion"),
         barmode="stack",
     )
 
     for g, c in zip(current.v_gene.unique(), colors):
         plot_df = current[current.v_gene == g]
-        if g == 'TRBV30':
-            fig.add_trace(
-                go.Bar(x=[plot_df.timepoint, plot_df.origin], 
-                       y=plot_df.proportion, name=g, marker_color=c,
-                       text=plot_df['total_v_genes'], textposition='outside'
-                      )
-                )
-        else:
-            fig.add_trace(
-                go.Bar(x=[plot_df.timepoint, plot_df.origin], 
-                       y=plot_df.proportion, name=g, marker_color=c
-                       )
+        fig.add_trace(
+            go.Bar(
+                x=[plot_df[col] for col in vgene_x_cols],
+                y=plot_df['proportion'],
+                name=g,
+                marker_color=c,
+                legendgroup=g,
+                text=plot_df['total_v_genes'] if g == 'TRBV30' else None,
+                textposition='outside' if g == 'TRBV30' else None,
+            )
+        )
+        
+        fig.update_layout(
+            margin=dict(b=80, r=250),
+            legend=dict(
+                orientation="v",
+                x=1.02,
+                y=1,
+                yanchor="top",
+                itemsizing="constant",
+                traceorder="reversed"
+            )
+        )
+
+        fig.update_xaxes(
+            automargin=True,
+            title_standoff=30
         )
 
     fig.show()

From f63cfeb7bb99976c14ca227ebecd398f2447b82b Mon Sep 17 00:00:00 2001
From: dltamayo <DLT@breakthroughcancer.org>
Date: Fri, 19 Dec 2025 12:21:51 -0500
Subject: [PATCH 5/7] Address Copilot suggestions

---
 modules/local/compare/gliph2.nf     | 4 ++--
 notebooks/sample_stats_template.qmd | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/modules/local/compare/gliph2.nf b/modules/local/compare/gliph2.nf
index fafefde..486442d 100644
--- a/modules/local/compare/gliph2.nf
+++ b/modules/local/compare/gliph2.nf
@@ -25,7 +25,7 @@ process GLIPH2_TURBOGLIPH {
 
     # During testing, including TRBJ column was causing issues in clustering step. Removing and reinserting afterwards.
     df <- read.csv("$concat_cdr3", sep = "\t", stringsAsFactors = FALSE, check.names = FALSE)
-    df['patient'] <- df['sample']
+    df[,'patient'] <- df[,'sample']
 
     result <- turboGliph::gliph2(
         cdr3_sequences = df,
@@ -39,7 +39,7 @@ process GLIPH2_TURBOGLIPH {
     )
 
     df3 <- read.csv('cluster_member_details.txt', sep = '\t', stringsAsFactors = FALSE, check.names = FALSE)
-    df3['sample'] <- df3['patient']
+    df3[,'sample'] <- df3[,'patient']
     df3 <- merge(df3, df[, c("CDR3b", "TRBV", "sample", 'counts')], by = c("CDR3b", "TRBV", "sample", 'counts'), all.x = TRUE)
     df3 <- df3[, c('CDR3b', 'TRBV', 'TRBJ', 'counts', 'sample', 'tag', 'seq_ID', 'ultCDR3b')]
     write.table(df3, "cluster_member_details.txt", sep = "\t", row.names = FALSE, quote = FALSE)
diff --git a/notebooks/sample_stats_template.qmd b/notebooks/sample_stats_template.qmd
index d3b89ed..9e683d8 100644
--- a/notebooks/sample_stats_template.qmd
+++ b/notebooks/sample_stats_template.qmd
@@ -74,7 +74,7 @@ print('Date and time:         ' + str(datetime.datetime.now()))
 # 4. Loading data
 ## reading sample metadata
 meta = pd.read_csv(sample_table, sep=',')
-meta_cols = meta.columns.tolist()[0:-1]
+meta_cols = meta.columns.tolist()
 
 ## reading combined repertoire statistics
 df = pd.read_csv(sample_stats_csv, sep=',')
@@ -207,7 +207,7 @@ where $N_{k}$ is the number of TCRs that use the $k$ th V gene, and T is the tot
 colors = ["#fafa70","#fdef6b","#ffe566","#ffda63","#ffd061","#ffc660","#ffbb5f","#fdb15f","#fba860","#f79e61","#f39562","#ef8c63","#e98365","#e37b66","#dd7367","#d66b68","#ce6469","#c65e6a","#bd576b","#b4526b","#ab4c6b","#a1476a","#974369","#8c3e68","#823a66","#773764","#6d3361","#62305e","#572c5a","#4d2956"]
 vgene_x_cols = vgene_x_cols.split(',')
 
-## calculate calulate proportions and add to v_family_long
+## calculate proportions and add to v_family_long
 v_family_long = pd.melt(v_family,
                         id_vars=meta_cols,
                         value_vars=[c for c in v_family.columns if c.startswith('TRBV')],

From ef327655ba41ebb9b98b4cc597fccfa846287903 Mon Sep 17 00:00:00 2001
From: dltamayo <DLT@breakthroughcancer.org>
Date: Fri, 19 Dec 2025 12:23:19 -0500
Subject: [PATCH 6/7] Update notebook - add logic to parse when submitting one
 grouping metadata column versus multiple for vgene chart

---
 notebooks/sample_stats_template.qmd | 94 ++++++++++++++++++++++-------
 1 file changed, 73 insertions(+), 21 deletions(-)

diff --git a/notebooks/sample_stats_template.qmd b/notebooks/sample_stats_template.qmd
index 9e683d8..5a97598 100644
--- a/notebooks/sample_stats_template.qmd
+++ b/notebooks/sample_stats_template.qmd
@@ -59,6 +59,13 @@ from matplotlib.colors import LinearSegmentedColormap
 import plotly.express as px
 import plotly.graph_objects as go
 
+import warnings
+warnings.filterwarnings(
+    "ignore",
+    category=FutureWarning,
+    module="plotly"
+)
+
 # 2. Print pipeline parameters
 
 print('Project Name:          ' + project_name)
@@ -107,7 +114,8 @@ def samplechart(df, y_col):
         x=samplechart_x_col, 
         y=y_col, 
         color=samplechart_color_col, 
-        points='all', hover_data=['sample'],
+        points='all',
+        hover_data='sample',
         category_orders={f'{samplechart_x_col}': x_category}
     )
 
@@ -230,42 +238,86 @@ for subject in subjects:
     fig = go.Figure()
     fig.update_layout(
         template="simple_white",
-        title_text=f"Subject: {subject}",
+        title_text=f"{vgene_subject_col}: {subject}",
         xaxis=dict(title_text=f"{', '.join(vgene_x_cols)}"),
         yaxis=dict(title_text="proportion"),
         barmode="stack",
+        margin=dict(b=80, r=115),
+        legend=dict(
+            orientation="v",
+            x=1.02,
+            y=1,
+            yanchor="top",
+            itemsizing="constant",
+            traceorder="reversed"
+        )
+    )
+
+    fig.update_xaxes(
+        automargin=True,
+        title_standoff=30
+    )
+
+    if len(vgene_x_cols) == 1:
+        x_vals = current[vgene_x_cols[0]]
+    else:
+        x_vals = current[vgene_x_cols].astype(str).agg('_'.join, axis=1)
+    summary = (
+        current
+        .assign(x=x_vals)
+        .drop_duplicates(subset=meta_cols)
+        .groupby('x', as_index=False)
+        .agg(
+            total_v_genes=('total_v_genes', 'sum'),
+        )
     )
+    y_df = (
+        current
+        .assign(x=x_vals)
+        .groupby('x', as_index=False)
+        .agg(
+            y_top=('proportion', 'sum')
+        )
+    )
+    summary = y_df.merge(summary, on='x')
 
     for g, c in zip(current.v_gene.unique(), colors):
         plot_df = current[current.v_gene == g]
+        if len(vgene_x_cols) == 1:
+            x = plot_df[vgene_x_cols[0]]
+        else:
+            x = [plot_df[col] for col in vgene_x_cols]
         fig.add_trace(
             go.Bar(
-                x=[plot_df[col] for col in vgene_x_cols],
+                x=x,
                 y=plot_df['proportion'],
                 name=g,
                 marker_color=c,
                 legendgroup=g,
-                text=plot_df['total_v_genes'] if g == 'TRBV30' else None,
-                textposition='outside' if g == 'TRBV30' else None,
-            )
-        )
-        
-        fig.update_layout(
-            margin=dict(b=80, r=250),
-            legend=dict(
-                orientation="v",
-                x=1.02,
-                y=1,
-                yanchor="top",
-                itemsizing="constant",
-                traceorder="reversed"
+
+                customdata=plot_df[['sample']],
+                hovertemplate=(
+                    "<b>V gene:</b> %{fullData.name}<br>"
+                    "<b>Sample:</b> %{customdata[0]}<br>"
+                    "<b>Proportion:</b> %{y}<br>"
+                    "<extra></extra>"
+                ),
+
+                text=plot_df['total_v_genes'] if ((g == 'TRBV30') and (len(vgene_x_cols) != 1)) else None,
+                textposition='outside',
             )
         )
 
-        fig.update_xaxes(
-            automargin=True,
-            title_standoff=30
-        )
+    if len(vgene_x_cols) == 1:
+        for _, row in summary.iterrows():
+            fig.add_annotation(
+                x=row['x'],
+                    y=row['y_top'] + 0.02,
+                    text=str(row['total_v_genes']),
+                    showarrow=False,
+                    yanchor='bottom',
+                    font=dict(size=12)
+            )
 
     fig.show()
 ```

From 8760a312465e01cb56566173580f3b16cd58799b Mon Sep 17 00:00:00 2001
From: dltamayo <DLT@breakthroughcancer.org>
Date: Fri, 19 Dec 2025 13:07:00 -0500
Subject: [PATCH 7/7] Add Cirro config

---
 .cirro/process-form.json  | 25 ++++++++++++++++++++++++-
 .cirro/process-input.json |  6 +++++-
 nextflow.config           |  2 +-
 3 files changed, 30 insertions(+), 3 deletions(-)

diff --git a/.cirro/process-form.json b/.cirro/process-form.json
index ee8c4df..da2d687 100644
--- a/.cirro/process-form.json
+++ b/.cirro/process-form.json
@@ -50,8 +50,31 @@
                 "description": "p_depth (GLIPH2)",
                 "title": "p_depth",
                 "type": "string"
+            },
+            "samplechart_x_col": {
+                "default": "timepoint",
+                "description": "Metadata column for x axis of sample-level plots for Sample workflow notebook",
+                "title": "Sample plot X axis column",
+                "type": "string"
+            },
+            "samplechart_color_col": {
+                "default": "origin",
+                "description": "Metadata column for legend color of sample-level plots for Sample workflow notebook",
+                "title": "Sample plot X axis column",
+                "type": "string"
+            },
+            "vgene_subject_col": {
+                "default": "subject_id",
+                "description": "Metadata column for grouping of samples for V gene plots for Sample workflow notebook",
+                "title": "V gene plot subject column",
+                "type": "string"
+            },
+            "vgene_x_cols": {
+                "default": "timepoint,origin",
+                "description": "Comma-separated list of metadata columns for x axis of V gene plot (eg. timepoint,origin or timepoint)",
+                "title": "V gene plot X axis columns",
+                "type": "string"
             }
-            
         }
     },
     "ui": {}
diff --git a/.cirro/process-input.json b/.cirro/process-input.json
index 98cf403..798fc2f 100644
--- a/.cirro/process-input.json
+++ b/.cirro/process-input.json
@@ -9,5 +9,9 @@
     "local_min_OVE": "$.params.dataset.paramJson.local_min_OVE",
     "local_min_pvalue": "$.params.dataset.paramJson.local_min_pvalue",
     "outdir": "$.params.dataset.s3|/data/",
-    "p_depth": "$.params.dataset.paramJson.p_depth"
+    "p_depth": "$.params.dataset.paramJson.p_depth",
+    "samplechart_x_col": "$.params.dataset.paramJson.samplechart_x_col",
+    "samplechart_color_col": "$.params.dataset.paramJson.samplechart_color_col",
+    "vgene_subject_col": "$.params.dataset.paramJson.vgene_subject_col",
+    "vgene_x_cols": "$.params.dataset.paramJson.vgene_x_cols"
 }
\ No newline at end of file
diff --git a/nextflow.config b/nextflow.config
index d15c1ac..f685ead 100644
--- a/nextflow.config
+++ b/nextflow.config
@@ -30,7 +30,7 @@ params {
     samplechart_x_col = 'timepoint'
     samplechart_color_col = 'origin'
     vgene_subject_col = 'subject_id'
-    vgene_x_cols = 'timepoint,origin'
+    vgene_x_cols = 'origin,timepoint'
 
     // GIANA parameters
     threshold = 7.0