diff --git a/.cirro/process-form.json b/.cirro/process-form.json
index ee8c4df..da2d687 100644
--- a/.cirro/process-form.json
+++ b/.cirro/process-form.json
@@ -50,8 +50,31 @@
                 "description": "p_depth (GLIPH2)",
                 "title": "p_depth",
                 "type": "string"
+            },
+            "samplechart_x_col": {
+                "default": "timepoint",
+                "description": "Metadata column for x axis of sample-level plots for Sample workflow notebook",
+                "title": "Sample plot X axis column",
+                "type": "string"
+            },
+            "samplechart_color_col": {
+                "default": "origin",
+                "description": "Metadata column for legend color of sample-level plots for Sample workflow notebook",
+                "title": "Sample plot X axis column",
+                "type": "string"
+            },
+            "vgene_subject_col": {
+                "default": "subject_id",
+                "description": "Metadata column for grouping of samples for V gene plots for Sample workflow notebook",
+                "title": "V gene plot subject column",
+                "type": "string"
+            },
+            "vgene_x_cols": {
+                "default": "timepoint,origin",
+                "description": "Comma-separated list of metadata columns for x axis of V gene plot (eg. timepoint,origin or timepoint)",
+                "title": "V gene plot X axis columns",
+                "type": "string"
             }
-            
         }
     },
     "ui": {}
diff --git a/.cirro/process-input.json b/.cirro/process-input.json
index 98cf403..798fc2f 100644
--- a/.cirro/process-input.json
+++ b/.cirro/process-input.json
@@ -9,5 +9,9 @@
     "local_min_OVE": "$.params.dataset.paramJson.local_min_OVE",
     "local_min_pvalue": "$.params.dataset.paramJson.local_min_pvalue",
     "outdir": "$.params.dataset.s3|/data/",
-    "p_depth": "$.params.dataset.paramJson.p_depth"
+    "p_depth": "$.params.dataset.paramJson.p_depth",
+    "samplechart_x_col": "$.params.dataset.paramJson.samplechart_x_col",
+    "samplechart_color_col": "$.params.dataset.paramJson.samplechart_color_col",
+    "vgene_subject_col": "$.params.dataset.paramJson.vgene_subject_col",
+    "vgene_x_cols": "$.params.dataset.paramJson.vgene_x_cols"
 }
\ No newline at end of file
diff --git a/bin/compare_concatenate.py b/bin/compare_concatenate.py
index 7b92891..2138a2f 100755
--- a/bin/compare_concatenate.py
+++ b/bin/compare_concatenate.py
@@ -29,18 +29,12 @@ def main():
         # Read the TSV file into a dataframe
         file_path = str(row['file'])
         df = pd.read_csv(file_path, sep="\t", header=0)
-        
-        # Get metadata
-        subject_id = row['subject_id']
-        timepoint = row['timepoint']
-        origin = row['origin']
-            
+
         # Add patient column
-        df['patient'] = f"{subject_id}:{timepoint}_{origin}"
         df['sample'] = row['sample']
-        
+
         # Select relevant columns
-        df = df[['junction_aa', 'v_call', 'j_call', 'duplicate_count', 'patient', 'sample']]
+        df = df[['junction_aa', 'v_call', 'j_call', 'duplicate_count', 'sample']]
         dfs.append(df)
 
 
diff --git a/bin/sample_calc.py b/bin/sample_calc.py
index f5d219a..9614775 100755
--- a/bin/sample_calc.py
+++ b/bin/sample_calc.py
@@ -22,18 +22,7 @@ def extract_trb_family(allele):
     match = re.match(r'(TRB[V|D|J])(\d+)', allele)
     return f"{match.group(1)}{match.group(2)}" if match else None
 
-def compute_gene_family_table(counts, col_name, all_families, sample_meta):
-    fam_col = f"{col_name}FamilyName"
-    counts[fam_col] = counts[col_name].apply(extract_trb_family)
-    fam_df = counts[fam_col].value_counts(dropna=False).to_frame().T.sort_index(axis=1)
-    fam_df = fam_df.reindex(columns=all_families, fill_value=0)
-
-    for col in ['origin', 'timepoint', 'subject_id']:
-        fam_df.insert(0, col, sample_meta[col])
-
-    return fam_df
-
-def calc_gene_family(counts, gene_column, family_prefix, max_index, output_file, meta_df):
+def calc_gene_family(sample_name, counts, gene_column, family_prefix, max_index, output_file):
     # Build list of all possible family names
     all_fams = [f'{family_prefix}{i}' for i in range(1, max_index + 1)]
 
@@ -43,12 +32,12 @@ def calc_gene_family(counts, gene_column, family_prefix, max_index, output_file,
     # Reindex to include all families
     fam_df = pd.DataFrame([fam_df.reindex(columns=all_fams, fill_value=0).iloc[0]]).reset_index(drop=True)
 
-    # Add metadata columns
-    fam_df = pd.concat([meta_df, fam_df], axis=1)
+    # Add sample column
+    fam_df.insert(0, 'sample', sample_name)
 
     fam_df.to_csv(output_file, header=True, index=False)
 
-def calc_sample_stats(meta_df, counts, output_file):
+def calc_sample_stats(sample_name, counts, output_file):
     """Calculate sample level statistics of TCR repertoire."""
 
     ## first pass stats
@@ -105,8 +94,8 @@ def calc_sample_stats(meta_df, counts, output_file):
     # Convert to single-row dataframe
     df_stats = pd.DataFrame([row_data])
 
-    # Add metadata columns
-    df_stats = pd.concat([meta_df, df_stats], axis=1)
+    # Add sample column
+    df_stats.insert(0, 'sample', sample_name)
 
     # Save to CSV
     df_stats.to_csv(output_file, header=True, index=False)
@@ -117,10 +106,10 @@ def main():
     parser = argparse.ArgumentParser(description='Calculate clonality of a TCR repertoire')
 
     # add arguments
-    parser.add_argument('-s', '--sample_meta', 
-                        metavar='sample_meta', 
+    parser.add_argument('-s', '--sample_name', 
+                        metavar='sample_name', 
                         type=str, 
-                        help='sample metadata passed in as json format')
+                        help='sample name')
     parser.add_argument('-c', '--count_table', 
                         metavar='count_table', 
                         type=argparse.FileType('r'), 
@@ -128,29 +117,16 @@ def main():
 
     args = parser.parse_args() 
 
-    ## convert metadata to list
-    sample_meta = json.loads(args.sample_meta)
+    sample = args.sample_name
 
     # Read in the counts file
-    counts = pd.read_csv(args.count_table, sep='\t', header=0)
-
-    # Build metadata row from selected keys
-    meta_keys = ['subject_id', 'timepoint', 'origin']
-    meta_row = {k: sample_meta[k] for k in meta_keys}
-    meta_df = pd.DataFrame([meta_row])
-
-    sample = sample_meta['sample']
-
-    calc_gene_family(counts, 'v_call', 'TRBV', 30, f'vdj/v_family_{sample}.csv', meta_df)
-    calc_gene_family(counts, 'd_call', 'TRBD', 2, f'vdj/d_family_{sample}.csv', meta_df)
-    calc_gene_family(counts, 'j_call', 'TRBJ', 2, f'vdj/j_family_{sample}.csv', meta_df)
+    counts = pd.read_csv(args.count_table, sep='\t')
 
-    # Build metadata row from selected keys
-    meta_keys = ['sample', 'subject_id', 'timepoint', 'origin']
-    meta_row = {k: sample_meta[k] for k in meta_keys}
-    meta_df = pd.DataFrame([meta_row])
+    calc_gene_family(sample, counts, 'v_call', 'TRBV', 30, f'vdj/v_family_{sample}.csv')
+    calc_gene_family(sample, counts, 'd_call', 'TRBD', 2, f'vdj/d_family_{sample}.csv')
+    calc_gene_family(sample, counts, 'j_call', 'TRBJ', 2, f'vdj/j_family_{sample}.csv')
     
-    calc_sample_stats(meta_df, counts, f'stats/sample_stats_{sample}.csv')
+    calc_sample_stats(sample, counts, f'stats/sample_stats_{sample}.csv')
 
 if __name__ == "__main__":
     main()
diff --git a/modules/local/compare/giana.nf b/modules/local/compare/giana.nf
index b062f5f..561c1b3 100644
--- a/modules/local/compare/giana.nf
+++ b/modules/local/compare/giana.nf
@@ -28,30 +28,20 @@ process GIANA_CALC {
         > giana.log 2>&1
 
     # Insert header after GIANA comments
-    python3 - <<EOF
-    input_file = "giana_RotationEncodingBL62.txt"
-    concat_header_file = "${concat_cdr3}"
-
-    with open(concat_header_file, 'r', encoding='utf-8') as f:
-        header = f.readline().strip().split('\\t')
-    header.insert(1, "cluster")
-    header_line = '\\t'.join(header)
-
-    with open(input_file, 'r', encoding='utf-8') as infile:
-        lines = infile.readlines()
-
-    with open(input_file, 'w', encoding='utf-8') as outfile:
-        inserted = False
-        for line in lines:
-            if line.startswith("##"):
-                outfile.write(line)
-            elif not inserted:
-                outfile.write(header_line + '\\n')
-                outfile.write(line)
-                inserted = True
-            else:
-                outfile.write(line)
-    EOF
+    insert=\$(head -n 1 "${concat_cdr3}")
+    insert=\$(echo "\$insert" | awk -F'\t' 'BEGIN{OFS="\t"} {
+        out = \$1 OFS "cluster"
+        for (i=2; i<=NF; i++) {
+            out = out OFS \$i
+        }
+        print out
+    }')
+
+    awk -v insert="\$insert" '
+    /^##/ { print; next }
+    !inserted { print insert; inserted=1 }
+    { print }
+    ' giana_RotationEncodingBL62.txt > tmp && mv tmp giana_RotationEncodingBL62.txt
 
     mv giana_RotationEncodingBL62.txt_EncodingMatrix.txt giana_EncodingMatrix.txt
     """
diff --git a/modules/local/compare/gliph2.nf b/modules/local/compare/gliph2.nf
index 36ca9f4..486442d 100644
--- a/modules/local/compare/gliph2.nf
+++ b/modules/local/compare/gliph2.nf
@@ -18,15 +18,14 @@ process GLIPH2_TURBOGLIPH {
     
     script:
     """
-    # R script starts here
-    cat > run_gliph2.R <<EOF
+    Rscript - <<EOF
     #!/usr/bin/env Rscript
 
     library(turboGliph)
 
     # During testing, including TRBJ column was causing issues in clustering step. Removing and reinserting afterwards.
     df <- read.csv("$concat_cdr3", sep = "\t", stringsAsFactors = FALSE, check.names = FALSE)
-    # df2 <- subset(df, select = c('CDR3b', 'TRBV', 'patient', 'counts'))
+    df[,'patient'] <- df[,'sample']
 
     result <- turboGliph::gliph2(
         cdr3_sequences = df,
@@ -40,14 +39,12 @@ process GLIPH2_TURBOGLIPH {
     )
 
     df3 <- read.csv('cluster_member_details.txt', sep = '\t', stringsAsFactors = FALSE, check.names = FALSE)
-    df3 <- merge(df3, df[, c("CDR3b", "TRBV", "patient", "TRBJ", 'counts')], by = c("CDR3b", "TRBV", "patient", 'counts'), all.x = TRUE)
+    df3[,'sample'] <- df3[,'patient']
+    df3 <- merge(df3, df[, c("CDR3b", "TRBV", "sample", 'counts')], by = c("CDR3b", "TRBV", "sample", 'counts'), all.x = TRUE)
+    df3 <- df3[, c('CDR3b', 'TRBV', 'TRBJ', 'counts', 'sample', 'tag', 'seq_ID', 'ultCDR3b')]
     write.table(df3, "cluster_member_details.txt", sep = "\t", row.names = FALSE, quote = FALSE)
-
     EOF
 
-    # Run the R script
-    Rscript run_gliph2.R
-
     # Rename local_similarities file to standardize output name
     input_file="local_similarities_*.txt"
     cat \$input_file > local_similarities.txt
diff --git a/modules/local/compare/tcrsharing.nf b/modules/local/compare/tcrsharing.nf
index b4b9fab..ff68619 100644
--- a/modules/local/compare/tcrsharing.nf
+++ b/modules/local/compare/tcrsharing.nf
@@ -19,30 +19,36 @@ process TCRSHARING_CALC {
     # Load data
     df = pd.read_csv("${concat_cdr3}", sep="\t")
 
-    # Step 1: Map samples to integers
-    sample_mapping = {sample: i + 1 for i, sample in enumerate(df['sample'].unique())}
-    sample_map_df = pd.DataFrame.from_dict(sample_mapping, orient='index', columns=['sample_id']).reset_index()
-    sample_map_df.columns = ['patient', 'sample_id']
+    # Map sample to integer codes
+    df['sample'] = df['sample'].astype('category')
+    df['sample_id'] = df['sample'].cat.codes + 1
+
+    # Export mapping (uses category lookup directly)
+    sample_map_df = pd.DataFrame({
+        'patient': df['sample'].cat.categories,
+        'sample_id': np.arange(1, len(df['sample'].cat.categories) + 1)
+    })
     sample_map_df.to_csv("sample_mapping.tsv", sep="\t", index=False)
 
-    # Step 2: Group by CDR3b and aggregate sample_ids
-    df['sample_id'] = df['sample'].map(sample_mapping)
-
+    # Get unique sample_ids per CDR3b — vectorized
     grouped = (
         df.groupby('CDR3b')['sample_id']
-        .apply(lambda x: sorted(set(x)))  # remove duplicates if any
+        .unique()     # UNIQUE — fast & vectorized
+        .apply(np.sort)  # SORT — vectorized
         .reset_index()
     )
 
-    # Step 3: Add comma-separated list and total count
-    grouped['samples_present'] = grouped['sample_id'].apply(lambda x: ",".join(map(str, x)))
+    # Calculate counts
     grouped['total_samples'] = grouped['sample_id'].apply(len)
+    grouped['samples_present'] = grouped['sample_id'].apply(
+        lambda arr: ",".join(arr.astype(str))
+    )
 
-    # Step 4: Final output — drop raw list
+    # Drop raw list
     final_df = grouped[['CDR3b', 'total_samples', 'samples_present']]
-    final_df = final_df.sort_values(by='total_samples', axis=0, ascending=False)
+    final_df = final_df.sort_values(by="total_samples", ascending=False)
 
-    # Step 5: Export both outputs
+    # Export final list
     final_df.to_csv("cdr3_sharing.tsv", sep="\t", index=False)
     EOF
 
diff --git a/modules/local/sample/sample_calc.nf b/modules/local/sample/sample_calc.nf
index 1a72867..dff4c0a 100644
--- a/modules/local/sample/sample_calc.nf
+++ b/modules/local/sample/sample_calc.nf
@@ -14,13 +14,11 @@ process SAMPLE_CALC {
     val sample_meta                                  , emit: sample_meta
 
     script:
-    def meta_json = groovy.json.JsonOutput.toJson(sample_meta)
-
     """
     mkdir -p stats
     mkdir -p vdj
     
-    sample_calc.py -s '${meta_json}' -c ${count_table}
+    sample_calc.py -s '${sample_meta.sample}' -c ${count_table}
     """
 
     stub:
diff --git a/modules/local/sample/sample_plot.nf b/modules/local/sample/sample_plot.nf
index 272533d..5ca0494 100644
--- a/modules/local/sample/sample_plot.nf
+++ b/modules/local/sample/sample_plot.nf
@@ -13,7 +13,7 @@ process SAMPLE_PLOT {
     output:
     path 'sample_stats.html'
 
-    script:    
+    script:
     """
     ## copy quarto notebook to output directory
     cp $sample_stats_template sample_stats.qmd
@@ -26,6 +26,10 @@ process SAMPLE_PLOT {
         -P sample_table:$sample_table \
         -P sample_stats_csv:$sample_stats_csv \
         -P v_family_csv:$v_family_csv \
+        -P samplechart_x_col:${params.samplechart_x_col} \
+        -P samplechart_color_col:${params.samplechart_color_col} \
+        -P vgene_subject_col:${params.vgene_subject_col} \
+        -P vgene_x_cols:${params.vgene_x_cols} \
         --to html
     """
 
diff --git a/nextflow.config b/nextflow.config
index 2e531d0..f685ead 100644
--- a/nextflow.config
+++ b/nextflow.config
@@ -26,6 +26,12 @@ params {
     sample_stats_template = "${projectDir}/notebooks/sample_stats_template.qmd"
     compare_stats_template = "${projectDir}/notebooks/compare_stats_template.qmd"
 
+    // Sample stats metadata parameters
+    samplechart_x_col = 'timepoint'
+    samplechart_color_col = 'origin'
+    vgene_subject_col = 'subject_id'
+    vgene_x_cols = 'origin,timepoint'
+
     // GIANA parameters
     threshold = 7.0
     threshold_score = 3.6
diff --git a/notebooks/sample_stats_template.qmd b/notebooks/sample_stats_template.qmd
index 648732f..5a97598 100644
--- a/notebooks/sample_stats_template.qmd
+++ b/notebooks/sample_stats_template.qmd
@@ -31,14 +31,21 @@ Thank you for using TCRtoolkit! This report is generated from sample data and me
 #| echo: false
 
 workflow_cmd='<command used to run the pipeline>'
-project_name='path/to/project_name'
-project_dir='path/to/project_dir'
-sample_table='path/to/sample_table.csv'
-sample_stats_csv='path/to/sample_stats.csv'
-v_family_csv='path/to/v_family.csv'
+project_name='<path/to/project_name>'
+project_dir='<path/to/project_dir>'
+sample_table='<path/to/sample_table.csv>'
+sample_stats_csv='<path/to/sample_stats.csv>'
+v_family_csv='<path/to/v_family.csv>'
+
+samplechart_x_col='<x column for section 2>'
+samplechart_color_col='<colors for section 2>'
+vgene_subject_col='<subject column for section 3>'
+vgene_x_cols='<x columns for section 3>'
 ```
 
 ```{python}
+#| code-fold: true
+
 # 1. Load Packages
 from IPython.display import Image
 import os
@@ -52,6 +59,13 @@ from matplotlib.colors import LinearSegmentedColormap
 import plotly.express as px
 import plotly.graph_objects as go
 
+import warnings
+warnings.filterwarnings(
+    "ignore",
+    category=FutureWarning,
+    module="plotly"
+)
+
 # 2. Print pipeline parameters
 
 print('Project Name:          ' + project_name)
@@ -65,19 +79,20 @@ print('Date and time:         ' + str(datetime.datetime.now()))
 # TicToc = TicTocGenerator()
 
 # 4. Loading data
+## reading sample metadata
+meta = pd.read_csv(sample_table, sep=',')
+meta_cols = meta.columns.tolist()
 
 ## reading combined repertoire statistics
 df = pd.read_csv(sample_stats_csv, sep=',')
-# print('-- Imported sample_stats_csv as `df`...')
-
-## reading sample metadata
-meta = pd.read_csv(sample_table, sep=',', header=None, index_col=None,
-                   names=['sample', 'file', 'subject_id', 'timepoint', 'origin'])
-# print('-- Imported sample_table as `meta`...')
+df = pd.merge(df, meta, on='sample', how='left')
+df = df[meta_cols + [c for c in df.columns if c not in meta_cols]]
 
 ## reading V gene family usage 
 v_family = pd.read_csv(v_family_csv, sep=',')
-v_family = v_family.sort_values(by=['subject_id', 'timepoint'])
+v_family = pd.merge(v_family, meta, on='sample', how='left')
+v_family = v_family[meta_cols + [c for c in v_family.columns if c not in meta_cols]]
+v_family = v_family.sort_values(by=[vgene_subject_col])
 ```
 
 # Sample level statistics {#sec-sample-level-stats}
@@ -86,46 +101,57 @@ Below are plots showing basic T cell repertoire statistics. Each plot has a desc
 
 Version 3 of these plots features plotly express interactive plots. This version is exploratory and may be updated in the future.
 
+```{python}
+#| code-fold: true
+
+
+x_category = df[samplechart_x_col].unique().tolist()
+x_category.sort()
+
+def samplechart(df, y_col):
+    fig = px.box(
+        df, 
+        x=samplechart_x_col, 
+        y=y_col, 
+        color=samplechart_color_col, 
+        points='all',
+        hover_data='sample',
+        category_orders={f'{samplechart_x_col}': x_category}
+    )
+
+    fig.update_layout(
+        margin=dict(b=80)
+    )
+
+    fig.show()
+```
+
 ## Number of clones
 
 ```{python}
-timepts = df.timepoint.unique().tolist()
-timepts.sort()
-fig = px.box(df, 
-             x = 'timepoint', 
-             y='num_clones', 
-             color='origin', 
-             points='all', hover_data=['sample'],
-             category_orders={'timepoint': timepts})
-fig.show()
+#| code-fold: true
+
+samplechart(df, y_col='num_clones')
 ```
 
-**Figure 1. Number of clones across timepoints.** A clone is defined as a T cell with a unique CDR3 amino acid sequence. The number of clones is shown on the y-axis and 'origin_timepoint' is shown on the x-axis. 
+**Figure 1. Number of clones across sample groupings.** A clone is defined as a T cell with a unique CDR3 amino acid sequence. The number of clones is shown on the y-axis. The x-axis represents sample groupings defined by user-specified metadata fields (eg. origin, timepoint).
 
 ## Clonality
 
 ```{python}
-fig = px.box(df, 
-             x = 'timepoint', 
-             y='clonality', 
-             color='origin', 
-             points='all', hover_data=['sample'],
-             category_orders={'timepoint': timepts})
-fig.show()
+#| code-fold: true
+
+samplechart(df, y_col='clonality')
 ```
 
-**Figure 2. The clonality of samples across timepoints.** Clonality is a measure of T cell clonal expansion and reflects the degree to which the sample is dominated by 1 or more T cell clones. Clonality is calculated via: $$Clonality = \frac {1-H} {\log_{2} N} \quad\text{,}\quad H = -\sum\limits_{i=1}^N p_i \log_{2}{p_i}$$ where $H$ is the Shannon entropy of a given sample, $N$ is the number of unique TCRs in the sample, and $p_i$ is the frequency of the $i$ th unique TCR in the sample. 
+**Figure 2. The clonality of samples across sample groupings.** Clonality is a measure of T cell clonal expansion and reflects the degree to which the sample is dominated by 1 or more T cell clones. Clonality is calculated via: $$Clonality = \frac {1-H} {\log_{2} N} \quad\text{,}\quad H = -\sum\limits_{i=1}^N p_i \log_{2}{p_i}$$ where $H$ is the Shannon entropy of a given sample, $N$ is the number of unique TCRs in the sample, and $p_i$ is the frequency of the $i$ th unique TCR in the sample. 
 
 ## Simpson Index
 
 ```{python}
-fig = px.box(df, 
-             x = 'timepoint', 
-             y='simpson_index_corrected', 
-             color='origin', 
-             points='all', hover_data=['sample'],
-             category_orders={'timepoint': timepts})
-fig.show()
+#| code-fold: true
+
+samplechart(df, y_col='simpson_index_corrected')
 ```
 
 **Figure 3. Corrected Simpson Index.** The Simpson Index is a measure of diversity that takes into account the number of clones and the relative abundance of each clone in a sample. The corrected Simpson Index, $D$, is calculated as: 
@@ -137,13 +163,9 @@ Where $N$ is the number of unique TCRs in the sample, $p_i$ is the frequency of
 ## Percent of productive TCRs
 
 ```{python}
-fig = px.box(df, 
-             x = 'timepoint', 
-             y='pct_prod', 
-             color='origin', 
-             points='all', hover_data=['sample'],
-             category_orders={'timepoint': timepts})
-fig.show()
+#| code-fold: true
+
+samplechart(df, y_col='pct_prod')
 ```
 
 **Figure 4. Percent of productive TCRs.** A productive TCR is a DNA/RNA sequence that can be translated into a protein sequence, i.e. it does not contain a premature stop codon or an out of frame rearrangement. The percent of productive TCRs is calculated as: 
@@ -155,13 +177,9 @@ where $P$ is the number of productive TCRs and $N$ is the total number of TCRs i
 ## Average productive CDR3 Length
 
 ```{python}
-fig = px.box(df, 
-             x = 'timepoint', 
-             y='productive_cdr3_avg_len', 
-             color='origin', 
-             points='all', hover_data=['sample'],
-             category_orders={'timepoint': timepts})
-fig.show()
+#| code-fold: true
+
+samplechart(df, y_col='productive_cdr3_avg_len')
 ```
 
 **Figure 5. Average Productive CDR3 Length** The average length of the CDR3 region of the TCR for productive clones. The CDR3 region is the most variable region of the TCR and is the region that determines antigen specificity.
@@ -169,13 +187,9 @@ fig.show()
 ## TCR Convergence
 
 ```{python}
-fig = px.box(df, 
-             x = 'timepoint', 
-             y='ratio_convergent', 
-             color='origin', 
-             points='all', hover_data=['sample'],
-             category_orders={'timepoint': timepts})
-fig.show()
+#| code-fold: true
+
+samplechart(df, y_col='ratio_convergent')
 ```
 
 **Figure 6. TCR Convergence** The ratio of convergent TCRs to total TCRs. A convergent TCR is a TCR that is generated via 2 or more unique nucleotide sequences via codon degeneracy. 
@@ -195,44 +209,115 @@ $$
 where $N_{k}$ is the number of TCRs that use the $k$ th V gene, and T is the total number of TCRs in the sample.
 
 ```{python}
+#| code-fold: true
+
 ## code adapted from https://www.moritzkoerber.com/posts/plotly-grouped-stacked-bar-chart/
 colors = ["#fafa70","#fdef6b","#ffe566","#ffda63","#ffd061","#ffc660","#ffbb5f","#fdb15f","#fba860","#f79e61","#f39562","#ef8c63","#e98365","#e37b66","#dd7367","#d66b68","#ce6469","#c65e6a","#bd576b","#b4526b","#ab4c6b","#a1476a","#974369","#8c3e68","#823a66","#773764","#6d3361","#62305e","#572c5a","#4d2956"]
+vgene_x_cols = vgene_x_cols.split(',')
 
-## calculate calulate proportions and add to v_family_long
-v_family_long = pd.melt(v_family, id_vars=['subject_id', 'timepoint', 'origin'], value_vars=v_family.columns[3:], var_name='v_gene', value_name='count')
-v_family_long['proportion'] = v_family_long.groupby(['subject_id', 'timepoint', 'origin'])['count'].transform(lambda x: x / x.sum())
+## calculate proportions and add to v_family_long
+v_family_long = pd.melt(v_family,
+                        id_vars=meta_cols,
+                        value_vars=[c for c in v_family.columns if c.startswith('TRBV')],
+                        var_name='v_gene',
+                        value_name='count')
+v_family_long['proportion'] = v_family_long.groupby(meta_cols)['count'].transform(lambda x: x / x.sum())
 
 ## add in the total number of v genes for each sample
-total_v_genes = v_family_long.groupby(['subject_id', 'timepoint', 'origin'])['count'].sum().reset_index()
-total_v_genes.columns = ['subject_id', 'timepoint', 'origin', 'total_v_genes']
-v_family_long = pd.merge(v_family_long, total_v_genes, on=['subject_id', 'timepoint', 'origin'])
+total_v_genes = v_family_long.groupby(meta_cols)['count'].sum().reset_index().rename(columns={'count': 'total_v_genes'})
+v_family_long = v_family_long.merge(total_v_genes, on=meta_cols)
 
-for patient in v_family_long.subject_id.unique().tolist():
-    current = v_family_long[v_family_long.subject_id == patient]
+subjects = (
+    v_family_long[vgene_subject_col]
+    .dropna()
+    .unique()
+)
+
+for subject in subjects:
+    current = v_family_long[v_family_long[vgene_subject_col] == subject]
     fig = go.Figure()
     fig.update_layout(
         template="simple_white",
-        title_text=f"Patient: {patient}",
-        xaxis=dict(title_text="timepoint"),
+        title_text=f"{vgene_subject_col}: {subject}",
+        xaxis=dict(title_text=f"{', '.join(vgene_x_cols)}"),
         yaxis=dict(title_text="proportion"),
         barmode="stack",
+        margin=dict(b=80, r=115),
+        legend=dict(
+            orientation="v",
+            x=1.02,
+            y=1,
+            yanchor="top",
+            itemsizing="constant",
+            traceorder="reversed"
+        )
     )
 
+    fig.update_xaxes(
+        automargin=True,
+        title_standoff=30
+    )
+
+    if len(vgene_x_cols) == 1:
+        x_vals = current[vgene_x_cols[0]]
+    else:
+        x_vals = current[vgene_x_cols].astype(str).agg('_'.join, axis=1)
+    summary = (
+        current
+        .assign(x=x_vals)
+        .drop_duplicates(subset=meta_cols)
+        .groupby('x', as_index=False)
+        .agg(
+            total_v_genes=('total_v_genes', 'sum'),
+        )
+    )
+    y_df = (
+        current
+        .assign(x=x_vals)
+        .groupby('x', as_index=False)
+        .agg(
+            y_top=('proportion', 'sum')
+        )
+    )
+    summary = y_df.merge(summary, on='x')
+
     for g, c in zip(current.v_gene.unique(), colors):
         plot_df = current[current.v_gene == g]
-        if g == 'TRBV30':
-            fig.add_trace(
-                go.Bar(x=[plot_df.timepoint, plot_df.origin], 
-                       y=plot_df.proportion, name=g, marker_color=c,
-                       text=plot_df['total_v_genes'], textposition='outside'
-                      )
-                )
+        if len(vgene_x_cols) == 1:
+            x = plot_df[vgene_x_cols[0]]
         else:
-            fig.add_trace(
-                go.Bar(x=[plot_df.timepoint, plot_df.origin], 
-                       y=plot_df.proportion, name=g, marker_color=c
-                       )
+            x = [plot_df[col] for col in vgene_x_cols]
+        fig.add_trace(
+            go.Bar(
+                x=x,
+                y=plot_df['proportion'],
+                name=g,
+                marker_color=c,
+                legendgroup=g,
+
+                customdata=plot_df[['sample']],
+                hovertemplate=(
+                    "<b>V gene:</b> %{fullData.name}<br>"
+                    "<b>Sample:</b> %{customdata[0]}<br>"
+                    "<b>Proportion:</b> %{y}<br>"
+                    "<extra></extra>"
+                ),
+
+                text=plot_df['total_v_genes'] if ((g == 'TRBV30') and (len(vgene_x_cols) != 1)) else None,
+                textposition='outside',
+            )
         )
 
+    if len(vgene_x_cols) == 1:
+        for _, row in summary.iterrows():
+            fig.add_annotation(
+                x=row['x'],
+                    y=row['y_top'] + 0.02,
+                    text=str(row['total_v_genes']),
+                    showarrow=False,
+                    yanchor='bottom',
+                    font=dict(size=12)
+            )
+
     fig.show()
 ```