From 542f21f8dfedcafd9a3326c278a8e7af259b27ef Mon Sep 17 00:00:00 2001 From: dltamayo Date: Mon, 15 Dec 2025 14:15:31 -0500 Subject: [PATCH 1/7] Vectorize tcrsharing --- modules/local/compare/tcrsharing.nf | 32 +++++++++++++++++------------ 1 file changed, 19 insertions(+), 13 deletions(-) diff --git a/modules/local/compare/tcrsharing.nf b/modules/local/compare/tcrsharing.nf index b4b9fab..ff68619 100644 --- a/modules/local/compare/tcrsharing.nf +++ b/modules/local/compare/tcrsharing.nf @@ -19,30 +19,36 @@ process TCRSHARING_CALC { # Load data df = pd.read_csv("${concat_cdr3}", sep="\t") - # Step 1: Map samples to integers - sample_mapping = {sample: i + 1 for i, sample in enumerate(df['sample'].unique())} - sample_map_df = pd.DataFrame.from_dict(sample_mapping, orient='index', columns=['sample_id']).reset_index() - sample_map_df.columns = ['patient', 'sample_id'] + # Map sample to integer codes + df['sample'] = df['sample'].astype('category') + df['sample_id'] = df['sample'].cat.codes + 1 + + # Export mapping (uses category lookup directly) + sample_map_df = pd.DataFrame({ + 'patient': df['sample'].cat.categories, + 'sample_id': np.arange(1, len(df['sample'].cat.categories) + 1) + }) sample_map_df.to_csv("sample_mapping.tsv", sep="\t", index=False) - # Step 2: Group by CDR3b and aggregate sample_ids - df['sample_id'] = df['sample'].map(sample_mapping) - + # Get unique sample_ids per CDR3b — vectorized grouped = ( df.groupby('CDR3b')['sample_id'] - .apply(lambda x: sorted(set(x))) # remove duplicates if any + .unique() # UNIQUE — fast & vectorized + .apply(np.sort) # SORT — vectorized .reset_index() ) - # Step 3: Add comma-separated list and total count - grouped['samples_present'] = grouped['sample_id'].apply(lambda x: ",".join(map(str, x))) + # Calculate counts grouped['total_samples'] = grouped['sample_id'].apply(len) + grouped['samples_present'] = grouped['sample_id'].apply( + lambda arr: ",".join(arr.astype(str)) + ) - # Step 4: Final output — drop raw list + # Drop raw list final_df = grouped[['CDR3b', 'total_samples', 'samples_present']] - final_df = final_df.sort_values(by='total_samples', axis=0, ascending=False) + final_df = final_df.sort_values(by="total_samples", ascending=False) - # Step 5: Export both outputs + # Export final list final_df.to_csv("cdr3_sharing.tsv", sep="\t", index=False) EOF From 2adef3b364008d8bf109776a6ebca2d6912bc774 Mon Sep 17 00:00:00 2001 From: dltamayo Date: Tue, 16 Dec 2025 15:08:22 -0500 Subject: [PATCH 2/7] Update GIANA post-processing Use awk instead of python to avoid reading files into memory --- modules/local/compare/giana.nf | 38 +++++++++++++--------------------- 1 file changed, 14 insertions(+), 24 deletions(-) diff --git a/modules/local/compare/giana.nf b/modules/local/compare/giana.nf index b062f5f..561c1b3 100644 --- a/modules/local/compare/giana.nf +++ b/modules/local/compare/giana.nf @@ -28,30 +28,20 @@ process GIANA_CALC { > giana.log 2>&1 # Insert header after GIANA comments - python3 - < tmp && mv tmp giana_RotationEncodingBL62.txt mv giana_RotationEncodingBL62.txt_EncodingMatrix.txt giana_EncodingMatrix.txt """ From 626f5fa51d71b6e689d9edd0e311f7722d4cb80a Mon Sep 17 00:00:00 2001 From: dltamayo Date: Wed, 17 Dec 2025 15:59:13 -0500 Subject: [PATCH 3/7] Remove hardcoded metadata from compare - `sample` is the unique identifier within samplesheet, no need to include other metadata columns for the purposes of clustering. --- bin/compare_concatenate.py | 12 +++--------- modules/local/compare/gliph2.nf | 13 +++++-------- 2 files changed, 8 insertions(+), 17 deletions(-) diff --git a/bin/compare_concatenate.py b/bin/compare_concatenate.py index 7b92891..2138a2f 100755 --- a/bin/compare_concatenate.py +++ b/bin/compare_concatenate.py @@ -29,18 +29,12 @@ def main(): # Read the TSV file into a dataframe file_path = str(row['file']) df = pd.read_csv(file_path, sep="\t", header=0) - - # Get metadata - subject_id = row['subject_id'] - timepoint = row['timepoint'] - origin = row['origin'] - + # Add patient column - df['patient'] = f"{subject_id}:{timepoint}_{origin}" df['sample'] = row['sample'] - + # Select relevant columns - df = df[['junction_aa', 'v_call', 'j_call', 'duplicate_count', 'patient', 'sample']] + df = df[['junction_aa', 'v_call', 'j_call', 'duplicate_count', 'sample']] dfs.append(df) diff --git a/modules/local/compare/gliph2.nf b/modules/local/compare/gliph2.nf index 36ca9f4..fafefde 100644 --- a/modules/local/compare/gliph2.nf +++ b/modules/local/compare/gliph2.nf @@ -18,15 +18,14 @@ process GLIPH2_TURBOGLIPH { script: """ - # R script starts here - cat > run_gliph2.R < local_similarities.txt From bc09d6b3a7b6e40a23c3bad1bdb211b4ba839654 Mon Sep 17 00:00:00 2001 From: dltamayo Date: Wed, 17 Dec 2025 16:18:52 -0500 Subject: [PATCH 4/7] Remove harcoded metadata from sample - reformatted sample_calc.py to only add sample as a metadata column; the rest of the metadata columns are now merged in the quarto notebook based on sample - added adjustable parameters allowing user to specify which metadata columns to use for notebook charts - cleaned up a lot of code within the notebook for aesthetics, reducing code redundancy --- bin/sample_calc.py | 54 +++----- modules/local/sample/sample_calc.nf | 4 +- modules/local/sample/sample_plot.nf | 6 +- nextflow.config | 6 + notebooks/sample_stats_template.qmd | 191 ++++++++++++++++------------ 5 files changed, 139 insertions(+), 122 deletions(-) diff --git a/bin/sample_calc.py b/bin/sample_calc.py index f5d219a..9614775 100755 --- a/bin/sample_calc.py +++ b/bin/sample_calc.py @@ -22,18 +22,7 @@ def extract_trb_family(allele): match = re.match(r'(TRB[V|D|J])(\d+)', allele) return f"{match.group(1)}{match.group(2)}" if match else None -def compute_gene_family_table(counts, col_name, all_families, sample_meta): - fam_col = f"{col_name}FamilyName" - counts[fam_col] = counts[col_name].apply(extract_trb_family) - fam_df = counts[fam_col].value_counts(dropna=False).to_frame().T.sort_index(axis=1) - fam_df = fam_df.reindex(columns=all_families, fill_value=0) - - for col in ['origin', 'timepoint', 'subject_id']: - fam_df.insert(0, col, sample_meta[col]) - - return fam_df - -def calc_gene_family(counts, gene_column, family_prefix, max_index, output_file, meta_df): +def calc_gene_family(sample_name, counts, gene_column, family_prefix, max_index, output_file): # Build list of all possible family names all_fams = [f'{family_prefix}{i}' for i in range(1, max_index + 1)] @@ -43,12 +32,12 @@ def calc_gene_family(counts, gene_column, family_prefix, max_index, output_file, # Reindex to include all families fam_df = pd.DataFrame([fam_df.reindex(columns=all_fams, fill_value=0).iloc[0]]).reset_index(drop=True) - # Add metadata columns - fam_df = pd.concat([meta_df, fam_df], axis=1) + # Add sample column + fam_df.insert(0, 'sample', sample_name) fam_df.to_csv(output_file, header=True, index=False) -def calc_sample_stats(meta_df, counts, output_file): +def calc_sample_stats(sample_name, counts, output_file): """Calculate sample level statistics of TCR repertoire.""" ## first pass stats @@ -105,8 +94,8 @@ def calc_sample_stats(meta_df, counts, output_file): # Convert to single-row dataframe df_stats = pd.DataFrame([row_data]) - # Add metadata columns - df_stats = pd.concat([meta_df, df_stats], axis=1) + # Add sample column + df_stats.insert(0, 'sample', sample_name) # Save to CSV df_stats.to_csv(output_file, header=True, index=False) @@ -117,10 +106,10 @@ def main(): parser = argparse.ArgumentParser(description='Calculate clonality of a TCR repertoire') # add arguments - parser.add_argument('-s', '--sample_meta', - metavar='sample_meta', + parser.add_argument('-s', '--sample_name', + metavar='sample_name', type=str, - help='sample metadata passed in as json format') + help='sample name') parser.add_argument('-c', '--count_table', metavar='count_table', type=argparse.FileType('r'), @@ -128,29 +117,16 @@ def main(): args = parser.parse_args() - ## convert metadata to list - sample_meta = json.loads(args.sample_meta) + sample = args.sample_name # Read in the counts file - counts = pd.read_csv(args.count_table, sep='\t', header=0) - - # Build metadata row from selected keys - meta_keys = ['subject_id', 'timepoint', 'origin'] - meta_row = {k: sample_meta[k] for k in meta_keys} - meta_df = pd.DataFrame([meta_row]) - - sample = sample_meta['sample'] - - calc_gene_family(counts, 'v_call', 'TRBV', 30, f'vdj/v_family_{sample}.csv', meta_df) - calc_gene_family(counts, 'd_call', 'TRBD', 2, f'vdj/d_family_{sample}.csv', meta_df) - calc_gene_family(counts, 'j_call', 'TRBJ', 2, f'vdj/j_family_{sample}.csv', meta_df) + counts = pd.read_csv(args.count_table, sep='\t') - # Build metadata row from selected keys - meta_keys = ['sample', 'subject_id', 'timepoint', 'origin'] - meta_row = {k: sample_meta[k] for k in meta_keys} - meta_df = pd.DataFrame([meta_row]) + calc_gene_family(sample, counts, 'v_call', 'TRBV', 30, f'vdj/v_family_{sample}.csv') + calc_gene_family(sample, counts, 'd_call', 'TRBD', 2, f'vdj/d_family_{sample}.csv') + calc_gene_family(sample, counts, 'j_call', 'TRBJ', 2, f'vdj/j_family_{sample}.csv') - calc_sample_stats(meta_df, counts, f'stats/sample_stats_{sample}.csv') + calc_sample_stats(sample, counts, f'stats/sample_stats_{sample}.csv') if __name__ == "__main__": main() diff --git a/modules/local/sample/sample_calc.nf b/modules/local/sample/sample_calc.nf index 1a72867..dff4c0a 100644 --- a/modules/local/sample/sample_calc.nf +++ b/modules/local/sample/sample_calc.nf @@ -14,13 +14,11 @@ process SAMPLE_CALC { val sample_meta , emit: sample_meta script: - def meta_json = groovy.json.JsonOutput.toJson(sample_meta) - """ mkdir -p stats mkdir -p vdj - sample_calc.py -s '${meta_json}' -c ${count_table} + sample_calc.py -s '${sample_meta.sample}' -c ${count_table} """ stub: diff --git a/modules/local/sample/sample_plot.nf b/modules/local/sample/sample_plot.nf index 272533d..5ca0494 100644 --- a/modules/local/sample/sample_plot.nf +++ b/modules/local/sample/sample_plot.nf @@ -13,7 +13,7 @@ process SAMPLE_PLOT { output: path 'sample_stats.html' - script: + script: """ ## copy quarto notebook to output directory cp $sample_stats_template sample_stats.qmd @@ -26,6 +26,10 @@ process SAMPLE_PLOT { -P sample_table:$sample_table \ -P sample_stats_csv:$sample_stats_csv \ -P v_family_csv:$v_family_csv \ + -P samplechart_x_col:${params.samplechart_x_col} \ + -P samplechart_color_col:${params.samplechart_color_col} \ + -P vgene_subject_col:${params.vgene_subject_col} \ + -P vgene_x_cols:${params.vgene_x_cols} \ --to html """ diff --git a/nextflow.config b/nextflow.config index 32f7938..45b8fbd 100644 --- a/nextflow.config +++ b/nextflow.config @@ -26,6 +26,12 @@ params { sample_stats_template = "${projectDir}/notebooks/sample_stats_template.qmd" compare_stats_template = "${projectDir}/notebooks/compare_stats_template.qmd" + // Sample stats metadata parameters + samplechart_x_col = 'timepoint' + samplechart_color_col = 'origin' + vgene_subject_col = 'subject_id' + vgene_x_cols = 'timepoint,origin' + // GIANA parameters threshold = 7.0 threshold_score = 3.6 diff --git a/notebooks/sample_stats_template.qmd b/notebooks/sample_stats_template.qmd index 648732f..d3b89ed 100644 --- a/notebooks/sample_stats_template.qmd +++ b/notebooks/sample_stats_template.qmd @@ -31,14 +31,21 @@ Thank you for using TCRtoolkit! This report is generated from sample data and me #| echo: false workflow_cmd='' -project_name='path/to/project_name' -project_dir='path/to/project_dir' -sample_table='path/to/sample_table.csv' -sample_stats_csv='path/to/sample_stats.csv' -v_family_csv='path/to/v_family.csv' +project_name='' +project_dir='' +sample_table='' +sample_stats_csv='' +v_family_csv='' + +samplechart_x_col='' +samplechart_color_col='' +vgene_subject_col='' +vgene_x_cols='' ``` ```{python} +#| code-fold: true + # 1. Load Packages from IPython.display import Image import os @@ -65,19 +72,20 @@ print('Date and time: ' + str(datetime.datetime.now())) # TicToc = TicTocGenerator() # 4. Loading data +## reading sample metadata +meta = pd.read_csv(sample_table, sep=',') +meta_cols = meta.columns.tolist()[0:-1] ## reading combined repertoire statistics df = pd.read_csv(sample_stats_csv, sep=',') -# print('-- Imported sample_stats_csv as `df`...') - -## reading sample metadata -meta = pd.read_csv(sample_table, sep=',', header=None, index_col=None, - names=['sample', 'file', 'subject_id', 'timepoint', 'origin']) -# print('-- Imported sample_table as `meta`...') +df = pd.merge(df, meta, on='sample', how='left') +df = df[meta_cols + [c for c in df.columns if c not in meta_cols]] ## reading V gene family usage v_family = pd.read_csv(v_family_csv, sep=',') -v_family = v_family.sort_values(by=['subject_id', 'timepoint']) +v_family = pd.merge(v_family, meta, on='sample', how='left') +v_family = v_family[meta_cols + [c for c in v_family.columns if c not in meta_cols]] +v_family = v_family.sort_values(by=[vgene_subject_col]) ``` # Sample level statistics {#sec-sample-level-stats} @@ -86,46 +94,56 @@ Below are plots showing basic T cell repertoire statistics. Each plot has a desc Version 3 of these plots features plotly express interactive plots. This version is exploratory and may be updated in the future. +```{python} +#| code-fold: true + + +x_category = df[samplechart_x_col].unique().tolist() +x_category.sort() + +def samplechart(df, y_col): + fig = px.box( + df, + x=samplechart_x_col, + y=y_col, + color=samplechart_color_col, + points='all', hover_data=['sample'], + category_orders={f'{samplechart_x_col}': x_category} + ) + + fig.update_layout( + margin=dict(b=80) + ) + + fig.show() +``` + ## Number of clones ```{python} -timepts = df.timepoint.unique().tolist() -timepts.sort() -fig = px.box(df, - x = 'timepoint', - y='num_clones', - color='origin', - points='all', hover_data=['sample'], - category_orders={'timepoint': timepts}) -fig.show() +#| code-fold: true + +samplechart(df, y_col='num_clones') ``` -**Figure 1. Number of clones across timepoints.** A clone is defined as a T cell with a unique CDR3 amino acid sequence. The number of clones is shown on the y-axis and 'origin_timepoint' is shown on the x-axis. +**Figure 1. Number of clones across sample groupings.** A clone is defined as a T cell with a unique CDR3 amino acid sequence. The number of clones is shown on the y-axis. The x-axis represents sample groupings defined by user-specified metadata fields (eg. origin, timepoint). ## Clonality ```{python} -fig = px.box(df, - x = 'timepoint', - y='clonality', - color='origin', - points='all', hover_data=['sample'], - category_orders={'timepoint': timepts}) -fig.show() +#| code-fold: true + +samplechart(df, y_col='clonality') ``` -**Figure 2. The clonality of samples across timepoints.** Clonality is a measure of T cell clonal expansion and reflects the degree to which the sample is dominated by 1 or more T cell clones. Clonality is calculated via: $$Clonality = \frac {1-H} {\log_{2} N} \quad\text{,}\quad H = -\sum\limits_{i=1}^N p_i \log_{2}{p_i}$$ where $H$ is the Shannon entropy of a given sample, $N$ is the number of unique TCRs in the sample, and $p_i$ is the frequency of the $i$ th unique TCR in the sample. +**Figure 2. The clonality of samples across sample groupings.** Clonality is a measure of T cell clonal expansion and reflects the degree to which the sample is dominated by 1 or more T cell clones. Clonality is calculated via: $$Clonality = \frac {1-H} {\log_{2} N} \quad\text{,}\quad H = -\sum\limits_{i=1}^N p_i \log_{2}{p_i}$$ where $H$ is the Shannon entropy of a given sample, $N$ is the number of unique TCRs in the sample, and $p_i$ is the frequency of the $i$ th unique TCR in the sample. ## Simpson Index ```{python} -fig = px.box(df, - x = 'timepoint', - y='simpson_index_corrected', - color='origin', - points='all', hover_data=['sample'], - category_orders={'timepoint': timepts}) -fig.show() +#| code-fold: true + +samplechart(df, y_col='simpson_index_corrected') ``` **Figure 3. Corrected Simpson Index.** The Simpson Index is a measure of diversity that takes into account the number of clones and the relative abundance of each clone in a sample. The corrected Simpson Index, $D$, is calculated as: @@ -137,13 +155,9 @@ Where $N$ is the number of unique TCRs in the sample, $p_i$ is the frequency of ## Percent of productive TCRs ```{python} -fig = px.box(df, - x = 'timepoint', - y='pct_prod', - color='origin', - points='all', hover_data=['sample'], - category_orders={'timepoint': timepts}) -fig.show() +#| code-fold: true + +samplechart(df, y_col='pct_prod') ``` **Figure 4. Percent of productive TCRs.** A productive TCR is a DNA/RNA sequence that can be translated into a protein sequence, i.e. it does not contain a premature stop codon or an out of frame rearrangement. The percent of productive TCRs is calculated as: @@ -155,13 +169,9 @@ where $P$ is the number of productive TCRs and $N$ is the total number of TCRs i ## Average productive CDR3 Length ```{python} -fig = px.box(df, - x = 'timepoint', - y='productive_cdr3_avg_len', - color='origin', - points='all', hover_data=['sample'], - category_orders={'timepoint': timepts}) -fig.show() +#| code-fold: true + +samplechart(df, y_col='productive_cdr3_avg_len') ``` **Figure 5. Average Productive CDR3 Length** The average length of the CDR3 region of the TCR for productive clones. The CDR3 region is the most variable region of the TCR and is the region that determines antigen specificity. @@ -169,13 +179,9 @@ fig.show() ## TCR Convergence ```{python} -fig = px.box(df, - x = 'timepoint', - y='ratio_convergent', - color='origin', - points='all', hover_data=['sample'], - category_orders={'timepoint': timepts}) -fig.show() +#| code-fold: true + +samplechart(df, y_col='ratio_convergent') ``` **Figure 6. TCR Convergence** The ratio of convergent TCRs to total TCRs. A convergent TCR is a TCR that is generated via 2 or more unique nucleotide sequences via codon degeneracy. @@ -195,43 +201,70 @@ $$ where $N_{k}$ is the number of TCRs that use the $k$ th V gene, and T is the total number of TCRs in the sample. ```{python} +#| code-fold: true + ## code adapted from https://www.moritzkoerber.com/posts/plotly-grouped-stacked-bar-chart/ colors = ["#fafa70","#fdef6b","#ffe566","#ffda63","#ffd061","#ffc660","#ffbb5f","#fdb15f","#fba860","#f79e61","#f39562","#ef8c63","#e98365","#e37b66","#dd7367","#d66b68","#ce6469","#c65e6a","#bd576b","#b4526b","#ab4c6b","#a1476a","#974369","#8c3e68","#823a66","#773764","#6d3361","#62305e","#572c5a","#4d2956"] +vgene_x_cols = vgene_x_cols.split(',') ## calculate calulate proportions and add to v_family_long -v_family_long = pd.melt(v_family, id_vars=['subject_id', 'timepoint', 'origin'], value_vars=v_family.columns[3:], var_name='v_gene', value_name='count') -v_family_long['proportion'] = v_family_long.groupby(['subject_id', 'timepoint', 'origin'])['count'].transform(lambda x: x / x.sum()) +v_family_long = pd.melt(v_family, + id_vars=meta_cols, + value_vars=[c for c in v_family.columns if c.startswith('TRBV')], + var_name='v_gene', + value_name='count') +v_family_long['proportion'] = v_family_long.groupby(meta_cols)['count'].transform(lambda x: x / x.sum()) ## add in the total number of v genes for each sample -total_v_genes = v_family_long.groupby(['subject_id', 'timepoint', 'origin'])['count'].sum().reset_index() -total_v_genes.columns = ['subject_id', 'timepoint', 'origin', 'total_v_genes'] -v_family_long = pd.merge(v_family_long, total_v_genes, on=['subject_id', 'timepoint', 'origin']) +total_v_genes = v_family_long.groupby(meta_cols)['count'].sum().reset_index().rename(columns={'count': 'total_v_genes'}) +v_family_long = v_family_long.merge(total_v_genes, on=meta_cols) + +subjects = ( + v_family_long[vgene_subject_col] + .dropna() + .unique() +) -for patient in v_family_long.subject_id.unique().tolist(): - current = v_family_long[v_family_long.subject_id == patient] +for subject in subjects: + current = v_family_long[v_family_long[vgene_subject_col] == subject] fig = go.Figure() fig.update_layout( template="simple_white", - title_text=f"Patient: {patient}", - xaxis=dict(title_text="timepoint"), + title_text=f"Subject: {subject}", + xaxis=dict(title_text=f"{', '.join(vgene_x_cols)}"), yaxis=dict(title_text="proportion"), barmode="stack", ) for g, c in zip(current.v_gene.unique(), colors): plot_df = current[current.v_gene == g] - if g == 'TRBV30': - fig.add_trace( - go.Bar(x=[plot_df.timepoint, plot_df.origin], - y=plot_df.proportion, name=g, marker_color=c, - text=plot_df['total_v_genes'], textposition='outside' - ) - ) - else: - fig.add_trace( - go.Bar(x=[plot_df.timepoint, plot_df.origin], - y=plot_df.proportion, name=g, marker_color=c - ) + fig.add_trace( + go.Bar( + x=[plot_df[col] for col in vgene_x_cols], + y=plot_df['proportion'], + name=g, + marker_color=c, + legendgroup=g, + text=plot_df['total_v_genes'] if g == 'TRBV30' else None, + textposition='outside' if g == 'TRBV30' else None, + ) + ) + + fig.update_layout( + margin=dict(b=80, r=250), + legend=dict( + orientation="v", + x=1.02, + y=1, + yanchor="top", + itemsizing="constant", + traceorder="reversed" + ) + ) + + fig.update_xaxes( + automargin=True, + title_standoff=30 ) fig.show() From f63cfeb7bb99976c14ca227ebecd398f2447b82b Mon Sep 17 00:00:00 2001 From: dltamayo Date: Fri, 19 Dec 2025 12:21:51 -0500 Subject: [PATCH 5/7] Address Copilot suggestions --- modules/local/compare/gliph2.nf | 4 ++-- notebooks/sample_stats_template.qmd | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/modules/local/compare/gliph2.nf b/modules/local/compare/gliph2.nf index fafefde..486442d 100644 --- a/modules/local/compare/gliph2.nf +++ b/modules/local/compare/gliph2.nf @@ -25,7 +25,7 @@ process GLIPH2_TURBOGLIPH { # During testing, including TRBJ column was causing issues in clustering step. Removing and reinserting afterwards. df <- read.csv("$concat_cdr3", sep = "\t", stringsAsFactors = FALSE, check.names = FALSE) - df['patient'] <- df['sample'] + df[,'patient'] <- df[,'sample'] result <- turboGliph::gliph2( cdr3_sequences = df, @@ -39,7 +39,7 @@ process GLIPH2_TURBOGLIPH { ) df3 <- read.csv('cluster_member_details.txt', sep = '\t', stringsAsFactors = FALSE, check.names = FALSE) - df3['sample'] <- df3['patient'] + df3[,'sample'] <- df3[,'patient'] df3 <- merge(df3, df[, c("CDR3b", "TRBV", "sample", 'counts')], by = c("CDR3b", "TRBV", "sample", 'counts'), all.x = TRUE) df3 <- df3[, c('CDR3b', 'TRBV', 'TRBJ', 'counts', 'sample', 'tag', 'seq_ID', 'ultCDR3b')] write.table(df3, "cluster_member_details.txt", sep = "\t", row.names = FALSE, quote = FALSE) diff --git a/notebooks/sample_stats_template.qmd b/notebooks/sample_stats_template.qmd index d3b89ed..9e683d8 100644 --- a/notebooks/sample_stats_template.qmd +++ b/notebooks/sample_stats_template.qmd @@ -74,7 +74,7 @@ print('Date and time: ' + str(datetime.datetime.now())) # 4. Loading data ## reading sample metadata meta = pd.read_csv(sample_table, sep=',') -meta_cols = meta.columns.tolist()[0:-1] +meta_cols = meta.columns.tolist() ## reading combined repertoire statistics df = pd.read_csv(sample_stats_csv, sep=',') @@ -207,7 +207,7 @@ where $N_{k}$ is the number of TCRs that use the $k$ th V gene, and T is the tot colors = ["#fafa70","#fdef6b","#ffe566","#ffda63","#ffd061","#ffc660","#ffbb5f","#fdb15f","#fba860","#f79e61","#f39562","#ef8c63","#e98365","#e37b66","#dd7367","#d66b68","#ce6469","#c65e6a","#bd576b","#b4526b","#ab4c6b","#a1476a","#974369","#8c3e68","#823a66","#773764","#6d3361","#62305e","#572c5a","#4d2956"] vgene_x_cols = vgene_x_cols.split(',') -## calculate calulate proportions and add to v_family_long +## calculate proportions and add to v_family_long v_family_long = pd.melt(v_family, id_vars=meta_cols, value_vars=[c for c in v_family.columns if c.startswith('TRBV')], From ef327655ba41ebb9b98b4cc597fccfa846287903 Mon Sep 17 00:00:00 2001 From: dltamayo Date: Fri, 19 Dec 2025 12:23:19 -0500 Subject: [PATCH 6/7] Update notebook - add logic to parse when submitting one grouping metadata column versus multiple for vgene chart --- notebooks/sample_stats_template.qmd | 94 ++++++++++++++++++++++------- 1 file changed, 73 insertions(+), 21 deletions(-) diff --git a/notebooks/sample_stats_template.qmd b/notebooks/sample_stats_template.qmd index 9e683d8..5a97598 100644 --- a/notebooks/sample_stats_template.qmd +++ b/notebooks/sample_stats_template.qmd @@ -59,6 +59,13 @@ from matplotlib.colors import LinearSegmentedColormap import plotly.express as px import plotly.graph_objects as go +import warnings +warnings.filterwarnings( + "ignore", + category=FutureWarning, + module="plotly" +) + # 2. Print pipeline parameters print('Project Name: ' + project_name) @@ -107,7 +114,8 @@ def samplechart(df, y_col): x=samplechart_x_col, y=y_col, color=samplechart_color_col, - points='all', hover_data=['sample'], + points='all', + hover_data='sample', category_orders={f'{samplechart_x_col}': x_category} ) @@ -230,42 +238,86 @@ for subject in subjects: fig = go.Figure() fig.update_layout( template="simple_white", - title_text=f"Subject: {subject}", + title_text=f"{vgene_subject_col}: {subject}", xaxis=dict(title_text=f"{', '.join(vgene_x_cols)}"), yaxis=dict(title_text="proportion"), barmode="stack", + margin=dict(b=80, r=115), + legend=dict( + orientation="v", + x=1.02, + y=1, + yanchor="top", + itemsizing="constant", + traceorder="reversed" + ) + ) + + fig.update_xaxes( + automargin=True, + title_standoff=30 + ) + + if len(vgene_x_cols) == 1: + x_vals = current[vgene_x_cols[0]] + else: + x_vals = current[vgene_x_cols].astype(str).agg('_'.join, axis=1) + summary = ( + current + .assign(x=x_vals) + .drop_duplicates(subset=meta_cols) + .groupby('x', as_index=False) + .agg( + total_v_genes=('total_v_genes', 'sum'), + ) ) + y_df = ( + current + .assign(x=x_vals) + .groupby('x', as_index=False) + .agg( + y_top=('proportion', 'sum') + ) + ) + summary = y_df.merge(summary, on='x') for g, c in zip(current.v_gene.unique(), colors): plot_df = current[current.v_gene == g] + if len(vgene_x_cols) == 1: + x = plot_df[vgene_x_cols[0]] + else: + x = [plot_df[col] for col in vgene_x_cols] fig.add_trace( go.Bar( - x=[plot_df[col] for col in vgene_x_cols], + x=x, y=plot_df['proportion'], name=g, marker_color=c, legendgroup=g, - text=plot_df['total_v_genes'] if g == 'TRBV30' else None, - textposition='outside' if g == 'TRBV30' else None, - ) - ) - - fig.update_layout( - margin=dict(b=80, r=250), - legend=dict( - orientation="v", - x=1.02, - y=1, - yanchor="top", - itemsizing="constant", - traceorder="reversed" + + customdata=plot_df[['sample']], + hovertemplate=( + "V gene: %{fullData.name}
" + "Sample: %{customdata[0]}
" + "Proportion: %{y}
" + "" + ), + + text=plot_df['total_v_genes'] if ((g == 'TRBV30') and (len(vgene_x_cols) != 1)) else None, + textposition='outside', ) ) - fig.update_xaxes( - automargin=True, - title_standoff=30 - ) + if len(vgene_x_cols) == 1: + for _, row in summary.iterrows(): + fig.add_annotation( + x=row['x'], + y=row['y_top'] + 0.02, + text=str(row['total_v_genes']), + showarrow=False, + yanchor='bottom', + font=dict(size=12) + ) fig.show() ``` From 8760a312465e01cb56566173580f3b16cd58799b Mon Sep 17 00:00:00 2001 From: dltamayo Date: Fri, 19 Dec 2025 13:07:00 -0500 Subject: [PATCH 7/7] Add Cirro config --- .cirro/process-form.json | 25 ++++++++++++++++++++++++- .cirro/process-input.json | 6 +++++- nextflow.config | 2 +- 3 files changed, 30 insertions(+), 3 deletions(-) diff --git a/.cirro/process-form.json b/.cirro/process-form.json index ee8c4df..da2d687 100644 --- a/.cirro/process-form.json +++ b/.cirro/process-form.json @@ -50,8 +50,31 @@ "description": "p_depth (GLIPH2)", "title": "p_depth", "type": "string" + }, + "samplechart_x_col": { + "default": "timepoint", + "description": "Metadata column for x axis of sample-level plots for Sample workflow notebook", + "title": "Sample plot X axis column", + "type": "string" + }, + "samplechart_color_col": { + "default": "origin", + "description": "Metadata column for legend color of sample-level plots for Sample workflow notebook", + "title": "Sample plot X axis column", + "type": "string" + }, + "vgene_subject_col": { + "default": "subject_id", + "description": "Metadata column for grouping of samples for V gene plots for Sample workflow notebook", + "title": "V gene plot subject column", + "type": "string" + }, + "vgene_x_cols": { + "default": "timepoint,origin", + "description": "Comma-separated list of metadata columns for x axis of V gene plot (eg. timepoint,origin or timepoint)", + "title": "V gene plot X axis columns", + "type": "string" } - } }, "ui": {} diff --git a/.cirro/process-input.json b/.cirro/process-input.json index 98cf403..798fc2f 100644 --- a/.cirro/process-input.json +++ b/.cirro/process-input.json @@ -9,5 +9,9 @@ "local_min_OVE": "$.params.dataset.paramJson.local_min_OVE", "local_min_pvalue": "$.params.dataset.paramJson.local_min_pvalue", "outdir": "$.params.dataset.s3|/data/", - "p_depth": "$.params.dataset.paramJson.p_depth" + "p_depth": "$.params.dataset.paramJson.p_depth", + "samplechart_x_col": "$.params.dataset.paramJson.samplechart_x_col", + "samplechart_color_col": "$.params.dataset.paramJson.samplechart_color_col", + "vgene_subject_col": "$.params.dataset.paramJson.vgene_subject_col", + "vgene_x_cols": "$.params.dataset.paramJson.vgene_x_cols" } \ No newline at end of file diff --git a/nextflow.config b/nextflow.config index d15c1ac..f685ead 100644 --- a/nextflow.config +++ b/nextflow.config @@ -30,7 +30,7 @@ params { samplechart_x_col = 'timepoint' samplechart_color_col = 'origin' vgene_subject_col = 'subject_id' - vgene_x_cols = 'timepoint,origin' + vgene_x_cols = 'origin,timepoint' // GIANA parameters threshold = 7.0