KarchinLab · KevinMLanderos · Jan 9, 2026 · Jan 9, 2026 · Jan 9, 2026 · Jan 11, 2026
diff --git a/bin/compare_calc.py b/bin/compare_calc.py
@@ -52,8 +52,13 @@ def morisita_horn_index(counts1, counts2):
     # -------------------------
     # Load samplesheet
     # -------------------------
-    sample_df = pd.read_csv(args.sample_utf8)
+    sample_df = pd.read_csv(args.sample_utf8, index_col=None)
 
+    # Basic hygiene: drop rows missing sample or file
+    sample_df = sample_df.dropna(subset=["sample", "file"])
+    print(sample_df.head())
+    print(sample_df.columns.tolist())
+
     samples = sample_df["sample"].tolist()
     files = sample_df["file"].tolist()
     n = len(samples)
@@ -64,44 +69,46 @@ def morisita_horn_index(counts1, counts2):
     # Preload data structures
     # -------------------------
     junction_sets = {}
-    count_vectors = {}
+    count_series = {}
 
     for sample, file in zip(samples, files):
         df = pd.read_csv(file, sep="\t", usecols=["junction_aa", "duplicate_count"])
         df = df.dropna(subset=["junction_aa"])
 
+        # Ensure counts are numeric
+        df["duplicate_count"] = pd.to_numeric(df["duplicate_count"], errors="coerce").fillna(0)
+
         # Set for presence/absence metrics
         junction_sets[sample] = set(df["junction_aa"])
 
-        # Counts for Morisita–Horn
-        count_vectors[sample] = (
+        # Counts for Morisita–Horn as a pandas Series (index = junction_aa)
+        counts = (
             df.groupby("junction_aa")["duplicate_count"]
             .sum()
         )
-
+        # Ensure we have a Series with a named index
+        if not isinstance(counts, pd.Series):
+            counts = pd.Series(counts)
+        count_series[sample] = counts
 
     # -------------------------
-    # Align count vectors across union space
+    # Build union index and align counts
     # -------------------------
-    all_junctions = sorted(
-        set().union(*junction_sets.values())
-    )
+    union_set = set().union(*junction_sets.values()) if junction_sets else set()
+    all_junctions = pd.Index(sorted(union_set))
 
+    aligned_vectors = {}
     for sample in samples:
-        count_vectors[sample] = (
-            count_vectors[sample]
-            .reindex(all_junctions, fill_value=0)
-            .to_numpy()
-        )
-
+        aligned = count_series[sample].reindex(all_junctions, fill_value=0)
+        # Store as numpy for MH computation
+        aligned_vectors[sample] = aligned.to_numpy(dtype=float)
 
     # -------------------------
     # Initialize matrices
     # -------------------------
-    jaccard_mat = np.zeros((n, n))
-    sorensen_mat = np.zeros((n, n))
-    morisita_mat = np.zeros((n, n))
-
+    jaccard_mat = np.zeros((n, n), dtype=float)
+    sorensen_mat = np.zeros((n, n), dtype=float)
+    morisita_mat = np.zeros((n, n), dtype=float)
 
     # -------------------------
     # Compute upper triangle only
@@ -111,7 +118,7 @@ def morisita_horn_index(counts1, counts2):
     for i in range(n):
         s1 = samples[i]
         set1 = junction_sets[s1]
-        counts1 = count_vectors[s1]
+        counts1 = aligned_vectors[s1]
 
         # Diagonal
         jaccard_mat[i, i] = 1.0
@@ -120,16 +127,14 @@ def morisita_horn_index(counts1, counts2):
 
         for j in range(i + 1, n):
             s2 = samples[j]
-
             j_val = jaccard_index(set1, junction_sets[s2])
             s_val = sorensen_index(set1, junction_sets[s2])
-            m_val = morisita_horn_index(counts1, count_vectors[s2])
+            m_val = morisita_horn_index(counts1, aligned_vectors[s2])
 
             jaccard_mat[i, j] = jaccard_mat[j, i] = j_val
             sorensen_mat[i, j] = sorensen_mat[j, i] = s_val
             morisita_mat[i, j] = morisita_mat[j, i] = m_val
 
-
     # -------------------------
     # Write outputs
     # -------------------------

diff --git a/bin/create_pheno_samplesheet.py b/bin/create_pheno_samplesheet.py
@@ -0,0 +1,37 @@
+#!/usr/bin/env python3
+
+import sys
+import json
+from pathlib import Path
+
+# The script now takes two simple arguments:
+# 1. The input JSON file
+# 2. The output CSV filename
+meta_json_file = sys.argv[1]
+output_csv = sys.argv[2]
+
+# Load the list from the JSON file
+with open(meta_json_file, 'r') as f_in:
+    # This will be a list, e.g.,
+    # [ [ {'sample': 'Patient01_Base_CD4', ...}, 'file1.tsv' ],
+    #   [ {'sample': 'Patient01_Base_CD8', ...}, 'file2.tsv' ] ]
+    all_new_meta = json.load(f_in)
+
+with open(output_csv, 'w') as f_out:
+
+    # Write the new header
+    f_out.write('sample,subject_id,timepoint,origin,phenotype,file\n')
+
+    # Iterate over each item in the list
+    for item in all_new_meta:
+        meta = item                   # 'item' is the meta map
+        file_path = str(meta['file']) # Get the file path from the map
+
+        f_out.write(
+            f"{meta['sample']},"
+            f"{meta['subject_id']},"
+            f"{meta['timepoint']},"
+            f"{meta['origin']},"
+            f"{meta['phenotype']},"
+            f"{file_path}\n"
+        )
diff --git a/bin/pseudobulk.py b/bin/pseudobulk.py
@@ -141,6 +141,120 @@ def pseudobulk(input_df, basename, airr_schema):
     bulk_df.to_csv(output_path, sep='\t', index=False)
     logger.info(f"Pseudobulked AIRR file written to: {output_path}")
 
+
+def add_phenotype_info(input_df, basename, sobject_gex):
+    import numpy as np
+    import pandas as pd
+    from rpy2.robjects import r
+    from rpy2.robjects import pandas2ri
+    import rpy2.robjects as ro
+    from rpy2.robjects.conversion import localconverter
+
+    # Read Seurat Object
+    readRDS = r['readRDS']
+    seurat_obj = readRDS(sobject_gex)
+    # Extract and Convert Metadata into a pandas DataFrame
+    with localconverter(ro.default_converter + pandas2ri.converter):
+        cell_metadata_pd = ro.conversion.rpy2py(seurat_obj.slots['meta.data'])
+
+    cell_metadata_pd["cell_id"] = cell_metadata_pd["library"] + "_" + cell_metadata_pd["cell_id"]
+    cell_metadata_pd = cell_metadata_pd.loc[:,["cell_id","phenotype"]]
+    cell_metadata_pd['phenotype'] = cell_metadata_pd['phenotype'].str.replace(' ', '_')
+
+    # ------ Fix TCR Barcodes ------ #
+    input_df["cell_id"] = basename + "_" + input_df["cell_id"]
+
+    # ------ Merge ------ #
+    merged_df = pd.merge(input_df, cell_metadata_pd, on="cell_id", how="left")
+    return merged_df
+
+def pseudobulk_phenotype(input_df, basename, airr_schema):
+    # Load data, filter out TRA calls
+    df = input_df
+    df_trb = df[df["v_call"].str.startswith("TRB") & df["j_call"].str.startswith("TRB")]
+
+    # Define required aggregations
+    agg_columns = {
+        "cell_id": pd.Series.nunique,
+        "junction": lambda x: assert_single_unique(x, "junction"),
+        "junction_aa": lambda x: assert_single_unique(x, "junction_aa"),
+        "v_call": lambda x: assert_single_unique(x, "v_call"),
+        "j_call": lambda x: assert_single_unique(x, "j_call"),
+        "consensus_count": "sum",
+        "duplicate_count": "sum"
+    }
+
+    # Identify columns not already aggregated
+    excluded_cols = set(agg_columns) | {"sequence", "phenotype"}
+    remaining_cols = [col for col in df_trb.columns if col not in excluded_cols]
+
+    # Group by sequence
+    grouped = df_trb.groupby(["sequence", "phenotype"])
+
+    # Determine which of the remaining columns have consistent values
+    consensus_cols = []
+    for col in remaining_cols:
+        if (grouped[col].nunique(dropna=True) <= 1).all():
+            consensus_cols.append(col)
+
+    # Add those columns to the aggregation with the unique value
+    for col in consensus_cols:
+        agg_columns[col] = lambda x, col=col: assert_single_unique(x, col)
+
+    # Aggregate
+    bulk_df = grouped.agg(agg_columns).reset_index()
+
+    # Rename and reorder columns
+    bulk_df.rename(columns={"cell_id": "cell_count"}, inplace=True)
+    bulk_df["sequence_id"] = [f"{basename}|sequence{i+1}|{bulk_df['phenotype'].iloc[i]}" for i in range(len(bulk_df))]
+    total_duplicate = bulk_df['duplicate_count'].sum()
+    bulk_df['duplicate_frequency_percent'] = bulk_df['duplicate_count'] / total_duplicate * 100
+
+    # Load schema
+    schema = airr_schema
+
+    required = schema["Rearrangement"]["required"]
+    properties = list(schema["Rearrangement"]["properties"].keys())
+
+    # Add missing required fields as blank
+    for col in required:
+        if col not in bulk_df.columns:
+            bulk_df[col] = pd.NA
+
+    # Apply to relevant columns if they exist
+    for col in ['v_call', 'd_call', 'j_call', 'c_call']:
+        if col in df.columns:
+            bulk_df[col] = bulk_df[col].apply(airr_compliant_gene)
+
+    # Identify all boolean columns from schema
+    bool_cols = [
+        name for name, spec in schema["Rearrangement"]["properties"].items()
+        if spec.get("type") == "boolean"
+    ]
+
+    bool_cols = bool_cols + ['is_cell']
+    for col in bool_cols:
+        if col in bulk_df.columns:
+            bulk_df[col] = bulk_df[col].apply(normalize_bool)
+
+    # Order columns
+    ordered_cols = [col for col in properties if col in bulk_df.columns]
+    extra_cols = [col for col in bulk_df.columns if col not in ordered_cols]
+    bulk_df = bulk_df[ordered_cols + extra_cols]
+
+    # Iterate through each unique phenotype and save to a separate file
+    for phenotype in bulk_df['phenotype'].unique():
+        # Filter the dataframe for the current phenotype
+        phenotype_df = bulk_df[bulk_df['phenotype'] == phenotype]
+
+        # Define the output path based on basename and phenotype
+        output_path = f"{basename}_{phenotype}_pseudobulk_phenotype.tsv"
+
+        # Save the filtered dataframe to a TSV file
+        phenotype_df.to_csv(output_path, sep='\t', index=False)
+        logger.info(f"Pseudobulked AIRR file for phenotype '{phenotype}' written to: {output_path}")
+
+
 if __name__ == "__main__":
     # Parse input arguments
     parser = argparse.ArgumentParser(description="Take positional args")
@@ -163,6 +277,18 @@ def pseudobulk(input_df, basename, airr_schema):
         help="Path to airr json schema for validating adaptive conversion.",
     )
 
+    parser.add_argument(
+        "--phenotype",
+        action="store_true",
+        help="A boolean flag to indicate wether the pseudobulk approach will consider phenotype.",
+    )
+
+    parser.add_argument(
+        "--sobject_gex",
+        type=str,
+        help="Path to Seurat Object, 'phenotype' should be a column in meta.data slot.",
+    )
+
     add_logging_args(parser)
     args = parser.parse_args()
 
@@ -181,5 +307,10 @@ def pseudobulk(input_df, basename, airr_schema):
     with open(args.airr_schema) as f:
         airr_schema = json.load(f)
     basename = args.basename
-
-    pseudobulk(input_df, basename, airr_schema)
+
+    # Pseudobulk by phenotype (if available)
+    if args.phenotype:
+        input_df = add_phenotype_info(input_df, basename, args.sobject_gex)  # Add single-cell phenotype info from Seurat Object.
+        pseudobulk_phenotype(input_df, basename, airr_schema)
+    else:
+        pseudobulk(input_df, basename, airr_schema)
diff --git a/conf/modules.config b/conf/modules.config
@@ -35,6 +35,22 @@ process {
         ]
     }
 
+    withName: /SAMPLE_.*_PHENO/ {
+        publishDir = [
+            path: { "${params.outdir}/sample_phenotype" },
+            mode: params.publish_dir_mode,
+            saveAs: { filename -> filename.equals('versions.yml') ? null : filename }
+        ]
+    }
+
+    withName: /COMPARE_.*_PHENO/ {
+        publishDir = [
+            path: { "${params.outdir}/compare_phenotype" },
+            mode: params.publish_dir_mode,
+            saveAs: { filename -> filename.equals('versions.yml') ? null : filename }
+        ]
+    }
+
     withName: TCRDIST3_MATRIX {
         label = params.matrix_sparsity == 'sparse' ? 'process_medium' : ['process_high', 'process_high_memory']
     }

diff --git a/modules/local/airr_convert/pseudobulk_cellranger.nf b/modules/local/airr_convert/pseudobulk_cellranger.nf
@@ -1,6 +1,7 @@
 process PSEUDOBULK_CELLRANGER {
     tag "${sample_meta.sample}"
     label 'process_low'
+    container "ghcr.io/karchinlab/tcrtoolkit:main"
 
     input:
     tuple val(sample_meta), path(count_table)

diff --git a/modules/local/airr_convert/pseudobulk_phenotype_cellranger.nf b/modules/local/airr_convert/pseudobulk_phenotype_cellranger.nf
@@ -0,0 +1,25 @@
+process PSEUDOBULK_CELLRANGER_PHENOTYPE {
+    tag "${sample_meta.sample}"
+    label 'process_low'
+    container "ghcr.io/karchinlab/tcrtoolkit:main"
+
+    input:
+    tuple val(sample_meta), path(count_table)
+    path airr_schema
+    path sobject_gex
+
+    output:
+    tuple val(sample_meta), path("${sample_meta.sample}_*_pseudobulk_phenotype.tsv"), emit: "cellranger_pseudobulk_phenotype"
+
+    script:
+    // Dynamically add phenotype arguments only if sobject_gex is provided (i.e., not null)
+    def phenotype_args = sobject_gex ? "--phenotype --sobject_gex ${sobject_gex}" : ""
+
+    """
+    pseudobulk.py \\
+        ${count_table} \\
+        ${sample_meta.sample} \\
+        ${airr_schema} \\
+        ${phenotype_args}
+    """
+}
diff --git a/modules/local/samplesheet/generate_pheno_samplesheet.nf b/modules/local/samplesheet/generate_pheno_samplesheet.nf
@@ -0,0 +1,20 @@
+process GENERATE_PHENO_SAMPLESHEET {
+    tag "Generating phenotype samplesheet"
+    label 'process_single'
+    // container "ghcr.io/karchinlab/tcrtoolkit:main"
+
+    input:
+    path original_samplesheet
+    path meta_json 
+    path py_script  // NOTE: Remove when in container
+
+    output:
+    path "phenotype_samplesheet.csv", emit: samplesheet
+
+    script:
+    def out_file = "phenotype_samplesheet.csv" // Define the output file variable
+
+    """ 
+    ${py_script} ${meta_json} ${out_file}
+    """
+}