Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
51 changes: 28 additions & 23 deletions bin/compare_calc.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,8 +52,13 @@ def morisita_horn_index(counts1, counts2):
# -------------------------
# Load samplesheet
# -------------------------
sample_df = pd.read_csv(args.sample_utf8)
sample_df = pd.read_csv(args.sample_utf8, index_col=None)

# Basic hygiene: drop rows missing sample or file
sample_df = sample_df.dropna(subset=["sample", "file"])
print(sample_df.head())
print(sample_df.columns.tolist())

samples = sample_df["sample"].tolist()
files = sample_df["file"].tolist()
n = len(samples)
Expand All @@ -64,44 +69,46 @@ def morisita_horn_index(counts1, counts2):
# Preload data structures
# -------------------------
junction_sets = {}
count_vectors = {}
count_series = {}

for sample, file in zip(samples, files):
df = pd.read_csv(file, sep="\t", usecols=["junction_aa", "duplicate_count"])
df = df.dropna(subset=["junction_aa"])

# Ensure counts are numeric
df["duplicate_count"] = pd.to_numeric(df["duplicate_count"], errors="coerce").fillna(0)

# Set for presence/absence metrics
junction_sets[sample] = set(df["junction_aa"])

# Counts for Morisita–Horn
count_vectors[sample] = (
# Counts for Morisita–Horn as a pandas Series (index = junction_aa)
counts = (
df.groupby("junction_aa")["duplicate_count"]
.sum()
)

# Ensure we have a Series with a named index
if not isinstance(counts, pd.Series):
counts = pd.Series(counts)
count_series[sample] = counts

# -------------------------
# Align count vectors across union space
# Build union index and align counts
# -------------------------
all_junctions = sorted(
set().union(*junction_sets.values())
)
union_set = set().union(*junction_sets.values()) if junction_sets else set()
all_junctions = pd.Index(sorted(union_set))

aligned_vectors = {}
for sample in samples:
count_vectors[sample] = (
count_vectors[sample]
.reindex(all_junctions, fill_value=0)
.to_numpy()
)

aligned = count_series[sample].reindex(all_junctions, fill_value=0)
# Store as numpy for MH computation
aligned_vectors[sample] = aligned.to_numpy(dtype=float)

# -------------------------
# Initialize matrices
# -------------------------
jaccard_mat = np.zeros((n, n))
sorensen_mat = np.zeros((n, n))
morisita_mat = np.zeros((n, n))

jaccard_mat = np.zeros((n, n), dtype=float)
sorensen_mat = np.zeros((n, n), dtype=float)
morisita_mat = np.zeros((n, n), dtype=float)

# -------------------------
# Compute upper triangle only
Expand All @@ -111,7 +118,7 @@ def morisita_horn_index(counts1, counts2):
for i in range(n):
s1 = samples[i]
set1 = junction_sets[s1]
counts1 = count_vectors[s1]
counts1 = aligned_vectors[s1]

# Diagonal
jaccard_mat[i, i] = 1.0
Expand All @@ -120,16 +127,14 @@ def morisita_horn_index(counts1, counts2):

for j in range(i + 1, n):
s2 = samples[j]

j_val = jaccard_index(set1, junction_sets[s2])
s_val = sorensen_index(set1, junction_sets[s2])
m_val = morisita_horn_index(counts1, count_vectors[s2])
m_val = morisita_horn_index(counts1, aligned_vectors[s2])

jaccard_mat[i, j] = jaccard_mat[j, i] = j_val
sorensen_mat[i, j] = sorensen_mat[j, i] = s_val
morisita_mat[i, j] = morisita_mat[j, i] = m_val


# -------------------------
# Write outputs
# -------------------------
Expand Down
37 changes: 37 additions & 0 deletions bin/create_pheno_samplesheet.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
#!/usr/bin/env python3

import sys
import json
from pathlib import Path

# The script now takes two simple arguments:
# 1. The input JSON file
# 2. The output CSV filename
meta_json_file = sys.argv[1]
output_csv = sys.argv[2]

# Load the list from the JSON file
with open(meta_json_file, 'r') as f_in:
# This will be a list, e.g.,
# [ [ {'sample': 'Patient01_Base_CD4', ...}, 'file1.tsv' ],
# [ {'sample': 'Patient01_Base_CD8', ...}, 'file2.tsv' ] ]
all_new_meta = json.load(f_in)

with open(output_csv, 'w') as f_out:

# Write the new header
f_out.write('sample,subject_id,timepoint,origin,phenotype,file\n')

# Iterate over each item in the list
for item in all_new_meta:
meta = item # 'item' is the meta map
file_path = str(meta['file']) # Get the file path from the map

f_out.write(
f"{meta['sample']},"
f"{meta['subject_id']},"
f"{meta['timepoint']},"
f"{meta['origin']},"
f"{meta['phenotype']},"
f"{file_path}\n"
)
135 changes: 133 additions & 2 deletions bin/pseudobulk.py
Original file line number Diff line number Diff line change
Expand Up @@ -141,6 +141,120 @@ def pseudobulk(input_df, basename, airr_schema):
bulk_df.to_csv(output_path, sep='\t', index=False)
logger.info(f"Pseudobulked AIRR file written to: {output_path}")


def add_phenotype_info(input_df, basename, sobject_gex):
import numpy as np
import pandas as pd
from rpy2.robjects import r
from rpy2.robjects import pandas2ri
import rpy2.robjects as ro
from rpy2.robjects.conversion import localconverter

# Read Seurat Object
readRDS = r['readRDS']
seurat_obj = readRDS(sobject_gex)
# Extract and Convert Metadata into a pandas DataFrame
with localconverter(ro.default_converter + pandas2ri.converter):
cell_metadata_pd = ro.conversion.rpy2py(seurat_obj.slots['meta.data'])

cell_metadata_pd["cell_id"] = cell_metadata_pd["library"] + "_" + cell_metadata_pd["cell_id"]
cell_metadata_pd = cell_metadata_pd.loc[:,["cell_id","phenotype"]]
cell_metadata_pd['phenotype'] = cell_metadata_pd['phenotype'].str.replace(' ', '_')

# ------ Fix TCR Barcodes ------ #
input_df["cell_id"] = basename + "_" + input_df["cell_id"]

# ------ Merge ------ #
merged_df = pd.merge(input_df, cell_metadata_pd, on="cell_id", how="left")
return merged_df

def pseudobulk_phenotype(input_df, basename, airr_schema):
# Load data, filter out TRA calls
df = input_df
df_trb = df[df["v_call"].str.startswith("TRB") & df["j_call"].str.startswith("TRB")]

# Define required aggregations
agg_columns = {
"cell_id": pd.Series.nunique,
"junction": lambda x: assert_single_unique(x, "junction"),
"junction_aa": lambda x: assert_single_unique(x, "junction_aa"),
"v_call": lambda x: assert_single_unique(x, "v_call"),
"j_call": lambda x: assert_single_unique(x, "j_call"),
"consensus_count": "sum",
"duplicate_count": "sum"
}

# Identify columns not already aggregated
excluded_cols = set(agg_columns) | {"sequence", "phenotype"}
remaining_cols = [col for col in df_trb.columns if col not in excluded_cols]

# Group by sequence
grouped = df_trb.groupby(["sequence", "phenotype"])

# Determine which of the remaining columns have consistent values
consensus_cols = []
for col in remaining_cols:
if (grouped[col].nunique(dropna=True) <= 1).all():
consensus_cols.append(col)

# Add those columns to the aggregation with the unique value
for col in consensus_cols:
agg_columns[col] = lambda x, col=col: assert_single_unique(x, col)

# Aggregate
bulk_df = grouped.agg(agg_columns).reset_index()

# Rename and reorder columns
bulk_df.rename(columns={"cell_id": "cell_count"}, inplace=True)
bulk_df["sequence_id"] = [f"{basename}|sequence{i+1}|{bulk_df['phenotype'].iloc[i]}" for i in range(len(bulk_df))]
total_duplicate = bulk_df['duplicate_count'].sum()
bulk_df['duplicate_frequency_percent'] = bulk_df['duplicate_count'] / total_duplicate * 100

# Load schema
schema = airr_schema

required = schema["Rearrangement"]["required"]
properties = list(schema["Rearrangement"]["properties"].keys())

# Add missing required fields as blank
for col in required:
if col not in bulk_df.columns:
bulk_df[col] = pd.NA

# Apply to relevant columns if they exist
for col in ['v_call', 'd_call', 'j_call', 'c_call']:
if col in df.columns:
bulk_df[col] = bulk_df[col].apply(airr_compliant_gene)

# Identify all boolean columns from schema
bool_cols = [
name for name, spec in schema["Rearrangement"]["properties"].items()
if spec.get("type") == "boolean"
]

bool_cols = bool_cols + ['is_cell']
for col in bool_cols:
if col in bulk_df.columns:
bulk_df[col] = bulk_df[col].apply(normalize_bool)

# Order columns
ordered_cols = [col for col in properties if col in bulk_df.columns]
extra_cols = [col for col in bulk_df.columns if col not in ordered_cols]
bulk_df = bulk_df[ordered_cols + extra_cols]

# Iterate through each unique phenotype and save to a separate file
for phenotype in bulk_df['phenotype'].unique():
# Filter the dataframe for the current phenotype
phenotype_df = bulk_df[bulk_df['phenotype'] == phenotype]

# Define the output path based on basename and phenotype
output_path = f"{basename}_{phenotype}_pseudobulk_phenotype.tsv"

# Save the filtered dataframe to a TSV file
phenotype_df.to_csv(output_path, sep='\t', index=False)
logger.info(f"Pseudobulked AIRR file for phenotype '{phenotype}' written to: {output_path}")


if __name__ == "__main__":
# Parse input arguments
parser = argparse.ArgumentParser(description="Take positional args")
Expand All @@ -163,6 +277,18 @@ def pseudobulk(input_df, basename, airr_schema):
help="Path to airr json schema for validating adaptive conversion.",
)

parser.add_argument(
"--phenotype",
action="store_true",
help="A boolean flag to indicate wether the pseudobulk approach will consider phenotype.",
)

parser.add_argument(
"--sobject_gex",
type=str,
help="Path to Seurat Object, 'phenotype' should be a column in meta.data slot.",
)

add_logging_args(parser)
args = parser.parse_args()

Expand All @@ -181,5 +307,10 @@ def pseudobulk(input_df, basename, airr_schema):
with open(args.airr_schema) as f:
airr_schema = json.load(f)
basename = args.basename

pseudobulk(input_df, basename, airr_schema)

# Pseudobulk by phenotype (if available)
if args.phenotype:
input_df = add_phenotype_info(input_df, basename, args.sobject_gex) # Add single-cell phenotype info from Seurat Object.
pseudobulk_phenotype(input_df, basename, airr_schema)
else:
pseudobulk(input_df, basename, airr_schema)
16 changes: 16 additions & 0 deletions conf/modules.config
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,22 @@ process {
]
}

withName: /SAMPLE_.*_PHENO/ {
publishDir = [
path: { "${params.outdir}/sample_phenotype" },
mode: params.publish_dir_mode,
saveAs: { filename -> filename.equals('versions.yml') ? null : filename }
]
}

withName: /COMPARE_.*_PHENO/ {
publishDir = [
path: { "${params.outdir}/compare_phenotype" },
mode: params.publish_dir_mode,
saveAs: { filename -> filename.equals('versions.yml') ? null : filename }
]
}

withName: TCRDIST3_MATRIX {
label = params.matrix_sparsity == 'sparse' ? 'process_medium' : ['process_high', 'process_high_memory']
}
Expand Down
1 change: 1 addition & 0 deletions modules/local/airr_convert/pseudobulk_cellranger.nf
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
process PSEUDOBULK_CELLRANGER {
tag "${sample_meta.sample}"
label 'process_low'
container "ghcr.io/karchinlab/tcrtoolkit:main"

input:
tuple val(sample_meta), path(count_table)
Expand Down
25 changes: 25 additions & 0 deletions modules/local/airr_convert/pseudobulk_phenotype_cellranger.nf
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
process PSEUDOBULK_CELLRANGER_PHENOTYPE {
tag "${sample_meta.sample}"
label 'process_low'
container "ghcr.io/karchinlab/tcrtoolkit:main"

input:
tuple val(sample_meta), path(count_table)
path airr_schema
path sobject_gex

output:
tuple val(sample_meta), path("${sample_meta.sample}_*_pseudobulk_phenotype.tsv"), emit: "cellranger_pseudobulk_phenotype"

script:
// Dynamically add phenotype arguments only if sobject_gex is provided (i.e., not null)
def phenotype_args = sobject_gex ? "--phenotype --sobject_gex ${sobject_gex}" : ""

"""
pseudobulk.py \\
${count_table} \\
${sample_meta.sample} \\
${airr_schema} \\
${phenotype_args}
"""
}
20 changes: 20 additions & 0 deletions modules/local/samplesheet/generate_pheno_samplesheet.nf
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
process GENERATE_PHENO_SAMPLESHEET {
tag "Generating phenotype samplesheet"
label 'process_single'
// container "ghcr.io/karchinlab/tcrtoolkit:main"

input:
path original_samplesheet
path meta_json
path py_script // NOTE: Remove when in container

output:
path "phenotype_samplesheet.csv", emit: samplesheet

script:
def out_file = "phenotype_samplesheet.csv" // Define the output file variable

"""
${py_script} ${meta_json} ${out_file}
"""
}
Loading