Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
25 changes: 24 additions & 1 deletion .cirro/process-form.json
Original file line number Diff line number Diff line change
Expand Up @@ -50,8 +50,31 @@
"description": "p_depth (GLIPH2)",
"title": "p_depth",
"type": "string"
},
"samplechart_x_col": {
"default": "timepoint",
"description": "Metadata column for x axis of sample-level plots for Sample workflow notebook",
"title": "Sample plot X axis column",
"type": "string"
},
"samplechart_color_col": {
"default": "origin",
"description": "Metadata column for legend color of sample-level plots for Sample workflow notebook",
"title": "Sample plot X axis column",
"type": "string"
},
"vgene_subject_col": {
"default": "subject_id",
"description": "Metadata column for grouping of samples for V gene plots for Sample workflow notebook",
"title": "V gene plot subject column",
"type": "string"
},
"vgene_x_cols": {
"default": "timepoint,origin",
"description": "Comma-separated list of metadata columns for x axis of V gene plot (eg. timepoint,origin or timepoint)",
"title": "V gene plot X axis columns",
"type": "string"
}

}
},
"ui": {}
Expand Down
6 changes: 5 additions & 1 deletion .cirro/process-input.json
Original file line number Diff line number Diff line change
Expand Up @@ -9,5 +9,9 @@
"local_min_OVE": "$.params.dataset.paramJson.local_min_OVE",
"local_min_pvalue": "$.params.dataset.paramJson.local_min_pvalue",
"outdir": "$.params.dataset.s3|/data/",
"p_depth": "$.params.dataset.paramJson.p_depth"
"p_depth": "$.params.dataset.paramJson.p_depth",
"samplechart_x_col": "$.params.dataset.paramJson.samplechart_x_col",
"samplechart_color_col": "$.params.dataset.paramJson.samplechart_color_col",
"vgene_subject_col": "$.params.dataset.paramJson.vgene_subject_col",
"vgene_x_cols": "$.params.dataset.paramJson.vgene_x_cols"
}
12 changes: 3 additions & 9 deletions bin/compare_concatenate.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,18 +29,12 @@ def main():
# Read the TSV file into a dataframe
file_path = str(row['file'])
df = pd.read_csv(file_path, sep="\t", header=0)

# Get metadata
subject_id = row['subject_id']
timepoint = row['timepoint']
origin = row['origin']


# Add patient column
df['patient'] = f"{subject_id}:{timepoint}_{origin}"
df['sample'] = row['sample']

# Select relevant columns
df = df[['junction_aa', 'v_call', 'j_call', 'duplicate_count', 'patient', 'sample']]
df = df[['junction_aa', 'v_call', 'j_call', 'duplicate_count', 'sample']]
dfs.append(df)


Expand Down
54 changes: 15 additions & 39 deletions bin/sample_calc.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,18 +22,7 @@ def extract_trb_family(allele):
match = re.match(r'(TRB[V|D|J])(\d+)', allele)
return f"{match.group(1)}{match.group(2)}" if match else None

def compute_gene_family_table(counts, col_name, all_families, sample_meta):
fam_col = f"{col_name}FamilyName"
counts[fam_col] = counts[col_name].apply(extract_trb_family)
fam_df = counts[fam_col].value_counts(dropna=False).to_frame().T.sort_index(axis=1)
fam_df = fam_df.reindex(columns=all_families, fill_value=0)

for col in ['origin', 'timepoint', 'subject_id']:
fam_df.insert(0, col, sample_meta[col])

return fam_df

def calc_gene_family(counts, gene_column, family_prefix, max_index, output_file, meta_df):
def calc_gene_family(sample_name, counts, gene_column, family_prefix, max_index, output_file):
# Build list of all possible family names
all_fams = [f'{family_prefix}{i}' for i in range(1, max_index + 1)]

Expand All @@ -43,12 +32,12 @@ def calc_gene_family(counts, gene_column, family_prefix, max_index, output_file,
# Reindex to include all families
fam_df = pd.DataFrame([fam_df.reindex(columns=all_fams, fill_value=0).iloc[0]]).reset_index(drop=True)

# Add metadata columns
fam_df = pd.concat([meta_df, fam_df], axis=1)
# Add sample column
fam_df.insert(0, 'sample', sample_name)

fam_df.to_csv(output_file, header=True, index=False)

def calc_sample_stats(meta_df, counts, output_file):
def calc_sample_stats(sample_name, counts, output_file):
"""Calculate sample level statistics of TCR repertoire."""

## first pass stats
Expand Down Expand Up @@ -105,8 +94,8 @@ def calc_sample_stats(meta_df, counts, output_file):
# Convert to single-row dataframe
df_stats = pd.DataFrame([row_data])

# Add metadata columns
df_stats = pd.concat([meta_df, df_stats], axis=1)
# Add sample column
df_stats.insert(0, 'sample', sample_name)

# Save to CSV
df_stats.to_csv(output_file, header=True, index=False)
Expand All @@ -117,40 +106,27 @@ def main():
parser = argparse.ArgumentParser(description='Calculate clonality of a TCR repertoire')

# add arguments
parser.add_argument('-s', '--sample_meta',
metavar='sample_meta',
parser.add_argument('-s', '--sample_name',
metavar='sample_name',
type=str,
help='sample metadata passed in as json format')
help='sample name')
parser.add_argument('-c', '--count_table',
metavar='count_table',
type=argparse.FileType('r'),
help='counts file in TSV format')

args = parser.parse_args()

## convert metadata to list
sample_meta = json.loads(args.sample_meta)
sample = args.sample_name

# Read in the counts file
counts = pd.read_csv(args.count_table, sep='\t', header=0)

# Build metadata row from selected keys
meta_keys = ['subject_id', 'timepoint', 'origin']
meta_row = {k: sample_meta[k] for k in meta_keys}
meta_df = pd.DataFrame([meta_row])

sample = sample_meta['sample']

calc_gene_family(counts, 'v_call', 'TRBV', 30, f'vdj/v_family_{sample}.csv', meta_df)
calc_gene_family(counts, 'd_call', 'TRBD', 2, f'vdj/d_family_{sample}.csv', meta_df)
calc_gene_family(counts, 'j_call', 'TRBJ', 2, f'vdj/j_family_{sample}.csv', meta_df)
counts = pd.read_csv(args.count_table, sep='\t')

# Build metadata row from selected keys
meta_keys = ['sample', 'subject_id', 'timepoint', 'origin']
meta_row = {k: sample_meta[k] for k in meta_keys}
meta_df = pd.DataFrame([meta_row])
calc_gene_family(sample, counts, 'v_call', 'TRBV', 30, f'vdj/v_family_{sample}.csv')
calc_gene_family(sample, counts, 'd_call', 'TRBD', 2, f'vdj/d_family_{sample}.csv')
calc_gene_family(sample, counts, 'j_call', 'TRBJ', 2, f'vdj/j_family_{sample}.csv')

calc_sample_stats(meta_df, counts, f'stats/sample_stats_{sample}.csv')
calc_sample_stats(sample, counts, f'stats/sample_stats_{sample}.csv')

if __name__ == "__main__":
main()
38 changes: 14 additions & 24 deletions modules/local/compare/giana.nf
Original file line number Diff line number Diff line change
Expand Up @@ -28,30 +28,20 @@ process GIANA_CALC {
> giana.log 2>&1

# Insert header after GIANA comments
python3 - <<EOF
input_file = "giana_RotationEncodingBL62.txt"
concat_header_file = "${concat_cdr3}"

with open(concat_header_file, 'r', encoding='utf-8') as f:
header = f.readline().strip().split('\\t')
header.insert(1, "cluster")
header_line = '\\t'.join(header)

with open(input_file, 'r', encoding='utf-8') as infile:
lines = infile.readlines()

with open(input_file, 'w', encoding='utf-8') as outfile:
inserted = False
for line in lines:
if line.startswith("##"):
outfile.write(line)
elif not inserted:
outfile.write(header_line + '\\n')
outfile.write(line)
inserted = True
else:
outfile.write(line)
EOF
insert=\$(head -n 1 "${concat_cdr3}")
insert=\$(echo "\$insert" | awk -F'\t' 'BEGIN{OFS="\t"} {
out = \$1 OFS "cluster"
for (i=2; i<=NF; i++) {
out = out OFS \$i
}
print out
}')

awk -v insert="\$insert" '
/^##/ { print; next }
!inserted { print insert; inserted=1 }
{ print }
' giana_RotationEncodingBL62.txt > tmp && mv tmp giana_RotationEncodingBL62.txt

mv giana_RotationEncodingBL62.txt_EncodingMatrix.txt giana_EncodingMatrix.txt
"""
Expand Down
13 changes: 5 additions & 8 deletions modules/local/compare/gliph2.nf
Original file line number Diff line number Diff line change
Expand Up @@ -18,15 +18,14 @@ process GLIPH2_TURBOGLIPH {

script:
"""
# R script starts here
cat > run_gliph2.R <<EOF
Rscript - <<EOF
#!/usr/bin/env Rscript

library(turboGliph)

# During testing, including TRBJ column was causing issues in clustering step. Removing and reinserting afterwards.
df <- read.csv("$concat_cdr3", sep = "\t", stringsAsFactors = FALSE, check.names = FALSE)
# df2 <- subset(df, select = c('CDR3b', 'TRBV', 'patient', 'counts'))
df[,'patient'] <- df[,'sample']

result <- turboGliph::gliph2(
cdr3_sequences = df,
Expand All @@ -40,14 +39,12 @@ process GLIPH2_TURBOGLIPH {
)

df3 <- read.csv('cluster_member_details.txt', sep = '\t', stringsAsFactors = FALSE, check.names = FALSE)
df3 <- merge(df3, df[, c("CDR3b", "TRBV", "patient", "TRBJ", 'counts')], by = c("CDR3b", "TRBV", "patient", 'counts'), all.x = TRUE)
df3[,'sample'] <- df3[,'patient']
df3 <- merge(df3, df[, c("CDR3b", "TRBV", "sample", 'counts')], by = c("CDR3b", "TRBV", "sample", 'counts'), all.x = TRUE)
df3 <- df3[, c('CDR3b', 'TRBV', 'TRBJ', 'counts', 'sample', 'tag', 'seq_ID', 'ultCDR3b')]
write.table(df3, "cluster_member_details.txt", sep = "\t", row.names = FALSE, quote = FALSE)

EOF

# Run the R script
Rscript run_gliph2.R

# Rename local_similarities file to standardize output name
input_file="local_similarities_*.txt"
cat \$input_file > local_similarities.txt
Expand Down
32 changes: 19 additions & 13 deletions modules/local/compare/tcrsharing.nf
Original file line number Diff line number Diff line change
Expand Up @@ -19,30 +19,36 @@ process TCRSHARING_CALC {
# Load data
df = pd.read_csv("${concat_cdr3}", sep="\t")

# Step 1: Map samples to integers
sample_mapping = {sample: i + 1 for i, sample in enumerate(df['sample'].unique())}
sample_map_df = pd.DataFrame.from_dict(sample_mapping, orient='index', columns=['sample_id']).reset_index()
sample_map_df.columns = ['patient', 'sample_id']
# Map sample to integer codes
df['sample'] = df['sample'].astype('category')
df['sample_id'] = df['sample'].cat.codes + 1

# Export mapping (uses category lookup directly)
sample_map_df = pd.DataFrame({
'patient': df['sample'].cat.categories,
'sample_id': np.arange(1, len(df['sample'].cat.categories) + 1)
})
sample_map_df.to_csv("sample_mapping.tsv", sep="\t", index=False)

# Step 2: Group by CDR3b and aggregate sample_ids
df['sample_id'] = df['sample'].map(sample_mapping)

# Get unique sample_ids per CDR3b — vectorized
grouped = (
df.groupby('CDR3b')['sample_id']
.apply(lambda x: sorted(set(x))) # remove duplicates if any
.unique() # UNIQUE — fast & vectorized
.apply(np.sort) # SORT — vectorized
.reset_index()
)

# Step 3: Add comma-separated list and total count
grouped['samples_present'] = grouped['sample_id'].apply(lambda x: ",".join(map(str, x)))
# Calculate counts
grouped['total_samples'] = grouped['sample_id'].apply(len)
grouped['samples_present'] = grouped['sample_id'].apply(
lambda arr: ",".join(arr.astype(str))
)

# Step 4: Final output — drop raw list
# Drop raw list
final_df = grouped[['CDR3b', 'total_samples', 'samples_present']]
final_df = final_df.sort_values(by='total_samples', axis=0, ascending=False)
final_df = final_df.sort_values(by="total_samples", ascending=False)

# Step 5: Export both outputs
# Export final list
final_df.to_csv("cdr3_sharing.tsv", sep="\t", index=False)
EOF

Expand Down
4 changes: 1 addition & 3 deletions modules/local/sample/sample_calc.nf
Original file line number Diff line number Diff line change
Expand Up @@ -14,13 +14,11 @@ process SAMPLE_CALC {
val sample_meta , emit: sample_meta

script:
def meta_json = groovy.json.JsonOutput.toJson(sample_meta)

"""
mkdir -p stats
mkdir -p vdj

sample_calc.py -s '${meta_json}' -c ${count_table}
sample_calc.py -s '${sample_meta.sample}' -c ${count_table}
"""

stub:
Expand Down
6 changes: 5 additions & 1 deletion modules/local/sample/sample_plot.nf
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ process SAMPLE_PLOT {
output:
path 'sample_stats.html'

script:
script:
"""
## copy quarto notebook to output directory
cp $sample_stats_template sample_stats.qmd
Expand All @@ -26,6 +26,10 @@ process SAMPLE_PLOT {
-P sample_table:$sample_table \
-P sample_stats_csv:$sample_stats_csv \
-P v_family_csv:$v_family_csv \
-P samplechart_x_col:${params.samplechart_x_col} \
-P samplechart_color_col:${params.samplechart_color_col} \
-P vgene_subject_col:${params.vgene_subject_col} \
-P vgene_x_cols:${params.vgene_x_cols} \
--to html
"""

Expand Down
6 changes: 6 additions & 0 deletions nextflow.config
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,12 @@ params {
sample_stats_template = "${projectDir}/notebooks/sample_stats_template.qmd"
compare_stats_template = "${projectDir}/notebooks/compare_stats_template.qmd"

// Sample stats metadata parameters
samplechart_x_col = 'timepoint'
samplechart_color_col = 'origin'
vgene_subject_col = 'subject_id'
vgene_x_cols = 'origin,timepoint'

// GIANA parameters
threshold = 7.0
threshold_score = 3.6
Expand Down
Loading