Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
23 commits
Select commit Hold shift + click to select a range
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion docker/minfi/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -8,4 +8,5 @@ RUN R --no-save <<SCRIPT
BiocManager::install("IlluminaHumanMethylationEPICanno.ilm10b4.hg19")
SCRIPT

COPY --from=scripts --chmod=777 methylation/methylation-preprocess.R /scripts/methylation/methylation-preprocess.R
COPY --from=scripts --chmod=777 methylation/methylation-preprocess.R /scripts/methylation/methylation-preprocess.R
COPY --from=scripts --chmod=777 methylation/list-sex-probes.R /scripts/methylation/list-sex-probes.R
2 changes: 1 addition & 1 deletion docker/minfi/package.json
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
{
"name": "minfi",
"version": "1.48.0",
"revision": "7"
"revision": "8"
}
2 changes: 1 addition & 1 deletion docker/pandas/package.json
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
{
"name": "pandas",
"version": "2.2.1",
"revision": "6"
"revision": "7"
}
11 changes: 11 additions & 0 deletions scripts/CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,17 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/).

Any change to the `scripts/` directory should be accompanied by version increases in the `docker/` directory! If you are editing this file, please ensure these changes propagate!

## 2025 December

### Added

- Added `list-sex-probes.R` to dump the list of probes that align to the sex chromosomes [#283](https://github.com/stjudecloud/workflows/pull/283)

### Changed

- `filter.py` now accepts an optional set of files listing probes to exclude [#283](https://github.com/stjudecloud/workflows/pull/283)
- `methylation-preprocess.R` now outputs non-genomic probes, SNP affected probes, and non-SNP affected probes [#283](https://github.com/stjudecloud/workflows/pull/283)

## 2025 September

### Added
Expand Down
18 changes: 17 additions & 1 deletion scripts/methylation/filter.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,9 @@ def get_args():
help="Fraction of samples that must exceed the p-value threshold.",
)
parser.add_argument("--pval", type=str, help="P-values CSV file.")
parser.add_argument(
"--exclude", type=str, action="append", help="Files with probes to exclude."
)
parser.add_argument("beta", type=str, help="Beta values CSV file.")
args = parser.parse_args()

Expand All @@ -40,6 +43,15 @@ def get_args():
if __name__ == "__main__":
args = get_args()

# Read probes to exclude
probes_to_exclude = []
if args.exclude is not None:
for exclude_file in args.exclude:
with open(exclude_file, "r") as ef:
for line in ef:
probes_to_exclude.append(line.strip())
print("Number of probes to exclude:", len(probes_to_exclude))

# Read p-values and find probes with high p-value in too many samples
high_pval_probes = []
if args.pval is not None:
Expand All @@ -66,6 +78,10 @@ def get_args():
"high_pval_probes.csv", index=False, header=False
)

# Combine with probes to exclude
exclude_probe_list = set(high_pval_probes).union(set(probes_to_exclude))
print("Total number of probes to exclude:", len(exclude_probe_list))

# Read beta values and compute standard deviation
data = []

Expand All @@ -78,7 +94,7 @@ def get_args():

probe = line[0]
sd = np.std([float(x) for x in line[1:]])
if probe not in high_pval_probes:
if probe not in exclude_probe_list:
data.append([probe, sd])

sd_df = pd.DataFrame(data, columns=["probe", "sd"]).set_index("probe")
Expand Down
15 changes: 15 additions & 0 deletions scripts/methylation/list-sex-probes.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
library(minfi)
library(IlluminaHumanMethylationEPICmanifest)
library(IlluminaHumanMethylationEPICanno.ilm10b4.hg19)


ann <- getAnnotation(IlluminaHumanMethylationEPICanno.ilm10b4.hg19)
good_probes <- rownames(ann)[ann$chr == "chrX" | ann$chr == "chrY"]

write.table(
good_probes,
Comment on lines +7 to +10
Copy link

Copilot AI Dec 19, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The variable name good_probes is misleading. These are sex chromosome probes, not necessarily 'good' probes. Consider renaming to sex_probes or sex_chromosome_probes for clarity.

Suggested change
good_probes <- rownames(ann)[ann$chr == "chrX" | ann$chr == "chrY"]
write.table(
good_probes,
sex_probes <- rownames(ann)[ann$chr == "chrX" | ann$chr == "chrY"]
write.table(
sex_probes,

Copilot uses AI. Check for mistakes.
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

surprisingly finding myself agreeing with copilot 😅

file = "sex_probes.txt",
quote = FALSE,
row.names = FALSE,
col.names = FALSE
)
35 changes: 35 additions & 0 deletions scripts/methylation/methylation-preprocess.R
Original file line number Diff line number Diff line change
Expand Up @@ -63,6 +63,15 @@ saveRDS(r_set, paste0(args$out_base, ".RSet.rds"))
gr_set <- mapToGenome(r_set)
saveRDS(gr_set, paste0(args$out_base, ".GRSet.rds"))

non_genomic_probes <- setdiff(featureNames(r_set), featureNames(gr_set))

# Get the list of sites with SNPs
snps <- getSnpInfo(gr_set)
gr_set_snps <- addSnpInfo(gr_set)
# Remove probes with SNPs at CpG or single-base extension sites
gr_set_snps <- dropLociWithSnps(gr_set_snps, snps = c("SBE", "CpG"), maf = 0)
probes_without_snps <- featureNames(gr_set_snps)

# Take the genomic mapped RatioSet and fill Beta values (non-normalized).
# Get the NON-normalized beta values:
beta <- getBeta(gr_set)
Expand All @@ -81,6 +90,32 @@ write.csv(sample_names, paste0(args$out_base, ".sampleNames.csv"))
probe_names <- featureNames(gr_set)
write.csv(probe_names, paste0(args$out_base, ".probeNames.csv"))

# Write probes with SNPs
probes_with_snps <- setdiff(probe_names, probes_without_snps)
write.table(
probes_with_snps,
paste0(args$out_base, ".probes_with_snps.tab"),
row.names = FALSE,
col.names = FALSE,
quote = FALSE,
)
write.table(
probes_without_snps,
paste0(args$out_base, ".probes_without_snps.tab"),
row.names = FALSE,
col.names = FALSE,
quote = FALSE,
)

# Write non-genomic probe list
write.table(
non_genomic_probes,
paste0(args$out_base, ".non_genomic_probes.tab"),
row.names = FALSE,
col.names = FALSE,
quote = FALSE,
)

gr <- granges(gr_set)
write.csv(gr, paste0(args$out_base, ".gr.csv"))

Expand Down
11 changes: 10 additions & 1 deletion workflows/methylation/CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,16 @@
All notable changes to this project will be documented in this file.

The format is based on [Keep a Changelog](http://keepachangelog.com/).


## 2025 December

### Added

- Now outputs list of probes that map to sex chromosomes [#283](https://github.com/stjudecloud/workflows/pull/283)
- Pipeline writes out the list of filtered probes [#283](https://github.com/stjudecloud/workflows/pull/283)
- Pipeline writes out list of probes that are affected by SNPs [#283](https://github.com/stjudecloud/workflows/pull/283)
- Pipeline writes out list of probes that were filtered for not mapping to genomic coordinates [#283](https://github.com/stjudecloud/workflows/pull/283)

## 2025 September

### Added
Expand Down
14 changes: 12 additions & 2 deletions workflows/methylation/methylation-cohort.wdl
Original file line number Diff line number Diff line change
Expand Up @@ -11,19 +11,22 @@ workflow methylation_cohort {
umap_embedding: "UMAP embedding for all samples",
umap_plot: "UMAP plot for all samples",
probe_pvalues: "Matrix (in CSV format) containing detection p-values for every (common) probe on the array as rows and all of the input samples as columns.",
high_pval_probes: "List of probes that were filtered out due to high p-values",
}
allowNestedInputs: true
}

parameter_meta {
unfiltered_normalized_beta: "Array of unfiltered normalized beta values for each sample"
sex_probe_list: "List of probes mapping to sex chromosomes to optionally filter"
p_values: "Array of detection p-value files for each sample."
skip_pvalue_check: "Skip filtering based on p-values, even if `p_values` is supplied."
num_probes: "Number of probes to use when filtering to the top `num_probes` probes with the highest standard deviation."
}

input {
Array[File] unfiltered_normalized_beta
File? sex_probe_list
Array[File] p_values = []
Boolean skip_pvalue_check = false
Int num_probes = 10000
Expand Down Expand Up @@ -120,6 +123,7 @@ workflow methylation_cohort {
),
p_values = pval_file,
num_probes,
additional_probes_to_exclude = select_all([sex_probe_list]),
}

call generate_umap { input:
Expand All @@ -142,6 +146,7 @@ workflow methylation_cohort {
File umap_embedding = generate_umap.umap
File umap_plot = plot_umap.umap_plot
File? probe_pvalues = pval_file
File? high_pval_probes = filter_probes.high_pval_probes
}
}

Expand Down Expand Up @@ -191,7 +196,7 @@ task combine_data {
}

runtime {
container: "ghcr.io/stjudecloud/pandas:2.2.1-6"
container: "ghcr.io/stjudecloud/pandas:2.2.1-7"
memory: "~{memory_gb} GB"
cpu: 1
disks: "~{disk_size_gb} GB"
Expand All @@ -206,12 +211,14 @@ task filter_probes {
outputs: {
filtered_beta_values: "Filtered beta values for all samples",
filtered_probes: "Probes that were retained after filtering.",
high_pval_probes: "Probes that were filtered out due to high p-values",
}
}

parameter_meta {
beta_values: "Beta values for all samples"
p_values: "P-values for all samples"
additional_probes_to_exclude: "Additional probes to exclude from the analysis"
prefix: "Prefix for the output files. The extensions `.beta.csv` and `.probes.csv` will be appended."
pval_threshold: "P-value cutoff to determine poor quality probes"
pval_sample_fraction: "Fraction of samples that must exceed p-value threshold to exclude probe"
Expand All @@ -221,6 +228,7 @@ task filter_probes {
input {
File beta_values
File? p_values
Array[File] additional_probes_to_exclude = []
String prefix = "filtered"
Float pval_threshold = 0.01
Float pval_sample_fraction = 0.5
Expand All @@ -237,16 +245,18 @@ task filter_probes {
--pval-threshold ~{pval_threshold} \
--pval-sample-fraction ~{pval_sample_fraction} \
~{"--pval '" + p_values + "'"} \
~{sep(" ", prefix("--exclude ", quote(additional_probes_to_exclude)))} \
"~{beta_values}"
>>>

output {
File filtered_beta_values = "~{prefix}.beta.csv"
File filtered_probes = "~{prefix}.probes.csv"
File? high_pval_probes = "high_pval_probes.csv"
}

runtime {
container: "ghcr.io/stjudecloud/pandas:2.2.1-6"
container: "ghcr.io/stjudecloud/pandas:2.2.1-7"
memory: "8 GB"
cpu: 1
disks: "~{disk_size_gb} GB"
Expand Down
41 changes: 40 additions & 1 deletion workflows/methylation/methylation-preprocess.wdl
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,9 @@ task process_raw_idats {
m_values: "M values",
probe_names: "Probe names found on the array",
probe_pvalues: "Matrix (in CSV format) containing detection p-values for every (common) probe on the array as rows.",
probes_with_snps: "List of probes that contain SNPs",
probes_without_snps: "List of probes that do not contain SNPs",
non_genomic_probes: "List of probes that do not map to the genome",
}
}

Expand Down Expand Up @@ -45,6 +48,8 @@ task process_raw_idats {
--idat_base "~{idat_base}" \
--out_base "~{out_base}" \
--seed ~{seed}

rm ./*.idat
>>>

output {
Expand All @@ -58,13 +63,47 @@ task process_raw_idats {
File m_values = out_base + ".m_values.csv"
File probe_names = out_base + ".probeNames.csv"
File probe_pvalues = out_base + ".detectionP.csv"
File probes_with_snps = out_base + ".probes_with_snps.tab"
File probes_without_snps = out_base + ".probes_without_snps.tab"
File non_genomic_probes = out_base + ".non_genomic_probes.tab"
}

runtime {
container: "ghcr.io/stjudecloud/minfi:1.48.0-7"
container: "ghcr.io/stjudecloud/minfi:1.48.0-8"
memory: "8 GB"
cpu: 1
disks: "~{disk_size_gb} GB"
maxRetries: 1
}
}

task list_sex_probes {
meta {
description: "List probes that map to the sex chromosomes"
outputs: {
probe_list: "List of probe names that map to the sex chromosomes"
}
}

parameter_meta {}

input {}

command <<<
set -euo pipefail

Rscript /scripts/methylation/list-sex-probes.R
>>>

output {
File probe_list = "sex_probes.txt"
}

runtime {
container: "ghcr.io/stjudecloud/minfi:1.48.0-8"
memory: "8 GB"
cpu: 1
disks: "2 GB"
maxRetries: 1
}
}
Loading
Loading