Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 6 additions & 5 deletions Dockerfile
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
FROM condaforge/miniforge3:latest
LABEL io.github.snakemake.containerized="true"
LABEL io.github.snakemake.conda_env_hash="7071b22b1161190c06be3ac061ab0019a1a8d3038532c9134e070a3875414ef5"
LABEL io.github.snakemake.conda_env_hash="a74592ad8bd4ec05cb4d3f2bde91dd12ba674df1828dc48cc7e2c39b95aed05c"

# Step 2: Retrieve conda environments

Expand Down Expand Up @@ -149,15 +149,16 @@ COPY workflow/envs/snpeff.yaml /conda-envs/0adafb79cb1bec58ef4c77bf4cca4f95/envi

# Conda environment:
# source: workflow/envs/var_calling.yaml
# prefix: /conda-envs/5150d0f0a91d7f7a789a06f453d63479
# prefix: /conda-envs/81e46c677a6cc0618c93963d57d17d3f
# channels:
# - conda-forge
# - bioconda
# dependencies:
# - ivar==1.4.2
# - samtools==1.17
RUN mkdir -p /conda-envs/5150d0f0a91d7f7a789a06f453d63479
COPY workflow/envs/var_calling.yaml /conda-envs/5150d0f0a91d7f7a789a06f453d63479/environment.yaml
# - bcftools==1.17
RUN mkdir -p /conda-envs/81e46c677a6cc0618c93963d57d17d3f
COPY workflow/envs/var_calling.yaml /conda-envs/81e46c677a6cc0618c93963d57d17d3f/environment.yaml

# Step 3: Generate conda environments

Expand All @@ -172,7 +173,7 @@ RUN conda env create --prefix /conda-envs/9c24a867826615972cc288081976e7fc --fil
conda env create --prefix /conda-envs/96f3c1cec4b3ce5d72f708992272e9c1 --file /conda-envs/96f3c1cec4b3ce5d72f708992272e9c1/environment.yaml && \
conda env create --prefix /conda-envs/8ad6cdcf265d30289788da99d5bf9fff --file /conda-envs/8ad6cdcf265d30289788da99d5bf9fff/environment.yaml && \
conda env create --prefix /conda-envs/0adafb79cb1bec58ef4c77bf4cca4f95 --file /conda-envs/0adafb79cb1bec58ef4c77bf4cca4f95/environment.yaml && \
conda env create --prefix /conda-envs/5150d0f0a91d7f7a789a06f453d63479 --file /conda-envs/5150d0f0a91d7f7a789a06f453d63479/environment.yaml && \
conda env create --prefix /conda-envs/81e46c677a6cc0618c93963d57d17d3f --file /conda-envs/81e46c677a6cc0618c93963d57d17d3f/environment.yaml && \
conda clean --all -y

# Step 4: Run post-deploy scripts
Expand Down
28 changes: 19 additions & 9 deletions template.qmd
Original file line number Diff line number Diff line change
Expand Up @@ -65,6 +65,7 @@ library(tidyr)
library(dplyr)
library(heatmaply)
library(readr)
library(tibble)

# Diversity
div_values <- fromJSON(params$div_value)
Expand All @@ -89,11 +90,9 @@ table <- read.csv(params$table)
n.samples <- table %>% pull(Sample) %>% unique() %>% length()

# Heatmap
heatmap.df <- read.csv(params$heat_tab)
row.names(heatmap.df) <- heatmap.df$`...1`
heatmap.df[1] <- NULL
cor.mat <- cor(heatmap.df, method = params$cor_method)
cor.mat[is.na(cor.mat)] <- 0
cor.mat <- read_csv(params$heat_tab) %>%
column_to_rownames("...1") %>%
as.matrix()
if (params$cor_method == "pearson") {
cor.method.name <- "Pearson's"
} else if (params$cor_method == "spearman") {
Expand Down Expand Up @@ -247,10 +246,11 @@ Labeled dots represent nucleotide variants whose allele frequency is correlated

Significantly correlated nucleotide variants are described in more detail in @fig-panel.

![Time series of relative allele frequencies. The shown positions include
nucleotide variants with a significant correlation with time and sites with more
than two possible states. Each subplot depicts the progression of the allele
frequencies in time for a given genome position.](`r params$panel`){#fig-panel}
![Time series of relative allele frequencies. Nucleotide variants with a significant
correlation with time and sites with more than two possible states are included. Each
subplot depicts the progression of the allele frequencies in time for a given genome
position; lines connecting points indicate data from contiguous time points, while
unconnected points denote intervals with missing data.](`r params$panel`){#fig-panel}

A `r dist.tree.algo` tree has been constructed using pairwise distances
between target samples (@fig-tree), based on the allele frequencies measured from
Expand All @@ -277,8 +277,18 @@ The heatmap is interactive and allows zooming in on specific regions.
```{r fig-heatmap, echo = F, message = F, warning = F, fig.align = 'center'}
#| fig-cap: "Interactive heatmap with hierarchical clustering of the pairwise correlation coefficients between the time series of allele frequencies in the case study."

# Calculate dendrograms using data with no NAs
cor.mat.clustering <- cor.mat
cor.mat.clustering[is.na(cor.mat.clustering)] <- 0
dist.rows <- dist(cor.mat.clustering)
dend.rows <- hclust(dist.rows)
dend.cols <- hclust(dist(t(cor.mat.clustering)))

# Display custom dendrograms over real data
heatmaply_cor(
cor.mat,
Rowv = as.dendrogram(dend.rows),
Colv = as.dendrogram(dend.cols),
grid_gap = 0.25,
fontsize_row = 7,
fontsize_col = 7,
Expand Down
1 change: 1 addition & 0 deletions workflow/envs/var_calling.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -4,3 +4,4 @@ channels:
dependencies:
- ivar==1.4.2
- samtools==1.17
- bcftools==1.17
21 changes: 19 additions & 2 deletions workflow/rules/report.smk
Original file line number Diff line number Diff line change
Expand Up @@ -345,7 +345,7 @@ rule af_time_correlation_data:
cor_method = config["COR"]["METHOD"],
cor_exact = config["COR"]["EXACT"],
input:
variants = OUTDIR/f"{OUTPUT_NAME}.variants.tsv",
variants = OUTDIR/f"{OUTPUT_NAME}.variants.all_sites.tsv",
metadata = config["METADATA"],
output:
fmt_variants = temp(REPORT_DIR_TABLES/"variants.filled.dated.tsv"),
Expand Down Expand Up @@ -392,6 +392,23 @@ rule af_trajectory_panel_plot:
"../scripts/report/af_trajectory_panel_plot.R"


rule pairwise_trajectory_correlation_data:
conda: "../envs/renv.yaml"
params:
cor_method = config["COR"]["METHOD"],
cor_use = "pairwise.complete.obs",
input:
variants = OUTDIR/f"{OUTPUT_NAME}.variants.all_sites.tsv",
metadata = config["METADATA"],
output:
table = REPORT_DIR_TABLES/"pairwise_trajectory_frequency_data.csv",
matrix = report(REPORT_DIR_TABLES/"pairwise_trajectory_correlation_matrix.csv"),
log:
LOGDIR / "pairwise_trajectory_correlation_data" / "log.txt"
script:
"../scripts/report/pairwise_trajectory_correlation_data.R"


rule summary_table:
conda: "../envs/renv.yaml"
input:
Expand Down Expand Up @@ -423,7 +440,7 @@ rule report:
temest = report(REPORT_DIR_PLOTS/"time_signal.png"),
evo = report(REPORT_DIR_PLOTS/"dn_and_ds.png"),
omega_plot = report(REPORT_DIR_PLOTS/"dnds.png"),
heat_table = report(OUTDIR/"vaf"/"pairwise_trajectory_correlation.csv"),
heat_table = report(REPORT_DIR_TABLES/"pairwise_trajectory_correlation_matrix.csv"),
freyja_ts = OUTDIR/"demixing"/"freyja_data"/"last_barcode_update.txt",
value = REPORT_DIR_TABLES/"diversity.json",
stats_lm = REPORT_DIR_TABLES/"time_signal.json",
Expand Down
79 changes: 68 additions & 11 deletions workflow/rules/vaf.smk
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,8 @@ rule snps_to_ancestor:
bam = get_input_bam,
gff = OUTDIR/"reference.gff3"
output:
tsv = temp(OUTDIR/"vaf"/"{sample}.tsv")
tsv = temp(OUTDIR/"vaf"/"{sample}.tsv"),
reference_fasta_renamed = temp(OUTDIR/"vaf"/"{sample}.reference.fasta"),
log:
LOGDIR / "snps_to_ancestor" / "{sample}.log.txt"
shell:
Expand All @@ -27,9 +28,9 @@ rule snps_to_ancestor:
echo Reference: $ref
echo FASTA before:
grep ">" {input.reference_fasta}
sed 's/>.*/>'$ref'/g' {input.reference_fasta} > renamed_reference.fasta
sed 's/>.*/>'$ref'/g' {input.reference_fasta} >{output.reference_fasta_renamed:q}
echo FASTA after:
grep ">" renamed_reference.fasta
grep ">" {output.reference_fasta_renamed:q}

echo Starting VC
samtools mpileup \
Expand All @@ -39,15 +40,15 @@ rule snps_to_ancestor:
--count-orphans \
--no-BAQ \
-Q {params.mpileup_quality} \
-f renamed_reference.fasta \
-f {output.reference_fasta_renamed:q} \
{input.bam} \
| ivar variants \
-p variants \
-q {params.ivar_quality} \
-t {params.ivar_freq} \
-m {params.ivar_depth} \
-g {input.gff} \
-r renamed_reference.fasta
-r {output.reference_fasta_renamed:q}
mv variants.tsv {output.tsv:q}
"""

Expand Down Expand Up @@ -205,14 +206,70 @@ use rule concat_vcf_fields as concat_variants with:
OUTDIR/f"{OUTPUT_NAME}.variants.tsv",


rule pairwise_trajectory_correlation:
rule bcftools_mpileup_all_sites:
threads: 1
conda: "../envs/var_calling.yaml"
params:
min_mq = 0,
min_bq = config["VC"]["MIN_QUALITY"],
mpileup_extra = "--no-BAQ"
input:
bam = get_input_bam,
reference = OUTDIR/"vaf"/"{sample}.reference.fasta",
output:
mpileup = temp(OUTDIR / "all_sites" / "{sample}.mpileup.vcf"),
query = temp(OUTDIR / "all_sites" / "{sample}.query.tsv"),
log:
mpileup = LOGDIR / "bcftools_mpileup_all_sites" / "{sample}.mpileup.txt",
query = LOGDIR / "bcftools_mpileup_all_sites" / "{sample}.query.txt",
shell:
"bcftools mpileup {params.mpileup_extra} -a AD,ADF,ADR --fasta-ref {input.reference:q} --threads {threads} -Q {params.min_bq} -q {params.min_mq} -Ov -o {output.mpileup:q} {input.bam:q} >{log.mpileup:q} 2>&1 && "
"echo 'CHROM\tPOS\tREF\tALT\tDP\tAD\tADF\tADR' >{output.query:q} && "
"bcftools query -f '%CHROM\t%POS\t%REF\t%ALT\t%DP\t[ %AD]\t[ %ADF]\t[ %ADR]\n' {output.mpileup:q} >>{output.query:q} 2>{log.query:q}"


rule filter_mpileup_all_sites:
threads: 1
params:
min_total_AD = config["VC"]["MIN_DEPTH"],
min_total_ADF = 0,
min_total_ADR = 0,
input:
OUTDIR / "all_sites" / "{sample}.query.tsv",
output:
temp(OUTDIR / "all_sites" / "{sample}.filtered_sites.tsv"),
log:
LOGDIR / "filter_mpileup_all_sites" / "{sample}.txt"
run:
import pandas as pd
df = pd.read_csv(input[0], sep="\t")
df["SAMPLE"] = wildcards.sample
df["REF_AD"] = df.AD.str.split(",").apply(lambda values: int(values[0]))
df["TOTAL_AD"] = df.AD.str.split(",").apply(lambda values: sum(int(n) for n in values))
df["TOTAL_ADF"] = df.ADF.str.split(",").apply(lambda values: sum(int(n) for n in values))
df["TOTAL_ADR"] = df.ADR.str.split(",").apply(lambda values: sum(int(n) for n in values))
df[
(df.TOTAL_AD >= params.min_total_AD) &
(df.TOTAL_ADF >= params.min_total_ADF) &
(df.TOTAL_ADR >= params.min_total_ADR)
].to_csv(output[0], sep="\t", index=False)


use rule concat_vcf_fields as merge_filtered_mpileup_all_sites with:
input:
expand(OUTDIR / "all_sites" / "{sample}.filtered_sites.tsv", sample=iter_samples()),
output:
OUTDIR / f"{OUTPUT_NAME}.filtered_sites.tsv",


rule fill_all_sites:
conda: "../envs/renv.yaml"
input:
variants = OUTDIR/f"{OUTPUT_NAME}.variants.tsv",
metadata = config["METADATA"],
variants = OUTDIR/f"{OUTPUT_NAME}.variants.tsv",
sites = OUTDIR / f"{OUTPUT_NAME}.filtered_sites.tsv",
output:
table = report(OUTDIR/"vaf"/"pairwise_trajectory_correlation.csv"),
variants = OUTDIR/f"{OUTPUT_NAME}.variants.all_sites.tsv",
log:
LOGDIR / "pairwise_trajectory_correlation" / "log.txt"
LOGDIR / "fill_all_sites" / "log.txt"
script:
"../scripts/pairwise_trajectory_correlation.R"
"../scripts/fill_all_sites.R"
54 changes: 54 additions & 0 deletions workflow/scripts/fill_all_sites.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
#!/usr/bin/env Rscript

# Write stdout and stderr to log file
log <- file(snakemake@log[[1]], open = "wt")
sink(log, type = "message")
sink(log, type = "output")

library(dplyr)
library(readr)
library(tidyr)
library(logger)

log_threshold(INFO)

log_info("Reading variants")
variants <- read_tsv(snakemake@input[["variants"]])

# Create a mapping of variant names to their genomic position
variant_coords <- variants %>%
select(VARIANT_NAME, REGION, POS) %>%
distinct()

log_info("Reading filtered sites")
sites <- read_tsv(snakemake@input[["sites"]]) %>%
select(SAMPLE, POS) %>% # TODO: consider region/chrom
distinct() %>%
mutate(FILTER_PASS = TRUE)

log_info("Processing variants")
all_variants <- variants %>%
# Select minimal columns
select(VARIANT_NAME, REGION, SAMPLE, ALT_FREQ) %>%
# Handle duplicates
group_by(SAMPLE, VARIANT_NAME, REGION) %>%
summarise(ALT_FREQ = sum(ALT_FREQ, na.rm = TRUE), .groups = "drop") %>%
# Complete with NA
complete(SAMPLE, VARIANT_NAME, REGION) %>%
# Assign genomic positions for all combinations
left_join(variant_coords, by = c("REGION", "VARIANT_NAME")) %>%
# Merge filtered sites
# TODO: consider region/chrom
left_join(sites, by = c("SAMPLE", "POS")) %>%
replace_na(list(FILTER_PASS = FALSE)) %>%
# Fill missing frequencies conditionally
mutate(
ALT_FREQ = case_when(
!is.na(ALT_FREQ) ~ ALT_FREQ, # variant was called, keep the frequency
FILTER_PASS ~ 0, # missing but site is covered: 0 (reference)
TRUE ~ NA # missing and not covered: NA (unknown)
)
)

log_info("Saving table")
write_tsv(all_variants, snakemake@output[["variants"]])
4 changes: 2 additions & 2 deletions workflow/scripts/report/af_time_correlation_data.R
Original file line number Diff line number Diff line change
Expand Up @@ -24,11 +24,11 @@ variants <- read_delim(
"POS"
)
) %>%
# Fill positions without alt frequency with 0
# Fill positions without alt frequency with NA
complete(
nesting(REGION, VARIANT_NAME, POS),
SAMPLE,
fill = list(ALT_FREQ = 0)
fill = list(ALT_FREQ = NA)
)

log_info("Reading metadata")
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@ library(logger)

log_threshold(INFO)

log_info("Reading variants")
variants <- read_tsv(snakemake@input[["variants"]])

# Obtain sample names ordered by CollectionDate
Expand All @@ -22,20 +23,28 @@ date_order <- read_csv(snakemake@input[["metadata"]]) %>%
pull(ID) %>%
unique()

# Create SNP variable and select useful variables
variants <- variants %>% select(VARIANT_NAME, SAMPLE, ALT_FREQ)

variants <- variants %>%
log_info("Formatting variants")
all_variants_wider <- variants %>%
select(SAMPLE, VARIANT_NAME, ALT_FREQ) %>%
pivot_wider(
names_from = VARIANT_NAME,
values_from = ALT_FREQ,
values_fill = 0,
values_fn = sum
values_from = ALT_FREQ
) %>%
# Apply chronological ordering
arrange(factor(SAMPLE, levels = date_order)) %>%
# Removes "|"-separated annotations, keeping the first one + ellipsis (clarifies heatmap)
# Removes "|"-separated annotations, keeping the first one + ellipsis
rename_with(~ str_replace(., "^([^|]+)\\|.*$", "\\1(...)"), -SAMPLE) %>%
column_to_rownames(var = "SAMPLE")

log_info("Saving table to create heatmap")
write.csv(variants, snakemake@output[["table"]])
log_info("Saving table of frequencies")
write.csv(all_variants_wider, snakemake@output[["table"]])

log_info("Calculating correlations")
cor.mat <- cor(
all_variants_wider,
method = snakemake@params$cor_method,
use = snakemake@params$cor_use
)

log_info("Saving correlation matrix")
write.csv(cor.mat, snakemake@output[["matrix"]])
Loading