From 9e59e596dc331ce76016a339adcd94e833435440 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Miguel=20=C3=81lvarez=20Herrera?= Date: Fri, 21 Nov 2025 15:16:29 +0100 Subject: [PATCH 1/8] chore: add calculation of DP and AD for all sites with coverage --- workflow/envs/var_calling.yaml | 1 + workflow/rules/vaf.smk | 48 ++++++++++++++++++++++++++++++---- 2 files changed, 44 insertions(+), 5 deletions(-) diff --git a/workflow/envs/var_calling.yaml b/workflow/envs/var_calling.yaml index 49ff305..8ea0b9d 100644 --- a/workflow/envs/var_calling.yaml +++ b/workflow/envs/var_calling.yaml @@ -4,3 +4,4 @@ channels: dependencies: - ivar==1.4.2 - samtools==1.17 + - bcftools==1.17 diff --git a/workflow/rules/vaf.smk b/workflow/rules/vaf.smk index 2a2560b..3093944 100644 --- a/workflow/rules/vaf.smk +++ b/workflow/rules/vaf.smk @@ -14,7 +14,8 @@ rule snps_to_ancestor: bam = get_input_bam, gff = OUTDIR/"reference.gff3" output: - tsv = temp(OUTDIR/"vaf"/"{sample}.tsv") + tsv = temp(OUTDIR/"vaf"/"{sample}.tsv"), + reference_fasta_renamed = temp(OUTDIR/"vaf"/"{sample}.reference.fasta"), log: LOGDIR / "snps_to_ancestor" / "{sample}.log.txt" shell: @@ -27,9 +28,9 @@ rule snps_to_ancestor: echo Reference: $ref echo FASTA before: grep ">" {input.reference_fasta} - sed 's/>.*/>'$ref'/g' {input.reference_fasta} > renamed_reference.fasta + sed 's/>.*/>'$ref'/g' {input.reference_fasta} >{output.reference_fasta_renamed:q} echo FASTA after: - grep ">" renamed_reference.fasta + grep ">" {output.reference_fasta_renamed:q} echo Starting VC samtools mpileup \ @@ -39,7 +40,7 @@ rule snps_to_ancestor: --count-orphans \ --no-BAQ \ -Q {params.mpileup_quality} \ - -f renamed_reference.fasta \ + -f {output.reference_fasta_renamed:q} \ {input.bam} \ | ivar variants \ -p variants \ @@ -47,7 +48,7 @@ rule snps_to_ancestor: -t {params.ivar_freq} \ -m {params.ivar_depth} \ -g {input.gff} \ - -r renamed_reference.fasta + -r {output.reference_fasta_renamed:q} mv variants.tsv {output.tsv:q} """ @@ -205,6 +206,43 @@ use rule concat_vcf_fields as concat_variants with: OUTDIR/f"{OUTPUT_NAME}.variants.tsv", +rule samtools_depth_all_sites: + threads: 1 + conda: "../envs/var_calling.yaml" + params: + min_mq = 0, + min_bq = config["VC"]["MIN_QUALITY"], + input: + bam = get_input_bam, + output: + OUTDIR / "samtools_depth_all_sites" / "{sample}.depth.tsv" + log: + LOGDIR / "samtools_depth_all_sites" / "{sample}.txt", + shell: + "samtools depth --threads {threads} -a -H -J -Q {params.min_mq} -q {params.min_bq} -o {output:q} {input:q} >{log:q} 2>&1" + + +rule bcftools_mpileup_all_sites: + threads: 1 + conda: "../envs/var_calling.yaml" + params: + min_mq = 0, + min_bq = config["VC"]["MIN_QUALITY"], + input: + bam = get_input_bam, + reference = OUTDIR/"vaf"/"{sample}.reference.fasta", + output: + mpileup = OUTDIR / "bcftools_mpileup_all_sites" / "{sample}.mpileup.vcf", + query = OUTDIR / "bcftools_mpileup_all_sites" / "{sample}.query.tsv", + log: + mpileup = LOGDIR / "bcftools_mpileup_all_sites" / "{sample}.mpileup.txt", + query = LOGDIR / "bcftools_mpileup_all_sites" / "{sample}.query.txt", + shell: + "bcftools mpileup --ignore-RG -a AD,ADF,ADR --fasta-ref {input.reference:q} --threads {threads} -Q {params.min_mq} -q {params.min_bq} -Ov -o {output.mpileup:q} {input.bam:q} >{log.mpileup:q} 2>&1 && " + "echo 'CHROM\tPOS\tREF\tALT\tDP\tAD\tADF\tADR' >{output.query:q} && " + "bcftools query -f '%CHROM\t%POS\t%REF\t%ALT\t%DP\t[ %AD]\t[ %ADF]\t[ %ADR]\n' {output.mpileup:q} >>{output.query:q} 2>{log.query:q}" + + rule pairwise_trajectory_correlation: conda: "../envs/renv.yaml" input: From 3dfb4c36cb3dabfcc27319903b89faa56f8f90df Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Miguel=20=C3=81lvarez=20Herrera?= Date: Fri, 21 Nov 2025 15:17:13 +0100 Subject: [PATCH 2/8] chore: update new VAF environment in Dockerfile --- Dockerfile | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/Dockerfile b/Dockerfile index 7f6ac6a..2bb0e5a 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,6 +1,6 @@ FROM condaforge/miniforge3:latest LABEL io.github.snakemake.containerized="true" -LABEL io.github.snakemake.conda_env_hash="7071b22b1161190c06be3ac061ab0019a1a8d3038532c9134e070a3875414ef5" +LABEL io.github.snakemake.conda_env_hash="a74592ad8bd4ec05cb4d3f2bde91dd12ba674df1828dc48cc7e2c39b95aed05c" # Step 2: Retrieve conda environments @@ -149,15 +149,16 @@ COPY workflow/envs/snpeff.yaml /conda-envs/0adafb79cb1bec58ef4c77bf4cca4f95/envi # Conda environment: # source: workflow/envs/var_calling.yaml -# prefix: /conda-envs/5150d0f0a91d7f7a789a06f453d63479 +# prefix: /conda-envs/81e46c677a6cc0618c93963d57d17d3f # channels: # - conda-forge # - bioconda # dependencies: # - ivar==1.4.2 # - samtools==1.17 -RUN mkdir -p /conda-envs/5150d0f0a91d7f7a789a06f453d63479 -COPY workflow/envs/var_calling.yaml /conda-envs/5150d0f0a91d7f7a789a06f453d63479/environment.yaml +# - bcftools==1.17 +RUN mkdir -p /conda-envs/81e46c677a6cc0618c93963d57d17d3f +COPY workflow/envs/var_calling.yaml /conda-envs/81e46c677a6cc0618c93963d57d17d3f/environment.yaml # Step 3: Generate conda environments @@ -172,7 +173,7 @@ RUN conda env create --prefix /conda-envs/9c24a867826615972cc288081976e7fc --fil conda env create --prefix /conda-envs/96f3c1cec4b3ce5d72f708992272e9c1 --file /conda-envs/96f3c1cec4b3ce5d72f708992272e9c1/environment.yaml && \ conda env create --prefix /conda-envs/8ad6cdcf265d30289788da99d5bf9fff --file /conda-envs/8ad6cdcf265d30289788da99d5bf9fff/environment.yaml && \ conda env create --prefix /conda-envs/0adafb79cb1bec58ef4c77bf4cca4f95 --file /conda-envs/0adafb79cb1bec58ef4c77bf4cca4f95/environment.yaml && \ - conda env create --prefix /conda-envs/5150d0f0a91d7f7a789a06f453d63479 --file /conda-envs/5150d0f0a91d7f7a789a06f453d63479/environment.yaml && \ + conda env create --prefix /conda-envs/81e46c677a6cc0618c93963d57d17d3f --file /conda-envs/81e46c677a6cc0618c93963d57d17d3f/environment.yaml && \ conda clean --all -y # Step 4: Run post-deploy scripts From c03a4ddf4e2a158e8f43834c43bb2cdb40ab1c39 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Miguel=20=C3=81lvarez=20Herrera?= Date: Tue, 2 Dec 2025 12:07:19 +0100 Subject: [PATCH 3/8] chore: update mpileup commands quality options --- workflow/rules/vaf.smk | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/workflow/rules/vaf.smk b/workflow/rules/vaf.smk index 3093944..bb8f63d 100644 --- a/workflow/rules/vaf.smk +++ b/workflow/rules/vaf.smk @@ -219,7 +219,7 @@ rule samtools_depth_all_sites: log: LOGDIR / "samtools_depth_all_sites" / "{sample}.txt", shell: - "samtools depth --threads {threads} -a -H -J -Q {params.min_mq} -q {params.min_bq} -o {output:q} {input:q} >{log:q} 2>&1" + "samtools depth --threads {threads} -a -H -J -Q {params.min_bq} -q {params.min_mq} -o {output:q} {input:q} >{log:q} 2>&1" rule bcftools_mpileup_all_sites: @@ -238,7 +238,7 @@ rule bcftools_mpileup_all_sites: mpileup = LOGDIR / "bcftools_mpileup_all_sites" / "{sample}.mpileup.txt", query = LOGDIR / "bcftools_mpileup_all_sites" / "{sample}.query.txt", shell: - "bcftools mpileup --ignore-RG -a AD,ADF,ADR --fasta-ref {input.reference:q} --threads {threads} -Q {params.min_mq} -q {params.min_bq} -Ov -o {output.mpileup:q} {input.bam:q} >{log.mpileup:q} 2>&1 && " + "bcftools mpileup --no-BAQ -a AD,ADF,ADR --fasta-ref {input.reference:q} --threads {threads} -Q {params.min_bq} -q {params.min_mq} -Ov -o {output.mpileup:q} {input.bam:q} >{log.mpileup:q} 2>&1 && " "echo 'CHROM\tPOS\tREF\tALT\tDP\tAD\tADF\tADR' >{output.query:q} && " "bcftools query -f '%CHROM\t%POS\t%REF\t%ALT\t%DP\t[ %AD]\t[ %ADF]\t[ %ADR]\n' {output.mpileup:q} >>{output.query:q} 2>{log.query:q}" From bd911f190da290204bdef0765f1e8fc68b1fbd9f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Miguel=20=C3=81lvarez=20Herrera?= Date: Tue, 2 Dec 2025 12:15:43 +0100 Subject: [PATCH 4/8] chore: add AD filter for all sites --- workflow/rules/vaf.smk | 26 ++++++++++++++++++++++++++ 1 file changed, 26 insertions(+) diff --git a/workflow/rules/vaf.smk b/workflow/rules/vaf.smk index bb8f63d..b051090 100644 --- a/workflow/rules/vaf.smk +++ b/workflow/rules/vaf.smk @@ -243,6 +243,32 @@ rule bcftools_mpileup_all_sites: "bcftools query -f '%CHROM\t%POS\t%REF\t%ALT\t%DP\t[ %AD]\t[ %ADF]\t[ %ADR]\n' {output.mpileup:q} >>{output.query:q} 2>{log.query:q}" +rule filter_mpileup_all_sites: + threads: 1 + params: + min_total_AD = config["VC"]["MIN_DEPTH"], + min_total_ADF = 0, + min_total_ADR = 0, + input: + OUTDIR / "bcftools_mpileup_all_sites" / "{sample}.query.tsv", + output: + OUTDIR / "bcftools_mpileup_all_sites" / "{sample}.filtered.tsv", + log: + LOGDIR / "filter_mpileup_all_sites" / "{sample}.txt" + run: + import pandas as pd + df = pd.read_csv(input[0], sep="\t") + df["ref_AD"] = df.AD.str.split(",").apply(lambda values: int(values[0])) + df["total_AD"] = df.AD.str.split(",").apply(lambda values: sum(int(n) for n in values)) + df["total_ADF"] = df.ADF.str.split(",").apply(lambda values: sum(int(n) for n in values)) + df["total_ADR"] = df.ADR.str.split(",").apply(lambda values: sum(int(n) for n in values)) + df[ + (df.total_AD >= params.min_total_AD) & + (df.total_ADF >= params.min_total_ADF) & + (df.total_ADR >= params.min_total_ADR) + ].to_csv(output[0], sep="\t", index=False) + + rule pairwise_trajectory_correlation: conda: "../envs/renv.yaml" input: From d3bae8afe5b113dc3b5f7c3edef2f6365ad32737 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Miguel=20=C3=81lvarez=20Herrera?= Date: Tue, 23 Dec 2025 13:24:06 +0100 Subject: [PATCH 5/8] refactor: extract extra BCFtools options to param --- workflow/rules/vaf.smk | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/workflow/rules/vaf.smk b/workflow/rules/vaf.smk index b051090..ad9b576 100644 --- a/workflow/rules/vaf.smk +++ b/workflow/rules/vaf.smk @@ -228,6 +228,7 @@ rule bcftools_mpileup_all_sites: params: min_mq = 0, min_bq = config["VC"]["MIN_QUALITY"], + mpileup_extra = "--no-BAQ" input: bam = get_input_bam, reference = OUTDIR/"vaf"/"{sample}.reference.fasta", @@ -238,7 +239,7 @@ rule bcftools_mpileup_all_sites: mpileup = LOGDIR / "bcftools_mpileup_all_sites" / "{sample}.mpileup.txt", query = LOGDIR / "bcftools_mpileup_all_sites" / "{sample}.query.txt", shell: - "bcftools mpileup --no-BAQ -a AD,ADF,ADR --fasta-ref {input.reference:q} --threads {threads} -Q {params.min_bq} -q {params.min_mq} -Ov -o {output.mpileup:q} {input.bam:q} >{log.mpileup:q} 2>&1 && " + "bcftools mpileup {params.mpileup_extra} -a AD,ADF,ADR --fasta-ref {input.reference:q} --threads {threads} -Q {params.min_bq} -q {params.min_mq} -Ov -o {output.mpileup:q} {input.bam:q} >{log.mpileup:q} 2>&1 && " "echo 'CHROM\tPOS\tREF\tALT\tDP\tAD\tADF\tADR' >{output.query:q} && " "bcftools query -f '%CHROM\t%POS\t%REF\t%ALT\t%DP\t[ %AD]\t[ %ADF]\t[ %ADR]\n' {output.mpileup:q} >>{output.query:q} 2>{log.query:q}" From 02a78e4bb0678a7cda9a821ca0bdf1e6530669a8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Miguel=20=C3=81lvarez=20Herrera?= Date: Tue, 23 Dec 2025 19:06:15 +0100 Subject: [PATCH 6/8] feat: add low-confidence site filtering Ensures missing variants are distinguished from zero frequency values by applying depth and quality filters. Prevents low-confidence sites from biasing allele frequency analyses. --- template.qmd | 22 +++++--- workflow/rules/report.smk | 17 +++++- workflow/rules/vaf.smk | 42 +++++++++------ workflow/scripts/fill_all_sites.R | 54 +++++++++++++++++++ ...=> pairwise_trajectory_correlation_data.R} | 18 +++---- .../scripts/report/af_time_correlation_data.R | 4 +- 6 files changed, 120 insertions(+), 37 deletions(-) create mode 100644 workflow/scripts/fill_all_sites.R rename workflow/scripts/{pairwise_trajectory_correlation.R => pairwise_trajectory_correlation_data.R} (70%) diff --git a/template.qmd b/template.qmd index 3c2d044..edd048d 100644 --- a/template.qmd +++ b/template.qmd @@ -92,8 +92,7 @@ n.samples <- table %>% pull(Sample) %>% unique() %>% length() heatmap.df <- read.csv(params$heat_tab) row.names(heatmap.df) <- heatmap.df$`...1` heatmap.df[1] <- NULL -cor.mat <- cor(heatmap.df, method = params$cor_method) -cor.mat[is.na(cor.mat)] <- 0 +cor.mat <- cor(heatmap.df, method = params$cor_method, use = "pairwise.complete.obs") if (params$cor_method == "pearson") { cor.method.name <- "Pearson's" } else if (params$cor_method == "spearman") { @@ -247,10 +246,11 @@ Labeled dots represent nucleotide variants whose allele frequency is correlated Significantly correlated nucleotide variants are described in more detail in @fig-panel. -![Time series of relative allele frequencies. The shown positions include -nucleotide variants with a significant correlation with time and sites with more -than two possible states. Each subplot depicts the progression of the allele -frequencies in time for a given genome position.](`r params$panel`){#fig-panel} +![Time series of relative allele frequencies. Nucleotide variants with a significant +correlation with time and sites with more than two possible states are included. Each +subplot depicts the progression of the allele frequencies in time for a given genome +position; lines connecting points indicate data from contiguous time points, while +unconnected points denote intervals with missing data.](`r params$panel`){#fig-panel} A `r dist.tree.algo` tree has been constructed using pairwise distances between target samples (@fig-tree), based on the allele frequencies measured from @@ -277,8 +277,18 @@ The heatmap is interactive and allows zooming in on specific regions. ```{r fig-heatmap, echo = F, message = F, warning = F, fig.align = 'center'} #| fig-cap: "Interactive heatmap with hierarchical clustering of the pairwise correlation coefficients between the time series of allele frequencies in the case study." +# Calculate dendrograms using data with no NAs +cor.mat.clustering <- cor.mat +cor.mat.clustering[is.na(cor.mat.clustering)] <- 0 +dist.rows <- dist(cor.mat.clustering) +dend.rows <- hclust(dist.rows) +dend.cols <- hclust(dist(t(cor.mat.clustering))) + +# Display custom dendrograms over real data heatmaply_cor( cor.mat, + Rowv = as.dendrogram(dend.rows), + Colv = as.dendrogram(dend.cols), grid_gap = 0.25, fontsize_row = 7, fontsize_col = 7, diff --git a/workflow/rules/report.smk b/workflow/rules/report.smk index 77aa6aa..a112e62 100644 --- a/workflow/rules/report.smk +++ b/workflow/rules/report.smk @@ -345,7 +345,7 @@ rule af_time_correlation_data: cor_method = config["COR"]["METHOD"], cor_exact = config["COR"]["EXACT"], input: - variants = OUTDIR/f"{OUTPUT_NAME}.variants.tsv", + variants = OUTDIR/f"{OUTPUT_NAME}.variants.all_sites.tsv", metadata = config["METADATA"], output: fmt_variants = temp(REPORT_DIR_TABLES/"variants.filled.dated.tsv"), @@ -392,6 +392,19 @@ rule af_trajectory_panel_plot: "../scripts/report/af_trajectory_panel_plot.R" +rule pairwise_trajectory_correlation_data: + conda: "../envs/renv.yaml" + input: + variants = OUTDIR/f"{OUTPUT_NAME}.variants.all_sites.tsv", + metadata = config["METADATA"], + output: + table = report(REPORT_DIR_TABLES/"pairwise_trajectory_correlation.csv"), + log: + LOGDIR / "pairwise_trajectory_correlation_data" / "log.txt" + script: + "../scripts/pairwise_trajectory_correlation_data.R" + + rule summary_table: conda: "../envs/renv.yaml" input: @@ -423,7 +436,7 @@ rule report: temest = report(REPORT_DIR_PLOTS/"time_signal.png"), evo = report(REPORT_DIR_PLOTS/"dn_and_ds.png"), omega_plot = report(REPORT_DIR_PLOTS/"dnds.png"), - heat_table = report(OUTDIR/"vaf"/"pairwise_trajectory_correlation.csv"), + heat_table = report(REPORT_DIR_TABLES/"pairwise_trajectory_correlation.csv"), freyja_ts = OUTDIR/"demixing"/"freyja_data"/"last_barcode_update.txt", value = REPORT_DIR_TABLES/"diversity.json", stats_lm = REPORT_DIR_TABLES/"time_signal.json", diff --git a/workflow/rules/vaf.smk b/workflow/rules/vaf.smk index ad9b576..134f088 100644 --- a/workflow/rules/vaf.smk +++ b/workflow/rules/vaf.smk @@ -233,8 +233,8 @@ rule bcftools_mpileup_all_sites: bam = get_input_bam, reference = OUTDIR/"vaf"/"{sample}.reference.fasta", output: - mpileup = OUTDIR / "bcftools_mpileup_all_sites" / "{sample}.mpileup.vcf", - query = OUTDIR / "bcftools_mpileup_all_sites" / "{sample}.query.tsv", + mpileup = temp(OUTDIR / "all_sites" / "{sample}.mpileup.vcf"), + query = temp(OUTDIR / "all_sites" / "{sample}.query.tsv"), log: mpileup = LOGDIR / "bcftools_mpileup_all_sites" / "{sample}.mpileup.txt", query = LOGDIR / "bcftools_mpileup_all_sites" / "{sample}.query.txt", @@ -251,33 +251,41 @@ rule filter_mpileup_all_sites: min_total_ADF = 0, min_total_ADR = 0, input: - OUTDIR / "bcftools_mpileup_all_sites" / "{sample}.query.tsv", + OUTDIR / "all_sites" / "{sample}.query.tsv", output: - OUTDIR / "bcftools_mpileup_all_sites" / "{sample}.filtered.tsv", + temp(OUTDIR / "all_sites" / "{sample}.filtered_sites.tsv"), log: LOGDIR / "filter_mpileup_all_sites" / "{sample}.txt" run: import pandas as pd df = pd.read_csv(input[0], sep="\t") - df["ref_AD"] = df.AD.str.split(",").apply(lambda values: int(values[0])) - df["total_AD"] = df.AD.str.split(",").apply(lambda values: sum(int(n) for n in values)) - df["total_ADF"] = df.ADF.str.split(",").apply(lambda values: sum(int(n) for n in values)) - df["total_ADR"] = df.ADR.str.split(",").apply(lambda values: sum(int(n) for n in values)) + df["SAMPLE"] = wildcards.sample + df["REF_AD"] = df.AD.str.split(",").apply(lambda values: int(values[0])) + df["TOTAL_AD"] = df.AD.str.split(",").apply(lambda values: sum(int(n) for n in values)) + df["TOTAL_ADF"] = df.ADF.str.split(",").apply(lambda values: sum(int(n) for n in values)) + df["TOTAL_ADR"] = df.ADR.str.split(",").apply(lambda values: sum(int(n) for n in values)) df[ - (df.total_AD >= params.min_total_AD) & - (df.total_ADF >= params.min_total_ADF) & - (df.total_ADR >= params.min_total_ADR) + (df.TOTAL_AD >= params.min_total_AD) & + (df.TOTAL_ADF >= params.min_total_ADF) & + (df.TOTAL_ADR >= params.min_total_ADR) ].to_csv(output[0], sep="\t", index=False) -rule pairwise_trajectory_correlation: +use rule concat_vcf_fields as merge_filtered_mpileup_all_sites with: + input: + expand(OUTDIR / "all_sites" / "{sample}.filtered_sites.tsv", sample=iter_samples()), + output: + OUTDIR / f"{OUTPUT_NAME}.filtered_sites.tsv", + + +rule fill_all_sites: conda: "../envs/renv.yaml" input: - variants = OUTDIR/f"{OUTPUT_NAME}.variants.tsv", - metadata = config["METADATA"], + variants = OUTDIR/f"{OUTPUT_NAME}.variants.tsv", + sites = OUTDIR / f"{OUTPUT_NAME}.filtered_sites.tsv", output: - table = report(OUTDIR/"vaf"/"pairwise_trajectory_correlation.csv"), + variants = OUTDIR/f"{OUTPUT_NAME}.variants.all_sites.tsv", log: - LOGDIR / "pairwise_trajectory_correlation" / "log.txt" + LOGDIR / "fill_all_sites" / "log.txt" script: - "../scripts/pairwise_trajectory_correlation.R" + "../scripts/fill_all_sites.R" diff --git a/workflow/scripts/fill_all_sites.R b/workflow/scripts/fill_all_sites.R new file mode 100644 index 0000000..3cb8d13 --- /dev/null +++ b/workflow/scripts/fill_all_sites.R @@ -0,0 +1,54 @@ +#!/usr/bin/env Rscript + +# Write stdout and stderr to log file +log <- file(snakemake@log[[1]], open = "wt") +sink(log, type = "message") +sink(log, type = "output") + +library(dplyr) +library(readr) +library(tidyr) +library(logger) + +log_threshold(INFO) + +log_info("Reading variants") +variants <- read_tsv(snakemake@input[["variants"]]) + +# Create a mapping of variant names to their genomic position +variant_coords <- variants %>% + select(VARIANT_NAME, REGION, POS) %>% + distinct() + +log_info("Reading filtered sites") +sites <- read_tsv(snakemake@input[["sites"]]) %>% + select(SAMPLE, POS) %>% # TODO: consider region/chrom + distinct() %>% + mutate(FILTER_PASS = TRUE) + +log_info("Processing variants") +all_variants <- variants %>% + # Select minimal columns + select(VARIANT_NAME, REGION, SAMPLE, ALT_FREQ) %>% + # Handle duplicates + group_by(SAMPLE, VARIANT_NAME, REGION) %>% + summarise(ALT_FREQ = sum(ALT_FREQ, na.rm = TRUE), .groups = "drop") %>% + # Complete with NA + complete(SAMPLE, VARIANT_NAME, REGION) %>% + # Assign genomic positions for all combinations + left_join(variant_coords, by = c("REGION", "VARIANT_NAME")) %>% + # Merge filtered sites + # TODO: consider region/chrom + left_join(sites, by = c("SAMPLE", "POS")) %>% + replace_na(list(FILTER_PASS = FALSE)) %>% + # Fill missing frequencies conditionally + mutate( + ALT_FREQ = case_when( + !is.na(ALT_FREQ) ~ ALT_FREQ, # variant was called, keep the frequency + FILTER_PASS ~ 0, # missing but site is covered: 0 (reference) + TRUE ~ NA # missing and not covered: NA (unknown) + ) + ) + +log_info("Saving table") +write_tsv(all_variants, snakemake@output[["variants"]]) diff --git a/workflow/scripts/pairwise_trajectory_correlation.R b/workflow/scripts/pairwise_trajectory_correlation_data.R similarity index 70% rename from workflow/scripts/pairwise_trajectory_correlation.R rename to workflow/scripts/pairwise_trajectory_correlation_data.R index 27a8dfc..9fe7cc6 100644 --- a/workflow/scripts/pairwise_trajectory_correlation.R +++ b/workflow/scripts/pairwise_trajectory_correlation_data.R @@ -14,6 +14,7 @@ library(logger) log_threshold(INFO) +log_info("Reading variants") variants <- read_tsv(snakemake@input[["variants"]]) # Obtain sample names ordered by CollectionDate @@ -22,20 +23,17 @@ date_order <- read_csv(snakemake@input[["metadata"]]) %>% pull(ID) %>% unique() -# Create SNP variable and select useful variables -variants <- variants %>% select(VARIANT_NAME, SAMPLE, ALT_FREQ) - -variants <- variants %>% +all_variants_wider <- variants %>% + select(SAMPLE, VARIANT_NAME, ALT_FREQ) %>% pivot_wider( names_from = VARIANT_NAME, - values_from = ALT_FREQ, - values_fill = 0, - values_fn = sum + values_from = ALT_FREQ ) %>% + # Apply chronological ordering arrange(factor(SAMPLE, levels = date_order)) %>% - # Removes "|"-separated annotations, keeping the first one + ellipsis (clarifies heatmap) + # Removes "|"-separated annotations, keeping the first one + ellipsis rename_with(~ str_replace(., "^([^|]+)\\|.*$", "\\1(...)"), -SAMPLE) %>% column_to_rownames(var = "SAMPLE") -log_info("Saving table to create heatmap") -write.csv(variants, snakemake@output[["table"]]) +log_info("Saving table") +write.csv(all_variants_wider, snakemake@output[["table"]]) diff --git a/workflow/scripts/report/af_time_correlation_data.R b/workflow/scripts/report/af_time_correlation_data.R index d3415c0..90daf30 100644 --- a/workflow/scripts/report/af_time_correlation_data.R +++ b/workflow/scripts/report/af_time_correlation_data.R @@ -24,11 +24,11 @@ variants <- read_delim( "POS" ) ) %>% - # Fill positions without alt frequency with 0 + # Fill positions without alt frequency with NA complete( nesting(REGION, VARIANT_NAME, POS), SAMPLE, - fill = list(ALT_FREQ = 0) + fill = list(ALT_FREQ = NA) ) log_info("Reading metadata") From 6dfb966da454341b81fac0b00ee51896853e09f3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Miguel=20=C3=81lvarez=20Herrera?= Date: Tue, 23 Dec 2025 19:18:00 +0100 Subject: [PATCH 7/8] refactor: calculate AF correlation matrix before rendering report --- template.qmd | 8 ++++---- workflow/rules/report.smk | 10 +++++++--- .../pairwise_trajectory_correlation_data.R | 13 ++++++++++++- 3 files changed, 23 insertions(+), 8 deletions(-) rename workflow/scripts/{ => report}/pairwise_trajectory_correlation_data.R (76%) diff --git a/template.qmd b/template.qmd index edd048d..8e3b6c3 100644 --- a/template.qmd +++ b/template.qmd @@ -65,6 +65,7 @@ library(tidyr) library(dplyr) library(heatmaply) library(readr) +library(tibble) # Diversity div_values <- fromJSON(params$div_value) @@ -89,10 +90,9 @@ table <- read.csv(params$table) n.samples <- table %>% pull(Sample) %>% unique() %>% length() # Heatmap -heatmap.df <- read.csv(params$heat_tab) -row.names(heatmap.df) <- heatmap.df$`...1` -heatmap.df[1] <- NULL -cor.mat <- cor(heatmap.df, method = params$cor_method, use = "pairwise.complete.obs") +cor.mat <- read_csv(params$heat_tab) %>% + column_to_rownames("...1") %>% + as.matrix() if (params$cor_method == "pearson") { cor.method.name <- "Pearson's" } else if (params$cor_method == "spearman") { diff --git a/workflow/rules/report.smk b/workflow/rules/report.smk index a112e62..1d709d1 100644 --- a/workflow/rules/report.smk +++ b/workflow/rules/report.smk @@ -394,15 +394,19 @@ rule af_trajectory_panel_plot: rule pairwise_trajectory_correlation_data: conda: "../envs/renv.yaml" + params: + cor_method = config["COR"]["METHOD"], + cor_use = "pairwise.complete.obs", input: variants = OUTDIR/f"{OUTPUT_NAME}.variants.all_sites.tsv", metadata = config["METADATA"], output: - table = report(REPORT_DIR_TABLES/"pairwise_trajectory_correlation.csv"), + table = REPORT_DIR_TABLES/"pairwise_trajectory_frequency_data.csv", + matrix = report(REPORT_DIR_TABLES/"pairwise_trajectory_correlation_matrix.csv"), log: LOGDIR / "pairwise_trajectory_correlation_data" / "log.txt" script: - "../scripts/pairwise_trajectory_correlation_data.R" + "../scripts/report/pairwise_trajectory_correlation_data.R" rule summary_table: @@ -436,7 +440,7 @@ rule report: temest = report(REPORT_DIR_PLOTS/"time_signal.png"), evo = report(REPORT_DIR_PLOTS/"dn_and_ds.png"), omega_plot = report(REPORT_DIR_PLOTS/"dnds.png"), - heat_table = report(REPORT_DIR_TABLES/"pairwise_trajectory_correlation.csv"), + heat_table = report(REPORT_DIR_TABLES/"pairwise_trajectory_correlation_matrix.csv"), freyja_ts = OUTDIR/"demixing"/"freyja_data"/"last_barcode_update.txt", value = REPORT_DIR_TABLES/"diversity.json", stats_lm = REPORT_DIR_TABLES/"time_signal.json", diff --git a/workflow/scripts/pairwise_trajectory_correlation_data.R b/workflow/scripts/report/pairwise_trajectory_correlation_data.R similarity index 76% rename from workflow/scripts/pairwise_trajectory_correlation_data.R rename to workflow/scripts/report/pairwise_trajectory_correlation_data.R index 9fe7cc6..dc51c85 100644 --- a/workflow/scripts/pairwise_trajectory_correlation_data.R +++ b/workflow/scripts/report/pairwise_trajectory_correlation_data.R @@ -23,6 +23,7 @@ date_order <- read_csv(snakemake@input[["metadata"]]) %>% pull(ID) %>% unique() +log_info("Formatting variants") all_variants_wider <- variants %>% select(SAMPLE, VARIANT_NAME, ALT_FREQ) %>% pivot_wider( @@ -35,5 +36,15 @@ all_variants_wider <- variants %>% rename_with(~ str_replace(., "^([^|]+)\\|.*$", "\\1(...)"), -SAMPLE) %>% column_to_rownames(var = "SAMPLE") -log_info("Saving table") +log_info("Saving table of frequencies") write.csv(all_variants_wider, snakemake@output[["table"]]) + +log_info("Calculating correlations") +cor.mat <- cor( + all_variants_wider, + method = snakemake@params$cor_method, + use = snakemake@params$cor_use +) + +log_info("Saving correlation matrix") +write.csv(cor.mat, snakemake@output[["matrix"]]) From 78f7f9dca4dbda1b452af23c4e55cd5447b7b28c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Miguel=20=C3=81lvarez=20Herrera?= Date: Tue, 23 Dec 2025 19:22:38 +0100 Subject: [PATCH 8/8] chore: remove unused rule --- workflow/rules/vaf.smk | 16 ---------------- 1 file changed, 16 deletions(-) diff --git a/workflow/rules/vaf.smk b/workflow/rules/vaf.smk index 134f088..c42d580 100644 --- a/workflow/rules/vaf.smk +++ b/workflow/rules/vaf.smk @@ -206,22 +206,6 @@ use rule concat_vcf_fields as concat_variants with: OUTDIR/f"{OUTPUT_NAME}.variants.tsv", -rule samtools_depth_all_sites: - threads: 1 - conda: "../envs/var_calling.yaml" - params: - min_mq = 0, - min_bq = config["VC"]["MIN_QUALITY"], - input: - bam = get_input_bam, - output: - OUTDIR / "samtools_depth_all_sites" / "{sample}.depth.tsv" - log: - LOGDIR / "samtools_depth_all_sites" / "{sample}.txt", - shell: - "samtools depth --threads {threads} -a -H -J -Q {params.min_bq} -q {params.min_mq} -o {output:q} {input:q} >{log:q} 2>&1" - - rule bcftools_mpileup_all_sites: threads: 1 conda: "../envs/var_calling.yaml"