diff --git a/.dag.png b/.dag.png
index d8806e5..f3e4928 100644
Binary files a/.dag.png and b/.dag.png differ
diff --git a/.github/workflows/test_apptainer.yml b/.github/workflows/test_apptainer.yml
index d70f359..452af0d 100644
--- a/.github/workflows/test_apptainer.yml
+++ b/.github/workflows/test_apptainer.yml
@@ -26,7 +26,7 @@ jobs:
environment-file: .test/environment_apptainer.yaml
- name: Run test pipeline
shell: bash -el {0}
- run: snakemake --snakefile .test/Snakefile --configfile config/config.yaml .test/targets.yaml -c1 --sdm apptainer conda
+ run: snakemake --snakefile .test/Snakefile --configfile config/config.yaml .test/targets.yaml -c1 --sdm apptainer conda --keep-going --notemp
- name: Pack logs
if: success() || failure()
shell: bash -el {0}
diff --git a/.github/workflows/test_v7.yml b/.github/workflows/test_v7.yml
index 93c6a4e..3416131 100644
--- a/.github/workflows/test_v7.yml
+++ b/.github/workflows/test_v7.yml
@@ -26,7 +26,7 @@ jobs:
environment-file: .test/environment_v7.yaml
- name: Run test pipeline
shell: bash -el {0}
- run: snakemake --snakefile .test/Snakefile --configfile config/config.yaml .test/targets.yaml --use-conda -c1 --conda-frontend conda
+ run: snakemake --snakefile .test/Snakefile --configfile config/config.yaml .test/targets.yaml --use-conda -c1 --conda-frontend conda --keep-going --notemp
- name: Pack logs
if: success() || failure()
shell: bash -el {0}
diff --git a/.github/workflows/test_v8.yml b/.github/workflows/test_v8.yml
index 0043c36..8fd912d 100644
--- a/.github/workflows/test_v8.yml
+++ b/.github/workflows/test_v8.yml
@@ -26,7 +26,7 @@ jobs:
environment-file: .test/environment_v8.yaml
- name: Run test pipeline
shell: bash -el {0}
- run: snakemake --snakefile .test/Snakefile --configfile config/config.yaml .test/targets.yaml --use-conda -c1 --conda-frontend conda
+ run: snakemake --snakefile .test/Snakefile --configfile config/config.yaml .test/targets.yaml --use-conda -c1 --conda-frontend conda --keep-going --notemp
- name: Pack logs
if: success() || failure()
shell: bash -el {0}
diff --git a/.github/workflows/test_v9.yml b/.github/workflows/test_v9.yml
index 1f5371a..a3e211c 100644
--- a/.github/workflows/test_v9.yml
+++ b/.github/workflows/test_v9.yml
@@ -26,7 +26,7 @@ jobs:
environment-file: .test/environment_v9.yaml
- name: Run test pipeline
shell: bash -el {0}
- run: snakemake --snakefile .test/Snakefile --configfile config/config.yaml .test/targets.yaml -c1 --sdm conda
+ run: snakemake --snakefile .test/Snakefile --configfile config/config.yaml .test/targets.yaml -c1 --sdm conda --keep-going --notemp
- name: Pack logs
if: success() || failure()
shell: bash -el {0}
diff --git a/.rulegraph.png b/.rulegraph.png
index dc85134..d3bcf3c 100644
Binary files a/.rulegraph.png and b/.rulegraph.png differ
diff --git a/.rulegraph_sv.svg b/.rulegraph_sv.svg
new file mode 100644
index 0000000..d7ff0c7
--- /dev/null
+++ b/.rulegraph_sv.svg
@@ -0,0 +1 @@
+
\ No newline at end of file
diff --git a/.test/Snakefile b/.test/Snakefile
index 66eaf84..d3ff278 100644
--- a/.test/Snakefile
+++ b/.test/Snakefile
@@ -8,7 +8,7 @@ import subprocess
min_version("7.19")
# Workflow version
-__version__ = "1.2.1"
+__version__ = "1.3.0"
# Rules
include: "../workflow/core.smk"
diff --git a/.test/targets.yaml b/.test/targets.yaml
index b9e4784..bac704f 100644
--- a/.test/targets.yaml
+++ b/.test/targets.yaml
@@ -23,11 +23,9 @@ PLOTS:
PROBLEMATIC_VCF:
"https://raw.githubusercontent.com/W-L/ProblematicSites_SARS-CoV2/da322c32004f7b16bdaa6a8ee7fd24d56e79d9dc/problematic_sites_sarsCov2.vcf"
VC:
- MAX_DEPTH: 0
MIN_QUALITY: 0
- IVAR_QUALITY: 0
- IVAR_FREQ: 0.05
- IVAR_DEPTH: 5
+ MIN_FREQ: 0.05
+ MIN_DEPTH: 5
DEMIX:
MIN_QUALITY: 0
COV_CUTOFF: 0
@@ -40,7 +38,5 @@ PLOT_GENOME_REGIONS:
"config/nsp_annotation.csv"
REPORT_QMD:
"template.qmd"
-FEATURES_JSON:
- "config/sarscov2_features.json"
GENETIC_CODE_JSON:
"config/standard_genetic_code.json"
diff --git a/Dockerfile b/Dockerfile
index 7f39843..7f6ac6a 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -1,12 +1,23 @@
FROM condaforge/miniforge3:latest
LABEL io.github.snakemake.containerized="true"
-LABEL io.github.snakemake.conda_env_hash="bd9b69b913e7b4e17d7d5ca7169a5815c3145ea775af87163f18c0b92abc1bf8"
+LABEL io.github.snakemake.conda_env_hash="7071b22b1161190c06be3ac061ab0019a1a8d3038532c9134e070a3875414ef5"
# Step 2: Retrieve conda environments
+# Conda environment:
+# source: workflow/envs/afwdist.yaml
+# prefix: /conda-envs/9c24a867826615972cc288081976e7fc
+# channels:
+# - bioconda
+# - conda-forge
+# dependencies:
+# - afwdist==1.0.0
+RUN mkdir -p /conda-envs/9c24a867826615972cc288081976e7fc
+COPY workflow/envs/afwdist.yaml /conda-envs/9c24a867826615972cc288081976e7fc/environment.yaml
+
# Conda environment:
# source: workflow/envs/biopython.yaml
-# prefix: /conda-envs/bd81c49fcb540d7706807c1683ba7200
+# prefix: /conda-envs/162796cecea22d99c8702138f0c48e2f
# channels:
# - conda-forge
# - bioconda
@@ -14,12 +25,8 @@ LABEL io.github.snakemake.conda_env_hash="bd9b69b913e7b4e17d7d5ca7169a5815c3145e
# - python==3.10
# - biopython==1.81
# - pandas==2.0.3
-# - pip==23.2.1
-# - mafft==7.525
-# - pip:
-# - gb2seq==0.2.20
-RUN mkdir -p /conda-envs/bd81c49fcb540d7706807c1683ba7200
-COPY workflow/envs/biopython.yaml /conda-envs/bd81c49fcb540d7706807c1683ba7200/environment.yaml
+RUN mkdir -p /conda-envs/162796cecea22d99c8702138f0c48e2f
+COPY workflow/envs/biopython.yaml /conda-envs/162796cecea22d99c8702138f0c48e2f/environment.yaml
# Conda environment:
# source: workflow/envs/fetch.yaml
@@ -35,14 +42,14 @@ COPY workflow/envs/fetch.yaml /conda-envs/9439457f932a4fbca3665c9ea1ac2f0a/envir
# Conda environment:
# source: workflow/envs/freyja.yaml
-# prefix: /conda-envs/ee7a2e1b4ec9a7a9999f34dddaea0605
+# prefix: /conda-envs/bb4c5f3a509433cc08861582fab4a705
# channels:
# - conda-forge
# - bioconda
# dependencies:
-# - freyja==1.4.2
-RUN mkdir -p /conda-envs/ee7a2e1b4ec9a7a9999f34dddaea0605
-COPY workflow/envs/freyja.yaml /conda-envs/ee7a2e1b4ec9a7a9999f34dddaea0605/environment.yaml
+# - freyja==2.0.1
+RUN mkdir -p /conda-envs/bb4c5f3a509433cc08861582fab4a705
+COPY workflow/envs/freyja.yaml /conda-envs/bb4c5f3a509433cc08861582fab4a705/environment.yaml
# Conda environment:
# source: workflow/envs/gisaidr.yaml
@@ -93,56 +100,52 @@ COPY workflow/envs/pangolin.yaml /conda-envs/fd645c541ee7a3d43fb9167441b77888/en
# Conda environment:
# source: workflow/envs/quarto_render.yaml
-# prefix: /conda-envs/f2a098519cf1f8c4cecb3c13f8c92883
+# prefix: /conda-envs/96f3c1cec4b3ce5d72f708992272e9c1
# channels:
# - conda-forge
-# - bioconda
# dependencies:
-# - r-base==4.3.1
-# - r-gt==0.9.0
-# - quarto==1.3.450
-# - r-jsonlite==1.8.5
+# - r-base==4.5.2
+# - r-gt==1.1.0
+# - quarto==1.8.25
+# - deno==2.3.1
# - r-tidyverse==2.0.0
-# - r-quarto==1.2
-# - r-heatmaply==1.4.2
-# - r-readr==2.1.4
-RUN mkdir -p /conda-envs/f2a098519cf1f8c4cecb3c13f8c92883
-COPY workflow/envs/quarto_render.yaml /conda-envs/f2a098519cf1f8c4cecb3c13f8c92883/environment.yaml
+# - r-heatmaply==1.6.0
+RUN mkdir -p /conda-envs/96f3c1cec4b3ce5d72f708992272e9c1
+COPY workflow/envs/quarto_render.yaml /conda-envs/96f3c1cec4b3ce5d72f708992272e9c1/environment.yaml
# Conda environment:
# source: workflow/envs/renv.yaml
-# prefix: /conda-envs/4b57bfc237ddc217c1f0b04d34dc06ef
+# prefix: /conda-envs/8ad6cdcf265d30289788da99d5bf9fff
# channels:
# - conda-forge
# - bioconda
# dependencies:
-# - r-base=4.1.3
+# - r-base=4.3.2
# - r-tidyverse==2.0.0
# - r-ggrepel==0.9.3
-# - r-stringi==1.7.12
# - r-ggpubr==0.6.0
-# - bioconductor-ggtree==3.2.0
-# - r-ape==5.7
+# - bioconductor-ggtree==3.10.0
+# - r-ape==5.8
# - r-adephylo==1.1_13
# - r-pegas==1.2
# - r-data.table==1.14.8
# - r-future.apply==1.11.0
-# - r-scales==1.2.1
+# - r-scales==1.3.0
# - r-showtext==0.9_6
-# - r-jsonlite==1.8.5
# - r-logger==0.2.2
-RUN mkdir -p /conda-envs/4b57bfc237ddc217c1f0b04d34dc06ef
-COPY workflow/envs/renv.yaml /conda-envs/4b57bfc237ddc217c1f0b04d34dc06ef/environment.yaml
+RUN mkdir -p /conda-envs/8ad6cdcf265d30289788da99d5bf9fff
+COPY workflow/envs/renv.yaml /conda-envs/8ad6cdcf265d30289788da99d5bf9fff/environment.yaml
# Conda environment:
# source: workflow/envs/snpeff.yaml
-# prefix: /conda-envs/1934df0e4df02a7ee33c52f53f9e3c30
+# prefix: /conda-envs/0adafb79cb1bec58ef4c77bf4cca4f95
# channels:
# - bioconda
# dependencies:
# - snpeff==5.1d
-RUN mkdir -p /conda-envs/1934df0e4df02a7ee33c52f53f9e3c30
-COPY workflow/envs/snpeff.yaml /conda-envs/1934df0e4df02a7ee33c52f53f9e3c30/environment.yaml
+# - snpsift==5.1d
+RUN mkdir -p /conda-envs/0adafb79cb1bec58ef4c77bf4cca4f95
+COPY workflow/envs/snpeff.yaml /conda-envs/0adafb79cb1bec58ef4c77bf4cca4f95/environment.yaml
# Conda environment:
# source: workflow/envs/var_calling.yaml
@@ -158,16 +161,17 @@ COPY workflow/envs/var_calling.yaml /conda-envs/5150d0f0a91d7f7a789a06f453d63479
# Step 3: Generate conda environments
-RUN conda env create --prefix /conda-envs/bd81c49fcb540d7706807c1683ba7200 --file /conda-envs/bd81c49fcb540d7706807c1683ba7200/environment.yaml && \
+RUN conda env create --prefix /conda-envs/9c24a867826615972cc288081976e7fc --file /conda-envs/9c24a867826615972cc288081976e7fc/environment.yaml && \
+ conda env create --prefix /conda-envs/162796cecea22d99c8702138f0c48e2f --file /conda-envs/162796cecea22d99c8702138f0c48e2f/environment.yaml && \
conda env create --prefix /conda-envs/9439457f932a4fbca3665c9ea1ac2f0a --file /conda-envs/9439457f932a4fbca3665c9ea1ac2f0a/environment.yaml && \
- conda env create --prefix /conda-envs/ee7a2e1b4ec9a7a9999f34dddaea0605 --file /conda-envs/ee7a2e1b4ec9a7a9999f34dddaea0605/environment.yaml && \
+ conda env create --prefix /conda-envs/bb4c5f3a509433cc08861582fab4a705 --file /conda-envs/bb4c5f3a509433cc08861582fab4a705/environment.yaml && \
conda env create --prefix /conda-envs/3fad3c9cdfa40bee9404f6a2e8fda69f --file /conda-envs/3fad3c9cdfa40bee9404f6a2e8fda69f/environment.yaml && \
conda env create --prefix /conda-envs/0a608afb24723cb6fa8aef748f5efbc8 --file /conda-envs/0a608afb24723cb6fa8aef748f5efbc8/environment.yaml && \
conda env create --prefix /conda-envs/04a3347f94ddf7e21c34bc49e5246076 --file /conda-envs/04a3347f94ddf7e21c34bc49e5246076/environment.yaml && \
conda env create --prefix /conda-envs/fd645c541ee7a3d43fb9167441b77888 --file /conda-envs/fd645c541ee7a3d43fb9167441b77888/environment.yaml && \
- conda env create --prefix /conda-envs/f2a098519cf1f8c4cecb3c13f8c92883 --file /conda-envs/f2a098519cf1f8c4cecb3c13f8c92883/environment.yaml && \
- conda env create --prefix /conda-envs/4b57bfc237ddc217c1f0b04d34dc06ef --file /conda-envs/4b57bfc237ddc217c1f0b04d34dc06ef/environment.yaml && \
- conda env create --prefix /conda-envs/1934df0e4df02a7ee33c52f53f9e3c30 --file /conda-envs/1934df0e4df02a7ee33c52f53f9e3c30/environment.yaml && \
+ conda env create --prefix /conda-envs/96f3c1cec4b3ce5d72f708992272e9c1 --file /conda-envs/96f3c1cec4b3ce5d72f708992272e9c1/environment.yaml && \
+ conda env create --prefix /conda-envs/8ad6cdcf265d30289788da99d5bf9fff --file /conda-envs/8ad6cdcf265d30289788da99d5bf9fff/environment.yaml && \
+ conda env create --prefix /conda-envs/0adafb79cb1bec58ef4c77bf4cca4f95 --file /conda-envs/0adafb79cb1bec58ef4c77bf4cca4f95/environment.yaml && \
conda env create --prefix /conda-envs/5150d0f0a91d7f7a789a06f453d63479 --file /conda-envs/5150d0f0a91d7f7a789a06f453d63479/environment.yaml && \
conda clean --all -y
diff --git a/config/README.md b/config/README.md
index 23e0367..f5cc413 100644
--- a/config/README.md
+++ b/config/README.md
@@ -12,16 +12,28 @@ For example, existing [profiles](https://snakemake.readthedocs.io/en/stable/gett
are cross-compatible as well, but note that the `--use-conda` flag is deprecated starting with Snakemake 8.
Instead, use `--software-deployment-method conda`.
+## tl;dr
+
+- **Needs**:
+ - Snakemake 7.19 or later.
+ - One FASTA per sample.
+ - One BAM per sample.
+ - One metadata CSV with columns `ID`, `CollectionDate` (YYYY-MM-DD), `ResidenceCity` and `GISAIDEPI` (can be empty).
+- **Setup**: edit [targets.yaml](/config/targets.yaml) (set `SAMPLES` and `METADATA`, at least) or build it using [`build_targets.py`](/build_targets.py). Leave `CONTEXT_FASTA: null` to auto-download from GISAID (needs `config/gisaid.yaml` with your username and password) or set a local FASTA path if download fails (see GISAID disclaimer).
+- **Run**: `snakemake (--use-conda | --sdm conda) -c4`.
+
## Inputs and outputs
The workflow requires a set of FASTA files (one per target sample), a corresponding set of
BAM files (also one per target sample), and a metadata table in CSV format with one row per
-sample. The metadata must include the following columns: unique sample identifier (default column `ID`,
-used to match sequencing files with metadata), the date the sample was collected (default `CollectionDate`),
-the location where the sample was collected (default `ResidenceCity`), and GISAID accession (default `GISAIDEPI`).
-The default column names but can be customized if needed via the workflow parameters.
+sample. The metadata must include at least the following columns:
-These parameters are set in two configuration files in YAML format:
+- `ID`: unique sample identifier, used to match sequencing files with metadata.
+- `CollectionDate`: the date the sample was collected (YYYY-MM-DD).
+- `ResidenceCity`: the location where the sample was collected.
+- `GISAIDEPI`: GISAID accession identifier (`EPI_ISL_...` or empty).
+
+The path to these input files is set in two configuration files in YAML format:
[config.yaml](/config/config.yaml) (for general workflow settings) and
[targets.yaml](/config/targets.yaml) (for specific dataset-related settings).
The latter must be modified by the user to point the `SAMPLES` and `METADATA`
@@ -130,24 +142,50 @@ All of the following variables are pre-defined in [config.yaml](/config/config.y
- `ALIGNMENT_REFERENCE`: NCBI accession number of the reference record for sequence alignment.
- `PROBLEMATIC_VCF`: URL or path of a VCF file containing problematic genome positions for masking.
-- `FEATURES_JSON`: path of a JSON file containing name equivalences of genome features for data visualization.
- `GENETIC_CODE_JSON`: path of a JSON file containing a genetic code for gene translation.
- `TREE_MODEL`: substitution model used by IQTREE (see [docs](http://www.iqtree.org/doc/Substitution-Models)).
-- `UFBOOT_REPS`: ultrafast bootstrap replicates for IQTREE (see [UFBoot](https://doi.org/10.1093/molbev/msx281)).
-- `SHALRT_REPS`: Shimodaira–Hasegawa approximate likelihood ratio test bootstrap replicates for IQTREE (see [SH-aLRT](https://doi.org/10.1093/sysbio/syq010)).
+- `UFBOOT` & `SHALRT`: settings for ultrafast bootstrap (see [UFBoot](https://doi.org/10.1093/molbev/msx281))
+ and Shimodaira–Hasegawa approximate likelihood ratio test bootstrap
+ (see [SH-aLRT](https://doi.org/10.1093/sysbio/syq010)) in IQTREE runs:
+ - `REPS`: number of replicates.
+ - `THRESHOLD`: value cutoff for visualization.
- `VC`: variant calling configuration:
- - `MAX_DEPTH`: maximum depth at a position for `samtools mpileup` (option `-d`).
- - `MIN_QUALITY`: minimum base quality for `samtools mpileup` (option `-Q`).
- - `IVAR_QUALITY`: minimum base quality for `ivar variants` (option `-q`).
- - `IVAR_FREQ`: minimum frequency threshold for `ivar variants` (option `-t`).
- - `IVAR_DEPTH`: minimum read depth for `ivar variants` (option `-m`).
-- `DEMIX`: demixing configuration:
+ - `MIN_QUALITY`: minimum base quality for `ivar variants` (option `-q`).
+ - `MIN_FREQ`: minimum frequency threshold for `ivar variants` (option `-t`).
+ - `MIN_DEPTH`: minimum read depth for `ivar variants` (option `-m`).
+ - `MAX_DEPTH`: maximum read depth for `samtools mpileup` (option `-d`).
+- `DEMIX`: demixing configuration (uses [Freyja](https://github.com/andersen-lab/Freyja), see also [its docs](https://andersen-lab.github.io/Freyja/index.html)):
+ - `PATHOGEN`: pathogen of interest for `freyja update` (option `--pathogen`); must be 'SARS-CoV-2'.
- `MIN_QUALITY`: minimum quality for `freyja variants` (option `--minq`).
- - `COV_CUTOFF`: minimum depth for `freyja demix` (option `--covcut`).
+ - `MAX_DEPTH`: maximum read depth for `samtools mpileup` (option `-d`).
+ - `COV_CUTOFF`: minimum depth to calculate the reported "coverage" (percent of sites with that depth) for `freyja demix` (option `--covcut`).
- `MIN_ABUNDANCE`: minimum lineage estimated abundance for `freyja demix` (option `--eps`).
+ - `CONFIRMED_ONLY`: exclude unconfirmed lineages in `freyja demix` (option `--confirmedonly`).
+ - `DEPTH_CUTOFF`: minimum depth on each site for `freyja demix` (option `--depthcutoff`).
+ - `RELAXED_MRCA`: assign clusters using relaxed (as opposed to strict) MRCA, used with `DEPTH_CUTOFF`, for `freyja demix` (option `--relaxedmrca`).
+ - `RELAXED_MRCA_THRESH`: `RELAXED_MRCA` threshold for `freyja demix` (option `--relaxedthresh`).
+ - `AUTO_ADAPT`: use error profile to set adaptive lasso penalty parameter for `freyja demix` (option `--autoadapt`).
- `WINDOW`: sliding window of nucleotide variants per site configuration:
- `WIDTH`: number of sites within windows.
- `STEP`: number of sites between windows.
+- `GB_FEATURES`: optional mapping to filter which features from the GenBank file are used
+ by some analyses (e.g. rules `window` and `n_s_sites`).
+ If `GB_FEATURES` is empty or unset, all features are used.
+ Filtering is applied in order:
+ - `INCLUDE`: mapping of qualifier names to sequences of values.
+ If present, only features that match at least one key/value pair in `INCLUDE`
+ are included in the analyses. For example, having `INCLUDE: {gene: [S, N]}` keeps features whose `gene`
+ qualifier equals `S` or `N`.
+ - `EXCLUDE`: mapping of qualifier names to sequences of values.
+ After `INCLUDE` is applied, any feature that matches any key/value pair in `EXCLUDE`
+ is omitted. For example, having `EXCLUDE: {gene: [S, N]}` removes features whose
+ `gene` qualifier equals `S` or `N`.
+- `ANNOTATION`: settings for variant annotation and functional effect prediction using [SnpEff](https://pcingola.github.io/SnpEff/), which uses `ALIGNMENT_REFERENCE` for selecting the database to annotate.
+ - `SNPEFF_COLS`: mapping of column names (which appear in result tables) to VCF fields (extracted after annotation with [SnpSift](https://pcingola.github.io/SnpEff/#snpsift)). Some columns are hard-coded in the code, so removing them is not advised. Additional columns can be added as needed.
+ - `FILTER_INCLUDE` & `FILTER_EXCLUDE`: mapping of column names (from `SNPEFF_COLS`) to lists of values used for filtering the annotated variants table. `FILTER_INCLUDE` is applied first, then `FILTER_EXCLUDE`.
+ - `FILTER_INCLUDE`: keeps variants that match at least one listed value.
+ - `FILTER_EXCLUDE`: removes variant that matches any listed value.
+ - `VARIANT_NAME_PATTERN`: string template used to build the variant name shown in the results table (column `VARIANT_NAME`). The template is interpreted with [glue](https://glue.tidyverse.org/) and can use any column name from `SNPEFF_COLS` and some R functions. For example, `"{GENE}:{coalesce(HGVS_P, HGVS_C)}"` creates names like `S:p.D614G` (using `HGVS_P` when available, otherwise `HGVS_C`).
- `GISAID`: automatic context download configuration.
- `CREDENTIALS`: path of the GISAID credentials in YAML format.
- `DATE_COLUMN`: name of the column that contains sampling dates (YYYY-MM-DD) in the input target metadata.
@@ -160,27 +198,35 @@ All of the following variables are pre-defined in [config.yaml](/config/config.y
- `EXACT`: boolean flag indicating whether to compute an exact p-value when possible. This option applies only to certain methods and may be set to `null` (default) to let R decide automatically.
- `LOG_PY_FMT`: logging format string for Python scripts.
- `PLOTS`: path of the R script that sets the design and style of data visualizations.
-- `PLOT_GENOME_REGIONS`: path of a CSV file containing genome regions, e.g. SARS-CoV-2 non-structural protein (NSP) coordinates, for data visualization.
+- `PLOT_GENOME_REGIONS`: path of a CSV file containing genome regions, e.g. SARS-CoV-2 non-structural protein (NSP) coordinates, for data visualization (columns: `region`, `start`, `end`).
- `REPORT_QMD`: path of the report template in Quarto markdown (QMD) format.
+- `REPORT_CSS`: path of the report stylesheet definition in CSS format.
-## Workflow graphs
+## Workflow visualization
-To generate a simplified rule graph, run:
+Snakemake enables easy visualization of workflows and rule relationships. The `--rulegraph` option outputs a DOT file that describes dependencies between rules. The example below produces an image using Graphviz:
```shell
-snakemake --rulegraph | dot -Tpng > .rulegraph.png
+snakemake --forceall --rulegraph | dot -Tpng >.rulegraph.png
```

-To generate the directed acyclic graph (DAG) of all rules
-to be executed, run:
+The same graph can also be rendered with other tools such as [snakevision](https://github.com/OpenOmics/snakevision) (v0.1.0).
+
+```shell
+snakemake --forceall --rulegraph | snakevision -s all -o .rulegraph_sv.svg
+```
+
+
+
+The `--dag` option emits an directed acyclic graph (DAG) that corresponds to the rule instances that would be executed for the current dataset. The example below produces an image using Graphviz:
```shell
-snakemake --forceall --dag | dot -Tpng > .dag.png
+snakemake --forceall --dag | dot -Tpng >.dag.png
```
-
+
## Run modes
diff --git a/config/config.yaml b/config/config.yaml
index 7bb2fbe..54d5097 100644
--- a/config/config.yaml
+++ b/config/config.yaml
@@ -2,29 +2,82 @@ ALIGNMENT_REFERENCE:
"NC_045512.2"
PROBLEMATIC_VCF:
"https://raw.githubusercontent.com/W-L/ProblematicSites_SARS-CoV2/da322c32004f7b16bdaa6a8ee7fd24d56e79d9dc/problematic_sites_sarsCov2.vcf"
-FEATURES_JSON:
- "config/sarscov2_features.json"
GENETIC_CODE_JSON:
"config/standard_genetic_code.json"
TREE_MODEL:
"GTR+F+I+G4"
-UFBOOT_REPS:
- 1000
-SHALRT_REPS:
- 1000
+UFBOOT:
+ REPS: 1000
+ THRESHOLD: 0.95
+SHALRT:
+ REPS: 1000
+ THRESHOLD: 0.80
VC:
- MAX_DEPTH: 0
- MIN_QUALITY: 0
- IVAR_QUALITY: 20
- IVAR_FREQ: 0.05
- IVAR_DEPTH: 30
+ MIN_QUALITY: 20
+ MIN_FREQ: 0.05
+ MIN_DEPTH: 30
+ MAX_DEPTH: 1000000
DEMIX:
+ PATHOGEN: "SARS-CoV-2"
MIN_QUALITY: 20
+ MAX_DEPTH: 1000000
COV_CUTOFF: 30
MIN_ABUNDANCE: 0.0001
+ CONFIRMED_ONLY: false
+ DEPTH_CUTOFF: 0
+ RELAXED_MRCA: false
+ RELAXED_MRCA_THRESH: 0.9
+ AUTO_ADAPT: false
+ SOLVER: "CLARABEL"
WINDOW:
WIDTH: 1000
STEP: 50
+GB_FEATURES:
+ INCLUDE: # any
+ product:
+ - "ORF1ab polyprotein"
+ - "surface glycoprotein"
+ - "ORF3a protein"
+ - "envelope protein"
+ - "membrane glycoprotein"
+ - "ORF6 protein"
+ - "ORF7a protein"
+ - "ORF7b"
+ - "ORF8 protein"
+ - "nucleocapsid phosphoprotein"
+ - "ORF10 protein"
+ # EXCLUDE: ... # all
+ANNOTATION:
+ # see: https://pcingola.github.io/SnpEff/adds/VCFannotationformat_v1.0.pdf
+ SNPEFF_COLS:
+ CHROM: CHROM
+ POS: POS
+ REF: REF
+ ALT: ALT
+ EFFECT: "ANN[*].EFFECT"
+ IMPACT: "ANN[*].IMPACT"
+ BIOTYPE: "ANN[*].BIOTYPE"
+ GENE: "ANN[*].GENE"
+ GENEID: "ANN[*].GENEID"
+ FEATURE: "ANN[*].FEATURE"
+ FEATUREID: "ANN[*].FEATUREID"
+ HGVS_P: "ANN[*].HGVS_P"
+ HGVS_C: "ANN[*].HGVS_C"
+ ERRORS: "ANN[*].ERRORS"
+ FILTER_INCLUDE:
+ # IMPACT: [HIGH, MODERATE, LOW]
+ FILTER_EXCLUDE:
+ EFFECT: [upstream_gene_variant, downstream_gene_variant]
+ ERRORS:
+ - ERROR_CHROMOSOME_NOT_FOUND
+ - ERROR_OUT_OF_CHROMOSOME_RANGE
+ - WARNING_REF_DOES_NOT_MATCH_GENOME
+ - WARNING_SEQUENCE_NOT_AVAILABLE
+ - WARNING_TRANSCRIPT_INCOMPLETE
+ - WARNING_TRANSCRIPT_MULTIPLE_STOP_CODONS
+ - WARNING_TRANSCRIPT_NO_START_CODON
+ - WARNING_TRANSCRIPT_NO_STOP_CODON
+ VARIANT_NAME_PATTERN: "{GENE}:{coalesce(HGVS_P, HGVS_C)}"
GISAID:
CREDENTIALS: "config/gisaid.yaml"
DATE_COLUMN: "CollectionDate"
@@ -44,3 +97,5 @@ PLOT_GENOME_REGIONS:
"config/nsp_annotation.csv"
REPORT_QMD:
"template.qmd"
+REPORT_CSS:
+ "config/report.styles.css"
diff --git a/config/design_plots.R b/config/design_plots.R
index 971265c..74e87d8 100644
--- a/config/design_plots.R
+++ b/config/design_plots.R
@@ -1,6 +1,6 @@
# Jordi Sevilla
-library(tidyverse)
+library(ggplot2)
library(showtext)
# Ajustes ####
@@ -8,13 +8,13 @@ showtext_auto(enable = FALSE)
showtext_opts(dpi = 200)
# Tema
-font_add_google("Montserrat", "Montserrat")
+font_add_google("Noto Sans", "Noto Sans")
showtext_auto()
theme_set(theme_minimal())
theme_update(
- text = element_text(size = 16, family = "Montserrat"),
+ text = element_text(size = 16, family = "Noto Sans"),
axis.title = element_text(size = 16),
axis.line = element_line(
linewidth = 0.5,
@@ -25,12 +25,11 @@ theme_update(
panel.grid = element_line(linewidth = 0.17, color = "lightgray")
)
-
-# CONFIG
-gene_colors = c(
+# Gene palette
+GENE_PALETTE <- c(
M = "#B4D4B4",
N = "#B7B7B8",
- orf1ab = "#9CC4DC",
+ ORF1ab = "#9CC4DC",
ORF3a = "#ECB4B7",
ORF8 = "#996D2B",
S = "#F5CC9E",
@@ -40,20 +39,32 @@ gene_colors = c(
ORF10 = "#CACB5D"
)
+# Nucleotide diversity
+DIVERSITY_PALETTE <- c(
+ density_fill = "#fcbf49",
+ density_color = "#ffaa00",
+ value_color = "#D944AA",
+ dnorm_color = "#f77f00"
+)
# M-L tree colors and labels
-tree_colors = c(
+TREE_PALETTE <- c(
tip_label = "#D944AA99",
boot_alrt_pass = "#64ACEEB2"
)
-node.size <- c(
+TREE_NODE_SIZE <- c(
tip_label = 2,
boot_alrt_pass = 0.8
)
+TREE_LEGEND_NAMES <- c(
+ tip_label = "Target samples",
+ boot_alrt_pass = "UFBoot ≥ %s%s & SH-aLRT ≥ %s%s"
+)
+
# Nucleotide variants classification colors and labels
-NV_colors <- c(
+NV_TYPE_PALETTE <- c(
Frameshift = "#568D63",
"In frame" = "black",
Intergenic = "#B27CF9",
@@ -61,7 +72,7 @@ NV_colors <- c(
Yes = "#0248FD"
)
-NV_names <- c(
+NV_TYPE_NAMES <- c(
Frameshift = "Frameshift",
"In frame" = "Inframe",
Intergenic = "Intergenic",
@@ -70,17 +81,23 @@ NV_names <- c(
)
# dn ds colors and labels
-dnds.labels <- c(
+DNDS_LABELS <- c(
dn = "dN",
ds = "dS"
)
-dnds.colors <- c(
- dn = "#E53E47",
- ds = "#2C47F5"
+DNDS_COLORS <- c(
+ dN = "#E53E47",
+ dS = "#2C47F5"
)
-dnds.shapes <- c(
- dn = 2,
- ds = 4
+DNDS_SHAPES <- c(
+ dN = 2,
+ dS = 4
)
+
+# Allele frequency trajectories panel color
+ALL.COLORS <- grDevices::colors()
+TRAJECTORY.PANEL.COLORS <- ALL.COLORS[
+ !grepl("(gray|grey|white|snow|azure|beige)", ALL.COLORS)
+]
diff --git a/config/nsp_annotation.csv b/config/nsp_annotation.csv
index 02126d0..78ab294 100644
--- a/config/nsp_annotation.csv
+++ b/config/nsp_annotation.csv
@@ -1,17 +1,7 @@
region,start,end
-nsp1,266,805
-nsp2,806,2719
-nsp3,2720,8554
-nsp4,8555,10054
-nsp5,10055,10972
-nsp6,10973,11842
-nsp7,11843,12091
-nsp8,12092,12685
-nsp9,12686,13024
-nsp10,13025,13441
-nsp11,13442,13468
-nsp12,13468,16236
-nsp13,16237,18039
-nsp14,18040,19620
-nsp15,19621,20658
-nsp16,20659,21552
+NSP2,806,2719
+NSP3,2720,8554
+NSP4-11,8555,13468
+RdRP,13468,16236
+NSP13,16237,18039
+NSP14-16,18040,21552
diff --git a/config/report.styles.css b/config/report.styles.css
index f81e0a9..9ea2783 100644
--- a/config/report.styles.css
+++ b/config/report.styles.css
@@ -1,35 +1,42 @@
-.quarto-title-block .quarto-title-banner {
- background: #27445C;
- background-image: url(logo_pathogenomics.png);
- background-size: 100px;
- background-position: right;
- background-repeat: no-repeat;
- padding-right: 10px;
- background-origin: content-box;
- }
+html {
+ font-family: "Noto Sans";
+ text-align: justify;
+}
-.title {
- font-family: Montserrat;
- color: #FFFFFF
+body {
+ color: black;
}
-.quarto-title p {
- font-family: Montserrat;
- color: #FFFFFF
+
+.quarto-title-block .quarto-title-banner {
+ background-color: #27445C;
+ background-image: url("logo_pathogenomics.png");
+ background-size: 100px;
+ background-position: right;
+ background-repeat: no-repeat;
+ background-origin: content-box;
+ padding-right: 10px;
+ background-blend-mode: normal;
+ filter: none;
+ mix-blend-mode: normal;
+ color: white;
}
-html {
- font-family: Montserrat;
- color: #000000;
- text-align: justify;
+.quarto-title-block .quarto-title-banner .title,
+.quarto-title-block .quarto-title-banner p {
+ color: inherit;
+ -webkit-text-fill-color: inherit;
+ opacity: 1;
}
h1, h2, h3 {
text-align: left;
font-weight: bold;
+ color: #27445C
}
+
#quarto-sidebar-toc-left {
text-align: left;
}
.header-section-number {
- color:#000000
-}
\ No newline at end of file
+ color: #27445C
+}
diff --git a/config/sarscov2_features.json b/config/sarscov2_features.json
deleted file mode 100644
index 2031f32..0000000
--- a/config/sarscov2_features.json
+++ /dev/null
@@ -1,14 +0,0 @@
-{
- "ORF1ab polyprotein": "orf1ab",
- "ORF1a polyprotein": "orf1ab",
- "surface glycoprotein": "S",
- "ORF3a protein": "ORF3a",
- "envelope protein": "E",
- "membrane glycoprotein": "M",
- "ORF6 protein": "ORF6",
- "ORF7a protein": "ORF7",
- "ORF7b": "ORF7",
- "ORF8 protein": "ORF8",
- "nucleocapsid phosphoprotein": "N",
- "ORF10 protein": "ORF10"
-}
diff --git a/template.qmd b/template.qmd
index eb5b82b..3c2d044 100644
--- a/template.qmd
+++ b/template.qmd
@@ -4,8 +4,8 @@ subtitle: "Workflow version: `r params$workflow_version`"
date: last-modified
date-format: "YYYY-MM-DD"
title-block-banner: true
-format:
- html:
+format:
+ html:
page-layout: article
embed-resources: true
smooth-scroll: true
@@ -15,41 +15,48 @@ format:
toc-location: left
toc-title: Summary
number-sections: true
-css: config/report.styles.css
-editor: visual
+css: __CSSPLACEHOLDER__
+editor: source
params:
- ufboot_reps: ""
- shalrt_reps: ""
- min_ivar_freq: ""
- workflow_version: ""
- use_bionj: ""
- cor_method: ""
- div: ""
- freyja: ""
- tree: ""
- tempest: ""
- SNV: ""
- SNV_s: ""
- evo: ""
- div_value: ""
- panel: ""
- volcano: ""
- tree_ml: ""
- fig_cor_snp: ""
- stats_lm: ""
- table: ""
- sum_nv: ""
- heat_tab: ""
- omega_plot: ""
- name: ""
+ ufboot_reps:
+ shalrt_reps:
+ min_ivar_freq:
+ workflow_version:
+ use_bionj:
+ cor_method:
+ div:
+ demix:
+ tree:
+ tempest:
+ SNV:
+ SNV_s:
+ evo:
+ div_value:
+ panel:
+ volcano:
+ tree_ml:
+ fig_cor_snp:
+ stats_lm:
+ stats_ml:
+ table:
+ sum_nv:
+ heat_tab:
+ omega_plot:
+ name:
+ freyja_ts:
output-file: report.html
---
-
+
+```{r setup, echo = F, message = F, include = F}
+display.num <- function(n, digits) {
+ format(n, digits = digits, nsmall = digits)
+}
+```
```{r read, echo = F, message = F, include = F}
library(jsonlite)
@@ -62,11 +69,14 @@ library(readr)
# Diversity
div_values <- fromJSON(params$div_value)
-# Temporal signal
-stats <- fromJSON(params$stats_lm)
+# Temporal signal and context tree
+stats <- append(
+ fromJSON(params$stats_lm),
+ fromJSON(params$stats_ml)
+)
correlation <- stats[["r2"]]
sub_rate <- stats[["sub_rate"]]
-sub_rate <- round(sub_rate, digits = 2)
+sub_rate <- display.num(sub_rate, 2)
p_value_lm <- stats[["pvalue"]]
# NV counts
@@ -79,10 +89,10 @@ table <- read.csv(params$table)
n.samples <- table %>% pull(Sample) %>% unique() %>% length()
# Heatmap
-vcf <- read_csv(params$heat_tab)
-row.names(vcf) <- vcf$`...1`
-vcf <- vcf[-1]
-cor.mat <- cor(vcf, method = params$cor_method)
+heatmap.df <- read.csv(params$heat_tab)
+row.names(heatmap.df) <- heatmap.df$`...1`
+heatmap.df[1] <- NULL
+cor.mat <- cor(heatmap.df, method = params$cor_method)
cor.mat[is.na(cor.mat)] <- 0
if (params$cor_method == "pearson") {
cor.method.name <- "Pearson's"
@@ -100,12 +110,16 @@ if (params$use_bionj) {
} else {
dist.tree.algo <- "neighbor-joining (NJ)"
}
+
+# Freyja timestamp
+freyja.timestamp <- read_file(params$freyja_ts) %>%
+ as.POSIXct(., format = "%m_%d_%Y-%H-%M") %>%
+ strftime(., format = "%Y-%m-%d at %H:%M")
```
## Summary of the target samples dataset
-```{r summary_table, echo = F,message = F}
-#| label: tbl-summary
+```{r tbl-summary, echo = F,message = F}
#| tbl-cap: Target dataset summary
table %>%
@@ -133,7 +147,7 @@ table %>%
columns = everything()
) %>%
opt_table_font(
- font = google_font("Montserrat")
+ font = google_font("Noto Sans")
)
```
@@ -141,11 +155,11 @@ table %>%
### Lineage admixture
-The estimated lineage admixture for each sample has been calculated
-using [Freyja](https://github.com/andersen-lab/Freyja).
+The lineage admixture for each sample has been estimated
+using [Freyja](https://github.com/andersen-lab/Freyja) (@fig-demix).
-{#fig-freyja}
+{#fig-demix}
### Phylogenetic reconstruction
@@ -159,26 +173,25 @@ $`r stats[["boot"]]`$% and a SH-aLRT score of $`r stats[["alrt"]]`$% (@fig-tree_
{#fig-tree_ml}
+samples is squared.](`r params$tree_ml`){#fig-tree_ml}
### Nucleotide diversity comparison
Nucleotide diversity (π) has been calculated for $`r div_values[["boot.reps"]]`$ random
-sample subsets of size $`r div_values[["sample.size"]]`$, extracted
-from the context dataset. The distribution of the nuclotide diversity is assumed to
-`r div_values[["norm.text"]]` be normal after performing a Shapiro-Wilk test
-(p-value of $`r div_values[["normal.pvalue"]]`$).
-
-The nucleotide diversity of the target samples is $`r div_values[["diversity"]] `$ (red line in @fig-div).
-Assuming the independence of the context samples, the `r div_values[["type.test"]]`
-p-value of the context samples having a nucleotide diversity (in orange in @fig-div)
-as low as that of the target dataset is $`r div_values[["p.value"]]`$.
-
-![Analysis of the nucleotide diversity (π). The orange line describes
-a normal distribution with the same mean and standard deviation as the distribution
-of π from $`r div_values[["boot.reps"]]`$ subsets of $`r div_values[["sample.size"]]`$
-sequences from the context. The red vertical line indicates the π value of the
-target samples.](`r params$div`){#fig-div}
+subsets of size $`r div_values[["sample.size"]]`$ drawn from the context dataset.
+A Shapiro–Wilk test indicates the distribution is `r div_values[["norm.text"]]`
+normal (p-value of $`r div_values[["normal.pvalue"]]`$).
+
+The nucleotide diversity of the target samples is $`r display.num(div_values[["diversity"]], 3)`$
+(vertical line in @fig-div). Assuming the context subsets are independent, the
+`r div_values[["type.test"]]` test gives a p-value of $`r display.num(div_values[["p.value"]], 3)`$
+for observing a nucleotide diversity as low as that of the target samples.
+
+![Distribution of nucleotide diversity (π) calculated from $`r div_values[["boot.reps"]]`$
+random subsets of $`r div_values[["sample.size"]]`$ sequences from the context dataset.
+The shaded area shows the empirical density; the overlaid curve is the normal distribution
+with the same mean and standard deviation.
+The vertical line marks the π value of the target samples.](`r params$div`){#fig-div}
## Evolutionary trajectory of the serially-sampled SARS-CoV-2 infection
@@ -186,7 +199,7 @@ target samples.](`r params$div`){#fig-div}
Sites with minor allele frequency $> `r params$min_ivar_freq`$ are considered polymorphic.
The linear association between the collection date of the samples and the number of
-polymorphic sites has an $R^2$ of $`r nv.counts[["r2"]]`$ and a p-value of
+polymorphic sites has an $R^2$ of $`r display.num(nv.counts[["r2"]], 4)`$ and a p-value of
$`r nv.counts[["value"]]`$ (@fig-fig_cor_snp).
{#fig-fig_cor_snp}
### Description of intra-host nucleotide variants
-A total of $`r n_SNV`$ different single nucleotide variants (SNV) and $`r n_INDELS`$
+A total of $`r n_SNV`$ different nucleotide variants (NV) and $`r n_INDELS`$
insertions and deletions (indels) have been detected along the genome (@fig-SNV).
::: {.panel-tabset}
@@ -203,7 +216,7 @@ insertions and deletions (indels) have been detected along the genome (@fig-SNV)
![Summary of the intra-host accumulation of nucleotide variants,
using the reconstructed dataset ancestor as reference. A) Nucleotide
-variants per site along the SARS-CoV-2 genome. Relative abundance of NVs is calculated
+variants per site along the SARS-CoV-2 genome. Relative abundance of variants is calculated
with a sliding window of width $`r nv.counts[["window"]]`$ nucleotides and a step of
$`r nv.counts[["step"]]`$. Labels indicate the coding regions of the non structural
proteins (NSP) within ORF1ab. B) Genome variation along the genome for each sample.
@@ -214,7 +227,7 @@ at the bottom, and the latest, at the top.](`r params$SNV`){#fig-SNV}
![Summary of the intra-host accumulation of nucleotide variants
in the spike sequence, using the reconstructed dataset ancestor as reference. A) Nucleotide
-variants per site along the S gene. Relative abundance of NVs is calculated
+variants per site along the S gene. Relative abundance of variants is calculated
with a sliding window of width $`r nv.counts[["window"]]`$ nucleotides and a step of
$`r nv.counts[["step"]]`$. B) Genome variation along the S gene for each sample.
The Y-axis displays samples in chronological order, with the earliest collection date
@@ -224,12 +237,12 @@ at the bottom, and the latest, at the top.](`r params$SNV_s`){#fig-SNV_s}
### Temporal signal of the intra-host mutations
-The correlation of the allele frequency of each NV with the time since the
+`r cor.method.name` correlation of the allele frequency of each variant with the time since the
initial sampling has been calculated (@fig-volcano).
{#fig-volcano}
Significantly correlated nucleotide variants are described in more detail in @fig-panel.
@@ -248,28 +261,31 @@ frequency-weighted distances.](`r params$tree`){#fig-tree}
To estimate the evolutionary rate, root-to-tip distances measured on the previous
tree (@fig-tree) have been correlated with time, obtaining a $R^2$ of
-$`r correlation`$ and a p-value of $`r p_value_lm`$. The estimated evolutionary
+$`r display.num(correlation, 4)`$ and a p-value of $`r p_value_lm`$. The estimated evolutionary
rate is $`r sub_rate`$ number of changes per year (@fig-tempest).
{#fig-tempest}
+distances and the number of days passed since the first sample. The solid
+line represents the linear model fit.](`r params$tempest`){#fig-tempest}
### Correlation between alternative alleles
-To detect possible interactions between mutations, pairwise correlation between allele
-frequencies are calculated (@fig-heatmap). The heatmap is an interactive figure that allows
-zooming in on specific regions.
+`r cor.method.name` correlation coefficients of allele frequencies between pairs
+of variants were calculated to detect interactions between them (@fig-heatmap).
+The heatmap is interactive and allows zooming in on specific regions.
-```{r heatmap, echo = F, message = F, fig.align = 'center'}
-#| label: fig-heatmap
-#| fig-cap: "Interactive hierarchically clustered heatmap of the pairwise correlation coefficients between the time series of allele frequencies in the case study."
+```{r fig-heatmap, echo = F, message = F, warning = F, fig.align = 'center'}
+#| fig-cap: "Interactive heatmap with hierarchical clustering of the pairwise correlation coefficients between the time series of allele frequencies in the case study."
heatmaply_cor(
cor.mat,
- fontsize_row = 8,
- fontsize_col = 8,
- column_text_angle = 90
+ grid_gap = 0.25,
+ fontsize_row = 7,
+ fontsize_col = 7,
+ column_text_angle = 45,
+ label_names = c("row", "column", "coefficient"),
+ width = 600,
+ height = 600
)
```
diff --git a/workflow/Snakefile b/workflow/Snakefile
index e8456e1..97381f5 100644
--- a/workflow/Snakefile
+++ b/workflow/Snakefile
@@ -6,7 +6,7 @@ import subprocess
min_version("7.19")
-__version__ = "1.2.2"
+__version__ = "1.3.0"
containerized: "docker://ahmig/vipera:v" + __version__
diff --git a/workflow/envs/afwdist.yaml b/workflow/envs/afwdist.yaml
new file mode 100644
index 0000000..8e29d0d
--- /dev/null
+++ b/workflow/envs/afwdist.yaml
@@ -0,0 +1,5 @@
+channels:
+ - bioconda
+ - conda-forge
+dependencies:
+ - afwdist==1.0.0
diff --git a/workflow/envs/biopython.yaml b/workflow/envs/biopython.yaml
index 172f7bf..aaafd09 100644
--- a/workflow/envs/biopython.yaml
+++ b/workflow/envs/biopython.yaml
@@ -5,7 +5,3 @@ dependencies:
- python==3.10
- biopython==1.81
- pandas==2.0.3
- - pip==23.2.1
- - mafft==7.525
- - pip:
- - gb2seq==0.2.20
diff --git a/workflow/envs/freyja.yaml b/workflow/envs/freyja.yaml
index 94d9be6..6673ab1 100644
--- a/workflow/envs/freyja.yaml
+++ b/workflow/envs/freyja.yaml
@@ -2,4 +2,4 @@ channels:
- conda-forge
- bioconda
dependencies:
- - freyja==1.4.2
+ - freyja==2.0.1
diff --git a/workflow/envs/quarto_render.yaml b/workflow/envs/quarto_render.yaml
index 71fa4cb..a1ab3ba 100644
--- a/workflow/envs/quarto_render.yaml
+++ b/workflow/envs/quarto_render.yaml
@@ -1,12 +1,9 @@
channels:
- conda-forge
- - bioconda
dependencies:
- - r-base==4.3.1
- - r-gt==0.9.0
- - quarto==1.3.450
- - r-jsonlite==1.8.5
+ - r-base==4.5.2
+ - r-gt==1.1.0
+ - quarto==1.8.25
+ - deno==2.3.1
- r-tidyverse==2.0.0
- - r-quarto==1.2
- - r-heatmaply==1.4.2
- - r-readr==2.1.4
+ - r-heatmaply==1.6.0
diff --git a/workflow/envs/renv.yaml b/workflow/envs/renv.yaml
index 43568d8..ceb6a99 100644
--- a/workflow/envs/renv.yaml
+++ b/workflow/envs/renv.yaml
@@ -5,7 +5,6 @@ dependencies:
- r-base=4.3.2
- r-tidyverse==2.0.0
- r-ggrepel==0.9.3
- - r-stringi==1.7.12
- r-ggpubr==0.6.0
- bioconductor-ggtree==3.10.0
- r-ape==5.8
@@ -15,5 +14,4 @@ dependencies:
- r-future.apply==1.11.0
- r-scales==1.3.0
- r-showtext==0.9_6
- - r-jsonlite==1.8.5
- r-logger==0.2.2
diff --git a/workflow/envs/snpeff.yaml b/workflow/envs/snpeff.yaml
index 1423a3c..612ab76 100644
--- a/workflow/envs/snpeff.yaml
+++ b/workflow/envs/snpeff.yaml
@@ -2,3 +2,4 @@ channels:
- bioconda
dependencies:
- snpeff==5.1d
+ - snpsift==5.1d
diff --git a/workflow/rules/asr.smk b/workflow/rules/asr.smk
index 5acaded..d20c6d4 100644
--- a/workflow/rules/asr.smk
+++ b/workflow/rules/asr.smk
@@ -2,6 +2,7 @@ rule reconstruct_ancestral_sequence:
threads: 4
conda: "../envs/iqtree.yaml"
params:
+ seed = 7291,
seqtype = "DNA",
name = OUTPUT_NAME,
outgroup = config["ALIGNMENT_REFERENCE"],
@@ -14,13 +15,11 @@ rule reconstruct_ancestral_sequence:
log:
LOGDIR / "reconstruct_ancestral_sequence" / "log.txt"
shell:
- """
- mkdir -p {output.folder}
- iqtree2 \
- -asr \
- -o {params.outgroup} -T AUTO --threads-max {threads} -s {input.fasta} \
- --seqtype {params.seqtype} -m {params.model} --prefix {output.folder}/{params.name} >{log} 2>&1
- """
+ "mkdir -p {output.folder} && "
+ "iqtree2 -seed {params.seed} "
+ "-asr "
+ "-o {params.outgroup} -T AUTO --threads-max {threads} -s {input.fasta} "
+ "--seqtype {params.seqtype} -m {params.model} --prefix {output.folder}/{params.name} >{log} 2>&1"
rule ancestor_fasta:
@@ -29,7 +28,7 @@ rule ancestor_fasta:
params:
node_id = "Node1",
indeterminate_char = "N",
- name = OUTPUT_NAME
+ name = "case_ancestor",
input:
state_file = OUTDIR/"tree"/f"{OUTPUT_NAME}.state"
output:
diff --git a/workflow/rules/context.smk b/workflow/rules/context.smk
index 623060c..9f8fa05 100644
--- a/workflow/rules/context.smk
+++ b/workflow/rules/context.smk
@@ -74,12 +74,14 @@ rule ml_context_tree:
conda: "../envs/iqtree.yaml"
shadow: "shallow"
params:
+ seed = 7291,
seqtype = "DNA",
name = OUTPUT_NAME,
- ufboot = config["UFBOOT_REPS"],
- alrt = config["SHALRT_REPS"],
+ ufboot = config["UFBOOT"]["REPS"],
+ alrt = config["SHALRT"]["REPS"],
outgroup = config["ALIGNMENT_REFERENCE"],
- model = config["TREE_MODEL"]
+ model = config["TREE_MODEL"],
+ max_iterations_convergence = 1000,
input:
fasta = OUTDIR/"nextalign"/f"{OUTPUT_NAME}.aligned.masked.fasta",
outgroup_aln = OUTDIR/"context"/"nextalign"/"context_sequences.aligned.masked.fasta"
@@ -89,14 +91,11 @@ rule ml_context_tree:
log:
LOGDIR / "ml_context_tree" / "log.txt"
shell:
- """
- exec >{log}
- exec 2>&1
-
- awk '/^>/{{p=seen[$0]++}}!p' {input.fasta} {input.outgroup_aln} > aln.fasta
- mkdir -p {output.folder}
- iqtree2 \
- -B {params.ufboot} -alrt {params.alrt} \
- -o {params.outgroup} -T AUTO --threads-max {threads} -s aln.fasta \
- --seqtype {params.seqtype} -m {params.model} --prefix {output.folder}/{params.name}
- """
+ "exec >{log} && exec 2>&1; "
+ "awk '/^>/{{p=seen[$0]++}}!p' {input.fasta} {input.outgroup_aln} >aln.fasta && "
+ "mkdir -p {output.folder} && "
+ "iqtree2 -seed {params.seed} "
+ "-bb {params.ufboot} -alrt {params.alrt} "
+ "-o {params.outgroup} -T AUTO --threads-max {threads} -s aln.fasta "
+ "-nm {params.max_iterations_convergence} "
+ "--seqtype {params.seqtype} -m {params.model} --prefix {output.folder}/{params.name}"
diff --git a/workflow/rules/demix.smk b/workflow/rules/demix.smk
index db6f6c2..4973b3e 100644
--- a/workflow/rules/demix.smk
+++ b/workflow/rules/demix.smk
@@ -1,26 +1,47 @@
+rule demix_barcode_update:
+ threads: 1
+ shadow: "shallow"
+ conda:
+ "../envs/freyja.yaml"
+ params:
+ pathogen = config["DEMIX"]["PATHOGEN"]
+ output:
+ folder = directory(OUTDIR/"demixing"/"freyja_data"),
+ curated_lineages = OUTDIR/"demixing"/"freyja_data"/"curated_lineages.json",
+ last_barcode_update = OUTDIR/"demixing"/"freyja_data"/"last_barcode_update.txt",
+ lineage_mutations = OUTDIR/"demixing"/"freyja_data"/"lineage_mutations.json",
+ lineage_yml = OUTDIR/"demixing"/"freyja_data"/"lineages.yml",
+ pathogens = OUTDIR/"demixing"/"freyja_data"/"pathogen_config.yml",
+ usher_barcodes = OUTDIR/"demixing"/"freyja_data"/"usher_barcodes.feather"
+ log:
+ LOGDIR / "demix_barcode_update" / "log.txt"
+ shell:
+ "mkdir -p {output.folder:q} && "
+ "freyja update --outdir {output.folder:q} --pathogen {params.pathogen:q} >{log} 2>&1"
+
+
rule demix_preprocessing:
threads: 1
- conda: "../envs/freyja.yaml"
+ conda: "../envs/var_calling.yaml"
shadow: "minimal"
input:
bam = get_input_bam,
ref_fasta = lambda wildcards: select_mapping_references_fasta()
params:
- minq = config["DEMIX"]["MIN_QUALITY"]
+ max_depth = config["DEMIX"]["MAX_DEPTH"],
+ minq = config["DEMIX"]["MIN_QUALITY"],
output:
depth_file = OUTDIR/"demixing"/"{sample}/{sample}_depth.txt",
- variants_file = OUTDIR/"demixing"/"{sample}/{sample}_variants.tsv"
+ variants_file = OUTDIR/"demixing"/"{sample}/{sample}_variants.tsv",
log:
- LOGDIR / "demix_preprocessing" / "{sample}.log.txt"
+ pileup = LOGDIR / "demix_preprocessing" / "{sample}_pileup.log.txt",
+ ivar = LOGDIR / "demix_preprocessing" / "{sample}_ivar.log.txt",
shell:
- """
- freyja variants \
- "{input.bam}" \
- --variants {output.variants_file} \
- --depths {output.depth_file} \
- --minq {params.minq} \
- --ref {input.ref_fasta} >{log} 2>&1
- """
+ "set -euo pipefail && "
+ "samtools mpileup -aa -A -d {params.max_depth} -Q {params.minq} -q 0 -B -f {input.ref_fasta:q} {input.bam:q} >sample.pileup 2>{log.pileup:q} && "
+ "ivar variants -p variants -q {params.minq} -r {input.ref_fasta:q} >{log.ivar:q} 2>&1 {output.depth_file:q} && "
+ "mv variants.tsv {output.variants_file:q}"
rule demix:
@@ -29,34 +50,52 @@ rule demix:
shadow: "minimal"
input:
depth_file = OUTDIR/"demixing"/"{sample}/{sample}_depth.txt",
- variants_file = OUTDIR/"demixing"/"{sample}/{sample}_variants.tsv"
+ variants_file = OUTDIR/"demixing"/"{sample}/{sample}_variants.tsv",
+ barcodes = OUTDIR/"demixing"/"freyja_data"/"usher_barcodes.feather",
+ curated_lineages = OUTDIR/"demixing"/"freyja_data"/"curated_lineages.json",
+ lineage_yml = OUTDIR/"demixing"/"freyja_data"/"lineages.yml",
params:
coverage_cutoff = config["DEMIX"]["COV_CUTOFF"],
- minimum_abundance = config["DEMIX"]["MIN_ABUNDANCE"]
+ minimum_abundance = config["DEMIX"]["MIN_ABUNDANCE"],
+ confirmed_only = "--confirmedonly " if config["DEMIX"]["CONFIRMED_ONLY"] else "",
+ depth_cutoff = config["DEMIX"]["DEPTH_CUTOFF"],
+ auto_adapt = "--autoadapt " if config["DEMIX"]["AUTO_ADAPT"] else "",
+ relaxed_mrca = "--relaxedmrca " if config["DEMIX"]["RELAXED_MRCA"] else "",
+ relaxed_mrca_thresh = config["DEMIX"]["RELAXED_MRCA_THRESH"],
+ solver = config["DEMIX"]["SOLVER"],
output:
- demix_file = OUTDIR/"demixing"/"{sample}/{sample}_demixed.tsv"
+ demix_file = OUTDIR/"demixing"/"samples"/"{sample}/{sample}_demixed.tsv"
log:
LOGDIR / "demix" / "{sample}.log.txt"
shell:
- """
- freyja demix \
- "{input.variants_file}" \
- "{input.depth_file}" \
- --eps {params.minimum_abundance} \
- --covcut {params.coverage_cutoff} \
- --output {output.demix_file} >{log} 2>&1
- """
+ "freyja demix "
+ "{input.variants_file:q} "
+ "{input.depth_file:q} "
+ "--barcodes {input.barcodes:q} "
+ "--meta {input.curated_lineages:q} "
+ "--lineageyml {input.lineage_yml:q} "
+ "--eps {params.minimum_abundance} "
+ "--covcut {params.coverage_cutoff} "
+ "--depthcutoff {params.depth_cutoff} "
+ "{params.confirmed_only}"
+ "{params.auto_adapt}"
+ "{params.relaxed_mrca}"
+ "--relaxedthresh {params.relaxed_mrca_thresh} "
+ "--solver {params.solver} "
+ "--output {output.demix_file} "
+ "--max-solver-threads {threads} "
+ ">{log} 2>&1"
-rule summarise_demixing:
+rule summarise_demix:
threads: 1
conda: "../envs/renv.yaml"
shadow: "shallow"
input:
- tables = expand(OUTDIR/"demixing"/"{sample}/{sample}_demixed.tsv", sample=iter_samples())
+ tables = expand(OUTDIR/"demixing"/"samples"/"{sample}/{sample}_demixed.tsv", sample=iter_samples())
output:
- summary_df = report(OUTDIR/"summary_freyja_demixing.csv")
+ summary_df = report(OUTDIR/"demixing"/"summary.csv")
log:
- LOGDIR / "summarise_demixing" / "log.txt"
+ LOGDIR / "summarise_demix" / "log.txt"
script:
- "../scripts/summary_demixing.R"
+ "../scripts/summarise_demix.R"
diff --git a/workflow/rules/distances.smk b/workflow/rules/distances.smk
index 4161df9..11fae38 100644
--- a/workflow/rules/distances.smk
+++ b/workflow/rules/distances.smk
@@ -1,17 +1,52 @@
-rule weighted_distances:
- threads: 1
+rule extract_afwdist_variants:
conda: "../envs/biopython.yaml"
params:
- samples = expand("{sample}", sample = iter_samples()),
- mask_class = ["mask"]
+ sample_col = "SAMPLE",
+ position_col = "POS",
+ sequence_col = "ALT",
+ frequency_col = "ALT_FREQ",
+ mask_class = ["mask"],
input:
- tsv = OUTDIR/f"{OUTPUT_NAME}.masked.filtered.tsv",
- vcf = lambda wildcards: select_problematic_vcf(),
+ variants = OUTDIR/f"{OUTPUT_NAME}.variants.tsv",
+ mask_vcf = lambda wildcards: select_problematic_vcf(),
ancestor = OUTDIR/f"{OUTPUT_NAME}.ancestor.fasta",
- reference = OUTDIR/"reference.fasta"
+ reference = OUTDIR/"reference.fasta",
output:
- distances = REPORT_DIR_TABLES/f"figure_8.csv"
+ variants = temp(OUTDIR/f"{OUTPUT_NAME}.variants.afwdist.csv"),
log:
- LOGDIR / "weighted_distances" / "log.txt"
+ LOGDIR/"extract_afwdist_variants"/"log.txt"
script:
- "../scripts/weighted_distances.py"
+ "../scripts/extract_afwdist_variants.py"
+
+
+rule afwdist_weighted_distances:
+ conda: "../envs/afwdist.yaml"
+ params:
+ extra_args = "",
+ input:
+ variants = OUTDIR/f"{OUTPUT_NAME}.variants.afwdist.csv",
+ reference = OUTDIR/f"{OUTPUT_NAME}.ancestor.fasta",
+ output:
+ distances = temp(OUTDIR/f"{OUTPUT_NAME}.distances.raw.csv"),
+ log:
+ LOGDIR/"afwdist_weighted_distances"/"log.txt"
+ shell:
+ "afwdist "
+ "-i {input.variants:q} "
+ "-r {input.reference:q} "
+ "-o {output.distances:q} "
+ "{params.extra_args} >{log:q} 2>&1"
+
+
+rule format_afwdist_results:
+ conda: "../envs/biopython.yaml"
+ params:
+ samples = sorted(iter_samples()) + [config["ALIGNMENT_REFERENCE"]],
+ input:
+ distances = OUTDIR/f"{OUTPUT_NAME}.distances.raw.csv",
+ output:
+ distances = OUTDIR/f"{OUTPUT_NAME}.distances.csv",
+ log:
+ LOGDIR/"format_afwdist_results"/"log.txt"
+ script:
+ "../scripts/format_afwdist_results.py"
diff --git a/workflow/rules/evolution.smk b/workflow/rules/evolution.smk
index e89d358..0380940 100644
--- a/workflow/rules/evolution.smk
+++ b/workflow/rules/evolution.smk
@@ -1,14 +1,45 @@
-rule N_S_sites:
+rule filter_genbank_features:
threads: 1
conda: "../envs/biopython.yaml"
+ params:
+ included = config.get("GB_FEATURES", {}).get("INCLUDE", {}),
+ excluded = config.get("GB_FEATURES", {}).get("EXCLUDE", {}),
input:
- fasta = OUTDIR/f"{OUTPUT_NAME}.ancestor.fasta",
gb = OUTDIR/"reference.gb",
- features = Path(config["FEATURES_JSON"]).resolve(),
- genetic_code = Path(config["GENETIC_CODE_JSON"]).resolve()
output:
- csv = temp(OUTDIR/f"{OUTPUT_NAME}.ancestor.N_S.sites.csv")
+ gb = OUTDIR/"reference.cds.gb",
+ log:
+ LOGDIR / "filter_genbank_features" / "log.txt"
+ script:
+ "../scripts/filter_genbank_features.py"
+
+
+rule n_s_sites:
+ threads: 1
+ conda: "../envs/biopython.yaml"
+ params:
+ gb_qualifier_display = "gene",
+ input:
+ fasta = OUTDIR/f"{OUTPUT_NAME}.ancestor.fasta",
+ gb = OUTDIR/"reference.cds.gb",
+ genetic_code = Path(config["GENETIC_CODE_JSON"]).resolve(),
+ output:
+ csv = temp(OUTDIR/f"{OUTPUT_NAME}.ancestor.n_s.sites.csv"),
+ log:
+ LOGDIR / "n_s_sites" / "log.txt"
+ script:
+ "../scripts/n_s_sites_from_fasta.py"
+
+
+rule calculate_dnds:
+ conda: "../envs/renv.yaml"
+ input:
+ n_s_sites = OUTDIR/f"{OUTPUT_NAME}.ancestor.n_s.sites.csv",
+ variants = OUTDIR/f"{OUTPUT_NAME}.variants.tsv",
+ metadata = config["METADATA"]
+ output:
+ table = OUTDIR/f"{OUTPUT_NAME}.dnds.csv",
log:
- LOGDIR / "N_S_sites" / "log.txt"
+ LOGDIR / "calculate_dnds" / "log.txt"
script:
- "../scripts/N_S_sites_from_fasta.py"
+ "../scripts/calculate_dnds.R"
diff --git a/workflow/rules/fetch.smk b/workflow/rules/fetch.smk
index 9f290be..3e4860c 100644
--- a/workflow/rules/fetch.smk
+++ b/workflow/rules/fetch.smk
@@ -66,3 +66,8 @@ rule fetch_problematic_vcf:
select_problematic_vcf()
shell:
"curl {params.url} -o {output} -s 2> {log}"
+
+
+rule create_empty_file:
+ output:
+ temp(touch(OUTDIR/"empty.txt"))
diff --git a/workflow/rules/report.smk b/workflow/rules/report.smk
index df45ea1..77aa6aa 100644
--- a/workflow/rules/report.smk
+++ b/workflow/rules/report.smk
@@ -1,160 +1,395 @@
-rule heatmap:
+rule demix_plot_data:
conda: "../envs/renv.yaml"
input:
- vcf = OUTDIR/f"{OUTPUT_NAME}.masked.filtered.tsv",
+ summary_demixing = OUTDIR/"demixing"/"summary.csv",
metadata = config["METADATA"]
output:
- table = report(REPORT_DIR_TABLES/"figure_10.csv")
+ data = REPORT_DIR_TABLES/"demix.csv"
log:
- LOGDIR / "heatmap" / "log.txt"
+ LOGDIR / "demix_plot_data" / "log.txt"
script:
- "../scripts/report/heatmap.R"
+ "../scripts/report/demix_plot_data.R"
-rule window:
- conda: "../envs/biopython.yaml"
+rule demix_plot:
+ conda: "../envs/renv.yaml"
params:
- window = config["WINDOW"]["WIDTH"],
- step = config["WINDOW"]["STEP"]
+ design = config["PLOTS"],
+ plot_width_mm = 159.2,
+ plot_height_mm = 119.4,
input:
- vcf = OUTDIR/f"{OUTPUT_NAME}.masked.filtered.tsv",
- gb = OUTDIR/"reference.gb",
- features = config["FEATURES_JSON"]
+ data = REPORT_DIR_TABLES/"demix.csv"
output:
- window_df = temp(OUTDIR/f"{OUTPUT_NAME}.window.csv"),
+ plot = report(REPORT_DIR_PLOTS/"demix.png")
log:
- LOGDIR / "window" / "log.txt"
+ LOGDIR / "demix_plot" / "log.txt"
script:
- "../scripts/window.py"
+ "../scripts/report/demix_plot.R"
-rule diversity:
+rule diversity_data:
threads: 4
conda: "../envs/renv.yaml"
params:
- design = config["PLOTS"],
bootstrap_reps = config["DIVERSITY_REPS"],
- plot_width = 159.2,
- plot_height = 119.4
+ aln_reference = config["ALIGNMENT_REFERENCE"],
+ seed = 7291,
input:
study_fasta = OUTDIR/"nextalign"/f"{OUTPUT_NAME}.aligned.masked.fasta",
- context_fasta = OUTDIR/"context"/"nextalign"/"context_sequences.aligned.masked.fasta"
+ context_fasta = OUTDIR/"context"/"nextalign"/"context_sequences.aligned.masked.fasta",
+ output:
+ divs = REPORT_DIR_TABLES/"diversity.txt",
+ json = REPORT_DIR_TABLES/"diversity.json",
+ log:
+ LOGDIR / "diversity_data" / "log.txt"
+ script:
+ "../scripts/report/diversity_data.R"
+
+
+rule diversity_plot:
+ threads: 1
+ conda: "../envs/renv.yaml"
+ params:
+ design = config["PLOTS"],
+ plot_width_mm = 159.2,
+ plot_height_mm = 119.4,
+ input:
+ divs = REPORT_DIR_TABLES/"diversity.txt",
+ json = REPORT_DIR_TABLES/"diversity.json",
output:
- fig = report(REPORT_DIR_PLOTS/"figure_3.png"),
- json = temp(OUTDIR/"diversity.json"),
- table = REPORT_DIR_TABLES/"figure_3.csv"
+ plot = report(REPORT_DIR_PLOTS/"diversity.png"),
log:
- LOGDIR / "diversity" / "log.txt"
+ LOGDIR / "diversity_plot" / "log.txt"
script:
"../scripts/report/diversity_plot.R"
-rule freyja_plot:
+rule extract_genbank_regions:
+ conda: "../envs/biopython.yaml"
+ params:
+ gb_qualifier = "gene",
+ input:
+ gb = OUTDIR/"reference.cds.gb",
+ output:
+ regions = temp(REPORT_DIR_TABLES/"genbank_regions.json"),
+ log:
+ LOGDIR / "extract_genbank_regions" / "log.txt"
+ script:
+ "../scripts/report/extract_genbank_regions.py"
+
+
+rule polymorphic_sites_over_time_data:
conda: "../envs/renv.yaml"
params:
- design = config["PLOTS"]
+ max_alt_freq = 1.0 - config["VC"]["MIN_FREQ"],
input:
- summary_demixing = OUTDIR/"summary_freyja_demixing.csv",
- metadata = config["METADATA"]
+ variants = OUTDIR/f"{OUTPUT_NAME}.variants.tsv",
+ metadata = config["METADATA"],
output:
- fig = report(REPORT_DIR_PLOTS/"figure_1.png"),
- table = report(REPORT_DIR_TABLES/"figure_1.csv")
+ table = REPORT_DIR_PLOTS/"polymorphic_sites_over_time.csv",
+ json = temp(REPORT_DIR_TABLES/"polymorphic_sites_over_time.json"),
log:
- LOGDIR / "freyja_plot" / "log.txt"
+ LOGDIR / "polymorphic_sites_over_time_data" / "log.txt"
script:
- "../scripts/report/freyja_plot.R"
+ "../scripts/report/polymorphic_sites_over_time_data.R"
-rule general_NV_description:
+rule polymorphic_sites_over_time_plot:
conda: "../envs/renv.yaml"
params:
- samples = expand("{sample}", sample = iter_samples()),
design = config["PLOTS"],
- regions = config["PLOT_GENOME_REGIONS"],
+ plot_width_mm = 159.2,
+ plot_height_mm = 119.4,
+ input:
+ table = REPORT_DIR_PLOTS/"polymorphic_sites_over_time.csv",
+ output:
+ plot = report(REPORT_DIR_PLOTS/"polymorphic_sites_over_time.png"),
+ log:
+ LOGDIR / "polymorphic_sites_over_time_plot" / "log.txt"
+ script:
+ "../scripts/report/polymorphic_sites_over_time_plot.R"
+
+
+rule window_data:
+ conda: "../envs/biopython.yaml"
+ params:
window = config["WINDOW"]["WIDTH"],
step = config["WINDOW"]["STEP"],
- max_alt_freq = 1.0 - config["VC"]["IVAR_FREQ"]
+ features = config.get("GB_FEATURES", {}),
+ gb_qualifier_display = "gene"
input:
- window = OUTDIR/f"{OUTPUT_NAME}.window.csv",
- vcf = OUTDIR/f"{OUTPUT_NAME}.masked.filtered.tsv",
- metadata = config["METADATA"]
+ variants = OUTDIR/f"{OUTPUT_NAME}.variants.tsv",
+ gb = OUTDIR/"reference.gb",
output:
- fig = report(REPORT_DIR_PLOTS/"figure_5a.png"),
- fig_s = report(REPORT_DIR_PLOTS/"figure_5b.png"),
- fig_cor = report(REPORT_DIR_PLOTS/"figure_4.png"),
- json = temp(OUTDIR/"summary_nv.json"),
- table_1 = report(REPORT_DIR_TABLES/"figure_5a.csv"),
- table_2 = report(REPORT_DIR_TABLES/"figure_5b.csv"),
- table_3 = report(REPORT_DIR_TABLES/"figure_4.csv")
+ window_df = REPORT_DIR_TABLES/"window.csv",
+ json = temp(REPORT_DIR_TABLES/"window.json"),
log:
- LOGDIR / "general_NV_description" / "log.txt"
+ LOGDIR / "window_data" / "log.txt"
script:
- "../scripts/report/NV_description.R"
+ "../scripts/report/window_data.py"
-rule phylo_plots:
+rule nv_panel_data:
+ conda: "../envs/renv.yaml"
+ input:
+ variants = OUTDIR/f"{OUTPUT_NAME}.variants.tsv",
+ metadata = config["METADATA"],
+ output:
+ table = REPORT_DIR_TABLES/"nv_panel.csv",
+ json = temp(REPORT_DIR_TABLES/"nv_panel.json"),
+ log:
+ LOGDIR / "nv_panel_data" / "log.txt"
+ script:
+ "../scripts/report/nv_panel_data.R"
+
+
+rule nv_panel_zoom_on_feature_data:
+ input:
+ table = REPORT_DIR_TABLES/"nv_panel.csv",
+ regions = REPORT_DIR_TABLES/"genbank_regions.json",
+ output:
+ table = temp(REPORT_DIR_TABLES/"nv_panel.{region_name}.csv"),
+ log:
+ LOGDIR / "nv_panel_zoom_on_feature_data" / "{region_name}.log.txt"
+ script:
+ "../scripts/report/nv_panel_zoom_on_feature_data.py"
+
+
+rule window_zoom_on_feature_data:
+ input:
+ table = REPORT_DIR_TABLES/"window.csv",
+ regions = REPORT_DIR_TABLES/"genbank_regions.json",
+ output:
+ table = temp(REPORT_DIR_TABLES/"window.{region_name}.csv"),
+ log:
+ LOGDIR / "window_zoom_on_feature_data" / "{region_name}.log.txt"
+ script:
+ "../scripts/report/window_zoom_on_feature_data.py"
+
+
+rule nv_panel_plot:
+ conda: "../envs/renv.yaml"
+ params:
+ design = config["PLOTS"],
+ window_step = config["WINDOW"]["STEP"],
+ plot_height_mm = 250.0,
+ plot_width_mm = 240.0,
+ input:
+ panel = REPORT_DIR_TABLES/"nv_panel.csv",
+ window = REPORT_DIR_TABLES/"window.csv",
+ regions = REPORT_DIR_TABLES/"genbank_regions.json",
+ highlight_window_regions = config["PLOT_GENOME_REGIONS"],
+ output:
+ plot = report(REPORT_DIR_PLOTS/"nv_panel.png"),
+ log:
+ LOGDIR / "nv_panel_plot" / "log.txt"
+ script:
+ "../scripts/report/nv_panel_plot.R"
+
+
+use rule nv_panel_plot as nv_panel_plot_S with:
+ input:
+ panel = REPORT_DIR_TABLES/"nv_panel.S.csv",
+ window = REPORT_DIR_TABLES/"window.S.csv",
+ regions = REPORT_DIR_TABLES/"genbank_regions.json",
+ highlight_window_regions = OUTDIR/"empty.txt",
+ output:
+ plot = report(REPORT_DIR_PLOTS/"nv_panel.S.png"),
+ log:
+ LOGDIR / "nv_panel_plot_S" / "log.txt"
+
+
+rule merge_json_files:
+ input:
+ REPORT_DIR_TABLES/"nv_panel.json",
+ REPORT_DIR_TABLES/"polymorphic_sites_over_time.json",
+ REPORT_DIR_TABLES/"window.json",
+ output:
+ json = REPORT_DIR_TABLES/"nv_panel_summary.json",
+ run:
+ import json
+ result = {}
+ for path in input:
+ with open(path) as f:
+ d = json.load(f)
+ result |= d # will replace existing keys
+ with open(output.json, "w") as fw:
+ json.dump(result, fw, indent=2)
+
+
+rule context_phylogeny_data:
conda: "../envs/renv.yaml"
params:
design = config["PLOTS"],
ref_name = config["ALIGNMENT_REFERENCE"],
- boot_th = 95,
- alrt_th = 80,
+ boot_th = config["UFBOOT"]["THRESHOLD"],
+ alrt_th = config["SHALRT"]["THRESHOLD"],
+ input:
+ target_fasta = OUTDIR/f"{OUTPUT_NAME}.fasta",
+ tree = OUTDIR/f"tree_context/{OUTPUT_NAME}.treefile",
+ output:
+ json = REPORT_DIR_TABLES/"context_phylogeny.json",
+ annotation = REPORT_DIR_TABLES/"context_phylogeny.csv",
+ log:
+ LOGDIR / "context_phylogeny_data" / "log.txt"
+ script:
+ "../scripts/report/context_phylogeny_data.R"
+
+
+rule context_phylogeny_plot:
+ conda: "../envs/renv.yaml"
+ params:
+ design = config["PLOTS"],
plot_height_mm = 119.4,
plot_width_mm = 159.2,
- use_bionj = config["USE_BIONJ"]
+ boot_th = config["UFBOOT"]["THRESHOLD"],
+ alrt_th = config["SHALRT"]["THRESHOLD"],
+ input:
+ tree = OUTDIR/f"tree_context/{OUTPUT_NAME}.treefile",
+ json = REPORT_DIR_TABLES/"context_phylogeny.json",
+ annotation = REPORT_DIR_TABLES/"context_phylogeny.csv"
+ output:
+ plot = report(REPORT_DIR_PLOTS/"context_phylogeny.png"),
+ log:
+ LOGDIR / "context_phylogeny_plot" / "log.txt"
+ script:
+ "../scripts/report/context_phylogeny_plot.R"
+
+
+rule allele_freq_tree_data:
+ conda: "../envs/renv.yaml"
+ params:
+ use_bionj = config["USE_BIONJ"],
+ outgroup_id = config["ALIGNMENT_REFERENCE"],
input:
- dist = REPORT_DIR_TABLES/f"figure_8.csv",
+ dist = OUTDIR/f"{OUTPUT_NAME}.distances.csv",
+ output:
+ tree = REPORT_DIR_TABLES/"allele_freq_tree.nwk",
+ log:
+ LOGDIR / "allele_freq_tree_data" / "log.txt"
+ script:
+ "../scripts/report/allele_freq_tree_data.R"
+
+
+rule allele_freq_tree_plot:
+ conda: "../envs/renv.yaml"
+ params:
+ design = config["PLOTS"],
+ outgroup_id = config["ALIGNMENT_REFERENCE"],
+ plot_height_mm = 119.4,
+ plot_width_mm = 159.2,
+ input:
+ tree = report(REPORT_DIR_TABLES/"allele_freq_tree.nwk"),
study_fasta = OUTDIR/f"{OUTPUT_NAME}.fasta",
- ml = OUTDIR/f"tree_context/{OUTPUT_NAME}.treefile",
- metadata = config["METADATA"]
+ metadata = config["METADATA"],
+ output:
+ plot = report(REPORT_DIR_PLOTS/"allele_freq_tree.png"),
+ log:
+ LOGDIR / "allele_freq_tree_plot" / "log.txt"
+ script:
+ "../scripts/report/allele_freq_tree_plot.R"
+
+
+rule time_signal_data:
+ conda: "../envs/renv.yaml"
+ params:
+ outgroup_id = config["ALIGNMENT_REFERENCE"],
+ input:
+ tree = report(REPORT_DIR_TABLES/"allele_freq_tree.nwk"),
+ metadata = config["METADATA"],
+ output:
+ table = report(REPORT_DIR_TABLES/"time_signal.csv"),
+ json = REPORT_DIR_TABLES/"time_signal.json",
+ log:
+ LOGDIR / "time_signal_data" / "log.txt"
+ script:
+ "../scripts/report/time_signal_data.R"
+
+
+rule time_signal_plot:
+ conda: "../envs/renv.yaml"
+ params:
+ design = config["PLOTS"],
+ plot_height_mm = 119.4,
+ plot_width_mm = 159.2,
+ input:
+ table = report(REPORT_DIR_TABLES/"time_signal.csv"),
output:
- temest = report(REPORT_DIR_PLOTS/"figure_9.png"),
- tree = report(REPORT_DIR_PLOTS/"figure_8.png"),
- tree_ml = report(REPORT_DIR_PLOTS/"figure_2.png"),
- table = report(REPORT_DIR_TABLES/"figure_9.csv"),
- json = temp(OUTDIR/"stats.lm.json")
+ plot = report(REPORT_DIR_PLOTS/"time_signal.png"),
log:
- LOGDIR / "phylo_plots" / "log.txt"
+ LOGDIR / "time_signal_plot" / "log.txt"
script:
- "../scripts/report/phylo_plots.R"
+ "../scripts/report/time_signal_plot.R"
-rule evo_plots:
+rule dnds_plots:
conda: "../envs/renv.yaml"
params:
- design = config["PLOTS"]
+ design = config["PLOTS"],
+ plot_height_mm = 119.4,
+ plot_width_mm = 159.2,
input:
- N_S = OUTDIR/f"{OUTPUT_NAME}.ancestor.N_S.sites.csv",
- vcf = OUTDIR/f"{OUTPUT_NAME}.masked.filtered.tsv",
- metadata = config["METADATA"]
+ table = OUTDIR/f"{OUTPUT_NAME}.dnds.csv",
output:
- plot = report(REPORT_DIR_PLOTS/"figure_11.png"),
- plot_omega = report(REPORT_DIR_PLOTS/"figure_12.png"),
- table = report(REPORT_DIR_TABLES/"figure_11.csv")
+ plot_dn_ds = report(REPORT_DIR_PLOTS/"dn_and_ds.png"),
+ plot_omega = report(REPORT_DIR_PLOTS/"dnds.png"),
log:
LOGDIR / "evo_plots" / "log.txt"
script:
- "../scripts/report/evo_plots.R"
+ "../scripts/report/dnds_plots.R"
-rule snp_plots:
+rule af_time_correlation_data:
conda: "../envs/renv.yaml"
params:
- design = config["PLOTS"],
cor_method = config["COR"]["METHOD"],
- cor_exact = config["COR"]["EXACT"]
+ cor_exact = config["COR"]["EXACT"],
input:
- vcf = OUTDIR/f"{OUTPUT_NAME}.masked.filtered.tsv",
- metadata = config["METADATA"]
+ variants = OUTDIR/f"{OUTPUT_NAME}.variants.tsv",
+ metadata = config["METADATA"],
output:
- pseudovolcano = report(REPORT_DIR_PLOTS/"figure_6.png"),
- snp_panel = report(REPORT_DIR_PLOTS/"figure_7.png"),
- table_1 = report(REPORT_DIR_TABLES/"figure_6.csv"),
- table_2 = report(REPORT_DIR_TABLES/"figure_7.csv")
+ fmt_variants = temp(REPORT_DIR_TABLES/"variants.filled.dated.tsv"),
+ correlations = report(REPORT_DIR_TABLES/"af_time_correlation.csv"),
+ subset = REPORT_DIR_TABLES/"af_time_correlation.subset.txt",
log:
- LOGDIR / "snp_plots" / "log.txt"
+ LOGDIR / "af_time_correlation_data" / "log.txt"
script:
- "../scripts/report/snp_plots.R"
+ "../scripts/report/af_time_correlation_data.R"
+
+
+rule af_time_correlation_plot:
+ conda: "../envs/renv.yaml"
+ params:
+ design = config["PLOTS"],
+ plot_height_mm = 119.4,
+ plot_width_mm = 159.2,
+ input:
+ correlations = REPORT_DIR_TABLES/"af_time_correlation.csv",
+ output:
+ plot = report(REPORT_DIR_PLOTS/"af_time_correlation.png"),
+ log:
+ LOGDIR / "af_time_correlation_plot" / "log.txt"
+ script:
+ "../scripts/report/af_time_correlation_plot.R"
+
+
+rule af_trajectory_panel_plot:
+ conda: "../envs/renv.yaml"
+ params:
+ design = config["PLOTS"],
+ n_plot_columns = 3,
+ plot_row_height_mm = 35,
+ plot_width_mm = 159.2,
+ random_color_seed = 7291,
+ input:
+ fmt_variants = REPORT_DIR_TABLES/"variants.filled.dated.tsv",
+ subset = REPORT_DIR_TABLES/"af_time_correlation.subset.txt"
+ output:
+ plot = report(REPORT_DIR_PLOTS/"af_trajectory_panel.png"),
+ log:
+ LOGDIR / "af_trajectory_panel_plot" / "log.txt"
+ script:
+ "../scripts/report/af_trajectory_panel_plot.R"
rule summary_table:
@@ -163,7 +398,7 @@ rule summary_table:
report = report(OUTDIR/f"{OUTPUT_NAME}.lineage_report.csv"),
metadata = config["METADATA"]
output:
- table = temp(OUTDIR/"summary_table.csv")
+ table = REPORT_DIR_TABLES/"summary_table.csv"
log:
LOGDIR / "summary_table" / "log.txt"
script:
@@ -172,67 +407,77 @@ rule summary_table:
rule report:
conda: "../envs/quarto_render.yaml"
- shadow: "shallow"
+ shadow: "minimal"
input:
qmd = Path(config["REPORT_QMD"]).resolve(),
- freyja = report(REPORT_DIR_PLOTS/"figure_1.png"),
- tree_ml = report(REPORT_DIR_PLOTS/"figure_2.png"),
- diversity = report(REPORT_DIR_PLOTS/"figure_3.png"),
- fig_cor = report(REPORT_DIR_PLOTS/"figure_4.png"),
- SNV = report(REPORT_DIR_PLOTS/"figure_5a.png"),
- SNV_spike = report(REPORT_DIR_PLOTS/"figure_5b.png"),
- volcano = report(REPORT_DIR_PLOTS/"figure_6.png"),
- panel = report(REPORT_DIR_PLOTS/"figure_7.png"),
- tree = report(REPORT_DIR_PLOTS/"figure_8.png"),
- temest = report(REPORT_DIR_PLOTS/"figure_9.png"),
- heat_table = report(REPORT_DIR_TABLES/"figure_10.csv"),
- evo = report(REPORT_DIR_PLOTS/"figure_11.png"),
- omega_plot = report(REPORT_DIR_PLOTS/"figure_12.png"),
- value = OUTDIR/"diversity.json",
- stats_lm = OUTDIR/"stats.lm.json",
- table = OUTDIR/"summary_table.csv",
- sum_nv = OUTDIR/"summary_nv.json"
+ css = Path(config["REPORT_CSS"]).resolve(),
+ demix = report(REPORT_DIR_PLOTS/"demix.png"),
+ tree_ml = report(REPORT_DIR_PLOTS/"context_phylogeny.png"),
+ diversity = report(REPORT_DIR_PLOTS/"diversity.png"),
+ fig_cor = report(REPORT_DIR_PLOTS/"polymorphic_sites_over_time.png"),
+ SNV = report(REPORT_DIR_PLOTS/"nv_panel.png"),
+ SNV_spike = report(REPORT_DIR_PLOTS/"nv_panel.S.png"),
+ volcano = report(REPORT_DIR_PLOTS/"af_time_correlation.png"),
+ panel = report(REPORT_DIR_PLOTS/"af_trajectory_panel.png"),
+ tree = report(REPORT_DIR_PLOTS/"allele_freq_tree.png"),
+ temest = report(REPORT_DIR_PLOTS/"time_signal.png"),
+ evo = report(REPORT_DIR_PLOTS/"dn_and_ds.png"),
+ omega_plot = report(REPORT_DIR_PLOTS/"dnds.png"),
+ heat_table = report(OUTDIR/"vaf"/"pairwise_trajectory_correlation.csv"),
+ freyja_ts = OUTDIR/"demixing"/"freyja_data"/"last_barcode_update.txt",
+ value = REPORT_DIR_TABLES/"diversity.json",
+ stats_lm = REPORT_DIR_TABLES/"time_signal.json",
+ stats_ml = REPORT_DIR_TABLES/"context_phylogeny.json",
+ table = REPORT_DIR_TABLES/"summary_table.csv",
+ sum_nv = REPORT_DIR_TABLES/"nv_panel_summary.json",
params:
workflow_version = get_repo_version(BASE_PATH.as_posix(), __version__),
- min_ivar_freq = config["VC"]["IVAR_FREQ"],
- ufboot_reps = config["UFBOOT_REPS"],
- shalrt_reps = config["SHALRT_REPS"],
+ min_ivar_freq = config["VC"]["MIN_FREQ"],
+ ufboot_reps = config["UFBOOT"]["REPS"],
+ shalrt_reps = config["SHALRT"]["REPS"],
name = config["OUTPUT_NAME"],
use_bionj = config["USE_BIONJ"],
- cor_method = config["COR"]["METHOD"]
+ cor_method = config["COR"]["METHOD"],
output:
html = report(OUTDIR/f"{OUTPUT_NAME}.report.html")
log:
LOGDIR / "report" / "log.txt"
shell:
- "set +o pipefail; "
- "Rscript -e 'library(quarto)' "
- "-e \"quarto_render("
- "input = '{input.qmd}', "
- "execute_params=list("
- "ufboot_reps='{params.ufboot_reps}', "
- "shalrt_reps='{params.shalrt_reps}', "
- "min_ivar_freq='{params.min_ivar_freq}', "
- "workflow_version='{params.workflow_version}', "
- "use_bionj='{params.use_bionj}', "
- "cor_method='{params.cor_method}', "
- "div='{input.diversity}', "
- "freyja ='{input.freyja}', "
- "tree = '{input.tree}', "
- "tempest = '{input.temest}', "
- "SNV = '{input.SNV}', "
- "SNV_s = '{input.SNV_spike}', "
- "evo = '{input.evo}', "
- "div_value = '{input.value}', "
- "panel = '{input.panel}', "
- "volcano = '{input.volcano}', "
- "tree_ml = '{input.tree_ml}', "
- "fig_cor_snp = '{input.fig_cor}', "
- "stats_lm = '{input.stats_lm}', "
- "table = '{input.table}', "
- "sum_nv = '{input.sum_nv}', "
- "heat_tab = '{input.heat_table}', "
- "omega_plot = '{input.omega_plot}', "
- "name = '{params.name}'))\" "
- ">{log} 2>&1 && "
- 'mv "$(dirname {input.qmd:q})/report.html" {output.html:q}'
+ """
+ set +o pipefail
+ exec >{log} && exec 2>&1
+
+ printf "%s\n" \
+ "ufboot_reps: '{params.ufboot_reps}'" \
+ "shalrt_reps: '{params.shalrt_reps}'" \
+ "min_ivar_freq: '{params.min_ivar_freq}'" \
+ "workflow_version: '{params.workflow_version}'" \
+ "use_bionj: '{params.use_bionj}'" \
+ "cor_method: '{params.cor_method}'" \
+ "div: '{input.diversity}'" \
+ "demix: '{input.demix}'" \
+ "tree: '{input.tree}'" \
+ "tempest: '{input.temest}'" \
+ "SNV: '{input.SNV}'" \
+ "SNV_s: '{input.SNV_spike}'" \
+ "evo: '{input.evo}'" \
+ "div_value: '{input.value}'" \
+ "panel: '{input.panel}'" \
+ "volcano: '{input.volcano}'" \
+ "tree_ml: '{input.tree_ml}'" \
+ "fig_cor_snp: '{input.fig_cor}'" \
+ "stats_lm: '{input.stats_lm}'" \
+ "stats_ml: '{input.stats_ml}'" \
+ "table: '{input.table}'" \
+ "sum_nv: '{input.sum_nv}'" \
+ "heat_tab: '{input.heat_table}'" \
+ "omega_plot: '{input.omega_plot}'" \
+ "freyja_ts: '{input.freyja_ts}'" \
+ "name: '{params.name}'" \
+ >params.yaml
+
+ sed "s|__CSSPLACEHOLDER__|{input.css}|g" {input.qmd:q} >report.qmd
+
+ quarto render report.qmd --execute-params params.yaml
+ mv report.html {output.html:q}
+ """
diff --git a/workflow/rules/vaf.smk b/workflow/rules/vaf.smk
index bd16219..2a2560b 100644
--- a/workflow/rules/vaf.smk
+++ b/workflow/rules/vaf.smk
@@ -1,20 +1,20 @@
rule snps_to_ancestor:
- threads: 1
+ threads: 2
retries: 3
shadow: "minimal"
conda: "../envs/var_calling.yaml"
params:
- max_depth = config["VC"]["MAX_DEPTH"],
- min_quality = config["VC"]["MIN_QUALITY"],
- ivar_quality = config["VC"]["IVAR_QUALITY"],
- ivar_freq = config["VC"]["IVAR_FREQ"],
- ivar_depth = config["VC"]["IVAR_DEPTH"]
+ mpileup_depth = config["VC"]["MAX_DEPTH"],
+ mpileup_quality = 0,
+ ivar_quality = config["VC"]["MIN_QUALITY"],
+ ivar_freq = config["VC"]["MIN_FREQ"],
+ ivar_depth = config["VC"]["MIN_DEPTH"],
input:
reference_fasta = OUTDIR/f"{OUTPUT_NAME}.ancestor.fasta",
bam = get_input_bam,
gff = OUTDIR/"reference.gff3"
output:
- tsv = temp(OUTDIR/"{sample}.tsv")
+ tsv = temp(OUTDIR/"vaf"/"{sample}.tsv")
log:
LOGDIR / "snps_to_ancestor" / "{sample}.log.txt"
shell:
@@ -35,58 +35,20 @@ rule snps_to_ancestor:
samtools mpileup \
-aa \
--ignore-overlaps \
- -d {params.max_depth} \
+ -d {params.mpileup_depth} \
--count-orphans \
--no-BAQ \
- -Q {params.min_quality} \
+ -Q {params.mpileup_quality} \
-f renamed_reference.fasta \
{input.bam} \
| ivar variants \
- -p {wildcards.sample} \
+ -p variants \
-q {params.ivar_quality} \
-t {params.ivar_freq} \
-m {params.ivar_depth} \
-g {input.gff} \
-r renamed_reference.fasta
-
- sed 's/'$ref'/'{wildcards.sample}'/g' {wildcards.sample}.tsv | cat > {output.tsv}
- """
-
-
-rule annotation:
- threads:1
- conda: "../envs/biopython.yaml"
- shadow: "shallow"
- input:
- gb = OUTDIR/"reference.gb",
- ref = OUTDIR/"reference.fasta",
- features = Path(config["FEATURES_JSON"]).resolve()
- output:
- df = temp(OUTDIR/"annotation.csv")
- log:
- LOGDIR / "annotation" / "log.txt"
- script:
- "../scripts/report/get_annotation.py"
-
-
-rule format_tsv:
- threads:1
- shadow: "shallow"
- input:
- expand(OUTDIR/"{sample}.tsv", sample = iter_samples())
- output:
- tsv = OUTDIR/f"{OUTPUT_NAME}.tsv"
- log:
- LOGDIR / "format_tsv" / "log.txt"
- shell:
- """
- path=`echo {input} | awk '{{print $1}}'`
- grep "^REGION" "$path" > header
- for tsv in {input}; do
- tail -n +2 "$tsv" >> body
- done
- cat header body > "{output.tsv}"
- rm header
+ mv variants.tsv {output.tsv:q}
"""
@@ -96,12 +58,12 @@ rule mask_tsv:
params:
mask_class = ["mask"]
input:
- tsv = OUTDIR/f"{OUTPUT_NAME}.tsv",
+ tsv = OUTDIR/"vaf"/"{sample}.tsv",
vcf = lambda wildcards: select_problematic_vcf()
output:
- masked_tsv = temp(OUTDIR/f"{OUTPUT_NAME}.masked.tsv")
+ masked_tsv = temp(OUTDIR/"vaf"/"{sample}.masked.tsv")
log:
- LOGDIR / "mask_tsv" / "log.txt"
+ LOGDIR / "mask_tsv" / "{sample}.log.txt"
script:
"../scripts/mask_tsv.py"
@@ -109,13 +71,16 @@ rule mask_tsv:
rule filter_tsv:
threads: 1
conda: "../envs/renv.yaml"
+ params:
+ min_depth = 20,
+ min_alt_rv = 2,
+ min_alt_dp = 2,
input:
- tsv = OUTDIR/f"{OUTPUT_NAME}.masked.tsv",
- annotation = OUTDIR/"annotation.csv"
+ tsv = OUTDIR/"vaf"/"{sample}.masked.tsv"
output:
- filtered_tsv = temp(OUTDIR/f"{OUTPUT_NAME}.masked.prefiltered.tsv")
+ filtered_tsv = temp(OUTDIR/"vaf"/"{sample}.masked.prefiltered.tsv")
log:
- LOGDIR / "filter_tsv" / "log.txt"
+ LOGDIR / "filter_tsv" / "{sample}.log.txt"
script:
"../scripts/filter_tsv.R"
@@ -123,12 +88,14 @@ rule filter_tsv:
rule tsv_to_vcf:
threads: 1
conda: "../envs/biopython.yaml"
+ params:
+ ref_name = config["ALIGNMENT_REFERENCE"],
input:
- tsv = OUTDIR/f"{OUTPUT_NAME}.masked.prefiltered.tsv",
+ tsv = OUTDIR/"vaf"/"{sample}.masked.prefiltered.tsv",
output:
- vcf = temp(OUTDIR/f"{OUTPUT_NAME}.vcf")
+ vcf = temp(OUTDIR/"vaf"/"{sample}.vcf")
log:
- LOGDIR / "tsv_to_vcf" / "log.txt"
+ LOGDIR / "tsv_to_vcf" / "{sample}.log.txt"
script:
"../scripts/tsv_to_vcf.py"
@@ -141,11 +108,11 @@ rule variants_effect:
ref_name = config["ALIGNMENT_REFERENCE"],
snpeff_data_dir = (BASE_PATH / "config" / "snpeff").resolve()
input:
- vcf = OUTDIR/f"{OUTPUT_NAME}.vcf"
+ vcf = OUTDIR/"vaf"/"{sample}.vcf"
output:
- ann_vcf = temp(OUTDIR/f"{OUTPUT_NAME}.annotated.vcf")
+ ann_vcf = OUTDIR/"vaf"/"{sample}.annotated.vcf"
log:
- LOGDIR / "variants_effect" / "log.txt"
+ LOGDIR / "variants_effect" / "{sample}.log.txt"
retries: 2
shell:
"""
@@ -159,19 +126,93 @@ rule variants_effect:
echo "Local database not found at '{params.snpeff_data_dir}', downloading from repository"
fi
- snpEff eff -dataDir {params.snpeff_data_dir} -noStats {params.ref_name} {input.vcf} > {output.ann_vcf}
+ snpEff eff -dataDir {params.snpeff_data_dir} -noStats {params.ref_name} {input.vcf} >{output.ann_vcf}
"""
-rule vcf_to_tsv:
+rule extract_vcf_fields:
threads: 1
+ conda: "../envs/snpeff.yaml"
+ params:
+ extract_columns = [f"'{col}'" for col in config["ANNOTATION"]["SNPEFF_COLS"].values()],
+ sep = ",",
+ input:
+ vcf = OUTDIR/"vaf"/"{sample}.annotated.vcf"
+ output:
+ tsv = OUTDIR/"vaf"/"{sample}.vcf_fields.tsv"
+ log:
+ LOGDIR / "tsv_to_vcf" / "{sample}.log.txt"
+ shell:
+ "SnpSift extractFields -e 'NA' -s {params.sep:q} {input.vcf:q} {params.extract_columns} >{output.tsv:q} 2>{log:q}"
+
+
+rule format_vcf_fields_longer:
+ conda: "../envs/renv.yaml"
+ params:
+ sample = "{sample}",
+ colnames_mapping = config["ANNOTATION"]["SNPEFF_COLS"],
+ filter_include = config["ANNOTATION"]["FILTER_INCLUDE"],
+ filter_exclude = config["ANNOTATION"]["FILTER_EXCLUDE"],
+ variant_name_pattern = lambda wildcards: config["ANNOTATION"]["VARIANT_NAME_PATTERN"], # lambda to deactivate automatic wildcard expansion in pattern
+ sep = ",",
+ input:
+ tsv = OUTDIR/"vaf"/"{sample}.vcf_fields.tsv",
+ output:
+ tsv = OUTDIR/"vaf"/"{sample}.vcf_fields.longer.tsv",
+ log:
+ LOGDIR / "format_vcf_fields_longer" / "{sample}.log.txt"
+ script:
+ "../scripts/format_vcf_fields_longer.R"
+
+
+rule concat_vcf_fields:
+ params:
+ sep = "\t",
+ input:
+ expand(OUTDIR/"vaf"/"{sample}.vcf_fields.longer.tsv", sample=iter_samples()),
+ output:
+ OUTDIR/f"{OUTPUT_NAME}.vcf_fields.longer.tsv",
+ run:
+ import pandas as pd
+ from functools import reduce
+ reduce(
+ lambda a, b: pd.concat((a, b), axis="rows", ignore_index=True),
+ (pd.read_csv(path, sep=params.sep) for path in input)
+ ).to_csv(output[0], sep=params.sep, index=False)
+
+
+rule merge_annotation:
+ threads: 1
+ conda: "../envs/renv.yaml"
+ params:
+ sample = "{sample}",
+ ref_name = config["ALIGNMENT_REFERENCE"],
+ input:
+ tsv = OUTDIR/"vaf"/"{sample}.masked.prefiltered.tsv",
+ annot = OUTDIR/"vaf"/"{sample}.vcf_fields.longer.tsv",
+ output:
+ tsv = OUTDIR/"vaf"/"{sample}.variants.tsv"
+ log:
+ LOGDIR / "merge_annotation" / "{sample}.log.txt"
+ script:
+ "../scripts/merge_annotation.R"
+
+
+use rule concat_vcf_fields as concat_variants with:
+ input:
+ expand(OUTDIR/"vaf"/"{sample}.variants.tsv", sample=iter_samples()),
+ output:
+ OUTDIR/f"{OUTPUT_NAME}.variants.tsv",
+
+
+rule pairwise_trajectory_correlation:
conda: "../envs/renv.yaml"
input:
- ann_vcf = OUTDIR/f"{OUTPUT_NAME}.annotated.vcf",
- pre_tsv = OUTDIR/f"{OUTPUT_NAME}.masked.prefiltered.tsv"
+ variants = OUTDIR/f"{OUTPUT_NAME}.variants.tsv",
+ metadata = config["METADATA"],
output:
- tsv = OUTDIR/f"{OUTPUT_NAME}.masked.filtered.tsv"
+ table = report(OUTDIR/"vaf"/"pairwise_trajectory_correlation.csv"),
log:
- LOGDIR / "vcf_to_tsv" / "log.txt"
+ LOGDIR / "pairwise_trajectory_correlation" / "log.txt"
script:
- "../scripts/vcf_to_tsv.R"
+ "../scripts/pairwise_trajectory_correlation.R"
diff --git a/workflow/scripts/N_S_sites_from_fasta.py b/workflow/scripts/N_S_sites_from_fasta.py
deleted file mode 100644
index e98ac94..0000000
--- a/workflow/scripts/N_S_sites_from_fasta.py
+++ /dev/null
@@ -1,92 +0,0 @@
-#!/usr/bin/env python3
-
-import logging
-import json
-import itertools as it
-import pandas as pd
-from gb2seq.alignment import Gb2Alignment
-from gb2seq.features import Features
-from dark.fasta import FastaReads
-
-
-def split_into_codons(seq: str) -> list:
- """Split the complete CDS feature in to a list of codons"""
- codons = [seq[i:i + 3] for i in range(0, len(seq), 3) if "N" not in seq[i:i + 3]]
- return codons
-
-
-def calculate_potential_changes(genetic_code: dict) -> dict:
- """Generate a dictionary with S and N pre-calculated for all possible codons"""
- # Initialize structure
- nts = set(["A", "G", "T", "C"])
- potential_changes = {"S": {}, "N": {}}
- for codon in it.product(nts, repeat=3):
- potential_changes["S"]["".join(codon)] = 0.
- potential_changes["N"]["".join(codon)] = 0.
- # Mutate (substitutions) all possible codons in the given genetic code
- # and count proportions of mutations that are synonymous and non-synonmyous
- for codon in genetic_code.keys():
- for codon_p in range(0, 3):
- nts = ["A", "G", "T", "C"]
- # Do not consider self substitutions, e.g. A->A
- nts.remove(codon[codon_p])
- for nt in nts:
- codon_mutated = list(codon)
- # Mutate the basepair
- codon_mutated[codon_p] = nt
- # Count how many of them are synonymous
- if genetic_code[codon] == genetic_code["".join(codon_mutated)]:
- potential_changes["S"][codon] += 1/3.
- else:
- potential_changes["N"][codon] += 1/3.
- return potential_changes
-
-
-def get_feature_codons(alignment: Gb2Alignment, annotation: list) -> dict:
- dct = {key:alignment.ntSequences(key)[1].sequence for key in annotation}
- return {key:split_into_codons(item) for key,item in dct.items()}
-
-
-def get_df(codons: dict, genetic_code: dict) -> pd.DataFrame:
- keys = []
- N_sites = []
- S_sites = []
- values = calculate_potential_changes(genetic_code)
- for key, item in codons.items():
- keys.append(key)
- N = sum([values["N"][x] for x in item if x in values["N"].keys()])
- S = sum([values["S"][x] for x in item if x in values["S"].keys()])
- N_sites.append(N)
- S_sites.append(S)
- return pd.DataFrame({"gene": keys, "N": N_sites, "S": S_sites})
-
-
-def main():
-
- logging.basicConfig(filename=snakemake.log[0], format=snakemake.config["LOG_PY_FMT"], level=logging.INFO)
-
- logging.info("Reading features")
- with open(snakemake.input.features) as f:
- feature_list = list(json.load(f).keys())
-
- logging.info("Reading genetic code")
- with open(snakemake.input.genetic_code) as f:
- genetic_code = json.load(f)
-
- logging.info("Create alignment object")
- features = Features(snakemake.input.gb)
- seq = list(FastaReads(snakemake.input.fasta))[0]
- aln = Gb2Alignment(seq, features)
-
- logging.info("Splitting ancestral sequence into codons")
- codons_dict = get_feature_codons(aln, feature_list)
-
- logging.info("Calculating synonymous and non synonymous sites")
- df = get_df(codons_dict, genetic_code)
-
- logging.info("Saving results")
- df.to_csv(snakemake.output.csv,index= False)
-
-
-if __name__ == "__main__":
- main()
diff --git a/workflow/scripts/calculate_dnds.R b/workflow/scripts/calculate_dnds.R
new file mode 100644
index 0000000..f545c27
--- /dev/null
+++ b/workflow/scripts/calculate_dnds.R
@@ -0,0 +1,71 @@
+#!/usr/bin/env Rscript
+
+# Write stdout and stderr to log file
+log <- file(snakemake@log[[1]], open = "wt")
+sink(log, type = "message")
+sink(log, type = "output")
+
+library(dplyr)
+library(readr)
+library(tidyr)
+library(logger)
+
+log_threshold(INFO)
+
+# Read inputs
+log_info("Reading variants table")
+variants <- read_delim(
+ snakemake@input[["variants"]],
+ col_select = c(
+ "SAMPLE",
+ "VARIANT_NAME",
+ "REGION",
+ "ALT_FREQ",
+ "SYNONYMOUS",
+ "POS"
+ )
+)
+
+log_info("Reading metadata table")
+metadata <- read_delim(snakemake@input[["metadata"]]) %>%
+ mutate(
+ interval = as.numeric(
+ as.Date(CollectionDate) - min(as.Date(CollectionDate))
+ )
+ ) %>%
+ select(ID, interval) %>%
+ rename(SAMPLE = ID)
+
+log_debug("Adding metadata to variants table")
+variants <- left_join(variants, metadata)
+
+log_info("Reading N/S sites")
+n_s_position <- read_delim(snakemake@input[["n_s_sites"]])
+
+log_info("Computing dN/dS over time (NG86)")
+dn.ds <- variants %>%
+ group_by(SAMPLE, SYNONYMOUS) %>%
+ summarise(
+ Freq = sum(ALT_FREQ, na.rm = TRUE)
+ ) %>%
+ pivot_wider(
+ names_from = SYNONYMOUS,
+ values_from = Freq,
+ values_fill = 0
+ ) %>%
+ transmute(
+ dn = No / sum(n_s_position$N),
+ ds = Yes / sum(n_s_position$S)
+ ) %>%
+ ungroup() %>%
+ left_join(unique(select(variants, SAMPLE, interval))) %>%
+ transmute(
+ sample = SAMPLE,
+ day = interval,
+ dN = dn,
+ dS = ds,
+ w = dn / ds
+ )
+
+log_info("Writing results")
+write_csv(dn.ds, snakemake@output[["table"]])
diff --git a/workflow/scripts/download_context.R b/workflow/scripts/download_context.R
index fefc7f6..8f0e119 100644
--- a/workflow/scripts/download_context.R
+++ b/workflow/scripts/download_context.R
@@ -6,9 +6,12 @@ sink(log, type = "message")
sink(log, type = "output")
library(GISAIDR)
-library(tidyverse)
+library(dplyr)
+library(readr)
library(glue)
+library(lubridate)
library(logger)
+
log_threshold(INFO)
CHKPT.ERROR.MSG <- paste(
@@ -34,9 +37,7 @@ needed.columns <- c(
)
needed.columns.mask <- needed.columns %in% colnames(sample.metadata)
if (!all(needed.columns.mask)) {
- print(glue(
- "Please ensure column '{needed.columns[!needed.columns.mask]}' is present"
- ))
+ log_error("Please ensure column '{needed.columns[!needed.columns.mask]}' is present")
stop(glue(
"Missing columns in '{snakemake@input[['metadata']]}'. Alternatively:\n{CHKPT.ERROR.MSG}"
))
@@ -51,24 +52,20 @@ log_info("Getting time windows for context samles")
# Get time windows
dates <- sample.metadata %>%
pull(snakemake@params[["date_column"]]) %>%
- as.numeric
+ as.numeric()
window.quantile.offset <- (1 - snakemake@params[["date_window_span"]]) / 2
min.date <- as_date(quantile(dates, window.quantile.offset, na.rm = TRUE))
max.date <- as_date(quantile(dates, 1 - window.quantile.offset, na.rm = TRUE))
padded.min.date <- min.date - snakemake@params[["date_window_paddding_days"]]
padded.max.date <- max.date + snakemake@params[["date_window_paddding_days"]]
-print(glue(
- "Time window (span={snakemake@params[['date_window_span']]}): {round(interval(min.date, max.date) / days(1))} days (from {min.date} to {max.date})"
-))
-print(glue(
- "Padded time window (padding={snakemake@params[['date_window_paddding_days']]} days): {round(interval(padded.min.date, padded.max.date) / days(1))} days (from {padded.min.date} to {padded.max.date})"
-))
+log_info("Time window (span={snakemake@params[['date_window_span']]}): {round(interval(min.date, max.date) / days(1))} days (from {min.date} to {max.date})")
+log_info("Padded time window (padding={snakemake@params[['date_window_paddding_days']]} days): {round(interval(padded.min.date, padded.max.date) / days(1))} days (from {padded.min.date} to {padded.max.date})")
log_info("Getting locations for context samples")
# Get locations (if there are multiple, sample from all of them)
locations <- sample.metadata %>%
pull(snakemake@params[["location_column"]]) %>%
- unique
+ unique()
# GISAID login
creds.list <- yaml::read_yaml(snakemake@params[["gisaid_creds"]])
@@ -105,7 +102,7 @@ samples.accids <- sample.metadata %>%
pull(snakemake@params[["samples_gisaid_accession_column"]])
filtered.accids <- metadata %>% filter(accession_id %in% samples.accids)
metadata <- metadata %>% filter(!accession_id %in% samples.accids)
-print(glue("{nrow(metadata)} accession_ids remaining after GISAID ID filter"))
+log_info("{nrow(metadata)} accession_ids remaining after GISAID ID filter")
# Checkpoint: enforce a minimum number of samples to have at least
# as many possible combinations as random subsample replicates.
diff --git a/workflow/scripts/extract_afwdist_variants.py b/workflow/scripts/extract_afwdist_variants.py
new file mode 100644
index 0000000..6a7bf3f
--- /dev/null
+++ b/workflow/scripts/extract_afwdist_variants.py
@@ -0,0 +1,103 @@
+#!/usr/bin/env python3
+
+import logging
+from typing import List
+
+import pandas as pd
+from Bio import SeqIO
+from Bio.SeqRecord import SeqRecord
+from Bio.Seq import Seq
+
+
+def read_monofasta(path: str) -> SeqRecord:
+ fasta = SeqIO.parse(path, "fasta")
+ record = next(fasta)
+ if next(fasta, None) is not None:
+ logging.warning(f"There are unread records left in '{path}'")
+ return record
+
+
+def read_masked_sites(vcf_path: str, mask_classes: List[str]) -> List[int]:
+ """
+ Parse a VCF containing positions for masking. Assumes the VCF file is
+ formatted as in:
+ github.com/W-L/ProblematicSites_SARS-CoV2/blob/master/problematic_sites_sarsCov2.vcf
+ with a "mask" or "caution" recommendation in column 7.
+ Masked sites are specified with params.
+ """
+ vcf = pd.read_csv(
+ vcf_path,
+ sep="\s+",
+ comment="#",
+ names=("CHROM", "POS", "ID", "REF", "ALT", "QUAL", "FILTER", "INFO")
+ )
+ return vcf.loc[vcf.FILTER.isin(mask_classes), "POS"].tolist()
+
+
+def build_ancestor_variant_table(ancestor: Seq, reference: Seq, reference_name: str, masked_positions: List[int]) -> pd.DataFrame:
+ pos = []
+ alt = []
+ for i in range(1, len(ancestor) + 1):
+ if i not in masked_positions and ancestor[i-1] != reference[i-1]:
+ pos.append(i)
+ alt.append(reference[i-1])
+ df = pd.DataFrame({snakemake.params.position_col: pos, snakemake.params.sequence_col: alt})
+ df[snakemake.params.frequency_col] = 1 # As a reference genome, we assume all positions have fixed alleles
+ df[snakemake.params.sample_col] = reference_name
+ return df
+
+
+DTYPES = {
+ "sample": "object",
+ "position": "int64",
+ "sequence": "object",
+ "frequency": "float64"
+}
+
+
+if __name__ == "__main__":
+
+ logging.basicConfig(filename=snakemake.log[0], format=snakemake.config["LOG_PY_FMT"], level=logging.INFO)
+
+ colnames = {
+ snakemake.params.sample_col: "sample",
+ snakemake.params.position_col: "position",
+ snakemake.params.sequence_col: "sequence",
+ snakemake.params.frequency_col: "frequency"
+ }
+
+ logging.info("Reading input tables")
+ # Variants
+ variants = pd.read_table(snakemake.input.variants, sep="\t")
+ logging.info(f"Read {len(variants)} variant records")
+ # VCF with sites to mask
+ masked_sites = read_masked_sites(snakemake.input.mask_vcf, snakemake.params.mask_class)
+ logging.info(f"Read {len(masked_sites)} masked positions")
+
+ logging.info("Reading input FASTA files")
+ # Case ancestor
+ ancestor = read_monofasta(snakemake.input.ancestor)
+ logging.info(f"Ancestor: '{ancestor.description}', length={len(ancestor.seq)}")
+ # Alignment reference
+ reference = read_monofasta(snakemake.input.reference)
+ logging.info(f"Reference: '{reference.description}', length={len(reference.seq)}")
+
+ logging.info("Processing ancestor variants")
+ ancestor_table = build_ancestor_variant_table(ancestor.seq, reference.seq, reference.id, masked_sites)
+ logging.info(f"Ancestor has {len(ancestor_table)} variants")
+ all_variants = pd.concat([variants, ancestor_table], ignore_index=True)
+ logging.info(f"Combined table has {len(all_variants)} variants")
+
+ logging.info("Renaming and selecting columns")
+ output = (
+ all_variants
+ .rename(columns=colnames)
+ [list(colnames.values())]
+ .astype(DTYPES)
+ )
+ logging.info("Filtering sites")
+ output = output[~output.position.isin(masked_sites)]
+ logging.info(f"There are {len(output)} rows left")
+
+ logging.info("Writing results")
+ output.to_csv(snakemake.output.variants, index=False)
diff --git a/workflow/scripts/filter_genbank_features.py b/workflow/scripts/filter_genbank_features.py
new file mode 100644
index 0000000..959a24e
--- /dev/null
+++ b/workflow/scripts/filter_genbank_features.py
@@ -0,0 +1,83 @@
+#!/usr/bin/env python3
+
+import logging
+from typing import Dict, Iterable
+
+from Bio import SeqIO
+from Bio.SeqFeature import SeqFeature
+
+
+def iter_features_filtering(features: Iterable[SeqFeature], included: Dict[str, str], excluded: Dict[str, str]) -> Iterable[SeqFeature]:
+ # No filters
+ if len(included) == 0 and len(excluded) == 0:
+ logging.debug("Selecting all features")
+ return iter(features)
+ # Only inclusion filter
+ elif len(included) == 0 and len(excluded) != 0:
+ logging.debug(f"Selecting features excluding all of {excluded}")
+ return (
+ feature for feature in features
+ if all(
+ (qualifier_value not in excluded.get(qualifier_key, []))
+ for qualifier_key in excluded.keys()
+ for qualifier_value in feature.qualifiers.get(qualifier_key, [])
+ )
+ )
+ # Only exclusion filter
+ elif len(included) != 0 and len(excluded) == 0:
+ logging.debug(f"Selecting features including any of {included}")
+ return (
+ feature for feature in features
+ if any(
+ (qualifier_value in included.get(qualifier_key, []))
+ for qualifier_key in included.keys()
+ for qualifier_value in feature.qualifiers.get(qualifier_key, [])
+ )
+ )
+ # Inclusion then exclusion filter
+ else:
+ logging.debug(f"Selecting features including any of {included} and then excluding all of {excluded}")
+ included_features = (
+ feature for feature in features
+ if any(
+ (qualifier_value in included.get(qualifier_key, []))
+ for qualifier_key in included.keys()
+ for qualifier_value in feature.qualifiers.get(qualifier_key, [])
+ )
+ )
+ return (
+ feature for feature in included_features
+ if all(
+ (qualifier_value not in excluded.get(qualifier_key, []))
+ for qualifier_key in excluded.keys()
+ for qualifier_value in feature.qualifiers.get(qualifier_key, [])
+ )
+ )
+
+
+if __name__ == "__main__":
+ logging.basicConfig(
+ filename=snakemake.log[0], format=snakemake.config["LOG_PY_FMT"],
+ level=logging.INFO
+ )
+
+ logging.info("Reading GenBank file")
+ gb = SeqIO.read(snakemake.input.gb, format="gb")
+
+ logging.info("Extracting CDS")
+ features = []
+ for feature in iter_features_filtering(gb.features, snakemake.params.included, snakemake.params.excluded):
+ logging.debug(
+ "Processing SeqFeature: "
+ f"ID={feature.id} type={feature.type} location={feature.location} "
+ f"gene={feature.qualifiers.get('gene', [])} "
+ f"locus_tag={feature.qualifiers.get('locus_tag', [])} "
+ f"product={feature.qualifiers.get('product', [])}"
+ )
+ features.append(feature)
+
+ logging.info("Replacing features")
+ gb.features = features
+
+ logging.info("Writing filtered GenBank files")
+ SeqIO.write(gb, snakemake.output.gb, "gb")
diff --git a/workflow/scripts/filter_tsv.R b/workflow/scripts/filter_tsv.R
index 55dc216..502ed78 100644
--- a/workflow/scripts/filter_tsv.R
+++ b/workflow/scripts/filter_tsv.R
@@ -5,38 +5,39 @@ log <- file(snakemake@log[[1]], open = "wt")
sink(log, type = "message")
sink(log, type = "output")
-library(tidyverse)
+library(dplyr)
+library(readr)
+library(stringr)
library(logger)
+
log_threshold(INFO)
# Read inputs
data <- read_tsv(snakemake@input[["tsv"]])
-# Filtering criteria:
-# - P-value < 0.05
-# - Depth >= 20
-# - For SNP more than 2 reads in each strand
+# Filtering
is.deletion <- str_detect(
data$ALT,
"^[A-Z]",
negate = TRUE
)
-inBothStrands <- data$ALT_RV > 2 & data$ALT_DP > 2
-Depth <- (data$ALT_RV + data$ALT_DP) >= 20
+strand.mask <- data$ALT_RV > snakemake@params$min_alt_rv &
+ data$ALT_DP > snakemake@params$min_alt_dp
+depth.mask <- (data$ALT_RV + data$ALT_DP) >= snakemake@params$min_depth
log_info("Filtering variants")
data <- filter(
data,
as.logical(PASS),
- Depth,
- inBothStrands | is.deletion
+ depth.mask,
+ strand.mask | is.deletion
)
log_info("Finding synonymous and non synonymous variants")
# Adding synonymous variable
data <- mutate(
data,
- synonimous = case_when(
+ SYNONYMOUS = case_when(
REF_AA == ALT_AA ~ "Yes",
TRUE ~ "No"
)
@@ -45,14 +46,6 @@ data <- mutate(
# Remove duplicated features
data <- distinct(data, pick(!GFF_FEATURE), .keep_all = TRUE)
-# Change annotation to gb2seq annotation
-features <- read_csv(snakemake@input[["annotation"]])
-
-data <- data %>%
- select(!GFF_FEATURE) %>%
- left_join(features) %>%
- rename(GFF_FEATURE = GEN)
-
log_info("Saving results")
write_tsv(
data,
diff --git a/workflow/scripts/format_afwdist_results.py b/workflow/scripts/format_afwdist_results.py
new file mode 100644
index 0000000..2f42ca6
--- /dev/null
+++ b/workflow/scripts/format_afwdist_results.py
@@ -0,0 +1,28 @@
+#!/usr/bin/env python3
+
+import logging
+import pandas as pd
+
+
+if __name__ == "__main__":
+
+ logging.basicConfig(filename=snakemake.log[0], format=snakemake.config["LOG_PY_FMT"], level=logging.INFO)
+
+ logging.info("Read pairwise distances")
+ df = pd.read_csv(snakemake.input.distances)
+
+ logging.info("Initializing formatted output")
+ output = pd.DataFrame(
+ data=0.0,
+ columns=snakemake.params.samples,
+ index=snakemake.params.samples,
+ dtype="float64"
+ )
+
+ logging.info("Filling table")
+ for i, row in df.iterrows():
+ output.loc[row.sample_m, row.sample_n] = row.distance
+ output.loc[row.sample_n, row.sample_m] = row.distance
+
+ logging.info("Writing formatted results")
+ output.to_csv(snakemake.output.distances)
diff --git a/workflow/scripts/format_vcf_fields_longer.R b/workflow/scripts/format_vcf_fields_longer.R
new file mode 100644
index 0000000..26c5512
--- /dev/null
+++ b/workflow/scripts/format_vcf_fields_longer.R
@@ -0,0 +1,75 @@
+#!/usr/bin/env Rscript
+
+# Write stdout and stderr to log file
+log <- file(snakemake@log[[1]], open = "wt")
+sink(log, type = "message")
+sink(log, type = "output")
+
+library(readr)
+library(dplyr)
+library(tidyr)
+library(stringr)
+library(purrr)
+library(rlang)
+library(glue)
+library(logger)
+
+empty.to.na <- function(x) {
+ x[x == ""] <- NA
+ x
+}
+
+# Replace "" values with NA in R filter list
+# Snakemake passes filters like: list(ERRORS = c(""))
+log_info("Preprocessing data")
+filter.include <- lapply(snakemake@params$filter_include, empty.to.na)
+filter.exclude <- lapply(snakemake@params$filter_exclude, empty.to.na)
+
+# Process input table
+log_info("Applying filters and writing results")
+read_tsv(snakemake@input$tsv) %>%
+
+ # Separate -delimited "...[*]..." columns (e.g. ANN[*].EFFECT)
+ separate_rows(
+ contains("[*]"),
+ sep = snakemake@params$sep,
+ convert = TRUE
+ ) %>%
+
+ # Rename "...[*]..." columns using the provided lookup via Snakemake config
+ rename(all_of(unlist(snakemake@params$colnames_mapping))) %>%
+
+ # Separate &-delimited error column (more than one error/warning/info message per row is possible)
+ mutate(split_errors = strsplit(ERRORS, "&")) %>%
+ # Keep rows with none of the excluded ERRORS terms, if any
+ filter(map_lgl(split_errors, ~ !any(. %in% filter.exclude[["ERRORS"]]))) %>%
+ select(-split_errors) %>%
+
+ # Apply filters
+ filter(
+ # Keep variants that include required values in each field
+ !!!map2(
+ names(filter.include),
+ filter.include,
+ ~ expr(.data[[!!.x]] %in% !!.y)
+ ),
+ # Keep variants that exclude required values in each field
+ !!!map2(
+ names(filter.exclude),
+ filter.exclude,
+ ~ expr(!(.data[[!!.x]] %in% !!.y))
+ )
+ ) %>%
+
+ # Keep unique rows
+ distinct() %>%
+
+ mutate(
+ # Assign variant name using the pattern defined via Snakemake config
+ VARIANT_NAME = str_glue(snakemake@params$variant_name_pattern),
+ # Assign sample name
+ SAMPLE = snakemake@params$sample
+ ) %>%
+
+ # Write output file
+ write_tsv(snakemake@output$tsv)
diff --git a/workflow/scripts/merge_annotation.R b/workflow/scripts/merge_annotation.R
new file mode 100644
index 0000000..9c322c4
--- /dev/null
+++ b/workflow/scripts/merge_annotation.R
@@ -0,0 +1,84 @@
+#!/usr/bin/env Rscript
+
+# Write stdout and stderr to log file
+log <- file(snakemake@log[[1]], open = "wt")
+sink(log, type = "message")
+sink(log, type = "output")
+
+library(dplyr)
+library(readr)
+library(logger)
+
+log_threshold(INFO)
+
+log_info("Reading variants table, replacing REGION with the reference name")
+variants <- read_tsv(
+ snakemake@input$tsv,
+ col_types = list(
+ REGION = col_character(),
+ POS = col_integer(),
+ REF = col_character(),
+ ALT = col_character()
+ )
+) %>%
+ mutate(
+ REGION = snakemake@params$ref_name,
+ is_insertion = startsWith(ALT, "+"),
+ is_deletion = startsWith(ALT, "-"),
+ REF_VCF = case_when(
+ is_deletion ~ paste0(REF, substring(ALT, 2)),
+ TRUE ~ REF
+ ),
+ ALT_VCF = case_when(
+ is_insertion ~ paste0(REF, substring(ALT, 2)),
+ is_deletion ~ REF,
+ TRUE ~ ALT
+ )
+ ) %>%
+ select(-is_insertion, -is_deletion)
+
+log_info("Reading annotation table")
+annotation <- read_tsv(
+ snakemake@input$annot,
+ col_select = c(
+ "CHROM",
+ "POS",
+ "REF",
+ "ALT",
+ "EFFECT",
+ "IMPACT",
+ "VARIANT_NAME"
+ ),
+ col_types = list(
+ CHROM = col_character(),
+ POS = col_integer(),
+ REF = col_character(),
+ ALT = col_character(),
+ VARIANT_NAME = col_character()
+ )
+) %>%
+ distinct() %>%
+ group_by(CHROM, POS, REF, ALT) %>%
+ mutate(VARIANT_NAME = paste(VARIANT_NAME, collapse = "|")) %>%
+ ungroup()
+
+log_info("Merging tables")
+merged <- left_join(
+ variants,
+ annotation,
+ by = c(
+ "REGION" = "CHROM",
+ "POS" = "POS",
+ "REF_VCF" = "REF",
+ "ALT_VCF" = "ALT"
+ )
+) %>%
+ mutate(
+ SAMPLE = snakemake@params$sample
+ )
+
+log_info("Saving results")
+write_tsv(
+ merged,
+ snakemake@output$tsv
+)
diff --git a/workflow/scripts/n_s_sites_from_fasta.py b/workflow/scripts/n_s_sites_from_fasta.py
new file mode 100644
index 0000000..cb7fa6f
--- /dev/null
+++ b/workflow/scripts/n_s_sites_from_fasta.py
@@ -0,0 +1,111 @@
+#!/usr/bin/env python3
+
+import logging
+import json
+import itertools as it
+from typing import Dict
+
+import pandas as pd
+from Bio import SeqIO
+from Bio.SeqRecord import SeqRecord
+from Bio.Seq import Seq
+
+
+NTS = ("A", "C", "G", "T")
+
+
+def read_monofasta(path: str) -> SeqRecord:
+ return SeqIO.read(path, format="fasta")
+
+
+def split_into_codons(seq: Seq) -> list:
+ """Split the complete CDS feature in to a list of codons"""
+ return [
+ seq[i:i + 3] for i in range(0, len(seq)-2, 3) if all(char in NTS for char in seq[i:i + 3])
+ ]
+
+
+def calculate_potential_changes(genetic_code: dict) -> dict:
+ """Generate a dictionary with S and N pre-calculated for all possible codons"""
+ # Initialize structure
+ potential_changes = {"S": {}, "N": {}}
+ for codon in it.product(NTS, repeat=3):
+ potential_changes["S"]["".join(codon)] = 0.0
+ potential_changes["N"]["".join(codon)] = 0.0
+ # Mutate (substitutions) all possible codons in the given genetic code
+ # and count proportions of mutations that are synonymous and non-synonmyous
+ for codon in genetic_code.keys():
+ for codon_p in range(0, 3):
+ nts = list(NTS)
+ # Do not consider self substitutions, e.g. A->A
+ nts.remove(codon[codon_p])
+ for nt in nts:
+ codon_mutated = list(codon)
+ # Mutate the basepair
+ codon_mutated[codon_p] = nt
+ # Count how many of them are synonymous
+ if genetic_code[codon] == genetic_code["".join(codon_mutated)]:
+ potential_changes["S"][codon] += 1 / 3.0
+ else:
+ potential_changes["N"][codon] += 1 / 3.0
+ return potential_changes
+
+
+def get_feature_codons(coding_records: Dict[str, SeqRecord]) -> dict:
+ return {name: split_into_codons(record.seq) for name, record in coding_records.items()}
+
+
+def calculate_ns_sites(codons: dict, genetic_code: dict) -> pd.DataFrame:
+ features = []
+ N_sites, S_sites = [], []
+ values = calculate_potential_changes(genetic_code)
+ for key, item in codons.items():
+ features.append(key)
+ N = sum([values["N"][x] for x in item if x in values["N"].keys()])
+ S = sum([values["S"][x] for x in item if x in values["S"].keys()])
+ N_sites.append(N)
+ S_sites.append(S)
+ return pd.DataFrame({"feature": features, "N": N_sites, "S": S_sites})
+
+
+def main():
+
+ logging.basicConfig(
+ filename=snakemake.log[0], format=snakemake.config["LOG_PY_FMT"],
+ level=logging.INFO
+ )
+
+ logging.info("Reading genetic code")
+ with open(snakemake.input.genetic_code) as f:
+ genetic_code = json.load(f)
+
+ logging.info("Reading GenBank file")
+ gb = SeqIO.read(snakemake.input.gb, format="gb")
+
+ logging.info("Reading input FASTA")
+ record = read_monofasta(snakemake.input.fasta)
+
+ logging.info("Extracting CDS")
+ coding_records = {}
+ for feature in gb.features:
+ identifier = "|".join(feature.qualifiers.get(snakemake.params.gb_qualifier_display, []))
+ if identifier == "":
+ logging.error(f"Feature at {feature.location} has no qualifier '{snakemake.params.gb_qualifier_display}'")
+ elif identifier in coding_records:
+ logging.warning(f"Identifier '{identifier}' is already among coding records and will not be replaced by feature at {feature.location}")
+ else:
+ logging.debug(f"Adding feature")
+ coding_records[identifier] = feature.extract(record)
+
+ logging.info(f"Splitting {len(coding_records)} records into codons")
+ codons = get_feature_codons(coding_records)
+
+ logging.info("Calculating synonymous and non synonymous sites")
+ df = calculate_ns_sites(codons, genetic_code)
+
+ logging.info("Saving results")
+ df.to_csv(snakemake.output.csv, index=False)
+
+
+if __name__ == "__main__":
+ main()
diff --git a/workflow/scripts/pairwise_trajectory_correlation.R b/workflow/scripts/pairwise_trajectory_correlation.R
new file mode 100644
index 0000000..27a8dfc
--- /dev/null
+++ b/workflow/scripts/pairwise_trajectory_correlation.R
@@ -0,0 +1,41 @@
+#!/usr/bin/env Rscript
+
+# Write stdout and stderr to log file
+log <- file(snakemake@log[[1]], open = "wt")
+sink(log, type = "message")
+sink(log, type = "output")
+
+library(dplyr)
+library(readr)
+library(tidyr)
+library(tibble)
+library(stringr)
+library(logger)
+
+log_threshold(INFO)
+
+variants <- read_tsv(snakemake@input[["variants"]])
+
+# Obtain sample names ordered by CollectionDate
+date_order <- read_csv(snakemake@input[["metadata"]]) %>%
+ arrange(CollectionDate) %>%
+ pull(ID) %>%
+ unique()
+
+# Create SNP variable and select useful variables
+variants <- variants %>% select(VARIANT_NAME, SAMPLE, ALT_FREQ)
+
+variants <- variants %>%
+ pivot_wider(
+ names_from = VARIANT_NAME,
+ values_from = ALT_FREQ,
+ values_fill = 0,
+ values_fn = sum
+ ) %>%
+ arrange(factor(SAMPLE, levels = date_order)) %>%
+ # Removes "|"-separated annotations, keeping the first one + ellipsis (clarifies heatmap)
+ rename_with(~ str_replace(., "^([^|]+)\\|.*$", "\\1(...)"), -SAMPLE) %>%
+ column_to_rownames(var = "SAMPLE")
+
+log_info("Saving table to create heatmap")
+write.csv(variants, snakemake@output[["table"]])
diff --git a/workflow/scripts/report/NV_description.R b/workflow/scripts/report/NV_description.R
deleted file mode 100644
index d3e72fd..0000000
--- a/workflow/scripts/report/NV_description.R
+++ /dev/null
@@ -1,505 +0,0 @@
-#!/usr/bin/env Rscript
-
-# Write stdout and stderr to log file
-log <- file(snakemake@log[[1]], open = "wt")
-sink(log, type = "message")
-sink(log, type = "output")
-
-library(tidyverse)
-library(stringi)
-library(ggpubr)
-library(jsonlite)
-library(logger)
-log_threshold(INFO)
-
-
-# Import file with plots style
-source(snakemake@params[["design"]])
-
-# SARS-CoV-2 anotation for genome scheme
-SCov2_annotation <- list(
- "five_prime_UTR" = c(1:265),
- "orf1ab" = c(266:21555),
- "Intergenic_1" = c(21556:21562),
- "S" = c(21563:25384),
- "Intergenic_2" = c(25385:25392),
- "ORF3a" = c(25393:26220),
- "Intergenic_3" = c(26221:26244),
- "E" = c(26245:26472),
- "Intergenic_4" = c(26473:26522),
- "M" = c(26523:27191),
- "Intergenic_5" = c(27192:27201),
- "ORF6" = c(27202:27387),
- "Intergenic_6" = c(27388:27393),
- "ORF7" = c(27394:27759),
- "Intergenic_7" = c(27760:27893),
- "ORF8" = c(27894:28259),
- "Intergenic_8" = c(28260:28273),
- "N" = c(28274:29533),
- "Intergenic_9" = c(29534:29557),
- "ORF10" = c(29558:29674),
- "three_prime_UTR" = c(29675:29903)
-)
-
-vcf <- read_delim(snakemake@input[["vcf"]])
-vcf_snp <- vcf
-window <- read_csv(snakemake@input[["window"]])
-metadata <- read_csv(snakemake@input[["metadata"]])
-
-# DATA PROCESSING
-# Obtain sample names ordered by CollectionDate
-date_order <- metadata %>%
- filter(ID %in% snakemake@params[["samples"]]) %>%
- arrange(CollectionDate) %>%
- pull(ID) %>%
- unique()
-
-empty_vcf <- tibble(
- REGION = date_order,
- variant = as.character(NA),
- ALT_FREQ = as.numeric(NA),
- GFF_FEATURE = as.character(NA),
- synonimous = as.character(NA),
- POS = as.numeric(NA),
- ALT = as.character(NA),
- NV_class = as.character(NA),
- group = as.character(NA)
-)
-
-# Create SNP variable and select useful variables
-vcf <- vcf %>%
- dplyr::select(
- variant,
- REGION,
- ALT_FREQ,
- GFF_FEATURE,
- synonimous,
- POS,
- ALT
- )
-
-# Df with gene length for scheme
-notation_empty <- data.frame(
- gene = "",
- len = 0
-) %>%
- filter(len != 0)
-
-notation <- lapply(
- names(SCov2_annotation),
- function(x) {
- add_row(
- notation_empty,
- gene = x,
- len = length(SCov2_annotation[[x]])
- )
- }
-) %>%
- bind_rows()
-
-# Classifying variants
-log_info("Classifying variants")
-vcf <- vcf %>%
- mutate(
- NV_class = case_when(
- str_detect(ALT, fixed("-")) |
- str_detect(ALT, fixed("+")) ~
- "INDEL",
- TRUE ~ "SNP"
- ),
- Class = case_when(
- GFF_FEATURE == "Intergenic" ~ "Intergenic",
- TRUE ~ synonimous
- ),
- POS = as.numeric(POS)
- ) %>%
- rowwise() %>%
- mutate(
- indel_len = case_when(
- NV_class == "INDEL" ~ str_length(ALT) - 1
- ),
- indel_class = case_when(
- GFF_FEATURE == "Intergenic" ~ "Intergenic",
- NV_class == "INDEL" &
- indel_len %% 3 == 0 ~
- "In frame",
- NV_class == "INDEL" &
- indel_len %% 3 > 0 ~
- "Frameshift"
- )
- ) %>%
- ungroup() %>%
- mutate(
- group = case_when(
- GFF_FEATURE == "Intergenic" ~ "Intergenic",
- NV_class == "SNP" ~ Class,
- NV_class == "INDEL" ~ indel_class
- )
- )
-
-# NSP data
-npc <- read_csv(snakemake@params[["regions"]]) %>%
- mutate(
- summary_nsp = case_when(
- region %in% paste("nsp", seq(4, 12, 1), sep = "") ~ "nsp4-12",
- region %in% paste("nsp", seq(14, 16, 1), sep = "") ~ "nsp14-16",
- TRUE ~ region
- ),
- summary_start = case_when(
- region %in% paste("nsp", seq(4, 12, 1), sep = "") ~ 8555,
- region %in% paste("nsp", seq(14, 16, 1), sep = "") ~ 18040,
- TRUE ~ start
- ),
- summary_end = case_when(
- region %in% paste("nsp", seq(4, 12, 1), sep = "") ~ 16236,
- region %in% paste("nsp", seq(14, 16, 1), sep = "") ~ 21552,
- TRUE ~ end
- )
- ) %>%
- filter(region != "nsp1")
-
-
-# PLOTS
-## SUMMARY FIGURE FOR WHOLE GENOME
-log_info("Plotting summary figure for whole genome")
-
-if (nrow(vcf) == 0) {
- log_warn("Whole-genome VCF has no rows")
- vcf <- empty_vcf
-}
-
-variants <- vcf %>%
- filter(ALT_FREQ > 0) %>%
- ggplot() +
- aes(
- x = POS,
- y = factor(REGION, date_order),
- shape = factor(NV_class, c("SNP", "INDEL")),
- color = group,
- alpha = ALT_FREQ
- ) +
- geom_point(size = 3) +
- geom_col(
- data = notation,
- aes(
- x = len,
- y = 0.3,
- fill = factor(gene, rev(names(SCov2_annotation)))
- ),
- inherit.aes = FALSE,
- width = 0.3
- ) +
- scale_fill_manual(values = gene_colors) +
- xlim(c(0, 29903)) +
- scale_color_manual(
- labels = NV_names,
- values = NV_colors
- ) +
- scale_y_discrete(drop = FALSE) +
- labs(
- x = "SARS-CoV-2 genome position",
- y = "Sample",
- shape = "Variant class",
- color = "Classification",
- alpha = "Frequency",
- fill = "Region"
- ) +
- guides(
- fill = guide_legend(reverse = TRUE)
- )
-
-# Window plot
-window_plot <- window %>%
- ggplot() +
- aes(
- x = position,
- y = fractions,
- color = gen
- ) +
- geom_point() +
- geom_line(
- aes(group = 1),
- colour = "black",
- alpha = 0.3
- ) +
- scale_y_continuous(
- label = scales::percent,
- limits = c(0, max(window$fractions) + 0.005)
- ) +
- xlim(c(0, 29903)) +
- scale_color_manual(values = gene_colors) +
- labs(
- y = "Proportion of \n sites with NV",
- x = "",
- color = "Gen"
- )
-
-# Add nsp info
-window_plot_nsp <- window_plot +
- geom_vline(
- data = npc,
- aes(xintercept = summary_start),
- color = "red"
- ) +
- geom_vline(
- data = npc,
- aes(xintercept = summary_end),
- color = "red"
- ) +
- geom_text(
- data = npc,
- aes(
- x = (summary_start + summary_end) / 2,
- y = max(window$fractions) + 0.002,
- label = summary_nsp
- ),
- inherit.aes = FALSE,
- size = 5,
- angle = 60
- )
-
-figura <- ggarrange(
- window_plot_nsp,
- variants,
- nrow = 2,
- align = "v",
- legend.grob = get_legend(variants),
- heights = c(2, 6),
- legend = "right",
- labels = c("A", "B")
-)
-
-ggsave(
- filename = snakemake@output[["fig"]],
- plot = figura,
- width = 250,
- height = 240,
- units = "mm",
- dpi = 250
-)
-
-
-# Zoom in in spike
-log_info("Plotting summary for variants in the spike")
-
-spike_pos <- window %>%
- filter(gen == "S") %>%
- pull(position)
-
-vcf_spike <- vcf %>%
- filter(
- ALT_FREQ > 0,
- POS %in% c(min(spike_pos):max(spike_pos))
- )
-
-window_plot_spike <- window %>%
- filter(gen == "S") %>%
- ggplot() +
- aes(
- x = position,
- y = fractions,
- color = gen
- ) +
- geom_point() +
- geom_line(
- aes(group = 1),
- colour = "black",
- alpha = 0.3
- ) +
- scale_y_continuous(
- label = scales::percent,
- limits = c(0, max(window$fractions) + 0.005)
- ) +
- xlim(c(min(spike_pos), max(spike_pos))) +
- scale_color_manual(
- values = gene_colors,
- guide = "none"
- ) +
- labs(
- y = "Proportion of \n sites with NV",
- x = ""
- )
-
-if (nrow(vcf_spike) == 0) {
- log_warn("Spike VCF has no rows")
- vcf_spike <- empty_vcf
-}
-
-variants_spike <- vcf_spike %>%
- ggplot() +
- aes(
- x = POS,
- y = factor(REGION, date_order),
- shape = factor(NV_class, c("SNP", "INDEL")),
- color = group,
- alpha = ALT_FREQ
- ) +
- geom_point(size = 3) +
- xlim(c(min(spike_pos), max(spike_pos))) +
- scale_color_manual(
- labels = NV_names,
- values = NV_colors
- ) +
- scale_y_discrete(drop = FALSE) +
- labs(
- x = "SARS-CoV-2 genome position",
- y = "Sample",
- shape = "Variant class",
- color = "Classification",
- alpha = "Frequency",
- fill = "Region"
- ) +
- guides(
- fill = guide_legend(reverse = TRUE)
- )
-
-figura_spike <- ggarrange(
- window_plot_spike,
- variants_spike,
- nrow = 2,
- align = "v",
- legend.grob = get_legend(variants),
- heights = c(2, 6),
- legend = "right",
- labels = c("A", "B")
-)
-
-ggsave(
- filename = snakemake@output[["fig_s"]],
- plot = figura_spike,
- width = 250,
- height = 240,
- units = "mm",
- dpi = 250
-)
-
-
-# Figure for no of heterozygous sites for each sample
-log_info("Plotting no. of heterozygous sites for each sample")
-figur_SNP_table <- vcf_snp %>%
- filter(ALT_FREQ <= snakemake@params$max_alt_freq) %>%
- left_join(
- metadata,
- by = c("REGION" = "ID")
- ) %>%
- group_by(REGION) %>%
- summarise(
- CollectionDate = min(as.Date(CollectionDate)),
- n = n_distinct(POS)
- ) %>%
- ungroup()
-
-if (nrow(figur_SNP_table) == 0) {
- log_warn("Filtered SNP table has no rows")
- figur_SNP_table <- empty_vcf
-}
-
-figur_SNP_time <- figur_SNP_table %>%
- ggplot() +
- aes(
- x = CollectionDate,
- y = n
- ) +
- geom_smooth(
- method = "lm",
- fill = "gray95",
- alpha = 0.6
- ) +
- geom_point() +
- labs(
- x = "Collection date",
- y = "No. of polimorphic sites"
- )
-
-ggsave(
- filename = snakemake@output[["fig_cor"]],
- plot = figur_SNP_time,
- width = 250,
- height = 119.4,
- units = "mm",
- dpi = 250
-)
-
-
-# PLOT TABLES
-log_info("Saving plot tables")
-
-# Variants summary table
-vcf %>%
- select(
- REGION,
- POS,
- variant,
- ALT_FREQ,
- NV_class,
- group
- ) %>%
- rename(
- sample = REGION,
- Variant = variant,
- Class = group
- ) %>%
- filter(ALT_FREQ > 0) %>%
- mutate(
- Class = case_when(
- Class == "Yes" ~ "synonymous",
- Class == "No" ~ "non_synonymous",
- TRUE ~ Class
- )
- ) %>%
- write.csv(snakemake@output[["table_2"]], row.names = FALSE)
-
-# Window plot table
-window %>%
- transmute(
- POS = position,
- feature = gen,
- prop_PolymorphicSites = fractions
- ) %>%
- write.csv(snakemake@output[["table_1"]], row.names = FALSE)
-
-# Heterozygous sites per sample table
-vcf_snp %>%
- filter(ALT_FREQ <= snakemake@params$max_alt_freq) %>%
- select(!GFF_FEATURE) %>%
- left_join(
- metadata,
- by = c("REGION" = "ID")
- ) %>%
- group_by(REGION) %>%
- summarise(
- CollectionDate = min(as.Date(CollectionDate)),
- n_PolymorphicSites = n_distinct(POS)
- ) %>%
- ungroup() %>%
- rename(sample = REGION) %>%
- unique() %>%
- write.csv(snakemake@output[["table_3"]], row.names = FALSE)
-
-# Stats for reporting
-n_indels <- vcf %>%
- filter(NV_class == "INDEL") %>%
- pull(variant) %>%
- unique() %>%
- length()
-
-n_snv <- length(unique(vcf$variant)) - n_indels
-model <- lm(n ~ CollectionDate, data = figur_SNP_table)
-
-# Calculate correlation, if possible
-if (nrow(figur_SNP_table) > 2) {
- p.value <- summary(model)$coefficients[2,4]
-} else {
- p.value <- NA
-}
-
-list(
- "INDELS" = n_indels,
- "SNV" = n_snv,
- "window" = snakemake@params[["window"]],
- "step" = snakemake@params[["step"]],
- "r2" = summary(model)$r.squared[[1]],
- "value" = ifelse(p.value < 0.001, "< 0.001", p.value)
-) %>%
- toJSON() %>%
- write(snakemake@output[["json"]])
diff --git a/workflow/scripts/report/af_time_correlation_data.R b/workflow/scripts/report/af_time_correlation_data.R
new file mode 100644
index 0000000..d3415c0
--- /dev/null
+++ b/workflow/scripts/report/af_time_correlation_data.R
@@ -0,0 +1,141 @@
+#!/usr/bin/env Rscript
+
+# Write stdout and stderr to log file
+log <- file(snakemake@log[[1]], open = "wt")
+sink(log, type = "message")
+sink(log, type = "output")
+
+library(dplyr)
+library(readr)
+library(tidyr)
+library(tibble)
+library(logger)
+log_threshold(INFO)
+
+# Read data
+log_info("Reading variants table")
+variants <- read_delim(
+ snakemake@input[["variants"]],
+ col_select = c(
+ "VARIANT_NAME",
+ "SAMPLE",
+ "REGION",
+ "ALT_FREQ",
+ "POS"
+ )
+) %>%
+ # Fill positions without alt frequency with 0
+ complete(
+ nesting(REGION, VARIANT_NAME, POS),
+ SAMPLE,
+ fill = list(ALT_FREQ = 0)
+ )
+
+log_info("Reading metadata")
+metadata <- read_csv(snakemake@input[["metadata"]]) %>%
+ filter(
+ ID %in% variants$SAMPLE
+ ) %>%
+ select(
+ ID,
+ CollectionDate
+ )
+
+log_info("Dating variants")
+variants <- left_join(variants, metadata, by = c("SAMPLE" = "ID")) %>%
+ # Calculate days since first sample
+ arrange(
+ CollectionDate
+ ) %>%
+ mutate(
+ interval = as.numeric(difftime(CollectionDate, min(CollectionDate), units = "days"))
+ )
+
+# Save processed input
+log_info("Writing dated and frequency-filled variants")
+write_csv(variants, snakemake@output$fmt_variants)
+
+log_info("Calculating correlations")
+log_debug("Calculating unique SNPs")
+# Get list with all different polymorphisms
+unique.snps <- unique(variants$VARIANT_NAME)
+
+# Create an empty dataframe to be filled
+cor.df <- data.frame(
+ variant = "",
+ coefficient = 0,
+ p.value = 0,
+ p.value.adj = 0
+) %>%
+ filter(p.value != 0)
+
+log_debug("Calculating correlation using method = {snakemake@params$cor_method} and exact p-value = {snakemake@params$cor_exact}")
+correlations <- lapply(
+ unique.snps,
+ function(snp) {
+ # Select SNP
+ df <- filter(
+ variants,
+ VARIANT_NAME == snp
+ )
+ # Perform calculation
+ test <- cor.test(
+ df$ALT_FREQ,
+ df$interval,
+ method = snakemake@params$cor_method,
+ exact = snakemake@params$cor_exact
+ )
+ # Adjust p-value
+ p.value.adj <- p.adjust(
+ test$p.value,
+ method = "BH",
+ n = length(unique.snps)
+ )
+ # Add row to dataframe
+ add_row(
+ cor.df,
+ variant = snp,
+ coefficient = test$estimate,
+ p.value = test$p.value,
+ p.value.adj = p.value.adj
+ )
+ }
+) %>%
+ bind_rows()
+
+log_info("Writing correlations table")
+write_csv(
+ correlations,
+ snakemake@output[["correlations"]]
+)
+
+log_info("Selecting variants whose allele frequency is significantly correlated with time")
+significant.variants <- correlations %>%
+ filter(p.value.adj < 0.05) %>%
+ pull(variant) %>%
+ unique()
+
+log_info("Significant: {significant.variants}")
+
+log_info("Selecting variants in positions with more than one alternative allele")
+mult.alt.variants <- variants %>%
+ select(
+ VARIANT_NAME,
+ POS
+ ) %>%
+ distinct() %>%
+ group_by(POS) %>%
+ filter(n() > 1) %>%
+ ungroup() %>%
+ pull(VARIANT_NAME) %>%
+ unique()
+
+log_info("Mult all: {mult.alt.variants}")
+
+# Build selected subset to represent
+variant.selection <- unique(c(significant.variants, mult.alt.variants))
+
+log_info("Selection: {variant.selection}")
+
+log_info("Writing selected variants subset")
+write_lines(variant.selection, snakemake@output$subset)
diff --git a/workflow/scripts/report/af_time_correlation_plot.R b/workflow/scripts/report/af_time_correlation_plot.R
new file mode 100644
index 0000000..42f5b75
--- /dev/null
+++ b/workflow/scripts/report/af_time_correlation_plot.R
@@ -0,0 +1,57 @@
+#!/usr/bin/env Rscript
+
+# Write stdout and stderr to log file
+log <- file(snakemake@log[[1]], open = "wt")
+sink(log, type = "message")
+sink(log, type = "output")
+
+library(dplyr)
+library(readr)
+library(ggplot2)
+library(ggrepel)
+library(logger)
+log_threshold(INFO)
+
+# Import file with plots style
+source(snakemake@params[["design"]])
+
+log_info("Reading correlation data")
+correlations <- read_csv(snakemake@input$correlations)
+
+log_info("Plotting coorrelation coefficients and p-values of each variants")
+p <- correlations %>%
+ mutate(
+ trans.p = -log10(p.value.adj),
+ label = ifelse(p.value.adj < 0.05, variant, NA)
+ ) %>%
+ ggplot() +
+ aes(
+ x = coefficient,
+ y = trans.p
+ ) +
+ geom_text_repel(aes(label = label), max.overlaps = 10000, direction = "x") +
+ geom_point(
+ data = function(x) subset(x, !is.na(label)),
+ color = "orange",
+ size = 2
+ ) +
+ geom_point(size = 2, shape = 1) +
+ xlim(c(-1, 1)) +
+ geom_hline(
+ aes(yintercept = -log10(0.05)),
+ linetype = 2,
+ color = "orange"
+ ) +
+ labs(
+ x = "Correlation coefficient",
+ y = "-log10(p-value)"
+ )
+
+ggsave(
+ filename = snakemake@output$plot,
+ plot = p,
+ width = snakemake@params$plot_width_mm,
+ height = snakemake@params$plot_height_mm,
+ units = "mm",
+ dpi = 250
+)
diff --git a/workflow/scripts/report/af_trajectory_panel_plot.R b/workflow/scripts/report/af_trajectory_panel_plot.R
new file mode 100644
index 0000000..61e7190
--- /dev/null
+++ b/workflow/scripts/report/af_trajectory_panel_plot.R
@@ -0,0 +1,79 @@
+#!/usr/bin/env Rscript
+
+# Write stdout and stderr to log file
+log <- file(snakemake@log[[1]], open = "wt")
+sink(log, type = "message")
+sink(log, type = "output")
+
+set.seed(snakemake@params$random_color_seed) # seed for sampling colors
+
+library(dplyr)
+library(readr)
+library(ggplot2)
+library(glue)
+library(logger)
+library(logger)
+log_threshold(INFO)
+
+# Import file with plots style
+source(snakemake@params[["design"]])
+
+log_info("Reading formatted variants table")
+variants <- read_csv(snakemake@input$fmt_variants)
+
+log_info("Reading subset of variants to represent")
+selected.variants <- read_lines(snakemake@input$subset)
+
+# Set plot height depending on the number of SNPs assuming 4 columns in the plot
+plot.rows <- ceiling(
+ length(selected.variants) / snakemake@params$n_plot_columns
+)
+plot.height <- max(100, plot.rows * snakemake@params$plot_row_height_mm)
+log_debug("Setting total plot height to {plot.height} mm with {plot.rows} rows")
+
+log_info("Plotting {length(selected.variants)} SNPs allele frequency trajectories in time")
+selected.colors <- sample(TRAJECTORY.PANEL.COLORS, length(selected.variants))
+log_debug("Selected color: {selected.colors}")
+p <- variants %>%
+ filter(VARIANT_NAME %in% selected.variants) %>%
+ mutate(
+ gPOS = paste0("g.", POS),
+ gPOS = reorder(gPOS, POS)
+ ) %>%
+ ggplot() +
+ aes(
+ x = interval,
+ y = ALT_FREQ,
+ color = reorder(glue("{gPOS}\n{VARIANT_NAME}"), POS)
+ ) +
+ scale_color_manual(values = selected.colors) +
+ geom_point() +
+ geom_line() +
+ theme(
+ legend.position = "bottom",
+ legend.text = element_text(size = 9),
+ legend.title = element_blank(),
+ legend.spacing.y = unit(3, "mm")
+ ) +
+ labs(
+ x = "Days since first sample",
+ y = "Frequency"
+ ) +
+ guides(color = guide_legend(ncol = 3))
+
+if (length(selected.variants) > 1) {
+ p <- p +
+ facet_wrap(
+ vars(gPOS),
+ ncol = snakemake@params$n_plot_columns
+ )
+}
+
+ggsave(
+ filename = snakemake@output[["plot"]],
+ plot = p,
+ width = snakemake@params$plot_width_mm,
+ height = plot.height,
+ units = "mm",
+ dpi = 250
+)
diff --git a/workflow/scripts/report/allele_freq_tree_data.R b/workflow/scripts/report/allele_freq_tree_data.R
new file mode 100644
index 0000000..9b20ce1
--- /dev/null
+++ b/workflow/scripts/report/allele_freq_tree_data.R
@@ -0,0 +1,56 @@
+#!/usr/bin/env Rscript
+
+# Write stdout and stderr to log file
+log <- file(snakemake@log[[1]], open = "wt")
+sink(log, type = "message")
+sink(log, type = "output")
+
+library(dplyr)
+library(readr)
+library(tibble)
+library(data.table)
+library(ape)
+library(logger)
+
+fix_negative_edge_length <- function(nj.tree) {
+ edge_infos <- cbind(
+ nj.tree$edge,
+ nj.tree$edge.length
+ ) %>%
+ as.data.table()
+ colnames(edge_infos) <- c("from", "to", "length")
+ nega_froms <- edge_infos[length < 0, sort(unique(from))]
+ for (nega_from in nega_froms) {
+ minus_length <- edge_infos[from == nega_from, ][order(length)][1, length]
+ edge_infos[from == nega_from, length := length - minus_length]
+ edge_infos[to == nega_from, length := length + minus_length]
+ }
+ nj.tree$edge.length <- edge_infos$length
+ nj.tree
+}
+
+log_threshold(INFO)
+
+matrix <- read_csv(snakemake@input$dist)
+
+# Tree construction
+if (snakemake@params$use_bionj) {
+ log_info("Constructing BIONJ tree based on distances")
+ tree_method <- bionj
+} else {
+ log_info("Constructing NJ tree based on distances")
+ tree_method <- nj
+}
+tree <- matrix %>%
+ column_to_rownames(var = "...1") %>%
+ as.matrix() %>%
+ as.dist() %>%
+ tree_method() %>%
+ root(snakemake@params$outgroup_id, resolve.root = TRUE)
+
+# Resolve possible negative edge lengths in tree
+log_info("Resolving negative edge lengths")
+tree <- fix_negative_edge_length(tree)
+
+log_info("Saving tree")
+write.tree(tree, snakemake@output$tree)
diff --git a/workflow/scripts/report/allele_freq_tree_plot.R b/workflow/scripts/report/allele_freq_tree_plot.R
new file mode 100644
index 0000000..2a08851
--- /dev/null
+++ b/workflow/scripts/report/allele_freq_tree_plot.R
@@ -0,0 +1,89 @@
+#!/usr/bin/env Rscript
+
+# Write stdout and stderr to log file
+log <- file(snakemake@log[[1]], open = "wt")
+sink(log, type = "message")
+sink(log, type = "output")
+
+library(dplyr)
+library(readr)
+library(ape)
+library(ggplot2)
+library(ggtree)
+library(logger)
+
+log_threshold(INFO)
+
+# Import file with plots style
+source(snakemake@params[["design"]])
+
+log_info("Building tree annotation")
+
+# Read metadata
+log_debug("Reading metadata")
+metadata <- read_csv(snakemake@input$metadata)
+
+# Extract names from records
+log_debug("Reading FASTA")
+study_records <- read.dna(
+ snakemake@input$study_fasta,
+ format = "fasta",
+ as.matrix = FALSE,
+)
+log_debug("Extracting names")
+study_records <- study_records[
+ !startsWith(names(study_records), snakemake@params$outgroup_id)
+]
+study_names <- names(study_records)
+
+# Obtain sample names ordered by CollectionDate
+log_debug("Sorting names by collection date")
+date_order <- metadata %>%
+ arrange(CollectionDate) %>%
+ filter(ID %in% study_names) %>%
+ pull(ID) %>%
+ unique()
+
+log_debug("Building annotation dataframe")
+tree_tiplab <- data.frame(
+ ID = date_order,
+ order = seq(1, length(date_order), 1)
+) %>%
+ rowwise() %>%
+ mutate(
+ tip_label = sprintf(
+ "(%s)-%s",
+ order,
+ ID
+ )
+ ) %>%
+ ungroup() %>%
+ add_row(
+ ID = snakemake@params$outgroup_id,
+ order = 0,
+ tip_label = snakemake@params$outgroup_id
+ )
+
+# Read distance tree and root
+log_info("Reading tree")
+tree <- read.tree(snakemake@input$tree) %>%
+ root(snakemake@params$outgroup_id, resolve.root = TRUE)
+
+# Plot
+log_info("Plotting distance tree")
+max.tip.length <- max(
+ node.depth.edgelength(tree)[seq_along(length(tree$tip.label))]
+)
+p <- ggtree(tree) %<+% tree_tiplab +
+ geom_tiplab(aes(label = tip_label)) +
+ geom_treescale(1.1 * max.tip.length) +
+ geom_rootedge(0.05 * max.tip.length)
+
+ggsave(
+ filename = snakemake@output$plot,
+ plot = p,
+ width = snakemake@params$plot_width_mm,
+ height = snakemake@params$plot_height_mm,
+ units = "mm",
+ dpi = 250
+)
diff --git a/workflow/scripts/report/context_phylogeny_data.R b/workflow/scripts/report/context_phylogeny_data.R
new file mode 100644
index 0000000..a23587d
--- /dev/null
+++ b/workflow/scripts/report/context_phylogeny_data.R
@@ -0,0 +1,116 @@
+#!/usr/bin/env Rscript
+
+# Write stdout and stderr to log file
+log <- file(snakemake@log[[1]], open = "wt")
+sink(log, type = "message")
+sink(log, type = "output")
+
+library(dplyr)
+library(readr)
+library(tibble)
+library(jsonlite)
+library(ape)
+library(logger)
+
+log_threshold(INFO)
+
+# Import file with plots style
+source(snakemake@params[["design"]])
+
+# Format tree label for well supported nodes
+TREE_LEGEND_NAMES["boot_alrt_pass"] <- sprintf(
+ TREE_LEGEND_NAMES["boot_alrt_pass"],
+ 100 * snakemake@params[["boot_th"]], "%",
+ 100 * snakemake@params[["alrt_th"]], "%"
+)
+
+log_info("Reading tree")
+tree_ml <- read.tree(snakemake@input[["tree"]]) %>%
+ root(
+ snakemake@params[["ref_name"]],
+ resolve.root = TRUE
+ )
+log_debug("Read tree with {length(tree_ml$node.label)} labels")
+
+log_info("Reading target names from FASTA")
+target_names <- read.dna(
+ snakemake@input[["target_fasta"]],
+ format = "fasta",
+ as.matrix = FALSE,
+)
+log_debug("Read {length(target_names)} records")
+
+log_info("Processing target names")
+target_names <- target_names[
+ !startsWith(names(target_names), snakemake@config[["ALIGNMENT_REFERENCE"]])
+]
+target_names <- names(target_names)
+log_debug("{length(target_names)} records remaining after processing")
+
+# ML tree with context data
+# Internal nodes color
+# Node labels contain SH-aLRT/UFboot values
+log_info("Reading support values from labels")
+labels <- strsplit(tree_ml$node.label, "/")
+aLRT.values <- sapply(
+ labels,
+ function(x) as.numeric(x[1])
+)
+bootstrap.values <- sapply(
+ labels,
+ function(x) as.numeric(x[2])
+)
+
+log_info("Calculating support mask for the given thresholds")
+aLRT.mask <- aLRT.values >= snakemake@params[["alrt_th"]]
+boot.mask <- bootstrap.values >= snakemake@params[["boot_th"]]
+
+# MRCA for target samples
+log_info("Calculating MRCA of target samples")
+target.mrca <- getMRCA(tree_ml, target_names)
+target.mrca.clade <- extract.clade(tree_ml, target.mrca)
+target.mrca.clade.ntips <- Ntip(target.mrca.clade)
+
+log_info("Building M-L tree annotation")
+tree_ml.ntips <- length(tree_ml$tip.label)
+tree_ml.nodes <- tree_ml$Nnode + tree_ml.ntips
+tree_ml.labels <- tree_ml$tip.label[1:tree_ml.nodes]
+tree_ml.node.pass <- c(rep(FALSE, tree_ml.ntips), aLRT.mask & boot.mask)
+
+ml.tree.annot <- tibble(
+ node = 1:tree_ml.nodes,
+) %>%
+ mutate(
+ Class = case_when(
+ tree_ml.labels %in% target_names ~ TREE_LEGEND_NAMES["tip_label"],
+ tree_ml.node.pass ~ TREE_LEGEND_NAMES["boot_alrt_pass"],
+ TRUE ~ NA
+ )
+ )
+
+# Write output files
+log_info("Writing tree annotation")
+write_csv(ml.tree.annot, snakemake@output$annotation)
+
+log_info("Writing JSON data")
+target.node <- tree_ml$node.label[target.mrca - length(tree_ml$tip.label)]
+list(
+ "boot" = strsplit(target.node, "/")[[1]][2] %>% as.numeric(),
+ "alrt" = strsplit(target.node, "/")[[1]][1] %>% as.numeric(),
+ "monophyly" = ifelse(
+ is.monophyletic(tree_ml, target_names),
+ "are",
+ "are not"
+ ),
+ "target_mrca" = target.mrca,
+ "clade_tips" = target.mrca.clade.ntips,
+ "max_tip_length" = max(node.depth.edgelength(tree_ml)[
+ 1:length(tree_ml$tip.label)
+ ]),
+ "root" = snakemake@params[["ref_name"]]
+) %>%
+ write_json(
+ snakemake@output$json,
+ auto_unbox = TRUE,
+ digits = NA
+ )
diff --git a/workflow/scripts/report/context_phylogeny_plot.R b/workflow/scripts/report/context_phylogeny_plot.R
new file mode 100644
index 0000000..742bb24
--- /dev/null
+++ b/workflow/scripts/report/context_phylogeny_plot.R
@@ -0,0 +1,71 @@
+#!/usr/bin/env Rscript
+
+# Write stdout and stderr to log file
+log <- file(snakemake@log[[1]], open = "wt")
+sink(log, type = "message")
+sink(log, type = "output")
+
+library(dplyr)
+library(readr)
+library(jsonlite)
+library(ape)
+library(ggplot2)
+library(ggtree)
+library(logger)
+
+log_threshold(INFO)
+
+# Import file with plots style
+source(snakemake@params[["design"]])
+
+# Format tree label for well supported nodes
+TREE_LEGEND_NAMES["boot_alrt_pass"] <- sprintf(
+ TREE_LEGEND_NAMES["boot_alrt_pass"],
+ 100 * snakemake@params[["boot_th"]], "%",
+ 100 * snakemake@params[["alrt_th"]], "%"
+)
+
+log_info("Reading JSON data")
+json <- read_json(snakemake@input$json)
+
+log_info("Reading tree")
+tree_ml <- read.tree(snakemake@input[["tree"]]) %>%
+ root(
+ json$root,
+ resolve.root = TRUE
+ )
+
+log_info("Reading tree annotation")
+annotation <- read_csv(snakemake@input$annotation)
+
+log_info("Plotting M-L tree with context samples")
+p <- ggtree(tree_ml, layout = "circular") %<+%
+ annotation +
+ geom_highlight(node = json$target_mrca, colour = "orange", alpha = 0) +
+ geom_point(aes(color = Class, size = Class)) +
+ geom_treescale(1.05 * json$max_tip_length) +
+ geom_rootedge(0.05 * json$max_tip_length) +
+ xlim(-json$max_tip_length / 3, NA) +
+ scale_color_manual(
+ values = setNames(
+ TREE_PALETTE[names(TREE_LEGEND_NAMES)],
+ TREE_LEGEND_NAMES
+ ),
+ na.translate = FALSE
+ ) +
+ scale_size_manual(
+ values = setNames(
+ TREE_NODE_SIZE[names(TREE_LEGEND_NAMES)],
+ TREE_LEGEND_NAMES
+ ),
+ na.translate = FALSE
+ )
+
+ggsave(
+ filename = snakemake@output[["plot"]],
+ plot = p,
+ width = snakemake@params[["plot_width_mm"]],
+ height = snakemake@params[["plot_height_mm"]],
+ units = "mm",
+ dpi = 250
+)
diff --git a/workflow/scripts/report/demix_plot.R b/workflow/scripts/report/demix_plot.R
new file mode 100644
index 0000000..1cf1c3d
--- /dev/null
+++ b/workflow/scripts/report/demix_plot.R
@@ -0,0 +1,56 @@
+#!/usr/bin/env Rscript
+
+# Write stdout and stderr to log file
+log <- file(snakemake@log[[1]], open = "wt")
+sink(log, type = "message")
+sink(log, type = "output")
+
+library(dplyr)
+library(readr)
+library(ggplot2)
+library(logger)
+
+log_threshold(INFO)
+
+# Import file with plots style
+source(snakemake@params[["design"]])
+
+# Read inputs
+log_info("Plotting")
+p <- read_csv(snakemake@input$data) %>%
+ mutate(
+ sample = reorder(sample, CollectionDate),
+ lineage = ifelse(is_main, lineage, NA)
+ ) %>%
+ ggplot() +
+ aes(
+ x = sample,
+ y = abundance,
+ fill = lineage
+ ) +
+ scale_fill_viridis_d(
+ na.value = "gray50",
+ labels = function(x) {
+ ifelse(is.na(x), "Other", x)
+ }
+ ) +
+ geom_col() +
+ theme(
+ axis.text.x = element_text(angle = 60, hjust = 1),
+ legend.position = "bottom"
+ ) +
+ labs(
+ x = "Sample",
+ y = "Relative abundance",
+ fill = "Lineage"
+ )
+
+log_info("Saving plot")
+ggsave(
+ filename = snakemake@output[["plot"]],
+ plot = p,
+ width = snakemake@params$plot_width_mm,
+ height = snakemake@params$plot_height_mm,
+ units = "mm",
+ dpi = 250
+)
diff --git a/workflow/scripts/report/demix_plot_data.R b/workflow/scripts/report/demix_plot_data.R
new file mode 100644
index 0000000..cb0c08b
--- /dev/null
+++ b/workflow/scripts/report/demix_plot_data.R
@@ -0,0 +1,58 @@
+#!/usr/bin/env Rscript
+
+# Write stdout and stderr to log file
+log <- file(snakemake@log[[1]], open = "wt")
+sink(log, type = "message")
+sink(log, type = "output")
+
+library(dplyr)
+library(readr)
+library(logger)
+
+log_threshold(INFO)
+
+# Read inputs
+demix <- read_csv(snakemake@input[["summary_demixing"]])
+
+# Data processing
+log_info("Obtaining main lineages")
+main_lineages <- demix %>%
+ group_by(sample) %>%
+ top_n(1, abundance) %>%
+ ungroup() %>%
+ pull(lineage) %>%
+ unique()
+
+# Obtain sample names ordered by CollectionDate
+log_info("Sorting dates")
+metadata <- read_csv(snakemake@input[["metadata"]])
+date_order <- metadata %>%
+ arrange(CollectionDate) %>%
+ filter(ID %in% demix$sample) %>%
+ pull(ID) %>%
+ unique()
+
+# Build plot data
+log_info("Building plot data")
+plot.data <- demix %>%
+ group_by(lineage, sample) %>%
+ summarize(abundance = sum(abundance)) %>%
+ ungroup() %>%
+ left_join(
+ select(
+ metadata,
+ ID,
+ CollectionDate
+ ),
+ by = c("sample" = "ID")
+ ) %>%
+ mutate(
+ is_main = lineage %in% main_lineages
+ )
+
+# Save plot data
+log_info("Saving plot data")
+write_csv(
+ plot.data,
+ snakemake@output[["data"]]
+)
diff --git a/workflow/scripts/report/diversity_data.R b/workflow/scripts/report/diversity_data.R
new file mode 100644
index 0000000..9ca5918
--- /dev/null
+++ b/workflow/scripts/report/diversity_data.R
@@ -0,0 +1,116 @@
+#!/usr/bin/env Rscript
+
+# Write stdout and stderr to log file
+log <- file(snakemake@log[[1]], open = "wt")
+sink(log, type = "message")
+sink(log, type = "output")
+
+set.seed(snakemake@params$seed)
+
+library(ape)
+library(pegas)
+library(future)
+library(future.apply)
+library(readr)
+library(jsonlite)
+library(logger)
+
+log_threshold(INFO)
+
+# Pi calculation
+nucleotide.diversity <- function(dna_object, record.names, sample.size) {
+ sample <- sample(record.names, sample.size, replace = FALSE)
+ dna_subset <- dna_object[record.names %in% sample]
+ nuc.div(dna_subset)
+}
+
+# Parallel bootstrapping
+boot.nd.parallel <- function(aln, sample.size, reps) {
+ record.names <- names(aln)
+ future_sapply(
+ 1:reps,
+ function(x) {
+ nucleotide.diversity(aln, record.names, sample.size)
+ },
+ future.seed = TRUE
+ )
+}
+
+# Read outgroup/context alignment
+log_info("Reading context")
+gene_ex <- read.dna(
+ snakemake@input[["context_fasta"]],
+ format = "fasta",
+ as.matrix = FALSE
+)
+gene_ex <- gene_ex[
+ !startsWith(names(gene_ex), snakemake@params$aln_reference)
+]
+
+# Read target (study) alignment
+log_info("Reading target alignment")
+study_aln <- read.dna(
+ snakemake@input[["study_fasta"]],
+ format = "fasta",
+ as.matrix = FALSE
+)
+study_aln <- study_aln[
+ !startsWith(names(study_aln), snakemake@params$aln_reference)
+]
+
+# Diversity value for our samples
+log_info("Calculating diversity value for studied samples")
+diversity <- nuc.div(study_aln)
+
+# Perform bootstrap
+log_info("Performing calculation for nucleotide diversity in context samples")
+plan(multisession, workers = snakemake@threads)
+divs <- boot.nd.parallel(
+ gene_ex,
+ length(study_aln),
+ snakemake@params[["bootstrap_reps"]]
+)
+plan(sequential)
+
+# Test normality
+log_info("Normality test for nucleotide diversity values")
+st <- shapiro.test(divs)
+
+# Calculate p-value (assuming normal distribution)
+log_info("Calculating p-value (assuming normal distribution)")
+test <- t.test(
+ divs,
+ alternative = "greater",
+ mu = diversity,
+ conf.level = 0.95
+)
+pvalue.norm <- test$p.value
+
+# Estimate p-value empirically
+log_info("Estimating p-value empirically")
+empirical.probs <- ecdf(divs)
+pvalue.emp <- empirical.probs(diversity)
+
+# Data for JSON file
+log_info("Building JSON data")
+p.value <- ifelse(st$p.value >= 0.05, pvalue.norm, pvalue.emp)
+list.div <- list(
+ "diversity" = diversity,
+ "p.value" = ifelse(p.value >= 0.001, p.value, "< 0.001"),
+ "normal.pvalue" = ifelse(st$p.value >= 0.001, p.value, "< 0.001"),
+ "norm.text" = ifelse(st$p.value >= 0.05, "", "not"),
+ "type.test" = ifelse(st$p.value >= 0.05, "", "empirical"),
+ "boot.reps" = snakemake@params[["bootstrap_reps"]],
+ "sample.size" = length(study_aln)
+)
+
+log_info("Writing diversity distribution")
+write_lines(divs, snakemake@output[["divs"]])
+
+log_info("Writing results JSON")
+write_json(
+ list.div,
+ snakemake@output[["json"]],
+ auto_unbox = TRUE,
+ digits = NA # maximum precision
+)
diff --git a/workflow/scripts/report/diversity_plot.R b/workflow/scripts/report/diversity_plot.R
index 3ad0131..0c42118 100644
--- a/workflow/scripts/report/diversity_plot.R
+++ b/workflow/scripts/report/diversity_plot.R
@@ -5,145 +5,57 @@ log <- file(snakemake@log[[1]], open = "wt")
sink(log, type = "message")
sink(log, type = "output")
-library(ape)
-library(pegas)
-library(future.apply)
-library(tidyverse)
+library(dplyr)
+library(readr)
+library(ggplot2)
library(jsonlite)
library(logger)
-log_threshold(INFO)
-
-# Pi calculation
-nucleotide.diversity <- function(dna_object, record.names, sample.size) {
- sample <- sample(record.names, sample.size, replace = FALSE)
- dna_subset <- dna_object[record.names %in% sample]
- nuc.div(dna_subset)
-}
-# Parallel bootstrapping
-boot.nd.parallel <- function(aln, sample.size = 12, reps = 100) {
- record.names <- names(aln)
- future_sapply(
- 1:reps,
- function(x) {
- nucleotide.diversity(aln, record.names, sample.size)
- },
- future.seed = TRUE
- )
-}
+log_threshold(INFO)
# Import file with plots style
source(snakemake@params[["design"]])
-# Outgroup/context alignment
-gene_ex <- read.dna(
- snakemake@input[["context_fasta"]],
- format = "fasta",
- as.matrix = FALSE
-)
-gene_ex <- gene_ex[
- !startsWith(names(gene_ex), snakemake@config[["ALIGNMENT_REFERENCE"]])
-]
-
-# Study alignment
-study_aln <- read.dna(
- snakemake@input[["study_fasta"]],
- format = "fasta",
- as.matrix = FALSE
-)
-study_aln <- study_aln[
- !startsWith(names(study_aln), snakemake@config[["ALIGNMENT_REFERENCE"]])
-]
-
-# Diversity value for our samples
-log_info("Calculating diversity value for studied samples")
-diversity <- nuc.div(study_aln)
-
-
-# Perform bootstrap
-log_info("Performing calculation for nucleotide diversity in context samples")
-plan(multisession, workers = snakemake@threads)
-divs <- boot.nd.parallel(
- gene_ex,
- length(study_aln),
- snakemake@params[["bootstrap_reps"]]
-)
-plan(sequential)
-
-# Test normality
-log_info("Normality test for nucleotide diversity values")
-st <- shapiro.test(divs)
-
-# Calculate p-value (assuming normal distribution)
-
-log_info("Calculating p-value (assuming normal distribution)")
-
-test <- t.test(
- divs,
- alternative = "greater",
- mu = diversity,
- conf.level = 0.95
-)
-pvalue.norm <- test$p.value
-
-# Estimate p-value empirically
-log_info("Estimating p-value empirically")
-empirical.probs <- ecdf(divs)
-pvalue.emp <- empirical.probs(diversity)
+# Read data
+log_info("Reading diversity results")
+divs <- read_lines(snakemake@input$divs) %>%
+ as.numeric()
+json <- read_json(snakemake@input$json)
# Plot and save
-log_info("Plotting diversity plot")
+log_info("Plotting")
p <- data.frame(pi = divs) %>%
ggplot() +
geom_density(
aes(x = pi),
- fill = "#fcbf49",
+ fill = DIVERSITY_PALETTE["density_fill"],
alpha = 0.7,
bw = 0.000001,
- color = "#eae2b7"
- ) +
- geom_vline(
- aes(xintercept = diversity),
- color = "#d62828"
+ linewidth = 0.5,
+ color = DIVERSITY_PALETTE["density_color"]
) +
stat_function(
fun = dnorm,
args = list(mean = mean(divs), sd = sd(divs)),
- color = "#f77f00"
+ color = DIVERSITY_PALETTE["dnorm_color"],
+ linewidth = 1
+ ) +
+ geom_vline(
+ xintercept = json$diversity,
+ color = DIVERSITY_PALETTE["value_color"],
+ linewidth = 1
) +
labs(
x = "π",
y = "Density"
- )
+ ) +
+ xlim(0, NA)
ggsave(
- filename = snakemake@output[["fig"]],
+ filename = snakemake@output[["plot"]],
plot = p,
- width = snakemake@params[["plot_width"]],
- height = snakemake@params[["plot_height"]],
+ width = snakemake@params[["plot_width_mm"]],
+ height = snakemake@params[["plot_height_mm"]],
units = "mm",
dpi = 250
)
-
-# DATA JSON #####
-p.value <- ifelse(st$p.value >= 0.05, pvalue.norm, pvalue.emp)
-
-list.div <- list(
- "diversity" = format(diversity, scientific = TRUE),
- "p.value" = ifelse(p.value >= 0.001, p.value, "< 0.001"),
- "normal.pvalue" = ifelse(st$p.value >= 0.001, p.value, "< 0.001"),
- "norm.text" = ifelse(st$p.value >= 0.05, "", "not"),
- "type.test" = ifelse(st$p.value >= 0.05, "", "empirical"),
- "boot.reps" = snakemake@params[["bootstrap_reps"]],
- "sample.size" = length(study_aln)
-)
-
-json <- toJSON(list.div)
-write(json, snakemake@output[["json"]])
-
-# PLOT TABLES
-data.frame(
- pi = divs,
- prop.value = diversity
-) %>%
- write.csv(snakemake@output[["table"]], row.names = FALSE)
diff --git a/workflow/scripts/report/dnds_plots.R b/workflow/scripts/report/dnds_plots.R
new file mode 100644
index 0000000..62b2e41
--- /dev/null
+++ b/workflow/scripts/report/dnds_plots.R
@@ -0,0 +1,89 @@
+#!/usr/bin/env Rscript
+
+# Write stdout and stderr to log file
+log <- file(snakemake@log[[1]], open = "wt")
+sink(log, type = "message")
+sink(log, type = "output")
+
+library(dplyr)
+library(readr)
+library(tidyr)
+library(ggplot2)
+library(logger)
+
+log_threshold(INFO)
+
+# Import file with plots style
+source(snakemake@params[["design"]])
+
+# Read inputs
+log_info("Reading plot data")
+plot.df <- read_delim(snakemake@input$table) %>%
+ filter(w != Inf) %>%
+ pivot_longer(
+ c("dN", "dS", "w"),
+ values_to = "value",
+ names_to = "ratio"
+ )
+
+log_info("Plotting dN and dS")
+p.dn.ds <- plot.df %>%
+ filter(ratio != "w") %>%
+ ggplot() +
+ aes(
+ x = day,
+ y = value,
+ color = ratio,
+ shape = ratio
+ ) +
+ geom_point(size = 3) +
+ geom_line() +
+ scale_color_manual(
+ name = "",
+ labels = DNDS_LABELS,
+ values = DNDS_COLORS
+ ) +
+ scale_shape_manual(
+ name = "",
+ values = DNDS_SHAPES,
+ labels = DNDS_LABELS
+ ) +
+ labs(
+ y = "Substitution rate",
+ x = "Days since the initial sampling",
+ color = ""
+ )
+
+ggsave(
+ filename = snakemake@output$plot_dn_ds,
+ plot = p.dn.ds,
+ width = snakemake@params$plot_width_mm,
+ height = snakemake@params$plot_height_mm,
+ units = "mm",
+ dpi = 250
+)
+
+log_info("Plotting w (dN/dS)")
+p.omega <- plot.df %>%
+ filter(ratio == "w") %>%
+ ggplot() +
+ aes(
+ x = day,
+ y = value,
+ ) +
+ geom_point(color = "black") +
+ geom_line(color = "black") +
+ labs(
+ y = "w (dN/dS)",
+ x = "Days since the initial sampling",
+ color = ""
+ )
+
+ggsave(
+ filename = snakemake@output$plot_omega,
+ plot = p.omega,
+ width = snakemake@params$plot_width_mm,
+ height = snakemake@params$plot_height_mm,
+ units = "mm",
+ dpi = 250
+)
diff --git a/workflow/scripts/report/evo_plots.R b/workflow/scripts/report/evo_plots.R
deleted file mode 100644
index e0ed984..0000000
--- a/workflow/scripts/report/evo_plots.R
+++ /dev/null
@@ -1,159 +0,0 @@
-#!/usr/bin/env Rscript
-
-# Write stdout and stderr to log file
-log <- file(snakemake@log[[1]], open = "wt")
-sink(log, type = "message")
-sink(log, type = "output")
-
-library(tidyverse)
-library(logger)
-log_threshold(INFO)
-
-# Import file with plots style
-source(snakemake@params[["design"]])
-
-# Read inputs
-vcf <- read_delim(snakemake@input[["vcf"]])
-metadata <- read_delim(snakemake@input[["metadata"]])
-N_S_position <- read_delim(snakemake@input[["N_S"]])
-
-# DATA PROCESSING
-# Create SNP variable and select useful variables
-vcf <- vcf %>%
- dplyr::select(
- variant,
- REGION,
- ALT_FREQ,
- GFF_FEATURE,
- synonimous,
- POS
- )
-
-# Create variable for days sins first sample in metadata
-metadata <- metadata %>%
- mutate(
- interval = as.numeric(
- as.Date(CollectionDate) - min(as.Date(CollectionDate))
- )
- ) %>%
- select(ID, interval) %>%
- rename(REGION = ID)
-
-vcf <- left_join(vcf, metadata)
-
-# PLOT
-log_info("Ploting dN and dS over time")
-plot_df <- vcf %>%
- group_by(REGION, synonimous) %>%
- summarise(
- Freq = sum(ALT_FREQ, na.rm = TRUE)
- ) %>%
- pivot_wider(
- names_from = synonimous,
- values_from = Freq,
- values_fill = 0
- ) %>%
- transmute(
- dn = No / sum(N_S_position$N),
- ds = Yes / sum(N_S_position$S)
- ) %>%
- ungroup() %>%
- mutate(
- w = dn / ds,
- ) %>%
- filter(w != Inf) %>%
- pivot_longer(
- c("dn", "ds", "w"),
- values_to = "value",
- names_to = "d"
- ) %>%
- left_join(unique(select(vcf, REGION, interval)))
-
-plot <- plot_df %>%
- filter(d != "w") %>%
- ggplot() +
- aes(
- x = interval,
- y = value,
- color = d,
- shape = d
- ) +
- geom_point(size = 3) +
- geom_line() +
- scale_color_manual(
- name = "Parameter",
- labels = dnds.labels,
- values = dnds.colors
- ) +
- scale_shape_manual(
- name = "Parameter",
- values = dnds.shapes,
- labels = dnds.labels
- ) +
- labs(
- y = "Substitution rate",
- x = "Days since the initial sampling",
- color = "Parameter"
- )
-
-ggsave(
- filename = snakemake@output[["plot"]],
- plot = plot,
- width = 159.2,
- height = 119.4,
- units = "mm",
- dpi = 250
-)
-
-# Plot for omega
-
-plot_omega <- plot_df %>%
- filter(d == "w") %>%
- ggplot() +
- aes(
- x = interval,
- y = value,
- ) +
- geom_point(color = "black") +
- geom_line(color = "black") +
- labs(
- y = "w (dN/dS)",
- x = "Days since the initial sampling",
- color = "Parameter"
- )
-
-ggsave(
- filename = snakemake@output[["plot_omega"]],
- plot = plot_omega,
- width = 159.2,
- height = 119.4,
- units = "mm",
- dpi = 250
-)
-
-# PLOT TABLES
-log_info("Saving plot table")
-vcf %>%
- group_by(REGION, synonimous) %>%
- summarise(
- Freq = sum(ALT_FREQ, na.rm = TRUE)
- ) %>%
- pivot_wider(
- names_from = synonimous,
- values_from = Freq,
- values_fill = 0
- ) %>%
- transmute(
- dn = No / sum(N_S_position$N),
- ds = Yes / sum(N_S_position$S)
- ) %>%
- ungroup() %>%
- left_join(unique(select(vcf, REGION, interval))) %>%
- transmute(
- sample = REGION,
- DaysSinceFirst = interval,
- dN = dn,
- dS = ds,
- w = dn / ds
- ) %>%
- write.csv(snakemake@output[["table"]], row.names = FALSE)
diff --git a/workflow/scripts/report/extract_genbank_regions.py b/workflow/scripts/report/extract_genbank_regions.py
new file mode 100644
index 0000000..61f3a33
--- /dev/null
+++ b/workflow/scripts/report/extract_genbank_regions.py
@@ -0,0 +1,48 @@
+#!/usr/bin/env python3
+
+import logging
+import json
+
+from Bio import SeqIO
+from Bio.SeqFeature import ExactPosition
+
+
+if __name__ == "__main__":
+ logging.basicConfig(
+ filename=snakemake.log[0], format=snakemake.config["LOG_PY_FMT"],
+ level=logging.INFO
+ )
+
+ logging.info("Reading GenBank file")
+ gb = SeqIO.read(snakemake.input.gb, format="gb")
+
+ logging.info("Calculating CDS regions")
+ regions = {}
+ for feature in gb.features:
+ identifier = "|".join(feature.qualifiers.get(snakemake.params.gb_qualifier, []))
+ if identifier == "":
+ logging.error(f"Feature at {feature.location} has no qualifier '{snakemake.params.gb_qualifier_display}'")
+ elif identifier in regions:
+ logging.warning(f"Identifier '{identifier}' is already among coding records and will not be replaced by feature at {feature.location}")
+ else:
+ logging.debug(f"Adding feature")
+ if type(feature.location.start) is not ExactPosition or type(feature.location.start) is not ExactPosition:
+ logging.warning(f"Feature '{identifier}' location is not exact but will be treated as such: {feature.location}")
+ regions[identifier] = (
+ int(feature.location.start.real + 1),
+ int(feature.location.end.real)
+ )
+
+ logging.info("Calculating intergenic regions")
+ cds_names = tuple(regions.keys())
+ intergenic_count = 0
+ for i in range(1, len(cds_names)):
+ start1, end1 = regions[cds_names[i-1]]
+ start2, end2 = regions[cds_names[i]]
+ if (start2 - end1) > 1:
+ intergenic_count += 1
+ regions[f"Intergenic_{intergenic_count}"] = (end1 + 1, start2 - 1)
+
+ logging.info("Writing regions to JSON file")
+ with open(snakemake.output.regions, "w") as fw:
+ json.dump(regions, fw, indent=2)
diff --git a/workflow/scripts/report/freyja_plot.R b/workflow/scripts/report/freyja_plot.R
deleted file mode 100644
index e698091..0000000
--- a/workflow/scripts/report/freyja_plot.R
+++ /dev/null
@@ -1,101 +0,0 @@
-#!/usr/bin/env Rscript
-
-# Write stdout and stderr to log file
-log <- file(snakemake@log[[1]], open = "wt")
-sink(log, type = "message")
-sink(log, type = "output")
-
-library(tidyverse)
-library(logger)
-log_threshold(INFO)
-
-# Import file with plots style
-source(snakemake@params[["design"]])
-
-# Read inputs
-demix <- read_csv(snakemake@input[["summary_demixing"]])
-
-# DATA PROCESSING
-log_info("Obtaining main lineages")
-main_lineages <- demix %>%
- group_by(sample) %>%
- top_n(1, abundances) %>%
- ungroup() %>%
- pull(lineages) %>%
- unique()
-
-# Obtain sample names ordered by CollectionDate
-metadata <- read_csv(snakemake@input[["metadata"]])
-date_order <- metadata %>%
- arrange(CollectionDate) %>%
- filter(ID %in% demix$sample) %>%
- pull(ID) %>%
- unique()
-
-# PLOT
-log_info("Plotting summary demixing")
-demix_plot <- demix %>%
- mutate(
- lineages = case_when(
- lineages %in% main_lineages ~ lineages
- )
- ) %>%
- group_by(lineages, sample) %>%
- mutate(
- abundances = sum(abundances)
- ) %>%
- unique() %>%
- ggplot() +
- aes(
- x = factor(sample, date_order),
- y = as.numeric(abundances),
- fill = lineages
- ) +
- scale_fill_viridis_d(
- na.value = "gray50",
- labels = function(x) {
- ifelse(is.na(x), "Other", x)
- }
- ) +
- geom_col() +
- theme(
- axis.text.x = element_text(angle = 60, hjust = 1),
- legend.position = "bottom"
- ) +
- labs(
- x = "Sample",
- y = "Relative abundance",
- fill = "Lineage"
- )
-
-ggsave(
- filename = snakemake@output[["fig"]],
- plot = demix_plot,
- width = 159.2,
- height = 119.4,
- units = "mm",
- dpi = 250
-)
-
-
-# PLOT TABLES
-log_info("Saving plot table")
-demix %>%
- mutate(
- lineages = case_when(
- lineages %in% main_lineages ~ lineages,
- TRUE ~ "Other"
- )
- ) %>%
- group_by(sample, lineages) %>%
- summarise(abundances = sum(abundances)) %>%
- ungroup() %>%
- left_join(
- select(
- metadata,
- ID,
- CollectionDate
- ),
- by = c("sample" = "ID")
- ) %>%
- write.csv(snakemake@output[["table"]], row.names = FALSE)
diff --git a/workflow/scripts/report/get_annotation.py b/workflow/scripts/report/get_annotation.py
deleted file mode 100644
index 552d70d..0000000
--- a/workflow/scripts/report/get_annotation.py
+++ /dev/null
@@ -1,36 +0,0 @@
-#!/usr/bin/env python3
-
-import json
-import logging
-import pandas as pd
-from gb2seq.features import Features
-from Bio import SeqIO
-
-
-def main():
- logging.basicConfig(filename=snakemake.log[0], format=snakemake.config["LOG_PY_FMT"], level=logging.INFO)
-
- logging.info("Reading features")
- ft = Features(snakemake.input.gb)
- with open(snakemake.input.features) as f:
- feature_key = json.load(f)
-
- logging.info("Reading reference")
- reference = str(next(SeqIO.parse(snakemake.input.ref, "fasta")).seq)
- positions = [x for x in range(len(reference))]
-
- logging.info("Building annotation")
- genes = []
- for pos in positions:
- if len(ft.getFeatureNames(pos)) == 0:
- genes.append("Intergenic")
- else:
- genes.append(feature_key[list(ft.getFeatureNames(pos))[0]])
-
- logging.info("Writing table")
- df = pd.DataFrame({"POS": positions, "GEN": genes})
- df.to_csv(snakemake.output.df, index= False)
-
-
-if __name__ == "__main__":
- main()
diff --git a/workflow/scripts/report/heatmap.R b/workflow/scripts/report/heatmap.R
deleted file mode 100644
index 08a2d73..0000000
--- a/workflow/scripts/report/heatmap.R
+++ /dev/null
@@ -1,35 +0,0 @@
-#!/usr/bin/env Rscript
-
-# Write stdout and stderr to log file
-log <- file(snakemake@log[[1]], open = "wt")
-sink(log, type = "message")
-sink(log, type = "output")
-
-library(tidyverse)
-library(logger)
-log_threshold(INFO)
-
-vcf <- read_tsv(snakemake@input[["vcf"]])
-
-# Obtain sample names ordered by CollectionDate
-date_order <- read_csv(snakemake@input[["metadata"]]) %>%
- arrange(CollectionDate) %>%
- pull(ID) %>%
- unique()
-
-# Create SNP variable and select useful variables from vcf
-vcf <- vcf %>%
- dplyr::select(variant, REGION, ALT_FREQ)
-
-vcf <- vcf %>%
- pivot_wider(
- names_from = variant,
- values_from = ALT_FREQ,
- values_fill = 0,
- values_fn = sum
- ) %>%
- arrange(factor(REGION, levels = date_order)) %>%
- column_to_rownames(var = "REGION")
-
-log_info("Saving table to create heatmap")
-write.csv(vcf, snakemake@output[["table"]])
diff --git a/workflow/scripts/report/nv_panel_data.R b/workflow/scripts/report/nv_panel_data.R
new file mode 100644
index 0000000..5cb6880
--- /dev/null
+++ b/workflow/scripts/report/nv_panel_data.R
@@ -0,0 +1,108 @@
+#!/usr/bin/env Rscript
+
+# Write stdout and stderr to log file
+log <- file(snakemake@log[[1]], open = "wt")
+sink(log, type = "message")
+sink(log, type = "output")
+
+library(dplyr)
+library(readr)
+library(stringr)
+library(tibble)
+library(jsonlite)
+library(logger)
+
+log_threshold(INFO)
+
+log_info("Reading variants")
+variants <- read_delim(
+ snakemake@input$variants,
+ col_select = c(
+ "SAMPLE",
+ "REGION",
+ "POS",
+ "ALT",
+ "VARIANT_NAME",
+ "ALT_FREQ",
+ "EFFECT",
+ "SYNONYMOUS"
+ )
+)
+log_debug("Read {nrow(variants)} rows")
+
+log_info("Reading dates from metadata")
+dates <- read_delim(
+ snakemake@input$metadata,
+ col_select = c("ID", "CollectionDate")
+)
+
+log_info("Dating variants")
+variants <- left_join(
+ variants, dates,
+ by = c("SAMPLE" = "ID")
+)
+
+log_info("Classifying variants")
+variants <- variants %>%
+ mutate(
+ NV_class = ifelse(
+ str_detect(ALT, fixed("-")) | str_detect(ALT, fixed("+")),
+ "INDEL",
+ "SNP"
+ ),
+ Class = ifelse(EFFECT == "intergenic_region", "Intergenic", SYNONYMOUS),
+ POS = as.numeric(POS)
+ ) %>%
+ # rowwise() %>%
+ mutate(
+ indel_len = ifelse(NV_class == "INDEL", str_length(ALT) - 1, NA),
+ indel_class = case_when(
+ EFFECT == "intergenic_region" ~ "Intergenic",
+ (NV_class == "INDEL") & (indel_len %% 3 == 0) ~ "In frame",
+ (NV_class == "INDEL") & (indel_len %% 3 != 0) ~ "Frameshift"
+ )
+ ) %>%
+ ungroup() %>%
+ mutate(
+ group = case_when(
+ EFFECT == "intergenic_region" ~ "Intergenic",
+ NV_class == "SNP" ~ Class,
+ NV_class == "INDEL" ~ indel_class
+ )
+ ) %>%
+ filter(ALT_FREQ > 0)
+log_debug("Processed {nrow(variants)} rows")
+
+if (nrow(variants) == 0) {
+ log_warning("No variants found, using an empty table")
+ variants <- tibble(
+ SAMPLE = date_order,
+ REGION = as.character(NA),
+ VARIANT_NAME = as.character(NA),
+ ALT_FREQ = as.numeric(NA),
+ EFFECT = as.character(NA),
+ SYNONYMOUS = as.character(NA),
+ POS = as.numeric(NA),
+ ALT = as.character(NA),
+ NV_class = as.character(NA),
+ group = as.character(NA)
+ )
+}
+
+log_info("Writing processed table")
+write_csv(variants, snakemake@output$table)
+
+log_info("Writing NV summary")
+nv_counts <- variants %>%
+ distinct(VARIANT_NAME, NV_class) %>%
+ count(NV_class) %>%
+ deframe()
+list(
+ "INDELS" = nv_counts["INDEL"],
+ "SNV" = nv_counts["SNP"]
+) %>%
+ write_json(
+ snakemake@output$json,
+ auto_unbox = TRUE,
+ digits = NA
+ )
diff --git a/workflow/scripts/report/nv_panel_plot.R b/workflow/scripts/report/nv_panel_plot.R
new file mode 100644
index 0000000..da7eca7
--- /dev/null
+++ b/workflow/scripts/report/nv_panel_plot.R
@@ -0,0 +1,165 @@
+#!/usr/bin/env Rscript
+
+# Write stdout and stderr to log file
+log <- file(snakemake@log[[1]], open = "wt")
+sink(log, type = "message")
+sink(log, type = "output")
+
+library(dplyr)
+library(readr)
+library(ggplot2)
+library(ggpubr)
+library(jsonlite)
+library(tibble)
+library(purrr)
+library(logger)
+
+log_threshold(INFO)
+
+# Import file with plots style
+source(snakemake@params[["design"]])
+
+# Subplot A: windows ============================
+log_info("Reading variant windows")
+window <- read_csv(snakemake@input$window)
+
+log_debug("Calculating X axis limits")
+xlim_values <- c(
+ max(0, min(window$position) - snakemake@params$window_step),
+ max(window$position) + snakemake@params$window_step
+)
+
+log_info("Plotting variant windows")
+p1 <- window %>%
+ ggplot() +
+ aes(
+ x = position,
+ y = fraction,
+ color = feature
+ ) +
+ geom_point() +
+ geom_line(
+ aes(group = 1),
+ colour = "black",
+ alpha = 0.3
+ ) +
+ scale_y_continuous(
+ label = scales::percent,
+ limits = c(0, max(window$fraction) + mean(window$fraction))
+ ) +
+ xlim(xlim_values) +
+ # TODO: change GENE_PALETTE to selection of TRAJECTORY.PANEL.COLORS ?
+ scale_color_manual(values = GENE_PALETTE) +
+ labs(
+ y = "Proportion of\nsites with NV",
+ x = "",
+ color = "Gen"
+ )
+
+log_info("Reading regions to highlight")
+highlight <- read_csv(snakemake@input$highlight_window_regions)
+
+if (nrow(highlight) != 0) {
+ log_info("Highlighting {nrow(highlight)} regions")
+ highlighted_sites <- unique(c(highlight$start, highlight$end))
+ p1 <- p1 +
+ geom_vline(xintercept = highlighted_sites, color = "orange") +
+ geom_text(
+ data = highlight,
+ aes(
+ x = (start + end) / 2,
+ y = max(window$fraction + mean(window$fraction) / 2),
+ label = region
+ ),
+ inherit.aes = FALSE,
+ size = 3,
+ angle = 45
+ )
+} else {
+ log_info("No highlighted regions")
+}
+
+# Subplot B: windows ============================
+log_info("Reading panel data")
+panel <- read_csv(snakemake@input$panel)
+
+log_info("Calculating sample chronological order")
+date_order <- panel %>%
+ arrange(CollectionDate) %>%
+ pull(SAMPLE) %>%
+ unique()
+
+log_info("Reading regions")
+regions <- read_json(snakemake@input$regions, simplifyVector = TRUE) %>%
+ enframe(name = "region", value = "coords") %>%
+ mutate(
+ start = map_dbl(coords, 1),
+ end = map_dbl(coords, 2),
+ length = end - start + 1
+ ) %>%
+ select(-coords)
+
+log_info("Plotting variants")
+p2 <- panel %>%
+ ggplot() +
+ aes(
+ x = POS,
+ y = factor(SAMPLE, date_order),
+ shape = factor(NV_class, c("SNP", "INDEL")),
+ color = group,
+ alpha = ALT_FREQ
+ ) +
+ geom_point(size = 3) +
+ geom_col(
+ data = regions,
+ aes(
+ x = length,
+ y = 0.3,
+ # TODO: legend shows as NA those without match in `regions`
+ fill = factor(region, rev(regions$region))
+ ),
+ inherit.aes = FALSE,
+ width = 0.3
+ ) +
+ # TODO: change GENE_PALETTE to selection of TRAJECTORY.PANEL.COLORS ?
+ scale_fill_manual(values = GENE_PALETTE) +
+ xlim(xlim_values) +
+ scale_color_manual(
+ labels = NV_TYPE_NAMES,
+ values = NV_TYPE_PALETTE
+ ) +
+ scale_y_discrete(drop = FALSE) +
+ labs(
+ x = "Genome position",
+ y = "Sample",
+ shape = "Variant class",
+ color = "Classification",
+ alpha = "Frequency",
+ fill = "Region"
+ ) +
+ guides(
+ fill = guide_legend(reverse = TRUE)
+ )
+
+# Plot arrangement ============================
+log_info("Arranging plots")
+p <- ggarrange(
+ p1,
+ p2,
+ nrow = 2,
+ align = "v",
+ legend.grob = get_legend(p2),
+ heights = c(2, 6),
+ legend = "right",
+ labels = c("A", "B")
+)
+
+log_info("Saving plot")
+ggsave(
+ filename = snakemake@output$plot,
+ plot = p,
+ width = snakemake@params$plot_width_mm,
+ height = snakemake@params$plot_height_mm,
+ units = "mm",
+ dpi = 250
+)
diff --git a/workflow/scripts/report/nv_panel_zoom_on_feature_data.py b/workflow/scripts/report/nv_panel_zoom_on_feature_data.py
new file mode 100644
index 0000000..4989e68
--- /dev/null
+++ b/workflow/scripts/report/nv_panel_zoom_on_feature_data.py
@@ -0,0 +1,28 @@
+#!/usr/bin/env python3
+
+import sys
+import logging
+import json
+
+import pandas as pd
+
+
+if __name__ == "__main__":
+
+ logging.basicConfig(filename=snakemake.log[0], format=snakemake.config["LOG_PY_FMT"], level=logging.INFO)
+
+ logging.info("Reading regions")
+ with open(snakemake.input.regions) as f:
+ regions = json.load(f)
+
+ if snakemake.wildcards.region_name not in regions:
+ logging.error(f"Region {snakemake.wildcards.region_name} is absent in {snakemake.input.regions}")
+ sys.exit(1)
+
+ start, end = regions[snakemake.wildcards.region_name]
+
+ logging.info("Reading input table")
+ df = pd.read_csv(snakemake.input.table)
+
+ logging.info("Filtering sites and writing results")
+ df[(df.POS >= start) & (df.POS <= end)].to_csv(snakemake.output.table, index=False)
diff --git a/workflow/scripts/report/phylo_plots.R b/workflow/scripts/report/phylo_plots.R
deleted file mode 100644
index 447d873..0000000
--- a/workflow/scripts/report/phylo_plots.R
+++ /dev/null
@@ -1,291 +0,0 @@
-#!/usr/bin/env Rscript
-
-# Write stdout and stderr to log file
-log <- file(snakemake@log[[1]], open = "wt")
-sink(log, type = "message")
-sink(log, type = "output")
-
-library(tidyverse)
-library(stringi)
-library(ape)
-library(ggtree)
-library(data.table)
-library(ggpubr)
-library(pegas)
-library(jsonlite)
-library(logger)
-
-log_threshold(INFO)
-
-# Import file with plots style
-source(snakemake@params[["design"]])
-
-# legend thresholds for ml tree
-legend.names <- c(
- tip_label = "Target samples",
- boot_alrt_pass = sprintf(
- "UFBoot ≥ %s%s & SH-aLRT ≥ %s%s ",
- snakemake@params[["boot_th"]],
- "%",
- snakemake@params[["alrt_th"]],
- "%"
- )
-)
-
-matrix <- read_csv(snakemake@input[["dist"]])
-metadata <- read_csv(snakemake@input[["metadata"]])
-tree_ml <- read.tree(snakemake@input[["ml"]]) %>%
- root(
- snakemake@params[["ref_name"]],
- resolve.root = TRUE
- )
-
-study_names <- read.dna(
- snakemake@input[["study_fasta"]],
- format = "fasta",
- as.matrix = FALSE,
-)
-
-study_names <- study_names[
- !startsWith(names(study_names), snakemake@config[["ALIGNMENT_REFERENCE"]])
-]
-study_names <- names(study_names)
-
-# Obtain sample names ordered by CollectionDate
-date_order <- metadata %>%
- arrange(CollectionDate) %>%
- filter(ID %in% study_names) %>%
- pull(ID) %>%
- unique()
-
-tree_tiplab <- data.frame(
- ID = date_order,
- order = seq(1, length(date_order), 1)
-) %>%
- rowwise() %>%
- mutate(
- tip_label = sprintf(
- "(%s)-%s",
- order,
- ID
- )
- ) %>%
- ungroup() %>%
- add_row(
- ID = snakemake@params[["ref_name"]],
- order = 0,
- tip_label = snakemake@params[["ref_name"]]
- )
-
-
-# DATA PROCESSING ####
-# Resolve possible negative edge lengths in n-j tree
-fix_negative_edge_length <- function(nj.tree) {
- edge_infos <- cbind(
- nj.tree$edge,
- nj.tree$edge.length
- ) %>%
- as.data.table
- colnames(edge_infos) <- c("from", "to", "length")
-
- nega_froms <- edge_infos[length < 0, sort(unique(from))]
-
- for (nega_from in nega_froms) {
- minus_length <- edge_infos[from == nega_from, ][order(length)][1, length]
- edge_infos[from == nega_from, length := length - minus_length]
- edge_infos[to == nega_from, length := length + minus_length]
- }
- nj.tree$edge.length <- edge_infos$length
- nj.tree
-}
-
-# Tree construction
-if (snakemake@params$use_bionj) {
- log_info("Constructing BIONJ tree based on distances")
- tree_method = bionj
-} else {
- log_info("Constructing NJ tree based on distances")
- tree_method = nj
-}
-tree <- matrix %>%
- column_to_rownames(var = "...1") %>%
- as.matrix() %>%
- as.dist() %>%
- tree_method() %>%
- root(snakemake@params[["ref_name"]], resolve.root = TRUE)
-
-tree <- fix_negative_edge_length(tree)
-
-# Get patristic distances to ancestor from n-j tree
-log_info("Getting patristic distances to ancestor from n-j tree")
-tempest <- adephylo::distRoot(
- tree,
- "all",
- method = "patristic"
-) %>%
- as.data.frame() %>%
- rownames_to_column(var = "ID") %>%
- filter(ID != snakemake@params[["ref_name"]]) %>%
- rename(distance = ".") %>%
- left_join(
- select(
- metadata,
- ID,
- CollectionDate
- )
- ) %>%
- mutate(
- date_interval = as.numeric(
- as.Date(CollectionDate) - min(as.Date(CollectionDate))
- )
- )
-
-
-# PLOTS ####
-# (BIO)NJ tree
-log_info("Plotting distance tree")
-tree_plot <- ggtree(tree) %<+%
- tree_tiplab +
- geom_tiplab(aes(label = tip_label)) +
- geom_treescale() +
- geom_rootedge(0.5) +
- xlim(0, max(tempest$distance) + 3.5)
-
-ggsave(
- filename = snakemake@output[["tree"]],
- plot = tree_plot,
- width = snakemake@params[["plot_width_mm"]],
- height = snakemake@params[["plot_height_mm"]],
- units = "mm",
- dpi = 250
-)
-
-# TempEst
-log_info("Plotting temporal signal analysis")
-tempest_fig <- tempest %>%
- ggplot() +
- aes(
- x = date_interval,
- y = distance
- ) +
- geom_smooth(
- method = "lm",
- fill = "gray95",
- alpha = 0.6,
- color = "red"
- ) +
- geom_point() +
- labs(
- y = "Root-to-tip distance",
- x = "Days since the initial sampling"
- )
-
-ggsave(
- filename = snakemake@output[["temest"]],
- plot = tempest_fig,
- width = snakemake@params[["plot_width_mm"]],
- height = snakemake@params[["plot_height_mm"]],
- units = "mm",
- dpi = 250
-)
-
-# ML tree with context data
-
-# Internal nodes color
-# Node labels contain SH-aLRT/UFboot values
-aLRT.values <- sapply(
- strsplit(tree_ml$node.label, "/"),
- function(x) {
- as.numeric(x[1])
- }
-)
-
-bootstrap.values <- sapply(
- strsplit(tree_ml$node.label, "/"),
- function(x) {
- as.numeric(x[2])
- }
-)
-
-aLRT.mask <- aLRT.values >= snakemake@params[["alrt_th"]]
-boot.mask <- bootstrap.values >= snakemake@params[["boot_th"]]
-
-# MRCA for target samples
-log_info("Calculating MRCA of target samples")
-study.mrca <- getMRCA(tree_ml, study_names)
-study.mrca.clade <- extract.clade(tree_ml, study.mrca)
-study.mrca.clade.ntips <- Ntip(study.mrca.clade)
-
-log_info("Building M-L tree annotation")
-tree_ml.max.tip.length <- max(node.depth.edgelength(tree_ml)[1:length(tree$tip.label)])
-tree_ml.ntips <- length(tree_ml$tip.label)
-tree_ml.nodes <- tree_ml$Nnode + tree_ml.ntips
-tree_ml.labels <- tree_ml$tip.label[1:tree_ml.nodes]
-tree_ml.node.pass <- c(rep(FALSE, tree_ml.ntips), aLRT.mask & boot.mask)
-
-ml.tree.annot <- tibble(
- node = 1:tree_ml.nodes,
- ) %>%
- mutate(
- Class = case_when(
- tree_ml.labels %in% study_names ~ legend.names["tip_label"],
- tree_ml.node.pass ~ legend.names["boot_alrt_pass"],
- TRUE ~ NA
- )
- )
-
-log_info("Plotting M-L tree with context samples")
-p <- ggtree(tree_ml, layout = "circular") %<+% ml.tree.annot +
- geom_highlight(node = study.mrca, colour = "red", alpha = 0) +
- geom_point(aes(color = Class, size = Class)) +
- geom_treescale(1.05 * tree_ml.max.tip.length) +
- geom_rootedge(0.05 * tree_ml.max.tip.length) +
- xlim(-tree_ml.max.tip.length / 3, NA) +
- scale_color_manual(
- values = setNames(tree_colors[names(legend.names)], legend.names),
- na.translate = FALSE
- ) +
- scale_size_manual(
- values = setNames(node.size[names(legend.names)], legend.names),
- na.translate = FALSE
- )
-
-ggsave(
- filename = snakemake@output[["tree_ml"]],
- plot = p,
- width = snakemake@params[["plot_width_mm"]],
- height = snakemake@params[["plot_height_mm"]],
- units = "mm",
- dpi = 250
-)
-
-# PLOT TABLES
-log_info("Saving temporal signal table")
-tempest %>%
- transmute(
- sample = ID,
- Distance = distance,
- CollectionDate = CollectionDate,
- DaysSinceFirst = date_interval
- ) %>%
- write.csv(snakemake@output[["table"]], row.names = FALSE)
-
-# TEMPEST STATS
-model <- lm(distance ~ date_interval, data = tempest)
-p.value <- summary(model)$coefficients[2,4]
-
-# TREE STATS
-study.node <- tree_ml$node.label[study.mrca - length(tree_ml$tip.label)]
-monophyletic <- ifelse(is.monophyletic(tree_ml, study_names), "are", "are not")
-
-list(
- "sub_rate" = model$coefficients[[2]] * 365,
- "r2" = summary(model)$r.squared[[1]],
- "pvalue" = ifelse(p.value < 0.001, "< 0.001", p.value),
- "boot" = strsplit(study.node, "/")[[1]][2] %>% as.numeric(),
- "alrt" = strsplit(study.node, "/")[[1]][1] %>% as.numeric(),
- "monophyly" = monophyletic,
- "clade_tips" = study.mrca.clade.ntips
-) %>%
- toJSON() %>%
- write(snakemake@output[["json"]])
diff --git a/workflow/scripts/report/polymorphic_sites_over_time_data.R b/workflow/scripts/report/polymorphic_sites_over_time_data.R
new file mode 100644
index 0000000..1100587
--- /dev/null
+++ b/workflow/scripts/report/polymorphic_sites_over_time_data.R
@@ -0,0 +1,81 @@
+#!/usr/bin/env Rscript
+
+# Write stdout and stderr to log file
+log <- file(snakemake@log[[1]], open = "wt")
+sink(log, type = "message")
+sink(log, type = "output")
+
+library(readr)
+library(dplyr)
+library(jsonlite)
+library(logger)
+
+log_threshold(INFO)
+
+log_info("Reading variants")
+variants <- read_delim(snakemake@input$variants)
+
+log_info("Reading metadata")
+metadata <- read_delim(snakemake@input$metadata)
+
+log_info("Calculating heterozygous sites")
+sites <- variants %>%
+ filter(ALT_FREQ <= snakemake@params$max_alt_freq) %>%
+ left_join(
+ metadata,
+ by = c("SAMPLE" = "ID")
+ ) %>%
+ group_by(SAMPLE) %>%
+ summarise(
+ CollectionDate = min(as.Date(CollectionDate)),
+ n = n_distinct(POS)
+ ) %>%
+ ungroup() %>%
+ arrange(CollectionDate) %>%
+ mutate(
+ Day = as.numeric(
+ difftime(CollectionDate, min(CollectionDate), units = "days")
+ )
+ )
+
+if (nrow(sites) == 0) {
+ log_warn("There are none, using an empty table and no linear regression")
+ sites <- tibble(
+ SAMPLE = date_order,
+ REGION = as.character(NA),
+ VARIANT_NAME = as.character(NA),
+ ALT_FREQ = as.numeric(NA),
+ EFFECT = as.character(NA),
+ SYNONYMOUS = as.character(NA),
+ POS = as.numeric(NA),
+ ALT = as.character(NA),
+ NV_class = as.character(NA),
+ group = as.character(NA)
+ )
+ r_squared <- "none"
+ p_value_string <- "none"
+} else if (nrow(sites) > 2) {
+ log_info("Calculating linear regression")
+ model <- lm(n ~ CollectionDate, data = sites)
+ r_squared <- summary(model)$r.squared[[1]]
+ p_value <- summary(model)$coefficients[2, 4]
+ p_value_string <- ifelse(p_value < 0.001, "< 0.001", p_value)
+} else {
+ log_warn("Not enough data points for a linear regression")
+ r_squared <- "none"
+ p_value_string <- "none"
+}
+
+log_info("Writing JSON summary")
+list(
+ "r2" = r_squared,
+ "value" = p_value_string
+) %>%
+ write_json(
+ snakemake@output$json,
+ auto_unbox = TRUE,
+ digits = NA
+ )
+
+log_info("Writing processed table")
+write_csv(sites, snakemake@output$table)
diff --git a/workflow/scripts/report/polymorphic_sites_over_time_plot.R b/workflow/scripts/report/polymorphic_sites_over_time_plot.R
new file mode 100644
index 0000000..2596195
--- /dev/null
+++ b/workflow/scripts/report/polymorphic_sites_over_time_plot.R
@@ -0,0 +1,45 @@
+#!/usr/bin/env Rscript
+
+# Write stdout and stderr to log file
+log <- file(snakemake@log[[1]], open = "wt")
+sink(log, type = "message")
+sink(log, type = "output")
+
+library(readr)
+library(dplyr)
+library(ggplot2)
+library(logger)
+
+log_threshold(INFO)
+
+# Import file with plots style
+source(snakemake@params[["design"]])
+
+log_info("Reading plot data")
+df <- read_csv(snakemake@input$table)
+
+log_info("Plotting")
+p <- df %>%
+ ggplot() +
+ aes(x = Day, y = n) +
+ geom_smooth(
+ method = "lm",
+ fill = "gray95",
+ alpha = 0.6,
+ colour = "orange"
+ ) +
+ geom_point(size = 2, shape = 1) +
+ labs(
+ x = "Days since the initial sampling",
+ y = "No. of polimorphic sites"
+ )
+
+log_info("Saving plot")
+ggsave(
+ filename = snakemake@output$plot,
+ plot = p,
+ width = snakemake@params$plot_width_mm,
+ height = snakemake@params$plot_height_mm,
+ units = "mm",
+ dpi = 250
+)
diff --git a/workflow/scripts/report/snp_plots.R b/workflow/scripts/report/snp_plots.R
deleted file mode 100644
index 44e2ebb..0000000
--- a/workflow/scripts/report/snp_plots.R
+++ /dev/null
@@ -1,244 +0,0 @@
-#!/usr/bin/env Rscript
-
-# Write stdout and stderr to log file
-log <- file(snakemake@log[[1]], open = "wt")
-sink(log, type = "message")
-sink(log, type = "output")
-
-library(tidyverse)
-library(stringi)
-library(ggrepel)
-library(logger)
-log_threshold(INFO)
-
-# Get colors
-color <- grDevices::colors()[grep(
- "gr(a|e)y",
- grDevices::colors(),
- invert = TRUE
-)]
-color <- color[grep("white", color, invert = TRUE)]
-# Import file with plots style
-source(snakemake@params[["design"]])
-
-# DATA PREPROCESSING #####
-metadata <- read_csv(snakemake@input[["metadata"]])
-
-vcf <- read_delim(snakemake@input[["vcf"]])
-data <- metadata %>%
- filter(
- ID %in% vcf$REGION
- ) %>%
- dplyr::select(
- ID,
- CollectionDate
- )
-
-# Obtain sample names ordered by CollectionDate
-date_order <- metadata %>%
- arrange(CollectionDate) %>%
- pull(ID) %>%
- unique()
-
-# Simplify features names and create SNP variable
-vcf <- vcf %>%
- dplyr::select(
- variant,
- REGION,
- ALT_FREQ,
- POS
- )
-
-# Fill positions without alt frequency with 0
-vcf <- vcf %>%
- complete(nesting(variant, POS), REGION, fill = list(ALT_FREQ = 0))
-
-# Join variants file and metadata file
-vcf <- left_join(vcf, data, by = c("REGION" = "ID"))
-
-# Calculate days since first sample
-vcf <- arrange(
- vcf,
- CollectionDate
-) %>%
- mutate(
- interval = as.numeric(CollectionDate - min(CollectionDate))
- )
-
-# PLOTS ####
-## VOLCANO PLOT ####
-
-# Get list with all different polymorphisms
-SNPs <- pull(
- vcf,
- variant
-) %>%
- unique()
-
-# Create an empty dataframe to be filled
-cor.df <- data.frame(
- snp = "",
- cor = 0,
- p.value = 0
-) %>%
- filter(p.value != 0)
-
-cor.df.fill <- lapply(
- SNPs,
- function(snp) {
- df <- filter(
- vcf,
- variant == snp
- )
-
- test <- cor.test(
- df$ALT_FREQ,
- df$interval,
- method = snakemake@params$cor_method,
- exact = snakemake@params$cor_exact
- )
-
- pvalue <- p.adjust(
- test$p.value,
- method = "BH",
- n = length(SNPs)
- )
- add_row(
- cor.df,
- snp = snp,
- cor = test$estimate,
- p.value = pvalue
- )
- }
-) %>%
- bind_rows()
-
-# Plot a pseudo volcano plot with coorrelation index and p-value
-log_info("Plotting pseudovolcano figure")
-volcano <- cor.df.fill %>%
- mutate(
- trans.p = -log10(p.value),
- label = case_when(p.value < 0.05 ~ snp)
- ) %>%
- ggplot() +
- aes(
- x = cor,
- y = trans.p
- ) +
- geom_point() +
- geom_label_repel(aes(label = label)) +
- xlim(c(-1, 1)) +
- geom_hline(
- aes(yintercept = -log10(0.05)),
- linetype = 2,
- color = "red"
- ) +
- labs(
- x = "Correlation",
- y = "-log10(p-value)"
- )
-
-ggsave(
- filename = snakemake@output[["pseudovolcano"]],
- plot = volcano,
- width = 159.2,
- height = 119.4,
- units = "mm",
- dpi = 250
-)
-
-
-## SNP PANEL ####
-# SNPs significantly correlated with time
-sign <- filter(
- cor.df.fill,
- p.value < 0.05
-) %>%
- pull(snp) %>%
- unique()
-
-# SNPs which are in positions with more than one alternative allele
-dup <- vcf %>%
- select(
- variant,
- POS
- ) %>%
- unique() %>%
- group_by(POS) %>%
- filter(n() > 1) %>%
- ungroup() %>%
- pull(variant) %>%
- unique()
-
-subset <- c(sign, dup) %>%
- unique()
-
-# Plot height depending on the number of SNPs assuming 4 columns in the plot
-plot.height <- ceiling(length(subset) / 4) * 42
-
-log_info("Plotting SNPs trends in time")
-panel <- vcf %>%
- filter(variant %in% subset) %>%
- ggplot() +
- aes(
- x = interval,
- y = ALT_FREQ,
- color = variant
- ) +
- scale_color_manual(values = sample(color, length(subset))) +
- geom_point() +
- geom_line() +
- theme(
- legend.position = "bottom",
- legend.text = element_text(size = 9)
- ) +
- labs(
- x = "Days since first sample",
- y = "Frequency",
- color = "NV"
- ) +
- guides(color = guide_legend(ncol = 3))
-
-if (length(subset) > 1) {
- panel <- panel +
- facet_wrap(
- vars(POS),
- nrow = ceiling(length(subset) / 4),
- ncol = 4
- )
-}
-
-ggsave(
- filename = snakemake@output[["snp_panel"]],
- plot = panel,
- width = 159.2,
- height = max(100, plot.height),
- units = "mm",
- dpi = 250
-)
-
-
-# PLOT TABLES ####
-log_info("Saving coorelation table")
-cor.df.fill %>%
- transmute(
- NV = snp,
- correlation_coef = cor,
- adj_pvalue = p.value
- ) %>%
- write.csv(
- snakemake@output[["table_1"]],
- row.names = FALSE
- )
-
-log_info("Saving SNPs trends table")
-vcf %>%
- filter(variant %in% subset) %>%
- transmute(
- sample = REGION,
- POS = POS,
- NV = variant,
- ALT_FREQ = ALT_FREQ,
- DaysSinceFirst = interval
- ) %>%
- write.csv(snakemake@output[["table_2"]], row.names = FALSE)
diff --git a/workflow/scripts/report/summary_table.R b/workflow/scripts/report/summary_table.R
index b78a103..0f96f7d 100644
--- a/workflow/scripts/report/summary_table.R
+++ b/workflow/scripts/report/summary_table.R
@@ -5,8 +5,9 @@ log <- file(snakemake@log[[1]], open = "wt")
sink(log, type = "message")
sink(log, type = "output")
-library(tidyverse)
+library(dplyr)
library(logger)
+
log_threshold(INFO)
metadata <- read.csv(snakemake@input[["metadata"]])
diff --git a/workflow/scripts/report/time_signal_data.R b/workflow/scripts/report/time_signal_data.R
new file mode 100644
index 0000000..d209528
--- /dev/null
+++ b/workflow/scripts/report/time_signal_data.R
@@ -0,0 +1,70 @@
+#!/usr/bin/env Rscript
+
+# Write stdout and stderr to log file
+log <- file(snakemake@log[[1]], open = "wt")
+sink(log, type = "message")
+sink(log, type = "output")
+
+library(readr)
+library(dplyr)
+library(tibble)
+library(jsonlite)
+library(ape)
+library(adephylo)
+library(logger)
+
+log_threshold(INFO)
+
+# Read distance tree and root
+log_info("Reading tree")
+tree <- read.tree(snakemake@input$tree) %>%
+ root(snakemake@params$outgroup_id, resolve.root = TRUE)
+
+# Read metadata
+log_info("Reading metadata")
+metadata <- read_csv(snakemake@input$metadata)
+
+# Get patristic distances to ancestor in distance tree
+log_info("Calculating patristic distances to ancestor in distance tree")
+time.signal <- distRoot(
+ tree,
+ "all",
+ method = "patristic"
+) %>%
+ as.data.frame() %>%
+ rownames_to_column(var = "ID") %>%
+ filter(ID != snakemake@params$outgroup_id) %>%
+ rename(distance = ".") %>%
+ left_join(
+ select(
+ metadata,
+ ID,
+ CollectionDate
+ ),
+ by = "ID"
+ ) %>%
+ mutate(
+ date_interval = as.numeric(
+ as.Date(CollectionDate) - min(as.Date(CollectionDate))
+ )
+ )
+
+# Save table
+log_info("Saving time signal data")
+log_debug("Saving table")
+write_csv(time.signal, snakemake@output$table)
+
+# Time signal stats
+log_debug("Building linear model")
+model <- lm(distance ~ date_interval, data = time.signal)
+p.value <- summary(model)$coefficients[2, 4]
+
+# TREE STATS
+log_debug("Saving linear model")
+list(
+ "sub_rate" = model$coefficients[[2]] * 365,
+ "r2" = summary(model)$r.squared[[1]],
+ "pvalue" = ifelse(p.value < 0.001, "< 0.001", p.value)
+) %>%
+ toJSON() %>%
+ write(snakemake@output$json)
diff --git a/workflow/scripts/report/time_signal_plot.R b/workflow/scripts/report/time_signal_plot.R
new file mode 100644
index 0000000..d9af70c
--- /dev/null
+++ b/workflow/scripts/report/time_signal_plot.R
@@ -0,0 +1,49 @@
+#!/usr/bin/env Rscript
+
+# Write stdout and stderr to log file
+log <- file(snakemake@log[[1]], open = "wt")
+sink(log, type = "message")
+sink(log, type = "output")
+
+library(readr)
+library(dplyr)
+library(ggplot2)
+library(logger)
+
+log_threshold(INFO)
+
+# Import file with plots style
+source(snakemake@params[["design"]])
+
+log_info("Reading time signal data")
+time.signal <- read_csv(snakemake@input$table)
+
+# PLOTS ####
+# TempEst
+log_info("Plotting time signal data")
+p <- time.signal %>%
+ ggplot() +
+ aes(
+ x = date_interval,
+ y = distance
+ ) +
+ geom_smooth(
+ method = "lm",
+ fill = "gray95",
+ alpha = 0.6,
+ color = "orange"
+ ) +
+ geom_point(size = 2, shape = 1) +
+ labs(
+ y = "Root-to-tip distance",
+ x = "Days since the initial sampling"
+ )
+
+ggsave(
+ filename = snakemake@output$plot,
+ plot = p,
+ width = snakemake@params[["plot_width_mm"]],
+ height = snakemake@params[["plot_height_mm"]],
+ units = "mm",
+ dpi = 250
+)
diff --git a/workflow/scripts/report/window_data.py b/workflow/scripts/report/window_data.py
new file mode 100644
index 0000000..73233f8
--- /dev/null
+++ b/workflow/scripts/report/window_data.py
@@ -0,0 +1,148 @@
+#!/usr/bin/env python3
+
+import logging
+import json
+from typing import NewType, Dict, Iterable, List
+
+import pandas as pd
+from Bio import SeqIO
+from Bio.SeqFeature import SeqFeature, SimpleLocation, CompoundLocation
+
+
+FeatureIndex = NewType("FeatureIndex", Dict[str, SimpleLocation | CompoundLocation])
+
+
+def iter_features_filtering(features: Iterable[SeqFeature], included: Dict[str, str], excluded: Dict[str, str]) -> Iterable[SeqFeature]:
+ # No filters
+ if len(included) == 0 and len(excluded) == 0:
+ logging.debug("Selecting all features")
+ return iter(features)
+ # Only inclusion filter
+ elif len(included) == 0 and len(excluded) != 0:
+ logging.debug(f"Selecting features excluding all of {excluded}")
+ return (
+ feature for feature in features
+ if all(
+ (qualifier_value not in excluded.get(qualifier_key, []))
+ for qualifier_key in excluded.keys()
+ for qualifier_value in feature.qualifiers.get(qualifier_key, [])
+ )
+ )
+ # Only exclusion filter
+ elif len(included) != 0 and len(excluded) == 0:
+ logging.debug(f"Selecting features including any of {included}")
+ return (
+ feature for feature in features
+ if any(
+ (qualifier_value in included.get(qualifier_key, []))
+ for qualifier_key in included.keys()
+ for qualifier_value in feature.qualifiers.get(qualifier_key, [])
+ )
+ )
+ # Inclusion then exclusion filter
+ else:
+ logging.debug(f"Selecting features including any of {included} and then excluding all of {excluded}")
+ included_features = (
+ feature for feature in features
+ if any(
+ (qualifier_value in included.get(qualifier_key, []))
+ for qualifier_key in included.keys()
+ for qualifier_value in feature.qualifiers.get(qualifier_key, [])
+ )
+ )
+ return (
+ feature for feature in included_features
+ if all(
+ (qualifier_value not in excluded.get(qualifier_key, []))
+ for qualifier_key in excluded.keys()
+ for qualifier_value in feature.qualifiers.get(qualifier_key, [])
+ )
+ )
+
+
+def index_features(features: Iterable[SeqFeature]) -> FeatureIndex:
+ index = FeatureIndex({})
+ for feature in features:
+ identifier = "|".join(feature.qualifiers.get(snakemake.params.gb_qualifier_display, []))
+ if identifier == "":
+ logging.error(f"Feature at {feature.location} has no qualifier '{snakemake.params.gb_qualifier_display}' and will be skipped")
+ elif feature.location is None:
+ logging.warning(f"Feature '{identifier}' has no location and will be skipped")
+ elif identifier in index:
+ logging.warning(f"Identifier '{identifier}' is already in feature index and will not be replaced by feature at {feature.location}")
+ else:
+ index[identifier] = feature.location
+ return index
+
+
+def feature_names_at(position: int, feature_index: FeatureIndex) -> List[str]:
+ return [name for name, location in feature_index.items() if position in location]
+
+
+def window_calculation(sites: set, window: int, step: int, size: int, feature_index: FeatureIndex) -> pd.DataFrame:
+ positions, fractions, features = [], [], []
+ lim_sup = size + 1
+ for position in range(1, lim_sup, step):
+ feature_names = ";".join(feature_names_at(position, feature_index))
+ if len(feature_names) == 0:
+ features.append("Intergenic")
+ else:
+ # Include all features on site
+ features.append(feature_names)
+ # Add percent (excluding initial and final positions)
+ if position - window not in range(1, lim_sup):
+ fractions.append(0.0)
+ else:
+ # Calculate number of polymorphisms in the window
+ num_snp = sum(
+ 1 for x in sites if x in range(position - window, position + 1)
+ )
+ fractions.append(num_snp / window)
+ positions.append(position)
+ return pd.DataFrame({"position": positions, "fraction": fractions, "feature": features})
+
+
+def main():
+
+ logging.basicConfig(
+ filename=snakemake.log[0], format=snakemake.config["LOG_PY_FMT"],
+ level=logging.INFO
+ )
+
+ # Read input files
+ logging.info("Reading input variants file")
+ df = pd.read_table(snakemake.input.variants)
+ sites = set(df.POS)
+
+ logging.info("Reading GenBank file")
+ gb = SeqIO.read(snakemake.input.gb, format="gb")
+
+ logging.info("Indexing feature locations")
+ included = snakemake.params.features.get("INCLUDE", {})
+ excluded = snakemake.params.features.get("EXCLUDE", {})
+ feature_index = index_features(
+ iter_features_filtering(gb.features, included, excluded)
+ )
+
+ logging.info("Calculating polimorphic sites sliding window")
+ windows = window_calculation(
+ sites,
+ snakemake.params.window,
+ snakemake.params.step,
+ len(gb),
+ feature_index
+ )
+
+ logging.info("Saving results")
+ windows.to_csv(snakemake.output.window_df, index=False)
+
+ logging.info("Saving window settings to JSON file")
+ with open(snakemake.output.json, "w") as fw:
+ json.dump(
+ {"window": snakemake.params.window, "step": snakemake.params.step},
+ fw
+ )
+
+
+if __name__ == "__main__":
+ main()
diff --git a/workflow/scripts/report/window_zoom_on_feature_data.py b/workflow/scripts/report/window_zoom_on_feature_data.py
new file mode 100644
index 0000000..36f1e9f
--- /dev/null
+++ b/workflow/scripts/report/window_zoom_on_feature_data.py
@@ -0,0 +1,28 @@
+#!/usr/bin/env python3
+
+import sys
+import logging
+import json
+
+import pandas as pd
+
+
+if __name__ == "__main__":
+
+ logging.basicConfig(filename=snakemake.log[0], format=snakemake.config["LOG_PY_FMT"], level=logging.INFO)
+
+ logging.info("Reading regions")
+ with open(snakemake.input.regions) as f:
+ regions = json.load(f)
+
+ if snakemake.wildcards.region_name not in regions:
+ logging.error(f"Region {snakemake.wildcards.region_name} is absent in {snakemake.input.regions}")
+ sys.exit(1)
+
+ start, end = regions[snakemake.wildcards.region_name]
+
+ logging.info("Reading input table")
+ df = pd.read_csv(snakemake.input.table)
+
+ logging.info("Filtering sites and writing results")
+ df[(df.position >= start) & (df.position <= end)].to_csv(snakemake.output.table, index=False)
diff --git a/workflow/scripts/summary_demixing.R b/workflow/scripts/summarise_demix.R
similarity index 74%
rename from workflow/scripts/summary_demixing.R
rename to workflow/scripts/summarise_demix.R
index 3ae246f..9792202 100644
--- a/workflow/scripts/summary_demixing.R
+++ b/workflow/scripts/summarise_demix.R
@@ -5,14 +5,18 @@ log <- file(snakemake@log[[1]], open = "wt")
sink(log, type = "message")
sink(log, type = "output")
-library(tidyverse)
+library(dplyr)
+library(readr)
+library(tidyr)
+library(stringr)
library(logger)
+
log_threshold(INFO)
# Empty dataframe to be filled with data
demix <- data.frame(
- "lineages" = NA,
- "abundances" = NA,
+ "lineage" = NA,
+ "abundance" = NA,
"sample" = NA
) %>%
filter(!is.na(sample))
@@ -23,21 +27,25 @@ lapply(
function(tsv_file) {
read_tsv(
tsv_file,
- col_names = c("variable", "valor"),
+ col_names = c("variable", "value"),
show_col_types = FALSE
) %>%
filter(
- row_number() %in% c(3, 4)
+ variable %in% c("lineages", "abundances")
) %>%
pivot_wider(
names_from = variable,
- values_from = valor
+ values_from = value
) %>%
separate_rows(
lineages,
abundances,
sep = " "
) %>%
+ rename(
+ lineage = lineages,
+ abundance = abundances
+ ) %>%
mutate(
sample = str_extract(
basename(tsv_file),
diff --git a/workflow/scripts/tsv_to_vcf.py b/workflow/scripts/tsv_to_vcf.py
index be43d9a..2201582 100644
--- a/workflow/scripts/tsv_to_vcf.py
+++ b/workflow/scripts/tsv_to_vcf.py
@@ -17,7 +17,7 @@ def tsv_to_vcf(tsv_file, vcf_file):
# Process each row in the TSV file
for index, row in tsv_df.iterrows():
# Extract fields from the TSV row
- chrom = 'NC_045512.2'
+ chrom = snakemake.params.ref_name
pos = row['POS']
ref = row['REF']
alt = row['ALT']
diff --git a/workflow/scripts/vcf_to_tsv.R b/workflow/scripts/vcf_to_tsv.R
deleted file mode 100644
index 990f488..0000000
--- a/workflow/scripts/vcf_to_tsv.R
+++ /dev/null
@@ -1,32 +0,0 @@
-#!/usr/bin/env Rscript
-
-# Write stdout and stderr to log file
-log <- file(snakemake@log[[1]], open = "wt")
-sink(log, type = "message")
-sink(log, type = "output")
-
-library(tidyverse)
-library(logger)
-log_threshold(INFO)
-
-# read data
-log_info("Reading data")
-vcf <- read_tsv(snakemake@input[["ann_vcf"]], comment = "##")
-tsv <- read_tsv(snakemake@input[["pre_tsv"]])
-
-tsv["variant"] <- str_extract(vcf$INFO, "p\\.([^|]*)", group = 1)
-tsv["nuc_variant"] <- str_extract(vcf$INFO, "c\\.([^|]*)", group = 1)
-
-tsv <- tsv %>%
- mutate(
- variant = case_when(
- is.na(variant) ~ paste(POS, REF, ">", ALT, sep = ""),
- TRUE ~ paste(GFF_FEATURE, ":", variant, sep = "")
- )
- )
-
-log_info("Saving results")
-write_tsv(
- tsv,
- snakemake@output[["tsv"]]
-)
diff --git a/workflow/scripts/weighted_distances.py b/workflow/scripts/weighted_distances.py
deleted file mode 100644
index 2a5c0e7..0000000
--- a/workflow/scripts/weighted_distances.py
+++ /dev/null
@@ -1,163 +0,0 @@
-#!/usr/bin/env python3
-# -*- coding: utf-8 -*-
-
-# Adapted script from https://github.com/PathoGenOmics-Lab/genetic-distances
-
-import logging
-from typing import List, Tuple
-
-import pandas as pd
-from Bio import SeqIO
-from Bio.SeqRecord import SeqRecord
-from Bio.Seq import Seq
-
-
-def read_monofasta(path: str) -> SeqRecord:
- fasta = SeqIO.parse(path, "fasta")
- record = next(fasta)
- if next(fasta, None) is not None:
- logging.warning(f"There are unread records left in '{path}'")
- return record
-
-
-def read_masked_sites(vcf_path: str, mask_classes: List[str]) -> List[int]:
- """
- Parse a VCF containing positions for masking. Assumes the VCF file is
- formatted as in:
- github.com/W-L/ProblematicSites_SARS-CoV2/blob/master/problematic_sites_sarsCov2.vcf
- with a "mask" or "caution" recommendation in column 7.
- Masked sites are specified with params.
- """
- vcf = pd.read_csv(
- vcf_path,
- sep="\s+",
- comment="#",
- names=("CHROM", "POS", "ID", "REF", "ALT", "QUAL", "FILTER", "INFO")
- )
- return vcf.loc[vcf.FILTER.isin(mask_classes), "POS"].tolist()
-
-
-def build_ancestor_variant_table(ancestor: Seq, reference: Seq, reference_name: str, masked_positions: List[int]) -> pd.DataFrame:
- pos = []
- alt = []
- for i in range(1, len(ancestor) + 1):
- if i not in masked_positions and ancestor[i-1] != reference[i-1]:
- pos.append(i)
- alt.append(reference[i-1])
- df = pd.DataFrame({"POS": pos, "ALT": alt})
- df["ALT_FREQ"] = 1 # As a reference genome, we assume all positions are monomorphic
- df["REGION"] = reference_name
- return df
-
-
-def get_frequencies_in_position(variant_table: pd.DataFrame, sample_name: str, position: int, reference: Seq) -> Tuple[float]:
- frequencies = {alt: 0. for alt in variant_table["ALT"].unique()}
- alt_keys = sorted(frequencies.keys())
- sample_variant_table = variant_table[variant_table["REGION"] == sample_name]
- # If the position has polimorphisims, allele frequencies are captured
- if position in sample_variant_table["POS"].values:
- alleles = sample_variant_table[sample_variant_table["POS"] == position]
- for base in alt_keys:
- if base in alleles["ALT"].values:
- frequencies[base] = float(alleles["ALT_FREQ"][alleles["ALT"] == base].iloc[0])
- # Obtain frequency for reference allele
- reference_allele = reference[position-1]
- reference_freq = 1 - sum(frequencies.values())
- if reference_allele in frequencies:
- frequencies[reference[position-1]] += reference_freq
- else:
- frequencies[reference[position-1]] = reference_freq
- return tuple(frequencies[base] for base in alt_keys)
-
-
-def heterozygosity(freqs: Tuple[float]) -> float:
- return 1 - sum([f ** 2 for f in freqs])
-
-
-def calc_fst_weir_cockerham(hs: float, ht: float) -> float:
- return (ht - hs) / ht if ht != 0 else 0
-
-
-def build_cache(variant_table: pd.DataFrame, samples: List[str], reference: Seq):
- cache = {"freq": {}, "hz": {}}
- for sample_name in set(samples):
- for position in variant_table["POS"].astype("Int64").unique():
- if sample_name not in cache["freq"]:
- cache["freq"][sample_name] = {}
- if sample_name not in cache["hz"]:
- cache["hz"][sample_name] = {}
- cache["freq"][sample_name][position] = get_frequencies_in_position(variant_table, sample_name, position, reference)
- logging.debug(f"Frequencies for '{sample_name}':{position} = {cache['freq'][sample_name][position]}")
- cache["hz"][sample_name][position] = heterozygosity(cache["freq"][sample_name][position])
- logging.debug(f"Heterozygosity for '{sample_name}':{position} = {cache['hz'][sample_name][position]}")
- return cache
-
-
-def calc_heterozygosities(sample1_name: str, sample2_name: str, pos: int, cache: dict):
- logging.debug(f"Calculating heterozygosities at position {pos} for '{sample1_name}' and '{sample2_name}'")
- # Retrieve pre-computed values
- freqs1 = cache["freq"][sample1_name][pos]
- freqs2 = cache["freq"][sample2_name][pos]
- hs1 = cache["hz"][sample1_name][pos]
- hs2 = cache["hz"][sample2_name][pos]
- # Calculate pairwise values
- total_freqs = [(f1 + f2) / 2 for f1, f2 in zip(freqs1, freqs2)]
- ht = heterozygosity(total_freqs)
- hs = (hs1 + hs2) / 2
- return hs, ht
-
-
-def calculate_pairwise_distance(positions: List[int], sample1_name: str, sample2_name: str, cache: dict) -> float:
- if len(positions) == 0:
- return 0
- else:
- return sum([calc_fst_weir_cockerham(*calc_heterozygosities(sample1_name, sample2_name, pos, cache)) for pos in positions])
-
-
-def calculate_sample_distances(positions: List[int], sample_name: str, samples: List[str], cache: dict) -> List[float]:
- return [calculate_pairwise_distance(positions, sample_name, other_sample_name, cache) for other_sample_name in samples]
-
-
-def calculate_distance_matrix(variant_table: pd.DataFrame, samples: List[str], reference: Seq) -> pd.DataFrame:
- positions = variant_table["POS"].astype("Int64").unique().tolist()
- cache = build_cache(variant_table, samples, reference)
- distance_matrix = {}
- for sample_name in samples:
- distance_matrix[sample_name] = calculate_sample_distances(positions, sample_name, samples, cache)
- for i in range(len(samples)):
- for j in range(i+1, len(samples)):
- distance_matrix[samples[j]][i] = distance_matrix[samples[i]][j]
- return pd.DataFrame(distance_matrix, index=samples)
-
-
-def main():
-
- logging.basicConfig(filename=snakemake.log[0], format=snakemake.config["LOG_PY_FMT"], level=logging.INFO)
-
- logging.info("Reading input FASTA files")
- ancestor = read_monofasta(snakemake.input.ancestor)
- logging.debug(f"Ancestor: '{ancestor.description}', length={len(ancestor.seq)}")
- reference = read_monofasta(snakemake.input.reference)
- logging.debug(f"Reference: '{reference.description}', length={len(reference.seq)}")
-
- logging.info("Reading input tables")
- masked_positions = read_masked_sites(snakemake.input.vcf, snakemake.params.mask_class)
- logging.debug(f"Read {len(masked_positions)} masked positions")
- input_table = pd.read_table(snakemake.input.tsv, sep="\t")
- logging.debug(f"Read {len(input_table)} rows in input TSV")
- ancestor_table = build_ancestor_variant_table(ancestor.seq, reference.seq, reference.id, masked_positions)
- logging.debug(f"Ancestor has {len(ancestor_table)} variants")
- variant_table = pd.concat([input_table, ancestor_table], ignore_index=True)
- logging.debug(f"Combined table has {len(variant_table)} variants")
-
- logging.info(f"Calculating distance matrix")
- sample_names = snakemake.params.samples + [reference.id]
- distances = calculate_distance_matrix(variant_table, sample_names, ancestor.seq)
- logging.debug(f"Distance matrix has shape: {distances.shape}")
-
- logging.info("Writing results")
- distances.to_csv(snakemake.output.distances)
-
-
-if __name__ == "__main__":
- main()
diff --git a/workflow/scripts/window.py b/workflow/scripts/window.py
deleted file mode 100644
index 5ba6a8c..0000000
--- a/workflow/scripts/window.py
+++ /dev/null
@@ -1,52 +0,0 @@
-#!/usr/bin/env python3
-
-import logging
-import json
-import pandas as pd
-from gb2seq.features import Features
-
-
-def window_calculation(sites: set, window: int, step: int, coord: str) -> pd.DataFrame:
- ft = Features(coord) # Create Features object to obtain annotations
- positions = []
- pct = []
- genes = []
- lim_sup = len(ft.reference.sequence) + 1
- for position in range(1, lim_sup, step):
- if len(ft.getFeatureNames(position)) == 0:
- genes.append("Intergenic")
- else:
- genes.append(list(ft.getFeatureNames(position))[0])
- # Add percent (excluding initial and final positions)
- if position - window not in range(1, lim_sup):
- pct.append(0)
- else:
- # Calculate no. of polimorphisms in the window
- num_snp = len([x for x in sites if x in range(position - window, position + 1)])
- pct.append(num_snp / window)
- positions.append(position)
- return pd.DataFrame({"position": positions, "fractions": pct, "gen": genes})
-
-
-def main():
-
- logging.basicConfig(filename=snakemake.log[0], format=snakemake.config["LOG_PY_FMT"], level=logging.INFO)
-
- logging.info("Reading input VCF")
- df = pd.read_table(snakemake.input.vcf)
- sites = set(df.POS)
-
- logging.info("Reading features")
- with open(snakemake.input.features) as f:
- features_key = json.load(f)
-
- logging.info("Sliding window calculation of proportion of polimorphic sites")
- frame = window_calculation(sites, snakemake.params.window, snakemake.params.step, snakemake.input.gb)
-
- logging.info("Saving results")
- frame.replace(features_key, inplace = True)
- frame.to_csv(snakemake.output.window_df, index= False)
-
-
-if __name__ == "__main__":
- main()