diff --git a/.dag.png b/.dag.png index d8806e5..f3e4928 100644 Binary files a/.dag.png and b/.dag.png differ diff --git a/.github/workflows/test_apptainer.yml b/.github/workflows/test_apptainer.yml index d70f359..452af0d 100644 --- a/.github/workflows/test_apptainer.yml +++ b/.github/workflows/test_apptainer.yml @@ -26,7 +26,7 @@ jobs: environment-file: .test/environment_apptainer.yaml - name: Run test pipeline shell: bash -el {0} - run: snakemake --snakefile .test/Snakefile --configfile config/config.yaml .test/targets.yaml -c1 --sdm apptainer conda + run: snakemake --snakefile .test/Snakefile --configfile config/config.yaml .test/targets.yaml -c1 --sdm apptainer conda --keep-going --notemp - name: Pack logs if: success() || failure() shell: bash -el {0} diff --git a/.github/workflows/test_v7.yml b/.github/workflows/test_v7.yml index 93c6a4e..3416131 100644 --- a/.github/workflows/test_v7.yml +++ b/.github/workflows/test_v7.yml @@ -26,7 +26,7 @@ jobs: environment-file: .test/environment_v7.yaml - name: Run test pipeline shell: bash -el {0} - run: snakemake --snakefile .test/Snakefile --configfile config/config.yaml .test/targets.yaml --use-conda -c1 --conda-frontend conda + run: snakemake --snakefile .test/Snakefile --configfile config/config.yaml .test/targets.yaml --use-conda -c1 --conda-frontend conda --keep-going --notemp - name: Pack logs if: success() || failure() shell: bash -el {0} diff --git a/.github/workflows/test_v8.yml b/.github/workflows/test_v8.yml index 0043c36..8fd912d 100644 --- a/.github/workflows/test_v8.yml +++ b/.github/workflows/test_v8.yml @@ -26,7 +26,7 @@ jobs: environment-file: .test/environment_v8.yaml - name: Run test pipeline shell: bash -el {0} - run: snakemake --snakefile .test/Snakefile --configfile config/config.yaml .test/targets.yaml --use-conda -c1 --conda-frontend conda + run: snakemake --snakefile .test/Snakefile --configfile config/config.yaml .test/targets.yaml --use-conda -c1 --conda-frontend conda --keep-going --notemp - name: Pack logs if: success() || failure() shell: bash -el {0} diff --git a/.github/workflows/test_v9.yml b/.github/workflows/test_v9.yml index 1f5371a..a3e211c 100644 --- a/.github/workflows/test_v9.yml +++ b/.github/workflows/test_v9.yml @@ -26,7 +26,7 @@ jobs: environment-file: .test/environment_v9.yaml - name: Run test pipeline shell: bash -el {0} - run: snakemake --snakefile .test/Snakefile --configfile config/config.yaml .test/targets.yaml -c1 --sdm conda + run: snakemake --snakefile .test/Snakefile --configfile config/config.yaml .test/targets.yaml -c1 --sdm conda --keep-going --notemp - name: Pack logs if: success() || failure() shell: bash -el {0} diff --git a/.rulegraph.png b/.rulegraph.png index dc85134..d3bcf3c 100644 Binary files a/.rulegraph.png and b/.rulegraph.png differ diff --git a/.rulegraph_sv.svg b/.rulegraph_sv.svg new file mode 100644 index 0000000..d7ff0c7 --- /dev/null +++ b/.rulegraph_sv.svg @@ -0,0 +1 @@ +read_bam_refsdemix_barcode_updatefetch_alignment_referencerename_fastasfetch_problematic_vcffetch_alignment_annotationfetch_reference_gbcreate_empty_filefetch_mapping_referencesconcat_fastafilter_genbank_featuresdemix_preprocessingalign_fastapangolin_reportextract_genbank_regionsdemixmask_alignmentdownload_contextsummary_tablesummarise_demixreconstruct_ancestral_sequencealign_contextdemix_plot_dataancestor_fastamask_contextdemix_plotsnps_to_ancestorn_s_sitesml_context_treediversity_datamask_tsvcontext_phylogeny_datadiversity_plotfilter_tsvcontext_phylogeny_plottsv_to_vcfvariants_effectextract_vcf_fieldsformat_vcf_fields_longermerge_annotationconcat_variantspolymorphic_sites_over_time_datanv_panel_datawindow_dataaf_time_correlation_dataextract_afwdist_variantscalculate_dndspairwise_trajectory_correlationpolymorphic_sites_over_time_plotnv_panel_zoom_on_feature_datanv_panel_plotwindow_zoom_on_feature_datamerge_json_filesaf_time_correlation_plotaf_trajectory_panel_plotafwdist_weighted_distancesdnds_plotsnv_panel_plot_Sformat_afwdist_resultsallele_freq_tree_dataallele_freq_tree_plottime_signal_datatime_signal_plotreport \ No newline at end of file diff --git a/.test/Snakefile b/.test/Snakefile index 66eaf84..d3ff278 100644 --- a/.test/Snakefile +++ b/.test/Snakefile @@ -8,7 +8,7 @@ import subprocess min_version("7.19") # Workflow version -__version__ = "1.2.1" +__version__ = "1.3.0" # Rules include: "../workflow/core.smk" diff --git a/.test/targets.yaml b/.test/targets.yaml index b9e4784..bac704f 100644 --- a/.test/targets.yaml +++ b/.test/targets.yaml @@ -23,11 +23,9 @@ PLOTS: PROBLEMATIC_VCF: "https://raw.githubusercontent.com/W-L/ProblematicSites_SARS-CoV2/da322c32004f7b16bdaa6a8ee7fd24d56e79d9dc/problematic_sites_sarsCov2.vcf" VC: - MAX_DEPTH: 0 MIN_QUALITY: 0 - IVAR_QUALITY: 0 - IVAR_FREQ: 0.05 - IVAR_DEPTH: 5 + MIN_FREQ: 0.05 + MIN_DEPTH: 5 DEMIX: MIN_QUALITY: 0 COV_CUTOFF: 0 @@ -40,7 +38,5 @@ PLOT_GENOME_REGIONS: "config/nsp_annotation.csv" REPORT_QMD: "template.qmd" -FEATURES_JSON: - "config/sarscov2_features.json" GENETIC_CODE_JSON: "config/standard_genetic_code.json" diff --git a/Dockerfile b/Dockerfile index 7f39843..7f6ac6a 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,12 +1,23 @@ FROM condaforge/miniforge3:latest LABEL io.github.snakemake.containerized="true" -LABEL io.github.snakemake.conda_env_hash="bd9b69b913e7b4e17d7d5ca7169a5815c3145ea775af87163f18c0b92abc1bf8" +LABEL io.github.snakemake.conda_env_hash="7071b22b1161190c06be3ac061ab0019a1a8d3038532c9134e070a3875414ef5" # Step 2: Retrieve conda environments +# Conda environment: +# source: workflow/envs/afwdist.yaml +# prefix: /conda-envs/9c24a867826615972cc288081976e7fc +# channels: +# - bioconda +# - conda-forge +# dependencies: +# - afwdist==1.0.0 +RUN mkdir -p /conda-envs/9c24a867826615972cc288081976e7fc +COPY workflow/envs/afwdist.yaml /conda-envs/9c24a867826615972cc288081976e7fc/environment.yaml + # Conda environment: # source: workflow/envs/biopython.yaml -# prefix: /conda-envs/bd81c49fcb540d7706807c1683ba7200 +# prefix: /conda-envs/162796cecea22d99c8702138f0c48e2f # channels: # - conda-forge # - bioconda @@ -14,12 +25,8 @@ LABEL io.github.snakemake.conda_env_hash="bd9b69b913e7b4e17d7d5ca7169a5815c3145e # - python==3.10 # - biopython==1.81 # - pandas==2.0.3 -# - pip==23.2.1 -# - mafft==7.525 -# - pip: -# - gb2seq==0.2.20 -RUN mkdir -p /conda-envs/bd81c49fcb540d7706807c1683ba7200 -COPY workflow/envs/biopython.yaml /conda-envs/bd81c49fcb540d7706807c1683ba7200/environment.yaml +RUN mkdir -p /conda-envs/162796cecea22d99c8702138f0c48e2f +COPY workflow/envs/biopython.yaml /conda-envs/162796cecea22d99c8702138f0c48e2f/environment.yaml # Conda environment: # source: workflow/envs/fetch.yaml @@ -35,14 +42,14 @@ COPY workflow/envs/fetch.yaml /conda-envs/9439457f932a4fbca3665c9ea1ac2f0a/envir # Conda environment: # source: workflow/envs/freyja.yaml -# prefix: /conda-envs/ee7a2e1b4ec9a7a9999f34dddaea0605 +# prefix: /conda-envs/bb4c5f3a509433cc08861582fab4a705 # channels: # - conda-forge # - bioconda # dependencies: -# - freyja==1.4.2 -RUN mkdir -p /conda-envs/ee7a2e1b4ec9a7a9999f34dddaea0605 -COPY workflow/envs/freyja.yaml /conda-envs/ee7a2e1b4ec9a7a9999f34dddaea0605/environment.yaml +# - freyja==2.0.1 +RUN mkdir -p /conda-envs/bb4c5f3a509433cc08861582fab4a705 +COPY workflow/envs/freyja.yaml /conda-envs/bb4c5f3a509433cc08861582fab4a705/environment.yaml # Conda environment: # source: workflow/envs/gisaidr.yaml @@ -93,56 +100,52 @@ COPY workflow/envs/pangolin.yaml /conda-envs/fd645c541ee7a3d43fb9167441b77888/en # Conda environment: # source: workflow/envs/quarto_render.yaml -# prefix: /conda-envs/f2a098519cf1f8c4cecb3c13f8c92883 +# prefix: /conda-envs/96f3c1cec4b3ce5d72f708992272e9c1 # channels: # - conda-forge -# - bioconda # dependencies: -# - r-base==4.3.1 -# - r-gt==0.9.0 -# - quarto==1.3.450 -# - r-jsonlite==1.8.5 +# - r-base==4.5.2 +# - r-gt==1.1.0 +# - quarto==1.8.25 +# - deno==2.3.1 # - r-tidyverse==2.0.0 -# - r-quarto==1.2 -# - r-heatmaply==1.4.2 -# - r-readr==2.1.4 -RUN mkdir -p /conda-envs/f2a098519cf1f8c4cecb3c13f8c92883 -COPY workflow/envs/quarto_render.yaml /conda-envs/f2a098519cf1f8c4cecb3c13f8c92883/environment.yaml +# - r-heatmaply==1.6.0 +RUN mkdir -p /conda-envs/96f3c1cec4b3ce5d72f708992272e9c1 +COPY workflow/envs/quarto_render.yaml /conda-envs/96f3c1cec4b3ce5d72f708992272e9c1/environment.yaml # Conda environment: # source: workflow/envs/renv.yaml -# prefix: /conda-envs/4b57bfc237ddc217c1f0b04d34dc06ef +# prefix: /conda-envs/8ad6cdcf265d30289788da99d5bf9fff # channels: # - conda-forge # - bioconda # dependencies: -# - r-base=4.1.3 +# - r-base=4.3.2 # - r-tidyverse==2.0.0 # - r-ggrepel==0.9.3 -# - r-stringi==1.7.12 # - r-ggpubr==0.6.0 -# - bioconductor-ggtree==3.2.0 -# - r-ape==5.7 +# - bioconductor-ggtree==3.10.0 +# - r-ape==5.8 # - r-adephylo==1.1_13 # - r-pegas==1.2 # - r-data.table==1.14.8 # - r-future.apply==1.11.0 -# - r-scales==1.2.1 +# - r-scales==1.3.0 # - r-showtext==0.9_6 -# - r-jsonlite==1.8.5 # - r-logger==0.2.2 -RUN mkdir -p /conda-envs/4b57bfc237ddc217c1f0b04d34dc06ef -COPY workflow/envs/renv.yaml /conda-envs/4b57bfc237ddc217c1f0b04d34dc06ef/environment.yaml +RUN mkdir -p /conda-envs/8ad6cdcf265d30289788da99d5bf9fff +COPY workflow/envs/renv.yaml /conda-envs/8ad6cdcf265d30289788da99d5bf9fff/environment.yaml # Conda environment: # source: workflow/envs/snpeff.yaml -# prefix: /conda-envs/1934df0e4df02a7ee33c52f53f9e3c30 +# prefix: /conda-envs/0adafb79cb1bec58ef4c77bf4cca4f95 # channels: # - bioconda # dependencies: # - snpeff==5.1d -RUN mkdir -p /conda-envs/1934df0e4df02a7ee33c52f53f9e3c30 -COPY workflow/envs/snpeff.yaml /conda-envs/1934df0e4df02a7ee33c52f53f9e3c30/environment.yaml +# - snpsift==5.1d +RUN mkdir -p /conda-envs/0adafb79cb1bec58ef4c77bf4cca4f95 +COPY workflow/envs/snpeff.yaml /conda-envs/0adafb79cb1bec58ef4c77bf4cca4f95/environment.yaml # Conda environment: # source: workflow/envs/var_calling.yaml @@ -158,16 +161,17 @@ COPY workflow/envs/var_calling.yaml /conda-envs/5150d0f0a91d7f7a789a06f453d63479 # Step 3: Generate conda environments -RUN conda env create --prefix /conda-envs/bd81c49fcb540d7706807c1683ba7200 --file /conda-envs/bd81c49fcb540d7706807c1683ba7200/environment.yaml && \ +RUN conda env create --prefix /conda-envs/9c24a867826615972cc288081976e7fc --file /conda-envs/9c24a867826615972cc288081976e7fc/environment.yaml && \ + conda env create --prefix /conda-envs/162796cecea22d99c8702138f0c48e2f --file /conda-envs/162796cecea22d99c8702138f0c48e2f/environment.yaml && \ conda env create --prefix /conda-envs/9439457f932a4fbca3665c9ea1ac2f0a --file /conda-envs/9439457f932a4fbca3665c9ea1ac2f0a/environment.yaml && \ - conda env create --prefix /conda-envs/ee7a2e1b4ec9a7a9999f34dddaea0605 --file /conda-envs/ee7a2e1b4ec9a7a9999f34dddaea0605/environment.yaml && \ + conda env create --prefix /conda-envs/bb4c5f3a509433cc08861582fab4a705 --file /conda-envs/bb4c5f3a509433cc08861582fab4a705/environment.yaml && \ conda env create --prefix /conda-envs/3fad3c9cdfa40bee9404f6a2e8fda69f --file /conda-envs/3fad3c9cdfa40bee9404f6a2e8fda69f/environment.yaml && \ conda env create --prefix /conda-envs/0a608afb24723cb6fa8aef748f5efbc8 --file /conda-envs/0a608afb24723cb6fa8aef748f5efbc8/environment.yaml && \ conda env create --prefix /conda-envs/04a3347f94ddf7e21c34bc49e5246076 --file /conda-envs/04a3347f94ddf7e21c34bc49e5246076/environment.yaml && \ conda env create --prefix /conda-envs/fd645c541ee7a3d43fb9167441b77888 --file /conda-envs/fd645c541ee7a3d43fb9167441b77888/environment.yaml && \ - conda env create --prefix /conda-envs/f2a098519cf1f8c4cecb3c13f8c92883 --file /conda-envs/f2a098519cf1f8c4cecb3c13f8c92883/environment.yaml && \ - conda env create --prefix /conda-envs/4b57bfc237ddc217c1f0b04d34dc06ef --file /conda-envs/4b57bfc237ddc217c1f0b04d34dc06ef/environment.yaml && \ - conda env create --prefix /conda-envs/1934df0e4df02a7ee33c52f53f9e3c30 --file /conda-envs/1934df0e4df02a7ee33c52f53f9e3c30/environment.yaml && \ + conda env create --prefix /conda-envs/96f3c1cec4b3ce5d72f708992272e9c1 --file /conda-envs/96f3c1cec4b3ce5d72f708992272e9c1/environment.yaml && \ + conda env create --prefix /conda-envs/8ad6cdcf265d30289788da99d5bf9fff --file /conda-envs/8ad6cdcf265d30289788da99d5bf9fff/environment.yaml && \ + conda env create --prefix /conda-envs/0adafb79cb1bec58ef4c77bf4cca4f95 --file /conda-envs/0adafb79cb1bec58ef4c77bf4cca4f95/environment.yaml && \ conda env create --prefix /conda-envs/5150d0f0a91d7f7a789a06f453d63479 --file /conda-envs/5150d0f0a91d7f7a789a06f453d63479/environment.yaml && \ conda clean --all -y diff --git a/config/README.md b/config/README.md index 23e0367..f5cc413 100644 --- a/config/README.md +++ b/config/README.md @@ -12,16 +12,28 @@ For example, existing [profiles](https://snakemake.readthedocs.io/en/stable/gett are cross-compatible as well, but note that the `--use-conda` flag is deprecated starting with Snakemake 8. Instead, use `--software-deployment-method conda`. +## tl;dr + +- **Needs**: + - Snakemake 7.19 or later. + - One FASTA per sample. + - One BAM per sample. + - One metadata CSV with columns `ID`, `CollectionDate` (YYYY-MM-DD), `ResidenceCity` and `GISAIDEPI` (can be empty). +- **Setup**: edit [targets.yaml](/config/targets.yaml) (set `SAMPLES` and `METADATA`, at least) or build it using [`build_targets.py`](/build_targets.py). Leave `CONTEXT_FASTA: null` to auto-download from GISAID (needs `config/gisaid.yaml` with your username and password) or set a local FASTA path if download fails (see GISAID disclaimer). +- **Run**: `snakemake (--use-conda | --sdm conda) -c4`. + ## Inputs and outputs The workflow requires a set of FASTA files (one per target sample), a corresponding set of BAM files (also one per target sample), and a metadata table in CSV format with one row per -sample. The metadata must include the following columns: unique sample identifier (default column `ID`, -used to match sequencing files with metadata), the date the sample was collected (default `CollectionDate`), -the location where the sample was collected (default `ResidenceCity`), and GISAID accession (default `GISAIDEPI`). -The default column names but can be customized if needed via the workflow parameters. +sample. The metadata must include at least the following columns: -These parameters are set in two configuration files in YAML format: +- `ID`: unique sample identifier, used to match sequencing files with metadata. +- `CollectionDate`: the date the sample was collected (YYYY-MM-DD). +- `ResidenceCity`: the location where the sample was collected. +- `GISAIDEPI`: GISAID accession identifier (`EPI_ISL_...` or empty). + +The path to these input files is set in two configuration files in YAML format: [config.yaml](/config/config.yaml) (for general workflow settings) and [targets.yaml](/config/targets.yaml) (for specific dataset-related settings). The latter must be modified by the user to point the `SAMPLES` and `METADATA` @@ -130,24 +142,50 @@ All of the following variables are pre-defined in [config.yaml](/config/config.y - `ALIGNMENT_REFERENCE`: NCBI accession number of the reference record for sequence alignment. - `PROBLEMATIC_VCF`: URL or path of a VCF file containing problematic genome positions for masking. -- `FEATURES_JSON`: path of a JSON file containing name equivalences of genome features for data visualization. - `GENETIC_CODE_JSON`: path of a JSON file containing a genetic code for gene translation. - `TREE_MODEL`: substitution model used by IQTREE (see [docs](http://www.iqtree.org/doc/Substitution-Models)). -- `UFBOOT_REPS`: ultrafast bootstrap replicates for IQTREE (see [UFBoot](https://doi.org/10.1093/molbev/msx281)). -- `SHALRT_REPS`: Shimodaira–Hasegawa approximate likelihood ratio test bootstrap replicates for IQTREE (see [SH-aLRT](https://doi.org/10.1093/sysbio/syq010)). +- `UFBOOT` & `SHALRT`: settings for ultrafast bootstrap (see [UFBoot](https://doi.org/10.1093/molbev/msx281)) + and Shimodaira–Hasegawa approximate likelihood ratio test bootstrap + (see [SH-aLRT](https://doi.org/10.1093/sysbio/syq010)) in IQTREE runs: + - `REPS`: number of replicates. + - `THRESHOLD`: value cutoff for visualization. - `VC`: variant calling configuration: - - `MAX_DEPTH`: maximum depth at a position for `samtools mpileup` (option `-d`). - - `MIN_QUALITY`: minimum base quality for `samtools mpileup` (option `-Q`). - - `IVAR_QUALITY`: minimum base quality for `ivar variants` (option `-q`). - - `IVAR_FREQ`: minimum frequency threshold for `ivar variants` (option `-t`). - - `IVAR_DEPTH`: minimum read depth for `ivar variants` (option `-m`). -- `DEMIX`: demixing configuration: + - `MIN_QUALITY`: minimum base quality for `ivar variants` (option `-q`). + - `MIN_FREQ`: minimum frequency threshold for `ivar variants` (option `-t`). + - `MIN_DEPTH`: minimum read depth for `ivar variants` (option `-m`). + - `MAX_DEPTH`: maximum read depth for `samtools mpileup` (option `-d`). +- `DEMIX`: demixing configuration (uses [Freyja](https://github.com/andersen-lab/Freyja), see also [its docs](https://andersen-lab.github.io/Freyja/index.html)): + - `PATHOGEN`: pathogen of interest for `freyja update` (option `--pathogen`); must be 'SARS-CoV-2'. - `MIN_QUALITY`: minimum quality for `freyja variants` (option `--minq`). - - `COV_CUTOFF`: minimum depth for `freyja demix` (option `--covcut`). + - `MAX_DEPTH`: maximum read depth for `samtools mpileup` (option `-d`). + - `COV_CUTOFF`: minimum depth to calculate the reported "coverage" (percent of sites with that depth) for `freyja demix` (option `--covcut`). - `MIN_ABUNDANCE`: minimum lineage estimated abundance for `freyja demix` (option `--eps`). + - `CONFIRMED_ONLY`: exclude unconfirmed lineages in `freyja demix` (option `--confirmedonly`). + - `DEPTH_CUTOFF`: minimum depth on each site for `freyja demix` (option `--depthcutoff`). + - `RELAXED_MRCA`: assign clusters using relaxed (as opposed to strict) MRCA, used with `DEPTH_CUTOFF`, for `freyja demix` (option `--relaxedmrca`). + - `RELAXED_MRCA_THRESH`: `RELAXED_MRCA` threshold for `freyja demix` (option `--relaxedthresh`). + - `AUTO_ADAPT`: use error profile to set adaptive lasso penalty parameter for `freyja demix` (option `--autoadapt`). - `WINDOW`: sliding window of nucleotide variants per site configuration: - `WIDTH`: number of sites within windows. - `STEP`: number of sites between windows. +- `GB_FEATURES`: optional mapping to filter which features from the GenBank file are used + by some analyses (e.g. rules `window` and `n_s_sites`). + If `GB_FEATURES` is empty or unset, all features are used. + Filtering is applied in order: + - `INCLUDE`: mapping of qualifier names to sequences of values. + If present, only features that match at least one key/value pair in `INCLUDE` + are included in the analyses. For example, having `INCLUDE: {gene: [S, N]}` keeps features whose `gene` + qualifier equals `S` or `N`. + - `EXCLUDE`: mapping of qualifier names to sequences of values. + After `INCLUDE` is applied, any feature that matches any key/value pair in `EXCLUDE` + is omitted. For example, having `EXCLUDE: {gene: [S, N]}` removes features whose + `gene` qualifier equals `S` or `N`. +- `ANNOTATION`: settings for variant annotation and functional effect prediction using [SnpEff](https://pcingola.github.io/SnpEff/), which uses `ALIGNMENT_REFERENCE` for selecting the database to annotate. + - `SNPEFF_COLS`: mapping of column names (which appear in result tables) to VCF fields (extracted after annotation with [SnpSift](https://pcingola.github.io/SnpEff/#snpsift)). Some columns are hard-coded in the code, so removing them is not advised. Additional columns can be added as needed. + - `FILTER_INCLUDE` & `FILTER_EXCLUDE`: mapping of column names (from `SNPEFF_COLS`) to lists of values used for filtering the annotated variants table. `FILTER_INCLUDE` is applied first, then `FILTER_EXCLUDE`. + - `FILTER_INCLUDE`: keeps variants that match at least one listed value. + - `FILTER_EXCLUDE`: removes variant that matches any listed value. + - `VARIANT_NAME_PATTERN`: string template used to build the variant name shown in the results table (column `VARIANT_NAME`). The template is interpreted with [glue](https://glue.tidyverse.org/) and can use any column name from `SNPEFF_COLS` and some R functions. For example, `"{GENE}:{coalesce(HGVS_P, HGVS_C)}"` creates names like `S:p.D614G` (using `HGVS_P` when available, otherwise `HGVS_C`). - `GISAID`: automatic context download configuration. - `CREDENTIALS`: path of the GISAID credentials in YAML format. - `DATE_COLUMN`: name of the column that contains sampling dates (YYYY-MM-DD) in the input target metadata. @@ -160,27 +198,35 @@ All of the following variables are pre-defined in [config.yaml](/config/config.y - `EXACT`: boolean flag indicating whether to compute an exact p-value when possible. This option applies only to certain methods and may be set to `null` (default) to let R decide automatically. - `LOG_PY_FMT`: logging format string for Python scripts. - `PLOTS`: path of the R script that sets the design and style of data visualizations. -- `PLOT_GENOME_REGIONS`: path of a CSV file containing genome regions, e.g. SARS-CoV-2 non-structural protein (NSP) coordinates, for data visualization. +- `PLOT_GENOME_REGIONS`: path of a CSV file containing genome regions, e.g. SARS-CoV-2 non-structural protein (NSP) coordinates, for data visualization (columns: `region`, `start`, `end`). - `REPORT_QMD`: path of the report template in Quarto markdown (QMD) format. +- `REPORT_CSS`: path of the report stylesheet definition in CSS format. -## Workflow graphs +## Workflow visualization -To generate a simplified rule graph, run: +Snakemake enables easy visualization of workflows and rule relationships. The `--rulegraph` option outputs a DOT file that describes dependencies between rules. The example below produces an image using Graphviz: ```shell -snakemake --rulegraph | dot -Tpng > .rulegraph.png +snakemake --forceall --rulegraph | dot -Tpng >.rulegraph.png ``` ![Snakemake rule graph](/.rulegraph.png) -To generate the directed acyclic graph (DAG) of all rules -to be executed, run: +The same graph can also be rendered with other tools such as [snakevision](https://github.com/OpenOmics/snakevision) (v0.1.0). + +```shell +snakemake --forceall --rulegraph | snakevision -s all -o .rulegraph_sv.svg +``` + +![Snakemake rule graph using snakevision](/.rulegraph_sv.svg) + +The `--dag` option emits an directed acyclic graph (DAG) that corresponds to the rule instances that would be executed for the current dataset. The example below produces an image using Graphviz: ```shell -snakemake --forceall --dag | dot -Tpng > .dag.png +snakemake --forceall --dag | dot -Tpng >.dag.png ``` -![Snakemake rule graph](/.dag.png) +![Snakemake DAG](/.dag.png) ## Run modes diff --git a/config/config.yaml b/config/config.yaml index 7bb2fbe..54d5097 100644 --- a/config/config.yaml +++ b/config/config.yaml @@ -2,29 +2,82 @@ ALIGNMENT_REFERENCE: "NC_045512.2" PROBLEMATIC_VCF: "https://raw.githubusercontent.com/W-L/ProblematicSites_SARS-CoV2/da322c32004f7b16bdaa6a8ee7fd24d56e79d9dc/problematic_sites_sarsCov2.vcf" -FEATURES_JSON: - "config/sarscov2_features.json" GENETIC_CODE_JSON: "config/standard_genetic_code.json" TREE_MODEL: "GTR+F+I+G4" -UFBOOT_REPS: - 1000 -SHALRT_REPS: - 1000 +UFBOOT: + REPS: 1000 + THRESHOLD: 0.95 +SHALRT: + REPS: 1000 + THRESHOLD: 0.80 VC: - MAX_DEPTH: 0 - MIN_QUALITY: 0 - IVAR_QUALITY: 20 - IVAR_FREQ: 0.05 - IVAR_DEPTH: 30 + MIN_QUALITY: 20 + MIN_FREQ: 0.05 + MIN_DEPTH: 30 + MAX_DEPTH: 1000000 DEMIX: + PATHOGEN: "SARS-CoV-2" MIN_QUALITY: 20 + MAX_DEPTH: 1000000 COV_CUTOFF: 30 MIN_ABUNDANCE: 0.0001 + CONFIRMED_ONLY: false + DEPTH_CUTOFF: 0 + RELAXED_MRCA: false + RELAXED_MRCA_THRESH: 0.9 + AUTO_ADAPT: false + SOLVER: "CLARABEL" WINDOW: WIDTH: 1000 STEP: 50 +GB_FEATURES: + INCLUDE: # any + product: + - "ORF1ab polyprotein" + - "surface glycoprotein" + - "ORF3a protein" + - "envelope protein" + - "membrane glycoprotein" + - "ORF6 protein" + - "ORF7a protein" + - "ORF7b" + - "ORF8 protein" + - "nucleocapsid phosphoprotein" + - "ORF10 protein" + # EXCLUDE: ... # all +ANNOTATION: + # see: https://pcingola.github.io/SnpEff/adds/VCFannotationformat_v1.0.pdf + SNPEFF_COLS: + CHROM: CHROM + POS: POS + REF: REF + ALT: ALT + EFFECT: "ANN[*].EFFECT" + IMPACT: "ANN[*].IMPACT" + BIOTYPE: "ANN[*].BIOTYPE" + GENE: "ANN[*].GENE" + GENEID: "ANN[*].GENEID" + FEATURE: "ANN[*].FEATURE" + FEATUREID: "ANN[*].FEATUREID" + HGVS_P: "ANN[*].HGVS_P" + HGVS_C: "ANN[*].HGVS_C" + ERRORS: "ANN[*].ERRORS" + FILTER_INCLUDE: + # IMPACT: [HIGH, MODERATE, LOW] + FILTER_EXCLUDE: + EFFECT: [upstream_gene_variant, downstream_gene_variant] + ERRORS: + - ERROR_CHROMOSOME_NOT_FOUND + - ERROR_OUT_OF_CHROMOSOME_RANGE + - WARNING_REF_DOES_NOT_MATCH_GENOME + - WARNING_SEQUENCE_NOT_AVAILABLE + - WARNING_TRANSCRIPT_INCOMPLETE + - WARNING_TRANSCRIPT_MULTIPLE_STOP_CODONS + - WARNING_TRANSCRIPT_NO_START_CODON + - WARNING_TRANSCRIPT_NO_STOP_CODON + VARIANT_NAME_PATTERN: "{GENE}:{coalesce(HGVS_P, HGVS_C)}" GISAID: CREDENTIALS: "config/gisaid.yaml" DATE_COLUMN: "CollectionDate" @@ -44,3 +97,5 @@ PLOT_GENOME_REGIONS: "config/nsp_annotation.csv" REPORT_QMD: "template.qmd" +REPORT_CSS: + "config/report.styles.css" diff --git a/config/design_plots.R b/config/design_plots.R index 971265c..74e87d8 100644 --- a/config/design_plots.R +++ b/config/design_plots.R @@ -1,6 +1,6 @@ # Jordi Sevilla -library(tidyverse) +library(ggplot2) library(showtext) # Ajustes #### @@ -8,13 +8,13 @@ showtext_auto(enable = FALSE) showtext_opts(dpi = 200) # Tema -font_add_google("Montserrat", "Montserrat") +font_add_google("Noto Sans", "Noto Sans") showtext_auto() theme_set(theme_minimal()) theme_update( - text = element_text(size = 16, family = "Montserrat"), + text = element_text(size = 16, family = "Noto Sans"), axis.title = element_text(size = 16), axis.line = element_line( linewidth = 0.5, @@ -25,12 +25,11 @@ theme_update( panel.grid = element_line(linewidth = 0.17, color = "lightgray") ) - -# CONFIG -gene_colors = c( +# Gene palette +GENE_PALETTE <- c( M = "#B4D4B4", N = "#B7B7B8", - orf1ab = "#9CC4DC", + ORF1ab = "#9CC4DC", ORF3a = "#ECB4B7", ORF8 = "#996D2B", S = "#F5CC9E", @@ -40,20 +39,32 @@ gene_colors = c( ORF10 = "#CACB5D" ) +# Nucleotide diversity +DIVERSITY_PALETTE <- c( + density_fill = "#fcbf49", + density_color = "#ffaa00", + value_color = "#D944AA", + dnorm_color = "#f77f00" +) # M-L tree colors and labels -tree_colors = c( +TREE_PALETTE <- c( tip_label = "#D944AA99", boot_alrt_pass = "#64ACEEB2" ) -node.size <- c( +TREE_NODE_SIZE <- c( tip_label = 2, boot_alrt_pass = 0.8 ) +TREE_LEGEND_NAMES <- c( + tip_label = "Target samples", + boot_alrt_pass = "UFBoot ≥ %s%s & SH-aLRT ≥ %s%s" +) + # Nucleotide variants classification colors and labels -NV_colors <- c( +NV_TYPE_PALETTE <- c( Frameshift = "#568D63", "In frame" = "black", Intergenic = "#B27CF9", @@ -61,7 +72,7 @@ NV_colors <- c( Yes = "#0248FD" ) -NV_names <- c( +NV_TYPE_NAMES <- c( Frameshift = "Frameshift", "In frame" = "Inframe", Intergenic = "Intergenic", @@ -70,17 +81,23 @@ NV_names <- c( ) # dn ds colors and labels -dnds.labels <- c( +DNDS_LABELS <- c( dn = "dN", ds = "dS" ) -dnds.colors <- c( - dn = "#E53E47", - ds = "#2C47F5" +DNDS_COLORS <- c( + dN = "#E53E47", + dS = "#2C47F5" ) -dnds.shapes <- c( - dn = 2, - ds = 4 +DNDS_SHAPES <- c( + dN = 2, + dS = 4 ) + +# Allele frequency trajectories panel color +ALL.COLORS <- grDevices::colors() +TRAJECTORY.PANEL.COLORS <- ALL.COLORS[ + !grepl("(gray|grey|white|snow|azure|beige)", ALL.COLORS) +] diff --git a/config/nsp_annotation.csv b/config/nsp_annotation.csv index 02126d0..78ab294 100644 --- a/config/nsp_annotation.csv +++ b/config/nsp_annotation.csv @@ -1,17 +1,7 @@ region,start,end -nsp1,266,805 -nsp2,806,2719 -nsp3,2720,8554 -nsp4,8555,10054 -nsp5,10055,10972 -nsp6,10973,11842 -nsp7,11843,12091 -nsp8,12092,12685 -nsp9,12686,13024 -nsp10,13025,13441 -nsp11,13442,13468 -nsp12,13468,16236 -nsp13,16237,18039 -nsp14,18040,19620 -nsp15,19621,20658 -nsp16,20659,21552 +NSP2,806,2719 +NSP3,2720,8554 +NSP4-11,8555,13468 +RdRP,13468,16236 +NSP13,16237,18039 +NSP14-16,18040,21552 diff --git a/config/report.styles.css b/config/report.styles.css index f81e0a9..9ea2783 100644 --- a/config/report.styles.css +++ b/config/report.styles.css @@ -1,35 +1,42 @@ -.quarto-title-block .quarto-title-banner { - background: #27445C; - background-image: url(logo_pathogenomics.png); - background-size: 100px; - background-position: right; - background-repeat: no-repeat; - padding-right: 10px; - background-origin: content-box; - } +html { + font-family: "Noto Sans"; + text-align: justify; +} -.title { - font-family: Montserrat; - color: #FFFFFF +body { + color: black; } -.quarto-title p { - font-family: Montserrat; - color: #FFFFFF + +.quarto-title-block .quarto-title-banner { + background-color: #27445C; + background-image: url("logo_pathogenomics.png"); + background-size: 100px; + background-position: right; + background-repeat: no-repeat; + background-origin: content-box; + padding-right: 10px; + background-blend-mode: normal; + filter: none; + mix-blend-mode: normal; + color: white; } -html { - font-family: Montserrat; - color: #000000; - text-align: justify; +.quarto-title-block .quarto-title-banner .title, +.quarto-title-block .quarto-title-banner p { + color: inherit; + -webkit-text-fill-color: inherit; + opacity: 1; } h1, h2, h3 { text-align: left; font-weight: bold; + color: #27445C } + #quarto-sidebar-toc-left { text-align: left; } .header-section-number { - color:#000000 -} \ No newline at end of file + color: #27445C +} diff --git a/config/sarscov2_features.json b/config/sarscov2_features.json deleted file mode 100644 index 2031f32..0000000 --- a/config/sarscov2_features.json +++ /dev/null @@ -1,14 +0,0 @@ -{ - "ORF1ab polyprotein": "orf1ab", - "ORF1a polyprotein": "orf1ab", - "surface glycoprotein": "S", - "ORF3a protein": "ORF3a", - "envelope protein": "E", - "membrane glycoprotein": "M", - "ORF6 protein": "ORF6", - "ORF7a protein": "ORF7", - "ORF7b": "ORF7", - "ORF8 protein": "ORF8", - "nucleocapsid phosphoprotein": "N", - "ORF10 protein": "ORF10" -} diff --git a/template.qmd b/template.qmd index eb5b82b..3c2d044 100644 --- a/template.qmd +++ b/template.qmd @@ -4,8 +4,8 @@ subtitle: "Workflow version: `r params$workflow_version`" date: last-modified date-format: "YYYY-MM-DD" title-block-banner: true -format: - html: +format: + html: page-layout: article embed-resources: true smooth-scroll: true @@ -15,41 +15,48 @@ format: toc-location: left toc-title: Summary number-sections: true -css: config/report.styles.css -editor: visual +css: __CSSPLACEHOLDER__ +editor: source params: - ufboot_reps: "" - shalrt_reps: "" - min_ivar_freq: "" - workflow_version: "" - use_bionj: "" - cor_method: "" - div: "" - freyja: "" - tree: "" - tempest: "" - SNV: "" - SNV_s: "" - evo: "" - div_value: "" - panel: "" - volcano: "" - tree_ml: "" - fig_cor_snp: "" - stats_lm: "" - table: "" - sum_nv: "" - heat_tab: "" - omega_plot: "" - name: "" + ufboot_reps: + shalrt_reps: + min_ivar_freq: + workflow_version: + use_bionj: + cor_method: + div: + demix: + tree: + tempest: + SNV: + SNV_s: + evo: + div_value: + panel: + volcano: + tree_ml: + fig_cor_snp: + stats_lm: + stats_ml: + table: + sum_nv: + heat_tab: + omega_plot: + name: + freyja_ts: output-file: report.html --- - + +```{r setup, echo = F, message = F, include = F} +display.num <- function(n, digits) { + format(n, digits = digits, nsmall = digits) +} +``` ```{r read, echo = F, message = F, include = F} library(jsonlite) @@ -62,11 +69,14 @@ library(readr) # Diversity div_values <- fromJSON(params$div_value) -# Temporal signal -stats <- fromJSON(params$stats_lm) +# Temporal signal and context tree +stats <- append( + fromJSON(params$stats_lm), + fromJSON(params$stats_ml) +) correlation <- stats[["r2"]] sub_rate <- stats[["sub_rate"]] -sub_rate <- round(sub_rate, digits = 2) +sub_rate <- display.num(sub_rate, 2) p_value_lm <- stats[["pvalue"]] # NV counts @@ -79,10 +89,10 @@ table <- read.csv(params$table) n.samples <- table %>% pull(Sample) %>% unique() %>% length() # Heatmap -vcf <- read_csv(params$heat_tab) -row.names(vcf) <- vcf$`...1` -vcf <- vcf[-1] -cor.mat <- cor(vcf, method = params$cor_method) +heatmap.df <- read.csv(params$heat_tab) +row.names(heatmap.df) <- heatmap.df$`...1` +heatmap.df[1] <- NULL +cor.mat <- cor(heatmap.df, method = params$cor_method) cor.mat[is.na(cor.mat)] <- 0 if (params$cor_method == "pearson") { cor.method.name <- "Pearson's" @@ -100,12 +110,16 @@ if (params$use_bionj) { } else { dist.tree.algo <- "neighbor-joining (NJ)" } + +# Freyja timestamp +freyja.timestamp <- read_file(params$freyja_ts) %>% + as.POSIXct(., format = "%m_%d_%Y-%H-%M") %>% + strftime(., format = "%Y-%m-%d at %H:%M") ``` ## Summary of the target samples dataset -```{r summary_table, echo = F,message = F} -#| label: tbl-summary +```{r tbl-summary, echo = F,message = F} #| tbl-cap: Target dataset summary table %>% @@ -133,7 +147,7 @@ table %>% columns = everything() ) %>% opt_table_font( - font = google_font("Montserrat") + font = google_font("Noto Sans") ) ``` @@ -141,11 +155,11 @@ table %>% ### Lineage admixture -The estimated lineage admixture for each sample has been calculated -using [Freyja](https://github.com/andersen-lab/Freyja). +The lineage admixture for each sample has been estimated +using [Freyja](https://github.com/andersen-lab/Freyja) (@fig-demix). -![Estimated lineage admixture of each sample. -Samples in the X-axis are ordered chronologically, from more ancient to newer.](`r params$freyja`){#fig-freyja} +![Estimated lineage admixture of each sample (Freyja barcode timestamp: `r freyja.timestamp`). +Samples in the X-axis are ordered chronologically, from more ancient to newer.](`r params$demix`){#fig-demix} ### Phylogenetic reconstruction @@ -159,26 +173,25 @@ $`r stats[["boot"]]`$% and a SH-aLRT score of $`r stats[["alrt"]]`$% (@fig-tree_ ![Maximum-likelihood phylogeny with $`r params$ufboot_reps`$ UFBoot and $`r params$shalrt_reps`$ SH-aLRT support replicates of the target dataset and its context samples. The clade that contains the target -samples is squared in red.](`r params$tree_ml`){#fig-tree_ml} +samples is squared.](`r params$tree_ml`){#fig-tree_ml} ### Nucleotide diversity comparison Nucleotide diversity (π) has been calculated for $`r div_values[["boot.reps"]]`$ random -sample subsets of size $`r div_values[["sample.size"]]`$, extracted -from the context dataset. The distribution of the nuclotide diversity is assumed to -`r div_values[["norm.text"]]` be normal after performing a Shapiro-Wilk test -(p-value of $`r div_values[["normal.pvalue"]]`$). - -The nucleotide diversity of the target samples is $`r div_values[["diversity"]] `$ (red line in @fig-div). -Assuming the independence of the context samples, the `r div_values[["type.test"]]` -p-value of the context samples having a nucleotide diversity (in orange in @fig-div) -as low as that of the target dataset is $`r div_values[["p.value"]]`$. - -![Analysis of the nucleotide diversity (π). The orange line describes -a normal distribution with the same mean and standard deviation as the distribution -of π from $`r div_values[["boot.reps"]]`$ subsets of $`r div_values[["sample.size"]]`$ -sequences from the context. The red vertical line indicates the π value of the -target samples.](`r params$div`){#fig-div} +subsets of size $`r div_values[["sample.size"]]`$ drawn from the context dataset. +A Shapiro–Wilk test indicates the distribution is `r div_values[["norm.text"]]` +normal (p-value of $`r div_values[["normal.pvalue"]]`$). + +The nucleotide diversity of the target samples is $`r display.num(div_values[["diversity"]], 3)`$ +(vertical line in @fig-div). Assuming the context subsets are independent, the +`r div_values[["type.test"]]` test gives a p-value of $`r display.num(div_values[["p.value"]], 3)`$ +for observing a nucleotide diversity as low as that of the target samples. + +![Distribution of nucleotide diversity (π) calculated from $`r div_values[["boot.reps"]]`$ +random subsets of $`r div_values[["sample.size"]]`$ sequences from the context dataset. +The shaded area shows the empirical density; the overlaid curve is the normal distribution +with the same mean and standard deviation. +The vertical line marks the π value of the target samples.](`r params$div`){#fig-div} ## Evolutionary trajectory of the serially-sampled SARS-CoV-2 infection @@ -186,7 +199,7 @@ target samples.](`r params$div`){#fig-div} Sites with minor allele frequency $> `r params$min_ivar_freq`$ are considered polymorphic. The linear association between the collection date of the samples and the number of -polymorphic sites has an $R^2$ of $`r nv.counts[["r2"]]`$ and a p-value of +polymorphic sites has an $R^2$ of $`r display.num(nv.counts[["r2"]], 4)`$ and a p-value of $`r nv.counts[["value"]]`$ (@fig-fig_cor_snp). ![Number of polymorphic sites along time. The @@ -194,7 +207,7 @@ blue line shows the linear model fit.](`r params$fig_cor_snp`){#fig-fig_cor_snp} ### Description of intra-host nucleotide variants -A total of $`r n_SNV`$ different single nucleotide variants (SNV) and $`r n_INDELS`$ +A total of $`r n_SNV`$ different nucleotide variants (NV) and $`r n_INDELS`$ insertions and deletions (indels) have been detected along the genome (@fig-SNV). ::: {.panel-tabset} @@ -203,7 +216,7 @@ insertions and deletions (indels) have been detected along the genome (@fig-SNV) ![Summary of the intra-host accumulation of nucleotide variants, using the reconstructed dataset ancestor as reference. A) Nucleotide -variants per site along the SARS-CoV-2 genome. Relative abundance of NVs is calculated +variants per site along the SARS-CoV-2 genome. Relative abundance of variants is calculated with a sliding window of width $`r nv.counts[["window"]]`$ nucleotides and a step of $`r nv.counts[["step"]]`$. Labels indicate the coding regions of the non structural proteins (NSP) within ORF1ab. B) Genome variation along the genome for each sample. @@ -214,7 +227,7 @@ at the bottom, and the latest, at the top.](`r params$SNV`){#fig-SNV} ![Summary of the intra-host accumulation of nucleotide variants in the spike sequence, using the reconstructed dataset ancestor as reference. A) Nucleotide -variants per site along the S gene. Relative abundance of NVs is calculated +variants per site along the S gene. Relative abundance of variants is calculated with a sliding window of width $`r nv.counts[["window"]]`$ nucleotides and a step of $`r nv.counts[["step"]]`$. B) Genome variation along the S gene for each sample. The Y-axis displays samples in chronological order, with the earliest collection date @@ -224,12 +237,12 @@ at the bottom, and the latest, at the top.](`r params$SNV_s`){#fig-SNV_s} ### Temporal signal of the intra-host mutations -The correlation of the allele frequency of each NV with the time since the +`r cor.method.name` correlation of the allele frequency of each variant with the time since the initial sampling has been calculated (@fig-volcano). ![`r cor.method.name` correlation coefficients and adjusted p-values of -allele frequencies with time. Red dashed line indicates adjusted $p = 0.05$. -Labeled dots represent nucleotide variants correlated with time +allele frequencies with time. The dashed line indicates adjusted (false discovery rate) $p = 0.05$. +Labeled dots represent nucleotide variants whose allele frequency is correlated with time (adjusted $p < 0.05$).](`r params$volcano`){#fig-volcano} Significantly correlated nucleotide variants are described in more detail in @fig-panel. @@ -248,28 +261,31 @@ frequency-weighted distances.](`r params$tree`){#fig-tree} To estimate the evolutionary rate, root-to-tip distances measured on the previous tree (@fig-tree) have been correlated with time, obtaining a $R^2$ of -$`r correlation`$ and a p-value of $`r p_value_lm`$. The estimated evolutionary +$`r display.num(correlation, 4)`$ and a p-value of $`r p_value_lm`$. The estimated evolutionary rate is $`r sub_rate`$ number of changes per year (@fig-tempest). ![Scatterplot depicting the relationship between root-to-tip -distances and the number of days passed since the first sample. The red -line shows the linear model fit.](`r params$tempest`){#fig-tempest} +distances and the number of days passed since the first sample. The solid +line represents the linear model fit.](`r params$tempest`){#fig-tempest} ### Correlation between alternative alleles -To detect possible interactions between mutations, pairwise correlation between allele -frequencies are calculated (@fig-heatmap). The heatmap is an interactive figure that allows -zooming in on specific regions. +`r cor.method.name` correlation coefficients of allele frequencies between pairs +of variants were calculated to detect interactions between them (@fig-heatmap). +The heatmap is interactive and allows zooming in on specific regions. -```{r heatmap, echo = F, message = F, fig.align = 'center'} -#| label: fig-heatmap -#| fig-cap: "Interactive hierarchically clustered heatmap of the pairwise correlation coefficients between the time series of allele frequencies in the case study." +```{r fig-heatmap, echo = F, message = F, warning = F, fig.align = 'center'} +#| fig-cap: "Interactive heatmap with hierarchical clustering of the pairwise correlation coefficients between the time series of allele frequencies in the case study." heatmaply_cor( cor.mat, - fontsize_row = 8, - fontsize_col = 8, - column_text_angle = 90 + grid_gap = 0.25, + fontsize_row = 7, + fontsize_col = 7, + column_text_angle = 45, + label_names = c("row", "column", "coefficient"), + width = 600, + height = 600 ) ``` diff --git a/workflow/Snakefile b/workflow/Snakefile index e8456e1..97381f5 100644 --- a/workflow/Snakefile +++ b/workflow/Snakefile @@ -6,7 +6,7 @@ import subprocess min_version("7.19") -__version__ = "1.2.2" +__version__ = "1.3.0" containerized: "docker://ahmig/vipera:v" + __version__ diff --git a/workflow/envs/afwdist.yaml b/workflow/envs/afwdist.yaml new file mode 100644 index 0000000..8e29d0d --- /dev/null +++ b/workflow/envs/afwdist.yaml @@ -0,0 +1,5 @@ +channels: + - bioconda + - conda-forge +dependencies: + - afwdist==1.0.0 diff --git a/workflow/envs/biopython.yaml b/workflow/envs/biopython.yaml index 172f7bf..aaafd09 100644 --- a/workflow/envs/biopython.yaml +++ b/workflow/envs/biopython.yaml @@ -5,7 +5,3 @@ dependencies: - python==3.10 - biopython==1.81 - pandas==2.0.3 - - pip==23.2.1 - - mafft==7.525 - - pip: - - gb2seq==0.2.20 diff --git a/workflow/envs/freyja.yaml b/workflow/envs/freyja.yaml index 94d9be6..6673ab1 100644 --- a/workflow/envs/freyja.yaml +++ b/workflow/envs/freyja.yaml @@ -2,4 +2,4 @@ channels: - conda-forge - bioconda dependencies: - - freyja==1.4.2 + - freyja==2.0.1 diff --git a/workflow/envs/quarto_render.yaml b/workflow/envs/quarto_render.yaml index 71fa4cb..a1ab3ba 100644 --- a/workflow/envs/quarto_render.yaml +++ b/workflow/envs/quarto_render.yaml @@ -1,12 +1,9 @@ channels: - conda-forge - - bioconda dependencies: - - r-base==4.3.1 - - r-gt==0.9.0 - - quarto==1.3.450 - - r-jsonlite==1.8.5 + - r-base==4.5.2 + - r-gt==1.1.0 + - quarto==1.8.25 + - deno==2.3.1 - r-tidyverse==2.0.0 - - r-quarto==1.2 - - r-heatmaply==1.4.2 - - r-readr==2.1.4 + - r-heatmaply==1.6.0 diff --git a/workflow/envs/renv.yaml b/workflow/envs/renv.yaml index 43568d8..ceb6a99 100644 --- a/workflow/envs/renv.yaml +++ b/workflow/envs/renv.yaml @@ -5,7 +5,6 @@ dependencies: - r-base=4.3.2 - r-tidyverse==2.0.0 - r-ggrepel==0.9.3 - - r-stringi==1.7.12 - r-ggpubr==0.6.0 - bioconductor-ggtree==3.10.0 - r-ape==5.8 @@ -15,5 +14,4 @@ dependencies: - r-future.apply==1.11.0 - r-scales==1.3.0 - r-showtext==0.9_6 - - r-jsonlite==1.8.5 - r-logger==0.2.2 diff --git a/workflow/envs/snpeff.yaml b/workflow/envs/snpeff.yaml index 1423a3c..612ab76 100644 --- a/workflow/envs/snpeff.yaml +++ b/workflow/envs/snpeff.yaml @@ -2,3 +2,4 @@ channels: - bioconda dependencies: - snpeff==5.1d + - snpsift==5.1d diff --git a/workflow/rules/asr.smk b/workflow/rules/asr.smk index 5acaded..d20c6d4 100644 --- a/workflow/rules/asr.smk +++ b/workflow/rules/asr.smk @@ -2,6 +2,7 @@ rule reconstruct_ancestral_sequence: threads: 4 conda: "../envs/iqtree.yaml" params: + seed = 7291, seqtype = "DNA", name = OUTPUT_NAME, outgroup = config["ALIGNMENT_REFERENCE"], @@ -14,13 +15,11 @@ rule reconstruct_ancestral_sequence: log: LOGDIR / "reconstruct_ancestral_sequence" / "log.txt" shell: - """ - mkdir -p {output.folder} - iqtree2 \ - -asr \ - -o {params.outgroup} -T AUTO --threads-max {threads} -s {input.fasta} \ - --seqtype {params.seqtype} -m {params.model} --prefix {output.folder}/{params.name} >{log} 2>&1 - """ + "mkdir -p {output.folder} && " + "iqtree2 -seed {params.seed} " + "-asr " + "-o {params.outgroup} -T AUTO --threads-max {threads} -s {input.fasta} " + "--seqtype {params.seqtype} -m {params.model} --prefix {output.folder}/{params.name} >{log} 2>&1" rule ancestor_fasta: @@ -29,7 +28,7 @@ rule ancestor_fasta: params: node_id = "Node1", indeterminate_char = "N", - name = OUTPUT_NAME + name = "case_ancestor", input: state_file = OUTDIR/"tree"/f"{OUTPUT_NAME}.state" output: diff --git a/workflow/rules/context.smk b/workflow/rules/context.smk index 623060c..9f8fa05 100644 --- a/workflow/rules/context.smk +++ b/workflow/rules/context.smk @@ -74,12 +74,14 @@ rule ml_context_tree: conda: "../envs/iqtree.yaml" shadow: "shallow" params: + seed = 7291, seqtype = "DNA", name = OUTPUT_NAME, - ufboot = config["UFBOOT_REPS"], - alrt = config["SHALRT_REPS"], + ufboot = config["UFBOOT"]["REPS"], + alrt = config["SHALRT"]["REPS"], outgroup = config["ALIGNMENT_REFERENCE"], - model = config["TREE_MODEL"] + model = config["TREE_MODEL"], + max_iterations_convergence = 1000, input: fasta = OUTDIR/"nextalign"/f"{OUTPUT_NAME}.aligned.masked.fasta", outgroup_aln = OUTDIR/"context"/"nextalign"/"context_sequences.aligned.masked.fasta" @@ -89,14 +91,11 @@ rule ml_context_tree: log: LOGDIR / "ml_context_tree" / "log.txt" shell: - """ - exec >{log} - exec 2>&1 - - awk '/^>/{{p=seen[$0]++}}!p' {input.fasta} {input.outgroup_aln} > aln.fasta - mkdir -p {output.folder} - iqtree2 \ - -B {params.ufboot} -alrt {params.alrt} \ - -o {params.outgroup} -T AUTO --threads-max {threads} -s aln.fasta \ - --seqtype {params.seqtype} -m {params.model} --prefix {output.folder}/{params.name} - """ + "exec >{log} && exec 2>&1; " + "awk '/^>/{{p=seen[$0]++}}!p' {input.fasta} {input.outgroup_aln} >aln.fasta && " + "mkdir -p {output.folder} && " + "iqtree2 -seed {params.seed} " + "-bb {params.ufboot} -alrt {params.alrt} " + "-o {params.outgroup} -T AUTO --threads-max {threads} -s aln.fasta " + "-nm {params.max_iterations_convergence} " + "--seqtype {params.seqtype} -m {params.model} --prefix {output.folder}/{params.name}" diff --git a/workflow/rules/demix.smk b/workflow/rules/demix.smk index db6f6c2..4973b3e 100644 --- a/workflow/rules/demix.smk +++ b/workflow/rules/demix.smk @@ -1,26 +1,47 @@ +rule demix_barcode_update: + threads: 1 + shadow: "shallow" + conda: + "../envs/freyja.yaml" + params: + pathogen = config["DEMIX"]["PATHOGEN"] + output: + folder = directory(OUTDIR/"demixing"/"freyja_data"), + curated_lineages = OUTDIR/"demixing"/"freyja_data"/"curated_lineages.json", + last_barcode_update = OUTDIR/"demixing"/"freyja_data"/"last_barcode_update.txt", + lineage_mutations = OUTDIR/"demixing"/"freyja_data"/"lineage_mutations.json", + lineage_yml = OUTDIR/"demixing"/"freyja_data"/"lineages.yml", + pathogens = OUTDIR/"demixing"/"freyja_data"/"pathogen_config.yml", + usher_barcodes = OUTDIR/"demixing"/"freyja_data"/"usher_barcodes.feather" + log: + LOGDIR / "demix_barcode_update" / "log.txt" + shell: + "mkdir -p {output.folder:q} && " + "freyja update --outdir {output.folder:q} --pathogen {params.pathogen:q} >{log} 2>&1" + + rule demix_preprocessing: threads: 1 - conda: "../envs/freyja.yaml" + conda: "../envs/var_calling.yaml" shadow: "minimal" input: bam = get_input_bam, ref_fasta = lambda wildcards: select_mapping_references_fasta() params: - minq = config["DEMIX"]["MIN_QUALITY"] + max_depth = config["DEMIX"]["MAX_DEPTH"], + minq = config["DEMIX"]["MIN_QUALITY"], output: depth_file = OUTDIR/"demixing"/"{sample}/{sample}_depth.txt", - variants_file = OUTDIR/"demixing"/"{sample}/{sample}_variants.tsv" + variants_file = OUTDIR/"demixing"/"{sample}/{sample}_variants.tsv", log: - LOGDIR / "demix_preprocessing" / "{sample}.log.txt" + pileup = LOGDIR / "demix_preprocessing" / "{sample}_pileup.log.txt", + ivar = LOGDIR / "demix_preprocessing" / "{sample}_ivar.log.txt", shell: - """ - freyja variants \ - "{input.bam}" \ - --variants {output.variants_file} \ - --depths {output.depth_file} \ - --minq {params.minq} \ - --ref {input.ref_fasta} >{log} 2>&1 - """ + "set -euo pipefail && " + "samtools mpileup -aa -A -d {params.max_depth} -Q {params.minq} -q 0 -B -f {input.ref_fasta:q} {input.bam:q} >sample.pileup 2>{log.pileup:q} && " + "ivar variants -p variants -q {params.minq} -r {input.ref_fasta:q} >{log.ivar:q} 2>&1 {output.depth_file:q} && " + "mv variants.tsv {output.variants_file:q}" rule demix: @@ -29,34 +50,52 @@ rule demix: shadow: "minimal" input: depth_file = OUTDIR/"demixing"/"{sample}/{sample}_depth.txt", - variants_file = OUTDIR/"demixing"/"{sample}/{sample}_variants.tsv" + variants_file = OUTDIR/"demixing"/"{sample}/{sample}_variants.tsv", + barcodes = OUTDIR/"demixing"/"freyja_data"/"usher_barcodes.feather", + curated_lineages = OUTDIR/"demixing"/"freyja_data"/"curated_lineages.json", + lineage_yml = OUTDIR/"demixing"/"freyja_data"/"lineages.yml", params: coverage_cutoff = config["DEMIX"]["COV_CUTOFF"], - minimum_abundance = config["DEMIX"]["MIN_ABUNDANCE"] + minimum_abundance = config["DEMIX"]["MIN_ABUNDANCE"], + confirmed_only = "--confirmedonly " if config["DEMIX"]["CONFIRMED_ONLY"] else "", + depth_cutoff = config["DEMIX"]["DEPTH_CUTOFF"], + auto_adapt = "--autoadapt " if config["DEMIX"]["AUTO_ADAPT"] else "", + relaxed_mrca = "--relaxedmrca " if config["DEMIX"]["RELAXED_MRCA"] else "", + relaxed_mrca_thresh = config["DEMIX"]["RELAXED_MRCA_THRESH"], + solver = config["DEMIX"]["SOLVER"], output: - demix_file = OUTDIR/"demixing"/"{sample}/{sample}_demixed.tsv" + demix_file = OUTDIR/"demixing"/"samples"/"{sample}/{sample}_demixed.tsv" log: LOGDIR / "demix" / "{sample}.log.txt" shell: - """ - freyja demix \ - "{input.variants_file}" \ - "{input.depth_file}" \ - --eps {params.minimum_abundance} \ - --covcut {params.coverage_cutoff} \ - --output {output.demix_file} >{log} 2>&1 - """ + "freyja demix " + "{input.variants_file:q} " + "{input.depth_file:q} " + "--barcodes {input.barcodes:q} " + "--meta {input.curated_lineages:q} " + "--lineageyml {input.lineage_yml:q} " + "--eps {params.minimum_abundance} " + "--covcut {params.coverage_cutoff} " + "--depthcutoff {params.depth_cutoff} " + "{params.confirmed_only}" + "{params.auto_adapt}" + "{params.relaxed_mrca}" + "--relaxedthresh {params.relaxed_mrca_thresh} " + "--solver {params.solver} " + "--output {output.demix_file} " + "--max-solver-threads {threads} " + ">{log} 2>&1" -rule summarise_demixing: +rule summarise_demix: threads: 1 conda: "../envs/renv.yaml" shadow: "shallow" input: - tables = expand(OUTDIR/"demixing"/"{sample}/{sample}_demixed.tsv", sample=iter_samples()) + tables = expand(OUTDIR/"demixing"/"samples"/"{sample}/{sample}_demixed.tsv", sample=iter_samples()) output: - summary_df = report(OUTDIR/"summary_freyja_demixing.csv") + summary_df = report(OUTDIR/"demixing"/"summary.csv") log: - LOGDIR / "summarise_demixing" / "log.txt" + LOGDIR / "summarise_demix" / "log.txt" script: - "../scripts/summary_demixing.R" + "../scripts/summarise_demix.R" diff --git a/workflow/rules/distances.smk b/workflow/rules/distances.smk index 4161df9..11fae38 100644 --- a/workflow/rules/distances.smk +++ b/workflow/rules/distances.smk @@ -1,17 +1,52 @@ -rule weighted_distances: - threads: 1 +rule extract_afwdist_variants: conda: "../envs/biopython.yaml" params: - samples = expand("{sample}", sample = iter_samples()), - mask_class = ["mask"] + sample_col = "SAMPLE", + position_col = "POS", + sequence_col = "ALT", + frequency_col = "ALT_FREQ", + mask_class = ["mask"], input: - tsv = OUTDIR/f"{OUTPUT_NAME}.masked.filtered.tsv", - vcf = lambda wildcards: select_problematic_vcf(), + variants = OUTDIR/f"{OUTPUT_NAME}.variants.tsv", + mask_vcf = lambda wildcards: select_problematic_vcf(), ancestor = OUTDIR/f"{OUTPUT_NAME}.ancestor.fasta", - reference = OUTDIR/"reference.fasta" + reference = OUTDIR/"reference.fasta", output: - distances = REPORT_DIR_TABLES/f"figure_8.csv" + variants = temp(OUTDIR/f"{OUTPUT_NAME}.variants.afwdist.csv"), log: - LOGDIR / "weighted_distances" / "log.txt" + LOGDIR/"extract_afwdist_variants"/"log.txt" script: - "../scripts/weighted_distances.py" + "../scripts/extract_afwdist_variants.py" + + +rule afwdist_weighted_distances: + conda: "../envs/afwdist.yaml" + params: + extra_args = "", + input: + variants = OUTDIR/f"{OUTPUT_NAME}.variants.afwdist.csv", + reference = OUTDIR/f"{OUTPUT_NAME}.ancestor.fasta", + output: + distances = temp(OUTDIR/f"{OUTPUT_NAME}.distances.raw.csv"), + log: + LOGDIR/"afwdist_weighted_distances"/"log.txt" + shell: + "afwdist " + "-i {input.variants:q} " + "-r {input.reference:q} " + "-o {output.distances:q} " + "{params.extra_args} >{log:q} 2>&1" + + +rule format_afwdist_results: + conda: "../envs/biopython.yaml" + params: + samples = sorted(iter_samples()) + [config["ALIGNMENT_REFERENCE"]], + input: + distances = OUTDIR/f"{OUTPUT_NAME}.distances.raw.csv", + output: + distances = OUTDIR/f"{OUTPUT_NAME}.distances.csv", + log: + LOGDIR/"format_afwdist_results"/"log.txt" + script: + "../scripts/format_afwdist_results.py" diff --git a/workflow/rules/evolution.smk b/workflow/rules/evolution.smk index e89d358..0380940 100644 --- a/workflow/rules/evolution.smk +++ b/workflow/rules/evolution.smk @@ -1,14 +1,45 @@ -rule N_S_sites: +rule filter_genbank_features: threads: 1 conda: "../envs/biopython.yaml" + params: + included = config.get("GB_FEATURES", {}).get("INCLUDE", {}), + excluded = config.get("GB_FEATURES", {}).get("EXCLUDE", {}), input: - fasta = OUTDIR/f"{OUTPUT_NAME}.ancestor.fasta", gb = OUTDIR/"reference.gb", - features = Path(config["FEATURES_JSON"]).resolve(), - genetic_code = Path(config["GENETIC_CODE_JSON"]).resolve() output: - csv = temp(OUTDIR/f"{OUTPUT_NAME}.ancestor.N_S.sites.csv") + gb = OUTDIR/"reference.cds.gb", + log: + LOGDIR / "filter_genbank_features" / "log.txt" + script: + "../scripts/filter_genbank_features.py" + + +rule n_s_sites: + threads: 1 + conda: "../envs/biopython.yaml" + params: + gb_qualifier_display = "gene", + input: + fasta = OUTDIR/f"{OUTPUT_NAME}.ancestor.fasta", + gb = OUTDIR/"reference.cds.gb", + genetic_code = Path(config["GENETIC_CODE_JSON"]).resolve(), + output: + csv = temp(OUTDIR/f"{OUTPUT_NAME}.ancestor.n_s.sites.csv"), + log: + LOGDIR / "n_s_sites" / "log.txt" + script: + "../scripts/n_s_sites_from_fasta.py" + + +rule calculate_dnds: + conda: "../envs/renv.yaml" + input: + n_s_sites = OUTDIR/f"{OUTPUT_NAME}.ancestor.n_s.sites.csv", + variants = OUTDIR/f"{OUTPUT_NAME}.variants.tsv", + metadata = config["METADATA"] + output: + table = OUTDIR/f"{OUTPUT_NAME}.dnds.csv", log: - LOGDIR / "N_S_sites" / "log.txt" + LOGDIR / "calculate_dnds" / "log.txt" script: - "../scripts/N_S_sites_from_fasta.py" + "../scripts/calculate_dnds.R" diff --git a/workflow/rules/fetch.smk b/workflow/rules/fetch.smk index 9f290be..3e4860c 100644 --- a/workflow/rules/fetch.smk +++ b/workflow/rules/fetch.smk @@ -66,3 +66,8 @@ rule fetch_problematic_vcf: select_problematic_vcf() shell: "curl {params.url} -o {output} -s 2> {log}" + + +rule create_empty_file: + output: + temp(touch(OUTDIR/"empty.txt")) diff --git a/workflow/rules/report.smk b/workflow/rules/report.smk index df45ea1..77aa6aa 100644 --- a/workflow/rules/report.smk +++ b/workflow/rules/report.smk @@ -1,160 +1,395 @@ -rule heatmap: +rule demix_plot_data: conda: "../envs/renv.yaml" input: - vcf = OUTDIR/f"{OUTPUT_NAME}.masked.filtered.tsv", + summary_demixing = OUTDIR/"demixing"/"summary.csv", metadata = config["METADATA"] output: - table = report(REPORT_DIR_TABLES/"figure_10.csv") + data = REPORT_DIR_TABLES/"demix.csv" log: - LOGDIR / "heatmap" / "log.txt" + LOGDIR / "demix_plot_data" / "log.txt" script: - "../scripts/report/heatmap.R" + "../scripts/report/demix_plot_data.R" -rule window: - conda: "../envs/biopython.yaml" +rule demix_plot: + conda: "../envs/renv.yaml" params: - window = config["WINDOW"]["WIDTH"], - step = config["WINDOW"]["STEP"] + design = config["PLOTS"], + plot_width_mm = 159.2, + plot_height_mm = 119.4, input: - vcf = OUTDIR/f"{OUTPUT_NAME}.masked.filtered.tsv", - gb = OUTDIR/"reference.gb", - features = config["FEATURES_JSON"] + data = REPORT_DIR_TABLES/"demix.csv" output: - window_df = temp(OUTDIR/f"{OUTPUT_NAME}.window.csv"), + plot = report(REPORT_DIR_PLOTS/"demix.png") log: - LOGDIR / "window" / "log.txt" + LOGDIR / "demix_plot" / "log.txt" script: - "../scripts/window.py" + "../scripts/report/demix_plot.R" -rule diversity: +rule diversity_data: threads: 4 conda: "../envs/renv.yaml" params: - design = config["PLOTS"], bootstrap_reps = config["DIVERSITY_REPS"], - plot_width = 159.2, - plot_height = 119.4 + aln_reference = config["ALIGNMENT_REFERENCE"], + seed = 7291, input: study_fasta = OUTDIR/"nextalign"/f"{OUTPUT_NAME}.aligned.masked.fasta", - context_fasta = OUTDIR/"context"/"nextalign"/"context_sequences.aligned.masked.fasta" + context_fasta = OUTDIR/"context"/"nextalign"/"context_sequences.aligned.masked.fasta", + output: + divs = REPORT_DIR_TABLES/"diversity.txt", + json = REPORT_DIR_TABLES/"diversity.json", + log: + LOGDIR / "diversity_data" / "log.txt" + script: + "../scripts/report/diversity_data.R" + + +rule diversity_plot: + threads: 1 + conda: "../envs/renv.yaml" + params: + design = config["PLOTS"], + plot_width_mm = 159.2, + plot_height_mm = 119.4, + input: + divs = REPORT_DIR_TABLES/"diversity.txt", + json = REPORT_DIR_TABLES/"diversity.json", output: - fig = report(REPORT_DIR_PLOTS/"figure_3.png"), - json = temp(OUTDIR/"diversity.json"), - table = REPORT_DIR_TABLES/"figure_3.csv" + plot = report(REPORT_DIR_PLOTS/"diversity.png"), log: - LOGDIR / "diversity" / "log.txt" + LOGDIR / "diversity_plot" / "log.txt" script: "../scripts/report/diversity_plot.R" -rule freyja_plot: +rule extract_genbank_regions: + conda: "../envs/biopython.yaml" + params: + gb_qualifier = "gene", + input: + gb = OUTDIR/"reference.cds.gb", + output: + regions = temp(REPORT_DIR_TABLES/"genbank_regions.json"), + log: + LOGDIR / "extract_genbank_regions" / "log.txt" + script: + "../scripts/report/extract_genbank_regions.py" + + +rule polymorphic_sites_over_time_data: conda: "../envs/renv.yaml" params: - design = config["PLOTS"] + max_alt_freq = 1.0 - config["VC"]["MIN_FREQ"], input: - summary_demixing = OUTDIR/"summary_freyja_demixing.csv", - metadata = config["METADATA"] + variants = OUTDIR/f"{OUTPUT_NAME}.variants.tsv", + metadata = config["METADATA"], output: - fig = report(REPORT_DIR_PLOTS/"figure_1.png"), - table = report(REPORT_DIR_TABLES/"figure_1.csv") + table = REPORT_DIR_PLOTS/"polymorphic_sites_over_time.csv", + json = temp(REPORT_DIR_TABLES/"polymorphic_sites_over_time.json"), log: - LOGDIR / "freyja_plot" / "log.txt" + LOGDIR / "polymorphic_sites_over_time_data" / "log.txt" script: - "../scripts/report/freyja_plot.R" + "../scripts/report/polymorphic_sites_over_time_data.R" -rule general_NV_description: +rule polymorphic_sites_over_time_plot: conda: "../envs/renv.yaml" params: - samples = expand("{sample}", sample = iter_samples()), design = config["PLOTS"], - regions = config["PLOT_GENOME_REGIONS"], + plot_width_mm = 159.2, + plot_height_mm = 119.4, + input: + table = REPORT_DIR_PLOTS/"polymorphic_sites_over_time.csv", + output: + plot = report(REPORT_DIR_PLOTS/"polymorphic_sites_over_time.png"), + log: + LOGDIR / "polymorphic_sites_over_time_plot" / "log.txt" + script: + "../scripts/report/polymorphic_sites_over_time_plot.R" + + +rule window_data: + conda: "../envs/biopython.yaml" + params: window = config["WINDOW"]["WIDTH"], step = config["WINDOW"]["STEP"], - max_alt_freq = 1.0 - config["VC"]["IVAR_FREQ"] + features = config.get("GB_FEATURES", {}), + gb_qualifier_display = "gene" input: - window = OUTDIR/f"{OUTPUT_NAME}.window.csv", - vcf = OUTDIR/f"{OUTPUT_NAME}.masked.filtered.tsv", - metadata = config["METADATA"] + variants = OUTDIR/f"{OUTPUT_NAME}.variants.tsv", + gb = OUTDIR/"reference.gb", output: - fig = report(REPORT_DIR_PLOTS/"figure_5a.png"), - fig_s = report(REPORT_DIR_PLOTS/"figure_5b.png"), - fig_cor = report(REPORT_DIR_PLOTS/"figure_4.png"), - json = temp(OUTDIR/"summary_nv.json"), - table_1 = report(REPORT_DIR_TABLES/"figure_5a.csv"), - table_2 = report(REPORT_DIR_TABLES/"figure_5b.csv"), - table_3 = report(REPORT_DIR_TABLES/"figure_4.csv") + window_df = REPORT_DIR_TABLES/"window.csv", + json = temp(REPORT_DIR_TABLES/"window.json"), log: - LOGDIR / "general_NV_description" / "log.txt" + LOGDIR / "window_data" / "log.txt" script: - "../scripts/report/NV_description.R" + "../scripts/report/window_data.py" -rule phylo_plots: +rule nv_panel_data: + conda: "../envs/renv.yaml" + input: + variants = OUTDIR/f"{OUTPUT_NAME}.variants.tsv", + metadata = config["METADATA"], + output: + table = REPORT_DIR_TABLES/"nv_panel.csv", + json = temp(REPORT_DIR_TABLES/"nv_panel.json"), + log: + LOGDIR / "nv_panel_data" / "log.txt" + script: + "../scripts/report/nv_panel_data.R" + + +rule nv_panel_zoom_on_feature_data: + input: + table = REPORT_DIR_TABLES/"nv_panel.csv", + regions = REPORT_DIR_TABLES/"genbank_regions.json", + output: + table = temp(REPORT_DIR_TABLES/"nv_panel.{region_name}.csv"), + log: + LOGDIR / "nv_panel_zoom_on_feature_data" / "{region_name}.log.txt" + script: + "../scripts/report/nv_panel_zoom_on_feature_data.py" + + +rule window_zoom_on_feature_data: + input: + table = REPORT_DIR_TABLES/"window.csv", + regions = REPORT_DIR_TABLES/"genbank_regions.json", + output: + table = temp(REPORT_DIR_TABLES/"window.{region_name}.csv"), + log: + LOGDIR / "window_zoom_on_feature_data" / "{region_name}.log.txt" + script: + "../scripts/report/window_zoom_on_feature_data.py" + + +rule nv_panel_plot: + conda: "../envs/renv.yaml" + params: + design = config["PLOTS"], + window_step = config["WINDOW"]["STEP"], + plot_height_mm = 250.0, + plot_width_mm = 240.0, + input: + panel = REPORT_DIR_TABLES/"nv_panel.csv", + window = REPORT_DIR_TABLES/"window.csv", + regions = REPORT_DIR_TABLES/"genbank_regions.json", + highlight_window_regions = config["PLOT_GENOME_REGIONS"], + output: + plot = report(REPORT_DIR_PLOTS/"nv_panel.png"), + log: + LOGDIR / "nv_panel_plot" / "log.txt" + script: + "../scripts/report/nv_panel_plot.R" + + +use rule nv_panel_plot as nv_panel_plot_S with: + input: + panel = REPORT_DIR_TABLES/"nv_panel.S.csv", + window = REPORT_DIR_TABLES/"window.S.csv", + regions = REPORT_DIR_TABLES/"genbank_regions.json", + highlight_window_regions = OUTDIR/"empty.txt", + output: + plot = report(REPORT_DIR_PLOTS/"nv_panel.S.png"), + log: + LOGDIR / "nv_panel_plot_S" / "log.txt" + + +rule merge_json_files: + input: + REPORT_DIR_TABLES/"nv_panel.json", + REPORT_DIR_TABLES/"polymorphic_sites_over_time.json", + REPORT_DIR_TABLES/"window.json", + output: + json = REPORT_DIR_TABLES/"nv_panel_summary.json", + run: + import json + result = {} + for path in input: + with open(path) as f: + d = json.load(f) + result |= d # will replace existing keys + with open(output.json, "w") as fw: + json.dump(result, fw, indent=2) + + +rule context_phylogeny_data: conda: "../envs/renv.yaml" params: design = config["PLOTS"], ref_name = config["ALIGNMENT_REFERENCE"], - boot_th = 95, - alrt_th = 80, + boot_th = config["UFBOOT"]["THRESHOLD"], + alrt_th = config["SHALRT"]["THRESHOLD"], + input: + target_fasta = OUTDIR/f"{OUTPUT_NAME}.fasta", + tree = OUTDIR/f"tree_context/{OUTPUT_NAME}.treefile", + output: + json = REPORT_DIR_TABLES/"context_phylogeny.json", + annotation = REPORT_DIR_TABLES/"context_phylogeny.csv", + log: + LOGDIR / "context_phylogeny_data" / "log.txt" + script: + "../scripts/report/context_phylogeny_data.R" + + +rule context_phylogeny_plot: + conda: "../envs/renv.yaml" + params: + design = config["PLOTS"], plot_height_mm = 119.4, plot_width_mm = 159.2, - use_bionj = config["USE_BIONJ"] + boot_th = config["UFBOOT"]["THRESHOLD"], + alrt_th = config["SHALRT"]["THRESHOLD"], + input: + tree = OUTDIR/f"tree_context/{OUTPUT_NAME}.treefile", + json = REPORT_DIR_TABLES/"context_phylogeny.json", + annotation = REPORT_DIR_TABLES/"context_phylogeny.csv" + output: + plot = report(REPORT_DIR_PLOTS/"context_phylogeny.png"), + log: + LOGDIR / "context_phylogeny_plot" / "log.txt" + script: + "../scripts/report/context_phylogeny_plot.R" + + +rule allele_freq_tree_data: + conda: "../envs/renv.yaml" + params: + use_bionj = config["USE_BIONJ"], + outgroup_id = config["ALIGNMENT_REFERENCE"], input: - dist = REPORT_DIR_TABLES/f"figure_8.csv", + dist = OUTDIR/f"{OUTPUT_NAME}.distances.csv", + output: + tree = REPORT_DIR_TABLES/"allele_freq_tree.nwk", + log: + LOGDIR / "allele_freq_tree_data" / "log.txt" + script: + "../scripts/report/allele_freq_tree_data.R" + + +rule allele_freq_tree_plot: + conda: "../envs/renv.yaml" + params: + design = config["PLOTS"], + outgroup_id = config["ALIGNMENT_REFERENCE"], + plot_height_mm = 119.4, + plot_width_mm = 159.2, + input: + tree = report(REPORT_DIR_TABLES/"allele_freq_tree.nwk"), study_fasta = OUTDIR/f"{OUTPUT_NAME}.fasta", - ml = OUTDIR/f"tree_context/{OUTPUT_NAME}.treefile", - metadata = config["METADATA"] + metadata = config["METADATA"], + output: + plot = report(REPORT_DIR_PLOTS/"allele_freq_tree.png"), + log: + LOGDIR / "allele_freq_tree_plot" / "log.txt" + script: + "../scripts/report/allele_freq_tree_plot.R" + + +rule time_signal_data: + conda: "../envs/renv.yaml" + params: + outgroup_id = config["ALIGNMENT_REFERENCE"], + input: + tree = report(REPORT_DIR_TABLES/"allele_freq_tree.nwk"), + metadata = config["METADATA"], + output: + table = report(REPORT_DIR_TABLES/"time_signal.csv"), + json = REPORT_DIR_TABLES/"time_signal.json", + log: + LOGDIR / "time_signal_data" / "log.txt" + script: + "../scripts/report/time_signal_data.R" + + +rule time_signal_plot: + conda: "../envs/renv.yaml" + params: + design = config["PLOTS"], + plot_height_mm = 119.4, + plot_width_mm = 159.2, + input: + table = report(REPORT_DIR_TABLES/"time_signal.csv"), output: - temest = report(REPORT_DIR_PLOTS/"figure_9.png"), - tree = report(REPORT_DIR_PLOTS/"figure_8.png"), - tree_ml = report(REPORT_DIR_PLOTS/"figure_2.png"), - table = report(REPORT_DIR_TABLES/"figure_9.csv"), - json = temp(OUTDIR/"stats.lm.json") + plot = report(REPORT_DIR_PLOTS/"time_signal.png"), log: - LOGDIR / "phylo_plots" / "log.txt" + LOGDIR / "time_signal_plot" / "log.txt" script: - "../scripts/report/phylo_plots.R" + "../scripts/report/time_signal_plot.R" -rule evo_plots: +rule dnds_plots: conda: "../envs/renv.yaml" params: - design = config["PLOTS"] + design = config["PLOTS"], + plot_height_mm = 119.4, + plot_width_mm = 159.2, input: - N_S = OUTDIR/f"{OUTPUT_NAME}.ancestor.N_S.sites.csv", - vcf = OUTDIR/f"{OUTPUT_NAME}.masked.filtered.tsv", - metadata = config["METADATA"] + table = OUTDIR/f"{OUTPUT_NAME}.dnds.csv", output: - plot = report(REPORT_DIR_PLOTS/"figure_11.png"), - plot_omega = report(REPORT_DIR_PLOTS/"figure_12.png"), - table = report(REPORT_DIR_TABLES/"figure_11.csv") + plot_dn_ds = report(REPORT_DIR_PLOTS/"dn_and_ds.png"), + plot_omega = report(REPORT_DIR_PLOTS/"dnds.png"), log: LOGDIR / "evo_plots" / "log.txt" script: - "../scripts/report/evo_plots.R" + "../scripts/report/dnds_plots.R" -rule snp_plots: +rule af_time_correlation_data: conda: "../envs/renv.yaml" params: - design = config["PLOTS"], cor_method = config["COR"]["METHOD"], - cor_exact = config["COR"]["EXACT"] + cor_exact = config["COR"]["EXACT"], input: - vcf = OUTDIR/f"{OUTPUT_NAME}.masked.filtered.tsv", - metadata = config["METADATA"] + variants = OUTDIR/f"{OUTPUT_NAME}.variants.tsv", + metadata = config["METADATA"], output: - pseudovolcano = report(REPORT_DIR_PLOTS/"figure_6.png"), - snp_panel = report(REPORT_DIR_PLOTS/"figure_7.png"), - table_1 = report(REPORT_DIR_TABLES/"figure_6.csv"), - table_2 = report(REPORT_DIR_TABLES/"figure_7.csv") + fmt_variants = temp(REPORT_DIR_TABLES/"variants.filled.dated.tsv"), + correlations = report(REPORT_DIR_TABLES/"af_time_correlation.csv"), + subset = REPORT_DIR_TABLES/"af_time_correlation.subset.txt", log: - LOGDIR / "snp_plots" / "log.txt" + LOGDIR / "af_time_correlation_data" / "log.txt" script: - "../scripts/report/snp_plots.R" + "../scripts/report/af_time_correlation_data.R" + + +rule af_time_correlation_plot: + conda: "../envs/renv.yaml" + params: + design = config["PLOTS"], + plot_height_mm = 119.4, + plot_width_mm = 159.2, + input: + correlations = REPORT_DIR_TABLES/"af_time_correlation.csv", + output: + plot = report(REPORT_DIR_PLOTS/"af_time_correlation.png"), + log: + LOGDIR / "af_time_correlation_plot" / "log.txt" + script: + "../scripts/report/af_time_correlation_plot.R" + + +rule af_trajectory_panel_plot: + conda: "../envs/renv.yaml" + params: + design = config["PLOTS"], + n_plot_columns = 3, + plot_row_height_mm = 35, + plot_width_mm = 159.2, + random_color_seed = 7291, + input: + fmt_variants = REPORT_DIR_TABLES/"variants.filled.dated.tsv", + subset = REPORT_DIR_TABLES/"af_time_correlation.subset.txt" + output: + plot = report(REPORT_DIR_PLOTS/"af_trajectory_panel.png"), + log: + LOGDIR / "af_trajectory_panel_plot" / "log.txt" + script: + "../scripts/report/af_trajectory_panel_plot.R" rule summary_table: @@ -163,7 +398,7 @@ rule summary_table: report = report(OUTDIR/f"{OUTPUT_NAME}.lineage_report.csv"), metadata = config["METADATA"] output: - table = temp(OUTDIR/"summary_table.csv") + table = REPORT_DIR_TABLES/"summary_table.csv" log: LOGDIR / "summary_table" / "log.txt" script: @@ -172,67 +407,77 @@ rule summary_table: rule report: conda: "../envs/quarto_render.yaml" - shadow: "shallow" + shadow: "minimal" input: qmd = Path(config["REPORT_QMD"]).resolve(), - freyja = report(REPORT_DIR_PLOTS/"figure_1.png"), - tree_ml = report(REPORT_DIR_PLOTS/"figure_2.png"), - diversity = report(REPORT_DIR_PLOTS/"figure_3.png"), - fig_cor = report(REPORT_DIR_PLOTS/"figure_4.png"), - SNV = report(REPORT_DIR_PLOTS/"figure_5a.png"), - SNV_spike = report(REPORT_DIR_PLOTS/"figure_5b.png"), - volcano = report(REPORT_DIR_PLOTS/"figure_6.png"), - panel = report(REPORT_DIR_PLOTS/"figure_7.png"), - tree = report(REPORT_DIR_PLOTS/"figure_8.png"), - temest = report(REPORT_DIR_PLOTS/"figure_9.png"), - heat_table = report(REPORT_DIR_TABLES/"figure_10.csv"), - evo = report(REPORT_DIR_PLOTS/"figure_11.png"), - omega_plot = report(REPORT_DIR_PLOTS/"figure_12.png"), - value = OUTDIR/"diversity.json", - stats_lm = OUTDIR/"stats.lm.json", - table = OUTDIR/"summary_table.csv", - sum_nv = OUTDIR/"summary_nv.json" + css = Path(config["REPORT_CSS"]).resolve(), + demix = report(REPORT_DIR_PLOTS/"demix.png"), + tree_ml = report(REPORT_DIR_PLOTS/"context_phylogeny.png"), + diversity = report(REPORT_DIR_PLOTS/"diversity.png"), + fig_cor = report(REPORT_DIR_PLOTS/"polymorphic_sites_over_time.png"), + SNV = report(REPORT_DIR_PLOTS/"nv_panel.png"), + SNV_spike = report(REPORT_DIR_PLOTS/"nv_panel.S.png"), + volcano = report(REPORT_DIR_PLOTS/"af_time_correlation.png"), + panel = report(REPORT_DIR_PLOTS/"af_trajectory_panel.png"), + tree = report(REPORT_DIR_PLOTS/"allele_freq_tree.png"), + temest = report(REPORT_DIR_PLOTS/"time_signal.png"), + evo = report(REPORT_DIR_PLOTS/"dn_and_ds.png"), + omega_plot = report(REPORT_DIR_PLOTS/"dnds.png"), + heat_table = report(OUTDIR/"vaf"/"pairwise_trajectory_correlation.csv"), + freyja_ts = OUTDIR/"demixing"/"freyja_data"/"last_barcode_update.txt", + value = REPORT_DIR_TABLES/"diversity.json", + stats_lm = REPORT_DIR_TABLES/"time_signal.json", + stats_ml = REPORT_DIR_TABLES/"context_phylogeny.json", + table = REPORT_DIR_TABLES/"summary_table.csv", + sum_nv = REPORT_DIR_TABLES/"nv_panel_summary.json", params: workflow_version = get_repo_version(BASE_PATH.as_posix(), __version__), - min_ivar_freq = config["VC"]["IVAR_FREQ"], - ufboot_reps = config["UFBOOT_REPS"], - shalrt_reps = config["SHALRT_REPS"], + min_ivar_freq = config["VC"]["MIN_FREQ"], + ufboot_reps = config["UFBOOT"]["REPS"], + shalrt_reps = config["SHALRT"]["REPS"], name = config["OUTPUT_NAME"], use_bionj = config["USE_BIONJ"], - cor_method = config["COR"]["METHOD"] + cor_method = config["COR"]["METHOD"], output: html = report(OUTDIR/f"{OUTPUT_NAME}.report.html") log: LOGDIR / "report" / "log.txt" shell: - "set +o pipefail; " - "Rscript -e 'library(quarto)' " - "-e \"quarto_render(" - "input = '{input.qmd}', " - "execute_params=list(" - "ufboot_reps='{params.ufboot_reps}', " - "shalrt_reps='{params.shalrt_reps}', " - "min_ivar_freq='{params.min_ivar_freq}', " - "workflow_version='{params.workflow_version}', " - "use_bionj='{params.use_bionj}', " - "cor_method='{params.cor_method}', " - "div='{input.diversity}', " - "freyja ='{input.freyja}', " - "tree = '{input.tree}', " - "tempest = '{input.temest}', " - "SNV = '{input.SNV}', " - "SNV_s = '{input.SNV_spike}', " - "evo = '{input.evo}', " - "div_value = '{input.value}', " - "panel = '{input.panel}', " - "volcano = '{input.volcano}', " - "tree_ml = '{input.tree_ml}', " - "fig_cor_snp = '{input.fig_cor}', " - "stats_lm = '{input.stats_lm}', " - "table = '{input.table}', " - "sum_nv = '{input.sum_nv}', " - "heat_tab = '{input.heat_table}', " - "omega_plot = '{input.omega_plot}', " - "name = '{params.name}'))\" " - ">{log} 2>&1 && " - 'mv "$(dirname {input.qmd:q})/report.html" {output.html:q}' + """ + set +o pipefail + exec >{log} && exec 2>&1 + + printf "%s\n" \ + "ufboot_reps: '{params.ufboot_reps}'" \ + "shalrt_reps: '{params.shalrt_reps}'" \ + "min_ivar_freq: '{params.min_ivar_freq}'" \ + "workflow_version: '{params.workflow_version}'" \ + "use_bionj: '{params.use_bionj}'" \ + "cor_method: '{params.cor_method}'" \ + "div: '{input.diversity}'" \ + "demix: '{input.demix}'" \ + "tree: '{input.tree}'" \ + "tempest: '{input.temest}'" \ + "SNV: '{input.SNV}'" \ + "SNV_s: '{input.SNV_spike}'" \ + "evo: '{input.evo}'" \ + "div_value: '{input.value}'" \ + "panel: '{input.panel}'" \ + "volcano: '{input.volcano}'" \ + "tree_ml: '{input.tree_ml}'" \ + "fig_cor_snp: '{input.fig_cor}'" \ + "stats_lm: '{input.stats_lm}'" \ + "stats_ml: '{input.stats_ml}'" \ + "table: '{input.table}'" \ + "sum_nv: '{input.sum_nv}'" \ + "heat_tab: '{input.heat_table}'" \ + "omega_plot: '{input.omega_plot}'" \ + "freyja_ts: '{input.freyja_ts}'" \ + "name: '{params.name}'" \ + >params.yaml + + sed "s|__CSSPLACEHOLDER__|{input.css}|g" {input.qmd:q} >report.qmd + + quarto render report.qmd --execute-params params.yaml + mv report.html {output.html:q} + """ diff --git a/workflow/rules/vaf.smk b/workflow/rules/vaf.smk index bd16219..2a2560b 100644 --- a/workflow/rules/vaf.smk +++ b/workflow/rules/vaf.smk @@ -1,20 +1,20 @@ rule snps_to_ancestor: - threads: 1 + threads: 2 retries: 3 shadow: "minimal" conda: "../envs/var_calling.yaml" params: - max_depth = config["VC"]["MAX_DEPTH"], - min_quality = config["VC"]["MIN_QUALITY"], - ivar_quality = config["VC"]["IVAR_QUALITY"], - ivar_freq = config["VC"]["IVAR_FREQ"], - ivar_depth = config["VC"]["IVAR_DEPTH"] + mpileup_depth = config["VC"]["MAX_DEPTH"], + mpileup_quality = 0, + ivar_quality = config["VC"]["MIN_QUALITY"], + ivar_freq = config["VC"]["MIN_FREQ"], + ivar_depth = config["VC"]["MIN_DEPTH"], input: reference_fasta = OUTDIR/f"{OUTPUT_NAME}.ancestor.fasta", bam = get_input_bam, gff = OUTDIR/"reference.gff3" output: - tsv = temp(OUTDIR/"{sample}.tsv") + tsv = temp(OUTDIR/"vaf"/"{sample}.tsv") log: LOGDIR / "snps_to_ancestor" / "{sample}.log.txt" shell: @@ -35,58 +35,20 @@ rule snps_to_ancestor: samtools mpileup \ -aa \ --ignore-overlaps \ - -d {params.max_depth} \ + -d {params.mpileup_depth} \ --count-orphans \ --no-BAQ \ - -Q {params.min_quality} \ + -Q {params.mpileup_quality} \ -f renamed_reference.fasta \ {input.bam} \ | ivar variants \ - -p {wildcards.sample} \ + -p variants \ -q {params.ivar_quality} \ -t {params.ivar_freq} \ -m {params.ivar_depth} \ -g {input.gff} \ -r renamed_reference.fasta - - sed 's/'$ref'/'{wildcards.sample}'/g' {wildcards.sample}.tsv | cat > {output.tsv} - """ - - -rule annotation: - threads:1 - conda: "../envs/biopython.yaml" - shadow: "shallow" - input: - gb = OUTDIR/"reference.gb", - ref = OUTDIR/"reference.fasta", - features = Path(config["FEATURES_JSON"]).resolve() - output: - df = temp(OUTDIR/"annotation.csv") - log: - LOGDIR / "annotation" / "log.txt" - script: - "../scripts/report/get_annotation.py" - - -rule format_tsv: - threads:1 - shadow: "shallow" - input: - expand(OUTDIR/"{sample}.tsv", sample = iter_samples()) - output: - tsv = OUTDIR/f"{OUTPUT_NAME}.tsv" - log: - LOGDIR / "format_tsv" / "log.txt" - shell: - """ - path=`echo {input} | awk '{{print $1}}'` - grep "^REGION" "$path" > header - for tsv in {input}; do - tail -n +2 "$tsv" >> body - done - cat header body > "{output.tsv}" - rm header + mv variants.tsv {output.tsv:q} """ @@ -96,12 +58,12 @@ rule mask_tsv: params: mask_class = ["mask"] input: - tsv = OUTDIR/f"{OUTPUT_NAME}.tsv", + tsv = OUTDIR/"vaf"/"{sample}.tsv", vcf = lambda wildcards: select_problematic_vcf() output: - masked_tsv = temp(OUTDIR/f"{OUTPUT_NAME}.masked.tsv") + masked_tsv = temp(OUTDIR/"vaf"/"{sample}.masked.tsv") log: - LOGDIR / "mask_tsv" / "log.txt" + LOGDIR / "mask_tsv" / "{sample}.log.txt" script: "../scripts/mask_tsv.py" @@ -109,13 +71,16 @@ rule mask_tsv: rule filter_tsv: threads: 1 conda: "../envs/renv.yaml" + params: + min_depth = 20, + min_alt_rv = 2, + min_alt_dp = 2, input: - tsv = OUTDIR/f"{OUTPUT_NAME}.masked.tsv", - annotation = OUTDIR/"annotation.csv" + tsv = OUTDIR/"vaf"/"{sample}.masked.tsv" output: - filtered_tsv = temp(OUTDIR/f"{OUTPUT_NAME}.masked.prefiltered.tsv") + filtered_tsv = temp(OUTDIR/"vaf"/"{sample}.masked.prefiltered.tsv") log: - LOGDIR / "filter_tsv" / "log.txt" + LOGDIR / "filter_tsv" / "{sample}.log.txt" script: "../scripts/filter_tsv.R" @@ -123,12 +88,14 @@ rule filter_tsv: rule tsv_to_vcf: threads: 1 conda: "../envs/biopython.yaml" + params: + ref_name = config["ALIGNMENT_REFERENCE"], input: - tsv = OUTDIR/f"{OUTPUT_NAME}.masked.prefiltered.tsv", + tsv = OUTDIR/"vaf"/"{sample}.masked.prefiltered.tsv", output: - vcf = temp(OUTDIR/f"{OUTPUT_NAME}.vcf") + vcf = temp(OUTDIR/"vaf"/"{sample}.vcf") log: - LOGDIR / "tsv_to_vcf" / "log.txt" + LOGDIR / "tsv_to_vcf" / "{sample}.log.txt" script: "../scripts/tsv_to_vcf.py" @@ -141,11 +108,11 @@ rule variants_effect: ref_name = config["ALIGNMENT_REFERENCE"], snpeff_data_dir = (BASE_PATH / "config" / "snpeff").resolve() input: - vcf = OUTDIR/f"{OUTPUT_NAME}.vcf" + vcf = OUTDIR/"vaf"/"{sample}.vcf" output: - ann_vcf = temp(OUTDIR/f"{OUTPUT_NAME}.annotated.vcf") + ann_vcf = OUTDIR/"vaf"/"{sample}.annotated.vcf" log: - LOGDIR / "variants_effect" / "log.txt" + LOGDIR / "variants_effect" / "{sample}.log.txt" retries: 2 shell: """ @@ -159,19 +126,93 @@ rule variants_effect: echo "Local database not found at '{params.snpeff_data_dir}', downloading from repository" fi - snpEff eff -dataDir {params.snpeff_data_dir} -noStats {params.ref_name} {input.vcf} > {output.ann_vcf} + snpEff eff -dataDir {params.snpeff_data_dir} -noStats {params.ref_name} {input.vcf} >{output.ann_vcf} """ -rule vcf_to_tsv: +rule extract_vcf_fields: threads: 1 + conda: "../envs/snpeff.yaml" + params: + extract_columns = [f"'{col}'" for col in config["ANNOTATION"]["SNPEFF_COLS"].values()], + sep = ",", + input: + vcf = OUTDIR/"vaf"/"{sample}.annotated.vcf" + output: + tsv = OUTDIR/"vaf"/"{sample}.vcf_fields.tsv" + log: + LOGDIR / "tsv_to_vcf" / "{sample}.log.txt" + shell: + "SnpSift extractFields -e 'NA' -s {params.sep:q} {input.vcf:q} {params.extract_columns} >{output.tsv:q} 2>{log:q}" + + +rule format_vcf_fields_longer: + conda: "../envs/renv.yaml" + params: + sample = "{sample}", + colnames_mapping = config["ANNOTATION"]["SNPEFF_COLS"], + filter_include = config["ANNOTATION"]["FILTER_INCLUDE"], + filter_exclude = config["ANNOTATION"]["FILTER_EXCLUDE"], + variant_name_pattern = lambda wildcards: config["ANNOTATION"]["VARIANT_NAME_PATTERN"], # lambda to deactivate automatic wildcard expansion in pattern + sep = ",", + input: + tsv = OUTDIR/"vaf"/"{sample}.vcf_fields.tsv", + output: + tsv = OUTDIR/"vaf"/"{sample}.vcf_fields.longer.tsv", + log: + LOGDIR / "format_vcf_fields_longer" / "{sample}.log.txt" + script: + "../scripts/format_vcf_fields_longer.R" + + +rule concat_vcf_fields: + params: + sep = "\t", + input: + expand(OUTDIR/"vaf"/"{sample}.vcf_fields.longer.tsv", sample=iter_samples()), + output: + OUTDIR/f"{OUTPUT_NAME}.vcf_fields.longer.tsv", + run: + import pandas as pd + from functools import reduce + reduce( + lambda a, b: pd.concat((a, b), axis="rows", ignore_index=True), + (pd.read_csv(path, sep=params.sep) for path in input) + ).to_csv(output[0], sep=params.sep, index=False) + + +rule merge_annotation: + threads: 1 + conda: "../envs/renv.yaml" + params: + sample = "{sample}", + ref_name = config["ALIGNMENT_REFERENCE"], + input: + tsv = OUTDIR/"vaf"/"{sample}.masked.prefiltered.tsv", + annot = OUTDIR/"vaf"/"{sample}.vcf_fields.longer.tsv", + output: + tsv = OUTDIR/"vaf"/"{sample}.variants.tsv" + log: + LOGDIR / "merge_annotation" / "{sample}.log.txt" + script: + "../scripts/merge_annotation.R" + + +use rule concat_vcf_fields as concat_variants with: + input: + expand(OUTDIR/"vaf"/"{sample}.variants.tsv", sample=iter_samples()), + output: + OUTDIR/f"{OUTPUT_NAME}.variants.tsv", + + +rule pairwise_trajectory_correlation: conda: "../envs/renv.yaml" input: - ann_vcf = OUTDIR/f"{OUTPUT_NAME}.annotated.vcf", - pre_tsv = OUTDIR/f"{OUTPUT_NAME}.masked.prefiltered.tsv" + variants = OUTDIR/f"{OUTPUT_NAME}.variants.tsv", + metadata = config["METADATA"], output: - tsv = OUTDIR/f"{OUTPUT_NAME}.masked.filtered.tsv" + table = report(OUTDIR/"vaf"/"pairwise_trajectory_correlation.csv"), log: - LOGDIR / "vcf_to_tsv" / "log.txt" + LOGDIR / "pairwise_trajectory_correlation" / "log.txt" script: - "../scripts/vcf_to_tsv.R" + "../scripts/pairwise_trajectory_correlation.R" diff --git a/workflow/scripts/N_S_sites_from_fasta.py b/workflow/scripts/N_S_sites_from_fasta.py deleted file mode 100644 index e98ac94..0000000 --- a/workflow/scripts/N_S_sites_from_fasta.py +++ /dev/null @@ -1,92 +0,0 @@ -#!/usr/bin/env python3 - -import logging -import json -import itertools as it -import pandas as pd -from gb2seq.alignment import Gb2Alignment -from gb2seq.features import Features -from dark.fasta import FastaReads - - -def split_into_codons(seq: str) -> list: - """Split the complete CDS feature in to a list of codons""" - codons = [seq[i:i + 3] for i in range(0, len(seq), 3) if "N" not in seq[i:i + 3]] - return codons - - -def calculate_potential_changes(genetic_code: dict) -> dict: - """Generate a dictionary with S and N pre-calculated for all possible codons""" - # Initialize structure - nts = set(["A", "G", "T", "C"]) - potential_changes = {"S": {}, "N": {}} - for codon in it.product(nts, repeat=3): - potential_changes["S"]["".join(codon)] = 0. - potential_changes["N"]["".join(codon)] = 0. - # Mutate (substitutions) all possible codons in the given genetic code - # and count proportions of mutations that are synonymous and non-synonmyous - for codon in genetic_code.keys(): - for codon_p in range(0, 3): - nts = ["A", "G", "T", "C"] - # Do not consider self substitutions, e.g. A->A - nts.remove(codon[codon_p]) - for nt in nts: - codon_mutated = list(codon) - # Mutate the basepair - codon_mutated[codon_p] = nt - # Count how many of them are synonymous - if genetic_code[codon] == genetic_code["".join(codon_mutated)]: - potential_changes["S"][codon] += 1/3. - else: - potential_changes["N"][codon] += 1/3. - return potential_changes - - -def get_feature_codons(alignment: Gb2Alignment, annotation: list) -> dict: - dct = {key:alignment.ntSequences(key)[1].sequence for key in annotation} - return {key:split_into_codons(item) for key,item in dct.items()} - - -def get_df(codons: dict, genetic_code: dict) -> pd.DataFrame: - keys = [] - N_sites = [] - S_sites = [] - values = calculate_potential_changes(genetic_code) - for key, item in codons.items(): - keys.append(key) - N = sum([values["N"][x] for x in item if x in values["N"].keys()]) - S = sum([values["S"][x] for x in item if x in values["S"].keys()]) - N_sites.append(N) - S_sites.append(S) - return pd.DataFrame({"gene": keys, "N": N_sites, "S": S_sites}) - - -def main(): - - logging.basicConfig(filename=snakemake.log[0], format=snakemake.config["LOG_PY_FMT"], level=logging.INFO) - - logging.info("Reading features") - with open(snakemake.input.features) as f: - feature_list = list(json.load(f).keys()) - - logging.info("Reading genetic code") - with open(snakemake.input.genetic_code) as f: - genetic_code = json.load(f) - - logging.info("Create alignment object") - features = Features(snakemake.input.gb) - seq = list(FastaReads(snakemake.input.fasta))[0] - aln = Gb2Alignment(seq, features) - - logging.info("Splitting ancestral sequence into codons") - codons_dict = get_feature_codons(aln, feature_list) - - logging.info("Calculating synonymous and non synonymous sites") - df = get_df(codons_dict, genetic_code) - - logging.info("Saving results") - df.to_csv(snakemake.output.csv,index= False) - - -if __name__ == "__main__": - main() diff --git a/workflow/scripts/calculate_dnds.R b/workflow/scripts/calculate_dnds.R new file mode 100644 index 0000000..f545c27 --- /dev/null +++ b/workflow/scripts/calculate_dnds.R @@ -0,0 +1,71 @@ +#!/usr/bin/env Rscript + +# Write stdout and stderr to log file +log <- file(snakemake@log[[1]], open = "wt") +sink(log, type = "message") +sink(log, type = "output") + +library(dplyr) +library(readr) +library(tidyr) +library(logger) + +log_threshold(INFO) + +# Read inputs +log_info("Reading variants table") +variants <- read_delim( + snakemake@input[["variants"]], + col_select = c( + "SAMPLE", + "VARIANT_NAME", + "REGION", + "ALT_FREQ", + "SYNONYMOUS", + "POS" + ) +) + +log_info("Reading metadata table") +metadata <- read_delim(snakemake@input[["metadata"]]) %>% + mutate( + interval = as.numeric( + as.Date(CollectionDate) - min(as.Date(CollectionDate)) + ) + ) %>% + select(ID, interval) %>% + rename(SAMPLE = ID) + +log_debug("Adding metadata to variants table") +variants <- left_join(variants, metadata) + +log_info("Reading N/S sites") +n_s_position <- read_delim(snakemake@input[["n_s_sites"]]) + +log_info("Computing dN/dS over time (NG86)") +dn.ds <- variants %>% + group_by(SAMPLE, SYNONYMOUS) %>% + summarise( + Freq = sum(ALT_FREQ, na.rm = TRUE) + ) %>% + pivot_wider( + names_from = SYNONYMOUS, + values_from = Freq, + values_fill = 0 + ) %>% + transmute( + dn = No / sum(n_s_position$N), + ds = Yes / sum(n_s_position$S) + ) %>% + ungroup() %>% + left_join(unique(select(variants, SAMPLE, interval))) %>% + transmute( + sample = SAMPLE, + day = interval, + dN = dn, + dS = ds, + w = dn / ds + ) + +log_info("Writing results") +write_csv(dn.ds, snakemake@output[["table"]]) diff --git a/workflow/scripts/download_context.R b/workflow/scripts/download_context.R index fefc7f6..8f0e119 100644 --- a/workflow/scripts/download_context.R +++ b/workflow/scripts/download_context.R @@ -6,9 +6,12 @@ sink(log, type = "message") sink(log, type = "output") library(GISAIDR) -library(tidyverse) +library(dplyr) +library(readr) library(glue) +library(lubridate) library(logger) + log_threshold(INFO) CHKPT.ERROR.MSG <- paste( @@ -34,9 +37,7 @@ needed.columns <- c( ) needed.columns.mask <- needed.columns %in% colnames(sample.metadata) if (!all(needed.columns.mask)) { - print(glue( - "Please ensure column '{needed.columns[!needed.columns.mask]}' is present" - )) + log_error("Please ensure column '{needed.columns[!needed.columns.mask]}' is present") stop(glue( "Missing columns in '{snakemake@input[['metadata']]}'. Alternatively:\n{CHKPT.ERROR.MSG}" )) @@ -51,24 +52,20 @@ log_info("Getting time windows for context samles") # Get time windows dates <- sample.metadata %>% pull(snakemake@params[["date_column"]]) %>% - as.numeric + as.numeric() window.quantile.offset <- (1 - snakemake@params[["date_window_span"]]) / 2 min.date <- as_date(quantile(dates, window.quantile.offset, na.rm = TRUE)) max.date <- as_date(quantile(dates, 1 - window.quantile.offset, na.rm = TRUE)) padded.min.date <- min.date - snakemake@params[["date_window_paddding_days"]] padded.max.date <- max.date + snakemake@params[["date_window_paddding_days"]] -print(glue( - "Time window (span={snakemake@params[['date_window_span']]}): {round(interval(min.date, max.date) / days(1))} days (from {min.date} to {max.date})" -)) -print(glue( - "Padded time window (padding={snakemake@params[['date_window_paddding_days']]} days): {round(interval(padded.min.date, padded.max.date) / days(1))} days (from {padded.min.date} to {padded.max.date})" -)) +log_info("Time window (span={snakemake@params[['date_window_span']]}): {round(interval(min.date, max.date) / days(1))} days (from {min.date} to {max.date})") +log_info("Padded time window (padding={snakemake@params[['date_window_paddding_days']]} days): {round(interval(padded.min.date, padded.max.date) / days(1))} days (from {padded.min.date} to {padded.max.date})") log_info("Getting locations for context samples") # Get locations (if there are multiple, sample from all of them) locations <- sample.metadata %>% pull(snakemake@params[["location_column"]]) %>% - unique + unique() # GISAID login creds.list <- yaml::read_yaml(snakemake@params[["gisaid_creds"]]) @@ -105,7 +102,7 @@ samples.accids <- sample.metadata %>% pull(snakemake@params[["samples_gisaid_accession_column"]]) filtered.accids <- metadata %>% filter(accession_id %in% samples.accids) metadata <- metadata %>% filter(!accession_id %in% samples.accids) -print(glue("{nrow(metadata)} accession_ids remaining after GISAID ID filter")) +log_info("{nrow(metadata)} accession_ids remaining after GISAID ID filter") # Checkpoint: enforce a minimum number of samples to have at least # as many possible combinations as random subsample replicates. diff --git a/workflow/scripts/extract_afwdist_variants.py b/workflow/scripts/extract_afwdist_variants.py new file mode 100644 index 0000000..6a7bf3f --- /dev/null +++ b/workflow/scripts/extract_afwdist_variants.py @@ -0,0 +1,103 @@ +#!/usr/bin/env python3 + +import logging +from typing import List + +import pandas as pd +from Bio import SeqIO +from Bio.SeqRecord import SeqRecord +from Bio.Seq import Seq + + +def read_monofasta(path: str) -> SeqRecord: + fasta = SeqIO.parse(path, "fasta") + record = next(fasta) + if next(fasta, None) is not None: + logging.warning(f"There are unread records left in '{path}'") + return record + + +def read_masked_sites(vcf_path: str, mask_classes: List[str]) -> List[int]: + """ + Parse a VCF containing positions for masking. Assumes the VCF file is + formatted as in: + github.com/W-L/ProblematicSites_SARS-CoV2/blob/master/problematic_sites_sarsCov2.vcf + with a "mask" or "caution" recommendation in column 7. + Masked sites are specified with params. + """ + vcf = pd.read_csv( + vcf_path, + sep="\s+", + comment="#", + names=("CHROM", "POS", "ID", "REF", "ALT", "QUAL", "FILTER", "INFO") + ) + return vcf.loc[vcf.FILTER.isin(mask_classes), "POS"].tolist() + + +def build_ancestor_variant_table(ancestor: Seq, reference: Seq, reference_name: str, masked_positions: List[int]) -> pd.DataFrame: + pos = [] + alt = [] + for i in range(1, len(ancestor) + 1): + if i not in masked_positions and ancestor[i-1] != reference[i-1]: + pos.append(i) + alt.append(reference[i-1]) + df = pd.DataFrame({snakemake.params.position_col: pos, snakemake.params.sequence_col: alt}) + df[snakemake.params.frequency_col] = 1 # As a reference genome, we assume all positions have fixed alleles + df[snakemake.params.sample_col] = reference_name + return df + + +DTYPES = { + "sample": "object", + "position": "int64", + "sequence": "object", + "frequency": "float64" +} + + +if __name__ == "__main__": + + logging.basicConfig(filename=snakemake.log[0], format=snakemake.config["LOG_PY_FMT"], level=logging.INFO) + + colnames = { + snakemake.params.sample_col: "sample", + snakemake.params.position_col: "position", + snakemake.params.sequence_col: "sequence", + snakemake.params.frequency_col: "frequency" + } + + logging.info("Reading input tables") + # Variants + variants = pd.read_table(snakemake.input.variants, sep="\t") + logging.info(f"Read {len(variants)} variant records") + # VCF with sites to mask + masked_sites = read_masked_sites(snakemake.input.mask_vcf, snakemake.params.mask_class) + logging.info(f"Read {len(masked_sites)} masked positions") + + logging.info("Reading input FASTA files") + # Case ancestor + ancestor = read_monofasta(snakemake.input.ancestor) + logging.info(f"Ancestor: '{ancestor.description}', length={len(ancestor.seq)}") + # Alignment reference + reference = read_monofasta(snakemake.input.reference) + logging.info(f"Reference: '{reference.description}', length={len(reference.seq)}") + + logging.info("Processing ancestor variants") + ancestor_table = build_ancestor_variant_table(ancestor.seq, reference.seq, reference.id, masked_sites) + logging.info(f"Ancestor has {len(ancestor_table)} variants") + all_variants = pd.concat([variants, ancestor_table], ignore_index=True) + logging.info(f"Combined table has {len(all_variants)} variants") + + logging.info("Renaming and selecting columns") + output = ( + all_variants + .rename(columns=colnames) + [list(colnames.values())] + .astype(DTYPES) + ) + logging.info("Filtering sites") + output = output[~output.position.isin(masked_sites)] + logging.info(f"There are {len(output)} rows left") + + logging.info("Writing results") + output.to_csv(snakemake.output.variants, index=False) diff --git a/workflow/scripts/filter_genbank_features.py b/workflow/scripts/filter_genbank_features.py new file mode 100644 index 0000000..959a24e --- /dev/null +++ b/workflow/scripts/filter_genbank_features.py @@ -0,0 +1,83 @@ +#!/usr/bin/env python3 + +import logging +from typing import Dict, Iterable + +from Bio import SeqIO +from Bio.SeqFeature import SeqFeature + + +def iter_features_filtering(features: Iterable[SeqFeature], included: Dict[str, str], excluded: Dict[str, str]) -> Iterable[SeqFeature]: + # No filters + if len(included) == 0 and len(excluded) == 0: + logging.debug("Selecting all features") + return iter(features) + # Only inclusion filter + elif len(included) == 0 and len(excluded) != 0: + logging.debug(f"Selecting features excluding all of {excluded}") + return ( + feature for feature in features + if all( + (qualifier_value not in excluded.get(qualifier_key, [])) + for qualifier_key in excluded.keys() + for qualifier_value in feature.qualifiers.get(qualifier_key, []) + ) + ) + # Only exclusion filter + elif len(included) != 0 and len(excluded) == 0: + logging.debug(f"Selecting features including any of {included}") + return ( + feature for feature in features + if any( + (qualifier_value in included.get(qualifier_key, [])) + for qualifier_key in included.keys() + for qualifier_value in feature.qualifiers.get(qualifier_key, []) + ) + ) + # Inclusion then exclusion filter + else: + logging.debug(f"Selecting features including any of {included} and then excluding all of {excluded}") + included_features = ( + feature for feature in features + if any( + (qualifier_value in included.get(qualifier_key, [])) + for qualifier_key in included.keys() + for qualifier_value in feature.qualifiers.get(qualifier_key, []) + ) + ) + return ( + feature for feature in included_features + if all( + (qualifier_value not in excluded.get(qualifier_key, [])) + for qualifier_key in excluded.keys() + for qualifier_value in feature.qualifiers.get(qualifier_key, []) + ) + ) + + +if __name__ == "__main__": + logging.basicConfig( + filename=snakemake.log[0], format=snakemake.config["LOG_PY_FMT"], + level=logging.INFO + ) + + logging.info("Reading GenBank file") + gb = SeqIO.read(snakemake.input.gb, format="gb") + + logging.info("Extracting CDS") + features = [] + for feature in iter_features_filtering(gb.features, snakemake.params.included, snakemake.params.excluded): + logging.debug( + "Processing SeqFeature: " + f"ID={feature.id} type={feature.type} location={feature.location} " + f"gene={feature.qualifiers.get('gene', [])} " + f"locus_tag={feature.qualifiers.get('locus_tag', [])} " + f"product={feature.qualifiers.get('product', [])}" + ) + features.append(feature) + + logging.info("Replacing features") + gb.features = features + + logging.info("Writing filtered GenBank files") + SeqIO.write(gb, snakemake.output.gb, "gb") diff --git a/workflow/scripts/filter_tsv.R b/workflow/scripts/filter_tsv.R index 55dc216..502ed78 100644 --- a/workflow/scripts/filter_tsv.R +++ b/workflow/scripts/filter_tsv.R @@ -5,38 +5,39 @@ log <- file(snakemake@log[[1]], open = "wt") sink(log, type = "message") sink(log, type = "output") -library(tidyverse) +library(dplyr) +library(readr) +library(stringr) library(logger) + log_threshold(INFO) # Read inputs data <- read_tsv(snakemake@input[["tsv"]]) -# Filtering criteria: -# - P-value < 0.05 -# - Depth >= 20 -# - For SNP more than 2 reads in each strand +# Filtering is.deletion <- str_detect( data$ALT, "^[A-Z]", negate = TRUE ) -inBothStrands <- data$ALT_RV > 2 & data$ALT_DP > 2 -Depth <- (data$ALT_RV + data$ALT_DP) >= 20 +strand.mask <- data$ALT_RV > snakemake@params$min_alt_rv & + data$ALT_DP > snakemake@params$min_alt_dp +depth.mask <- (data$ALT_RV + data$ALT_DP) >= snakemake@params$min_depth log_info("Filtering variants") data <- filter( data, as.logical(PASS), - Depth, - inBothStrands | is.deletion + depth.mask, + strand.mask | is.deletion ) log_info("Finding synonymous and non synonymous variants") # Adding synonymous variable data <- mutate( data, - synonimous = case_when( + SYNONYMOUS = case_when( REF_AA == ALT_AA ~ "Yes", TRUE ~ "No" ) @@ -45,14 +46,6 @@ data <- mutate( # Remove duplicated features data <- distinct(data, pick(!GFF_FEATURE), .keep_all = TRUE) -# Change annotation to gb2seq annotation -features <- read_csv(snakemake@input[["annotation"]]) - -data <- data %>% - select(!GFF_FEATURE) %>% - left_join(features) %>% - rename(GFF_FEATURE = GEN) - log_info("Saving results") write_tsv( data, diff --git a/workflow/scripts/format_afwdist_results.py b/workflow/scripts/format_afwdist_results.py new file mode 100644 index 0000000..2f42ca6 --- /dev/null +++ b/workflow/scripts/format_afwdist_results.py @@ -0,0 +1,28 @@ +#!/usr/bin/env python3 + +import logging +import pandas as pd + + +if __name__ == "__main__": + + logging.basicConfig(filename=snakemake.log[0], format=snakemake.config["LOG_PY_FMT"], level=logging.INFO) + + logging.info("Read pairwise distances") + df = pd.read_csv(snakemake.input.distances) + + logging.info("Initializing formatted output") + output = pd.DataFrame( + data=0.0, + columns=snakemake.params.samples, + index=snakemake.params.samples, + dtype="float64" + ) + + logging.info("Filling table") + for i, row in df.iterrows(): + output.loc[row.sample_m, row.sample_n] = row.distance + output.loc[row.sample_n, row.sample_m] = row.distance + + logging.info("Writing formatted results") + output.to_csv(snakemake.output.distances) diff --git a/workflow/scripts/format_vcf_fields_longer.R b/workflow/scripts/format_vcf_fields_longer.R new file mode 100644 index 0000000..26c5512 --- /dev/null +++ b/workflow/scripts/format_vcf_fields_longer.R @@ -0,0 +1,75 @@ +#!/usr/bin/env Rscript + +# Write stdout and stderr to log file +log <- file(snakemake@log[[1]], open = "wt") +sink(log, type = "message") +sink(log, type = "output") + +library(readr) +library(dplyr) +library(tidyr) +library(stringr) +library(purrr) +library(rlang) +library(glue) +library(logger) + +empty.to.na <- function(x) { + x[x == ""] <- NA + x +} + +# Replace "" values with NA in R filter list +# Snakemake passes filters like: list(ERRORS = c("")) +log_info("Preprocessing data") +filter.include <- lapply(snakemake@params$filter_include, empty.to.na) +filter.exclude <- lapply(snakemake@params$filter_exclude, empty.to.na) + +# Process input table +log_info("Applying filters and writing results") +read_tsv(snakemake@input$tsv) %>% + + # Separate -delimited "...[*]..." columns (e.g. ANN[*].EFFECT) + separate_rows( + contains("[*]"), + sep = snakemake@params$sep, + convert = TRUE + ) %>% + + # Rename "...[*]..." columns using the provided lookup via Snakemake config + rename(all_of(unlist(snakemake@params$colnames_mapping))) %>% + + # Separate &-delimited error column (more than one error/warning/info message per row is possible) + mutate(split_errors = strsplit(ERRORS, "&")) %>% + # Keep rows with none of the excluded ERRORS terms, if any + filter(map_lgl(split_errors, ~ !any(. %in% filter.exclude[["ERRORS"]]))) %>% + select(-split_errors) %>% + + # Apply filters + filter( + # Keep variants that include required values in each field + !!!map2( + names(filter.include), + filter.include, + ~ expr(.data[[!!.x]] %in% !!.y) + ), + # Keep variants that exclude required values in each field + !!!map2( + names(filter.exclude), + filter.exclude, + ~ expr(!(.data[[!!.x]] %in% !!.y)) + ) + ) %>% + + # Keep unique rows + distinct() %>% + + mutate( + # Assign variant name using the pattern defined via Snakemake config + VARIANT_NAME = str_glue(snakemake@params$variant_name_pattern), + # Assign sample name + SAMPLE = snakemake@params$sample + ) %>% + + # Write output file + write_tsv(snakemake@output$tsv) diff --git a/workflow/scripts/merge_annotation.R b/workflow/scripts/merge_annotation.R new file mode 100644 index 0000000..9c322c4 --- /dev/null +++ b/workflow/scripts/merge_annotation.R @@ -0,0 +1,84 @@ +#!/usr/bin/env Rscript + +# Write stdout and stderr to log file +log <- file(snakemake@log[[1]], open = "wt") +sink(log, type = "message") +sink(log, type = "output") + +library(dplyr) +library(readr) +library(logger) + +log_threshold(INFO) + +log_info("Reading variants table, replacing REGION with the reference name") +variants <- read_tsv( + snakemake@input$tsv, + col_types = list( + REGION = col_character(), + POS = col_integer(), + REF = col_character(), + ALT = col_character() + ) +) %>% + mutate( + REGION = snakemake@params$ref_name, + is_insertion = startsWith(ALT, "+"), + is_deletion = startsWith(ALT, "-"), + REF_VCF = case_when( + is_deletion ~ paste0(REF, substring(ALT, 2)), + TRUE ~ REF + ), + ALT_VCF = case_when( + is_insertion ~ paste0(REF, substring(ALT, 2)), + is_deletion ~ REF, + TRUE ~ ALT + ) + ) %>% + select(-is_insertion, -is_deletion) + +log_info("Reading annotation table") +annotation <- read_tsv( + snakemake@input$annot, + col_select = c( + "CHROM", + "POS", + "REF", + "ALT", + "EFFECT", + "IMPACT", + "VARIANT_NAME" + ), + col_types = list( + CHROM = col_character(), + POS = col_integer(), + REF = col_character(), + ALT = col_character(), + VARIANT_NAME = col_character() + ) +) %>% + distinct() %>% + group_by(CHROM, POS, REF, ALT) %>% + mutate(VARIANT_NAME = paste(VARIANT_NAME, collapse = "|")) %>% + ungroup() + +log_info("Merging tables") +merged <- left_join( + variants, + annotation, + by = c( + "REGION" = "CHROM", + "POS" = "POS", + "REF_VCF" = "REF", + "ALT_VCF" = "ALT" + ) +) %>% + mutate( + SAMPLE = snakemake@params$sample + ) + +log_info("Saving results") +write_tsv( + merged, + snakemake@output$tsv +) diff --git a/workflow/scripts/n_s_sites_from_fasta.py b/workflow/scripts/n_s_sites_from_fasta.py new file mode 100644 index 0000000..cb7fa6f --- /dev/null +++ b/workflow/scripts/n_s_sites_from_fasta.py @@ -0,0 +1,111 @@ +#!/usr/bin/env python3 + +import logging +import json +import itertools as it +from typing import Dict + +import pandas as pd +from Bio import SeqIO +from Bio.SeqRecord import SeqRecord +from Bio.Seq import Seq + + +NTS = ("A", "C", "G", "T") + + +def read_monofasta(path: str) -> SeqRecord: + return SeqIO.read(path, format="fasta") + + +def split_into_codons(seq: Seq) -> list: + """Split the complete CDS feature in to a list of codons""" + return [ + seq[i:i + 3] for i in range(0, len(seq)-2, 3) if all(char in NTS for char in seq[i:i + 3]) + ] + + +def calculate_potential_changes(genetic_code: dict) -> dict: + """Generate a dictionary with S and N pre-calculated for all possible codons""" + # Initialize structure + potential_changes = {"S": {}, "N": {}} + for codon in it.product(NTS, repeat=3): + potential_changes["S"]["".join(codon)] = 0.0 + potential_changes["N"]["".join(codon)] = 0.0 + # Mutate (substitutions) all possible codons in the given genetic code + # and count proportions of mutations that are synonymous and non-synonmyous + for codon in genetic_code.keys(): + for codon_p in range(0, 3): + nts = list(NTS) + # Do not consider self substitutions, e.g. A->A + nts.remove(codon[codon_p]) + for nt in nts: + codon_mutated = list(codon) + # Mutate the basepair + codon_mutated[codon_p] = nt + # Count how many of them are synonymous + if genetic_code[codon] == genetic_code["".join(codon_mutated)]: + potential_changes["S"][codon] += 1 / 3.0 + else: + potential_changes["N"][codon] += 1 / 3.0 + return potential_changes + + +def get_feature_codons(coding_records: Dict[str, SeqRecord]) -> dict: + return {name: split_into_codons(record.seq) for name, record in coding_records.items()} + + +def calculate_ns_sites(codons: dict, genetic_code: dict) -> pd.DataFrame: + features = [] + N_sites, S_sites = [], [] + values = calculate_potential_changes(genetic_code) + for key, item in codons.items(): + features.append(key) + N = sum([values["N"][x] for x in item if x in values["N"].keys()]) + S = sum([values["S"][x] for x in item if x in values["S"].keys()]) + N_sites.append(N) + S_sites.append(S) + return pd.DataFrame({"feature": features, "N": N_sites, "S": S_sites}) + + +def main(): + + logging.basicConfig( + filename=snakemake.log[0], format=snakemake.config["LOG_PY_FMT"], + level=logging.INFO + ) + + logging.info("Reading genetic code") + with open(snakemake.input.genetic_code) as f: + genetic_code = json.load(f) + + logging.info("Reading GenBank file") + gb = SeqIO.read(snakemake.input.gb, format="gb") + + logging.info("Reading input FASTA") + record = read_monofasta(snakemake.input.fasta) + + logging.info("Extracting CDS") + coding_records = {} + for feature in gb.features: + identifier = "|".join(feature.qualifiers.get(snakemake.params.gb_qualifier_display, [])) + if identifier == "": + logging.error(f"Feature at {feature.location} has no qualifier '{snakemake.params.gb_qualifier_display}'") + elif identifier in coding_records: + logging.warning(f"Identifier '{identifier}' is already among coding records and will not be replaced by feature at {feature.location}") + else: + logging.debug(f"Adding feature") + coding_records[identifier] = feature.extract(record) + + logging.info(f"Splitting {len(coding_records)} records into codons") + codons = get_feature_codons(coding_records) + + logging.info("Calculating synonymous and non synonymous sites") + df = calculate_ns_sites(codons, genetic_code) + + logging.info("Saving results") + df.to_csv(snakemake.output.csv, index=False) + + +if __name__ == "__main__": + main() diff --git a/workflow/scripts/pairwise_trajectory_correlation.R b/workflow/scripts/pairwise_trajectory_correlation.R new file mode 100644 index 0000000..27a8dfc --- /dev/null +++ b/workflow/scripts/pairwise_trajectory_correlation.R @@ -0,0 +1,41 @@ +#!/usr/bin/env Rscript + +# Write stdout and stderr to log file +log <- file(snakemake@log[[1]], open = "wt") +sink(log, type = "message") +sink(log, type = "output") + +library(dplyr) +library(readr) +library(tidyr) +library(tibble) +library(stringr) +library(logger) + +log_threshold(INFO) + +variants <- read_tsv(snakemake@input[["variants"]]) + +# Obtain sample names ordered by CollectionDate +date_order <- read_csv(snakemake@input[["metadata"]]) %>% + arrange(CollectionDate) %>% + pull(ID) %>% + unique() + +# Create SNP variable and select useful variables +variants <- variants %>% select(VARIANT_NAME, SAMPLE, ALT_FREQ) + +variants <- variants %>% + pivot_wider( + names_from = VARIANT_NAME, + values_from = ALT_FREQ, + values_fill = 0, + values_fn = sum + ) %>% + arrange(factor(SAMPLE, levels = date_order)) %>% + # Removes "|"-separated annotations, keeping the first one + ellipsis (clarifies heatmap) + rename_with(~ str_replace(., "^([^|]+)\\|.*$", "\\1(...)"), -SAMPLE) %>% + column_to_rownames(var = "SAMPLE") + +log_info("Saving table to create heatmap") +write.csv(variants, snakemake@output[["table"]]) diff --git a/workflow/scripts/report/NV_description.R b/workflow/scripts/report/NV_description.R deleted file mode 100644 index d3e72fd..0000000 --- a/workflow/scripts/report/NV_description.R +++ /dev/null @@ -1,505 +0,0 @@ -#!/usr/bin/env Rscript - -# Write stdout and stderr to log file -log <- file(snakemake@log[[1]], open = "wt") -sink(log, type = "message") -sink(log, type = "output") - -library(tidyverse) -library(stringi) -library(ggpubr) -library(jsonlite) -library(logger) -log_threshold(INFO) - - -# Import file with plots style -source(snakemake@params[["design"]]) - -# SARS-CoV-2 anotation for genome scheme -SCov2_annotation <- list( - "five_prime_UTR" = c(1:265), - "orf1ab" = c(266:21555), - "Intergenic_1" = c(21556:21562), - "S" = c(21563:25384), - "Intergenic_2" = c(25385:25392), - "ORF3a" = c(25393:26220), - "Intergenic_3" = c(26221:26244), - "E" = c(26245:26472), - "Intergenic_4" = c(26473:26522), - "M" = c(26523:27191), - "Intergenic_5" = c(27192:27201), - "ORF6" = c(27202:27387), - "Intergenic_6" = c(27388:27393), - "ORF7" = c(27394:27759), - "Intergenic_7" = c(27760:27893), - "ORF8" = c(27894:28259), - "Intergenic_8" = c(28260:28273), - "N" = c(28274:29533), - "Intergenic_9" = c(29534:29557), - "ORF10" = c(29558:29674), - "three_prime_UTR" = c(29675:29903) -) - -vcf <- read_delim(snakemake@input[["vcf"]]) -vcf_snp <- vcf -window <- read_csv(snakemake@input[["window"]]) -metadata <- read_csv(snakemake@input[["metadata"]]) - -# DATA PROCESSING -# Obtain sample names ordered by CollectionDate -date_order <- metadata %>% - filter(ID %in% snakemake@params[["samples"]]) %>% - arrange(CollectionDate) %>% - pull(ID) %>% - unique() - -empty_vcf <- tibble( - REGION = date_order, - variant = as.character(NA), - ALT_FREQ = as.numeric(NA), - GFF_FEATURE = as.character(NA), - synonimous = as.character(NA), - POS = as.numeric(NA), - ALT = as.character(NA), - NV_class = as.character(NA), - group = as.character(NA) -) - -# Create SNP variable and select useful variables -vcf <- vcf %>% - dplyr::select( - variant, - REGION, - ALT_FREQ, - GFF_FEATURE, - synonimous, - POS, - ALT - ) - -# Df with gene length for scheme -notation_empty <- data.frame( - gene = "", - len = 0 -) %>% - filter(len != 0) - -notation <- lapply( - names(SCov2_annotation), - function(x) { - add_row( - notation_empty, - gene = x, - len = length(SCov2_annotation[[x]]) - ) - } -) %>% - bind_rows() - -# Classifying variants -log_info("Classifying variants") -vcf <- vcf %>% - mutate( - NV_class = case_when( - str_detect(ALT, fixed("-")) | - str_detect(ALT, fixed("+")) ~ - "INDEL", - TRUE ~ "SNP" - ), - Class = case_when( - GFF_FEATURE == "Intergenic" ~ "Intergenic", - TRUE ~ synonimous - ), - POS = as.numeric(POS) - ) %>% - rowwise() %>% - mutate( - indel_len = case_when( - NV_class == "INDEL" ~ str_length(ALT) - 1 - ), - indel_class = case_when( - GFF_FEATURE == "Intergenic" ~ "Intergenic", - NV_class == "INDEL" & - indel_len %% 3 == 0 ~ - "In frame", - NV_class == "INDEL" & - indel_len %% 3 > 0 ~ - "Frameshift" - ) - ) %>% - ungroup() %>% - mutate( - group = case_when( - GFF_FEATURE == "Intergenic" ~ "Intergenic", - NV_class == "SNP" ~ Class, - NV_class == "INDEL" ~ indel_class - ) - ) - -# NSP data -npc <- read_csv(snakemake@params[["regions"]]) %>% - mutate( - summary_nsp = case_when( - region %in% paste("nsp", seq(4, 12, 1), sep = "") ~ "nsp4-12", - region %in% paste("nsp", seq(14, 16, 1), sep = "") ~ "nsp14-16", - TRUE ~ region - ), - summary_start = case_when( - region %in% paste("nsp", seq(4, 12, 1), sep = "") ~ 8555, - region %in% paste("nsp", seq(14, 16, 1), sep = "") ~ 18040, - TRUE ~ start - ), - summary_end = case_when( - region %in% paste("nsp", seq(4, 12, 1), sep = "") ~ 16236, - region %in% paste("nsp", seq(14, 16, 1), sep = "") ~ 21552, - TRUE ~ end - ) - ) %>% - filter(region != "nsp1") - - -# PLOTS -## SUMMARY FIGURE FOR WHOLE GENOME -log_info("Plotting summary figure for whole genome") - -if (nrow(vcf) == 0) { - log_warn("Whole-genome VCF has no rows") - vcf <- empty_vcf -} - -variants <- vcf %>% - filter(ALT_FREQ > 0) %>% - ggplot() + - aes( - x = POS, - y = factor(REGION, date_order), - shape = factor(NV_class, c("SNP", "INDEL")), - color = group, - alpha = ALT_FREQ - ) + - geom_point(size = 3) + - geom_col( - data = notation, - aes( - x = len, - y = 0.3, - fill = factor(gene, rev(names(SCov2_annotation))) - ), - inherit.aes = FALSE, - width = 0.3 - ) + - scale_fill_manual(values = gene_colors) + - xlim(c(0, 29903)) + - scale_color_manual( - labels = NV_names, - values = NV_colors - ) + - scale_y_discrete(drop = FALSE) + - labs( - x = "SARS-CoV-2 genome position", - y = "Sample", - shape = "Variant class", - color = "Classification", - alpha = "Frequency", - fill = "Region" - ) + - guides( - fill = guide_legend(reverse = TRUE) - ) - -# Window plot -window_plot <- window %>% - ggplot() + - aes( - x = position, - y = fractions, - color = gen - ) + - geom_point() + - geom_line( - aes(group = 1), - colour = "black", - alpha = 0.3 - ) + - scale_y_continuous( - label = scales::percent, - limits = c(0, max(window$fractions) + 0.005) - ) + - xlim(c(0, 29903)) + - scale_color_manual(values = gene_colors) + - labs( - y = "Proportion of \n sites with NV", - x = "", - color = "Gen" - ) - -# Add nsp info -window_plot_nsp <- window_plot + - geom_vline( - data = npc, - aes(xintercept = summary_start), - color = "red" - ) + - geom_vline( - data = npc, - aes(xintercept = summary_end), - color = "red" - ) + - geom_text( - data = npc, - aes( - x = (summary_start + summary_end) / 2, - y = max(window$fractions) + 0.002, - label = summary_nsp - ), - inherit.aes = FALSE, - size = 5, - angle = 60 - ) - -figura <- ggarrange( - window_plot_nsp, - variants, - nrow = 2, - align = "v", - legend.grob = get_legend(variants), - heights = c(2, 6), - legend = "right", - labels = c("A", "B") -) - -ggsave( - filename = snakemake@output[["fig"]], - plot = figura, - width = 250, - height = 240, - units = "mm", - dpi = 250 -) - - -# Zoom in in spike -log_info("Plotting summary for variants in the spike") - -spike_pos <- window %>% - filter(gen == "S") %>% - pull(position) - -vcf_spike <- vcf %>% - filter( - ALT_FREQ > 0, - POS %in% c(min(spike_pos):max(spike_pos)) - ) - -window_plot_spike <- window %>% - filter(gen == "S") %>% - ggplot() + - aes( - x = position, - y = fractions, - color = gen - ) + - geom_point() + - geom_line( - aes(group = 1), - colour = "black", - alpha = 0.3 - ) + - scale_y_continuous( - label = scales::percent, - limits = c(0, max(window$fractions) + 0.005) - ) + - xlim(c(min(spike_pos), max(spike_pos))) + - scale_color_manual( - values = gene_colors, - guide = "none" - ) + - labs( - y = "Proportion of \n sites with NV", - x = "" - ) - -if (nrow(vcf_spike) == 0) { - log_warn("Spike VCF has no rows") - vcf_spike <- empty_vcf -} - -variants_spike <- vcf_spike %>% - ggplot() + - aes( - x = POS, - y = factor(REGION, date_order), - shape = factor(NV_class, c("SNP", "INDEL")), - color = group, - alpha = ALT_FREQ - ) + - geom_point(size = 3) + - xlim(c(min(spike_pos), max(spike_pos))) + - scale_color_manual( - labels = NV_names, - values = NV_colors - ) + - scale_y_discrete(drop = FALSE) + - labs( - x = "SARS-CoV-2 genome position", - y = "Sample", - shape = "Variant class", - color = "Classification", - alpha = "Frequency", - fill = "Region" - ) + - guides( - fill = guide_legend(reverse = TRUE) - ) - -figura_spike <- ggarrange( - window_plot_spike, - variants_spike, - nrow = 2, - align = "v", - legend.grob = get_legend(variants), - heights = c(2, 6), - legend = "right", - labels = c("A", "B") -) - -ggsave( - filename = snakemake@output[["fig_s"]], - plot = figura_spike, - width = 250, - height = 240, - units = "mm", - dpi = 250 -) - - -# Figure for no of heterozygous sites for each sample -log_info("Plotting no. of heterozygous sites for each sample") -figur_SNP_table <- vcf_snp %>% - filter(ALT_FREQ <= snakemake@params$max_alt_freq) %>% - left_join( - metadata, - by = c("REGION" = "ID") - ) %>% - group_by(REGION) %>% - summarise( - CollectionDate = min(as.Date(CollectionDate)), - n = n_distinct(POS) - ) %>% - ungroup() - -if (nrow(figur_SNP_table) == 0) { - log_warn("Filtered SNP table has no rows") - figur_SNP_table <- empty_vcf -} - -figur_SNP_time <- figur_SNP_table %>% - ggplot() + - aes( - x = CollectionDate, - y = n - ) + - geom_smooth( - method = "lm", - fill = "gray95", - alpha = 0.6 - ) + - geom_point() + - labs( - x = "Collection date", - y = "No. of polimorphic sites" - ) - -ggsave( - filename = snakemake@output[["fig_cor"]], - plot = figur_SNP_time, - width = 250, - height = 119.4, - units = "mm", - dpi = 250 -) - - -# PLOT TABLES -log_info("Saving plot tables") - -# Variants summary table -vcf %>% - select( - REGION, - POS, - variant, - ALT_FREQ, - NV_class, - group - ) %>% - rename( - sample = REGION, - Variant = variant, - Class = group - ) %>% - filter(ALT_FREQ > 0) %>% - mutate( - Class = case_when( - Class == "Yes" ~ "synonymous", - Class == "No" ~ "non_synonymous", - TRUE ~ Class - ) - ) %>% - write.csv(snakemake@output[["table_2"]], row.names = FALSE) - -# Window plot table -window %>% - transmute( - POS = position, - feature = gen, - prop_PolymorphicSites = fractions - ) %>% - write.csv(snakemake@output[["table_1"]], row.names = FALSE) - -# Heterozygous sites per sample table -vcf_snp %>% - filter(ALT_FREQ <= snakemake@params$max_alt_freq) %>% - select(!GFF_FEATURE) %>% - left_join( - metadata, - by = c("REGION" = "ID") - ) %>% - group_by(REGION) %>% - summarise( - CollectionDate = min(as.Date(CollectionDate)), - n_PolymorphicSites = n_distinct(POS) - ) %>% - ungroup() %>% - rename(sample = REGION) %>% - unique() %>% - write.csv(snakemake@output[["table_3"]], row.names = FALSE) - -# Stats for reporting -n_indels <- vcf %>% - filter(NV_class == "INDEL") %>% - pull(variant) %>% - unique() %>% - length() - -n_snv <- length(unique(vcf$variant)) - n_indels -model <- lm(n ~ CollectionDate, data = figur_SNP_table) - -# Calculate correlation, if possible -if (nrow(figur_SNP_table) > 2) { - p.value <- summary(model)$coefficients[2,4] -} else { - p.value <- NA -} - -list( - "INDELS" = n_indels, - "SNV" = n_snv, - "window" = snakemake@params[["window"]], - "step" = snakemake@params[["step"]], - "r2" = summary(model)$r.squared[[1]], - "value" = ifelse(p.value < 0.001, "< 0.001", p.value) -) %>% - toJSON() %>% - write(snakemake@output[["json"]]) diff --git a/workflow/scripts/report/af_time_correlation_data.R b/workflow/scripts/report/af_time_correlation_data.R new file mode 100644 index 0000000..d3415c0 --- /dev/null +++ b/workflow/scripts/report/af_time_correlation_data.R @@ -0,0 +1,141 @@ +#!/usr/bin/env Rscript + +# Write stdout and stderr to log file +log <- file(snakemake@log[[1]], open = "wt") +sink(log, type = "message") +sink(log, type = "output") + +library(dplyr) +library(readr) +library(tidyr) +library(tibble) +library(logger) +log_threshold(INFO) + +# Read data +log_info("Reading variants table") +variants <- read_delim( + snakemake@input[["variants"]], + col_select = c( + "VARIANT_NAME", + "SAMPLE", + "REGION", + "ALT_FREQ", + "POS" + ) +) %>% + # Fill positions without alt frequency with 0 + complete( + nesting(REGION, VARIANT_NAME, POS), + SAMPLE, + fill = list(ALT_FREQ = 0) + ) + +log_info("Reading metadata") +metadata <- read_csv(snakemake@input[["metadata"]]) %>% + filter( + ID %in% variants$SAMPLE + ) %>% + select( + ID, + CollectionDate + ) + +log_info("Dating variants") +variants <- left_join(variants, metadata, by = c("SAMPLE" = "ID")) %>% + # Calculate days since first sample + arrange( + CollectionDate + ) %>% + mutate( + interval = as.numeric(difftime(CollectionDate, min(CollectionDate), units = "days")) + ) + +# Save processed input +log_info("Writing dated and frequency-filled variants") +write_csv(variants, snakemake@output$fmt_variants) + +log_info("Calculating correlations") +log_debug("Calculating unique SNPs") +# Get list with all different polymorphisms +unique.snps <- unique(variants$VARIANT_NAME) + +# Create an empty dataframe to be filled +cor.df <- data.frame( + variant = "", + coefficient = 0, + p.value = 0, + p.value.adj = 0 +) %>% + filter(p.value != 0) + +log_debug("Calculating correlation using method = {snakemake@params$cor_method} and exact p-value = {snakemake@params$cor_exact}") +correlations <- lapply( + unique.snps, + function(snp) { + # Select SNP + df <- filter( + variants, + VARIANT_NAME == snp + ) + # Perform calculation + test <- cor.test( + df$ALT_FREQ, + df$interval, + method = snakemake@params$cor_method, + exact = snakemake@params$cor_exact + ) + # Adjust p-value + p.value.adj <- p.adjust( + test$p.value, + method = "BH", + n = length(unique.snps) + ) + # Add row to dataframe + add_row( + cor.df, + variant = snp, + coefficient = test$estimate, + p.value = test$p.value, + p.value.adj = p.value.adj + ) + } +) %>% + bind_rows() + +log_info("Writing correlations table") +write_csv( + correlations, + snakemake@output[["correlations"]] +) + +log_info("Selecting variants whose allele frequency is significantly correlated with time") +significant.variants <- correlations %>% + filter(p.value.adj < 0.05) %>% + pull(variant) %>% + unique() + +log_info("Significant: {significant.variants}") + +log_info("Selecting variants in positions with more than one alternative allele") +mult.alt.variants <- variants %>% + select( + VARIANT_NAME, + POS + ) %>% + distinct() %>% + group_by(POS) %>% + filter(n() > 1) %>% + ungroup() %>% + pull(VARIANT_NAME) %>% + unique() + +log_info("Mult all: {mult.alt.variants}") + +# Build selected subset to represent +variant.selection <- unique(c(significant.variants, mult.alt.variants)) + +log_info("Selection: {variant.selection}") + +log_info("Writing selected variants subset") +write_lines(variant.selection, snakemake@output$subset) diff --git a/workflow/scripts/report/af_time_correlation_plot.R b/workflow/scripts/report/af_time_correlation_plot.R new file mode 100644 index 0000000..42f5b75 --- /dev/null +++ b/workflow/scripts/report/af_time_correlation_plot.R @@ -0,0 +1,57 @@ +#!/usr/bin/env Rscript + +# Write stdout and stderr to log file +log <- file(snakemake@log[[1]], open = "wt") +sink(log, type = "message") +sink(log, type = "output") + +library(dplyr) +library(readr) +library(ggplot2) +library(ggrepel) +library(logger) +log_threshold(INFO) + +# Import file with plots style +source(snakemake@params[["design"]]) + +log_info("Reading correlation data") +correlations <- read_csv(snakemake@input$correlations) + +log_info("Plotting coorrelation coefficients and p-values of each variants") +p <- correlations %>% + mutate( + trans.p = -log10(p.value.adj), + label = ifelse(p.value.adj < 0.05, variant, NA) + ) %>% + ggplot() + + aes( + x = coefficient, + y = trans.p + ) + + geom_text_repel(aes(label = label), max.overlaps = 10000, direction = "x") + + geom_point( + data = function(x) subset(x, !is.na(label)), + color = "orange", + size = 2 + ) + + geom_point(size = 2, shape = 1) + + xlim(c(-1, 1)) + + geom_hline( + aes(yintercept = -log10(0.05)), + linetype = 2, + color = "orange" + ) + + labs( + x = "Correlation coefficient", + y = "-log10(p-value)" + ) + +ggsave( + filename = snakemake@output$plot, + plot = p, + width = snakemake@params$plot_width_mm, + height = snakemake@params$plot_height_mm, + units = "mm", + dpi = 250 +) diff --git a/workflow/scripts/report/af_trajectory_panel_plot.R b/workflow/scripts/report/af_trajectory_panel_plot.R new file mode 100644 index 0000000..61e7190 --- /dev/null +++ b/workflow/scripts/report/af_trajectory_panel_plot.R @@ -0,0 +1,79 @@ +#!/usr/bin/env Rscript + +# Write stdout and stderr to log file +log <- file(snakemake@log[[1]], open = "wt") +sink(log, type = "message") +sink(log, type = "output") + +set.seed(snakemake@params$random_color_seed) # seed for sampling colors + +library(dplyr) +library(readr) +library(ggplot2) +library(glue) +library(logger) +library(logger) +log_threshold(INFO) + +# Import file with plots style +source(snakemake@params[["design"]]) + +log_info("Reading formatted variants table") +variants <- read_csv(snakemake@input$fmt_variants) + +log_info("Reading subset of variants to represent") +selected.variants <- read_lines(snakemake@input$subset) + +# Set plot height depending on the number of SNPs assuming 4 columns in the plot +plot.rows <- ceiling( + length(selected.variants) / snakemake@params$n_plot_columns +) +plot.height <- max(100, plot.rows * snakemake@params$plot_row_height_mm) +log_debug("Setting total plot height to {plot.height} mm with {plot.rows} rows") + +log_info("Plotting {length(selected.variants)} SNPs allele frequency trajectories in time") +selected.colors <- sample(TRAJECTORY.PANEL.COLORS, length(selected.variants)) +log_debug("Selected color: {selected.colors}") +p <- variants %>% + filter(VARIANT_NAME %in% selected.variants) %>% + mutate( + gPOS = paste0("g.", POS), + gPOS = reorder(gPOS, POS) + ) %>% + ggplot() + + aes( + x = interval, + y = ALT_FREQ, + color = reorder(glue("{gPOS}\n{VARIANT_NAME}"), POS) + ) + + scale_color_manual(values = selected.colors) + + geom_point() + + geom_line() + + theme( + legend.position = "bottom", + legend.text = element_text(size = 9), + legend.title = element_blank(), + legend.spacing.y = unit(3, "mm") + ) + + labs( + x = "Days since first sample", + y = "Frequency" + ) + + guides(color = guide_legend(ncol = 3)) + +if (length(selected.variants) > 1) { + p <- p + + facet_wrap( + vars(gPOS), + ncol = snakemake@params$n_plot_columns + ) +} + +ggsave( + filename = snakemake@output[["plot"]], + plot = p, + width = snakemake@params$plot_width_mm, + height = plot.height, + units = "mm", + dpi = 250 +) diff --git a/workflow/scripts/report/allele_freq_tree_data.R b/workflow/scripts/report/allele_freq_tree_data.R new file mode 100644 index 0000000..9b20ce1 --- /dev/null +++ b/workflow/scripts/report/allele_freq_tree_data.R @@ -0,0 +1,56 @@ +#!/usr/bin/env Rscript + +# Write stdout and stderr to log file +log <- file(snakemake@log[[1]], open = "wt") +sink(log, type = "message") +sink(log, type = "output") + +library(dplyr) +library(readr) +library(tibble) +library(data.table) +library(ape) +library(logger) + +fix_negative_edge_length <- function(nj.tree) { + edge_infos <- cbind( + nj.tree$edge, + nj.tree$edge.length + ) %>% + as.data.table() + colnames(edge_infos) <- c("from", "to", "length") + nega_froms <- edge_infos[length < 0, sort(unique(from))] + for (nega_from in nega_froms) { + minus_length <- edge_infos[from == nega_from, ][order(length)][1, length] + edge_infos[from == nega_from, length := length - minus_length] + edge_infos[to == nega_from, length := length + minus_length] + } + nj.tree$edge.length <- edge_infos$length + nj.tree +} + +log_threshold(INFO) + +matrix <- read_csv(snakemake@input$dist) + +# Tree construction +if (snakemake@params$use_bionj) { + log_info("Constructing BIONJ tree based on distances") + tree_method <- bionj +} else { + log_info("Constructing NJ tree based on distances") + tree_method <- nj +} +tree <- matrix %>% + column_to_rownames(var = "...1") %>% + as.matrix() %>% + as.dist() %>% + tree_method() %>% + root(snakemake@params$outgroup_id, resolve.root = TRUE) + +# Resolve possible negative edge lengths in tree +log_info("Resolving negative edge lengths") +tree <- fix_negative_edge_length(tree) + +log_info("Saving tree") +write.tree(tree, snakemake@output$tree) diff --git a/workflow/scripts/report/allele_freq_tree_plot.R b/workflow/scripts/report/allele_freq_tree_plot.R new file mode 100644 index 0000000..2a08851 --- /dev/null +++ b/workflow/scripts/report/allele_freq_tree_plot.R @@ -0,0 +1,89 @@ +#!/usr/bin/env Rscript + +# Write stdout and stderr to log file +log <- file(snakemake@log[[1]], open = "wt") +sink(log, type = "message") +sink(log, type = "output") + +library(dplyr) +library(readr) +library(ape) +library(ggplot2) +library(ggtree) +library(logger) + +log_threshold(INFO) + +# Import file with plots style +source(snakemake@params[["design"]]) + +log_info("Building tree annotation") + +# Read metadata +log_debug("Reading metadata") +metadata <- read_csv(snakemake@input$metadata) + +# Extract names from records +log_debug("Reading FASTA") +study_records <- read.dna( + snakemake@input$study_fasta, + format = "fasta", + as.matrix = FALSE, +) +log_debug("Extracting names") +study_records <- study_records[ + !startsWith(names(study_records), snakemake@params$outgroup_id) +] +study_names <- names(study_records) + +# Obtain sample names ordered by CollectionDate +log_debug("Sorting names by collection date") +date_order <- metadata %>% + arrange(CollectionDate) %>% + filter(ID %in% study_names) %>% + pull(ID) %>% + unique() + +log_debug("Building annotation dataframe") +tree_tiplab <- data.frame( + ID = date_order, + order = seq(1, length(date_order), 1) +) %>% + rowwise() %>% + mutate( + tip_label = sprintf( + "(%s)-%s", + order, + ID + ) + ) %>% + ungroup() %>% + add_row( + ID = snakemake@params$outgroup_id, + order = 0, + tip_label = snakemake@params$outgroup_id + ) + +# Read distance tree and root +log_info("Reading tree") +tree <- read.tree(snakemake@input$tree) %>% + root(snakemake@params$outgroup_id, resolve.root = TRUE) + +# Plot +log_info("Plotting distance tree") +max.tip.length <- max( + node.depth.edgelength(tree)[seq_along(length(tree$tip.label))] +) +p <- ggtree(tree) %<+% tree_tiplab + + geom_tiplab(aes(label = tip_label)) + + geom_treescale(1.1 * max.tip.length) + + geom_rootedge(0.05 * max.tip.length) + +ggsave( + filename = snakemake@output$plot, + plot = p, + width = snakemake@params$plot_width_mm, + height = snakemake@params$plot_height_mm, + units = "mm", + dpi = 250 +) diff --git a/workflow/scripts/report/context_phylogeny_data.R b/workflow/scripts/report/context_phylogeny_data.R new file mode 100644 index 0000000..a23587d --- /dev/null +++ b/workflow/scripts/report/context_phylogeny_data.R @@ -0,0 +1,116 @@ +#!/usr/bin/env Rscript + +# Write stdout and stderr to log file +log <- file(snakemake@log[[1]], open = "wt") +sink(log, type = "message") +sink(log, type = "output") + +library(dplyr) +library(readr) +library(tibble) +library(jsonlite) +library(ape) +library(logger) + +log_threshold(INFO) + +# Import file with plots style +source(snakemake@params[["design"]]) + +# Format tree label for well supported nodes +TREE_LEGEND_NAMES["boot_alrt_pass"] <- sprintf( + TREE_LEGEND_NAMES["boot_alrt_pass"], + 100 * snakemake@params[["boot_th"]], "%", + 100 * snakemake@params[["alrt_th"]], "%" +) + +log_info("Reading tree") +tree_ml <- read.tree(snakemake@input[["tree"]]) %>% + root( + snakemake@params[["ref_name"]], + resolve.root = TRUE + ) +log_debug("Read tree with {length(tree_ml$node.label)} labels") + +log_info("Reading target names from FASTA") +target_names <- read.dna( + snakemake@input[["target_fasta"]], + format = "fasta", + as.matrix = FALSE, +) +log_debug("Read {length(target_names)} records") + +log_info("Processing target names") +target_names <- target_names[ + !startsWith(names(target_names), snakemake@config[["ALIGNMENT_REFERENCE"]]) +] +target_names <- names(target_names) +log_debug("{length(target_names)} records remaining after processing") + +# ML tree with context data +# Internal nodes color +# Node labels contain SH-aLRT/UFboot values +log_info("Reading support values from labels") +labels <- strsplit(tree_ml$node.label, "/") +aLRT.values <- sapply( + labels, + function(x) as.numeric(x[1]) +) +bootstrap.values <- sapply( + labels, + function(x) as.numeric(x[2]) +) + +log_info("Calculating support mask for the given thresholds") +aLRT.mask <- aLRT.values >= snakemake@params[["alrt_th"]] +boot.mask <- bootstrap.values >= snakemake@params[["boot_th"]] + +# MRCA for target samples +log_info("Calculating MRCA of target samples") +target.mrca <- getMRCA(tree_ml, target_names) +target.mrca.clade <- extract.clade(tree_ml, target.mrca) +target.mrca.clade.ntips <- Ntip(target.mrca.clade) + +log_info("Building M-L tree annotation") +tree_ml.ntips <- length(tree_ml$tip.label) +tree_ml.nodes <- tree_ml$Nnode + tree_ml.ntips +tree_ml.labels <- tree_ml$tip.label[1:tree_ml.nodes] +tree_ml.node.pass <- c(rep(FALSE, tree_ml.ntips), aLRT.mask & boot.mask) + +ml.tree.annot <- tibble( + node = 1:tree_ml.nodes, +) %>% + mutate( + Class = case_when( + tree_ml.labels %in% target_names ~ TREE_LEGEND_NAMES["tip_label"], + tree_ml.node.pass ~ TREE_LEGEND_NAMES["boot_alrt_pass"], + TRUE ~ NA + ) + ) + +# Write output files +log_info("Writing tree annotation") +write_csv(ml.tree.annot, snakemake@output$annotation) + +log_info("Writing JSON data") +target.node <- tree_ml$node.label[target.mrca - length(tree_ml$tip.label)] +list( + "boot" = strsplit(target.node, "/")[[1]][2] %>% as.numeric(), + "alrt" = strsplit(target.node, "/")[[1]][1] %>% as.numeric(), + "monophyly" = ifelse( + is.monophyletic(tree_ml, target_names), + "are", + "are not" + ), + "target_mrca" = target.mrca, + "clade_tips" = target.mrca.clade.ntips, + "max_tip_length" = max(node.depth.edgelength(tree_ml)[ + 1:length(tree_ml$tip.label) + ]), + "root" = snakemake@params[["ref_name"]] +) %>% + write_json( + snakemake@output$json, + auto_unbox = TRUE, + digits = NA + ) diff --git a/workflow/scripts/report/context_phylogeny_plot.R b/workflow/scripts/report/context_phylogeny_plot.R new file mode 100644 index 0000000..742bb24 --- /dev/null +++ b/workflow/scripts/report/context_phylogeny_plot.R @@ -0,0 +1,71 @@ +#!/usr/bin/env Rscript + +# Write stdout and stderr to log file +log <- file(snakemake@log[[1]], open = "wt") +sink(log, type = "message") +sink(log, type = "output") + +library(dplyr) +library(readr) +library(jsonlite) +library(ape) +library(ggplot2) +library(ggtree) +library(logger) + +log_threshold(INFO) + +# Import file with plots style +source(snakemake@params[["design"]]) + +# Format tree label for well supported nodes +TREE_LEGEND_NAMES["boot_alrt_pass"] <- sprintf( + TREE_LEGEND_NAMES["boot_alrt_pass"], + 100 * snakemake@params[["boot_th"]], "%", + 100 * snakemake@params[["alrt_th"]], "%" +) + +log_info("Reading JSON data") +json <- read_json(snakemake@input$json) + +log_info("Reading tree") +tree_ml <- read.tree(snakemake@input[["tree"]]) %>% + root( + json$root, + resolve.root = TRUE + ) + +log_info("Reading tree annotation") +annotation <- read_csv(snakemake@input$annotation) + +log_info("Plotting M-L tree with context samples") +p <- ggtree(tree_ml, layout = "circular") %<+% + annotation + + geom_highlight(node = json$target_mrca, colour = "orange", alpha = 0) + + geom_point(aes(color = Class, size = Class)) + + geom_treescale(1.05 * json$max_tip_length) + + geom_rootedge(0.05 * json$max_tip_length) + + xlim(-json$max_tip_length / 3, NA) + + scale_color_manual( + values = setNames( + TREE_PALETTE[names(TREE_LEGEND_NAMES)], + TREE_LEGEND_NAMES + ), + na.translate = FALSE + ) + + scale_size_manual( + values = setNames( + TREE_NODE_SIZE[names(TREE_LEGEND_NAMES)], + TREE_LEGEND_NAMES + ), + na.translate = FALSE + ) + +ggsave( + filename = snakemake@output[["plot"]], + plot = p, + width = snakemake@params[["plot_width_mm"]], + height = snakemake@params[["plot_height_mm"]], + units = "mm", + dpi = 250 +) diff --git a/workflow/scripts/report/demix_plot.R b/workflow/scripts/report/demix_plot.R new file mode 100644 index 0000000..1cf1c3d --- /dev/null +++ b/workflow/scripts/report/demix_plot.R @@ -0,0 +1,56 @@ +#!/usr/bin/env Rscript + +# Write stdout and stderr to log file +log <- file(snakemake@log[[1]], open = "wt") +sink(log, type = "message") +sink(log, type = "output") + +library(dplyr) +library(readr) +library(ggplot2) +library(logger) + +log_threshold(INFO) + +# Import file with plots style +source(snakemake@params[["design"]]) + +# Read inputs +log_info("Plotting") +p <- read_csv(snakemake@input$data) %>% + mutate( + sample = reorder(sample, CollectionDate), + lineage = ifelse(is_main, lineage, NA) + ) %>% + ggplot() + + aes( + x = sample, + y = abundance, + fill = lineage + ) + + scale_fill_viridis_d( + na.value = "gray50", + labels = function(x) { + ifelse(is.na(x), "Other", x) + } + ) + + geom_col() + + theme( + axis.text.x = element_text(angle = 60, hjust = 1), + legend.position = "bottom" + ) + + labs( + x = "Sample", + y = "Relative abundance", + fill = "Lineage" + ) + +log_info("Saving plot") +ggsave( + filename = snakemake@output[["plot"]], + plot = p, + width = snakemake@params$plot_width_mm, + height = snakemake@params$plot_height_mm, + units = "mm", + dpi = 250 +) diff --git a/workflow/scripts/report/demix_plot_data.R b/workflow/scripts/report/demix_plot_data.R new file mode 100644 index 0000000..cb0c08b --- /dev/null +++ b/workflow/scripts/report/demix_plot_data.R @@ -0,0 +1,58 @@ +#!/usr/bin/env Rscript + +# Write stdout and stderr to log file +log <- file(snakemake@log[[1]], open = "wt") +sink(log, type = "message") +sink(log, type = "output") + +library(dplyr) +library(readr) +library(logger) + +log_threshold(INFO) + +# Read inputs +demix <- read_csv(snakemake@input[["summary_demixing"]]) + +# Data processing +log_info("Obtaining main lineages") +main_lineages <- demix %>% + group_by(sample) %>% + top_n(1, abundance) %>% + ungroup() %>% + pull(lineage) %>% + unique() + +# Obtain sample names ordered by CollectionDate +log_info("Sorting dates") +metadata <- read_csv(snakemake@input[["metadata"]]) +date_order <- metadata %>% + arrange(CollectionDate) %>% + filter(ID %in% demix$sample) %>% + pull(ID) %>% + unique() + +# Build plot data +log_info("Building plot data") +plot.data <- demix %>% + group_by(lineage, sample) %>% + summarize(abundance = sum(abundance)) %>% + ungroup() %>% + left_join( + select( + metadata, + ID, + CollectionDate + ), + by = c("sample" = "ID") + ) %>% + mutate( + is_main = lineage %in% main_lineages + ) + +# Save plot data +log_info("Saving plot data") +write_csv( + plot.data, + snakemake@output[["data"]] +) diff --git a/workflow/scripts/report/diversity_data.R b/workflow/scripts/report/diversity_data.R new file mode 100644 index 0000000..9ca5918 --- /dev/null +++ b/workflow/scripts/report/diversity_data.R @@ -0,0 +1,116 @@ +#!/usr/bin/env Rscript + +# Write stdout and stderr to log file +log <- file(snakemake@log[[1]], open = "wt") +sink(log, type = "message") +sink(log, type = "output") + +set.seed(snakemake@params$seed) + +library(ape) +library(pegas) +library(future) +library(future.apply) +library(readr) +library(jsonlite) +library(logger) + +log_threshold(INFO) + +# Pi calculation +nucleotide.diversity <- function(dna_object, record.names, sample.size) { + sample <- sample(record.names, sample.size, replace = FALSE) + dna_subset <- dna_object[record.names %in% sample] + nuc.div(dna_subset) +} + +# Parallel bootstrapping +boot.nd.parallel <- function(aln, sample.size, reps) { + record.names <- names(aln) + future_sapply( + 1:reps, + function(x) { + nucleotide.diversity(aln, record.names, sample.size) + }, + future.seed = TRUE + ) +} + +# Read outgroup/context alignment +log_info("Reading context") +gene_ex <- read.dna( + snakemake@input[["context_fasta"]], + format = "fasta", + as.matrix = FALSE +) +gene_ex <- gene_ex[ + !startsWith(names(gene_ex), snakemake@params$aln_reference) +] + +# Read target (study) alignment +log_info("Reading target alignment") +study_aln <- read.dna( + snakemake@input[["study_fasta"]], + format = "fasta", + as.matrix = FALSE +) +study_aln <- study_aln[ + !startsWith(names(study_aln), snakemake@params$aln_reference) +] + +# Diversity value for our samples +log_info("Calculating diversity value for studied samples") +diversity <- nuc.div(study_aln) + +# Perform bootstrap +log_info("Performing calculation for nucleotide diversity in context samples") +plan(multisession, workers = snakemake@threads) +divs <- boot.nd.parallel( + gene_ex, + length(study_aln), + snakemake@params[["bootstrap_reps"]] +) +plan(sequential) + +# Test normality +log_info("Normality test for nucleotide diversity values") +st <- shapiro.test(divs) + +# Calculate p-value (assuming normal distribution) +log_info("Calculating p-value (assuming normal distribution)") +test <- t.test( + divs, + alternative = "greater", + mu = diversity, + conf.level = 0.95 +) +pvalue.norm <- test$p.value + +# Estimate p-value empirically +log_info("Estimating p-value empirically") +empirical.probs <- ecdf(divs) +pvalue.emp <- empirical.probs(diversity) + +# Data for JSON file +log_info("Building JSON data") +p.value <- ifelse(st$p.value >= 0.05, pvalue.norm, pvalue.emp) +list.div <- list( + "diversity" = diversity, + "p.value" = ifelse(p.value >= 0.001, p.value, "< 0.001"), + "normal.pvalue" = ifelse(st$p.value >= 0.001, p.value, "< 0.001"), + "norm.text" = ifelse(st$p.value >= 0.05, "", "not"), + "type.test" = ifelse(st$p.value >= 0.05, "", "empirical"), + "boot.reps" = snakemake@params[["bootstrap_reps"]], + "sample.size" = length(study_aln) +) + +log_info("Writing diversity distribution") +write_lines(divs, snakemake@output[["divs"]]) + +log_info("Writing results JSON") +write_json( + list.div, + snakemake@output[["json"]], + auto_unbox = TRUE, + digits = NA # maximum precision +) diff --git a/workflow/scripts/report/diversity_plot.R b/workflow/scripts/report/diversity_plot.R index 3ad0131..0c42118 100644 --- a/workflow/scripts/report/diversity_plot.R +++ b/workflow/scripts/report/diversity_plot.R @@ -5,145 +5,57 @@ log <- file(snakemake@log[[1]], open = "wt") sink(log, type = "message") sink(log, type = "output") -library(ape) -library(pegas) -library(future.apply) -library(tidyverse) +library(dplyr) +library(readr) +library(ggplot2) library(jsonlite) library(logger) -log_threshold(INFO) - -# Pi calculation -nucleotide.diversity <- function(dna_object, record.names, sample.size) { - sample <- sample(record.names, sample.size, replace = FALSE) - dna_subset <- dna_object[record.names %in% sample] - nuc.div(dna_subset) -} -# Parallel bootstrapping -boot.nd.parallel <- function(aln, sample.size = 12, reps = 100) { - record.names <- names(aln) - future_sapply( - 1:reps, - function(x) { - nucleotide.diversity(aln, record.names, sample.size) - }, - future.seed = TRUE - ) -} +log_threshold(INFO) # Import file with plots style source(snakemake@params[["design"]]) -# Outgroup/context alignment -gene_ex <- read.dna( - snakemake@input[["context_fasta"]], - format = "fasta", - as.matrix = FALSE -) -gene_ex <- gene_ex[ - !startsWith(names(gene_ex), snakemake@config[["ALIGNMENT_REFERENCE"]]) -] - -# Study alignment -study_aln <- read.dna( - snakemake@input[["study_fasta"]], - format = "fasta", - as.matrix = FALSE -) -study_aln <- study_aln[ - !startsWith(names(study_aln), snakemake@config[["ALIGNMENT_REFERENCE"]]) -] - -# Diversity value for our samples -log_info("Calculating diversity value for studied samples") -diversity <- nuc.div(study_aln) - - -# Perform bootstrap -log_info("Performing calculation for nucleotide diversity in context samples") -plan(multisession, workers = snakemake@threads) -divs <- boot.nd.parallel( - gene_ex, - length(study_aln), - snakemake@params[["bootstrap_reps"]] -) -plan(sequential) - -# Test normality -log_info("Normality test for nucleotide diversity values") -st <- shapiro.test(divs) - -# Calculate p-value (assuming normal distribution) - -log_info("Calculating p-value (assuming normal distribution)") - -test <- t.test( - divs, - alternative = "greater", - mu = diversity, - conf.level = 0.95 -) -pvalue.norm <- test$p.value - -# Estimate p-value empirically -log_info("Estimating p-value empirically") -empirical.probs <- ecdf(divs) -pvalue.emp <- empirical.probs(diversity) +# Read data +log_info("Reading diversity results") +divs <- read_lines(snakemake@input$divs) %>% + as.numeric() +json <- read_json(snakemake@input$json) # Plot and save -log_info("Plotting diversity plot") +log_info("Plotting") p <- data.frame(pi = divs) %>% ggplot() + geom_density( aes(x = pi), - fill = "#fcbf49", + fill = DIVERSITY_PALETTE["density_fill"], alpha = 0.7, bw = 0.000001, - color = "#eae2b7" - ) + - geom_vline( - aes(xintercept = diversity), - color = "#d62828" + linewidth = 0.5, + color = DIVERSITY_PALETTE["density_color"] ) + stat_function( fun = dnorm, args = list(mean = mean(divs), sd = sd(divs)), - color = "#f77f00" + color = DIVERSITY_PALETTE["dnorm_color"], + linewidth = 1 + ) + + geom_vline( + xintercept = json$diversity, + color = DIVERSITY_PALETTE["value_color"], + linewidth = 1 ) + labs( x = "π", y = "Density" - ) + ) + + xlim(0, NA) ggsave( - filename = snakemake@output[["fig"]], + filename = snakemake@output[["plot"]], plot = p, - width = snakemake@params[["plot_width"]], - height = snakemake@params[["plot_height"]], + width = snakemake@params[["plot_width_mm"]], + height = snakemake@params[["plot_height_mm"]], units = "mm", dpi = 250 ) - -# DATA JSON ##### -p.value <- ifelse(st$p.value >= 0.05, pvalue.norm, pvalue.emp) - -list.div <- list( - "diversity" = format(diversity, scientific = TRUE), - "p.value" = ifelse(p.value >= 0.001, p.value, "< 0.001"), - "normal.pvalue" = ifelse(st$p.value >= 0.001, p.value, "< 0.001"), - "norm.text" = ifelse(st$p.value >= 0.05, "", "not"), - "type.test" = ifelse(st$p.value >= 0.05, "", "empirical"), - "boot.reps" = snakemake@params[["bootstrap_reps"]], - "sample.size" = length(study_aln) -) - -json <- toJSON(list.div) -write(json, snakemake@output[["json"]]) - -# PLOT TABLES -data.frame( - pi = divs, - prop.value = diversity -) %>% - write.csv(snakemake@output[["table"]], row.names = FALSE) diff --git a/workflow/scripts/report/dnds_plots.R b/workflow/scripts/report/dnds_plots.R new file mode 100644 index 0000000..62b2e41 --- /dev/null +++ b/workflow/scripts/report/dnds_plots.R @@ -0,0 +1,89 @@ +#!/usr/bin/env Rscript + +# Write stdout and stderr to log file +log <- file(snakemake@log[[1]], open = "wt") +sink(log, type = "message") +sink(log, type = "output") + +library(dplyr) +library(readr) +library(tidyr) +library(ggplot2) +library(logger) + +log_threshold(INFO) + +# Import file with plots style +source(snakemake@params[["design"]]) + +# Read inputs +log_info("Reading plot data") +plot.df <- read_delim(snakemake@input$table) %>% + filter(w != Inf) %>% + pivot_longer( + c("dN", "dS", "w"), + values_to = "value", + names_to = "ratio" + ) + +log_info("Plotting dN and dS") +p.dn.ds <- plot.df %>% + filter(ratio != "w") %>% + ggplot() + + aes( + x = day, + y = value, + color = ratio, + shape = ratio + ) + + geom_point(size = 3) + + geom_line() + + scale_color_manual( + name = "", + labels = DNDS_LABELS, + values = DNDS_COLORS + ) + + scale_shape_manual( + name = "", + values = DNDS_SHAPES, + labels = DNDS_LABELS + ) + + labs( + y = "Substitution rate", + x = "Days since the initial sampling", + color = "" + ) + +ggsave( + filename = snakemake@output$plot_dn_ds, + plot = p.dn.ds, + width = snakemake@params$plot_width_mm, + height = snakemake@params$plot_height_mm, + units = "mm", + dpi = 250 +) + +log_info("Plotting w (dN/dS)") +p.omega <- plot.df %>% + filter(ratio == "w") %>% + ggplot() + + aes( + x = day, + y = value, + ) + + geom_point(color = "black") + + geom_line(color = "black") + + labs( + y = "w (dN/dS)", + x = "Days since the initial sampling", + color = "" + ) + +ggsave( + filename = snakemake@output$plot_omega, + plot = p.omega, + width = snakemake@params$plot_width_mm, + height = snakemake@params$plot_height_mm, + units = "mm", + dpi = 250 +) diff --git a/workflow/scripts/report/evo_plots.R b/workflow/scripts/report/evo_plots.R deleted file mode 100644 index e0ed984..0000000 --- a/workflow/scripts/report/evo_plots.R +++ /dev/null @@ -1,159 +0,0 @@ -#!/usr/bin/env Rscript - -# Write stdout and stderr to log file -log <- file(snakemake@log[[1]], open = "wt") -sink(log, type = "message") -sink(log, type = "output") - -library(tidyverse) -library(logger) -log_threshold(INFO) - -# Import file with plots style -source(snakemake@params[["design"]]) - -# Read inputs -vcf <- read_delim(snakemake@input[["vcf"]]) -metadata <- read_delim(snakemake@input[["metadata"]]) -N_S_position <- read_delim(snakemake@input[["N_S"]]) - -# DATA PROCESSING -# Create SNP variable and select useful variables -vcf <- vcf %>% - dplyr::select( - variant, - REGION, - ALT_FREQ, - GFF_FEATURE, - synonimous, - POS - ) - -# Create variable for days sins first sample in metadata -metadata <- metadata %>% - mutate( - interval = as.numeric( - as.Date(CollectionDate) - min(as.Date(CollectionDate)) - ) - ) %>% - select(ID, interval) %>% - rename(REGION = ID) - -vcf <- left_join(vcf, metadata) - -# PLOT -log_info("Ploting dN and dS over time") -plot_df <- vcf %>% - group_by(REGION, synonimous) %>% - summarise( - Freq = sum(ALT_FREQ, na.rm = TRUE) - ) %>% - pivot_wider( - names_from = synonimous, - values_from = Freq, - values_fill = 0 - ) %>% - transmute( - dn = No / sum(N_S_position$N), - ds = Yes / sum(N_S_position$S) - ) %>% - ungroup() %>% - mutate( - w = dn / ds, - ) %>% - filter(w != Inf) %>% - pivot_longer( - c("dn", "ds", "w"), - values_to = "value", - names_to = "d" - ) %>% - left_join(unique(select(vcf, REGION, interval))) - -plot <- plot_df %>% - filter(d != "w") %>% - ggplot() + - aes( - x = interval, - y = value, - color = d, - shape = d - ) + - geom_point(size = 3) + - geom_line() + - scale_color_manual( - name = "Parameter", - labels = dnds.labels, - values = dnds.colors - ) + - scale_shape_manual( - name = "Parameter", - values = dnds.shapes, - labels = dnds.labels - ) + - labs( - y = "Substitution rate", - x = "Days since the initial sampling", - color = "Parameter" - ) - -ggsave( - filename = snakemake@output[["plot"]], - plot = plot, - width = 159.2, - height = 119.4, - units = "mm", - dpi = 250 -) - -# Plot for omega - -plot_omega <- plot_df %>% - filter(d == "w") %>% - ggplot() + - aes( - x = interval, - y = value, - ) + - geom_point(color = "black") + - geom_line(color = "black") + - labs( - y = "w (dN/dS)", - x = "Days since the initial sampling", - color = "Parameter" - ) - -ggsave( - filename = snakemake@output[["plot_omega"]], - plot = plot_omega, - width = 159.2, - height = 119.4, - units = "mm", - dpi = 250 -) - -# PLOT TABLES -log_info("Saving plot table") -vcf %>% - group_by(REGION, synonimous) %>% - summarise( - Freq = sum(ALT_FREQ, na.rm = TRUE) - ) %>% - pivot_wider( - names_from = synonimous, - values_from = Freq, - values_fill = 0 - ) %>% - transmute( - dn = No / sum(N_S_position$N), - ds = Yes / sum(N_S_position$S) - ) %>% - ungroup() %>% - left_join(unique(select(vcf, REGION, interval))) %>% - transmute( - sample = REGION, - DaysSinceFirst = interval, - dN = dn, - dS = ds, - w = dn / ds - ) %>% - write.csv(snakemake@output[["table"]], row.names = FALSE) diff --git a/workflow/scripts/report/extract_genbank_regions.py b/workflow/scripts/report/extract_genbank_regions.py new file mode 100644 index 0000000..61f3a33 --- /dev/null +++ b/workflow/scripts/report/extract_genbank_regions.py @@ -0,0 +1,48 @@ +#!/usr/bin/env python3 + +import logging +import json + +from Bio import SeqIO +from Bio.SeqFeature import ExactPosition + + +if __name__ == "__main__": + logging.basicConfig( + filename=snakemake.log[0], format=snakemake.config["LOG_PY_FMT"], + level=logging.INFO + ) + + logging.info("Reading GenBank file") + gb = SeqIO.read(snakemake.input.gb, format="gb") + + logging.info("Calculating CDS regions") + regions = {} + for feature in gb.features: + identifier = "|".join(feature.qualifiers.get(snakemake.params.gb_qualifier, [])) + if identifier == "": + logging.error(f"Feature at {feature.location} has no qualifier '{snakemake.params.gb_qualifier_display}'") + elif identifier in regions: + logging.warning(f"Identifier '{identifier}' is already among coding records and will not be replaced by feature at {feature.location}") + else: + logging.debug(f"Adding feature") + if type(feature.location.start) is not ExactPosition or type(feature.location.start) is not ExactPosition: + logging.warning(f"Feature '{identifier}' location is not exact but will be treated as such: {feature.location}") + regions[identifier] = ( + int(feature.location.start.real + 1), + int(feature.location.end.real) + ) + + logging.info("Calculating intergenic regions") + cds_names = tuple(regions.keys()) + intergenic_count = 0 + for i in range(1, len(cds_names)): + start1, end1 = regions[cds_names[i-1]] + start2, end2 = regions[cds_names[i]] + if (start2 - end1) > 1: + intergenic_count += 1 + regions[f"Intergenic_{intergenic_count}"] = (end1 + 1, start2 - 1) + + logging.info("Writing regions to JSON file") + with open(snakemake.output.regions, "w") as fw: + json.dump(regions, fw, indent=2) diff --git a/workflow/scripts/report/freyja_plot.R b/workflow/scripts/report/freyja_plot.R deleted file mode 100644 index e698091..0000000 --- a/workflow/scripts/report/freyja_plot.R +++ /dev/null @@ -1,101 +0,0 @@ -#!/usr/bin/env Rscript - -# Write stdout and stderr to log file -log <- file(snakemake@log[[1]], open = "wt") -sink(log, type = "message") -sink(log, type = "output") - -library(tidyverse) -library(logger) -log_threshold(INFO) - -# Import file with plots style -source(snakemake@params[["design"]]) - -# Read inputs -demix <- read_csv(snakemake@input[["summary_demixing"]]) - -# DATA PROCESSING -log_info("Obtaining main lineages") -main_lineages <- demix %>% - group_by(sample) %>% - top_n(1, abundances) %>% - ungroup() %>% - pull(lineages) %>% - unique() - -# Obtain sample names ordered by CollectionDate -metadata <- read_csv(snakemake@input[["metadata"]]) -date_order <- metadata %>% - arrange(CollectionDate) %>% - filter(ID %in% demix$sample) %>% - pull(ID) %>% - unique() - -# PLOT -log_info("Plotting summary demixing") -demix_plot <- demix %>% - mutate( - lineages = case_when( - lineages %in% main_lineages ~ lineages - ) - ) %>% - group_by(lineages, sample) %>% - mutate( - abundances = sum(abundances) - ) %>% - unique() %>% - ggplot() + - aes( - x = factor(sample, date_order), - y = as.numeric(abundances), - fill = lineages - ) + - scale_fill_viridis_d( - na.value = "gray50", - labels = function(x) { - ifelse(is.na(x), "Other", x) - } - ) + - geom_col() + - theme( - axis.text.x = element_text(angle = 60, hjust = 1), - legend.position = "bottom" - ) + - labs( - x = "Sample", - y = "Relative abundance", - fill = "Lineage" - ) - -ggsave( - filename = snakemake@output[["fig"]], - plot = demix_plot, - width = 159.2, - height = 119.4, - units = "mm", - dpi = 250 -) - - -# PLOT TABLES -log_info("Saving plot table") -demix %>% - mutate( - lineages = case_when( - lineages %in% main_lineages ~ lineages, - TRUE ~ "Other" - ) - ) %>% - group_by(sample, lineages) %>% - summarise(abundances = sum(abundances)) %>% - ungroup() %>% - left_join( - select( - metadata, - ID, - CollectionDate - ), - by = c("sample" = "ID") - ) %>% - write.csv(snakemake@output[["table"]], row.names = FALSE) diff --git a/workflow/scripts/report/get_annotation.py b/workflow/scripts/report/get_annotation.py deleted file mode 100644 index 552d70d..0000000 --- a/workflow/scripts/report/get_annotation.py +++ /dev/null @@ -1,36 +0,0 @@ -#!/usr/bin/env python3 - -import json -import logging -import pandas as pd -from gb2seq.features import Features -from Bio import SeqIO - - -def main(): - logging.basicConfig(filename=snakemake.log[0], format=snakemake.config["LOG_PY_FMT"], level=logging.INFO) - - logging.info("Reading features") - ft = Features(snakemake.input.gb) - with open(snakemake.input.features) as f: - feature_key = json.load(f) - - logging.info("Reading reference") - reference = str(next(SeqIO.parse(snakemake.input.ref, "fasta")).seq) - positions = [x for x in range(len(reference))] - - logging.info("Building annotation") - genes = [] - for pos in positions: - if len(ft.getFeatureNames(pos)) == 0: - genes.append("Intergenic") - else: - genes.append(feature_key[list(ft.getFeatureNames(pos))[0]]) - - logging.info("Writing table") - df = pd.DataFrame({"POS": positions, "GEN": genes}) - df.to_csv(snakemake.output.df, index= False) - - -if __name__ == "__main__": - main() diff --git a/workflow/scripts/report/heatmap.R b/workflow/scripts/report/heatmap.R deleted file mode 100644 index 08a2d73..0000000 --- a/workflow/scripts/report/heatmap.R +++ /dev/null @@ -1,35 +0,0 @@ -#!/usr/bin/env Rscript - -# Write stdout and stderr to log file -log <- file(snakemake@log[[1]], open = "wt") -sink(log, type = "message") -sink(log, type = "output") - -library(tidyverse) -library(logger) -log_threshold(INFO) - -vcf <- read_tsv(snakemake@input[["vcf"]]) - -# Obtain sample names ordered by CollectionDate -date_order <- read_csv(snakemake@input[["metadata"]]) %>% - arrange(CollectionDate) %>% - pull(ID) %>% - unique() - -# Create SNP variable and select useful variables from vcf -vcf <- vcf %>% - dplyr::select(variant, REGION, ALT_FREQ) - -vcf <- vcf %>% - pivot_wider( - names_from = variant, - values_from = ALT_FREQ, - values_fill = 0, - values_fn = sum - ) %>% - arrange(factor(REGION, levels = date_order)) %>% - column_to_rownames(var = "REGION") - -log_info("Saving table to create heatmap") -write.csv(vcf, snakemake@output[["table"]]) diff --git a/workflow/scripts/report/nv_panel_data.R b/workflow/scripts/report/nv_panel_data.R new file mode 100644 index 0000000..5cb6880 --- /dev/null +++ b/workflow/scripts/report/nv_panel_data.R @@ -0,0 +1,108 @@ +#!/usr/bin/env Rscript + +# Write stdout and stderr to log file +log <- file(snakemake@log[[1]], open = "wt") +sink(log, type = "message") +sink(log, type = "output") + +library(dplyr) +library(readr) +library(stringr) +library(tibble) +library(jsonlite) +library(logger) + +log_threshold(INFO) + +log_info("Reading variants") +variants <- read_delim( + snakemake@input$variants, + col_select = c( + "SAMPLE", + "REGION", + "POS", + "ALT", + "VARIANT_NAME", + "ALT_FREQ", + "EFFECT", + "SYNONYMOUS" + ) +) +log_debug("Read {nrow(variants)} rows") + +log_info("Reading dates from metadata") +dates <- read_delim( + snakemake@input$metadata, + col_select = c("ID", "CollectionDate") +) + +log_info("Dating variants") +variants <- left_join( + variants, dates, + by = c("SAMPLE" = "ID") +) + +log_info("Classifying variants") +variants <- variants %>% + mutate( + NV_class = ifelse( + str_detect(ALT, fixed("-")) | str_detect(ALT, fixed("+")), + "INDEL", + "SNP" + ), + Class = ifelse(EFFECT == "intergenic_region", "Intergenic", SYNONYMOUS), + POS = as.numeric(POS) + ) %>% + # rowwise() %>% + mutate( + indel_len = ifelse(NV_class == "INDEL", str_length(ALT) - 1, NA), + indel_class = case_when( + EFFECT == "intergenic_region" ~ "Intergenic", + (NV_class == "INDEL") & (indel_len %% 3 == 0) ~ "In frame", + (NV_class == "INDEL") & (indel_len %% 3 != 0) ~ "Frameshift" + ) + ) %>% + ungroup() %>% + mutate( + group = case_when( + EFFECT == "intergenic_region" ~ "Intergenic", + NV_class == "SNP" ~ Class, + NV_class == "INDEL" ~ indel_class + ) + ) %>% + filter(ALT_FREQ > 0) +log_debug("Processed {nrow(variants)} rows") + +if (nrow(variants) == 0) { + log_warning("No variants found, using an empty table") + variants <- tibble( + SAMPLE = date_order, + REGION = as.character(NA), + VARIANT_NAME = as.character(NA), + ALT_FREQ = as.numeric(NA), + EFFECT = as.character(NA), + SYNONYMOUS = as.character(NA), + POS = as.numeric(NA), + ALT = as.character(NA), + NV_class = as.character(NA), + group = as.character(NA) + ) +} + +log_info("Writing processed table") +write_csv(variants, snakemake@output$table) + +log_info("Writing NV summary") +nv_counts <- variants %>% + distinct(VARIANT_NAME, NV_class) %>% + count(NV_class) %>% + deframe() +list( + "INDELS" = nv_counts["INDEL"], + "SNV" = nv_counts["SNP"] +) %>% + write_json( + snakemake@output$json, + auto_unbox = TRUE, + digits = NA + ) diff --git a/workflow/scripts/report/nv_panel_plot.R b/workflow/scripts/report/nv_panel_plot.R new file mode 100644 index 0000000..da7eca7 --- /dev/null +++ b/workflow/scripts/report/nv_panel_plot.R @@ -0,0 +1,165 @@ +#!/usr/bin/env Rscript + +# Write stdout and stderr to log file +log <- file(snakemake@log[[1]], open = "wt") +sink(log, type = "message") +sink(log, type = "output") + +library(dplyr) +library(readr) +library(ggplot2) +library(ggpubr) +library(jsonlite) +library(tibble) +library(purrr) +library(logger) + +log_threshold(INFO) + +# Import file with plots style +source(snakemake@params[["design"]]) + +# Subplot A: windows ============================ +log_info("Reading variant windows") +window <- read_csv(snakemake@input$window) + +log_debug("Calculating X axis limits") +xlim_values <- c( + max(0, min(window$position) - snakemake@params$window_step), + max(window$position) + snakemake@params$window_step +) + +log_info("Plotting variant windows") +p1 <- window %>% + ggplot() + + aes( + x = position, + y = fraction, + color = feature + ) + + geom_point() + + geom_line( + aes(group = 1), + colour = "black", + alpha = 0.3 + ) + + scale_y_continuous( + label = scales::percent, + limits = c(0, max(window$fraction) + mean(window$fraction)) + ) + + xlim(xlim_values) + + # TODO: change GENE_PALETTE to selection of TRAJECTORY.PANEL.COLORS ? + scale_color_manual(values = GENE_PALETTE) + + labs( + y = "Proportion of\nsites with NV", + x = "", + color = "Gen" + ) + +log_info("Reading regions to highlight") +highlight <- read_csv(snakemake@input$highlight_window_regions) + +if (nrow(highlight) != 0) { + log_info("Highlighting {nrow(highlight)} regions") + highlighted_sites <- unique(c(highlight$start, highlight$end)) + p1 <- p1 + + geom_vline(xintercept = highlighted_sites, color = "orange") + + geom_text( + data = highlight, + aes( + x = (start + end) / 2, + y = max(window$fraction + mean(window$fraction) / 2), + label = region + ), + inherit.aes = FALSE, + size = 3, + angle = 45 + ) +} else { + log_info("No highlighted regions") +} + +# Subplot B: windows ============================ +log_info("Reading panel data") +panel <- read_csv(snakemake@input$panel) + +log_info("Calculating sample chronological order") +date_order <- panel %>% + arrange(CollectionDate) %>% + pull(SAMPLE) %>% + unique() + +log_info("Reading regions") +regions <- read_json(snakemake@input$regions, simplifyVector = TRUE) %>% + enframe(name = "region", value = "coords") %>% + mutate( + start = map_dbl(coords, 1), + end = map_dbl(coords, 2), + length = end - start + 1 + ) %>% + select(-coords) + +log_info("Plotting variants") +p2 <- panel %>% + ggplot() + + aes( + x = POS, + y = factor(SAMPLE, date_order), + shape = factor(NV_class, c("SNP", "INDEL")), + color = group, + alpha = ALT_FREQ + ) + + geom_point(size = 3) + + geom_col( + data = regions, + aes( + x = length, + y = 0.3, + # TODO: legend shows as NA those without match in `regions` + fill = factor(region, rev(regions$region)) + ), + inherit.aes = FALSE, + width = 0.3 + ) + + # TODO: change GENE_PALETTE to selection of TRAJECTORY.PANEL.COLORS ? + scale_fill_manual(values = GENE_PALETTE) + + xlim(xlim_values) + + scale_color_manual( + labels = NV_TYPE_NAMES, + values = NV_TYPE_PALETTE + ) + + scale_y_discrete(drop = FALSE) + + labs( + x = "Genome position", + y = "Sample", + shape = "Variant class", + color = "Classification", + alpha = "Frequency", + fill = "Region" + ) + + guides( + fill = guide_legend(reverse = TRUE) + ) + +# Plot arrangement ============================ +log_info("Arranging plots") +p <- ggarrange( + p1, + p2, + nrow = 2, + align = "v", + legend.grob = get_legend(p2), + heights = c(2, 6), + legend = "right", + labels = c("A", "B") +) + +log_info("Saving plot") +ggsave( + filename = snakemake@output$plot, + plot = p, + width = snakemake@params$plot_width_mm, + height = snakemake@params$plot_height_mm, + units = "mm", + dpi = 250 +) diff --git a/workflow/scripts/report/nv_panel_zoom_on_feature_data.py b/workflow/scripts/report/nv_panel_zoom_on_feature_data.py new file mode 100644 index 0000000..4989e68 --- /dev/null +++ b/workflow/scripts/report/nv_panel_zoom_on_feature_data.py @@ -0,0 +1,28 @@ +#!/usr/bin/env python3 + +import sys +import logging +import json + +import pandas as pd + + +if __name__ == "__main__": + + logging.basicConfig(filename=snakemake.log[0], format=snakemake.config["LOG_PY_FMT"], level=logging.INFO) + + logging.info("Reading regions") + with open(snakemake.input.regions) as f: + regions = json.load(f) + + if snakemake.wildcards.region_name not in regions: + logging.error(f"Region {snakemake.wildcards.region_name} is absent in {snakemake.input.regions}") + sys.exit(1) + + start, end = regions[snakemake.wildcards.region_name] + + logging.info("Reading input table") + df = pd.read_csv(snakemake.input.table) + + logging.info("Filtering sites and writing results") + df[(df.POS >= start) & (df.POS <= end)].to_csv(snakemake.output.table, index=False) diff --git a/workflow/scripts/report/phylo_plots.R b/workflow/scripts/report/phylo_plots.R deleted file mode 100644 index 447d873..0000000 --- a/workflow/scripts/report/phylo_plots.R +++ /dev/null @@ -1,291 +0,0 @@ -#!/usr/bin/env Rscript - -# Write stdout and stderr to log file -log <- file(snakemake@log[[1]], open = "wt") -sink(log, type = "message") -sink(log, type = "output") - -library(tidyverse) -library(stringi) -library(ape) -library(ggtree) -library(data.table) -library(ggpubr) -library(pegas) -library(jsonlite) -library(logger) - -log_threshold(INFO) - -# Import file with plots style -source(snakemake@params[["design"]]) - -# legend thresholds for ml tree -legend.names <- c( - tip_label = "Target samples", - boot_alrt_pass = sprintf( - "UFBoot ≥ %s%s & SH-aLRT ≥ %s%s ", - snakemake@params[["boot_th"]], - "%", - snakemake@params[["alrt_th"]], - "%" - ) -) - -matrix <- read_csv(snakemake@input[["dist"]]) -metadata <- read_csv(snakemake@input[["metadata"]]) -tree_ml <- read.tree(snakemake@input[["ml"]]) %>% - root( - snakemake@params[["ref_name"]], - resolve.root = TRUE - ) - -study_names <- read.dna( - snakemake@input[["study_fasta"]], - format = "fasta", - as.matrix = FALSE, -) - -study_names <- study_names[ - !startsWith(names(study_names), snakemake@config[["ALIGNMENT_REFERENCE"]]) -] -study_names <- names(study_names) - -# Obtain sample names ordered by CollectionDate -date_order <- metadata %>% - arrange(CollectionDate) %>% - filter(ID %in% study_names) %>% - pull(ID) %>% - unique() - -tree_tiplab <- data.frame( - ID = date_order, - order = seq(1, length(date_order), 1) -) %>% - rowwise() %>% - mutate( - tip_label = sprintf( - "(%s)-%s", - order, - ID - ) - ) %>% - ungroup() %>% - add_row( - ID = snakemake@params[["ref_name"]], - order = 0, - tip_label = snakemake@params[["ref_name"]] - ) - - -# DATA PROCESSING #### -# Resolve possible negative edge lengths in n-j tree -fix_negative_edge_length <- function(nj.tree) { - edge_infos <- cbind( - nj.tree$edge, - nj.tree$edge.length - ) %>% - as.data.table - colnames(edge_infos) <- c("from", "to", "length") - - nega_froms <- edge_infos[length < 0, sort(unique(from))] - - for (nega_from in nega_froms) { - minus_length <- edge_infos[from == nega_from, ][order(length)][1, length] - edge_infos[from == nega_from, length := length - minus_length] - edge_infos[to == nega_from, length := length + minus_length] - } - nj.tree$edge.length <- edge_infos$length - nj.tree -} - -# Tree construction -if (snakemake@params$use_bionj) { - log_info("Constructing BIONJ tree based on distances") - tree_method = bionj -} else { - log_info("Constructing NJ tree based on distances") - tree_method = nj -} -tree <- matrix %>% - column_to_rownames(var = "...1") %>% - as.matrix() %>% - as.dist() %>% - tree_method() %>% - root(snakemake@params[["ref_name"]], resolve.root = TRUE) - -tree <- fix_negative_edge_length(tree) - -# Get patristic distances to ancestor from n-j tree -log_info("Getting patristic distances to ancestor from n-j tree") -tempest <- adephylo::distRoot( - tree, - "all", - method = "patristic" -) %>% - as.data.frame() %>% - rownames_to_column(var = "ID") %>% - filter(ID != snakemake@params[["ref_name"]]) %>% - rename(distance = ".") %>% - left_join( - select( - metadata, - ID, - CollectionDate - ) - ) %>% - mutate( - date_interval = as.numeric( - as.Date(CollectionDate) - min(as.Date(CollectionDate)) - ) - ) - - -# PLOTS #### -# (BIO)NJ tree -log_info("Plotting distance tree") -tree_plot <- ggtree(tree) %<+% - tree_tiplab + - geom_tiplab(aes(label = tip_label)) + - geom_treescale() + - geom_rootedge(0.5) + - xlim(0, max(tempest$distance) + 3.5) - -ggsave( - filename = snakemake@output[["tree"]], - plot = tree_plot, - width = snakemake@params[["plot_width_mm"]], - height = snakemake@params[["plot_height_mm"]], - units = "mm", - dpi = 250 -) - -# TempEst -log_info("Plotting temporal signal analysis") -tempest_fig <- tempest %>% - ggplot() + - aes( - x = date_interval, - y = distance - ) + - geom_smooth( - method = "lm", - fill = "gray95", - alpha = 0.6, - color = "red" - ) + - geom_point() + - labs( - y = "Root-to-tip distance", - x = "Days since the initial sampling" - ) - -ggsave( - filename = snakemake@output[["temest"]], - plot = tempest_fig, - width = snakemake@params[["plot_width_mm"]], - height = snakemake@params[["plot_height_mm"]], - units = "mm", - dpi = 250 -) - -# ML tree with context data - -# Internal nodes color -# Node labels contain SH-aLRT/UFboot values -aLRT.values <- sapply( - strsplit(tree_ml$node.label, "/"), - function(x) { - as.numeric(x[1]) - } -) - -bootstrap.values <- sapply( - strsplit(tree_ml$node.label, "/"), - function(x) { - as.numeric(x[2]) - } -) - -aLRT.mask <- aLRT.values >= snakemake@params[["alrt_th"]] -boot.mask <- bootstrap.values >= snakemake@params[["boot_th"]] - -# MRCA for target samples -log_info("Calculating MRCA of target samples") -study.mrca <- getMRCA(tree_ml, study_names) -study.mrca.clade <- extract.clade(tree_ml, study.mrca) -study.mrca.clade.ntips <- Ntip(study.mrca.clade) - -log_info("Building M-L tree annotation") -tree_ml.max.tip.length <- max(node.depth.edgelength(tree_ml)[1:length(tree$tip.label)]) -tree_ml.ntips <- length(tree_ml$tip.label) -tree_ml.nodes <- tree_ml$Nnode + tree_ml.ntips -tree_ml.labels <- tree_ml$tip.label[1:tree_ml.nodes] -tree_ml.node.pass <- c(rep(FALSE, tree_ml.ntips), aLRT.mask & boot.mask) - -ml.tree.annot <- tibble( - node = 1:tree_ml.nodes, - ) %>% - mutate( - Class = case_when( - tree_ml.labels %in% study_names ~ legend.names["tip_label"], - tree_ml.node.pass ~ legend.names["boot_alrt_pass"], - TRUE ~ NA - ) - ) - -log_info("Plotting M-L tree with context samples") -p <- ggtree(tree_ml, layout = "circular") %<+% ml.tree.annot + - geom_highlight(node = study.mrca, colour = "red", alpha = 0) + - geom_point(aes(color = Class, size = Class)) + - geom_treescale(1.05 * tree_ml.max.tip.length) + - geom_rootedge(0.05 * tree_ml.max.tip.length) + - xlim(-tree_ml.max.tip.length / 3, NA) + - scale_color_manual( - values = setNames(tree_colors[names(legend.names)], legend.names), - na.translate = FALSE - ) + - scale_size_manual( - values = setNames(node.size[names(legend.names)], legend.names), - na.translate = FALSE - ) - -ggsave( - filename = snakemake@output[["tree_ml"]], - plot = p, - width = snakemake@params[["plot_width_mm"]], - height = snakemake@params[["plot_height_mm"]], - units = "mm", - dpi = 250 -) - -# PLOT TABLES -log_info("Saving temporal signal table") -tempest %>% - transmute( - sample = ID, - Distance = distance, - CollectionDate = CollectionDate, - DaysSinceFirst = date_interval - ) %>% - write.csv(snakemake@output[["table"]], row.names = FALSE) - -# TEMPEST STATS -model <- lm(distance ~ date_interval, data = tempest) -p.value <- summary(model)$coefficients[2,4] - -# TREE STATS -study.node <- tree_ml$node.label[study.mrca - length(tree_ml$tip.label)] -monophyletic <- ifelse(is.monophyletic(tree_ml, study_names), "are", "are not") - -list( - "sub_rate" = model$coefficients[[2]] * 365, - "r2" = summary(model)$r.squared[[1]], - "pvalue" = ifelse(p.value < 0.001, "< 0.001", p.value), - "boot" = strsplit(study.node, "/")[[1]][2] %>% as.numeric(), - "alrt" = strsplit(study.node, "/")[[1]][1] %>% as.numeric(), - "monophyly" = monophyletic, - "clade_tips" = study.mrca.clade.ntips -) %>% - toJSON() %>% - write(snakemake@output[["json"]]) diff --git a/workflow/scripts/report/polymorphic_sites_over_time_data.R b/workflow/scripts/report/polymorphic_sites_over_time_data.R new file mode 100644 index 0000000..1100587 --- /dev/null +++ b/workflow/scripts/report/polymorphic_sites_over_time_data.R @@ -0,0 +1,81 @@ +#!/usr/bin/env Rscript + +# Write stdout and stderr to log file +log <- file(snakemake@log[[1]], open = "wt") +sink(log, type = "message") +sink(log, type = "output") + +library(readr) +library(dplyr) +library(jsonlite) +library(logger) + +log_threshold(INFO) + +log_info("Reading variants") +variants <- read_delim(snakemake@input$variants) + +log_info("Reading metadata") +metadata <- read_delim(snakemake@input$metadata) + +log_info("Calculating heterozygous sites") +sites <- variants %>% + filter(ALT_FREQ <= snakemake@params$max_alt_freq) %>% + left_join( + metadata, + by = c("SAMPLE" = "ID") + ) %>% + group_by(SAMPLE) %>% + summarise( + CollectionDate = min(as.Date(CollectionDate)), + n = n_distinct(POS) + ) %>% + ungroup() %>% + arrange(CollectionDate) %>% + mutate( + Day = as.numeric( + difftime(CollectionDate, min(CollectionDate), units = "days") + ) + ) + +if (nrow(sites) == 0) { + log_warn("There are none, using an empty table and no linear regression") + sites <- tibble( + SAMPLE = date_order, + REGION = as.character(NA), + VARIANT_NAME = as.character(NA), + ALT_FREQ = as.numeric(NA), + EFFECT = as.character(NA), + SYNONYMOUS = as.character(NA), + POS = as.numeric(NA), + ALT = as.character(NA), + NV_class = as.character(NA), + group = as.character(NA) + ) + r_squared <- "none" + p_value_string <- "none" +} else if (nrow(sites) > 2) { + log_info("Calculating linear regression") + model <- lm(n ~ CollectionDate, data = sites) + r_squared <- summary(model)$r.squared[[1]] + p_value <- summary(model)$coefficients[2, 4] + p_value_string <- ifelse(p_value < 0.001, "< 0.001", p_value) +} else { + log_warn("Not enough data points for a linear regression") + r_squared <- "none" + p_value_string <- "none" +} + +log_info("Writing JSON summary") +list( + "r2" = r_squared, + "value" = p_value_string +) %>% + write_json( + snakemake@output$json, + auto_unbox = TRUE, + digits = NA + ) + +log_info("Writing processed table") +write_csv(sites, snakemake@output$table) diff --git a/workflow/scripts/report/polymorphic_sites_over_time_plot.R b/workflow/scripts/report/polymorphic_sites_over_time_plot.R new file mode 100644 index 0000000..2596195 --- /dev/null +++ b/workflow/scripts/report/polymorphic_sites_over_time_plot.R @@ -0,0 +1,45 @@ +#!/usr/bin/env Rscript + +# Write stdout and stderr to log file +log <- file(snakemake@log[[1]], open = "wt") +sink(log, type = "message") +sink(log, type = "output") + +library(readr) +library(dplyr) +library(ggplot2) +library(logger) + +log_threshold(INFO) + +# Import file with plots style +source(snakemake@params[["design"]]) + +log_info("Reading plot data") +df <- read_csv(snakemake@input$table) + +log_info("Plotting") +p <- df %>% + ggplot() + + aes(x = Day, y = n) + + geom_smooth( + method = "lm", + fill = "gray95", + alpha = 0.6, + colour = "orange" + ) + + geom_point(size = 2, shape = 1) + + labs( + x = "Days since the initial sampling", + y = "No. of polimorphic sites" + ) + +log_info("Saving plot") +ggsave( + filename = snakemake@output$plot, + plot = p, + width = snakemake@params$plot_width_mm, + height = snakemake@params$plot_height_mm, + units = "mm", + dpi = 250 +) diff --git a/workflow/scripts/report/snp_plots.R b/workflow/scripts/report/snp_plots.R deleted file mode 100644 index 44e2ebb..0000000 --- a/workflow/scripts/report/snp_plots.R +++ /dev/null @@ -1,244 +0,0 @@ -#!/usr/bin/env Rscript - -# Write stdout and stderr to log file -log <- file(snakemake@log[[1]], open = "wt") -sink(log, type = "message") -sink(log, type = "output") - -library(tidyverse) -library(stringi) -library(ggrepel) -library(logger) -log_threshold(INFO) - -# Get colors -color <- grDevices::colors()[grep( - "gr(a|e)y", - grDevices::colors(), - invert = TRUE -)] -color <- color[grep("white", color, invert = TRUE)] -# Import file with plots style -source(snakemake@params[["design"]]) - -# DATA PREPROCESSING ##### -metadata <- read_csv(snakemake@input[["metadata"]]) - -vcf <- read_delim(snakemake@input[["vcf"]]) -data <- metadata %>% - filter( - ID %in% vcf$REGION - ) %>% - dplyr::select( - ID, - CollectionDate - ) - -# Obtain sample names ordered by CollectionDate -date_order <- metadata %>% - arrange(CollectionDate) %>% - pull(ID) %>% - unique() - -# Simplify features names and create SNP variable -vcf <- vcf %>% - dplyr::select( - variant, - REGION, - ALT_FREQ, - POS - ) - -# Fill positions without alt frequency with 0 -vcf <- vcf %>% - complete(nesting(variant, POS), REGION, fill = list(ALT_FREQ = 0)) - -# Join variants file and metadata file -vcf <- left_join(vcf, data, by = c("REGION" = "ID")) - -# Calculate days since first sample -vcf <- arrange( - vcf, - CollectionDate -) %>% - mutate( - interval = as.numeric(CollectionDate - min(CollectionDate)) - ) - -# PLOTS #### -## VOLCANO PLOT #### - -# Get list with all different polymorphisms -SNPs <- pull( - vcf, - variant -) %>% - unique() - -# Create an empty dataframe to be filled -cor.df <- data.frame( - snp = "", - cor = 0, - p.value = 0 -) %>% - filter(p.value != 0) - -cor.df.fill <- lapply( - SNPs, - function(snp) { - df <- filter( - vcf, - variant == snp - ) - - test <- cor.test( - df$ALT_FREQ, - df$interval, - method = snakemake@params$cor_method, - exact = snakemake@params$cor_exact - ) - - pvalue <- p.adjust( - test$p.value, - method = "BH", - n = length(SNPs) - ) - add_row( - cor.df, - snp = snp, - cor = test$estimate, - p.value = pvalue - ) - } -) %>% - bind_rows() - -# Plot a pseudo volcano plot with coorrelation index and p-value -log_info("Plotting pseudovolcano figure") -volcano <- cor.df.fill %>% - mutate( - trans.p = -log10(p.value), - label = case_when(p.value < 0.05 ~ snp) - ) %>% - ggplot() + - aes( - x = cor, - y = trans.p - ) + - geom_point() + - geom_label_repel(aes(label = label)) + - xlim(c(-1, 1)) + - geom_hline( - aes(yintercept = -log10(0.05)), - linetype = 2, - color = "red" - ) + - labs( - x = "Correlation", - y = "-log10(p-value)" - ) - -ggsave( - filename = snakemake@output[["pseudovolcano"]], - plot = volcano, - width = 159.2, - height = 119.4, - units = "mm", - dpi = 250 -) - - -## SNP PANEL #### -# SNPs significantly correlated with time -sign <- filter( - cor.df.fill, - p.value < 0.05 -) %>% - pull(snp) %>% - unique() - -# SNPs which are in positions with more than one alternative allele -dup <- vcf %>% - select( - variant, - POS - ) %>% - unique() %>% - group_by(POS) %>% - filter(n() > 1) %>% - ungroup() %>% - pull(variant) %>% - unique() - -subset <- c(sign, dup) %>% - unique() - -# Plot height depending on the number of SNPs assuming 4 columns in the plot -plot.height <- ceiling(length(subset) / 4) * 42 - -log_info("Plotting SNPs trends in time") -panel <- vcf %>% - filter(variant %in% subset) %>% - ggplot() + - aes( - x = interval, - y = ALT_FREQ, - color = variant - ) + - scale_color_manual(values = sample(color, length(subset))) + - geom_point() + - geom_line() + - theme( - legend.position = "bottom", - legend.text = element_text(size = 9) - ) + - labs( - x = "Days since first sample", - y = "Frequency", - color = "NV" - ) + - guides(color = guide_legend(ncol = 3)) - -if (length(subset) > 1) { - panel <- panel + - facet_wrap( - vars(POS), - nrow = ceiling(length(subset) / 4), - ncol = 4 - ) -} - -ggsave( - filename = snakemake@output[["snp_panel"]], - plot = panel, - width = 159.2, - height = max(100, plot.height), - units = "mm", - dpi = 250 -) - - -# PLOT TABLES #### -log_info("Saving coorelation table") -cor.df.fill %>% - transmute( - NV = snp, - correlation_coef = cor, - adj_pvalue = p.value - ) %>% - write.csv( - snakemake@output[["table_1"]], - row.names = FALSE - ) - -log_info("Saving SNPs trends table") -vcf %>% - filter(variant %in% subset) %>% - transmute( - sample = REGION, - POS = POS, - NV = variant, - ALT_FREQ = ALT_FREQ, - DaysSinceFirst = interval - ) %>% - write.csv(snakemake@output[["table_2"]], row.names = FALSE) diff --git a/workflow/scripts/report/summary_table.R b/workflow/scripts/report/summary_table.R index b78a103..0f96f7d 100644 --- a/workflow/scripts/report/summary_table.R +++ b/workflow/scripts/report/summary_table.R @@ -5,8 +5,9 @@ log <- file(snakemake@log[[1]], open = "wt") sink(log, type = "message") sink(log, type = "output") -library(tidyverse) +library(dplyr) library(logger) + log_threshold(INFO) metadata <- read.csv(snakemake@input[["metadata"]]) diff --git a/workflow/scripts/report/time_signal_data.R b/workflow/scripts/report/time_signal_data.R new file mode 100644 index 0000000..d209528 --- /dev/null +++ b/workflow/scripts/report/time_signal_data.R @@ -0,0 +1,70 @@ +#!/usr/bin/env Rscript + +# Write stdout and stderr to log file +log <- file(snakemake@log[[1]], open = "wt") +sink(log, type = "message") +sink(log, type = "output") + +library(readr) +library(dplyr) +library(tibble) +library(jsonlite) +library(ape) +library(adephylo) +library(logger) + +log_threshold(INFO) + +# Read distance tree and root +log_info("Reading tree") +tree <- read.tree(snakemake@input$tree) %>% + root(snakemake@params$outgroup_id, resolve.root = TRUE) + +# Read metadata +log_info("Reading metadata") +metadata <- read_csv(snakemake@input$metadata) + +# Get patristic distances to ancestor in distance tree +log_info("Calculating patristic distances to ancestor in distance tree") +time.signal <- distRoot( + tree, + "all", + method = "patristic" +) %>% + as.data.frame() %>% + rownames_to_column(var = "ID") %>% + filter(ID != snakemake@params$outgroup_id) %>% + rename(distance = ".") %>% + left_join( + select( + metadata, + ID, + CollectionDate + ), + by = "ID" + ) %>% + mutate( + date_interval = as.numeric( + as.Date(CollectionDate) - min(as.Date(CollectionDate)) + ) + ) + +# Save table +log_info("Saving time signal data") +log_debug("Saving table") +write_csv(time.signal, snakemake@output$table) + +# Time signal stats +log_debug("Building linear model") +model <- lm(distance ~ date_interval, data = time.signal) +p.value <- summary(model)$coefficients[2, 4] + +# TREE STATS +log_debug("Saving linear model") +list( + "sub_rate" = model$coefficients[[2]] * 365, + "r2" = summary(model)$r.squared[[1]], + "pvalue" = ifelse(p.value < 0.001, "< 0.001", p.value) +) %>% + toJSON() %>% + write(snakemake@output$json) diff --git a/workflow/scripts/report/time_signal_plot.R b/workflow/scripts/report/time_signal_plot.R new file mode 100644 index 0000000..d9af70c --- /dev/null +++ b/workflow/scripts/report/time_signal_plot.R @@ -0,0 +1,49 @@ +#!/usr/bin/env Rscript + +# Write stdout and stderr to log file +log <- file(snakemake@log[[1]], open = "wt") +sink(log, type = "message") +sink(log, type = "output") + +library(readr) +library(dplyr) +library(ggplot2) +library(logger) + +log_threshold(INFO) + +# Import file with plots style +source(snakemake@params[["design"]]) + +log_info("Reading time signal data") +time.signal <- read_csv(snakemake@input$table) + +# PLOTS #### +# TempEst +log_info("Plotting time signal data") +p <- time.signal %>% + ggplot() + + aes( + x = date_interval, + y = distance + ) + + geom_smooth( + method = "lm", + fill = "gray95", + alpha = 0.6, + color = "orange" + ) + + geom_point(size = 2, shape = 1) + + labs( + y = "Root-to-tip distance", + x = "Days since the initial sampling" + ) + +ggsave( + filename = snakemake@output$plot, + plot = p, + width = snakemake@params[["plot_width_mm"]], + height = snakemake@params[["plot_height_mm"]], + units = "mm", + dpi = 250 +) diff --git a/workflow/scripts/report/window_data.py b/workflow/scripts/report/window_data.py new file mode 100644 index 0000000..73233f8 --- /dev/null +++ b/workflow/scripts/report/window_data.py @@ -0,0 +1,148 @@ +#!/usr/bin/env python3 + +import logging +import json +from typing import NewType, Dict, Iterable, List + +import pandas as pd +from Bio import SeqIO +from Bio.SeqFeature import SeqFeature, SimpleLocation, CompoundLocation + + +FeatureIndex = NewType("FeatureIndex", Dict[str, SimpleLocation | CompoundLocation]) + + +def iter_features_filtering(features: Iterable[SeqFeature], included: Dict[str, str], excluded: Dict[str, str]) -> Iterable[SeqFeature]: + # No filters + if len(included) == 0 and len(excluded) == 0: + logging.debug("Selecting all features") + return iter(features) + # Only inclusion filter + elif len(included) == 0 and len(excluded) != 0: + logging.debug(f"Selecting features excluding all of {excluded}") + return ( + feature for feature in features + if all( + (qualifier_value not in excluded.get(qualifier_key, [])) + for qualifier_key in excluded.keys() + for qualifier_value in feature.qualifiers.get(qualifier_key, []) + ) + ) + # Only exclusion filter + elif len(included) != 0 and len(excluded) == 0: + logging.debug(f"Selecting features including any of {included}") + return ( + feature for feature in features + if any( + (qualifier_value in included.get(qualifier_key, [])) + for qualifier_key in included.keys() + for qualifier_value in feature.qualifiers.get(qualifier_key, []) + ) + ) + # Inclusion then exclusion filter + else: + logging.debug(f"Selecting features including any of {included} and then excluding all of {excluded}") + included_features = ( + feature for feature in features + if any( + (qualifier_value in included.get(qualifier_key, [])) + for qualifier_key in included.keys() + for qualifier_value in feature.qualifiers.get(qualifier_key, []) + ) + ) + return ( + feature for feature in included_features + if all( + (qualifier_value not in excluded.get(qualifier_key, [])) + for qualifier_key in excluded.keys() + for qualifier_value in feature.qualifiers.get(qualifier_key, []) + ) + ) + + +def index_features(features: Iterable[SeqFeature]) -> FeatureIndex: + index = FeatureIndex({}) + for feature in features: + identifier = "|".join(feature.qualifiers.get(snakemake.params.gb_qualifier_display, [])) + if identifier == "": + logging.error(f"Feature at {feature.location} has no qualifier '{snakemake.params.gb_qualifier_display}' and will be skipped") + elif feature.location is None: + logging.warning(f"Feature '{identifier}' has no location and will be skipped") + elif identifier in index: + logging.warning(f"Identifier '{identifier}' is already in feature index and will not be replaced by feature at {feature.location}") + else: + index[identifier] = feature.location + return index + + +def feature_names_at(position: int, feature_index: FeatureIndex) -> List[str]: + return [name for name, location in feature_index.items() if position in location] + + +def window_calculation(sites: set, window: int, step: int, size: int, feature_index: FeatureIndex) -> pd.DataFrame: + positions, fractions, features = [], [], [] + lim_sup = size + 1 + for position in range(1, lim_sup, step): + feature_names = ";".join(feature_names_at(position, feature_index)) + if len(feature_names) == 0: + features.append("Intergenic") + else: + # Include all features on site + features.append(feature_names) + # Add percent (excluding initial and final positions) + if position - window not in range(1, lim_sup): + fractions.append(0.0) + else: + # Calculate number of polymorphisms in the window + num_snp = sum( + 1 for x in sites if x in range(position - window, position + 1) + ) + fractions.append(num_snp / window) + positions.append(position) + return pd.DataFrame({"position": positions, "fraction": fractions, "feature": features}) + + +def main(): + + logging.basicConfig( + filename=snakemake.log[0], format=snakemake.config["LOG_PY_FMT"], + level=logging.INFO + ) + + # Read input files + logging.info("Reading input variants file") + df = pd.read_table(snakemake.input.variants) + sites = set(df.POS) + + logging.info("Reading GenBank file") + gb = SeqIO.read(snakemake.input.gb, format="gb") + + logging.info("Indexing feature locations") + included = snakemake.params.features.get("INCLUDE", {}) + excluded = snakemake.params.features.get("EXCLUDE", {}) + feature_index = index_features( + iter_features_filtering(gb.features, included, excluded) + ) + + logging.info("Calculating polimorphic sites sliding window") + windows = window_calculation( + sites, + snakemake.params.window, + snakemake.params.step, + len(gb), + feature_index + ) + + logging.info("Saving results") + windows.to_csv(snakemake.output.window_df, index=False) + + logging.info("Saving window settings to JSON file") + with open(snakemake.output.json, "w") as fw: + json.dump( + {"window": snakemake.params.window, "step": snakemake.params.step}, + fw + ) + + +if __name__ == "__main__": + main() diff --git a/workflow/scripts/report/window_zoom_on_feature_data.py b/workflow/scripts/report/window_zoom_on_feature_data.py new file mode 100644 index 0000000..36f1e9f --- /dev/null +++ b/workflow/scripts/report/window_zoom_on_feature_data.py @@ -0,0 +1,28 @@ +#!/usr/bin/env python3 + +import sys +import logging +import json + +import pandas as pd + + +if __name__ == "__main__": + + logging.basicConfig(filename=snakemake.log[0], format=snakemake.config["LOG_PY_FMT"], level=logging.INFO) + + logging.info("Reading regions") + with open(snakemake.input.regions) as f: + regions = json.load(f) + + if snakemake.wildcards.region_name not in regions: + logging.error(f"Region {snakemake.wildcards.region_name} is absent in {snakemake.input.regions}") + sys.exit(1) + + start, end = regions[snakemake.wildcards.region_name] + + logging.info("Reading input table") + df = pd.read_csv(snakemake.input.table) + + logging.info("Filtering sites and writing results") + df[(df.position >= start) & (df.position <= end)].to_csv(snakemake.output.table, index=False) diff --git a/workflow/scripts/summary_demixing.R b/workflow/scripts/summarise_demix.R similarity index 74% rename from workflow/scripts/summary_demixing.R rename to workflow/scripts/summarise_demix.R index 3ae246f..9792202 100644 --- a/workflow/scripts/summary_demixing.R +++ b/workflow/scripts/summarise_demix.R @@ -5,14 +5,18 @@ log <- file(snakemake@log[[1]], open = "wt") sink(log, type = "message") sink(log, type = "output") -library(tidyverse) +library(dplyr) +library(readr) +library(tidyr) +library(stringr) library(logger) + log_threshold(INFO) # Empty dataframe to be filled with data demix <- data.frame( - "lineages" = NA, - "abundances" = NA, + "lineage" = NA, + "abundance" = NA, "sample" = NA ) %>% filter(!is.na(sample)) @@ -23,21 +27,25 @@ lapply( function(tsv_file) { read_tsv( tsv_file, - col_names = c("variable", "valor"), + col_names = c("variable", "value"), show_col_types = FALSE ) %>% filter( - row_number() %in% c(3, 4) + variable %in% c("lineages", "abundances") ) %>% pivot_wider( names_from = variable, - values_from = valor + values_from = value ) %>% separate_rows( lineages, abundances, sep = " " ) %>% + rename( + lineage = lineages, + abundance = abundances + ) %>% mutate( sample = str_extract( basename(tsv_file), diff --git a/workflow/scripts/tsv_to_vcf.py b/workflow/scripts/tsv_to_vcf.py index be43d9a..2201582 100644 --- a/workflow/scripts/tsv_to_vcf.py +++ b/workflow/scripts/tsv_to_vcf.py @@ -17,7 +17,7 @@ def tsv_to_vcf(tsv_file, vcf_file): # Process each row in the TSV file for index, row in tsv_df.iterrows(): # Extract fields from the TSV row - chrom = 'NC_045512.2' + chrom = snakemake.params.ref_name pos = row['POS'] ref = row['REF'] alt = row['ALT'] diff --git a/workflow/scripts/vcf_to_tsv.R b/workflow/scripts/vcf_to_tsv.R deleted file mode 100644 index 990f488..0000000 --- a/workflow/scripts/vcf_to_tsv.R +++ /dev/null @@ -1,32 +0,0 @@ -#!/usr/bin/env Rscript - -# Write stdout and stderr to log file -log <- file(snakemake@log[[1]], open = "wt") -sink(log, type = "message") -sink(log, type = "output") - -library(tidyverse) -library(logger) -log_threshold(INFO) - -# read data -log_info("Reading data") -vcf <- read_tsv(snakemake@input[["ann_vcf"]], comment = "##") -tsv <- read_tsv(snakemake@input[["pre_tsv"]]) - -tsv["variant"] <- str_extract(vcf$INFO, "p\\.([^|]*)", group = 1) -tsv["nuc_variant"] <- str_extract(vcf$INFO, "c\\.([^|]*)", group = 1) - -tsv <- tsv %>% - mutate( - variant = case_when( - is.na(variant) ~ paste(POS, REF, ">", ALT, sep = ""), - TRUE ~ paste(GFF_FEATURE, ":", variant, sep = "") - ) - ) - -log_info("Saving results") -write_tsv( - tsv, - snakemake@output[["tsv"]] -) diff --git a/workflow/scripts/weighted_distances.py b/workflow/scripts/weighted_distances.py deleted file mode 100644 index 2a5c0e7..0000000 --- a/workflow/scripts/weighted_distances.py +++ /dev/null @@ -1,163 +0,0 @@ -#!/usr/bin/env python3 -# -*- coding: utf-8 -*- - -# Adapted script from https://github.com/PathoGenOmics-Lab/genetic-distances - -import logging -from typing import List, Tuple - -import pandas as pd -from Bio import SeqIO -from Bio.SeqRecord import SeqRecord -from Bio.Seq import Seq - - -def read_monofasta(path: str) -> SeqRecord: - fasta = SeqIO.parse(path, "fasta") - record = next(fasta) - if next(fasta, None) is not None: - logging.warning(f"There are unread records left in '{path}'") - return record - - -def read_masked_sites(vcf_path: str, mask_classes: List[str]) -> List[int]: - """ - Parse a VCF containing positions for masking. Assumes the VCF file is - formatted as in: - github.com/W-L/ProblematicSites_SARS-CoV2/blob/master/problematic_sites_sarsCov2.vcf - with a "mask" or "caution" recommendation in column 7. - Masked sites are specified with params. - """ - vcf = pd.read_csv( - vcf_path, - sep="\s+", - comment="#", - names=("CHROM", "POS", "ID", "REF", "ALT", "QUAL", "FILTER", "INFO") - ) - return vcf.loc[vcf.FILTER.isin(mask_classes), "POS"].tolist() - - -def build_ancestor_variant_table(ancestor: Seq, reference: Seq, reference_name: str, masked_positions: List[int]) -> pd.DataFrame: - pos = [] - alt = [] - for i in range(1, len(ancestor) + 1): - if i not in masked_positions and ancestor[i-1] != reference[i-1]: - pos.append(i) - alt.append(reference[i-1]) - df = pd.DataFrame({"POS": pos, "ALT": alt}) - df["ALT_FREQ"] = 1 # As a reference genome, we assume all positions are monomorphic - df["REGION"] = reference_name - return df - - -def get_frequencies_in_position(variant_table: pd.DataFrame, sample_name: str, position: int, reference: Seq) -> Tuple[float]: - frequencies = {alt: 0. for alt in variant_table["ALT"].unique()} - alt_keys = sorted(frequencies.keys()) - sample_variant_table = variant_table[variant_table["REGION"] == sample_name] - # If the position has polimorphisims, allele frequencies are captured - if position in sample_variant_table["POS"].values: - alleles = sample_variant_table[sample_variant_table["POS"] == position] - for base in alt_keys: - if base in alleles["ALT"].values: - frequencies[base] = float(alleles["ALT_FREQ"][alleles["ALT"] == base].iloc[0]) - # Obtain frequency for reference allele - reference_allele = reference[position-1] - reference_freq = 1 - sum(frequencies.values()) - if reference_allele in frequencies: - frequencies[reference[position-1]] += reference_freq - else: - frequencies[reference[position-1]] = reference_freq - return tuple(frequencies[base] for base in alt_keys) - - -def heterozygosity(freqs: Tuple[float]) -> float: - return 1 - sum([f ** 2 for f in freqs]) - - -def calc_fst_weir_cockerham(hs: float, ht: float) -> float: - return (ht - hs) / ht if ht != 0 else 0 - - -def build_cache(variant_table: pd.DataFrame, samples: List[str], reference: Seq): - cache = {"freq": {}, "hz": {}} - for sample_name in set(samples): - for position in variant_table["POS"].astype("Int64").unique(): - if sample_name not in cache["freq"]: - cache["freq"][sample_name] = {} - if sample_name not in cache["hz"]: - cache["hz"][sample_name] = {} - cache["freq"][sample_name][position] = get_frequencies_in_position(variant_table, sample_name, position, reference) - logging.debug(f"Frequencies for '{sample_name}':{position} = {cache['freq'][sample_name][position]}") - cache["hz"][sample_name][position] = heterozygosity(cache["freq"][sample_name][position]) - logging.debug(f"Heterozygosity for '{sample_name}':{position} = {cache['hz'][sample_name][position]}") - return cache - - -def calc_heterozygosities(sample1_name: str, sample2_name: str, pos: int, cache: dict): - logging.debug(f"Calculating heterozygosities at position {pos} for '{sample1_name}' and '{sample2_name}'") - # Retrieve pre-computed values - freqs1 = cache["freq"][sample1_name][pos] - freqs2 = cache["freq"][sample2_name][pos] - hs1 = cache["hz"][sample1_name][pos] - hs2 = cache["hz"][sample2_name][pos] - # Calculate pairwise values - total_freqs = [(f1 + f2) / 2 for f1, f2 in zip(freqs1, freqs2)] - ht = heterozygosity(total_freqs) - hs = (hs1 + hs2) / 2 - return hs, ht - - -def calculate_pairwise_distance(positions: List[int], sample1_name: str, sample2_name: str, cache: dict) -> float: - if len(positions) == 0: - return 0 - else: - return sum([calc_fst_weir_cockerham(*calc_heterozygosities(sample1_name, sample2_name, pos, cache)) for pos in positions]) - - -def calculate_sample_distances(positions: List[int], sample_name: str, samples: List[str], cache: dict) -> List[float]: - return [calculate_pairwise_distance(positions, sample_name, other_sample_name, cache) for other_sample_name in samples] - - -def calculate_distance_matrix(variant_table: pd.DataFrame, samples: List[str], reference: Seq) -> pd.DataFrame: - positions = variant_table["POS"].astype("Int64").unique().tolist() - cache = build_cache(variant_table, samples, reference) - distance_matrix = {} - for sample_name in samples: - distance_matrix[sample_name] = calculate_sample_distances(positions, sample_name, samples, cache) - for i in range(len(samples)): - for j in range(i+1, len(samples)): - distance_matrix[samples[j]][i] = distance_matrix[samples[i]][j] - return pd.DataFrame(distance_matrix, index=samples) - - -def main(): - - logging.basicConfig(filename=snakemake.log[0], format=snakemake.config["LOG_PY_FMT"], level=logging.INFO) - - logging.info("Reading input FASTA files") - ancestor = read_monofasta(snakemake.input.ancestor) - logging.debug(f"Ancestor: '{ancestor.description}', length={len(ancestor.seq)}") - reference = read_monofasta(snakemake.input.reference) - logging.debug(f"Reference: '{reference.description}', length={len(reference.seq)}") - - logging.info("Reading input tables") - masked_positions = read_masked_sites(snakemake.input.vcf, snakemake.params.mask_class) - logging.debug(f"Read {len(masked_positions)} masked positions") - input_table = pd.read_table(snakemake.input.tsv, sep="\t") - logging.debug(f"Read {len(input_table)} rows in input TSV") - ancestor_table = build_ancestor_variant_table(ancestor.seq, reference.seq, reference.id, masked_positions) - logging.debug(f"Ancestor has {len(ancestor_table)} variants") - variant_table = pd.concat([input_table, ancestor_table], ignore_index=True) - logging.debug(f"Combined table has {len(variant_table)} variants") - - logging.info(f"Calculating distance matrix") - sample_names = snakemake.params.samples + [reference.id] - distances = calculate_distance_matrix(variant_table, sample_names, ancestor.seq) - logging.debug(f"Distance matrix has shape: {distances.shape}") - - logging.info("Writing results") - distances.to_csv(snakemake.output.distances) - - -if __name__ == "__main__": - main() diff --git a/workflow/scripts/window.py b/workflow/scripts/window.py deleted file mode 100644 index 5ba6a8c..0000000 --- a/workflow/scripts/window.py +++ /dev/null @@ -1,52 +0,0 @@ -#!/usr/bin/env python3 - -import logging -import json -import pandas as pd -from gb2seq.features import Features - - -def window_calculation(sites: set, window: int, step: int, coord: str) -> pd.DataFrame: - ft = Features(coord) # Create Features object to obtain annotations - positions = [] - pct = [] - genes = [] - lim_sup = len(ft.reference.sequence) + 1 - for position in range(1, lim_sup, step): - if len(ft.getFeatureNames(position)) == 0: - genes.append("Intergenic") - else: - genes.append(list(ft.getFeatureNames(position))[0]) - # Add percent (excluding initial and final positions) - if position - window not in range(1, lim_sup): - pct.append(0) - else: - # Calculate no. of polimorphisms in the window - num_snp = len([x for x in sites if x in range(position - window, position + 1)]) - pct.append(num_snp / window) - positions.append(position) - return pd.DataFrame({"position": positions, "fractions": pct, "gen": genes}) - - -def main(): - - logging.basicConfig(filename=snakemake.log[0], format=snakemake.config["LOG_PY_FMT"], level=logging.INFO) - - logging.info("Reading input VCF") - df = pd.read_table(snakemake.input.vcf) - sites = set(df.POS) - - logging.info("Reading features") - with open(snakemake.input.features) as f: - features_key = json.load(f) - - logging.info("Sliding window calculation of proportion of polimorphic sites") - frame = window_calculation(sites, snakemake.params.window, snakemake.params.step, snakemake.input.gb) - - logging.info("Saving results") - frame.replace(features_key, inplace = True) - frame.to_csv(snakemake.output.window_df, index= False) - - -if __name__ == "__main__": - main()