diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index bc1d054..f1d5e40 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -2,9 +2,9 @@ name: CI on: push: - branches: [main, dev] + branches: [main] pull_request: - branches: [main, dev] + branches: [main] jobs: Formatting: @@ -47,7 +47,7 @@ jobs: with: directory: .test snakefile: workflow/Snakefile - args: "--sdm conda --show-failed-logs --cores 1 --conda-cleanup-pkgs cache -n" + args: "--sdm conda --show-failed-logs --cores 3 --conda-cleanup-pkgs cache" - name: Test report uses: snakemake/snakemake-github-action@v2.0.0 diff --git a/.test/config/config.yml b/.test/config/config.yml index 290a42e..773cadc 100644 --- a/.test/config/config.yml +++ b/.test/config/config.yml @@ -1,5 +1,5 @@ samplesheet: "config/samples.csv" -outdir: "results" +tool: ["prokka"] pgap: bin: "path/to/pgap.py" @@ -7,3 +7,23 @@ pgap: prepare_yaml_files: generic: "config/generic.yaml" submol: "config/submol.yaml" + +prokka: + center: "" + extra: "--addgenes" + +bakta: + download_db: "light" + existing_db: "" + extra: "--keep-contig-headers --compliant" + +quast: + reference_fasta: "" + reference_gff: "" + extra: "" + +panaroo: + skip: False + remove_source: "cmsearch" + remove_feature: "tRNA|rRNA|ncRNA|exon|sequence_feature" + extra: "--clean-mode strict --remove-invalid-genes" diff --git a/.test/config/samples.csv b/.test/config/samples.csv index 4f368e3..5a46dd9 100644 --- a/.test/config/samples.csv +++ b/.test/config/samples.csv @@ -1,2 +1,2 @@ sample,species,strain,id_prefix,file -EC2224,"Streptococcus pyogenes",SF370,SPY,"data/assembly.fasta" \ No newline at end of file +EC2224,"Streptococcus pyogenes",SF370,SPY,"data/assembly.fasta" diff --git a/README.md b/README.md index 05db0da..1368e45 100644 --- a/README.md +++ b/README.md @@ -4,23 +4,39 @@ [![GitHub actions status](https://github.com/MPUSP/snakemake-assembly-postprocessing/actions/workflows/main.yml/badge.svg)](https://github.com/MPUSP/snakemake-assembly-postprocessing/actions/workflows/main.yml) [![run with conda](http://img.shields.io/badge/run%20with-conda-3EB049?labelColor=000000&logo=anaconda)](https://docs.conda.io/en/latest/) [![run with apptainer](https://img.shields.io/badge/run%20with-apptainer-1D355C.svg?labelColor=000000)](https://apptainer.org/) +[![workflow catalog](https://img.shields.io/badge/Snakemake%20workflow%20catalog-darkgreen)](https://snakemake.github.io/snakemake-workflow-catalog/docs/workflows/MPUSP/snakemake-assembly-postprocessing) A Snakemake workflow for the post-processing of microbial genome assemblies. +- [snakemake-assembly-postprocessing](#snakemake-assembly-postprocessing) + - [Usage](#usage) + - [Workflow overview](#workflow-overview) + - [Installation](#installation) + - [Deployment options](#deployment-options) + - [Authors](#authors) + - [References](#references) + ## Usage The usage of this workflow is described in the [Snakemake Workflow Catalog](https://snakemake.github.io/snakemake-workflow-catalog/docs/workflows/MPUSP/snakemake-assembly-postprocessing). +Detailed information about input data and workflow configuration can also be found in the [`config/README.md`](config/README.md). + If you use this workflow in a paper, don't forget to give credits to the authors by citing the URL of this repository. -## Workflow overview +_Workflow overview:_ -1. Parse `samples.csv` table containing the samples's meta data (`python`) -2. Annotate assemblies using NCBI's Prokaryotic Genome Annotation Pipeline ([PGAP](https://github.com/ncbi/pgap)) + -## Requirements +## Workflow overview -- [PGAP](https://github.com/ncbi/pgap) +1. Parse `samples.csv` table containing the samples's meta data (`python`) +2. Annotate assemblies using one of the following tools: + 1. NCBI's Prokaryotic Genome Annotation Pipeline ([PGAP](https://github.com/ncbi/pgap)). Note: needs to be installed manually + 2. [prokka](https://github.com/tseemann/prokka), a fast and light-weight prokaryotic annotation tool + 3. [bakta](https://github.com/oschwengers/bakta), a fast, alignment-free annotation tool. Note: Bakta will automatically download its companion database from zenodo (light: 1.5 GB, full: 40 GB) +3. Create a QC report for the assemblies using [Quast](https://github.com/ablab/quast) +4. Create a pangenome analysis (orthologs/homologs) using [Panaroo](https://gthlab.au/panaroo/) ## Installation @@ -46,9 +62,37 @@ conda activate snakemake-assembly-postprocessing **Step 4: Install PGAP** +- if you want to use [PGAP](https://github.com/ncbi/pgap) for annotation, it needs to be installed separately - PGAP can be downloaded from https://github.com/ncbi/pgap. Please follow the installation instructions there. - Define the path to the `pgap.py` script (located in the `scripts` folder) in the `config` file (recommended: `./resources`) +## Deployment options + +To run the workflow from command line, change the working directory. + +```bash +cd snakemake-assembly-postprocessing +``` + +Adjust options in the default config file `config/config.yml`. +Before running the complete workflow, you can perform a dry run using: + +```bash +snakemake --cores 1 --dry-run +``` + +To run the workflow with test files using **conda**: + +```bash +snakemake --cores 2 --sdm conda --directory .test +``` + +To run the workflow with test files using **apptainer**: + +```bash +snakemake --cores 2 --sdm conda apptainer --directory .test +``` + ## Authors - Dr. Rina Ahmed-Begrich @@ -61,6 +105,14 @@ conda activate snakemake-assembly-postprocessing ## References +> Seemann T. _Prokka: rapid prokaryotic genome annotation_. Bioinformatics. **2014** Jul 15;30(14):2068-9. PMID: 24642063. https://doi.org/10.1093/bioinformatics/btu153. + +> Schwengers O, Jelonek L, Dieckmann MA, Beyvers S, Blom J, Goesmann A. _Bakta: rapid and standardized annotation of bacterial genomes via alignment-free sequence identification_. Microb Genom, 7(11):000685 **2021**. PMID: 34739369. https://doi.org/10.1099/mgen.0.000685. + > Li W, O'Neill KR, Haft DH, DiCuccio M, Chetvernin V, Badretdin A, Coulouris G, Chitsaz F, Derbyshire MK, Durkin AS, Gonzales NR, Gwadz M, Lanczycki CJ, Song JS, Thanki N, Wang J, Yamashita RA, Yang M, Zheng C, Marchler-Bauer A, Thibaud-Nissen F. _RefSeq: Expanding the Prokaryotic Genome Annotation Pipeline reach with protein family model curation._ Nucleic Acids Res, **2021** Jan 8;49(D1):D1020-D1028. https://doi.org/10.1093/nar/gkaa1105 +> Gurevich A, Saveliev V, Vyahhi N, Tesler G. _QUAST: quality assessment tool for genome assemblies_. Bioinformatics. 29(8):1072-5, **2013**. PMID: 23422339. https://doi.org/10.1093/bioinformatics/btt086. + +> Tonkin-Hill G, MacAlasdair N, Ruis C, Weimann A, Horesh G, Lees JA, Gladstone RA, Lo S, Beaudoin C, Floto RA, Frost SDW, Corander J, Bentley SD, Parkhill J. _Producing polished prokaryotic pangenomes with the Panaroo pipeline_. Genome Biol. 21(1):180, **2020**. PMID: 32698896. https://doi.org/10.1186/s13059-020-02090-4. + > Köster, J., Mölder, F., Jablonski, K. P., Letcher, B., Hall, M. B., Tomkins-Tinch, C. H., Sochat, V., Forster, J., Lee, S., Twardziok, S. O., Kanitz, A., Wilm, A., Holtgrewe, M., Rahmann, S., & Nahnsen, S. _Sustainable data analysis with Snakemake_. F1000Research, 10:33, 10, 33, **2021**. https://doi.org/10.12688/f1000research.29032.2. diff --git a/config/README.md b/config/README.md index 3326ef0..d864e26 100644 --- a/config/README.md +++ b/config/README.md @@ -1,3 +1,15 @@ +## Workflow overview + +A Snakemake workflow for the post-processing of microbial genome assemblies. + +1. Parse `samples.csv` table containing the samples's meta data (`python`) +2. Annotate assemblies using one of the following tools: + 1. NCBI's Prokaryotic Genome Annotation Pipeline ([PGAP](https://github.com/ncbi/pgap)). Note: needs to be installed manually + 2. [prokka](https://github.com/tseemann/prokka), a fast and light-weight prokaryotic annotation tool + 3. [bakta](https://github.com/oschwengers/bakta), a fast, alignment-free annotation tool. Note: Bakta will automatically download its companion database from zenodo (light: 1.5 GB, full: 40 GB) +3. Create a QC report for the assemblies using [Quast](https://github.com/ablab/quast) +4. Create a pangenome analysis (orthologs/homologs) using [Panaroo](https://gthlab.au/panaroo/) + ## Running the workflow ### Input data @@ -5,28 +17,39 @@ This workflow requires `fasta` input data. The samplesheet table has the following layout: -| sample | species | strain | id_prefix | file | -| ----------- | ------------ | ------------- | ------------- | ------------- | -| EC2224 | "Streptococcus pyogenes" | SF370 | Spy | assembly.fasta | - -### Execution - -To run the workflow from command line, change to the working directory and activate the conda environment. - -```bash -cd snakemake-assembly-postprocessing -conda activate snakemake-assembly-postprocessing -``` - -Adjust options in the default config file `config/config.yml`. -Before running the entire workflow, perform a dry run using: - -```bash -snakemake --cores 1 --sdm conda --directory .test --dry-run -``` - -To run the workflow with test files using **conda**: - -```bash -snakemake --cores 1 --sdm conda --directory .test -``` +| sample | species | strain | id_prefix | file | +| ------ | ------------------------ | ------ | --------- | -------------- | +| EC2224 | "Streptococcus pyogenes" | SF370 | SPY | assembly.fasta | +| ... | ... | ... | ... | ... | + +**Note:** Pangenome analysis with `Panaroo` requires at least two samples. + +### Parameters + +This table lists all parameters that can be used to run the workflow. + +| Parameter | Type | Details | Default | +|:---|:---|:---|:---| +| **samplesheet** | string | Path to the sample sheet file in csv format | | +| **tool** | array[string] | Annotation tool to use (one of `prokka`, `pgap`, `bakta`) | | +| **pgap** | | PGAP configuration object | | +| bin | string | Path to the PGAP script | | +| use_yaml_config | boolean | Whether to use YAML configuration for PGAP | `False` | +| _prepare_yaml_files_ | | Paths to YAML templates for PGAP | | +| generic | string | Path to the generic YAML configuration file | | +| submol | string | Path to the submol YAML configuration file | | +| **prokka** | | Prokka configuration object | | +| center | string | Center name for Prokka annotation (used in sequence IDs) | | +| extra | string | Extra command-line arguments for Prokka | `--addgenes` | +| **bakta** | | Bakta configuration object | | +| download_db | string | Bakta database type (`full`, `light`, or `none`) | `light` | +| existing_db | string | Path to an existing Bakta database (optional). Needs to be combined with `download_db='none'` | `--keep-contig-headers --compliant` | +| extra | string | Extra command-line arguments for Bakta | | +| **quast** | | QUAST configuration object | | +| reference_fasta | string | Path to the reference genome for QUAST | | +| reference_gff | string | Path to the reference annotation for QUAST | +| extra | string | Extra command-line arguments for QUAST | | +| **panaroo** | | Panaroo configuration object | | +| remove_source | string | Source types to remove in Panaroo (regex supported) | `cmsearch` | +| remove_feature | string | Feature types to remove in Panaroo (regex supported) | `tRNA\|rRNA\|ncRNA\|exon\|sequence_feature` | +| extra | string | Extra command-line arguments for Panaroo | `--clean-mode strict --remove-invalid-genes` | diff --git a/config/config.yml b/config/config.yml index 290a42e..773cadc 100644 --- a/config/config.yml +++ b/config/config.yml @@ -1,5 +1,5 @@ samplesheet: "config/samples.csv" -outdir: "results" +tool: ["prokka"] pgap: bin: "path/to/pgap.py" @@ -7,3 +7,23 @@ pgap: prepare_yaml_files: generic: "config/generic.yaml" submol: "config/submol.yaml" + +prokka: + center: "" + extra: "--addgenes" + +bakta: + download_db: "light" + existing_db: "" + extra: "--keep-contig-headers --compliant" + +quast: + reference_fasta: "" + reference_gff: "" + extra: "" + +panaroo: + skip: False + remove_source: "cmsearch" + remove_feature: "tRNA|rRNA|ncRNA|exon|sequence_feature" + extra: "--clean-mode strict --remove-invalid-genes" diff --git a/config/schemas/config.schema.yml b/config/schemas/config.schema.yml index 7dead6d..33215e4 100644 --- a/config/schemas/config.schema.yml +++ b/config/schemas/config.schema.yml @@ -6,9 +6,15 @@ properties: samplesheet: type: string description: Path to the sample sheet file - outdir: - type: string - description: Output directory for results + tool: + type: array + description: Annotation tool to use + items: + type: string + enum: + - prokka + - pgap + - bakta pgap: type: object properties: @@ -34,7 +40,66 @@ properties: - bin - use_yaml_config - prepare_yaml_files + prokka: + type: object + properties: + center: + type: string + description: Center name for Prokka annotation (used in sequence IDs) + extra: + type: string + description: Extra command-line arguments for Prokka + required: + - center + - extra + bakta: + type: object + properties: + download_db: + type: string + description: Bakta database type, one of 'full', 'light', or 'none' if existing is used + existing_db: + type: string + description: Path to an existing Bakta database (optional) + extra: + type: string + description: Extra command-line arguments for Bakta + required: + - download_db + - existing_db + - extra + quast: + type: object + properties: + reference_fasta: + type: string + description: Path to the reference genome for QUAST + reference_gff: + type: string + description: Path to the reference annotation for QUAST + extra: + type: string + description: Extra command-line arguments for QUAST + panaroo: + type: object + properties: + skip: + type: boolean + description: Whether to skip Panaroo analysis + remove_source: + type: string + description: Source types to remove in Panaroo (regex supported) + remove_feature: + type: string + description: Feature types to remove in Panaroo (regex supported) + extra: + type: string + description: Extra command-line arguments for Panaroo required: - samplesheet + - tool - pgap + - prokka + - bakta + - quast diff --git a/resources/.gitignore b/resources/.gitignore new file mode 100644 index 0000000..9abb766 --- /dev/null +++ b/resources/.gitignore @@ -0,0 +1 @@ +.* \ No newline at end of file diff --git a/resources/images/dag.svg b/resources/images/dag.svg new file mode 100644 index 0000000..344f877 --- /dev/null +++ b/resources/images/dag.svg @@ -0,0 +1,157 @@ + + + + + + +snakemake_dag + + + +0 + +all + + + +1 + +quast + + + +1->0 + + + + + +2 + +annotate_prokka + + + +2->1 + + + + + +9 + +prepare_panaroo + + + +2->9 + + + + + +3 + +get_fasta + + + +3->2 + + + + + +5 + +prepare_yaml_files + + + +3->5 + + + + + +6 + +annotate_bakta + + + +3->6 + + + + + +4 + +annotate_pgap + + + +4->1 + + + + + +4->9 + + + + + +5->4 + + + + + +6->1 + + + + + +6->9 + + + + + +7 + +get_bakta_db + + + +7->6 + + + + + +8 + +panaroo + + + +8->0 + + + + + +9->8 + + + + + diff --git a/workflow/Snakefile b/workflow/Snakefile index 6ae7d4d..aafcb6a 100644 --- a/workflow/Snakefile +++ b/workflow/Snakefile @@ -19,18 +19,12 @@ container: "oras://ghcr.io/mpusp/snakemake-assembly-postprocessing:latest" configfile: "config/config.yml" -# set outout directory -OUTDIR = config.get("outdir", "") - -if not OUTDIR: - OUTDIR = f"{os.getcwd()}/results" - - # ----------------------------------------------------- # load rules # ----------------------------------------------------- include: "rules/common.smk" include: "rules/annotate.smk" +include: "rules/qc.smk" # ----------------------------------------------------- @@ -44,7 +38,7 @@ end = "\033[0m" msg = f"""\nSnakemake-assembly-postprocessing: A Snakemake workflow for the post-processing of microbial genome assemblies.""" -prolog = f"Output directory: {OUTDIR}" +prolog = f"Output directory: ./results" year = datetime.today().year epilog = f""" @@ -73,8 +67,5 @@ onerror: # ----------------------------------------------------- rule all: input: - expand( - os.path.join(OUTDIR, "annotation/pgap/{sample}/{sample}.gff"), - sample=samples.index, - ), + get_final_input, default_target: True diff --git a/workflow/envs/bakta.yml b/workflow/envs/bakta.yml new file mode 100644 index 0000000..bd2d4d2 --- /dev/null +++ b/workflow/envs/bakta.yml @@ -0,0 +1,7 @@ +name: bakta +channels: + - conda-forge + - bioconda + - nodefaults +dependencies: + - bakta=1.11.4 diff --git a/workflow/envs/base.yml b/workflow/envs/base.yml index fb1867a..fcd1565 100644 --- a/workflow/envs/base.yml +++ b/workflow/envs/base.yml @@ -4,6 +4,6 @@ channels: - bioconda - nodefaults dependencies: - - python==3.13.0 - - pandas==2.3.3 - - apptainer==1.4.5 + - python=3.13.0 + - pandas=2.3.3 + - apptainer=1.4.5 diff --git a/workflow/envs/panaroo.yml b/workflow/envs/panaroo.yml new file mode 100644 index 0000000..43ee14a --- /dev/null +++ b/workflow/envs/panaroo.yml @@ -0,0 +1,9 @@ +name: panaroo +channels: + - conda-forge + - bioconda + - nodefaults +dependencies: + - numpy=1.26.4 + - scipy=1.11.4 + - panaroo=1.5.2 diff --git a/workflow/envs/prokka.yml b/workflow/envs/prokka.yml new file mode 100644 index 0000000..643ba6b --- /dev/null +++ b/workflow/envs/prokka.yml @@ -0,0 +1,7 @@ +name: prokka +channels: + - conda-forge + - bioconda + - nodefaults +dependencies: + - prokka=1.14.6 diff --git a/workflow/envs/quast.yml b/workflow/envs/quast.yml new file mode 100644 index 0000000..9d5f823 --- /dev/null +++ b/workflow/envs/quast.yml @@ -0,0 +1,7 @@ +name: quast +channels: + - conda-forge + - bioconda + - nodefaults +dependencies: + - quast=5.3.0 diff --git a/workflow/rules/annotate.smk b/workflow/rules/annotate.smk index f695f80..4b9aab0 100644 --- a/workflow/rules/annotate.smk +++ b/workflow/rules/annotate.smk @@ -2,15 +2,11 @@ rule get_fasta: input: get_fasta, output: - fasta=os.path.join( - OUTDIR, "annotation/pgap/prepare_files/{sample}/genome.fasta" - ), + fasta="results/annotation/pgap/prepare_files/{sample}/genome.fasta", conda: "../envs/base.yml" log: - os.path.join( - OUTDIR, "annotation/pgap/prepare_files/logs/{sample}_get_fasta.log" - ), + "results/annotation/pgap/prepare_files/logs/{sample}_get_fasta.log", shell: "INPUT=$(realpath {input}); " "ln -s ${{INPUT}} {output}; " @@ -21,12 +17,8 @@ rule prepare_yaml_files: input: fasta=rules.get_fasta.output.fasta, output: - input_yaml=os.path.join( - OUTDIR, "annotation/pgap/prepare_files/{sample}/input.yaml" - ), - submol_yaml=os.path.join( - OUTDIR, "annotation/pgap/prepare_files/{sample}/submol.yaml" - ), + input_yaml="results/annotation/pgap/prepare_files/{sample}/input.yaml", + submol_yaml="results/annotation/pgap/prepare_files/{sample}/submol.yaml", conda: "../envs/base.yml" params: @@ -37,10 +29,7 @@ rule prepare_yaml_files: sample="{sample}", pd_samples=samples, log: - os.path.join( - OUTDIR, - "annotation/pgap/prepare_files/logs/{sample}_prepare_yaml_files.log", - ), + "results/annotation/pgap/prepare_files/logs/{sample}_prepare_yaml_files.log", script: "../scripts/prepare_yaml_files.py" @@ -53,11 +42,12 @@ rule annotate_pgap: otherwise=rules.get_fasta.output.fasta, ), output: - os.path.join(OUTDIR, "annotation/pgap/{sample}/{sample}.gff"), + gff="results/annotation/pgap/{sample}/{sample}.gff", + fasta="results/annotation/pgap/{sample}/{sample}.fna", conda: "../envs/base.yml" message: - """--- Run PGAP annotation for sample {wildcards.sample} ---""" + """--- Running PGAP annotation for sample {wildcards.sample} ---""" params: pgap=config["pgap"]["bin"], use_yaml_config=config["pgap"]["use_yaml_config"], @@ -65,7 +55,7 @@ rule annotate_pgap: outdir=lambda wc, output: os.path.dirname(output[0]), threads: 1 log: - os.path.join(OUTDIR, "annotation/pgap/logs/{sample}_pgap.log"), + "results/annotation/pgap/logs/{sample}_pgap.log", shell: "rm -rf {params.outdir}; " "if [ {params.use_yaml_config} == 'True' ]; then " @@ -85,3 +75,112 @@ rule annotate_pgap: "--no-self-update " "-g {input} -s '{params.species}' &>> {log}; " "fi; " + + +rule annotate_prokka: + input: + fasta=rules.get_fasta.output.fasta, + output: + gff="results/annotation/prokka/{sample}/{sample}.gff", + fasta="results/annotation/prokka/{sample}/{sample}.fna", + conda: + "../envs/prokka.yml" + message: + """--- Running PROKKA annotation for sample {wildcards.sample} ---""" + params: + prefix=lambda wc: wc.sample, + locustag=lambda wc: samples.loc[wc.sample]["id_prefix"], + genus=lambda wc: samples.loc[wc.sample]["species"].split(" ")[0], + species=lambda wc: samples.loc[wc.sample]["species"].split(" ")[1], + strain=lambda wc: samples.loc[wc.sample]["strain"], + outdir=lambda wc, output: os.path.dirname(output[0]), + extra=config["prokka"]["extra"], + threads: workflow.cores * 0.25 + log: + "results/annotation/prokka/logs/{sample}_prokka.log", + shell: + """ + prokka \ + --locustag {params.locustag} \ + --genus {params.genus} \ + --species {params.species} \ + --strain {params.strain} \ + --prefix {params.prefix} \ + --outdir {params.outdir} \ + --force {params.extra} \ + --cpus {threads} \ + {input.fasta} &> {log} + """ + + +rule get_bakta_db: + output: + db=branch( + lookup(dpath="bakta/download_db", within=config), + cases={ + "full": directory("results/annotation/bakta/database/db"), + "light": directory("results/annotation/bakta/database/db-light"), + "none": directory("results/annotation/bakta/database/custom"), + }, + ), + conda: + "../envs/bakta.yml" + message: + """--- Getting BAKTA database for annotation ---""" + params: + download_db=config["bakta"]["download_db"], + existing_db=config["bakta"]["existing_db"], + outdir=lambda wc, output: os.path.dirname(output[0]), + threads: workflow.cores * 0.25 + log: + "results/annotation/bakta/database/db.log", + shell: + """ + if [ {params.download_db} != 'none' ]; then + echo 'The most recent of the following available Bakta DBs is downloaded:' > {log}; + bakta_db list &>> {log}; + bakta_db download --output {params.outdir} --type {params.download_db} &>> {log}; + else + echo 'Using Bakta DB from supplied input dir: {params.existing_db}' > {log}; + ln -s {params.existing_db} {output.db}; + echo 'Update ARMFinderPlus DB using supplied input dir: {params.existing_db}' >> {log}; + amrfinder_update --force_update --database {params.existing_db}/amrfinderplus-db &>> {log} + fi + """ + + +rule annotate_bakta: + input: + fasta=rules.get_fasta.output.fasta, + db=rules.get_bakta_db.output.db, + output: + gff="results/annotation/bakta/{sample}/{sample}.gff", + fasta="results/annotation/bakta/{sample}/{sample}.fna", + conda: + "../envs/bakta.yml" + message: + """--- Running BAKTA annotation for sample {wildcards.sample} ---""" + params: + prefix=lambda wc: wc.sample, + locustag=lambda wc: format_bakta_locustag(samples.loc[wc.sample]["id_prefix"]), + species=lambda wc: samples.loc[wc.sample]["species"], + strain=lambda wc: samples.loc[wc.sample]["strain"], + outdir=lambda wc, output: os.path.dirname(output[0]), + extra=config["bakta"]["extra"], + threads: workflow.cores * 0.25 + log: + "results/annotation/bakta/logs/{sample}_bakta.log", + shell: + """ + bakta \ + --db {input.db} \ + --prefix {params.prefix} \ + --output {params.outdir} \ + --locus-tag {params.locustag} \ + --species '{params.species}' \ + --strain {params.strain} \ + --threads {threads} \ + --force {params.extra} \ + {input.fasta} &> {log}; + mv {output.gff}3 {output.gff} + """ diff --git a/workflow/rules/common.smk b/workflow/rules/common.smk index 72684e1..47364a7 100644 --- a/workflow/rules/common.smk +++ b/workflow/rules/common.smk @@ -1,5 +1,7 @@ # import basic packages import pandas as pd +import re +from snakemake import logging from snakemake.utils import validate @@ -23,6 +25,66 @@ def get_fasta(wildcards): sample = wildcards.sample if sample not in samples.index: raise ValueError(f"Sample {sample} not found in samplesheet.") - - # return the fasta file path return samples.loc[sample, "file"] + + +def get_quast_fasta(wildcards): + return expand( + "results/annotation/{tool}/{sample}/{sample}.fna", + tool=wildcards.tool, + sample=samples.index, + ) + + +def get_panaroo_gff(wildcards): + return expand( + "results/qc/panaroo/{tool}/prepare/{sample}.gff", + tool=wildcards.tool, + sample=samples.index, + ) + + +def get_panaroo_fasta(wildcards): + return expand( + "results/qc/panaroo/{tool}/prepare/{sample}.fna", + tool=wildcards.tool, + sample=samples.index, + ) + + +def get_final_input(wildcards): + inputs = [] + inputs += expand( + "results/qc/quast/{tool}/report.txt", + tool=config["tool"], + ) + if len(samples.index) > 1 and not config["panaroo"]["skip"]: + inputs += expand( + "results/qc/panaroo/{tool}/summary_statistics.txt", + tool=config["tool"], + ) + return inputs + + +# ----------------------------------------------------- +# helper functions +# ----------------------------------------------------- +def format_bakta_locustag(raw): + """Format locustag for BAKTA annotation.""" + tag = str(raw) + # uppercase for BAKTA + tag_up = tag.upper() + # keep only A-Z0-9 + cleaned = re.sub(r"[^A-Z0-9]", "", tag_up) + if len(cleaned) < 3 or len(cleaned) > 12: + raise ValueError( + f"locustag '{raw}' -> '{cleaned}' must contain between 3-12 alphanumeric uppercase characters\n" + ) + if not re.match(r"^[A-Z]", cleaned): + raise ValueError(f"locustag '{raw}' -> '{cleaned}' must start with a letter") + # warn if cleaned tag is different from original + if cleaned != tag: + logger.warning( + f"\nlocustag '{raw}' converted to '{cleaned}' to meet BAKTA requirements (between 3 and 12 alphanumeric uppercase characters, start with a letter)\n" + ) + return cleaned diff --git a/workflow/rules/qc.smk b/workflow/rules/qc.smk new file mode 100644 index 0000000..6135aa1 --- /dev/null +++ b/workflow/rules/qc.smk @@ -0,0 +1,94 @@ +rule quast: + input: + fasta=get_quast_fasta, + output: + report="results/qc/quast/{tool}/report.txt", + conda: + "../envs/quast.yml" + message: + """--- Running QUAST quality check for all assemblies ---""" + params: + outdir=lambda wc, output: os.path.dirname(output.report), + ref_fasta=( + " ".join(["-r", config["quast"]["reference_fasta"]]) + if config["quast"]["reference_fasta"] + else [] + ), + ref_gff=( + " ".join(["-g", config["quast"]["reference_gff"]]) + if config["quast"]["reference_gff"] + else [] + ), + extra=config["quast"]["extra"], + threads: 4 + log: + "results/qc/quast/{tool}/quast.log", + shell: + """ + quast \ + --output-dir {params.outdir} \ + --threads {threads} \ + {params.ref_fasta} \ + {params.ref_gff} \ + {params.extra} \ + {input.fasta} \ + > {log} 2>&1 + """ + + +rule prepare_panaroo: + input: + fasta="results/annotation/{tool}/{sample}/{sample}.fna", + gff="results/annotation/{tool}/{sample}/{sample}.gff", + output: + fasta="results/qc/panaroo/{tool}/prepare/{sample}.fna", + gff="results/qc/panaroo/{tool}/prepare/{sample}.gff", + conda: + "../envs/panaroo.yml" + message: + """--- Prepare input files for pan-genome alignment ---""" + params: + remove_source=config["panaroo"]["remove_source"], + remove_feature=config["panaroo"]["remove_feature"], + log: + "results/qc/panaroo/{tool}/prepare/{sample}.log", + shell: + """ + echo 'Preparing annotation for Panaroo:' > {log}; + echo ' - formatting seqnames in FASTA files' >> {log}; + awk '{{ sub(/>.*\\|/, ">"); sub(/[[:space:]].*$/, ""); print }}' \ + {input.fasta} > {output.fasta} 2>> {log}; + echo ' - removing sequences and selected features in GFF files' >> {log}; + awk ' /^##FASTA/ {{exit}} $2 !~ /{params.remove_source}/ && $3 !~ /{params.remove_feature}/ {{print}}' \ + {input.gff} > {output.gff} 2>> {log} + """ + + +rule panaroo: + input: + gff=get_panaroo_gff, + fasta=get_panaroo_fasta, + output: + stats="results/qc/panaroo/{tool}/summary_statistics.txt", + conda: + "../envs/panaroo.yml" + message: + """--- Running PANAROO to create pangenome from all annotations ---""" + params: + outdir=lambda wc, output: os.path.dirname(output.stats), + extra=config["panaroo"]["extra"], + threads: 4 + log: + "results/qc/panaroo/{tool}/panaroo.log", + shell: + """ + printf '%s\n' {input.gff} | \ + paste -d ' ' - <(printf '%s\n' {input.fasta}) \ + > {params.outdir}/input_files.txt; + panaroo \ + -i {params.outdir}/input_files.txt \ + -o {params.outdir} \ + -t {threads} \ + {params.extra} \ + > {log} 2>&1 + """ diff --git a/workflow/scripts/prepare_yaml_files.py b/workflow/scripts/prepare_yaml_files.py index 22d4f0a..5f1914f 100644 --- a/workflow/scripts/prepare_yaml_files.py +++ b/workflow/scripts/prepare_yaml_files.py @@ -21,7 +21,7 @@ # define helper functions -def read_yaml_to_dict(filepath): +def read_yaml_to_dict(filepath: str) -> None: try: with open(filepath, "r") as f: return yaml.safe_load(f) @@ -35,32 +35,31 @@ def read_yaml_to_dict(filepath): return None -def write_dict_to_yaml(dic, filepath): +def write_dict_to_yaml(outdict: dict, filepath: str) -> None: try: with open(filepath, "w") as f: - yaml.safe_dump(dic, f, default_flow_style=False) + yaml.safe_dump(outdict, f, default_flow_style=False) sys.stderr.write(f"Wrote YAML file: {filepath}\n") except Exception as e: sys.stderr.write(f"Error writing YAML file {filepath}: {e}\n") -generic_dic = read_yaml_to_dict(generic_template) -submol_dic = read_yaml_to_dict(submol_template) +generic_dict = read_yaml_to_dict(generic_template) +submol_dict = read_yaml_to_dict(submol_template) -generic_dic["fasta"]["location"] = os.path.basename(fasta) -generic_dic["submol"]["location"] = submol_yaml +generic_dict["fasta"]["location"] = os.path.basename(fasta) +generic_dict["submol"]["location"] = os.path.abspath(submol_yaml) if not organism: organism = samples.loc[sample]["species"] -submol_dic["organism"]["genus_species"] = organism - -submol_dic["organism"]["strain"] = samples.loc[sample]["strain"] +submol_dict["organism"]["genus_species"] = organism +submol_dict["organism"]["strain"] = samples.loc[sample]["strain"] if not locus_tag: locus_tag = "Spy" -submol_dic["locus_tag_prefix"] = locus_tag +submol_dict["locus_tag_prefix"] = locus_tag -write_dict_to_yaml(generic_dic, input_yaml) -write_dict_to_yaml(submol_dic, submol_yaml) +write_dict_to_yaml(generic_dict, input_yaml) +write_dict_to_yaml(submol_dict, submol_yaml) sys.stderr.write(f"Module 'prepare yaml files' finished successfully!")