From 04542596282eec63d5d54dfdce18b260b1590172 Mon Sep 17 00:00:00 2001 From: Rina Ahmed-Begrich Date: Fri, 5 Dec 2025 09:57:24 +0100 Subject: [PATCH 01/18] fix: remove option to define output dir in config. --- .test/config/config.yml | 1 - config/config.yml | 1 - config/schemas/config.schema.yml | 3 --- workflow/Snakefile | 5 +---- 4 files changed, 1 insertion(+), 9 deletions(-) diff --git a/.test/config/config.yml b/.test/config/config.yml index 290a42e..f2d1cc9 100644 --- a/.test/config/config.yml +++ b/.test/config/config.yml @@ -1,5 +1,4 @@ samplesheet: "config/samples.csv" -outdir: "results" pgap: bin: "path/to/pgap.py" diff --git a/config/config.yml b/config/config.yml index 290a42e..f2d1cc9 100644 --- a/config/config.yml +++ b/config/config.yml @@ -1,5 +1,4 @@ samplesheet: "config/samples.csv" -outdir: "results" pgap: bin: "path/to/pgap.py" diff --git a/config/schemas/config.schema.yml b/config/schemas/config.schema.yml index 7dead6d..e636fbd 100644 --- a/config/schemas/config.schema.yml +++ b/config/schemas/config.schema.yml @@ -6,9 +6,6 @@ properties: samplesheet: type: string description: Path to the sample sheet file - outdir: - type: string - description: Output directory for results pgap: type: object properties: diff --git a/workflow/Snakefile b/workflow/Snakefile index 6ae7d4d..0cd35ac 100644 --- a/workflow/Snakefile +++ b/workflow/Snakefile @@ -20,10 +20,7 @@ configfile: "config/config.yml" # set outout directory -OUTDIR = config.get("outdir", "") - -if not OUTDIR: - OUTDIR = f"{os.getcwd()}/results" +OUTDIR = f"{os.getcwd()}/results" # ----------------------------------------------------- From 1579c874494d406216f77c15ccd3a300e89d4859 Mon Sep 17 00:00:00 2001 From: m-jahn Date: Fri, 5 Dec 2025 11:41:33 +0100 Subject: [PATCH 02/18] feat: added prokka for annotation --- .test/config/config.yml | 5 +++++ README.md | 4 +++- config/README.md | 6 +++--- resources/.gitignore | 1 + workflow/Snakefile | 5 +++-- workflow/envs/prokka.yml | 7 +++++++ workflow/rules/annotate.smk | 35 ++++++++++++++++++++++++++++++++++- 7 files changed, 56 insertions(+), 7 deletions(-) create mode 100644 resources/.gitignore create mode 100644 workflow/envs/prokka.yml diff --git a/.test/config/config.yml b/.test/config/config.yml index 290a42e..932a2f0 100644 --- a/.test/config/config.yml +++ b/.test/config/config.yml @@ -1,5 +1,6 @@ samplesheet: "config/samples.csv" outdir: "results" +tool: "prokka" pgap: bin: "path/to/pgap.py" @@ -7,3 +8,7 @@ pgap: prepare_yaml_files: generic: "config/generic.yaml" submol: "config/submol.yaml" + +prokka: + center: "" + extra: "--addgenes" diff --git a/README.md b/README.md index 05db0da..4493195 100644 --- a/README.md +++ b/README.md @@ -16,7 +16,9 @@ If you use this workflow in a paper, don't forget to give credits to the authors ## Workflow overview 1. Parse `samples.csv` table containing the samples's meta data (`python`) -2. Annotate assemblies using NCBI's Prokaryotic Genome Annotation Pipeline ([PGAP](https://github.com/ncbi/pgap)) +2. Annotate assemblies using one of the following tools: + 1. NCBI's Prokaryotic Genome Annotation Pipeline ([PGAP](https://github.com/ncbi/pgap)) + 2. [prokka](https://github.com/tseemann/prokka) ## Requirements diff --git a/config/README.md b/config/README.md index 3326ef0..17692b8 100644 --- a/config/README.md +++ b/config/README.md @@ -5,9 +5,9 @@ This workflow requires `fasta` input data. The samplesheet table has the following layout: -| sample | species | strain | id_prefix | file | -| ----------- | ------------ | ------------- | ------------- | ------------- | -| EC2224 | "Streptococcus pyogenes" | SF370 | Spy | assembly.fasta | +| sample | species | strain | id_prefix | file | +| ------ | ------------------------ | ------ | --------- | -------------- | +| EC2224 | "Streptococcus pyogenes" | SF370 | Spy | assembly.fasta | ### Execution diff --git a/resources/.gitignore b/resources/.gitignore new file mode 100644 index 0000000..9abb766 --- /dev/null +++ b/resources/.gitignore @@ -0,0 +1 @@ +.* \ No newline at end of file diff --git a/workflow/Snakefile b/workflow/Snakefile index 6ae7d4d..21d0c82 100644 --- a/workflow/Snakefile +++ b/workflow/Snakefile @@ -19,7 +19,7 @@ container: "oras://ghcr.io/mpusp/snakemake-assembly-postprocessing:latest" configfile: "config/config.yml" -# set outout directory +# set output directory OUTDIR = config.get("outdir", "") if not OUTDIR: @@ -74,7 +74,8 @@ onerror: rule all: input: expand( - os.path.join(OUTDIR, "annotation/pgap/{sample}/{sample}.gff"), + os.path.join(OUTDIR, "annotation/{tool}/{sample}/{sample}.gff"), + tool=config["tool"], sample=samples.index, ), default_target: True diff --git a/workflow/envs/prokka.yml b/workflow/envs/prokka.yml new file mode 100644 index 0000000..2859a3a --- /dev/null +++ b/workflow/envs/prokka.yml @@ -0,0 +1,7 @@ +name: base +channels: + - conda-forge + - bioconda + - nodefaults +dependencies: + - prokka=1.14.6 diff --git a/workflow/rules/annotate.smk b/workflow/rules/annotate.smk index f695f80..31c3ced 100644 --- a/workflow/rules/annotate.smk +++ b/workflow/rules/annotate.smk @@ -57,7 +57,7 @@ rule annotate_pgap: conda: "../envs/base.yml" message: - """--- Run PGAP annotation for sample {wildcards.sample} ---""" + """--- Running PGAP annotation for sample {wildcards.sample} ---""" params: pgap=config["pgap"]["bin"], use_yaml_config=config["pgap"]["use_yaml_config"], @@ -85,3 +85,36 @@ rule annotate_pgap: "--no-self-update " "-g {input} -s '{params.species}' &>> {log}; " "fi; " + + +rule annotate_prokka: + input: + fasta=rules.get_fasta.output.fasta, + output: + os.path.join(OUTDIR, "annotation/prokka/{sample}/{sample}.gff"), + conda: + "../envs/prokka.yml" + message: + """--- Running PROKKA annotation for sample {wildcards.sample} ---""" + params: + prefix=lambda wc: wc.sample, + locustag=lambda wc: samples.loc[wc.sample]["id_prefix"], + species=lambda wc: samples.loc[wc.sample]["species"], + strain=lambda wc: samples.loc[wc.sample]["strain"], + outdir=lambda wc, output: os.path.dirname(output[0]), + extra=config["prokka"]["extra"], + threads: workflow.cores * 0.25 + log: + os.path.join(OUTDIR, "annotation/prokka/logs/{sample}_prokka.log"), + shell: + """ + prokka \ + --cpus {threads} \ + --locustag {params.locustag} \ + --species '{params.species}' \ + --strain {params.strain} \ + --prefix {params.prefix} \ + --outdir {params.outdir} \ + --force {params.extra} \ + {input.fasta} &>> {log} + """ From 7e9dfc0fb89daaa7ad6c961eb7dc083c56cbbc94 Mon Sep 17 00:00:00 2001 From: m-jahn Date: Fri, 5 Dec 2025 11:44:12 +0100 Subject: [PATCH 03/18] fix: updated schema --- config/config.yml | 5 +++++ config/schemas/config.schema.yml | 20 ++++++++++++++++++++ 2 files changed, 25 insertions(+) diff --git a/config/config.yml b/config/config.yml index 290a42e..932a2f0 100644 --- a/config/config.yml +++ b/config/config.yml @@ -1,5 +1,6 @@ samplesheet: "config/samples.csv" outdir: "results" +tool: "prokka" pgap: bin: "path/to/pgap.py" @@ -7,3 +8,7 @@ pgap: prepare_yaml_files: generic: "config/generic.yaml" submol: "config/submol.yaml" + +prokka: + center: "" + extra: "--addgenes" diff --git a/config/schemas/config.schema.yml b/config/schemas/config.schema.yml index 7dead6d..17eaa39 100644 --- a/config/schemas/config.schema.yml +++ b/config/schemas/config.schema.yml @@ -9,6 +9,12 @@ properties: outdir: type: string description: Output directory for results + tool: + type: string + description: Annotation tool to use + enum: + - prokka + - pgap pgap: type: object properties: @@ -34,7 +40,21 @@ properties: - bin - use_yaml_config - prepare_yaml_files + prokka: + type: object + properties: + center: + type: string + description: Center name for Prokka annotation (used in sequence IDs) + extra: + type: string + description: Extra command-line arguments for Prokka + required: + - center + - extra required: - samplesheet + - tool - pgap + - prokka From 4a88b34ec0f871fea7c3d1ae12b22b5858a3ff63 Mon Sep 17 00:00:00 2001 From: m-jahn Date: Fri, 5 Dec 2025 14:07:23 +0100 Subject: [PATCH 04/18] feat: added bakta for annotation --- .github/workflows/main.yml | 6 +-- .test/config/config.yml | 4 ++ README.md | 5 ++- config/config.yml | 4 ++ config/schemas/config.schema.yml | 14 +++++++ workflow/envs/bakta.yml | 7 ++++ workflow/rules/annotate.smk | 67 ++++++++++++++++++++++++++++++-- 7 files changed, 98 insertions(+), 9 deletions(-) create mode 100644 workflow/envs/bakta.yml diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index bc1d054..3991806 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -2,9 +2,9 @@ name: CI on: push: - branches: [main, dev] + branches: [main] pull_request: - branches: [main, dev] + branches: [main] jobs: Formatting: @@ -47,7 +47,7 @@ jobs: with: directory: .test snakefile: workflow/Snakefile - args: "--sdm conda --show-failed-logs --cores 1 --conda-cleanup-pkgs cache -n" + args: "--sdm conda --show-failed-logs --cores 1 --conda-cleanup-pkgs cache" - name: Test report uses: snakemake/snakemake-github-action@v2.0.0 diff --git a/.test/config/config.yml b/.test/config/config.yml index 932a2f0..46c3512 100644 --- a/.test/config/config.yml +++ b/.test/config/config.yml @@ -12,3 +12,7 @@ pgap: prokka: center: "" extra: "--addgenes" + +bakta: + db: "light" + extra: "--keep-contig-headers --compliant" diff --git a/README.md b/README.md index 4493195..dbf4310 100644 --- a/README.md +++ b/README.md @@ -17,8 +17,9 @@ If you use this workflow in a paper, don't forget to give credits to the authors 1. Parse `samples.csv` table containing the samples's meta data (`python`) 2. Annotate assemblies using one of the following tools: - 1. NCBI's Prokaryotic Genome Annotation Pipeline ([PGAP](https://github.com/ncbi/pgap)) - 2. [prokka](https://github.com/tseemann/prokka) + 1. NCBI's Prokaryotic Genome Annotation Pipeline ([PGAP](https://github.com/ncbi/pgap)). Note: needs to be installed manually + 2. [prokka](https://github.com/tseemann/prokka), a fast and light-weight prokaryotic annotation tool + 3. [bakta](https://github.com/oschwengers/bakta), a fast, alignment-free annotation tool. Note: Bakta will automatically download its companion database from zenodo (light: 1.5 GB, full: 40 GB) ## Requirements diff --git a/config/config.yml b/config/config.yml index 932a2f0..3d59fb6 100644 --- a/config/config.yml +++ b/config/config.yml @@ -12,3 +12,7 @@ pgap: prokka: center: "" extra: "--addgenes" + +bakta: + db: "light" + extra: "" diff --git a/config/schemas/config.schema.yml b/config/schemas/config.schema.yml index 17eaa39..f0c5247 100644 --- a/config/schemas/config.schema.yml +++ b/config/schemas/config.schema.yml @@ -15,6 +15,7 @@ properties: enum: - prokka - pgap + - bakta pgap: type: object properties: @@ -52,9 +53,22 @@ properties: required: - center - extra + bakta: + type: object + properties: + db: + type: string + description: Bakta database type, one of 'full', 'light' + extra: + type: string + description: Extra command-line arguments for Bakta + required: + - db + - extra required: - samplesheet - tool - pgap - prokka + - bakta diff --git a/workflow/envs/bakta.yml b/workflow/envs/bakta.yml new file mode 100644 index 0000000..1c833f7 --- /dev/null +++ b/workflow/envs/bakta.yml @@ -0,0 +1,7 @@ +name: base +channels: + - conda-forge + - bioconda + - nodefaults +dependencies: + - bakta=1.9.4 diff --git a/workflow/rules/annotate.smk b/workflow/rules/annotate.smk index 31c3ced..75a0790 100644 --- a/workflow/rules/annotate.smk +++ b/workflow/rules/annotate.smk @@ -99,7 +99,8 @@ rule annotate_prokka: params: prefix=lambda wc: wc.sample, locustag=lambda wc: samples.loc[wc.sample]["id_prefix"], - species=lambda wc: samples.loc[wc.sample]["species"], + genus=lambda wc: samples.loc[wc.sample]["species"].split(" ")[0], + species=lambda wc: samples.loc[wc.sample]["species"].split(" ")[1], strain=lambda wc: samples.loc[wc.sample]["strain"], outdir=lambda wc, output: os.path.dirname(output[0]), extra=config["prokka"]["extra"], @@ -109,12 +110,70 @@ rule annotate_prokka: shell: """ prokka \ - --cpus {threads} \ --locustag {params.locustag} \ - --species '{params.species}' \ + --genus {params.genus} \ + --species {params.species} \ --strain {params.strain} \ --prefix {params.prefix} \ --outdir {params.outdir} \ --force {params.extra} \ - {input.fasta} &>> {log} + --cpus {threads} \ + {input.fasta} &> {log} + """ + + +rule get_bakta_db: + output: + db=directory(os.path.join(OUTDIR, "annotation/bakta/database/db")), + conda: + "../envs/bakta.yml" + message: + """--- Getting BAKTA database for annotation ---""" + params: + db=config["bakta"]["db"], + threads: workflow.cores * 0.25 + log: + os.path.join(OUTDIR, "annotation/bakta/database/db.log"), + shell: + """ + echo 'The most recent of the following available bakta DBs is downloaded:' > {log}; + bakta_db list > {log}; + bakta_db download --output {output.db} --type {params.db} &> {log} + """ + + +rule annotate_bakta: + input: + fasta=rules.get_fasta.output.fasta, + db=rules.get_bakta_db.output.db, + output: + os.path.join(OUTDIR, "annotation/bakta/{sample}/{sample}.gff"), + conda: + "../envs/bakta.yml" + message: + """--- Running BAKTA annotation for sample {wildcards.sample} ---""" + params: + prefix=lambda wc: wc.sample, + locustag=lambda wc: samples.loc[wc.sample]["id_prefix"], + species=lambda wc: samples.loc[wc.sample]["species"], + strain=lambda wc: samples.loc[wc.sample]["strain"], + outdir=lambda wc, output: os.path.dirname(output[0]), + subdir="db" if config["bakta"]["db"] == "full" else "db-light", + extra=config["bakta"]["extra"], + threads: workflow.cores * 0.25 + log: + os.path.join(OUTDIR, "annotation/bakta/logs/{sample}_bakta.log"), + shell: + """ + bakta \ + --db {input.db}/{params.subdir} \ + --prefix {params.prefix} \ + --output {params.outdir} \ + --locus-tag {params.locustag} \ + --species '{params.species}' \ + --strain {params.strain} \ + --threads {threads} \ + --force {params.extra} \ + {input.fasta} &> {log}; + mv {output}3 {output} """ From 1b6695f502b40e88d5e43cdd38be1b06662a2d88 Mon Sep 17 00:00:00 2001 From: m-jahn Date: Fri, 5 Dec 2025 14:26:10 +0100 Subject: [PATCH 05/18] fix: use results as standard output dir --- workflow/Snakefile | 8 ++------ workflow/rules/annotate.smk | 37 +++++++++++++------------------------ 2 files changed, 15 insertions(+), 30 deletions(-) diff --git a/workflow/Snakefile b/workflow/Snakefile index f048102..fb2a11d 100644 --- a/workflow/Snakefile +++ b/workflow/Snakefile @@ -19,10 +19,6 @@ container: "oras://ghcr.io/mpusp/snakemake-assembly-postprocessing:latest" configfile: "config/config.yml" -# set outout directory -OUTDIR = f"{os.getcwd()}/results" - - # ----------------------------------------------------- # load rules # ----------------------------------------------------- @@ -41,7 +37,7 @@ end = "\033[0m" msg = f"""\nSnakemake-assembly-postprocessing: A Snakemake workflow for the post-processing of microbial genome assemblies.""" -prolog = f"Output directory: {OUTDIR}" +prolog = f"Output directory: ./results" year = datetime.today().year epilog = f""" @@ -71,7 +67,7 @@ onerror: rule all: input: expand( - os.path.join(OUTDIR, "annotation/{tool}/{sample}/{sample}.gff"), + "results/annotation/{tool}/{sample}/{sample}.gff", tool=config["tool"], sample=samples.index, ), diff --git a/workflow/rules/annotate.smk b/workflow/rules/annotate.smk index 75a0790..10407f8 100644 --- a/workflow/rules/annotate.smk +++ b/workflow/rules/annotate.smk @@ -2,15 +2,11 @@ rule get_fasta: input: get_fasta, output: - fasta=os.path.join( - OUTDIR, "annotation/pgap/prepare_files/{sample}/genome.fasta" - ), + fasta="results/annotation/pgap/prepare_files/{sample}/genome.fasta", conda: "../envs/base.yml" log: - os.path.join( - OUTDIR, "annotation/pgap/prepare_files/logs/{sample}_get_fasta.log" - ), + "results/annotation/pgap/prepare_files/logs/{sample}_get_fasta.log", shell: "INPUT=$(realpath {input}); " "ln -s ${{INPUT}} {output}; " @@ -21,12 +17,8 @@ rule prepare_yaml_files: input: fasta=rules.get_fasta.output.fasta, output: - input_yaml=os.path.join( - OUTDIR, "annotation/pgap/prepare_files/{sample}/input.yaml" - ), - submol_yaml=os.path.join( - OUTDIR, "annotation/pgap/prepare_files/{sample}/submol.yaml" - ), + input_yaml="results/annotation/pgap/prepare_files/{sample}/input.yaml", + submol_yaml="results/annotation/pgap/prepare_files/{sample}/submol.yaml", conda: "../envs/base.yml" params: @@ -37,10 +29,7 @@ rule prepare_yaml_files: sample="{sample}", pd_samples=samples, log: - os.path.join( - OUTDIR, - "annotation/pgap/prepare_files/logs/{sample}_prepare_yaml_files.log", - ), + "results/annotation/pgap/prepare_files/logs/{sample}_prepare_yaml_files.log", script: "../scripts/prepare_yaml_files.py" @@ -53,7 +42,7 @@ rule annotate_pgap: otherwise=rules.get_fasta.output.fasta, ), output: - os.path.join(OUTDIR, "annotation/pgap/{sample}/{sample}.gff"), + "results/annotation/pgap/{sample}/{sample}.gff", conda: "../envs/base.yml" message: @@ -65,7 +54,7 @@ rule annotate_pgap: outdir=lambda wc, output: os.path.dirname(output[0]), threads: 1 log: - os.path.join(OUTDIR, "annotation/pgap/logs/{sample}_pgap.log"), + "results/annotation/pgap/logs/{sample}_pgap.log", shell: "rm -rf {params.outdir}; " "if [ {params.use_yaml_config} == 'True' ]; then " @@ -91,7 +80,7 @@ rule annotate_prokka: input: fasta=rules.get_fasta.output.fasta, output: - os.path.join(OUTDIR, "annotation/prokka/{sample}/{sample}.gff"), + "results/annotation/prokka/{sample}/{sample}.gff", conda: "../envs/prokka.yml" message: @@ -106,7 +95,7 @@ rule annotate_prokka: extra=config["prokka"]["extra"], threads: workflow.cores * 0.25 log: - os.path.join(OUTDIR, "annotation/prokka/logs/{sample}_prokka.log"), + "results/annotation/prokka/logs/{sample}_prokka.log", shell: """ prokka \ @@ -124,7 +113,7 @@ rule annotate_prokka: rule get_bakta_db: output: - db=directory(os.path.join(OUTDIR, "annotation/bakta/database/db")), + db=directory("results/annotation/bakta/database/db"), conda: "../envs/bakta.yml" message: @@ -133,7 +122,7 @@ rule get_bakta_db: db=config["bakta"]["db"], threads: workflow.cores * 0.25 log: - os.path.join(OUTDIR, "annotation/bakta/database/db.log"), + "results/annotation/bakta/database/db.log", shell: """ echo 'The most recent of the following available bakta DBs is downloaded:' > {log}; @@ -147,7 +136,7 @@ rule annotate_bakta: fasta=rules.get_fasta.output.fasta, db=rules.get_bakta_db.output.db, output: - os.path.join(OUTDIR, "annotation/bakta/{sample}/{sample}.gff"), + "results/annotation/bakta/{sample}/{sample}.gff", conda: "../envs/bakta.yml" message: @@ -162,7 +151,7 @@ rule annotate_bakta: extra=config["bakta"]["extra"], threads: workflow.cores * 0.25 log: - os.path.join(OUTDIR, "annotation/bakta/logs/{sample}_bakta.log"), + "results/annotation/bakta/logs/{sample}_bakta.log", shell: """ bakta \ From 7d9f6cdbc1dc78b00f4d33344d245c70a375b7fe Mon Sep 17 00:00:00 2001 From: m-jahn Date: Mon, 8 Dec 2025 10:47:54 +0100 Subject: [PATCH 06/18] feat: added quast, allow multiple annotation tools, allow local bakta db --- .test/config/config.yml | 9 ++++++-- .test/config/samples.csv | 2 +- config/config.yml | 9 ++++++-- config/schemas/config.schema.yml | 32 +++++++++++++++++++++-------- workflow/Snakefile | 4 ++-- workflow/envs/quast.yml | 7 +++++++ workflow/rules/annotate.smk | 35 +++++++++++++++++++++++--------- workflow/rules/common.smk | 17 +++++++++++++++- workflow/rules/qc.smk | 32 +++++++++++++++++++++++++++++ 9 files changed, 121 insertions(+), 26 deletions(-) create mode 100644 workflow/envs/quast.yml create mode 100644 workflow/rules/qc.smk diff --git a/.test/config/config.yml b/.test/config/config.yml index 595142c..fa5a8d5 100644 --- a/.test/config/config.yml +++ b/.test/config/config.yml @@ -1,5 +1,5 @@ samplesheet: "config/samples.csv" -tool: "prokka" +tool: ["prokka"] pgap: bin: "path/to/pgap.py" @@ -13,5 +13,10 @@ prokka: extra: "--addgenes" bakta: - db: "light" + download_db: "light" + existing_db: "" extra: "--keep-contig-headers --compliant" + +quast: + reference: "" + extra: "" diff --git a/.test/config/samples.csv b/.test/config/samples.csv index 4f368e3..5a46dd9 100644 --- a/.test/config/samples.csv +++ b/.test/config/samples.csv @@ -1,2 +1,2 @@ sample,species,strain,id_prefix,file -EC2224,"Streptococcus pyogenes",SF370,SPY,"data/assembly.fasta" \ No newline at end of file +EC2224,"Streptococcus pyogenes",SF370,SPY,"data/assembly.fasta" diff --git a/config/config.yml b/config/config.yml index 595142c..fa5a8d5 100644 --- a/config/config.yml +++ b/config/config.yml @@ -1,5 +1,5 @@ samplesheet: "config/samples.csv" -tool: "prokka" +tool: ["prokka"] pgap: bin: "path/to/pgap.py" @@ -13,5 +13,10 @@ prokka: extra: "--addgenes" bakta: - db: "light" + download_db: "light" + existing_db: "" extra: "--keep-contig-headers --compliant" + +quast: + reference: "" + extra: "" diff --git a/config/schemas/config.schema.yml b/config/schemas/config.schema.yml index b913caa..ac37e43 100644 --- a/config/schemas/config.schema.yml +++ b/config/schemas/config.schema.yml @@ -7,12 +7,14 @@ properties: type: string description: Path to the sample sheet file tool: - type: string + type: array description: Annotation tool to use - enum: - - prokka - - pgap - - bakta + items: + type: string + enum: + - prokka + - pgap + - bakta pgap: type: object properties: @@ -53,15 +55,28 @@ properties: bakta: type: object properties: - db: + download_db: + type: string + description: Bakta database type, one of 'full', 'light', or 'none' if existing is used + existing_db: type: string - description: Bakta database type, one of 'full', 'light' + description: Path to an existing Bakta database (optional) extra: type: string description: Extra command-line arguments for Bakta required: - - db + - download_db + - existing_db - extra + quast: + type: object + properties: + reference: + type: string + description: Path to the reference genome for QUAST + extra: + type: string + description: Extra command-line arguments for QUAST required: - samplesheet @@ -69,3 +84,4 @@ required: - pgap - prokka - bakta + - quast diff --git a/workflow/Snakefile b/workflow/Snakefile index fb2a11d..9ae255e 100644 --- a/workflow/Snakefile +++ b/workflow/Snakefile @@ -24,6 +24,7 @@ configfile: "config/config.yml" # ----------------------------------------------------- include: "rules/common.smk" include: "rules/annotate.smk" +include: "rules/qc.smk" # ----------------------------------------------------- @@ -67,8 +68,7 @@ onerror: rule all: input: expand( - "results/annotation/{tool}/{sample}/{sample}.gff", + "results/qc/quast/{tool}/report.txt", tool=config["tool"], - sample=samples.index, ), default_target: True diff --git a/workflow/envs/quast.yml b/workflow/envs/quast.yml new file mode 100644 index 0000000..0fbc104 --- /dev/null +++ b/workflow/envs/quast.yml @@ -0,0 +1,7 @@ +name: base +channels: + - conda-forge + - bioconda + - nodefaults +dependencies: + - quast=5.3.0 diff --git a/workflow/rules/annotate.smk b/workflow/rules/annotate.smk index 10407f8..c92f09a 100644 --- a/workflow/rules/annotate.smk +++ b/workflow/rules/annotate.smk @@ -80,7 +80,8 @@ rule annotate_prokka: input: fasta=rules.get_fasta.output.fasta, output: - "results/annotation/prokka/{sample}/{sample}.gff", + gff="results/annotation/prokka/{sample}/{sample}.gff", + fasta="results/annotation/prokka/{sample}/{sample}.fna", conda: "../envs/prokka.yml" message: @@ -113,21 +114,35 @@ rule annotate_prokka: rule get_bakta_db: output: - db=directory("results/annotation/bakta/database/db"), + db=branch( + lookup(dpath="bakta/download_db", within=config), + cases={ + "full": directory("results/annotation/bakta/database/db"), + "light": directory("results/annotation/bakta/database/db-light"), + "none": directory("results/annotation/bakta/database/custom"), + }, + ), conda: "../envs/bakta.yml" message: """--- Getting BAKTA database for annotation ---""" params: - db=config["bakta"]["db"], + download_db=config["bakta"]["download_db"], + existing_db=config["bakta"]["existing_db"], + outdir=lambda wc, output: os.path.dirname(output[0]), threads: workflow.cores * 0.25 log: "results/annotation/bakta/database/db.log", shell: """ - echo 'The most recent of the following available bakta DBs is downloaded:' > {log}; - bakta_db list > {log}; - bakta_db download --output {output.db} --type {params.db} &> {log} + if [ {params.download_db} != 'none' ]; then + echo 'The most recent of the following available Bakta DBs is downloaded:' > {log}; + bakta_db list >> {log}; + bakta_db download --output {params.outdir} --type {params.download_db} &>> {log}; + else + echo 'Using Bakta DB from supplied input dir: {params.existing_db}' > {log}; + ln -s {params.existing_db} {output.db} + fi """ @@ -136,7 +151,8 @@ rule annotate_bakta: fasta=rules.get_fasta.output.fasta, db=rules.get_bakta_db.output.db, output: - "results/annotation/bakta/{sample}/{sample}.gff", + gff="results/annotation/bakta/{sample}/{sample}.gff", + fasta="results/annotation/bakta/{sample}/{sample}.fna", conda: "../envs/bakta.yml" message: @@ -147,7 +163,6 @@ rule annotate_bakta: species=lambda wc: samples.loc[wc.sample]["species"], strain=lambda wc: samples.loc[wc.sample]["strain"], outdir=lambda wc, output: os.path.dirname(output[0]), - subdir="db" if config["bakta"]["db"] == "full" else "db-light", extra=config["bakta"]["extra"], threads: workflow.cores * 0.25 log: @@ -155,7 +170,7 @@ rule annotate_bakta: shell: """ bakta \ - --db {input.db}/{params.subdir} \ + --db {input.db} \ --prefix {params.prefix} \ --output {params.outdir} \ --locus-tag {params.locustag} \ @@ -164,5 +179,5 @@ rule annotate_bakta: --threads {threads} \ --force {params.extra} \ {input.fasta} &> {log}; - mv {output}3 {output} + mv {output.gff}3 {output.gff} """ diff --git a/workflow/rules/common.smk b/workflow/rules/common.smk index 72684e1..9300f94 100644 --- a/workflow/rules/common.smk +++ b/workflow/rules/common.smk @@ -23,6 +23,21 @@ def get_fasta(wildcards): sample = wildcards.sample if sample not in samples.index: raise ValueError(f"Sample {sample} not found in samplesheet.") - # return the fasta file path return samples.loc[sample, "file"] + + +def get_quast_fasta(wildcards): + return expand( + "results/annotation/{tool}/{sample}/{sample}.fna", + tool=wildcards.tool, + sample=samples.index, + ) + + +def get_quast_gff(wildcards): + return expand( + "results/annotation/{tool}/{sample}/{sample}.gff", + tool=wildcards.tool, + sample=samples.index, + ) diff --git a/workflow/rules/qc.smk b/workflow/rules/qc.smk new file mode 100644 index 0000000..1403c1b --- /dev/null +++ b/workflow/rules/qc.smk @@ -0,0 +1,32 @@ +rule quast: + input: + fasta=get_quast_fasta, + gff=get_quast_gff, + output: + report="results/qc/quast/{tool}/report.txt", + conda: + "../envs/quast.yml" + message: + """--- Running QUAST quality check for all assemblies ---""" + params: + outdir=lambda wc, output: os.path.dirname(output.report), + ref=( + " ".join(["-r", config["quast"]["reference"]]) + if config["quast"]["reference"] + else [] + ), + extra=config["quast"]["extra"], + threads: workflow.cores * 0.25 + log: + "results/qc/quast/{tool}/quast.log", + shell: + """ + quast \ + --output-dir {params.outdir} \ + --threads {threads} \ + --features {input.gff} \ + {params.ref} \ + {params.extra} \ + {input.fasta} \ + > {log} 2>&1 + """ From f4db893567c9375d62bb2bd6a419ab86eed4b0b6 Mon Sep 17 00:00:00 2001 From: jahn Date: Mon, 8 Dec 2025 14:35:34 +0100 Subject: [PATCH 07/18] fix: use absolute input paths for pgap helpers --- workflow/rules/annotate.smk | 3 ++- workflow/scripts/prepare_yaml_files.py | 25 ++++++++++++------------- 2 files changed, 14 insertions(+), 14 deletions(-) diff --git a/workflow/rules/annotate.smk b/workflow/rules/annotate.smk index c92f09a..602e607 100644 --- a/workflow/rules/annotate.smk +++ b/workflow/rules/annotate.smk @@ -42,7 +42,8 @@ rule annotate_pgap: otherwise=rules.get_fasta.output.fasta, ), output: - "results/annotation/pgap/{sample}/{sample}.gff", + gff="results/annotation/pgap/{sample}/{sample}.gff", + fasta="results/annotation/pgap/{sample}/{sample}.fna", conda: "../envs/base.yml" message: diff --git a/workflow/scripts/prepare_yaml_files.py b/workflow/scripts/prepare_yaml_files.py index 22d4f0a..c8846de 100644 --- a/workflow/scripts/prepare_yaml_files.py +++ b/workflow/scripts/prepare_yaml_files.py @@ -21,7 +21,7 @@ # define helper functions -def read_yaml_to_dict(filepath): +def read_yaml_to_dict(filepath: str) -> None: try: with open(filepath, "r") as f: return yaml.safe_load(f) @@ -35,32 +35,31 @@ def read_yaml_to_dict(filepath): return None -def write_dict_to_yaml(dic, filepath): +def write_dict_to_yaml(outdict: dict, filepath: str) -> None: try: with open(filepath, "w") as f: - yaml.safe_dump(dic, f, default_flow_style=False) + yaml.safe_dump(outdict, f, default_flow_style=False) sys.stderr.write(f"Wrote YAML file: {filepath}\n") except Exception as e: sys.stderr.write(f"Error writing YAML file {filepath}: {e}\n") -generic_dic = read_yaml_to_dict(generic_template) -submol_dic = read_yaml_to_dict(submol_template) +generic_dict = read_yaml_to_dict(generic_template) +submol_dict = read_yaml_to_dict(submol_template) -generic_dic["fasta"]["location"] = os.path.basename(fasta) -generic_dic["submol"]["location"] = submol_yaml +generic_dict["fasta"]["location"] = os.path.abspath(fasta) +generic_dict["submol"]["location"] = os.path.abspath(submol_yaml) if not organism: organism = samples.loc[sample]["species"] -submol_dic["organism"]["genus_species"] = organism - -submol_dic["organism"]["strain"] = samples.loc[sample]["strain"] +submol_dict["organism"]["genus_species"] = organism +submol_dict["organism"]["strain"] = samples.loc[sample]["strain"] if not locus_tag: locus_tag = "Spy" -submol_dic["locus_tag_prefix"] = locus_tag +submol_dict["locus_tag_prefix"] = locus_tag -write_dict_to_yaml(generic_dic, input_yaml) -write_dict_to_yaml(submol_dic, submol_yaml) +write_dict_to_yaml(generic_dict, input_yaml) +write_dict_to_yaml(submol_dict, submol_yaml) sys.stderr.write(f"Module 'prepare yaml files' finished successfully!") From 84c07a67fb10053b1a435afe2923652a75a50101 Mon Sep 17 00:00:00 2001 From: m-jahn Date: Tue, 9 Dec 2025 11:51:25 +0100 Subject: [PATCH 08/18] fix: use correct input for quast reference, closes #7 --- .test/config/config.yml | 3 ++- config/config.yml | 3 ++- config/schemas/config.schema.yml | 5 ++++- workflow/rules/qc.smk | 16 ++++++++++------ 4 files changed, 18 insertions(+), 9 deletions(-) diff --git a/.test/config/config.yml b/.test/config/config.yml index fa5a8d5..6650ef0 100644 --- a/.test/config/config.yml +++ b/.test/config/config.yml @@ -18,5 +18,6 @@ bakta: extra: "--keep-contig-headers --compliant" quast: - reference: "" + reference_fasta: "" + reference_gff: "" extra: "" diff --git a/config/config.yml b/config/config.yml index fa5a8d5..6650ef0 100644 --- a/config/config.yml +++ b/config/config.yml @@ -18,5 +18,6 @@ bakta: extra: "--keep-contig-headers --compliant" quast: - reference: "" + reference_fasta: "" + reference_gff: "" extra: "" diff --git a/config/schemas/config.schema.yml b/config/schemas/config.schema.yml index ac37e43..b461fc1 100644 --- a/config/schemas/config.schema.yml +++ b/config/schemas/config.schema.yml @@ -71,9 +71,12 @@ properties: quast: type: object properties: - reference: + reference_fasta: type: string description: Path to the reference genome for QUAST + reference_gff: + type: string + description: Path to the reference annotation for QUAST extra: type: string description: Extra command-line arguments for QUAST diff --git a/workflow/rules/qc.smk b/workflow/rules/qc.smk index 1403c1b..1e8a9ce 100644 --- a/workflow/rules/qc.smk +++ b/workflow/rules/qc.smk @@ -1,7 +1,6 @@ rule quast: input: fasta=get_quast_fasta, - gff=get_quast_gff, output: report="results/qc/quast/{tool}/report.txt", conda: @@ -10,9 +9,14 @@ rule quast: """--- Running QUAST quality check for all assemblies ---""" params: outdir=lambda wc, output: os.path.dirname(output.report), - ref=( - " ".join(["-r", config["quast"]["reference"]]) - if config["quast"]["reference"] + ref_fasta=( + " ".join(["-r", config["quast"]["reference_fasta"]]) + if config["quast"]["reference_fasta"] + else [] + ), + ref_gff=( + " ".join(["-g", config["quast"]["reference_gff"]]) + if config["quast"]["reference_gff"] else [] ), extra=config["quast"]["extra"], @@ -24,8 +28,8 @@ rule quast: quast \ --output-dir {params.outdir} \ --threads {threads} \ - --features {input.gff} \ - {params.ref} \ + {params.ref_fasta} \ + {params.ref_gff} \ {params.extra} \ {input.fasta} \ > {log} 2>&1 From db385f0fce62cad5b188d865b8f84cab1e13b270 Mon Sep 17 00:00:00 2001 From: Rina Ahmed-Begrich Date: Tue, 9 Dec 2025 13:10:26 +0100 Subject: [PATCH 09/18] fix: update armdfinder db when local db is used. update bakta version. --- workflow/envs/bakta.yml | 2 +- workflow/rules/annotate.smk | 4 +++- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/workflow/envs/bakta.yml b/workflow/envs/bakta.yml index 1c833f7..526650b 100644 --- a/workflow/envs/bakta.yml +++ b/workflow/envs/bakta.yml @@ -4,4 +4,4 @@ channels: - bioconda - nodefaults dependencies: - - bakta=1.9.4 + - bakta=1.11.4 diff --git a/workflow/rules/annotate.smk b/workflow/rules/annotate.smk index 602e607..9e3b4d3 100644 --- a/workflow/rules/annotate.smk +++ b/workflow/rules/annotate.smk @@ -142,7 +142,9 @@ rule get_bakta_db: bakta_db download --output {params.outdir} --type {params.download_db} &>> {log}; else echo 'Using Bakta DB from supplied input dir: {params.existing_db}' > {log}; - ln -s {params.existing_db} {output.db} + ln -s {params.existing_db} {output.db}; + echo 'Update ARMFinderPlus DB using supplied input dir: {params.existing_db}' > {log}; + amrfinder_update --force_update --database {params.existing_db}/amrfinderplus-db fi """ From a85d20bf22201ee00cffb5ae22047207fc07ad80 Mon Sep 17 00:00:00 2001 From: Rina Ahmed-Begrich Date: Tue, 9 Dec 2025 13:14:00 +0100 Subject: [PATCH 10/18] fix: pipe armfinder_update to log. --- workflow/rules/annotate.smk | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/workflow/rules/annotate.smk b/workflow/rules/annotate.smk index 9e3b4d3..5a8a372 100644 --- a/workflow/rules/annotate.smk +++ b/workflow/rules/annotate.smk @@ -144,7 +144,7 @@ rule get_bakta_db: echo 'Using Bakta DB from supplied input dir: {params.existing_db}' > {log}; ln -s {params.existing_db} {output.db}; echo 'Update ARMFinderPlus DB using supplied input dir: {params.existing_db}' > {log}; - amrfinder_update --force_update --database {params.existing_db}/amrfinderplus-db + amrfinder_update --force_update --database {params.existing_db}/amrfinderplus-db >> {log} fi """ From 938ac49de7be1258b910dc8167138fa17f8b39e3 Mon Sep 17 00:00:00 2001 From: jahn Date: Tue, 9 Dec 2025 13:32:14 +0100 Subject: [PATCH 11/18] fix: correct env names, use relative path for pgap fasta --- workflow/envs/bakta.yml | 2 +- workflow/envs/base.yml | 6 +++--- workflow/envs/prokka.yml | 2 +- workflow/envs/quast.yml | 2 +- workflow/scripts/prepare_yaml_files.py | 2 +- 5 files changed, 7 insertions(+), 7 deletions(-) diff --git a/workflow/envs/bakta.yml b/workflow/envs/bakta.yml index 526650b..bd2d4d2 100644 --- a/workflow/envs/bakta.yml +++ b/workflow/envs/bakta.yml @@ -1,4 +1,4 @@ -name: base +name: bakta channels: - conda-forge - bioconda diff --git a/workflow/envs/base.yml b/workflow/envs/base.yml index fb1867a..fcd1565 100644 --- a/workflow/envs/base.yml +++ b/workflow/envs/base.yml @@ -4,6 +4,6 @@ channels: - bioconda - nodefaults dependencies: - - python==3.13.0 - - pandas==2.3.3 - - apptainer==1.4.5 + - python=3.13.0 + - pandas=2.3.3 + - apptainer=1.4.5 diff --git a/workflow/envs/prokka.yml b/workflow/envs/prokka.yml index 2859a3a..643ba6b 100644 --- a/workflow/envs/prokka.yml +++ b/workflow/envs/prokka.yml @@ -1,4 +1,4 @@ -name: base +name: prokka channels: - conda-forge - bioconda diff --git a/workflow/envs/quast.yml b/workflow/envs/quast.yml index 0fbc104..9d5f823 100644 --- a/workflow/envs/quast.yml +++ b/workflow/envs/quast.yml @@ -1,4 +1,4 @@ -name: base +name: quast channels: - conda-forge - bioconda diff --git a/workflow/scripts/prepare_yaml_files.py b/workflow/scripts/prepare_yaml_files.py index c8846de..5f1914f 100644 --- a/workflow/scripts/prepare_yaml_files.py +++ b/workflow/scripts/prepare_yaml_files.py @@ -47,7 +47,7 @@ def write_dict_to_yaml(outdict: dict, filepath: str) -> None: generic_dict = read_yaml_to_dict(generic_template) submol_dict = read_yaml_to_dict(submol_template) -generic_dict["fasta"]["location"] = os.path.abspath(fasta) +generic_dict["fasta"]["location"] = os.path.basename(fasta) generic_dict["submol"]["location"] = os.path.abspath(submol_yaml) if not organism: From 20e70b24062e621950812144af5600d21a0f7efe Mon Sep 17 00:00:00 2001 From: jahn Date: Wed, 10 Dec 2025 10:24:42 +0100 Subject: [PATCH 12/18] feat: added panaroo as a genome comparison tool, closes #6 --- .test/config/config.yml | 5 +++ README.md | 11 ++++++ config/config.yml | 5 +++ config/schemas/config.schema.yml | 12 +++++++ workflow/Snakefile | 5 +-- workflow/envs/panaroo.yml | 9 +++++ workflow/rules/annotate.smk | 6 ++-- workflow/rules/common.smk | 27 ++++++++++++-- workflow/rules/qc.smk | 60 +++++++++++++++++++++++++++++++- 9 files changed, 129 insertions(+), 11 deletions(-) create mode 100644 workflow/envs/panaroo.yml diff --git a/.test/config/config.yml b/.test/config/config.yml index 6650ef0..b203c5d 100644 --- a/.test/config/config.yml +++ b/.test/config/config.yml @@ -21,3 +21,8 @@ quast: reference_fasta: "" reference_gff: "" extra: "" + +panaroo: + remove_source: "cmsearch" + remove_feature: "tRNA|rRNA|ncRNA|exon|sequence_feature" + extra: "--clean-mode strict --remove-invalid-genes" diff --git a/README.md b/README.md index dbf4310..0d30436 100644 --- a/README.md +++ b/README.md @@ -4,6 +4,7 @@ [![GitHub actions status](https://github.com/MPUSP/snakemake-assembly-postprocessing/actions/workflows/main.yml/badge.svg)](https://github.com/MPUSP/snakemake-assembly-postprocessing/actions/workflows/main.yml) [![run with conda](http://img.shields.io/badge/run%20with-conda-3EB049?labelColor=000000&logo=anaconda)](https://docs.conda.io/en/latest/) [![run with apptainer](https://img.shields.io/badge/run%20with-apptainer-1D355C.svg?labelColor=000000)](https://apptainer.org/) +[![workflow catalog](https://img.shields.io/badge/Snakemake%20workflow%20catalog-darkgreen)](https://snakemake.github.io/snakemake-workflow-catalog/docs/workflows/MPUSP/snakemake-assembly-postprocessing) A Snakemake workflow for the post-processing of microbial genome assemblies. @@ -20,6 +21,8 @@ If you use this workflow in a paper, don't forget to give credits to the authors 1. NCBI's Prokaryotic Genome Annotation Pipeline ([PGAP](https://github.com/ncbi/pgap)). Note: needs to be installed manually 2. [prokka](https://github.com/tseemann/prokka), a fast and light-weight prokaryotic annotation tool 3. [bakta](https://github.com/oschwengers/bakta), a fast, alignment-free annotation tool. Note: Bakta will automatically download its companion database from zenodo (light: 1.5 GB, full: 40 GB) +3. Create a QC report for the assemblies using [Quast](https://github.com/ablab/quast) +4. Create a pangenome analysis (orthologs/homologs) using [Panaroo](https://gthlab.au/panaroo/) ## Requirements @@ -64,6 +67,14 @@ conda activate snakemake-assembly-postprocessing ## References +> Seemann T. _Prokka: rapid prokaryotic genome annotation_. Bioinformatics. **2014** Jul 15;30(14):2068-9. PMID: 24642063. https://doi.org/10.1093/bioinformatics/btu153. + +> Schwengers O, Jelonek L, Dieckmann MA, Beyvers S, Blom J, Goesmann A. _Bakta: rapid and standardized annotation of bacterial genomes via alignment-free sequence identification_. Microb Genom, 7(11):000685 **2021**. PMID: 34739369. https://doi.org/10.1099/mgen.0.000685. + > Li W, O'Neill KR, Haft DH, DiCuccio M, Chetvernin V, Badretdin A, Coulouris G, Chitsaz F, Derbyshire MK, Durkin AS, Gonzales NR, Gwadz M, Lanczycki CJ, Song JS, Thanki N, Wang J, Yamashita RA, Yang M, Zheng C, Marchler-Bauer A, Thibaud-Nissen F. _RefSeq: Expanding the Prokaryotic Genome Annotation Pipeline reach with protein family model curation._ Nucleic Acids Res, **2021** Jan 8;49(D1):D1020-D1028. https://doi.org/10.1093/nar/gkaa1105 +> Gurevich A, Saveliev V, Vyahhi N, Tesler G. _QUAST: quality assessment tool for genome assemblies_. Bioinformatics. 29(8):1072-5, **2013**. PMID: 23422339. https://doi.org/10.1093/bioinformatics/btt086. + +> Tonkin-Hill G, MacAlasdair N, Ruis C, Weimann A, Horesh G, Lees JA, Gladstone RA, Lo S, Beaudoin C, Floto RA, Frost SDW, Corander J, Bentley SD, Parkhill J. _Producing polished prokaryotic pangenomes with the Panaroo pipeline_. Genome Biol. 21(1):180, **2020**. PMID: 32698896. https://doi.org/10.1186/s13059-020-02090-4. + > Köster, J., Mölder, F., Jablonski, K. P., Letcher, B., Hall, M. B., Tomkins-Tinch, C. H., Sochat, V., Forster, J., Lee, S., Twardziok, S. O., Kanitz, A., Wilm, A., Holtgrewe, M., Rahmann, S., & Nahnsen, S. _Sustainable data analysis with Snakemake_. F1000Research, 10:33, 10, 33, **2021**. https://doi.org/10.12688/f1000research.29032.2. diff --git a/config/config.yml b/config/config.yml index 6650ef0..b203c5d 100644 --- a/config/config.yml +++ b/config/config.yml @@ -21,3 +21,8 @@ quast: reference_fasta: "" reference_gff: "" extra: "" + +panaroo: + remove_source: "cmsearch" + remove_feature: "tRNA|rRNA|ncRNA|exon|sequence_feature" + extra: "--clean-mode strict --remove-invalid-genes" diff --git a/config/schemas/config.schema.yml b/config/schemas/config.schema.yml index b461fc1..d52ee03 100644 --- a/config/schemas/config.schema.yml +++ b/config/schemas/config.schema.yml @@ -80,6 +80,18 @@ properties: extra: type: string description: Extra command-line arguments for QUAST + panaroo: + type: object + properties: + remove_source: + type: string + description: Source types to remove in Panaroo (regex supported) + remove_feature: + type: string + description: Feature types to remove in Panaroo (regex supported) + extra: + type: string + description: Extra command-line arguments for Panaroo required: - samplesheet diff --git a/workflow/Snakefile b/workflow/Snakefile index 9ae255e..aafcb6a 100644 --- a/workflow/Snakefile +++ b/workflow/Snakefile @@ -67,8 +67,5 @@ onerror: # ----------------------------------------------------- rule all: input: - expand( - "results/qc/quast/{tool}/report.txt", - tool=config["tool"], - ), + get_final_input, default_target: True diff --git a/workflow/envs/panaroo.yml b/workflow/envs/panaroo.yml new file mode 100644 index 0000000..43ee14a --- /dev/null +++ b/workflow/envs/panaroo.yml @@ -0,0 +1,9 @@ +name: panaroo +channels: + - conda-forge + - bioconda + - nodefaults +dependencies: + - numpy=1.26.4 + - scipy=1.11.4 + - panaroo=1.5.2 diff --git a/workflow/rules/annotate.smk b/workflow/rules/annotate.smk index 5a8a372..9fc66b9 100644 --- a/workflow/rules/annotate.smk +++ b/workflow/rules/annotate.smk @@ -138,13 +138,13 @@ rule get_bakta_db: """ if [ {params.download_db} != 'none' ]; then echo 'The most recent of the following available Bakta DBs is downloaded:' > {log}; - bakta_db list >> {log}; + bakta_db list &>> {log}; bakta_db download --output {params.outdir} --type {params.download_db} &>> {log}; else echo 'Using Bakta DB from supplied input dir: {params.existing_db}' > {log}; ln -s {params.existing_db} {output.db}; - echo 'Update ARMFinderPlus DB using supplied input dir: {params.existing_db}' > {log}; - amrfinder_update --force_update --database {params.existing_db}/amrfinderplus-db >> {log} + echo 'Update ARMFinderPlus DB using supplied input dir: {params.existing_db}' >> {log}; + amrfinder_update --force_update --database {params.existing_db}/amrfinderplus-db &>> {log} fi """ diff --git a/workflow/rules/common.smk b/workflow/rules/common.smk index 9300f94..64bed83 100644 --- a/workflow/rules/common.smk +++ b/workflow/rules/common.smk @@ -23,7 +23,6 @@ def get_fasta(wildcards): sample = wildcards.sample if sample not in samples.index: raise ValueError(f"Sample {sample} not found in samplesheet.") - # return the fasta file path return samples.loc[sample, "file"] @@ -35,9 +34,31 @@ def get_quast_fasta(wildcards): ) -def get_quast_gff(wildcards): +def get_panaroo_gff(wildcards): return expand( - "results/annotation/{tool}/{sample}/{sample}.gff", + "results/qc/panaroo/{tool}/prepare/{sample}.gff", tool=wildcards.tool, sample=samples.index, ) + + +def get_panaroo_fasta(wildcards): + return expand( + "results/qc/panaroo/{tool}/prepare/{sample}.fna", + tool=wildcards.tool, + sample=samples.index, + ) + + +def get_final_input(wildcards): + inputs = [] + inputs += expand( + "results/qc/quast/{tool}/report.txt", + tool=config["tool"], + ) + if len(samples.index) > 1: + inputs += expand( + "results/qc/panaroo/{tool}/summary_statistics.txt", + tool=config["tool"], + ) + return inputs diff --git a/workflow/rules/qc.smk b/workflow/rules/qc.smk index 1e8a9ce..6135aa1 100644 --- a/workflow/rules/qc.smk +++ b/workflow/rules/qc.smk @@ -20,7 +20,7 @@ rule quast: else [] ), extra=config["quast"]["extra"], - threads: workflow.cores * 0.25 + threads: 4 log: "results/qc/quast/{tool}/quast.log", shell: @@ -34,3 +34,61 @@ rule quast: {input.fasta} \ > {log} 2>&1 """ + + +rule prepare_panaroo: + input: + fasta="results/annotation/{tool}/{sample}/{sample}.fna", + gff="results/annotation/{tool}/{sample}/{sample}.gff", + output: + fasta="results/qc/panaroo/{tool}/prepare/{sample}.fna", + gff="results/qc/panaroo/{tool}/prepare/{sample}.gff", + conda: + "../envs/panaroo.yml" + message: + """--- Prepare input files for pan-genome alignment ---""" + params: + remove_source=config["panaroo"]["remove_source"], + remove_feature=config["panaroo"]["remove_feature"], + log: + "results/qc/panaroo/{tool}/prepare/{sample}.log", + shell: + """ + echo 'Preparing annotation for Panaroo:' > {log}; + echo ' - formatting seqnames in FASTA files' >> {log}; + awk '{{ sub(/>.*\\|/, ">"); sub(/[[:space:]].*$/, ""); print }}' \ + {input.fasta} > {output.fasta} 2>> {log}; + echo ' - removing sequences and selected features in GFF files' >> {log}; + awk ' /^##FASTA/ {{exit}} $2 !~ /{params.remove_source}/ && $3 !~ /{params.remove_feature}/ {{print}}' \ + {input.gff} > {output.gff} 2>> {log} + """ + + +rule panaroo: + input: + gff=get_panaroo_gff, + fasta=get_panaroo_fasta, + output: + stats="results/qc/panaroo/{tool}/summary_statistics.txt", + conda: + "../envs/panaroo.yml" + message: + """--- Running PANAROO to create pangenome from all annotations ---""" + params: + outdir=lambda wc, output: os.path.dirname(output.stats), + extra=config["panaroo"]["extra"], + threads: 4 + log: + "results/qc/panaroo/{tool}/panaroo.log", + shell: + """ + printf '%s\n' {input.gff} | \ + paste -d ' ' - <(printf '%s\n' {input.fasta}) \ + > {params.outdir}/input_files.txt; + panaroo \ + -i {params.outdir}/input_files.txt \ + -o {params.outdir} \ + -t {threads} \ + {params.extra} \ + > {log} 2>&1 + """ From 6048f72a5c0ad16ca09f365aeb37bc2175c398f4 Mon Sep 17 00:00:00 2001 From: Rina Ahmed-Begrich Date: Wed, 10 Dec 2025 10:55:50 +0100 Subject: [PATCH 13/18] fix: validate locus tag for bakta annotation; closes #8 --- workflow/rules/annotate.smk | 2 +- workflow/rules/common.smk | 26 ++++++++++++++++++++++++++ 2 files changed, 27 insertions(+), 1 deletion(-) diff --git a/workflow/rules/annotate.smk b/workflow/rules/annotate.smk index 9fc66b9..4b9aab0 100644 --- a/workflow/rules/annotate.smk +++ b/workflow/rules/annotate.smk @@ -162,7 +162,7 @@ rule annotate_bakta: """--- Running BAKTA annotation for sample {wildcards.sample} ---""" params: prefix=lambda wc: wc.sample, - locustag=lambda wc: samples.loc[wc.sample]["id_prefix"], + locustag=lambda wc: format_bakta_locustag(samples.loc[wc.sample]["id_prefix"]), species=lambda wc: samples.loc[wc.sample]["species"], strain=lambda wc: samples.loc[wc.sample]["strain"], outdir=lambda wc, output: os.path.dirname(output[0]), diff --git a/workflow/rules/common.smk b/workflow/rules/common.smk index 64bed83..266fbdc 100644 --- a/workflow/rules/common.smk +++ b/workflow/rules/common.smk @@ -1,5 +1,7 @@ # import basic packages import pandas as pd +import re +from snakemake import logging from snakemake.utils import validate @@ -62,3 +64,27 @@ def get_final_input(wildcards): tool=config["tool"], ) return inputs + + +# ----------------------------------------------------- +# helper functions +# ----------------------------------------------------- +def format_bakta_locustag(raw): + """Format locustag for BAKTA annotation.""" + tag = str(raw) + # uppercase for BAKTA + tag_up = tag.upper() + # keep only A-Z0-9 + cleaned = re.sub(r"[^A-Z0-9]", "", tag_up) + if len(cleaned) < 3 or len(cleaned) > 12: + raise ValueError( + f"locustag '{raw}' -> '{cleaned}' must contain between 3-12 alphanumeric uppercase characters\n" + ) + if not re.match(r"^[A-Z]", cleaned): + raise ValueError(f"locustag '{raw}' -> '{cleaned}' must start with a letter") + # warn if cleaned tag is different from original + if cleaned != tag: + logger.warning( + f"\nlocustag '{raw}' converted to '{cleaned}' to meet BAKTA requirements (between 3 and 12 alphanumeric uppercase characters, start with a letter)\n" + ) + return cleaned From 0e829f6ca1746221565b90648ed712e5d23018e0 Mon Sep 17 00:00:00 2001 From: jahn Date: Wed, 10 Dec 2025 11:36:42 +0100 Subject: [PATCH 14/18] fix: added option to skip pangenome analysis --- .test/config/config.yml | 1 + config/config.yml | 1 + config/schemas/config.schema.yml | 3 +++ workflow/rules/common.smk | 2 +- 4 files changed, 6 insertions(+), 1 deletion(-) diff --git a/.test/config/config.yml b/.test/config/config.yml index b203c5d..773cadc 100644 --- a/.test/config/config.yml +++ b/.test/config/config.yml @@ -23,6 +23,7 @@ quast: extra: "" panaroo: + skip: False remove_source: "cmsearch" remove_feature: "tRNA|rRNA|ncRNA|exon|sequence_feature" extra: "--clean-mode strict --remove-invalid-genes" diff --git a/config/config.yml b/config/config.yml index b203c5d..773cadc 100644 --- a/config/config.yml +++ b/config/config.yml @@ -23,6 +23,7 @@ quast: extra: "" panaroo: + skip: False remove_source: "cmsearch" remove_feature: "tRNA|rRNA|ncRNA|exon|sequence_feature" extra: "--clean-mode strict --remove-invalid-genes" diff --git a/config/schemas/config.schema.yml b/config/schemas/config.schema.yml index d52ee03..33215e4 100644 --- a/config/schemas/config.schema.yml +++ b/config/schemas/config.schema.yml @@ -83,6 +83,9 @@ properties: panaroo: type: object properties: + skip: + type: boolean + description: Whether to skip Panaroo analysis remove_source: type: string description: Source types to remove in Panaroo (regex supported) diff --git a/workflow/rules/common.smk b/workflow/rules/common.smk index 266fbdc..47364a7 100644 --- a/workflow/rules/common.smk +++ b/workflow/rules/common.smk @@ -58,7 +58,7 @@ def get_final_input(wildcards): "results/qc/quast/{tool}/report.txt", tool=config["tool"], ) - if len(samples.index) > 1: + if len(samples.index) > 1 and not config["panaroo"]["skip"]: inputs += expand( "results/qc/panaroo/{tool}/summary_statistics.txt", tool=config["tool"], From a4a147aa1fe1c18994d46bfd279e9b32afd7633e Mon Sep 17 00:00:00 2001 From: m-jahn Date: Wed, 10 Dec 2025 12:43:20 +0100 Subject: [PATCH 15/18] fix: update readmes --- README.md | 47 +++++++++++++++++++++++++++++++++++++---- config/README.md | 54 ++++++++++++++++++++++++++++++++---------------- 2 files changed, 79 insertions(+), 22 deletions(-) diff --git a/README.md b/README.md index 0d30436..1af2ba6 100644 --- a/README.md +++ b/README.md @@ -8,12 +8,27 @@ A Snakemake workflow for the post-processing of microbial genome assemblies. +- [snakemake-assembly-postprocessing](#snakemake-assembly-postprocessing) + - [Usage](#usage) + - [Workflow overview](#workflow-overview) + - [Installation](#installation) + - [Deployment options](#deployment-options) + - [Authors](#authors) + - [References](#references) + ## Usage The usage of this workflow is described in the [Snakemake Workflow Catalog](https://snakemake.github.io/snakemake-workflow-catalog/docs/workflows/MPUSP/snakemake-assembly-postprocessing). +Detailed information about input data and workflow configuration can also be found in the [`config/README.md`](config/README.md). + If you use this workflow in a paper, don't forget to give credits to the authors by citing the URL of this repository. +_Workflow overview:_ + + + + ## Workflow overview 1. Parse `samples.csv` table containing the samples's meta data (`python`) @@ -24,10 +39,6 @@ If you use this workflow in a paper, don't forget to give credits to the authors 3. Create a QC report for the assemblies using [Quast](https://github.com/ablab/quast) 4. Create a pangenome analysis (orthologs/homologs) using [Panaroo](https://gthlab.au/panaroo/) -## Requirements - -- [PGAP](https://github.com/ncbi/pgap) - ## Installation **Step 1: Clone this repository** @@ -52,9 +63,37 @@ conda activate snakemake-assembly-postprocessing **Step 4: Install PGAP** +- if you want to use [PGAP](https://github.com/ncbi/pgap) for annotation, it needs to be installed separately - PGAP can be downloaded from https://github.com/ncbi/pgap. Please follow the installation instructions there. - Define the path to the `pgap.py` script (located in the `scripts` folder) in the `config` file (recommended: `./resources`) +## Deployment options + +To run the workflow from command line, change the working directory. + +```bash +cd path/to/snakemake-simple-mapping +``` + +Adjust options in the default config file `config/config.yml`. +Before running the complete workflow, you can perform a dry run using: + +```bash +snakemake --dry-run +``` + +To run the workflow with test files using **conda**: + +```bash +snakemake --cores 2 --sdm conda --directory .test +``` + +To run the workflow with test files using **apptainer**: + +```bash +snakemake --cores 2 --sdm conda apptainer --directory .test +``` + ## Authors - Dr. Rina Ahmed-Begrich diff --git a/config/README.md b/config/README.md index 17692b8..05ae5ee 100644 --- a/config/README.md +++ b/config/README.md @@ -1,32 +1,50 @@ -## Running the workflow - -### Input data +## Workflow overview -This workflow requires `fasta` input data. -The samplesheet table has the following layout: +A Snakemake workflow for the post-processing of microbial genome assemblies. -| sample | species | strain | id_prefix | file | -| ------ | ------------------------ | ------ | --------- | -------------- | -| EC2224 | "Streptococcus pyogenes" | SF370 | Spy | assembly.fasta | +1. Parse `samples.csv` table containing the samples's meta data (`python`) +2. Annotate assemblies using one of the following tools: + 1. NCBI's Prokaryotic Genome Annotation Pipeline ([PGAP](https://github.com/ncbi/pgap)). Note: needs to be installed manually + 2. [prokka](https://github.com/tseemann/prokka), a fast and light-weight prokaryotic annotation tool + 3. [bakta](https://github.com/oschwengers/bakta), a fast, alignment-free annotation tool. Note: Bakta will automatically download its companion database from zenodo (light: 1.5 GB, full: 40 GB) +3. Create a QC report for the assemblies using [Quast](https://github.com/ablab/quast) +4. Create a pangenome analysis (orthologs/homologs) using [Panaroo](https://gthlab.au/panaroo/) -### Execution +## Installation -To run the workflow from command line, change to the working directory and activate the conda environment. +**Step 1: Clone this repository** ```bash +git clone https://github.com/MPUSP/snakemake-assembly-postprocessing.git cd snakemake-assembly-postprocessing -conda activate snakemake-assembly-postprocessing ``` -Adjust options in the default config file `config/config.yml`. -Before running the entire workflow, perform a dry run using: +**Step 2: Install dependencies** -```bash -snakemake --cores 1 --sdm conda --directory .test --dry-run -``` +It is recommended to install snakemake and run the workflow with `conda` or `mamba`. [Miniforge](https://conda-forge.org/download/) is the preferred conda-forge installer and includes `conda`, `mamba` and their dependencies. + +**Step 3: Create snakemake environment** -To run the workflow with test files using **conda**: +This step creates a new conda environment called `snakemake-assembly-postprocessing`. ```bash -snakemake --cores 1 --sdm conda --directory .test +mamba create -c conda-forge -c bioconda -n snakemake-assembly-postprocessing snakemake pandas +conda activate snakemake-assembly-postprocessing ``` + +**Step 4: Install PGAP** + +- if you want to use [PGAP](https://github.com/ncbi/pgap) for annotation, it needs to be installed separately +- PGAP can be downloaded from https://github.com/ncbi/pgap. Please follow the installation instructions there. +- Define the path to the `pgap.py` script (located in the `scripts` folder) in the `config` file (recommended: `./resources`) + +## Running the workflow + +### Input data + +This workflow requires `fasta` input data. +The samplesheet table has the following layout: + +| sample | species | strain | id_prefix | file | +| ------ | ------------------------ | ------ | --------- | -------------- | +| EC2224 | "Streptococcus pyogenes" | SF370 | SPY | assembly.fasta | From b1d0d34f43934bcc91db506a95c1358db282c36b Mon Sep 17 00:00:00 2001 From: Rina Ahmed-Begrich Date: Wed, 10 Dec 2025 14:54:43 +0100 Subject: [PATCH 16/18] fix: added missing dag. --- README.md | 5 +- config/README.md | 30 +------- resources/images/dag.svg | 157 +++++++++++++++++++++++++++++++++++++++ 3 files changed, 161 insertions(+), 31 deletions(-) create mode 100644 resources/images/dag.svg diff --git a/README.md b/README.md index 1af2ba6..1368e45 100644 --- a/README.md +++ b/README.md @@ -26,7 +26,6 @@ If you use this workflow in a paper, don't forget to give credits to the authors _Workflow overview:_ - ## Workflow overview @@ -72,14 +71,14 @@ conda activate snakemake-assembly-postprocessing To run the workflow from command line, change the working directory. ```bash -cd path/to/snakemake-simple-mapping +cd snakemake-assembly-postprocessing ``` Adjust options in the default config file `config/config.yml`. Before running the complete workflow, you can perform a dry run using: ```bash -snakemake --dry-run +snakemake --cores 1 --dry-run ``` To run the workflow with test files using **conda**: diff --git a/config/README.md b/config/README.md index 05ae5ee..a5c8d18 100644 --- a/config/README.md +++ b/config/README.md @@ -10,34 +10,6 @@ A Snakemake workflow for the post-processing of microbial genome assemblies. 3. Create a QC report for the assemblies using [Quast](https://github.com/ablab/quast) 4. Create a pangenome analysis (orthologs/homologs) using [Panaroo](https://gthlab.au/panaroo/) -## Installation - -**Step 1: Clone this repository** - -```bash -git clone https://github.com/MPUSP/snakemake-assembly-postprocessing.git -cd snakemake-assembly-postprocessing -``` - -**Step 2: Install dependencies** - -It is recommended to install snakemake and run the workflow with `conda` or `mamba`. [Miniforge](https://conda-forge.org/download/) is the preferred conda-forge installer and includes `conda`, `mamba` and their dependencies. - -**Step 3: Create snakemake environment** - -This step creates a new conda environment called `snakemake-assembly-postprocessing`. - -```bash -mamba create -c conda-forge -c bioconda -n snakemake-assembly-postprocessing snakemake pandas -conda activate snakemake-assembly-postprocessing -``` - -**Step 4: Install PGAP** - -- if you want to use [PGAP](https://github.com/ncbi/pgap) for annotation, it needs to be installed separately -- PGAP can be downloaded from https://github.com/ncbi/pgap. Please follow the installation instructions there. -- Define the path to the `pgap.py` script (located in the `scripts` folder) in the `config` file (recommended: `./resources`) - ## Running the workflow ### Input data @@ -48,3 +20,5 @@ The samplesheet table has the following layout: | sample | species | strain | id_prefix | file | | ------ | ------------------------ | ------ | --------- | -------------- | | EC2224 | "Streptococcus pyogenes" | SF370 | SPY | assembly.fasta | + +**Note:** Pangenome analysis with `Panaroo` requires at least two samples. \ No newline at end of file diff --git a/resources/images/dag.svg b/resources/images/dag.svg new file mode 100644 index 0000000..344f877 --- /dev/null +++ b/resources/images/dag.svg @@ -0,0 +1,157 @@ + + + + + + +snakemake_dag + + + +0 + +all + + + +1 + +quast + + + +1->0 + + + + + +2 + +annotate_prokka + + + +2->1 + + + + + +9 + +prepare_panaroo + + + +2->9 + + + + + +3 + +get_fasta + + + +3->2 + + + + + +5 + +prepare_yaml_files + + + +3->5 + + + + + +6 + +annotate_bakta + + + +3->6 + + + + + +4 + +annotate_pgap + + + +4->1 + + + + + +4->9 + + + + + +5->4 + + + + + +6->1 + + + + + +6->9 + + + + + +7 + +get_bakta_db + + + +7->6 + + + + + +8 + +panaroo + + + +8->0 + + + + + +9->8 + + + + + From 8fa11b19776d0415e9cdf2c8e50b7911558572d1 Mon Sep 17 00:00:00 2001 From: Rina Ahmed-Begrich Date: Wed, 10 Dec 2025 15:39:28 +0100 Subject: [PATCH 17/18] chore: define parameters in config readme. --- config/README.md | 33 ++++++++++++++++++++++++++++++++- 1 file changed, 32 insertions(+), 1 deletion(-) diff --git a/config/README.md b/config/README.md index a5c8d18..4d77e2c 100644 --- a/config/README.md +++ b/config/README.md @@ -20,5 +20,36 @@ The samplesheet table has the following layout: | sample | species | strain | id_prefix | file | | ------ | ------------------------ | ------ | --------- | -------------- | | EC2224 | "Streptococcus pyogenes" | SF370 | SPY | assembly.fasta | +| ... | ... | ... | ... | ... | -**Note:** Pangenome analysis with `Panaroo` requires at least two samples. \ No newline at end of file +**Note:** Pangenome analysis with `Panaroo` requires at least two samples. + +### Parameters + +This table lists all parameters that can be used to run the workflow. + +| Parameter | Type | Details | Default | +|:---|:---|:---|:---| +| **samplesheet** | string | Path to the sample sheet file in csv format | | +| **tool** | array[string] | Annotation tool to use (one of `prokka`, `pgap`, `bakta`) | | +| **pgap** | | PGAP configuration object | | +| bin | string | Path to the PGAP script | | +| use_yaml_config | boolean | Whether to use YAML configuration for PGAP | `False` | +| _prepare_yaml_files_ | | Paths to YAML templates for PGAP | | +| generic | string | Path to the generic YAML configuration file | | +| submol | string | Path to the submol YAML configuration file | | +| **prokka** | | Prokka configuration object | | +| center | string | Center name for Prokka annotation (used in sequence IDs) | | +| extra | string | Extra command-line arguments for Prokka | | +| **bakta** | | Bakta configuration object | | +| download_db | string | Bakta database type (`full`, `light`, or `none`) | `light` | +| existing_db | string | Path to an existing Bakta database (optional). Needs to be combined with `download_db='none'` | | +| extra | string | Extra command-line arguments for Bakta | | +| **quast** | | QUAST configuration object | | +| reference_fasta | string | Path to the reference genome for QUAST | | +| reference_gff | string | Path to the reference annotation for QUAST | +| extra | string | Extra command-line arguments for QUAST | | +| **panaroo** | | Panaroo configuration object | | +| remove_source | string | Source types to remove in Panaroo (regex supported) | `cmsearch` | +| remove_feature | string | Feature types to remove in Panaroo (regex supported) | `tRNA\|rRNA\|ncRNA\|exon\|sequence_feature` | +| extra | string | Extra command-line arguments for Panaroo | | From dcbd2f49b0db125f941d8ba5634350908284bc98 Mon Sep 17 00:00:00 2001 From: Rina Ahmed-Begrich Date: Wed, 10 Dec 2025 15:49:57 +0100 Subject: [PATCH 18/18] chore: update defaults in config readme --- config/README.md | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/config/README.md b/config/README.md index 4d77e2c..d864e26 100644 --- a/config/README.md +++ b/config/README.md @@ -40,10 +40,10 @@ This table lists all parameters that can be used to run the workflow. | submol | string | Path to the submol YAML configuration file | | | **prokka** | | Prokka configuration object | | | center | string | Center name for Prokka annotation (used in sequence IDs) | | -| extra | string | Extra command-line arguments for Prokka | | +| extra | string | Extra command-line arguments for Prokka | `--addgenes` | | **bakta** | | Bakta configuration object | | | download_db | string | Bakta database type (`full`, `light`, or `none`) | `light` | -| existing_db | string | Path to an existing Bakta database (optional). Needs to be combined with `download_db='none'` | | +| existing_db | string | Path to an existing Bakta database (optional). Needs to be combined with `download_db='none'` | `--keep-contig-headers --compliant` | | extra | string | Extra command-line arguments for Bakta | | | **quast** | | QUAST configuration object | | | reference_fasta | string | Path to the reference genome for QUAST | | @@ -52,4 +52,4 @@ This table lists all parameters that can be used to run the workflow. | **panaroo** | | Panaroo configuration object | | | remove_source | string | Source types to remove in Panaroo (regex supported) | `cmsearch` | | remove_feature | string | Feature types to remove in Panaroo (regex supported) | `tRNA\|rRNA\|ncRNA\|exon\|sequence_feature` | -| extra | string | Extra command-line arguments for Panaroo | | +| extra | string | Extra command-line arguments for Panaroo | `--clean-mode strict --remove-invalid-genes` |