From 04542596282eec63d5d54dfdce18b260b1590172 Mon Sep 17 00:00:00 2001
From: Rina Ahmed-Begrich <begrich@mpusp.mpg.de>
Date: Fri, 5 Dec 2025 09:57:24 +0100
Subject: [PATCH 01/18] fix: remove option to define output dir in config.

---
 .test/config/config.yml          | 1 -
 config/config.yml                | 1 -
 config/schemas/config.schema.yml | 3 ---
 workflow/Snakefile               | 5 +----
 4 files changed, 1 insertion(+), 9 deletions(-)

diff --git a/.test/config/config.yml b/.test/config/config.yml
index 290a42e..f2d1cc9 100644
--- a/.test/config/config.yml
+++ b/.test/config/config.yml
@@ -1,5 +1,4 @@
 samplesheet: "config/samples.csv"
-outdir: "results"
 
 pgap:
   bin: "path/to/pgap.py"
diff --git a/config/config.yml b/config/config.yml
index 290a42e..f2d1cc9 100644
--- a/config/config.yml
+++ b/config/config.yml
@@ -1,5 +1,4 @@
 samplesheet: "config/samples.csv"
-outdir: "results"
 
 pgap:
   bin: "path/to/pgap.py"
diff --git a/config/schemas/config.schema.yml b/config/schemas/config.schema.yml
index 7dead6d..e636fbd 100644
--- a/config/schemas/config.schema.yml
+++ b/config/schemas/config.schema.yml
@@ -6,9 +6,6 @@ properties:
   samplesheet:
     type: string
     description: Path to the sample sheet file
-  outdir:
-    type: string
-    description: Output directory for results
   pgap:
     type: object
     properties:
diff --git a/workflow/Snakefile b/workflow/Snakefile
index 6ae7d4d..0cd35ac 100644
--- a/workflow/Snakefile
+++ b/workflow/Snakefile
@@ -20,10 +20,7 @@ configfile: "config/config.yml"
 
 
 # set outout directory
-OUTDIR = config.get("outdir", "")
-
-if not OUTDIR:
-    OUTDIR = f"{os.getcwd()}/results"
+OUTDIR = f"{os.getcwd()}/results"
 
 
 # -----------------------------------------------------

From 1579c874494d406216f77c15ccd3a300e89d4859 Mon Sep 17 00:00:00 2001
From: m-jahn <jahn@mpusp.mpg.de>
Date: Fri, 5 Dec 2025 11:41:33 +0100
Subject: [PATCH 02/18] feat: added prokka for annotation

---
 .test/config/config.yml     |  5 +++++
 README.md                   |  4 +++-
 config/README.md            |  6 +++---
 resources/.gitignore        |  1 +
 workflow/Snakefile          |  5 +++--
 workflow/envs/prokka.yml    |  7 +++++++
 workflow/rules/annotate.smk | 35 ++++++++++++++++++++++++++++++++++-
 7 files changed, 56 insertions(+), 7 deletions(-)
 create mode 100644 resources/.gitignore
 create mode 100644 workflow/envs/prokka.yml

diff --git a/.test/config/config.yml b/.test/config/config.yml
index 290a42e..932a2f0 100644
--- a/.test/config/config.yml
+++ b/.test/config/config.yml
@@ -1,5 +1,6 @@
 samplesheet: "config/samples.csv"
 outdir: "results"
+tool: "prokka"
 
 pgap:
   bin: "path/to/pgap.py"
@@ -7,3 +8,7 @@ pgap:
   prepare_yaml_files:
     generic: "config/generic.yaml"
     submol: "config/submol.yaml"
+
+prokka:
+  center: ""
+  extra: "--addgenes"
diff --git a/README.md b/README.md
index 05db0da..4493195 100644
--- a/README.md
+++ b/README.md
@@ -16,7 +16,9 @@ If you use this workflow in a paper, don't forget to give credits to the authors
 ## Workflow overview
 
 1. Parse `samples.csv` table containing the samples's meta data (`python`)
-2. Annotate assemblies using NCBI's Prokaryotic Genome Annotation Pipeline ([PGAP](https://github.com/ncbi/pgap))
+2. Annotate assemblies using one of the following tools:
+   1. NCBI's Prokaryotic Genome Annotation Pipeline ([PGAP](https://github.com/ncbi/pgap))
+   2. [prokka](https://github.com/tseemann/prokka)
 
 ## Requirements
 
diff --git a/config/README.md b/config/README.md
index 3326ef0..17692b8 100644
--- a/config/README.md
+++ b/config/README.md
@@ -5,9 +5,9 @@
 This workflow requires `fasta` input data.
 The samplesheet table has the following layout:
 
-| sample | species | strain | id_prefix | file |
-| ----------- | ------------ | ------------- | ------------- | ------------- |
-| EC2224 | "Streptococcus pyogenes" | SF370 | Spy | assembly.fasta |
+| sample | species                  | strain | id_prefix | file           |
+| ------ | ------------------------ | ------ | --------- | -------------- |
+| EC2224 | "Streptococcus pyogenes" | SF370  | Spy       | assembly.fasta |
 
 ### Execution
 
diff --git a/resources/.gitignore b/resources/.gitignore
new file mode 100644
index 0000000..9abb766
--- /dev/null
+++ b/resources/.gitignore
@@ -0,0 +1 @@
+.*
\ No newline at end of file
diff --git a/workflow/Snakefile b/workflow/Snakefile
index 6ae7d4d..21d0c82 100644
--- a/workflow/Snakefile
+++ b/workflow/Snakefile
@@ -19,7 +19,7 @@ container: "oras://ghcr.io/mpusp/snakemake-assembly-postprocessing:latest"
 configfile: "config/config.yml"
 
 
-# set outout directory
+# set output directory
 OUTDIR = config.get("outdir", "")
 
 if not OUTDIR:
@@ -74,7 +74,8 @@ onerror:
 rule all:
     input:
         expand(
-            os.path.join(OUTDIR, "annotation/pgap/{sample}/{sample}.gff"),
+            os.path.join(OUTDIR, "annotation/{tool}/{sample}/{sample}.gff"),
+            tool=config["tool"],
             sample=samples.index,
         ),
     default_target: True
diff --git a/workflow/envs/prokka.yml b/workflow/envs/prokka.yml
new file mode 100644
index 0000000..2859a3a
--- /dev/null
+++ b/workflow/envs/prokka.yml
@@ -0,0 +1,7 @@
+name: base
+channels:
+  - conda-forge
+  - bioconda
+  - nodefaults
+dependencies:
+  - prokka=1.14.6
diff --git a/workflow/rules/annotate.smk b/workflow/rules/annotate.smk
index f695f80..31c3ced 100644
--- a/workflow/rules/annotate.smk
+++ b/workflow/rules/annotate.smk
@@ -57,7 +57,7 @@ rule annotate_pgap:
     conda:
         "../envs/base.yml"
     message:
-        """--- Run PGAP annotation for sample {wildcards.sample} ---"""
+        """--- Running PGAP annotation for sample {wildcards.sample} ---"""
     params:
         pgap=config["pgap"]["bin"],
         use_yaml_config=config["pgap"]["use_yaml_config"],
@@ -85,3 +85,36 @@ rule annotate_pgap:
         "--no-self-update "
         "-g {input} -s '{params.species}' &>> {log}; "
         "fi; "
+
+
+rule annotate_prokka:
+    input:
+        fasta=rules.get_fasta.output.fasta,
+    output:
+        os.path.join(OUTDIR, "annotation/prokka/{sample}/{sample}.gff"),
+    conda:
+        "../envs/prokka.yml"
+    message:
+        """--- Running PROKKA annotation for sample {wildcards.sample} ---"""
+    params:
+        prefix=lambda wc: wc.sample,
+        locustag=lambda wc: samples.loc[wc.sample]["id_prefix"],
+        species=lambda wc: samples.loc[wc.sample]["species"],
+        strain=lambda wc: samples.loc[wc.sample]["strain"],
+        outdir=lambda wc, output: os.path.dirname(output[0]),
+        extra=config["prokka"]["extra"],
+    threads: workflow.cores * 0.25
+    log:
+        os.path.join(OUTDIR, "annotation/prokka/logs/{sample}_prokka.log"),
+    shell:
+        """
+        prokka \
+          --cpus {threads} \
+          --locustag {params.locustag} \
+          --species '{params.species}' \
+          --strain {params.strain} \
+          --prefix {params.prefix} \
+          --outdir {params.outdir} \
+          --force {params.extra} \
+          {input.fasta} &>> {log}
+        """

From 7e9dfc0fb89daaa7ad6c961eb7dc083c56cbbc94 Mon Sep 17 00:00:00 2001
From: m-jahn <jahn@mpusp.mpg.de>
Date: Fri, 5 Dec 2025 11:44:12 +0100
Subject: [PATCH 03/18] fix: updated schema

---
 config/config.yml                |  5 +++++
 config/schemas/config.schema.yml | 20 ++++++++++++++++++++
 2 files changed, 25 insertions(+)

diff --git a/config/config.yml b/config/config.yml
index 290a42e..932a2f0 100644
--- a/config/config.yml
+++ b/config/config.yml
@@ -1,5 +1,6 @@
 samplesheet: "config/samples.csv"
 outdir: "results"
+tool: "prokka"
 
 pgap:
   bin: "path/to/pgap.py"
@@ -7,3 +8,7 @@ pgap:
   prepare_yaml_files:
     generic: "config/generic.yaml"
     submol: "config/submol.yaml"
+
+prokka:
+  center: ""
+  extra: "--addgenes"
diff --git a/config/schemas/config.schema.yml b/config/schemas/config.schema.yml
index 7dead6d..17eaa39 100644
--- a/config/schemas/config.schema.yml
+++ b/config/schemas/config.schema.yml
@@ -9,6 +9,12 @@ properties:
   outdir:
     type: string
     description: Output directory for results
+  tool:
+    type: string
+    description: Annotation tool to use
+    enum:
+      - prokka
+      - pgap
   pgap:
     type: object
     properties:
@@ -34,7 +40,21 @@ properties:
       - bin
       - use_yaml_config
       - prepare_yaml_files
+  prokka:
+    type: object
+    properties:
+      center:
+        type: string
+        description: Center name for Prokka annotation (used in sequence IDs)
+      extra:
+        type: string
+        description: Extra command-line arguments for Prokka
+    required:
+      - center
+      - extra
 
 required:
   - samplesheet
+  - tool
   - pgap
+  - prokka

From 4a88b34ec0f871fea7c3d1ae12b22b5858a3ff63 Mon Sep 17 00:00:00 2001
From: m-jahn <jahn@mpusp.mpg.de>
Date: Fri, 5 Dec 2025 14:07:23 +0100
Subject: [PATCH 04/18] feat: added bakta for annotation

---
 .github/workflows/main.yml       |  6 +--
 .test/config/config.yml          |  4 ++
 README.md                        |  5 ++-
 config/config.yml                |  4 ++
 config/schemas/config.schema.yml | 14 +++++++
 workflow/envs/bakta.yml          |  7 ++++
 workflow/rules/annotate.smk      | 67 ++++++++++++++++++++++++++++++--
 7 files changed, 98 insertions(+), 9 deletions(-)
 create mode 100644 workflow/envs/bakta.yml

diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml
index bc1d054..3991806 100644
--- a/.github/workflows/main.yml
+++ b/.github/workflows/main.yml
@@ -2,9 +2,9 @@ name: CI
 
 on:
   push:
-    branches: [main, dev]
+    branches: [main]
   pull_request:
-    branches: [main, dev]
+    branches: [main]
 
 jobs:
   Formatting:
@@ -47,7 +47,7 @@ jobs:
         with:
           directory: .test
           snakefile: workflow/Snakefile
-          args: "--sdm conda --show-failed-logs --cores 1 --conda-cleanup-pkgs cache -n"
+          args: "--sdm conda --show-failed-logs --cores 1 --conda-cleanup-pkgs cache"
 
       - name: Test report
         uses: snakemake/snakemake-github-action@v2.0.0
diff --git a/.test/config/config.yml b/.test/config/config.yml
index 932a2f0..46c3512 100644
--- a/.test/config/config.yml
+++ b/.test/config/config.yml
@@ -12,3 +12,7 @@ pgap:
 prokka:
   center: ""
   extra: "--addgenes"
+
+bakta:
+  db: "light"
+  extra: "--keep-contig-headers --compliant"
diff --git a/README.md b/README.md
index 4493195..dbf4310 100644
--- a/README.md
+++ b/README.md
@@ -17,8 +17,9 @@ If you use this workflow in a paper, don't forget to give credits to the authors
 
 1. Parse `samples.csv` table containing the samples's meta data (`python`)
 2. Annotate assemblies using one of the following tools:
-   1. NCBI's Prokaryotic Genome Annotation Pipeline ([PGAP](https://github.com/ncbi/pgap))
-   2. [prokka](https://github.com/tseemann/prokka)
+   1. NCBI's Prokaryotic Genome Annotation Pipeline ([PGAP](https://github.com/ncbi/pgap)). Note: needs to be installed manually
+   2. [prokka](https://github.com/tseemann/prokka), a fast and light-weight prokaryotic annotation tool
+   3. [bakta](https://github.com/oschwengers/bakta), a fast, alignment-free annotation tool. Note: Bakta will automatically download its companion database from zenodo (light: 1.5 GB, full: 40 GB)
 
 ## Requirements
 
diff --git a/config/config.yml b/config/config.yml
index 932a2f0..3d59fb6 100644
--- a/config/config.yml
+++ b/config/config.yml
@@ -12,3 +12,7 @@ pgap:
 prokka:
   center: ""
   extra: "--addgenes"
+
+bakta:
+  db: "light"
+  extra: ""
diff --git a/config/schemas/config.schema.yml b/config/schemas/config.schema.yml
index 17eaa39..f0c5247 100644
--- a/config/schemas/config.schema.yml
+++ b/config/schemas/config.schema.yml
@@ -15,6 +15,7 @@ properties:
     enum:
       - prokka
       - pgap
+      - bakta
   pgap:
     type: object
     properties:
@@ -52,9 +53,22 @@ properties:
     required:
       - center
       - extra
+  bakta:
+    type: object
+    properties:
+      db:
+        type: string
+        description: Bakta database type, one of 'full', 'light'
+      extra:
+        type: string
+        description: Extra command-line arguments for Bakta
+    required:
+      - db
+      - extra
 
 required:
   - samplesheet
   - tool
   - pgap
   - prokka
+  - bakta
diff --git a/workflow/envs/bakta.yml b/workflow/envs/bakta.yml
new file mode 100644
index 0000000..1c833f7
--- /dev/null
+++ b/workflow/envs/bakta.yml
@@ -0,0 +1,7 @@
+name: base
+channels:
+  - conda-forge
+  - bioconda
+  - nodefaults
+dependencies:
+  - bakta=1.9.4
diff --git a/workflow/rules/annotate.smk b/workflow/rules/annotate.smk
index 31c3ced..75a0790 100644
--- a/workflow/rules/annotate.smk
+++ b/workflow/rules/annotate.smk
@@ -99,7 +99,8 @@ rule annotate_prokka:
     params:
         prefix=lambda wc: wc.sample,
         locustag=lambda wc: samples.loc[wc.sample]["id_prefix"],
-        species=lambda wc: samples.loc[wc.sample]["species"],
+        genus=lambda wc: samples.loc[wc.sample]["species"].split(" ")[0],
+        species=lambda wc: samples.loc[wc.sample]["species"].split(" ")[1],
         strain=lambda wc: samples.loc[wc.sample]["strain"],
         outdir=lambda wc, output: os.path.dirname(output[0]),
         extra=config["prokka"]["extra"],
@@ -109,12 +110,70 @@ rule annotate_prokka:
     shell:
         """
         prokka \
-          --cpus {threads} \
           --locustag {params.locustag} \
-          --species '{params.species}' \
+          --genus {params.genus} \
+          --species {params.species} \
           --strain {params.strain} \
           --prefix {params.prefix} \
           --outdir {params.outdir} \
           --force {params.extra} \
-          {input.fasta} &>> {log}
+          --cpus {threads} \
+          {input.fasta} &> {log}
+        """
+
+
+rule get_bakta_db:
+    output:
+        db=directory(os.path.join(OUTDIR, "annotation/bakta/database/db")),
+    conda:
+        "../envs/bakta.yml"
+    message:
+        """--- Getting BAKTA database for annotation ---"""
+    params:
+        db=config["bakta"]["db"],
+    threads: workflow.cores * 0.25
+    log:
+        os.path.join(OUTDIR, "annotation/bakta/database/db.log"),
+    shell:
+        """
+        echo 'The most recent of the following available bakta DBs is downloaded:' > {log};
+        bakta_db list > {log};
+        bakta_db download --output {output.db} --type {params.db} &> {log}
+        """
+
+
+rule annotate_bakta:
+    input:
+        fasta=rules.get_fasta.output.fasta,
+        db=rules.get_bakta_db.output.db,
+    output:
+        os.path.join(OUTDIR, "annotation/bakta/{sample}/{sample}.gff"),
+    conda:
+        "../envs/bakta.yml"
+    message:
+        """--- Running BAKTA annotation for sample {wildcards.sample} ---"""
+    params:
+        prefix=lambda wc: wc.sample,
+        locustag=lambda wc: samples.loc[wc.sample]["id_prefix"],
+        species=lambda wc: samples.loc[wc.sample]["species"],
+        strain=lambda wc: samples.loc[wc.sample]["strain"],
+        outdir=lambda wc, output: os.path.dirname(output[0]),
+        subdir="db" if config["bakta"]["db"] == "full" else "db-light",
+        extra=config["bakta"]["extra"],
+    threads: workflow.cores * 0.25
+    log:
+        os.path.join(OUTDIR, "annotation/bakta/logs/{sample}_bakta.log"),
+    shell:
+        """
+        bakta \
+          --db {input.db}/{params.subdir} \
+          --prefix {params.prefix} \
+          --output {params.outdir} \
+          --locus-tag {params.locustag} \
+          --species '{params.species}' \
+          --strain {params.strain} \
+          --threads {threads} \
+          --force {params.extra} \
+          {input.fasta} &> {log};
+          mv {output}3 {output}
         """

From 1b6695f502b40e88d5e43cdd38be1b06662a2d88 Mon Sep 17 00:00:00 2001
From: m-jahn <jahn@mpusp.mpg.de>
Date: Fri, 5 Dec 2025 14:26:10 +0100
Subject: [PATCH 05/18] fix: use results as standard output dir

---
 workflow/Snakefile          |  8 ++------
 workflow/rules/annotate.smk | 37 +++++++++++++------------------------
 2 files changed, 15 insertions(+), 30 deletions(-)

diff --git a/workflow/Snakefile b/workflow/Snakefile
index f048102..fb2a11d 100644
--- a/workflow/Snakefile
+++ b/workflow/Snakefile
@@ -19,10 +19,6 @@ container: "oras://ghcr.io/mpusp/snakemake-assembly-postprocessing:latest"
 configfile: "config/config.yml"
 
 
-# set outout directory
-OUTDIR = f"{os.getcwd()}/results"
-
-
 # -----------------------------------------------------
 # load rules
 # -----------------------------------------------------
@@ -41,7 +37,7 @@ end = "\033[0m"
 msg = f"""\nSnakemake-assembly-postprocessing: A Snakemake workflow 
 for the post-processing of microbial genome assemblies."""
 
-prolog = f"Output directory: {OUTDIR}"
+prolog = f"Output directory: ./results"
 year = datetime.today().year
 
 epilog = f"""
@@ -71,7 +67,7 @@ onerror:
 rule all:
     input:
         expand(
-            os.path.join(OUTDIR, "annotation/{tool}/{sample}/{sample}.gff"),
+            "results/annotation/{tool}/{sample}/{sample}.gff",
             tool=config["tool"],
             sample=samples.index,
         ),
diff --git a/workflow/rules/annotate.smk b/workflow/rules/annotate.smk
index 75a0790..10407f8 100644
--- a/workflow/rules/annotate.smk
+++ b/workflow/rules/annotate.smk
@@ -2,15 +2,11 @@ rule get_fasta:
     input:
         get_fasta,
     output:
-        fasta=os.path.join(
-            OUTDIR, "annotation/pgap/prepare_files/{sample}/genome.fasta"
-        ),
+        fasta="results/annotation/pgap/prepare_files/{sample}/genome.fasta",
     conda:
         "../envs/base.yml"
     log:
-        os.path.join(
-            OUTDIR, "annotation/pgap/prepare_files/logs/{sample}_get_fasta.log"
-        ),
+        "results/annotation/pgap/prepare_files/logs/{sample}_get_fasta.log",
     shell:
         "INPUT=$(realpath {input}); "
         "ln -s ${{INPUT}} {output}; "
@@ -21,12 +17,8 @@ rule prepare_yaml_files:
     input:
         fasta=rules.get_fasta.output.fasta,
     output:
-        input_yaml=os.path.join(
-            OUTDIR, "annotation/pgap/prepare_files/{sample}/input.yaml"
-        ),
-        submol_yaml=os.path.join(
-            OUTDIR, "annotation/pgap/prepare_files/{sample}/submol.yaml"
-        ),
+        input_yaml="results/annotation/pgap/prepare_files/{sample}/input.yaml",
+        submol_yaml="results/annotation/pgap/prepare_files/{sample}/submol.yaml",
     conda:
         "../envs/base.yml"
     params:
@@ -37,10 +29,7 @@ rule prepare_yaml_files:
         sample="{sample}",
         pd_samples=samples,
     log:
-        os.path.join(
-            OUTDIR,
-            "annotation/pgap/prepare_files/logs/{sample}_prepare_yaml_files.log",
-        ),
+        "results/annotation/pgap/prepare_files/logs/{sample}_prepare_yaml_files.log",
     script:
         "../scripts/prepare_yaml_files.py"
 
@@ -53,7 +42,7 @@ rule annotate_pgap:
             otherwise=rules.get_fasta.output.fasta,
         ),
     output:
-        os.path.join(OUTDIR, "annotation/pgap/{sample}/{sample}.gff"),
+        "results/annotation/pgap/{sample}/{sample}.gff",
     conda:
         "../envs/base.yml"
     message:
@@ -65,7 +54,7 @@ rule annotate_pgap:
         outdir=lambda wc, output: os.path.dirname(output[0]),
     threads: 1
     log:
-        os.path.join(OUTDIR, "annotation/pgap/logs/{sample}_pgap.log"),
+        "results/annotation/pgap/logs/{sample}_pgap.log",
     shell:
         "rm -rf {params.outdir}; "
         "if [ {params.use_yaml_config} == 'True' ]; then "
@@ -91,7 +80,7 @@ rule annotate_prokka:
     input:
         fasta=rules.get_fasta.output.fasta,
     output:
-        os.path.join(OUTDIR, "annotation/prokka/{sample}/{sample}.gff"),
+        "results/annotation/prokka/{sample}/{sample}.gff",
     conda:
         "../envs/prokka.yml"
     message:
@@ -106,7 +95,7 @@ rule annotate_prokka:
         extra=config["prokka"]["extra"],
     threads: workflow.cores * 0.25
     log:
-        os.path.join(OUTDIR, "annotation/prokka/logs/{sample}_prokka.log"),
+        "results/annotation/prokka/logs/{sample}_prokka.log",
     shell:
         """
         prokka \
@@ -124,7 +113,7 @@ rule annotate_prokka:
 
 rule get_bakta_db:
     output:
-        db=directory(os.path.join(OUTDIR, "annotation/bakta/database/db")),
+        db=directory("results/annotation/bakta/database/db"),
     conda:
         "../envs/bakta.yml"
     message:
@@ -133,7 +122,7 @@ rule get_bakta_db:
         db=config["bakta"]["db"],
     threads: workflow.cores * 0.25
     log:
-        os.path.join(OUTDIR, "annotation/bakta/database/db.log"),
+        "results/annotation/bakta/database/db.log",
     shell:
         """
         echo 'The most recent of the following available bakta DBs is downloaded:' > {log};
@@ -147,7 +136,7 @@ rule annotate_bakta:
         fasta=rules.get_fasta.output.fasta,
         db=rules.get_bakta_db.output.db,
     output:
-        os.path.join(OUTDIR, "annotation/bakta/{sample}/{sample}.gff"),
+        "results/annotation/bakta/{sample}/{sample}.gff",
     conda:
         "../envs/bakta.yml"
     message:
@@ -162,7 +151,7 @@ rule annotate_bakta:
         extra=config["bakta"]["extra"],
     threads: workflow.cores * 0.25
     log:
-        os.path.join(OUTDIR, "annotation/bakta/logs/{sample}_bakta.log"),
+        "results/annotation/bakta/logs/{sample}_bakta.log",
     shell:
         """
         bakta \

From 7d9f6cdbc1dc78b00f4d33344d245c70a375b7fe Mon Sep 17 00:00:00 2001
From: m-jahn <jahn@mpusp.mpg.de>
Date: Mon, 8 Dec 2025 10:47:54 +0100
Subject: [PATCH 06/18] feat: added quast, allow multiple annotation tools,
 allow local bakta db

---
 .test/config/config.yml          |  9 ++++++--
 .test/config/samples.csv         |  2 +-
 config/config.yml                |  9 ++++++--
 config/schemas/config.schema.yml | 32 +++++++++++++++++++++--------
 workflow/Snakefile               |  4 ++--
 workflow/envs/quast.yml          |  7 +++++++
 workflow/rules/annotate.smk      | 35 +++++++++++++++++++++++---------
 workflow/rules/common.smk        | 17 +++++++++++++++-
 workflow/rules/qc.smk            | 32 +++++++++++++++++++++++++++++
 9 files changed, 121 insertions(+), 26 deletions(-)
 create mode 100644 workflow/envs/quast.yml
 create mode 100644 workflow/rules/qc.smk

diff --git a/.test/config/config.yml b/.test/config/config.yml
index 595142c..fa5a8d5 100644
--- a/.test/config/config.yml
+++ b/.test/config/config.yml
@@ -1,5 +1,5 @@
 samplesheet: "config/samples.csv"
-tool: "prokka"
+tool: ["prokka"]
 
 pgap:
   bin: "path/to/pgap.py"
@@ -13,5 +13,10 @@ prokka:
   extra: "--addgenes"
 
 bakta:
-  db: "light"
+  download_db: "light"
+  existing_db: ""
   extra: "--keep-contig-headers --compliant"
+
+quast:
+  reference: ""
+  extra: ""
diff --git a/.test/config/samples.csv b/.test/config/samples.csv
index 4f368e3..5a46dd9 100644
--- a/.test/config/samples.csv
+++ b/.test/config/samples.csv
@@ -1,2 +1,2 @@
 sample,species,strain,id_prefix,file
-EC2224,"Streptococcus pyogenes",SF370,SPY,"data/assembly.fasta"
\ No newline at end of file
+EC2224,"Streptococcus pyogenes",SF370,SPY,"data/assembly.fasta"
diff --git a/config/config.yml b/config/config.yml
index 595142c..fa5a8d5 100644
--- a/config/config.yml
+++ b/config/config.yml
@@ -1,5 +1,5 @@
 samplesheet: "config/samples.csv"
-tool: "prokka"
+tool: ["prokka"]
 
 pgap:
   bin: "path/to/pgap.py"
@@ -13,5 +13,10 @@ prokka:
   extra: "--addgenes"
 
 bakta:
-  db: "light"
+  download_db: "light"
+  existing_db: ""
   extra: "--keep-contig-headers --compliant"
+
+quast:
+  reference: ""
+  extra: ""
diff --git a/config/schemas/config.schema.yml b/config/schemas/config.schema.yml
index b913caa..ac37e43 100644
--- a/config/schemas/config.schema.yml
+++ b/config/schemas/config.schema.yml
@@ -7,12 +7,14 @@ properties:
     type: string
     description: Path to the sample sheet file
   tool:
-    type: string
+    type: array
     description: Annotation tool to use
-    enum:
-      - prokka
-      - pgap
-      - bakta
+    items:
+      type: string
+      enum:
+        - prokka
+        - pgap
+        - bakta
   pgap:
     type: object
     properties:
@@ -53,15 +55,28 @@ properties:
   bakta:
     type: object
     properties:
-      db:
+      download_db:
+        type: string
+        description: Bakta database type, one of 'full', 'light', or 'none' if existing is used
+      existing_db:
         type: string
-        description: Bakta database type, one of 'full', 'light'
+        description: Path to an existing Bakta database (optional)
       extra:
         type: string
         description: Extra command-line arguments for Bakta
     required:
-      - db
+      - download_db
+      - existing_db
       - extra
+  quast:
+    type: object
+    properties:
+      reference:
+        type: string
+        description: Path to the reference genome for QUAST
+      extra:
+        type: string
+        description: Extra command-line arguments for QUAST
 
 required:
   - samplesheet
@@ -69,3 +84,4 @@ required:
   - pgap
   - prokka
   - bakta
+  - quast
diff --git a/workflow/Snakefile b/workflow/Snakefile
index fb2a11d..9ae255e 100644
--- a/workflow/Snakefile
+++ b/workflow/Snakefile
@@ -24,6 +24,7 @@ configfile: "config/config.yml"
 # -----------------------------------------------------
 include: "rules/common.smk"
 include: "rules/annotate.smk"
+include: "rules/qc.smk"
 
 
 # -----------------------------------------------------
@@ -67,8 +68,7 @@ onerror:
 rule all:
     input:
         expand(
-            "results/annotation/{tool}/{sample}/{sample}.gff",
+            "results/qc/quast/{tool}/report.txt",
             tool=config["tool"],
-            sample=samples.index,
         ),
     default_target: True
diff --git a/workflow/envs/quast.yml b/workflow/envs/quast.yml
new file mode 100644
index 0000000..0fbc104
--- /dev/null
+++ b/workflow/envs/quast.yml
@@ -0,0 +1,7 @@
+name: base
+channels:
+  - conda-forge
+  - bioconda
+  - nodefaults
+dependencies:
+  - quast=5.3.0
diff --git a/workflow/rules/annotate.smk b/workflow/rules/annotate.smk
index 10407f8..c92f09a 100644
--- a/workflow/rules/annotate.smk
+++ b/workflow/rules/annotate.smk
@@ -80,7 +80,8 @@ rule annotate_prokka:
     input:
         fasta=rules.get_fasta.output.fasta,
     output:
-        "results/annotation/prokka/{sample}/{sample}.gff",
+        gff="results/annotation/prokka/{sample}/{sample}.gff",
+        fasta="results/annotation/prokka/{sample}/{sample}.fna",
     conda:
         "../envs/prokka.yml"
     message:
@@ -113,21 +114,35 @@ rule annotate_prokka:
 
 rule get_bakta_db:
     output:
-        db=directory("results/annotation/bakta/database/db"),
+        db=branch(
+            lookup(dpath="bakta/download_db", within=config),
+            cases={
+                "full": directory("results/annotation/bakta/database/db"),
+                "light": directory("results/annotation/bakta/database/db-light"),
+                "none": directory("results/annotation/bakta/database/custom"),
+            },
+        ),
     conda:
         "../envs/bakta.yml"
     message:
         """--- Getting BAKTA database for annotation ---"""
     params:
-        db=config["bakta"]["db"],
+        download_db=config["bakta"]["download_db"],
+        existing_db=config["bakta"]["existing_db"],
+        outdir=lambda wc, output: os.path.dirname(output[0]),
     threads: workflow.cores * 0.25
     log:
         "results/annotation/bakta/database/db.log",
     shell:
         """
-        echo 'The most recent of the following available bakta DBs is downloaded:' > {log};
-        bakta_db list > {log};
-        bakta_db download --output {output.db} --type {params.db} &> {log}
+        if [ {params.download_db} != 'none' ]; then
+          echo 'The most recent of the following available Bakta DBs is downloaded:' > {log};
+          bakta_db list >> {log};
+          bakta_db download --output {params.outdir} --type {params.download_db} &>> {log};
+        else
+          echo 'Using Bakta DB from supplied input dir: {params.existing_db}' > {log};
+          ln -s {params.existing_db} {output.db}
+        fi
         """
 
 
@@ -136,7 +151,8 @@ rule annotate_bakta:
         fasta=rules.get_fasta.output.fasta,
         db=rules.get_bakta_db.output.db,
     output:
-        "results/annotation/bakta/{sample}/{sample}.gff",
+        gff="results/annotation/bakta/{sample}/{sample}.gff",
+        fasta="results/annotation/bakta/{sample}/{sample}.fna",
     conda:
         "../envs/bakta.yml"
     message:
@@ -147,7 +163,6 @@ rule annotate_bakta:
         species=lambda wc: samples.loc[wc.sample]["species"],
         strain=lambda wc: samples.loc[wc.sample]["strain"],
         outdir=lambda wc, output: os.path.dirname(output[0]),
-        subdir="db" if config["bakta"]["db"] == "full" else "db-light",
         extra=config["bakta"]["extra"],
     threads: workflow.cores * 0.25
     log:
@@ -155,7 +170,7 @@ rule annotate_bakta:
     shell:
         """
         bakta \
-          --db {input.db}/{params.subdir} \
+          --db {input.db} \
           --prefix {params.prefix} \
           --output {params.outdir} \
           --locus-tag {params.locustag} \
@@ -164,5 +179,5 @@ rule annotate_bakta:
           --threads {threads} \
           --force {params.extra} \
           {input.fasta} &> {log};
-          mv {output}3 {output}
+          mv {output.gff}3 {output.gff}
         """
diff --git a/workflow/rules/common.smk b/workflow/rules/common.smk
index 72684e1..9300f94 100644
--- a/workflow/rules/common.smk
+++ b/workflow/rules/common.smk
@@ -23,6 +23,21 @@ def get_fasta(wildcards):
     sample = wildcards.sample
     if sample not in samples.index:
         raise ValueError(f"Sample {sample} not found in samplesheet.")
-
     # return the fasta file path
     return samples.loc[sample, "file"]
+
+
+def get_quast_fasta(wildcards):
+    return expand(
+        "results/annotation/{tool}/{sample}/{sample}.fna",
+        tool=wildcards.tool,
+        sample=samples.index,
+    )
+
+
+def get_quast_gff(wildcards):
+    return expand(
+        "results/annotation/{tool}/{sample}/{sample}.gff",
+        tool=wildcards.tool,
+        sample=samples.index,
+    )
diff --git a/workflow/rules/qc.smk b/workflow/rules/qc.smk
new file mode 100644
index 0000000..1403c1b
--- /dev/null
+++ b/workflow/rules/qc.smk
@@ -0,0 +1,32 @@
+rule quast:
+    input:
+        fasta=get_quast_fasta,
+        gff=get_quast_gff,
+    output:
+        report="results/qc/quast/{tool}/report.txt",
+    conda:
+        "../envs/quast.yml"
+    message:
+        """--- Running QUAST quality check for all assemblies ---"""
+    params:
+        outdir=lambda wc, output: os.path.dirname(output.report),
+        ref=(
+            " ".join(["-r", config["quast"]["reference"]])
+            if config["quast"]["reference"]
+            else []
+        ),
+        extra=config["quast"]["extra"],
+    threads: workflow.cores * 0.25
+    log:
+        "results/qc/quast/{tool}/quast.log",
+    shell:
+        """
+        quast \
+        --output-dir {params.outdir} \
+        --threads {threads} \
+        --features {input.gff} \
+        {params.ref} \
+        {params.extra} \
+        {input.fasta} \
+        > {log} 2>&1
+        """

From f4db893567c9375d62bb2bd6a419ab86eed4b0b6 Mon Sep 17 00:00:00 2001
From: jahn <jahn@mpusp.mpg.de>
Date: Mon, 8 Dec 2025 14:35:34 +0100
Subject: [PATCH 07/18] fix: use absolute input paths for pgap helpers

---
 workflow/rules/annotate.smk            |  3 ++-
 workflow/scripts/prepare_yaml_files.py | 25 ++++++++++++-------------
 2 files changed, 14 insertions(+), 14 deletions(-)

diff --git a/workflow/rules/annotate.smk b/workflow/rules/annotate.smk
index c92f09a..602e607 100644
--- a/workflow/rules/annotate.smk
+++ b/workflow/rules/annotate.smk
@@ -42,7 +42,8 @@ rule annotate_pgap:
             otherwise=rules.get_fasta.output.fasta,
         ),
     output:
-        "results/annotation/pgap/{sample}/{sample}.gff",
+        gff="results/annotation/pgap/{sample}/{sample}.gff",
+        fasta="results/annotation/pgap/{sample}/{sample}.fna",
     conda:
         "../envs/base.yml"
     message:
diff --git a/workflow/scripts/prepare_yaml_files.py b/workflow/scripts/prepare_yaml_files.py
index 22d4f0a..c8846de 100644
--- a/workflow/scripts/prepare_yaml_files.py
+++ b/workflow/scripts/prepare_yaml_files.py
@@ -21,7 +21,7 @@
 
 
 # define helper functions
-def read_yaml_to_dict(filepath):
+def read_yaml_to_dict(filepath: str) -> None:
     try:
         with open(filepath, "r") as f:
             return yaml.safe_load(f)
@@ -35,32 +35,31 @@ def read_yaml_to_dict(filepath):
     return None
 
 
-def write_dict_to_yaml(dic, filepath):
+def write_dict_to_yaml(outdict: dict, filepath: str) -> None:
     try:
         with open(filepath, "w") as f:
-            yaml.safe_dump(dic, f, default_flow_style=False)
+            yaml.safe_dump(outdict, f, default_flow_style=False)
         sys.stderr.write(f"Wrote YAML file: {filepath}\n")
     except Exception as e:
         sys.stderr.write(f"Error writing YAML file {filepath}: {e}\n")
 
 
-generic_dic = read_yaml_to_dict(generic_template)
-submol_dic = read_yaml_to_dict(submol_template)
+generic_dict = read_yaml_to_dict(generic_template)
+submol_dict = read_yaml_to_dict(submol_template)
 
-generic_dic["fasta"]["location"] = os.path.basename(fasta)
-generic_dic["submol"]["location"] = submol_yaml
+generic_dict["fasta"]["location"] = os.path.abspath(fasta)
+generic_dict["submol"]["location"] = os.path.abspath(submol_yaml)
 
 if not organism:
     organism = samples.loc[sample]["species"]
 
-submol_dic["organism"]["genus_species"] = organism
-
-submol_dic["organism"]["strain"] = samples.loc[sample]["strain"]
+submol_dict["organism"]["genus_species"] = organism
+submol_dict["organism"]["strain"] = samples.loc[sample]["strain"]
 
 if not locus_tag:
     locus_tag = "Spy"
-submol_dic["locus_tag_prefix"] = locus_tag
+submol_dict["locus_tag_prefix"] = locus_tag
 
-write_dict_to_yaml(generic_dic, input_yaml)
-write_dict_to_yaml(submol_dic, submol_yaml)
+write_dict_to_yaml(generic_dict, input_yaml)
+write_dict_to_yaml(submol_dict, submol_yaml)
 sys.stderr.write(f"Module 'prepare yaml files' finished successfully!")

From 84c07a67fb10053b1a435afe2923652a75a50101 Mon Sep 17 00:00:00 2001
From: m-jahn <jahn@mpusp.mpg.de>
Date: Tue, 9 Dec 2025 11:51:25 +0100
Subject: [PATCH 08/18] fix: use correct input for quast reference, closes #7

---
 .test/config/config.yml          |  3 ++-
 config/config.yml                |  3 ++-
 config/schemas/config.schema.yml |  5 ++++-
 workflow/rules/qc.smk            | 16 ++++++++++------
 4 files changed, 18 insertions(+), 9 deletions(-)

diff --git a/.test/config/config.yml b/.test/config/config.yml
index fa5a8d5..6650ef0 100644
--- a/.test/config/config.yml
+++ b/.test/config/config.yml
@@ -18,5 +18,6 @@ bakta:
   extra: "--keep-contig-headers --compliant"
 
 quast:
-  reference: ""
+  reference_fasta: ""
+  reference_gff: ""
   extra: ""
diff --git a/config/config.yml b/config/config.yml
index fa5a8d5..6650ef0 100644
--- a/config/config.yml
+++ b/config/config.yml
@@ -18,5 +18,6 @@ bakta:
   extra: "--keep-contig-headers --compliant"
 
 quast:
-  reference: ""
+  reference_fasta: ""
+  reference_gff: ""
   extra: ""
diff --git a/config/schemas/config.schema.yml b/config/schemas/config.schema.yml
index ac37e43..b461fc1 100644
--- a/config/schemas/config.schema.yml
+++ b/config/schemas/config.schema.yml
@@ -71,9 +71,12 @@ properties:
   quast:
     type: object
     properties:
-      reference:
+      reference_fasta:
         type: string
         description: Path to the reference genome for QUAST
+      reference_gff:
+        type: string
+        description: Path to the reference annotation for QUAST
       extra:
         type: string
         description: Extra command-line arguments for QUAST
diff --git a/workflow/rules/qc.smk b/workflow/rules/qc.smk
index 1403c1b..1e8a9ce 100644
--- a/workflow/rules/qc.smk
+++ b/workflow/rules/qc.smk
@@ -1,7 +1,6 @@
 rule quast:
     input:
         fasta=get_quast_fasta,
-        gff=get_quast_gff,
     output:
         report="results/qc/quast/{tool}/report.txt",
     conda:
@@ -10,9 +9,14 @@ rule quast:
         """--- Running QUAST quality check for all assemblies ---"""
     params:
         outdir=lambda wc, output: os.path.dirname(output.report),
-        ref=(
-            " ".join(["-r", config["quast"]["reference"]])
-            if config["quast"]["reference"]
+        ref_fasta=(
+            " ".join(["-r", config["quast"]["reference_fasta"]])
+            if config["quast"]["reference_fasta"]
+            else []
+        ),
+        ref_gff=(
+            " ".join(["-g", config["quast"]["reference_gff"]])
+            if config["quast"]["reference_gff"]
             else []
         ),
         extra=config["quast"]["extra"],
@@ -24,8 +28,8 @@ rule quast:
         quast \
         --output-dir {params.outdir} \
         --threads {threads} \
-        --features {input.gff} \
-        {params.ref} \
+        {params.ref_fasta} \
+        {params.ref_gff} \
         {params.extra} \
         {input.fasta} \
         > {log} 2>&1

From db385f0fce62cad5b188d865b8f84cab1e13b270 Mon Sep 17 00:00:00 2001
From: Rina Ahmed-Begrich <begrich@mpusp.mpg.de>
Date: Tue, 9 Dec 2025 13:10:26 +0100
Subject: [PATCH 09/18] fix: update armdfinder db when local db is used. update
 bakta version.

---
 workflow/envs/bakta.yml     | 2 +-
 workflow/rules/annotate.smk | 4 +++-
 2 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/workflow/envs/bakta.yml b/workflow/envs/bakta.yml
index 1c833f7..526650b 100644
--- a/workflow/envs/bakta.yml
+++ b/workflow/envs/bakta.yml
@@ -4,4 +4,4 @@ channels:
   - bioconda
   - nodefaults
 dependencies:
-  - bakta=1.9.4
+  - bakta=1.11.4
diff --git a/workflow/rules/annotate.smk b/workflow/rules/annotate.smk
index 602e607..9e3b4d3 100644
--- a/workflow/rules/annotate.smk
+++ b/workflow/rules/annotate.smk
@@ -142,7 +142,9 @@ rule get_bakta_db:
           bakta_db download --output {params.outdir} --type {params.download_db} &>> {log};
         else
           echo 'Using Bakta DB from supplied input dir: {params.existing_db}' > {log};
-          ln -s {params.existing_db} {output.db}
+          ln -s {params.existing_db} {output.db};
+          echo 'Update ARMFinderPlus DB using supplied input dir: {params.existing_db}' > {log};
+          amrfinder_update --force_update --database {params.existing_db}/amrfinderplus-db
         fi
         """
 

From a85d20bf22201ee00cffb5ae22047207fc07ad80 Mon Sep 17 00:00:00 2001
From: Rina Ahmed-Begrich <begrich@mpusp.mpg.de>
Date: Tue, 9 Dec 2025 13:14:00 +0100
Subject: [PATCH 10/18] fix: pipe armfinder_update to log.

---
 workflow/rules/annotate.smk | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/workflow/rules/annotate.smk b/workflow/rules/annotate.smk
index 9e3b4d3..5a8a372 100644
--- a/workflow/rules/annotate.smk
+++ b/workflow/rules/annotate.smk
@@ -144,7 +144,7 @@ rule get_bakta_db:
           echo 'Using Bakta DB from supplied input dir: {params.existing_db}' > {log};
           ln -s {params.existing_db} {output.db};
           echo 'Update ARMFinderPlus DB using supplied input dir: {params.existing_db}' > {log};
-          amrfinder_update --force_update --database {params.existing_db}/amrfinderplus-db
+          amrfinder_update --force_update --database {params.existing_db}/amrfinderplus-db >> {log}
         fi
         """
 

From 938ac49de7be1258b910dc8167138fa17f8b39e3 Mon Sep 17 00:00:00 2001
From: jahn <jahn@mpusp.mpg.de>
Date: Tue, 9 Dec 2025 13:32:14 +0100
Subject: [PATCH 11/18] fix: correct env names, use relative path for pgap
 fasta

---
 workflow/envs/bakta.yml                | 2 +-
 workflow/envs/base.yml                 | 6 +++---
 workflow/envs/prokka.yml               | 2 +-
 workflow/envs/quast.yml                | 2 +-
 workflow/scripts/prepare_yaml_files.py | 2 +-
 5 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/workflow/envs/bakta.yml b/workflow/envs/bakta.yml
index 526650b..bd2d4d2 100644
--- a/workflow/envs/bakta.yml
+++ b/workflow/envs/bakta.yml
@@ -1,4 +1,4 @@
-name: base
+name: bakta
 channels:
   - conda-forge
   - bioconda
diff --git a/workflow/envs/base.yml b/workflow/envs/base.yml
index fb1867a..fcd1565 100644
--- a/workflow/envs/base.yml
+++ b/workflow/envs/base.yml
@@ -4,6 +4,6 @@ channels:
   - bioconda
   - nodefaults
 dependencies:
-  - python==3.13.0
-  - pandas==2.3.3
-  - apptainer==1.4.5
+  - python=3.13.0
+  - pandas=2.3.3
+  - apptainer=1.4.5
diff --git a/workflow/envs/prokka.yml b/workflow/envs/prokka.yml
index 2859a3a..643ba6b 100644
--- a/workflow/envs/prokka.yml
+++ b/workflow/envs/prokka.yml
@@ -1,4 +1,4 @@
-name: base
+name: prokka
 channels:
   - conda-forge
   - bioconda
diff --git a/workflow/envs/quast.yml b/workflow/envs/quast.yml
index 0fbc104..9d5f823 100644
--- a/workflow/envs/quast.yml
+++ b/workflow/envs/quast.yml
@@ -1,4 +1,4 @@
-name: base
+name: quast
 channels:
   - conda-forge
   - bioconda
diff --git a/workflow/scripts/prepare_yaml_files.py b/workflow/scripts/prepare_yaml_files.py
index c8846de..5f1914f 100644
--- a/workflow/scripts/prepare_yaml_files.py
+++ b/workflow/scripts/prepare_yaml_files.py
@@ -47,7 +47,7 @@ def write_dict_to_yaml(outdict: dict, filepath: str) -> None:
 generic_dict = read_yaml_to_dict(generic_template)
 submol_dict = read_yaml_to_dict(submol_template)
 
-generic_dict["fasta"]["location"] = os.path.abspath(fasta)
+generic_dict["fasta"]["location"] = os.path.basename(fasta)
 generic_dict["submol"]["location"] = os.path.abspath(submol_yaml)
 
 if not organism:

From 20e70b24062e621950812144af5600d21a0f7efe Mon Sep 17 00:00:00 2001
From: jahn <jahn@mpusp.mpg.de>
Date: Wed, 10 Dec 2025 10:24:42 +0100
Subject: [PATCH 12/18] feat: added panaroo as a genome comparison tool, closes
 #6

---
 .test/config/config.yml          |  5 +++
 README.md                        | 11 ++++++
 config/config.yml                |  5 +++
 config/schemas/config.schema.yml | 12 +++++++
 workflow/Snakefile               |  5 +--
 workflow/envs/panaroo.yml        |  9 +++++
 workflow/rules/annotate.smk      |  6 ++--
 workflow/rules/common.smk        | 27 ++++++++++++--
 workflow/rules/qc.smk            | 60 +++++++++++++++++++++++++++++++-
 9 files changed, 129 insertions(+), 11 deletions(-)
 create mode 100644 workflow/envs/panaroo.yml

diff --git a/.test/config/config.yml b/.test/config/config.yml
index 6650ef0..b203c5d 100644
--- a/.test/config/config.yml
+++ b/.test/config/config.yml
@@ -21,3 +21,8 @@ quast:
   reference_fasta: ""
   reference_gff: ""
   extra: ""
+
+panaroo:
+  remove_source: "cmsearch"
+  remove_feature: "tRNA|rRNA|ncRNA|exon|sequence_feature"
+  extra: "--clean-mode strict --remove-invalid-genes"
diff --git a/README.md b/README.md
index dbf4310..0d30436 100644
--- a/README.md
+++ b/README.md
@@ -4,6 +4,7 @@
 [![GitHub actions status](https://github.com/MPUSP/snakemake-assembly-postprocessing/actions/workflows/main.yml/badge.svg)](https://github.com/MPUSP/snakemake-assembly-postprocessing/actions/workflows/main.yml)
 [![run with conda](http://img.shields.io/badge/run%20with-conda-3EB049?labelColor=000000&logo=anaconda)](https://docs.conda.io/en/latest/)
 [![run with apptainer](https://img.shields.io/badge/run%20with-apptainer-1D355C.svg?labelColor=000000)](https://apptainer.org/)
+[![workflow catalog](https://img.shields.io/badge/Snakemake%20workflow%20catalog-darkgreen)](https://snakemake.github.io/snakemake-workflow-catalog/docs/workflows/MPUSP/snakemake-assembly-postprocessing)
 
 A Snakemake workflow for the post-processing of microbial genome assemblies.
 
@@ -20,6 +21,8 @@ If you use this workflow in a paper, don't forget to give credits to the authors
    1. NCBI's Prokaryotic Genome Annotation Pipeline ([PGAP](https://github.com/ncbi/pgap)). Note: needs to be installed manually
    2. [prokka](https://github.com/tseemann/prokka), a fast and light-weight prokaryotic annotation tool
    3. [bakta](https://github.com/oschwengers/bakta), a fast, alignment-free annotation tool. Note: Bakta will automatically download its companion database from zenodo (light: 1.5 GB, full: 40 GB)
+3. Create a QC report for the assemblies using [Quast](https://github.com/ablab/quast)
+4. Create a pangenome analysis (orthologs/homologs) using [Panaroo](https://gthlab.au/panaroo/)
 
 ## Requirements
 
@@ -64,6 +67,14 @@ conda activate snakemake-assembly-postprocessing
 
 ## References
 
+> Seemann T. _Prokka: rapid prokaryotic genome annotation_. Bioinformatics. **2014** Jul 15;30(14):2068-9. PMID: 24642063. https://doi.org/10.1093/bioinformatics/btu153.
+
+> Schwengers O, Jelonek L, Dieckmann MA, Beyvers S, Blom J, Goesmann A. _Bakta: rapid and standardized annotation of bacterial genomes via alignment-free sequence identification_. Microb Genom, 7(11):000685 **2021**. PMID: 34739369. https://doi.org/10.1099/mgen.0.000685.
+
 > Li W, O'Neill KR, Haft DH, DiCuccio M, Chetvernin V, Badretdin A, Coulouris G, Chitsaz F, Derbyshire MK, Durkin AS, Gonzales NR, Gwadz M, Lanczycki CJ, Song JS, Thanki N, Wang J, Yamashita RA, Yang M, Zheng C, Marchler-Bauer A, Thibaud-Nissen F. _RefSeq: Expanding the Prokaryotic Genome Annotation Pipeline reach with protein family model curation._ Nucleic Acids Res, **2021** Jan 8;49(D1):D1020-D1028. https://doi.org/10.1093/nar/gkaa1105
 
+> Gurevich A, Saveliev V, Vyahhi N, Tesler G. _QUAST: quality assessment tool for genome assemblies_. Bioinformatics. 29(8):1072-5, **2013**. PMID: 23422339. https://doi.org/10.1093/bioinformatics/btt086.
+
+> Tonkin-Hill G, MacAlasdair N, Ruis C, Weimann A, Horesh G, Lees JA, Gladstone RA, Lo S, Beaudoin C, Floto RA, Frost SDW, Corander J, Bentley SD, Parkhill J. _Producing polished prokaryotic pangenomes with the Panaroo pipeline_. Genome Biol. 21(1):180, **2020**. PMID: 32698896. https://doi.org/10.1186/s13059-020-02090-4.
+
 > Köster, J., Mölder, F., Jablonski, K. P., Letcher, B., Hall, M. B., Tomkins-Tinch, C. H., Sochat, V., Forster, J., Lee, S., Twardziok, S. O., Kanitz, A., Wilm, A., Holtgrewe, M., Rahmann, S., & Nahnsen, S. _Sustainable data analysis with Snakemake_. F1000Research, 10:33, 10, 33, **2021**. https://doi.org/10.12688/f1000research.29032.2.
diff --git a/config/config.yml b/config/config.yml
index 6650ef0..b203c5d 100644
--- a/config/config.yml
+++ b/config/config.yml
@@ -21,3 +21,8 @@ quast:
   reference_fasta: ""
   reference_gff: ""
   extra: ""
+
+panaroo:
+  remove_source: "cmsearch"
+  remove_feature: "tRNA|rRNA|ncRNA|exon|sequence_feature"
+  extra: "--clean-mode strict --remove-invalid-genes"
diff --git a/config/schemas/config.schema.yml b/config/schemas/config.schema.yml
index b461fc1..d52ee03 100644
--- a/config/schemas/config.schema.yml
+++ b/config/schemas/config.schema.yml
@@ -80,6 +80,18 @@ properties:
       extra:
         type: string
         description: Extra command-line arguments for QUAST
+  panaroo:
+    type: object
+    properties:
+      remove_source:
+        type: string
+        description: Source types to remove in Panaroo (regex supported)
+      remove_feature:
+        type: string
+        description: Feature types to remove in Panaroo (regex supported)
+      extra:
+        type: string
+        description: Extra command-line arguments for Panaroo
 
 required:
   - samplesheet
diff --git a/workflow/Snakefile b/workflow/Snakefile
index 9ae255e..aafcb6a 100644
--- a/workflow/Snakefile
+++ b/workflow/Snakefile
@@ -67,8 +67,5 @@ onerror:
 # -----------------------------------------------------
 rule all:
     input:
-        expand(
-            "results/qc/quast/{tool}/report.txt",
-            tool=config["tool"],
-        ),
+        get_final_input,
     default_target: True
diff --git a/workflow/envs/panaroo.yml b/workflow/envs/panaroo.yml
new file mode 100644
index 0000000..43ee14a
--- /dev/null
+++ b/workflow/envs/panaroo.yml
@@ -0,0 +1,9 @@
+name: panaroo
+channels:
+  - conda-forge
+  - bioconda
+  - nodefaults
+dependencies:
+  - numpy=1.26.4
+  - scipy=1.11.4
+  - panaroo=1.5.2
diff --git a/workflow/rules/annotate.smk b/workflow/rules/annotate.smk
index 5a8a372..9fc66b9 100644
--- a/workflow/rules/annotate.smk
+++ b/workflow/rules/annotate.smk
@@ -138,13 +138,13 @@ rule get_bakta_db:
         """
         if [ {params.download_db} != 'none' ]; then
           echo 'The most recent of the following available Bakta DBs is downloaded:' > {log};
-          bakta_db list >> {log};
+          bakta_db list &>> {log};
           bakta_db download --output {params.outdir} --type {params.download_db} &>> {log};
         else
           echo 'Using Bakta DB from supplied input dir: {params.existing_db}' > {log};
           ln -s {params.existing_db} {output.db};
-          echo 'Update ARMFinderPlus DB using supplied input dir: {params.existing_db}' > {log};
-          amrfinder_update --force_update --database {params.existing_db}/amrfinderplus-db >> {log}
+          echo 'Update ARMFinderPlus DB using supplied input dir: {params.existing_db}' >> {log};
+          amrfinder_update --force_update --database {params.existing_db}/amrfinderplus-db &>> {log}
         fi
         """
 
diff --git a/workflow/rules/common.smk b/workflow/rules/common.smk
index 9300f94..64bed83 100644
--- a/workflow/rules/common.smk
+++ b/workflow/rules/common.smk
@@ -23,7 +23,6 @@ def get_fasta(wildcards):
     sample = wildcards.sample
     if sample not in samples.index:
         raise ValueError(f"Sample {sample} not found in samplesheet.")
-    # return the fasta file path
     return samples.loc[sample, "file"]
 
 
@@ -35,9 +34,31 @@ def get_quast_fasta(wildcards):
     )
 
 
-def get_quast_gff(wildcards):
+def get_panaroo_gff(wildcards):
     return expand(
-        "results/annotation/{tool}/{sample}/{sample}.gff",
+        "results/qc/panaroo/{tool}/prepare/{sample}.gff",
         tool=wildcards.tool,
         sample=samples.index,
     )
+
+
+def get_panaroo_fasta(wildcards):
+    return expand(
+        "results/qc/panaroo/{tool}/prepare/{sample}.fna",
+        tool=wildcards.tool,
+        sample=samples.index,
+    )
+
+
+def get_final_input(wildcards):
+    inputs = []
+    inputs += expand(
+        "results/qc/quast/{tool}/report.txt",
+        tool=config["tool"],
+    )
+    if len(samples.index) > 1:
+        inputs += expand(
+            "results/qc/panaroo/{tool}/summary_statistics.txt",
+            tool=config["tool"],
+        )
+    return inputs
diff --git a/workflow/rules/qc.smk b/workflow/rules/qc.smk
index 1e8a9ce..6135aa1 100644
--- a/workflow/rules/qc.smk
+++ b/workflow/rules/qc.smk
@@ -20,7 +20,7 @@ rule quast:
             else []
         ),
         extra=config["quast"]["extra"],
-    threads: workflow.cores * 0.25
+    threads: 4
     log:
         "results/qc/quast/{tool}/quast.log",
     shell:
@@ -34,3 +34,61 @@ rule quast:
         {input.fasta} \
         > {log} 2>&1
         """
+
+
+rule prepare_panaroo:
+    input:
+        fasta="results/annotation/{tool}/{sample}/{sample}.fna",
+        gff="results/annotation/{tool}/{sample}/{sample}.gff",
+    output:
+        fasta="results/qc/panaroo/{tool}/prepare/{sample}.fna",
+        gff="results/qc/panaroo/{tool}/prepare/{sample}.gff",
+    conda:
+        "../envs/panaroo.yml"
+    message:
+        """--- Prepare input files for pan-genome alignment ---"""
+    params:
+        remove_source=config["panaroo"]["remove_source"],
+        remove_feature=config["panaroo"]["remove_feature"],
+    log:
+        "results/qc/panaroo/{tool}/prepare/{sample}.log",
+    shell:
+        """
+        echo 'Preparing annotation for Panaroo:' > {log};
+        echo '  - formatting seqnames in FASTA files' >> {log};
+        awk '{{ sub(/>.*\\|/, ">"); sub(/[[:space:]].*$/, ""); print }}' \
+          {input.fasta} > {output.fasta} 2>> {log};
+        echo '  - removing sequences and selected features in GFF files' >> {log};
+        awk ' /^##FASTA/ {{exit}} $2 !~ /{params.remove_source}/ && $3 !~ /{params.remove_feature}/ {{print}}' \
+          {input.gff} > {output.gff} 2>> {log}
+        """
+
+
+rule panaroo:
+    input:
+        gff=get_panaroo_gff,
+        fasta=get_panaroo_fasta,
+    output:
+        stats="results/qc/panaroo/{tool}/summary_statistics.txt",
+    conda:
+        "../envs/panaroo.yml"
+    message:
+        """--- Running PANAROO to create pangenome from all annotations ---"""
+    params:
+        outdir=lambda wc, output: os.path.dirname(output.stats),
+        extra=config["panaroo"]["extra"],
+    threads: 4
+    log:
+        "results/qc/panaroo/{tool}/panaroo.log",
+    shell:
+        """
+        printf '%s\n' {input.gff} | \
+          paste -d ' ' - <(printf '%s\n' {input.fasta}) \
+          > {params.outdir}/input_files.txt;
+        panaroo \
+          -i {params.outdir}/input_files.txt \
+          -o {params.outdir} \
+          -t {threads} \
+          {params.extra} \
+          > {log} 2>&1
+        """

From 6048f72a5c0ad16ca09f365aeb37bc2175c398f4 Mon Sep 17 00:00:00 2001
From: Rina Ahmed-Begrich <begrich@mpusp.mpg.de>
Date: Wed, 10 Dec 2025 10:55:50 +0100
Subject: [PATCH 13/18] fix: validate locus tag for bakta annotation; closes #8

---
 workflow/rules/annotate.smk |  2 +-
 workflow/rules/common.smk   | 26 ++++++++++++++++++++++++++
 2 files changed, 27 insertions(+), 1 deletion(-)

diff --git a/workflow/rules/annotate.smk b/workflow/rules/annotate.smk
index 9fc66b9..4b9aab0 100644
--- a/workflow/rules/annotate.smk
+++ b/workflow/rules/annotate.smk
@@ -162,7 +162,7 @@ rule annotate_bakta:
         """--- Running BAKTA annotation for sample {wildcards.sample} ---"""
     params:
         prefix=lambda wc: wc.sample,
-        locustag=lambda wc: samples.loc[wc.sample]["id_prefix"],
+        locustag=lambda wc: format_bakta_locustag(samples.loc[wc.sample]["id_prefix"]),
         species=lambda wc: samples.loc[wc.sample]["species"],
         strain=lambda wc: samples.loc[wc.sample]["strain"],
         outdir=lambda wc, output: os.path.dirname(output[0]),
diff --git a/workflow/rules/common.smk b/workflow/rules/common.smk
index 64bed83..266fbdc 100644
--- a/workflow/rules/common.smk
+++ b/workflow/rules/common.smk
@@ -1,5 +1,7 @@
 # import basic packages
 import pandas as pd
+import re
+from snakemake import logging
 from snakemake.utils import validate
 
 
@@ -62,3 +64,27 @@ def get_final_input(wildcards):
             tool=config["tool"],
         )
     return inputs
+
+
+# -----------------------------------------------------
+# helper functions
+# -----------------------------------------------------
+def format_bakta_locustag(raw):
+    """Format locustag for BAKTA annotation."""
+    tag = str(raw)
+    # uppercase for BAKTA
+    tag_up = tag.upper()
+    # keep only A-Z0-9
+    cleaned = re.sub(r"[^A-Z0-9]", "", tag_up)
+    if len(cleaned) < 3 or len(cleaned) > 12:
+        raise ValueError(
+            f"locustag '{raw}' -> '{cleaned}' must contain between 3-12 alphanumeric uppercase characters\n"
+        )
+    if not re.match(r"^[A-Z]", cleaned):
+        raise ValueError(f"locustag '{raw}' -> '{cleaned}' must start with a letter")
+    # warn if cleaned tag is different from original
+    if cleaned != tag:
+        logger.warning(
+            f"\nlocustag '{raw}' converted to '{cleaned}' to meet BAKTA requirements (between 3 and 12 alphanumeric uppercase characters, start with a letter)\n"
+        )
+    return cleaned

From 0e829f6ca1746221565b90648ed712e5d23018e0 Mon Sep 17 00:00:00 2001
From: jahn <jahn@mpusp.mpg.de>
Date: Wed, 10 Dec 2025 11:36:42 +0100
Subject: [PATCH 14/18] fix: added option to skip pangenome analysis

---
 .test/config/config.yml          | 1 +
 config/config.yml                | 1 +
 config/schemas/config.schema.yml | 3 +++
 workflow/rules/common.smk        | 2 +-
 4 files changed, 6 insertions(+), 1 deletion(-)

diff --git a/.test/config/config.yml b/.test/config/config.yml
index b203c5d..773cadc 100644
--- a/.test/config/config.yml
+++ b/.test/config/config.yml
@@ -23,6 +23,7 @@ quast:
   extra: ""
 
 panaroo:
+  skip: False
   remove_source: "cmsearch"
   remove_feature: "tRNA|rRNA|ncRNA|exon|sequence_feature"
   extra: "--clean-mode strict --remove-invalid-genes"
diff --git a/config/config.yml b/config/config.yml
index b203c5d..773cadc 100644
--- a/config/config.yml
+++ b/config/config.yml
@@ -23,6 +23,7 @@ quast:
   extra: ""
 
 panaroo:
+  skip: False
   remove_source: "cmsearch"
   remove_feature: "tRNA|rRNA|ncRNA|exon|sequence_feature"
   extra: "--clean-mode strict --remove-invalid-genes"
diff --git a/config/schemas/config.schema.yml b/config/schemas/config.schema.yml
index d52ee03..33215e4 100644
--- a/config/schemas/config.schema.yml
+++ b/config/schemas/config.schema.yml
@@ -83,6 +83,9 @@ properties:
   panaroo:
     type: object
     properties:
+      skip:
+        type: boolean
+        description: Whether to skip Panaroo analysis
       remove_source:
         type: string
         description: Source types to remove in Panaroo (regex supported)
diff --git a/workflow/rules/common.smk b/workflow/rules/common.smk
index 266fbdc..47364a7 100644
--- a/workflow/rules/common.smk
+++ b/workflow/rules/common.smk
@@ -58,7 +58,7 @@ def get_final_input(wildcards):
         "results/qc/quast/{tool}/report.txt",
         tool=config["tool"],
     )
-    if len(samples.index) > 1:
+    if len(samples.index) > 1 and not config["panaroo"]["skip"]:
         inputs += expand(
             "results/qc/panaroo/{tool}/summary_statistics.txt",
             tool=config["tool"],

From a4a147aa1fe1c18994d46bfd279e9b32afd7633e Mon Sep 17 00:00:00 2001
From: m-jahn <jahn@mpusp.mpg.de>
Date: Wed, 10 Dec 2025 12:43:20 +0100
Subject: [PATCH 15/18] fix: update readmes

---
 README.md        | 47 +++++++++++++++++++++++++++++++++++++----
 config/README.md | 54 ++++++++++++++++++++++++++++++++----------------
 2 files changed, 79 insertions(+), 22 deletions(-)

diff --git a/README.md b/README.md
index 0d30436..1af2ba6 100644
--- a/README.md
+++ b/README.md
@@ -8,12 +8,27 @@
 
 A Snakemake workflow for the post-processing of microbial genome assemblies.
 
+- [snakemake-assembly-postprocessing](#snakemake-assembly-postprocessing)
+  - [Usage](#usage)
+  - [Workflow overview](#workflow-overview)
+  - [Installation](#installation)
+  - [Deployment options](#deployment-options)
+  - [Authors](#authors)
+  - [References](#references)
+
 ## Usage
 
 The usage of this workflow is described in the [Snakemake Workflow Catalog](https://snakemake.github.io/snakemake-workflow-catalog/docs/workflows/MPUSP/snakemake-assembly-postprocessing).
 
+Detailed information about input data and workflow configuration can also be found in the [`config/README.md`](config/README.md).
+
 If you use this workflow in a paper, don't forget to give credits to the authors by citing the URL of this repository.
 
+_Workflow overview:_
+
+<!-- include overview-->
+<img src="resources/images/dag.svg" align="center" />
+
 ## Workflow overview
 
 1. Parse `samples.csv` table containing the samples's meta data (`python`)
@@ -24,10 +39,6 @@ If you use this workflow in a paper, don't forget to give credits to the authors
 3. Create a QC report for the assemblies using [Quast](https://github.com/ablab/quast)
 4. Create a pangenome analysis (orthologs/homologs) using [Panaroo](https://gthlab.au/panaroo/)
 
-## Requirements
-
-- [PGAP](https://github.com/ncbi/pgap)
-
 ## Installation
 
 **Step 1: Clone this repository**
@@ -52,9 +63,37 @@ conda activate snakemake-assembly-postprocessing
 
 **Step 4: Install PGAP**
 
+- if you want to use [PGAP](https://github.com/ncbi/pgap) for annotation, it needs to be installed separately
 - PGAP can be downloaded from https://github.com/ncbi/pgap. Please follow the installation instructions there.
 - Define the path to the `pgap.py` script (located in the `scripts` folder) in the `config` file (recommended: `./resources`)
 
+## Deployment options
+
+To run the workflow from command line, change the working directory.
+
+```bash
+cd path/to/snakemake-simple-mapping
+```
+
+Adjust options in the default config file `config/config.yml`.
+Before running the complete workflow, you can perform a dry run using:
+
+```bash
+snakemake --dry-run
+```
+
+To run the workflow with test files using **conda**:
+
+```bash
+snakemake --cores 2 --sdm conda --directory .test
+```
+
+To run the workflow with test files using **apptainer**:
+
+```bash
+snakemake --cores 2 --sdm conda apptainer --directory .test
+```
+
 ## Authors
 
 - Dr. Rina Ahmed-Begrich
diff --git a/config/README.md b/config/README.md
index 17692b8..05ae5ee 100644
--- a/config/README.md
+++ b/config/README.md
@@ -1,32 +1,50 @@
-## Running the workflow
-
-### Input data
+## Workflow overview
 
-This workflow requires `fasta` input data.
-The samplesheet table has the following layout:
+A Snakemake workflow for the post-processing of microbial genome assemblies.
 
-| sample | species                  | strain | id_prefix | file           |
-| ------ | ------------------------ | ------ | --------- | -------------- |
-| EC2224 | "Streptococcus pyogenes" | SF370  | Spy       | assembly.fasta |
+1. Parse `samples.csv` table containing the samples's meta data (`python`)
+2. Annotate assemblies using one of the following tools:
+   1. NCBI's Prokaryotic Genome Annotation Pipeline ([PGAP](https://github.com/ncbi/pgap)). Note: needs to be installed manually
+   2. [prokka](https://github.com/tseemann/prokka), a fast and light-weight prokaryotic annotation tool
+   3. [bakta](https://github.com/oschwengers/bakta), a fast, alignment-free annotation tool. Note: Bakta will automatically download its companion database from zenodo (light: 1.5 GB, full: 40 GB)
+3. Create a QC report for the assemblies using [Quast](https://github.com/ablab/quast)
+4. Create a pangenome analysis (orthologs/homologs) using [Panaroo](https://gthlab.au/panaroo/)
 
-### Execution
+## Installation
 
-To run the workflow from command line, change to the working directory and activate the conda environment.
+**Step 1: Clone this repository**
 
 ```bash
+git clone https://github.com/MPUSP/snakemake-assembly-postprocessing.git
 cd snakemake-assembly-postprocessing
-conda activate snakemake-assembly-postprocessing
 ```
 
-Adjust options in the default config file `config/config.yml`.
-Before running the entire workflow, perform a dry run using:
+**Step 2: Install dependencies**
 
-```bash
-snakemake --cores 1 --sdm conda --directory .test --dry-run
-```
+It is recommended to install snakemake and run the workflow with `conda` or `mamba`. [Miniforge](https://conda-forge.org/download/) is the preferred conda-forge installer and includes `conda`, `mamba` and their dependencies.
+
+**Step 3: Create snakemake environment**
 
-To run the workflow with test files using **conda**:
+This step creates a new conda environment called `snakemake-assembly-postprocessing`.
 
 ```bash
-snakemake --cores 1 --sdm conda --directory .test
+mamba create -c conda-forge -c bioconda -n snakemake-assembly-postprocessing snakemake pandas
+conda activate snakemake-assembly-postprocessing
 ```
+
+**Step 4: Install PGAP**
+
+- if you want to use [PGAP](https://github.com/ncbi/pgap) for annotation, it needs to be installed separately
+- PGAP can be downloaded from https://github.com/ncbi/pgap. Please follow the installation instructions there.
+- Define the path to the `pgap.py` script (located in the `scripts` folder) in the `config` file (recommended: `./resources`)
+
+## Running the workflow
+
+### Input data
+
+This workflow requires `fasta` input data.
+The samplesheet table has the following layout:
+
+| sample | species                  | strain | id_prefix | file           |
+| ------ | ------------------------ | ------ | --------- | -------------- |
+| EC2224 | "Streptococcus pyogenes" | SF370  | SPY       | assembly.fasta |

From b1d0d34f43934bcc91db506a95c1358db282c36b Mon Sep 17 00:00:00 2001
From: Rina Ahmed-Begrich <begrich@mpusp.mpg.de>
Date: Wed, 10 Dec 2025 14:54:43 +0100
Subject: [PATCH 16/18] fix: added missing dag.

---
 README.md                |   5 +-
 config/README.md         |  30 +-------
 resources/images/dag.svg | 157 +++++++++++++++++++++++++++++++++++++++
 3 files changed, 161 insertions(+), 31 deletions(-)
 create mode 100644 resources/images/dag.svg

diff --git a/README.md b/README.md
index 1af2ba6..1368e45 100644
--- a/README.md
+++ b/README.md
@@ -26,7 +26,6 @@ If you use this workflow in a paper, don't forget to give credits to the authors
 
 _Workflow overview:_
 
-<!-- include overview-->
 <img src="resources/images/dag.svg" align="center" />
 
 ## Workflow overview
@@ -72,14 +71,14 @@ conda activate snakemake-assembly-postprocessing
 To run the workflow from command line, change the working directory.
 
 ```bash
-cd path/to/snakemake-simple-mapping
+cd snakemake-assembly-postprocessing
 ```
 
 Adjust options in the default config file `config/config.yml`.
 Before running the complete workflow, you can perform a dry run using:
 
 ```bash
-snakemake --dry-run
+snakemake --cores 1 --dry-run
 ```
 
 To run the workflow with test files using **conda**:
diff --git a/config/README.md b/config/README.md
index 05ae5ee..a5c8d18 100644
--- a/config/README.md
+++ b/config/README.md
@@ -10,34 +10,6 @@ A Snakemake workflow for the post-processing of microbial genome assemblies.
 3. Create a QC report for the assemblies using [Quast](https://github.com/ablab/quast)
 4. Create a pangenome analysis (orthologs/homologs) using [Panaroo](https://gthlab.au/panaroo/)
 
-## Installation
-
-**Step 1: Clone this repository**
-
-```bash
-git clone https://github.com/MPUSP/snakemake-assembly-postprocessing.git
-cd snakemake-assembly-postprocessing
-```
-
-**Step 2: Install dependencies**
-
-It is recommended to install snakemake and run the workflow with `conda` or `mamba`. [Miniforge](https://conda-forge.org/download/) is the preferred conda-forge installer and includes `conda`, `mamba` and their dependencies.
-
-**Step 3: Create snakemake environment**
-
-This step creates a new conda environment called `snakemake-assembly-postprocessing`.
-
-```bash
-mamba create -c conda-forge -c bioconda -n snakemake-assembly-postprocessing snakemake pandas
-conda activate snakemake-assembly-postprocessing
-```
-
-**Step 4: Install PGAP**
-
-- if you want to use [PGAP](https://github.com/ncbi/pgap) for annotation, it needs to be installed separately
-- PGAP can be downloaded from https://github.com/ncbi/pgap. Please follow the installation instructions there.
-- Define the path to the `pgap.py` script (located in the `scripts` folder) in the `config` file (recommended: `./resources`)
-
 ## Running the workflow
 
 ### Input data
@@ -48,3 +20,5 @@ The samplesheet table has the following layout:
 | sample | species                  | strain | id_prefix | file           |
 | ------ | ------------------------ | ------ | --------- | -------------- |
 | EC2224 | "Streptococcus pyogenes" | SF370  | SPY       | assembly.fasta |
+
+**Note:** Pangenome analysis with `Panaroo` requires at least two samples.
\ No newline at end of file
diff --git a/resources/images/dag.svg b/resources/images/dag.svg
new file mode 100644
index 0000000..344f877
--- /dev/null
+++ b/resources/images/dag.svg
@@ -0,0 +1,157 @@
+<?xml version="1.0" encoding="UTF-8" standalone="no"?>
+<!DOCTYPE svg PUBLIC "-//W3C//DTD SVG 1.1//EN"
+ "http://www.w3.org/Graphics/SVG/1.1/DTD/svg11.dtd">
+<!-- Generated by graphviz version 2.43.0 (0)
+ -->
+<!-- Title: snakemake_dag Pages: 1 -->
+<svg width="329pt" height="404pt"
+ viewBox="0.00 0.00 329.03 404.00" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink">
+<g id="graph0" class="graph" transform="scale(1 1) rotate(0) translate(4 400)">
+<title>snakemake_dag</title>
+<polygon fill="white" stroke="transparent" points="-4,4 -4,-400 325.03,-400 325.03,4 -4,4"/>
+<!-- 0 -->
+<g id="node1" class="node">
+<title>0</title>
+<path fill="none" stroke="#d85656" stroke-width="2" d="M175,-36C175,-36 145,-36 145,-36 139,-36 133,-30 133,-24 133,-24 133,-12 133,-12 133,-6 139,0 145,0 145,0 175,0 175,0 181,0 187,-6 187,-12 187,-12 187,-24 187,-24 187,-30 181,-36 175,-36"/>
+<text text-anchor="middle" x="160" y="-15.5" font-family="sans" font-size="10.00">all</text>
+</g>
+<!-- 1 -->
+<g id="node2" class="node">
+<title>1</title>
+<path fill="none" stroke="#568ad8" stroke-width="2" d="M128,-180C128,-180 98,-180 98,-180 92,-180 86,-174 86,-168 86,-168 86,-156 86,-156 86,-150 92,-144 98,-144 98,-144 128,-144 128,-144 134,-144 140,-150 140,-156 140,-156 140,-168 140,-168 140,-174 134,-180 128,-180"/>
+<text text-anchor="middle" x="113" y="-159.5" font-family="sans" font-size="10.00">quast</text>
+</g>
+<!-- 1&#45;&gt;0 -->
+<g id="edge2" class="edge">
+<title>1&#45;&gt;0</title>
+<path fill="none" stroke="grey" stroke-width="2" d="M118.67,-143.87C126.71,-119.56 141.52,-74.82 151.06,-46.01"/>
+<polygon fill="grey" stroke="grey" stroke-width="2" points="154.49,-46.78 154.31,-36.19 147.85,-44.58 154.49,-46.78"/>
+</g>
+<!-- 2 -->
+<g id="node3" class="node">
+<title>2</title>
+<path fill="none" stroke="#bed856" stroke-width="2" d="M86,-252C86,-252 12,-252 12,-252 6,-252 0,-246 0,-240 0,-240 0,-228 0,-228 0,-222 6,-216 12,-216 12,-216 86,-216 86,-216 92,-216 98,-222 98,-228 98,-228 98,-240 98,-240 98,-246 92,-252 86,-252"/>
+<text text-anchor="middle" x="49" y="-231.5" font-family="sans" font-size="10.00">annotate_prokka</text>
+</g>
+<!-- 2&#45;&gt;1 -->
+<g id="edge5" class="edge">
+<title>2&#45;&gt;1</title>
+<path fill="none" stroke="grey" stroke-width="2" d="M64.82,-215.7C72.57,-207.22 82.04,-196.86 90.52,-187.58"/>
+<polygon fill="grey" stroke="grey" stroke-width="2" points="93.2,-189.85 97.36,-180.1 88.03,-185.12 93.2,-189.85"/>
+</g>
+<!-- 9 -->
+<g id="node10" class="node">
+<title>9</title>
+<path fill="none" stroke="#56d8c1" stroke-width="2" d="M245.5,-180C245.5,-180 170.5,-180 170.5,-180 164.5,-180 158.5,-174 158.5,-168 158.5,-168 158.5,-156 158.5,-156 158.5,-150 164.5,-144 170.5,-144 170.5,-144 245.5,-144 245.5,-144 251.5,-144 257.5,-150 257.5,-156 257.5,-156 257.5,-168 257.5,-168 257.5,-174 251.5,-180 245.5,-180"/>
+<text text-anchor="middle" x="208" y="-159.5" font-family="sans" font-size="10.00">prepare_panaroo</text>
+</g>
+<!-- 2&#45;&gt;9 -->
+<g id="edge14" class="edge">
+<title>2&#45;&gt;9</title>
+<path fill="none" stroke="grey" stroke-width="2" d="M87.9,-215.88C109.63,-206.31 136.9,-194.3 159.96,-184.15"/>
+<polygon fill="grey" stroke="grey" stroke-width="2" points="161.49,-187.3 169.23,-180.07 158.67,-180.89 161.49,-187.3"/>
+</g>
+<!-- 3 -->
+<g id="node4" class="node">
+<title>3</title>
+<path fill="none" stroke="#59d856" stroke-width="2" d="M168,-396C168,-396 130,-396 130,-396 124,-396 118,-390 118,-384 118,-384 118,-372 118,-372 118,-366 124,-360 130,-360 130,-360 168,-360 168,-360 174,-360 180,-366 180,-372 180,-372 180,-384 180,-384 180,-390 174,-396 168,-396"/>
+<text text-anchor="middle" x="149" y="-375.5" font-family="sans" font-size="10.00">get_fasta</text>
+</g>
+<!-- 3&#45;&gt;2 -->
+<g id="edge6" class="edge">
+<title>3&#45;&gt;2</title>
+<path fill="none" stroke="grey" stroke-width="2" d="M122.66,-359.85C109.93,-350.49 95.22,-337.97 85,-324 71.3,-305.27 61.85,-280.59 56.08,-261.97"/>
+<polygon fill="grey" stroke="grey" stroke-width="2" points="59.43,-260.94 53.26,-252.33 52.71,-262.91 59.43,-260.94"/>
+</g>
+<!-- 5 -->
+<g id="node6" class="node">
+<title>5</title>
+<path fill="none" stroke="#56c1d8" stroke-width="2" d="M192,-324C192,-324 106,-324 106,-324 100,-324 94,-318 94,-312 94,-312 94,-300 94,-300 94,-294 100,-288 106,-288 106,-288 192,-288 192,-288 198,-288 204,-294 204,-300 204,-300 204,-312 204,-312 204,-318 198,-324 192,-324"/>
+<text text-anchor="middle" x="149" y="-303.5" font-family="sans" font-size="10.00">prepare_yaml_files</text>
+</g>
+<!-- 3&#45;&gt;5 -->
+<g id="edge8" class="edge">
+<title>3&#45;&gt;5</title>
+<path fill="none" stroke="grey" stroke-width="2" d="M149,-359.7C149,-351.98 149,-342.71 149,-334.11"/>
+<polygon fill="grey" stroke="grey" stroke-width="2" points="152.5,-334.1 149,-324.1 145.5,-334.1 152.5,-334.1"/>
+</g>
+<!-- 6 -->
+<g id="node7" class="node">
+<title>6</title>
+<path fill="none" stroke="#d88d56" stroke-width="2" d="M304.5,-252C304.5,-252 235.5,-252 235.5,-252 229.5,-252 223.5,-246 223.5,-240 223.5,-240 223.5,-228 223.5,-228 223.5,-222 229.5,-216 235.5,-216 235.5,-216 304.5,-216 304.5,-216 310.5,-216 316.5,-222 316.5,-228 316.5,-228 316.5,-240 316.5,-240 316.5,-246 310.5,-252 304.5,-252"/>
+<text text-anchor="middle" x="270" y="-231.5" font-family="sans" font-size="10.00">annotate_bakta</text>
+</g>
+<!-- 3&#45;&gt;6 -->
+<g id="edge10" class="edge">
+<title>3&#45;&gt;6</title>
+<path fill="none" stroke="grey" stroke-width="2" d="M180.3,-377.23C219.17,-375.61 284.13,-366.26 315,-324 329.66,-303.94 313.74,-278.4 296.92,-259.83"/>
+<polygon fill="grey" stroke="grey" stroke-width="2" points="299.19,-257.14 289.75,-252.34 294.14,-261.98 299.19,-257.14"/>
+</g>
+<!-- 4 -->
+<g id="node5" class="node">
+<title>4</title>
+<path fill="none" stroke="#d8bc56" stroke-width="2" d="M193.5,-252C193.5,-252 128.5,-252 128.5,-252 122.5,-252 116.5,-246 116.5,-240 116.5,-240 116.5,-228 116.5,-228 116.5,-222 122.5,-216 128.5,-216 128.5,-216 193.5,-216 193.5,-216 199.5,-216 205.5,-222 205.5,-228 205.5,-228 205.5,-240 205.5,-240 205.5,-246 199.5,-252 193.5,-252"/>
+<text text-anchor="middle" x="161" y="-231.5" font-family="sans" font-size="10.00">annotate_pgap</text>
+</g>
+<!-- 4&#45;&gt;1 -->
+<g id="edge4" class="edge">
+<title>4&#45;&gt;1</title>
+<path fill="none" stroke="grey" stroke-width="2" d="M149.13,-215.7C143.5,-207.47 136.65,-197.48 130.43,-188.42"/>
+<polygon fill="grey" stroke="grey" stroke-width="2" points="133.27,-186.37 124.73,-180.1 127.5,-190.33 133.27,-186.37"/>
+</g>
+<!-- 4&#45;&gt;9 -->
+<g id="edge13" class="edge">
+<title>4&#45;&gt;9</title>
+<path fill="none" stroke="grey" stroke-width="2" d="M172.62,-215.7C178.14,-207.47 184.85,-197.48 190.93,-188.42"/>
+<polygon fill="grey" stroke="grey" stroke-width="2" points="193.85,-190.36 196.52,-180.1 188.04,-186.46 193.85,-190.36"/>
+</g>
+<!-- 5&#45;&gt;4 -->
+<g id="edge7" class="edge">
+<title>5&#45;&gt;4</title>
+<path fill="none" stroke="grey" stroke-width="2" d="M151.97,-287.7C153.29,-279.98 154.88,-270.71 156.35,-262.11"/>
+<polygon fill="grey" stroke="grey" stroke-width="2" points="159.83,-262.55 158.07,-252.1 152.93,-261.37 159.83,-262.55"/>
+</g>
+<!-- 6&#45;&gt;1 -->
+<g id="edge3" class="edge">
+<title>6&#45;&gt;1</title>
+<path fill="none" stroke="grey" stroke-width="2" d="M230.37,-215.98C208.71,-206.6 181.25,-194.58 149.24,-180.06"/>
+<polygon fill="grey" stroke="grey" stroke-width="2" points="150.64,-176.85 140.09,-175.89 147.74,-183.22 150.64,-176.85"/>
+</g>
+<!-- 6&#45;&gt;9 -->
+<g id="edge12" class="edge">
+<title>6&#45;&gt;9</title>
+<path fill="none" stroke="grey" stroke-width="2" d="M254.67,-215.7C247.24,-207.3 238.18,-197.07 230.02,-187.86"/>
+<polygon fill="grey" stroke="grey" stroke-width="2" points="232.4,-185.27 223.15,-180.1 227.16,-189.91 232.4,-185.27"/>
+</g>
+<!-- 7 -->
+<g id="node8" class="node">
+<title>7</title>
+<path fill="none" stroke="#88d856" stroke-width="2" d="M293.5,-324C293.5,-324 234.5,-324 234.5,-324 228.5,-324 222.5,-318 222.5,-312 222.5,-312 222.5,-300 222.5,-300 222.5,-294 228.5,-288 234.5,-288 234.5,-288 293.5,-288 293.5,-288 299.5,-288 305.5,-294 305.5,-300 305.5,-300 305.5,-312 305.5,-312 305.5,-318 299.5,-324 293.5,-324"/>
+<text text-anchor="middle" x="264" y="-303.5" font-family="sans" font-size="10.00">get_bakta_db</text>
+</g>
+<!-- 7&#45;&gt;6 -->
+<g id="edge9" class="edge">
+<title>7&#45;&gt;6</title>
+<path fill="none" stroke="grey" stroke-width="2" d="M265.48,-287.7C266.14,-279.98 266.94,-270.71 267.68,-262.11"/>
+<polygon fill="grey" stroke="grey" stroke-width="2" points="271.17,-262.37 268.53,-252.1 264.19,-261.77 271.17,-262.37"/>
+</g>
+<!-- 8 -->
+<g id="node9" class="node">
+<title>8</title>
+<path fill="none" stroke="#56d88a" stroke-width="2" d="M214,-108C214,-108 182,-108 182,-108 176,-108 170,-102 170,-96 170,-96 170,-84 170,-84 170,-78 176,-72 182,-72 182,-72 214,-72 214,-72 220,-72 226,-78 226,-84 226,-84 226,-96 226,-96 226,-102 220,-108 214,-108"/>
+<text text-anchor="middle" x="198" y="-87.5" font-family="sans" font-size="10.00">panaroo</text>
+</g>
+<!-- 8&#45;&gt;0 -->
+<g id="edge1" class="edge">
+<title>8&#45;&gt;0</title>
+<path fill="none" stroke="grey" stroke-width="2" d="M188.61,-71.7C184.24,-63.64 178.94,-53.89 174.1,-44.98"/>
+<polygon fill="grey" stroke="grey" stroke-width="2" points="177.13,-43.22 169.29,-36.1 170.98,-46.56 177.13,-43.22"/>
+</g>
+<!-- 9&#45;&gt;8 -->
+<g id="edge11" class="edge">
+<title>9&#45;&gt;8</title>
+<path fill="none" stroke="grey" stroke-width="2" d="M205.53,-143.7C204.43,-135.98 203.1,-126.71 201.87,-118.11"/>
+<polygon fill="grey" stroke="grey" stroke-width="2" points="205.32,-117.51 200.44,-108.1 198.39,-118.5 205.32,-117.51"/>
+</g>
+</g>
+</svg>

From 8fa11b19776d0415e9cdf2c8e50b7911558572d1 Mon Sep 17 00:00:00 2001
From: Rina Ahmed-Begrich <begrich@mpusp.mpg.de>
Date: Wed, 10 Dec 2025 15:39:28 +0100
Subject: [PATCH 17/18] chore: define parameters in config readme.

---
 config/README.md | 33 ++++++++++++++++++++++++++++++++-
 1 file changed, 32 insertions(+), 1 deletion(-)

diff --git a/config/README.md b/config/README.md
index a5c8d18..4d77e2c 100644
--- a/config/README.md
+++ b/config/README.md
@@ -20,5 +20,36 @@ The samplesheet table has the following layout:
 | sample | species                  | strain | id_prefix | file           |
 | ------ | ------------------------ | ------ | --------- | -------------- |
 | EC2224 | "Streptococcus pyogenes" | SF370  | SPY       | assembly.fasta |
+| ...    | ...                      | ...    | ...       | ...            |
 
-**Note:** Pangenome analysis with `Panaroo` requires at least two samples.
\ No newline at end of file
+**Note:** Pangenome analysis with `Panaroo` requires at least two samples.
+
+### Parameters
+
+This table lists all parameters that can be used to run the workflow.
+
+| Parameter | Type | Details | Default |
+|:---|:---|:---|:---|
+| **samplesheet** | string | Path to the sample sheet file in csv format | |
+| **tool** | array[string] | Annotation tool to use (one of `prokka`, `pgap`, `bakta`) | |
+| **pgap** | | PGAP configuration object |  |
+| bin | string | Path to the PGAP script | |
+| use_yaml_config | boolean | Whether to use YAML configuration for PGAP | `False` |
+| _prepare_yaml_files_ | | Paths to YAML templates for PGAP | |
+| generic | string | Path to the generic YAML configuration file | |
+| submol | string | Path to the submol YAML configuration file | |
+| **prokka** | | Prokka configuration object | |
+| center | string | Center name for Prokka annotation (used in sequence IDs) | |
+| extra | string | Extra command-line arguments for Prokka | |
+| **bakta** | | Bakta configuration object | |
+| download_db | string | Bakta database type (`full`, `light`, or `none`) | `light` |
+| existing_db | string | Path to an existing Bakta database (optional). Needs to be combined with `download_db='none'` | |
+| extra | string | Extra command-line arguments for Bakta | |
+| **quast** | | QUAST configuration object | |
+| reference_fasta | string | Path to the reference genome for QUAST | |
+| reference_gff | string | Path to the reference annotation for QUAST |
+| extra | string | Extra command-line arguments for QUAST | |
+| **panaroo** | | Panaroo configuration object | |
+| remove_source | string | Source types to remove in Panaroo (regex supported) | `cmsearch` |
+| remove_feature | string | Feature types to remove in Panaroo (regex supported) | `tRNA\|rRNA\|ncRNA\|exon\|sequence_feature` |
+| extra | string | Extra command-line arguments for Panaroo | |

From dcbd2f49b0db125f941d8ba5634350908284bc98 Mon Sep 17 00:00:00 2001
From: Rina Ahmed-Begrich <begrich@mpusp.mpg.de>
Date: Wed, 10 Dec 2025 15:49:57 +0100
Subject: [PATCH 18/18] chore: update defaults in config readme

---
 config/README.md | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/config/README.md b/config/README.md
index 4d77e2c..d864e26 100644
--- a/config/README.md
+++ b/config/README.md
@@ -40,10 +40,10 @@ This table lists all parameters that can be used to run the workflow.
 | submol | string | Path to the submol YAML configuration file | |
 | **prokka** | | Prokka configuration object | |
 | center | string | Center name for Prokka annotation (used in sequence IDs) | |
-| extra | string | Extra command-line arguments for Prokka | |
+| extra | string | Extra command-line arguments for Prokka | `--addgenes` |
 | **bakta** | | Bakta configuration object | |
 | download_db | string | Bakta database type (`full`, `light`, or `none`) | `light` |
-| existing_db | string | Path to an existing Bakta database (optional). Needs to be combined with `download_db='none'` | |
+| existing_db | string | Path to an existing Bakta database (optional). Needs to be combined with `download_db='none'` | `--keep-contig-headers --compliant` |
 | extra | string | Extra command-line arguments for Bakta | |
 | **quast** | | QUAST configuration object | |
 | reference_fasta | string | Path to the reference genome for QUAST | |
@@ -52,4 +52,4 @@ This table lists all parameters that can be used to run the workflow.
 | **panaroo** | | Panaroo configuration object | |
 | remove_source | string | Source types to remove in Panaroo (regex supported) | `cmsearch` |
 | remove_feature | string | Feature types to remove in Panaroo (regex supported) | `tRNA\|rRNA\|ncRNA\|exon\|sequence_feature` |
-| extra | string | Extra command-line arguments for Panaroo | |
+| extra | string | Extra command-line arguments for Panaroo | `--clean-mode strict --remove-invalid-genes` |