From 0f12e7ee8ab9a63c2f2c2cdf05700c6f8d217f3f Mon Sep 17 00:00:00 2001
From: Villon CHEN <villon.chen@oppida.fr>
Date: Fri, 14 Nov 2025 18:16:25 +0100
Subject: [PATCH 1/5] feat(sasts): do not run analysis from temporary directory
 as it breaks filepath parsing

---
 codesectools/sasts/core/sast/__init__.py | 79 +++++-------------------
 1 file changed, 17 insertions(+), 62 deletions(-)

diff --git a/codesectools/sasts/core/sast/__init__.py b/codesectools/sasts/core/sast/__init__.py
index 35d07c8..51559ac 100644
--- a/codesectools/sasts/core/sast/__init__.py
+++ b/codesectools/sasts/core/sast/__init__.py
@@ -9,13 +9,11 @@
 import os
 import random
 import shutil
-import tempfile
 import time
 from abc import ABC
 from pathlib import Path
 from typing import Any, Literal, Union
 
-import git
 from rich import print
 from rich.panel import Panel
 from rich.progress import Progress
@@ -165,7 +163,7 @@ def save_results(self, project_dir: Path, output_dir: Path, extra: dict) -> None
 
         """
         output_dir.mkdir(exist_ok=True, parents=True)
-        json.dump(extra, (output_dir / "cstools_output.json").open("w"))
+        json.dump(extra, (output_dir / "cstools_output.json").open("w"), indent=4)
 
         missing_files = []
         for path_from_root, required in self.output_files:
@@ -175,7 +173,7 @@ def save_results(self, project_dir: Path, output_dir: Path, extra: dict) -> None
                 filepath = project_dir / parent_dir / filename
                 if filepath.is_file():
                     if not filepath == output_dir / filename:
-                        shutil.copy2(filepath, output_dir / filename)
+                        filepath.rename(output_dir / filename)
                 else:
                     if required:
                         missing_files.append(filename)
@@ -184,7 +182,7 @@ def save_results(self, project_dir: Path, output_dir: Path, extra: dict) -> None
                 if filepaths:
                     for filepath in filepaths:
                         if not filepath == output_dir / filename:
-                            shutil.copy2(filepath, output_dir / filepath.name)
+                            filepath.rename(output_dir / filepath.name)
                 else:
                     if required:
                         missing_files.append(filename)
@@ -218,25 +216,7 @@ def analyze_files(
                 )
                 return
 
-        # Create temporary directory for the project
-        temp_dir = tempfile.TemporaryDirectory()
-        temp_path = Path(temp_dir.name)
-
-        # Copy files into the temporary directory
-        if testing:
-            random.seed(os.environ.get("CONSTANT_RANDOM", os.urandom(16)))
-            files = random.sample(dataset.files, k=2)
-        else:
-            files = dataset.files
-
-        for file in files:
-            file.save(temp_path)
-
-        # Run analysis
-        self.run_analysis(dataset.lang, temp_path, result_path)
-
-        # Clear temporary directory
-        temp_dir.cleanup()
+        self.run_analysis(dataset.lang, dataset.directory, result_path)
 
     def analyze_repos(
         self, dataset: GitRepoDataset, overwrite: bool = False, testing: bool = False
@@ -252,8 +232,8 @@ def analyze_repos(
             testing: If True, run analysis on a sample of two small random repositories for testing purposes.
 
         """
-        base_result_path = self.output_dir / dataset.full_name
-        base_result_path.mkdir(exist_ok=True, parents=True)
+        result_path = self.output_dir / dataset.full_name
+        result_path.mkdir(exist_ok=True, parents=True)
 
         if testing:
             random.seed(os.environ.get("CONSTANT_RANDOM", os.urandom(16)))
@@ -263,27 +243,22 @@ def analyze_repos(
             repos = dataset.repos
 
         for repo in repos:
-            result_path = base_result_path / repo.name
-            if result_path.is_dir():
-                if list(result_path.iterdir()) and not overwrite:
+            repo_result_path = result_path / repo.name
+            if repo_result_path.is_dir():
+                if list(repo_result_path.iterdir()) and not overwrite:
                     print(f"Results already exist for {repo.name}, skipping...")
                     print("Please use --overwrite to analyze again")
+                    continue
 
-            # Create temporary directory for the project
-            temp_dir = tempfile.TemporaryDirectory()
-            repo_path = Path(temp_dir.name)
+            repo_source_path = dataset.directory / repo.name
 
-            # Clone and checkout to the vulnerable commit
-            try:
-                repo.save(repo_path)
-            except git.GitCommandError:
-                continue
+            if repo_source_path.is_dir():
+                shutil.rmtree(repo_source_path)
 
-            # Run analysis
-            self.run_analysis(dataset.lang, repo_path, result_path)
+            repo_source_path.mkdir()
+            repo.save(repo_source_path)
 
-            # Clear temporary directory
-            temp_dir.cleanup()
+            self.run_analysis(dataset.lang, repo_source_path, repo_result_path)
 
     @property
     def supported_dataset_full_names(self) -> list[str]:
@@ -399,27 +374,7 @@ def analyze_files(
                 )
                 return
 
-        # Create temporary directory for the project
-        temp_dir = tempfile.TemporaryDirectory()
-        temp_path = Path(temp_dir.name)
-
-        # Copy files into the temporary directory
-        if testing:
-            random.seed(os.environ.get("CONSTANT_RANDOM", os.urandom(16)))
-            prebuilt_files = random.sample(dataset.list_prebuilt_files(), k=2)
-        else:
-            prebuilt_files = dataset.list_prebuilt_files()
-
-        for prebuilt_file in prebuilt_files:
-            shutil.copy2(prebuilt_file, temp_path / prebuilt_file.name)
-
-        # Run analysis
-        self.run_analysis(
-            dataset.lang, dataset.directory, result_path, artifacts=temp_path
-        )
-
-        # Clear temporary directory
-        temp_dir.cleanup()
+        self.run_analysis(dataset.lang, dataset.directory, result_path)
 
 
 class PrebuiltBuildlessSAST(PrebuiltSAST, BuildlessSAST):

From e9c20c2977266aa4dcc088a4b714a70936ad77a3 Mon Sep 17 00:00:00 2001
From: Villon CHEN <villon.chen@oppida.fr>
Date: Fri, 14 Nov 2025 18:23:55 +0100
Subject: [PATCH 2/5] feat(datasets): pass artifact arguments to benchmark
 commands

---
 .../datasets/BenchmarkJava/dataset.py         |  6 ++++++
 codesectools/datasets/core/dataset.py         | 21 ++++++++++---------
 codesectools/sasts/core/sast/__init__.py      |  7 ++++++-
 3 files changed, 23 insertions(+), 11 deletions(-)

diff --git a/codesectools/datasets/BenchmarkJava/dataset.py b/codesectools/datasets/BenchmarkJava/dataset.py
index c7e2f61..ec25095 100644
--- a/codesectools/datasets/BenchmarkJava/dataset.py
+++ b/codesectools/datasets/BenchmarkJava/dataset.py
@@ -54,6 +54,11 @@ class BenchmarkJava(PrebuiltFileDataset):
     Attributes:
         name (str): The name of the dataset, "BenchmarkJava".
         supported_languages (list[str]): A list of supported programming languages.
+        license (str): The license under which the dataset is distributed.
+        license_url (str): A URL to the full text of the license.
+        build_command (str): The command to build the Java project.
+        prebuilt_expected (tuple): A tuple defining the path and glob pattern for expected build artifacts.
+        artefacts_arg (str): The argument to specify the location of build artifacts for SAST tools.
 
     """
 
@@ -64,6 +69,7 @@ class BenchmarkJava(PrebuiltFileDataset):
 
     build_command = "mvn clean compile"
     prebuilt_expected = (Path("target/classes/org/owasp/benchmark/testcode"), "*.class")
+    artefacts_arg = "."
 
     def __init__(self, lang: None | str = None) -> None:
         """Initialize the BenchmarkJava dataset.
diff --git a/codesectools/datasets/core/dataset.py b/codesectools/datasets/core/dataset.py
index 1ba6145..97ed03e 100644
--- a/codesectools/datasets/core/dataset.py
+++ b/codesectools/datasets/core/dataset.py
@@ -150,10 +150,19 @@ def list_dataset_full_names(cls) -> list[str]:
 
 
 class PrebuiltDatasetMixin:
-    """Provide functionality for datasets that require a build step."""
+    """Provide functionality for datasets that require a build step.
+
+    Attributes:
+        build_command (str): The command required to build the dataset.
+        prebuilt_expected (tuple[Path, str]): A tuple containing the path and glob pattern
+            to find the built artifacts.
+        artefacts_arg (str): The argument to pass to the SAST tool command template.
+
+    """
 
     build_command: str
     prebuilt_expected: tuple[Path, str]
+    artefacts_arg: str
 
     def is_built(self) -> bool:
         """Check if the dataset has been built."""
@@ -271,15 +280,7 @@ def save(self, dir: Path) -> None:
 
 
 class FileDataset(Dataset):
-    """Abstract base class for datasets composed of individual files.
-
-    Attributes:
-        directory (Path): The directory path for the dataset.
-        lang (str): The programming language of the dataset.
-        full_name (str): The full name of the dataset, including the language.
-        files (list[File]): A list of `File` objects loaded from the dataset.
-
-    """
+    """Abstract base class for datasets composed of individual files."""
 
     def __init__(self, lang: str) -> None:
         """Initialize a FileDataset instance.
diff --git a/codesectools/sasts/core/sast/__init__.py b/codesectools/sasts/core/sast/__init__.py
index 51559ac..bc8a9bf 100644
--- a/codesectools/sasts/core/sast/__init__.py
+++ b/codesectools/sasts/core/sast/__init__.py
@@ -374,7 +374,12 @@ def analyze_files(
                 )
                 return
 
-        self.run_analysis(dataset.lang, dataset.directory, result_path)
+        self.run_analysis(
+            dataset.lang,
+            dataset.directory,
+            result_path,
+            artifacts=dataset.artefacts_arg,
+        )
 
 
 class PrebuiltBuildlessSAST(PrebuiltSAST, BuildlessSAST):

From 1ff8d9c8998c6a205a125be144f94d743dab8c7c Mon Sep 17 00:00:00 2001
From: Villon CHEN <villon.chen@oppida.fr>
Date: Fri, 14 Nov 2025 18:25:22 +0100
Subject: [PATCH 3/5] feat(sasts): provide temporary directory for analysis
 tools

---
 codesectools/sasts/core/sast/__init__.py  | 7 +++++++
 codesectools/sasts/tools/Cppcheck/sast.py | 1 +
 2 files changed, 8 insertions(+)

diff --git a/codesectools/sasts/core/sast/__init__.py b/codesectools/sasts/core/sast/__init__.py
index bc8a9bf..c874570 100644
--- a/codesectools/sasts/core/sast/__init__.py
+++ b/codesectools/sasts/core/sast/__init__.py
@@ -9,6 +9,7 @@
 import os
 import random
 import shutil
+import tempfile
 import time
 from abc import ABC
 from pathlib import Path
@@ -121,9 +122,15 @@ def run_analysis(
                 render_variables[to_replace] = v
             elif isinstance(v, Path):
                 render_variables[to_replace] = str(v.resolve())
+            elif isinstance(v, list):
+                render_variables[to_replace] = v
             else:
                 raise NotImplementedError(k, v)
 
+        # Make temporary directory available to command
+        temp_dir = tempfile.TemporaryDirectory()
+        render_variables["{tempdir}"] = temp_dir.name
+
         with Progress() as progress:
             progress.add_task(
                 f"[b][{self.name}][/b] analyzing: [i]{project_dir.name}[/i]",
diff --git a/codesectools/sasts/tools/Cppcheck/sast.py b/codesectools/sasts/tools/Cppcheck/sast.py
index 611d7ff..003492f 100644
--- a/codesectools/sasts/tools/Cppcheck/sast.py
+++ b/codesectools/sasts/tools/Cppcheck/sast.py
@@ -50,6 +50,7 @@ class CppcheckSAST(PrebuiltBuildlessSAST):
             "--enable=all",
             "--xml",
             "--output-file=cppcheck_output.xml",
+            "--cppcheck-build-dir={tempdir}",
         ]
     ]
     valid_codes = [0]

From 59955ea5e4fb7bf811aeec0ee020ee851caccf76 Mon Sep 17 00:00:00 2001
From: Villon CHEN <villon.chen@oppida.fr>
Date: Mon, 17 Nov 2025 11:21:08 +0100
Subject: [PATCH 4/5] feat(utils): render list of arguments

---
 codesectools/utils.py | 56 +++++++++++++++++++++++++++++++++----------
 1 file changed, 43 insertions(+), 13 deletions(-)

diff --git a/codesectools/utils.py b/codesectools/utils.py
index 7ac12fb..0ccdb64 100644
--- a/codesectools/utils.py
+++ b/codesectools/utils.py
@@ -6,6 +6,7 @@
 """
 
 import os
+import re
 import subprocess
 from collections.abc import Sequence
 from importlib.resources import files
@@ -39,31 +40,60 @@ def DEBUG() -> bool:
 
 
 # Subprocess wrapper
-def render_command(command: list[str], map: dict[str, str]) -> list[str]:
+def get_pattern(arg: str, mapping: dict[str, str]) -> str | None:
+    """Find a placeholder pattern like '{placeholder}' in an argument string.
+
+    Args:
+        arg: The string to search for a pattern.
+        mapping: A dictionary of placeholders, kept for contextual consistency
+                 with `render_command`.
+
+    Returns:
+        The found pattern string (e.g., '{placeholder}') or None if not found.
+
+    """
+    if m := re.search(r"\{.*\}", arg):
+        return m.group(0)
+
+
+def render_command(command: list, mapping: dict[str, str]) -> list[str]:
     """Render a command template by replacing placeholders with values.
 
+    Substitutes placeholders in a command list from a given map. It handles
+    simple string arguments and optional arguments represented as tuples.
+    If a mapped value is a list, the argument is expanded.
+
     Args:
-        command: The command template as a list of strings.
-        map: A dictionary of placeholders to their replacement values.
+        command: The command template, which can contain strings and tuples
+            of the form `(default, optional_template)`.
+        mapping: A dictionary of placeholders to their replacement values.
 
     Returns:
         The rendered command as a list of strings.
 
     """
     _command = command.copy()
-    for pattern, value in map.items():
-        for i, arg in enumerate(_command):
-            # Check if optional argument can be used
-            if isinstance(arg, tuple):
-                default_arg, optional_arg = arg
-                if pattern in optional_arg:
-                    _command[i] = arg.replace(pattern, value)
+    for i, arg in enumerate(_command):
+        # Check if optional argument can be used
+        if isinstance(arg, tuple):
+            default_arg, optional_arg = arg
+
+            if pattern := get_pattern(optional_arg, mapping):
+                _command[i] = optional_arg.replace(pattern, mapping[pattern])
+            elif pattern := get_pattern(default_arg, mapping):
+                _command[i] = default_arg.replace(pattern, mapping[pattern])
+        else:
+            if pattern := get_pattern(arg, mapping):
+                value = mapping[pattern]
+                if isinstance(value, list):
+                    _command[i] = " ".join(
+                        arg.replace(pattern, subvalue) for subvalue in value
+                    )
                 else:
-                    _command[i] = default_arg
-            else:
-                if pattern in arg:
                     _command[i] = arg.replace(pattern, value)
 
+    _command = " ".join(_command).split(" ")
+
     # Remove not rendered part of the command:
     __command = []
     for part in _command:

From 78e0a0cd1107d94fead9c4c11ed7d93f6b084513 Mon Sep 17 00:00:00 2001
From: Villon CHEN <villon.chen@oppida.fr>
Date: Mon, 17 Nov 2025 11:22:10 +0100
Subject: [PATCH 5/5] feat(sasts): provide `CPU_COUNT` for SAST tool's command

---
 codesectools/sasts/tools/Cppcheck/sast.py | 2 ++
 codesectools/utils.py                     | 3 +++
 2 files changed, 5 insertions(+)

diff --git a/codesectools/sasts/tools/Cppcheck/sast.py b/codesectools/sasts/tools/Cppcheck/sast.py
index 003492f..518a820 100644
--- a/codesectools/sasts/tools/Cppcheck/sast.py
+++ b/codesectools/sasts/tools/Cppcheck/sast.py
@@ -13,6 +13,7 @@
     SASTRequirements,
 )
 from codesectools.sasts.tools.Cppcheck.parser import CppcheckAnalysisResult
+from codesectools.utils import CPU_COUNT
 
 
 class CppcheckSAST(PrebuiltBuildlessSAST):
@@ -51,6 +52,7 @@ class CppcheckSAST(PrebuiltBuildlessSAST):
             "--xml",
             "--output-file=cppcheck_output.xml",
             "--cppcheck-build-dir={tempdir}",
+            f"-j{CPU_COUNT}",
         ]
     ]
     valid_codes = [0]
diff --git a/codesectools/utils.py b/codesectools/utils.py
index 0ccdb64..1ae8b0e 100644
--- a/codesectools/utils.py
+++ b/codesectools/utils.py
@@ -223,3 +223,6 @@ def shorten_path(p: str) -> str:
     if len(path.parts) > 3:
         return str(Path("...") / path.parts[-2] / path.parts[-1])
     return p
+
+
+CPU_COUNT = os.cpu_count()