From 0f12e7ee8ab9a63c2f2c2cdf05700c6f8d217f3f Mon Sep 17 00:00:00 2001 From: Villon CHEN Date: Fri, 14 Nov 2025 18:16:25 +0100 Subject: [PATCH 1/5] feat(sasts): do not run analysis from temporary directory as it breaks filepath parsing --- codesectools/sasts/core/sast/__init__.py | 79 +++++------------------- 1 file changed, 17 insertions(+), 62 deletions(-) diff --git a/codesectools/sasts/core/sast/__init__.py b/codesectools/sasts/core/sast/__init__.py index 35d07c8..51559ac 100644 --- a/codesectools/sasts/core/sast/__init__.py +++ b/codesectools/sasts/core/sast/__init__.py @@ -9,13 +9,11 @@ import os import random import shutil -import tempfile import time from abc import ABC from pathlib import Path from typing import Any, Literal, Union -import git from rich import print from rich.panel import Panel from rich.progress import Progress @@ -165,7 +163,7 @@ def save_results(self, project_dir: Path, output_dir: Path, extra: dict) -> None """ output_dir.mkdir(exist_ok=True, parents=True) - json.dump(extra, (output_dir / "cstools_output.json").open("w")) + json.dump(extra, (output_dir / "cstools_output.json").open("w"), indent=4) missing_files = [] for path_from_root, required in self.output_files: @@ -175,7 +173,7 @@ def save_results(self, project_dir: Path, output_dir: Path, extra: dict) -> None filepath = project_dir / parent_dir / filename if filepath.is_file(): if not filepath == output_dir / filename: - shutil.copy2(filepath, output_dir / filename) + filepath.rename(output_dir / filename) else: if required: missing_files.append(filename) @@ -184,7 +182,7 @@ def save_results(self, project_dir: Path, output_dir: Path, extra: dict) -> None if filepaths: for filepath in filepaths: if not filepath == output_dir / filename: - shutil.copy2(filepath, output_dir / filepath.name) + filepath.rename(output_dir / filepath.name) else: if required: missing_files.append(filename) @@ -218,25 +216,7 @@ def analyze_files( ) return - # Create temporary directory for the project - temp_dir = tempfile.TemporaryDirectory() - temp_path = Path(temp_dir.name) - - # Copy files into the temporary directory - if testing: - random.seed(os.environ.get("CONSTANT_RANDOM", os.urandom(16))) - files = random.sample(dataset.files, k=2) - else: - files = dataset.files - - for file in files: - file.save(temp_path) - - # Run analysis - self.run_analysis(dataset.lang, temp_path, result_path) - - # Clear temporary directory - temp_dir.cleanup() + self.run_analysis(dataset.lang, dataset.directory, result_path) def analyze_repos( self, dataset: GitRepoDataset, overwrite: bool = False, testing: bool = False @@ -252,8 +232,8 @@ def analyze_repos( testing: If True, run analysis on a sample of two small random repositories for testing purposes. """ - base_result_path = self.output_dir / dataset.full_name - base_result_path.mkdir(exist_ok=True, parents=True) + result_path = self.output_dir / dataset.full_name + result_path.mkdir(exist_ok=True, parents=True) if testing: random.seed(os.environ.get("CONSTANT_RANDOM", os.urandom(16))) @@ -263,27 +243,22 @@ def analyze_repos( repos = dataset.repos for repo in repos: - result_path = base_result_path / repo.name - if result_path.is_dir(): - if list(result_path.iterdir()) and not overwrite: + repo_result_path = result_path / repo.name + if repo_result_path.is_dir(): + if list(repo_result_path.iterdir()) and not overwrite: print(f"Results already exist for {repo.name}, skipping...") print("Please use --overwrite to analyze again") + continue - # Create temporary directory for the project - temp_dir = tempfile.TemporaryDirectory() - repo_path = Path(temp_dir.name) + repo_source_path = dataset.directory / repo.name - # Clone and checkout to the vulnerable commit - try: - repo.save(repo_path) - except git.GitCommandError: - continue + if repo_source_path.is_dir(): + shutil.rmtree(repo_source_path) - # Run analysis - self.run_analysis(dataset.lang, repo_path, result_path) + repo_source_path.mkdir() + repo.save(repo_source_path) - # Clear temporary directory - temp_dir.cleanup() + self.run_analysis(dataset.lang, repo_source_path, repo_result_path) @property def supported_dataset_full_names(self) -> list[str]: @@ -399,27 +374,7 @@ def analyze_files( ) return - # Create temporary directory for the project - temp_dir = tempfile.TemporaryDirectory() - temp_path = Path(temp_dir.name) - - # Copy files into the temporary directory - if testing: - random.seed(os.environ.get("CONSTANT_RANDOM", os.urandom(16))) - prebuilt_files = random.sample(dataset.list_prebuilt_files(), k=2) - else: - prebuilt_files = dataset.list_prebuilt_files() - - for prebuilt_file in prebuilt_files: - shutil.copy2(prebuilt_file, temp_path / prebuilt_file.name) - - # Run analysis - self.run_analysis( - dataset.lang, dataset.directory, result_path, artifacts=temp_path - ) - - # Clear temporary directory - temp_dir.cleanup() + self.run_analysis(dataset.lang, dataset.directory, result_path) class PrebuiltBuildlessSAST(PrebuiltSAST, BuildlessSAST): From e9c20c2977266aa4dcc088a4b714a70936ad77a3 Mon Sep 17 00:00:00 2001 From: Villon CHEN Date: Fri, 14 Nov 2025 18:23:55 +0100 Subject: [PATCH 2/5] feat(datasets): pass artifact arguments to benchmark commands --- .../datasets/BenchmarkJava/dataset.py | 6 ++++++ codesectools/datasets/core/dataset.py | 21 ++++++++++--------- codesectools/sasts/core/sast/__init__.py | 7 ++++++- 3 files changed, 23 insertions(+), 11 deletions(-) diff --git a/codesectools/datasets/BenchmarkJava/dataset.py b/codesectools/datasets/BenchmarkJava/dataset.py index c7e2f61..ec25095 100644 --- a/codesectools/datasets/BenchmarkJava/dataset.py +++ b/codesectools/datasets/BenchmarkJava/dataset.py @@ -54,6 +54,11 @@ class BenchmarkJava(PrebuiltFileDataset): Attributes: name (str): The name of the dataset, "BenchmarkJava". supported_languages (list[str]): A list of supported programming languages. + license (str): The license under which the dataset is distributed. + license_url (str): A URL to the full text of the license. + build_command (str): The command to build the Java project. + prebuilt_expected (tuple): A tuple defining the path and glob pattern for expected build artifacts. + artefacts_arg (str): The argument to specify the location of build artifacts for SAST tools. """ @@ -64,6 +69,7 @@ class BenchmarkJava(PrebuiltFileDataset): build_command = "mvn clean compile" prebuilt_expected = (Path("target/classes/org/owasp/benchmark/testcode"), "*.class") + artefacts_arg = "." def __init__(self, lang: None | str = None) -> None: """Initialize the BenchmarkJava dataset. diff --git a/codesectools/datasets/core/dataset.py b/codesectools/datasets/core/dataset.py index 1ba6145..97ed03e 100644 --- a/codesectools/datasets/core/dataset.py +++ b/codesectools/datasets/core/dataset.py @@ -150,10 +150,19 @@ def list_dataset_full_names(cls) -> list[str]: class PrebuiltDatasetMixin: - """Provide functionality for datasets that require a build step.""" + """Provide functionality for datasets that require a build step. + + Attributes: + build_command (str): The command required to build the dataset. + prebuilt_expected (tuple[Path, str]): A tuple containing the path and glob pattern + to find the built artifacts. + artefacts_arg (str): The argument to pass to the SAST tool command template. + + """ build_command: str prebuilt_expected: tuple[Path, str] + artefacts_arg: str def is_built(self) -> bool: """Check if the dataset has been built.""" @@ -271,15 +280,7 @@ def save(self, dir: Path) -> None: class FileDataset(Dataset): - """Abstract base class for datasets composed of individual files. - - Attributes: - directory (Path): The directory path for the dataset. - lang (str): The programming language of the dataset. - full_name (str): The full name of the dataset, including the language. - files (list[File]): A list of `File` objects loaded from the dataset. - - """ + """Abstract base class for datasets composed of individual files.""" def __init__(self, lang: str) -> None: """Initialize a FileDataset instance. diff --git a/codesectools/sasts/core/sast/__init__.py b/codesectools/sasts/core/sast/__init__.py index 51559ac..bc8a9bf 100644 --- a/codesectools/sasts/core/sast/__init__.py +++ b/codesectools/sasts/core/sast/__init__.py @@ -374,7 +374,12 @@ def analyze_files( ) return - self.run_analysis(dataset.lang, dataset.directory, result_path) + self.run_analysis( + dataset.lang, + dataset.directory, + result_path, + artifacts=dataset.artefacts_arg, + ) class PrebuiltBuildlessSAST(PrebuiltSAST, BuildlessSAST): From 1ff8d9c8998c6a205a125be144f94d743dab8c7c Mon Sep 17 00:00:00 2001 From: Villon CHEN Date: Fri, 14 Nov 2025 18:25:22 +0100 Subject: [PATCH 3/5] feat(sasts): provide temporary directory for analysis tools --- codesectools/sasts/core/sast/__init__.py | 7 +++++++ codesectools/sasts/tools/Cppcheck/sast.py | 1 + 2 files changed, 8 insertions(+) diff --git a/codesectools/sasts/core/sast/__init__.py b/codesectools/sasts/core/sast/__init__.py index bc8a9bf..c874570 100644 --- a/codesectools/sasts/core/sast/__init__.py +++ b/codesectools/sasts/core/sast/__init__.py @@ -9,6 +9,7 @@ import os import random import shutil +import tempfile import time from abc import ABC from pathlib import Path @@ -121,9 +122,15 @@ def run_analysis( render_variables[to_replace] = v elif isinstance(v, Path): render_variables[to_replace] = str(v.resolve()) + elif isinstance(v, list): + render_variables[to_replace] = v else: raise NotImplementedError(k, v) + # Make temporary directory available to command + temp_dir = tempfile.TemporaryDirectory() + render_variables["{tempdir}"] = temp_dir.name + with Progress() as progress: progress.add_task( f"[b][{self.name}][/b] analyzing: [i]{project_dir.name}[/i]", diff --git a/codesectools/sasts/tools/Cppcheck/sast.py b/codesectools/sasts/tools/Cppcheck/sast.py index 611d7ff..003492f 100644 --- a/codesectools/sasts/tools/Cppcheck/sast.py +++ b/codesectools/sasts/tools/Cppcheck/sast.py @@ -50,6 +50,7 @@ class CppcheckSAST(PrebuiltBuildlessSAST): "--enable=all", "--xml", "--output-file=cppcheck_output.xml", + "--cppcheck-build-dir={tempdir}", ] ] valid_codes = [0] From 59955ea5e4fb7bf811aeec0ee020ee851caccf76 Mon Sep 17 00:00:00 2001 From: Villon CHEN Date: Mon, 17 Nov 2025 11:21:08 +0100 Subject: [PATCH 4/5] feat(utils): render list of arguments --- codesectools/utils.py | 56 +++++++++++++++++++++++++++++++++---------- 1 file changed, 43 insertions(+), 13 deletions(-) diff --git a/codesectools/utils.py b/codesectools/utils.py index 7ac12fb..0ccdb64 100644 --- a/codesectools/utils.py +++ b/codesectools/utils.py @@ -6,6 +6,7 @@ """ import os +import re import subprocess from collections.abc import Sequence from importlib.resources import files @@ -39,31 +40,60 @@ def DEBUG() -> bool: # Subprocess wrapper -def render_command(command: list[str], map: dict[str, str]) -> list[str]: +def get_pattern(arg: str, mapping: dict[str, str]) -> str | None: + """Find a placeholder pattern like '{placeholder}' in an argument string. + + Args: + arg: The string to search for a pattern. + mapping: A dictionary of placeholders, kept for contextual consistency + with `render_command`. + + Returns: + The found pattern string (e.g., '{placeholder}') or None if not found. + + """ + if m := re.search(r"\{.*\}", arg): + return m.group(0) + + +def render_command(command: list, mapping: dict[str, str]) -> list[str]: """Render a command template by replacing placeholders with values. + Substitutes placeholders in a command list from a given map. It handles + simple string arguments and optional arguments represented as tuples. + If a mapped value is a list, the argument is expanded. + Args: - command: The command template as a list of strings. - map: A dictionary of placeholders to their replacement values. + command: The command template, which can contain strings and tuples + of the form `(default, optional_template)`. + mapping: A dictionary of placeholders to their replacement values. Returns: The rendered command as a list of strings. """ _command = command.copy() - for pattern, value in map.items(): - for i, arg in enumerate(_command): - # Check if optional argument can be used - if isinstance(arg, tuple): - default_arg, optional_arg = arg - if pattern in optional_arg: - _command[i] = arg.replace(pattern, value) + for i, arg in enumerate(_command): + # Check if optional argument can be used + if isinstance(arg, tuple): + default_arg, optional_arg = arg + + if pattern := get_pattern(optional_arg, mapping): + _command[i] = optional_arg.replace(pattern, mapping[pattern]) + elif pattern := get_pattern(default_arg, mapping): + _command[i] = default_arg.replace(pattern, mapping[pattern]) + else: + if pattern := get_pattern(arg, mapping): + value = mapping[pattern] + if isinstance(value, list): + _command[i] = " ".join( + arg.replace(pattern, subvalue) for subvalue in value + ) else: - _command[i] = default_arg - else: - if pattern in arg: _command[i] = arg.replace(pattern, value) + _command = " ".join(_command).split(" ") + # Remove not rendered part of the command: __command = [] for part in _command: From 78e0a0cd1107d94fead9c4c11ed7d93f6b084513 Mon Sep 17 00:00:00 2001 From: Villon CHEN Date: Mon, 17 Nov 2025 11:22:10 +0100 Subject: [PATCH 5/5] feat(sasts): provide `CPU_COUNT` for SAST tool's command --- codesectools/sasts/tools/Cppcheck/sast.py | 2 ++ codesectools/utils.py | 3 +++ 2 files changed, 5 insertions(+) diff --git a/codesectools/sasts/tools/Cppcheck/sast.py b/codesectools/sasts/tools/Cppcheck/sast.py index 003492f..518a820 100644 --- a/codesectools/sasts/tools/Cppcheck/sast.py +++ b/codesectools/sasts/tools/Cppcheck/sast.py @@ -13,6 +13,7 @@ SASTRequirements, ) from codesectools.sasts.tools.Cppcheck.parser import CppcheckAnalysisResult +from codesectools.utils import CPU_COUNT class CppcheckSAST(PrebuiltBuildlessSAST): @@ -51,6 +52,7 @@ class CppcheckSAST(PrebuiltBuildlessSAST): "--xml", "--output-file=cppcheck_output.xml", "--cppcheck-build-dir={tempdir}", + f"-j{CPU_COUNT}", ] ] valid_codes = [0] diff --git a/codesectools/utils.py b/codesectools/utils.py index 0ccdb64..1ae8b0e 100644 --- a/codesectools/utils.py +++ b/codesectools/utils.py @@ -223,3 +223,6 @@ def shorten_path(p: str) -> str: if len(path.parts) > 3: return str(Path("...") / path.parts[-2] / path.parts[-1]) return p + + +CPU_COUNT = os.cpu_count()