diff --git a/codesectools/datasets/JulietTestSuiteC/__init__.py b/codesectools/datasets/JulietTestSuiteC/__init__.py new file mode 100644 index 0000000..69592f3 --- /dev/null +++ b/codesectools/datasets/JulietTestSuiteC/__init__.py @@ -0,0 +1 @@ +"""Initializes the JulietTestSuiteC dataset module.""" diff --git a/codesectools/datasets/JulietTestSuiteC/dataset.py b/codesectools/datasets/JulietTestSuiteC/dataset.py new file mode 100644 index 0000000..78e578b --- /dev/null +++ b/codesectools/datasets/JulietTestSuiteC/dataset.py @@ -0,0 +1,160 @@ +"""Defines the JulietTestSuiteC dataset for evaluating SAST tools on C code. + +This module provides the classes and logic to load the Juliet Test Suite for C/C++, +which consists of C test files with known vulnerabilities. It downloads the source code +from the NIST Software Assurance Reference Dataset (SARD) and parses an XML manifest +to associate test files with expected results. +""" + +import io +import re +import shutil +import zipfile +from pathlib import Path +from typing import Self + +import requests +from lxml import etree + +from codesectools.datasets.core.dataset import File, PrebuiltFileDataset +from codesectools.shared.cwe import CWE, CWEs +from codesectools.utils import CPU_COUNT + + +class TestCode(File): + """Represents a single test file in the JulietTestSuiteC dataset.""" + + def __init__( + self, + filepath: Path, + content: str | bytes, + cwes: list[CWE], + has_vuln: bool, + ) -> None: + """Initialize a TestCode instance. + + Args: + filepath: The path to the file. + content: The content of the file, as a string or bytes. + cwes: A list of CWEs associated with the file. + has_vuln: A boolean indicating if the vulnerability is real or a false positive test case. + + """ + super().__init__( + filepath=filepath, content=content, cwes=cwes, has_vuln=has_vuln + ) + + +class JulietTestSuiteC(PrebuiltFileDataset): + """Represents the Juliet Test Suite for C/C++. + + This class handles downloading, extracting, and loading the C/C++ test cases + from the Juliet Test Suite. + """ + + name = "JulietTestSuiteC" + supported_languages = ["c"] + license = "CC0 1.0 Universal" + license_url = "https://data.niaid.nih.gov/resources?id=zenodo_4701386#description" + + build_command = f"bear -- make -C ./C individuals -j{CPU_COUNT}" + prebuilt_expected = (Path("."), "compile_commands.json") + artefacts_arg = "compile_commands.json" + + def __init__(self, lang: None | str = None) -> None: + """Initialize the JulietTestSuiteC dataset. + + Args: + lang: The programming language of the dataset files. + Must be one of the supported languages. + + """ + super().__init__(lang) + + def __eq__(self, other: str | Self) -> bool: + """Compare this dataset with another object for equality. + + Args: + other: The object to compare with. Can be a string (dataset name) + or another JulietTestSuiteC instance. + + Returns: + True if the names are equal, False otherwise. + + """ + if isinstance(other, str): + return self.name == other + elif isinstance(other, self.__class__): + return self.name == other.name + else: + return False + + def download_files(self: Self, test: bool = False) -> None: + """Download and extract the dataset from the NIST SARD website. + + Downloads the zip archive, extracts its contents, and prunes the test cases + to a smaller subset for faster processing. If in test mode, it further + reduces the dataset to only a single CWE. + + Args: + test: If True, reduce the number of test files for faster testing. + + """ + zip_file = io.BytesIO( + requests.get( + "https://samate.nist.gov/SARD/downloads/test-suites/2017-10-01-juliet-test-suite-for-c-cplusplus-v1-3.zip" + ).content + ) + with zipfile.ZipFile(zip_file, "r") as zip_ref: + zip_ref.extractall(self.directory) + + # Limit to one set for each CWE + testcases = self.directory / "C" / "testcases" + for set_dir in testcases.glob("CWE*/s*"): + if set_dir.name != "s01": + shutil.move(set_dir, set_dir.parent / f"_{set_dir.name}") + + if test: + for cwe_dir in list(testcases.glob("CWE*")): + if not cwe_dir.name.startswith("CWE835"): + shutil.rmtree(cwe_dir) + + def load_dataset(self) -> list[TestCode]: + """Load the JulietTestSuiteC dataset from the source files. + + Parses the `manifest.xml` file to identify vulnerabilities in the C/C++ + source files and creates a `TestCode` object for each file containing a flaw. + + Returns: + A list of `TestCode` objects representing the dataset. + + """ + files = [] + testcode_dir = self.directory / "C" / "testcases" + testcode_paths = { + path.name: path + for path in list(testcode_dir.rglob("CWE*.c")) + + list(testcode_dir.rglob("CWE*.cpp")) + } + manifest_path = self.directory / "C" / "manifest.xml" + manifest = etree.parse(manifest_path) + testcases = manifest.xpath("/container/testcase") + for testcase in testcases: + files_tree = testcase.xpath("file") + for file_tree in files_tree: + file_path = file_tree.get("path") + if file_obj := testcode_paths.get(file_path): + if file_tree.xpath("flaw"): + flaw = file_tree.xpath("flaw")[0] + flaw_name = flaw.get("name") + if m := re.search(r"CWE-(\d+)", flaw_name): + cwe_id = int(m.group(1)) + files.append( + TestCode( + filepath=file_obj.relative_to(self.directory), + content=file_obj.read_bytes(), + cwes=[CWEs.from_id(cwe_id)], + has_vuln=True, + ) + ) + return files diff --git a/codesectools/sasts/tools/Cppcheck/sast.py b/codesectools/sasts/tools/Cppcheck/sast.py index 518a820..485ac94 100644 --- a/codesectools/sasts/tools/Cppcheck/sast.py +++ b/codesectools/sasts/tools/Cppcheck/sast.py @@ -36,7 +36,7 @@ class CppcheckSAST(PrebuiltBuildlessSAST): name = "Cppcheck" supported_languages = ["c"] - supported_dataset_names = [] + supported_dataset_names = ["JulietTestSuiteC"] properties = SASTProperties(free=True, offline=True) requirements = SASTRequirements( full_reqs=[ diff --git a/codesectools/sasts/tools/SemgrepCE/sast.py b/codesectools/sasts/tools/SemgrepCE/sast.py index 6e5764e..c4a8600 100644 --- a/codesectools/sasts/tools/SemgrepCE/sast.py +++ b/codesectools/sasts/tools/SemgrepCE/sast.py @@ -37,7 +37,7 @@ class SemgrepCESAST(BuildlessSAST): name = "SemgrepCE" supported_languages = ["java", "c"] - supported_dataset_names = ["BenchmarkJava", "CVEfixes"] + supported_dataset_names = ["BenchmarkJava", "CVEfixes", "JulietTestSuiteC"] properties = SASTProperties(free=True, offline=True) requirements = SASTRequirements( full_reqs=[ diff --git a/codesectools/sasts/tools/SnykCode/sast.py b/codesectools/sasts/tools/SnykCode/sast.py index c0301ca..0cbf304 100644 --- a/codesectools/sasts/tools/SnykCode/sast.py +++ b/codesectools/sasts/tools/SnykCode/sast.py @@ -33,7 +33,7 @@ class SnykCodeSAST(BuildlessSAST): name = "SnykCode" supported_languages = ["java", "c"] - supported_dataset_names = ["BenchmarkJava", "CVEfixes"] + supported_dataset_names = ["BenchmarkJava", "CVEfixes", "JulietTestSuiteC"] properties = SASTProperties(free=False, offline=False) requirements = SASTRequirements( full_reqs=[ diff --git a/docs/dataset/profiles/juliettestsuitec.yaml b/docs/dataset/profiles/juliettestsuitec.yaml new file mode 100644 index 0000000..8dadcad --- /dev/null +++ b/docs/dataset/profiles/juliettestsuitec.yaml @@ -0,0 +1,20 @@ +name: Juliet Test Suite for C/C++ v1.3 +description: A collection of test cases in the C/C++ language. It contains examples organized under 118 different CWEs. Version 1.3 adds test cases for increment and decrement and fixes some dozen systematic problems in 1.2 cases. +type: File +url: https://data.niaid.nih.gov/resources?id=zenodo_4701386#description +supported_version: Latest +supported_languages: + - C/C++ +legal: + license: CC0 1.0 Universal + license_type: Public Domain + license_url: https://data.niaid.nih.gov/resources?id=zenodo_4701386#description +requirements: + - An internet connection is required **only** to download the dataset. +extra: | + !!! info "Dataset content" + + - Test files: `C/testcases/CWE*/**` + - Labeled data: `C/manifest.xml` + + *Downloaded from [NIST SARD](https://samate.nist.gov/SARD/downloads/test-suites/2017-10-01-juliet-test-suite-for-c-cplusplus-v1-3.zip).* diff --git a/pyproject.toml b/pyproject.toml index a997f14..a9d8dbd 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "CodeSecTools" -version = "0.12.4" +version = "0.13.0" description = "A framework for code security that provides abstractions for static analysis tools and datasets to support their integration, testing, and evaluation." readme = "README.md" license = "AGPL-3.0-only" diff --git a/uv.lock b/uv.lock index de3b98f..a964ef6 100644 --- a/uv.lock +++ b/uv.lock @@ -221,7 +221,7 @@ wheels = [ [[package]] name = "codesectools" -version = "0.12.4" +version = "0.13.0" source = { editable = "." } dependencies = [ { name = "gitpython" },