From 73bc0a3308171b39cebd836db84d9eb3dc3576f3 Mon Sep 17 00:00:00 2001 From: Villon CHEN Date: Thu, 20 Nov 2025 12:30:06 +0100 Subject: [PATCH 1/3] feat(CWE): add parent and children --- codesectools/shared/cwe.py | 93 ++++++++++++++++++++++++++++++++------ 1 file changed, 78 insertions(+), 15 deletions(-) diff --git a/codesectools/shared/cwe.py b/codesectools/shared/cwe.py index 321176c..21ed93d 100644 --- a/codesectools/shared/cwe.py +++ b/codesectools/shared/cwe.py @@ -20,18 +20,31 @@ class CWE: id (int): The CWE identifier. name (str): The name of the weakness. description (str): A description of the weakness. + parent (CWE | None): The parent CWE weakness, if any. + children (set[CWE]): A set of child CWE weaknesses. """ - def __init__(self, id: int, name: str, description: str) -> None: + def __init__( + self, + id: int, + name: str, + description: str, + parent: Self | None = None, + children: set[Self] | None = None, + ) -> None: """Initialize a CWE instance. Args: id: The CWE identifier. name: The name of the weakness. description: A description of the weakness. + parent: The parent CWE weakness, if any. + children: A set of child CWE weaknesses, if any. """ + if children is None: + children = set() self.id = id if r := re.search(r"\('(.*)'\)", name): self.name = r.group(1) @@ -40,6 +53,8 @@ def __init__(self, id: int, name: str, description: str) -> None: self.name = self.full_name = name self.description = description + self.parent = parent + self.children = children or set() def __eq__(self, other: Self | int) -> bool: """Compare this CWE with another object for equality. @@ -72,6 +87,31 @@ def __repr__(self) -> str: """ return f"{self.__class__.__name__}(id={self.id})" + def extend(self, distance: int = 1) -> set[Self]: + """Retrieve the set of related CWEs within a specified distance in the hierarchy. + + Recursively finds parent and child CWEs up to the given distance level. + Includes the current CWE in the returned set. + + Args: + distance: The number of levels to traverse up (parents) and down (children). + Defaults to 1. + + Returns: + A set of CWE objects including the self and related weaknesses. + + """ + cwes: set[Self] = set([self]) + for _ in range(distance): + new_cwes = cwes.copy() + for cwe in cwes: + if cwe.parent: + new_cwes.add(cwe.parent) + for child in cwe.children: + new_cwes.add(child) + cwes = new_cwes.copy() + return cwes + class CWEsCollection: """Manage the collection of all CWEs. @@ -136,24 +176,50 @@ def download(self) -> None: ) progress.update(task, advance=25) - def load(self) -> list[CWE]: - """Load CWE data from the CSV file. + def load(self) -> dict[int, CWE]: + """Load and parse CWE data from cached CSV files. + + Reads the CSV files defined in `cwes_data`, instantiates `CWE` objects, + and establishes parent-child relationships based on the "Related Weaknesses" field. Returns: - A list of CWE objects. + A dictionary mapping CWE IDs (int) to `CWE` objects. """ - cwes = [] + cwes = {} + cwes_parent = {} + cwes_children = {} for filename in self.cwes_data.values(): reader = csv.DictReader((self.directory / filename).open(encoding="utf-8")) for cwe_dict in reader: - cwes.append( - CWE( - id=int(cwe_dict["CWE-ID"]), - name=cwe_dict["Name"], - description=cwe_dict["Description"], - ) + cwe_id = int(cwe_dict["CWE-ID"]) + + cwes[cwe_id] = CWE( + id=cwe_id, + name=cwe_dict["Name"], + description=cwe_dict["Description"], ) + + for related in cwe_dict["Related Weaknesses"].split("::"): + if m := re.search(r"NATURE:ChildOf:CWE ID:(\d+):", related): + parent_id = int(m.group(1)) + + cwes_parent[cwe_id] = parent_id + + if cwes_children.get(parent_id): + cwes_children[parent_id].add(cwe_id) + else: + cwes_children[parent_id] = {cwe_id} + + break + + for cwe_id, cwe in cwes.items(): + if p_id := cwes_parent.get(cwe_id): + cwe.parent = cwes.get(p_id, None) + for c_id in cwes_children.get(cwe_id, []): + if child_cwe := cwes.get(c_id): + cwe.children.add(child_cwe) + return cwes def from_string(self, cwe_string: str) -> CWE: @@ -181,10 +247,7 @@ def from_id(self, cwe_id: int) -> CWE: The CWE object if found, otherwise a default CWE object with ID -1. """ - try: - return self.cwes[self.cwes.index(cwe_id)] - except ValueError: - return self.NOCWE + return self.cwes.get(cwe_id, self.NOCWE) CWEs = CWEsCollection() From 5d24cee15a7c5bdbffcce8d61ecb3057754c1600 Mon Sep 17 00:00:00 2001 From: Villon CHEN Date: Thu, 20 Nov 2025 12:33:41 +0100 Subject: [PATCH 2/3] feat(datasets): support parent/children relationships in CWE comparison to reduce false positives --- codesectools/datasets/core/dataset.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/codesectools/datasets/core/dataset.py b/codesectools/datasets/core/dataset.py index 9a63ddf..ffef70d 100644 --- a/codesectools/datasets/core/dataset.py +++ b/codesectools/datasets/core/dataset.py @@ -34,6 +34,8 @@ class Dataset(ABC): name (str): The name of the dataset. supported_languages (list[str]): A list of programming languages supported by the dataset. + license (str): The license under which the dataset is distributed. + license_url (str): A URL to the full text of the license. """ @@ -333,8 +335,7 @@ def validate(self, analysis_result: AnalysisResult) -> FileDatasetData: for (filepath, cwe), defect in unique_reported_defects.items(): has_vuln, expected_cwes = ground_truth.get(filepath, (False, set())) - - if has_vuln and cwe in expected_cwes: + if has_vuln and bool(cwe.extend() & expected_cwes): # Correctly identified a vulnerability tp_defects_map[(filepath, cwe)] = defect else: @@ -609,7 +610,7 @@ def validate(self, analysis_results: list[AnalysisResult]) -> GitRepoDatasetData for (filename, cwe), defect in unique_reported_defects.items(): # A reported defect is a TP if it's in a known vulnerable file # with a known CWE for that repo. - if filename in repo.files and cwe in repo.cwes: + if filename in repo.files and bool(cwe.extend() & set(repo.cwes)): tp_defects_map[(filename, cwe)] = defect else: fp_defects_map[(filename, cwe)] = defect From 7e7c3ec1e19a4a998978ab28b074cbe7da78fdfd Mon Sep 17 00:00:00 2001 From: Villon CHEN Date: Thu, 20 Nov 2025 14:13:03 +0100 Subject: [PATCH 3/3] chore(release): bump project version --- pyproject.toml | 2 +- uv.lock | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 4d4ac2e..0852bd5 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "CodeSecTools" -version = "0.13.3" +version = "0.13.4" description = "A framework for code security that provides abstractions for static analysis tools and datasets to support their integration, testing, and evaluation." readme = "README.md" license = "AGPL-3.0-only" diff --git a/uv.lock b/uv.lock index 62510f1..a09eb17 100644 --- a/uv.lock +++ b/uv.lock @@ -221,7 +221,7 @@ wheels = [ [[package]] name = "codesectools" -version = "0.13.3" +version = "0.13.4" source = { editable = "." } dependencies = [ { name = "gitpython" },