Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 4 additions & 3 deletions codesectools/datasets/core/dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,8 @@ class Dataset(ABC):
name (str): The name of the dataset.
supported_languages (list[str]): A list of programming languages supported
by the dataset.
license (str): The license under which the dataset is distributed.
license_url (str): A URL to the full text of the license.

"""

Expand Down Expand Up @@ -333,8 +335,7 @@ def validate(self, analysis_result: AnalysisResult) -> FileDatasetData:

for (filepath, cwe), defect in unique_reported_defects.items():
has_vuln, expected_cwes = ground_truth.get(filepath, (False, set()))

if has_vuln and cwe in expected_cwes:
if has_vuln and bool(cwe.extend() & expected_cwes):
# Correctly identified a vulnerability
tp_defects_map[(filepath, cwe)] = defect
else:
Expand Down Expand Up @@ -609,7 +610,7 @@ def validate(self, analysis_results: list[AnalysisResult]) -> GitRepoDatasetData
for (filename, cwe), defect in unique_reported_defects.items():
# A reported defect is a TP if it's in a known vulnerable file
# with a known CWE for that repo.
if filename in repo.files and cwe in repo.cwes:
if filename in repo.files and bool(cwe.extend() & set(repo.cwes)):
tp_defects_map[(filename, cwe)] = defect
else:
fp_defects_map[(filename, cwe)] = defect
Expand Down
93 changes: 78 additions & 15 deletions codesectools/shared/cwe.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,18 +20,31 @@ class CWE:
id (int): The CWE identifier.
name (str): The name of the weakness.
description (str): A description of the weakness.
parent (CWE | None): The parent CWE weakness, if any.
children (set[CWE]): A set of child CWE weaknesses.

"""

def __init__(self, id: int, name: str, description: str) -> None:
def __init__(
self,
id: int,
name: str,
description: str,
parent: Self | None = None,
children: set[Self] | None = None,
) -> None:
"""Initialize a CWE instance.

Args:
id: The CWE identifier.
name: The name of the weakness.
description: A description of the weakness.
parent: The parent CWE weakness, if any.
children: A set of child CWE weaknesses, if any.

"""
if children is None:
children = set()
self.id = id
if r := re.search(r"\('(.*)'\)", name):
self.name = r.group(1)
Expand All @@ -40,6 +53,8 @@ def __init__(self, id: int, name: str, description: str) -> None:
self.name = self.full_name = name

self.description = description
self.parent = parent
self.children = children or set()

def __eq__(self, other: Self | int) -> bool:
"""Compare this CWE with another object for equality.
Expand Down Expand Up @@ -72,6 +87,31 @@ def __repr__(self) -> str:
"""
return f"{self.__class__.__name__}(id={self.id})"

def extend(self, distance: int = 1) -> set[Self]:
"""Retrieve the set of related CWEs within a specified distance in the hierarchy.

Recursively finds parent and child CWEs up to the given distance level.
Includes the current CWE in the returned set.

Args:
distance: The number of levels to traverse up (parents) and down (children).
Defaults to 1.

Returns:
A set of CWE objects including the self and related weaknesses.

"""
cwes: set[Self] = set([self])
for _ in range(distance):
new_cwes = cwes.copy()
for cwe in cwes:
if cwe.parent:
new_cwes.add(cwe.parent)
for child in cwe.children:
new_cwes.add(child)
cwes = new_cwes.copy()
return cwes


class CWEsCollection:
"""Manage the collection of all CWEs.
Expand Down Expand Up @@ -136,24 +176,50 @@ def download(self) -> None:
)
progress.update(task, advance=25)

def load(self) -> list[CWE]:
"""Load CWE data from the CSV file.
def load(self) -> dict[int, CWE]:
"""Load and parse CWE data from cached CSV files.

Reads the CSV files defined in `cwes_data`, instantiates `CWE` objects,
and establishes parent-child relationships based on the "Related Weaknesses" field.

Returns:
A list of CWE objects.
A dictionary mapping CWE IDs (int) to `CWE` objects.

"""
cwes = []
cwes = {}
cwes_parent = {}
cwes_children = {}
for filename in self.cwes_data.values():
reader = csv.DictReader((self.directory / filename).open(encoding="utf-8"))
for cwe_dict in reader:
cwes.append(
CWE(
id=int(cwe_dict["CWE-ID"]),
name=cwe_dict["Name"],
description=cwe_dict["Description"],
)
cwe_id = int(cwe_dict["CWE-ID"])

cwes[cwe_id] = CWE(
id=cwe_id,
name=cwe_dict["Name"],
description=cwe_dict["Description"],
)

for related in cwe_dict["Related Weaknesses"].split("::"):
if m := re.search(r"NATURE:ChildOf:CWE ID:(\d+):", related):
parent_id = int(m.group(1))

cwes_parent[cwe_id] = parent_id

if cwes_children.get(parent_id):
cwes_children[parent_id].add(cwe_id)
else:
cwes_children[parent_id] = {cwe_id}

break

for cwe_id, cwe in cwes.items():
if p_id := cwes_parent.get(cwe_id):
cwe.parent = cwes.get(p_id, None)
for c_id in cwes_children.get(cwe_id, []):
if child_cwe := cwes.get(c_id):
cwe.children.add(child_cwe)

return cwes

def from_string(self, cwe_string: str) -> CWE:
Expand Down Expand Up @@ -181,10 +247,7 @@ def from_id(self, cwe_id: int) -> CWE:
The CWE object if found, otherwise a default CWE object with ID -1.

"""
try:
return self.cwes[self.cwes.index(cwe_id)]
except ValueError:
return self.NOCWE
return self.cwes.get(cwe_id, self.NOCWE)


CWEs = CWEsCollection()
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[project]
name = "CodeSecTools"
version = "0.13.3"
version = "0.13.4"
description = "A framework for code security that provides abstractions for static analysis tools and datasets to support their integration, testing, and evaluation."
readme = "README.md"
license = "AGPL-3.0-only"
Expand Down
2 changes: 1 addition & 1 deletion uv.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.