From ea5c9de20422ecf67b85f1d2b0964d51c5e60c3c Mon Sep 17 00:00:00 2001 From: Seth Michael Larson Date: Mon, 2 Dec 2024 15:49:42 -0600 Subject: [PATCH 1/3] Deduplicate SPDX IDs with hash suffixes --- sbom.py | 16 +++++++++++++--- tests/sbom/sbom-with-pip-removed.json | 4 ++-- tests/sbom/sbom-with-pip.json | 14 +++++++------- 3 files changed, 22 insertions(+), 12 deletions(-) diff --git a/sbom.py b/sbom.py index c7122139..5b3a2fd7 100644 --- a/sbom.py +++ b/sbom.py @@ -25,7 +25,7 @@ import typing import zipfile from pathlib import Path -from typing import Any, NotRequired, TypedDict, cast +from typing import Any, LiteralString, NotRequired, TypedDict, cast from urllib.request import urlopen @@ -90,9 +90,19 @@ class CreationInfo(TypedDict): licenseListVersion: str -def spdx_id(value: str) -> str: +# Cache of values that we've seen already. We use this +# to de-duplicate values and their corresponding SPDX ID. +_SPDX_IDS_TO_VALUES = {} + + +def spdx_id(value: LiteralString) -> str: """Encode a value into characters that are valid in an SPDX ID""" - return re.sub(r"[^a-zA-Z0-9.\-]+", "-", value) + spdx_id = re.sub(r"[^a-zA-Z0-9.\-]+", "-", value) + # To avoid collisions we append a hash suffix. + suffix = hashlib.sha256(value.encode()).hexdigest()[:8] + spdx_id = f"{spdx_id}-{suffix}" + assert _SPDX_IDS_TO_VALUES.setdefault(spdx_id, value) == value + return spdx_id def calculate_package_verification_codes(sbom: SBOM) -> None: diff --git a/tests/sbom/sbom-with-pip-removed.json b/tests/sbom/sbom-with-pip-removed.json index 0ba2d4c0..b2646f23 100644 --- a/tests/sbom/sbom-with-pip-removed.json +++ b/tests/sbom/sbom-with-pip-removed.json @@ -13,9 +13,9 @@ "packages": [], "relationships": [ { - "relatedSpdxElement": "SPDXRef-FILE-Modules-expat-COPYING", + "relatedSpdxElement": "SPDXRef-FILE-Modules-expat-COPYING-497fb0c3", "relationshipType": "CONTAINS", - "spdxElementId": "SPDXRef-PACKAGE-expat" + "spdxElementId": "SPDXRef-PACKAGE-expat-83b93528" } ] } diff --git a/tests/sbom/sbom-with-pip.json b/tests/sbom/sbom-with-pip.json index e758c182..d2ee1c6f 100644 --- a/tests/sbom/sbom-with-pip.json +++ b/tests/sbom/sbom-with-pip.json @@ -12,7 +12,7 @@ "files": [], "packages": [ { - "SPDXID": "SPDXRef-PACKAGE-pip", + "SPDXID": "SPDXRef-PACKAGE-pip-ced959c1", "name": "pip", "versionInfo": "24.0", "licenseConcluded": "MIT", @@ -38,19 +38,19 @@ ], "relationships": [ { - "relatedSpdxElement": "SPDXRef-FILE-Modules-expat-COPYING", + "relatedSpdxElement": "SPDXRef-FILE-Modules-expat-COPYING-497fb0c3", "relationshipType": "CONTAINS", - "spdxElementId": "SPDXRef-PACKAGE-expat" + "spdxElementId": "SPDXRef-PACKAGE-expat-83b93528" }, { - "relatedSpdxElement": "SPDXRef-PACKAGE-urllib3", + "relatedSpdxElement": "SPDXRef-PACKAGE-urllib3-b7a198af", "relationshipType": "DEPENDS_ON", - "spdxElementId": "SPDXRef-PACKAGE-pip" + "spdxElementId": "SPDXRef-PACKAGE-pip-ced959c1" }, { - "relatedSpdxElement": "SPDXRef-PACKAGE-pip", + "relatedSpdxElement": "SPDXRef-PACKAGE-pip-ced959c1", "relationshipType": "DEPENDS_ON", - "spdxElementId": "SPDXRef-PACKAGE-cpython" + "spdxElementId": "SPDXRef-PACKAGE-cpython-608f998c" } ] } From c6763d62357e173c475a628bb1fe58edbbfcf630 Mon Sep 17 00:00:00 2001 From: Seth Michael Larson Date: Tue, 3 Dec 2024 10:27:24 -0600 Subject: [PATCH 2/3] Apply suggestions from code review Co-authored-by: Hugo van Kemenade <1324225+hugovk@users.noreply.github.com> --- sbom.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/sbom.py b/sbom.py index 5b3a2fd7..25b8def2 100644 --- a/sbom.py +++ b/sbom.py @@ -92,9 +92,10 @@ class CreationInfo(TypedDict): # Cache of values that we've seen already. We use this # to de-duplicate values and their corresponding SPDX ID. -_SPDX_IDS_TO_VALUES = {} +_SPDX_IDS_TO_VALUES: dict[str, Any] = {} +@cache def spdx_id(value: LiteralString) -> str: """Encode a value into characters that are valid in an SPDX ID""" spdx_id = re.sub(r"[^a-zA-Z0-9.\-]+", "-", value) From 76278e9475bf9912d26408dcabfc75d517b60ab5 Mon Sep 17 00:00:00 2001 From: Seth Michael Larson Date: Tue, 3 Dec 2024 10:30:04 -0600 Subject: [PATCH 3/3] Add tests, don't shadow name --- sbom.py | 9 +++++---- tests/test_sbom.py | 16 ++++++++++++++++ 2 files changed, 21 insertions(+), 4 deletions(-) diff --git a/sbom.py b/sbom.py index 25b8def2..9a3cfa00 100644 --- a/sbom.py +++ b/sbom.py @@ -24,6 +24,7 @@ import tarfile import typing import zipfile +from functools import cache from pathlib import Path from typing import Any, LiteralString, NotRequired, TypedDict, cast from urllib.request import urlopen @@ -98,12 +99,12 @@ class CreationInfo(TypedDict): @cache def spdx_id(value: LiteralString) -> str: """Encode a value into characters that are valid in an SPDX ID""" - spdx_id = re.sub(r"[^a-zA-Z0-9.\-]+", "-", value) + value_as_spdx_id = re.sub(r"[^a-zA-Z0-9.\-]+", "-", value) # To avoid collisions we append a hash suffix. suffix = hashlib.sha256(value.encode()).hexdigest()[:8] - spdx_id = f"{spdx_id}-{suffix}" - assert _SPDX_IDS_TO_VALUES.setdefault(spdx_id, value) == value - return spdx_id + value_as_spdx_id = f"{value_as_spdx_id}-{suffix}" + assert _SPDX_IDS_TO_VALUES.setdefault(value_as_spdx_id, value) == value + return value_as_spdx_id def calculate_package_verification_codes(sbom: SBOM) -> None: diff --git a/tests/test_sbom.py b/tests/test_sbom.py index 45e66041..8e7713dc 100644 --- a/tests/test_sbom.py +++ b/tests/test_sbom.py @@ -11,6 +11,22 @@ import sbom +@pytest.mark.parametrize( + ["value", "expected"], + [ + ("abc", "abc-ba7816bf"), + ("def", "def-cb8379ac"), + ("SPDXRef-PACKAGE-pip", "SPDXRef-PACKAGE-pip-ced959c1"), + ("SPDXRef-PACKAGE-cpython", "SPDXRef-PACKAGE-cpython-79ab18d2"), + ("SPDXRef-PACKAGE-urllib3", "SPDXRef-PACKAGE-urllib3-b8ab4751"), + ], +) +def test_spdx_id(value: str, expected: str) -> None: + assert sbom.spdx_id(value) == expected + # Check we get the same value next time + assert sbom.spdx_id(value) == expected + + @pytest.mark.parametrize( ["package_sha1s", "package_verification_code"], [