Skip to content

Commit 80b43fe

Browse files
committed
Add ProjectKBv2 importer
Add a test for the ProjectKB importer and collect fix commits pipeline. Signed-off-by: ziad hany <ziadhany2016@gmail.com>
1 parent f3d45ca commit 80b43fe

File tree

12 files changed

+2807
-75
lines changed

12 files changed

+2807
-75
lines changed

vulnerabilities/importers/__init__.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -56,6 +56,7 @@
5656
from vulnerabilities.pipelines.v2_importers import nvd_importer as nvd_importer_v2
5757
from vulnerabilities.pipelines.v2_importers import oss_fuzz as oss_fuzz_v2
5858
from vulnerabilities.pipelines.v2_importers import postgresql_importer as postgresql_importer_v2
59+
from vulnerabilities.pipelines.v2_importers import project_kb_importer as project_kb_importer_v2
5960
from vulnerabilities.pipelines.v2_importers import pypa_importer as pypa_importer_v2
6061
from vulnerabilities.pipelines.v2_importers import pysec_importer as pysec_importer_v2
6162
from vulnerabilities.pipelines.v2_importers import redhat_importer as redhat_importer_v2
@@ -83,6 +84,7 @@
8384
github_osv_importer_v2.GithubOSVImporterPipeline,
8485
redhat_importer_v2.RedHatImporterPipeline,
8586
aosp_importer_v2.AospImporterPipeline,
87+
project_kb_importer_v2.ProjectKBPipeline,
8688
nvd_importer.NVDImporterPipeline,
8789
github_importer.GitHubAPIImporterPipeline,
8890
gitlab_importer.GitLabImporterPipeline,
Lines changed: 125 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,125 @@
1+
#
2+
# Copyright (c) nexB Inc. and others. All rights reserved.
3+
# VulnerableCode is a trademark of nexB Inc.
4+
# SPDX-License-Identifier: Apache-2.0
5+
# See http://www.apache.org/licenses/LICENSE-2.0 for the license text.
6+
# See https://github.com/aboutcode-org/vulnerablecode for support or download.
7+
# See https://aboutcode.org for more information about nexB OSS projects.
8+
#
9+
10+
import json
11+
from pathlib import Path
12+
from typing import Iterable
13+
14+
import saneyaml
15+
from fetchcode.vcs import fetch_via_vcs
16+
from packageurl import PackageURL
17+
from univers.maven import VersionRange
18+
19+
from vulnerabilities.importer import AdvisoryData
20+
from vulnerabilities.importer import AffectedPackageV2
21+
from vulnerabilities.importer import ReferenceV2
22+
from vulnerabilities.pipelines import VulnerableCodeBaseImporterPipelineV2
23+
from vulnerabilities.utils import get_advisory_url
24+
25+
26+
class ProjectKBPipeline(VulnerableCodeBaseImporterPipelineV2):
27+
"""
28+
ProjectKB Importer Pipeline
29+
Collect advisory from ProjectKB data:
30+
- YAML statements: https://github.com/SAP/project-kb/blob/vulnerability-data/statements/*/*.yaml
31+
"""
32+
33+
pipeline_id = "project-kb_v2"
34+
spdx_license_expression = "Apache-2.0"
35+
license_url = "https://github.com/SAP/project-kb/blob/main/LICENSE.txt"
36+
repo_url = "git+https://github.com/SAP/project-kb@vulnerability-data"
37+
38+
@classmethod
39+
def steps(cls):
40+
return (cls.clone_repo, cls.collect_and_store_advisories, cls.clean_downloads)
41+
42+
def clone_repo(self):
43+
self.log("Processing ProjectKB advisory data...")
44+
self.vcs_response = fetch_via_vcs(self.repo_url)
45+
46+
def advisories_count(self):
47+
base_path = Path(self.vcs_response.dest_dir) / "statements"
48+
count = sum(1 for _ in base_path.rglob("*.yaml"))
49+
self.log(f"Estimated advisories to process: {count}")
50+
return count
51+
52+
def collect_advisories(self) -> Iterable[AdvisoryData]:
53+
"""Collect fix commits from YAML statements under /statements."""
54+
base_path = Path(self.vcs_response.dest_dir) / "statements"
55+
56+
for yaml_file in base_path.rglob("*.yaml"):
57+
if yaml_file.name != "statement.yaml":
58+
continue
59+
60+
with open(yaml_file, encoding="utf-8") as f:
61+
yaml_data = saneyaml.load(f)
62+
63+
vulnerability_id = yaml_data.get("vulnerability_id")
64+
if not vulnerability_id:
65+
continue
66+
67+
note_texts = []
68+
for note_entry in yaml_data.get("notes", []):
69+
text_content = note_entry.get("text")
70+
if text_content:
71+
note_texts.append(text_content)
72+
description = "\n".join(note_texts)
73+
74+
references = []
75+
for fix in yaml_data.get("fixes", []):
76+
for commit in fix.get("commits", []):
77+
commit_id = commit.get("id")
78+
repo_url = commit.get("repository")
79+
if not commit_id or not repo_url:
80+
continue
81+
82+
commit_url = repo_url.replace(".git", "") + "/commit/" + commit_id
83+
ref = ReferenceV2.from_url(commit_url)
84+
references.append(ref)
85+
86+
affected_packages = []
87+
for artifact in yaml_data.get("artifacts", []):
88+
affected = artifact.get("affected")
89+
if not affected:
90+
continue
91+
92+
purl_str = artifact.get("id")
93+
purl = PackageURL.from_string(purl_str)
94+
95+
affected_package = AffectedPackageV2(
96+
package=PackageURL(type=purl.type, namespace=purl.namespace, name=purl.name),
97+
fixed_version_range=VersionRange.from_version(purl.version),
98+
)
99+
affected_packages.append(affected_package)
100+
101+
advisory_url = get_advisory_url(
102+
file=yaml_file,
103+
base_path=base_path,
104+
url="https://github.com/SAP/project-kb/blob/vulnerability-data/statements/",
105+
)
106+
107+
yield AdvisoryData(
108+
advisory_id=vulnerability_id,
109+
aliases=[],
110+
summary=description or "",
111+
affected_packages=affected_packages,
112+
references_v2=references,
113+
url=advisory_url,
114+
original_advisory_text=json.dumps(yaml_data, indent=2, ensure_ascii=False),
115+
)
116+
117+
def clean_downloads(self):
118+
"""Remove the cloned repository from disk."""
119+
self.log("Removing cloned repository...")
120+
if self.vcs_response:
121+
self.vcs_response.delete()
122+
123+
def on_failure(self):
124+
"""Ensure cleanup happens on pipeline failure."""
125+
self.clean_downloads()

vulnerabilities/pipelines/v2_improvers/collect_commits_project_kb.py

Lines changed: 45 additions & 74 deletions
Original file line numberDiff line numberDiff line change
@@ -6,10 +6,10 @@
66
# See https://github.com/aboutcode-org/vulnerablecode for support or download.
77
# See https://aboutcode.org for more information about nexB OSS projects.
88
#
9+
910
import csv
1011
from pathlib import Path
1112

12-
import saneyaml
1313
from fetchcode.vcs import fetch_via_vcs
1414

1515
from vulnerabilities.models import AdvisoryV2
@@ -21,100 +21,71 @@ class CollectFixCommitsProjectKBPipeline(VulnerableCodePipeline):
2121
"""
2222
Pipeline to collect fix commits from Project KB:
2323
https://github.com/SAP/project-kb/blob/main/MSR2019/dataset/vulas_db_msr2019_release.csv
24-
https://github.com/SAP/project-kb/blob/vulnerability-data/statements/*/*.yaml
2524
"""
2625

2726
pipeline_id = "kb_project_fix_commits"
2827
spdx_license_expression = "Apache-2.0"
2928
license_url = "https://github.com/SAP/project-kb/blob/main/LICENSE.txt"
30-
importer_name = "Project KB Importer"
3129
qualified_name = "kb_project_fix_commits"
32-
repo_url_vulnerability_data = "git+https://github.com/SAP/project-kb@vulnerability-data"
33-
repo_url_main = "git+https://github.com/SAP/project-kb"
30+
repo_url = "git+https://github.com/SAP/project-kb"
3431

3532
@classmethod
3633
def steps(cls):
37-
return (cls.collect_fix_commits,)
34+
return (
35+
cls.clone,
36+
cls.collect_fix_commits,
37+
)
38+
39+
def clone(self):
40+
self.log("Cloning repositories for ProjectKB fix commits from CSV...")
41+
self.vcs_response = fetch_via_vcs(self.repo_url)
3842

3943
def collect_fix_commits(self):
40-
self.vcs_response_main = fetch_via_vcs(self.repo_url_main)
41-
self.vcs_response_vuln_data = fetch_via_vcs(self.repo_url_vulnerability_data)
44+
self.log("Collecting fix commits from ProjectKB...")
4245

43-
self.log(f"Processing ProjectKBP fix commits.")
44-
csv_database_filepath = (
45-
Path(self.vcs_response_main.dest_dir) / "MSR2019/dataset/vulas_db_msr2019_release.csv"
46-
)
47-
try:
48-
with open(csv_database_filepath, mode="r", newline="", encoding="utf-8") as f:
49-
reader = csv.reader(f)
50-
next(reader, None) # Skip header row
51-
for row in reader:
52-
if len(row) != 4:
53-
continue
54-
vulnerability_id, repo_url, commit_hash, label = row
55-
56-
if not vulnerability_id:
57-
continue
58-
59-
try:
60-
advisory = AdvisoryV2.objects.get(advisory_id=vulnerability_id)
61-
except AdvisoryV2.DoesNotExist:
62-
self.log(f"Can't find vulnerability_id: {vulnerability_id}")
63-
continue
64-
65-
self.create_codefix_entries(advisory, repo_url, commit_hash, vulnerability_id)
66-
except FileNotFoundError:
67-
self.log(f"CSV file not found: {csv_database_filepath}")
68-
69-
base_path = Path(self.vcs_response_vuln_data.dest_dir) / "statements"
70-
for file_path in base_path.rglob("*.yaml"):
71-
if file_path.name != "statement.yaml":
72-
continue
46+
csv_path = Path(self.vcs_response.dest_dir) / "MSR2019/dataset/vulas_db_msr2019_release.csv"
7347

74-
with open(file_path) as f:
75-
vulnerability_fixes_data = saneyaml.load(f)
48+
with open(csv_path, newline="", encoding="utf-8") as f:
49+
reader = csv.reader(f)
50+
next(reader, None) # skip header
51+
rows = [r for r in reader if len(r) == 4 and r[0]]
7652

77-
vulnerability_id = vulnerability_fixes_data.get("vulnerability_id")
78-
if not vulnerability_id:
79-
continue
53+
vuln_ids = {r[0] for r in rows}
54+
advisories = AdvisoryV2.objects.filter(advisory_id__in=vuln_ids).prefetch_related(
55+
"impacted_packages__affecting_packages"
56+
)
57+
advisory_map = {a.advisory_id: a for a in advisories}
8058

81-
try:
82-
advisory = AdvisoryV2.objects.get(advisory_id=vulnerability_id)
83-
except AdvisoryV2.DoesNotExist:
84-
self.log(f"Can't find vulnerability_id: {vulnerability_id}")
59+
codefixes = []
60+
for vuln_id, repo_url, commit, _ in rows:
61+
advisory = advisory_map.get(vuln_id)
62+
if not advisory:
8563
continue
8664

87-
for commit_data in vulnerability_fixes_data.get("fixes", []):
88-
for commit in commit_data.get("commits", []):
89-
commit_id = commit.get("id")
90-
repo_url = commit.get("repository")
91-
92-
if not commit_id or not repo_url:
93-
continue
94-
95-
self.create_codefix_entries(advisory, repo_url, commit_id, vulnerability_id)
96-
97-
def create_codefix_entries(self, advisory, repo_url, commit_id, vulnerability_id):
98-
repo_url = repo_url.rstrip("/").removesuffix(".git")
99-
vcs_url = f"{repo_url}/commit/{commit_id}"
100-
101-
for impact in advisory.impacted_packages.all():
102-
for package in impact.affecting_packages.all():
103-
code_fix, created = CodeFixV2.objects.get_or_create(
104-
commits=[vcs_url],
105-
advisory=advisory,
106-
affected_package=package,
107-
)
108-
if created:
109-
self.log(
110-
f"Created CodeFix entry for vulnerability_id: {vulnerability_id} with VCS URL {vcs_url}"
65+
repo_url = repo_url.rstrip("/").removesuffix(".git")
66+
vcs_url = f"{repo_url}/commit/{commit}"
67+
68+
for impact in advisory.impacted_packages.all():
69+
for pkg in impact.affecting_packages.all():
70+
codefixes.append(
71+
CodeFixV2(
72+
commits=[vcs_url],
73+
advisory=advisory,
74+
affected_package=pkg,
75+
)
11176
)
11277

78+
if codefixes:
79+
CodeFixV2.objects.bulk_create(codefixes, ignore_conflicts=True)
80+
self.log(f"Created {len(codefixes)} CodeFix entries.")
81+
else:
82+
self.log("No CodeFix entries created.")
83+
11384
def clean_downloads(self):
114-
if self.vcs_response_main or self.vcs_response_vuln_data:
85+
"""Remove the cloned repository from disk."""
86+
if self.vcs_response:
11587
self.log(f"Removing cloned repository")
116-
self.vcs_response_main.delete()
117-
self.vcs_response_vuln_data.delete()
88+
self.vcs_response.delete()
11889

11990
def on_failure(self):
12091
self.clean_downloads()
Lines changed: 89 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,89 @@
1+
#
2+
# Copyright (c) nexB Inc. and others. All rights reserved.
3+
# VulnerableCode is a trademark of nexB Inc.
4+
# SPDX-License-Identifier: Apache-2.0
5+
# See http://www.apache.org/licenses/LICENSE-2.0 for the license text.
6+
# See https://github.com/aboutcode-org/vulnerablecode for support or download.
7+
# See https://aboutcode.org for more information about nexB OSS projects.
8+
#
9+
from datetime import datetime
10+
from datetime import timezone
11+
from pathlib import Path
12+
from types import SimpleNamespace
13+
from unittest import TestCase
14+
from unittest.mock import patch
15+
16+
import pytest
17+
18+
from vulnerabilities.models import AdvisoryV2
19+
from vulnerabilities.models import CodeFixV2
20+
from vulnerabilities.models import ImpactedPackage
21+
from vulnerabilities.models import PackageV2
22+
from vulnerabilities.pipelines.v2_importers.project_kb_importer import ProjectKBPipeline
23+
from vulnerabilities.pipelines.v2_improvers.collect_commits_project_kb import (
24+
CollectFixCommitsProjectKBPipeline,
25+
)
26+
from vulnerabilities.tests import util_tests
27+
28+
TEST_DATA = Path(__file__).parent.parent.parent / "test_data" / "kbmsr2019"
29+
30+
31+
class TestProjectKbImporterPipeline(TestCase):
32+
"""
33+
Integration-style test that validates YAML → Advisory → JSON conversion
34+
using real test data files, but mocks network and repo access.
35+
"""
36+
37+
@patch(
38+
"vulnerabilities.pipelines.v2_importers.project_kb_importer.get_advisory_url",
39+
return_value="https://mocked.url/advisory",
40+
)
41+
def test_project_kb_collect_advisories_v2(self, mock_get_advisory_url):
42+
pipeline = ProjectKBPipeline()
43+
pipeline.vcs_response = SimpleNamespace(dest_dir=TEST_DATA)
44+
45+
for idx in range(1, 4):
46+
yaml_file = TEST_DATA / str(idx) / f"statement.yaml"
47+
expected_file = TEST_DATA / f"statement-{idx}-expected.json"
48+
49+
with patch(
50+
"vulnerabilities.pipelines.v2_importers.project_kb_importer.Path.rglob",
51+
return_value=[yaml_file],
52+
):
53+
result = [adv.to_dict() for adv in pipeline.collect_advisories()]
54+
55+
util_tests.check_results_against_json(result, expected_file)
56+
57+
@pytest.mark.django_db
58+
def test_collect_fix_commits_uses_existing_csv(self):
59+
"""
60+
Test that CollectFixCommitsProjectKBPipeline.collect_fix_commits()
61+
reads an existing ProjectKB CSV file and creates CodeFixV2 entries.
62+
"""
63+
64+
advisory = AdvisoryV2.objects.create(
65+
advisory_id="CVE-2018-8034",
66+
datasource_id="test-datasource",
67+
avid="TEST-1234",
68+
unique_content_id="unique-test-id",
69+
url="https://example.com/advisory/CVE-2018-8034",
70+
date_collected=datetime.now(timezone.utc),
71+
)
72+
73+
pkg1 = PackageV2.objects.create(name="test_name1", type="test")
74+
pkg2 = PackageV2.objects.create(name="test_name2", type="test")
75+
76+
impacted = ImpactedPackage.objects.create(advisory=advisory)
77+
impacted.affecting_packages.set([pkg1, pkg2])
78+
79+
pipeline = CollectFixCommitsProjectKBPipeline()
80+
pipeline.vcs_response = SimpleNamespace(dest_dir=TEST_DATA)
81+
82+
pipeline.collect_fix_commits()
83+
84+
fixes = CodeFixV2.objects.all()
85+
assert len(fixes) == 2
86+
assert [fix.commits for fix in fixes] == [
87+
["https://github.com/apache/tomcat/commit/2835bb4e030c1c741ed0847bb3b9c3822e4fbc8a"],
88+
["https://github.com/apache/tomcat/commit/2835bb4e030c1c741ed0847bb3b9c3822e4fbc8a"],
89+
]

0 commit comments

Comments
 (0)