66# See https://github.com/aboutcode-org/vulnerablecode for support or download.
77# See https://aboutcode.org for more information about nexB OSS projects.
88#
9+
910import csv
1011from pathlib import Path
1112
12- import saneyaml
1313from fetchcode .vcs import fetch_via_vcs
1414
1515from vulnerabilities .models import AdvisoryV2
@@ -21,100 +21,71 @@ class CollectFixCommitsProjectKBPipeline(VulnerableCodePipeline):
2121 """
2222 Pipeline to collect fix commits from Project KB:
2323 https://github.com/SAP/project-kb/blob/main/MSR2019/dataset/vulas_db_msr2019_release.csv
24- https://github.com/SAP/project-kb/blob/vulnerability-data/statements/*/*.yaml
2524 """
2625
2726 pipeline_id = "kb_project_fix_commits"
2827 spdx_license_expression = "Apache-2.0"
2928 license_url = "https://github.com/SAP/project-kb/blob/main/LICENSE.txt"
30- importer_name = "Project KB Importer"
3129 qualified_name = "kb_project_fix_commits"
32- repo_url_vulnerability_data = "git+https://github.com/SAP/project-kb@vulnerability-data"
33- repo_url_main = "git+https://github.com/SAP/project-kb"
30+ repo_url = "git+https://github.com/SAP/project-kb"
3431
3532 @classmethod
3633 def steps (cls ):
37- return (cls .collect_fix_commits ,)
34+ return (
35+ cls .clone ,
36+ cls .collect_fix_commits ,
37+ )
38+
39+ def clone (self ):
40+ self .log ("Cloning repositories for ProjectKB fix commits from CSV..." )
41+ self .vcs_response = fetch_via_vcs (self .repo_url )
3842
3943 def collect_fix_commits (self ):
40- self .vcs_response_main = fetch_via_vcs (self .repo_url_main )
41- self .vcs_response_vuln_data = fetch_via_vcs (self .repo_url_vulnerability_data )
44+ self .log ("Collecting fix commits from ProjectKB..." )
4245
43- self .log (f"Processing ProjectKBP fix commits." )
44- csv_database_filepath = (
45- Path (self .vcs_response_main .dest_dir ) / "MSR2019/dataset/vulas_db_msr2019_release.csv"
46- )
47- try :
48- with open (csv_database_filepath , mode = "r" , newline = "" , encoding = "utf-8" ) as f :
49- reader = csv .reader (f )
50- next (reader , None ) # Skip header row
51- for row in reader :
52- if len (row ) != 4 :
53- continue
54- vulnerability_id , repo_url , commit_hash , label = row
55-
56- if not vulnerability_id :
57- continue
58-
59- try :
60- advisory = AdvisoryV2 .objects .get (advisory_id = vulnerability_id )
61- except AdvisoryV2 .DoesNotExist :
62- self .log (f"Can't find vulnerability_id: { vulnerability_id } " )
63- continue
64-
65- self .create_codefix_entries (advisory , repo_url , commit_hash , vulnerability_id )
66- except FileNotFoundError :
67- self .log (f"CSV file not found: { csv_database_filepath } " )
68-
69- base_path = Path (self .vcs_response_vuln_data .dest_dir ) / "statements"
70- for file_path in base_path .rglob ("*.yaml" ):
71- if file_path .name != "statement.yaml" :
72- continue
46+ csv_path = Path (self .vcs_response .dest_dir ) / "MSR2019/dataset/vulas_db_msr2019_release.csv"
7347
74- with open (file_path ) as f :
75- vulnerability_fixes_data = saneyaml .load (f )
48+ with open (csv_path , newline = "" , encoding = "utf-8" ) as f :
49+ reader = csv .reader (f )
50+ next (reader , None ) # skip header
51+ rows = [r for r in reader if len (r ) == 4 and r [0 ]]
7652
77- vulnerability_id = vulnerability_fixes_data .get ("vulnerability_id" )
78- if not vulnerability_id :
79- continue
53+ vuln_ids = {r [0 ] for r in rows }
54+ advisories = AdvisoryV2 .objects .filter (advisory_id__in = vuln_ids ).prefetch_related (
55+ "impacted_packages__affecting_packages"
56+ )
57+ advisory_map = {a .advisory_id : a for a in advisories }
8058
81- try :
82- advisory = AdvisoryV2 . objects . get ( advisory_id = vulnerability_id )
83- except AdvisoryV2 . DoesNotExist :
84- self . log ( f"Can't find vulnerability_id: { vulnerability_id } " )
59+ codefixes = []
60+ for vuln_id , repo_url , commit , _ in rows :
61+ advisory = advisory_map . get ( vuln_id )
62+ if not advisory :
8563 continue
8664
87- for commit_data in vulnerability_fixes_data .get ("fixes" , []):
88- for commit in commit_data .get ("commits" , []):
89- commit_id = commit .get ("id" )
90- repo_url = commit .get ("repository" )
91-
92- if not commit_id or not repo_url :
93- continue
94-
95- self .create_codefix_entries (advisory , repo_url , commit_id , vulnerability_id )
96-
97- def create_codefix_entries (self , advisory , repo_url , commit_id , vulnerability_id ):
98- repo_url = repo_url .rstrip ("/" ).removesuffix (".git" )
99- vcs_url = f"{ repo_url } /commit/{ commit_id } "
100-
101- for impact in advisory .impacted_packages .all ():
102- for package in impact .affecting_packages .all ():
103- code_fix , created = CodeFixV2 .objects .get_or_create (
104- commits = [vcs_url ],
105- advisory = advisory ,
106- affected_package = package ,
107- )
108- if created :
109- self .log (
110- f"Created CodeFix entry for vulnerability_id: { vulnerability_id } with VCS URL { vcs_url } "
65+ repo_url = repo_url .rstrip ("/" ).removesuffix (".git" )
66+ vcs_url = f"{ repo_url } /commit/{ commit } "
67+
68+ for impact in advisory .impacted_packages .all ():
69+ for pkg in impact .affecting_packages .all ():
70+ codefixes .append (
71+ CodeFixV2 (
72+ commits = [vcs_url ],
73+ advisory = advisory ,
74+ affected_package = pkg ,
75+ )
11176 )
11277
78+ if codefixes :
79+ CodeFixV2 .objects .bulk_create (codefixes , ignore_conflicts = True )
80+ self .log (f"Created { len (codefixes )} CodeFix entries." )
81+ else :
82+ self .log ("No CodeFix entries created." )
83+
11384 def clean_downloads (self ):
114- if self .vcs_response_main or self .vcs_response_vuln_data :
85+ """Remove the cloned repository from disk."""
86+ if self .vcs_response :
11587 self .log (f"Removing cloned repository" )
116- self .vcs_response_main .delete ()
117- self .vcs_response_vuln_data .delete ()
88+ self .vcs_response .delete ()
11889
11990 def on_failure (self ):
12091 self .clean_downloads ()
0 commit comments