Skip to content

Commit dab40d5

Browse files
committed
Keep the oldest advisory while deduping
Signed-off-by: Keshav Priyadarshi <git@keshav.space>
1 parent b3b43ab commit dab40d5

File tree

1 file changed

+31
-27
lines changed

1 file changed

+31
-27
lines changed

vulnerabilities/pipelines/remove_duplicate_advisories.py

Lines changed: 31 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -32,15 +32,16 @@ def remove_duplicates(self):
3232
self.log(f"Computing new content id for {advisories_count} and removing duplicates.")
3333

3434
update_batch_size = 500
35-
delete_batch_size = 1000
35+
delete_batch_size = 5000
3636
chunk_size = 50000
37-
deleted_advisory_count = 0
38-
updated_advisory_count = 0
39-
duplicate_advisory_id = []
40-
updated_advisory = []
37+
deleted_advisories_count = 0
38+
updated_advisories_count = 0
39+
duplicate_advisory_ids = []
40+
advisories_to_update = []
4141
content_ids = set()
4242

43-
advisories = Advisory.objects.all().order_by("-id").paginated(per_page=chunk_size)
43+
advisories = Advisory.objects.all().order_by("id").paginated(per_page=chunk_size)
44+
4445
progress = LoopProgress(
4546
total_iterations=advisories_count,
4647
logger=self.log,
@@ -49,48 +50,52 @@ def remove_duplicates(self):
4950

5051
for advisory in progress.iter(advisories):
5152
content_id = compute_content_id(advisory.to_advisory_data())
53+
5254
if content_id in content_ids:
53-
duplicate_advisory_id.append(advisory.id)
55+
duplicate_advisory_ids.append(advisory.id)
5456
else:
57+
content_ids.add(content_id)
5558
if advisory.unique_content_id != content_id:
5659
advisory.unique_content_id = content_id
57-
updated_advisory.append(advisory)
58-
content_ids.add(content_id)
59-
if len(duplicate_advisory_id) > delete_batch_size:
60-
deleted_advisory_count += delete_advisories(
61-
advisory_ids=duplicate_advisory_id,
60+
advisories_to_update.append(advisory)
61+
62+
if len(duplicate_advisory_ids) > delete_batch_size:
63+
deleted_advisories_count += delete_advisories(
64+
advisory_ids=duplicate_advisory_ids,
6265
logger=self.log,
6366
)
64-
if len(updated_advisory) > update_batch_size:
65-
updated_advisory_count += bulk_update_advisory(
66-
items=updated_advisory,
67+
duplicate_advisory_ids.clear()
68+
69+
if len(advisories_to_update) > update_batch_size:
70+
updated_advisories_count += bulk_update_advisories(
71+
advisories=advisories_to_update,
6772
fields=["unique_content_id"],
6873
logger=self.log,
6974
)
75+
advisories_to_update.clear()
7076

71-
deleted_advisory_count += delete_advisories(
72-
advisory_ids=duplicate_advisory_id,
77+
deleted_advisories_count += delete_advisories(
78+
advisory_ids=duplicate_advisory_ids,
7379
logger=self.log,
7480
)
75-
updated_advisory_count += bulk_update_advisory(
76-
items=updated_advisory,
81+
updated_advisories_count += bulk_update_advisories(
82+
advisories=advisories_to_update,
7783
fields=["unique_content_id"],
7884
logger=self.log,
7985
)
8086

81-
self.log(f"Removed {deleted_advisory_count} duplicates advisories.")
82-
self.log(f"Updated content id for {deleted_advisory_count} advisories.")
87+
self.log(f"Removed {deleted_advisories_count} duplicates advisories.")
88+
self.log(f"Updated content id for {deleted_advisories_count} advisories.")
8389

8490

85-
def bulk_update_advisory(items, fields, logger):
91+
def bulk_update_advisories(advisories, fields, logger):
8692
item_count = 0
87-
if items:
93+
if advisories:
8894
try:
89-
Advisory.objects.bulk_update(objs=items, fields=fields)
90-
item_count += len(items)
95+
Advisory.objects.bulk_update(objs=advisories, fields=fields)
96+
item_count += len(advisories)
9197
except Exception as e:
9298
logger(f"Error updating Advisory: {e}")
93-
items.clear()
9499
return item_count
95100

96101

@@ -102,5 +107,4 @@ def delete_advisories(advisory_ids, logger):
102107
item_count += len(advisory_ids)
103108
except Exception as e:
104109
logger(f"Error deleting Advisory: {e}")
105-
advisory_ids.clear()
106110
return item_count

0 commit comments

Comments
 (0)