@@ -32,15 +32,16 @@ def remove_duplicates(self):
3232 self .log (f"Computing new content id for { advisories_count } and removing duplicates." )
3333
3434 update_batch_size = 500
35- delete_batch_size = 1000
35+ delete_batch_size = 5000
3636 chunk_size = 50000
37- deleted_advisory_count = 0
38- updated_advisory_count = 0
39- duplicate_advisory_id = []
40- updated_advisory = []
37+ deleted_advisories_count = 0
38+ updated_advisories_count = 0
39+ duplicate_advisory_ids = []
40+ advisories_to_update = []
4141 content_ids = set ()
4242
43- advisories = Advisory .objects .all ().order_by ("-id" ).paginated (per_page = chunk_size )
43+ advisories = Advisory .objects .all ().order_by ("id" ).paginated (per_page = chunk_size )
44+
4445 progress = LoopProgress (
4546 total_iterations = advisories_count ,
4647 logger = self .log ,
@@ -49,48 +50,52 @@ def remove_duplicates(self):
4950
5051 for advisory in progress .iter (advisories ):
5152 content_id = compute_content_id (advisory .to_advisory_data ())
53+
5254 if content_id in content_ids :
53- duplicate_advisory_id .append (advisory .id )
55+ duplicate_advisory_ids .append (advisory .id )
5456 else :
57+ content_ids .add (content_id )
5558 if advisory .unique_content_id != content_id :
5659 advisory .unique_content_id = content_id
57- updated_advisory .append (advisory )
58- content_ids . add ( content_id )
59- if len (duplicate_advisory_id ) > delete_batch_size :
60- deleted_advisory_count += delete_advisories (
61- advisory_ids = duplicate_advisory_id ,
60+ advisories_to_update .append (advisory )
61+
62+ if len (duplicate_advisory_ids ) > delete_batch_size :
63+ deleted_advisories_count += delete_advisories (
64+ advisory_ids = duplicate_advisory_ids ,
6265 logger = self .log ,
6366 )
64- if len (updated_advisory ) > update_batch_size :
65- updated_advisory_count += bulk_update_advisory (
66- items = updated_advisory ,
67+ duplicate_advisory_ids .clear ()
68+
69+ if len (advisories_to_update ) > update_batch_size :
70+ updated_advisories_count += bulk_update_advisories (
71+ advisories = advisories_to_update ,
6772 fields = ["unique_content_id" ],
6873 logger = self .log ,
6974 )
75+ advisories_to_update .clear ()
7076
71- deleted_advisory_count += delete_advisories (
72- advisory_ids = duplicate_advisory_id ,
77+ deleted_advisories_count += delete_advisories (
78+ advisory_ids = duplicate_advisory_ids ,
7379 logger = self .log ,
7480 )
75- updated_advisory_count += bulk_update_advisory (
76- items = updated_advisory ,
81+ updated_advisories_count += bulk_update_advisories (
82+ advisories = advisories_to_update ,
7783 fields = ["unique_content_id" ],
7884 logger = self .log ,
7985 )
8086
81- self .log (f"Removed { deleted_advisory_count } duplicates advisories." )
82- self .log (f"Updated content id for { deleted_advisory_count } advisories." )
87+ self .log (f"Removed { deleted_advisories_count } duplicates advisories." )
88+ self .log (f"Updated content id for { deleted_advisories_count } advisories." )
8389
8490
85- def bulk_update_advisory ( items , fields , logger ):
91+ def bulk_update_advisories ( advisories , fields , logger ):
8692 item_count = 0
87- if items :
93+ if advisories :
8894 try :
89- Advisory .objects .bulk_update (objs = items , fields = fields )
90- item_count += len (items )
95+ Advisory .objects .bulk_update (objs = advisories , fields = fields )
96+ item_count += len (advisories )
9197 except Exception as e :
9298 logger (f"Error updating Advisory: { e } " )
93- items .clear ()
9499 return item_count
95100
96101
@@ -102,5 +107,4 @@ def delete_advisories(advisory_ids, logger):
102107 item_count += len (advisory_ids )
103108 except Exception as e :
104109 logger (f"Error deleting Advisory: { e } " )
105- advisory_ids .clear ()
106110 return item_count
0 commit comments