@@ -485,9 +485,10 @@ gc_maybe_untrack(PyObject *op)
485485// enough time between the enqueue and dequeue so that the needed memory
486486// for the object, most importantly ob_gc_bits and ob_type words, will
487487// already be in the CPU cache.
488- #define BUFFER_SIZE 256
488+ #define BUFFER_SIZE 256 // this must be a power of 2
489489#define BUFFER_HI 16
490490#define BUFFER_LO 8
491+ #define BUFFER_MASK (BUFFER_SIZE - 1)
491492
492493// Prefetch intructions will fetch the line of data from memory that
493494// contains the byte specified with the source operand to a location in
@@ -554,15 +555,63 @@ typedef struct {
554555} gc_span_stack_t ;
555556
556557typedef struct {
557- Py_ssize_t enqueued ;
558- Py_ssize_t dequeued ;
558+ unsigned int in ;
559+ unsigned int out ;
559560 _PyObjectStack stack ;
560561 gc_span_stack_t spans ;
561562 PyObject * buffer [BUFFER_SIZE ];
563+ bool use_prefetch ;
562564} gc_mark_args_t ;
563565
564- // Called when we run out of space in the buffer. The object will be added
565- // to gc_mark_args.stack instead.
566+
567+ // Returns number of entries in buffer
568+ static inline unsigned int
569+ gc_mark_buffer_len (gc_mark_args_t * args )
570+ {
571+ return args -> in - args -> out ;
572+ }
573+
574+ // Returns number of free entry slots in buffer
575+ static inline unsigned int
576+ gc_mark_buffer_avail (gc_mark_args_t * args )
577+ {
578+ return BUFFER_SIZE - gc_mark_buffer_len (args );
579+ }
580+
581+ static inline bool
582+ gc_mark_buffer_is_empty (gc_mark_args_t * args )
583+ {
584+ return args -> in == args -> out ;
585+ }
586+
587+ static inline bool
588+ gc_mark_buffer_is_full (gc_mark_args_t * args )
589+ {
590+ return gc_mark_buffer_len (args ) == BUFFER_SIZE ;
591+ }
592+
593+ static inline PyObject *
594+ gc_mark_buffer_pop (gc_mark_args_t * args )
595+ {
596+ assert (!gc_mark_buffer_is_empty (args ));
597+ PyObject * op = args -> buffer [args -> out & BUFFER_MASK ];
598+ args -> out ++ ;
599+ return op ;
600+ }
601+
602+ // Called when there is space in the buffer for the object. Issue the
603+ // prefetch instruction and add it to the end of the buffer.
604+ static inline void
605+ gc_mark_buffer_push (PyObject * op , gc_mark_args_t * args )
606+ {
607+ assert (!gc_mark_buffer_is_full (args ));
608+ prefetch (op );
609+ args -> buffer [args -> in & BUFFER_MASK ] = op ;
610+ args -> in ++ ;
611+ }
612+
613+ // Called when we run out of space in the buffer or if the prefetching
614+ // is disabled. The object will be pushed on the gc_mark_args.stack.
566615static int
567616gc_mark_stack_push (_PyObjectStack * ms , PyObject * op )
568617{
@@ -575,6 +624,9 @@ gc_mark_stack_push(_PyObjectStack *ms, PyObject *op)
575624static int
576625gc_mark_span_push (gc_span_stack_t * ss , PyObject * * start , PyObject * * end )
577626{
627+ if (start == end ) {
628+ return 0 ;
629+ }
578630 if (ss -> size >= ss -> capacity ) {
579631 if (ss -> capacity == 0 ) {
580632 ss -> capacity = 256 ;
@@ -594,27 +646,36 @@ gc_mark_span_push(gc_span_stack_t *ss, PyObject **start, PyObject **end)
594646 return 0 ;
595647}
596648
597- // Called when there is space in the buffer for the object. Add it to the end
598- // of the buffer and issue the prefetch instruction.
599- static void
600- gc_mark_buffer_push (PyObject * op , gc_mark_args_t * args )
649+ static int
650+ gc_mark_enqueue_no_buffer (PyObject * op , gc_mark_args_t * args )
601651{
602- #ifdef Py_DEBUG
603- Py_ssize_t buf_used = args -> enqueued - args -> dequeued ;
604- assert (buf_used < BUFFER_SIZE );
605- #endif
606- prefetch (op );
607- args -> buffer [args -> enqueued % BUFFER_SIZE ] = op ;
608- args -> enqueued ++ ;
652+ if (op == NULL ) {
653+ return 0 ;
654+ }
655+ if (!gc_has_bit (op , _PyGC_BITS_TRACKED )) {
656+ return 0 ;
657+ }
658+ if (gc_is_alive (op )) {
659+ return 0 ; // already visited this object
660+ }
661+ if (gc_maybe_untrack (op )) {
662+ return 0 ; // was untracked, don't visit it
663+ }
664+
665+ // Need to call tp_traverse on this object. Add to stack and mark it
666+ // alive so we don't traverse it a second time.
667+ gc_set_alive (op );
668+ if (_PyObjectStack_Push (& args -> stack , op ) < 0 ) {
669+ return -1 ;
670+ }
671+ return 0 ;
609672}
610673
611- // Called when we find an object that needs to be marked alive (either from a
612- // root or from calling tp_traverse).
613674static int
614- gc_mark_enqueue (PyObject * op , gc_mark_args_t * args )
675+ gc_mark_enqueue_buffer (PyObject * op , gc_mark_args_t * args )
615676{
616677 assert (op != NULL );
617- if (args -> enqueued - args -> dequeued < BUFFER_SIZE ) {
678+ if (! gc_mark_buffer_is_full ( args ) ) {
618679 gc_mark_buffer_push (op , args );
619680 return 0 ;
620681 }
@@ -623,12 +684,31 @@ gc_mark_enqueue(PyObject *op, gc_mark_args_t *args)
623684 }
624685}
625686
687+ // Called when we find an object that needs to be marked alive (either from a
688+ // root or from calling tp_traverse).
689+ static int
690+ gc_mark_enqueue (PyObject * op , gc_mark_args_t * args )
691+ {
692+ if (args -> use_prefetch ) {
693+ return gc_mark_enqueue_buffer (op , args );
694+ }
695+ else {
696+ return gc_mark_enqueue_no_buffer (op , args );
697+ }
698+ }
699+
700+ // Called when we have a contigous sequence of PyObject pointers, either
701+ // a tuple or list object. This will add the items to the buffer if there
702+ // is space for them all otherwise push a new "span" on the span stack. Using
703+ // spans has the advantage of not creating a deep _PyObjectStack stack when
704+ // dealing with long sequences. Those sequences will be processed in smaller
705+ // chunks by the gc_prime_from_spans() function.
626706static int
627707gc_mark_enqueue_span (PyObject * * item , Py_ssize_t size , gc_mark_args_t * args )
628708{
629- Py_ssize_t used = args -> enqueued - args -> dequeued ;
709+ Py_ssize_t used = gc_mark_buffer_len ( args ) ;
630710 Py_ssize_t free = BUFFER_SIZE - used ;
631- if (free > size ) {
711+ if (free >= size ) {
632712 for (Py_ssize_t i = 0 ; i < size ; i ++ ) {
633713 PyObject * op = item [i ];
634714 if (op == NULL ) {
@@ -694,9 +774,9 @@ gc_abort_mark_alive(PyInterpreterState *interp,
694774 struct collection_state * state ,
695775 gc_mark_args_t * args )
696776{
697- // We failed to allocate memory for "stack" while doing the "mark
698- // alive" phase. In that case, free the object stack and make sure
699- // that no objects have the alive bit set.
777+ // We failed to allocate memory while doing the "mark alive" phase.
778+ // In that case, free the memory used for marking state and make
779+ // sure that no objects have the alive bit set.
700780 _PyObjectStack_Clear (& args -> stack );
701781 if (args -> spans .stack != NULL ) {
702782 PyMem_Free (args -> spans .stack );
@@ -1089,24 +1169,26 @@ move_legacy_finalizer_reachable(struct collection_state *state);
10891169static void
10901170gc_prime_from_spans (gc_mark_args_t * args )
10911171{
1092- Py_ssize_t space = BUFFER_HI - (args -> enqueued - args -> dequeued );
1093- assert (space >= 1 ); // needed to make progress
1172+ Py_ssize_t space = BUFFER_HI - gc_mark_buffer_len (args );
1173+ // there should always be at least this amount of space
1174+ assert (space <= gc_mark_buffer_avail (args ));
1175+ assert (space > 0 );
10941176 gc_span_t entry = args -> spans .stack [-- args -> spans .size ];
1095- while (entry .start < entry .end ) {
1177+ // spans on the stack should always have one or more elements
1178+ assert (entry .start < entry .end );
1179+ do {
10961180 PyObject * op = * entry .start ;
1181+ entry .start ++ ;
10971182 if (op != NULL ) {
1098- if (space > 0 ) {
1099- gc_mark_buffer_push (op , args );
1100- space -- ;
1101- }
1102- else {
1103- // no more space in buffer, push remaining
1183+ gc_mark_buffer_push (op , args );
1184+ space -- ;
1185+ if (space == 0 ) {
1186+ // buffer is as full was we want and not done with span
11041187 gc_mark_span_push (& args -> spans , entry .start , entry .end );
1105- break ;
1188+ return ;
11061189 }
11071190 }
1108- entry .start ++ ;
1109- }
1191+ } while (entry .start < entry .end );
11101192}
11111193
11121194static void
@@ -1120,36 +1202,36 @@ gc_prime_buffer(gc_mark_args_t *args)
11201202 // likely cause the stack to be used shortly after when it
11211203 // fills. We want to use the buffer as much as possible and so
11221204 // we only fill to BUFFER_HI, not BUFFER_SIZE.
1123- Py_ssize_t space = BUFFER_HI - (args -> enqueued - args -> dequeued );
1124- while (space > 0 ) {
1205+ Py_ssize_t space = BUFFER_HI - gc_mark_buffer_len (args );
1206+ assert (space > 0 );
1207+ do {
11251208 PyObject * op = _PyObjectStack_Pop (& args -> stack );
11261209 if (op == NULL ) {
11271210 return ;
11281211 }
11291212 gc_mark_buffer_push (op , args );
11301213 space -- ;
1131- }
1214+ } while ( space > 0 );
11321215 }
11331216}
11341217
11351218static int
1136- gc_propagate_alive (gc_mark_args_t * args )
1219+ gc_propagate_alive_prefetch (gc_mark_args_t * args )
11371220{
11381221 for (;;) {
1139- Py_ssize_t buf_used = args -> enqueued - args -> dequeued ;
1222+ Py_ssize_t buf_used = gc_mark_buffer_len ( args ) ;
11401223 if (buf_used <= BUFFER_LO ) {
11411224 // The mark buffer is getting empty. If it's too empty
11421225 // then there will not be enough delay between issuing
11431226 // the prefetch and when the object is actually accessed.
11441227 // Prime the buffer with object pointers from the stack or
11451228 // from the spans, if there are any available.
11461229 gc_prime_buffer (args );
1147- if (args -> enqueued == args -> dequeued ) {
1148- return 0 ; // buffer empty, done
1230+ if (gc_mark_buffer_is_empty ( args ) ) {
1231+ return 0 ;
11491232 }
11501233 }
1151- PyObject * op = args -> buffer [args -> dequeued % BUFFER_SIZE ];
1152- args -> dequeued ++ ;
1234+ PyObject * op = gc_mark_buffer_pop (args );
11531235
11541236 if (!gc_has_bit (op , _PyGC_BITS_TRACKED )) {
11551237 continue ;
@@ -1174,12 +1256,35 @@ gc_propagate_alive(gc_mark_args_t *args)
11741256 return -1 ;
11751257 }
11761258 }
1177- else if (traverse (op , (visitproc )& gc_mark_enqueue , args ) < 0 ) {
1259+ else if (traverse (op , (visitproc )& gc_mark_enqueue_buffer , args ) < 0 ) {
11781260 return -1 ;
11791261 }
11801262 }
11811263}
11821264
1265+ static int
1266+ gc_propagate_alive (gc_mark_args_t * args )
1267+ {
1268+ if (args -> use_prefetch ) {
1269+ return gc_propagate_alive_prefetch (args );
1270+ }
1271+ else {
1272+ for (;;) {
1273+ PyObject * op = _PyObjectStack_Pop (& args -> stack );
1274+ if (op == NULL ) {
1275+ break ;
1276+ }
1277+ assert (_PyObject_GC_IS_TRACKED (op ));
1278+ assert (gc_is_alive (op ));
1279+ traverseproc traverse = Py_TYPE (op )-> tp_traverse ;
1280+ if (traverse (op , (visitproc )& gc_mark_enqueue_no_buffer , args ) < 0 ) {
1281+ return -1 ;
1282+ }
1283+ }
1284+ return 0 ;
1285+ }
1286+ }
1287+
11831288// Using tp_traverse, mark everything reachable from known root objects
11841289// (which must be non-garbage) as alive (_PyGC_BITS_ALIVE is set). In
11851290// most programs, this marks nearly all objects that are not actually
@@ -1206,6 +1311,14 @@ gc_mark_alive_from_roots(PyInterpreterState *interp,
12061311#endif
12071312 gc_mark_args_t mark_args = { 0 };
12081313
1314+ // Using prefetch instructions is only a win if the set of objects being
1315+ // examined by the GC does not fit into CPU caches. Otherwise, using the
1316+ // buffer and prefetch instructions is just overhead. Using the long lived
1317+ // object count seems a good estimate of if things will fit in the cache.
1318+ // On 64-bit platforms, the minimum object size is 32 bytes. A 4MB L2 cache
1319+ // would hold about 130k objects.
1320+ mark_args .use_prefetch = interp -> gc .long_lived_total > 200000 ;
1321+
12091322 #define MARK_ENQUEUE (op ) \
12101323 if (op != NULL ) { \
12111324 if (gc_mark_enqueue(op, &mark_args) < 0) { \
0 commit comments