@@ -482,8 +482,8 @@ gc_maybe_untrack(PyObject *op)
482482#define BUFFER_LO 8
483483
484484#if defined(__GNUC__ ) || defined(__clang__ )
485- #define PREFETCH_L1 (ptr ) __builtin_prefetch(ptr, 1 , 3)
486- #define PREFETCH_L2 (ptr ) __builtin_prefetch(ptr, 1 , 2)
485+ #define PREFETCH_L1 (ptr ) __builtin_prefetch(ptr, 0 , 3)
486+ #define PREFETCH_L2 (ptr ) __builtin_prefetch(ptr, 0 , 2)
487487#elif defined(_MSC_VER ) && (defined(_M_X64 ) || defined(_M_I86 )) && !defined(_M_ARM64EC )
488488 #include <mmintrin.h>
489489 #define PREFETCH_L1 (ptr ) _mm_prefetch((const char*)(ptr), _MM_HINT_T0)
@@ -497,17 +497,30 @@ gc_maybe_untrack(PyObject *op)
497497#endif
498498
499499#ifdef GC_ENABLE_PREFETCH_INSTRUCTIONS
500- #define prefetch (ptr ) PREFETCH_L2 (ptr)
500+ #define prefetch (ptr ) PREFETCH_L1 (ptr)
501501#else
502502#define prefetch (ptr )
503503#endif
504504
505- struct gc_mark_args {
505+ // a contigous sequence of PyObject pointers
506+ typedef struct {
507+ PyObject * * start ;
508+ PyObject * * end ;
509+ } gc_span_t ;
510+
511+ typedef struct {
512+ Py_ssize_t size ;
513+ Py_ssize_t capacity ;
514+ gc_span_t * stack ;
515+ } gc_span_stack_t ;
516+
517+ typedef struct {
506518 Py_ssize_t enqueued ;
507519 Py_ssize_t dequeued ;
508520 _PyObjectStack stack ;
521+ gc_span_stack_t spans ;
509522 PyObject * buffer [BUFFER_SIZE ];
510- };
523+ } gc_mark_args_t ;
511524
512525// Called when we run out of space in the buffer. The object will be added
513526// to gc_mark_args.stack instead.
@@ -520,24 +533,45 @@ gc_mark_stack_push(_PyObjectStack *ms, PyObject *op)
520533 return 0 ;
521534}
522535
536+ static int
537+ gc_mark_span_push (gc_span_stack_t * ss , PyObject * * start , PyObject * * end )
538+ {
539+ if (ss -> size >= ss -> capacity ) {
540+ if (ss -> capacity == 0 ) {
541+ ss -> capacity = 256 ;
542+ }
543+ else {
544+ ss -> capacity *= 2 ;
545+ }
546+ ss -> stack = (gc_span_t * )PyMem_Realloc (ss -> stack , ss -> capacity * sizeof (gc_span_t ));
547+ if (ss -> stack == NULL ) {
548+ return -1 ;
549+ }
550+ }
551+ ss -> stack [ss -> size ].start = start ;
552+ ss -> stack [ss -> size ].end = end ;
553+ ss -> size ++ ;
554+ return 0 ;
555+ }
556+
523557// Called when there is space in the buffer for the object. Add it to the end
524558// of the buffer and issue the prefetch instruction.
525- static inline void
526- gc_mark_buffer_push (PyObject * op , struct gc_mark_args * args )
559+ static void
560+ gc_mark_buffer_push (PyObject * op , gc_mark_args_t * args )
527561{
528562#ifdef Py_DEBUG
529563 Py_ssize_t buf_used = args -> enqueued - args -> dequeued ;
530564 assert (buf_used < BUFFER_SIZE );
531565#endif
566+ prefetch (op );
532567 args -> buffer [args -> enqueued % BUFFER_SIZE ] = op ;
533568 args -> enqueued ++ ;
534- prefetch (op );
535569}
536570
537571// Called when we find an object that needs to be marked alive (either from a
538572// root or from calling tp_traverse).
539573static int
540- gc_mark_enqueue (PyObject * op , struct gc_mark_args * args )
574+ gc_mark_enqueue (PyObject * op , gc_mark_args_t * args )
541575{
542576 assert (op != NULL );
543577 if (args -> enqueued - args -> dequeued < BUFFER_SIZE ) {
@@ -549,6 +583,25 @@ gc_mark_enqueue(PyObject *op, struct gc_mark_args *args)
549583 }
550584}
551585
586+ static int
587+ gc_mark_enqueue_span (PyObject * * item , Py_ssize_t size , gc_mark_args_t * args )
588+ {
589+ Py_ssize_t used = args -> enqueued - args -> dequeued ;
590+ Py_ssize_t free = BUFFER_SIZE - used ;
591+ if (free > size ) {
592+ for (Py_ssize_t i = 0 ; i < size ; i ++ ) {
593+ gc_mark_buffer_push (item [i ], args );
594+ }
595+ }
596+ else {
597+ PyObject * * end = & item [size ];
598+ if (gc_mark_span_push (& args -> spans , item , end ) < 0 ) {
599+ return -1 ;
600+ }
601+ }
602+ return 0 ;
603+ }
604+
552605static bool
553606gc_clear_alive_bits (const mi_heap_t * heap , const mi_heap_area_t * area ,
554607 void * block , size_t block_size , void * args )
@@ -570,10 +623,8 @@ gc_mark_traverse_list(PyObject *self, void *args)
570623 if (list -> ob_item == NULL ) {
571624 return 0 ;
572625 }
573- for (Py_ssize_t i = 0 ; i < Py_SIZE (list ); i ++ ) {
574- if (gc_mark_enqueue (list -> ob_item [i ], args ) < 0 ) {
575- return -1 ;
576- }
626+ if (gc_mark_enqueue_span (list -> ob_item , PyList_GET_SIZE (list ), args ) < 0 ) {
627+ return -1 ;
577628 }
578629 return 0 ;
579630}
@@ -586,33 +637,30 @@ gc_mark_traverse_tuple(PyObject *self, void *args)
586637 return 0 ;
587638 }
588639 PyTupleObject * tuple = _PyTuple_CAST (self );
589- for (Py_ssize_t i = Py_SIZE (tuple ); -- i >= 0 ; ) {
590- PyObject * item = tuple -> ob_item [i ];
591- if (item == NULL ) {
592- continue ;
593- }
594- if (gc_mark_enqueue (tuple -> ob_item [i ], args ) < 0 ) {
595- return -1 ;
596- }
640+ if (gc_mark_enqueue_span (tuple -> ob_item , Py_SIZE (tuple ), args ) < 0 ) {
641+ return -1 ;
597642 }
598643 return 0 ;
599644}
600645
601646static void
602647gc_abort_mark_alive (PyInterpreterState * interp ,
603648 struct collection_state * state ,
604- struct gc_mark_args * args )
649+ gc_mark_args_t * args )
605650{
606651 // We failed to allocate memory for "stack" while doing the "mark
607652 // alive" phase. In that case, free the object stack and make sure
608653 // that no objects have the alive bit set.
609654 _PyObjectStack_Clear (& args -> stack );
655+ if (args -> spans .stack != NULL ) {
656+ PyMem_Free (args -> spans .stack );
657+ }
610658 gc_visit_heaps (interp , & gc_clear_alive_bits , & state -> base );
611659}
612660
613661#ifdef GC_MARK_ALIVE_STACKS
614662static int
615- gc_visit_stackref_mark_alive (struct gc_mark_args * args , _PyStackRef stackref )
663+ gc_visit_stackref_mark_alive (gc_mark_args_t * args , _PyStackRef stackref )
616664{
617665 if (!PyStackRef_IsNull (stackref )) {
618666 PyObject * op = PyStackRef_AsPyObjectBorrow (stackref );
@@ -624,7 +672,7 @@ gc_visit_stackref_mark_alive(struct gc_mark_args *args, _PyStackRef stackref)
624672}
625673
626674static int
627- gc_visit_thread_stacks_mark_alive (PyInterpreterState * interp , struct gc_mark_args * args )
675+ gc_visit_thread_stacks_mark_alive (PyInterpreterState * interp , gc_mark_args_t * args )
628676{
629677 _Py_FOR_EACH_TSTATE_BEGIN (interp , p ) {
630678 for (_PyInterpreterFrame * f = p -> current_frame ; f != NULL ; f = f -> previous ) {
@@ -974,39 +1022,65 @@ move_legacy_finalizer_reachable(struct collection_state *state);
9741022#ifdef GC_ENABLE_MARK_ALIVE
9751023
9761024static void
977- gc_mark_buffer_prime (struct gc_mark_args * args )
978- {
979- for (;;) {
980- Py_ssize_t buf_used = args -> enqueued - args -> dequeued ;
981- if (buf_used >= BUFFER_HI ) {
982- // When priming, don't fill the buffer since that would
983- // likely cause the stack to be used shortly after when it
984- // fills. We want to use the buffer as much as possible and
985- // so we only fill to BUFFER_HI, not BUFFER_SIZE.
986- return ;
1025+ gc_prime_from_spans (gc_mark_args_t * args )
1026+ {
1027+ Py_ssize_t space = BUFFER_HI - (args -> enqueued - args -> dequeued );
1028+ assert (space >= 1 ); // needed to make progress
1029+ gc_span_t entry = args -> spans .stack [-- args -> spans .size ];
1030+ while (entry .start < entry .end ) {
1031+ PyObject * op = * entry .start ;
1032+ if (op != NULL ) {
1033+ if (space > 0 ) {
1034+ gc_mark_buffer_push (op , args );
1035+ space -- ;
1036+ }
1037+ else {
1038+ // no more space in buffer, push remaining
1039+ gc_mark_span_push (& args -> spans , entry .start , entry .end );
1040+ break ;
1041+ }
9871042 }
988- PyObject * op = _PyObjectStack_Pop (& args -> stack );
989- if (op == NULL ) {
990- break ;
1043+ entry .start ++ ;
1044+ }
1045+ }
1046+
1047+ static void
1048+ gc_prime_buffer (gc_mark_args_t * args )
1049+ {
1050+ if (args -> spans .size > 0 ) {
1051+ gc_prime_from_spans (args );
1052+ }
1053+ else {
1054+ // When priming, don't fill the buffer too full since that would
1055+ // likely cause the stack to be used shortly after when it
1056+ // fills. We want to use the buffer as much as possible and so
1057+ // we only fill to BUFFER_HI, not BUFFER_SIZE.
1058+ Py_ssize_t space = BUFFER_HI - (args -> enqueued - args -> dequeued );
1059+ while (space > 0 ) {
1060+ PyObject * op = _PyObjectStack_Pop (& args -> stack );
1061+ if (op == NULL ) {
1062+ return ;
1063+ }
1064+ gc_mark_buffer_push (op , args );
1065+ space -- ;
9911066 }
992- gc_mark_buffer_push (op , args );
9931067 }
9941068}
9951069
9961070static int
997- gc_propagate_alive (struct gc_mark_args * args )
1071+ gc_propagate_alive (gc_mark_args_t * args )
9981072{
9991073 for (;;) {
10001074 Py_ssize_t buf_used = args -> enqueued - args -> dequeued ;
10011075 if (buf_used <= BUFFER_LO ) {
10021076 // The mark buffer is getting empty. If it's too empty
10031077 // then there will not be enough delay between issuing
1004- // the prefetch vs when the object is actually accessed.
1005- // Prime the buffer with object pointers from the stack,
1006- // if there are any available.
1007- gc_mark_buffer_prime (args );
1078+ // the prefetch and when the object is actually accessed.
1079+ // Prime the buffer with object pointers from the stack or
1080+ // from the spans, if there are any available.
1081+ gc_prime_buffer (args );
10081082 if (args -> enqueued == args -> dequeued ) {
1009- return 0 ; // stack and buffer are both empty
1083+ return 0 ; // buffer empty, done
10101084 }
10111085 }
10121086 PyObject * op = args -> buffer [args -> dequeued % BUFFER_SIZE ];
@@ -1065,7 +1139,7 @@ gc_mark_alive_from_roots(PyInterpreterState *interp,
10651139 // Check that all objects don't have alive bit set
10661140 gc_visit_heaps (interp , & validate_alive_bits , & state -> base );
10671141#endif
1068- struct gc_mark_args mark_args = { 0 };
1142+ gc_mark_args_t mark_args = { 0 };
10691143
10701144 #define MARK_ENQUEUE (op ) \
10711145 if (op != NULL ) { \
@@ -1102,6 +1176,10 @@ gc_mark_alive_from_roots(PyInterpreterState *interp,
11021176 return -1 ;
11031177 }
11041178
1179+ assert (mark_args .spans .size == 0 );
1180+ if (mark_args .spans .stack != NULL ) {
1181+ PyMem_Free (mark_args .spans .stack );
1182+ }
11051183 assert (mark_args .stack .head == NULL );
11061184
11071185 return 0 ;
0 commit comments