@@ -2415,6 +2415,298 @@ insert_superinstructions(cfg_builder *g)
24152415 return res ;
24162416}
24172417
2418+ typedef struct {
2419+ // Index of instruction that produced the reference or -1.
2420+ int instr ;
2421+
2422+ // The local to which the reference refers or -1.
2423+ int local ;
2424+ } ref ;
2425+
2426+ #define NOT_LOCAL -1
2427+
2428+ #define DUMMY_REF (ref){-1, NOT_LOCAL}
2429+
2430+ typedef struct {
2431+ ref * refs ;
2432+ Py_ssize_t size ;
2433+ Py_ssize_t capacity ;
2434+ } ref_stack ;
2435+
2436+ static bool
2437+ ref_stack_has_refs_from_instr (ref_stack * stack , int instr )
2438+ {
2439+ for (Py_ssize_t i = 0 ; i < stack -> size ; i ++ ) {
2440+ if (stack -> refs [i ].instr == instr ) {
2441+ return true;
2442+ }
2443+ }
2444+ return false;
2445+ }
2446+
2447+ static int
2448+ ref_stack_push (ref_stack * stack , ref r )
2449+ {
2450+ if (stack -> size == stack -> capacity ) {
2451+ Py_ssize_t new_cap = Py_MAX (32 , stack -> capacity * 2 );
2452+ ref * refs = PyMem_Realloc (stack -> refs , sizeof (* stack -> refs ) * new_cap );
2453+ if (refs == NULL ) {
2454+ PyErr_NoMemory ();
2455+ return -1 ;
2456+ }
2457+ stack -> refs = refs ;
2458+ stack -> capacity = new_cap ;
2459+ }
2460+ stack -> refs [stack -> size ] = r ;
2461+ stack -> size ++ ;
2462+ return 0 ;
2463+ }
2464+
2465+ static ref
2466+ ref_stack_pop (ref_stack * stack )
2467+ {
2468+ assert (stack -> size > 0 );
2469+ stack -> size -- ;
2470+ ref r = stack -> refs [stack -> size ];
2471+ return r ;
2472+ }
2473+
2474+ static void
2475+ ref_stack_swap_top (ref_stack * stack , Py_ssize_t off )
2476+ {
2477+ Py_ssize_t idx = stack -> size - off ;
2478+ assert (idx >= 0 && idx < stack -> size );
2479+ ref tmp = stack -> refs [idx ];
2480+ stack -> refs [idx ] = stack -> refs [stack -> size - 1 ];
2481+ stack -> refs [stack -> size - 1 ] = tmp ;
2482+ }
2483+
2484+ static ref
2485+ ref_stack_at (ref_stack * stack , Py_ssize_t idx )
2486+ {
2487+ assert (idx >= 0 && idx < stack -> size );
2488+ return stack -> refs [idx ];
2489+ }
2490+
2491+ static void
2492+ ref_stack_clear (ref_stack * stack )
2493+ {
2494+ stack -> size = 0 ;
2495+ }
2496+
2497+ static void
2498+ ref_stack_fini (ref_stack * stack )
2499+ {
2500+ if (stack -> refs != NULL ) {
2501+ PyMem_Free (stack -> refs );
2502+ }
2503+ stack -> refs = NULL ;
2504+ stack -> capacity = 0 ;
2505+ stack -> size = 0 ;
2506+ }
2507+
2508+ static void
2509+ kill_local (bool * has_killed_refs , Py_ssize_t size , ref_stack * refs , int local )
2510+ {
2511+ for (Py_ssize_t i = 0 ; i < refs -> size ; i ++ ) {
2512+ ref r = ref_stack_at (refs , i );
2513+ if (r .local == local ) {
2514+ assert (r .instr >= 0 );
2515+ has_killed_refs [r .instr ] = true;
2516+ }
2517+ }
2518+ }
2519+
2520+ static void
2521+ load_fast_push_block (basicblock * * * sp , basicblock * target , int start_depth )
2522+ {
2523+ assert (!target -> b_visited || (target -> b_startdepth == start_depth ));
2524+ if (!target -> b_visited ) {
2525+ assert (target -> b_startdepth == -1 );
2526+ target -> b_startdepth = start_depth ;
2527+ target -> b_visited = 1 ;
2528+ * (* sp )++ = target ;
2529+ }
2530+ }
2531+
2532+ static int
2533+ optimize_load_fast (cfg_builder * g )
2534+ {
2535+ int status ;
2536+ ref_stack refs = {0 };
2537+ bool * has_killed_refs = NULL ;
2538+ basicblock * entryblock = g -> g_entryblock ;
2539+ for (basicblock * b = entryblock ; b != NULL ; b = b -> b_next ) {
2540+ b -> b_startdepth = -1 ;
2541+ }
2542+ basicblock * * blocks = make_cfg_traversal_stack (entryblock );
2543+ if (blocks == NULL ) {
2544+ status = ERROR ;
2545+ goto done ;
2546+ }
2547+ basicblock * * sp = blocks ;
2548+ * sp = entryblock ;
2549+ sp ++ ;
2550+ entryblock -> b_startdepth = 0 ;
2551+ entryblock -> b_visited = 1 ;
2552+
2553+ while (sp != blocks ) {
2554+ basicblock * block = * -- sp ;
2555+ assert (block -> b_startdepth > -1 );
2556+
2557+ // Reset state that tracks which instructions produce references to
2558+ // locals that are on the stack while the local is overwritten.
2559+ int size = sizeof (* has_killed_refs ) * block -> b_iused ;
2560+ bool * p = PyMem_Realloc (has_killed_refs , size );
2561+ if (p == NULL ) {
2562+ PyErr_NoMemory ();
2563+ status = ERROR ;
2564+ goto done ;
2565+ }
2566+ else {
2567+ has_killed_refs = p ;
2568+ }
2569+ memset (has_killed_refs , 0 , size );
2570+
2571+ // Reset the stack of refs. We don't track references on the stack
2572+ // across basic blocks, but the bytecode will expect their
2573+ // presence. Add dummy references as necessary.
2574+ ref_stack_clear (& refs );
2575+ for (int i = 0 ; i < block -> b_startdepth ; i ++ ) {
2576+ ref_stack_push (& refs , DUMMY_REF );
2577+ }
2578+
2579+ for (int i = 0 ; i < block -> b_iused ; i ++ ) {
2580+ cfg_instr * instr = & block -> b_instr [i ];
2581+ int opcode = instr -> i_opcode ;
2582+ int oparg = instr -> i_oparg ;
2583+ assert (opcode != EXTENDED_ARG );
2584+ switch (opcode ) {
2585+ case COPY : {
2586+ Py_ssize_t idx = refs .size - oparg ;
2587+ ref r = ref_stack_at (& refs , idx );
2588+ if (ref_stack_push (& refs , r ) < 0 ) {
2589+ status = ERROR ;
2590+ goto done ;
2591+ }
2592+ break ;
2593+ }
2594+
2595+ case LOAD_FAST : {
2596+ if (ref_stack_push (& refs , (ref ){i , oparg }) < 0 ) {
2597+ status = ERROR ;
2598+ goto done ;
2599+ }
2600+ break ;
2601+ }
2602+
2603+ case LOAD_FAST_LOAD_FAST : {
2604+ if (ref_stack_push (& refs , (ref ){i , oparg >> 4 }) < 0 ) {
2605+ status = ERROR ;
2606+ goto done ;
2607+ }
2608+ if (ref_stack_push (& refs , (ref ){i , oparg & 15 }) < 0 ) {
2609+ status = ERROR ;
2610+ goto done ;
2611+ }
2612+ break ;
2613+ }
2614+
2615+ case RETURN_VALUE : {
2616+ // We need to return a new reference so there is no point
2617+ // optimizing the instruction that produced the returned
2618+ // reference.
2619+ ref r = ref_stack_pop (& refs );
2620+ if (r .local != NOT_LOCAL ) {
2621+ assert (r .instr >= 0 );
2622+ has_killed_refs [r .instr ] = true;
2623+ }
2624+ break ;
2625+ }
2626+
2627+ case STORE_FAST : {
2628+ kill_local (has_killed_refs , block -> b_iused , & refs , oparg );
2629+ ref_stack_pop (& refs );
2630+ break ;
2631+ }
2632+
2633+ case STORE_FAST_STORE_FAST : {
2634+ kill_local (has_killed_refs , block -> b_iused , & refs , oparg >> 4 );
2635+ kill_local (has_killed_refs , block -> b_iused , & refs , oparg & 15 );
2636+ ref_stack_pop (& refs );
2637+ ref_stack_pop (& refs );
2638+ break ;
2639+ }
2640+
2641+ case SWAP : {
2642+ ref_stack_swap_top (& refs , oparg );
2643+ break ;
2644+ }
2645+
2646+ default : {
2647+ int num_popped = _PyOpcode_num_popped (opcode , oparg );
2648+ int num_pushed = _PyOpcode_num_pushed (opcode , oparg );
2649+ if (HAS_TARGET (instr -> i_opcode )) {
2650+ load_fast_push_block (& sp , instr -> i_target , refs .size - num_popped + num_pushed );
2651+ }
2652+ if (!IS_BLOCK_PUSH_OPCODE (instr -> i_opcode )) {
2653+ // Block push opcodes only affect the stack when jumping
2654+ // to the target.
2655+ for (int j = 0 ; j < num_popped ; j ++ ) {
2656+ ref_stack_pop (& refs );
2657+ }
2658+ for (int j = 0 ; j < num_pushed ; j ++ ) {
2659+ if (ref_stack_push (& refs , (ref ){i , NOT_LOCAL }) < 0 ) {
2660+ status = ERROR ;
2661+ goto done ;
2662+ }
2663+ }
2664+ }
2665+ break ;
2666+ }
2667+ }
2668+ }
2669+
2670+ // Optimize instructions
2671+ for (int i = 0 ; i < block -> b_iused ; i ++ ) {
2672+ if (!has_killed_refs [i ] && !ref_stack_has_refs_from_instr (& refs , i )) {
2673+ cfg_instr * instr = & block -> b_instr [i ];
2674+ switch (instr -> i_opcode ) {
2675+ case LOAD_FAST :
2676+ instr -> i_opcode = LOAD_FAST_BORROW ;
2677+ break ;
2678+ case LOAD_FAST_LOAD_FAST :
2679+ instr -> i_opcode = LOAD_FAST_BORROW_LOAD_FAST_BORROW ;
2680+ break ;
2681+ default :
2682+ break ;
2683+ }
2684+ }
2685+ }
2686+
2687+ // Push fallthrough block
2688+ cfg_instr * term = basicblock_last_instr (block );
2689+ if (term != NULL && block -> b_next != NULL &&
2690+ !(IS_UNCONDITIONAL_JUMP_OPCODE (term -> i_opcode ) ||
2691+ IS_SCOPE_EXIT_OPCODE (term -> i_opcode ))) {
2692+ assert (BB_HAS_FALLTHROUGH (block ));
2693+ load_fast_push_block (& sp , block -> b_next , refs .size );
2694+ }
2695+ }
2696+
2697+ status = SUCCESS ;
2698+
2699+ done :
2700+ ref_stack_fini (& refs );
2701+ if (has_killed_refs != NULL ) {
2702+ PyMem_Free (has_killed_refs );
2703+ }
2704+ if (blocks != NULL ) {
2705+ PyMem_Free (blocks );
2706+ }
2707+ return status ;
2708+ }
2709+
24182710// helper functions for add_checks_for_loads_of_unknown_variables
24192711static inline void
24202712maybe_push (basicblock * b , uint64_t unsafe_mask , basicblock * * * sp )
@@ -3028,6 +3320,7 @@ _PyCfg_OptimizeCodeUnit(cfg_builder *g, PyObject *consts, PyObject *const_cache,
30283320 add_checks_for_loads_of_uninitialized_variables (
30293321 g -> g_entryblock , nlocals , nparams ));
30303322 RETURN_IF_ERROR (insert_superinstructions (g ));
3323+ RETURN_IF_ERROR (optimize_load_fast (g ));
30313324
30323325 RETURN_IF_ERROR (push_cold_blocks_to_end (g ));
30333326 RETURN_IF_ERROR (resolve_line_numbers (g , firstlineno ));
0 commit comments