Skip to content

Conversation

@jakobbotsch
Copy link
Member

@jakobbotsch jakobbotsch commented Jan 29, 2026

Previously resumption inside OSR methods looked like a normal OSR transition going through the patchpoint helper. However, the patchpoint helper is not cheap and the overhead of this is around 10-20x.

This PR optimizes resumption inside OSR functions by executing a direct non-local jump from the tier0 code into the OSR code. This completely bypasses the patchpoint helper. To do so:

  • Introduce GT_FTN_ENTRY which represents the entry point of the current function being compiled. Switch the OSR IL offset stored by OSR methods in the continuation to be this address instead.
  • Introduce GT_NONLOCAL_JMP, a unary node which represents a jump to a specified address. Change the async transformation to generate this node in tier0 codegen, so that when an OSR continuation is passed, we just jump to the OSR address.
  • Move the responsibility of simulating a call instruction from JIT_Patchpoint to the OSR function itself by simply pushing any value on the stack on entry.

Currently this only works for x64, but the same approach should work on other platforms once #123645 is merged.
The same approach can also be used to remove the transitioning responsibility from JIT_Patchpoint. The idea would be that JIT_Patchpoint returns a function address and the tier0 codegen just executes a nonlocal jump to it.

Example:

using System;
using System.Diagnostics;
using System.Runtime.CompilerServices;
using System.Threading.Tasks;

namespace OSRPerf;

public class Program
{
    static void Main()
    {
        NullAwaiter na = new NullAwaiter();

        Task t = Foo(10_000_000, na);
        while (!t.IsCompleted)
        {
            na.Continue();
        }
    }

    static int s_value;
    static async Task Foo(int n, NullAwaiter na)
    {
        for (int i = 0; i < n; i++)
        {
            s_value += i;
        }

        Stopwatch timer = Stopwatch.StartNew();
        for (int i = 0; i < 10_000_000; i++)
        {
            await na;
        }
        Console.WriteLine("Took {0:F1} ms", timer.Elapsed.TotalMilliseconds);
    }

    private class NullAwaiter : ICriticalNotifyCompletion
    {
        public Action Continue;

        public NullAwaiter GetAwaiter() => this;

        public bool IsCompleted => false;

        public void GetResult()
        {
        }

        public void UnsafeOnCompleted(Action continuation)
        {
            Continue = continuation;
        }

        public void OnCompleted(Action continuation)
        {
            throw new NotImplementedException();
        }
    }
}

Before: Took 19462.2 ms
After: Took 799.1 ms

Tier0 codegen diff
@@ -22,7 +22,7 @@
 ;  V12 tmp4         [V12    ] (  1,  1   )     ref  ->  [rbp-0x48]  do-not-enreg[X] must-init addr-exposed ld-addr-op "Async SynchronizationContext"
 ;  V13 tmp5         [V13    ] (  1,  1   )     ref  ->  [rbp-0x80]  do-not-enreg[] must-init "returned continuation"
 ;  V14 tmp6         [V14    ] (  1,  1   )     ref  ->  [rbp-0x88]  do-not-enreg[] must-init "new continuation"
-;  V15 tmp7         [V15    ] (  1,  1   )     int  ->  [rbp-0x8C]  do-not-enreg[] must-init "IL offset for tier0 OSR method"
+;  V15 tmp7         [V15    ] (  1,  1   )    long  ->  [rbp-0x90]  do-not-enreg[] must-init "OSR address for tier0 OSR method"
 ;  TEMP_01                                   byref  ->  [rbp-0x98]
 ;
 ; Lcl frame size = 192
@@ -178,7 +178,8 @@ G_M39061_IG15:
        mov      rax, gword ptr [rbp-0x88]
        mov      dword ptr [rax+0x18], 1
        mov      rax, gword ptr [rbp-0x88]
-       mov      dword ptr [rax+0x20], -1
+       xor      ecx, ecx
+       mov      qword ptr [rax+0x20], rcx
        mov      rax, gword ptr [rbp-0x88]
        lea      rcx, bword ptr [rax+0x30]
        mov      rdx, gword ptr [rbp+0x20]
@@ -223,9 +224,9 @@ G_M39061_IG15:
        mov      bword ptr [rbp-0x98], rcx
        call     [System.Runtime.CompilerServices.AsyncHelpers:CaptureExecutionContext():System.Threading.ExecutionContext]
        mov      rcx, bword ptr [rbp-0x98]
-       mov      rdx, rax
-						;; size=309 bbWeight=0 PerfScore 0.00
+						;; size=305 bbWeight=0 PerfScore 0.00
 G_M39061_IG16:
+       mov      rdx, rax
        call     CORINFO_HELP_ASSIGN_REF
        cmp      gword ptr [rbp+0x10], 0
        setne    cl
@@ -235,7 +236,7 @@ G_M39061_IG16:
        call     [System.Runtime.CompilerServices.AsyncHelpers:RestoreContextsOnSuspension(bool,System.Threading.ExecutionContext,System.Threading.SynchronizationContext)]
        mov      rax, gword ptr [rbp-0x88]
        mov      rcx, rax
-						;; size=40 bbWeight=0 PerfScore 0.00
+						;; size=43 bbWeight=0 PerfScore 0.00
 G_M39061_IG17:
        add      rsp, 192
        pop      rbp
@@ -243,10 +244,10 @@ G_M39061_IG17:
 						;; size=9 bbWeight=0 PerfScore 0.00
 G_M39061_IG18:
        mov      rax, gword ptr [rbp+0x10]
-       mov      eax, dword ptr [rax+0x20]
-       mov      dword ptr [rbp-0x8C], eax
-       cmp      dword ptr [rbp-0x8C], 0
-       jge      G_M39061_IG19
+       mov      rax, qword ptr [rax+0x20]
+       mov      qword ptr [rbp-0x90], rax
+       cmp      qword ptr [rbp-0x90], 0
+       jne      G_M39061_IG19
        mov      rax, gword ptr [rbp+0x10]
        mov      rcx, gword ptr [rax+0x28]
        call     [System.Runtime.CompilerServices.AsyncHelpers:RestoreExecutionContext(System.Threading.ExecutionContext)]
@@ -284,12 +285,11 @@ G_M39061_IG18:
        mov      eax, dword ptr [rax+0x74]
        mov      dword ptr [rbp-0x78], eax
        jmp      G_M39061_IG09
-						;; size=172 bbWeight=0 PerfScore 0.00
+						;; size=175 bbWeight=0 PerfScore 0.00
 G_M39061_IG19:
-       mov      ecx, dword ptr [rbp-0x8C]
-       call     CORINFO_HELP_PATCHPOINT_FORCED
+       jmp      qword ptr [rbp-0x90]
        int3     
-						;; size=12 bbWeight=0 PerfScore 0.00
+						;; size=7 bbWeight=0 PerfScore 0.00
 G_M39061_IG20:
        sub      rsp, 40
 						;; size=4 bbWeight=0 PerfScore 0.00
@@ -310,5 +310,5 @@ RWD00  	dq	(dynamicClass):IL_STUB_AsyncResume_Foo_Tier0(System.Object,byref):Sys
 	dq	G_M39061_IG15
 
 
-; Total bytes of code 1050, prolog size 62, PerfScore 126.37, instruction count 235, allocated bytes for code 1050 (MethodHash=01c8676a) for method OSRPerf.Program:Foo(int,OSRPerf.Program+NullAwaiter) (Instrumented Tier0)
+; Total bytes of code 1047, prolog size 62, PerfScore 126.37, instruction count 235, allocated bytes for code 1047 (MethodHash=01c8676a) for method OSRPerf.Program:Foo(int,OSRPerf.Program+NullAwaiter) (Instrumented Tier0)
 ; ============================================================
OSR method diff
@@ -62,6 +62,7 @@
 ; Lcl frame size = 80
 
 G_M39061_IG01:
+       push     rax
        mov      rax, qword ptr [rbp]
        push     rax
        sub      rsp, 112
@@ -78,7 +79,7 @@ G_M39061_IG01:
        mov      rax, gword ptr [rbp+0xE0]
        mov      ecx, dword ptr [rbp+0xE8]
        mov      edx, dword ptr [rbp+0x7C]
-						;; size=81 bbWeight=1 PerfScore 19.00
+						;; size=82 bbWeight=1 PerfScore 20.00
 G_M39061_IG02:
        test     rax, rax
        jne      G_M39061_IG49
@@ -343,7 +344,8 @@ G_M39061_IG47:
        lea      rcx, [reloc @RWD32]
        mov      qword ptr [r14+0x10], rcx
        mov      qword ptr [r14+0x18], 1
-       mov      dword ptr [r14+0x20], 20
+       lea      rcx, G_M39061_IG01
+       mov      qword ptr [r14+0x20], rcx
        lea      rcx, bword ptr [r14+0x30]
        mov      rdx, rbx
        call     CORINFO_HELP_ASSIGN_REF
@@ -366,7 +368,7 @@ G_M39061_IG47:
        mov      r8, gword ptr [rbp+0x88]
        call     [System.Runtime.CompilerServices.AsyncHelpers:RestoreContextsOnSuspension(bool,System.Threading.ExecutionContext,System.Threading.SynchronizationContext)]
        mov      rcx, r14
-						;; size=145 bbWeight=0 PerfScore 0.00
+						;; size=148 bbWeight=0 PerfScore 0.00
 G_M39061_IG48:
        vmovaps  xmm6, xmmword ptr [rsp+0x40]
        add      rsp, 288
@@ -378,9 +380,9 @@ G_M39061_IG48:
        ret      
 						;; size=20 bbWeight=0 PerfScore 0.00
 G_M39061_IG49:
-       cmp      dword ptr [rax+0x20], 0
+       cmp      qword ptr [rax+0x20], 0
        mov      rax, gword ptr [rbp+0xE0]
-       jl       G_M39061_IG03
+       je       G_M39061_IG03
        mov      rcx, gword ptr [rax+0x28]
        call     [System.Runtime.CompilerServices.AsyncHelpers:RestoreExecutionContext(System.Threading.ExecutionContext)]
        mov      rax, gword ptr [rbp+0xE0]
@@ -392,7 +394,7 @@ G_M39061_IG49:
        mov      qword ptr [rbp-0x40], r8
        mov      edi, dword ptr [rax+0x50]
        jmp      G_M39061_IG12
-						;; size=66 bbWeight=0 PerfScore 0.00
+						;; size=67 bbWeight=0 PerfScore 0.00
 G_M39061_IG50:
        sub      rsp, 40
        vzeroupper 
@@ -418,5 +420,5 @@ RWD32  	dq	(dynamicClass):IL_STUB_AsyncResume_Foo_Tier1OSR(System.Object,byref):
 	dq	G_M39061_IG47

Fix #120865

@github-actions github-actions bot added the area-CodeGen-coreclr CLR JIT compiler in src/coreclr/src/jit and related components such as SuperPMI label Jan 29, 2026
@dotnet-policy-service
Copy link
Contributor

Tagging subscribers to this area: @JulieLeeMSFT, @dotnet/jit-contrib
See info in area-owners.md if you want to be subscribed.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment

Labels

area-CodeGen-coreclr CLR JIT compiler in src/coreclr/src/jit and related components such as SuperPMI

Projects

None yet

Development

Successfully merging this pull request may close these issues.

Optimize runtime async OSR resumption performance

1 participant