diff --git a/.github/workflows/zjit-macos.yml b/.github/workflows/zjit-macos.yml
index 8e5ca954cb8a43..30024a6d0c9d1b 100644
--- a/.github/workflows/zjit-macos.yml
+++ b/.github/workflows/zjit-macos.yml
@@ -40,6 +40,7 @@ jobs:
 
           - test_task: 'zjit-test-all'
             configure: '--enable-zjit=dev'
+            testopts: '--seed=11831'
 
           - test_task: 'btest'
             configure: '--enable-zjit=dev'
@@ -48,6 +49,7 @@ jobs:
       GITPULLOPTIONS: --no-tags origin ${{ github.ref }}
       RUN_OPTS: ${{ matrix.zjit_opts }}
       SPECOPTS: ${{ matrix.specopts }}
+      TESTOPTS: ${{ matrix.testopts }}
 
     runs-on: macos-14
 
@@ -146,6 +148,7 @@ jobs:
           make -s ${{ matrix.test_task }} ${TESTS:+TESTS="$TESTS"}
           RUN_OPTS="$RUN_OPTS"
           SPECOPTS="$SPECOPTS"
+          TESTOPTS="$TESTOPTS"
         timeout-minutes: 60
         env:
           RUBY_TESTOPTS: '-q --tty=no'
diff --git a/.github/workflows/zjit-ubuntu.yml b/.github/workflows/zjit-ubuntu.yml
index ecfd0ddeded21f..3e2db58f72b959 100644
--- a/.github/workflows/zjit-ubuntu.yml
+++ b/.github/workflows/zjit-ubuntu.yml
@@ -42,6 +42,7 @@ jobs:
 
           - test_task: 'zjit-test-all'
             configure: '--enable-zjit=dev'
+            testopts: '--seed=11831'
 
           - test_task: 'btest'
             configure: '--enable-zjit=dev'
@@ -51,6 +52,7 @@ jobs:
       RUN_OPTS: ${{ matrix.zjit_opts }}
       YJIT_BENCH_OPTS: ${{ matrix.yjit_bench_opts }}
       SPECOPTS: ${{ matrix.specopts }}
+      TESTOPTS: ${{ matrix.testopts }}
       RUBY_DEBUG: ci
       BUNDLE_JOBS: 8 # for yjit-bench
       RUST_BACKTRACE: 1
@@ -167,6 +169,7 @@ jobs:
         run: >-
           make -s ${{ matrix.test_task }} ${TESTS:+TESTS="$TESTS"}
           RUN_OPTS="$RUN_OPTS" MSPECOPT=--debug SPECOPTS="$SPECOPTS"
+          TESTOPTS="$TESTOPTS"
           ZJIT_BINDGEN_DIFF_OPTS="$ZJIT_BINDGEN_DIFF_OPTS"
         timeout-minutes: 90
         env:
diff --git a/Cargo.toml b/Cargo.toml
index 6ab7e3a45f7c7d..6f61e5f95cfd35 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -22,7 +22,8 @@ path = "jit.rs"
 
 [features]
 disasm = ["yjit?/disasm", "zjit?/disasm"]
-runtime_checks = []
+# TODO(GH-642) Turning this on trips a btest failure.
+runtime_checks = [] # ["yjit?/runtime_checks", "zjit?/runtime_checks"]
 yjit = [ "dep:yjit" ]
 zjit = [ "dep:zjit" ]
 
diff --git a/configure.ac b/configure.ac
index 4766b8813cae21..8e08b91edbbc32 100644
--- a/configure.ac
+++ b/configure.ac
@@ -4014,10 +4014,16 @@ AS_IF([test x"$JIT_CARGO_SUPPORT" != "xno" -o \( x"$YJIT_SUPPORT" != "xno" -a x"
     YJIT_LIBS=
     ZJIT_LIBS=
 
+    # There's more processing below to get the feature set for the
+    # top-level crate, so capture at this point for feature set of
+    # just the zjit crate.
+    ZJIT_TEST_FEATURES="${rb_cargo_features}"
+
     AS_IF([test x"${YJIT_SUPPORT}" != x"no"], [
         rb_cargo_features="$rb_cargo_features,yjit"
     ])
     AS_IF([test x"${ZJIT_SUPPORT}" != x"no"], [
+        AC_SUBST(ZJIT_TEST_FEATURES)
         rb_cargo_features="$rb_cargo_features,zjit"
     ])
     # if YJIT and ZJIT release mode
diff --git a/prism/config.yml b/prism/config.yml
index cb154500b3fa09..257bd389ed02ab 100644
--- a/prism/config.yml
+++ b/prism/config.yml
@@ -101,6 +101,8 @@ errors:
   - EXPECT_FOR_DELIMITER
   - EXPECT_IDENT_REQ_PARAMETER
   - EXPECT_IN_DELIMITER
+  - EXPECT_LPAREN_AFTER_NOT_LPAREN
+  - EXPECT_LPAREN_AFTER_NOT_OTHER
   - EXPECT_LPAREN_REQ_PARAMETER
   - EXPECT_MESSAGE
   - EXPECT_RBRACKET
diff --git a/prism/prism.c b/prism/prism.c
index ffe874fe08b475..85647020d832cb 100644
--- a/prism/prism.c
+++ b/prism/prism.c
@@ -13142,14 +13142,6 @@ match8(const pm_parser_t *parser, pm_token_type_t type1, pm_token_type_t type2,
     return match1(parser, type1) || match1(parser, type2) || match1(parser, type3) || match1(parser, type4) || match1(parser, type5) || match1(parser, type6) || match1(parser, type7) || match1(parser, type8);
 }
 
-/**
- * Returns true if the current token is any of the nine given types.
- */
-static inline bool
-match9(const pm_parser_t *parser, pm_token_type_t type1, pm_token_type_t type2, pm_token_type_t type3, pm_token_type_t type4, pm_token_type_t type5, pm_token_type_t type6, pm_token_type_t type7, pm_token_type_t type8, pm_token_type_t type9) {
-    return match1(parser, type1) || match1(parser, type2) || match1(parser, type3) || match1(parser, type4) || match1(parser, type5) || match1(parser, type6) || match1(parser, type7) || match1(parser, type8) || match1(parser, type9);
-}
-
 /**
  * If the current token is of the specified type, lex forward by one token and
  * return true. Otherwise, return false. For example:
@@ -17412,6 +17404,14 @@ parse_pattern_primitive(pm_parser_t *parser, pm_constant_id_list_t *captures, pm
             // If we found a label, we need to immediately return to the caller.
             if (pm_symbol_node_label_p(node)) return node;
 
+            // Call nodes (arithmetic operations) are not allowed in patterns
+            if (PM_NODE_TYPE(node) == PM_CALL_NODE) {
+                pm_parser_err_node(parser, node, diag_id);
+                pm_missing_node_t *missing_node = pm_missing_node_create(parser, node->location.start, node->location.end);
+                pm_node_destroy(parser, node);
+                return (pm_node_t *) missing_node;
+            }
+
             // Now that we have a primitive, we need to check if it's part of a range.
             if (accept2(parser, PM_TOKEN_DOT_DOT, PM_TOKEN_DOT_DOT_DOT)) {
                 pm_token_t operator = parser->previous;
@@ -17694,7 +17694,7 @@ parse_pattern(pm_parser_t *parser, pm_constant_id_list_t *captures, uint8_t flag
         // Gather up all of the patterns into the list.
         while (accept1(parser, PM_TOKEN_COMMA)) {
             // Break early here in case we have a trailing comma.
-            if (match9(parser, PM_TOKEN_KEYWORD_THEN, PM_TOKEN_BRACE_RIGHT, PM_TOKEN_BRACKET_RIGHT, PM_TOKEN_PARENTHESIS_RIGHT, PM_TOKEN_SEMICOLON, PM_TOKEN_NEWLINE, PM_TOKEN_EOF,PM_TOKEN_KEYWORD_AND, PM_TOKEN_KEYWORD_OR)) {
+            if (match7(parser, PM_TOKEN_KEYWORD_THEN, PM_TOKEN_BRACE_RIGHT, PM_TOKEN_BRACKET_RIGHT, PM_TOKEN_PARENTHESIS_RIGHT, PM_TOKEN_SEMICOLON, PM_TOKEN_KEYWORD_AND, PM_TOKEN_KEYWORD_OR)) {
                 node = (pm_node_t *) pm_implicit_rest_node_create(parser, &parser->previous);
                 pm_node_list_append(&nodes, node);
                 trailing_rest = true;
@@ -19775,6 +19775,20 @@ parse_expression_prefix(pm_parser_t *parser, pm_binding_power_t binding_power, b
             pm_arguments_t arguments = { 0 };
             pm_node_t *receiver = NULL;
 
+            // If we do not accept a command call, then we also do not accept a
+            // not without parentheses. In this case we need to reject this
+            // syntax.
+            if (!accepts_command_call && !match1(parser, PM_TOKEN_PARENTHESIS_LEFT)) {
+                if (match1(parser, PM_TOKEN_PARENTHESIS_LEFT_PARENTHESES)) {
+                    pm_parser_err(parser, parser->previous.end, parser->previous.end + 1, PM_ERR_EXPECT_LPAREN_AFTER_NOT_LPAREN);
+                } else {
+                    accept1(parser, PM_TOKEN_NEWLINE);
+                    pm_parser_err_current(parser, PM_ERR_EXPECT_LPAREN_AFTER_NOT_OTHER);
+                }
+
+                return (pm_node_t *) pm_missing_node_create(parser, parser->current.start, parser->current.end);
+            }
+
             accept1(parser, PM_TOKEN_NEWLINE);
 
             if (accept1(parser, PM_TOKEN_PARENTHESIS_LEFT)) {
diff --git a/prism/templates/src/diagnostic.c.erb b/prism/templates/src/diagnostic.c.erb
index ce98dc5acd5163..9a30a57e3b0eca 100644
--- a/prism/templates/src/diagnostic.c.erb
+++ b/prism/templates/src/diagnostic.c.erb
@@ -184,6 +184,8 @@ static const pm_diagnostic_data_t diagnostic_messages[PM_DIAGNOSTIC_ID_MAX] = {
     [PM_ERR_EXPECT_FOR_DELIMITER]               = { "unexpected %s; expected a 'do', newline, or ';' after the 'for' loop collection", PM_ERROR_LEVEL_SYNTAX },
     [PM_ERR_EXPECT_IDENT_REQ_PARAMETER]         = { "expected an identifier for the required parameter", PM_ERROR_LEVEL_SYNTAX },
     [PM_ERR_EXPECT_IN_DELIMITER]                = { "expected a delimiter after the patterns of an `in` clause", PM_ERROR_LEVEL_SYNTAX },
+    [PM_ERR_EXPECT_LPAREN_AFTER_NOT_LPAREN]     = { "expected a `(` immediately after `not`", PM_ERROR_LEVEL_SYNTAX },
+    [PM_ERR_EXPECT_LPAREN_AFTER_NOT_OTHER]      = { "expected a `(` after `not`", PM_ERROR_LEVEL_SYNTAX },
     [PM_ERR_EXPECT_LPAREN_REQ_PARAMETER]        = { "expected a `(` to start a required parameter", PM_ERROR_LEVEL_SYNTAX },
     [PM_ERR_EXPECT_MESSAGE]                     = { "unexpected %s; expecting a message to send to the receiver", PM_ERROR_LEVEL_SYNTAX },
     [PM_ERR_EXPECT_RBRACKET]                    = { "expected a matching `]`", PM_ERROR_LEVEL_SYNTAX },
diff --git a/template/Makefile.in b/template/Makefile.in
index 96c8d8031bcea0..1e6d55c435e95b 100644
--- a/template/Makefile.in
+++ b/template/Makefile.in
@@ -113,6 +113,7 @@ ZJIT_OBJ=@ZJIT_OBJ@
 JIT_CARGO_SUPPORT=@JIT_CARGO_SUPPORT@
 CARGO_TARGET_DIR=@abs_top_builddir@/target
 CARGO_BUILD_ARGS=@CARGO_BUILD_ARGS@
+ZJIT_TEST_FEATURES=@ZJIT_TEST_FEATURES@
 RUST_LIB=@RUST_LIB@
 RUST_LIBOBJ = $(RUST_LIB:.a=.@OBJEXT@)
 LDFLAGS = @STATIC@ $(CFLAGS) @LDFLAGS@
diff --git a/test/prism/errors/command_calls_31.txt b/test/prism/errors/command_calls_31.txt
new file mode 100644
index 00000000000000..e662b254444821
--- /dev/null
+++ b/test/prism/errors/command_calls_31.txt
@@ -0,0 +1,17 @@
+true && not true
+            ^~~~ expected a `(` after `not`
+            ^~~~ unexpected 'true', expecting end-of-input
+
+true || not true
+            ^~~~ expected a `(` after `not`
+            ^~~~ unexpected 'true', expecting end-of-input
+
+true && not (true)
+           ^ expected a `(` immediately after `not`
+            ^ unexpected '(', expecting end-of-input
+
+true && not
+true
+^~~~ expected a `(` after `not`
+^~~~ unexpected 'true', expecting end-of-input
+
diff --git a/test/prism/errors/pattern_arithmetic_expressions.txt b/test/prism/errors/pattern_arithmetic_expressions.txt
new file mode 100644
index 00000000000000..cfb36505312584
--- /dev/null
+++ b/test/prism/errors/pattern_arithmetic_expressions.txt
@@ -0,0 +1,3 @@
+case 1; in -1**2; end
+           ^~~~~ expected a pattern expression after the `in` keyword
+
diff --git a/test/prism/errors/pattern_match_implicit_rest.txt b/test/prism/errors/pattern_match_implicit_rest.txt
new file mode 100644
index 00000000000000..8602c0add06ce5
--- /dev/null
+++ b/test/prism/errors/pattern_match_implicit_rest.txt
@@ -0,0 +1,3 @@
+a=>b, *,
+        ^ expected a pattern expression after `,`
+
diff --git a/test/ruby/test_process.rb b/test/ruby/test_process.rb
index 5497b182f76b8a..f1894ab0c30f8a 100644
--- a/test/ruby/test_process.rb
+++ b/test/ruby/test_process.rb
@@ -1682,9 +1682,10 @@ def test_uid_from_name
       if u = Etc.getpwuid(Process.uid)
         assert_equal(Process.uid, Process::UID.from_name(u.name), u.name)
       end
-      assert_raise_with_message(ArgumentError, /\u{4e0d 5b58 5728}/) {
+      exc = assert_raise_kind_of(ArgumentError, SystemCallError) {
         Process::UID.from_name("\u{4e0d 5b58 5728}")
       }
+      assert_match(/\u{4e0d 5b58 5728}/, exc.message) if exc.is_a?(ArgumentError)
     end
   end
 
diff --git a/test/ruby/test_zjit.rb b/test/ruby/test_zjit.rb
index fdcee3cafd858e..299600eef95d53 100644
--- a/test/ruby/test_zjit.rb
+++ b/test/ruby/test_zjit.rb
@@ -999,6 +999,26 @@ def test_require_rubygems_with_auto_compact
     }, call_threshold: 2
   end
 
+  def test_profile_under_nested_jit_call
+    assert_compiles '[nil, nil, 3]', %q{
+      def profile
+        1 + 2
+      end
+
+      def jit_call(flag)
+        if flag
+          profile
+        end
+      end
+
+      def entry(flag)
+        jit_call(flag)
+      end
+
+      [entry(false), entry(false), entry(true)]
+    }, call_threshold: 2
+  end
+
   def test_bop_redefinition
     assert_runs '[3, :+, 100]', %q{
       def test
diff --git a/zjit.c b/zjit.c
index 0c0c85d3a62bee..61c17d32c30751 100644
--- a/zjit.c
+++ b/zjit.c
@@ -295,6 +295,18 @@ rb_zjit_profile_disable(const rb_iseq_t *iseq)
     }
 }
 
+// Update a YARV instruction to a given opcode (to disable ZJIT profiling).
+void
+rb_zjit_iseq_insn_set(const rb_iseq_t *iseq, unsigned int insn_idx, enum ruby_vminsn_type bare_insn)
+{
+#if RUBY_DEBUG
+    int insn = rb_vm_insn_addr2opcode((void *)iseq->body->iseq_encoded[insn_idx]);
+    RUBY_ASSERT(vm_zjit_insn_to_bare_insn(insn) == (int)bare_insn);
+#endif
+    const void *const *insn_table = rb_vm_get_insns_address_table();
+    iseq->body->iseq_encoded[insn_idx] = (VALUE)insn_table[bare_insn];
+}
+
 // Get profiling information for ISEQ
 void *
 rb_iseq_get_zjit_payload(const rb_iseq_t *iseq)
diff --git a/zjit/Cargo.toml b/zjit/Cargo.toml
index a86117d6e2f08e..a1da8e7cc0990a 100644
--- a/zjit/Cargo.toml
+++ b/zjit/Cargo.toml
@@ -18,3 +18,4 @@ expect-test = "1.5.1"
 [features]
 # Support --yjit-dump-disasm and RubyVM::YJIT.disasm using libcapstone.
 disasm = ["capstone"]
+runtime_checks = []
diff --git a/zjit/bindgen/src/main.rs b/zjit/bindgen/src/main.rs
index 91de6dcd8d4567..dd167e9eb07ac3 100644
--- a/zjit/bindgen/src/main.rs
+++ b/zjit/bindgen/src/main.rs
@@ -335,6 +335,7 @@ fn main() {
         .allowlist_function("rb_zjit_get_page_size")
         .allowlist_function("rb_zjit_iseq_builtin_attrs")
         .allowlist_function("rb_zjit_iseq_inspect")
+        .allowlist_function("rb_zjit_iseq_insn_set")
         .allowlist_function("rb_set_cfp_(pc|sp)")
         .allowlist_function("rb_c_method_tracing_currently_enabled")
         .allowlist_function("rb_full_cfunc_return")
diff --git a/zjit/src/backend/arm64/mod.rs b/zjit/src/backend/arm64/mod.rs
index ac5c9908f55c43..149d09813b2018 100644
--- a/zjit/src/backend/arm64/mod.rs
+++ b/zjit/src/backend/arm64/mod.rs
@@ -415,26 +415,36 @@ impl Assembler
             // being used. It is okay not to use their output here.
             #[allow(unused_must_use)]
             match &mut insn {
-                Insn::Sub { left, right, out } |
                 Insn::Add { left, right, out } => {
                     match (*left, *right) {
-                        (Opnd::Reg(_) | Opnd::VReg { .. }, Opnd::Reg(_) | Opnd::VReg { .. }) => {
-                            merge_three_reg_mov(&live_ranges, &mut iterator, left, right, out);
-                            asm.push_insn(insn);
-                        },
+                        // When one operand is a register, legalize the other operand
+                        // into possibly an immdiate and swap the order if necessary.
+                        // Only the rhs of ADD can be an immediate, but addition is commutative.
                         (reg_opnd @ (Opnd::Reg(_) | Opnd::VReg { .. }), other_opnd) |
                         (other_opnd, reg_opnd @ (Opnd::Reg(_) | Opnd::VReg { .. })) => {
                             *left = reg_opnd;
                             *right = split_shifted_immediate(asm, other_opnd);
+                            // Now `right` is either a register or an immediate, both can try to
+                            // merge with a subsequent mov.
+                            merge_three_reg_mov(&live_ranges, &mut iterator, left, left, out);
                             asm.push_insn(insn);
-                        },
+                        }
                         _ => {
                             *left = split_load_operand(asm, *left);
                             *right = split_shifted_immediate(asm, *right);
+                            merge_three_reg_mov(&live_ranges, &mut iterator, left, right, out);
                             asm.push_insn(insn);
                         }
                     }
-                },
+                }
+                Insn::Sub { left, right, out } => {
+                    *left = split_load_operand(asm, *left);
+                    *right = split_shifted_immediate(asm, *right);
+                    // Now `right` is either a register or an immediate,
+                    // both can try to merge with a subsequent mov.
+                    merge_three_reg_mov(&live_ranges, &mut iterator, left, left, out);
+                    asm.push_insn(insn);
+                }
                 Insn::And { left, right, out } |
                 Insn::Or { left, right, out } |
                 Insn::Xor { left, right, out } => {
@@ -919,10 +929,28 @@ impl Assembler
                     ldp_post(cb, X29, X30, A64Opnd::new_mem(128, C_SP_REG, 16));
                 },
                 Insn::Add { left, right, out } => {
-                    adds(cb, out.into(), left.into(), right.into());
+                    // Usually, we issue ADDS, so you could branch on overflow, but ADDS with
+                    // out=31 refers to out=XZR, which discards the sum. So, instead of ADDS
+                    // (aliased to CMN in this case) we issue ADD instead which writes the sum
+                    // to the stack pointer; we assume you got x31 from NATIVE_STACK_POINTER.
+                    let out: A64Opnd = out.into();
+                    if let A64Opnd::Reg(A64Reg { reg_no: 31, .. }) = out {
+                        add(cb, out, left.into(), right.into());
+                    } else {
+                        adds(cb, out, left.into(), right.into());
+                    }
                 },
                 Insn::Sub { left, right, out } => {
-                    subs(cb, out.into(), left.into(), right.into());
+                    // Usually, we issue SUBS, so you could branch on overflow, but SUBS with
+                    // out=31 refers to out=XZR, which discards the result. So, instead of SUBS
+                    // (aliased to CMP in this case) we issue SUB instead which writes the diff
+                    // to the stack pointer; we assume you got x31 from NATIVE_STACK_POINTER.
+                    let out: A64Opnd = out.into();
+                    if let A64Opnd::Reg(A64Reg { reg_no: 31, .. }) = out {
+                        sub(cb, out, left.into(), right.into());
+                    } else {
+                        subs(cb, out, left.into(), right.into());
+                    }
                 },
                 Insn::Mul { left, right, out } => {
                     // If the next instruction is jo (jump on overflow)
@@ -1363,12 +1391,42 @@ mod tests {
         asm.mov(sp, new_sp);
         let new_sp = asm.sub(sp, 0x20.into());
         asm.mov(sp, new_sp);
-        asm.compile_with_num_regs(&mut cb, 2);
 
-        assert_disasm!(cb, "e08300b11f000091e08300f11f000091", {"
+        asm.compile_with_num_regs(&mut cb, 2);
+        assert_disasm!(cb, "ff830091ff8300d1", "
             0x0: add sp, sp, #0x20
             0x4: sub sp, sp, #0x20
-        "});
+        ");
+    }
+
+    #[test]
+    fn add_into() {
+        let (mut asm, mut cb) = setup_asm();
+
+        let sp = Opnd::Reg(XZR_REG);
+        asm.add_into(sp, 8.into());
+        asm.add_into(Opnd::Reg(X20_REG), 0x20.into());
+
+        asm.compile_with_num_regs(&mut cb, 0);
+        assert_disasm!(cb, "ff230091948200b1", "
+            0x0: add sp, sp, #8
+            0x4: adds x20, x20, #0x20
+        ");
+    }
+
+    #[test]
+    fn sub_imm_reg() {
+        let (mut asm, mut cb) = setup_asm();
+
+        let difference = asm.sub(0x8.into(), Opnd::Reg(X5_REG));
+        asm.load_into(Opnd::Reg(X1_REG), difference);
+
+        asm.compile_with_num_regs(&mut cb, 1);
+        assert_disasm!(cb, "000180d2000005ebe10300aa", "
+            0x0: mov x0, #8
+            0x4: subs x0, x0, x5
+            0x8: mov x1, x0
+        ");
     }
 
     #[test]
diff --git a/zjit/src/backend/lir.rs b/zjit/src/backend/lir.rs
index e5453e4d554b67..94c53569b4e95c 100644
--- a/zjit/src/backend/lir.rs
+++ b/zjit/src/backend/lir.rs
@@ -270,6 +270,14 @@ impl From<VALUE> for Opnd {
     }
 }
 
+/// Set of things we need to restore for side exits.
+#[derive(Clone, Debug)]
+pub struct SideExitContext {
+    pub pc: *const VALUE,
+    pub stack: Vec<Opnd>,
+    pub locals: Vec<Opnd>,
+}
+
 /// Branch target (something that we can jump to)
 /// for branch instructions
 #[derive(Clone, Debug)]
@@ -281,12 +289,14 @@ pub enum Target
     Label(Label),
     /// Side exit to the interpreter
     SideExit {
-        pc: *const VALUE,
-        stack: Vec<Opnd>,
-        locals: Vec<Opnd>,
-        c_stack_bytes: usize,
+        /// Context to restore on regular side exits. None for side exits right
+        /// after JIT-to-JIT calls because we restore them before the JIT call.
+        context: Option<SideExitContext>,
+        /// We use this to enrich asm comments.
         reason: SideExitReason,
-        // Some if the side exit should write this label. We use it for patch points.
+        /// The number of bytes we need to adjust the C stack pointer by.
+        c_stack_bytes: usize,
+        /// Some if the side exit should write this label. We use it for patch points.
         label: Option<Label>,
     },
 }
@@ -767,7 +777,7 @@ impl<'a> Iterator for InsnOpndIterator<'a> {
             Insn::Label(target) |
             Insn::LeaJumpTarget { target, .. } |
             Insn::PatchPoint(target) => {
-                if let Target::SideExit { stack, locals, .. } = target {
+                if let Target::SideExit { context: Some(SideExitContext { stack, locals, .. }), .. } = target {
                     let stack_idx = self.idx;
                     if stack_idx < stack.len() {
                         let opnd = &stack[stack_idx];
@@ -792,7 +802,7 @@ impl<'a> Iterator for InsnOpndIterator<'a> {
                     return Some(opnd);
                 }
 
-                if let Target::SideExit { stack, locals, .. } = target {
+                if let Target::SideExit { context: Some(SideExitContext { stack, locals, .. }), .. } = target {
                     let stack_idx = self.idx - 1;
                     if stack_idx < stack.len() {
                         let opnd = &stack[stack_idx];
@@ -923,7 +933,7 @@ impl<'a> InsnOpndMutIterator<'a> {
             Insn::Label(target) |
             Insn::LeaJumpTarget { target, .. } |
             Insn::PatchPoint(target) => {
-                if let Target::SideExit { stack, locals, .. } = target {
+                if let Target::SideExit { context: Some(SideExitContext { stack, locals, .. }), .. } = target {
                     let stack_idx = self.idx;
                     if stack_idx < stack.len() {
                         let opnd = &mut stack[stack_idx];
@@ -948,7 +958,7 @@ impl<'a> InsnOpndMutIterator<'a> {
                     return Some(opnd);
                 }
 
-                if let Target::SideExit { stack, locals, .. } = target {
+                if let Target::SideExit { context: Some(SideExitContext { stack, locals, .. }), .. } = target {
                     let stack_idx = self.idx - 1;
                     if stack_idx < stack.len() {
                         let opnd = &mut stack[stack_idx];
@@ -1803,7 +1813,7 @@ impl Assembler
         for (idx, target) in targets {
             // Compile a side exit. Note that this is past the split pass and alloc_regs(),
             // so you can't use a VReg or an instruction that needs to be split.
-            if let Target::SideExit { pc, stack, locals, c_stack_bytes, reason, label } = target {
+            if let Target::SideExit { context, reason, c_stack_bytes, label } = target {
                 asm_comment!(self, "Exit: {reason}");
                 let side_exit_label = if let Some(label) = label {
                     Target::Label(label)
@@ -1823,26 +1833,30 @@ impl Assembler
                     }
                 }
 
-                asm_comment!(self, "write stack slots: {stack:?}");
-                for (idx, &opnd) in stack.iter().enumerate() {
-                    let opnd = split_store_source(self, opnd);
-                    self.store(Opnd::mem(64, SP, idx as i32 * SIZEOF_VALUE_I32), opnd);
-                }
+                // Restore the PC and the stack for regular side exits. We don't do this for
+                // side exits right after JIT-to-JIT calls, which restore them before the call.
+                if let Some(SideExitContext { pc, stack, locals }) = context {
+                    asm_comment!(self, "write stack slots: {stack:?}");
+                    for (idx, &opnd) in stack.iter().enumerate() {
+                        let opnd = split_store_source(self, opnd);
+                        self.store(Opnd::mem(64, SP, idx as i32 * SIZEOF_VALUE_I32), opnd);
+                    }
 
-                asm_comment!(self, "write locals: {locals:?}");
-                for (idx, &opnd) in locals.iter().enumerate() {
-                    let opnd = split_store_source(self, opnd);
-                    self.store(Opnd::mem(64, SP, (-local_size_and_idx_to_ep_offset(locals.len(), idx) - 1) * SIZEOF_VALUE_I32), opnd);
-                }
+                    asm_comment!(self, "write locals: {locals:?}");
+                    for (idx, &opnd) in locals.iter().enumerate() {
+                        let opnd = split_store_source(self, opnd);
+                        self.store(Opnd::mem(64, SP, (-local_size_and_idx_to_ep_offset(locals.len(), idx) - 1) * SIZEOF_VALUE_I32), opnd);
+                    }
 
-                asm_comment!(self, "save cfp->pc");
-                self.load_into(Opnd::Reg(Assembler::SCRATCH_REG), Opnd::const_ptr(pc));
-                self.store(Opnd::mem(64, CFP, RUBY_OFFSET_CFP_PC), Opnd::Reg(Assembler::SCRATCH_REG));
+                    asm_comment!(self, "save cfp->pc");
+                    self.load_into(Opnd::Reg(Assembler::SCRATCH_REG), Opnd::const_ptr(pc));
+                    self.store(Opnd::mem(64, CFP, RUBY_OFFSET_CFP_PC), Opnd::Reg(Assembler::SCRATCH_REG));
 
-                asm_comment!(self, "save cfp->sp");
-                self.lea_into(Opnd::Reg(Assembler::SCRATCH_REG), Opnd::mem(64, SP, stack.len() as i32 * SIZEOF_VALUE_I32));
-                let cfp_sp = Opnd::mem(64, CFP, RUBY_OFFSET_CFP_SP);
-                self.store(cfp_sp, Opnd::Reg(Assembler::SCRATCH_REG));
+                    asm_comment!(self, "save cfp->sp");
+                    self.lea_into(Opnd::Reg(Assembler::SCRATCH_REG), Opnd::mem(64, SP, stack.len() as i32 * SIZEOF_VALUE_I32));
+                    let cfp_sp = Opnd::mem(64, CFP, RUBY_OFFSET_CFP_SP);
+                    self.store(cfp_sp, Opnd::Reg(Assembler::SCRATCH_REG));
+                }
 
                 if c_stack_bytes > 0 {
                     asm_comment!(self, "restore C stack pointer");
diff --git a/zjit/src/codegen.rs b/zjit/src/codegen.rs
index ce3decd56f319b..f67757038e9b80 100644
--- a/zjit/src/codegen.rs
+++ b/zjit/src/codegen.rs
@@ -7,7 +7,7 @@ use crate::invariants::track_bop_assumption;
 use crate::gc::get_or_create_iseq_payload;
 use crate::state::ZJITState;
 use crate::{asm::CodeBlock, cruby::*, options::debug, virtualmem::CodePtr};
-use crate::backend::lir::{self, asm_comment, asm_ccall, Assembler, Opnd, Target, CFP, C_ARG_OPNDS, C_RET_OPND, EC, NATIVE_STACK_PTR, SP};
+use crate::backend::lir::{self, asm_comment, asm_ccall, Assembler, Opnd, SideExitContext, Target, CFP, C_ARG_OPNDS, C_RET_OPND, EC, NATIVE_STACK_PTR, SP};
 use crate::hir::{iseq_to_hir, Block, BlockId, BranchEdge, CallInfo, Invariant, RangeType, SideExitReason, SideExitReason::*, SpecialObjectType, SELF_PARAM_IDX};
 use crate::hir::{Const, FrameState, Function, Insn, InsnId};
 use crate::hir_type::{types::Fixnum, Type};
@@ -156,10 +156,6 @@ fn gen_entry(cb: &mut CodeBlock, iseq: IseqPtr, function: &Function, function_pt
 
     // Restore registers for CFP, EC, and SP after use
     asm_comment!(asm, "exit to the interpreter");
-    // On x86_64, maintain 16-byte stack alignment
-    if cfg!(target_arch = "x86_64") {
-        asm.cpop_into(SP);
-    }
     asm.cpop_into(SP);
     asm.cpop_into(EC);
     asm.cpop_into(CFP);
@@ -511,10 +507,6 @@ fn gen_entry_prologue(asm: &mut Assembler, iseq: IseqPtr) {
     asm.cpush(CFP);
     asm.cpush(EC);
     asm.cpush(SP);
-    // On x86_64, maintain 16-byte stack alignment
-    if cfg!(target_arch = "x86_64") {
-        asm.cpush(SP);
-    }
 
     // EC and CFP are passed as arguments
     asm.mov(EC, C_ARG_OPNDS[0]);
@@ -782,7 +774,8 @@ fn gen_send_without_block_direct(
     // TODO: Let side exit code pop all JIT frames to optimize away this cmp + je.
     asm_comment!(asm, "side-exit if callee side-exits");
     asm.cmp(ret, Qundef.into());
-    asm.je(ZJITState::get_exit_trampoline().into());
+    // Restore the C stack pointer on exit
+    asm.je(Target::SideExit { context: None, reason: CalleeSideExit, c_stack_bytes: jit.c_stack_bytes, label: None });
 
     asm_comment!(asm, "restore SP register for the caller");
     let new_sp = asm.sub(SP, sp_offset.into());
@@ -1120,11 +1113,13 @@ fn build_side_exit(jit: &mut JITState, state: &FrameState, reason: SideExitReaso
     }
 
     let target = Target::SideExit {
-        pc: state.pc,
-        stack,
-        locals,
-        c_stack_bytes: jit.c_stack_bytes,
+        context: Some(SideExitContext {
+            pc: state.pc,
+            stack,
+            locals,
+        }),
         reason,
+        c_stack_bytes: jit.c_stack_bytes,
         label,
     };
     Some(target)
@@ -1157,12 +1152,19 @@ fn max_num_params(function: &Function) -> usize {
 /// the function needs to allocate on the stack for the stack frame.
 fn aligned_stack_bytes(num_slots: usize) -> usize {
     // Both x86_64 and arm64 require the stack to be aligned to 16 bytes.
-    // Since SIZEOF_VALUE is 8 bytes, we need to round up the size to the nearest even number.
-    let num_slots = if num_slots % 2 == 0 {
-        num_slots
-    } else {
+    let num_slots = if cfg!(target_arch = "x86_64") && num_slots % 2 == 0 {
+        // On x86_64, since the call instruction bumps the stack pointer by 8 bytes on entry,
+        // we need to round up `num_slots` to an odd number.
+        num_slots + 1
+    } else if cfg!(target_arch = "aarch64") && num_slots % 2 == 1 {
+        // On arm64, the stack pointer is always aligned to 16 bytes, so we need to round up
+        // `num_slots`` to an even number.
         num_slots + 1
+    } else {
+        num_slots
     };
+
+    const { assert!(SIZEOF_VALUE == 8, "aligned_stack_bytes() assumes SIZEOF_VALUE == 8"); }
     num_slots * SIZEOF_VALUE
 }
 
diff --git a/zjit/src/cruby_bindings.inc.rs b/zjit/src/cruby_bindings.inc.rs
index 6ba13f9b2747fc..44e544e972f58f 100644
--- a/zjit/src/cruby_bindings.inc.rs
+++ b/zjit/src/cruby_bindings.inc.rs
@@ -929,6 +929,11 @@ unsafe extern "C" {
         file: *const ::std::os::raw::c_char,
         line: ::std::os::raw::c_int,
     );
+    pub fn rb_zjit_iseq_insn_set(
+        iseq: *const rb_iseq_t,
+        insn_idx: ::std::os::raw::c_uint,
+        bare_insn: ruby_vminsn_type,
+    );
     pub fn rb_iseq_get_zjit_payload(iseq: *const rb_iseq_t) -> *mut ::std::os::raw::c_void;
     pub fn rb_iseq_set_zjit_payload(iseq: *const rb_iseq_t, payload: *mut ::std::os::raw::c_void);
     pub fn rb_zjit_print_exception();
diff --git a/zjit/src/hir.rs b/zjit/src/hir.rs
index 598774de8cff72..1c6b55a9ff3a21 100644
--- a/zjit/src/hir.rs
+++ b/zjit/src/hir.rs
@@ -414,6 +414,7 @@ pub enum SideExitReason {
     GuardType(Type),
     GuardBitEquals(VALUE),
     PatchPoint(Invariant),
+    CalleeSideExit,
 }
 
 impl std::fmt::Display for SideExitReason {
diff --git a/zjit/src/options.rs b/zjit/src/options.rs
index 476640d11feb0b..68bbca07a470fe 100644
--- a/zjit/src/options.rs
+++ b/zjit/src/options.rs
@@ -17,7 +17,7 @@ pub static mut rb_zjit_call_threshold: u64 = 2;
 #[derive(Clone, Copy, Debug)]
 pub struct Options {
     /// Number of times YARV instructions should be profiled.
-    pub num_profiles: u64,
+    pub num_profiles: u8,
 
     /// Enable debug logging
     pub debug: bool,
@@ -51,7 +51,7 @@ pub fn init_options() -> Options {
 /// Note that --help allows only 80 chars per line, including indentation.    80-char limit --> |
 pub const ZJIT_OPTIONS: &'static [(&str, &str)] = &[
     ("--zjit-call-threshold=num", "Number of calls to trigger JIT (default: 2)."),
-    ("--zjit-num-profiles=num",   "Number of profiled calls before JIT (default: 1)."),
+    ("--zjit-num-profiles=num",   "Number of profiled calls before JIT (default: 1, max: 255)."),
 ];
 
 #[derive(Clone, Copy, Debug)]
@@ -158,7 +158,7 @@ fn update_profile_threshold(options: &Options) {
             rb_zjit_profile_threshold = 0;
         } else {
             // Otherwise, profile instructions at least once.
-            rb_zjit_profile_threshold = rb_zjit_call_threshold.saturating_sub(options.num_profiles).max(1);
+            rb_zjit_profile_threshold = rb_zjit_call_threshold.saturating_sub(options.num_profiles as u64).max(1);
         }
     }
 }
diff --git a/zjit/src/profile.rs b/zjit/src/profile.rs
index de1455f788673f..2d0c7a4b9d3cb0 100644
--- a/zjit/src/profile.rs
+++ b/zjit/src/profile.rs
@@ -1,7 +1,7 @@
 // We use the YARV bytecode constants which have a CRuby-style name
 #![allow(non_upper_case_globals)]
 
-use crate::{cruby::*, gc::get_or_create_iseq_payload, hir_type::{types::{Empty, Fixnum}, Type}};
+use crate::{cruby::*, gc::get_or_create_iseq_payload, hir_type::{types::{Empty, Fixnum}, Type}, options::get_option};
 
 /// Ephemeral state for profiling runtime information
 struct Profiler {
@@ -38,43 +38,49 @@ impl Profiler {
 
 /// API called from zjit_* instruction. opcode is the bare (non-zjit_*) instruction.
 #[unsafe(no_mangle)]
-pub extern "C" fn rb_zjit_profile_insn(opcode: ruby_vminsn_type, ec: EcPtr) {
+pub extern "C" fn rb_zjit_profile_insn(bare_opcode: ruby_vminsn_type, ec: EcPtr) {
     with_vm_lock(src_loc!(), || {
         let mut profiler = Profiler::new(ec);
-        profile_insn(&mut profiler, opcode);
+        profile_insn(&mut profiler, bare_opcode);
     });
 }
 
 /// Profile a YARV instruction
-fn profile_insn(profiler: &mut Profiler, opcode: ruby_vminsn_type) {
-    match opcode {
-        YARVINSN_opt_nil_p => profile_operands(profiler, 1),
-        YARVINSN_opt_plus  => profile_operands(profiler, 2),
-        YARVINSN_opt_minus => profile_operands(profiler, 2),
-        YARVINSN_opt_mult  => profile_operands(profiler, 2),
-        YARVINSN_opt_div   => profile_operands(profiler, 2),
-        YARVINSN_opt_mod   => profile_operands(profiler, 2),
-        YARVINSN_opt_eq    => profile_operands(profiler, 2),
-        YARVINSN_opt_neq   => profile_operands(profiler, 2),
-        YARVINSN_opt_lt    => profile_operands(profiler, 2),
-        YARVINSN_opt_le    => profile_operands(profiler, 2),
-        YARVINSN_opt_gt    => profile_operands(profiler, 2),
-        YARVINSN_opt_ge    => profile_operands(profiler, 2),
-        YARVINSN_opt_and   => profile_operands(profiler, 2),
-        YARVINSN_opt_or    => profile_operands(profiler, 2),
+fn profile_insn(profiler: &mut Profiler, bare_opcode: ruby_vminsn_type) {
+    let profile = &mut get_or_create_iseq_payload(profiler.iseq).profile;
+    match bare_opcode {
+        YARVINSN_opt_nil_p => profile_operands(profiler, profile, 1),
+        YARVINSN_opt_plus  => profile_operands(profiler, profile, 2),
+        YARVINSN_opt_minus => profile_operands(profiler, profile, 2),
+        YARVINSN_opt_mult  => profile_operands(profiler, profile, 2),
+        YARVINSN_opt_div   => profile_operands(profiler, profile, 2),
+        YARVINSN_opt_mod   => profile_operands(profiler, profile, 2),
+        YARVINSN_opt_eq    => profile_operands(profiler, profile, 2),
+        YARVINSN_opt_neq   => profile_operands(profiler, profile, 2),
+        YARVINSN_opt_lt    => profile_operands(profiler, profile, 2),
+        YARVINSN_opt_le    => profile_operands(profiler, profile, 2),
+        YARVINSN_opt_gt    => profile_operands(profiler, profile, 2),
+        YARVINSN_opt_ge    => profile_operands(profiler, profile, 2),
+        YARVINSN_opt_and   => profile_operands(profiler, profile, 2),
+        YARVINSN_opt_or    => profile_operands(profiler, profile, 2),
         YARVINSN_opt_send_without_block => {
             let cd: *const rb_call_data = profiler.insn_opnd(0).as_ptr();
             let argc = unsafe { vm_ci_argc((*cd).ci) };
             // Profile all the arguments and self (+1).
-            profile_operands(profiler, (argc + 1) as usize);
+            profile_operands(profiler, profile, (argc + 1) as usize);
         }
         _ => {}
     }
+
+    // Once we profile the instruction num_profiles times, we stop profiling it.
+    profile.num_profiles[profiler.insn_idx] = profile.num_profiles[profiler.insn_idx].saturating_add(1);
+    if profile.num_profiles[profiler.insn_idx] == get_option!(num_profiles) {
+        unsafe { rb_zjit_iseq_insn_set(profiler.iseq, profiler.insn_idx as u32, bare_opcode); }
+    }
 }
 
 /// Profile the Type of top-`n` stack operands
-fn profile_operands(profiler: &mut Profiler, n: usize) {
-    let profile = &mut get_or_create_iseq_payload(profiler.iseq).profile;
+fn profile_operands(profiler: &mut Profiler, profile: &mut IseqProfile, n: usize) {
     let types = &mut profile.opnd_types[profiler.insn_idx];
     if types.len() <= n {
         types.resize(n, Empty);
@@ -89,11 +95,17 @@ fn profile_operands(profiler: &mut Profiler, n: usize) {
 pub struct IseqProfile {
     /// Type information of YARV instruction operands, indexed by the instruction index
     opnd_types: Vec<Vec<Type>>,
+
+    /// Number of profiled executions for each YARV instruction, indexed by the instruction index
+    num_profiles: Vec<u8>,
 }
 
 impl IseqProfile {
     pub fn new(iseq_size: u32) -> Self {
-        Self { opnd_types: vec![vec![]; iseq_size as usize] }
+        Self {
+            opnd_types: vec![vec![]; iseq_size as usize],
+            num_profiles: vec![0; iseq_size as usize],
+        }
     }
 
     /// Get profiled operand types for a given instruction index
diff --git a/zjit/zjit.mk b/zjit/zjit.mk
index 02f1b2b50d8e5d..047f4ab3a3cf50 100644
--- a/zjit/zjit.mk
+++ b/zjit/zjit.mk
@@ -56,7 +56,7 @@ zjit-check:
 
 .PHONY: zjit-test-all
 zjit-test-all:
-	$(MAKE) test-all RUST_BACKTRACE=1 TEST_EXCLUDES='--excludes-dir=$(top_srcdir)/test/.excludes-zjit --name=!/memory_leak/' RUN_OPTS='--zjit-call-threshold=1' TESTS='$(top_srcdir)/test/ruby/'
+	$(MAKE) test-all RUST_BACKTRACE=1 TEST_EXCLUDES='--excludes-dir=$(top_srcdir)/test/.excludes-zjit --name=!/memory_leak/' RUN_OPTS='--zjit-call-threshold=1' TESTS='$(top_srcdir)/test/ruby'
 
 ZJIT_BINDGEN_DIFF_OPTS =
 
@@ -68,6 +68,12 @@ zjit-bindgen: zjit.$(OBJEXT)
 	ZJIT_SRC_ROOT_PATH='$(top_srcdir)' BINDGEN_JIT_NAME=zjit $(CARGO) run --manifest-path '$(top_srcdir)/zjit/bindgen/Cargo.toml' -- $(CFLAGS) $(XCFLAGS) $(CPPFLAGS)
 	$(Q) if [ 'x$(HAVE_GIT)' = xyes ]; then $(GIT) -C "$(top_srcdir)" diff $(ZJIT_BINDGEN_DIFF_OPTS) zjit/src/cruby_bindings.inc.rs; fi
 
+# Build env should roughly match what's used for miniruby to help with caching.
+ZJIT_NEXTEST_ENV := RUBY_BUILD_DIR='$(TOP_BUILD_DIR)' \
+    RUBY_LD_FLAGS='$(LDFLAGS) $(XLDFLAGS) $(MAINLIBS)' \
+    MACOSX_DEPLOYMENT_TARGET=11.0 \
+    CARGO_TARGET_DIR='$(CARGO_TARGET_DIR)'
+
 # We need `cargo nextest` for its one-process-per execution execution model
 # since we can only boot the VM once per process. Normal `cargo test`
 # runs tests in threads and can't handle this.
@@ -75,10 +81,10 @@ zjit-bindgen: zjit.$(OBJEXT)
 # On darwin, it's available through `brew install cargo-nextest`. See
 # https://nexte.st/docs/installation/pre-built-binaries/ otherwise.
 zjit-test: libminiruby.a
-	RUBY_BUILD_DIR='$(TOP_BUILD_DIR)' \
-	    RUBY_LD_FLAGS='$(LDFLAGS) $(XLDFLAGS) $(MAINLIBS)' \
-	    CARGO_TARGET_DIR='$(CARGO_TARGET_DIR)' \
-	    $(CARGO) nextest run --manifest-path '$(top_srcdir)/zjit/Cargo.toml' $(ZJIT_TESTS)
+	$(ZJIT_NEXTEST_ENV) $(CARGO) nextest run \
+		--manifest-path '$(top_srcdir)/zjit/Cargo.toml' \
+		'--features=$(ZJIT_TEST_FEATURES)' \
+		$(ZJIT_TESTS)
 
 # Run a ZJIT test written with Rust #[test] under LLDB
 zjit-test-lldb: libminiruby.a
@@ -88,9 +94,7 @@ zjit-test-lldb: libminiruby.a
 		echo "Many tests only work when it's the only test in the process."; \
 		exit 1; \
 	    fi; \
-	    exe_path=`RUBY_BUILD_DIR='$(TOP_BUILD_DIR)' \
-	    RUBY_LD_FLAGS='$(LDFLAGS) $(XLDFLAGS) $(MAINLIBS)' \
-	    CARGO_TARGET_DIR='$(CARGO_TARGET_DIR)' \
+	    exe_path=`$(ZJIT_NEXTEST_ENV) \
 	    $(CARGO) nextest list --manifest-path '$(top_srcdir)/zjit/Cargo.toml' --message-format json --list-type=binaries-only | \
 	    $(BASERUBY) -rjson -e 'puts JSON.load(STDIN.read).dig("rust-binaries", "zjit", "binary-path")'`; \
 	    exec lldb $$exe_path -- --test-threads=1 $(ZJIT_TESTS)