From 44db4f165426c80dd20d7f35cf3007da9bb806db Mon Sep 17 00:00:00 2001 From: Julian Nagele Date: Fri, 13 Jun 2025 16:00:42 +0100 Subject: [PATCH 1/3] [ISel] Replace expensive mov from wzr by two moves via fpr --- llvm/lib/Target/AArch64/AArch64InstrInfo.td | 18 ++++++++++++------ .../CodeGen/AArch64/arm64-vector-insertion.ll | 15 ++++++++++----- 2 files changed, 22 insertions(+), 11 deletions(-) diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.td b/llvm/lib/Target/AArch64/AArch64InstrInfo.td index f01b634f44ba6..347f7d4d3c1a3 100644 --- a/llvm/lib/Target/AArch64/AArch64InstrInfo.td +++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.td @@ -7356,16 +7356,10 @@ def : Pat<(v4f16 (vector_insert (v4f16 V64:$Rn), (i64 0)), dsub)>; -def : Pat<(vector_insert (v8f16 V128:$Rn), (f16 fpimm0), (i64 VectorIndexH:$imm)), - (INSvi16gpr V128:$Rn, VectorIndexH:$imm, WZR)>; def : Pat<(vector_insert (v4f16 V64:$Rn), (f16 fpimm0), (i64 VectorIndexH:$imm)), (EXTRACT_SUBREG (INSvi16gpr (v8f16 (INSERT_SUBREG (v8f16 (IMPLICIT_DEF)), V64:$Rn, dsub)), VectorIndexH:$imm, WZR), dsub)>; -def : Pat<(vector_insert (v4f32 V128:$Rn), (f32 fpimm0), (i64 VectorIndexS:$imm)), - (INSvi32gpr V128:$Rn, VectorIndexS:$imm, WZR)>; def : Pat<(vector_insert (v2f32 V64:$Rn), (f32 fpimm0), (i64 VectorIndexS:$imm)), (EXTRACT_SUBREG (INSvi32gpr (v4f32 (INSERT_SUBREG (v4f32 (IMPLICIT_DEF)), V64:$Rn, dsub)), VectorIndexS:$imm, WZR), dsub)>; -def : Pat<(vector_insert v2f64:$Rn, (f64 fpimm0), (i64 VectorIndexD:$imm)), - (INSvi64gpr V128:$Rn, VectorIndexS:$imm, XZR)>; def : Pat<(v8f16 (vector_insert (v8f16 V128:$Rn), (f16 FPR16:$Rm), (i64 VectorIndexH:$imm))), @@ -8035,6 +8029,18 @@ def MOVIv2d_ns : SIMDModifiedImmVectorNoShift<1, 1, 0, 0b1110, V128, "movi", ".2d", [(set (v2i64 V128:$Rd), (AArch64movi_edit imm0_255:$imm8))]>; +def : Pat<(vector_insert (v8f16 V128:$Rn), (f16 fpimm0), (i64 VectorIndexH:$imm)), + (INSvi16lane V128:$Rn, VectorIndexH:$imm, + (v8f16 (MOVIv2d_ns (i32 0))), (i64 0))>; + +def : Pat<(vector_insert (v4f32 V128:$Rn), (f32 fpimm0), (i64 VectorIndexS:$imm)), + (INSvi32lane V128:$Rn, VectorIndexS:$imm, + (v4f32 (MOVIv2d_ns (i32 0))), (i64 0))>; + +def : Pat<(vector_insert (v2f64 V128:$Rn), (f64 fpimm0), (i64 VectorIndexD:$imm)), + (INSvi64lane V128:$Rn, VectorIndexD:$imm, + (v2f64 (MOVIv2d_ns (i32 0))), (i64 0))>; + let Predicates = [HasNEON] in { def : Pat<(v2i64 immAllZerosV), (MOVIv2d_ns (i32 0))>; def : Pat<(v4i32 immAllZerosV), (MOVIv2d_ns (i32 0))>; diff --git a/llvm/test/CodeGen/AArch64/arm64-vector-insertion.ll b/llvm/test/CodeGen/AArch64/arm64-vector-insertion.ll index ff28c7817d143..3638c1b5005ad 100644 --- a/llvm/test/CodeGen/AArch64/arm64-vector-insertion.ll +++ b/llvm/test/CodeGen/AArch64/arm64-vector-insertion.ll @@ -172,8 +172,9 @@ define <8 x half> @test_insert_v8f16_insert_1(half %a) { ; CHECK-LABEL: test_insert_v8f16_insert_1: ; CHECK: // %bb.0: ; CHECK-NEXT: // kill: def $h0 killed $h0 def $q0 +; CHECK-NEXT: movi.2d v1, #0000000000000000 ; CHECK-NEXT: dup.8h v0, v0[0] -; CHECK-NEXT: mov.h v0[7], wzr +; CHECK-NEXT: mov.h v0[7], v1[0] ; CHECK-NEXT: ret %v.0 = insertelement <8 x half> , half %a, i32 0 %v.1 = insertelement <8 x half> %v.0, half %a, i32 1 @@ -278,8 +279,9 @@ define <4 x float> @test_insert_3_f32_undef_zero_vector(float %a) { ; CHECK-LABEL: test_insert_3_f32_undef_zero_vector: ; CHECK: // %bb.0: ; CHECK-NEXT: // kill: def $s0 killed $s0 def $q0 +; CHECK-NEXT: movi.2d v1, #0000000000000000 ; CHECK-NEXT: dup.4s v0, v0[0] -; CHECK-NEXT: mov.s v0[3], wzr +; CHECK-NEXT: mov.s v0[3], v1[0] ; CHECK-NEXT: ret %v.0 = insertelement <4 x float> , float %a, i32 0 %v.1 = insertelement <4 x float> %v.0, float %a, i32 1 @@ -362,7 +364,8 @@ define <4 x half> @test_insert_v4f16_f16_zero(<4 x half> %a) { define <8 x half> @test_insert_v8f16_f16_zero(<8 x half> %a) { ; CHECK-LABEL: test_insert_v8f16_f16_zero: ; CHECK: // %bb.0: -; CHECK-NEXT: mov.h v0[6], wzr +; CHECK-NEXT: movi.2d v1, #0000000000000000 +; CHECK-NEXT: mov.h v0[6], v1[0] ; CHECK-NEXT: ret %v.0 = insertelement <8 x half> %a, half 0.000000e+00, i32 6 ret <8 x half> %v.0 @@ -382,7 +385,8 @@ define <2 x float> @test_insert_v2f32_f32_zero(<2 x float> %a) { define <4 x float> @test_insert_v4f32_f32_zero(<4 x float> %a) { ; CHECK-LABEL: test_insert_v4f32_f32_zero: ; CHECK: // %bb.0: -; CHECK-NEXT: mov.s v0[3], wzr +; CHECK-NEXT: movi.2d v1, #0000000000000000 +; CHECK-NEXT: mov.s v0[3], v1[0] ; CHECK-NEXT: ret %v.0 = insertelement <4 x float> %a, float 0.000000e+00, i32 3 ret <4 x float> %v.0 @@ -391,7 +395,8 @@ define <4 x float> @test_insert_v4f32_f32_zero(<4 x float> %a) { define <2 x double> @test_insert_v2f64_f64_zero(<2 x double> %a) { ; CHECK-LABEL: test_insert_v2f64_f64_zero: ; CHECK: // %bb.0: -; CHECK-NEXT: mov.d v0[1], xzr +; CHECK-NEXT: movi.2d v1, #0000000000000000 +; CHECK-NEXT: mov.d v0[1], v1[0] ; CHECK-NEXT: ret %v.0 = insertelement <2 x double> %a, double 0.000000e+00, i32 1 ret <2 x double> %v.0 From 7be1083db10f04a2485a96ba68565466e3daf365 Mon Sep 17 00:00:00 2001 From: Julian Nagele Date: Mon, 14 Jul 2025 16:34:13 +0100 Subject: [PATCH 2/3] fixup! [ISel] Replace expensive mov from wzr by two moves via fpr Instead of introducing new patterns, guard exisiting ones by tuning feature --- llvm/lib/Target/AArch64/AArch64Features.td | 7 ++ llvm/lib/Target/AArch64/AArch64InstrInfo.td | 22 +++--- llvm/lib/Target/AArch64/AArch64Processors.td | 18 +++-- .../CodeGen/AArch64/arm64-vector-insertion.ll | 68 ++++++++++++++++--- 4 files changed, 89 insertions(+), 26 deletions(-) diff --git a/llvm/lib/Target/AArch64/AArch64Features.td b/llvm/lib/Target/AArch64/AArch64Features.td index 24fbe207c4969..a13525fc40928 100644 --- a/llvm/lib/Target/AArch64/AArch64Features.td +++ b/llvm/lib/Target/AArch64/AArch64Features.td @@ -834,6 +834,13 @@ def FeatureDisableFastIncVL : SubtargetFeature<"disable-fast-inc-vl", "HasDisableFastIncVL", "true", "Do not prefer INC/DEC, ALL, { 1, 2, 4 } over ADDVL">; +// On most processors we want to avoid moving from WZR to vector registers +// (relying on materializing 0 to a FPR and moving from there instead), +// but on some (in-order) cores it's preferable to avoid the extra instruction instead. +def FeatureUseWzrToVecMove : SubtargetFeature<"use-wzr-to-vec-move", + "UseWzrToVecMove", "true", + "Move from WZR to insert 0 into vector registers">; + //===----------------------------------------------------------------------===// // Architectures. // diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.td b/llvm/lib/Target/AArch64/AArch64InstrInfo.td index 347f7d4d3c1a3..6748d6f6f597c 100644 --- a/llvm/lib/Target/AArch64/AArch64InstrInfo.td +++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.td @@ -413,6 +413,8 @@ def AArch64LocalRecover : SDNode<"ISD::LOCAL_RECOVER", def AllowMisalignedMemAccesses : Predicate<"!Subtarget->requiresStrictAlign()">; +def UseWzrToVecMove : Predicate<"Subtarget->useWzrToVecMove()">; + //===----------------------------------------------------------------------===// // AArch64-specific DAG Nodes. @@ -7356,10 +7358,18 @@ def : Pat<(v4f16 (vector_insert (v4f16 V64:$Rn), (i64 0)), dsub)>; +let Predicates = [UseWzrToVecMove] in { +def : Pat<(vector_insert (v8f16 V128:$Rn), (f16 fpimm0), (i64 VectorIndexH:$imm)), + (INSvi16gpr V128:$Rn, VectorIndexH:$imm, WZR)>; def : Pat<(vector_insert (v4f16 V64:$Rn), (f16 fpimm0), (i64 VectorIndexH:$imm)), (EXTRACT_SUBREG (INSvi16gpr (v8f16 (INSERT_SUBREG (v8f16 (IMPLICIT_DEF)), V64:$Rn, dsub)), VectorIndexH:$imm, WZR), dsub)>; +def : Pat<(vector_insert (v4f32 V128:$Rn), (f32 fpimm0), (i64 VectorIndexS:$imm)), + (INSvi32gpr V128:$Rn, VectorIndexS:$imm, WZR)>; def : Pat<(vector_insert (v2f32 V64:$Rn), (f32 fpimm0), (i64 VectorIndexS:$imm)), (EXTRACT_SUBREG (INSvi32gpr (v4f32 (INSERT_SUBREG (v4f32 (IMPLICIT_DEF)), V64:$Rn, dsub)), VectorIndexS:$imm, WZR), dsub)>; +def : Pat<(vector_insert v2f64:$Rn, (f64 fpimm0), (i64 VectorIndexD:$imm)), + (INSvi64gpr V128:$Rn, VectorIndexS:$imm, XZR)>; +} def : Pat<(v8f16 (vector_insert (v8f16 V128:$Rn), (f16 FPR16:$Rm), (i64 VectorIndexH:$imm))), @@ -8029,18 +8039,6 @@ def MOVIv2d_ns : SIMDModifiedImmVectorNoShift<1, 1, 0, 0b1110, V128, "movi", ".2d", [(set (v2i64 V128:$Rd), (AArch64movi_edit imm0_255:$imm8))]>; -def : Pat<(vector_insert (v8f16 V128:$Rn), (f16 fpimm0), (i64 VectorIndexH:$imm)), - (INSvi16lane V128:$Rn, VectorIndexH:$imm, - (v8f16 (MOVIv2d_ns (i32 0))), (i64 0))>; - -def : Pat<(vector_insert (v4f32 V128:$Rn), (f32 fpimm0), (i64 VectorIndexS:$imm)), - (INSvi32lane V128:$Rn, VectorIndexS:$imm, - (v4f32 (MOVIv2d_ns (i32 0))), (i64 0))>; - -def : Pat<(vector_insert (v2f64 V128:$Rn), (f64 fpimm0), (i64 VectorIndexD:$imm)), - (INSvi64lane V128:$Rn, VectorIndexD:$imm, - (v2f64 (MOVIv2d_ns (i32 0))), (i64 0))>; - let Predicates = [HasNEON] in { def : Pat<(v2i64 immAllZerosV), (MOVIv2d_ns (i32 0))>; def : Pat<(v4i32 immAllZerosV), (MOVIv2d_ns (i32 0))>; diff --git a/llvm/lib/Target/AArch64/AArch64Processors.td b/llvm/lib/Target/AArch64/AArch64Processors.td index 4a5682475d107..adc9fda9ae0a7 100644 --- a/llvm/lib/Target/AArch64/AArch64Processors.td +++ b/llvm/lib/Target/AArch64/AArch64Processors.td @@ -21,40 +21,46 @@ def TuneA320 : SubtargetFeature<"a320", "ARMProcFamily", "CortexA320", "Cortex-A320 ARM processors", [ FeatureFuseAES, FeatureFuseAdrpAdd, - FeaturePostRAScheduler]>; + FeaturePostRAScheduler, + FeatureUseWzrToVecMove]>; def TuneA53 : SubtargetFeature<"a53", "ARMProcFamily", "CortexA53", "Cortex-A53 ARM processors", [ FeatureFuseAES, FeatureFuseAdrpAdd, FeatureBalanceFPOps, - FeaturePostRAScheduler]>; + FeaturePostRAScheduler, + FeatureUseWzrToVecMove]>; def TuneA55 : SubtargetFeature<"a55", "ARMProcFamily", "CortexA55", "Cortex-A55 ARM processors", [ FeatureFuseAES, FeatureFuseAdrpAdd, FeaturePostRAScheduler, - FeatureFuseAddress]>; + FeatureFuseAddress, + FeatureUseWzrToVecMove]>; def TuneA510 : SubtargetFeature<"a510", "ARMProcFamily", "CortexA510", "Cortex-A510 ARM processors", [ FeatureFuseAES, FeatureFuseAdrpAdd, - FeaturePostRAScheduler + FeaturePostRAScheduler, + FeatureUseWzrToVecMove ]>; def TuneA520 : SubtargetFeature<"a520", "ARMProcFamily", "CortexA520", "Cortex-A520 ARM processors", [ FeatureFuseAES, FeatureFuseAdrpAdd, - FeaturePostRAScheduler]>; + FeaturePostRAScheduler, + FeatureUseWzrToVecMove]>; def TuneA520AE : SubtargetFeature<"a520ae", "ARMProcFamily", "CortexA520", "Cortex-A520AE ARM processors", [ FeatureFuseAES, FeatureFuseAdrpAdd, - FeaturePostRAScheduler]>; + FeaturePostRAScheduler, + FeatureUseWzrToVecMove]>; def TuneA57 : SubtargetFeature<"a57", "ARMProcFamily", "CortexA57", "Cortex-A57 ARM processors", [ diff --git a/llvm/test/CodeGen/AArch64/arm64-vector-insertion.ll b/llvm/test/CodeGen/AArch64/arm64-vector-insertion.ll index 3638c1b5005ad..bae254bbd2104 100644 --- a/llvm/test/CodeGen/AArch64/arm64-vector-insertion.ll +++ b/llvm/test/CodeGen/AArch64/arm64-vector-insertion.ll @@ -172,7 +172,7 @@ define <8 x half> @test_insert_v8f16_insert_1(half %a) { ; CHECK-LABEL: test_insert_v8f16_insert_1: ; CHECK: // %bb.0: ; CHECK-NEXT: // kill: def $h0 killed $h0 def $q0 -; CHECK-NEXT: movi.2d v1, #0000000000000000 +; CHECK-NEXT: movi d1, #0000000000000000 ; CHECK-NEXT: dup.8h v0, v0[0] ; CHECK-NEXT: mov.h v0[7], v1[0] ; CHECK-NEXT: ret @@ -279,7 +279,7 @@ define <4 x float> @test_insert_3_f32_undef_zero_vector(float %a) { ; CHECK-LABEL: test_insert_3_f32_undef_zero_vector: ; CHECK: // %bb.0: ; CHECK-NEXT: // kill: def $s0 killed $s0 def $q0 -; CHECK-NEXT: movi.2d v1, #0000000000000000 +; CHECK-NEXT: movi d1, #0000000000000000 ; CHECK-NEXT: dup.4s v0, v0[0] ; CHECK-NEXT: mov.s v0[3], v1[0] ; CHECK-NEXT: ret @@ -349,12 +349,12 @@ define <8 x i16> @test_insert_v8i16_i16_zero(<8 x i16> %a) { ret <8 x i16> %v.0 } -; TODO: This should jsut be a mov.s v0[3], wzr define <4 x half> @test_insert_v4f16_f16_zero(<4 x half> %a) { ; CHECK-LABEL: test_insert_v4f16_f16_zero: ; CHECK: // %bb.0: +; CHECK-NEXT: movi d1, #0000000000000000 ; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 -; CHECK-NEXT: mov.h v0[0], wzr +; CHECK-NEXT: mov.h v0[0], v1[0] ; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0 ; CHECK-NEXT: ret %v.0 = insertelement <4 x half> %a, half 0.000000e+00, i32 0 @@ -364,7 +364,7 @@ define <4 x half> @test_insert_v4f16_f16_zero(<4 x half> %a) { define <8 x half> @test_insert_v8f16_f16_zero(<8 x half> %a) { ; CHECK-LABEL: test_insert_v8f16_f16_zero: ; CHECK: // %bb.0: -; CHECK-NEXT: movi.2d v1, #0000000000000000 +; CHECK-NEXT: movi d1, #0000000000000000 ; CHECK-NEXT: mov.h v0[6], v1[0] ; CHECK-NEXT: ret %v.0 = insertelement <8 x half> %a, half 0.000000e+00, i32 6 @@ -374,8 +374,9 @@ define <8 x half> @test_insert_v8f16_f16_zero(<8 x half> %a) { define <2 x float> @test_insert_v2f32_f32_zero(<2 x float> %a) { ; CHECK-LABEL: test_insert_v2f32_f32_zero: ; CHECK: // %bb.0: +; CHECK-NEXT: movi d1, #0000000000000000 ; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 -; CHECK-NEXT: mov.s v0[0], wzr +; CHECK-NEXT: mov.s v0[0], v1[0] ; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0 ; CHECK-NEXT: ret %v.0 = insertelement <2 x float> %a, float 0.000000e+00, i32 0 @@ -385,7 +386,7 @@ define <2 x float> @test_insert_v2f32_f32_zero(<2 x float> %a) { define <4 x float> @test_insert_v4f32_f32_zero(<4 x float> %a) { ; CHECK-LABEL: test_insert_v4f32_f32_zero: ; CHECK: // %bb.0: -; CHECK-NEXT: movi.2d v1, #0000000000000000 +; CHECK-NEXT: movi d1, #0000000000000000 ; CHECK-NEXT: mov.s v0[3], v1[0] ; CHECK-NEXT: ret %v.0 = insertelement <4 x float> %a, float 0.000000e+00, i32 3 @@ -395,9 +396,60 @@ define <4 x float> @test_insert_v4f32_f32_zero(<4 x float> %a) { define <2 x double> @test_insert_v2f64_f64_zero(<2 x double> %a) { ; CHECK-LABEL: test_insert_v2f64_f64_zero: ; CHECK: // %bb.0: -; CHECK-NEXT: movi.2d v1, #0000000000000000 +; CHECK-NEXT: movi d1, #0000000000000000 ; CHECK-NEXT: mov.d v0[1], v1[0] ; CHECK-NEXT: ret %v.0 = insertelement <2 x double> %a, double 0.000000e+00, i32 1 ret <2 x double> %v.0 } + +define <4 x half> @test_insert_v4f16_f16_zero_wzr(<4 x half> %a) #1 { +; CHECK-LABEL: test_insert_v4f16_f16_zero_wzr: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-NEXT: mov.h v0[0], wzr +; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0 +; CHECK-NEXT: ret + %v.0 = insertelement <4 x half> %a, half 0.000000e+00, i32 0 + ret <4 x half> %v.0 +} + +define <8 x half> @test_insert_v8f16_f16_zero_wzr(<8 x half> %a) #1 { +; CHECK-LABEL: test_insert_v8f16_f16_zero_wzr: +; CHECK: // %bb.0: +; CHECK-NEXT: mov.h v0[6], wzr +; CHECK-NEXT: ret + %v.0 = insertelement <8 x half> %a, half 0.000000e+00, i32 6 + ret <8 x half> %v.0 +} + +define <2 x float> @test_insert_v2f32_f32_zero_wzr(<2 x float> %a) #1 { +; CHECK-LABEL: test_insert_v2f32_f32_zero_wzr: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-NEXT: mov.s v0[0], wzr +; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0 +; CHECK-NEXT: ret + %v.0 = insertelement <2 x float> %a, float 0.000000e+00, i32 0 + ret <2 x float> %v.0 +} + +define <4 x float> @test_insert_v4f32_f32_zero_wzr(<4 x float> %a) #1 { +; CHECK-LABEL: test_insert_v4f32_f32_zero_wzr: +; CHECK: // %bb.0: +; CHECK-NEXT: mov.s v0[3], wzr +; CHECK-NEXT: ret + %v.0 = insertelement <4 x float> %a, float 0.000000e+00, i32 3 + ret <4 x float> %v.0 +} + +define <2 x double> @test_insert_v2f64_f64_zero_xzr(<2 x double> %a) #1 { +; CHECK-LABEL: test_insert_v2f64_f64_zero_xzr: +; CHECK: // %bb.0: +; CHECK-NEXT: mov.d v0[1], xzr +; CHECK-NEXT: ret + %v.0 = insertelement <2 x double> %a, double 0.000000e+00, i32 1 + ret <2 x double> %v.0 +} + +attributes #1 = {"tune-cpu"="cortex-a55"} From 131c488fe7522f89b5d7871bea92e585585b49f4 Mon Sep 17 00:00:00 2001 From: Julian Nagele Date: Mon, 14 Jul 2025 19:18:56 +0100 Subject: [PATCH 3/3] fixup! fixup! [ISel] Replace expensive mov from wzr by two moves via fpr --- llvm/test/CodeGen/AArch64/vecreduce-fadd.ll | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/llvm/test/CodeGen/AArch64/vecreduce-fadd.ll b/llvm/test/CodeGen/AArch64/vecreduce-fadd.ll index 8a84d3ca2328c..59dfcf9850a49 100644 --- a/llvm/test/CodeGen/AArch64/vecreduce-fadd.ll +++ b/llvm/test/CodeGen/AArch64/vecreduce-fadd.ll @@ -63,8 +63,9 @@ define half @add_v3HalfH(<3 x half> %bin.rdx) { ; ; CHECK-SD-FP16-LABEL: add_v3HalfH: ; CHECK-SD-FP16: // %bb.0: +; CHECK-SD-FP16-NEXT: movi d1, #0000000000000000 ; CHECK-SD-FP16-NEXT: // kill: def $d0 killed $d0 def $q0 -; CHECK-SD-FP16-NEXT: mov v0.h[3], wzr +; CHECK-SD-FP16-NEXT: mov v0.h[3], v1.h[0] ; CHECK-SD-FP16-NEXT: faddp v0.4h, v0.4h, v0.4h ; CHECK-SD-FP16-NEXT: faddp h0, v0.2h ; CHECK-SD-FP16-NEXT: ret