From 9ef16e8c286faf93bbe4b760e0de71d4e186bbbd Mon Sep 17 00:00:00 2001 From: Sean Parkinson Date: Wed, 24 Dec 2025 12:16:21 +1000 Subject: [PATCH] Curve25519 improvements Add non-constant time implemenations of mod_inv for x64 and Aarch64 assembly. Generate base point table, with better formatting, for double smul with a script. Increase Bi table size to 32 entries for 64-bit asm. Minor improvements to double smul. WOLFSSL_CURVE25519_NOT_USE_ED25519 to not use ed25519 base smul in curve25519 base smul. --- configure.ac | 6 + wolfcrypt/src/curve25519.c | 5 +- wolfcrypt/src/ed25519.c | 2 +- wolfcrypt/src/fe_x25519_asm.S | 3788 +++++++++++-------- wolfcrypt/src/ge_operations.c | 620 ++- wolfcrypt/src/port/arm/armv8-curve25519.S | 247 ++ wolfcrypt/src/port/arm/armv8-curve25519_c.c | 257 ++ wolfssl/wolfcrypt/fe_operations.h | 5 +- wolfssl/wolfcrypt/ge_operations.h | 5 + 9 files changed, 3145 insertions(+), 1790 deletions(-) diff --git a/configure.ac b/configure.ac index 099dbd8bcb..b1e7d0accc 100644 --- a/configure.ac +++ b/configure.ac @@ -10280,6 +10280,12 @@ then AM_CFLAGS="$AM_CFLAGS -DWOLFSSL_CURVE25519_USE_ED25519" AM_CCASFLAGS="$AM_CCASFLAGS -DWOLFSSL_CURVE25519_USE_ED25519" fi + if test "$ENABLED_CURVE25519" = "not-ed" + then + AM_CFLAGS="$AM_CFLAGS -DWOLFSSL_CURVE25519_NOT_USE_ED25519" + AM_CCASFLAGS="$AM_CCASFLAGS -DWOLFSSL_CURVE25519_NOT_USE_ED25519" + fi + AM_CFLAGS="$AM_CFLAGS -DHAVE_CURVE25519" AM_CCASFLAGS="$AM_CCASFLAGS -DHAVE_CURVE25519" diff --git a/wolfcrypt/src/curve25519.c b/wolfcrypt/src/curve25519.c index 21b43e699c..e8974e4501 100644 --- a/wolfcrypt/src/curve25519.c +++ b/wolfcrypt/src/curve25519.c @@ -75,8 +75,9 @@ const curve25519_set_type curve25519_sets[] = { } }; -#if !defined(WOLFSSL_CURVE25519_USE_ED25519) || \ - defined(WOLFSSL_CURVE25519_BLINDING) +#if (!defined(WOLFSSL_CURVE25519_USE_ED25519) && \ + !(defined(CURVED25519_X64) || (defined(WOLFSSL_ARMASM) && \ + defined(__aarch64__)))) || defined(WOLFSSL_CURVE25519_BLINDING) static const word32 kCurve25519BasePoint[CURVE25519_KEYSIZE/sizeof(word32)] = { #ifdef BIG_ENDIAN_ORDER 0x09000000 diff --git a/wolfcrypt/src/ed25519.c b/wolfcrypt/src/ed25519.c index a03efb5603..3fd2c9ef1c 100644 --- a/wolfcrypt/src/ed25519.c +++ b/wolfcrypt/src/ed25519.c @@ -822,7 +822,7 @@ static int ed25519_verify_msg_final_with_sha(const byte* sig, word32 sigLen, if (ret != 0) return ret; - ge_tobytes(rcheck, &R); + ge_tobytes_nct(rcheck, &R); #endif /* FREESCALE_LTC_ECC */ /* comparison of R created to R in sig */ diff --git a/wolfcrypt/src/fe_x25519_asm.S b/wolfcrypt/src/fe_x25519_asm.S index 2ac32677a4..577aba1834 100644 --- a/wolfcrypt/src/fe_x25519_asm.S +++ b/wolfcrypt/src/fe_x25519_asm.S @@ -238,7 +238,7 @@ L_fe_init_get_flags: #else movq %rax, _ge_sub_p(%rip) #endif /* __APPLE__ */ -#if !defined(HAVE_ED25519) && !defined(WOLFSSL_CURVE25519_USE_ED25519) +#if defined(WOLFSSL_CURVE25519_NOT_USE_ED25519) #ifndef __APPLE__ movq curve25519_base_avx2@GOTPCREL(%rip), %rax #else @@ -250,7 +250,7 @@ L_fe_init_get_flags: #else movq %rax, _curve25519_base_p(%rip) #endif /* __APPLE__ */ -#endif /* !HAVE_ED25519 && !WOLFSSL_CURVE25519_USE_ED25519 */ +#endif /* WOLFSSL_CURVE25519_NOT_USE_ED25519 */ #ifdef HAVE_ED25519 #ifndef __APPLE__ movq fe_sq2_avx2@GOTPCREL(%rip), %rax @@ -263,6 +263,17 @@ L_fe_init_get_flags: #else movq %rax, _fe_sq2_p(%rip) #endif /* __APPLE__ */ +#ifndef __APPLE__ + movq fe_invert_nct_avx2@GOTPCREL(%rip), %rax +#else + leaq _fe_invert_nct_avx2(%rip), %rax +#endif /* __APPLE__ */ +#ifndef __APPLE__ + movq fe_invert_nct_p@GOTPCREL(%rip), %rdx + movq %rax, (%rdx) +#else + movq %rax, _fe_invert_nct_p(%rip) +#endif /* __APPLE__ */ #ifndef __APPLE__ movq sc_reduce_avx2@GOTPCREL(%rip), %rax #else @@ -443,7 +454,6 @@ fe_sub: .p2align 4 _fe_sub: #endif /* __APPLE__ */ - pushq %r12 # Sub movq (%rsi), %rax movq 8(%rsi), %rcx @@ -453,13 +463,12 @@ _fe_sub: sbbq 8(%rdx), %rcx sbbq 16(%rdx), %r8 sbbq 24(%rdx), %r9 - sbbq %r11, %r11 - shldq $0x01, %r9, %r11 - movq $0x7fffffffffffffff, %r12 - imulq $-19, %r11 - andq %r12, %r9 + sbbq %r10, %r10 + shldq $0x01, %r9, %r10 + imulq $-19, %r10 + btr $63, %r9 # Add modulus (if underflow) - subq %r11, %rax + subq %r10, %rax sbbq $0x00, %rcx sbbq $0x00, %r8 sbbq $0x00, %r9 @@ -467,7 +476,6 @@ _fe_sub: movq %rcx, 8(%rdi) movq %r8, 16(%rdi) movq %r9, 24(%rdi) - popq %r12 repz retq #ifndef __APPLE__ .size fe_sub,.-fe_sub @@ -484,7 +492,6 @@ fe_add: .p2align 4 _fe_add: #endif /* __APPLE__ */ - pushq %r12 # Add movq (%rsi), %rax movq 8(%rsi), %rcx @@ -494,14 +501,13 @@ _fe_add: movq 24(%rsi), %r9 adcq 16(%rdx), %r8 adcq 24(%rdx), %r9 - movq $0x00, %r11 - adcq $0x00, %r11 - shldq $0x01, %r9, %r11 - movq $0x7fffffffffffffff, %r12 - imulq $19, %r11 - andq %r12, %r9 + movq $0x00, %r10 + adcq $0x00, %r10 + shldq $0x01, %r9, %r10 + imulq $19, %r10 + btr $63, %r9 # Sub modulus (if overflow) - addq %r11, %rax + addq %r10, %rax adcq $0x00, %rcx adcq $0x00, %r8 adcq $0x00, %r9 @@ -509,7 +515,6 @@ _fe_add: movq %rcx, 8(%rdi) movq %r8, 16(%rdi) movq %r9, 24(%rdi) - popq %r12 repz retq #ifndef __APPLE__ .size fe_add,.-fe_add @@ -920,6 +925,30 @@ _ge_sub: #ifndef __APPLE__ .size ge_sub,.-ge_sub #endif /* __APPLE__ */ +#if defined(WOLFSSL_CURVE25519_NOT_USE_ED25519) +#if defined(WOLFSSL_CURVE25519_NOT_USE_ED25519) +#ifndef __APPLE__ +.text +.globl curve25519_base +.type curve25519_base,@function +.align 16 +curve25519_base: +#else +.section __TEXT,__text +.globl _curve25519_base +.p2align 4 +_curve25519_base: +#endif /* __APPLE__ */ +#ifndef __APPLE__ + jmpq *curve25519_base_p(%rip) +#else + jmpq *_curve25519_base_p(%rip) +#endif /* __APPLE__ */ +#ifndef __APPLE__ +.size curve25519_base,.-curve25519_base +#endif /* __APPLE__ */ +#endif /* WOLFSSL_CURVE25519_NOT_USE_ED25519 */ +#endif /* WOLFSSL_CURVE25519_NOT_USE_ED25519 */ #ifdef HAVE_ED25519 #ifdef HAVE_ED25519 #ifndef __APPLE__ @@ -946,6 +975,28 @@ _fe_sq2: #ifdef HAVE_ED25519 #ifndef __APPLE__ .text +.globl fe_invert_nct +.type fe_invert_nct,@function +.align 16 +fe_invert_nct: +#else +.section __TEXT,__text +.globl _fe_invert_nct +.p2align 4 +_fe_invert_nct: +#endif /* __APPLE__ */ +#ifndef __APPLE__ + jmpq *fe_invert_nct_p(%rip) +#else + jmpq *_fe_invert_nct_p(%rip) +#endif /* __APPLE__ */ +#ifndef __APPLE__ +.size fe_invert_nct,.-fe_invert_nct +#endif /* __APPLE__ */ +#endif /* HAVE_ED25519 */ +#ifdef HAVE_ED25519 +#ifndef __APPLE__ +.text .globl sc_reduce .type sc_reduce,@function .align 16 @@ -1180,7 +1231,7 @@ ge_sub_p: _ge_sub_p: .quad _ge_sub_x64 #endif /* __APPLE__ */ -#if !defined(HAVE_ED25519) && !defined(WOLFSSL_CURVE25519_USE_ED25519) +#if defined(WOLFSSL_CURVE25519_NOT_USE_ED25519) #ifndef __APPLE__ .data .type curve25519_base_p, @object @@ -1193,7 +1244,7 @@ curve25519_base_p: _curve25519_base_p: .quad _curve25519_base_x64 #endif /* __APPLE__ */ -#endif /* !HAVE_ED25519 && !WOLFSSL_CURVE25519_USE_ED25519 */ +#endif /* WOLFSSL_CURVE25519_NOT_USE_ED25519 */ #ifdef HAVE_ED25519 #ifndef __APPLE__ .data @@ -1209,6 +1260,18 @@ _fe_sq2_p: #endif /* __APPLE__ */ #ifndef __APPLE__ .data +.type fe_invert_nct_p, @object +.size fe_invert_nct_p,8 +fe_invert_nct_p: + .quad fe_invert_nct_x64 +#else +.section __DATA,__data +.p2align 3 +_fe_invert_nct_p: + .quad _fe_invert_nct_x64 +#endif /* __APPLE__ */ +#ifndef __APPLE__ +.data .type sc_reduce_p, @object .size sc_reduce_p,8 sc_reduce_p: @@ -2268,7 +2331,7 @@ _fe_invert_x64: movq 128(%rsp), %rdi addq $0x90, %rsp repz retq -#if !defined(HAVE_ED25519) && !defined(WOLFSSL_CURVE25519_USE_ED25519) +#if defined(WOLFSSL_CURVE25519_NOT_USE_ED25519) #ifndef __APPLE__ .data #else @@ -2342,43 +2405,42 @@ L_curve25519_base_x64_bits: movq 8(%rdi), %r8 movq 16(%rdi), %r9 movq 24(%rdi), %r10 + movq (%rsp), %r11 + movq 8(%rsp), %r12 + movq 16(%rsp), %r13 + movq 24(%rsp), %r14 xorq 64(%rsp), %rcx xorq 72(%rsp), %r8 xorq 80(%rsp), %r9 xorq 88(%rsp), %r10 + xorq 32(%rsp), %r11 + xorq 40(%rsp), %r12 + xorq 48(%rsp), %r13 + xorq 56(%rsp), %r14 andq %r15, %rcx andq %r15, %r8 andq %r15, %r9 andq %r15, %r10 + andq %r15, %r11 + andq %r15, %r12 + andq %r15, %r13 + andq %r15, %r14 xorq %rcx, (%rdi) xorq %r8, 8(%rdi) xorq %r9, 16(%rdi) xorq %r10, 24(%rdi) + xorq %r11, (%rsp) + xorq %r12, 8(%rsp) + xorq %r13, 16(%rsp) + xorq %r14, 24(%rsp) xorq %rcx, 64(%rsp) xorq %r8, 72(%rsp) xorq %r9, 80(%rsp) xorq %r10, 88(%rsp) - # Conditional Swap - movq (%rsp), %rcx - movq 8(%rsp), %r8 - movq 16(%rsp), %r9 - movq 24(%rsp), %r10 - xorq 32(%rsp), %rcx - xorq 40(%rsp), %r8 - xorq 48(%rsp), %r9 - xorq 56(%rsp), %r10 - andq %r15, %rcx - andq %r15, %r8 - andq %r15, %r9 - andq %r15, %r10 - xorq %rcx, (%rsp) - xorq %r8, 8(%rsp) - xorq %r9, 16(%rsp) - xorq %r10, 24(%rsp) - xorq %rcx, 32(%rsp) - xorq %r8, 40(%rsp) - xorq %r9, 48(%rsp) - xorq %r10, 56(%rsp) + xorq %r11, 32(%rsp) + xorq %r12, 40(%rsp) + xorq %r13, 48(%rsp) + xorq %r14, 56(%rsp) movq %rbx, %r15 # Add-Sub # Add @@ -2394,14 +2456,13 @@ L_curve25519_base_x64_bits: adcq 16(%rsp), %r9 movq %r10, %r14 adcq 24(%rsp), %r10 - movq $0x00, %rax - adcq $0x00, %rax - shldq $0x01, %r10, %rax - movq $0x7fffffffffffffff, %rdx - imulq $19, %rax - andq %rdx, %r10 + movq $0x00, %rbx + adcq $0x00, %rbx + shldq $0x01, %r10, %rbx + imulq $19, %rbx + btr $63, %r10 # Sub modulus (if overflow) - addq %rax, %rcx + addq %rbx, %rcx adcq $0x00, %r8 adcq $0x00, %r9 adcq $0x00, %r10 @@ -2410,12 +2471,12 @@ L_curve25519_base_x64_bits: sbbq 8(%rsp), %r12 sbbq 16(%rsp), %r13 sbbq 24(%rsp), %r14 - sbbq %rax, %rax - shldq $0x01, %r14, %rax - imulq $-19, %rax - andq %rdx, %r14 + sbbq %rbx, %rbx + shldq $0x01, %r14, %rbx + imulq $-19, %rbx + btr $63, %r14 # Add modulus (if underflow) - subq %rax, %r11 + subq %rbx, %r11 sbbq $0x00, %r12 sbbq $0x00, %r13 sbbq $0x00, %r14 @@ -2441,14 +2502,13 @@ L_curve25519_base_x64_bits: adcq 48(%rsp), %r9 movq %r10, %r14 adcq 56(%rsp), %r10 - movq $0x00, %rax - adcq $0x00, %rax - shldq $0x01, %r10, %rax - movq $0x7fffffffffffffff, %rdx - imulq $19, %rax - andq %rdx, %r10 + movq $0x00, %rbx + adcq $0x00, %rbx + shldq $0x01, %r10, %rbx + imulq $19, %rbx + btr $63, %r10 # Sub modulus (if overflow) - addq %rax, %rcx + addq %rbx, %rcx adcq $0x00, %r8 adcq $0x00, %r9 adcq $0x00, %r10 @@ -2457,12 +2517,12 @@ L_curve25519_base_x64_bits: sbbq 40(%rsp), %r12 sbbq 48(%rsp), %r13 sbbq 56(%rsp), %r14 - sbbq %rax, %rax - shldq $0x01, %r14, %rax - imulq $-19, %rax - andq %rdx, %r14 + sbbq %rbx, %rbx + shldq $0x01, %r14, %rbx + imulq $-19, %rbx + btr $63, %r14 # Add modulus (if underflow) - subq %rax, %r11 + subq %rbx, %r11 sbbq $0x00, %r12 sbbq $0x00, %r13 sbbq $0x00, %r14 @@ -2962,14 +3022,13 @@ L_curve25519_base_x64_bits: adcq 48(%rsp), %r9 movq %r10, %r14 adcq 56(%rsp), %r10 - movq $0x00, %rax - adcq $0x00, %rax - shldq $0x01, %r10, %rax - movq $0x7fffffffffffffff, %rdx - imulq $19, %rax - andq %rdx, %r10 + movq $0x00, %rbx + adcq $0x00, %rbx + shldq $0x01, %r10, %rbx + imulq $19, %rbx + btr $63, %r10 # Sub modulus (if overflow) - addq %rax, %rcx + addq %rbx, %rcx adcq $0x00, %r8 adcq $0x00, %r9 adcq $0x00, %r10 @@ -2978,12 +3037,12 @@ L_curve25519_base_x64_bits: sbbq 40(%rsp), %r12 sbbq 48(%rsp), %r13 sbbq 56(%rsp), %r14 - sbbq %rax, %rax - shldq $0x01, %r14, %rax - imulq $-19, %rax - andq %rdx, %r14 + sbbq %rbx, %rbx + shldq $0x01, %r14, %rbx + imulq $-19, %rbx + btr $63, %r14 # Add modulus (if underflow) - subq %rax, %r11 + subq %rbx, %r11 sbbq $0x00, %r12 sbbq $0x00, %r13 sbbq $0x00, %r14 @@ -3136,13 +3195,12 @@ L_curve25519_base_x64_bits: sbbq 104(%rsp), %r8 sbbq 112(%rsp), %r9 sbbq 120(%rsp), %r10 - sbbq %rax, %rax - shldq $0x01, %r10, %rax - movq $0x7fffffffffffffff, %rdx - imulq $-19, %rax - andq %rdx, %r10 + sbbq %rbx, %rbx + shldq $0x01, %r10, %rbx + imulq $-19, %rbx + btr $63, %r10 # Add modulus (if underflow) - subq %rax, %rcx + subq %rbx, %rcx sbbq $0x00, %r8 sbbq $0x00, %r9 sbbq $0x00, %r10 @@ -3255,39 +3313,6 @@ L_curve25519_base_x64_bits: movq %r8, 40(%rsp) movq %r9, 48(%rsp) movq %r10, 56(%rsp) - # Multiply by 121666 - movq $0x1db42, %rax - mulq 128(%rsp) - xorq %r9, %r9 - movq %rax, %rcx - movq %rdx, %r8 - movq $0x1db42, %rax - mulq 136(%rsp) - xorq %r10, %r10 - addq %rax, %r8 - adcq %rdx, %r9 - movq $0x1db42, %rax - mulq 144(%rsp) - xorq %r12, %r12 - addq %rax, %r9 - adcq %rdx, %r10 - movq $0x1db42, %rax - mulq 152(%rsp) - movq $0x7fffffffffffffff, %r11 - addq %rax, %r10 - adcq %rdx, %r12 - shldq $0x01, %r10, %r12 - andq %r11, %r10 - movq $19, %rax - mulq %r12 - addq %rax, %rcx - adcq $0x00, %r8 - adcq $0x00, %r9 - adcq $0x00, %r10 - movq %rcx, (%rsp) - movq %r8, 8(%rsp) - movq %r9, 16(%rsp) - movq %r10, 24(%rsp) # Square # A[0] * A[1] movq 64(%rsp), %rax @@ -3393,22 +3418,36 @@ L_curve25519_base_x64_bits: movq %r8, 72(%rsp) movq %r9, 80(%rsp) movq %r10, 88(%rsp) - # Add - movq 96(%rsp), %rcx - movq 104(%rsp), %r8 - addq (%rsp), %rcx - movq 112(%rsp), %r9 - adcq 8(%rsp), %r8 - movq 120(%rsp), %r10 - adcq 16(%rsp), %r9 - adcq 24(%rsp), %r10 - movq $0x00, %rax - adcq $0x00, %rax - shldq $0x01, %r10, %rax - movq $0x7fffffffffffffff, %rdx - imulq $19, %rax - andq %rdx, %r10 - # Sub modulus (if overflow) + # Multiply by 121666 + movq $0x1db42, %rax + mulq 128(%rsp) + xorq %r9, %r9 + movq %rax, %rcx + movq %rdx, %r8 + movq $0x1db42, %rax + mulq 136(%rsp) + xorq %r10, %r10 + addq %rax, %r8 + adcq %rdx, %r9 + movq $0x1db42, %rax + mulq 144(%rsp) + xorq %r12, %r12 + addq %rax, %r9 + adcq %rdx, %r10 + movq $0x1db42, %rax + mulq 152(%rsp) + movq $0x7fffffffffffffff, %r11 + addq %rax, %r10 + adcq %rdx, %r12 + addq 96(%rsp), %rcx + adcq 104(%rsp), %r8 + adcq 112(%rsp), %r9 + adcq 120(%rsp), %r10 + adcq $0x00, %r12 + shldq $0x01, %r10, %r12 + andq %r11, %r10 + movq $19, %rax + mulq %r12 addq %rax, %rcx adcq $0x00, %r8 adcq $0x00, %r9 @@ -3591,43 +3630,42 @@ L_curve25519_base_x64_bits: movq 8(%rdi), %r8 movq 16(%rdi), %r9 movq 24(%rdi), %r10 + movq (%rsp), %r11 + movq 8(%rsp), %r12 + movq 16(%rsp), %r13 + movq 24(%rsp), %r14 xorq 64(%rsp), %rcx xorq 72(%rsp), %r8 xorq 80(%rsp), %r9 xorq 88(%rsp), %r10 + xorq 32(%rsp), %r11 + xorq 40(%rsp), %r12 + xorq 48(%rsp), %r13 + xorq 56(%rsp), %r14 andq %r15, %rcx andq %r15, %r8 andq %r15, %r9 andq %r15, %r10 + andq %r15, %r11 + andq %r15, %r12 + andq %r15, %r13 + andq %r15, %r14 xorq %rcx, (%rdi) xorq %r8, 8(%rdi) xorq %r9, 16(%rdi) xorq %r10, 24(%rdi) + xorq %r11, (%rsp) + xorq %r12, 8(%rsp) + xorq %r13, 16(%rsp) + xorq %r14, 24(%rsp) xorq %rcx, 64(%rsp) xorq %r8, 72(%rsp) xorq %r9, 80(%rsp) xorq %r10, 88(%rsp) - # Conditional Swap - movq (%rsp), %rcx - movq 8(%rsp), %r8 - movq 16(%rsp), %r9 - movq 24(%rsp), %r10 - xorq 32(%rsp), %rcx - xorq 40(%rsp), %r8 - xorq 48(%rsp), %r9 - xorq 56(%rsp), %r10 - andq %r15, %rcx - andq %r15, %r8 - andq %r15, %r9 - andq %r15, %r10 - xorq %rcx, (%rsp) - xorq %r8, 8(%rsp) - xorq %r9, 16(%rsp) - xorq %r10, 24(%rsp) - xorq %rcx, 32(%rsp) - xorq %r8, 40(%rsp) - xorq %r9, 48(%rsp) - xorq %r10, 56(%rsp) + xorq %r11, 32(%rsp) + xorq %r12, 40(%rsp) + xorq %r13, 48(%rsp) + xorq %r14, 56(%rsp) L_curve25519_base_x64_3: # Add-Sub # Add @@ -3643,14 +3681,13 @@ L_curve25519_base_x64_3: adcq 16(%rsp), %r9 movq %r10, %r14 adcq 24(%rsp), %r10 - movq $0x00, %rax - adcq $0x00, %rax - shldq $0x01, %r10, %rax - movq $0x7fffffffffffffff, %rdx - imulq $19, %rax - andq %rdx, %r10 + movq $0x00, %rbx + adcq $0x00, %rbx + shldq $0x01, %r10, %rbx + imulq $19, %rbx + btr $63, %r10 # Sub modulus (if overflow) - addq %rax, %rcx + addq %rbx, %rcx adcq $0x00, %r8 adcq $0x00, %r9 adcq $0x00, %r10 @@ -3659,12 +3696,12 @@ L_curve25519_base_x64_3: sbbq 8(%rsp), %r12 sbbq 16(%rsp), %r13 sbbq 24(%rsp), %r14 - sbbq %rax, %rax - shldq $0x01, %r14, %rax - imulq $-19, %rax - andq %rdx, %r14 + sbbq %rbx, %rbx + shldq $0x01, %r14, %rbx + imulq $-19, %rbx + btr $63, %r14 # Add modulus (if underflow) - subq %rax, %r11 + subq %rbx, %r11 sbbq $0x00, %r12 sbbq $0x00, %r13 sbbq $0x00, %r14 @@ -4027,13 +4064,12 @@ L_curve25519_base_x64_3: sbbq 104(%rsp), %r8 sbbq 112(%rsp), %r9 sbbq 120(%rsp), %r10 - sbbq %rax, %rax - shldq $0x01, %r10, %rax - movq $0x7fffffffffffffff, %rdx - imulq $-19, %rax - andq %rdx, %r10 + sbbq %rbx, %rbx + shldq $0x01, %r10, %rbx + imulq $-19, %rbx + btr $63, %r10 # Add modulus (if underflow) - subq %rax, %rcx + subq %rbx, %rcx sbbq $0x00, %r8 sbbq $0x00, %r9 sbbq $0x00, %r10 @@ -4062,6 +4098,11 @@ L_curve25519_base_x64_3: movq $0x7fffffffffffffff, %r11 addq %rax, %r10 adcq %rdx, %r12 + addq 96(%rsp), %rcx + adcq 104(%rsp), %r8 + adcq 112(%rsp), %r9 + adcq 120(%rsp), %r10 + adcq $0x00, %r12 shldq $0x01, %r10, %r12 andq %r11, %r10 movq $19, %rax @@ -4070,30 +4111,6 @@ L_curve25519_base_x64_3: adcq $0x00, %r8 adcq $0x00, %r9 adcq $0x00, %r10 - movq %rcx, (%rsp) - movq %r8, 8(%rsp) - movq %r9, 16(%rsp) - movq %r10, 24(%rsp) - # Add - movq 96(%rsp), %rcx - movq 104(%rsp), %r8 - addq (%rsp), %rcx - movq 112(%rsp), %r9 - adcq 8(%rsp), %r8 - movq 120(%rsp), %r10 - adcq 16(%rsp), %r9 - adcq 24(%rsp), %r10 - movq $0x00, %rax - adcq $0x00, %rax - shldq $0x01, %r10, %rax - movq $0x7fffffffffffffff, %rdx - imulq $19, %rax - andq %rdx, %r10 - # Sub modulus (if overflow) - addq %rax, %rcx - adcq $0x00, %r8 - adcq $0x00, %r9 - adcq $0x00, %r10 movq %rcx, 96(%rsp) movq %r8, 104(%rsp) movq %r9, 112(%rsp) @@ -4639,7 +4656,7 @@ L_curve25519_base_x64_3: #ifndef __APPLE__ .size curve25519_base_x64,.-curve25519_base_x64 #endif /* __APPLE__ */ -#endif /* !HAVE_ED25519 && !WOLFSSL_CURVE25519_USE_ED25519 */ +#endif /* WOLFSSL_CURVE25519_NOT_USE_ED25519 */ #ifndef __APPLE__ .text .globl curve25519_x64 @@ -4702,43 +4719,42 @@ L_curve25519_x64_bits: movq 8(%rdi), %r9 movq 16(%rdi), %r10 movq 24(%rdi), %r11 + movq (%rsp), %r12 + movq 8(%rsp), %r13 + movq 16(%rsp), %r14 + movq 24(%rsp), %r15 xorq 64(%rsp), %rcx xorq 72(%rsp), %r9 xorq 80(%rsp), %r10 xorq 88(%rsp), %r11 + xorq 32(%rsp), %r12 + xorq 40(%rsp), %r13 + xorq 48(%rsp), %r14 + xorq 56(%rsp), %r15 andq %rbx, %rcx andq %rbx, %r9 andq %rbx, %r10 andq %rbx, %r11 + andq %rbx, %r12 + andq %rbx, %r13 + andq %rbx, %r14 + andq %rbx, %r15 xorq %rcx, (%rdi) xorq %r9, 8(%rdi) xorq %r10, 16(%rdi) xorq %r11, 24(%rdi) + xorq %r12, (%rsp) + xorq %r13, 8(%rsp) + xorq %r14, 16(%rsp) + xorq %r15, 24(%rsp) xorq %rcx, 64(%rsp) xorq %r9, 72(%rsp) xorq %r10, 80(%rsp) xorq %r11, 88(%rsp) - # Conditional Swap - movq (%rsp), %rcx - movq 8(%rsp), %r9 - movq 16(%rsp), %r10 - movq 24(%rsp), %r11 - xorq 32(%rsp), %rcx - xorq 40(%rsp), %r9 - xorq 48(%rsp), %r10 - xorq 56(%rsp), %r11 - andq %rbx, %rcx - andq %rbx, %r9 - andq %rbx, %r10 - andq %rbx, %r11 - xorq %rcx, (%rsp) - xorq %r9, 8(%rsp) - xorq %r10, 16(%rsp) - xorq %r11, 24(%rsp) - xorq %rcx, 32(%rsp) - xorq %r9, 40(%rsp) - xorq %r10, 48(%rsp) - xorq %r11, 56(%rsp) + xorq %r12, 32(%rsp) + xorq %r13, 40(%rsp) + xorq %r14, 48(%rsp) + xorq %r15, 56(%rsp) movq %rbp, %rbx # Add-Sub # Add @@ -4754,14 +4770,13 @@ L_curve25519_x64_bits: adcq 16(%rsp), %r10 movq %r11, %r15 adcq 24(%rsp), %r11 - movq $0x00, %rax - adcq $0x00, %rax - shldq $0x01, %r11, %rax - movq $0x7fffffffffffffff, %rdx - imulq $19, %rax - andq %rdx, %r11 + movq $0x00, %rbp + adcq $0x00, %rbp + shldq $0x01, %r11, %rbp + imulq $19, %rbp + btr $63, %r11 # Sub modulus (if overflow) - addq %rax, %rcx + addq %rbp, %rcx adcq $0x00, %r9 adcq $0x00, %r10 adcq $0x00, %r11 @@ -4770,12 +4785,12 @@ L_curve25519_x64_bits: sbbq 8(%rsp), %r13 sbbq 16(%rsp), %r14 sbbq 24(%rsp), %r15 - sbbq %rax, %rax - shldq $0x01, %r15, %rax - imulq $-19, %rax - andq %rdx, %r15 + sbbq %rbp, %rbp + shldq $0x01, %r15, %rbp + imulq $-19, %rbp + btr $63, %r15 # Add modulus (if underflow) - subq %rax, %r12 + subq %rbp, %r12 sbbq $0x00, %r13 sbbq $0x00, %r14 sbbq $0x00, %r15 @@ -4801,14 +4816,13 @@ L_curve25519_x64_bits: adcq 48(%rsp), %r10 movq %r11, %r15 adcq 56(%rsp), %r11 - movq $0x00, %rax - adcq $0x00, %rax - shldq $0x01, %r11, %rax - movq $0x7fffffffffffffff, %rdx - imulq $19, %rax - andq %rdx, %r11 + movq $0x00, %rbp + adcq $0x00, %rbp + shldq $0x01, %r11, %rbp + imulq $19, %rbp + btr $63, %r11 # Sub modulus (if overflow) - addq %rax, %rcx + addq %rbp, %rcx adcq $0x00, %r9 adcq $0x00, %r10 adcq $0x00, %r11 @@ -4817,12 +4831,12 @@ L_curve25519_x64_bits: sbbq 40(%rsp), %r13 sbbq 48(%rsp), %r14 sbbq 56(%rsp), %r15 - sbbq %rax, %rax - shldq $0x01, %r15, %rax - imulq $-19, %rax - andq %rdx, %r15 + sbbq %rbp, %rbp + shldq $0x01, %r15, %rbp + imulq $-19, %rbp + btr $63, %r15 # Add modulus (if underflow) - subq %rax, %r12 + subq %rbp, %r12 sbbq $0x00, %r13 sbbq $0x00, %r14 sbbq $0x00, %r15 @@ -5322,14 +5336,13 @@ L_curve25519_x64_bits: adcq 48(%rsp), %r10 movq %r11, %r15 adcq 56(%rsp), %r11 - movq $0x00, %rax - adcq $0x00, %rax - shldq $0x01, %r11, %rax - movq $0x7fffffffffffffff, %rdx - imulq $19, %rax - andq %rdx, %r11 + movq $0x00, %rbp + adcq $0x00, %rbp + shldq $0x01, %r11, %rbp + imulq $19, %rbp + btr $63, %r11 # Sub modulus (if overflow) - addq %rax, %rcx + addq %rbp, %rcx adcq $0x00, %r9 adcq $0x00, %r10 adcq $0x00, %r11 @@ -5338,12 +5351,12 @@ L_curve25519_x64_bits: sbbq 40(%rsp), %r13 sbbq 48(%rsp), %r14 sbbq 56(%rsp), %r15 - sbbq %rax, %rax - shldq $0x01, %r15, %rax - imulq $-19, %rax - andq %rdx, %r15 + sbbq %rbp, %rbp + shldq $0x01, %r15, %rbp + imulq $-19, %rbp + btr $63, %r15 # Add modulus (if underflow) - subq %rax, %r12 + subq %rbp, %r12 sbbq $0x00, %r13 sbbq $0x00, %r14 sbbq $0x00, %r15 @@ -5496,13 +5509,12 @@ L_curve25519_x64_bits: sbbq 104(%rsp), %r9 sbbq 112(%rsp), %r10 sbbq 120(%rsp), %r11 - sbbq %rax, %rax - shldq $0x01, %r11, %rax - movq $0x7fffffffffffffff, %rdx - imulq $-19, %rax - andq %rdx, %r11 + sbbq %rbp, %rbp + shldq $0x01, %r11, %rbp + imulq $-19, %rbp + btr $63, %r11 # Add modulus (if underflow) - subq %rax, %rcx + subq %rbp, %rcx sbbq $0x00, %r9 sbbq $0x00, %r10 sbbq $0x00, %r11 @@ -5615,39 +5627,6 @@ L_curve25519_x64_bits: movq %r9, 40(%rsp) movq %r10, 48(%rsp) movq %r11, 56(%rsp) - # Multiply by 121666 - movq $0x1db42, %rax - mulq 128(%rsp) - xorq %r10, %r10 - movq %rax, %rcx - movq %rdx, %r9 - movq $0x1db42, %rax - mulq 136(%rsp) - xorq %r11, %r11 - addq %rax, %r9 - adcq %rdx, %r10 - movq $0x1db42, %rax - mulq 144(%rsp) - xorq %r13, %r13 - addq %rax, %r10 - adcq %rdx, %r11 - movq $0x1db42, %rax - mulq 152(%rsp) - movq $0x7fffffffffffffff, %r12 - addq %rax, %r11 - adcq %rdx, %r13 - shldq $0x01, %r11, %r13 - andq %r12, %r11 - movq $19, %rax - mulq %r13 - addq %rax, %rcx - adcq $0x00, %r9 - adcq $0x00, %r10 - adcq $0x00, %r11 - movq %rcx, (%rsp) - movq %r9, 8(%rsp) - movq %r10, 16(%rsp) - movq %r11, 24(%rsp) # Square # A[0] * A[1] movq 64(%rsp), %rax @@ -5753,22 +5732,36 @@ L_curve25519_x64_bits: movq %r9, 72(%rsp) movq %r10, 80(%rsp) movq %r11, 88(%rsp) - # Add - movq 96(%rsp), %rcx - movq 104(%rsp), %r9 - addq (%rsp), %rcx - movq 112(%rsp), %r10 - adcq 8(%rsp), %r9 - movq 120(%rsp), %r11 - adcq 16(%rsp), %r10 - adcq 24(%rsp), %r11 - movq $0x00, %rax - adcq $0x00, %rax - shldq $0x01, %r11, %rax - movq $0x7fffffffffffffff, %rdx - imulq $19, %rax - andq %rdx, %r11 - # Sub modulus (if overflow) + # Multiply by 121666 + movq $0x1db42, %rax + mulq 128(%rsp) + xorq %r10, %r10 + movq %rax, %rcx + movq %rdx, %r9 + movq $0x1db42, %rax + mulq 136(%rsp) + xorq %r11, %r11 + addq %rax, %r9 + adcq %rdx, %r10 + movq $0x1db42, %rax + mulq 144(%rsp) + xorq %r13, %r13 + addq %rax, %r10 + adcq %rdx, %r11 + movq $0x1db42, %rax + mulq 152(%rsp) + movq $0x7fffffffffffffff, %r12 + addq %rax, %r11 + adcq %rdx, %r13 + addq 96(%rsp), %rcx + adcq 104(%rsp), %r9 + adcq 112(%rsp), %r10 + adcq 120(%rsp), %r11 + adcq $0x00, %r13 + shldq $0x01, %r11, %r13 + andq %r12, %r11 + movq $19, %rax + mulq %r13 addq %rax, %rcx adcq $0x00, %r9 adcq $0x00, %r10 @@ -6045,50 +6038,49 @@ L_curve25519_x64_bits: decq %r9 cmpq $3, %r9 jge L_curve25519_x64_bits - movq %r9, 160(%rsp) + movq $2, 160(%rsp) negq %rbx # Conditional Swap movq (%rdi), %rcx movq 8(%rdi), %r9 movq 16(%rdi), %r10 movq 24(%rdi), %r11 + movq (%rsp), %r12 + movq 8(%rsp), %r13 + movq 16(%rsp), %r14 + movq 24(%rsp), %r15 xorq 64(%rsp), %rcx xorq 72(%rsp), %r9 xorq 80(%rsp), %r10 xorq 88(%rsp), %r11 + xorq 32(%rsp), %r12 + xorq 40(%rsp), %r13 + xorq 48(%rsp), %r14 + xorq 56(%rsp), %r15 andq %rbx, %rcx andq %rbx, %r9 andq %rbx, %r10 andq %rbx, %r11 + andq %rbx, %r12 + andq %rbx, %r13 + andq %rbx, %r14 + andq %rbx, %r15 xorq %rcx, (%rdi) xorq %r9, 8(%rdi) xorq %r10, 16(%rdi) xorq %r11, 24(%rdi) + xorq %r12, (%rsp) + xorq %r13, 8(%rsp) + xorq %r14, 16(%rsp) + xorq %r15, 24(%rsp) xorq %rcx, 64(%rsp) xorq %r9, 72(%rsp) xorq %r10, 80(%rsp) xorq %r11, 88(%rsp) - # Conditional Swap - movq (%rsp), %rcx - movq 8(%rsp), %r9 - movq 16(%rsp), %r10 - movq 24(%rsp), %r11 - xorq 32(%rsp), %rcx - xorq 40(%rsp), %r9 - xorq 48(%rsp), %r10 - xorq 56(%rsp), %r11 - andq %rbx, %rcx - andq %rbx, %r9 - andq %rbx, %r10 - andq %rbx, %r11 - xorq %rcx, (%rsp) - xorq %r9, 8(%rsp) - xorq %r10, 16(%rsp) - xorq %r11, 24(%rsp) - xorq %rcx, 32(%rsp) - xorq %r9, 40(%rsp) - xorq %r10, 48(%rsp) - xorq %r11, 56(%rsp) + xorq %r12, 32(%rsp) + xorq %r13, 40(%rsp) + xorq %r14, 48(%rsp) + xorq %r15, 56(%rsp) L_curve25519_x64_3: # Add-Sub # Add @@ -6104,14 +6096,13 @@ L_curve25519_x64_3: adcq 16(%rsp), %r10 movq %r11, %r15 adcq 24(%rsp), %r11 - movq $0x00, %rax - adcq $0x00, %rax - shldq $0x01, %r11, %rax - movq $0x7fffffffffffffff, %rdx - imulq $19, %rax - andq %rdx, %r11 + movq $0x00, %rbp + adcq $0x00, %rbp + shldq $0x01, %r11, %rbp + imulq $19, %rbp + btr $63, %r11 # Sub modulus (if overflow) - addq %rax, %rcx + addq %rbp, %rcx adcq $0x00, %r9 adcq $0x00, %r10 adcq $0x00, %r11 @@ -6120,12 +6111,12 @@ L_curve25519_x64_3: sbbq 8(%rsp), %r13 sbbq 16(%rsp), %r14 sbbq 24(%rsp), %r15 - sbbq %rax, %rax - shldq $0x01, %r15, %rax - imulq $-19, %rax - andq %rdx, %r15 + sbbq %rbp, %rbp + shldq $0x01, %r15, %rbp + imulq $-19, %rbp + btr $63, %r15 # Add modulus (if underflow) - subq %rax, %r12 + subq %rbp, %r12 sbbq $0x00, %r13 sbbq $0x00, %r14 sbbq $0x00, %r15 @@ -6488,13 +6479,12 @@ L_curve25519_x64_3: sbbq 104(%rsp), %r9 sbbq 112(%rsp), %r10 sbbq 120(%rsp), %r11 - sbbq %rax, %rax - shldq $0x01, %r11, %rax - movq $0x7fffffffffffffff, %rdx - imulq $-19, %rax - andq %rdx, %r11 + sbbq %rbp, %rbp + shldq $0x01, %r11, %rbp + imulq $-19, %rbp + btr $63, %r11 # Add modulus (if underflow) - subq %rax, %rcx + subq %rbp, %rcx sbbq $0x00, %r9 sbbq $0x00, %r10 sbbq $0x00, %r11 @@ -6523,6 +6513,11 @@ L_curve25519_x64_3: movq $0x7fffffffffffffff, %r12 addq %rax, %r11 adcq %rdx, %r13 + addq 96(%rsp), %rcx + adcq 104(%rsp), %r9 + adcq 112(%rsp), %r10 + adcq 120(%rsp), %r11 + adcq $0x00, %r13 shldq $0x01, %r11, %r13 andq %r12, %r11 movq $19, %rax @@ -6531,30 +6526,6 @@ L_curve25519_x64_3: adcq $0x00, %r9 adcq $0x00, %r10 adcq $0x00, %r11 - movq %rcx, (%rsp) - movq %r9, 8(%rsp) - movq %r10, 16(%rsp) - movq %r11, 24(%rsp) - # Add - movq 96(%rsp), %rcx - movq 104(%rsp), %r9 - addq (%rsp), %rcx - movq 112(%rsp), %r10 - adcq 8(%rsp), %r9 - movq 120(%rsp), %r11 - adcq 16(%rsp), %r10 - adcq 24(%rsp), %r11 - movq $0x00, %rax - adcq $0x00, %rax - shldq $0x01, %r11, %rax - movq $0x7fffffffffffffff, %rdx - imulq $19, %rax - andq %rdx, %r11 - # Sub modulus (if overflow) - addq %rax, %rcx - adcq $0x00, %r9 - adcq $0x00, %r10 - adcq $0x00, %r11 movq %rcx, 96(%rsp) movq %r9, 104(%rsp) movq %r10, 112(%rsp) @@ -6691,9 +6662,7 @@ L_curve25519_x64_3: movq %r9, 8(%rsp) movq %r10, 16(%rsp) movq %r11, 24(%rsp) - movq 160(%rsp), %r9 - decq %r9 - movq %r9, 160(%rsp) + decq 160(%rsp) jge L_curve25519_x64_3 # Invert leaq 32(%rsp), %rdi @@ -8598,14 +8567,13 @@ _ge_p2_dbl_x64: adcq 16(%rsi), %r11 movq %r12, %rbx adcq 24(%rsi), %r12 - movq $0x00, %rax - adcq $0x00, %rax - shldq $0x01, %r12, %rax - movq $0x7fffffffffffffff, %rdx - imulq $19, %rax - andq %rdx, %r12 + movq $0x00, %r8 + adcq $0x00, %r8 + shldq $0x01, %r12, %r8 + imulq $19, %r8 + btr $63, %r12 # Sub modulus (if overflow) - addq %rax, %r9 + addq %r8, %r9 adcq $0x00, %r10 adcq $0x00, %r11 adcq $0x00, %r12 @@ -8614,12 +8582,12 @@ _ge_p2_dbl_x64: sbbq 8(%rsi), %r14 sbbq 16(%rsi), %r15 sbbq 24(%rsi), %rbx - sbbq %rax, %rax - shldq $0x01, %rbx, %rax - imulq $-19, %rax - andq %rdx, %rbx + sbbq %r8, %r8 + shldq $0x01, %rbx, %r8 + imulq $-19, %r8 + btr $63, %rbx # Add modulus (if underflow) - subq %rax, %r13 + subq %r8, %r13 sbbq $0x00, %r14 sbbq $0x00, %r15 sbbq $0x00, %rbx @@ -8644,14 +8612,13 @@ _ge_p2_dbl_x64: movq 24(%rsi), %r12 adcq 16(%rcx), %r11 adcq 24(%rcx), %r12 - movq $0x00, %rax - adcq $0x00, %rax - shldq $0x01, %r12, %rax - movq $0x7fffffffffffffff, %rdx - imulq $19, %rax - andq %rdx, %r12 + movq $0x00, %r8 + adcq $0x00, %r8 + shldq $0x01, %r12, %r8 + imulq $19, %r8 + btr $63, %r12 # Sub modulus (if overflow) - addq %rax, %r9 + addq %r8, %r9 adcq $0x00, %r10 adcq $0x00, %r11 adcq $0x00, %r12 @@ -8767,13 +8734,12 @@ _ge_p2_dbl_x64: sbbq 8(%rsi), %r10 sbbq 16(%rsi), %r11 sbbq 24(%rsi), %r12 - sbbq %rax, %rax - shldq $0x01, %r12, %rax - movq $0x7fffffffffffffff, %rdx - imulq $-19, %rax - andq %rdx, %r12 + sbbq %r8, %r8 + shldq $0x01, %r12, %r8 + imulq $-19, %r8 + btr $63, %r12 # Add modulus (if underflow) - subq %rax, %r9 + subq %r8, %r9 sbbq $0x00, %r10 sbbq $0x00, %r11 sbbq $0x00, %r12 @@ -8904,13 +8870,12 @@ _ge_p2_dbl_x64: sbbq 8(%rsi), %r10 sbbq 16(%rsi), %r11 sbbq 24(%rsi), %r12 - sbbq %rax, %rax - shldq $0x01, %r12, %rax - movq $0x7fffffffffffffff, %rdx - imulq $-19, %rax - andq %rdx, %r12 + sbbq %r8, %r8 + shldq $0x01, %r12, %r8 + imulq $-19, %r8 + btr $63, %r12 # Add modulus (if underflow) - subq %rax, %r9 + subq %r8, %r9 sbbq $0x00, %r10 sbbq $0x00, %r11 sbbq $0x00, %r12 @@ -8970,14 +8935,13 @@ _ge_madd_x64: adcq 16(%r8), %r12 movq %r13, %rbp adcq 24(%r8), %r13 - movq $0x00, %rax - adcq $0x00, %rax - shldq $0x01, %r13, %rax - movq $0x7fffffffffffffff, %rdx - imulq $19, %rax - andq %rdx, %r13 + movq $0x00, %r9 + adcq $0x00, %r9 + shldq $0x01, %r13, %r9 + imulq $19, %r9 + btr $63, %r13 # Sub modulus (if overflow) - addq %rax, %r10 + addq %r9, %r10 adcq $0x00, %r11 adcq $0x00, %r12 adcq $0x00, %r13 @@ -8986,12 +8950,12 @@ _ge_madd_x64: sbbq 8(%r8), %r15 sbbq 16(%r8), %rbx sbbq 24(%r8), %rbp - sbbq %rax, %rax - shldq $0x01, %rbp, %rax - imulq $-19, %rax - andq %rdx, %rbp + sbbq %r9, %r9 + shldq $0x01, %rbp, %r9 + imulq $-19, %r9 + btr $63, %rbp # Add modulus (if underflow) - subq %rax, %r14 + subq %r9, %r14 sbbq $0x00, %r15 sbbq $0x00, %rbx sbbq $0x00, %rbp @@ -9412,14 +9376,13 @@ _ge_madd_x64: adcq 16(%rsi), %r12 movq %r13, %rbp adcq 24(%rsi), %r13 - movq $0x00, %rax - adcq $0x00, %rax - shldq $0x01, %r13, %rax - movq $0x7fffffffffffffff, %rdx - imulq $19, %rax - andq %rdx, %r13 + movq $0x00, %r9 + adcq $0x00, %r9 + shldq $0x01, %r13, %r9 + imulq $19, %r9 + btr $63, %r13 # Sub modulus (if overflow) - addq %rax, %r10 + addq %r9, %r10 adcq $0x00, %r11 adcq $0x00, %r12 adcq $0x00, %r13 @@ -9428,12 +9391,12 @@ _ge_madd_x64: sbbq 8(%rsi), %r15 sbbq 16(%rsi), %rbx sbbq 24(%rsi), %rbp - sbbq %rax, %rax - shldq $0x01, %rbp, %rax - imulq $-19, %rax - andq %rdx, %rbp + sbbq %r9, %r9 + shldq $0x01, %rbp, %r9 + imulq $-19, %r9 + btr $63, %rbp # Add modulus (if underflow) - subq %rax, %r14 + subq %r9, %r14 sbbq $0x00, %r15 sbbq $0x00, %rbx sbbq $0x00, %rbp @@ -9455,14 +9418,13 @@ _ge_madd_x64: movq 24(%r8), %r13 adcq %r12, %r12 adcq %r13, %r13 - movq $0x00, %rax - adcq $0x00, %rax - shldq $0x01, %r13, %rax - movq $0x7fffffffffffffff, %rdx - imulq $19, %rax - andq %rdx, %r13 + movq $0x00, %r9 + adcq $0x00, %r9 + shldq $0x01, %r13, %r9 + imulq $19, %r9 + btr $63, %r13 # Sub modulus (if overflow) - addq %rax, %r10 + addq %r9, %r10 adcq $0x00, %r11 adcq $0x00, %r12 adcq $0x00, %r13 @@ -9479,14 +9441,13 @@ _ge_madd_x64: adcq 16(%rsi), %r12 movq %r13, %rbp adcq 24(%rsi), %r13 - movq $0x00, %rax - adcq $0x00, %rax - shldq $0x01, %r13, %rax - movq $0x7fffffffffffffff, %rdx - imulq $19, %rax - andq %rdx, %r13 + movq $0x00, %r9 + adcq $0x00, %r9 + shldq $0x01, %r13, %r9 + imulq $19, %r9 + btr $63, %r13 # Sub modulus (if overflow) - addq %rax, %r10 + addq %r9, %r10 adcq $0x00, %r11 adcq $0x00, %r12 adcq $0x00, %r13 @@ -9495,12 +9456,12 @@ _ge_madd_x64: sbbq 8(%rsi), %r15 sbbq 16(%rsi), %rbx sbbq 24(%rsi), %rbp - sbbq %rax, %rax - shldq $0x01, %rbp, %rax - imulq $-19, %rax - andq %rdx, %rbp + sbbq %r9, %r9 + shldq $0x01, %rbp, %r9 + imulq $-19, %r9 + btr $63, %rbp # Add modulus (if underflow) - subq %rax, %r14 + subq %r9, %r14 sbbq $0x00, %r15 sbbq $0x00, %rbx sbbq $0x00, %rbp @@ -9565,14 +9526,13 @@ _ge_msub_x64: adcq 16(%r8), %r12 movq %r13, %rbp adcq 24(%r8), %r13 - movq $0x00, %rax - adcq $0x00, %rax - shldq $0x01, %r13, %rax - movq $0x7fffffffffffffff, %rdx - imulq $19, %rax - andq %rdx, %r13 + movq $0x00, %r9 + adcq $0x00, %r9 + shldq $0x01, %r13, %r9 + imulq $19, %r9 + btr $63, %r13 # Sub modulus (if overflow) - addq %rax, %r10 + addq %r9, %r10 adcq $0x00, %r11 adcq $0x00, %r12 adcq $0x00, %r13 @@ -9581,12 +9541,12 @@ _ge_msub_x64: sbbq 8(%r8), %r15 sbbq 16(%r8), %rbx sbbq 24(%r8), %rbp - sbbq %rax, %rax - shldq $0x01, %rbp, %rax - imulq $-19, %rax - andq %rdx, %rbp + sbbq %r9, %r9 + shldq $0x01, %rbp, %r9 + imulq $-19, %r9 + btr $63, %rbp # Add modulus (if underflow) - subq %rax, %r14 + subq %r9, %r14 sbbq $0x00, %r15 sbbq $0x00, %rbx sbbq $0x00, %rbp @@ -10007,14 +9967,13 @@ _ge_msub_x64: adcq 16(%rsi), %r12 movq %r13, %rbp adcq 24(%rsi), %r13 - movq $0x00, %rax - adcq $0x00, %rax - shldq $0x01, %r13, %rax - movq $0x7fffffffffffffff, %rdx - imulq $19, %rax - andq %rdx, %r13 + movq $0x00, %r9 + adcq $0x00, %r9 + shldq $0x01, %r13, %r9 + imulq $19, %r9 + btr $63, %r13 # Sub modulus (if overflow) - addq %rax, %r10 + addq %r9, %r10 adcq $0x00, %r11 adcq $0x00, %r12 adcq $0x00, %r13 @@ -10023,12 +9982,12 @@ _ge_msub_x64: sbbq 8(%rsi), %r15 sbbq 16(%rsi), %rbx sbbq 24(%rsi), %rbp - sbbq %rax, %rax - shldq $0x01, %rbp, %rax - imulq $-19, %rax - andq %rdx, %rbp + sbbq %r9, %r9 + shldq $0x01, %rbp, %r9 + imulq $-19, %r9 + btr $63, %rbp # Add modulus (if underflow) - subq %rax, %r14 + subq %r9, %r14 sbbq $0x00, %r15 sbbq $0x00, %rbx sbbq $0x00, %rbp @@ -10051,14 +10010,13 @@ _ge_msub_x64: movq 24(%r8), %r13 adcq %r12, %r12 adcq %r13, %r13 - movq $0x00, %rax - adcq $0x00, %rax - shldq $0x01, %r13, %rax - movq $0x7fffffffffffffff, %rdx - imulq $19, %rax - andq %rdx, %r13 + movq $0x00, %r9 + adcq $0x00, %r9 + shldq $0x01, %r13, %r9 + imulq $19, %r9 + btr $63, %r13 # Sub modulus (if overflow) - addq %rax, %r10 + addq %r9, %r10 adcq $0x00, %r11 adcq $0x00, %r12 adcq $0x00, %r13 @@ -10074,14 +10032,13 @@ _ge_msub_x64: adcq 16(%rsi), %r12 movq %r13, %rbp adcq 24(%rsi), %r13 - movq $0x00, %rax - adcq $0x00, %rax - shldq $0x01, %r13, %rax - movq $0x7fffffffffffffff, %rdx - imulq $19, %rax - andq %rdx, %r13 + movq $0x00, %r9 + adcq $0x00, %r9 + shldq $0x01, %r13, %r9 + imulq $19, %r9 + btr $63, %r13 # Sub modulus (if overflow) - addq %rax, %r10 + addq %r9, %r10 adcq $0x00, %r11 adcq $0x00, %r12 adcq $0x00, %r13 @@ -10090,12 +10047,12 @@ _ge_msub_x64: sbbq 8(%rsi), %r15 sbbq 16(%rsi), %rbx sbbq 24(%rsi), %rbp - sbbq %rax, %rax - shldq $0x01, %rbp, %rax - imulq $-19, %rax - andq %rdx, %rbp + sbbq %r9, %r9 + shldq $0x01, %rbp, %r9 + imulq $-19, %r9 + btr $63, %rbp # Add modulus (if underflow) - subq %rax, %r14 + subq %r9, %r14 sbbq $0x00, %r15 sbbq $0x00, %rbx sbbq $0x00, %rbp @@ -10160,14 +10117,13 @@ _ge_add_x64: adcq 16(%r8), %r12 movq %r13, %rbp adcq 24(%r8), %r13 - movq $0x00, %rax - adcq $0x00, %rax - shldq $0x01, %r13, %rax - movq $0x7fffffffffffffff, %rdx - imulq $19, %rax - andq %rdx, %r13 + movq $0x00, %r9 + adcq $0x00, %r9 + shldq $0x01, %r13, %r9 + imulq $19, %r9 + btr $63, %r13 # Sub modulus (if overflow) - addq %rax, %r10 + addq %r9, %r10 adcq $0x00, %r11 adcq $0x00, %r12 adcq $0x00, %r13 @@ -10176,12 +10132,12 @@ _ge_add_x64: sbbq 8(%r8), %r15 sbbq 16(%r8), %rbx sbbq 24(%r8), %rbp - sbbq %rax, %rax - shldq $0x01, %rbp, %rax - imulq $-19, %rax - andq %rdx, %rbp + sbbq %r9, %r9 + shldq $0x01, %rbp, %r9 + imulq $-19, %r9 + btr $63, %rbp # Add modulus (if underflow) - subq %rax, %r14 + subq %r9, %r14 sbbq $0x00, %r15 sbbq $0x00, %rbx sbbq $0x00, %rbp @@ -10603,14 +10559,13 @@ _ge_add_x64: adcq 16(%rsi), %r12 movq %r13, %rbp adcq 24(%rsi), %r13 - movq $0x00, %rax - adcq $0x00, %rax - shldq $0x01, %r13, %rax - movq $0x7fffffffffffffff, %rdx - imulq $19, %rax - andq %rdx, %r13 + movq $0x00, %r9 + adcq $0x00, %r9 + shldq $0x01, %r13, %r9 + imulq $19, %r9 + btr $63, %r13 # Sub modulus (if overflow) - addq %rax, %r10 + addq %r9, %r10 adcq $0x00, %r11 adcq $0x00, %r12 adcq $0x00, %r13 @@ -10619,12 +10574,12 @@ _ge_add_x64: sbbq 8(%rsi), %r15 sbbq 16(%rsi), %rbx sbbq 24(%rsi), %rbp - sbbq %rax, %rax - shldq $0x01, %rbp, %rax - imulq $-19, %rax - andq %rdx, %rbp + sbbq %r9, %r9 + shldq $0x01, %rbp, %r9 + imulq $-19, %r9 + btr $63, %rbp # Add modulus (if underflow) - subq %rax, %r14 + subq %r9, %r14 sbbq $0x00, %r15 sbbq $0x00, %rbx sbbq $0x00, %rbp @@ -10772,14 +10727,13 @@ _ge_add_x64: adcq %r11, %r11 adcq %r12, %r12 adcq %r13, %r13 - movq $0x00, %rax - adcq $0x00, %rax - shldq $0x01, %r13, %rax - movq $0x7fffffffffffffff, %rdx - imulq $19, %rax - andq %rdx, %r13 + movq $0x00, %r9 + adcq $0x00, %r9 + shldq $0x01, %r13, %r9 + imulq $19, %r9 + btr $63, %r13 # Sub modulus (if overflow) - addq %rax, %r10 + addq %r9, %r10 adcq $0x00, %r11 adcq $0x00, %r12 adcq $0x00, %r13 @@ -10795,14 +10749,13 @@ _ge_add_x64: adcq 16(%rsi), %r12 movq %r13, %rbp adcq 24(%rsi), %r13 - movq $0x00, %rax - adcq $0x00, %rax - shldq $0x01, %r13, %rax - movq $0x7fffffffffffffff, %rdx - imulq $19, %rax - andq %rdx, %r13 + movq $0x00, %r9 + adcq $0x00, %r9 + shldq $0x01, %r13, %r9 + imulq $19, %r9 + btr $63, %r13 # Sub modulus (if overflow) - addq %rax, %r10 + addq %r9, %r10 adcq $0x00, %r11 adcq $0x00, %r12 adcq $0x00, %r13 @@ -10811,12 +10764,12 @@ _ge_add_x64: sbbq 8(%rsi), %r15 sbbq 16(%rsi), %rbx sbbq 24(%rsi), %rbp - sbbq %rax, %rax - shldq $0x01, %rbp, %rax - imulq $-19, %rax - andq %rdx, %rbp + sbbq %r9, %r9 + shldq $0x01, %rbp, %r9 + imulq $-19, %r9 + btr $63, %rbp # Add modulus (if underflow) - subq %rax, %r14 + subq %r9, %r14 sbbq $0x00, %r15 sbbq $0x00, %rbx sbbq $0x00, %rbp @@ -10881,14 +10834,13 @@ _ge_sub_x64: adcq 16(%r8), %r12 movq %r13, %rbp adcq 24(%r8), %r13 - movq $0x00, %rax - adcq $0x00, %rax - shldq $0x01, %r13, %rax - movq $0x7fffffffffffffff, %rdx - imulq $19, %rax - andq %rdx, %r13 + movq $0x00, %r9 + adcq $0x00, %r9 + shldq $0x01, %r13, %r9 + imulq $19, %r9 + btr $63, %r13 # Sub modulus (if overflow) - addq %rax, %r10 + addq %r9, %r10 adcq $0x00, %r11 adcq $0x00, %r12 adcq $0x00, %r13 @@ -10897,12 +10849,12 @@ _ge_sub_x64: sbbq 8(%r8), %r15 sbbq 16(%r8), %rbx sbbq 24(%r8), %rbp - sbbq %rax, %rax - shldq $0x01, %rbp, %rax - imulq $-19, %rax - andq %rdx, %rbp + sbbq %r9, %r9 + shldq $0x01, %rbp, %r9 + imulq $-19, %r9 + btr $63, %rbp # Add modulus (if underflow) - subq %rax, %r14 + subq %r9, %r14 sbbq $0x00, %r15 sbbq $0x00, %rbx sbbq $0x00, %rbp @@ -11323,14 +11275,13 @@ _ge_sub_x64: adcq 16(%rsi), %r12 movq %r13, %rbp adcq 24(%rsi), %r13 - movq $0x00, %rax - adcq $0x00, %rax - shldq $0x01, %r13, %rax - movq $0x7fffffffffffffff, %rdx - imulq $19, %rax - andq %rdx, %r13 + movq $0x00, %r9 + adcq $0x00, %r9 + shldq $0x01, %r13, %r9 + imulq $19, %r9 + btr $63, %r13 # Sub modulus (if overflow) - addq %rax, %r10 + addq %r9, %r10 adcq $0x00, %r11 adcq $0x00, %r12 adcq $0x00, %r13 @@ -11339,12 +11290,12 @@ _ge_sub_x64: sbbq 8(%rsi), %r15 sbbq 16(%rsi), %rbx sbbq 24(%rsi), %rbp - sbbq %rax, %rax - shldq $0x01, %rbp, %rax - imulq $-19, %rax - andq %rdx, %rbp + sbbq %r9, %r9 + shldq $0x01, %rbp, %r9 + imulq $-19, %r9 + btr $63, %rbp # Add modulus (if underflow) - subq %rax, %r14 + subq %r9, %r14 sbbq $0x00, %r15 sbbq $0x00, %rbx sbbq $0x00, %rbp @@ -11491,14 +11442,13 @@ _ge_sub_x64: adcq %r11, %r11 adcq %r12, %r12 adcq %r13, %r13 - movq $0x00, %rax - adcq $0x00, %rax - shldq $0x01, %r13, %rax - movq $0x7fffffffffffffff, %rdx - imulq $19, %rax - andq %rdx, %r13 + movq $0x00, %r9 + adcq $0x00, %r9 + shldq $0x01, %r13, %r9 + imulq $19, %r9 + btr $63, %r13 # Sub modulus (if overflow) - addq %rax, %r10 + addq %r9, %r10 adcq $0x00, %r11 adcq $0x00, %r12 adcq $0x00, %r13 @@ -11515,14 +11465,13 @@ _ge_sub_x64: adcq 16(%rdi), %r12 movq %r13, %rbp adcq 24(%rdi), %r13 - movq $0x00, %rax - adcq $0x00, %rax - shldq $0x01, %r13, %rax - movq $0x7fffffffffffffff, %rdx - imulq $19, %rax - andq %rdx, %r13 + movq $0x00, %r9 + adcq $0x00, %r9 + shldq $0x01, %r13, %r9 + imulq $19, %r9 + btr $63, %r13 # Sub modulus (if overflow) - addq %rax, %r10 + addq %r9, %r10 adcq $0x00, %r11 adcq $0x00, %r12 adcq $0x00, %r13 @@ -11531,12 +11480,12 @@ _ge_sub_x64: sbbq 8(%rdi), %r15 sbbq 16(%rdi), %rbx sbbq 24(%rdi), %rbp - sbbq %rax, %rax - shldq $0x01, %rbp, %rax - imulq $-19, %rax - andq %rdx, %rbp + sbbq %r9, %r9 + shldq $0x01, %rbp, %r9 + imulq $-19, %r9 + btr $63, %rbp # Add modulus (if underflow) - subq %rax, %r14 + subq %r9, %r14 sbbq $0x00, %r15 sbbq $0x00, %rbx sbbq $0x00, %rbp @@ -12173,61 +12122,291 @@ _sc_muladd_x64: #ifndef __APPLE__ .size sc_muladd_x64,.-sc_muladd_x64 #endif /* __APPLE__ */ -#endif /* HAVE_ED25519 */ -#ifdef HAVE_INTEL_AVX2 +/* Non-constant time modular inversion. + * + * @param [out] r Resulting number. + * @param [in] a Number to invert. + * @return MP_OKAY on success. + */ #ifndef __APPLE__ .text -.globl fe_cmov_table_avx2 -.type fe_cmov_table_avx2,@function +.globl fe_invert_nct_x64 +.type fe_invert_nct_x64,@function .align 16 -fe_cmov_table_avx2: +fe_invert_nct_x64: #else .section __TEXT,__text -.globl _fe_cmov_table_avx2 +.globl _fe_invert_nct_x64 .p2align 4 -_fe_cmov_table_avx2: +_fe_invert_nct_x64: #endif /* __APPLE__ */ pushq %r12 pushq %r13 pushq %r14 pushq %r15 - pushq %rbx - movq %rdx, %rcx - xor %rbx, %rbx - movsbq %cl, %rax - cdq - xorb %dl, %al - subb %dl, %al - movb %al, %bl - movd %ebx, %xmm7 - movq $0x01, %rbx - movd %rbx, %xmm9 - vmovdqa %ymm9, %ymm3 - vmovdqa %ymm9, %ymm4 - vpxor %ymm8, %ymm8, %ymm8 - vpermd %ymm7, %ymm8, %ymm7 - vpermd %ymm9, %ymm8, %ymm9 - vpxor %ymm0, %ymm0, %ymm0 - vpxor %ymm1, %ymm1, %ymm1 - vpxor %ymm2, %ymm2, %ymm2 - vpcmpeqd %ymm7, %ymm8, %ymm6 - vpxor %ymm5, %ymm5, %ymm5 - vpand %ymm6, %ymm3, %ymm3 - vpand %ymm6, %ymm4, %ymm4 - vmovdqa %ymm9, %ymm8 - vpcmpeqd %ymm7, %ymm8, %ymm6 - vpaddd %ymm9, %ymm8, %ymm8 - vmovupd (%rsi), %ymm0 - vmovupd 32(%rsi), %ymm1 - vmovupd 64(%rsi), %ymm2 - vpand %ymm6, %ymm0, %ymm0 - vpand %ymm6, %ymm1, %ymm1 - vpand %ymm6, %ymm2, %ymm2 - vpor %ymm0, %ymm3, %ymm3 - vpor %ymm1, %ymm4, %ymm4 - vpor %ymm2, %ymm5, %ymm5 - vpcmpeqd %ymm7, %ymm8, %ymm6 - vpaddd %ymm9, %ymm8, %ymm8 + subq $0x201, %rsp + movq $-19, %rcx + movq $-1, %r8 + movq $-1, %r9 + movq $0x7fffffffffffffff, %r10 + movq (%rsi), %r11 + movq 8(%rsi), %r12 + movq 16(%rsi), %r13 + movq 24(%rsi), %r14 + movq $0x00, %r15 + testb $0x01, %r11b + jnz fe_invert_nct_v_even_end +fe_invert_nct_v_even_start: + shrdq $0x01, %r12, %r11 + shrdq $0x01, %r13, %r12 + shrdq $0x01, %r14, %r13 + shrq $0x01, %r14 + movb $0x01, (%rsp,%r15,1) + incq %r15 + testb $0x01, %r11b + jz fe_invert_nct_v_even_start +fe_invert_nct_v_even_end: +L_fe_invert_nct_uv_start: + cmpq %r14, %r10 + jb L_fe_invert_nct_uv_v + ja L_fe_invert_nct_uv_u + cmpq %r13, %r9 + jb L_fe_invert_nct_uv_v + ja L_fe_invert_nct_uv_u + cmpq %r12, %r8 + jb L_fe_invert_nct_uv_v + ja L_fe_invert_nct_uv_u + cmpq %r11, %rcx + jb L_fe_invert_nct_uv_v +L_fe_invert_nct_uv_u: + movb $2, (%rsp,%r15,1) + incq %r15 + subq %r11, %rcx + sbbq %r12, %r8 + sbbq %r13, %r9 + sbbq %r14, %r10 + shrdq $0x01, %r8, %rcx + shrdq $0x01, %r9, %r8 + shrdq $0x01, %r10, %r9 + shrq $0x01, %r10 + testb $0x01, %cl + jnz fe_invert_nct_usubv_even_end +fe_invert_nct_usubv_even_start: + shrdq $0x01, %r8, %rcx + shrdq $0x01, %r9, %r8 + shrdq $0x01, %r10, %r9 + shrq $0x01, %r10 + movb $0x00, (%rsp,%r15,1) + incq %r15 + testb $0x01, %cl + jz fe_invert_nct_usubv_even_start +fe_invert_nct_usubv_even_end: + cmpq $0x01, %rcx + jne L_fe_invert_nct_uv_start + movq %r8, %rdx + orq %r9, %rdx + jne L_fe_invert_nct_uv_start + orq %r10, %rdx + jne L_fe_invert_nct_uv_start + movb $0x01, %al + jmp L_fe_invert_nct_uv_end +L_fe_invert_nct_uv_v: + movb $3, (%rsp,%r15,1) + incq %r15 + subq %rcx, %r11 + sbbq %r8, %r12 + sbbq %r9, %r13 + sbbq %r10, %r14 + shrdq $0x01, %r12, %r11 + shrdq $0x01, %r13, %r12 + shrdq $0x01, %r14, %r13 + shrq $0x01, %r14 + testb $0x01, %r11b + jnz fe_invert_nct_vsubu_even_end +fe_invert_nct_vsubu_even_start: + shrdq $0x01, %r12, %r11 + shrdq $0x01, %r13, %r12 + shrdq $0x01, %r14, %r13 + shrq $0x01, %r14 + movb $0x01, (%rsp,%r15,1) + incq %r15 + testb $0x01, %r11b + jz fe_invert_nct_vsubu_even_start +fe_invert_nct_vsubu_even_end: + cmpq $0x01, %r11 + jne L_fe_invert_nct_uv_start + movq %r12, %rdx + orq %r13, %rdx + jne L_fe_invert_nct_uv_start + orq %r14, %rdx + jne L_fe_invert_nct_uv_start + movb $0x00, %al +L_fe_invert_nct_uv_end: + movq $-19, %rcx + movq $-1, %r8 + movq $-1, %r9 + movq $0x7fffffffffffffff, %r10 + movq $0x01, %r11 + xorq %r12, %r12 + xorq %r13, %r13 + xorq %r14, %r14 + movb $7, (%rsp,%r15,1) + movb (%rsp), %dl + movq $0x01, %r15 + cmpb $0x01, %dl + je L_fe_invert_nct_op_div2_d + jl L_fe_invert_nct_op_div2_b + cmpb $3, %dl + je L_fe_invert_nct_op_d_sub_b + jl L_fe_invert_nct_op_b_sub_d + jmp L_fe_invert_nct_op_end +L_fe_invert_nct_op_b_sub_d: + subq %r11, %rcx + sbbq %r12, %r8 + sbbq %r13, %r9 + sbbq %r14, %r10 + jnc L_fe_invert_nct_op_div2_b + movq $-1, %rdx + addq $-19, %rcx + adcq %rdx, %r8 + adcq %rdx, %r9 + movq $0x7fffffffffffffff, %rdx + adcq %rdx, %r10 +L_fe_invert_nct_op_div2_b: + testb $0x01, %cl + jz L_fe_invert_nct_op_div2_b_mod + addq $-19, %rcx + movq $-1, %rdx + adcq %rdx, %r8 + adcq %rdx, %r9 + movq $0x7fffffffffffffff, %rdx + adcq %rdx, %r10 +L_fe_invert_nct_op_div2_b_mod: + shrdq $0x01, %r8, %rcx + shrdq $0x01, %r9, %r8 + shrdq $0x01, %r10, %r9 + shrq $0x01, %r10 + movb (%rsp,%r15,1), %dl + incq %r15 + cmpb $0x01, %dl + je L_fe_invert_nct_op_div2_d + jl L_fe_invert_nct_op_div2_b + cmpb $3, %dl + je L_fe_invert_nct_op_d_sub_b + jl L_fe_invert_nct_op_b_sub_d + jmp L_fe_invert_nct_op_end +L_fe_invert_nct_op_d_sub_b: + subq %rcx, %r11 + sbbq %r8, %r12 + sbbq %r9, %r13 + sbbq %r10, %r14 + jnc L_fe_invert_nct_op_div2_d + movq $-1, %rdx + addq $-19, %r11 + adcq %rdx, %r12 + adcq %rdx, %r13 + movq $0x7fffffffffffffff, %rdx + adcq %rdx, %r14 +L_fe_invert_nct_op_div2_d: + testb $0x01, %r11b + jz L_fe_invert_nct_op_div2_d_mod + addq $-19, %r11 + movq $-1, %rdx + adcq %rdx, %r12 + adcq %rdx, %r13 + movq $0x7fffffffffffffff, %rdx + adcq %rdx, %r14 +L_fe_invert_nct_op_div2_d_mod: + shrdq $0x01, %r12, %r11 + shrdq $0x01, %r13, %r12 + shrdq $0x01, %r14, %r13 + shrq $0x01, %r14 + movb (%rsp,%r15,1), %dl + incq %r15 + cmpb $0x01, %dl + je L_fe_invert_nct_op_div2_d + jl L_fe_invert_nct_op_div2_b + cmpb $3, %dl + je L_fe_invert_nct_op_d_sub_b + jl L_fe_invert_nct_op_b_sub_d +L_fe_invert_nct_op_end: + cmpb $0x01, %al + jne L_fe_invert_nct_store_d + movq %rcx, (%rdi) + movq %r8, 8(%rdi) + movq %r9, 16(%rdi) + movq %r10, 24(%rdi) + jmp L_fe_invert_nct_store_end +L_fe_invert_nct_store_d: + movq %r11, (%rdi) + movq %r12, 8(%rdi) + movq %r13, 16(%rdi) + movq %r14, 24(%rdi) +L_fe_invert_nct_store_end: + addq $0x201, %rsp + popq %r15 + popq %r14 + popq %r13 + popq %r12 + repz retq +#ifndef __APPLE__ +.size fe_invert_nct_x64,.-fe_invert_nct_x64 +#endif /* __APPLE__ */ +#endif /* HAVE_ED25519 */ +#ifdef HAVE_INTEL_AVX2 +#ifndef __APPLE__ +.text +.globl fe_cmov_table_avx2 +.type fe_cmov_table_avx2,@function +.align 16 +fe_cmov_table_avx2: +#else +.section __TEXT,__text +.globl _fe_cmov_table_avx2 +.p2align 4 +_fe_cmov_table_avx2: +#endif /* __APPLE__ */ + pushq %r12 + pushq %r13 + pushq %r14 + pushq %r15 + pushq %rbx + movq %rdx, %rcx + xor %rbx, %rbx + movsbq %cl, %rax + cdq + xorb %dl, %al + subb %dl, %al + movb %al, %bl + movd %ebx, %xmm7 + movq $0x01, %rbx + movd %rbx, %xmm9 + vmovdqa %ymm9, %ymm3 + vmovdqa %ymm9, %ymm4 + vpxor %ymm8, %ymm8, %ymm8 + vpermd %ymm7, %ymm8, %ymm7 + vpermd %ymm9, %ymm8, %ymm9 + vpxor %ymm0, %ymm0, %ymm0 + vpxor %ymm1, %ymm1, %ymm1 + vpxor %ymm2, %ymm2, %ymm2 + vpcmpeqd %ymm7, %ymm8, %ymm6 + vpxor %ymm5, %ymm5, %ymm5 + vpand %ymm6, %ymm3, %ymm3 + vpand %ymm6, %ymm4, %ymm4 + vmovdqa %ymm9, %ymm8 + vpcmpeqd %ymm7, %ymm8, %ymm6 + vpaddd %ymm9, %ymm8, %ymm8 + vmovupd (%rsi), %ymm0 + vmovupd 32(%rsi), %ymm1 + vmovupd 64(%rsi), %ymm2 + vpand %ymm6, %ymm0, %ymm0 + vpand %ymm6, %ymm1, %ymm1 + vpand %ymm6, %ymm2, %ymm2 + vpor %ymm0, %ymm3, %ymm3 + vpor %ymm1, %ymm4, %ymm4 + vpor %ymm2, %ymm5, %ymm5 + vpcmpeqd %ymm7, %ymm8, %ymm6 + vpaddd %ymm9, %ymm8, %ymm8 vmovupd 96(%rsi), %ymm0 vmovupd 128(%rsi), %ymm1 vmovupd 160(%rsi), %ymm2 @@ -12358,16 +12537,18 @@ fe_mul_avx2: .p2align 4 _fe_mul_avx2: #endif /* __APPLE__ */ + pushq %rbx pushq %r12 pushq %r13 pushq %r14 pushq %r15 - pushq %rbx - movq %rdx, %rbx + pushq %rbp + movq %rdx, %rbp + movq (%rsi), %rbx # Multiply # A[0] * B[0] - movq (%rbx), %rdx - mulxq (%rsi), %r8, %r9 + movq (%rbp), %rdx + mulxq %rbx, %r8, %r9 # A[2] * B[0] mulxq 16(%rsi), %r10, %r11 # A[1] * B[0] @@ -12375,66 +12556,66 @@ _fe_mul_avx2: xorq %r15, %r15 adcxq %rax, %r9 # A[3] * B[1] - movq 8(%rbx), %rdx + movq 8(%rbp), %rdx mulxq 24(%rsi), %r12, %r13 adcxq %rcx, %r10 # A[0] * B[1] - mulxq (%rsi), %rax, %rcx + mulxq %rbx, %rax, %rcx adoxq %rax, %r9 # A[2] * B[1] mulxq 16(%rsi), %rax, %r14 adoxq %rcx, %r10 adcxq %rax, %r11 # A[1] * B[2] - movq 16(%rbx), %rdx + movq 16(%rbp), %rdx mulxq 8(%rsi), %rax, %rcx adcxq %r14, %r12 adoxq %rax, %r11 adcxq %r15, %r13 adoxq %rcx, %r12 # A[0] * B[2] - mulxq (%rsi), %rax, %rcx + mulxq %rbx, %rax, %rcx adoxq %r15, %r13 xorq %r14, %r14 adcxq %rax, %r10 # A[1] * B[1] - movq 8(%rbx), %rdx + movq 8(%rbp), %rdx mulxq 8(%rsi), %rdx, %rax adcxq %rcx, %r11 adoxq %rdx, %r10 # A[1] * B[3] - movq 24(%rbx), %rdx + movq 24(%rbp), %rdx adoxq %rax, %r11 mulxq 8(%rsi), %rax, %rcx adcxq %rax, %r12 # A[2] * B[2] - movq 16(%rbx), %rdx + movq 16(%rbp), %rdx mulxq 16(%rsi), %rdx, %rax adcxq %rcx, %r13 adoxq %rdx, %r12 # A[3] * B[3] - movq 24(%rbx), %rdx + movq 24(%rbp), %rdx adoxq %rax, %r13 mulxq 24(%rsi), %rax, %rcx adoxq %r15, %r14 adcxq %rax, %r14 # A[0] * B[3] - mulxq (%rsi), %rdx, %rax + mulxq %rbx, %rdx, %rax adcxq %rcx, %r15 xorq %rcx, %rcx adcxq %rdx, %r11 # A[3] * B[0] movq 24(%rsi), %rdx adcxq %rax, %r12 - mulxq (%rbx), %rdx, %rax + mulxq (%rbp), %rdx, %rax adoxq %rdx, %r11 adoxq %rax, %r12 # A[3] * B[2] movq 24(%rsi), %rdx - mulxq 16(%rbx), %rdx, %rax + mulxq 16(%rbp), %rdx, %rax adcxq %rdx, %r13 # A[2] * B[3] - movq 24(%rbx), %rdx + movq 24(%rbp), %rdx adcxq %rax, %r14 mulxq 16(%rsi), %rax, %rdx adcxq %rcx, %r15 @@ -12475,11 +12656,12 @@ _fe_mul_avx2: movq %r9, 8(%rdi) movq %r10, 16(%rdi) movq %r11, 24(%rdi) - popq %rbx + popq %rbp popq %r15 popq %r14 popq %r13 popq %r12 + popq %rbx repz retq #ifndef __APPLE__ .size fe_mul_avx2,.-fe_mul_avx2 @@ -12729,14 +12911,13 @@ _fe_mul121666_avx2: mulxq (%rsi), %rax, %r13 mulxq 8(%rsi), %rcx, %r12 mulxq 16(%rsi), %r8, %r11 - mulxq 24(%rsi), %r9, %r10 addq %r13, %rcx + mulxq 24(%rsi), %r9, %r10 adcq %r12, %r8 adcq %r11, %r9 adcq $0x00, %r10 - movq $0x7fffffffffffffff, %r13 shldq $0x01, %r9, %r10 - andq %r13, %r9 + btr $63, %r9 imulq $19, %r10, %r10 addq %r10, %rax adcq $0x00, %rcx @@ -13008,7 +13189,7 @@ _fe_invert_avx2: movq 128(%rsp), %rdi addq $0x90, %rsp repz retq -#if !defined(HAVE_ED25519) && !defined(WOLFSSL_CURVE25519_USE_ED25519) +#if defined(WOLFSSL_CURVE25519_NOT_USE_ED25519) #ifndef __APPLE__ .data #else @@ -13083,43 +13264,42 @@ L_curve25519_base_avx2_bits: movq 8(%rdi), %r9 movq 16(%rdi), %r10 movq 24(%rdi), %r11 + movq (%rsp), %r12 + movq 8(%rsp), %r13 + movq 16(%rsp), %r14 + movq 24(%rsp), %r15 xorq 64(%rsp), %r8 xorq 72(%rsp), %r9 xorq 80(%rsp), %r10 xorq 88(%rsp), %r11 + xorq 32(%rsp), %r12 + xorq 40(%rsp), %r13 + xorq 48(%rsp), %r14 + xorq 56(%rsp), %r15 andq %rax, %r8 andq %rax, %r9 andq %rax, %r10 andq %rax, %r11 + andq %rax, %r12 + andq %rax, %r13 + andq %rax, %r14 + andq %rax, %r15 xorq %r8, (%rdi) xorq %r9, 8(%rdi) xorq %r10, 16(%rdi) xorq %r11, 24(%rdi) + xorq %r12, (%rsp) + xorq %r13, 8(%rsp) + xorq %r14, 16(%rsp) + xorq %r15, 24(%rsp) xorq %r8, 64(%rsp) xorq %r9, 72(%rsp) xorq %r10, 80(%rsp) xorq %r11, 88(%rsp) - # Conditional Swap - movq (%rsp), %r8 - movq 8(%rsp), %r9 - movq 16(%rsp), %r10 - movq 24(%rsp), %r11 - xorq 32(%rsp), %r8 - xorq 40(%rsp), %r9 - xorq 48(%rsp), %r10 - xorq 56(%rsp), %r11 - andq %rax, %r8 - andq %rax, %r9 - andq %rax, %r10 - andq %rax, %r11 - xorq %r8, (%rsp) - xorq %r9, 8(%rsp) - xorq %r10, 16(%rsp) - xorq %r11, 24(%rsp) - xorq %r8, 32(%rsp) - xorq %r9, 40(%rsp) - xorq %r10, 48(%rsp) - xorq %r11, 56(%rsp) + xorq %r12, 32(%rsp) + xorq %r13, 40(%rsp) + xorq %r14, 48(%rsp) + xorq %r15, 56(%rsp) movq %rbx, 168(%rsp) # Add-Sub # Add @@ -13135,14 +13315,13 @@ L_curve25519_base_avx2_bits: adcq 16(%rsp), %r10 movq %r11, %r15 adcq 24(%rsp), %r11 - movq $0x00, %rcx - adcq $0x00, %rcx - shldq $0x01, %r11, %rcx - movq $0x7fffffffffffffff, %rbx - imulq $19, %rcx - andq %rbx, %r11 + movq $0x00, %rbx + adcq $0x00, %rbx + shldq $0x01, %r11, %rbx + imulq $19, %rbx + btr $63, %r11 # Sub modulus (if overflow) - addq %rcx, %r8 + addq %rbx, %r8 adcq $0x00, %r9 adcq $0x00, %r10 adcq $0x00, %r11 @@ -13151,12 +13330,12 @@ L_curve25519_base_avx2_bits: sbbq 8(%rsp), %r13 sbbq 16(%rsp), %r14 sbbq 24(%rsp), %r15 - sbbq %rcx, %rcx - shldq $0x01, %r15, %rcx - imulq $-19, %rcx - andq %rbx, %r15 + sbbq %rbx, %rbx + shldq $0x01, %r15, %rbx + imulq $-19, %rbx + btr $63, %r15 # Add modulus (if underflow) - subq %rcx, %r12 + subq %rbx, %r12 sbbq $0x00, %r13 sbbq $0x00, %r14 sbbq $0x00, %r15 @@ -13182,14 +13361,13 @@ L_curve25519_base_avx2_bits: adcq 48(%rsp), %r10 movq %r11, %r15 adcq 56(%rsp), %r11 - movq $0x00, %rcx - adcq $0x00, %rcx - shldq $0x01, %r11, %rcx - movq $0x7fffffffffffffff, %rbx - imulq $19, %rcx - andq %rbx, %r11 + movq $0x00, %rbx + adcq $0x00, %rbx + shldq $0x01, %r11, %rbx + imulq $19, %rbx + btr $63, %r11 # Sub modulus (if overflow) - addq %rcx, %r8 + addq %rbx, %r8 adcq $0x00, %r9 adcq $0x00, %r10 adcq $0x00, %r11 @@ -13198,12 +13376,12 @@ L_curve25519_base_avx2_bits: sbbq 40(%rsp), %r13 sbbq 48(%rsp), %r14 sbbq 56(%rsp), %r15 - sbbq %rcx, %rcx - shldq $0x01, %r15, %rcx - imulq $-19, %rcx - andq %rbx, %r15 + sbbq %rbx, %rbx + shldq $0x01, %r15, %rbx + imulq $-19, %rbx + btr $63, %r15 # Add modulus (if underflow) - subq %rcx, %r12 + subq %rbx, %r12 sbbq $0x00, %r13 sbbq $0x00, %r14 sbbq $0x00, %r15 @@ -13215,10 +13393,11 @@ L_curve25519_base_avx2_bits: movq %r13, 104(%rsp) movq %r14, 112(%rsp) movq %r15, 120(%rsp) + movq 32(%rsp), %rax # Multiply # A[0] * B[0] movq 128(%rsp), %rdx - mulxq 32(%rsp), %r8, %r9 + mulxq %rax, %r8, %r9 # A[2] * B[0] mulxq 48(%rsp), %r10, %r11 # A[1] * B[0] @@ -13230,7 +13409,7 @@ L_curve25519_base_avx2_bits: mulxq 56(%rsp), %r12, %r13 adcxq %rbx, %r10 # A[0] * B[1] - mulxq 32(%rsp), %rcx, %rbx + mulxq %rax, %rcx, %rbx adoxq %rcx, %r9 # A[2] * B[1] mulxq 48(%rsp), %rcx, %r14 @@ -13244,7 +13423,7 @@ L_curve25519_base_avx2_bits: adcxq %r15, %r13 adoxq %rbx, %r12 # A[0] * B[2] - mulxq 32(%rsp), %rcx, %rbx + mulxq %rax, %rcx, %rbx adoxq %r15, %r13 xorq %r14, %r14 adcxq %rcx, %r10 @@ -13270,7 +13449,7 @@ L_curve25519_base_avx2_bits: adoxq %r15, %r14 adcxq %rcx, %r14 # A[0] * B[3] - mulxq 32(%rsp), %rdx, %rcx + mulxq %rax, %rdx, %rcx adcxq %rbx, %r15 xorq %rbx, %rbx adcxq %rdx, %r11 @@ -13317,10 +13496,11 @@ L_curve25519_base_avx2_bits: movq %r9, 40(%rsp) movq %r10, 48(%rsp) movq %r11, 56(%rsp) + movq 96(%rsp), %rax # Multiply # A[0] * B[0] movq (%rdi), %rdx - mulxq 96(%rsp), %r8, %r9 + mulxq %rax, %r8, %r9 # A[2] * B[0] mulxq 112(%rsp), %r10, %r11 # A[1] * B[0] @@ -13332,7 +13512,7 @@ L_curve25519_base_avx2_bits: mulxq 120(%rsp), %r12, %r13 adcxq %rbx, %r10 # A[0] * B[1] - mulxq 96(%rsp), %rcx, %rbx + mulxq %rax, %rcx, %rbx adoxq %rcx, %r9 # A[2] * B[1] mulxq 112(%rsp), %rcx, %r14 @@ -13346,7 +13526,7 @@ L_curve25519_base_avx2_bits: adcxq %r15, %r13 adoxq %rbx, %r12 # A[0] * B[2] - mulxq 96(%rsp), %rcx, %rbx + mulxq %rax, %rcx, %rbx adoxq %r15, %r13 xorq %r14, %r14 adcxq %rcx, %r10 @@ -13372,7 +13552,7 @@ L_curve25519_base_avx2_bits: adoxq %r15, %r14 adcxq %rcx, %r14 # A[0] * B[3] - mulxq 96(%rsp), %rdx, %rcx + mulxq %rax, %rdx, %rcx adcxq %rbx, %r15 xorq %rbx, %rbx adcxq %rdx, %r11 @@ -13593,14 +13773,13 @@ L_curve25519_base_avx2_bits: adcq 48(%rsp), %r10 movq %r11, %r15 adcq 56(%rsp), %r11 - movq $0x00, %rcx - adcq $0x00, %rcx - shldq $0x01, %r11, %rcx - movq $0x7fffffffffffffff, %rbx - imulq $19, %rcx - andq %rbx, %r11 + movq $0x00, %rbx + adcq $0x00, %rbx + shldq $0x01, %r11, %rbx + imulq $19, %rbx + btr $63, %r11 # Sub modulus (if overflow) - addq %rcx, %r8 + addq %rbx, %r8 adcq $0x00, %r9 adcq $0x00, %r10 adcq $0x00, %r11 @@ -13609,12 +13788,12 @@ L_curve25519_base_avx2_bits: sbbq 40(%rsp), %r13 sbbq 48(%rsp), %r14 sbbq 56(%rsp), %r15 - sbbq %rcx, %rcx - shldq $0x01, %r15, %rcx - imulq $-19, %rcx - andq %rbx, %r15 + sbbq %rbx, %rbx + shldq $0x01, %r15, %rbx + imulq $-19, %rbx + btr $63, %r15 # Add modulus (if underflow) - subq %rcx, %r12 + subq %rbx, %r12 sbbq $0x00, %r13 sbbq $0x00, %r14 sbbq $0x00, %r15 @@ -13626,10 +13805,11 @@ L_curve25519_base_avx2_bits: movq %r13, 40(%rsp) movq %r14, 48(%rsp) movq %r15, 56(%rsp) + movq 128(%rsp), %rax # Multiply # A[0] * B[0] movq 96(%rsp), %rdx - mulxq 128(%rsp), %r8, %r9 + mulxq %rax, %r8, %r9 # A[2] * B[0] mulxq 144(%rsp), %r10, %r11 # A[1] * B[0] @@ -13641,7 +13821,7 @@ L_curve25519_base_avx2_bits: mulxq 152(%rsp), %r12, %r13 adcxq %rbx, %r10 # A[0] * B[1] - mulxq 128(%rsp), %rcx, %rbx + mulxq %rax, %rcx, %rbx adoxq %rcx, %r9 # A[2] * B[1] mulxq 144(%rsp), %rcx, %r14 @@ -13655,7 +13835,7 @@ L_curve25519_base_avx2_bits: adcxq %r15, %r13 adoxq %rbx, %r12 # A[0] * B[2] - mulxq 128(%rsp), %rcx, %rbx + mulxq %rax, %rcx, %rbx adoxq %r15, %r13 xorq %r14, %r14 adcxq %rcx, %r10 @@ -13681,7 +13861,7 @@ L_curve25519_base_avx2_bits: adoxq %r15, %r14 adcxq %rcx, %r14 # A[0] * B[3] - mulxq 128(%rsp), %rdx, %rcx + mulxq %rax, %rdx, %rcx adcxq %rbx, %r15 xorq %rbx, %rbx adcxq %rdx, %r11 @@ -13737,13 +13917,12 @@ L_curve25519_base_avx2_bits: sbbq 104(%rsp), %r9 sbbq 112(%rsp), %r10 sbbq 120(%rsp), %r11 - sbbq %rcx, %rcx - shldq $0x01, %r11, %rcx - movq $0x7fffffffffffffff, %rbx - imulq $-19, %rcx - andq %rbx, %r11 + sbbq %rbx, %rbx + shldq $0x01, %r11, %rbx + imulq $-19, %rbx + btr $63, %r11 # Add modulus (if underflow) - subq %rcx, %r8 + subq %rbx, %r8 sbbq $0x00, %r9 sbbq $0x00, %r10 sbbq $0x00, %r11 @@ -13831,27 +14010,6 @@ L_curve25519_base_avx2_bits: movq %r9, 40(%rsp) movq %r10, 48(%rsp) movq %r11, 56(%rsp) - movq $0x1db42, %rdx - mulxq 128(%rsp), %r8, %r15 - mulxq 136(%rsp), %r9, %r14 - mulxq 144(%rsp), %r10, %r13 - mulxq 152(%rsp), %r11, %r12 - addq %r15, %r9 - adcq %r14, %r10 - adcq %r13, %r11 - adcq $0x00, %r12 - movq $0x7fffffffffffffff, %r15 - shldq $0x01, %r11, %r12 - andq %r15, %r11 - imulq $19, %r12, %r12 - addq %r12, %r8 - adcq $0x00, %r9 - adcq $0x00, %r10 - adcq $0x00, %r11 - movq %r8, (%rsp) - movq %r9, 8(%rsp) - movq %r10, 16(%rsp) - movq %r11, 24(%rsp) # Square movq 64(%rsp), %rdx movq 72(%rsp), %rax @@ -13932,23 +14090,24 @@ L_curve25519_base_avx2_bits: movq %r9, 72(%rsp) movq %r10, 80(%rsp) movq %r11, 88(%rsp) - # Add - movq 96(%rsp), %r8 - movq 104(%rsp), %r9 - addq (%rsp), %r8 - movq 112(%rsp), %r10 - adcq 8(%rsp), %r9 - movq 120(%rsp), %r11 - adcq 16(%rsp), %r10 - adcq 24(%rsp), %r11 - movq $0x00, %rcx - adcq $0x00, %rcx - shldq $0x01, %r11, %rcx - movq $0x7fffffffffffffff, %rbx - imulq $19, %rcx - andq %rbx, %r11 - # Sub modulus (if overflow) - addq %rcx, %r8 + movq $0x1db42, %rdx + mulxq 128(%rsp), %r8, %r15 + mulxq 136(%rsp), %r9, %r14 + mulxq 144(%rsp), %r10, %r13 + addq %r15, %r9 + mulxq 152(%rsp), %r11, %r12 + adcq %r14, %r10 + adcq %r13, %r11 + adcq $0x00, %r12 + addq 96(%rsp), %r8 + adcq 104(%rsp), %r9 + adcq 112(%rsp), %r10 + adcq 120(%rsp), %r11 + adcq $0x00, %r12 + shldq $0x01, %r11, %r12 + btr $63, %r11 + imulq $19, %r12, %r12 + addq %r12, %r8 adcq $0x00, %r9 adcq $0x00, %r10 adcq $0x00, %r11 @@ -13960,14 +14119,13 @@ L_curve25519_base_avx2_bits: mulxq 32(%rsp), %r8, %r15 mulxq 40(%rsp), %r9, %r14 mulxq 48(%rsp), %r10, %r13 - mulxq 56(%rsp), %r11, %r12 addq %r15, %r9 + mulxq 56(%rsp), %r11, %r12 adcq %r14, %r10 adcq %r13, %r11 adcq $0x00, %r12 - movq $0x7fffffffffffffff, %r15 shldq $0x01, %r11, %r12 - andq %r15, %r11 + btr $63, %r11 imulq $19, %r12, %r12 addq %r12, %r8 adcq $0x00, %r9 @@ -13977,10 +14135,11 @@ L_curve25519_base_avx2_bits: movq %r9, 40(%rsp) movq %r10, 48(%rsp) movq %r11, 56(%rsp) + movq 128(%rsp), %rax # Multiply # A[0] * B[0] movq 96(%rsp), %rdx - mulxq 128(%rsp), %r8, %r9 + mulxq %rax, %r8, %r9 # A[2] * B[0] mulxq 144(%rsp), %r10, %r11 # A[1] * B[0] @@ -13992,7 +14151,7 @@ L_curve25519_base_avx2_bits: mulxq 152(%rsp), %r12, %r13 adcxq %rbx, %r10 # A[0] * B[1] - mulxq 128(%rsp), %rcx, %rbx + mulxq %rax, %rcx, %rbx adoxq %rcx, %r9 # A[2] * B[1] mulxq 144(%rsp), %rcx, %r14 @@ -14006,7 +14165,7 @@ L_curve25519_base_avx2_bits: adcxq %r15, %r13 adoxq %rbx, %r12 # A[0] * B[2] - mulxq 128(%rsp), %rcx, %rbx + mulxq %rax, %rcx, %rbx adoxq %r15, %r13 xorq %r14, %r14 adcxq %rcx, %r10 @@ -14032,7 +14191,7 @@ L_curve25519_base_avx2_bits: adoxq %r15, %r14 adcxq %rcx, %r14 # A[0] * B[3] - mulxq 128(%rsp), %rdx, %rcx + mulxq %rax, %rdx, %rcx adcxq %rbx, %r15 xorq %rbx, %rbx adcxq %rdx, %r11 @@ -14089,43 +14248,42 @@ L_curve25519_base_avx2_bits: movq 8(%rdi), %r9 movq 16(%rdi), %r10 movq 24(%rdi), %r11 + movq (%rsp), %r12 + movq 8(%rsp), %r13 + movq 16(%rsp), %r14 + movq 24(%rsp), %r15 xorq 64(%rsp), %r8 xorq 72(%rsp), %r9 xorq 80(%rsp), %r10 xorq 88(%rsp), %r11 + xorq 32(%rsp), %r12 + xorq 40(%rsp), %r13 + xorq 48(%rsp), %r14 + xorq 56(%rsp), %r15 andq %rax, %r8 andq %rax, %r9 andq %rax, %r10 andq %rax, %r11 + andq %rax, %r12 + andq %rax, %r13 + andq %rax, %r14 + andq %rax, %r15 xorq %r8, (%rdi) xorq %r9, 8(%rdi) xorq %r10, 16(%rdi) xorq %r11, 24(%rdi) + xorq %r12, (%rsp) + xorq %r13, 8(%rsp) + xorq %r14, 16(%rsp) + xorq %r15, 24(%rsp) xorq %r8, 64(%rsp) xorq %r9, 72(%rsp) xorq %r10, 80(%rsp) xorq %r11, 88(%rsp) - # Conditional Swap - movq (%rsp), %r8 - movq 8(%rsp), %r9 - movq 16(%rsp), %r10 - movq 24(%rsp), %r11 - xorq 32(%rsp), %r8 - xorq 40(%rsp), %r9 - xorq 48(%rsp), %r10 - xorq 56(%rsp), %r11 - andq %rax, %r8 - andq %rax, %r9 - andq %rax, %r10 - andq %rax, %r11 - xorq %r8, (%rsp) - xorq %r9, 8(%rsp) - xorq %r10, 16(%rsp) - xorq %r11, 24(%rsp) - xorq %r8, 32(%rsp) - xorq %r9, 40(%rsp) - xorq %r10, 48(%rsp) - xorq %r11, 56(%rsp) + xorq %r12, 32(%rsp) + xorq %r13, 40(%rsp) + xorq %r14, 48(%rsp) + xorq %r15, 56(%rsp) L_curve25519_base_avx2_last_3: # Add-Sub # Add @@ -14141,14 +14299,13 @@ L_curve25519_base_avx2_last_3: adcq 16(%rsp), %r10 movq %r11, %r15 adcq 24(%rsp), %r11 - movq $0x00, %rcx - adcq $0x00, %rcx - shldq $0x01, %r11, %rcx - movq $0x7fffffffffffffff, %rbx - imulq $19, %rcx - andq %rbx, %r11 + movq $0x00, %rbx + adcq $0x00, %rbx + shldq $0x01, %r11, %rbx + imulq $19, %rbx + btr $63, %r11 # Sub modulus (if overflow) - addq %rcx, %r8 + addq %rbx, %r8 adcq $0x00, %r9 adcq $0x00, %r10 adcq $0x00, %r11 @@ -14157,12 +14314,12 @@ L_curve25519_base_avx2_last_3: sbbq 8(%rsp), %r13 sbbq 16(%rsp), %r14 sbbq 24(%rsp), %r15 - sbbq %rcx, %rcx - shldq $0x01, %r15, %rcx - imulq $-19, %rcx - andq %rbx, %r15 + sbbq %rbx, %rbx + shldq $0x01, %r15, %rbx + imulq $-19, %rbx + btr $63, %r15 # Add modulus (if underflow) - subq %rcx, %r12 + subq %rbx, %r12 sbbq $0x00, %r13 sbbq $0x00, %r14 sbbq $0x00, %r15 @@ -14334,10 +14491,11 @@ L_curve25519_base_avx2_last_3: movq %r9, 136(%rsp) movq %r10, 144(%rsp) movq %r11, 152(%rsp) + movq 128(%rsp), %rax # Multiply # A[0] * B[0] movq 96(%rsp), %rdx - mulxq 128(%rsp), %r8, %r9 + mulxq %rax, %r8, %r9 # A[2] * B[0] mulxq 144(%rsp), %r10, %r11 # A[1] * B[0] @@ -14349,7 +14507,7 @@ L_curve25519_base_avx2_last_3: mulxq 152(%rsp), %r12, %r13 adcxq %rbx, %r10 # A[0] * B[1] - mulxq 128(%rsp), %rcx, %rbx + mulxq %rax, %rcx, %rbx adoxq %rcx, %r9 # A[2] * B[1] mulxq 144(%rsp), %rcx, %r14 @@ -14363,7 +14521,7 @@ L_curve25519_base_avx2_last_3: adcxq %r15, %r13 adoxq %rbx, %r12 # A[0] * B[2] - mulxq 128(%rsp), %rcx, %rbx + mulxq %rax, %rcx, %rbx adoxq %r15, %r13 xorq %r14, %r14 adcxq %rcx, %r10 @@ -14389,7 +14547,7 @@ L_curve25519_base_avx2_last_3: adoxq %r15, %r14 adcxq %rcx, %r14 # A[0] * B[3] - mulxq 128(%rsp), %rdx, %rcx + mulxq %rax, %rdx, %rcx adcxq %rbx, %r15 xorq %rbx, %rbx adcxq %rdx, %r11 @@ -14445,13 +14603,12 @@ L_curve25519_base_avx2_last_3: sbbq 104(%rsp), %r9 sbbq 112(%rsp), %r10 sbbq 120(%rsp), %r11 - sbbq %rcx, %rcx - shldq $0x01, %r11, %rcx - movq $0x7fffffffffffffff, %rbx - imulq $-19, %rcx - andq %rbx, %r11 + sbbq %rbx, %rbx + shldq $0x01, %r11, %rbx + imulq $-19, %rbx + btr $63, %r11 # Add modulus (if underflow) - subq %rcx, %r8 + subq %rbx, %r8 sbbq $0x00, %r9 sbbq $0x00, %r10 sbbq $0x00, %r11 @@ -14463,51 +14620,32 @@ L_curve25519_base_avx2_last_3: mulxq 128(%rsp), %r8, %r15 mulxq 136(%rsp), %r9, %r14 mulxq 144(%rsp), %r10, %r13 - mulxq 152(%rsp), %r11, %r12 addq %r15, %r9 + mulxq 152(%rsp), %r11, %r12 adcq %r14, %r10 adcq %r13, %r11 adcq $0x00, %r12 - movq $0x7fffffffffffffff, %r15 + addq 96(%rsp), %r8 + adcq 104(%rsp), %r9 + adcq 112(%rsp), %r10 + adcq 120(%rsp), %r11 + adcq $0x00, %r12 shldq $0x01, %r11, %r12 - andq %r15, %r11 + btr $63, %r11 imulq $19, %r12, %r12 addq %r12, %r8 adcq $0x00, %r9 adcq $0x00, %r10 adcq $0x00, %r11 - movq %r8, (%rsp) - movq %r9, 8(%rsp) - movq %r10, 16(%rsp) - movq %r11, 24(%rsp) - # Add - movq 96(%rsp), %r8 - movq 104(%rsp), %r9 - addq (%rsp), %r8 - movq 112(%rsp), %r10 - adcq 8(%rsp), %r9 - movq 120(%rsp), %r11 - adcq 16(%rsp), %r10 - adcq 24(%rsp), %r11 - movq $0x00, %rcx - adcq $0x00, %rcx - shldq $0x01, %r11, %rcx - movq $0x7fffffffffffffff, %rbx - imulq $19, %rcx - andq %rbx, %r11 - # Sub modulus (if overflow) - addq %rcx, %r8 - adcq $0x00, %r9 - adcq $0x00, %r10 - adcq $0x00, %r11 movq %r8, 96(%rsp) movq %r9, 104(%rsp) movq %r10, 112(%rsp) movq %r11, 120(%rsp) + movq 128(%rsp), %rax # Multiply # A[0] * B[0] movq 96(%rsp), %rdx - mulxq 128(%rsp), %r8, %r9 + mulxq %rax, %r8, %r9 # A[2] * B[0] mulxq 144(%rsp), %r10, %r11 # A[1] * B[0] @@ -14519,7 +14657,7 @@ L_curve25519_base_avx2_last_3: mulxq 152(%rsp), %r12, %r13 adcxq %rbx, %r10 # A[0] * B[1] - mulxq 128(%rsp), %rcx, %rbx + mulxq %rax, %rcx, %rbx adoxq %rcx, %r9 # A[2] * B[1] mulxq 144(%rsp), %rcx, %r14 @@ -14533,7 +14671,7 @@ L_curve25519_base_avx2_last_3: adcxq %r15, %r13 adoxq %rbx, %r12 # A[0] * B[2] - mulxq 128(%rsp), %rcx, %rbx + mulxq %rax, %rcx, %rbx adoxq %r15, %r13 xorq %r14, %r14 adcxq %rcx, %r10 @@ -14559,7 +14697,7 @@ L_curve25519_base_avx2_last_3: adoxq %r15, %r14 adcxq %rcx, %r14 # A[0] * B[3] - mulxq 128(%rsp), %rdx, %rcx + mulxq %rax, %rdx, %rcx adcxq %rbx, %r15 xorq %rbx, %rbx adcxq %rdx, %r11 @@ -14846,10 +14984,11 @@ L_curve25519_base_avx2_last_3: callq _fe_mul_avx2 #endif /* __APPLE__ */ movq 160(%rsp), %rdi + movq (%rdi), %rax # Multiply # A[0] * B[0] movq (%rsp), %rdx - mulxq (%rdi), %r8, %r9 + mulxq %rax, %r8, %r9 # A[2] * B[0] mulxq 16(%rdi), %r10, %r11 # A[1] * B[0] @@ -14861,7 +15000,7 @@ L_curve25519_base_avx2_last_3: mulxq 24(%rdi), %r12, %r13 adcxq %rbx, %r10 # A[0] * B[1] - mulxq (%rdi), %rcx, %rbx + mulxq %rax, %rcx, %rbx adoxq %rcx, %r9 # A[2] * B[1] mulxq 16(%rdi), %rcx, %r14 @@ -14875,7 +15014,7 @@ L_curve25519_base_avx2_last_3: adcxq %r15, %r13 adoxq %rbx, %r12 # A[0] * B[2] - mulxq (%rdi), %rcx, %rbx + mulxq %rax, %rcx, %rbx adoxq %r15, %r13 xorq %r14, %r14 adcxq %rcx, %r10 @@ -14901,7 +15040,7 @@ L_curve25519_base_avx2_last_3: adoxq %r15, %r14 adcxq %rcx, %r14 # A[0] * B[3] - mulxq (%rdi), %rdx, %rcx + mulxq %rax, %rdx, %rcx adcxq %rbx, %r15 xorq %rbx, %rbx adcxq %rdx, %r11 @@ -14985,7 +15124,7 @@ L_curve25519_base_avx2_last_3: #ifndef __APPLE__ .size curve25519_base_avx2,.-curve25519_base_avx2 #endif /* __APPLE__ */ -#endif /* !HAVE_ED25519 && !WOLFSSL_CURVE25519_USE_ED25519 */ +#endif /* WOLFSSL_CURVE25519_NOT_USE_ED25519 */ #ifndef __APPLE__ .text .globl curve25519_avx2 @@ -15034,59 +15173,58 @@ _curve25519_avx2: movq %r12, 88(%rsp) movq $0xfe, %rbx L_curve25519_avx2_bits: - movq 176(%rsp), %rax movq %rbx, 160(%rsp) movq %rbx, %rcx + movq 176(%rsp), %rax andq $63, %rcx shrq $6, %rbx movq (%rsi,%rbx,8), %rbx shrq %cl, %rbx andq $0x01, %rbx xorq %rbx, %rax + movq %rbx, 176(%rsp) negq %rax # Conditional Swap movq (%rdi), %r9 movq 8(%rdi), %r10 movq 16(%rdi), %r11 movq 24(%rdi), %r12 + movq (%rsp), %r13 + movq 8(%rsp), %r14 + movq 16(%rsp), %r15 + movq 24(%rsp), %rbp xorq 64(%rsp), %r9 xorq 72(%rsp), %r10 xorq 80(%rsp), %r11 xorq 88(%rsp), %r12 + xorq 32(%rsp), %r13 + xorq 40(%rsp), %r14 + xorq 48(%rsp), %r15 + xorq 56(%rsp), %rbp andq %rax, %r9 andq %rax, %r10 andq %rax, %r11 andq %rax, %r12 + andq %rax, %r13 + andq %rax, %r14 + andq %rax, %r15 + andq %rax, %rbp xorq %r9, (%rdi) xorq %r10, 8(%rdi) xorq %r11, 16(%rdi) xorq %r12, 24(%rdi) + xorq %r13, (%rsp) + xorq %r14, 8(%rsp) + xorq %r15, 16(%rsp) + xorq %rbp, 24(%rsp) xorq %r9, 64(%rsp) xorq %r10, 72(%rsp) xorq %r11, 80(%rsp) xorq %r12, 88(%rsp) - # Conditional Swap - movq (%rsp), %r9 - movq 8(%rsp), %r10 - movq 16(%rsp), %r11 - movq 24(%rsp), %r12 - xorq 32(%rsp), %r9 - xorq 40(%rsp), %r10 - xorq 48(%rsp), %r11 - xorq 56(%rsp), %r12 - andq %rax, %r9 - andq %rax, %r10 - andq %rax, %r11 - andq %rax, %r12 - xorq %r9, (%rsp) - xorq %r10, 8(%rsp) - xorq %r11, 16(%rsp) - xorq %r12, 24(%rsp) - xorq %r9, 32(%rsp) - xorq %r10, 40(%rsp) - xorq %r11, 48(%rsp) - xorq %r12, 56(%rsp) - movq %rbx, 176(%rsp) + xorq %r13, 32(%rsp) + xorq %r14, 40(%rsp) + xorq %r15, 48(%rsp) + xorq %rbp, 56(%rsp) # Add-Sub # Add movq (%rdi), %r9 @@ -15101,14 +15239,13 @@ L_curve25519_avx2_bits: adcq 16(%rsp), %r11 movq %r12, %rbp adcq 24(%rsp), %r12 - movq $0x00, %rcx - adcq $0x00, %rcx - shldq $0x01, %r12, %rcx - movq $0x7fffffffffffffff, %rbx - imulq $19, %rcx - andq %rbx, %r12 + movq $0x00, %rbx + adcq $0x00, %rbx + shldq $0x01, %r12, %rbx + imulq $19, %rbx + btr $63, %r12 # Sub modulus (if overflow) - addq %rcx, %r9 + addq %rbx, %r9 adcq $0x00, %r10 adcq $0x00, %r11 adcq $0x00, %r12 @@ -15117,12 +15254,12 @@ L_curve25519_avx2_bits: sbbq 8(%rsp), %r14 sbbq 16(%rsp), %r15 sbbq 24(%rsp), %rbp - sbbq %rcx, %rcx - shldq $0x01, %rbp, %rcx - imulq $-19, %rcx - andq %rbx, %rbp + sbbq %rbx, %rbx + shldq $0x01, %rbp, %rbx + imulq $-19, %rbx + btr $63, %rbp # Add modulus (if underflow) - subq %rcx, %r13 + subq %rbx, %r13 sbbq $0x00, %r14 sbbq $0x00, %r15 sbbq $0x00, %rbp @@ -15148,14 +15285,13 @@ L_curve25519_avx2_bits: adcq 48(%rsp), %r11 movq %r12, %rbp adcq 56(%rsp), %r12 - movq $0x00, %rcx - adcq $0x00, %rcx - shldq $0x01, %r12, %rcx - movq $0x7fffffffffffffff, %rbx - imulq $19, %rcx - andq %rbx, %r12 + movq $0x00, %rbx + adcq $0x00, %rbx + shldq $0x01, %r12, %rbx + imulq $19, %rbx + btr $63, %r12 # Sub modulus (if overflow) - addq %rcx, %r9 + addq %rbx, %r9 adcq $0x00, %r10 adcq $0x00, %r11 adcq $0x00, %r12 @@ -15164,12 +15300,12 @@ L_curve25519_avx2_bits: sbbq 40(%rsp), %r14 sbbq 48(%rsp), %r15 sbbq 56(%rsp), %rbp - sbbq %rcx, %rcx - shldq $0x01, %rbp, %rcx - imulq $-19, %rcx - andq %rbx, %rbp + sbbq %rbx, %rbx + shldq $0x01, %rbp, %rbx + imulq $-19, %rbx + btr $63, %rbp # Add modulus (if underflow) - subq %rcx, %r13 + subq %rbx, %r13 sbbq $0x00, %r14 sbbq $0x00, %r15 sbbq $0x00, %rbp @@ -15181,10 +15317,11 @@ L_curve25519_avx2_bits: movq %r14, 104(%rsp) movq %r15, 112(%rsp) movq %rbp, 120(%rsp) + movq 32(%rsp), %rax # Multiply # A[0] * B[0] movq 128(%rsp), %rdx - mulxq 32(%rsp), %r9, %r10 + mulxq %rax, %r9, %r10 # A[2] * B[0] mulxq 48(%rsp), %r11, %r12 # A[1] * B[0] @@ -15196,7 +15333,7 @@ L_curve25519_avx2_bits: mulxq 56(%rsp), %r13, %r14 adcxq %rbx, %r11 # A[0] * B[1] - mulxq 32(%rsp), %rcx, %rbx + mulxq %rax, %rcx, %rbx adoxq %rcx, %r10 # A[2] * B[1] mulxq 48(%rsp), %rcx, %r15 @@ -15210,7 +15347,7 @@ L_curve25519_avx2_bits: adcxq %rbp, %r14 adoxq %rbx, %r13 # A[0] * B[2] - mulxq 32(%rsp), %rcx, %rbx + mulxq %rax, %rcx, %rbx adoxq %rbp, %r14 xorq %r15, %r15 adcxq %rcx, %r11 @@ -15236,7 +15373,7 @@ L_curve25519_avx2_bits: adoxq %rbp, %r15 adcxq %rcx, %r15 # A[0] * B[3] - mulxq 32(%rsp), %rdx, %rcx + mulxq %rax, %rdx, %rcx adcxq %rbx, %rbp xorq %rbx, %rbx adcxq %rdx, %r12 @@ -15283,10 +15420,11 @@ L_curve25519_avx2_bits: movq %r10, 40(%rsp) movq %r11, 48(%rsp) movq %r12, 56(%rsp) + movq 96(%rsp), %rax # Multiply # A[0] * B[0] movq (%rdi), %rdx - mulxq 96(%rsp), %r9, %r10 + mulxq %rax, %r9, %r10 # A[2] * B[0] mulxq 112(%rsp), %r11, %r12 # A[1] * B[0] @@ -15298,7 +15436,7 @@ L_curve25519_avx2_bits: mulxq 120(%rsp), %r13, %r14 adcxq %rbx, %r11 # A[0] * B[1] - mulxq 96(%rsp), %rcx, %rbx + mulxq %rax, %rcx, %rbx adoxq %rcx, %r10 # A[2] * B[1] mulxq 112(%rsp), %rcx, %r15 @@ -15312,7 +15450,7 @@ L_curve25519_avx2_bits: adcxq %rbp, %r14 adoxq %rbx, %r13 # A[0] * B[2] - mulxq 96(%rsp), %rcx, %rbx + mulxq %rax, %rcx, %rbx adoxq %rbp, %r14 xorq %r15, %r15 adcxq %rcx, %r11 @@ -15338,7 +15476,7 @@ L_curve25519_avx2_bits: adoxq %rbp, %r15 adcxq %rcx, %r15 # A[0] * B[3] - mulxq 96(%rsp), %rdx, %rcx + mulxq %rax, %rdx, %rcx adcxq %rbx, %rbp xorq %rbx, %rbx adcxq %rdx, %r12 @@ -15559,14 +15697,13 @@ L_curve25519_avx2_bits: adcq 48(%rsp), %r11 movq %r12, %rbp adcq 56(%rsp), %r12 - movq $0x00, %rcx - adcq $0x00, %rcx - shldq $0x01, %r12, %rcx - movq $0x7fffffffffffffff, %rbx - imulq $19, %rcx - andq %rbx, %r12 + movq $0x00, %rbx + adcq $0x00, %rbx + shldq $0x01, %r12, %rbx + imulq $19, %rbx + btr $63, %r12 # Sub modulus (if overflow) - addq %rcx, %r9 + addq %rbx, %r9 adcq $0x00, %r10 adcq $0x00, %r11 adcq $0x00, %r12 @@ -15575,12 +15712,12 @@ L_curve25519_avx2_bits: sbbq 40(%rsp), %r14 sbbq 48(%rsp), %r15 sbbq 56(%rsp), %rbp - sbbq %rcx, %rcx - shldq $0x01, %rbp, %rcx - imulq $-19, %rcx - andq %rbx, %rbp + sbbq %rbx, %rbx + shldq $0x01, %rbp, %rbx + imulq $-19, %rbx + btr $63, %rbp # Add modulus (if underflow) - subq %rcx, %r13 + subq %rbx, %r13 sbbq $0x00, %r14 sbbq $0x00, %r15 sbbq $0x00, %rbp @@ -15592,10 +15729,11 @@ L_curve25519_avx2_bits: movq %r14, 40(%rsp) movq %r15, 48(%rsp) movq %rbp, 56(%rsp) + movq 128(%rsp), %rax # Multiply # A[0] * B[0] movq 96(%rsp), %rdx - mulxq 128(%rsp), %r9, %r10 + mulxq %rax, %r9, %r10 # A[2] * B[0] mulxq 144(%rsp), %r11, %r12 # A[1] * B[0] @@ -15607,7 +15745,7 @@ L_curve25519_avx2_bits: mulxq 152(%rsp), %r13, %r14 adcxq %rbx, %r11 # A[0] * B[1] - mulxq 128(%rsp), %rcx, %rbx + mulxq %rax, %rcx, %rbx adoxq %rcx, %r10 # A[2] * B[1] mulxq 144(%rsp), %rcx, %r15 @@ -15621,7 +15759,7 @@ L_curve25519_avx2_bits: adcxq %rbp, %r14 adoxq %rbx, %r13 # A[0] * B[2] - mulxq 128(%rsp), %rcx, %rbx + mulxq %rax, %rcx, %rbx adoxq %rbp, %r14 xorq %r15, %r15 adcxq %rcx, %r11 @@ -15647,7 +15785,7 @@ L_curve25519_avx2_bits: adoxq %rbp, %r15 adcxq %rcx, %r15 # A[0] * B[3] - mulxq 128(%rsp), %rdx, %rcx + mulxq %rax, %rdx, %rcx adcxq %rbx, %rbp xorq %rbx, %rbx adcxq %rdx, %r12 @@ -15703,13 +15841,12 @@ L_curve25519_avx2_bits: sbbq 104(%rsp), %r10 sbbq 112(%rsp), %r11 sbbq 120(%rsp), %r12 - sbbq %rcx, %rcx - shldq $0x01, %r12, %rcx - movq $0x7fffffffffffffff, %rbx - imulq $-19, %rcx - andq %rbx, %r12 + sbbq %rbx, %rbx + shldq $0x01, %r12, %rbx + imulq $-19, %rbx + btr $63, %r12 # Add modulus (if underflow) - subq %rcx, %r9 + subq %rbx, %r9 sbbq $0x00, %r10 sbbq $0x00, %r11 sbbq $0x00, %r12 @@ -15797,27 +15934,6 @@ L_curve25519_avx2_bits: movq %r10, 40(%rsp) movq %r11, 48(%rsp) movq %r12, 56(%rsp) - movq $0x1db42, %rdx - mulxq 128(%rsp), %r9, %rbp - mulxq 136(%rsp), %r10, %r15 - mulxq 144(%rsp), %r11, %r14 - mulxq 152(%rsp), %r12, %r13 - addq %rbp, %r10 - adcq %r15, %r11 - adcq %r14, %r12 - adcq $0x00, %r13 - movq $0x7fffffffffffffff, %rbp - shldq $0x01, %r12, %r13 - andq %rbp, %r12 - imulq $19, %r13, %r13 - addq %r13, %r9 - adcq $0x00, %r10 - adcq $0x00, %r11 - adcq $0x00, %r12 - movq %r9, (%rsp) - movq %r10, 8(%rsp) - movq %r11, 16(%rsp) - movq %r12, 24(%rsp) # Square movq 64(%rsp), %rdx movq 72(%rsp), %rax @@ -15898,23 +16014,24 @@ L_curve25519_avx2_bits: movq %r10, 72(%rsp) movq %r11, 80(%rsp) movq %r12, 88(%rsp) - # Add - movq 96(%rsp), %r9 - movq 104(%rsp), %r10 - addq (%rsp), %r9 - movq 112(%rsp), %r11 - adcq 8(%rsp), %r10 - movq 120(%rsp), %r12 - adcq 16(%rsp), %r11 - adcq 24(%rsp), %r12 - movq $0x00, %rcx - adcq $0x00, %rcx - shldq $0x01, %r12, %rcx - movq $0x7fffffffffffffff, %rbx - imulq $19, %rcx - andq %rbx, %r12 - # Sub modulus (if overflow) - addq %rcx, %r9 + movq $0x1db42, %rdx + mulxq 128(%rsp), %r9, %rbp + mulxq 136(%rsp), %r10, %r15 + mulxq 144(%rsp), %r11, %r14 + addq %rbp, %r10 + mulxq 152(%rsp), %r12, %r13 + adcq %r15, %r11 + adcq %r14, %r12 + adcq $0x00, %r13 + addq 96(%rsp), %r9 + adcq 104(%rsp), %r10 + adcq 112(%rsp), %r11 + adcq 120(%rsp), %r12 + adcq $0x00, %r13 + shldq $0x01, %r12, %r13 + btr $63, %r12 + imulq $19, %r13, %r13 + addq %r13, %r9 adcq $0x00, %r10 adcq $0x00, %r11 adcq $0x00, %r12 @@ -15922,10 +16039,11 @@ L_curve25519_avx2_bits: movq %r10, 104(%rsp) movq %r11, 112(%rsp) movq %r12, 120(%rsp) + movq (%r8), %rax # Multiply # A[0] * B[0] movq 32(%rsp), %rdx - mulxq (%r8), %r9, %r10 + mulxq %rax, %r9, %r10 # A[2] * B[0] mulxq 16(%r8), %r11, %r12 # A[1] * B[0] @@ -15937,7 +16055,7 @@ L_curve25519_avx2_bits: mulxq 24(%r8), %r13, %r14 adcxq %rbx, %r11 # A[0] * B[1] - mulxq (%r8), %rcx, %rbx + mulxq %rax, %rcx, %rbx adoxq %rcx, %r10 # A[2] * B[1] mulxq 16(%r8), %rcx, %r15 @@ -15951,7 +16069,7 @@ L_curve25519_avx2_bits: adcxq %rbp, %r14 adoxq %rbx, %r13 # A[0] * B[2] - mulxq (%r8), %rcx, %rbx + mulxq %rax, %rcx, %rbx adoxq %rbp, %r14 xorq %r15, %r15 adcxq %rcx, %r11 @@ -15977,7 +16095,7 @@ L_curve25519_avx2_bits: adoxq %rbp, %r15 adcxq %rcx, %r15 # A[0] * B[3] - mulxq (%r8), %rdx, %rcx + mulxq %rax, %rdx, %rcx adcxq %rbx, %rbp xorq %rbx, %rbx adcxq %rdx, %r12 @@ -16024,79 +16142,80 @@ L_curve25519_avx2_bits: movq %r10, 40(%rsp) movq %r11, 48(%rsp) movq %r12, 56(%rsp) + movq 96(%rsp), %rax # Multiply # A[0] * B[0] - movq 96(%rsp), %rdx - mulxq 128(%rsp), %r9, %r10 + movq 128(%rsp), %rdx + mulxq %rax, %r9, %r10 # A[2] * B[0] - mulxq 144(%rsp), %r11, %r12 + mulxq 112(%rsp), %r11, %r12 # A[1] * B[0] - mulxq 136(%rsp), %rcx, %rbx + mulxq 104(%rsp), %rcx, %rbx xorq %rbp, %rbp adcxq %rcx, %r10 # A[3] * B[1] - movq 104(%rsp), %rdx - mulxq 152(%rsp), %r13, %r14 + movq 136(%rsp), %rdx + mulxq 120(%rsp), %r13, %r14 adcxq %rbx, %r11 # A[0] * B[1] - mulxq 128(%rsp), %rcx, %rbx + mulxq %rax, %rcx, %rbx adoxq %rcx, %r10 # A[2] * B[1] - mulxq 144(%rsp), %rcx, %r15 + mulxq 112(%rsp), %rcx, %r15 adoxq %rbx, %r11 adcxq %rcx, %r12 # A[1] * B[2] - movq 112(%rsp), %rdx - mulxq 136(%rsp), %rcx, %rbx + movq 144(%rsp), %rdx + mulxq 104(%rsp), %rcx, %rbx adcxq %r15, %r13 adoxq %rcx, %r12 adcxq %rbp, %r14 adoxq %rbx, %r13 # A[0] * B[2] - mulxq 128(%rsp), %rcx, %rbx + mulxq %rax, %rcx, %rbx adoxq %rbp, %r14 xorq %r15, %r15 adcxq %rcx, %r11 # A[1] * B[1] - movq 104(%rsp), %rdx - mulxq 136(%rsp), %rdx, %rcx + movq 136(%rsp), %rdx + mulxq 104(%rsp), %rdx, %rcx adcxq %rbx, %r12 adoxq %rdx, %r11 # A[1] * B[3] - movq 120(%rsp), %rdx + movq 152(%rsp), %rdx adoxq %rcx, %r12 - mulxq 136(%rsp), %rcx, %rbx + mulxq 104(%rsp), %rcx, %rbx adcxq %rcx, %r13 # A[2] * B[2] - movq 112(%rsp), %rdx - mulxq 144(%rsp), %rdx, %rcx + movq 144(%rsp), %rdx + mulxq 112(%rsp), %rdx, %rcx adcxq %rbx, %r14 adoxq %rdx, %r13 # A[3] * B[3] - movq 120(%rsp), %rdx + movq 152(%rsp), %rdx adoxq %rcx, %r14 - mulxq 152(%rsp), %rcx, %rbx + mulxq 120(%rsp), %rcx, %rbx adoxq %rbp, %r15 adcxq %rcx, %r15 # A[0] * B[3] - mulxq 128(%rsp), %rdx, %rcx + mulxq %rax, %rdx, %rcx adcxq %rbx, %rbp xorq %rbx, %rbx adcxq %rdx, %r12 # A[3] * B[0] - movq 152(%rsp), %rdx + movq 120(%rsp), %rdx adcxq %rcx, %r13 - mulxq 96(%rsp), %rdx, %rcx + mulxq 128(%rsp), %rdx, %rcx adoxq %rdx, %r12 adoxq %rcx, %r13 # A[3] * B[2] - movq 152(%rsp), %rdx - mulxq 112(%rsp), %rdx, %rcx + movq 120(%rsp), %rdx + mulxq 144(%rsp), %rdx, %rcx adcxq %rdx, %r14 # A[2] * B[3] - movq 120(%rsp), %rdx + movq 152(%rsp), %rdx adcxq %rcx, %r15 - mulxq 144(%rsp), %rcx, %rdx + mulxq 112(%rsp), %rcx, %rdx adcxq %rbx, %rbp adoxq %rcx, %r14 adoxq %rdx, %r15 @@ -16130,7 +16249,7 @@ L_curve25519_avx2_bits: decq %rbx cmpq $3, %rbx jge L_curve25519_avx2_bits - movq %rbx, 160(%rsp) + movq $2, 160(%rsp) movq 176(%rsp), %rax negq %rax # Conditional Swap @@ -16138,43 +16257,42 @@ L_curve25519_avx2_bits: movq 8(%rdi), %r10 movq 16(%rdi), %r11 movq 24(%rdi), %r12 + movq (%rsp), %r13 + movq 8(%rsp), %r14 + movq 16(%rsp), %r15 + movq 24(%rsp), %rbp xorq 64(%rsp), %r9 xorq 72(%rsp), %r10 xorq 80(%rsp), %r11 xorq 88(%rsp), %r12 + xorq 32(%rsp), %r13 + xorq 40(%rsp), %r14 + xorq 48(%rsp), %r15 + xorq 56(%rsp), %rbp andq %rax, %r9 andq %rax, %r10 andq %rax, %r11 andq %rax, %r12 + andq %rax, %r13 + andq %rax, %r14 + andq %rax, %r15 + andq %rax, %rbp xorq %r9, (%rdi) xorq %r10, 8(%rdi) xorq %r11, 16(%rdi) xorq %r12, 24(%rdi) + xorq %r13, (%rsp) + xorq %r14, 8(%rsp) + xorq %r15, 16(%rsp) + xorq %rbp, 24(%rsp) xorq %r9, 64(%rsp) xorq %r10, 72(%rsp) xorq %r11, 80(%rsp) xorq %r12, 88(%rsp) - # Conditional Swap - movq (%rsp), %r9 - movq 8(%rsp), %r10 - movq 16(%rsp), %r11 - movq 24(%rsp), %r12 - xorq 32(%rsp), %r9 - xorq 40(%rsp), %r10 - xorq 48(%rsp), %r11 - xorq 56(%rsp), %r12 - andq %rax, %r9 - andq %rax, %r10 - andq %rax, %r11 - andq %rax, %r12 - xorq %r9, (%rsp) - xorq %r10, 8(%rsp) - xorq %r11, 16(%rsp) - xorq %r12, 24(%rsp) - xorq %r9, 32(%rsp) - xorq %r10, 40(%rsp) - xorq %r11, 48(%rsp) - xorq %r12, 56(%rsp) + xorq %r13, 32(%rsp) + xorq %r14, 40(%rsp) + xorq %r15, 48(%rsp) + xorq %rbp, 56(%rsp) L_curve25519_avx2_last_3: # Add-Sub # Add @@ -16190,14 +16308,13 @@ L_curve25519_avx2_last_3: adcq 16(%rsp), %r11 movq %r12, %rbp adcq 24(%rsp), %r12 - movq $0x00, %rcx - adcq $0x00, %rcx - shldq $0x01, %r12, %rcx - movq $0x7fffffffffffffff, %rbx - imulq $19, %rcx - andq %rbx, %r12 + movq $0x00, %rbx + adcq $0x00, %rbx + shldq $0x01, %r12, %rbx + imulq $19, %rbx + btr $63, %r12 # Sub modulus (if overflow) - addq %rcx, %r9 + addq %rbx, %r9 adcq $0x00, %r10 adcq $0x00, %r11 adcq $0x00, %r12 @@ -16206,12 +16323,12 @@ L_curve25519_avx2_last_3: sbbq 8(%rsp), %r14 sbbq 16(%rsp), %r15 sbbq 24(%rsp), %rbp - sbbq %rcx, %rcx - shldq $0x01, %rbp, %rcx - imulq $-19, %rcx - andq %rbx, %rbp + sbbq %rbx, %rbx + shldq $0x01, %rbp, %rbx + imulq $-19, %rbx + btr $63, %rbp # Add modulus (if underflow) - subq %rcx, %r13 + subq %rbx, %r13 sbbq $0x00, %r14 sbbq $0x00, %r15 sbbq $0x00, %rbp @@ -16383,10 +16500,11 @@ L_curve25519_avx2_last_3: movq %r10, 136(%rsp) movq %r11, 144(%rsp) movq %r12, 152(%rsp) + movq 128(%rsp), %rax # Multiply # A[0] * B[0] movq 96(%rsp), %rdx - mulxq 128(%rsp), %r9, %r10 + mulxq %rax, %r9, %r10 # A[2] * B[0] mulxq 144(%rsp), %r11, %r12 # A[1] * B[0] @@ -16398,7 +16516,7 @@ L_curve25519_avx2_last_3: mulxq 152(%rsp), %r13, %r14 adcxq %rbx, %r11 # A[0] * B[1] - mulxq 128(%rsp), %rcx, %rbx + mulxq %rax, %rcx, %rbx adoxq %rcx, %r10 # A[2] * B[1] mulxq 144(%rsp), %rcx, %r15 @@ -16412,7 +16530,7 @@ L_curve25519_avx2_last_3: adcxq %rbp, %r14 adoxq %rbx, %r13 # A[0] * B[2] - mulxq 128(%rsp), %rcx, %rbx + mulxq %rax, %rcx, %rbx adoxq %rbp, %r14 xorq %r15, %r15 adcxq %rcx, %r11 @@ -16438,7 +16556,7 @@ L_curve25519_avx2_last_3: adoxq %rbp, %r15 adcxq %rcx, %r15 # A[0] * B[3] - mulxq 128(%rsp), %rdx, %rcx + mulxq %rax, %rdx, %rcx adcxq %rbx, %rbp xorq %rbx, %rbx adcxq %rdx, %r12 @@ -16494,13 +16612,12 @@ L_curve25519_avx2_last_3: sbbq 104(%rsp), %r10 sbbq 112(%rsp), %r11 sbbq 120(%rsp), %r12 - sbbq %rcx, %rcx - shldq $0x01, %r12, %rcx - movq $0x7fffffffffffffff, %rbx - imulq $-19, %rcx - andq %rbx, %r12 + sbbq %rbx, %rbx + shldq $0x01, %r12, %rbx + imulq $-19, %rbx + btr $63, %r12 # Add modulus (if underflow) - subq %rcx, %r9 + subq %rbx, %r9 sbbq $0x00, %r10 sbbq $0x00, %r11 sbbq $0x00, %r12 @@ -16512,120 +16629,101 @@ L_curve25519_avx2_last_3: mulxq 128(%rsp), %r9, %rbp mulxq 136(%rsp), %r10, %r15 mulxq 144(%rsp), %r11, %r14 - mulxq 152(%rsp), %r12, %r13 addq %rbp, %r10 + mulxq 152(%rsp), %r12, %r13 adcq %r15, %r11 adcq %r14, %r12 adcq $0x00, %r13 - movq $0x7fffffffffffffff, %rbp + addq 96(%rsp), %r9 + adcq 104(%rsp), %r10 + adcq 112(%rsp), %r11 + adcq 120(%rsp), %r12 + adcq $0x00, %r13 shldq $0x01, %r12, %r13 - andq %rbp, %r12 + btr $63, %r12 imulq $19, %r13, %r13 addq %r13, %r9 adcq $0x00, %r10 adcq $0x00, %r11 adcq $0x00, %r12 - movq %r9, (%rsp) - movq %r10, 8(%rsp) - movq %r11, 16(%rsp) - movq %r12, 24(%rsp) - # Add - movq 96(%rsp), %r9 - movq 104(%rsp), %r10 - addq (%rsp), %r9 - movq 112(%rsp), %r11 - adcq 8(%rsp), %r10 - movq 120(%rsp), %r12 - adcq 16(%rsp), %r11 - adcq 24(%rsp), %r12 - movq $0x00, %rcx - adcq $0x00, %rcx - shldq $0x01, %r12, %rcx - movq $0x7fffffffffffffff, %rbx - imulq $19, %rcx - andq %rbx, %r12 - # Sub modulus (if overflow) - addq %rcx, %r9 - adcq $0x00, %r10 - adcq $0x00, %r11 - adcq $0x00, %r12 movq %r9, 96(%rsp) movq %r10, 104(%rsp) movq %r11, 112(%rsp) movq %r12, 120(%rsp) + movq 96(%rsp), %rax # Multiply # A[0] * B[0] - movq 96(%rsp), %rdx - mulxq 128(%rsp), %r9, %r10 + movq 128(%rsp), %rdx + mulxq %rax, %r9, %r10 # A[2] * B[0] - mulxq 144(%rsp), %r11, %r12 + mulxq 112(%rsp), %r11, %r12 # A[1] * B[0] - mulxq 136(%rsp), %rcx, %rbx + mulxq 104(%rsp), %rcx, %rbx xorq %rbp, %rbp adcxq %rcx, %r10 # A[3] * B[1] - movq 104(%rsp), %rdx - mulxq 152(%rsp), %r13, %r14 + movq 136(%rsp), %rdx + mulxq 120(%rsp), %r13, %r14 adcxq %rbx, %r11 # A[0] * B[1] - mulxq 128(%rsp), %rcx, %rbx + mulxq %rax, %rcx, %rbx adoxq %rcx, %r10 # A[2] * B[1] - mulxq 144(%rsp), %rcx, %r15 + mulxq 112(%rsp), %rcx, %r15 adoxq %rbx, %r11 adcxq %rcx, %r12 # A[1] * B[2] - movq 112(%rsp), %rdx - mulxq 136(%rsp), %rcx, %rbx + movq 144(%rsp), %rdx + mulxq 104(%rsp), %rcx, %rbx adcxq %r15, %r13 adoxq %rcx, %r12 adcxq %rbp, %r14 adoxq %rbx, %r13 # A[0] * B[2] - mulxq 128(%rsp), %rcx, %rbx + mulxq %rax, %rcx, %rbx adoxq %rbp, %r14 xorq %r15, %r15 adcxq %rcx, %r11 # A[1] * B[1] - movq 104(%rsp), %rdx - mulxq 136(%rsp), %rdx, %rcx + movq 136(%rsp), %rdx + mulxq 104(%rsp), %rdx, %rcx adcxq %rbx, %r12 adoxq %rdx, %r11 # A[1] * B[3] - movq 120(%rsp), %rdx + movq 152(%rsp), %rdx adoxq %rcx, %r12 - mulxq 136(%rsp), %rcx, %rbx + mulxq 104(%rsp), %rcx, %rbx adcxq %rcx, %r13 # A[2] * B[2] - movq 112(%rsp), %rdx - mulxq 144(%rsp), %rdx, %rcx + movq 144(%rsp), %rdx + mulxq 112(%rsp), %rdx, %rcx adcxq %rbx, %r14 adoxq %rdx, %r13 # A[3] * B[3] - movq 120(%rsp), %rdx + movq 152(%rsp), %rdx adoxq %rcx, %r14 - mulxq 152(%rsp), %rcx, %rbx + mulxq 120(%rsp), %rcx, %rbx adoxq %rbp, %r15 adcxq %rcx, %r15 # A[0] * B[3] - mulxq 128(%rsp), %rdx, %rcx + mulxq %rax, %rdx, %rcx adcxq %rbx, %rbp xorq %rbx, %rbx adcxq %rdx, %r12 # A[3] * B[0] - movq 152(%rsp), %rdx + movq 120(%rsp), %rdx adcxq %rcx, %r13 - mulxq 96(%rsp), %rdx, %rcx + mulxq 128(%rsp), %rdx, %rcx adoxq %rdx, %r12 adoxq %rcx, %r13 # A[3] * B[2] - movq 152(%rsp), %rdx - mulxq 112(%rsp), %rdx, %rcx + movq 120(%rsp), %rdx + mulxq 144(%rsp), %rdx, %rcx adcxq %rdx, %r14 # A[2] * B[3] - movq 120(%rsp), %rdx + movq 152(%rsp), %rdx adcxq %rcx, %r15 - mulxq 144(%rsp), %rcx, %rdx + mulxq 112(%rsp), %rcx, %rdx adcxq %rbx, %rbp adoxq %rcx, %r14 adoxq %rdx, %r15 @@ -16655,9 +16753,7 @@ L_curve25519_avx2_last_3: movq %r10, 8(%rsp) movq %r11, 16(%rsp) movq %r12, 24(%rsp) - movq 160(%rsp), %rbx - decq %rbx - movq %rbx, 160(%rsp) + decq 160(%rsp) jge L_curve25519_avx2_last_3 # Invert leaq 32(%rsp), %rdi @@ -16897,10 +16993,11 @@ L_curve25519_avx2_last_3: callq _fe_mul_avx2 #endif /* __APPLE__ */ movq 168(%rsp), %rdi + movq (%rdi), %rax # Multiply # A[0] * B[0] movq (%rsp), %rdx - mulxq (%rdi), %r9, %r10 + mulxq %rax, %r9, %r10 # A[2] * B[0] mulxq 16(%rdi), %r11, %r12 # A[1] * B[0] @@ -16912,7 +17009,7 @@ L_curve25519_avx2_last_3: mulxq 24(%rdi), %r13, %r14 adcxq %rbx, %r11 # A[0] * B[1] - mulxq (%rdi), %rcx, %rbx + mulxq %rax, %rcx, %rbx adoxq %rcx, %r10 # A[2] * B[1] mulxq 16(%rdi), %rcx, %r15 @@ -16926,7 +17023,7 @@ L_curve25519_avx2_last_3: adcxq %rbp, %r14 adoxq %rbx, %r13 # A[0] * B[2] - mulxq (%rdi), %rcx, %rbx + mulxq %rax, %rcx, %rbx adoxq %rbp, %r14 xorq %r15, %r15 adcxq %rcx, %r11 @@ -16952,7 +17049,7 @@ L_curve25519_avx2_last_3: adoxq %rbp, %r15 adcxq %rcx, %r15 # A[0] * B[3] - mulxq (%rdi), %rdx, %rcx + mulxq %rax, %rdx, %rcx adcxq %rbx, %rbp xorq %rbx, %rbx adcxq %rdx, %r12 @@ -17308,321 +17405,326 @@ _ge_p1p1_to_p2_avx2: pushq %r14 pushq %r15 pushq %rbx + pushq %rbp subq $16, %rsp movq %rdi, (%rsp) movq %rsi, 8(%rsp) leaq 96(%rsi), %rax + movq (%rsi), %r9 # Multiply # A[0] * B[0] movq (%rax), %rdx - mulxq (%rsi), %r9, %r10 + mulxq %r9, %r10, %r11 # A[2] * B[0] - mulxq 16(%rsi), %r11, %r12 + mulxq 16(%rsi), %r12, %r13 # A[1] * B[0] mulxq 8(%rsi), %rcx, %r8 - xorq %rbx, %rbx - adcxq %rcx, %r10 + xorq %rbp, %rbp + adcxq %rcx, %r11 # A[3] * B[1] movq 8(%rax), %rdx - mulxq 24(%rsi), %r13, %r14 - adcxq %r8, %r11 + mulxq 24(%rsi), %r14, %r15 + adcxq %r8, %r12 # A[0] * B[1] - mulxq (%rsi), %rcx, %r8 - adoxq %rcx, %r10 + mulxq %r9, %rcx, %r8 + adoxq %rcx, %r11 # A[2] * B[1] - mulxq 16(%rsi), %rcx, %r15 - adoxq %r8, %r11 - adcxq %rcx, %r12 + mulxq 16(%rsi), %rcx, %rbx + adoxq %r8, %r12 + adcxq %rcx, %r13 # A[1] * B[2] movq 16(%rax), %rdx mulxq 8(%rsi), %rcx, %r8 - adcxq %r15, %r13 - adoxq %rcx, %r12 adcxq %rbx, %r14 - adoxq %r8, %r13 + adoxq %rcx, %r13 + adcxq %rbp, %r15 + adoxq %r8, %r14 # A[0] * B[2] - mulxq (%rsi), %rcx, %r8 - adoxq %rbx, %r14 - xorq %r15, %r15 - adcxq %rcx, %r11 + mulxq %r9, %rcx, %r8 + adoxq %rbp, %r15 + xorq %rbx, %rbx + adcxq %rcx, %r12 # A[1] * B[1] movq 8(%rax), %rdx mulxq 8(%rsi), %rdx, %rcx - adcxq %r8, %r12 - adoxq %rdx, %r11 + adcxq %r8, %r13 + adoxq %rdx, %r12 # A[1] * B[3] movq 24(%rax), %rdx - adoxq %rcx, %r12 + adoxq %rcx, %r13 mulxq 8(%rsi), %rcx, %r8 - adcxq %rcx, %r13 + adcxq %rcx, %r14 # A[2] * B[2] movq 16(%rax), %rdx mulxq 16(%rsi), %rdx, %rcx - adcxq %r8, %r14 - adoxq %rdx, %r13 + adcxq %r8, %r15 + adoxq %rdx, %r14 # A[3] * B[3] movq 24(%rax), %rdx - adoxq %rcx, %r14 + adoxq %rcx, %r15 mulxq 24(%rsi), %rcx, %r8 - adoxq %rbx, %r15 - adcxq %rcx, %r15 + adoxq %rbp, %rbx + adcxq %rcx, %rbx # A[0] * B[3] - mulxq (%rsi), %rdx, %rcx - adcxq %r8, %rbx + mulxq %r9, %rdx, %rcx + adcxq %r8, %rbp xorq %r8, %r8 - adcxq %rdx, %r12 + adcxq %rdx, %r13 # A[3] * B[0] movq 24(%rsi), %rdx - adcxq %rcx, %r13 + adcxq %rcx, %r14 mulxq (%rax), %rdx, %rcx - adoxq %rdx, %r12 - adoxq %rcx, %r13 + adoxq %rdx, %r13 + adoxq %rcx, %r14 # A[3] * B[2] movq 24(%rsi), %rdx mulxq 16(%rax), %rdx, %rcx - adcxq %rdx, %r14 + adcxq %rdx, %r15 # A[2] * B[3] movq 24(%rax), %rdx - adcxq %rcx, %r15 + adcxq %rcx, %rbx mulxq 16(%rsi), %rcx, %rdx - adcxq %r8, %rbx - adoxq %rcx, %r14 - adoxq %rdx, %r15 - adoxq %r8, %rbx + adcxq %r8, %rbp + adoxq %rcx, %r15 + adoxq %rdx, %rbx + adoxq %r8, %rbp movq $38, %rdx - mulxq %rbx, %rbx, %rcx - addq %rbx, %r12 + mulxq %rbp, %rbp, %rcx + addq %rbp, %r13 adcq $0x00, %rcx movq $0x7fffffffffffffff, %r8 - shldq $0x01, %r12, %rcx + shldq $0x01, %r13, %rcx imulq $19, %rcx, %rcx - andq %r8, %r12 + andq %r8, %r13 xorq %r8, %r8 - adoxq %rcx, %r9 - mulxq %r13, %rcx, %r13 - adcxq %rcx, %r9 - adoxq %r13, %r10 + adoxq %rcx, %r10 mulxq %r14, %rcx, %r14 adcxq %rcx, %r10 adoxq %r14, %r11 mulxq %r15, %rcx, %r15 adcxq %rcx, %r11 adoxq %r15, %r12 - adcxq %r8, %r12 + mulxq %rbx, %rcx, %rbx + adcxq %rcx, %r12 + adoxq %rbx, %r13 + adcxq %r8, %r13 # Store - movq %r9, (%rdi) - movq %r10, 8(%rdi) - movq %r11, 16(%rdi) - movq %r12, 24(%rdi) + movq %r10, (%rdi) + movq %r11, 8(%rdi) + movq %r12, 16(%rdi) + movq %r13, 24(%rdi) leaq 64(%rsi), %rsi leaq 64(%rdi), %rdi + movq (%rsi), %r9 # Multiply # A[0] * B[0] movq (%rax), %rdx - mulxq (%rsi), %r9, %r10 + mulxq %r9, %r10, %r11 # A[2] * B[0] - mulxq 16(%rsi), %r11, %r12 + mulxq 16(%rsi), %r12, %r13 # A[1] * B[0] mulxq 8(%rsi), %rcx, %r8 - xorq %rbx, %rbx - adcxq %rcx, %r10 + xorq %rbp, %rbp + adcxq %rcx, %r11 # A[3] * B[1] movq 8(%rax), %rdx - mulxq 24(%rsi), %r13, %r14 - adcxq %r8, %r11 + mulxq 24(%rsi), %r14, %r15 + adcxq %r8, %r12 # A[0] * B[1] - mulxq (%rsi), %rcx, %r8 - adoxq %rcx, %r10 + mulxq %r9, %rcx, %r8 + adoxq %rcx, %r11 # A[2] * B[1] - mulxq 16(%rsi), %rcx, %r15 - adoxq %r8, %r11 - adcxq %rcx, %r12 + mulxq 16(%rsi), %rcx, %rbx + adoxq %r8, %r12 + adcxq %rcx, %r13 # A[1] * B[2] movq 16(%rax), %rdx mulxq 8(%rsi), %rcx, %r8 - adcxq %r15, %r13 - adoxq %rcx, %r12 adcxq %rbx, %r14 - adoxq %r8, %r13 + adoxq %rcx, %r13 + adcxq %rbp, %r15 + adoxq %r8, %r14 # A[0] * B[2] - mulxq (%rsi), %rcx, %r8 - adoxq %rbx, %r14 - xorq %r15, %r15 - adcxq %rcx, %r11 + mulxq %r9, %rcx, %r8 + adoxq %rbp, %r15 + xorq %rbx, %rbx + adcxq %rcx, %r12 # A[1] * B[1] movq 8(%rax), %rdx mulxq 8(%rsi), %rdx, %rcx - adcxq %r8, %r12 - adoxq %rdx, %r11 + adcxq %r8, %r13 + adoxq %rdx, %r12 # A[1] * B[3] movq 24(%rax), %rdx - adoxq %rcx, %r12 + adoxq %rcx, %r13 mulxq 8(%rsi), %rcx, %r8 - adcxq %rcx, %r13 + adcxq %rcx, %r14 # A[2] * B[2] movq 16(%rax), %rdx mulxq 16(%rsi), %rdx, %rcx - adcxq %r8, %r14 - adoxq %rdx, %r13 + adcxq %r8, %r15 + adoxq %rdx, %r14 # A[3] * B[3] movq 24(%rax), %rdx - adoxq %rcx, %r14 + adoxq %rcx, %r15 mulxq 24(%rsi), %rcx, %r8 - adoxq %rbx, %r15 - adcxq %rcx, %r15 + adoxq %rbp, %rbx + adcxq %rcx, %rbx # A[0] * B[3] - mulxq (%rsi), %rdx, %rcx - adcxq %r8, %rbx + mulxq %r9, %rdx, %rcx + adcxq %r8, %rbp xorq %r8, %r8 - adcxq %rdx, %r12 + adcxq %rdx, %r13 # A[3] * B[0] movq 24(%rsi), %rdx - adcxq %rcx, %r13 + adcxq %rcx, %r14 mulxq (%rax), %rdx, %rcx - adoxq %rdx, %r12 - adoxq %rcx, %r13 + adoxq %rdx, %r13 + adoxq %rcx, %r14 # A[3] * B[2] movq 24(%rsi), %rdx mulxq 16(%rax), %rdx, %rcx - adcxq %rdx, %r14 + adcxq %rdx, %r15 # A[2] * B[3] movq 24(%rax), %rdx - adcxq %rcx, %r15 + adcxq %rcx, %rbx mulxq 16(%rsi), %rcx, %rdx - adcxq %r8, %rbx - adoxq %rcx, %r14 - adoxq %rdx, %r15 - adoxq %r8, %rbx + adcxq %r8, %rbp + adoxq %rcx, %r15 + adoxq %rdx, %rbx + adoxq %r8, %rbp movq $38, %rdx - mulxq %rbx, %rbx, %rcx - addq %rbx, %r12 + mulxq %rbp, %rbp, %rcx + addq %rbp, %r13 adcq $0x00, %rcx movq $0x7fffffffffffffff, %r8 - shldq $0x01, %r12, %rcx + shldq $0x01, %r13, %rcx imulq $19, %rcx, %rcx - andq %r8, %r12 + andq %r8, %r13 xorq %r8, %r8 - adoxq %rcx, %r9 - mulxq %r13, %rcx, %r13 - adcxq %rcx, %r9 - adoxq %r13, %r10 + adoxq %rcx, %r10 mulxq %r14, %rcx, %r14 adcxq %rcx, %r10 adoxq %r14, %r11 mulxq %r15, %rcx, %r15 adcxq %rcx, %r11 adoxq %r15, %r12 - adcxq %r8, %r12 + mulxq %rbx, %rcx, %rbx + adcxq %rcx, %r12 + adoxq %rbx, %r13 + adcxq %r8, %r13 # Store - movq %r9, (%rdi) - movq %r10, 8(%rdi) - movq %r11, 16(%rdi) - movq %r12, 24(%rdi) + movq %r10, (%rdi) + movq %r11, 8(%rdi) + movq %r12, 16(%rdi) + movq %r13, 24(%rdi) leaq -32(%rsi), %rax leaq -32(%rdi), %rdi + movq (%rsi), %r9 # Multiply # A[0] * B[0] movq (%rax), %rdx - mulxq (%rsi), %r9, %r10 + mulxq %r9, %r10, %r11 # A[2] * B[0] - mulxq 16(%rsi), %r11, %r12 + mulxq 16(%rsi), %r12, %r13 # A[1] * B[0] mulxq 8(%rsi), %rcx, %r8 - xorq %rbx, %rbx - adcxq %rcx, %r10 + xorq %rbp, %rbp + adcxq %rcx, %r11 # A[3] * B[1] movq 8(%rax), %rdx - mulxq 24(%rsi), %r13, %r14 - adcxq %r8, %r11 + mulxq 24(%rsi), %r14, %r15 + adcxq %r8, %r12 # A[0] * B[1] - mulxq (%rsi), %rcx, %r8 - adoxq %rcx, %r10 + mulxq %r9, %rcx, %r8 + adoxq %rcx, %r11 # A[2] * B[1] - mulxq 16(%rsi), %rcx, %r15 - adoxq %r8, %r11 - adcxq %rcx, %r12 + mulxq 16(%rsi), %rcx, %rbx + adoxq %r8, %r12 + adcxq %rcx, %r13 # A[1] * B[2] movq 16(%rax), %rdx mulxq 8(%rsi), %rcx, %r8 - adcxq %r15, %r13 - adoxq %rcx, %r12 adcxq %rbx, %r14 - adoxq %r8, %r13 - # A[0] * B[2] - mulxq (%rsi), %rcx, %r8 - adoxq %rbx, %r14 - xorq %r15, %r15 - adcxq %rcx, %r11 + adoxq %rcx, %r13 + adcxq %rbp, %r15 + adoxq %r8, %r14 + # A[0] * B[2] + mulxq %r9, %rcx, %r8 + adoxq %rbp, %r15 + xorq %rbx, %rbx + adcxq %rcx, %r12 # A[1] * B[1] movq 8(%rax), %rdx mulxq 8(%rsi), %rdx, %rcx - adcxq %r8, %r12 - adoxq %rdx, %r11 + adcxq %r8, %r13 + adoxq %rdx, %r12 # A[1] * B[3] movq 24(%rax), %rdx - adoxq %rcx, %r12 + adoxq %rcx, %r13 mulxq 8(%rsi), %rcx, %r8 - adcxq %rcx, %r13 + adcxq %rcx, %r14 # A[2] * B[2] movq 16(%rax), %rdx mulxq 16(%rsi), %rdx, %rcx - adcxq %r8, %r14 - adoxq %rdx, %r13 + adcxq %r8, %r15 + adoxq %rdx, %r14 # A[3] * B[3] movq 24(%rax), %rdx - adoxq %rcx, %r14 + adoxq %rcx, %r15 mulxq 24(%rsi), %rcx, %r8 - adoxq %rbx, %r15 - adcxq %rcx, %r15 + adoxq %rbp, %rbx + adcxq %rcx, %rbx # A[0] * B[3] - mulxq (%rsi), %rdx, %rcx - adcxq %r8, %rbx + mulxq %r9, %rdx, %rcx + adcxq %r8, %rbp xorq %r8, %r8 - adcxq %rdx, %r12 + adcxq %rdx, %r13 # A[3] * B[0] movq 24(%rsi), %rdx - adcxq %rcx, %r13 + adcxq %rcx, %r14 mulxq (%rax), %rdx, %rcx - adoxq %rdx, %r12 - adoxq %rcx, %r13 + adoxq %rdx, %r13 + adoxq %rcx, %r14 # A[3] * B[2] movq 24(%rsi), %rdx mulxq 16(%rax), %rdx, %rcx - adcxq %rdx, %r14 + adcxq %rdx, %r15 # A[2] * B[3] movq 24(%rax), %rdx - adcxq %rcx, %r15 + adcxq %rcx, %rbx mulxq 16(%rsi), %rcx, %rdx - adcxq %r8, %rbx - adoxq %rcx, %r14 - adoxq %rdx, %r15 - adoxq %r8, %rbx + adcxq %r8, %rbp + adoxq %rcx, %r15 + adoxq %rdx, %rbx + adoxq %r8, %rbp movq $38, %rdx - mulxq %rbx, %rbx, %rcx - addq %rbx, %r12 + mulxq %rbp, %rbp, %rcx + addq %rbp, %r13 adcq $0x00, %rcx movq $0x7fffffffffffffff, %r8 - shldq $0x01, %r12, %rcx + shldq $0x01, %r13, %rcx imulq $19, %rcx, %rcx - andq %r8, %r12 + andq %r8, %r13 xorq %r8, %r8 - adoxq %rcx, %r9 - mulxq %r13, %rcx, %r13 - adcxq %rcx, %r9 - adoxq %r13, %r10 + adoxq %rcx, %r10 mulxq %r14, %rcx, %r14 adcxq %rcx, %r10 adoxq %r14, %r11 mulxq %r15, %rcx, %r15 adcxq %rcx, %r11 adoxq %r15, %r12 - adcxq %r8, %r12 + mulxq %rbx, %rcx, %rbx + adcxq %rcx, %r12 + adoxq %rbx, %r13 + adcxq %r8, %r13 # Store - movq %r9, (%rdi) - movq %r10, 8(%rdi) - movq %r11, 16(%rdi) - movq %r12, 24(%rdi) + movq %r10, (%rdi) + movq %r11, 8(%rdi) + movq %r12, 16(%rdi) + movq %r13, 24(%rdi) addq $16, %rsp + popq %rbp popq %rbx popq %r15 popq %r14 @@ -17649,425 +17751,431 @@ _ge_p1p1_to_p3_avx2: pushq %r14 pushq %r15 pushq %rbx + pushq %rbp subq $16, %rsp movq %rdi, (%rsp) movq %rsi, 8(%rsp) leaq 96(%rsi), %rax + movq (%rsi), %r9 # Multiply # A[0] * B[0] movq (%rax), %rdx - mulxq (%rsi), %r9, %r10 + mulxq %r9, %r10, %r11 # A[2] * B[0] - mulxq 16(%rsi), %r11, %r12 + mulxq 16(%rsi), %r12, %r13 # A[1] * B[0] mulxq 8(%rsi), %rcx, %r8 - xorq %rbx, %rbx - adcxq %rcx, %r10 + xorq %rbp, %rbp + adcxq %rcx, %r11 # A[3] * B[1] movq 8(%rax), %rdx - mulxq 24(%rsi), %r13, %r14 - adcxq %r8, %r11 + mulxq 24(%rsi), %r14, %r15 + adcxq %r8, %r12 # A[0] * B[1] - mulxq (%rsi), %rcx, %r8 - adoxq %rcx, %r10 + mulxq %r9, %rcx, %r8 + adoxq %rcx, %r11 # A[2] * B[1] - mulxq 16(%rsi), %rcx, %r15 - adoxq %r8, %r11 - adcxq %rcx, %r12 + mulxq 16(%rsi), %rcx, %rbx + adoxq %r8, %r12 + adcxq %rcx, %r13 # A[1] * B[2] movq 16(%rax), %rdx mulxq 8(%rsi), %rcx, %r8 - adcxq %r15, %r13 - adoxq %rcx, %r12 adcxq %rbx, %r14 - adoxq %r8, %r13 + adoxq %rcx, %r13 + adcxq %rbp, %r15 + adoxq %r8, %r14 # A[0] * B[2] - mulxq (%rsi), %rcx, %r8 - adoxq %rbx, %r14 - xorq %r15, %r15 - adcxq %rcx, %r11 + mulxq %r9, %rcx, %r8 + adoxq %rbp, %r15 + xorq %rbx, %rbx + adcxq %rcx, %r12 # A[1] * B[1] movq 8(%rax), %rdx mulxq 8(%rsi), %rdx, %rcx - adcxq %r8, %r12 - adoxq %rdx, %r11 + adcxq %r8, %r13 + adoxq %rdx, %r12 # A[1] * B[3] movq 24(%rax), %rdx - adoxq %rcx, %r12 + adoxq %rcx, %r13 mulxq 8(%rsi), %rcx, %r8 - adcxq %rcx, %r13 + adcxq %rcx, %r14 # A[2] * B[2] movq 16(%rax), %rdx mulxq 16(%rsi), %rdx, %rcx - adcxq %r8, %r14 - adoxq %rdx, %r13 + adcxq %r8, %r15 + adoxq %rdx, %r14 # A[3] * B[3] movq 24(%rax), %rdx - adoxq %rcx, %r14 + adoxq %rcx, %r15 mulxq 24(%rsi), %rcx, %r8 - adoxq %rbx, %r15 - adcxq %rcx, %r15 + adoxq %rbp, %rbx + adcxq %rcx, %rbx # A[0] * B[3] - mulxq (%rsi), %rdx, %rcx - adcxq %r8, %rbx + mulxq %r9, %rdx, %rcx + adcxq %r8, %rbp xorq %r8, %r8 - adcxq %rdx, %r12 + adcxq %rdx, %r13 # A[3] * B[0] movq 24(%rsi), %rdx - adcxq %rcx, %r13 + adcxq %rcx, %r14 mulxq (%rax), %rdx, %rcx - adoxq %rdx, %r12 - adoxq %rcx, %r13 + adoxq %rdx, %r13 + adoxq %rcx, %r14 # A[3] * B[2] movq 24(%rsi), %rdx mulxq 16(%rax), %rdx, %rcx - adcxq %rdx, %r14 + adcxq %rdx, %r15 # A[2] * B[3] movq 24(%rax), %rdx - adcxq %rcx, %r15 + adcxq %rcx, %rbx mulxq 16(%rsi), %rcx, %rdx - adcxq %r8, %rbx - adoxq %rcx, %r14 - adoxq %rdx, %r15 - adoxq %r8, %rbx + adcxq %r8, %rbp + adoxq %rcx, %r15 + adoxq %rdx, %rbx + adoxq %r8, %rbp movq $38, %rdx - mulxq %rbx, %rbx, %rcx - addq %rbx, %r12 + mulxq %rbp, %rbp, %rcx + addq %rbp, %r13 adcq $0x00, %rcx movq $0x7fffffffffffffff, %r8 - shldq $0x01, %r12, %rcx + shldq $0x01, %r13, %rcx imulq $19, %rcx, %rcx - andq %r8, %r12 + andq %r8, %r13 xorq %r8, %r8 - adoxq %rcx, %r9 - mulxq %r13, %rcx, %r13 - adcxq %rcx, %r9 - adoxq %r13, %r10 + adoxq %rcx, %r10 mulxq %r14, %rcx, %r14 adcxq %rcx, %r10 adoxq %r14, %r11 mulxq %r15, %rcx, %r15 adcxq %rcx, %r11 adoxq %r15, %r12 - adcxq %r8, %r12 + mulxq %rbx, %rcx, %rbx + adcxq %rcx, %r12 + adoxq %rbx, %r13 + adcxq %r8, %r13 # Store - movq %r9, (%rdi) - movq %r10, 8(%rdi) - movq %r11, 16(%rdi) - movq %r12, 24(%rdi) + movq %r10, (%rdi) + movq %r11, 8(%rdi) + movq %r12, 16(%rdi) + movq %r13, 24(%rdi) leaq 32(%rsi), %rax leaq 96(%rdi), %rdi + movq (%rsi), %r9 # Multiply # A[0] * B[0] movq (%rax), %rdx - mulxq (%rsi), %r9, %r10 + mulxq %r9, %r10, %r11 # A[2] * B[0] - mulxq 16(%rsi), %r11, %r12 + mulxq 16(%rsi), %r12, %r13 # A[1] * B[0] mulxq 8(%rsi), %rcx, %r8 - xorq %rbx, %rbx - adcxq %rcx, %r10 + xorq %rbp, %rbp + adcxq %rcx, %r11 # A[3] * B[1] movq 8(%rax), %rdx - mulxq 24(%rsi), %r13, %r14 - adcxq %r8, %r11 + mulxq 24(%rsi), %r14, %r15 + adcxq %r8, %r12 # A[0] * B[1] - mulxq (%rsi), %rcx, %r8 - adoxq %rcx, %r10 + mulxq %r9, %rcx, %r8 + adoxq %rcx, %r11 # A[2] * B[1] - mulxq 16(%rsi), %rcx, %r15 - adoxq %r8, %r11 - adcxq %rcx, %r12 + mulxq 16(%rsi), %rcx, %rbx + adoxq %r8, %r12 + adcxq %rcx, %r13 # A[1] * B[2] movq 16(%rax), %rdx mulxq 8(%rsi), %rcx, %r8 - adcxq %r15, %r13 - adoxq %rcx, %r12 adcxq %rbx, %r14 - adoxq %r8, %r13 + adoxq %rcx, %r13 + adcxq %rbp, %r15 + adoxq %r8, %r14 # A[0] * B[2] - mulxq (%rsi), %rcx, %r8 - adoxq %rbx, %r14 - xorq %r15, %r15 - adcxq %rcx, %r11 + mulxq %r9, %rcx, %r8 + adoxq %rbp, %r15 + xorq %rbx, %rbx + adcxq %rcx, %r12 # A[1] * B[1] movq 8(%rax), %rdx mulxq 8(%rsi), %rdx, %rcx - adcxq %r8, %r12 - adoxq %rdx, %r11 + adcxq %r8, %r13 + adoxq %rdx, %r12 # A[1] * B[3] movq 24(%rax), %rdx - adoxq %rcx, %r12 + adoxq %rcx, %r13 mulxq 8(%rsi), %rcx, %r8 - adcxq %rcx, %r13 + adcxq %rcx, %r14 # A[2] * B[2] movq 16(%rax), %rdx mulxq 16(%rsi), %rdx, %rcx - adcxq %r8, %r14 - adoxq %rdx, %r13 + adcxq %r8, %r15 + adoxq %rdx, %r14 # A[3] * B[3] movq 24(%rax), %rdx - adoxq %rcx, %r14 + adoxq %rcx, %r15 mulxq 24(%rsi), %rcx, %r8 - adoxq %rbx, %r15 - adcxq %rcx, %r15 + adoxq %rbp, %rbx + adcxq %rcx, %rbx # A[0] * B[3] - mulxq (%rsi), %rdx, %rcx - adcxq %r8, %rbx + mulxq %r9, %rdx, %rcx + adcxq %r8, %rbp xorq %r8, %r8 - adcxq %rdx, %r12 + adcxq %rdx, %r13 # A[3] * B[0] movq 24(%rsi), %rdx - adcxq %rcx, %r13 + adcxq %rcx, %r14 mulxq (%rax), %rdx, %rcx - adoxq %rdx, %r12 - adoxq %rcx, %r13 + adoxq %rdx, %r13 + adoxq %rcx, %r14 # A[3] * B[2] movq 24(%rsi), %rdx mulxq 16(%rax), %rdx, %rcx - adcxq %rdx, %r14 + adcxq %rdx, %r15 # A[2] * B[3] movq 24(%rax), %rdx - adcxq %rcx, %r15 + adcxq %rcx, %rbx mulxq 16(%rsi), %rcx, %rdx - adcxq %r8, %rbx - adoxq %rcx, %r14 - adoxq %rdx, %r15 - adoxq %r8, %rbx + adcxq %r8, %rbp + adoxq %rcx, %r15 + adoxq %rdx, %rbx + adoxq %r8, %rbp movq $38, %rdx - mulxq %rbx, %rbx, %rcx - addq %rbx, %r12 + mulxq %rbp, %rbp, %rcx + addq %rbp, %r13 adcq $0x00, %rcx movq $0x7fffffffffffffff, %r8 - shldq $0x01, %r12, %rcx + shldq $0x01, %r13, %rcx imulq $19, %rcx, %rcx - andq %r8, %r12 + andq %r8, %r13 xorq %r8, %r8 - adoxq %rcx, %r9 - mulxq %r13, %rcx, %r13 - adcxq %rcx, %r9 - adoxq %r13, %r10 + adoxq %rcx, %r10 mulxq %r14, %rcx, %r14 adcxq %rcx, %r10 adoxq %r14, %r11 mulxq %r15, %rcx, %r15 adcxq %rcx, %r11 adoxq %r15, %r12 - adcxq %r8, %r12 + mulxq %rbx, %rcx, %rbx + adcxq %rcx, %r12 + adoxq %rbx, %r13 + adcxq %r8, %r13 # Store - movq %r9, (%rdi) - movq %r10, 8(%rdi) - movq %r11, 16(%rdi) - movq %r12, 24(%rdi) + movq %r10, (%rdi) + movq %r11, 8(%rdi) + movq %r12, 16(%rdi) + movq %r13, 24(%rdi) leaq 64(%rsi), %rsi leaq -64(%rdi), %rdi + movq (%rsi), %r9 # Multiply # A[0] * B[0] movq (%rax), %rdx - mulxq (%rsi), %r9, %r10 + mulxq %r9, %r10, %r11 # A[2] * B[0] - mulxq 16(%rsi), %r11, %r12 + mulxq 16(%rsi), %r12, %r13 # A[1] * B[0] mulxq 8(%rsi), %rcx, %r8 - xorq %rbx, %rbx - adcxq %rcx, %r10 + xorq %rbp, %rbp + adcxq %rcx, %r11 # A[3] * B[1] movq 8(%rax), %rdx - mulxq 24(%rsi), %r13, %r14 - adcxq %r8, %r11 + mulxq 24(%rsi), %r14, %r15 + adcxq %r8, %r12 # A[0] * B[1] - mulxq (%rsi), %rcx, %r8 - adoxq %rcx, %r10 + mulxq %r9, %rcx, %r8 + adoxq %rcx, %r11 # A[2] * B[1] - mulxq 16(%rsi), %rcx, %r15 - adoxq %r8, %r11 - adcxq %rcx, %r12 + mulxq 16(%rsi), %rcx, %rbx + adoxq %r8, %r12 + adcxq %rcx, %r13 # A[1] * B[2] movq 16(%rax), %rdx mulxq 8(%rsi), %rcx, %r8 - adcxq %r15, %r13 - adoxq %rcx, %r12 adcxq %rbx, %r14 - adoxq %r8, %r13 + adoxq %rcx, %r13 + adcxq %rbp, %r15 + adoxq %r8, %r14 # A[0] * B[2] - mulxq (%rsi), %rcx, %r8 - adoxq %rbx, %r14 - xorq %r15, %r15 - adcxq %rcx, %r11 + mulxq %r9, %rcx, %r8 + adoxq %rbp, %r15 + xorq %rbx, %rbx + adcxq %rcx, %r12 # A[1] * B[1] movq 8(%rax), %rdx mulxq 8(%rsi), %rdx, %rcx - adcxq %r8, %r12 - adoxq %rdx, %r11 + adcxq %r8, %r13 + adoxq %rdx, %r12 # A[1] * B[3] movq 24(%rax), %rdx - adoxq %rcx, %r12 + adoxq %rcx, %r13 mulxq 8(%rsi), %rcx, %r8 - adcxq %rcx, %r13 + adcxq %rcx, %r14 # A[2] * B[2] movq 16(%rax), %rdx mulxq 16(%rsi), %rdx, %rcx - adcxq %r8, %r14 - adoxq %rdx, %r13 + adcxq %r8, %r15 + adoxq %rdx, %r14 # A[3] * B[3] movq 24(%rax), %rdx - adoxq %rcx, %r14 + adoxq %rcx, %r15 mulxq 24(%rsi), %rcx, %r8 - adoxq %rbx, %r15 - adcxq %rcx, %r15 + adoxq %rbp, %rbx + adcxq %rcx, %rbx # A[0] * B[3] - mulxq (%rsi), %rdx, %rcx - adcxq %r8, %rbx + mulxq %r9, %rdx, %rcx + adcxq %r8, %rbp xorq %r8, %r8 - adcxq %rdx, %r12 + adcxq %rdx, %r13 # A[3] * B[0] movq 24(%rsi), %rdx - adcxq %rcx, %r13 + adcxq %rcx, %r14 mulxq (%rax), %rdx, %rcx - adoxq %rdx, %r12 - adoxq %rcx, %r13 + adoxq %rdx, %r13 + adoxq %rcx, %r14 # A[3] * B[2] movq 24(%rsi), %rdx mulxq 16(%rax), %rdx, %rcx - adcxq %rdx, %r14 + adcxq %rdx, %r15 # A[2] * B[3] movq 24(%rax), %rdx - adcxq %rcx, %r15 + adcxq %rcx, %rbx mulxq 16(%rsi), %rcx, %rdx - adcxq %r8, %rbx - adoxq %rcx, %r14 - adoxq %rdx, %r15 - adoxq %r8, %rbx + adcxq %r8, %rbp + adoxq %rcx, %r15 + adoxq %rdx, %rbx + adoxq %r8, %rbp movq $38, %rdx - mulxq %rbx, %rbx, %rcx - addq %rbx, %r12 + mulxq %rbp, %rbp, %rcx + addq %rbp, %r13 adcq $0x00, %rcx movq $0x7fffffffffffffff, %r8 - shldq $0x01, %r12, %rcx + shldq $0x01, %r13, %rcx imulq $19, %rcx, %rcx - andq %r8, %r12 + andq %r8, %r13 xorq %r8, %r8 - adoxq %rcx, %r9 - mulxq %r13, %rcx, %r13 - adcxq %rcx, %r9 - adoxq %r13, %r10 + adoxq %rcx, %r10 mulxq %r14, %rcx, %r14 adcxq %rcx, %r10 adoxq %r14, %r11 mulxq %r15, %rcx, %r15 adcxq %rcx, %r11 adoxq %r15, %r12 - adcxq %r8, %r12 + mulxq %rbx, %rcx, %rbx + adcxq %rcx, %r12 + adoxq %rbx, %r13 + adcxq %r8, %r13 # Store - movq %r9, (%rdi) - movq %r10, 8(%rdi) - movq %r11, 16(%rdi) - movq %r12, 24(%rdi) + movq %r10, (%rdi) + movq %r11, 8(%rdi) + movq %r12, 16(%rdi) + movq %r13, 24(%rdi) leaq 32(%rsi), %rax leaq 32(%rdi), %rdi + movq (%rsi), %r9 # Multiply # A[0] * B[0] movq (%rax), %rdx - mulxq (%rsi), %r9, %r10 + mulxq %r9, %r10, %r11 # A[2] * B[0] - mulxq 16(%rsi), %r11, %r12 + mulxq 16(%rsi), %r12, %r13 # A[1] * B[0] mulxq 8(%rsi), %rcx, %r8 - xorq %rbx, %rbx - adcxq %rcx, %r10 + xorq %rbp, %rbp + adcxq %rcx, %r11 # A[3] * B[1] movq 8(%rax), %rdx - mulxq 24(%rsi), %r13, %r14 - adcxq %r8, %r11 + mulxq 24(%rsi), %r14, %r15 + adcxq %r8, %r12 # A[0] * B[1] - mulxq (%rsi), %rcx, %r8 - adoxq %rcx, %r10 + mulxq %r9, %rcx, %r8 + adoxq %rcx, %r11 # A[2] * B[1] - mulxq 16(%rsi), %rcx, %r15 - adoxq %r8, %r11 - adcxq %rcx, %r12 + mulxq 16(%rsi), %rcx, %rbx + adoxq %r8, %r12 + adcxq %rcx, %r13 # A[1] * B[2] movq 16(%rax), %rdx mulxq 8(%rsi), %rcx, %r8 - adcxq %r15, %r13 - adoxq %rcx, %r12 adcxq %rbx, %r14 - adoxq %r8, %r13 + adoxq %rcx, %r13 + adcxq %rbp, %r15 + adoxq %r8, %r14 # A[0] * B[2] - mulxq (%rsi), %rcx, %r8 - adoxq %rbx, %r14 - xorq %r15, %r15 - adcxq %rcx, %r11 + mulxq %r9, %rcx, %r8 + adoxq %rbp, %r15 + xorq %rbx, %rbx + adcxq %rcx, %r12 # A[1] * B[1] movq 8(%rax), %rdx mulxq 8(%rsi), %rdx, %rcx - adcxq %r8, %r12 - adoxq %rdx, %r11 + adcxq %r8, %r13 + adoxq %rdx, %r12 # A[1] * B[3] movq 24(%rax), %rdx - adoxq %rcx, %r12 + adoxq %rcx, %r13 mulxq 8(%rsi), %rcx, %r8 - adcxq %rcx, %r13 + adcxq %rcx, %r14 # A[2] * B[2] movq 16(%rax), %rdx mulxq 16(%rsi), %rdx, %rcx - adcxq %r8, %r14 - adoxq %rdx, %r13 + adcxq %r8, %r15 + adoxq %rdx, %r14 # A[3] * B[3] movq 24(%rax), %rdx - adoxq %rcx, %r14 + adoxq %rcx, %r15 mulxq 24(%rsi), %rcx, %r8 - adoxq %rbx, %r15 - adcxq %rcx, %r15 + adoxq %rbp, %rbx + adcxq %rcx, %rbx # A[0] * B[3] - mulxq (%rsi), %rdx, %rcx - adcxq %r8, %rbx + mulxq %r9, %rdx, %rcx + adcxq %r8, %rbp xorq %r8, %r8 - adcxq %rdx, %r12 + adcxq %rdx, %r13 # A[3] * B[0] movq 24(%rsi), %rdx - adcxq %rcx, %r13 + adcxq %rcx, %r14 mulxq (%rax), %rdx, %rcx - adoxq %rdx, %r12 - adoxq %rcx, %r13 + adoxq %rdx, %r13 + adoxq %rcx, %r14 # A[3] * B[2] movq 24(%rsi), %rdx mulxq 16(%rax), %rdx, %rcx - adcxq %rdx, %r14 + adcxq %rdx, %r15 # A[2] * B[3] movq 24(%rax), %rdx - adcxq %rcx, %r15 + adcxq %rcx, %rbx mulxq 16(%rsi), %rcx, %rdx - adcxq %r8, %rbx - adoxq %rcx, %r14 - adoxq %rdx, %r15 - adoxq %r8, %rbx + adcxq %r8, %rbp + adoxq %rcx, %r15 + adoxq %rdx, %rbx + adoxq %r8, %rbp movq $38, %rdx - mulxq %rbx, %rbx, %rcx - addq %rbx, %r12 + mulxq %rbp, %rbp, %rcx + addq %rbp, %r13 adcq $0x00, %rcx movq $0x7fffffffffffffff, %r8 - shldq $0x01, %r12, %rcx + shldq $0x01, %r13, %rcx imulq $19, %rcx, %rcx - andq %r8, %r12 + andq %r8, %r13 xorq %r8, %r8 - adoxq %rcx, %r9 - mulxq %r13, %rcx, %r13 - adcxq %rcx, %r9 - adoxq %r13, %r10 + adoxq %rcx, %r10 mulxq %r14, %rcx, %r14 adcxq %rcx, %r10 adoxq %r14, %r11 mulxq %r15, %rcx, %r15 adcxq %rcx, %r11 adoxq %r15, %r12 - adcxq %r8, %r12 + mulxq %rbx, %rcx, %rbx + adcxq %rcx, %r12 + adoxq %rbx, %r13 + adcxq %r8, %r13 # Store - movq %r9, (%rdi) - movq %r10, 8(%rdi) - movq %r11, 16(%rdi) - movq %r12, 24(%rdi) + movq %r10, (%rdi) + movq %r11, 8(%rdi) + movq %r12, 16(%rdi) + movq %r13, 24(%rdi) addq $16, %rsp + popq %rbp popq %rbx popq %r15 popq %r14 @@ -18268,14 +18376,13 @@ _ge_p2_dbl_avx2: adcq 16(%rsi), %r12 movq %r13, %rbp adcq 24(%rsi), %r13 - movq $0x00, %rcx - adcq $0x00, %rcx - shldq $0x01, %r13, %rcx - movq $0x7fffffffffffffff, %r8 - imulq $19, %rcx - andq %r8, %r13 + movq $0x00, %rdx + adcq $0x00, %rdx + shldq $0x01, %r13, %rdx + imulq $19, %rdx + btr $63, %r13 # Sub modulus (if overflow) - addq %rcx, %r10 + addq %rdx, %r10 adcq $0x00, %r11 adcq $0x00, %r12 adcq $0x00, %r13 @@ -18284,12 +18391,12 @@ _ge_p2_dbl_avx2: sbbq 8(%rsi), %r15 sbbq 16(%rsi), %rbx sbbq 24(%rsi), %rbp - sbbq %rcx, %rcx - shldq $0x01, %rbp, %rcx - imulq $-19, %rcx - andq %r8, %rbp + sbbq %rdx, %rdx + shldq $0x01, %rbp, %rdx + imulq $-19, %rdx + btr $63, %rbp # Add modulus (if underflow) - subq %rcx, %r14 + subq %rdx, %r14 sbbq $0x00, %r15 sbbq $0x00, %rbx sbbq $0x00, %rbp @@ -18313,14 +18420,13 @@ _ge_p2_dbl_avx2: movq 24(%rsi), %r13 adcq 16(%rax), %r12 adcq 24(%rax), %r13 - movq $0x00, %rcx - adcq $0x00, %rcx - shldq $0x01, %r13, %rcx - movq $0x7fffffffffffffff, %r8 - imulq $19, %rcx - andq %r8, %r13 + movq $0x00, %rdx + adcq $0x00, %rdx + shldq $0x01, %r13, %rdx + imulq $19, %rdx + btr $63, %r13 # Sub modulus (if overflow) - addq %rcx, %r10 + addq %rdx, %r10 adcq $0x00, %r11 adcq $0x00, %r12 adcq $0x00, %r13 @@ -18410,13 +18516,12 @@ _ge_p2_dbl_avx2: sbbq 8(%rsi), %r11 sbbq 16(%rsi), %r12 sbbq 24(%rsi), %r13 - sbbq %rcx, %rcx - shldq $0x01, %r13, %rcx - movq $0x7fffffffffffffff, %r8 - imulq $-19, %rcx - andq %r8, %r13 + sbbq %rdx, %rdx + shldq $0x01, %r13, %rdx + imulq $-19, %rdx + btr $63, %r13 # Add modulus (if underflow) - subq %rcx, %r10 + subq %rdx, %r10 sbbq $0x00, %r11 sbbq $0x00, %r12 sbbq $0x00, %r13 @@ -18521,13 +18626,12 @@ _ge_p2_dbl_avx2: sbbq 8(%rsi), %r11 sbbq 16(%rsi), %r12 sbbq 24(%rsi), %r13 - sbbq %rcx, %rcx - shldq $0x01, %r13, %rcx - movq $0x7fffffffffffffff, %r8 - imulq $-19, %rcx - andq %r8, %r13 + sbbq %rdx, %rdx + shldq $0x01, %r13, %rdx + imulq $-19, %rdx + btr $63, %r13 # Add modulus (if underflow) - subq %rcx, %r10 + subq %rdx, %r10 sbbq $0x00, %r11 sbbq $0x00, %r12 sbbq $0x00, %r13 @@ -18692,14 +18796,13 @@ _ge_madd_avx2: adcq 16(%rcx), %r12 movq %r13, %rbp adcq 24(%rcx), %r13 - movq $0x00, %r8 - adcq $0x00, %r8 - shldq $0x01, %r13, %r8 - movq $0x7fffffffffffffff, %r9 - imulq $19, %r8 - andq %r9, %r13 + movq $0x00, %rdx + adcq $0x00, %rdx + shldq $0x01, %r13, %rdx + imulq $19, %rdx + btr $63, %r13 # Sub modulus (if overflow) - addq %r8, %r10 + addq %rdx, %r10 adcq $0x00, %r11 adcq $0x00, %r12 adcq $0x00, %r13 @@ -18708,12 +18811,12 @@ _ge_madd_avx2: sbbq 8(%rcx), %r15 sbbq 16(%rcx), %rbx sbbq 24(%rcx), %rbp - sbbq %r8, %r8 - shldq $0x01, %rbp, %r8 - imulq $-19, %r8 - andq %r9, %rbp + sbbq %rdx, %rdx + shldq $0x01, %rbp, %rdx + imulq $-19, %rdx + btr $63, %rbp # Add modulus (if underflow) - subq %r8, %r14 + subq %rdx, %r14 sbbq $0x00, %r15 sbbq $0x00, %rbx sbbq $0x00, %rbp @@ -18945,14 +19048,13 @@ _ge_madd_avx2: adcq 16(%rsi), %r12 movq %r13, %rbp adcq 24(%rsi), %r13 - movq $0x00, %r8 - adcq $0x00, %r8 - shldq $0x01, %r13, %r8 - movq $0x7fffffffffffffff, %r9 - imulq $19, %r8 - andq %r9, %r13 + movq $0x00, %rdx + adcq $0x00, %rdx + shldq $0x01, %r13, %rdx + imulq $19, %rdx + btr $63, %r13 # Sub modulus (if overflow) - addq %r8, %r10 + addq %rdx, %r10 adcq $0x00, %r11 adcq $0x00, %r12 adcq $0x00, %r13 @@ -18961,12 +19063,12 @@ _ge_madd_avx2: sbbq 8(%rsi), %r15 sbbq 16(%rsi), %rbx sbbq 24(%rsi), %rbp - sbbq %r8, %r8 - shldq $0x01, %rbp, %r8 - imulq $-19, %r8 - andq %r9, %rbp + sbbq %rdx, %rdx + shldq $0x01, %rbp, %rdx + imulq $-19, %rdx + btr $63, %rbp # Add modulus (if underflow) - subq %r8, %r14 + subq %rdx, %r14 sbbq $0x00, %r15 sbbq $0x00, %rbx sbbq $0x00, %rbp @@ -18988,14 +19090,13 @@ _ge_madd_avx2: movq 24(%rcx), %r13 adcq %r12, %r12 adcq %r13, %r13 - movq $0x00, %r8 - adcq $0x00, %r8 - shldq $0x01, %r13, %r8 - movq $0x7fffffffffffffff, %r9 - imulq $19, %r8 - andq %r9, %r13 + movq $0x00, %rdx + adcq $0x00, %rdx + shldq $0x01, %r13, %rdx + imulq $19, %rdx + btr $63, %r13 # Sub modulus (if overflow) - addq %r8, %r10 + addq %rdx, %r10 adcq $0x00, %r11 adcq $0x00, %r12 adcq $0x00, %r13 @@ -19011,14 +19112,13 @@ _ge_madd_avx2: adcq 16(%rsi), %r12 movq %r13, %rbp adcq 24(%rsi), %r13 - movq $0x00, %r8 - adcq $0x00, %r8 - shldq $0x01, %r13, %r8 - movq $0x7fffffffffffffff, %r9 - imulq $19, %r8 - andq %r9, %r13 + movq $0x00, %rdx + adcq $0x00, %rdx + shldq $0x01, %r13, %rdx + imulq $19, %rdx + btr $63, %r13 # Sub modulus (if overflow) - addq %r8, %r10 + addq %rdx, %r10 adcq $0x00, %r11 adcq $0x00, %r12 adcq $0x00, %r13 @@ -19027,12 +19127,12 @@ _ge_madd_avx2: sbbq 8(%rsi), %r15 sbbq 16(%rsi), %rbx sbbq 24(%rsi), %rbp - sbbq %r8, %r8 - shldq $0x01, %rbp, %r8 - imulq $-19, %r8 - andq %r9, %rbp + sbbq %rdx, %rdx + shldq $0x01, %rbp, %rdx + imulq $-19, %rdx + btr $63, %rbp # Add modulus (if underflow) - subq %r8, %r14 + subq %rdx, %r14 sbbq $0x00, %r15 sbbq $0x00, %rbx sbbq $0x00, %rbp @@ -19201,14 +19301,13 @@ _ge_msub_avx2: adcq 16(%rcx), %r12 movq %r13, %rbp adcq 24(%rcx), %r13 - movq $0x00, %r8 - adcq $0x00, %r8 - shldq $0x01, %r13, %r8 - movq $0x7fffffffffffffff, %r9 - imulq $19, %r8 - andq %r9, %r13 + movq $0x00, %rdx + adcq $0x00, %rdx + shldq $0x01, %r13, %rdx + imulq $19, %rdx + btr $63, %r13 # Sub modulus (if overflow) - addq %r8, %r10 + addq %rdx, %r10 adcq $0x00, %r11 adcq $0x00, %r12 adcq $0x00, %r13 @@ -19217,12 +19316,12 @@ _ge_msub_avx2: sbbq 8(%rcx), %r15 sbbq 16(%rcx), %rbx sbbq 24(%rcx), %rbp - sbbq %r8, %r8 - shldq $0x01, %rbp, %r8 - imulq $-19, %r8 - andq %r9, %rbp + sbbq %rdx, %rdx + shldq $0x01, %rbp, %rdx + imulq $-19, %rdx + btr $63, %rbp # Add modulus (if underflow) - subq %r8, %r14 + subq %rdx, %r14 sbbq $0x00, %r15 sbbq $0x00, %rbx sbbq $0x00, %rbp @@ -19455,14 +19554,13 @@ _ge_msub_avx2: adcq 16(%rsi), %r12 movq %r13, %rbp adcq 24(%rsi), %r13 - movq $0x00, %r8 - adcq $0x00, %r8 - shldq $0x01, %r13, %r8 - movq $0x7fffffffffffffff, %r9 - imulq $19, %r8 - andq %r9, %r13 + movq $0x00, %rdx + adcq $0x00, %rdx + shldq $0x01, %r13, %rdx + imulq $19, %rdx + btr $63, %r13 # Sub modulus (if overflow) - addq %r8, %r10 + addq %rdx, %r10 adcq $0x00, %r11 adcq $0x00, %r12 adcq $0x00, %r13 @@ -19471,12 +19569,12 @@ _ge_msub_avx2: sbbq 8(%rsi), %r15 sbbq 16(%rsi), %rbx sbbq 24(%rsi), %rbp - sbbq %r8, %r8 - shldq $0x01, %rbp, %r8 - imulq $-19, %r8 - andq %r9, %rbp + sbbq %rdx, %rdx + shldq $0x01, %rbp, %rdx + imulq $-19, %rdx + btr $63, %rbp # Add modulus (if underflow) - subq %r8, %r14 + subq %rdx, %r14 sbbq $0x00, %r15 sbbq $0x00, %rbx sbbq $0x00, %rbp @@ -19498,14 +19596,13 @@ _ge_msub_avx2: movq 24(%rcx), %r13 adcq %r12, %r12 adcq %r13, %r13 - movq $0x00, %r8 - adcq $0x00, %r8 - shldq $0x01, %r13, %r8 - movq $0x7fffffffffffffff, %r9 - imulq $19, %r8 - andq %r9, %r13 + movq $0x00, %rdx + adcq $0x00, %rdx + shldq $0x01, %r13, %rdx + imulq $19, %rdx + btr $63, %r13 # Sub modulus (if overflow) - addq %r8, %r10 + addq %rdx, %r10 adcq $0x00, %r11 adcq $0x00, %r12 adcq $0x00, %r13 @@ -19521,14 +19618,13 @@ _ge_msub_avx2: adcq 16(%rsi), %r12 movq %r13, %rbp adcq 24(%rsi), %r13 - movq $0x00, %r8 - adcq $0x00, %r8 - shldq $0x01, %r13, %r8 - movq $0x7fffffffffffffff, %r9 - imulq $19, %r8 - andq %r9, %r13 + movq $0x00, %rdx + adcq $0x00, %rdx + shldq $0x01, %r13, %rdx + imulq $19, %rdx + btr $63, %r13 # Sub modulus (if overflow) - addq %r8, %r10 + addq %rdx, %r10 adcq $0x00, %r11 adcq $0x00, %r12 adcq $0x00, %r13 @@ -19537,12 +19633,12 @@ _ge_msub_avx2: sbbq 8(%rsi), %r15 sbbq 16(%rsi), %rbx sbbq 24(%rsi), %rbp - sbbq %r8, %r8 - shldq $0x01, %rbp, %r8 - imulq $-19, %r8 - andq %r9, %rbp + sbbq %rdx, %rdx + shldq $0x01, %rbp, %rdx + imulq $-19, %rdx + btr $63, %rbp # Add modulus (if underflow) - subq %r8, %r14 + subq %rdx, %r14 sbbq $0x00, %r15 sbbq $0x00, %rbx sbbq $0x00, %rbp @@ -19711,14 +19807,13 @@ _ge_add_avx2: adcq 16(%rcx), %r12 movq %r13, %rbp adcq 24(%rcx), %r13 - movq $0x00, %r8 - adcq $0x00, %r8 - shldq $0x01, %r13, %r8 - movq $0x7fffffffffffffff, %r9 - imulq $19, %r8 - andq %r9, %r13 + movq $0x00, %rdx + adcq $0x00, %rdx + shldq $0x01, %r13, %rdx + imulq $19, %rdx + btr $63, %r13 # Sub modulus (if overflow) - addq %r8, %r10 + addq %rdx, %r10 adcq $0x00, %r11 adcq $0x00, %r12 adcq $0x00, %r13 @@ -19727,12 +19822,12 @@ _ge_add_avx2: sbbq 8(%rcx), %r15 sbbq 16(%rcx), %rbx sbbq 24(%rcx), %rbp - sbbq %r8, %r8 - shldq $0x01, %rbp, %r8 - imulq $-19, %r8 - andq %r9, %rbp + sbbq %rdx, %rdx + shldq $0x01, %rbp, %rdx + imulq $-19, %rdx + btr $63, %rbp # Add modulus (if underflow) - subq %r8, %r14 + subq %rdx, %r14 sbbq $0x00, %r15 sbbq $0x00, %rbx sbbq $0x00, %rbp @@ -20056,14 +20151,13 @@ _ge_add_avx2: adcq %r11, %r11 adcq %r12, %r12 adcq %r13, %r13 - movq $0x00, %r8 - adcq $0x00, %r8 - shldq $0x01, %r13, %r8 - movq $0x7fffffffffffffff, %r9 - imulq $19, %r8 - andq %r9, %r13 + movq $0x00, %rdx + adcq $0x00, %rdx + shldq $0x01, %r13, %rdx + imulq $19, %rdx + btr $63, %r13 # Sub modulus (if overflow) - addq %r8, %r10 + addq %rdx, %r10 adcq $0x00, %r11 adcq $0x00, %r12 adcq $0x00, %r13 @@ -20086,14 +20180,13 @@ _ge_add_avx2: adcq 16(%rsi), %r12 movq %r13, %rbp adcq 24(%rsi), %r13 - movq $0x00, %r8 - adcq $0x00, %r8 - shldq $0x01, %r13, %r8 - movq $0x7fffffffffffffff, %r9 - imulq $19, %r8 - andq %r9, %r13 + movq $0x00, %rdx + adcq $0x00, %rdx + shldq $0x01, %r13, %rdx + imulq $19, %rdx + btr $63, %r13 # Sub modulus (if overflow) - addq %r8, %r10 + addq %rdx, %r10 adcq $0x00, %r11 adcq $0x00, %r12 adcq $0x00, %r13 @@ -20102,12 +20195,12 @@ _ge_add_avx2: sbbq 8(%rsi), %r15 sbbq 16(%rsi), %rbx sbbq 24(%rsi), %rbp - sbbq %r8, %r8 - shldq $0x01, %rbp, %r8 - imulq $-19, %r8 - andq %r9, %rbp + sbbq %rdx, %rdx + shldq $0x01, %rbp, %rdx + imulq $-19, %rdx + btr $63, %rbp # Add modulus (if underflow) - subq %r8, %r14 + subq %rdx, %r14 sbbq $0x00, %r15 sbbq $0x00, %rbx sbbq $0x00, %rbp @@ -20135,14 +20228,13 @@ _ge_add_avx2: adcq 16(%rsi), %r12 movq %r13, %rbp adcq 24(%rsi), %r13 - movq $0x00, %r8 - adcq $0x00, %r8 - shldq $0x01, %r13, %r8 - movq $0x7fffffffffffffff, %r9 - imulq $19, %r8 - andq %r9, %r13 + movq $0x00, %rdx + adcq $0x00, %rdx + shldq $0x01, %r13, %rdx + imulq $19, %rdx + btr $63, %r13 # Sub modulus (if overflow) - addq %r8, %r10 + addq %rdx, %r10 adcq $0x00, %r11 adcq $0x00, %r12 adcq $0x00, %r13 @@ -20151,12 +20243,12 @@ _ge_add_avx2: sbbq 8(%rsi), %r15 sbbq 16(%rsi), %rbx sbbq 24(%rsi), %rbp - sbbq %r8, %r8 - shldq $0x01, %rbp, %r8 - imulq $-19, %r8 - andq %r9, %rbp + sbbq %rdx, %rdx + shldq $0x01, %rbp, %rdx + imulq $-19, %rdx + btr $63, %rbp # Add modulus (if underflow) - subq %r8, %r14 + subq %rdx, %r14 sbbq $0x00, %r15 sbbq $0x00, %rbx sbbq $0x00, %rbp @@ -20325,14 +20417,13 @@ _ge_sub_avx2: adcq 16(%rcx), %r12 movq %r13, %rbp adcq 24(%rcx), %r13 - movq $0x00, %r8 - adcq $0x00, %r8 - shldq $0x01, %r13, %r8 - movq $0x7fffffffffffffff, %r9 - imulq $19, %r8 - andq %r9, %r13 + movq $0x00, %rdx + adcq $0x00, %rdx + shldq $0x01, %r13, %rdx + imulq $19, %rdx + btr $63, %r13 # Sub modulus (if overflow) - addq %r8, %r10 + addq %rdx, %r10 adcq $0x00, %r11 adcq $0x00, %r12 adcq $0x00, %r13 @@ -20341,12 +20432,12 @@ _ge_sub_avx2: sbbq 8(%rcx), %r15 sbbq 16(%rcx), %rbx sbbq 24(%rcx), %rbp - sbbq %r8, %r8 - shldq $0x01, %rbp, %r8 - imulq $-19, %r8 - andq %r9, %rbp + sbbq %rdx, %rdx + shldq $0x01, %rbp, %rdx + imulq $-19, %rdx + btr $63, %rbp # Add modulus (if underflow) - subq %r8, %r14 + subq %rdx, %r14 sbbq $0x00, %r15 sbbq $0x00, %rbx sbbq $0x00, %rbp @@ -20671,14 +20762,13 @@ _ge_sub_avx2: adcq %r11, %r11 adcq %r12, %r12 adcq %r13, %r13 - movq $0x00, %r8 - adcq $0x00, %r8 - shldq $0x01, %r13, %r8 - movq $0x7fffffffffffffff, %r9 - imulq $19, %r8 - andq %r9, %r13 + movq $0x00, %rdx + adcq $0x00, %rdx + shldq $0x01, %r13, %rdx + imulq $19, %rdx + btr $63, %r13 # Sub modulus (if overflow) - addq %r8, %r10 + addq %rdx, %r10 adcq $0x00, %r11 adcq $0x00, %r12 adcq $0x00, %r13 @@ -20701,14 +20791,13 @@ _ge_sub_avx2: adcq 16(%rsi), %r12 movq %r13, %rbp adcq 24(%rsi), %r13 - movq $0x00, %r8 - adcq $0x00, %r8 - shldq $0x01, %r13, %r8 - movq $0x7fffffffffffffff, %r9 - imulq $19, %r8 - andq %r9, %r13 + movq $0x00, %rdx + adcq $0x00, %rdx + shldq $0x01, %r13, %rdx + imulq $19, %rdx + btr $63, %r13 # Sub modulus (if overflow) - addq %r8, %r10 + addq %rdx, %r10 adcq $0x00, %r11 adcq $0x00, %r12 adcq $0x00, %r13 @@ -20717,12 +20806,12 @@ _ge_sub_avx2: sbbq 8(%rsi), %r15 sbbq 16(%rsi), %rbx sbbq 24(%rsi), %rbp - sbbq %r8, %r8 - shldq $0x01, %rbp, %r8 - imulq $-19, %r8 - andq %r9, %rbp + sbbq %rdx, %rdx + shldq $0x01, %rbp, %rdx + imulq $-19, %rdx + btr $63, %rbp # Add modulus (if underflow) - subq %r8, %r14 + subq %rdx, %r14 sbbq $0x00, %r15 sbbq $0x00, %rbx sbbq $0x00, %rbp @@ -20750,14 +20839,13 @@ _ge_sub_avx2: adcq 16(%rdi), %r12 movq %r13, %rbp adcq 24(%rdi), %r13 - movq $0x00, %r8 - adcq $0x00, %r8 - shldq $0x01, %r13, %r8 - movq $0x7fffffffffffffff, %r9 - imulq $19, %r8 - andq %r9, %r13 + movq $0x00, %rdx + adcq $0x00, %rdx + shldq $0x01, %r13, %rdx + imulq $19, %rdx + btr $63, %r13 # Sub modulus (if overflow) - addq %r8, %r10 + addq %rdx, %r10 adcq $0x00, %r11 adcq $0x00, %r12 adcq $0x00, %r13 @@ -20766,12 +20854,12 @@ _ge_sub_avx2: sbbq 8(%rdi), %r15 sbbq 16(%rdi), %rbx sbbq 24(%rdi), %rbp - sbbq %r8, %r8 - shldq $0x01, %rbp, %r8 - imulq $-19, %r8 - andq %r9, %rbp + sbbq %rdx, %rdx + shldq $0x01, %rbp, %rdx + imulq $-19, %rdx + btr $63, %rbp # Add modulus (if underflow) - subq %r8, %r14 + subq %rdx, %r14 sbbq $0x00, %r15 sbbq $0x00, %rbx sbbq $0x00, %rbp @@ -21322,6 +21410,384 @@ _sc_muladd_avx2: #ifndef __APPLE__ .size sc_muladd_avx2,.-sc_muladd_avx2 #endif /* __APPLE__ */ +#ifndef __APPLE__ +.data +#else +.section __DATA,__data +#endif /* __APPLE__ */ +L_sp_mod_inv_avx2__prime: +.long 0x03ffffed,0x03ffffff,0x03ffffff,0x03ffffff +.long 0x03ffffff,0x00000000,0x00000000,0x00000000 +.long 0x03ffffff,0x03ffffff,0x03ffffff,0x03ffffff +.long 0x001fffff,0x00000000,0x00000000,0x00000000 +#ifndef __APPLE__ +.data +#else +.section __DATA,__data +#endif /* __APPLE__ */ +#ifndef __APPLE__ +.align 32 +#else +.p2align 5 +#endif /* __APPLE__ */ +L_sp_mod_inv_avx2__one: +.quad 0x1, 0x0 +.quad 0x0, 0x0 +#ifndef __APPLE__ +.data +#else +.section __DATA,__data +#endif /* __APPLE__ */ +L_sp_mod_inv_avx2__all_one: +.long 0x00000001,0x00000001,0x00000001,0x00000001 +.long 0x00000001,0x00000001,0x00000001,0x00000001 +#ifndef __APPLE__ +.data +#else +.section __DATA,__data +#endif /* __APPLE__ */ +L_sp_mod_inv_avx2__mask01111: +.long 0x00000000,0x00000001,0x00000001,0x00000001 +.long 0x00000001,0x00000000,0x00000000,0x00000000 +#ifndef __APPLE__ +.data +#else +.section __DATA,__data +#endif /* __APPLE__ */ +L_sp_mod_inv_avx2__down_one_dword: +.long 0x00000001,0x00000002,0x00000003,0x00000004 +.long 0x00000005,0x00000006,0x00000007,0x00000007 +#ifndef __APPLE__ +.data +#else +.section __DATA,__data +#endif /* __APPLE__ */ +L_sp_mod_inv_avx2__neg: +.long 0x00000000,0x00000000,0x00000000,0x00000000 +.long 0x80000000,0x00000000,0x00000000,0x00000000 +#ifndef __APPLE__ +.data +#else +.section __DATA,__data +#endif /* __APPLE__ */ +L_sp_mod_inv_avx2__up_one_dword: +.long 0x00000007,0x00000000,0x00000001,0x00000002 +.long 0x00000003,0x00000007,0x00000007,0x00000007 +#ifndef __APPLE__ +.data +#else +.section __DATA,__data +#endif /* __APPLE__ */ +L_sp_mod_inv_avx2__mask26: +.long 0x03ffffff,0x03ffffff,0x03ffffff,0x03ffffff +.long 0x03ffffff,0x00000000,0x00000000,0x00000000 +/* Non-constant time modular inversion. + * + * @param [out] r Resulting number. + * @param [in] a Number to invert. + * @param [in] m Modulus. + * @return MP_OKAY on success. + */ +#ifndef __APPLE__ +.text +.globl fe_invert_nct_avx2 +.type fe_invert_nct_avx2,@function +.align 16 +fe_invert_nct_avx2: +#else +.section __TEXT,__text +.globl _fe_invert_nct_avx2 +.p2align 4 +_fe_invert_nct_avx2: +#endif /* __APPLE__ */ + pushq %r12 + pushq %r13 + pushq %r14 + pushq %r15 + pushq %rbx + movq $-19, %rax + movq $-1, %rcx + movq $-1, %r8 + movq $0x7fffffffffffffff, %r9 + movq (%rsi), %r10 + movq 8(%rsi), %r11 + movq 16(%rsi), %r12 + movq 24(%rsi), %r13 + leaq L_sp_mod_inv_avx2__prime(%rip), %rbx + vmovupd (%rbx), %ymm6 + vmovupd 32(%rbx), %ymm7 + leaq L_sp_mod_inv_avx2__one(%rip), %rbx + vmovupd (%rbx), %ymm8 + leaq L_sp_mod_inv_avx2__mask01111(%rip), %rbx + vmovupd (%rbx), %ymm9 + leaq L_sp_mod_inv_avx2__all_one(%rip), %rbx + vmovupd (%rbx), %ymm10 + leaq L_sp_mod_inv_avx2__down_one_dword(%rip), %rbx + vmovupd (%rbx), %ymm11 + leaq L_sp_mod_inv_avx2__neg(%rip), %rbx + vmovupd (%rbx), %ymm12 + leaq L_sp_mod_inv_avx2__up_one_dword(%rip), %rbx + vmovupd (%rbx), %ymm13 + leaq L_sp_mod_inv_avx2__mask26(%rip), %rbx + vmovupd (%rbx), %ymm14 + vpxor %xmm0, %xmm0, %xmm0 + vpxor %xmm1, %xmm1, %xmm1 + vmovdqu %ymm8, %ymm2 + vpxor %xmm3, %xmm3, %xmm3 + testb $0x01, %r10b + jnz L__mod_inv_avx2__v_even_end +L__mod_inv_avx2__v_even_start: + shrdq $0x01, %r11, %r10 + shrdq $0x01, %r12, %r11 + shrdq $0x01, %r13, %r12 + shrq $0x01, %r13 + vptest %ymm8, %ymm2 + jz L__mod_inv_avx2__v_even_shr1 + vpaddd %ymm6, %ymm2, %ymm2 + vpaddd %ymm7, %ymm3, %ymm3 +L__mod_inv_avx2__v_even_shr1: + vpand %ymm9, %ymm2, %ymm4 + vpand %ymm10, %ymm3, %ymm5 + vpermd %ymm4, %ymm11, %ymm4 + vpsrad $0x01, %ymm2, %ymm2 + vpsrad $0x01, %ymm3, %ymm3 + vpslld $25, %ymm5, %ymm5 + vpslld $25, %xmm4, %xmm4 + vpaddd %ymm5, %ymm2, %ymm2 + vpaddd %ymm4, %ymm3, %ymm3 + testb $0x01, %r10b + jz L__mod_inv_avx2__v_even_start +L__mod_inv_avx2__v_even_end: +L__mod_inv_avx2__uv_start: + cmpq %r13, %r9 + jb L__mod_inv_avx2__uv_v + ja L__mod_inv_avx2__uv_u + cmpq %r12, %r8 + jb L__mod_inv_avx2__uv_v + ja L__mod_inv_avx2__uv_u + cmpq %r11, %rcx + jb L__mod_inv_avx2__uv_v + ja L__mod_inv_avx2__uv_u + cmpq %r10, %rax + jb L__mod_inv_avx2__uv_v +L__mod_inv_avx2__uv_u: + subq %r10, %rax + sbbq %r11, %rcx + vpsubd %ymm2, %ymm0, %ymm0 + sbbq %r12, %r8 + vpsubd %ymm3, %ymm1, %ymm1 + sbbq %r13, %r9 + vptest %ymm12, %ymm1 + jz L__mod_inv_avx2__usubv_done_neg + vpaddd %ymm6, %ymm0, %ymm0 + vpaddd %ymm7, %ymm1, %ymm1 +L__mod_inv_avx2__usubv_done_neg: +L__mod_inv_avx2__usubv_shr1: + shrdq $0x01, %rcx, %rax + shrdq $0x01, %r8, %rcx + shrdq $0x01, %r9, %r8 + shrq $0x01, %r9 + vptest %ymm8, %ymm0 + jz L__mod_inv_avx2__usubv_sub_shr1 + vpaddd %ymm6, %ymm0, %ymm0 + vpaddd %ymm7, %ymm1, %ymm1 +L__mod_inv_avx2__usubv_sub_shr1: + vpand %ymm9, %ymm0, %ymm4 + vpand %ymm10, %ymm1, %ymm5 + vpermd %ymm4, %ymm11, %ymm4 + vpsrad $0x01, %ymm0, %ymm0 + vpsrad $0x01, %ymm1, %ymm1 + vpslld $25, %ymm5, %ymm5 + vpslld $25, %xmm4, %xmm4 + vpaddd %ymm5, %ymm0, %ymm0 + vpaddd %ymm4, %ymm1, %ymm1 + testb $0x01, %al + jz L__mod_inv_avx2__usubv_shr1 + cmpq $0x01, %rax + jne L__mod_inv_avx2__uv_start + movq %rcx, %rdx + orq %r8, %rdx + jne L__mod_inv_avx2__uv_start + orq %r9, %rdx + jne L__mod_inv_avx2__uv_start + vpextrd $0x00, %xmm0, %eax + vpextrd $0x01, %xmm0, %r8d + vpextrd $2, %xmm0, %r10d + vpextrd $3, %xmm0, %r12d + vpextrd $0x00, %xmm1, %ecx + vpextrd $0x01, %xmm1, %r9d + vpextrd $2, %xmm1, %r11d + vpextrd $3, %xmm1, %r13d + vextracti128 $0x01, %ymm0, %xmm0 + vextracti128 $0x01, %ymm1, %xmm1 + vpextrd $0x00, %xmm0, %r14d + vpextrd $0x00, %xmm1, %r15d + jmp L__mod_inv_avx2__store_done +L__mod_inv_avx2__uv_v: + subq %rax, %r10 + sbbq %rcx, %r11 + vpsubd %ymm0, %ymm2, %ymm2 + sbbq %r8, %r12 + vpsubd %ymm1, %ymm3, %ymm3 + sbbq %r9, %r13 + vptest %ymm12, %ymm3 + jz L__mod_inv_avx2__vsubu_done_neg + vpaddd %ymm6, %ymm2, %ymm2 + vpaddd %ymm7, %ymm3, %ymm3 +L__mod_inv_avx2__vsubu_done_neg: +L__mod_inv_avx2__vsubu_shr1: + shrdq $0x01, %r11, %r10 + shrdq $0x01, %r12, %r11 + shrdq $0x01, %r13, %r12 + shrq $0x01, %r13 + vptest %ymm8, %ymm2 + jz L__mod_inv_avx2__vsubu_sub_shr1 + vpaddd %ymm6, %ymm2, %ymm2 + vpaddd %ymm7, %ymm3, %ymm3 +L__mod_inv_avx2__vsubu_sub_shr1: + vpand %ymm9, %ymm2, %ymm4 + vpand %ymm10, %ymm3, %ymm5 + vpermd %ymm4, %ymm11, %ymm4 + vpsrad $0x01, %ymm2, %ymm2 + vpsrad $0x01, %ymm3, %ymm3 + vpslld $25, %ymm5, %ymm5 + vpslld $25, %xmm4, %xmm4 + vpaddd %ymm5, %ymm2, %ymm2 + vpaddd %ymm4, %ymm3, %ymm3 + testb $0x01, %r10b + jz L__mod_inv_avx2__vsubu_shr1 + cmpq $0x01, %r10 + jne L__mod_inv_avx2__uv_start + movq %r11, %rdx + orq %r12, %rdx + jne L__mod_inv_avx2__uv_start + orq %r13, %rdx + jne L__mod_inv_avx2__uv_start + vpextrd $0x00, %xmm2, %eax + vpextrd $0x01, %xmm2, %r8d + vpextrd $2, %xmm2, %r10d + vpextrd $3, %xmm2, %r12d + vpextrd $0x00, %xmm3, %ecx + vpextrd $0x01, %xmm3, %r9d + vpextrd $2, %xmm3, %r11d + vpextrd $3, %xmm3, %r13d + vextracti128 $0x01, %ymm2, %xmm2 + vextracti128 $0x01, %ymm3, %xmm3 + vpextrd $0x00, %xmm2, %r14d + vpextrd $0x00, %xmm3, %r15d +L__mod_inv_avx2__store_done: + movl %eax, %edx + andl $0x3ffffff, %eax + sarl $26, %edx + addl %edx, %ecx + movl %ecx, %edx + andl $0x3ffffff, %ecx + sarl $26, %edx + addl %edx, %r8d + movl %r8d, %edx + andl $0x3ffffff, %r8d + sarl $26, %edx + addl %edx, %r9d + movl %r9d, %edx + andl $0x3ffffff, %r9d + sarl $26, %edx + addl %edx, %r10d + movl %r10d, %edx + andl $0x3ffffff, %r10d + sarl $26, %edx + addl %edx, %r11d + movl %r11d, %edx + andl $0x3ffffff, %r11d + sarl $26, %edx + addl %edx, %r12d + movl %r12d, %edx + andl $0x3ffffff, %r12d + sarl $26, %edx + addl %edx, %r13d + movl %r13d, %edx + andl $0x3ffffff, %r13d + sarl $26, %edx + addl %edx, %r14d + movl %r14d, %edx + andl $0x3ffffff, %r14d + sarl $26, %edx + addl %edx, %r15d + movslq %ecx, %rcx + movslq %r9d, %r9 + movslq %r11d, %r11 + movslq %r13d, %r13 + movslq %r15d, %r15 + shlq $26, %rcx + shlq $26, %r9 + shlq $26, %r11 + shlq $26, %r13 + shlq $26, %r15 + movslq %eax, %rax + addq %rcx, %rax + movslq %r8d, %r8 + adcq %r9, %r8 + movslq %r10d, %r10 + adcq %r11, %r10 + movslq %r12d, %r12 + adcq %r13, %r12 + movslq %r14d, %r14 + adcq %r15, %r14 + jge L__mod_inv_avx2__3_no_add_prime + movq $0xfffffffffffed, %rcx + movq $0xfffffffffffff, %r9 + movq $0xfffffffffffff, %r11 + movq $0xfffffffffffff, %r13 + movq $0x7fffffffffff, %r15 + addq %rcx, %rax + addq %r9, %r8 + addq %r11, %r10 + addq %r13, %r12 + addq %r15, %r14 + movq $0xfffffffffffff, %rdx + movq %rax, %rcx + andq %rdx, %rax + sarq $52, %rcx + addq %rcx, %r8 + movq %r8, %r9 + andq %rdx, %r8 + sarq $52, %r9 + addq %r9, %r10 + movq %r10, %r11 + andq %rdx, %r10 + sarq $52, %r11 + addq %r11, %r12 + movq %r12, %r13 + andq %rdx, %r12 + sarq $52, %r13 + addq %r13, %r14 +L__mod_inv_avx2__3_no_add_prime: + movq %r8, %rcx + movq %r10, %r9 + movq %r12, %r11 + shlq $52, %rcx + sarq $12, %r8 + shlq $40, %r9 + sarq $24, %r10 + shlq $28, %r11 + sarq $36, %r12 + shlq $16, %r14 + addq %rcx, %rax + adcq %r9, %r8 + adcq %r11, %r10 + adcq %r14, %r12 + movq %rax, (%rdi) + movq %r8, 8(%rdi) + movq %r10, 16(%rdi) + movq %r12, 24(%rdi) + popq %rbx + popq %r15 + popq %r14 + popq %r13 + popq %r12 + repz retq +#ifndef __APPLE__ +.size fe_invert_nct_avx2,.-fe_invert_nct_avx2 +#endif /* __APPLE__ */ #endif /* HAVE_ED25519 */ #endif /* HAVE_INTEL_AVX2 */ diff --git a/wolfcrypt/src/ge_operations.c b/wolfcrypt/src/ge_operations.c index c3264e94f3..c6411fffe2 100644 --- a/wolfcrypt/src/ge_operations.c +++ b/wolfcrypt/src/ge_operations.c @@ -9175,24 +9175,25 @@ void ge_scalarmult_base(ge_p3 *h,const unsigned char *a) #define SLIDE_SIZE 256 /* ge double scalar mult */ -static void slide(signed char *r,const unsigned char *a) +static void slide(signed char *r,const unsigned char *a, int max) { int i; int b; int k; - for (i = 0;i < SLIDE_SIZE;++i) + for (i = 0;i < SLIDE_SIZE;++i) { r[i] = 1 & (a[i >> 3] >> (i & 7)); + } - for (i = 0;i < SLIDE_SIZE;++i) + for (i = 0;i < SLIDE_SIZE;++i) { if (r[i]) { for (b = 1;b <= 6 && i + b < SLIDE_SIZE;++b) { if (r[i + b]) { signed char rb = (signed char)((unsigned char)r[i + b] << b); - if (r[i] + rb <= 15) { + if (r[i] + rb <= max) { r[i] = (signed char)(r[i] + rb); r[i + b] = 0; - } else if (r[i] - rb >= -15) { + } else if (r[i] - rb >= -max) { r[i] = (signed char)(r[i] - rb); for (k = i + b;k < SLIDE_SIZE;++k) { if (!r[k]) { @@ -9206,180 +9207,470 @@ static void slide(signed char *r,const unsigned char *a) } } } + } } +/* Generated using command: ruby ../scripts/x25519/ed25519.rb */ #ifdef CURVED25519_ASM_64BIT -static const ge_precomp Bi[8] = { +static const ge_precomp Bi[32] = { + { + { 0x2fbc93c6f58c3b85, -0x306cd2390473f1e7, 0x270b4898643d42c2, + 0x07cf9d3a33d4ba65 }, + { -0x62efc6fa28bf6ec2, -0x02c660fa2ebf414d, -0x5a3e7bcb977075f7, + 0x44fd2f9298f81267 }, + { -0x5436edfa78855598, 0x26d9e823ccaac49e, 0x5a1b7dcbdd43598c, + 0x6f117b689f0c65a8 }, + }, + { + { -0x50da4f57b31168d0, 0x025a8430e8864b8a, -0x3ee4affd60fe98ce, + 0x7a164e1b9a80f8f4 }, + { 0x56611fe8a4fcd265, 0x3bd353fde5c1ba7d, -0x7ece0ce5deb42943, + 0x2ab91587555bda62 }, + { 0x14ae933f0dd0d889, 0x589423221c35da62, -0x2e8f1aba730d24b4, + 0x5a2826af12b9b4c6 }, + }, + { + { -0x5ded43bbf75a44cd, -0x72afb73c38a112fe, -0x22e414f3a54013bc, + 0x2945ccf146e206eb }, + { 0x7f9182c3a447d6ba, -0x2affeb2eb4d8d649, -0x1cc30ee3479b5f79, + 0x154a7e73eb1b55f3 }, + { -0x4344240e7ed57d7b, 0x270e0807d0bdd1fc, -0x4be498f4e44258d3, + 0x43aabe696b3bb69a }, + }, + { + { 0x6b1a5cd0944ea3bf, 0x7470353ab39dc0d2, 0x71b2528228542e49, + 0x461bea69283c927e }, + { -0x4590d36555cdde4f, 0x6ca021533bba23a7, -0x621589b06de6d3c6, + 0x1d6edd5d2e5317e0 }, + { -0x0e7c9237fe474c5e, -0x4cfca0b8fac15b66, 0x529c41ba5877adf3, + 0x7a9fbb1c6a0f90a7 }, + }, + { + { -0x64d1987559579cd1, -0x59af6190ae43b93b, -0x314dcc3639790a4b, + 0x34b9ed338add7f59 }, + { -0x0c91de81fc627f9c, -0x675f7e490adfbe65, -0x693439f718a14fbc, + 0x49c05a51fadc9c8f }, + { 0x06b4e8bf9045af1b, -0x1d007c1758e62dd1, -0x550903d66c2b30ea, + 0x73c172021b008b06 }, + }, + { + { 0x2fbf00848a802ade, -0x1a260130fdcfd1d9, 0x113e847117703406, + 0x4275aae2546d8faf }, + { 0x315f5b0249864348, 0x3ed6b36977088381, -0x5c5f8aaa9572146b, + 0x18ab598029d5c77f }, + { -0x27d4d33a029f7617, 0x031eb4a13282e4a4, 0x44311199b51a8622, + 0x3dc65522b53df948 }, + }, + { + { -0x408f3ddd5dff8093, -0x407b4c654a432125, 0x537a0e12fb07ba07, + 0x234fd7eec346f241 }, + { 0x506f013b327fbf93, -0x5103143664889095, -0x62ed4dcd5552a698, + 0x0267882d176024a7 }, + { 0x5360a119732ea378, 0x2437e6b1df8dd471, -0x5d10c8076e581acd, + 0x497ba6fdaa097863 }, + }, + { + { 0x24cecc0313cfeaa0, -0x79b73d72e763db93, 0x2dbdbdfac1f2d4d0, + 0x61e22917f12de72b }, + { 0x040bcd86468ccf0b, -0x2c7d645bd566ef2a, 0x7508300807b25192, + 0x43b5cd4218d05ebf }, + { 0x5d9a762f9bd0b516, -0x14c750b1c8c02112, 0x032e5a7d93d64270, + 0x511d61210ae4d842 }, + }, + { + { -0x6d3989106af1627f, -0x5ab9df323f28fbb1, -0x5564c99b9070edb8, + 0x6d325924ddb855e3 }, + { 0x081386484420de87, -0x75e30fe94a6d124c, 0x39fa4e2729942d25, + 0x71a7fe6fe2482810 }, + { 0x6c7182b8a5c8c854, 0x33fd1479fe5f2a03, 0x72cf591883778d0c, + 0x4746c4b6559eeaa9 }, + }, + { + { -0x2c8884c3923965d5, -0x21054dd8907609e9, 0x45651cf7b53a16b5, + 0x5c9a51de34fe9fb7 }, + { 0x348546c864741147, 0x7d35aedd0efcc849, -0x006c6589f98d5cce, + 0x219663497db5e6d6 }, + { -0x0aef0e30860ef199, -0x0022255e19a7aea5, 0x09c3a71710142277, + 0x4804503c608223bb }, + }, + { + { -0x3bdb612fd35c8039, -0x5fa65f1c59ea5355, -0x775691283691f1dd, + 0x553398a51650696d }, + { 0x3b6821d23a36d175, -0x444bf558166461ce, 0x5d9e5ce420838a47, + 0x771e098858de4c5e }, + { -0x65ed0a2d87bae121, 0x3ada5d7985899ccb, 0x477f4a2d9fa59508, + 0x5a5ed1d68ff5a611 }, + }, + { + { 0x1195122afe150e83, -0x30df65da81b4ca28, 0x7387f8291e711e20, + 0x44acb897d8bf92f0 }, + { -0x451a1f3aa7ad8ca7, 0x392e5c19cadb9d7e, 0x28653c1eda1cabe9, + 0x019b60135fefdc44 }, + { 0x1e6068145e134b83, -0x3b0a19b0dbcfb3ea, 0x506e88a8fc1a3ed7, + 0x150c49fde6ad2f92 }, + }, + { + { -0x71840d6af6b8eec8, 0x5d6fef394f75a651, 0x10af79c425a708ad, + 0x6b2b5a075bb99922 }, + { -0x47b679c363235798, -0x37c0bb24478eb530, -0x01c11ca9f3c9e973, + 0x78a6d7791e05fbc1 }, + { 0x58bf704b47a0b976, -0x59fe4caa8be8b72b, -0x55d4e04e2abd0a70, + 0x725c7ffc4ad55d00 }, + }, + { + { -0x1bbd98ea2e30664e, 0x7352d51102a20d34, 0x23d1157b8b12109f, + 0x794cc9277cb1f3a3 }, + { -0x6e7fd408e32f6740, -0x01be935b12a19c9a, -0x20a7a28eb6fd66b4, + 0x4cd54625f855fae7 }, + { 0x4af6c426c2ac5053, -0x43651252cd098da8, 0x2ad032f10a311021, + 0x7008357b6fcc8e85 }, + }, + { + { 0x0b88672738773f01, -0x473337056a043305, -0x72d22a5c4652d64a, + 0x06ef7e9851ad0f6a }, + { -0x2fe460447da7b5cc, 0x47ab6463d2b4792b, -0x49ce9c63b7ac9dfe, + 0x13a92a3669d6d428 }, + { -0x356c88e33fa8821b, 0x7540e41e5035dc5c, 0x24680f01d802e071, + 0x3c296ddf8a2af86a }, + }, + { + { -0x5152ea0626eb58ed, -0x56d08406737006ee, -0x5007dce860ac28d0, + 0x7a99d393490c77ba }, + { -0x0314b2d144e0dabf, -0x476aef38bf5246e1, -0x038e5c822f5e52fb, + 0x0a892c700747717b }, + { -0x70ad12dbc9425c18, 0x77a8c84157e80794, -0x5a569a9cd9d06320, + 0x286762d28302f7d2 }, + }, + { + { 0x4e7836093ce35b25, -0x7d1ee7e24d945569, 0x0cc192d3cbc7b83f, + 0x32f1da046a9d9d3a }, + { 0x7c558e2bce2ef5bd, -0x1b67934b98b8439d, 0x154a179f3bbb89b8, + 0x7686f2a3d6f1767a }, + { -0x5572ed5992a68396, -0x70ee6cfcfb2c7ad5, 0x3f91dc73c209b022, + 0x561305f8a9ad28a6 }, + }, + { + { 0x100c978dec92aed1, -0x35bc2abcb2928c1b, -0x7cece4dd27b845b8, + 0x00aaec53e35d4d2c }, + { 0x6722cc28e7b0c0d5, 0x709de9bbdb075c53, -0x3509725828fef59f, + 0x030a1aef2c57cc6c }, + { 0x7bb1f773003ad2aa, 0x0b3f29802b216608, 0x7821dc86520ed23e, + 0x20be9c1c24065480 }, + }, + { + { -0x1eac7827db698c5a, 0x5943bc2df546e493, 0x1c7f9a81c36f63b5, + 0x750ab3361f0ac1de }, + { 0x20e0e44ae2025e60, -0x4fc4c4d0342346c8, 0x105d639cf95a0d1c, + 0x69764c545067e311 }, + { 0x1e8a3283a2f81037, 0x6f2eda23bd7fcbf1, -0x48d02ea453d1da9d, + 0x54f96b3fb7075040 }, + }, + { + { 0x0fadf20429669279, 0x3adda2047d7d724a, 0x6f3d94828c5760f1, + 0x3d7fe9c52bb7539e }, + { 0x177dafc616b11ecd, -0x7689b46305a89b87, -0x48575eef1913187b, + 0x78e6839fbe85dbf0 }, + { 0x70332df737b8856b, 0x75d05d43041a178a, 0x320ff74aa0e59e22, + 0x70f268f350088242 }, + }, + { + { 0x66864583b1805f47, -0x0aca3a2e9f2283e7, -0x1678b148e1b34ffa, + 0x7c0d345cfad889d9 }, + { 0x2324112070dcf355, 0x380cc97ee7fce117, -0x4ce22112caad4968, + 0x404e56c039b8c4b9 }, + { 0x591f1f4b8c78338a, -0x5fc9954e981f4a1f, 0x5cbc4152b45f3d44, + 0x20d754762aaec777 }, + }, + { + { 0x5e8fc36fc73bb758, -0x531abc5ac9c34466, -0x566cb5826fc436de, + 0x2b8f1e46f3ceec62 }, + { -0x628b014eca460abd, -0x7b4c820e21736a94, -0x16cdd4f8a8ec7457, + 0x38b8ada8790b4ce1 }, + { -0x4a3fb56320ae06a3, 0x2b3952aecb1fdeac, 0x1d106d8b328b66da, + 0x049aeb32ceba1953 }, + }, + { + { -0x55af82f48a0386cf, 0x0fef924b7a6725d3, 0x1d82542b396b3930, + 0x795ee17530f674fc }, + { -0x288982c39c230182, 0x209c594897856e40, -0x4998979e1eb083ed, + 0x51c665e0c8d625fc }, + { 0x254a5b0a52ecbd81, 0x5d411f6ee034afe7, -0x195db2f23511b5cf, + 0x6cd19bf49dc54477 }, + }, + { + { 0x1ffe612165afc386, 0x082a2a88b8d51b10, 0x76f6627e20990baa, + 0x5e01b3a7429e43e7 }, + { 0x7e87619052179ca3, 0x571d0a060b2c9f85, -0x7f5d45577b668ee2, + 0x7520f3db40b2e638 }, + { 0x3db50be3d39357a1, -0x69849322a6616b5b, 0x1a309a64df311e6e, + 0x71092c9ccef3c986 }, + }, { - { 0x2fbc93c6f58c3b85, -0x306cd2390473f1e7, 0x270b4898643d42c2, 0x07cf9d3a33d4ba65, }, - { -0x62efc6fa28bf6ec2, -0x02c660fa2ebf414d, -0x5a3e7bcb977075f7, 0x44fd2f9298f81267, }, - { -0x5436edfa78855598, 0x26d9e823ccaac49e, 0x5a1b7dcbdd43598c, 0x6f117b689f0c65a8, }, + { -0x7a9427538bfae231, 0x03f6a40855b7aa1e, 0x3a4ae7cbc9743ceb, + 0x4173a5bb7137abde }, + { 0x53d8523f0364918c, -0x5d4bfb0bc05494e4, 0x080b4a9e6681e5a4, + 0x0ea15b03d0257ba7 }, + { 0x17c56e31f0f9218a, 0x5a696e2b1afc4708, -0x086ce9970b4d0e8a, + 0x5fc565614a4e3a67 }, }, { - { -0x50da4f57b31168d0, 0x025a8430e8864b8a, -0x3ee4affd60fe98ce, 0x7a164e1b9a80f8f4, }, - { 0x56611fe8a4fcd265, 0x3bd353fde5c1ba7d, -0x7ece0ce5deb42943, 0x2ab91587555bda62, }, - { 0x14ae933f0dd0d889, 0x589423221c35da62, -0x2e8f1aba730d24b4, 0x5a2826af12b9b4c6, }, + { 0x4892e1e67790988e, 0x01d5950f1c5cd722, -0x1c4f7e651a6dc113, + 0x3214c7409d46651b }, + { 0x136e570dc46d7ae5, 0x0fd0aacc54f8dc8f, 0x59549f03310dad86, + 0x62711c414c454aa1 }, + { 0x1329827406651770, 0x3ba4a0668a279436, -0x26494713e7a2ddc4, + 0x5bea94073ecb833c }, }, { - { -0x5ded43bbf75a44cd, -0x72afb73c38a112fe, -0x22e414f3a54013bc, 0x2945ccf146e206eb, }, - { 0x7f9182c3a447d6ba, -0x2affeb2eb4d8d649, -0x1cc30ee3479b5f79, 0x154a7e73eb1b55f3, }, - { -0x4344240e7ed57d7b, 0x270e0807d0bdd1fc, -0x4be498f4e44258d3, 0x43aabe696b3bb69a, }, + { -0x4b8f319c0cbc2d08, 0x0067ba8f0543e8f1, 0x35da51a1a2117b6f, + 0x4ad0785944f1bd2f }, + { 0x641dbf0912c89be4, -0x530c74ce8291a864, -0x540161fd09684f9b, + 0x3aacd5c148f61eec }, + { -0x7a71c4cb3cce7cff, -0x23663fb8f8ce97da, 0x34085b2ed39da88c, + 0x3aff0cb1d902853d }, }, { - { 0x6b1a5cd0944ea3bf, 0x7470353ab39dc0d2, 0x71b2528228542e49, 0x461bea69283c927e, }, - { -0x4590d36555cdde4f, 0x6ca021533bba23a7, -0x621589b06de6d3c6, 0x1d6edd5d2e5317e0, }, - { -0x0e7c9237fe474c5e, -0x4cfca0b8fac15b66, 0x529c41ba5877adf3, 0x7a9fbb1c6a0f90a7, }, + { -0x6dd9bcf40b3acafb, 0x68e49c13261f2283, 0x09ef33788fd327c6, + 0x2ccf9f732bd99e7f }, + { -0x783a3814c5dfbfa2, -0x711cee101252a937, 0x29252e48ad29d5f9, + 0x110e7e86f4cd251d }, + { 0x57c0d89ed603f5e4, 0x12888628f0b0200c, 0x53172709a02e3bb7, + 0x05c557e0b9693a37 }, }, { - { -0x64d1987559579cd1, -0x59af6190ae43b93b, -0x314dcc3639790a4b, 0x34b9ed338add7f59, }, - { -0x0c91de81fc627f9c, -0x675f7e490adfbe65, -0x693439f718a14fbc, 0x49c05a51fadc9c8f, }, - { 0x06b4e8bf9045af1b, -0x1d007c1758e62dd1, -0x550903d66c2b30ea, 0x73c172021b008b06, }, + { -0x0889444f763df150, 0x61f85bf6fa0fd85c, -0x4946c0b19cbbde05, + 0x289fef0841861205 }, + { -0x270631cee0368191, 0x7a3f263011f9fdae, -0x1ea4815f7412da23, + 0x6e154c178fe9875a }, + { -0x309e9cc901296541, -0x64e91b187cca36b1, 0x13789765753a7fe7, + 0x6afbf642a95ca319 }, }, { - { 0x2fbf00848a802ade, -0x1a260130fdcfd1d9, 0x113e847117703406, 0x4275aae2546d8faf, }, - { 0x315f5b0249864348, 0x3ed6b36977088381, -0x5c5f8aaa9572146b, 0x18ab598029d5c77f, }, - { -0x27d4d33a029f7617, 0x031eb4a13282e4a4, 0x44311199b51a8622, 0x3dc65522b53df948, }, + { 0x5de55070f913a8cc, 0x7d1d167b2b0cf561, -0x25d6a9496f152b77, + 0x12c093cedb801ed9 }, + { 0x7da8de0c62f5d2c1, -0x6703c25b4ff18466, 0x7deb6ada0dad70e0, + 0x0db4b851b95038c4 }, + { -0x03eb806cf747e6f1, 0x06969da0a11ae310, -0x3118aa8d25382803, + 0x33aa8799c6635ce6 }, }, { - { -0x408f3ddd5dff8093, -0x407b4c654a432125, 0x537a0e12fb07ba07, 0x234fd7eec346f241, }, - { 0x506f013b327fbf93, -0x5103143664889095, -0x62ed4dcd5552a698, 0x0267882d176024a7, }, - { 0x5360a119732ea378, 0x2437e6b1df8dd471, -0x5d10c8076e581acd, 0x497ba6fdaa097863, }, + { -0x7cb70a7703ea934f, 0x6da2ba9b1a0a6d27, -0x1dd9d2a37835a54a, + 0x212cd0c1c8d589a6 }, + { -0x50f00ae142f7a30e, 0x78f51a8967d33f1f, 0x6ec2bfe15060033c, + 0x233c6f29e8e21a86 }, + { -0x2d0b2aef80e7387f, 0x122ecdf2527e9d28, -0x58f579d5c2c2ccbf, + 0x1db7778911914ce3 }, }, { - { 0x24cecc0313cfeaa0, -0x79b73d72e763db93, 0x2dbdbdfac1f2d4d0, 0x61e22917f12de72b, }, - { 0x040bcd86468ccf0b, -0x2c7d645bd566ef2a, 0x7508300807b25192, 0x43b5cd4218d05ebf, }, - { 0x5d9a762f9bd0b516, -0x14c750b1c8c02112, 0x032e5a7d93d64270, 0x511d61210ae4d842, }, + { -0x4cc6b896228fe54a, -0x1d47212be630725b, 0x15df4161fd2ac852, + 0x7ae2ca8a017d24be }, + { -0x220cadc683943d91, 0x7a97e2cc53d50113, 0x7c74f43abf79a330, + 0x31ad97ad26e2adfc }, + { -0x4817e812f6df469e, 0x1e8518cc3f19da9d, -0x1b6e3eb0daa9f59c, + 0x1ed1fc53a6622c83 }, }, }; #elif defined(CURVED25519_ASM_32BIT) static const ge_precomp Bi[8] = { { - { -0x0a73c47b, 0x2fbc93c6, -0x0473f1e7, -0x306cd23a, 0x643d42c2, 0x270b4898, 0x33d4ba65, 0x07cf9d3a, }, - { -0x28bf6ec2, -0x62efc6fb, -0x2ebf414d, -0x02c660fb, 0x688f8a09, -0x5a3e7bcc, -0x6707ed99, 0x44fd2f92, }, - { -0x78855598, -0x5436edfb, -0x33553b62, 0x26d9e823, -0x22bca674, 0x5a1b7dcb, -0x60f39a58, 0x6f117b68, }, + { -0x0a73c47b, 0x2fbc93c6, -0x0473f1e7, -0x306cd23a, 0x643d42c2, + 0x270b4898, 0x33d4ba65, 0x07cf9d3a }, + { -0x28bf6ec2, -0x62efc6fb, -0x2ebf414d, -0x02c660fb, 0x688f8a09, + -0x5a3e7bcc, -0x6707ed99, 0x44fd2f92 }, + { -0x78855598, -0x5436edfb, -0x33553b62, 0x26d9e823, -0x22bca674, + 0x5a1b7dcb, -0x60f39a58, 0x6f117b68 }, }, { - { 0x4cee9730, -0x50da4f58, -0x1779b476, 0x025a8430, -0x60fe98ce, -0x3ee4affe, -0x657f070c, 0x7a164e1b, }, - { -0x5b032d9b, 0x56611fe8, -0x1a3e4583, 0x3bd353fd, 0x214bd6bd, -0x7ece0ce6, 0x555bda62, 0x2ab91587, }, - { 0x0dd0d889, 0x14ae933f, 0x1c35da62, 0x58942322, -0x730d24b4, -0x2e8f1abb, 0x12b9b4c6, 0x5a2826af, }, + { 0x4cee9730, -0x50da4f58, -0x1779b476, 0x025a8430, -0x60fe98ce, + -0x3ee4affe, -0x657f070c, 0x7a164e1b }, + { -0x5b032d9b, 0x56611fe8, -0x1a3e4583, 0x3bd353fd, 0x214bd6bd, + -0x7ece0ce6, 0x555bda62, 0x2ab91587 }, + { 0x0dd0d889, 0x14ae933f, 0x1c35da62, 0x58942322, -0x730d24b4, + -0x2e8f1abb, 0x12b9b4c6, 0x5a2826af }, }, { - { 0x08a5bb33, -0x5ded43bc, -0x38a112fe, -0x72afb73d, 0x5abfec44, -0x22e414f4, 0x46e206eb, 0x2945ccf1, }, - { -0x5bb82946, 0x7f9182c3, 0x4b2729b7, -0x2affeb2f, -0x479b5f79, -0x1cc30ee4, -0x14e4aa0d, 0x154a7e73, }, - { -0x7ed57d7b, -0x4344240f, -0x2f422e04, 0x270e0807, 0x1bbda72d, -0x4be498f5, 0x6b3bb69a, 0x43aabe69, }, + { 0x08a5bb33, -0x5ded43bc, -0x38a112fe, -0x72afb73d, 0x5abfec44, + -0x22e414f4, 0x46e206eb, 0x2945ccf1 }, + { -0x5bb82946, 0x7f9182c3, 0x4b2729b7, -0x2affeb2f, -0x479b5f79, + -0x1cc30ee4, -0x14e4aa0d, 0x154a7e73 }, + { -0x7ed57d7b, -0x4344240f, -0x2f422e04, 0x270e0807, 0x1bbda72d, + -0x4be498f5, 0x6b3bb69a, 0x43aabe69 }, }, { - { -0x6bb15c41, 0x6b1a5cd0, -0x4c623f2e, 0x7470353a, 0x28542e49, 0x71b25282, 0x283c927e, 0x461bea69, }, - { -0x55cdde4f, -0x4590d366, 0x3bba23a7, 0x6ca02153, -0x6de6d3c6, -0x621589b1, 0x2e5317e0, 0x1d6edd5d, }, - { 0x01b8b3a2, -0x0e7c9238, 0x053ea49a, -0x4cfca0b9, 0x5877adf3, 0x529c41ba, 0x6a0f90a7, 0x7a9fbb1c, }, + { -0x6bb15c41, 0x6b1a5cd0, -0x4c623f2e, 0x7470353a, 0x28542e49, + 0x71b25282, 0x283c927e, 0x461bea69 }, + { -0x55cdde4f, -0x4590d366, 0x3bba23a7, 0x6ca02153, -0x6de6d3c6, + -0x621589b1, 0x2e5317e0, 0x1d6edd5d }, + { 0x01b8b3a2, -0x0e7c9238, 0x053ea49a, -0x4cfca0b9, 0x5877adf3, + 0x529c41ba, 0x6a0f90a7, 0x7a9fbb1c }, }, { - { -0x59579cd1, -0x64d19876, 0x51bc46c5, -0x59af6191, -0x39790a4b, -0x314dcc37, -0x752280a7, 0x34b9ed33, }, - { 0x039d8064, -0x0c91de82, -0x0adfbe65, -0x675f7e4a, -0x18a14fbc, -0x693439f8, -0x05236371, 0x49c05a51, }, - { -0x6fba50e5, 0x06b4e8bf, -0x58e62dd1, -0x1d007c18, -0x6c2b30ea, -0x550903d7, 0x1b008b06, 0x73c17202, }, + { -0x59579cd1, -0x64d19876, 0x51bc46c5, -0x59af6191, -0x39790a4b, + -0x314dcc37, -0x752280a7, 0x34b9ed33 }, + { 0x039d8064, -0x0c91de82, -0x0adfbe65, -0x675f7e4a, -0x18a14fbc, + -0x693439f8, -0x05236371, 0x49c05a51 }, + { -0x6fba50e5, 0x06b4e8bf, -0x58e62dd1, -0x1d007c18, -0x6c2b30ea, + -0x550903d7, 0x1b008b06, 0x73c17202 }, }, { - { -0x757fd522, 0x2fbf0084, 0x02302e27, -0x1a260131, 0x17703406, 0x113e8471, 0x546d8faf, 0x4275aae2, }, - { 0x49864348, 0x315f5b02, 0x77088381, 0x3ed6b369, 0x6a8deb95, -0x5c5f8aab, 0x29d5c77f, 0x18ab5980, }, - { -0x029f7617, -0x27d4d33b, 0x3282e4a4, 0x031eb4a1, -0x4ae579de, 0x44311199, -0x4ac206b8, 0x3dc65522, }, + { -0x757fd522, 0x2fbf0084, 0x02302e27, -0x1a260131, 0x17703406, + 0x113e8471, 0x546d8faf, 0x4275aae2 }, + { 0x49864348, 0x315f5b02, 0x77088381, 0x3ed6b369, 0x6a8deb95, + -0x5c5f8aab, 0x29d5c77f, 0x18ab5980 }, + { -0x029f7617, -0x27d4d33b, 0x3282e4a4, 0x031eb4a1, -0x4ae579de, + 0x44311199, -0x4ac206b8, 0x3dc65522 }, }, { - { -0x5dff8093, -0x408f3dde, -0x4a432125, -0x407b4c66, -0x04f845f9, 0x537a0e12, -0x3cb90dbf, 0x234fd7ee, }, - { 0x327fbf93, 0x506f013b, -0x64889095, -0x51031437, -0x5552a698, -0x62ed4dce, 0x176024a7, 0x0267882d, }, - { 0x732ea378, 0x5360a119, -0x20722b8f, 0x2437e6b1, -0x6e581acd, -0x5d10c808, -0x55f6879d, 0x497ba6fd, }, + { -0x5dff8093, -0x408f3dde, -0x4a432125, -0x407b4c66, -0x04f845f9, + 0x537a0e12, -0x3cb90dbf, 0x234fd7ee }, + { 0x327fbf93, 0x506f013b, -0x64889095, -0x51031437, -0x5552a698, + -0x62ed4dce, 0x176024a7, 0x0267882d }, + { 0x732ea378, 0x5360a119, -0x20722b8f, 0x2437e6b1, -0x6e581acd, + -0x5d10c808, -0x55f6879d, 0x497ba6fd }, }, { - { 0x13cfeaa0, 0x24cecc03, 0x189c246d, -0x79b73d73, -0x3e0d2b30, 0x2dbdbdfa, -0x0ed218d5, 0x61e22917, }, - { 0x468ccf0b, 0x040bcd86, 0x2a9910d6, -0x2c7d645c, 0x07b25192, 0x75083008, 0x18d05ebf, 0x43b5cd42, }, - { -0x642f4aea, 0x5d9a762f, 0x373fdeee, -0x14c750b2, -0x6c29bd90, 0x032e5a7d, 0x0ae4d842, 0x511d6121, }, + { 0x13cfeaa0, 0x24cecc03, 0x189c246d, -0x79b73d73, -0x3e0d2b30, + 0x2dbdbdfa, -0x0ed218d5, 0x61e22917 }, + { 0x468ccf0b, 0x040bcd86, 0x2a9910d6, -0x2c7d645c, 0x07b25192, + 0x75083008, 0x18d05ebf, 0x43b5cd42 }, + { -0x642f4aea, 0x5d9a762f, 0x373fdeee, -0x14c750b2, -0x6c29bd90, + 0x032e5a7d, 0x0ae4d842, 0x511d6121 }, }, }; #elif defined(CURVED25519_128BIT) static const ge_precomp Bi[8] = { { - { 0x493c6f58c3b85, 0x0df7181c325f7, 0x0f50b0b3e4cb7, 0x5329385a44c32, 0x07cf9d3a33d4b }, - { 0x03905d740913e, 0x0ba2817d673a2, 0x23e2827f4e67c, 0x133d2e0c21a34, 0x44fd2f9298f81 }, - { 0x11205877aaa68, 0x479955893d579, 0x50d66309b67a0, 0x2d42d0dbee5ee, 0x6f117b689f0c6 }, + { 0x493c6f58c3b85, 0x0df7181c325f7, 0x0f50b0b3e4cb7, 0x5329385a44c32, + 0x07cf9d3a33d4b }, + { 0x03905d740913e, 0x0ba2817d673a2, 0x23e2827f4e67c, 0x133d2e0c21a34, + 0x44fd2f9298f81 }, + { 0x11205877aaa68, 0x479955893d579, 0x50d66309b67a0, 0x2d42d0dbee5ee, + 0x6f117b689f0c6 }, }, { - { 0x5b0a84cee9730, 0x61d10c97155e4, 0x4059cc8096a10, 0x47a608da8014f, 0x7a164e1b9a80f }, - { 0x11fe8a4fcd265, 0x7bcb8374faacc, 0x52f5af4ef4d4f, 0x5314098f98d10, 0x2ab91587555bd }, - { 0x6933f0dd0d889, 0x44386bb4c4295, 0x3cb6d3162508c, 0x26368b872a2c6, 0x5a2826af12b9b }, + { 0x5b0a84cee9730, 0x61d10c97155e4, 0x4059cc8096a10, 0x47a608da8014f, + 0x7a164e1b9a80f }, + { 0x11fe8a4fcd265, 0x7bcb8374faacc, 0x52f5af4ef4d4f, 0x5314098f98d10, + 0x2ab91587555bd }, + { 0x6933f0dd0d889, 0x44386bb4c4295, 0x3cb6d3162508c, 0x26368b872a2c6, + 0x5a2826af12b9b }, }, { - { 0x2bc4408a5bb33, 0x078ebdda05442, 0x2ffb112354123, 0x375ee8df5862d, 0x2945ccf146e20 }, - { 0x182c3a447d6ba, 0x22964e536eff2, 0x192821f540053, 0x2f9f19e788e5c, 0x154a7e73eb1b5 }, - { 0x3dbf1812a8285, 0x0fa17ba3f9797, 0x6f69cb49c3820, 0x34d5a0db3858d, 0x43aabe696b3bb }, + { 0x2bc4408a5bb33, 0x078ebdda05442, 0x2ffb112354123, 0x375ee8df5862d, + 0x2945ccf146e20 }, + { 0x182c3a447d6ba, 0x22964e536eff2, 0x192821f540053, 0x2f9f19e788e5c, + 0x154a7e73eb1b5 }, + { 0x3dbf1812a8285, 0x0fa17ba3f9797, 0x6f69cb49c3820, 0x34d5a0db3858d, + 0x43aabe696b3bb }, }, { - { 0x25cd0944ea3bf, 0x75673b81a4d63, 0x150b925d1c0d4, 0x13f38d9294114, 0x461bea69283c9 }, - { 0x72c9aaa3221b1, 0x267774474f74d, 0x064b0e9b28085, 0x3f04ef53b27c9, 0x1d6edd5d2e531 }, - { 0x36dc801b8b3a2, 0x0e0a7d4935e30, 0x1deb7cecc0d7d, 0x053a94e20dd2c, 0x7a9fbb1c6a0f9 }, + { 0x25cd0944ea3bf, 0x75673b81a4d63, 0x150b925d1c0d4, 0x13f38d9294114, + 0x461bea69283c9 }, + { 0x72c9aaa3221b1, 0x267774474f74d, 0x064b0e9b28085, 0x3f04ef53b27c9, + 0x1d6edd5d2e531 }, + { 0x36dc801b8b3a2, 0x0e0a7d4935e30, 0x1deb7cecc0d7d, 0x053a94e20dd2c, + 0x7a9fbb1c6a0f9 }, }, { - { 0x6678aa6a8632f, 0x5ea3788d8b365, 0x21bd6d6994279, 0x7ace75919e4e3, 0x34b9ed338add7 }, - { 0x6217e039d8064, 0x6dea408337e6d, 0x57ac112628206, 0x647cb65e30473, 0x49c05a51fadc9 }, - { 0x4e8bf9045af1b, 0x514e33a45e0d6, 0x7533c5b8bfe0f, 0x583557b7e14c9, 0x73c172021b008 }, + { 0x6678aa6a8632f, 0x5ea3788d8b365, 0x21bd6d6994279, 0x7ace75919e4e3, + 0x34b9ed338add7 }, + { 0x6217e039d8064, 0x6dea408337e6d, 0x57ac112628206, 0x647cb65e30473, + 0x49c05a51fadc9 }, + { 0x4e8bf9045af1b, 0x514e33a45e0d6, 0x7533c5b8bfe0f, 0x583557b7e14c9, + 0x73c172021b008 }, }, { - { 0x700848a802ade, 0x1e04605c4e5f7, 0x5c0d01b9767fb, 0x7d7889f42388b, 0x4275aae2546d8 }, - { 0x75b0249864348, 0x52ee11070262b, 0x237ae54fb5acd, 0x3bfd1d03aaab5, 0x18ab598029d5c }, - { 0x32cc5fd6089e9, 0x426505c949b05, 0x46a18880c7ad2, 0x4a4221888ccda, 0x3dc65522b53df }, + { 0x700848a802ade, 0x1e04605c4e5f7, 0x5c0d01b9767fb, 0x7d7889f42388b, + 0x4275aae2546d8 }, + { 0x75b0249864348, 0x52ee11070262b, 0x237ae54fb5acd, 0x3bfd1d03aaab5, + 0x18ab598029d5c }, + { 0x32cc5fd6089e9, 0x426505c949b05, 0x46a18880c7ad2, 0x4a4221888ccda, + 0x3dc65522b53df }, }, { - { 0x0c222a2007f6d, 0x356b79bdb77ee, 0x41ee81efe12ce, 0x120a9bd07097d, 0x234fd7eec346f }, - { 0x7013b327fbf93, 0x1336eeded6a0d, 0x2b565a2bbf3af, 0x253ce89591955, 0x0267882d17602 }, - { 0x0a119732ea378, 0x63bf1ba8e2a6c, 0x69f94cc90df9a, 0x431d1779bfc48, 0x497ba6fdaa097 }, + { 0x0c222a2007f6d, 0x356b79bdb77ee, 0x41ee81efe12ce, 0x120a9bd07097d, + 0x234fd7eec346f }, + { 0x7013b327fbf93, 0x1336eeded6a0d, 0x2b565a2bbf3af, 0x253ce89591955, + 0x0267882d17602 }, + { 0x0a119732ea378, 0x63bf1ba8e2a6c, 0x69f94cc90df9a, 0x431d1779bfc48, + 0x497ba6fdaa097 }, }, { - { 0x6cc0313cfeaa0, 0x1a313848da499, 0x7cb534219230a, 0x39596dedefd60, 0x61e22917f12de }, - { 0x3cd86468ccf0b, 0x48553221ac081, 0x6c9464b4e0a6e, 0x75fba84180403, 0x43b5cd4218d05 }, - { 0x2762f9bd0b516, 0x1c6e7fbddcbb3, 0x75909c3ace2bd, 0x42101972d3ec9, 0x511d61210ae4d }, + { 0x6cc0313cfeaa0, 0x1a313848da499, 0x7cb534219230a, 0x39596dedefd60, + 0x61e22917f12de }, + { 0x3cd86468ccf0b, 0x48553221ac081, 0x6c9464b4e0a6e, 0x75fba84180403, + 0x43b5cd4218d05 }, + { 0x2762f9bd0b516, 0x1c6e7fbddcbb3, 0x75909c3ace2bd, 0x42101972d3ec9, + 0x511d61210ae4d }, }, }; #else static const ge_precomp Bi[8] = { - { - { 25967493,-14356035,29566456,3660896,-12694345,4014787,27544626,-11754271,-6079156,2047605 }, - { -12545711,934262,-2722910,3049990,-727428,9406986,12720692,5043384,19500929,-15469378 }, - { -8738181,4489570,9688441,-14785194,10184609,-12363380,29287919,11864899,-24514362,-4438546 }, - }, - { - { 15636291,-9688557,24204773,-7912398,616977,-16685262,27787600,-14772189,28944400,-1550024 }, - { 16568933,4717097,-11556148,-1102322,15682896,-11807043,16354577,-11775962,7689662,11199574 }, - { 30464156,-5976125,-11779434,-15670865,23220365,15915852,7512774,10017326,-17749093,-9920357 }, - }, - { - { 10861363,11473154,27284546,1981175,-30064349,12577861,32867885,14515107,-15438304,10819380 }, - { 4708026,6336745,20377586,9066809,-11272109,6594696,-25653668,12483688,-12668491,5581306 }, - { 19563160,16186464,-29386857,4097519,10237984,-4348115,28542350,13850243,-23678021,-15815942 }, - }, - { - { 5153746,9909285,1723747,-2777874,30523605,5516873,19480852,5230134,-23952439,-15175766 }, - { -30269007,-3463509,7665486,10083793,28475525,1649722,20654025,16520125,30598449,7715701 }, - { 28881845,14381568,9657904,3680757,-20181635,7843316,-31400660,1370708,29794553,-1409300 }, - }, - { - { -22518993,-6692182,14201702,-8745502,-23510406,8844726,18474211,-1361450,-13062696,13821877 }, - { -6455177,-7839871,3374702,-4740862,-27098617,-10571707,31655028,-7212327,18853322,-14220951 }, - { 4566830,-12963868,-28974889,-12240689,-7602672,-2830569,-8514358,-10431137,2207753,-3209784 }, - }, - { - { -25154831,-4185821,29681144,7868801,-6854661,-9423865,-12437364,-663000,-31111463,-16132436 }, - { 25576264,-2703214,7349804,-11814844,16472782,9300885,3844789,15725684,171356,6466918 }, - { 23103977,13316479,9739013,-16149481,817875,-15038942,8965339,-14088058,-30714912,16193877 }, - }, - { - { -33521811,3180713,-2394130,14003687,-16903474,-16270840,17238398,4729455,-18074513,9256800 }, - { -25182317,-4174131,32336398,5036987,-21236817,11360617,22616405,9761698,-19827198,630305 }, - { -13720693,2639453,-24237460,-7406481,9494427,-5774029,-6554551,-15960994,-2449256,-14291300 }, - }, - { - { -3151181,-5046075,9282714,6866145,-31907062,-863023,-18940575,15033784,25105118,-7894876 }, - { -24326370,15950226,-31801215,-14592823,-11662737,-5090925,1573892,-2625887,2198790,-15804619 }, - { -3099351,10324967,-2241613,7453183,-5446979,-2735503,-13812022,-16236442,-32461234,-12290683 }, - }, -} ; + { + { 0x18c3b85, -0x0db0e43, 0x1c325f8, 0x037dc60, -0x0c1b349, + 0x03d42c3, 0x1a44c32, -0x0b35b1f, -0x05cc2b4, 0x01f3e75 }, + { -0x0bf6eaf, 0x00e4176, -0x0298c5e, 0x02e8a06, -0x00b1984, + 0x08f8a0a, 0x0c21a34, 0x04cf4b8, 0x1298f81, -0x0ec0b42 }, + { -0x0855585, 0x0448162, 0x093d579, -0x0e19aaa, 0x09b67a1, + -0x0bca674, 0x1bee5ef, 0x0b50b43, -0x1760f3a, -0x043ba12 }, + }, + { + { 0x0ee9743, -0x093d5ed, 0x17155e5, -0x078bbce, 0x0096a11, + -0x0fe98ce, 0x1a80150, -0x0e167dd, 0x1b9a810, -0x017a6c8 }, + { 0x0fcd265, 0x047fa29, -0x0b05534, -0x010d1f2, 0x0ef4d50, + -0x0b42943, 0x0f98d11, -0x0b3afda, 0x07555be, 0x0aae456 }, + { 0x1d0d89c, -0x05b303d, -0x0b3bd6a, -0x0ef1e51, 0x162508d, + 0x0f2db4c, 0x072a2c6, 0x098da2e, -0x10ed465, -0x0975f65 }, + }, + { + { 0x0a5bb33, 0x0af1102, 0x1a05442, 0x01e3af7, -0x1cabedd, + 0x0bfec45, 0x1f5862d, 0x0dd7ba3, -0x0eb91e0, 0x0a51734 }, + { 0x047d6ba, 0x060b0e9, 0x136eff2, 0x08a5939, -0x0abffad, + 0x064a088, -0x18771a4, 0x0be7c68, -0x0c14e4b, 0x05529fa }, + { 0x12a8298, 0x0f6fc60, -0x1c06869, 0x03e85ef, 0x09c3820, + -0x04258d3, 0x1b3858e, 0x0d35683, -0x1694c45, -0x0f15506 }, + }, + { + { 0x04ea3d2, 0x0973425, 0x01a4d63, -0x02a6312, 0x1d1c0d5, + 0x0542e49, 0x1294114, 0x04fce36, -0x16d7c37, -0x0e79056 }, + { -0x1cdde4f, -0x034d955, 0x074f74e, 0x099ddd1, 0x1b28085, + 0x0192c3a, 0x13b27c9, 0x0fc13bd, 0x1d2e531, 0x075bb75 }, + { 0x1b8b3b5, 0x0db7200, 0x0935e30, 0x03829f5, -0x133f283, + 0x077adf4, -0x1df22d4, 0x014ea54, 0x1c6a0f9, -0x0158114 }, + }, + { + { -0x1579cd1, -0x0661d56, 0x0d8b366, -0x085721e, -0x166bd86, + 0x086f5b6, 0x119e4e3, -0x014c62a, -0x0c75228, 0x0d2e7b5 }, + { -0x0627f89, -0x077a07f, 0x0337e6e, -0x04856fe, -0x19d7df9, + -0x0a14fbb, 0x1e30474, -0x06e0d27, 0x11fadca, -0x0d8fe97 }, + { 0x045af2e, -0x0c5d01c, -0x1ba1f29, -0x0bac731, -0x07401f0, + -0x02b30e9, -0x081eb36, -0x09f2aa1, 0x021b009, -0x030fa38 }, + }, + { + { -0x17fd50f, -0x03fdedd, 0x1c4e5f8, 0x0781181, -0x0689805, + -0x08fcbf9, -0x0bdc774, -0x00a1dd8, -0x1dab927, -0x0f62954 }, + { 0x1864348, -0x0293f6e, 0x070262c, -0x0b447bc, 0x0fb5ace, + 0x08deb95, 0x03aaab5, 0x0eff474, 0x0029d5c, 0x062ad66 }, + { 0x16089e9, 0x0cb317f, 0x0949b05, -0x0f66be9, 0x00c7ad3, + -0x0e579de, 0x088ccdb, -0x0d6f77a, -0x1d4ac20, 0x0f71955 }, + }, + { + { -0x1ff8093, 0x03088a9, -0x0248812, 0x0d5ade7, -0x101ed32, + -0x0f845f8, 0x107097e, 0x0482a6f, -0x113cb91, 0x08d3f60 }, + { -0x180406d, -0x03fb133, 0x1ed6a0e, 0x04cdbbb, -0x1440c51, + 0x0ad5969, 0x1591955, 0x094f3a2, -0x12e89fe, 0x0099e21 }, + { -0x0d15c75, 0x028465d, -0x171d594, -0x0710391, 0x090df9b, + -0x0581acd, -0x06403b7, -0x0f38ba2, -0x0255f68, -0x0da1164 }, + }, + { + { -0x030154d, -0x04cff3b, 0x08da49a, 0x068c4e1, -0x1e6dcf6, + -0x00d2b2f, -0x121029f, 0x0e565b8, 0x17f12de, -0x078775c }, + { -0x17330e2, 0x0f36192, -0x1e53f7f, -0x0deab37, -0x0b1f591, + -0x04dae6d, 0x0180404, -0x028115f, 0x0218d06, -0x0f128cb }, + { -0x02f4ad7, 0x09d8be7, -0x022344d, 0x071b9ff, -0x0531d43, + -0x029bd8f, -0x0d2c136, -0x0f7bf9a, -0x1ef51b2, -0x0bb8a7b }, + }, +}; #endif @@ -9405,7 +9696,7 @@ int ge_double_scalarmult_vartime(ge_p2 *r, const unsigned char *a, #else signed char aslide[SLIDE_SIZE]; signed char bslide[SLIDE_SIZE]; - ge_cached Ai[8]; /* A,3A,5A,7A,9A,11A,13A,15A */ + ge_cached Ai[16]; /* A,3A,5A,7A,9A,11A,13A,15A */ ge_p1p1 t[1]; ge_p3 u[1]; @@ -9416,7 +9707,7 @@ int ge_double_scalarmult_vartime(ge_p2 *r, const unsigned char *a, #if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_NO_MALLOC) if (((aslide = (signed char *)XMALLOC(SLIDE_SIZE, NULL, DYNAMIC_TYPE_TMP_BUFFER))== NULL) || ((bslide = (signed char *)XMALLOC(SLIDE_SIZE, NULL, DYNAMIC_TYPE_TMP_BUFFER))== NULL) || - ((Ai = (ge_cached *)XMALLOC(8 * sizeof(*Ai), NULL, DYNAMIC_TYPE_TMP_BUFFER))== NULL) || + ((Ai = (ge_cached *)XMALLOC(16 * sizeof(*Ai), NULL, DYNAMIC_TYPE_TMP_BUFFER))== NULL) || ((t = (ge_p1p1 *)XMALLOC(sizeof(*t), NULL, DYNAMIC_TYPE_TMP_BUFFER))== NULL) || ((u = (ge_p3 *)XMALLOC(sizeof(*u), NULL, DYNAMIC_TYPE_TMP_BUFFER))== NULL) || ((A2 = (ge_p3 *)XMALLOC(sizeof(*A2), NULL, DYNAMIC_TYPE_TMP_BUFFER))== NULL)) @@ -9427,8 +9718,12 @@ int ge_double_scalarmult_vartime(ge_p2 *r, const unsigned char *a, ret = 0; #endif - slide(aslide,a); - slide(bslide,b); + slide(aslide,a,15); +#ifdef CURVED25519_ASM_64BIT + slide(bslide,b,63); +#else + slide(bslide,b,15); +#endif ge_p3_to_cached(&Ai[0],A); ge_p3_dbl(t,A); ge_p1p1_to_p3(A2,t); @@ -9442,8 +9737,60 @@ int ge_double_scalarmult_vartime(ge_p2 *r, const unsigned char *a, ge_p2_0(r); - for (i = 255;i >= 0;--i) { - if (aslide[i] || bslide[i]) break; + for (i = SLIDE_SIZE-1;i >= 0;--i) { + if (aslide[i] || bslide[i]) { +#ifdef CURVED25519_ASM_64BIT + if (aslide[i] > 0) { + fe_copy(t->Z, Ai[aslide[i]/2].YplusX); + fe_copy(t->Y, Ai[aslide[i]/2].YminusX); + fe_sub(t->X,t->Z,t->Y); + fe_add(t->Y,t->Z,t->Y); + fe_add(t->T,Ai[aslide[i]/2].Z, Ai[aslide[i]/2].Z); + fe_copy(t->Z,t->T); + } else if (aslide[i] < 0) { + fe_copy(t->Z, Ai[aslide[i]/2].YminusX); + fe_copy(t->Y, Ai[aslide[i]/2].YplusX); + fe_sub(t->X,t->Z,t->Y); + fe_add(t->Y,t->Z,t->Y); + fe_add(t->T,Ai[aslide[i]/2].Z, Ai[aslide[i]/2].Z); + fe_copy(t->Z,t->T); + } + + if (bslide[i] > 0) { + if (aslide[i] != 0) { + ge_p1p1_to_p3(u,t); + ge_madd(t,u,&Bi[bslide[i]/2]); + } else { + fe_copy(t->Z,Bi[bslide[i]/2].yplusx); + fe_copy(t->Y,Bi[bslide[i]/2].yminusx); + fe_sub(t->X,t->Z,t->Y); + fe_add(t->Y,t->Z,t->Y); + fe_0(t->T); + t->T[0] = 0x2; + fe_0(t->Z); + t->Z[0] = 0x2; + } + } else if (bslide[i] < 0) { + if (aslide[i] != 0) { + ge_p1p1_to_p3(u,t); + ge_msub(t,u,&Bi[(-bslide[i])/2]); + } else { + fe_copy(t->Z,Bi[bslide[i]/2].yminusx); + fe_copy(t->Y,Bi[bslide[i]/2].yplusx); + fe_sub(t->X,t->Z,t->Y); + fe_add(t->Y,t->Z,t->Y); + fe_0(t->T); + t->T[0] = 0x2; + fe_0(t->Z); + t->Z[0] = 0x2; + } + } + + ge_p1p1_to_p2(r,t); + --i; +#endif + break; + } } for (;i >= 0;--i) { @@ -9556,6 +9903,11 @@ int ge_frombytes_negate_vartime(ge_p3 *h,const unsigned char *s) fe_mul(h->X,h->X,u); /* x = uv^7 */ fe_pow22523(h->X,h->X); /* x = (uv^7)^((q-5)/8) */ + /* Alternative if inversion very fast. + * x^2^252 * invert(x^3) + * = x^2^252 * x^-3 + * = x^(2^252 - 3) + */ fe_mul(h->X,h->X,v3); fe_mul(h->X,h->X,u); /* x = uv^3(uv^7)^((q-5)/8) */ @@ -9823,6 +10175,24 @@ void ge_tobytes(unsigned char *s,const ge_p2 *h) s[31] ^= (unsigned char)((unsigned char)fe_isnegative(x) << 7); } +#ifndef CURVED25519_ASM_64BIT + #define fe_invert_nct fe_invert +#endif + +/* ge tobytes */ +void ge_tobytes_nct(unsigned char *s,const ge_p2 *h) +{ + ge recip; + ge x; + ge y; + + fe_invert_nct(recip,h->Z); + fe_mul(x,h->X,recip); + fe_mul(y,h->Y,recip); + fe_tobytes(s,y); + s[31] ^= (unsigned char)((unsigned char)fe_isnegative(x) << 7); +} + #endif /* !ED25519_SMALL */ /* if HAVE_ED25519 but not HAVE_CURVE25519, and an asm implementation is built, diff --git a/wolfcrypt/src/port/arm/armv8-curve25519.S b/wolfcrypt/src/port/arm/armv8-curve25519.S index ffaf2d6b54..f7492d3991 100644 --- a/wolfcrypt/src/port/arm/armv8-curve25519.S +++ b/wolfcrypt/src/port/arm/armv8-curve25519.S @@ -555,6 +555,253 @@ _fe_cmov_table: #endif /* __APPLE__ */ #ifndef __APPLE__ .text +.globl fe_invert_nct +.type fe_invert_nct,@function +.align 2 +fe_invert_nct: +#else +.section __TEXT,__text +.globl _fe_invert_nct +.p2align 2 +_fe_invert_nct: +#endif /* __APPLE__ */ + stp x29, x30, [sp, #-80]! + add x29, sp, #0 + stp x17, x19, [x29, #24] + stp x20, x21, [x29, #40] + stp x22, x23, [x29, #56] + str x24, [x29, #72] + mov x19, #-19 + mov x20, #-1 + mov x21, #0x7fffffffffffffff + ldr x6, [x1] + ldr x7, [x1, #8] + ldr x8, [x1, #16] + ldr x9, [x1, #24] + mov x2, x19 + mov x3, x20 + mov x4, x20 + mov x5, x21 + mov x10, xzr + mov x11, xzr + mov x12, xzr + mov x13, xzr + mov x14, #1 + mov x15, xzr + mov x16, xzr + mov x17, xzr + mov x22, #0xff + cmp x9, #0 + beq L_fe_invert_nct_num_bits_init_v_0 + mov x24, #0x100 + clz x23, x9 + sub x23, x24, x23 + b L_fe_invert_nct_num_bits_init_v_3 +L_fe_invert_nct_num_bits_init_v_0: + cmp x8, #0 + beq L_fe_invert_nct_num_bits_init_v_1 + mov x24, #0xc0 + clz x23, x8 + sub x23, x24, x23 + b L_fe_invert_nct_num_bits_init_v_3 +L_fe_invert_nct_num_bits_init_v_1: + cmp x7, #0 + beq L_fe_invert_nct_num_bits_init_v_2 + mov x24, #0x80 + clz x23, x7 + sub x23, x24, x23 + b L_fe_invert_nct_num_bits_init_v_3 +L_fe_invert_nct_num_bits_init_v_2: + mov x24, #0x40 + clz x23, x6 + sub x23, x24, x23 +L_fe_invert_nct_num_bits_init_v_3: + tst x6, #1 + bne L_fe_invert_nct_loop +L_fe_invert_nct_even_init_v_0: + extr x6, x7, x6, #1 + extr x7, x8, x7, #1 + extr x8, x9, x8, #1 + lsr x9, x9, #1 + sub x23, x23, #1 + ands x24, x14, #1 + beq L_fe_invert_nct_even_init_v_1 + adds x14, x14, x19 + adcs x15, x15, x20 + adcs x16, x16, x20 + adcs x17, x17, x21 + cset x24, cs +L_fe_invert_nct_even_init_v_1: + extr x14, x15, x14, #1 + extr x15, x16, x15, #1 + extr x16, x17, x16, #1 + extr x17, x24, x17, #1 + tst x6, #1 + beq L_fe_invert_nct_even_init_v_0 +L_fe_invert_nct_loop: + cmp x22, #1 + beq L_fe_invert_nct_u_done + cmp x23, #1 + beq L_fe_invert_nct_v_done + cmp x22, x23 + bhi L_fe_invert_nct_u_larger + bcc L_fe_invert_nct_v_larger + cmp x5, x9 + bhi L_fe_invert_nct_u_larger + bcc L_fe_invert_nct_v_larger + cmp x4, x8 + bhi L_fe_invert_nct_u_larger + bcc L_fe_invert_nct_v_larger + cmp x3, x7 + bhi L_fe_invert_nct_u_larger + bcc L_fe_invert_nct_v_larger + cmp x2, x6 + bcc L_fe_invert_nct_v_larger +L_fe_invert_nct_u_larger: + subs x2, x2, x6 + sbcs x3, x3, x7 + sbcs x4, x4, x8 + sbc x5, x5, x9 + subs x10, x10, x14 + sbcs x11, x11, x15 + sbcs x12, x12, x16 + sbcs x13, x13, x17 + bcs L_fe_invert_nct_sub_uv + adds x10, x10, x19 + adcs x11, x11, x20 + adcs x12, x12, x20 + adc x13, x13, x21 +L_fe_invert_nct_sub_uv: + cmp x5, #0 + beq L_fe_invert_nct_nct_num_bits_u_0 + mov x24, #0x100 + clz x22, x5 + sub x22, x24, x22 + b L_fe_invert_nct_nct_num_bits_u_3 +L_fe_invert_nct_nct_num_bits_u_0: + cmp x4, #0 + beq L_fe_invert_nct_nct_num_bits_u_1 + mov x24, #0xc0 + clz x22, x4 + sub x22, x24, x22 + b L_fe_invert_nct_nct_num_bits_u_3 +L_fe_invert_nct_nct_num_bits_u_1: + cmp x3, #0 + beq L_fe_invert_nct_nct_num_bits_u_2 + mov x24, #0x80 + clz x22, x3 + sub x22, x24, x22 + b L_fe_invert_nct_nct_num_bits_u_3 +L_fe_invert_nct_nct_num_bits_u_2: + mov x24, #0x40 + clz x22, x2 + sub x22, x24, x22 +L_fe_invert_nct_nct_num_bits_u_3: +L_fe_invert_nct_even_u_0: + extr x2, x3, x2, #1 + extr x3, x4, x3, #1 + extr x4, x5, x4, #1 + lsr x5, x5, #1 + sub x22, x22, #1 + ands x24, x10, #1 + beq L_fe_invert_nct_even_u_1 + adds x10, x10, x19 + adcs x11, x11, x20 + adcs x12, x12, x20 + adcs x13, x13, x21 + cset x24, cs +L_fe_invert_nct_even_u_1: + extr x10, x11, x10, #1 + extr x11, x12, x11, #1 + extr x12, x13, x12, #1 + extr x13, x24, x13, #1 + tst x2, #1 + beq L_fe_invert_nct_even_u_0 + b L_fe_invert_nct_loop +L_fe_invert_nct_v_larger: + subs x6, x6, x2 + sbcs x7, x7, x3 + sbcs x8, x8, x4 + sbc x9, x9, x5 + subs x14, x14, x10 + sbcs x15, x15, x11 + sbcs x16, x16, x12 + sbcs x17, x17, x13 + bcs L_fe_invert_nct_sub_vu + adds x14, x14, x19 + adcs x15, x15, x20 + adcs x16, x16, x20 + adc x17, x17, x21 +L_fe_invert_nct_sub_vu: + cmp x9, #0 + beq L_fe_invert_nct_nct_num_bits_v_0 + mov x24, #0x100 + clz x23, x9 + sub x23, x24, x23 + b L_fe_invert_nct_nct_num_bits_v_3 +L_fe_invert_nct_nct_num_bits_v_0: + cmp x8, #0 + beq L_fe_invert_nct_nct_num_bits_v_1 + mov x24, #0xc0 + clz x23, x8 + sub x23, x24, x23 + b L_fe_invert_nct_nct_num_bits_v_3 +L_fe_invert_nct_nct_num_bits_v_1: + cmp x7, #0 + beq L_fe_invert_nct_nct_num_bits_v_2 + mov x24, #0x80 + clz x23, x7 + sub x23, x24, x23 + b L_fe_invert_nct_nct_num_bits_v_3 +L_fe_invert_nct_nct_num_bits_v_2: + mov x24, #0x40 + clz x23, x6 + sub x23, x24, x23 +L_fe_invert_nct_nct_num_bits_v_3: +L_fe_invert_nct_even_v_0: + extr x6, x7, x6, #1 + extr x7, x8, x7, #1 + extr x8, x9, x8, #1 + lsr x9, x9, #1 + sub x23, x23, #1 + ands x24, x14, #1 + beq L_fe_invert_nct_even_v_1 + adds x14, x14, x19 + adcs x15, x15, x20 + adcs x16, x16, x20 + adcs x17, x17, x21 + cset x24, cs +L_fe_invert_nct_even_v_1: + extr x14, x15, x14, #1 + extr x15, x16, x15, #1 + extr x16, x17, x16, #1 + extr x17, x24, x17, #1 + tst x6, #1 + beq L_fe_invert_nct_even_v_0 + b L_fe_invert_nct_loop +L_fe_invert_nct_u_done: + str x10, [x0] + str x11, [x0, #8] + str x12, [x0, #16] + str x13, [x0, #24] + b L_fe_invert_nct_done +L_fe_invert_nct_v_done: + str x14, [x0] + str x15, [x0, #8] + str x16, [x0, #16] + str x17, [x0, #24] +L_fe_invert_nct_done: + ldp x17, x19, [x29, #24] + ldp x20, x21, [x29, #40] + ldp x22, x23, [x29, #56] + ldr x24, [x29, #72] + ldp x29, x30, [sp], #0x50 + ret +#ifndef __APPLE__ + .size fe_invert_nct,.-fe_invert_nct +#endif /* __APPLE__ */ +#ifndef __APPLE__ +.text .globl fe_mul .type fe_mul,@function .align 2 diff --git a/wolfcrypt/src/port/arm/armv8-curve25519_c.c b/wolfcrypt/src/port/arm/armv8-curve25519_c.c index c91e32e1a7..2ffeb85e45 100644 --- a/wolfcrypt/src/port/arm/armv8-curve25519_c.c +++ b/wolfcrypt/src/port/arm/armv8-curve25519_c.c @@ -471,6 +471,263 @@ void fe_cmov_table(fe* r, fe* base, signed char b) ); } +void fe_invert_nct(word64* r, const word64* a) +{ + __asm__ __volatile__ ( + "mov x19, #-19\n\t" + "mov x20, #-1\n\t" + "mov x21, #0x7fffffffffffffff\n\t" + "ldr x6, [%x[a]]\n\t" + "ldr x7, [%x[a], #8]\n\t" + "ldr x8, [%x[a], #16]\n\t" + "ldr x9, [%x[a], #24]\n\t" + "mov x2, x19\n\t" + "mov x3, x20\n\t" + "mov x4, x20\n\t" + "mov x5, x21\n\t" + "mov x10, xzr\n\t" + "mov x11, xzr\n\t" + "mov x12, xzr\n\t" + "mov x13, xzr\n\t" + "mov x14, #1\n\t" + "mov x15, xzr\n\t" + "mov x16, xzr\n\t" + "mov x17, xzr\n\t" + "mov x22, #0xff\n\t" + "cmp x9, #0\n\t" + "b.eq L_fe_invert_nct_num_bits_init_v_0_%=\n\t" + "mov x24, #0x100\n\t" + "clz x23, x9\n\t" + "sub x23, x24, x23\n\t" + "b L_fe_invert_nct_num_bits_init_v_3_%=\n\t" + "\n" + "L_fe_invert_nct_num_bits_init_v_0_%=: \n\t" + "cmp x8, #0\n\t" + "b.eq L_fe_invert_nct_num_bits_init_v_1_%=\n\t" + "mov x24, #0xc0\n\t" + "clz x23, x8\n\t" + "sub x23, x24, x23\n\t" + "b L_fe_invert_nct_num_bits_init_v_3_%=\n\t" + "\n" + "L_fe_invert_nct_num_bits_init_v_1_%=: \n\t" + "cmp x7, #0\n\t" + "b.eq L_fe_invert_nct_num_bits_init_v_2_%=\n\t" + "mov x24, #0x80\n\t" + "clz x23, x7\n\t" + "sub x23, x24, x23\n\t" + "b L_fe_invert_nct_num_bits_init_v_3_%=\n\t" + "\n" + "L_fe_invert_nct_num_bits_init_v_2_%=: \n\t" + "mov x24, #0x40\n\t" + "clz x23, x6\n\t" + "sub x23, x24, x23\n\t" + "\n" + "L_fe_invert_nct_num_bits_init_v_3_%=: \n\t" + "tst x6, #1\n\t" + "b.ne L_fe_invert_nct_loop_%=\n\t" + "\n" + "L_fe_invert_nct_even_init_v_0_%=: \n\t" + "extr x6, x7, x6, #1\n\t" + "extr x7, x8, x7, #1\n\t" + "extr x8, x9, x8, #1\n\t" + "lsr x9, x9, #1\n\t" + "sub x23, x23, #1\n\t" + "ands x24, x14, #1\n\t" + "b.eq L_fe_invert_nct_even_init_v_1_%=\n\t" + "adds x14, x14, x19\n\t" + "adcs x15, x15, x20\n\t" + "adcs x16, x16, x20\n\t" + "adcs x17, x17, x21\n\t" + "cset x24, cs\n\t" + "\n" + "L_fe_invert_nct_even_init_v_1_%=: \n\t" + "extr x14, x15, x14, #1\n\t" + "extr x15, x16, x15, #1\n\t" + "extr x16, x17, x16, #1\n\t" + "extr x17, x24, x17, #1\n\t" + "tst x6, #1\n\t" + "b.eq L_fe_invert_nct_even_init_v_0_%=\n\t" + "\n" + "L_fe_invert_nct_loop_%=: \n\t" + "cmp x22, #1\n\t" + "b.eq L_fe_invert_nct_u_done_%=\n\t" + "cmp x23, #1\n\t" + "b.eq L_fe_invert_nct_v_done_%=\n\t" + "cmp x22, x23\n\t" + "bhi L_fe_invert_nct_u_larger_%=\n\t" + "bcc L_fe_invert_nct_v_larger_%=\n\t" + "cmp x5, x9\n\t" + "bhi L_fe_invert_nct_u_larger_%=\n\t" + "bcc L_fe_invert_nct_v_larger_%=\n\t" + "cmp x4, x8\n\t" + "bhi L_fe_invert_nct_u_larger_%=\n\t" + "bcc L_fe_invert_nct_v_larger_%=\n\t" + "cmp x3, x7\n\t" + "bhi L_fe_invert_nct_u_larger_%=\n\t" + "bcc L_fe_invert_nct_v_larger_%=\n\t" + "cmp x2, x6\n\t" + "bcc L_fe_invert_nct_v_larger_%=\n\t" + "\n" + "L_fe_invert_nct_u_larger_%=: \n\t" + "subs x2, x2, x6\n\t" + "sbcs x3, x3, x7\n\t" + "sbcs x4, x4, x8\n\t" + "sbc x5, x5, x9\n\t" + "subs x10, x10, x14\n\t" + "sbcs x11, x11, x15\n\t" + "sbcs x12, x12, x16\n\t" + "sbcs x13, x13, x17\n\t" + "bcs L_fe_invert_nct_sub_uv_%=\n\t" + "adds x10, x10, x19\n\t" + "adcs x11, x11, x20\n\t" + "adcs x12, x12, x20\n\t" + "adc x13, x13, x21\n\t" + "\n" + "L_fe_invert_nct_sub_uv_%=: \n\t" + "cmp x5, #0\n\t" + "b.eq L_fe_invert_nct_nct_num_bits_u_0_%=\n\t" + "mov x24, #0x100\n\t" + "clz x22, x5\n\t" + "sub x22, x24, x22\n\t" + "b L_fe_invert_nct_nct_num_bits_u_3_%=\n\t" + "\n" + "L_fe_invert_nct_nct_num_bits_u_0_%=: \n\t" + "cmp x4, #0\n\t" + "b.eq L_fe_invert_nct_nct_num_bits_u_1_%=\n\t" + "mov x24, #0xc0\n\t" + "clz x22, x4\n\t" + "sub x22, x24, x22\n\t" + "b L_fe_invert_nct_nct_num_bits_u_3_%=\n\t" + "\n" + "L_fe_invert_nct_nct_num_bits_u_1_%=: \n\t" + "cmp x3, #0\n\t" + "b.eq L_fe_invert_nct_nct_num_bits_u_2_%=\n\t" + "mov x24, #0x80\n\t" + "clz x22, x3\n\t" + "sub x22, x24, x22\n\t" + "b L_fe_invert_nct_nct_num_bits_u_3_%=\n\t" + "\n" + "L_fe_invert_nct_nct_num_bits_u_2_%=: \n\t" + "mov x24, #0x40\n\t" + "clz x22, x2\n\t" + "sub x22, x24, x22\n\t" + "\n" + "L_fe_invert_nct_nct_num_bits_u_3_%=: \n\t" + "\n" + "L_fe_invert_nct_even_u_0_%=: \n\t" + "extr x2, x3, x2, #1\n\t" + "extr x3, x4, x3, #1\n\t" + "extr x4, x5, x4, #1\n\t" + "lsr x5, x5, #1\n\t" + "sub x22, x22, #1\n\t" + "ands x24, x10, #1\n\t" + "b.eq L_fe_invert_nct_even_u_1_%=\n\t" + "adds x10, x10, x19\n\t" + "adcs x11, x11, x20\n\t" + "adcs x12, x12, x20\n\t" + "adcs x13, x13, x21\n\t" + "cset x24, cs\n\t" + "\n" + "L_fe_invert_nct_even_u_1_%=: \n\t" + "extr x10, x11, x10, #1\n\t" + "extr x11, x12, x11, #1\n\t" + "extr x12, x13, x12, #1\n\t" + "extr x13, x24, x13, #1\n\t" + "tst x2, #1\n\t" + "b.eq L_fe_invert_nct_even_u_0_%=\n\t" + "b L_fe_invert_nct_loop_%=\n\t" + "\n" + "L_fe_invert_nct_v_larger_%=: \n\t" + "subs x6, x6, x2\n\t" + "sbcs x7, x7, x3\n\t" + "sbcs x8, x8, x4\n\t" + "sbc x9, x9, x5\n\t" + "subs x14, x14, x10\n\t" + "sbcs x15, x15, x11\n\t" + "sbcs x16, x16, x12\n\t" + "sbcs x17, x17, x13\n\t" + "bcs L_fe_invert_nct_sub_vu_%=\n\t" + "adds x14, x14, x19\n\t" + "adcs x15, x15, x20\n\t" + "adcs x16, x16, x20\n\t" + "adc x17, x17, x21\n\t" + "\n" + "L_fe_invert_nct_sub_vu_%=: \n\t" + "cmp x9, #0\n\t" + "b.eq L_fe_invert_nct_nct_num_bits_v_0_%=\n\t" + "mov x24, #0x100\n\t" + "clz x23, x9\n\t" + "sub x23, x24, x23\n\t" + "b L_fe_invert_nct_nct_num_bits_v_3_%=\n\t" + "\n" + "L_fe_invert_nct_nct_num_bits_v_0_%=: \n\t" + "cmp x8, #0\n\t" + "b.eq L_fe_invert_nct_nct_num_bits_v_1_%=\n\t" + "mov x24, #0xc0\n\t" + "clz x23, x8\n\t" + "sub x23, x24, x23\n\t" + "b L_fe_invert_nct_nct_num_bits_v_3_%=\n\t" + "\n" + "L_fe_invert_nct_nct_num_bits_v_1_%=: \n\t" + "cmp x7, #0\n\t" + "b.eq L_fe_invert_nct_nct_num_bits_v_2_%=\n\t" + "mov x24, #0x80\n\t" + "clz x23, x7\n\t" + "sub x23, x24, x23\n\t" + "b L_fe_invert_nct_nct_num_bits_v_3_%=\n\t" + "\n" + "L_fe_invert_nct_nct_num_bits_v_2_%=: \n\t" + "mov x24, #0x40\n\t" + "clz x23, x6\n\t" + "sub x23, x24, x23\n\t" + "\n" + "L_fe_invert_nct_nct_num_bits_v_3_%=: \n\t" + "\n" + "L_fe_invert_nct_even_v_0_%=: \n\t" + "extr x6, x7, x6, #1\n\t" + "extr x7, x8, x7, #1\n\t" + "extr x8, x9, x8, #1\n\t" + "lsr x9, x9, #1\n\t" + "sub x23, x23, #1\n\t" + "ands x24, x14, #1\n\t" + "b.eq L_fe_invert_nct_even_v_1_%=\n\t" + "adds x14, x14, x19\n\t" + "adcs x15, x15, x20\n\t" + "adcs x16, x16, x20\n\t" + "adcs x17, x17, x21\n\t" + "cset x24, cs\n\t" + "\n" + "L_fe_invert_nct_even_v_1_%=: \n\t" + "extr x14, x15, x14, #1\n\t" + "extr x15, x16, x15, #1\n\t" + "extr x16, x17, x16, #1\n\t" + "extr x17, x24, x17, #1\n\t" + "tst x6, #1\n\t" + "b.eq L_fe_invert_nct_even_v_0_%=\n\t" + "b L_fe_invert_nct_loop_%=\n\t" + "\n" + "L_fe_invert_nct_u_done_%=: \n\t" + "str x10, [%x[r]]\n\t" + "str x11, [%x[r], #8]\n\t" + "str x12, [%x[r], #16]\n\t" + "str x13, [%x[r], #24]\n\t" + "b L_fe_invert_nct_done_%=\n\t" + "\n" + "L_fe_invert_nct_v_done_%=: \n\t" + "str x14, [%x[r]]\n\t" + "str x15, [%x[r], #8]\n\t" + "str x16, [%x[r], #16]\n\t" + "str x17, [%x[r], #24]\n\t" + "\n" + "L_fe_invert_nct_done_%=: \n\t" + : [r] "+r" (r) + : [a] "r" (a) + : "memory", "cc", "x2", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", + "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x19", "x20", + "x21", "x22", "x23", "x24" + ); +} + void fe_mul(fe r, const fe a, const fe b) { __asm__ __volatile__ ( diff --git a/wolfssl/wolfcrypt/fe_operations.h b/wolfssl/wolfcrypt/fe_operations.h index 4959cf2b83..02490e2463 100644 --- a/wolfssl/wolfcrypt/fe_operations.h +++ b/wolfssl/wolfcrypt/fe_operations.h @@ -49,7 +49,8 @@ #endif #if (defined(CURVED25519_ASM_64BIT) || defined(HAVE_ED25519)) && \ - !defined(WOLFSSL_CURVE25519_BLINDING) + !defined(WOLFSSL_CURVE25519_BLINDING) && \ + !defined(WOLFSSL_CURVE25519_NOT_USE_ED25519) #undef WOLFSSL_CURVE25519_USE_ED25519 #define WOLFSSL_CURVE25519_USE_ED25519 #endif @@ -133,6 +134,8 @@ WOLFSSL_LOCAL void fe_pow22523(fe out,const fe z); #ifdef CURVED25519_ASM WOLFSSL_LOCAL void fe_cmov_table(fe* r, fe* base, signed char b); + +WOLFSSL_LOCAL void fe_invert_nct(fe r, const fe a); #endif /* CURVED25519_ASM */ #endif /* !CURVE25519_SMALL || !ED25519_SMALL */ diff --git a/wolfssl/wolfcrypt/ge_operations.h b/wolfssl/wolfcrypt/ge_operations.h index 6b3d244055..62074fdf26 100644 --- a/wolfssl/wolfcrypt/ge_operations.h +++ b/wolfssl/wolfcrypt/ge_operations.h @@ -85,6 +85,11 @@ WOLFSSL_LOCAL void sc_reduce(byte* s); WOLFSSL_LOCAL void sc_muladd(byte* s, const byte* a, const byte* b, const byte* c); WOLFSSL_LOCAL void ge_tobytes(unsigned char *s,const ge_p2 *h); +#ifndef ED25519_SMALL +WOLFSSL_LOCAL void ge_tobytes_nct(unsigned char *s,const ge_p2 *h); +#else +#define ge_tobytes_nct ge_tobytes +#endif #ifndef GE_P3_TOBYTES_IMPL #define ge_p3_tobytes(s, h) ge_tobytes((s), (const ge_p2 *)(h)) #else