From c8e57c0196a0cb912da905c70ae2335e53fabba2 Mon Sep 17 00:00:00 2001 From: Claudia Richoux Date: Sat, 28 Jan 2023 21:04:36 -0500 Subject: [PATCH 1/8] adding some tests --- .github/workflows/sha1.yml | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/.github/workflows/sha1.yml b/.github/workflows/sha1.yml index ca09fd895..2228af5d5 100644 --- a/.github/workflows/sha1.yml +++ b/.github/workflows/sha1.yml @@ -187,3 +187,20 @@ jobs: override: true - run: cargo test --no-default-features - run: cargo test + + # TODO: merge with test on MSRV bump to 1.59 or higher + test-inline-asm: + runs-on: ubuntu-latest + strategy: + matrix: + rust: + - 1.59.0 # MSRV + steps: + - uses: actions/checkout@v3 + - uses: RustCrypto/actions/cargo-cache@master + - uses: actions-rs/toolchain@v1 + with: + profile: minimal + toolchain: ${{ matrix.rust }} + override: true + - run: cargo test --features inline-asm From 68a36e473e4f10bebe299cb8d5a457125e2c99ca Mon Sep 17 00:00:00 2001 From: Claudia Richoux Date: Sat, 28 Jan 2023 23:42:42 -0500 Subject: [PATCH 2/8] skeleton --- sha1/Cargo.toml | 2 + sha1/README.md | 2 + sha1/src/asm/aarch64.rs | 237 ++++++++++++++++++++++++++++++++++ sha1/src/asm/aarch64_apple.rs | 237 ++++++++++++++++++++++++++++++++++ sha1/src/asm/mod.rs | 7 + sha1/src/asm/x86.rs | 227 ++++++++++++++++++++++++++++++++ sha1/src/asm/x86_64.rs | 217 +++++++++++++++++++++++++++++++ sha1/src/lib.rs | 39 +++++- 8 files changed, 967 insertions(+), 1 deletion(-) create mode 100644 sha1/src/asm/aarch64.rs create mode 100644 sha1/src/asm/aarch64_apple.rs create mode 100644 sha1/src/asm/mod.rs create mode 100644 sha1/src/asm/x86.rs create mode 100644 sha1/src/asm/x86_64.rs diff --git a/sha1/Cargo.toml b/sha1/Cargo.toml index 8ff801508..06a7bc44e 100644 --- a/sha1/Cargo.toml +++ b/sha1/Cargo.toml @@ -18,6 +18,7 @@ cfg-if = "1.0" [target.'cfg(any(target_arch = "aarch64", target_arch = "x86", target_arch = "x86_64"))'.dependencies] cpufeatures = "0.2" sha1-asm = { version = "0.5", optional = true } +asm_block = { version = "0.1.3", optional = true } [dev-dependencies] digest = { version = "0.10.4", features = ["dev"] } @@ -30,6 +31,7 @@ oid = ["digest/oid"] # Enable OID support. WARNING: Bumps MSRV to 1.57 asm = ["sha1-asm"] # WARNING: this feature SHOULD NOT be enabled by library crates compress = [] # Expose compress function force-soft = [] # Force software implementation +inline-asm = ["asm_block"] # TODO: i don't know why the "do not enable by library crates" warning is in the asm feature, flagging this to ask about it. # WARNING: bumps MSRV to 1.59 [package.metadata.docs.rs] all-features = true diff --git a/sha1/README.md b/sha1/README.md index bd76f0973..d388cc6f1 100644 --- a/sha1/README.md +++ b/sha1/README.md @@ -23,6 +23,8 @@ We provide this crate for legacy interoperability purposes only. Rust **1.41** or higher. +Enabling feature flag `inline-asm` requires Rust **1.59** or higher. + Minimum supported Rust version can be changed in the future, but it will be done with a minor version bump. diff --git a/sha1/src/asm/aarch64.rs b/sha1/src/asm/aarch64.rs new file mode 100644 index 000000000..c069f329e --- /dev/null +++ b/sha1/src/asm/aarch64.rs @@ -0,0 +1,237 @@ +// /* +// * SHA-1 hash in AArch64 assembly +// * +// * Copyright (c) 2020 Emmanuel Gil Peyrot . (MIT License) +// * +// * Permission is hereby granted, free of charge, to any person obtaining a copy of +// * this software and associated documentation files (the "Software"), to deal in +// * the Software without restriction, including without limitation the rights to +// * use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of +// * the Software, and to permit persons to whom the Software is furnished to do so, +// * subject to the following conditions: +// * - The above copyright notice and this permission notice shall be included in +// * all copies or substantial portions of the Software. +// * - The Software is provided "as is", without warranty of any kind, express or +// * implied, including but not limited to the warranties of merchantability, +// * fitness for a particular purpose and noninfringement. In no event shall the +// * authors or copyright holders be liable for any claim, damages or other +// * liability, whether in an action of contract, tort or otherwise, arising from, +// * out of or in connection with the Software or the use or other dealings in the +// * Software. +// */ +// +// +// /* void sha1_compress(uint32_t state[5], const uint8_t block[64]) */ +// .global sha1_compress +// sha1_compress: +// /* +// * Storage usage: +// * Bytes Location Description +// * 4 x0 state argument +// * 4 x1 block argument +// * 16 q0 W0 +// * 16 q1 W1 +// * 16 q2 W2 +// * 16 q3 W3 +// * 16 q4 k +// * 16 q5 Original ABCD +// * 16 q6 ABCD (with s3 being A) +// * 4 s16 E +// * 4 s17 e0 +// * 4 s18 e1 +// * 16 q19 wk +// */ +// +// // Load state in registers +// ldr q5, [x0] +// ldr s16, [x0, 16] +// mov v6.16b, v5.16b +// +// // Load block in registers +// ldr q0, [x1] +// ldr q1, [x1, 16] +// ldr q2, [x1, 32] +// ldr q3, [x1, 48] +// +// // TODO: only do that on little endian +// rev32 v0.16b, v0.16b +// rev32 v1.16b, v1.16b +// rev32 v2.16b, v2.16b +// rev32 v3.16b, v3.16b +// +// // k for the next five rounds +// adrp x1, .K0 +// ldr q4, [x1, #:lo12:.K0] +// +// // 0 +// sha1h s18, s6 +// add v19.4s, v0.4s, v4.4s +// sha1c q6, s16, v19.4s +// sha1su0 v0.4s, v1.4s, v2.4s +// +// // 1 +// sha1h s17, s6 +// add v19.4s, v1.4s, v4.4s +// sha1c q6, s18, v19.4s +// sha1su1 v0.4s, v3.4s +// sha1su0 v1.4s, v2.4s, v3.4s +// +// // 2 +// sha1h s18, s6 +// add v19.4s, v2.4s, v4.4s +// sha1c q6, s17, v19.4s +// sha1su1 v1.4s, v0.4s +// sha1su0 v2.4s, v3.4s, v0.4s +// +// // 3 +// sha1h s17, s6 +// add v19.4s, v3.4s, v4.4s +// sha1c q6, s18, v19.4s +// sha1su1 v2.4s, v1.4s +// sha1su0 v3.4s, v0.4s, v1.4s +// +// // 4 +// sha1h s18, s6 +// add v19.4s, v0.4s, v4.4s +// sha1c q6, s17, v19.4s +// sha1su1 v3.4s, v2.4s +// sha1su0 v0.4s, v1.4s, v2.4s +// +// // k for the next five rounds +// adrp x1, .K1 +// ldr q4, [x1, #:lo12:.K1] +// +// // 5 +// sha1h s17, s6 +// add v19.4s, v1.4s, v4.4s +// sha1p q6, s18, v19.4s +// sha1su1 v0.4s, v3.4s +// sha1su0 v1.4s, v2.4s, v3.4s +// +// // 6 +// sha1h s18, s6 +// add v19.4s, v2.4s, v4.4s +// sha1p q6, s17, v19.4s +// sha1su1 v1.4s, v0.4s +// sha1su0 v2.4s, v3.4s, v0.4s +// +// // 7 +// sha1h s17, s6 +// add v19.4s, v3.4s, v4.4s +// sha1p q6, s18, v19.4s +// sha1su1 v2.4s, v1.4s +// sha1su0 v3.4s, v0.4s, v1.4s +// +// // 8 +// sha1h s18, s6 +// add v19.4s, v0.4s, v4.4s +// sha1p q6, s17, v19.4s +// sha1su1 v3.4s, v2.4s +// sha1su0 v0.4s, v1.4s, v2.4s +// +// // 9 +// sha1h s17, s6 +// add v19.4s, v1.4s, v4.4s +// sha1p q6, s18, v19.4s +// sha1su1 v0.4s, v3.4s +// sha1su0 v1.4s, v2.4s, v3.4s +// +// // k for the next five rounds +// adrp x1, .K2 +// ldr q4, [x1, #:lo12:.K2] +// +// // 10 +// sha1h s18, s6 +// add v19.4s, v2.4s, v4.4s +// sha1m q6, s17, v19.4s +// sha1su1 v1.4s, v0.4s +// sha1su0 v2.4s, v3.4s, v0.4s +// +// // 11 +// sha1h s17, s6 +// add v19.4s, v3.4s, v4.4s +// sha1m q6, s18, v19.4s +// sha1su1 v2.4s, v1.4s +// sha1su0 v3.4s, v0.4s, v1.4s +// +// // 12 +// sha1h s18, s6 +// add v19.4s, v0.4s, v4.4s +// sha1m q6, s17, v19.4s +// sha1su1 v3.4s, v2.4s +// sha1su0 v0.4s, v1.4s, v2.4s +// +// // 13 +// sha1h s17, s6 +// add v19.4s, v1.4s, v4.4s +// sha1m q6, s18, v19.4s +// sha1su1 v0.4s, v3.4s +// sha1su0 v1.4s, v2.4s, v3.4s +// +// // 14 +// sha1h s18, s6 +// add v19.4s, v2.4s, v4.4s +// sha1m q6, s17, v19.4s +// sha1su1 v1.4s, v0.4s +// sha1su0 v2.4s, v3.4s, v0.4s +// +// // k for the next five rounds +// adrp x1, .K3 +// ldr q4, [x1, #:lo12:.K3] +// +// // 15 +// sha1h s17, s6 +// add v19.4s, v3.4s, v4.4s +// sha1p q6, s18, v19.4s +// sha1su1 v2.4s, v1.4s +// sha1su0 v3.4s, v0.4s, v1.4s +// +// // 16 +// sha1h s18, s6 +// add v19.4s, v0.4s, v4.4s +// sha1p q6, s17, v19.4s +// sha1su1 v3.4s, v2.4s +// +// // 17 +// sha1h s17, s6 +// add v19.4s, v1.4s, v4.4s +// sha1p q6, s18, v19.4s +// +// // 18 +// sha1h s18, s6 +// add v19.4s, v2.4s, v4.4s +// sha1p q6, s17, v19.4s +// +// // 19 +// sha1h s17, s6 +// add v19.4s, v3.4s, v4.4s +// sha1p q6, s18, v19.4s +// +// // Update state +// add v6.4s, v6.4s, v5.4s +// str q6, [x0] +// add v16.2s, v16.2s, v17.2s +// str s16, [x0, 16] +// +// ret +// .align 4 +// .K0: +// .word 0x5A827999 +// .word 0x5A827999 +// .word 0x5A827999 +// .word 0x5A827999 +// .K1: +// .word 0x6ED9EBA1 +// .word 0x6ED9EBA1 +// .word 0x6ED9EBA1 +// .word 0x6ED9EBA1 +// .K2: +// .word 0x8F1BBCDC +// .word 0x8F1BBCDC +// .word 0x8F1BBCDC +// .word 0x8F1BBCDC +// .K3: +// .word 0xCA62C1D6 +// .word 0xCA62C1D6 +// .word 0xCA62C1D6 +// .word 0xCA62C1D6 diff --git a/sha1/src/asm/aarch64_apple.rs b/sha1/src/asm/aarch64_apple.rs new file mode 100644 index 000000000..832c2c359 --- /dev/null +++ b/sha1/src/asm/aarch64_apple.rs @@ -0,0 +1,237 @@ +// /* +// * SHA-1 hash in AArch64 assembly +// * +// * Copyright (c) 2020 Emmanuel Gil Peyrot . (MIT License) +// * +// * Permission is hereby granted, free of charge, to any person obtaining a copy of +// * this software and associated documentation files (the "Software"), to deal in +// * the Software without restriction, including without limitation the rights to +// * use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of +// * the Software, and to permit persons to whom the Software is furnished to do so, +// * subject to the following conditions: +// * - The above copyright notice and this permission notice shall be included in +// * all copies or substantial portions of the Software. +// * - The Software is provided "as is", without warranty of any kind, express or +// * implied, including but not limited to the warranties of merchantability, +// * fitness for a particular purpose and noninfringement. In no event shall the +// * authors or copyright holders be liable for any claim, damages or other +// * liability, whether in an action of contract, tort or otherwise, arising from, +// * out of or in connection with the Software or the use or other dealings in the +// * Software. +// */ +// +// +// /* void sha1_compress(uint32_t state[5], const uint8_t block[64]) */ +// .global _sha1_compress +// _sha1_compress: +// /* +// * Storage usage: +// * Bytes Location Description +// * 4 x0 state argument +// * 4 x1 block argument +// * 16 q0 W0 +// * 16 q1 W1 +// * 16 q2 W2 +// * 16 q3 W3 +// * 16 q4 k +// * 16 q5 Original ABCD +// * 16 q6 ABCD (with s3 being A) +// * 4 s16 E +// * 4 s17 e0 +// * 4 s18 e1 +// * 16 q19 wk +// */ +// +// // Load state in registers +// ldr q5, [x0] +// ldr s16, [x0, 16] +// mov v6.16b, v5.16b +// +// // Load block in registers +// ldr q0, [x1] +// ldr q1, [x1, 16] +// ldr q2, [x1, 32] +// ldr q3, [x1, 48] +// +// // TODO: only do that on little endian +// rev32 v0.16b, v0.16b +// rev32 v1.16b, v1.16b +// rev32 v2.16b, v2.16b +// rev32 v3.16b, v3.16b +// +// // k for the next five rounds +// adrp x1, .K0@PAGE +// ldr q4, [x1, #:lo12:.K0@PAGEOFF] +// +// // 0 +// sha1h s18, s6 +// add v19.4s, v0.4s, v4.4s +// sha1c q6, s16, v19.4s +// sha1su0 v0.4s, v1.4s, v2.4s +// +// // 1 +// sha1h s17, s6 +// add v19.4s, v1.4s, v4.4s +// sha1c q6, s18, v19.4s +// sha1su1 v0.4s, v3.4s +// sha1su0 v1.4s, v2.4s, v3.4s +// +// // 2 +// sha1h s18, s6 +// add v19.4s, v2.4s, v4.4s +// sha1c q6, s17, v19.4s +// sha1su1 v1.4s, v0.4s +// sha1su0 v2.4s, v3.4s, v0.4s +// +// // 3 +// sha1h s17, s6 +// add v19.4s, v3.4s, v4.4s +// sha1c q6, s18, v19.4s +// sha1su1 v2.4s, v1.4s +// sha1su0 v3.4s, v0.4s, v1.4s +// +// // 4 +// sha1h s18, s6 +// add v19.4s, v0.4s, v4.4s +// sha1c q6, s17, v19.4s +// sha1su1 v3.4s, v2.4s +// sha1su0 v0.4s, v1.4s, v2.4s +// +// // k for the next five rounds +// adrp x1, .K1@PAGE +// ldr q4, [x1, #:lo12:.K1@PAGEOFF] +// +// // 5 +// sha1h s17, s6 +// add v19.4s, v1.4s, v4.4s +// sha1p q6, s18, v19.4s +// sha1su1 v0.4s, v3.4s +// sha1su0 v1.4s, v2.4s, v3.4s +// +// // 6 +// sha1h s18, s6 +// add v19.4s, v2.4s, v4.4s +// sha1p q6, s17, v19.4s +// sha1su1 v1.4s, v0.4s +// sha1su0 v2.4s, v3.4s, v0.4s +// +// // 7 +// sha1h s17, s6 +// add v19.4s, v3.4s, v4.4s +// sha1p q6, s18, v19.4s +// sha1su1 v2.4s, v1.4s +// sha1su0 v3.4s, v0.4s, v1.4s +// +// // 8 +// sha1h s18, s6 +// add v19.4s, v0.4s, v4.4s +// sha1p q6, s17, v19.4s +// sha1su1 v3.4s, v2.4s +// sha1su0 v0.4s, v1.4s, v2.4s +// +// // 9 +// sha1h s17, s6 +// add v19.4s, v1.4s, v4.4s +// sha1p q6, s18, v19.4s +// sha1su1 v0.4s, v3.4s +// sha1su0 v1.4s, v2.4s, v3.4s +// +// // k for the next five rounds +// adrp x1, .K2@PAGE +// ldr q4, [x1, #:lo12:.K2@PAGEOFF] +// +// // 10 +// sha1h s18, s6 +// add v19.4s, v2.4s, v4.4s +// sha1m q6, s17, v19.4s +// sha1su1 v1.4s, v0.4s +// sha1su0 v2.4s, v3.4s, v0.4s +// +// // 11 +// sha1h s17, s6 +// add v19.4s, v3.4s, v4.4s +// sha1m q6, s18, v19.4s +// sha1su1 v2.4s, v1.4s +// sha1su0 v3.4s, v0.4s, v1.4s +// +// // 12 +// sha1h s18, s6 +// add v19.4s, v0.4s, v4.4s +// sha1m q6, s17, v19.4s +// sha1su1 v3.4s, v2.4s +// sha1su0 v0.4s, v1.4s, v2.4s +// +// // 13 +// sha1h s17, s6 +// add v19.4s, v1.4s, v4.4s +// sha1m q6, s18, v19.4s +// sha1su1 v0.4s, v3.4s +// sha1su0 v1.4s, v2.4s, v3.4s +// +// // 14 +// sha1h s18, s6 +// add v19.4s, v2.4s, v4.4s +// sha1m q6, s17, v19.4s +// sha1su1 v1.4s, v0.4s +// sha1su0 v2.4s, v3.4s, v0.4s +// +// // k for the next five rounds +// adrp x1, .K3@PAGE +// ldr q4, [x1, #:lo12:.K3@PAGEOFF] +// +// // 15 +// sha1h s17, s6 +// add v19.4s, v3.4s, v4.4s +// sha1p q6, s18, v19.4s +// sha1su1 v2.4s, v1.4s +// sha1su0 v3.4s, v0.4s, v1.4s +// +// // 16 +// sha1h s18, s6 +// add v19.4s, v0.4s, v4.4s +// sha1p q6, s17, v19.4s +// sha1su1 v3.4s, v2.4s +// +// // 17 +// sha1h s17, s6 +// add v19.4s, v1.4s, v4.4s +// sha1p q6, s18, v19.4s +// +// // 18 +// sha1h s18, s6 +// add v19.4s, v2.4s, v4.4s +// sha1p q6, s17, v19.4s +// +// // 19 +// sha1h s17, s6 +// add v19.4s, v3.4s, v4.4s +// sha1p q6, s18, v19.4s +// +// // Update state +// add v6.4s, v6.4s, v5.4s +// str q6, [x0] +// add v16.2s, v16.2s, v17.2s +// str s16, [x0, 16] +// +// ret +// .align 4 +// .K0: +// .word 0x5A827999 +// .word 0x5A827999 +// .word 0x5A827999 +// .word 0x5A827999 +// .K1: +// .word 0x6ED9EBA1 +// .word 0x6ED9EBA1 +// .word 0x6ED9EBA1 +// .word 0x6ED9EBA1 +// .K2: +// .word 0x8F1BBCDC +// .word 0x8F1BBCDC +// .word 0x8F1BBCDC +// .word 0x8F1BBCDC +// .K3: +// .word 0xCA62C1D6 +// .word 0xCA62C1D6 +// .word 0xCA62C1D6 +// .word 0xCA62C1D6 diff --git a/sha1/src/asm/mod.rs b/sha1/src/asm/mod.rs new file mode 100644 index 000000000..056e15e14 --- /dev/null +++ b/sha1/src/asm/mod.rs @@ -0,0 +1,7 @@ +// TODO (laudiacay): here, do the switch to figure out which architecture's method we'll do... +// here's how that md5 PR did it (obviously wrong for what we want here...) +// #[cfg(any(target_arch = "x86_64", target_arch = "x86"))] +// mod x86; +// +// #[cfg(any(target_arch = "x86_64", target_arch = "x86"))] +// pub use x86::compress; diff --git a/sha1/src/asm/x86.rs b/sha1/src/asm/x86.rs new file mode 100644 index 000000000..c9ff140fe --- /dev/null +++ b/sha1/src/asm/x86.rs @@ -0,0 +1,227 @@ +//! SHA-1 hash in x86 assembly. adapted from Project Nayuki's MIT licensed code... +// /* +// * SHA-1 hash in x86 assembly +// * +// * Copyright (c) 2014 Project Nayuki. (MIT License) +// * https://www.nayuki.io/page/fast-sha1-hash-implementation-in-x86-assembly +// * +// * Permission is hereby granted, free of charge, to any person obtaining a copy of +// * this software and associated documentation files (the "Software"), to deal in +// * the Software without restriction, including without limitation the rights to +// * use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of +// * the Software, and to permit persons to whom the Software is furnished to do so, +// * subject to the following conditions: +// * - The above copyright notice and this permission notice shall be included in +// * all copies or substantial portions of the Software. +// * - The Software is provided "as is", without warranty of any kind, express or +// * implied, including but not limited to the warranties of merchantability, +// * fitness for a particular purpose and noninfringement. In no event shall the +// * authors or copyright holders be liable for any claim, damages or other +// * liability, whether in an action of contract, tort or otherwise, arising from, +// * out of or in connection with the Software or the use or other dealings in the +// * Software. +// */ + +use core::arch::asm; + +use asm_block::asm_block; + +// +// /* void sha1_compress(uint32_t state[5], const uint8_t block[64]) */ +// #ifdef __APPLE__ +// .globl _sha1_compress +// _sha1_compress: +// #else +// .globl sha1_compress +// sha1_compress: +// #endif +// /* +// * Storage usage: +// * Bytes Location Description +// * 4 eax SHA-1 state variable A +// * 4 ebx SHA-1 state variable B +// * 4 ecx SHA-1 state variable C +// * 4 edx SHA-1 state variable D +// * 4 ebp SHA-1 state variable E +// * 4 esi Temporary for calculation per round +// * 4 edi (First 16 rounds) base address of block array argument (read-only); (last 64 rounds) temporary for calculation per round +// * 4 esp x86 stack pointer +// * 64 [esp+ 0] Circular buffer of most recent 16 key schedule items, 4 bytes each +// * 4 [esp+64] Caller's value of ebx +// * 4 [esp+68] Caller's value of esi +// * 4 [esp+72] Caller's value of edi +// * 4 [esp+76] Caller's value of ebp +// */ +// +// #define ROUND0a(a, b, c, d, e, i) \ +// movl (i*4)(%edi), %esi; \ +// bswapl %esi; \ +// movl %esi, (i*4)(%esp); \ +// addl %esi, %e; \ +// movl %c, %esi; \ +// xorl %d, %esi; \ +// andl %b, %esi; \ +// xorl %d, %esi; \ +// ROUNDTAIL(a, b, e, i, 0x5A827999) +// +// #define SCHEDULE(i, e) \ +// movl (((i- 3)&0xF)*4)(%esp), %esi; \ +// xorl (((i- 8)&0xF)*4)(%esp), %esi; \ +// xorl (((i-14)&0xF)*4)(%esp), %esi; \ +// xorl (((i-16)&0xF)*4)(%esp), %esi; \ +// roll $1, %esi; \ +// addl %esi, %e; \ +// movl %esi, ((i&0xF)*4)(%esp); +// +// #define ROUND0b(a, b, c, d, e, i) \ +// SCHEDULE(i, e) \ +// movl %c, %esi; \ +// xorl %d, %esi; \ +// andl %b, %esi; \ +// xorl %d, %esi; \ +// ROUNDTAIL(a, b, e, i, 0x5A827999) +// +// #define ROUND1(a, b, c, d, e, i) \ +// SCHEDULE(i, e) \ +// movl %b, %esi; \ +// xorl %c, %esi; \ +// xorl %d, %esi; \ +// ROUNDTAIL(a, b, e, i, 0x6ED9EBA1) +// +// #define ROUND2(a, b, c, d, e, i) \ +// SCHEDULE(i, e) \ +// movl %c, %esi; \ +// movl %c, %edi; \ +// orl %d, %esi; \ +// andl %b, %esi; \ +// andl %d, %edi; \ +// orl %edi, %esi; \ +// ROUNDTAIL(a, b, e, i, 0x8F1BBCDC) +// +// #define ROUND3(a, b, c, d, e, i) \ +// SCHEDULE(i, e) \ +// movl %b, %esi; \ +// xorl %c, %esi; \ +// xorl %d, %esi; \ +// ROUNDTAIL(a, b, e, i, 0xCA62C1D6) +// +// #define ROUNDTAIL(a, b, e, i, k) \ +// roll $30, %b; \ +// leal k(%e,%esi), %e; \ +// movl %a, %esi; \ +// roll $5, %esi; \ +// addl %esi, %e; +// +// /* Save registers */ +// subl $80, %esp +// movl %ebx, 64(%esp) +// movl %esi, 68(%esp) +// movl %edi, 72(%esp) +// movl %ebp, 76(%esp) +// +// /* Load arguments */ +// movl 84(%esp), %esi /* state */ +// movl 88(%esp), %edi /* block */ +// movl 0(%esi), %eax /* a */ +// movl 4(%esi), %ebx /* b */ +// movl 8(%esi), %ecx /* c */ +// movl 12(%esi), %edx /* d */ +// movl 16(%esi), %ebp /* e */ +// +// /* 80 rounds of hashing */ +// ROUND0a(eax, ebx, ecx, edx, ebp, 0) +// ROUND0a(ebp, eax, ebx, ecx, edx, 1) +// ROUND0a(edx, ebp, eax, ebx, ecx, 2) +// ROUND0a(ecx, edx, ebp, eax, ebx, 3) +// ROUND0a(ebx, ecx, edx, ebp, eax, 4) +// ROUND0a(eax, ebx, ecx, edx, ebp, 5) +// ROUND0a(ebp, eax, ebx, ecx, edx, 6) +// ROUND0a(edx, ebp, eax, ebx, ecx, 7) +// ROUND0a(ecx, edx, ebp, eax, ebx, 8) +// ROUND0a(ebx, ecx, edx, ebp, eax, 9) +// ROUND0a(eax, ebx, ecx, edx, ebp, 10) +// ROUND0a(ebp, eax, ebx, ecx, edx, 11) +// ROUND0a(edx, ebp, eax, ebx, ecx, 12) +// ROUND0a(ecx, edx, ebp, eax, ebx, 13) +// ROUND0a(ebx, ecx, edx, ebp, eax, 14) +// ROUND0a(eax, ebx, ecx, edx, ebp, 15) +// ROUND0b(ebp, eax, ebx, ecx, edx, 16) +// ROUND0b(edx, ebp, eax, ebx, ecx, 17) +// ROUND0b(ecx, edx, ebp, eax, ebx, 18) +// ROUND0b(ebx, ecx, edx, ebp, eax, 19) +// ROUND1(eax, ebx, ecx, edx, ebp, 20) +// ROUND1(ebp, eax, ebx, ecx, edx, 21) +// ROUND1(edx, ebp, eax, ebx, ecx, 22) +// ROUND1(ecx, edx, ebp, eax, ebx, 23) +// ROUND1(ebx, ecx, edx, ebp, eax, 24) +// ROUND1(eax, ebx, ecx, edx, ebp, 25) +// ROUND1(ebp, eax, ebx, ecx, edx, 26) +// ROUND1(edx, ebp, eax, ebx, ecx, 27) +// ROUND1(ecx, edx, ebp, eax, ebx, 28) +// ROUND1(ebx, ecx, edx, ebp, eax, 29) +// ROUND1(eax, ebx, ecx, edx, ebp, 30) +// ROUND1(ebp, eax, ebx, ecx, edx, 31) +// ROUND1(edx, ebp, eax, ebx, ecx, 32) +// ROUND1(ecx, edx, ebp, eax, ebx, 33) +// ROUND1(ebx, ecx, edx, ebp, eax, 34) +// ROUND1(eax, ebx, ecx, edx, ebp, 35) +// ROUND1(ebp, eax, ebx, ecx, edx, 36) +// ROUND1(edx, ebp, eax, ebx, ecx, 37) +// ROUND1(ecx, edx, ebp, eax, ebx, 38) +// ROUND1(ebx, ecx, edx, ebp, eax, 39) +// ROUND2(eax, ebx, ecx, edx, ebp, 40) +// ROUND2(ebp, eax, ebx, ecx, edx, 41) +// ROUND2(edx, ebp, eax, ebx, ecx, 42) +// ROUND2(ecx, edx, ebp, eax, ebx, 43) +// ROUND2(ebx, ecx, edx, ebp, eax, 44) +// ROUND2(eax, ebx, ecx, edx, ebp, 45) +// ROUND2(ebp, eax, ebx, ecx, edx, 46) +// ROUND2(edx, ebp, eax, ebx, ecx, 47) +// ROUND2(ecx, edx, ebp, eax, ebx, 48) +// ROUND2(ebx, ecx, edx, ebp, eax, 49) +// ROUND2(eax, ebx, ecx, edx, ebp, 50) +// ROUND2(ebp, eax, ebx, ecx, edx, 51) +// ROUND2(edx, ebp, eax, ebx, ecx, 52) +// ROUND2(ecx, edx, ebp, eax, ebx, 53) +// ROUND2(ebx, ecx, edx, ebp, eax, 54) +// ROUND2(eax, ebx, ecx, edx, ebp, 55) +// ROUND2(ebp, eax, ebx, ecx, edx, 56) +// ROUND2(edx, ebp, eax, ebx, ecx, 57) +// ROUND2(ecx, edx, ebp, eax, ebx, 58) +// ROUND2(ebx, ecx, edx, ebp, eax, 59) +// ROUND3(eax, ebx, ecx, edx, ebp, 60) +// ROUND3(ebp, eax, ebx, ecx, edx, 61) +// ROUND3(edx, ebp, eax, ebx, ecx, 62) +// ROUND3(ecx, edx, ebp, eax, ebx, 63) +// ROUND3(ebx, ecx, edx, ebp, eax, 64) +// ROUND3(eax, ebx, ecx, edx, ebp, 65) +// ROUND3(ebp, eax, ebx, ecx, edx, 66) +// ROUND3(edx, ebp, eax, ebx, ecx, 67) +// ROUND3(ecx, edx, ebp, eax, ebx, 68) +// ROUND3(ebx, ecx, edx, ebp, eax, 69) +// ROUND3(eax, ebx, ecx, edx, ebp, 70) +// ROUND3(ebp, eax, ebx, ecx, edx, 71) +// ROUND3(edx, ebp, eax, ebx, ecx, 72) +// ROUND3(ecx, edx, ebp, eax, ebx, 73) +// ROUND3(ebx, ecx, edx, ebp, eax, 74) +// ROUND3(eax, ebx, ecx, edx, ebp, 75) +// ROUND3(ebp, eax, ebx, ecx, edx, 76) +// ROUND3(edx, ebp, eax, ebx, ecx, 77) +// ROUND3(ecx, edx, ebp, eax, ebx, 78) +// ROUND3(ebx, ecx, edx, ebp, eax, 79) +// +// /* Save updated state */ +// movl 84(%esp), %esi +// addl %eax, 0(%esi) +// addl %ebx, 4(%esi) +// addl %ecx, 8(%esi) +// addl %edx, 12(%esi) +// addl %ebp, 16(%esi) +// +// /* Restore registers */ +// movl 64(%esp), %ebx +// movl 68(%esp), %esi +// movl 72(%esp), %edi +// movl 76(%esp), %ebp +// addl $80, %esp +// retl diff --git a/sha1/src/asm/x86_64.rs b/sha1/src/asm/x86_64.rs new file mode 100644 index 000000000..5e095883b --- /dev/null +++ b/sha1/src/asm/x86_64.rs @@ -0,0 +1,217 @@ +// /* +// * SHA-1 hash in x86-64 assembly +// * +// * Copyright (c) 2015 Project Nayuki. (MIT License) +// * https://www.nayuki.io/page/fast-sha1-hash-implementation-in-x86-assembly +// * +// * Permission is hereby granted, free of charge, to any person obtaining a copy of +// * this software and associated documentation files (the "Software"), to deal in +// * the Software without restriction, including without limitation the rights to +// * use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of +// * the Software, and to permit persons to whom the Software is furnished to do so, +// * subject to the following conditions: +// * - The above copyright notice and this permission notice shall be included in +// * all copies or substantial portions of the Software. +// * - The Software is provided "as is", without warranty of any kind, express or +// * implied, including but not limited to the warranties of merchantability, +// * fitness for a particular purpose and noninfringement. In no event shall the +// * authors or copyright holders be liable for any claim, damages or other +// * liability, whether in an action of contract, tort or otherwise, arising from, +// * out of or in connection with the Software or the use or other dealings in the +// * Software. +// */ +// +// +// /* void sha1_compress(uint32_t state[5], const uint8_t block[64]) */ +// #ifdef __APPLE__ +// .globl _sha1_compress +// _sha1_compress: +// #else +// .globl sha1_compress +// sha1_compress: +// #endif +// /* +// * Storage usage: +// * Bytes Location Description +// * 4 eax SHA-1 state variable A +// * 4 ebx SHA-1 state variable B +// * 4 ecx SHA-1 state variable C +// * 4 edx SHA-1 state variable D +// * 4 ebp SHA-1 state variable E +// * 4 esi Temporary for calculation per round +// * 4 edi (Last 64 rounds) temporary for calculation per round +// * 8 rdi (First 16 rounds) base address of block array argument (read-only) +// * 8 r8 Base address of state array argument (read-only) +// * 8 rsp x86-64 stack pointer +// * 64 [rsp+0] Circular buffer of most recent 16 key schedule items, 4 bytes each +// * 16 xmm0 Caller's value of rbx (only low 64 bits are used) +// * 16 xmm1 Caller's value of rbp (only low 64 bits are used) +// */ +// +// #define ROUND0a(a, b, c, d, e, i) \ +// movl (i*4)(%rdi), %esi; \ +// bswapl %esi; \ +// movl %esi, (i*4)(%rsp); \ +// addl %esi, %e; \ +// movl %c, %esi; \ +// xorl %d, %esi; \ +// andl %b, %esi; \ +// xorl %d, %esi; \ +// ROUNDTAIL(a, b, e, i, 0x5A827999) +// +// #define SCHEDULE(i, e) \ +// movl (((i- 3)&0xF)*4)(%rsp), %esi; \ +// xorl (((i- 8)&0xF)*4)(%rsp), %esi; \ +// xorl (((i-14)&0xF)*4)(%rsp), %esi; \ +// xorl (((i-16)&0xF)*4)(%rsp), %esi; \ +// roll $1, %esi; \ +// addl %esi, %e; \ +// movl %esi, ((i&0xF)*4)(%rsp); +// +// #define ROUND0b(a, b, c, d, e, i) \ +// SCHEDULE(i, e) \ +// movl %c, %esi; \ +// xorl %d, %esi; \ +// andl %b, %esi; \ +// xorl %d, %esi; \ +// ROUNDTAIL(a, b, e, i, 0x5A827999) +// +// #define ROUND1(a, b, c, d, e, i) \ +// SCHEDULE(i, e) \ +// movl %b, %esi; \ +// xorl %c, %esi; \ +// xorl %d, %esi; \ +// ROUNDTAIL(a, b, e, i, 0x6ED9EBA1) +// +// #define ROUND2(a, b, c, d, e, i) \ +// SCHEDULE(i, e) \ +// movl %c, %esi; \ +// movl %c, %edi; \ +// orl %d, %esi; \ +// andl %b, %esi; \ +// andl %d, %edi; \ +// orl %edi, %esi; \ +// ROUNDTAIL(a, b, e, i, -0x70E44324) +// +// #define ROUND3(a, b, c, d, e, i) \ +// SCHEDULE(i, e) \ +// movl %b, %esi; \ +// xorl %c, %esi; \ +// xorl %d, %esi; \ +// ROUNDTAIL(a, b, e, i, -0x359D3E2A) +// +// #define ROUNDTAIL(a, b, e, i, k) \ +// roll $30, %b; \ +// leal k(%e,%esi), %e; \ +// movl %a, %esi; \ +// roll $5, %esi; \ +// addl %esi, %e; +// +// /* Save registers, allocate scratch space */ +// movq %rbx, %xmm0 +// movq %rbp, %xmm1 +// subq $64, %rsp +// +// /* Load arguments */ +// movq %rdi, %r8 +// movl 0(%rdi), %eax /* a */ +// movl 4(%rdi), %ebx /* b */ +// movl 8(%rdi), %ecx /* c */ +// movl 12(%rdi), %edx /* d */ +// movl 16(%rdi), %ebp /* e */ +// movq %rsi, %rdi +// +// /* 80 rounds of hashing */ +// ROUND0a(eax, ebx, ecx, edx, ebp, 0) +// ROUND0a(ebp, eax, ebx, ecx, edx, 1) +// ROUND0a(edx, ebp, eax, ebx, ecx, 2) +// ROUND0a(ecx, edx, ebp, eax, ebx, 3) +// ROUND0a(ebx, ecx, edx, ebp, eax, 4) +// ROUND0a(eax, ebx, ecx, edx, ebp, 5) +// ROUND0a(ebp, eax, ebx, ecx, edx, 6) +// ROUND0a(edx, ebp, eax, ebx, ecx, 7) +// ROUND0a(ecx, edx, ebp, eax, ebx, 8) +// ROUND0a(ebx, ecx, edx, ebp, eax, 9) +// ROUND0a(eax, ebx, ecx, edx, ebp, 10) +// ROUND0a(ebp, eax, ebx, ecx, edx, 11) +// ROUND0a(edx, ebp, eax, ebx, ecx, 12) +// ROUND0a(ecx, edx, ebp, eax, ebx, 13) +// ROUND0a(ebx, ecx, edx, ebp, eax, 14) +// ROUND0a(eax, ebx, ecx, edx, ebp, 15) +// ROUND0b(ebp, eax, ebx, ecx, edx, 16) +// ROUND0b(edx, ebp, eax, ebx, ecx, 17) +// ROUND0b(ecx, edx, ebp, eax, ebx, 18) +// ROUND0b(ebx, ecx, edx, ebp, eax, 19) +// ROUND1(eax, ebx, ecx, edx, ebp, 20) +// ROUND1(ebp, eax, ebx, ecx, edx, 21) +// ROUND1(edx, ebp, eax, ebx, ecx, 22) +// ROUND1(ecx, edx, ebp, eax, ebx, 23) +// ROUND1(ebx, ecx, edx, ebp, eax, 24) +// ROUND1(eax, ebx, ecx, edx, ebp, 25) +// ROUND1(ebp, eax, ebx, ecx, edx, 26) +// ROUND1(edx, ebp, eax, ebx, ecx, 27) +// ROUND1(ecx, edx, ebp, eax, ebx, 28) +// ROUND1(ebx, ecx, edx, ebp, eax, 29) +// ROUND1(eax, ebx, ecx, edx, ebp, 30) +// ROUND1(ebp, eax, ebx, ecx, edx, 31) +// ROUND1(edx, ebp, eax, ebx, ecx, 32) +// ROUND1(ecx, edx, ebp, eax, ebx, 33) +// ROUND1(ebx, ecx, edx, ebp, eax, 34) +// ROUND1(eax, ebx, ecx, edx, ebp, 35) +// ROUND1(ebp, eax, ebx, ecx, edx, 36) +// ROUND1(edx, ebp, eax, ebx, ecx, 37) +// ROUND1(ecx, edx, ebp, eax, ebx, 38) +// ROUND1(ebx, ecx, edx, ebp, eax, 39) +// ROUND2(eax, ebx, ecx, edx, ebp, 40) +// ROUND2(ebp, eax, ebx, ecx, edx, 41) +// ROUND2(edx, ebp, eax, ebx, ecx, 42) +// ROUND2(ecx, edx, ebp, eax, ebx, 43) +// ROUND2(ebx, ecx, edx, ebp, eax, 44) +// ROUND2(eax, ebx, ecx, edx, ebp, 45) +// ROUND2(ebp, eax, ebx, ecx, edx, 46) +// ROUND2(edx, ebp, eax, ebx, ecx, 47) +// ROUND2(ecx, edx, ebp, eax, ebx, 48) +// ROUND2(ebx, ecx, edx, ebp, eax, 49) +// ROUND2(eax, ebx, ecx, edx, ebp, 50) +// ROUND2(ebp, eax, ebx, ecx, edx, 51) +// ROUND2(edx, ebp, eax, ebx, ecx, 52) +// ROUND2(ecx, edx, ebp, eax, ebx, 53) +// ROUND2(ebx, ecx, edx, ebp, eax, 54) +// ROUND2(eax, ebx, ecx, edx, ebp, 55) +// ROUND2(ebp, eax, ebx, ecx, edx, 56) +// ROUND2(edx, ebp, eax, ebx, ecx, 57) +// ROUND2(ecx, edx, ebp, eax, ebx, 58) +// ROUND2(ebx, ecx, edx, ebp, eax, 59) +// ROUND3(eax, ebx, ecx, edx, ebp, 60) +// ROUND3(ebp, eax, ebx, ecx, edx, 61) +// ROUND3(edx, ebp, eax, ebx, ecx, 62) +// ROUND3(ecx, edx, ebp, eax, ebx, 63) +// ROUND3(ebx, ecx, edx, ebp, eax, 64) +// ROUND3(eax, ebx, ecx, edx, ebp, 65) +// ROUND3(ebp, eax, ebx, ecx, edx, 66) +// ROUND3(edx, ebp, eax, ebx, ecx, 67) +// ROUND3(ecx, edx, ebp, eax, ebx, 68) +// ROUND3(ebx, ecx, edx, ebp, eax, 69) +// ROUND3(eax, ebx, ecx, edx, ebp, 70) +// ROUND3(ebp, eax, ebx, ecx, edx, 71) +// ROUND3(edx, ebp, eax, ebx, ecx, 72) +// ROUND3(ecx, edx, ebp, eax, ebx, 73) +// ROUND3(ebx, ecx, edx, ebp, eax, 74) +// ROUND3(eax, ebx, ecx, edx, ebp, 75) +// ROUND3(ebp, eax, ebx, ecx, edx, 76) +// ROUND3(edx, ebp, eax, ebx, ecx, 77) +// ROUND3(ecx, edx, ebp, eax, ebx, 78) +// ROUND3(ebx, ecx, edx, ebp, eax, 79) +// +// /* Save updated state */ +// addl %eax, 0(%r8) +// addl %ebx, 4(%r8) +// addl %ecx, 8(%r8) +// addl %edx, 12(%r8) +// addl %ebp, 16(%r8) +// +// /* Restore registers */ +// movq %xmm0, %rbx +// movq %xmm1, %rbp +// addq $64, %rsp +// retq diff --git a/sha1/src/lib.rs b/sha1/src/lib.rs index 38ddc4b51..616b58f37 100644 --- a/sha1/src/lib.rs +++ b/sha1/src/lib.rs @@ -63,11 +63,48 @@ use digest::{ HashMarker, Output, }; +#[cfg(all( +feature = "inline-asm", +any(target_arch = "x86", target_arch = "x86_64", target_arch = "aarch64") +))] +mod asm; + +#[cfg(not(all( +any(feature = "asm", feature = "inline-asm"), +any(target_arch = "x86", target_arch = "x86_64", target_arch = "aarch64") +)))] mod compress; #[cfg(feature = "compress")] +#[cfg(all( +feature = "inline-asm", +feature = "compress", +any(target_arch = "x86", target_arch = "x86_64", target_arch = "aarch64") +))] +pub use asm::compress; + +#[cfg(feature = "compress")] +#[cfg(all( +feature = "inline-asm", +not(feature = "compress"), +any(target_arch = "x86", target_arch = "x86_64", target_arch = "aarch64") +))] +use asm::compress; + +#[cfg(feature = "compress")] +#[cfg(all( +not(feature = "inline-asm"), +feature = "compress", +any(target_arch = "x86", target_arch = "x86_64", target_arch = "aarch64") +))] pub use compress::compress; -#[cfg(not(feature = "compress"))] + +#[cfg(feature = "compress")] +#[cfg(all( +not(feature = "inline-asm"), +not(feature = "compress"), +any(target_arch = "x86", target_arch = "x86_64", target_arch = "aarch64") +))] use compress::compress; const STATE_LEN: usize = 5; From 64f975daa3e5214368a3c02e7dce258fb964f3c5 Mon Sep 17 00:00:00 2001 From: Claudia Richoux Date: Sat, 28 Jan 2023 23:46:54 -0500 Subject: [PATCH 3/8] better skeleton --- sha1/src/asm/aarch64.rs | 5 +++-- sha1/src/asm/mod.rs | 18 ++++++++++++++++++ sha1/src/asm/x86.rs | 1 - sha1/src/lib.rs | 32 ++++++++++++++++---------------- 4 files changed, 37 insertions(+), 19 deletions(-) diff --git a/sha1/src/asm/aarch64.rs b/sha1/src/asm/aarch64.rs index c069f329e..9e6611086 100644 --- a/sha1/src/asm/aarch64.rs +++ b/sha1/src/asm/aarch64.rs @@ -1,3 +1,5 @@ +//! SHA-1 hash in AArch64 assembly, adapted from Emmanuel Gil Peyrot's MIT-licensed implementation +// // /* // * SHA-1 hash in AArch64 assembly // * @@ -19,8 +21,7 @@ // * out of or in connection with the Software or the use or other dealings in the // * Software. // */ -// -// + // /* void sha1_compress(uint32_t state[5], const uint8_t block[64]) */ // .global sha1_compress // sha1_compress: diff --git a/sha1/src/asm/mod.rs b/sha1/src/asm/mod.rs index 056e15e14..164c8065d 100644 --- a/sha1/src/asm/mod.rs +++ b/sha1/src/asm/mod.rs @@ -5,3 +5,21 @@ // // #[cfg(any(target_arch = "x86_64", target_arch = "x86"))] // pub use x86::compress; + +#[cfg(all(feature = "inline-asm", target_arch = "x86",))] +mod x86; +#[cfg(all(feature = "inline-asm", target_arch = "x86",))] +pub use x86::compress; + +#[cfg(all(feature = "inline-asm", target_arch = "x86_64",))] +mod x86_64; +#[cfg(all(feature = "inline-asm", target_arch = "x86_64",))] +pub use x86_64::compress; + +#[cfg(all(feature = "inline-asm", target_arch = "aarch64",))] +mod aarch64; +#[cfg(all(feature = "inline-asm", target_arch = "aarch64",))] +pub use aarch64::compress; + +// TODO(laudiacay) i don't know how to detect M1 +mod aarch64_apple; diff --git a/sha1/src/asm/x86.rs b/sha1/src/asm/x86.rs index c9ff140fe..2ab865649 100644 --- a/sha1/src/asm/x86.rs +++ b/sha1/src/asm/x86.rs @@ -21,7 +21,6 @@ // * out of or in connection with the Software or the use or other dealings in the // * Software. // */ - use core::arch::asm; use asm_block::asm_block; diff --git a/sha1/src/lib.rs b/sha1/src/lib.rs index 616b58f37..9b7fd0a37 100644 --- a/sha1/src/lib.rs +++ b/sha1/src/lib.rs @@ -64,46 +64,46 @@ use digest::{ }; #[cfg(all( -feature = "inline-asm", -any(target_arch = "x86", target_arch = "x86_64", target_arch = "aarch64") + feature = "inline-asm", + any(target_arch = "x86", target_arch = "x86_64", target_arch = "aarch64") ))] mod asm; #[cfg(not(all( -any(feature = "asm", feature = "inline-asm"), -any(target_arch = "x86", target_arch = "x86_64", target_arch = "aarch64") + any(feature = "asm", feature = "inline-asm"), + any(target_arch = "x86", target_arch = "x86_64", target_arch = "aarch64") )))] mod compress; #[cfg(feature = "compress")] #[cfg(all( -feature = "inline-asm", -feature = "compress", -any(target_arch = "x86", target_arch = "x86_64", target_arch = "aarch64") + feature = "inline-asm", + feature = "compress", + any(target_arch = "x86", target_arch = "x86_64", target_arch = "aarch64") ))] pub use asm::compress; #[cfg(feature = "compress")] #[cfg(all( -feature = "inline-asm", -not(feature = "compress"), -any(target_arch = "x86", target_arch = "x86_64", target_arch = "aarch64") + feature = "inline-asm", + not(feature = "compress"), + any(target_arch = "x86", target_arch = "x86_64", target_arch = "aarch64") ))] use asm::compress; #[cfg(feature = "compress")] #[cfg(all( -not(feature = "inline-asm"), -feature = "compress", -any(target_arch = "x86", target_arch = "x86_64", target_arch = "aarch64") + not(feature = "inline-asm"), + feature = "compress", + any(target_arch = "x86", target_arch = "x86_64", target_arch = "aarch64") ))] pub use compress::compress; #[cfg(feature = "compress")] #[cfg(all( -not(feature = "inline-asm"), -not(feature = "compress"), -any(target_arch = "x86", target_arch = "x86_64", target_arch = "aarch64") + not(feature = "inline-asm"), + not(feature = "compress"), + any(target_arch = "x86", target_arch = "x86_64", target_arch = "aarch64") ))] use compress::compress; From 2391945b30270a964ffa10d8eb8495fa7967ecbc Mon Sep 17 00:00:00 2001 From: Claudia Richoux Date: Sun, 29 Jan 2023 01:21:26 -0500 Subject: [PATCH 4/8] need to figure out where to put macros in this next --- sha1/src/asm/aarch64.rs | 474 ++++++++++++++++++++++------------------ 1 file changed, 260 insertions(+), 214 deletions(-) diff --git a/sha1/src/asm/aarch64.rs b/sha1/src/asm/aarch64.rs index 9e6611086..5347a8e11 100644 --- a/sha1/src/asm/aarch64.rs +++ b/sha1/src/asm/aarch64.rs @@ -22,217 +22,263 @@ // * Software. // */ -// /* void sha1_compress(uint32_t state[5], const uint8_t block[64]) */ -// .global sha1_compress -// sha1_compress: -// /* -// * Storage usage: -// * Bytes Location Description -// * 4 x0 state argument -// * 4 x1 block argument -// * 16 q0 W0 -// * 16 q1 W1 -// * 16 q2 W2 -// * 16 q3 W3 -// * 16 q4 k -// * 16 q5 Original ABCD -// * 16 q6 ABCD (with s3 being A) -// * 4 s16 E -// * 4 s17 e0 -// * 4 s18 e1 -// * 16 q19 wk -// */ -// -// // Load state in registers -// ldr q5, [x0] -// ldr s16, [x0, 16] -// mov v6.16b, v5.16b -// -// // Load block in registers -// ldr q0, [x1] -// ldr q1, [x1, 16] -// ldr q2, [x1, 32] -// ldr q3, [x1, 48] -// -// // TODO: only do that on little endian -// rev32 v0.16b, v0.16b -// rev32 v1.16b, v1.16b -// rev32 v2.16b, v2.16b -// rev32 v3.16b, v3.16b -// -// // k for the next five rounds -// adrp x1, .K0 -// ldr q4, [x1, #:lo12:.K0] -// -// // 0 -// sha1h s18, s6 -// add v19.4s, v0.4s, v4.4s -// sha1c q6, s16, v19.4s -// sha1su0 v0.4s, v1.4s, v2.4s -// -// // 1 -// sha1h s17, s6 -// add v19.4s, v1.4s, v4.4s -// sha1c q6, s18, v19.4s -// sha1su1 v0.4s, v3.4s -// sha1su0 v1.4s, v2.4s, v3.4s -// -// // 2 -// sha1h s18, s6 -// add v19.4s, v2.4s, v4.4s -// sha1c q6, s17, v19.4s -// sha1su1 v1.4s, v0.4s -// sha1su0 v2.4s, v3.4s, v0.4s -// -// // 3 -// sha1h s17, s6 -// add v19.4s, v3.4s, v4.4s -// sha1c q6, s18, v19.4s -// sha1su1 v2.4s, v1.4s -// sha1su0 v3.4s, v0.4s, v1.4s -// -// // 4 -// sha1h s18, s6 -// add v19.4s, v0.4s, v4.4s -// sha1c q6, s17, v19.4s -// sha1su1 v3.4s, v2.4s -// sha1su0 v0.4s, v1.4s, v2.4s -// -// // k for the next five rounds -// adrp x1, .K1 -// ldr q4, [x1, #:lo12:.K1] -// -// // 5 -// sha1h s17, s6 -// add v19.4s, v1.4s, v4.4s -// sha1p q6, s18, v19.4s -// sha1su1 v0.4s, v3.4s -// sha1su0 v1.4s, v2.4s, v3.4s -// -// // 6 -// sha1h s18, s6 -// add v19.4s, v2.4s, v4.4s -// sha1p q6, s17, v19.4s -// sha1su1 v1.4s, v0.4s -// sha1su0 v2.4s, v3.4s, v0.4s -// -// // 7 -// sha1h s17, s6 -// add v19.4s, v3.4s, v4.4s -// sha1p q6, s18, v19.4s -// sha1su1 v2.4s, v1.4s -// sha1su0 v3.4s, v0.4s, v1.4s -// -// // 8 -// sha1h s18, s6 -// add v19.4s, v0.4s, v4.4s -// sha1p q6, s17, v19.4s -// sha1su1 v3.4s, v2.4s -// sha1su0 v0.4s, v1.4s, v2.4s -// -// // 9 -// sha1h s17, s6 -// add v19.4s, v1.4s, v4.4s -// sha1p q6, s18, v19.4s -// sha1su1 v0.4s, v3.4s -// sha1su0 v1.4s, v2.4s, v3.4s -// -// // k for the next five rounds -// adrp x1, .K2 -// ldr q4, [x1, #:lo12:.K2] -// -// // 10 -// sha1h s18, s6 -// add v19.4s, v2.4s, v4.4s -// sha1m q6, s17, v19.4s -// sha1su1 v1.4s, v0.4s -// sha1su0 v2.4s, v3.4s, v0.4s -// -// // 11 -// sha1h s17, s6 -// add v19.4s, v3.4s, v4.4s -// sha1m q6, s18, v19.4s -// sha1su1 v2.4s, v1.4s -// sha1su0 v3.4s, v0.4s, v1.4s -// -// // 12 -// sha1h s18, s6 -// add v19.4s, v0.4s, v4.4s -// sha1m q6, s17, v19.4s -// sha1su1 v3.4s, v2.4s -// sha1su0 v0.4s, v1.4s, v2.4s -// -// // 13 -// sha1h s17, s6 -// add v19.4s, v1.4s, v4.4s -// sha1m q6, s18, v19.4s -// sha1su1 v0.4s, v3.4s -// sha1su0 v1.4s, v2.4s, v3.4s -// -// // 14 -// sha1h s18, s6 -// add v19.4s, v2.4s, v4.4s -// sha1m q6, s17, v19.4s -// sha1su1 v1.4s, v0.4s -// sha1su0 v2.4s, v3.4s, v0.4s -// -// // k for the next five rounds -// adrp x1, .K3 -// ldr q4, [x1, #:lo12:.K3] -// -// // 15 -// sha1h s17, s6 -// add v19.4s, v3.4s, v4.4s -// sha1p q6, s18, v19.4s -// sha1su1 v2.4s, v1.4s -// sha1su0 v3.4s, v0.4s, v1.4s -// -// // 16 -// sha1h s18, s6 -// add v19.4s, v0.4s, v4.4s -// sha1p q6, s17, v19.4s -// sha1su1 v3.4s, v2.4s -// -// // 17 -// sha1h s17, s6 -// add v19.4s, v1.4s, v4.4s -// sha1p q6, s18, v19.4s -// -// // 18 -// sha1h s18, s6 -// add v19.4s, v2.4s, v4.4s -// sha1p q6, s17, v19.4s -// -// // 19 -// sha1h s17, s6 -// add v19.4s, v3.4s, v4.4s -// sha1p q6, s18, v19.4s -// -// // Update state -// add v6.4s, v6.4s, v5.4s -// str q6, [x0] -// add v16.2s, v16.2s, v17.2s -// str s16, [x0, 16] -// -// ret -// .align 4 -// .K0: -// .word 0x5A827999 -// .word 0x5A827999 -// .word 0x5A827999 -// .word 0x5A827999 -// .K1: -// .word 0x6ED9EBA1 -// .word 0x6ED9EBA1 -// .word 0x6ED9EBA1 -// .word 0x6ED9EBA1 -// .K2: -// .word 0x8F1BBCDC -// .word 0x8F1BBCDC -// .word 0x8F1BBCDC -// .word 0x8F1BBCDC -// .K3: -// .word 0xCA62C1D6 -// .word 0xCA62C1D6 -// .word 0xCA62C1D6 -// .word 0xCA62C1D6 +use core::arch::asm; + +// macro_rules! sha_1_through_4 { +// (F, $a: tt, $b: tt, $c: tt, $d: tt, $k: tt, $s: literal, $t: literal, $tmp1: tt, $tmp2: tt) => { + +/// SHA1 compress function. We don't have enough registers to load the whole block, +/// so we need to use memory address to refer to the inputs. Due to possible failure +/// of register allocation on `x86`, we explicitly specify registers to use. +#[cfg(all(feature = "inline-asm", target_arch = "aarch64"))] +pub fn compress(state: &mut [u32; 5], blocks: &[[u8; 64]]) { + // SAFETY: inline-assembly + unsafe { + asm!( + // define the SHA1 constants TODO (laudiacay) does it make sense for these to be up here? does it cause alignment issues? + ".K0:", + ".word 0x5A827999", + ".word 0x5A827999", + ".word 0x5A827999", + ".word 0x5A827999", + ".K1:", + ".word 0x6ED9EBA1", + ".word 0x6ED9EBA1", + ".word 0x6ED9EBA1", + ".word 0x6ED9EBA1", + ".K2:", + ".word 0x8F1BBCDC", + ".word 0x8F1BBCDC", + ".word 0x8F1BBCDC", + ".word 0x8F1BBCDC", + ".K3:", + ".word 0xCA62C1D6", + ".word 0xCA62C1D6", + ".word 0xCA62C1D6", + ".word 0xCA62C1D6", + + // from original code, some docs :) + // /* + // * Storage usage: + // * Bytes Location Description + // * 4 x0 state argument + // * 4 x1 block argument + // * 16 q0 W0 + // * 16 q1 W1 + // * 16 q2 W2 + // * 16 q3 W3 + // * 16 q4 k + // * 16 q5 Original ABCD + // * 16 q6 ABCD (with s3 being A) + // * 4 s16 E + // * 4 s17 e0 + // * 4 s18 e1 + // * 16 q19 wk + // */ + + // Load state in registers + // original code: + // ldr q5, [x0] + // ldr s16, [x0, 16] + // mov v6.16b, v5.16b + in(q5) state[0..4], + in(s16) state[4], + "mov v6.16b, v5.16b", + + // Load block in registers + // original code: + // ldr q0, [x1] + // ldr q1, [x1, 16] + // ldr q2, [x1, 32] + // ldr q3, [x1, 48] + in(q0) blocks[0][0..16], + in(q1) blocks[0][16..32], + in(q2) blocks[0][32..48], + in(q3) blocks[0][48..64], + + // from original code: TODO: only do that on little endian + "rev32 v0.16b, v0.16b", + "rev32 v1.16b, v1.16b", + "rev32 v2.16b, v2.16b", + "rev32 v3.16b, v3.16b", + + // k for the next five rounds + "adrp x1, .K0" + "ldr q4, [x1, #:lo12:.K0]" + + // // 0 + // sha1h s18, s6 + // add v19.4s, v0.4s, v4.4s + // sha1c q6, s16, v19.4s + // sha1su0 v0.4s, v1.4s, v2.4s + + // // 1 + // sha1h s17, s6 + // add v19.4s, v1.4s, v4.4s + // sha1c q6, s18, v19.4s + // sha1su1 v0.4s, v3.4s + // sha1su0 v1.4s, v2.4s, v3.4s + // + // // 2 + // sha1h s18, s6 + // add v19.4s, v2.4s, v4.4s + // sha1c q6, s17, v19.4s + // sha1su1 v1.4s, v0.4s + // sha1su0 v2.4s, v3.4s, v0.4s + // + // // 3 + // sha1h s17, s6 + // add v19.4s, v3.4s, v4.4s + // sha1c q6, s18, v19.4s + // sha1su1 v2.4s, v1.4s + // sha1su0 v3.4s, v0.4s, v1.4s + // + // // 4 + // sha1h s18, s6 + // add v19.4s, v0.4s, v4.4s + // sha1c q6, s17, v19.4s + // sha1su1 v3.4s, v2.4s + // sha1su0 v0.4s, v1.4s, v2.4s + // + // // k for the next five rounds + // adrp x1, .K1 + // ldr q4, [x1, #:lo12:.K1] + // + // // 5 + // sha1h s17, s6 + // add v19.4s, v1.4s, v4.4s + // sha1p q6, s18, v19.4s + // sha1su1 v0.4s, v3.4s + // sha1su0 v1.4s, v2.4s, v3.4s + // + // // 6 + // sha1h s18, s6 + // add v19.4s, v2.4s, v4.4s + // sha1p q6, s17, v19.4s + // sha1su1 v1.4s, v0.4s + // sha1su0 v2.4s, v3.4s, v0.4s + // + // // 7 + // sha1h s17, s6 + // add v19.4s, v3.4s, v4.4s + // sha1p q6, s18, v19.4s + // sha1su1 v2.4s, v1.4s + // sha1su0 v3.4s, v0.4s, v1.4s + // + // // 8 + // sha1h s18, s6 + // add v19.4s, v0.4s, v4.4s + // sha1p q6, s17, v19.4s + // sha1su1 v3.4s, v2.4s + // sha1su0 v0.4s, v1.4s, v2.4s + // + // // 9 + // sha1h s17, s6 + // add v19.4s, v1.4s, v4.4s + // sha1p q6, s18, v19.4s + // sha1su1 v0.4s, v3.4s + // sha1su0 v1.4s, v2.4s, v3.4s + // + // // k for the next five rounds + // adrp x1, .K2 + // ldr q4, [x1, #:lo12:.K2] + // + // // 10 + // sha1h s18, s6 + // add v19.4s, v2.4s, v4.4s + // sha1m q6, s17, v19.4s + // sha1su1 v1.4s, v0.4s + // sha1su0 v2.4s, v3.4s, v0.4s + // + // // 11 + // sha1h s17, s6 + // add v19.4s, v3.4s, v4.4s + // sha1m q6, s18, v19.4s + // sha1su1 v2.4s, v1.4s + // sha1su0 v3.4s, v0.4s, v1.4s + // + // // 12 + // sha1h s18, s6 + // add v19.4s, v0.4s, v4.4s + // sha1m q6, s17, v19.4s + // sha1su1 v3.4s, v2.4s + // sha1su0 v0.4s, v1.4s, v2.4s + // + // // 13 + // sha1h s17, s6 + // add v19.4s, v1.4s, v4.4s + // sha1m q6, s18, v19.4s + // sha1su1 v0.4s, v3.4s + // sha1su0 v1.4s, v2.4s, v3.4s + // + // // 14 + // sha1h s18, s6 + // add v19.4s, v2.4s, v4.4s + // sha1m q6, s17, v19.4s + // sha1su1 v1.4s, v0.4s + // sha1su0 v2.4s, v3.4s, v0.4s + // + // // k for the next five rounds + // adrp x1, .K3 + // ldr q4, [x1, #:lo12:.K3] + // + // // 15 + // sha1h s17, s6 + // add v19.4s, v3.4s, v4.4s + // sha1p q6, s18, v19.4s + // sha1su1 v2.4s, v1.4s + // sha1su0 v3.4s, v0.4s, v1.4s + // + // // 16 + // sha1h s18, s6 + // add v19.4s, v0.4s, v4.4s + // sha1p q6, s17, v19.4s + // sha1su1 v3.4s, v2.4s + // + // // 17 + // sha1h s17, s6 + // add v19.4s, v1.4s, v4.4s + // sha1p q6, s18, v19.4s + // + // // 18 + // sha1h s18, s6 + // add v19.4s, v2.4s, v4.4s + // sha1p q6, s17, v19.4s + // + // // 19 + // sha1h s17, s6 + // add v19.4s, v3.4s, v4.4s + // sha1p q6, s18, v19.4s + // + // // Update state + // add v6.4s, v6.4s, v5.4s + // str q6, [x0] + // add v16.2s, v16.2s, v17.2s + // str s16, [x0, 16] + // + // ret + // .align 4 + // .K0: + // .word 0x5A827999 + // .word 0x5A827999 + // .word 0x5A827999 + // .word 0x5A827999 + // .K1: + // .word 0x6ED9EBA1 + // .word 0x6ED9EBA1 + // .word 0x6ED9EBA1 + // .word 0x6ED9EBA1 + // .K2: + // .word 0x8F1BBCDC + // .word 0x8F1BBCDC + // .word 0x8F1BBCDC + // .word 0x8F1BBCDC + // .K3: + // .word 0xCA62C1D6 + // .word 0xCA62C1D6 + // .word 0xCA62C1D6 + // .word 0xCA62C1D6 + + ); + }; +} From 28c3a33a20a125f3305d7111605ae041bf2cc750 Mon Sep 17 00:00:00 2001 From: Claudia Richoux Date: Sun, 29 Jan 2023 12:39:31 -0500 Subject: [PATCH 5/8] are we passing --- sha1/src/asm/aarch64.rs | 376 +++++++++++++++++++--------------------- sha1/src/compress.rs | 8 + sha1/src/lib.rs | 28 +-- 3 files changed, 187 insertions(+), 225 deletions(-) diff --git a/sha1/src/asm/aarch64.rs b/sha1/src/asm/aarch64.rs index 5347a8e11..675144b6a 100644 --- a/sha1/src/asm/aarch64.rs +++ b/sha1/src/asm/aarch64.rs @@ -21,7 +21,6 @@ // * out of or in connection with the Software or the use or other dealings in the // * Software. // */ - use core::arch::asm; // macro_rules! sha_1_through_4 { @@ -35,28 +34,6 @@ pub fn compress(state: &mut [u32; 5], blocks: &[[u8; 64]]) { // SAFETY: inline-assembly unsafe { asm!( - // define the SHA1 constants TODO (laudiacay) does it make sense for these to be up here? does it cause alignment issues? - ".K0:", - ".word 0x5A827999", - ".word 0x5A827999", - ".word 0x5A827999", - ".word 0x5A827999", - ".K1:", - ".word 0x6ED9EBA1", - ".word 0x6ED9EBA1", - ".word 0x6ED9EBA1", - ".word 0x6ED9EBA1", - ".K2:", - ".word 0x8F1BBCDC", - ".word 0x8F1BBCDC", - ".word 0x8F1BBCDC", - ".word 0x8F1BBCDC", - ".K3:", - ".word 0xCA62C1D6", - ".word 0xCA62C1D6", - ".word 0xCA62C1D6", - ".word 0xCA62C1D6", - // from original code, some docs :) // /* // * Storage usage: @@ -103,181 +80,184 @@ pub fn compress(state: &mut [u32; 5], blocks: &[[u8; 64]]) { "rev32 v3.16b, v3.16b", // k for the next five rounds - "adrp x1, .K0" - "ldr q4, [x1, #:lo12:.K0]" - - // // 0 - // sha1h s18, s6 - // add v19.4s, v0.4s, v4.4s - // sha1c q6, s16, v19.4s - // sha1su0 v0.4s, v1.4s, v2.4s - - // // 1 - // sha1h s17, s6 - // add v19.4s, v1.4s, v4.4s - // sha1c q6, s18, v19.4s - // sha1su1 v0.4s, v3.4s - // sha1su0 v1.4s, v2.4s, v3.4s - // - // // 2 - // sha1h s18, s6 - // add v19.4s, v2.4s, v4.4s - // sha1c q6, s17, v19.4s - // sha1su1 v1.4s, v0.4s - // sha1su0 v2.4s, v3.4s, v0.4s - // - // // 3 - // sha1h s17, s6 - // add v19.4s, v3.4s, v4.4s - // sha1c q6, s18, v19.4s - // sha1su1 v2.4s, v1.4s - // sha1su0 v3.4s, v0.4s, v1.4s - // - // // 4 - // sha1h s18, s6 - // add v19.4s, v0.4s, v4.4s - // sha1c q6, s17, v19.4s - // sha1su1 v3.4s, v2.4s - // sha1su0 v0.4s, v1.4s, v2.4s - // - // // k for the next five rounds - // adrp x1, .K1 - // ldr q4, [x1, #:lo12:.K1] - // - // // 5 - // sha1h s17, s6 - // add v19.4s, v1.4s, v4.4s - // sha1p q6, s18, v19.4s - // sha1su1 v0.4s, v3.4s - // sha1su0 v1.4s, v2.4s, v3.4s - // - // // 6 - // sha1h s18, s6 - // add v19.4s, v2.4s, v4.4s - // sha1p q6, s17, v19.4s - // sha1su1 v1.4s, v0.4s - // sha1su0 v2.4s, v3.4s, v0.4s - // - // // 7 - // sha1h s17, s6 - // add v19.4s, v3.4s, v4.4s - // sha1p q6, s18, v19.4s - // sha1su1 v2.4s, v1.4s - // sha1su0 v3.4s, v0.4s, v1.4s - // - // // 8 - // sha1h s18, s6 - // add v19.4s, v0.4s, v4.4s - // sha1p q6, s17, v19.4s - // sha1su1 v3.4s, v2.4s - // sha1su0 v0.4s, v1.4s, v2.4s - // - // // 9 - // sha1h s17, s6 - // add v19.4s, v1.4s, v4.4s - // sha1p q6, s18, v19.4s - // sha1su1 v0.4s, v3.4s - // sha1su0 v1.4s, v2.4s, v3.4s - // - // // k for the next five rounds - // adrp x1, .K2 - // ldr q4, [x1, #:lo12:.K2] - // - // // 10 - // sha1h s18, s6 - // add v19.4s, v2.4s, v4.4s - // sha1m q6, s17, v19.4s - // sha1su1 v1.4s, v0.4s - // sha1su0 v2.4s, v3.4s, v0.4s - // - // // 11 - // sha1h s17, s6 - // add v19.4s, v3.4s, v4.4s - // sha1m q6, s18, v19.4s - // sha1su1 v2.4s, v1.4s - // sha1su0 v3.4s, v0.4s, v1.4s - // - // // 12 - // sha1h s18, s6 - // add v19.4s, v0.4s, v4.4s - // sha1m q6, s17, v19.4s - // sha1su1 v3.4s, v2.4s - // sha1su0 v0.4s, v1.4s, v2.4s - // - // // 13 - // sha1h s17, s6 - // add v19.4s, v1.4s, v4.4s - // sha1m q6, s18, v19.4s - // sha1su1 v0.4s, v3.4s - // sha1su0 v1.4s, v2.4s, v3.4s - // - // // 14 - // sha1h s18, s6 - // add v19.4s, v2.4s, v4.4s - // sha1m q6, s17, v19.4s - // sha1su1 v1.4s, v0.4s - // sha1su0 v2.4s, v3.4s, v0.4s - // - // // k for the next five rounds - // adrp x1, .K3 - // ldr q4, [x1, #:lo12:.K3] - // - // // 15 - // sha1h s17, s6 - // add v19.4s, v3.4s, v4.4s - // sha1p q6, s18, v19.4s - // sha1su1 v2.4s, v1.4s - // sha1su0 v3.4s, v0.4s, v1.4s - // - // // 16 - // sha1h s18, s6 - // add v19.4s, v0.4s, v4.4s - // sha1p q6, s17, v19.4s - // sha1su1 v3.4s, v2.4s - // - // // 17 - // sha1h s17, s6 - // add v19.4s, v1.4s, v4.4s - // sha1p q6, s18, v19.4s - // - // // 18 - // sha1h s18, s6 - // add v19.4s, v2.4s, v4.4s - // sha1p q6, s17, v19.4s - // - // // 19 - // sha1h s17, s6 - // add v19.4s, v3.4s, v4.4s - // sha1p q6, s18, v19.4s - // - // // Update state - // add v6.4s, v6.4s, v5.4s - // str q6, [x0] - // add v16.2s, v16.2s, v17.2s - // str s16, [x0, 16] - // - // ret - // .align 4 - // .K0: - // .word 0x5A827999 - // .word 0x5A827999 - // .word 0x5A827999 - // .word 0x5A827999 - // .K1: - // .word 0x6ED9EBA1 - // .word 0x6ED9EBA1 - // .word 0x6ED9EBA1 - // .word 0x6ED9EBA1 - // .K2: - // .word 0x8F1BBCDC - // .word 0x8F1BBCDC - // .word 0x8F1BBCDC - // .word 0x8F1BBCDC - // .K3: - // .word 0xCA62C1D6 - // .word 0xCA62C1D6 - // .word 0xCA62C1D6 - // .word 0xCA62C1D6 + "adrp x1, .K0", + "ldr q4, [x1, #:lo12:.K0]", + + // 0 + "sha1h s18, s6", + "add v19.4s, v0.4s, v4.4s", + "sha1c q6, s16, v19.4s", + "sha1su0 v0.4s, v1.4s, v2.4s", + + // 1 + "sha1h s17, s6", + "add v19.4s, v1.4s, v4.4s", + "sha1c q6, s18, v19.4s", + "sha1su1 v0.4s, v3.4s", + "sha1su0 v1.4s, v2.4s, v3.4s", + + // 2 + "sha1h s18, s6", + "add v19.4s, v2.4s, v4.4s", + "sha1c q6, s17, v19.4s", + "sha1su1 v1.4s, v0.4s", + "sha1su0 v2.4s, v3.4s, v0.4s", + + // 3 + "sha1h s17, s6", + "add v19.4s, v3.4s, v4.4s", + "sha1c q6, s18, v19.4s", + "sha1su1 v2.4s, v1.4s", + "sha1su0 v3.4s, v0.4s, v1.4s", + + // 4 + "sha1h s18, s6", + "add v19.4s, v0.4s, v4.4s", + "sha1c q6, s17, v19.4s", + "sha1su1 v3.4s, v2.4s", + "sha1su0 v0.4s, v1.4s, v2.4s", + + // k for the next five rounds + "adrp x1, .K1", + "ldr q4, [x1, #:lo12:.K1]", + + // 5 + "sha1h s17, s6", + "add v19.4s, v1.4s, v4.4s", + "sha1p q6, s18, v19.4s", + "sha1su1 v0.4s, v3.4s", + "sha1su0 v1.4s, v2.4s, v3.4s", + + // 6 + "sha1h s18, s6", + "add v19.4s, v2.4s, v4.4s", + "sha1p q6, s17, v19.4s", + "sha1su1 v1.4s, v0.4s", + "sha1su0 v2.4s, v3.4s, v0.4s", + + // 7 + "sha1h s17, s6", + "add v19.4s, v3.4s, v4.4s", + "sha1p q6, s18, v19.4s", + "sha1su1 v2.4s, v1.4s", + "sha1su0 v3.4s, v0.4s, v1.4s", + + // 8 + "sha1h s18, s6", + "add v19.4s, v0.4s, v4.4s", + "sha1p q6, s17, v19.4s", + "sha1su1 v3.4s, v2.4s", + "sha1su0 v0.4s, v1.4s, v2.4s", + + // 9 + "sha1h s17, s6", + "add v19.4s, v1.4s, v4.4s", + "sha1p q6, s18, v19.4s", + "sha1su1 v0.4s, v3.4s", + "sha1su0 v1.4s, v2.4s, v3.4s", + + // k for the next five rounds + "adrp x1, .K2", + "ldr q4, [x1, #:lo12:.K2]", + + // 10 + "sha1h s18, s6", + "add v19.4s, v2.4s, v4.4s", + "sha1m q6, s17, v19.4s", + "sha1su1 v1.4s, v0.4s", + "sha1su0 v2.4s, v3.4s, v0.4s", + + // 11 + "sha1h s17, s6", + "add v19.4s, v3.4s, v4.4s", + "sha1m q6, s18, v19.4s", + "sha1su1 v2.4s, v1.4s", + "sha1su0 v3.4s, v0.4s, v1.4s", + + // 12 + "sha1h s18, s6", + "add v19.4s, v0.4s, v4.4s", + "sha1m q6, s17, v19.4s", + "sha1su1 v3.4s, v2.4s", + "sha1su0 v0.4s, v1.4s, v2.4s", + + // 13 + "sha1h s17, s6", + "add v19.4s, v1.4s, v4.4s", + "sha1m q6, s18, v19.4s", + "sha1su1 v0.4s, v3.4s", + "sha1su0 v1.4s, v2.4s, v3.4s", + + // 14 + "sha1h s18, s6", + "add v19.4s, v2.4s, v4.4s", + "sha1m q6, s17, v19.4s", + "sha1su1 v1.4s, v0.4s", + "sha1su0 v2.4s, v3.4s, v0.4s", + + // k for the next five rounds + "adrp x1, .K3", + "ldr q4, [x1, #:lo12:.K3]", + + // 15 + "sha1h s17, s6", + "add v19.4s, v3.4s, v4.4s", + "sha1p q6, s18, v19.4s", + "sha1su1 v2.4s, v1.4s", + "sha1su0 v3.4s, v0.4s, v1.4s", + + // 16 + "sha1h s18, s6", + "add v19.4s, v0.4s, v4.4s", + "sha1p q6, s17, v19.4s", + "sha1su1 v3.4s, v2.4s", + + // 17 + "sha1h s17, s6", + "add v19.4s, v1.4s, v4.4s", + "sha1p q6, s18, v19.4s", + + // 18 + "sha1h s18, s6", + "add v19.4s, v2.4s, v4.4s", + "sha1p q6, s17, v19.4s", + + // 19 + "sha1h s17, s6", + "add v19.4s, v3.4s, v4.4s", + "sha1p q6, s18, v19.4s", + + // Update state + "add v6.4s, v6.4s, v5.4s", + // source code: str q6, [x0] + out(q6) state[0..4], + "add v16.2s, v16.2s, v17.2s", + // source code: str s16, [x0, 16] + out(s16) state[4], + + "ret", // TODO is this right + + ".align 4", // TODO ummm alignment... + ".K0:", // TODO are labels just the same in inline asm in rust? + ".word 0x5A827999" + ".word 0x5A827999", + ".word 0x5A827999", + ".word 0x5A827999", + ".K1:", + ".word 0x6ED9EBA1", + ".word 0x6ED9EBA1", + ".word 0x6ED9EBA1", + ".word 0x6ED9EBA1", + ".K2:", + ".word 0x8F1BBCDC", + ".word 0x8F1BBCDC", + ".word 0x8F1BBCDC", + ".word 0x8F1BBCDC", + ".K3:", + ".word 0xCA62C1D6", + ".word 0xCA62C1D6", + ".word 0xCA62C1D6", + ".word 0xCA62C1D6", ); }; diff --git a/sha1/src/compress.rs b/sha1/src/compress.rs index da4a10a98..c80650620 100644 --- a/sha1/src/compress.rs +++ b/sha1/src/compress.rs @@ -5,6 +5,14 @@ cfg_if::cfg_if! { if #[cfg(feature = "force-soft")] { mod soft; use soft::compress as compress_inner; + } else if #[cfg(feature = "inline-asm")] { + mod asm; + #[cfg(all(feature = "inline-asm", target_arch = "x86"))] + use asm::x86::compress as compress_inner; + #[cfg(all(feature = "inline-asm", target_arch = "x86_64"))] + use asm::x86_64::compress as compress_inner; + #[cfg(all(feature = "inline-asm", target_arch = "aarch64"))] + use asm::aarch64::compress as compress_inner; } else if #[cfg(all(feature = "asm", target_arch = "aarch64"))] { mod soft; mod aarch64; diff --git a/sha1/src/lib.rs b/sha1/src/lib.rs index 9b7fd0a37..c37227b8e 100644 --- a/sha1/src/lib.rs +++ b/sha1/src/lib.rs @@ -76,35 +76,9 @@ mod asm; mod compress; #[cfg(feature = "compress")] -#[cfg(all( - feature = "inline-asm", - feature = "compress", - any(target_arch = "x86", target_arch = "x86_64", target_arch = "aarch64") -))] -pub use asm::compress; - -#[cfg(feature = "compress")] -#[cfg(all( - feature = "inline-asm", - not(feature = "compress"), - any(target_arch = "x86", target_arch = "x86_64", target_arch = "aarch64") -))] -use asm::compress; - -#[cfg(feature = "compress")] -#[cfg(all( - not(feature = "inline-asm"), - feature = "compress", - any(target_arch = "x86", target_arch = "x86_64", target_arch = "aarch64") -))] pub use compress::compress; -#[cfg(feature = "compress")] -#[cfg(all( - not(feature = "inline-asm"), - not(feature = "compress"), - any(target_arch = "x86", target_arch = "x86_64", target_arch = "aarch64") -))] +#[cfg(not(feature = "compress"))] use compress::compress; const STATE_LEN: usize = 5; From e65392db2bab274a02396bdf5f3d232a237e2f41 Mon Sep 17 00:00:00 2001 From: Claudia Richoux Date: Sun, 29 Jan 2023 12:43:03 -0500 Subject: [PATCH 6/8] we love to change the target --- .github/workflows/sha1.yml | 6 ++++++ Cargo.lock | 7 +++++++ sha1/src/compress.rs | 7 +------ 3 files changed, 14 insertions(+), 6 deletions(-) diff --git a/.github/workflows/sha1.yml b/.github/workflows/sha1.yml index 2228af5d5..2eb5c4312 100644 --- a/.github/workflows/sha1.yml +++ b/.github/workflows/sha1.yml @@ -189,10 +189,16 @@ jobs: - run: cargo test # TODO: merge with test on MSRV bump to 1.59 or higher + # TODO: do i need to think about no-std platforms here? test-inline-asm: runs-on: ubuntu-latest strategy: matrix: + target: + - aarch64-unknown-linux-gnu + - x86_64-unknown-linux-gnu + - x86-unknown-linux-gnu + # TODO - aarch64-apple-darwin rust: - 1.59.0 # MSRV steps: diff --git a/Cargo.lock b/Cargo.lock index d851c17b4..66c3279ff 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2,6 +2,12 @@ # It is not intended for manual editing. version = 3 +[[package]] +name = "asm_block" +version = "0.1.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "466c0990cf15ef0f331f19fdc16fd60229606ab237476c70a66b747fce2911ab" + [[package]] name = "blake2" version = "0.10.6" @@ -204,6 +210,7 @@ dependencies = [ name = "sha1" version = "0.10.5" dependencies = [ + "asm_block", "cfg-if", "cpufeatures", "digest", diff --git a/sha1/src/compress.rs b/sha1/src/compress.rs index c80650620..d43e509d6 100644 --- a/sha1/src/compress.rs +++ b/sha1/src/compress.rs @@ -7,12 +7,7 @@ cfg_if::cfg_if! { use soft::compress as compress_inner; } else if #[cfg(feature = "inline-asm")] { mod asm; - #[cfg(all(feature = "inline-asm", target_arch = "x86"))] - use asm::x86::compress as compress_inner; - #[cfg(all(feature = "inline-asm", target_arch = "x86_64"))] - use asm::x86_64::compress as compress_inner; - #[cfg(all(feature = "inline-asm", target_arch = "aarch64"))] - use asm::aarch64::compress as compress_inner; + use asm::compress as compress_inner; } else if #[cfg(all(feature = "asm", target_arch = "aarch64"))] { mod soft; mod aarch64; From 8b8e56c9aa15f0f4a5d724d9a2ec8caea125d11e Mon Sep 17 00:00:00 2001 From: Claudia Richoux Date: Sun, 29 Jan 2023 14:00:06 -0500 Subject: [PATCH 7/8] unsure how to get the state out of the inline asm --- sha1/src/asm/aarch64.rs | 35 +++++++++++++++++++++++++---------- sha1/src/compress.rs | 3 +-- sha1/src/lib.rs | 4 ---- 3 files changed, 26 insertions(+), 16 deletions(-) diff --git a/sha1/src/asm/aarch64.rs b/sha1/src/asm/aarch64.rs index 675144b6a..c5fb97351 100644 --- a/sha1/src/asm/aarch64.rs +++ b/sha1/src/asm/aarch64.rs @@ -31,6 +31,7 @@ use core::arch::asm; /// of register allocation on `x86`, we explicitly specify registers to use. #[cfg(all(feature = "inline-asm", target_arch = "aarch64"))] pub fn compress(state: &mut [u32; 5], blocks: &[[u8; 64]]) { + let mut out_state = [0u32; 5]; // SAFETY: inline-assembly unsafe { asm!( @@ -57,9 +58,10 @@ pub fn compress(state: &mut [u32; 5], blocks: &[[u8; 64]]) { // original code: // ldr q5, [x0] // ldr s16, [x0, 16] - // mov v6.16b, v5.16b - in(q5) state[0..4], - in(s16) state[4], + // this now happens at the bottom... + // TODO what is this doing? + // i believe it's copying state[0..4] into v6 (which is also q6) + // confirmed this is the mutable copy of the first 4 words of the state "mov v6.16b, v5.16b", // Load block in registers @@ -68,12 +70,10 @@ pub fn compress(state: &mut [u32; 5], blocks: &[[u8; 64]]) { // ldr q1, [x1, 16] // ldr q2, [x1, 32] // ldr q3, [x1, 48] - in(q0) blocks[0][0..16], - in(q1) blocks[0][16..32], - in(q2) blocks[0][32..48], - in(q3) blocks[0][48..64], + // this is at the bottom now // from original code: TODO: only do that on little endian + // this flips the blocks from little to big endian "rev32 v0.16b, v0.16b", "rev32 v1.16b, v1.16b", "rev32 v2.16b, v2.16b", @@ -230,16 +230,16 @@ pub fn compress(state: &mut [u32; 5], blocks: &[[u8; 64]]) { // Update state "add v6.4s, v6.4s, v5.4s", // source code: str q6, [x0] - out(q6) state[0..4], + // this now happens at the bottom "add v16.2s, v16.2s, v17.2s", // source code: str s16, [x0, 16] - out(s16) state[4], + // this now happens at the bottom "ret", // TODO is this right ".align 4", // TODO ummm alignment... ".K0:", // TODO are labels just the same in inline asm in rust? - ".word 0x5A827999" + ".word 0x5A827999", ".word 0x5A827999", ".word 0x5A827999", ".word 0x5A827999", @@ -259,6 +259,21 @@ pub fn compress(state: &mut [u32; 5], blocks: &[[u8; 64]]) { ".word 0xCA62C1D6", ".word 0xCA62C1D6", + // state ins and outs + in("q4") state.as_mut_ptr(), + inout("s16") state[4], + lateout("q6") state as *mut u32, + // blocks in + in("q0") blocks[0][0..16].as_ptr(), + in("q1") blocks[0][16..32].as_ptr(), + in("q2") blocks[0][32..48].as_ptr(), + in("q3") blocks[0][48..64].as_ptr(), + // some clobbers + out("q5") _, + out("s17") _, + out("s18") _, + out("q19") _, + // TODO make sure there aren't any other clobbers ); }; } diff --git a/sha1/src/compress.rs b/sha1/src/compress.rs index d43e509d6..2e80ee090 100644 --- a/sha1/src/compress.rs +++ b/sha1/src/compress.rs @@ -6,8 +6,7 @@ cfg_if::cfg_if! { mod soft; use soft::compress as compress_inner; } else if #[cfg(feature = "inline-asm")] { - mod asm; - use asm::compress as compress_inner; + use crate::asm::compress as compress_inner; } else if #[cfg(all(feature = "asm", target_arch = "aarch64"))] { mod soft; mod aarch64; diff --git a/sha1/src/lib.rs b/sha1/src/lib.rs index c37227b8e..8a003d2d9 100644 --- a/sha1/src/lib.rs +++ b/sha1/src/lib.rs @@ -69,10 +69,6 @@ use digest::{ ))] mod asm; -#[cfg(not(all( - any(feature = "asm", feature = "inline-asm"), - any(target_arch = "x86", target_arch = "x86_64", target_arch = "aarch64") -)))] mod compress; #[cfg(feature = "compress")] From c14975b8e3d4ca63cc7ce6239bb940439eb9b6de Mon Sep 17 00:00:00 2001 From: Claudia Richoux Date: Sun, 29 Jan 2023 14:41:52 -0500 Subject: [PATCH 8/8] starting x86 --- sha1/src/asm/x86.rs | 169 ++++++++++++++++++++++++++++++++++------- sha1/src/asm/x86_64.rs | 34 ++++----- 2 files changed, 160 insertions(+), 43 deletions(-) diff --git a/sha1/src/asm/x86.rs b/sha1/src/asm/x86.rs index 2ab865649..f7b24e839 100644 --- a/sha1/src/asm/x86.rs +++ b/sha1/src/asm/x86.rs @@ -51,8 +51,9 @@ use asm_block::asm_block; // * 4 [esp+72] Caller's value of edi // * 4 [esp+76] Caller's value of ebp // */ -// -// #define ROUND0a(a, b, c, d, e, i) \ + + +// #define round0a(a, b, c, d, e, i) \ // movl (i*4)(%edi), %esi; \ // bswapl %esi; \ // movl %esi, (i*4)(%esp); \ @@ -62,7 +63,25 @@ use asm_block::asm_block; // andl %b, %esi; \ // xorl %d, %esi; \ // ROUNDTAIL(a, b, e, i, 0x5A827999) -// + +macro_rules! round0a { + ($a:tt, $b:tt, $c:tt, $d:tt, $e:tt, $i:tt) => { + concat! { + asm_block! { + movl ($i*4)(%edi), %esi; + bswapl %esi; + movl %esi, ($i*4)(%esp); + addl %esi, $e; + movl $c, %esi; + xorl $d, %esi; + andl $b, %esi; + xorl $d, %esi; + ROUNDTAIL!($a, $b, $e, $i, 0x5A827999); + } + } + }; +} + // #define SCHEDULE(i, e) \ // movl (((i- 3)&0xF)*4)(%esp), %esi; \ // xorl (((i- 8)&0xF)*4)(%esp), %esi; \ @@ -71,7 +90,23 @@ use asm_block::asm_block; // roll $1, %esi; \ // addl %esi, %e; \ // movl %esi, ((i&0xF)*4)(%esp); -// + +macro_rules! schedule { + ($i:tt, $e:tt) => { + concat!{ + asm_block! { + movl ((($i- 3)&0xF)*4)(%esp), %esi; + xorl ((($i- 8)&0xF)*4)(%esp), %esi; + xorl ((($i-14)&0xF)*4)(%esp), %esi; + xorl ((($i-16)&0xF)*4)(%esp), %esi; + roll $1, %esi; + addl %esi, $e; + movl %esi, (($i&0xF)*4)(%esp); + } + } + }; +} + // #define ROUND0b(a, b, c, d, e, i) \ // SCHEDULE(i, e) \ // movl %c, %esi; \ @@ -79,14 +114,43 @@ use asm_block::asm_block; // andl %b, %esi; \ // xorl %d, %esi; \ // ROUNDTAIL(a, b, e, i, 0x5A827999) -// + +macro_rules! round0b { + ($a:tt, $b:tt, $c:tt, $d:tt, $e:tt, $i:tt) => { + concat! { + asm_block! { + schedule!($i, $e); + movl $c, %esi; + xorl $d, %esi; + andl $b, %esi; + xorl $d, %esi; + roundtail!($a, $b, $e, $i, 0x5A827999); + } + } + }; +} + // #define ROUND1(a, b, c, d, e, i) \ // SCHEDULE(i, e) \ // movl %b, %esi; \ // xorl %c, %esi; \ // xorl %d, %esi; \ // ROUNDTAIL(a, b, e, i, 0x6ED9EBA1) -// + +macro_rules! round1 { + ($a:tt, $b:tt, $c:tt, $d:tt, $e:tt, $i:tt) => { + concat! { + asm_block! { + schedule!($i, $e); + movl $b, %esi; + xorl $c, %esi; + xorl $d, %esi; + roundtail!($a, $b, $e, $i, 0x6ED9EBA1); + } + } + }; +} + // #define ROUND2(a, b, c, d, e, i) \ // SCHEDULE(i, e) \ // movl %c, %esi; \ @@ -96,22 +160,69 @@ use asm_block::asm_block; // andl %d, %edi; \ // orl %edi, %esi; \ // ROUNDTAIL(a, b, e, i, 0x8F1BBCDC) -// + +macro_rules! round2 { + ($a:tt, $b:tt, $c:tt, $d:tt, $e:tt, $i:tt) => { + concat! { + asm_block! { + schedule!($i, $e); + movl $c, %esi; + movl $c, %edi; + orl $d, %esi; + andl $b, %esi; + andl $d, %edi; + orl %edi, %esi; + roundtail!($a, $b, $e, $i, 0x8F1BBCDC); + } + } + }; +} + // #define ROUND3(a, b, c, d, e, i) \ // SCHEDULE(i, e) \ // movl %b, %esi; \ // xorl %c, %esi; \ // xorl %d, %esi; \ // ROUNDTAIL(a, b, e, i, 0xCA62C1D6) -// + +macro_rules! round3 { + ($a:tt, $b:tt, $c:tt, $d:tt, $e:tt, $i:tt) => { + concat! { + asm_block! { + schedule!($i, $e); + movl $b, %esi; + xorl $c, %esi; + xorl $d, %esi; + roundtail!($a, $b, $e, $i, 0xCA62C1D6); + } + } + }; +} + // #define ROUNDTAIL(a, b, e, i, k) \ // roll $30, %b; \ // leal k(%e,%esi), %e; \ // movl %a, %esi; \ // roll $5, %esi; \ // addl %esi, %e; -// -// /* Save registers */ + +macro_rules! roundtail { + ($a:tt, $b:tt, $e:tt, $i:tt, $k:tt) => { + concat! { + asm_block! { + roll $30, $b; + leal $k($e,%esi), $e; + movl $a, %esi; + roll $5, %esi; + addl %esi, $e; + } + } + }; +} + +macro_rules! asm_sha1 { + // states + // /* Save registers */ // subl $80, %esp // movl %ebx, 64(%esp) // movl %esi, 68(%esp) @@ -128,22 +239,22 @@ use asm_block::asm_block; // movl 16(%esi), %ebp /* e */ // // /* 80 rounds of hashing */ -// ROUND0a(eax, ebx, ecx, edx, ebp, 0) -// ROUND0a(ebp, eax, ebx, ecx, edx, 1) -// ROUND0a(edx, ebp, eax, ebx, ecx, 2) -// ROUND0a(ecx, edx, ebp, eax, ebx, 3) -// ROUND0a(ebx, ecx, edx, ebp, eax, 4) -// ROUND0a(eax, ebx, ecx, edx, ebp, 5) -// ROUND0a(ebp, eax, ebx, ecx, edx, 6) -// ROUND0a(edx, ebp, eax, ebx, ecx, 7) -// ROUND0a(ecx, edx, ebp, eax, ebx, 8) -// ROUND0a(ebx, ecx, edx, ebp, eax, 9) -// ROUND0a(eax, ebx, ecx, edx, ebp, 10) -// ROUND0a(ebp, eax, ebx, ecx, edx, 11) -// ROUND0a(edx, ebp, eax, ebx, ecx, 12) -// ROUND0a(ecx, edx, ebp, eax, ebx, 13) -// ROUND0a(ebx, ecx, edx, ebp, eax, 14) -// ROUND0a(eax, ebx, ecx, edx, ebp, 15) +// round0a(eax, ebx, ecx, edx, ebp, 0) +// round0a(ebp, eax, ebx, ecx, edx, 1) +// round0a(edx, ebp, eax, ebx, ecx, 2) +// round0a(ecx, edx, ebp, eax, ebx, 3) +// round0a(ebx, ecx, edx, ebp, eax, 4) +// round0a(eax, ebx, ecx, edx, ebp, 5) +// round0a(ebp, eax, ebx, ecx, edx, 6) +// round0a(edx, ebp, eax, ebx, ecx, 7) +// round0a(ecx, edx, ebp, eax, ebx, 8) +// round0a(ebx, ecx, edx, ebp, eax, 9) +// round0a(eax, ebx, ecx, edx, ebp, 10) +// round0a(ebp, eax, ebx, ecx, edx, 11) +// round0a(edx, ebp, eax, ebx, ecx, 12) +// round0a(ecx, edx, ebp, eax, ebx, 13) +// round0a(ebx, ecx, edx, ebp, eax, 14) +// round0a(eax, ebx, ecx, edx, ebp, 15) // ROUND0b(ebp, eax, ebx, ecx, edx, 16) // ROUND0b(edx, ebp, eax, ebx, ecx, 17) // ROUND0b(ecx, edx, ebp, eax, ebx, 18) @@ -224,3 +335,9 @@ use asm_block::asm_block; // movl 76(%esp), %ebp // addl $80, %esp // retl +} + +#[cfg(all(feature = "inline_asm", target_arch = "x86"))] +pub fn compress(state: &mut [u32; 5], blocks: &[[u8; 64]]) { + unimplemented!("compress() is not implemented for x86"); +} diff --git a/sha1/src/asm/x86_64.rs b/sha1/src/asm/x86_64.rs index 5e095883b..37c26dd15 100644 --- a/sha1/src/asm/x86_64.rs +++ b/sha1/src/asm/x86_64.rs @@ -48,7 +48,7 @@ // * 16 xmm1 Caller's value of rbp (only low 64 bits are used) // */ // -// #define ROUND0a(a, b, c, d, e, i) \ +// #define round0a(a, b, c, d, e, i) \ // movl (i*4)(%rdi), %esi; \ // bswapl %esi; \ // movl %esi, (i*4)(%rsp); \ @@ -122,22 +122,22 @@ // movq %rsi, %rdi // // /* 80 rounds of hashing */ -// ROUND0a(eax, ebx, ecx, edx, ebp, 0) -// ROUND0a(ebp, eax, ebx, ecx, edx, 1) -// ROUND0a(edx, ebp, eax, ebx, ecx, 2) -// ROUND0a(ecx, edx, ebp, eax, ebx, 3) -// ROUND0a(ebx, ecx, edx, ebp, eax, 4) -// ROUND0a(eax, ebx, ecx, edx, ebp, 5) -// ROUND0a(ebp, eax, ebx, ecx, edx, 6) -// ROUND0a(edx, ebp, eax, ebx, ecx, 7) -// ROUND0a(ecx, edx, ebp, eax, ebx, 8) -// ROUND0a(ebx, ecx, edx, ebp, eax, 9) -// ROUND0a(eax, ebx, ecx, edx, ebp, 10) -// ROUND0a(ebp, eax, ebx, ecx, edx, 11) -// ROUND0a(edx, ebp, eax, ebx, ecx, 12) -// ROUND0a(ecx, edx, ebp, eax, ebx, 13) -// ROUND0a(ebx, ecx, edx, ebp, eax, 14) -// ROUND0a(eax, ebx, ecx, edx, ebp, 15) +// round0a(eax, ebx, ecx, edx, ebp, 0) +// round0a(ebp, eax, ebx, ecx, edx, 1) +// round0a(edx, ebp, eax, ebx, ecx, 2) +// round0a(ecx, edx, ebp, eax, ebx, 3) +// round0a(ebx, ecx, edx, ebp, eax, 4) +// round0a(eax, ebx, ecx, edx, ebp, 5) +// round0a(ebp, eax, ebx, ecx, edx, 6) +// round0a(edx, ebp, eax, ebx, ecx, 7) +// round0a(ecx, edx, ebp, eax, ebx, 8) +// round0a(ebx, ecx, edx, ebp, eax, 9) +// round0a(eax, ebx, ecx, edx, ebp, 10) +// round0a(ebp, eax, ebx, ecx, edx, 11) +// round0a(edx, ebp, eax, ebx, ecx, 12) +// round0a(ecx, edx, ebp, eax, ebx, 13) +// round0a(ebx, ecx, edx, ebp, eax, 14) +// round0a(eax, ebx, ecx, edx, ebp, 15) // ROUND0b(ebp, eax, ebx, ecx, edx, 16) // ROUND0b(edx, ebp, eax, ebx, ecx, 17) // ROUND0b(ecx, edx, ebp, eax, ebx, 18)