From c8e57c0196a0cb912da905c70ae2335e53fabba2 Mon Sep 17 00:00:00 2001
From: Claudia Richoux <c@laudiacay.cool>
Date: Sat, 28 Jan 2023 21:04:36 -0500
Subject: [PATCH 1/8] adding some tests

---
 .github/workflows/sha1.yml | 17 +++++++++++++++++
 1 file changed, 17 insertions(+)

diff --git a/.github/workflows/sha1.yml b/.github/workflows/sha1.yml
index ca09fd895..2228af5d5 100644
--- a/.github/workflows/sha1.yml
+++ b/.github/workflows/sha1.yml
@@ -187,3 +187,20 @@ jobs:
           override: true
       - run: cargo test --no-default-features
       - run: cargo test
+
+  # TODO: merge with test on MSRV bump to 1.59 or higher
+  test-inline-asm:
+    runs-on: ubuntu-latest
+    strategy:
+      matrix:
+        rust:
+          - 1.59.0 # MSRV
+    steps:
+      - uses: actions/checkout@v3
+      - uses: RustCrypto/actions/cargo-cache@master
+      - uses: actions-rs/toolchain@v1
+        with:
+          profile: minimal
+          toolchain: ${{ matrix.rust }}
+          override: true
+      - run: cargo test --features inline-asm

From 68a36e473e4f10bebe299cb8d5a457125e2c99ca Mon Sep 17 00:00:00 2001
From: Claudia Richoux <c@laudiacay.cool>
Date: Sat, 28 Jan 2023 23:42:42 -0500
Subject: [PATCH 2/8] skeleton

---
 sha1/Cargo.toml               |   2 +
 sha1/README.md                |   2 +
 sha1/src/asm/aarch64.rs       | 237 ++++++++++++++++++++++++++++++++++
 sha1/src/asm/aarch64_apple.rs | 237 ++++++++++++++++++++++++++++++++++
 sha1/src/asm/mod.rs           |   7 +
 sha1/src/asm/x86.rs           | 227 ++++++++++++++++++++++++++++++++
 sha1/src/asm/x86_64.rs        | 217 +++++++++++++++++++++++++++++++
 sha1/src/lib.rs               |  39 +++++-
 8 files changed, 967 insertions(+), 1 deletion(-)
 create mode 100644 sha1/src/asm/aarch64.rs
 create mode 100644 sha1/src/asm/aarch64_apple.rs
 create mode 100644 sha1/src/asm/mod.rs
 create mode 100644 sha1/src/asm/x86.rs
 create mode 100644 sha1/src/asm/x86_64.rs

diff --git a/sha1/Cargo.toml b/sha1/Cargo.toml
index 8ff801508..06a7bc44e 100644
--- a/sha1/Cargo.toml
+++ b/sha1/Cargo.toml
@@ -18,6 +18,7 @@ cfg-if = "1.0"
 [target.'cfg(any(target_arch = "aarch64", target_arch = "x86", target_arch = "x86_64"))'.dependencies]
 cpufeatures = "0.2"
 sha1-asm = { version = "0.5", optional = true }
+asm_block = { version = "0.1.3", optional = true }
 
 [dev-dependencies]
 digest = { version = "0.10.4", features = ["dev"] }
@@ -30,6 +31,7 @@ oid = ["digest/oid"] # Enable OID support. WARNING: Bumps MSRV to 1.57
 asm = ["sha1-asm"] # WARNING: this feature SHOULD NOT be enabled by library crates
 compress = [] # Expose compress function
 force-soft = [] # Force software implementation
+inline-asm = ["asm_block"] # TODO: i don't know why the "do not enable by library crates" warning is in the asm feature, flagging this to ask about it. # WARNING: bumps MSRV to 1.59
 
 [package.metadata.docs.rs]
 all-features = true
diff --git a/sha1/README.md b/sha1/README.md
index bd76f0973..d388cc6f1 100644
--- a/sha1/README.md
+++ b/sha1/README.md
@@ -23,6 +23,8 @@ We provide this crate for legacy interoperability purposes only.
 
 Rust **1.41** or higher.
 
+Enabling feature flag `inline-asm` requires Rust **1.59** or higher.
+
 Minimum supported Rust version can be changed in the future, but it will be
 done with a minor version bump.
 
diff --git a/sha1/src/asm/aarch64.rs b/sha1/src/asm/aarch64.rs
new file mode 100644
index 000000000..c069f329e
--- /dev/null
+++ b/sha1/src/asm/aarch64.rs
@@ -0,0 +1,237 @@
+// /*
+//  * SHA-1 hash in AArch64 assembly
+//  *
+//  * Copyright (c) 2020 Emmanuel Gil Peyrot <linkmauve@linkmauve.fr>. (MIT License)
+//  *
+//  * Permission is hereby granted, free of charge, to any person obtaining a copy of
+//  * this software and associated documentation files (the "Software"), to deal in
+//  * the Software without restriction, including without limitation the rights to
+//  * use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of
+//  * the Software, and to permit persons to whom the Software is furnished to do so,
+//  * subject to the following conditions:
+//  * - The above copyright notice and this permission notice shall be included in
+//  *   all copies or substantial portions of the Software.
+//  * - The Software is provided "as is", without warranty of any kind, express or
+//  *   implied, including but not limited to the warranties of merchantability,
+//  *   fitness for a particular purpose and noninfringement. In no event shall the
+//  *   authors or copyright holders be liable for any claim, damages or other
+//  *   liability, whether in an action of contract, tort or otherwise, arising from,
+//  *   out of or in connection with the Software or the use or other dealings in the
+//  *   Software.
+//  */
+//
+//
+// /* void sha1_compress(uint32_t state[5], const uint8_t block[64]) */
+// .global sha1_compress
+// sha1_compress:
+// 	/*
+// 	 * Storage usage:
+// 	 *   Bytes  Location  Description
+// 	 *       4  x0        state argument
+// 	 *       4  x1        block argument
+// 	 *      16  q0        W0
+// 	 *      16  q1        W1
+// 	 *      16  q2        W2
+// 	 *      16  q3        W3
+// 	 *      16  q4        k
+// 	 *      16  q5        Original ABCD
+// 	 *      16  q6        ABCD (with s3 being A)
+// 	 *       4  s16       E
+// 	 *       4  s17       e0
+// 	 *       4  s18       e1
+// 	 *      16  q19       wk
+// 	 */
+//
+// 	// Load state in registers
+// 	ldr	q5, [x0]
+// 	ldr	s16, [x0, 16]
+// 	mov	v6.16b, v5.16b
+//
+// 	// Load block in registers
+// 	ldr	q0, [x1]
+// 	ldr	q1, [x1, 16]
+// 	ldr	q2, [x1, 32]
+// 	ldr	q3, [x1, 48]
+//
+// 	// TODO: only do that on little endian
+// 	rev32	v0.16b, v0.16b
+// 	rev32	v1.16b, v1.16b
+// 	rev32	v2.16b, v2.16b
+// 	rev32	v3.16b, v3.16b
+//
+// 	// k for the next five rounds
+// 	adrp	x1, .K0
+// 	ldr	q4, [x1, #:lo12:.K0]
+//
+// 	// 0
+// 	sha1h	s18, s6
+// 	add	v19.4s, v0.4s, v4.4s
+// 	sha1c	q6, s16, v19.4s
+// 	sha1su0	v0.4s, v1.4s, v2.4s
+//
+// 	// 1
+// 	sha1h	s17, s6
+// 	add	v19.4s, v1.4s, v4.4s
+// 	sha1c	q6, s18, v19.4s
+// 	sha1su1	v0.4s, v3.4s
+// 	sha1su0	v1.4s, v2.4s, v3.4s
+//
+// 	// 2
+// 	sha1h	s18, s6
+// 	add	v19.4s, v2.4s, v4.4s
+// 	sha1c	q6, s17, v19.4s
+// 	sha1su1	v1.4s, v0.4s
+// 	sha1su0	v2.4s, v3.4s, v0.4s
+//
+// 	// 3
+// 	sha1h	s17, s6
+// 	add	v19.4s, v3.4s, v4.4s
+// 	sha1c	q6, s18, v19.4s
+// 	sha1su1	v2.4s, v1.4s
+// 	sha1su0	v3.4s, v0.4s, v1.4s
+//
+// 	// 4
+// 	sha1h	s18, s6
+// 	add	v19.4s, v0.4s, v4.4s
+// 	sha1c	q6, s17, v19.4s
+// 	sha1su1	v3.4s, v2.4s
+// 	sha1su0	v0.4s, v1.4s, v2.4s
+//
+// 	// k for the next five rounds
+// 	adrp	x1, .K1
+// 	ldr	q4, [x1, #:lo12:.K1]
+//
+// 	// 5
+// 	sha1h	s17, s6
+// 	add	v19.4s, v1.4s, v4.4s
+// 	sha1p	q6, s18, v19.4s
+// 	sha1su1	v0.4s, v3.4s
+// 	sha1su0	v1.4s, v2.4s, v3.4s
+//
+// 	// 6
+// 	sha1h	s18, s6
+// 	add	v19.4s, v2.4s, v4.4s
+// 	sha1p	q6, s17, v19.4s
+// 	sha1su1	v1.4s, v0.4s
+// 	sha1su0	v2.4s, v3.4s, v0.4s
+//
+// 	// 7
+// 	sha1h	s17, s6
+// 	add	v19.4s, v3.4s, v4.4s
+// 	sha1p	q6, s18, v19.4s
+// 	sha1su1	v2.4s, v1.4s
+// 	sha1su0	v3.4s, v0.4s, v1.4s
+//
+// 	// 8
+// 	sha1h	s18, s6
+// 	add	v19.4s, v0.4s, v4.4s
+// 	sha1p	q6, s17, v19.4s
+// 	sha1su1	v3.4s, v2.4s
+// 	sha1su0	v0.4s, v1.4s, v2.4s
+//
+// 	// 9
+// 	sha1h	s17, s6
+// 	add	v19.4s, v1.4s, v4.4s
+// 	sha1p	q6, s18, v19.4s
+// 	sha1su1	v0.4s, v3.4s
+// 	sha1su0	v1.4s, v2.4s, v3.4s
+//
+// 	// k for the next five rounds
+// 	adrp	x1, .K2
+// 	ldr	q4, [x1, #:lo12:.K2]
+//
+// 	// 10
+// 	sha1h	s18, s6
+// 	add	v19.4s, v2.4s, v4.4s
+// 	sha1m	q6, s17, v19.4s
+// 	sha1su1	v1.4s, v0.4s
+// 	sha1su0	v2.4s, v3.4s, v0.4s
+//
+// 	// 11
+// 	sha1h	s17, s6
+// 	add	v19.4s, v3.4s, v4.4s
+// 	sha1m	q6, s18, v19.4s
+// 	sha1su1	v2.4s, v1.4s
+// 	sha1su0	v3.4s, v0.4s, v1.4s
+//
+// 	// 12
+// 	sha1h	s18, s6
+// 	add	v19.4s, v0.4s, v4.4s
+// 	sha1m	q6, s17, v19.4s
+// 	sha1su1	v3.4s, v2.4s
+// 	sha1su0	v0.4s, v1.4s, v2.4s
+//
+// 	// 13
+// 	sha1h	s17, s6
+// 	add	v19.4s, v1.4s, v4.4s
+// 	sha1m	q6, s18, v19.4s
+// 	sha1su1	v0.4s, v3.4s
+// 	sha1su0	v1.4s, v2.4s, v3.4s
+//
+// 	// 14
+// 	sha1h	s18, s6
+// 	add	v19.4s, v2.4s, v4.4s
+// 	sha1m	q6, s17, v19.4s
+// 	sha1su1	v1.4s, v0.4s
+// 	sha1su0	v2.4s, v3.4s, v0.4s
+//
+// 	// k for the next five rounds
+// 	adrp	x1, .K3
+// 	ldr	q4, [x1, #:lo12:.K3]
+//
+// 	// 15
+// 	sha1h	s17, s6
+// 	add	v19.4s, v3.4s, v4.4s
+// 	sha1p	q6, s18, v19.4s
+// 	sha1su1	v2.4s, v1.4s
+// 	sha1su0	v3.4s, v0.4s, v1.4s
+//
+// 	// 16
+// 	sha1h	s18, s6
+// 	add	v19.4s, v0.4s, v4.4s
+// 	sha1p	q6, s17, v19.4s
+// 	sha1su1	v3.4s, v2.4s
+//
+// 	// 17
+// 	sha1h	s17, s6
+// 	add	v19.4s, v1.4s, v4.4s
+// 	sha1p	q6, s18, v19.4s
+//
+// 	// 18
+// 	sha1h	s18, s6
+// 	add	v19.4s, v2.4s, v4.4s
+// 	sha1p	q6, s17, v19.4s
+//
+// 	// 19
+// 	sha1h	s17, s6
+// 	add	v19.4s, v3.4s, v4.4s
+// 	sha1p	q6, s18, v19.4s
+//
+// 	// Update state
+// 	add	v6.4s, v6.4s, v5.4s
+// 	str	q6, [x0]
+// 	add	v16.2s, v16.2s, v17.2s
+// 	str	s16, [x0, 16]
+//
+// 	ret
+// .align 4
+// .K0:
+// 	.word	0x5A827999
+// 	.word	0x5A827999
+// 	.word	0x5A827999
+// 	.word	0x5A827999
+// .K1:
+// 	.word	0x6ED9EBA1
+// 	.word	0x6ED9EBA1
+// 	.word	0x6ED9EBA1
+// 	.word	0x6ED9EBA1
+// .K2:
+// 	.word	0x8F1BBCDC
+// 	.word	0x8F1BBCDC
+// 	.word	0x8F1BBCDC
+// 	.word	0x8F1BBCDC
+// .K3:
+// 	.word	0xCA62C1D6
+// 	.word	0xCA62C1D6
+// 	.word	0xCA62C1D6
+// 	.word	0xCA62C1D6
diff --git a/sha1/src/asm/aarch64_apple.rs b/sha1/src/asm/aarch64_apple.rs
new file mode 100644
index 000000000..832c2c359
--- /dev/null
+++ b/sha1/src/asm/aarch64_apple.rs
@@ -0,0 +1,237 @@
+// /*
+//  * SHA-1 hash in AArch64 assembly
+//  *
+//  * Copyright (c) 2020 Emmanuel Gil Peyrot <linkmauve@linkmauve.fr>. (MIT License)
+//  *
+//  * Permission is hereby granted, free of charge, to any person obtaining a copy of
+//  * this software and associated documentation files (the "Software"), to deal in
+//  * the Software without restriction, including without limitation the rights to
+//  * use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of
+//  * the Software, and to permit persons to whom the Software is furnished to do so,
+//  * subject to the following conditions:
+//  * - The above copyright notice and this permission notice shall be included in
+//  *   all copies or substantial portions of the Software.
+//  * - The Software is provided "as is", without warranty of any kind, express or
+//  *   implied, including but not limited to the warranties of merchantability,
+//  *   fitness for a particular purpose and noninfringement. In no event shall the
+//  *   authors or copyright holders be liable for any claim, damages or other
+//  *   liability, whether in an action of contract, tort or otherwise, arising from,
+//  *   out of or in connection with the Software or the use or other dealings in the
+//  *   Software.
+//  */
+//
+//
+// /* void sha1_compress(uint32_t state[5], const uint8_t block[64]) */
+// .global _sha1_compress
+// _sha1_compress:
+// 	/*
+// 	 * Storage usage:
+// 	 *   Bytes  Location  Description
+// 	 *       4  x0        state argument
+// 	 *       4  x1        block argument
+// 	 *      16  q0        W0
+// 	 *      16  q1        W1
+// 	 *      16  q2        W2
+// 	 *      16  q3        W3
+// 	 *      16  q4        k
+// 	 *      16  q5        Original ABCD
+// 	 *      16  q6        ABCD (with s3 being A)
+// 	 *       4  s16       E
+// 	 *       4  s17       e0
+// 	 *       4  s18       e1
+// 	 *      16  q19       wk
+// 	 */
+//
+// 	// Load state in registers
+// 	ldr	q5, [x0]
+// 	ldr	s16, [x0, 16]
+// 	mov	v6.16b, v5.16b
+//
+// 	// Load block in registers
+// 	ldr	q0, [x1]
+// 	ldr	q1, [x1, 16]
+// 	ldr	q2, [x1, 32]
+// 	ldr	q3, [x1, 48]
+//
+// 	// TODO: only do that on little endian
+// 	rev32	v0.16b, v0.16b
+// 	rev32	v1.16b, v1.16b
+// 	rev32	v2.16b, v2.16b
+// 	rev32	v3.16b, v3.16b
+//
+// 	// k for the next five rounds
+// 	adrp	x1, .K0@PAGE
+// 	ldr	q4, [x1, #:lo12:.K0@PAGEOFF]
+//
+// 	// 0
+// 	sha1h	s18, s6
+// 	add	v19.4s, v0.4s, v4.4s
+// 	sha1c	q6, s16, v19.4s
+// 	sha1su0	v0.4s, v1.4s, v2.4s
+//
+// 	// 1
+// 	sha1h	s17, s6
+// 	add	v19.4s, v1.4s, v4.4s
+// 	sha1c	q6, s18, v19.4s
+// 	sha1su1	v0.4s, v3.4s
+// 	sha1su0	v1.4s, v2.4s, v3.4s
+//
+// 	// 2
+// 	sha1h	s18, s6
+// 	add	v19.4s, v2.4s, v4.4s
+// 	sha1c	q6, s17, v19.4s
+// 	sha1su1	v1.4s, v0.4s
+// 	sha1su0	v2.4s, v3.4s, v0.4s
+//
+// 	// 3
+// 	sha1h	s17, s6
+// 	add	v19.4s, v3.4s, v4.4s
+// 	sha1c	q6, s18, v19.4s
+// 	sha1su1	v2.4s, v1.4s
+// 	sha1su0	v3.4s, v0.4s, v1.4s
+//
+// 	// 4
+// 	sha1h	s18, s6
+// 	add	v19.4s, v0.4s, v4.4s
+// 	sha1c	q6, s17, v19.4s
+// 	sha1su1	v3.4s, v2.4s
+// 	sha1su0	v0.4s, v1.4s, v2.4s
+//
+// 	// k for the next five rounds
+// 	adrp	x1, .K1@PAGE
+// 	ldr	q4, [x1, #:lo12:.K1@PAGEOFF]
+//
+// 	// 5
+// 	sha1h	s17, s6
+// 	add	v19.4s, v1.4s, v4.4s
+// 	sha1p	q6, s18, v19.4s
+// 	sha1su1	v0.4s, v3.4s
+// 	sha1su0	v1.4s, v2.4s, v3.4s
+//
+// 	// 6
+// 	sha1h	s18, s6
+// 	add	v19.4s, v2.4s, v4.4s
+// 	sha1p	q6, s17, v19.4s
+// 	sha1su1	v1.4s, v0.4s
+// 	sha1su0	v2.4s, v3.4s, v0.4s
+//
+// 	// 7
+// 	sha1h	s17, s6
+// 	add	v19.4s, v3.4s, v4.4s
+// 	sha1p	q6, s18, v19.4s
+// 	sha1su1	v2.4s, v1.4s
+// 	sha1su0	v3.4s, v0.4s, v1.4s
+//
+// 	// 8
+// 	sha1h	s18, s6
+// 	add	v19.4s, v0.4s, v4.4s
+// 	sha1p	q6, s17, v19.4s
+// 	sha1su1	v3.4s, v2.4s
+// 	sha1su0	v0.4s, v1.4s, v2.4s
+//
+// 	// 9
+// 	sha1h	s17, s6
+// 	add	v19.4s, v1.4s, v4.4s
+// 	sha1p	q6, s18, v19.4s
+// 	sha1su1	v0.4s, v3.4s
+// 	sha1su0	v1.4s, v2.4s, v3.4s
+//
+// 	// k for the next five rounds
+// 	adrp	x1, .K2@PAGE
+// 	ldr	q4, [x1, #:lo12:.K2@PAGEOFF]
+//
+// 	// 10
+// 	sha1h	s18, s6
+// 	add	v19.4s, v2.4s, v4.4s
+// 	sha1m	q6, s17, v19.4s
+// 	sha1su1	v1.4s, v0.4s
+// 	sha1su0	v2.4s, v3.4s, v0.4s
+//
+// 	// 11
+// 	sha1h	s17, s6
+// 	add	v19.4s, v3.4s, v4.4s
+// 	sha1m	q6, s18, v19.4s
+// 	sha1su1	v2.4s, v1.4s
+// 	sha1su0	v3.4s, v0.4s, v1.4s
+//
+// 	// 12
+// 	sha1h	s18, s6
+// 	add	v19.4s, v0.4s, v4.4s
+// 	sha1m	q6, s17, v19.4s
+// 	sha1su1	v3.4s, v2.4s
+// 	sha1su0	v0.4s, v1.4s, v2.4s
+//
+// 	// 13
+// 	sha1h	s17, s6
+// 	add	v19.4s, v1.4s, v4.4s
+// 	sha1m	q6, s18, v19.4s
+// 	sha1su1	v0.4s, v3.4s
+// 	sha1su0	v1.4s, v2.4s, v3.4s
+//
+// 	// 14
+// 	sha1h	s18, s6
+// 	add	v19.4s, v2.4s, v4.4s
+// 	sha1m	q6, s17, v19.4s
+// 	sha1su1	v1.4s, v0.4s
+// 	sha1su0	v2.4s, v3.4s, v0.4s
+//
+// 	// k for the next five rounds
+// 	adrp	x1, .K3@PAGE
+// 	ldr	q4, [x1, #:lo12:.K3@PAGEOFF]
+//
+// 	// 15
+// 	sha1h	s17, s6
+// 	add	v19.4s, v3.4s, v4.4s
+// 	sha1p	q6, s18, v19.4s
+// 	sha1su1	v2.4s, v1.4s
+// 	sha1su0	v3.4s, v0.4s, v1.4s
+//
+// 	// 16
+// 	sha1h	s18, s6
+// 	add	v19.4s, v0.4s, v4.4s
+// 	sha1p	q6, s17, v19.4s
+// 	sha1su1	v3.4s, v2.4s
+//
+// 	// 17
+// 	sha1h	s17, s6
+// 	add	v19.4s, v1.4s, v4.4s
+// 	sha1p	q6, s18, v19.4s
+//
+// 	// 18
+// 	sha1h	s18, s6
+// 	add	v19.4s, v2.4s, v4.4s
+// 	sha1p	q6, s17, v19.4s
+//
+// 	// 19
+// 	sha1h	s17, s6
+// 	add	v19.4s, v3.4s, v4.4s
+// 	sha1p	q6, s18, v19.4s
+//
+// 	// Update state
+// 	add	v6.4s, v6.4s, v5.4s
+// 	str	q6, [x0]
+// 	add	v16.2s, v16.2s, v17.2s
+// 	str	s16, [x0, 16]
+//
+// 	ret
+// .align 4
+// .K0:
+// 	.word	0x5A827999
+// 	.word	0x5A827999
+// 	.word	0x5A827999
+// 	.word	0x5A827999
+// .K1:
+// 	.word	0x6ED9EBA1
+// 	.word	0x6ED9EBA1
+// 	.word	0x6ED9EBA1
+// 	.word	0x6ED9EBA1
+// .K2:
+// 	.word	0x8F1BBCDC
+// 	.word	0x8F1BBCDC
+// 	.word	0x8F1BBCDC
+// 	.word	0x8F1BBCDC
+// .K3:
+// 	.word	0xCA62C1D6
+// 	.word	0xCA62C1D6
+// 	.word	0xCA62C1D6
+// 	.word	0xCA62C1D6
diff --git a/sha1/src/asm/mod.rs b/sha1/src/asm/mod.rs
new file mode 100644
index 000000000..056e15e14
--- /dev/null
+++ b/sha1/src/asm/mod.rs
@@ -0,0 +1,7 @@
+// TODO (laudiacay): here, do the switch to figure out which architecture's method we'll do...
+// here's how that md5 PR did it (obviously wrong for what we want here...)
+// #[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
+// mod x86;
+//
+// #[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
+// pub use x86::compress;
diff --git a/sha1/src/asm/x86.rs b/sha1/src/asm/x86.rs
new file mode 100644
index 000000000..c9ff140fe
--- /dev/null
+++ b/sha1/src/asm/x86.rs
@@ -0,0 +1,227 @@
+//! SHA-1 hash in x86 assembly. adapted from Project Nayuki's MIT licensed code...
+// /*
+//  * SHA-1 hash in x86 assembly
+//  *
+//  * Copyright (c) 2014 Project Nayuki. (MIT License)
+//  * https://www.nayuki.io/page/fast-sha1-hash-implementation-in-x86-assembly
+//  *
+//  * Permission is hereby granted, free of charge, to any person obtaining a copy of
+//  * this software and associated documentation files (the "Software"), to deal in
+//  * the Software without restriction, including without limitation the rights to
+//  * use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of
+//  * the Software, and to permit persons to whom the Software is furnished to do so,
+//  * subject to the following conditions:
+//  * - The above copyright notice and this permission notice shall be included in
+//  *   all copies or substantial portions of the Software.
+//  * - The Software is provided "as is", without warranty of any kind, express or
+//  *   implied, including but not limited to the warranties of merchantability,
+//  *   fitness for a particular purpose and noninfringement. In no event shall the
+//  *   authors or copyright holders be liable for any claim, damages or other
+//  *   liability, whether in an action of contract, tort or otherwise, arising from,
+//  *   out of or in connection with the Software or the use or other dealings in the
+//  *   Software.
+//  */
+
+use core::arch::asm;
+
+use asm_block::asm_block;
+
+//
+// /* void sha1_compress(uint32_t state[5], const uint8_t block[64]) */
+// #ifdef __APPLE__
+// .globl _sha1_compress
+// _sha1_compress:
+// #else
+// .globl sha1_compress
+// sha1_compress:
+// #endif
+//     /*
+//      * Storage usage:
+//      *   Bytes  Location  Description
+//      *       4  eax       SHA-1 state variable A
+//      *       4  ebx       SHA-1 state variable B
+//      *       4  ecx       SHA-1 state variable C
+//      *       4  edx       SHA-1 state variable D
+//      *       4  ebp       SHA-1 state variable E
+//      *       4  esi       Temporary for calculation per round
+//      *       4  edi       (First 16 rounds) base address of block array argument (read-only); (last 64 rounds) temporary for calculation per round
+//      *       4  esp       x86 stack pointer
+//      *      64  [esp+ 0]  Circular buffer of most recent 16 key schedule items, 4 bytes each
+//      *       4  [esp+64]  Caller's value of ebx
+//      *       4  [esp+68]  Caller's value of esi
+//      *       4  [esp+72]  Caller's value of edi
+//      *       4  [esp+76]  Caller's value of ebp
+//      */
+//
+//     #define ROUND0a(a, b, c, d, e, i)  \
+//         movl    (i*4)(%edi), %esi;  \
+//         bswapl  %esi;               \
+//         movl    %esi, (i*4)(%esp);  \
+//         addl    %esi, %e;           \
+//         movl    %c, %esi;           \
+//         xorl    %d, %esi;           \
+//         andl    %b, %esi;           \
+//         xorl    %d, %esi;           \
+//         ROUNDTAIL(a, b, e, i, 0x5A827999)
+//
+//     #define SCHEDULE(i, e)  \
+//         movl  (((i- 3)&0xF)*4)(%esp), %esi;  \
+//         xorl  (((i- 8)&0xF)*4)(%esp), %esi;  \
+//         xorl  (((i-14)&0xF)*4)(%esp), %esi;  \
+//         xorl  (((i-16)&0xF)*4)(%esp), %esi;  \
+//         roll  $1, %esi;                      \
+//         addl  %esi, %e;                      \
+//         movl  %esi, ((i&0xF)*4)(%esp);
+//
+//     #define ROUND0b(a, b, c, d, e, i)  \
+//         SCHEDULE(i, e)   \
+//         movl  %c, %esi;  \
+//         xorl  %d, %esi;  \
+//         andl  %b, %esi;  \
+//         xorl  %d, %esi;  \
+//         ROUNDTAIL(a, b, e, i, 0x5A827999)
+//
+//     #define ROUND1(a, b, c, d, e, i)  \
+//         SCHEDULE(i, e)   \
+//         movl  %b, %esi;  \
+//         xorl  %c, %esi;  \
+//         xorl  %d, %esi;  \
+//         ROUNDTAIL(a, b, e, i, 0x6ED9EBA1)
+//
+//     #define ROUND2(a, b, c, d, e, i)  \
+//         SCHEDULE(i, e)     \
+//         movl  %c, %esi;    \
+//         movl  %c, %edi;    \
+//         orl   %d, %esi;    \
+//         andl  %b, %esi;    \
+//         andl  %d, %edi;    \
+//         orl   %edi, %esi;  \
+//         ROUNDTAIL(a, b, e, i, 0x8F1BBCDC)
+//
+//     #define ROUND3(a, b, c, d, e, i)  \
+//         SCHEDULE(i, e)   \
+//         movl  %b, %esi;  \
+//         xorl  %c, %esi;  \
+//         xorl  %d, %esi;  \
+//         ROUNDTAIL(a, b, e, i, 0xCA62C1D6)
+//
+//     #define ROUNDTAIL(a, b, e, i, k)  \
+//         roll  $30, %b;         \
+//         leal  k(%e,%esi), %e;  \
+//         movl  %a, %esi;        \
+//         roll  $5, %esi;        \
+//         addl  %esi, %e;
+//
+//     /* Save registers */
+//     subl    $80, %esp
+//     movl    %ebx, 64(%esp)
+//     movl    %esi, 68(%esp)
+//     movl    %edi, 72(%esp)
+//     movl    %ebp, 76(%esp)
+//
+//     /* Load arguments */
+//     movl    84(%esp), %esi  /* state */
+//     movl    88(%esp), %edi  /* block */
+//     movl     0(%esi), %eax  /* a */
+//     movl     4(%esi), %ebx  /* b */
+//     movl     8(%esi), %ecx  /* c */
+//     movl    12(%esi), %edx  /* d */
+//     movl    16(%esi), %ebp  /* e */
+//
+//     /* 80 rounds of hashing */
+//     ROUND0a(eax, ebx, ecx, edx, ebp,  0)
+//     ROUND0a(ebp, eax, ebx, ecx, edx,  1)
+//     ROUND0a(edx, ebp, eax, ebx, ecx,  2)
+//     ROUND0a(ecx, edx, ebp, eax, ebx,  3)
+//     ROUND0a(ebx, ecx, edx, ebp, eax,  4)
+//     ROUND0a(eax, ebx, ecx, edx, ebp,  5)
+//     ROUND0a(ebp, eax, ebx, ecx, edx,  6)
+//     ROUND0a(edx, ebp, eax, ebx, ecx,  7)
+//     ROUND0a(ecx, edx, ebp, eax, ebx,  8)
+//     ROUND0a(ebx, ecx, edx, ebp, eax,  9)
+//     ROUND0a(eax, ebx, ecx, edx, ebp, 10)
+//     ROUND0a(ebp, eax, ebx, ecx, edx, 11)
+//     ROUND0a(edx, ebp, eax, ebx, ecx, 12)
+//     ROUND0a(ecx, edx, ebp, eax, ebx, 13)
+//     ROUND0a(ebx, ecx, edx, ebp, eax, 14)
+//     ROUND0a(eax, ebx, ecx, edx, ebp, 15)
+//     ROUND0b(ebp, eax, ebx, ecx, edx, 16)
+//     ROUND0b(edx, ebp, eax, ebx, ecx, 17)
+//     ROUND0b(ecx, edx, ebp, eax, ebx, 18)
+//     ROUND0b(ebx, ecx, edx, ebp, eax, 19)
+//     ROUND1(eax, ebx, ecx, edx, ebp, 20)
+//     ROUND1(ebp, eax, ebx, ecx, edx, 21)
+//     ROUND1(edx, ebp, eax, ebx, ecx, 22)
+//     ROUND1(ecx, edx, ebp, eax, ebx, 23)
+//     ROUND1(ebx, ecx, edx, ebp, eax, 24)
+//     ROUND1(eax, ebx, ecx, edx, ebp, 25)
+//     ROUND1(ebp, eax, ebx, ecx, edx, 26)
+//     ROUND1(edx, ebp, eax, ebx, ecx, 27)
+//     ROUND1(ecx, edx, ebp, eax, ebx, 28)
+//     ROUND1(ebx, ecx, edx, ebp, eax, 29)
+//     ROUND1(eax, ebx, ecx, edx, ebp, 30)
+//     ROUND1(ebp, eax, ebx, ecx, edx, 31)
+//     ROUND1(edx, ebp, eax, ebx, ecx, 32)
+//     ROUND1(ecx, edx, ebp, eax, ebx, 33)
+//     ROUND1(ebx, ecx, edx, ebp, eax, 34)
+//     ROUND1(eax, ebx, ecx, edx, ebp, 35)
+//     ROUND1(ebp, eax, ebx, ecx, edx, 36)
+//     ROUND1(edx, ebp, eax, ebx, ecx, 37)
+//     ROUND1(ecx, edx, ebp, eax, ebx, 38)
+//     ROUND1(ebx, ecx, edx, ebp, eax, 39)
+//     ROUND2(eax, ebx, ecx, edx, ebp, 40)
+//     ROUND2(ebp, eax, ebx, ecx, edx, 41)
+//     ROUND2(edx, ebp, eax, ebx, ecx, 42)
+//     ROUND2(ecx, edx, ebp, eax, ebx, 43)
+//     ROUND2(ebx, ecx, edx, ebp, eax, 44)
+//     ROUND2(eax, ebx, ecx, edx, ebp, 45)
+//     ROUND2(ebp, eax, ebx, ecx, edx, 46)
+//     ROUND2(edx, ebp, eax, ebx, ecx, 47)
+//     ROUND2(ecx, edx, ebp, eax, ebx, 48)
+//     ROUND2(ebx, ecx, edx, ebp, eax, 49)
+//     ROUND2(eax, ebx, ecx, edx, ebp, 50)
+//     ROUND2(ebp, eax, ebx, ecx, edx, 51)
+//     ROUND2(edx, ebp, eax, ebx, ecx, 52)
+//     ROUND2(ecx, edx, ebp, eax, ebx, 53)
+//     ROUND2(ebx, ecx, edx, ebp, eax, 54)
+//     ROUND2(eax, ebx, ecx, edx, ebp, 55)
+//     ROUND2(ebp, eax, ebx, ecx, edx, 56)
+//     ROUND2(edx, ebp, eax, ebx, ecx, 57)
+//     ROUND2(ecx, edx, ebp, eax, ebx, 58)
+//     ROUND2(ebx, ecx, edx, ebp, eax, 59)
+//     ROUND3(eax, ebx, ecx, edx, ebp, 60)
+//     ROUND3(ebp, eax, ebx, ecx, edx, 61)
+//     ROUND3(edx, ebp, eax, ebx, ecx, 62)
+//     ROUND3(ecx, edx, ebp, eax, ebx, 63)
+//     ROUND3(ebx, ecx, edx, ebp, eax, 64)
+//     ROUND3(eax, ebx, ecx, edx, ebp, 65)
+//     ROUND3(ebp, eax, ebx, ecx, edx, 66)
+//     ROUND3(edx, ebp, eax, ebx, ecx, 67)
+//     ROUND3(ecx, edx, ebp, eax, ebx, 68)
+//     ROUND3(ebx, ecx, edx, ebp, eax, 69)
+//     ROUND3(eax, ebx, ecx, edx, ebp, 70)
+//     ROUND3(ebp, eax, ebx, ecx, edx, 71)
+//     ROUND3(edx, ebp, eax, ebx, ecx, 72)
+//     ROUND3(ecx, edx, ebp, eax, ebx, 73)
+//     ROUND3(ebx, ecx, edx, ebp, eax, 74)
+//     ROUND3(eax, ebx, ecx, edx, ebp, 75)
+//     ROUND3(ebp, eax, ebx, ecx, edx, 76)
+//     ROUND3(edx, ebp, eax, ebx, ecx, 77)
+//     ROUND3(ecx, edx, ebp, eax, ebx, 78)
+//     ROUND3(ebx, ecx, edx, ebp, eax, 79)
+//
+//     /* Save updated state */
+//     movl    84(%esp), %esi
+//     addl    %eax,  0(%esi)
+//     addl    %ebx,  4(%esi)
+//     addl    %ecx,  8(%esi)
+//     addl    %edx, 12(%esi)
+//     addl    %ebp, 16(%esi)
+//
+//     /* Restore registers */
+//     movl    64(%esp), %ebx
+//     movl    68(%esp), %esi
+//     movl    72(%esp), %edi
+//     movl    76(%esp), %ebp
+//     addl    $80, %esp
+//     retl
diff --git a/sha1/src/asm/x86_64.rs b/sha1/src/asm/x86_64.rs
new file mode 100644
index 000000000..5e095883b
--- /dev/null
+++ b/sha1/src/asm/x86_64.rs
@@ -0,0 +1,217 @@
+// /*
+//  * SHA-1 hash in x86-64 assembly
+//  *
+//  * Copyright (c) 2015 Project Nayuki. (MIT License)
+//  * https://www.nayuki.io/page/fast-sha1-hash-implementation-in-x86-assembly
+//  *
+//  * Permission is hereby granted, free of charge, to any person obtaining a copy of
+//  * this software and associated documentation files (the "Software"), to deal in
+//  * the Software without restriction, including without limitation the rights to
+//  * use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of
+//  * the Software, and to permit persons to whom the Software is furnished to do so,
+//  * subject to the following conditions:
+//  * - The above copyright notice and this permission notice shall be included in
+//  *   all copies or substantial portions of the Software.
+//  * - The Software is provided "as is", without warranty of any kind, express or
+//  *   implied, including but not limited to the warranties of merchantability,
+//  *   fitness for a particular purpose and noninfringement. In no event shall the
+//  *   authors or copyright holders be liable for any claim, damages or other
+//  *   liability, whether in an action of contract, tort or otherwise, arising from,
+//  *   out of or in connection with the Software or the use or other dealings in the
+//  *   Software.
+//  */
+//
+//
+// /* void sha1_compress(uint32_t state[5], const uint8_t block[64]) */
+// #ifdef __APPLE__
+// .globl _sha1_compress
+// _sha1_compress:
+// #else
+// .globl sha1_compress
+// sha1_compress:
+// #endif
+//     /*
+//      * Storage usage:
+//      *   Bytes  Location  Description
+//      *       4  eax       SHA-1 state variable A
+//      *       4  ebx       SHA-1 state variable B
+//      *       4  ecx       SHA-1 state variable C
+//      *       4  edx       SHA-1 state variable D
+//      *       4  ebp       SHA-1 state variable E
+//      *       4  esi       Temporary for calculation per round
+//      *       4  edi       (Last 64 rounds) temporary for calculation per round
+//      *       8  rdi       (First 16 rounds) base address of block array argument (read-only)
+//      *       8  r8        Base address of state array argument (read-only)
+//      *       8  rsp       x86-64 stack pointer
+//      *      64  [rsp+0]   Circular buffer of most recent 16 key schedule items, 4 bytes each
+//      *      16  xmm0      Caller's value of rbx (only low 64 bits are used)
+//      *      16  xmm1      Caller's value of rbp (only low 64 bits are used)
+//      */
+//
+//     #define ROUND0a(a, b, c, d, e, i)  \
+//         movl    (i*4)(%rdi), %esi;  \
+//         bswapl  %esi;               \
+//         movl    %esi, (i*4)(%rsp);  \
+//         addl    %esi, %e;           \
+//         movl    %c, %esi;           \
+//         xorl    %d, %esi;           \
+//         andl    %b, %esi;           \
+//         xorl    %d, %esi;           \
+//         ROUNDTAIL(a, b, e, i, 0x5A827999)
+//
+//     #define SCHEDULE(i, e)  \
+//         movl  (((i- 3)&0xF)*4)(%rsp), %esi;  \
+//         xorl  (((i- 8)&0xF)*4)(%rsp), %esi;  \
+//         xorl  (((i-14)&0xF)*4)(%rsp), %esi;  \
+//         xorl  (((i-16)&0xF)*4)(%rsp), %esi;  \
+//         roll  $1, %esi;                      \
+//         addl  %esi, %e;                      \
+//         movl  %esi, ((i&0xF)*4)(%rsp);
+//
+//     #define ROUND0b(a, b, c, d, e, i)  \
+//         SCHEDULE(i, e)   \
+//         movl  %c, %esi;  \
+//         xorl  %d, %esi;  \
+//         andl  %b, %esi;  \
+//         xorl  %d, %esi;  \
+//         ROUNDTAIL(a, b, e, i, 0x5A827999)
+//
+//     #define ROUND1(a, b, c, d, e, i)  \
+//         SCHEDULE(i, e)   \
+//         movl  %b, %esi;  \
+//         xorl  %c, %esi;  \
+//         xorl  %d, %esi;  \
+//         ROUNDTAIL(a, b, e, i, 0x6ED9EBA1)
+//
+//     #define ROUND2(a, b, c, d, e, i)  \
+//         SCHEDULE(i, e)     \
+//         movl  %c, %esi;    \
+//         movl  %c, %edi;    \
+//         orl   %d, %esi;    \
+//         andl  %b, %esi;    \
+//         andl  %d, %edi;    \
+//         orl   %edi, %esi;  \
+//         ROUNDTAIL(a, b, e, i, -0x70E44324)
+//
+//     #define ROUND3(a, b, c, d, e, i)  \
+//         SCHEDULE(i, e)   \
+//         movl  %b, %esi;  \
+//         xorl  %c, %esi;  \
+//         xorl  %d, %esi;  \
+//         ROUNDTAIL(a, b, e, i, -0x359D3E2A)
+//
+//     #define ROUNDTAIL(a, b, e, i, k)  \
+//         roll  $30, %b;         \
+//         leal  k(%e,%esi), %e;  \
+//         movl  %a, %esi;        \
+//         roll  $5, %esi;        \
+//         addl  %esi, %e;
+//
+//     /* Save registers, allocate scratch space */
+//     movq    %rbx, %xmm0
+//     movq    %rbp, %xmm1
+//     subq    $64, %rsp
+//
+//     /* Load arguments */
+//     movq    %rdi, %r8
+//     movl     0(%rdi), %eax  /* a */
+//     movl     4(%rdi), %ebx  /* b */
+//     movl     8(%rdi), %ecx  /* c */
+//     movl    12(%rdi), %edx  /* d */
+//     movl    16(%rdi), %ebp  /* e */
+//     movq    %rsi, %rdi
+//
+//     /* 80 rounds of hashing */
+//     ROUND0a(eax, ebx, ecx, edx, ebp,  0)
+//     ROUND0a(ebp, eax, ebx, ecx, edx,  1)
+//     ROUND0a(edx, ebp, eax, ebx, ecx,  2)
+//     ROUND0a(ecx, edx, ebp, eax, ebx,  3)
+//     ROUND0a(ebx, ecx, edx, ebp, eax,  4)
+//     ROUND0a(eax, ebx, ecx, edx, ebp,  5)
+//     ROUND0a(ebp, eax, ebx, ecx, edx,  6)
+//     ROUND0a(edx, ebp, eax, ebx, ecx,  7)
+//     ROUND0a(ecx, edx, ebp, eax, ebx,  8)
+//     ROUND0a(ebx, ecx, edx, ebp, eax,  9)
+//     ROUND0a(eax, ebx, ecx, edx, ebp, 10)
+//     ROUND0a(ebp, eax, ebx, ecx, edx, 11)
+//     ROUND0a(edx, ebp, eax, ebx, ecx, 12)
+//     ROUND0a(ecx, edx, ebp, eax, ebx, 13)
+//     ROUND0a(ebx, ecx, edx, ebp, eax, 14)
+//     ROUND0a(eax, ebx, ecx, edx, ebp, 15)
+//     ROUND0b(ebp, eax, ebx, ecx, edx, 16)
+//     ROUND0b(edx, ebp, eax, ebx, ecx, 17)
+//     ROUND0b(ecx, edx, ebp, eax, ebx, 18)
+//     ROUND0b(ebx, ecx, edx, ebp, eax, 19)
+//     ROUND1(eax, ebx, ecx, edx, ebp, 20)
+//     ROUND1(ebp, eax, ebx, ecx, edx, 21)
+//     ROUND1(edx, ebp, eax, ebx, ecx, 22)
+//     ROUND1(ecx, edx, ebp, eax, ebx, 23)
+//     ROUND1(ebx, ecx, edx, ebp, eax, 24)
+//     ROUND1(eax, ebx, ecx, edx, ebp, 25)
+//     ROUND1(ebp, eax, ebx, ecx, edx, 26)
+//     ROUND1(edx, ebp, eax, ebx, ecx, 27)
+//     ROUND1(ecx, edx, ebp, eax, ebx, 28)
+//     ROUND1(ebx, ecx, edx, ebp, eax, 29)
+//     ROUND1(eax, ebx, ecx, edx, ebp, 30)
+//     ROUND1(ebp, eax, ebx, ecx, edx, 31)
+//     ROUND1(edx, ebp, eax, ebx, ecx, 32)
+//     ROUND1(ecx, edx, ebp, eax, ebx, 33)
+//     ROUND1(ebx, ecx, edx, ebp, eax, 34)
+//     ROUND1(eax, ebx, ecx, edx, ebp, 35)
+//     ROUND1(ebp, eax, ebx, ecx, edx, 36)
+//     ROUND1(edx, ebp, eax, ebx, ecx, 37)
+//     ROUND1(ecx, edx, ebp, eax, ebx, 38)
+//     ROUND1(ebx, ecx, edx, ebp, eax, 39)
+//     ROUND2(eax, ebx, ecx, edx, ebp, 40)
+//     ROUND2(ebp, eax, ebx, ecx, edx, 41)
+//     ROUND2(edx, ebp, eax, ebx, ecx, 42)
+//     ROUND2(ecx, edx, ebp, eax, ebx, 43)
+//     ROUND2(ebx, ecx, edx, ebp, eax, 44)
+//     ROUND2(eax, ebx, ecx, edx, ebp, 45)
+//     ROUND2(ebp, eax, ebx, ecx, edx, 46)
+//     ROUND2(edx, ebp, eax, ebx, ecx, 47)
+//     ROUND2(ecx, edx, ebp, eax, ebx, 48)
+//     ROUND2(ebx, ecx, edx, ebp, eax, 49)
+//     ROUND2(eax, ebx, ecx, edx, ebp, 50)
+//     ROUND2(ebp, eax, ebx, ecx, edx, 51)
+//     ROUND2(edx, ebp, eax, ebx, ecx, 52)
+//     ROUND2(ecx, edx, ebp, eax, ebx, 53)
+//     ROUND2(ebx, ecx, edx, ebp, eax, 54)
+//     ROUND2(eax, ebx, ecx, edx, ebp, 55)
+//     ROUND2(ebp, eax, ebx, ecx, edx, 56)
+//     ROUND2(edx, ebp, eax, ebx, ecx, 57)
+//     ROUND2(ecx, edx, ebp, eax, ebx, 58)
+//     ROUND2(ebx, ecx, edx, ebp, eax, 59)
+//     ROUND3(eax, ebx, ecx, edx, ebp, 60)
+//     ROUND3(ebp, eax, ebx, ecx, edx, 61)
+//     ROUND3(edx, ebp, eax, ebx, ecx, 62)
+//     ROUND3(ecx, edx, ebp, eax, ebx, 63)
+//     ROUND3(ebx, ecx, edx, ebp, eax, 64)
+//     ROUND3(eax, ebx, ecx, edx, ebp, 65)
+//     ROUND3(ebp, eax, ebx, ecx, edx, 66)
+//     ROUND3(edx, ebp, eax, ebx, ecx, 67)
+//     ROUND3(ecx, edx, ebp, eax, ebx, 68)
+//     ROUND3(ebx, ecx, edx, ebp, eax, 69)
+//     ROUND3(eax, ebx, ecx, edx, ebp, 70)
+//     ROUND3(ebp, eax, ebx, ecx, edx, 71)
+//     ROUND3(edx, ebp, eax, ebx, ecx, 72)
+//     ROUND3(ecx, edx, ebp, eax, ebx, 73)
+//     ROUND3(ebx, ecx, edx, ebp, eax, 74)
+//     ROUND3(eax, ebx, ecx, edx, ebp, 75)
+//     ROUND3(ebp, eax, ebx, ecx, edx, 76)
+//     ROUND3(edx, ebp, eax, ebx, ecx, 77)
+//     ROUND3(ecx, edx, ebp, eax, ebx, 78)
+//     ROUND3(ebx, ecx, edx, ebp, eax, 79)
+//
+//     /* Save updated state */
+//     addl    %eax,  0(%r8)
+//     addl    %ebx,  4(%r8)
+//     addl    %ecx,  8(%r8)
+//     addl    %edx, 12(%r8)
+//     addl    %ebp, 16(%r8)
+//
+//     /* Restore registers */
+//     movq    %xmm0, %rbx
+//     movq    %xmm1, %rbp
+//     addq    $64, %rsp
+//     retq
diff --git a/sha1/src/lib.rs b/sha1/src/lib.rs
index 38ddc4b51..616b58f37 100644
--- a/sha1/src/lib.rs
+++ b/sha1/src/lib.rs
@@ -63,11 +63,48 @@ use digest::{
     HashMarker, Output,
 };
 
+#[cfg(all(
+feature = "inline-asm",
+any(target_arch = "x86", target_arch = "x86_64", target_arch = "aarch64")
+))]
+mod asm;
+
+#[cfg(not(all(
+any(feature = "asm", feature = "inline-asm"),
+any(target_arch = "x86", target_arch = "x86_64", target_arch = "aarch64")
+)))]
 mod compress;
 
 #[cfg(feature = "compress")]
+#[cfg(all(
+feature = "inline-asm",
+feature = "compress",
+any(target_arch = "x86", target_arch = "x86_64", target_arch = "aarch64")
+))]
+pub use asm::compress;
+
+#[cfg(feature = "compress")]
+#[cfg(all(
+feature = "inline-asm",
+not(feature = "compress"),
+any(target_arch = "x86", target_arch = "x86_64", target_arch = "aarch64")
+))]
+use asm::compress;
+
+#[cfg(feature = "compress")]
+#[cfg(all(
+not(feature = "inline-asm"),
+feature = "compress",
+any(target_arch = "x86", target_arch = "x86_64", target_arch = "aarch64")
+))]
 pub use compress::compress;
-#[cfg(not(feature = "compress"))]
+
+#[cfg(feature = "compress")]
+#[cfg(all(
+not(feature = "inline-asm"),
+not(feature = "compress"),
+any(target_arch = "x86", target_arch = "x86_64", target_arch = "aarch64")
+))]
 use compress::compress;
 
 const STATE_LEN: usize = 5;

From 64f975daa3e5214368a3c02e7dce258fb964f3c5 Mon Sep 17 00:00:00 2001
From: Claudia Richoux <c@laudiacay.cool>
Date: Sat, 28 Jan 2023 23:46:54 -0500
Subject: [PATCH 3/8] better skeleton

---
 sha1/src/asm/aarch64.rs |  5 +++--
 sha1/src/asm/mod.rs     | 18 ++++++++++++++++++
 sha1/src/asm/x86.rs     |  1 -
 sha1/src/lib.rs         | 32 ++++++++++++++++----------------
 4 files changed, 37 insertions(+), 19 deletions(-)

diff --git a/sha1/src/asm/aarch64.rs b/sha1/src/asm/aarch64.rs
index c069f329e..9e6611086 100644
--- a/sha1/src/asm/aarch64.rs
+++ b/sha1/src/asm/aarch64.rs
@@ -1,3 +1,5 @@
+//! SHA-1 hash in AArch64 assembly, adapted from Emmanuel Gil Peyrot's MIT-licensed implementation
+//
 // /*
 //  * SHA-1 hash in AArch64 assembly
 //  *
@@ -19,8 +21,7 @@
 //  *   out of or in connection with the Software or the use or other dealings in the
 //  *   Software.
 //  */
-//
-//
+
 // /* void sha1_compress(uint32_t state[5], const uint8_t block[64]) */
 // .global sha1_compress
 // sha1_compress:
diff --git a/sha1/src/asm/mod.rs b/sha1/src/asm/mod.rs
index 056e15e14..164c8065d 100644
--- a/sha1/src/asm/mod.rs
+++ b/sha1/src/asm/mod.rs
@@ -5,3 +5,21 @@
 //
 // #[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
 // pub use x86::compress;
+
+#[cfg(all(feature = "inline-asm", target_arch = "x86",))]
+mod x86;
+#[cfg(all(feature = "inline-asm", target_arch = "x86",))]
+pub use x86::compress;
+
+#[cfg(all(feature = "inline-asm", target_arch = "x86_64",))]
+mod x86_64;
+#[cfg(all(feature = "inline-asm", target_arch = "x86_64",))]
+pub use x86_64::compress;
+
+#[cfg(all(feature = "inline-asm", target_arch = "aarch64",))]
+mod aarch64;
+#[cfg(all(feature = "inline-asm", target_arch = "aarch64",))]
+pub use aarch64::compress;
+
+// TODO(laudiacay) i don't know how to detect M1
+mod aarch64_apple;
diff --git a/sha1/src/asm/x86.rs b/sha1/src/asm/x86.rs
index c9ff140fe..2ab865649 100644
--- a/sha1/src/asm/x86.rs
+++ b/sha1/src/asm/x86.rs
@@ -21,7 +21,6 @@
 //  *   out of or in connection with the Software or the use or other dealings in the
 //  *   Software.
 //  */
-
 use core::arch::asm;
 
 use asm_block::asm_block;
diff --git a/sha1/src/lib.rs b/sha1/src/lib.rs
index 616b58f37..9b7fd0a37 100644
--- a/sha1/src/lib.rs
+++ b/sha1/src/lib.rs
@@ -64,46 +64,46 @@ use digest::{
 };
 
 #[cfg(all(
-feature = "inline-asm",
-any(target_arch = "x86", target_arch = "x86_64", target_arch = "aarch64")
+    feature = "inline-asm",
+    any(target_arch = "x86", target_arch = "x86_64", target_arch = "aarch64")
 ))]
 mod asm;
 
 #[cfg(not(all(
-any(feature = "asm", feature = "inline-asm"),
-any(target_arch = "x86", target_arch = "x86_64", target_arch = "aarch64")
+    any(feature = "asm", feature = "inline-asm"),
+    any(target_arch = "x86", target_arch = "x86_64", target_arch = "aarch64")
 )))]
 mod compress;
 
 #[cfg(feature = "compress")]
 #[cfg(all(
-feature = "inline-asm",
-feature = "compress",
-any(target_arch = "x86", target_arch = "x86_64", target_arch = "aarch64")
+    feature = "inline-asm",
+    feature = "compress",
+    any(target_arch = "x86", target_arch = "x86_64", target_arch = "aarch64")
 ))]
 pub use asm::compress;
 
 #[cfg(feature = "compress")]
 #[cfg(all(
-feature = "inline-asm",
-not(feature = "compress"),
-any(target_arch = "x86", target_arch = "x86_64", target_arch = "aarch64")
+    feature = "inline-asm",
+    not(feature = "compress"),
+    any(target_arch = "x86", target_arch = "x86_64", target_arch = "aarch64")
 ))]
 use asm::compress;
 
 #[cfg(feature = "compress")]
 #[cfg(all(
-not(feature = "inline-asm"),
-feature = "compress",
-any(target_arch = "x86", target_arch = "x86_64", target_arch = "aarch64")
+    not(feature = "inline-asm"),
+    feature = "compress",
+    any(target_arch = "x86", target_arch = "x86_64", target_arch = "aarch64")
 ))]
 pub use compress::compress;
 
 #[cfg(feature = "compress")]
 #[cfg(all(
-not(feature = "inline-asm"),
-not(feature = "compress"),
-any(target_arch = "x86", target_arch = "x86_64", target_arch = "aarch64")
+    not(feature = "inline-asm"),
+    not(feature = "compress"),
+    any(target_arch = "x86", target_arch = "x86_64", target_arch = "aarch64")
 ))]
 use compress::compress;
 

From 2391945b30270a964ffa10d8eb8495fa7967ecbc Mon Sep 17 00:00:00 2001
From: Claudia Richoux <c@laudiacay.cool>
Date: Sun, 29 Jan 2023 01:21:26 -0500
Subject: [PATCH 4/8] need to figure out where to put macros in this next

---
 sha1/src/asm/aarch64.rs | 474 ++++++++++++++++++++++------------------
 1 file changed, 260 insertions(+), 214 deletions(-)

diff --git a/sha1/src/asm/aarch64.rs b/sha1/src/asm/aarch64.rs
index 9e6611086..5347a8e11 100644
--- a/sha1/src/asm/aarch64.rs
+++ b/sha1/src/asm/aarch64.rs
@@ -22,217 +22,263 @@
 //  *   Software.
 //  */
 
-// /* void sha1_compress(uint32_t state[5], const uint8_t block[64]) */
-// .global sha1_compress
-// sha1_compress:
-// 	/*
-// 	 * Storage usage:
-// 	 *   Bytes  Location  Description
-// 	 *       4  x0        state argument
-// 	 *       4  x1        block argument
-// 	 *      16  q0        W0
-// 	 *      16  q1        W1
-// 	 *      16  q2        W2
-// 	 *      16  q3        W3
-// 	 *      16  q4        k
-// 	 *      16  q5        Original ABCD
-// 	 *      16  q6        ABCD (with s3 being A)
-// 	 *       4  s16       E
-// 	 *       4  s17       e0
-// 	 *       4  s18       e1
-// 	 *      16  q19       wk
-// 	 */
-//
-// 	// Load state in registers
-// 	ldr	q5, [x0]
-// 	ldr	s16, [x0, 16]
-// 	mov	v6.16b, v5.16b
-//
-// 	// Load block in registers
-// 	ldr	q0, [x1]
-// 	ldr	q1, [x1, 16]
-// 	ldr	q2, [x1, 32]
-// 	ldr	q3, [x1, 48]
-//
-// 	// TODO: only do that on little endian
-// 	rev32	v0.16b, v0.16b
-// 	rev32	v1.16b, v1.16b
-// 	rev32	v2.16b, v2.16b
-// 	rev32	v3.16b, v3.16b
-//
-// 	// k for the next five rounds
-// 	adrp	x1, .K0
-// 	ldr	q4, [x1, #:lo12:.K0]
-//
-// 	// 0
-// 	sha1h	s18, s6
-// 	add	v19.4s, v0.4s, v4.4s
-// 	sha1c	q6, s16, v19.4s
-// 	sha1su0	v0.4s, v1.4s, v2.4s
-//
-// 	// 1
-// 	sha1h	s17, s6
-// 	add	v19.4s, v1.4s, v4.4s
-// 	sha1c	q6, s18, v19.4s
-// 	sha1su1	v0.4s, v3.4s
-// 	sha1su0	v1.4s, v2.4s, v3.4s
-//
-// 	// 2
-// 	sha1h	s18, s6
-// 	add	v19.4s, v2.4s, v4.4s
-// 	sha1c	q6, s17, v19.4s
-// 	sha1su1	v1.4s, v0.4s
-// 	sha1su0	v2.4s, v3.4s, v0.4s
-//
-// 	// 3
-// 	sha1h	s17, s6
-// 	add	v19.4s, v3.4s, v4.4s
-// 	sha1c	q6, s18, v19.4s
-// 	sha1su1	v2.4s, v1.4s
-// 	sha1su0	v3.4s, v0.4s, v1.4s
-//
-// 	// 4
-// 	sha1h	s18, s6
-// 	add	v19.4s, v0.4s, v4.4s
-// 	sha1c	q6, s17, v19.4s
-// 	sha1su1	v3.4s, v2.4s
-// 	sha1su0	v0.4s, v1.4s, v2.4s
-//
-// 	// k for the next five rounds
-// 	adrp	x1, .K1
-// 	ldr	q4, [x1, #:lo12:.K1]
-//
-// 	// 5
-// 	sha1h	s17, s6
-// 	add	v19.4s, v1.4s, v4.4s
-// 	sha1p	q6, s18, v19.4s
-// 	sha1su1	v0.4s, v3.4s
-// 	sha1su0	v1.4s, v2.4s, v3.4s
-//
-// 	// 6
-// 	sha1h	s18, s6
-// 	add	v19.4s, v2.4s, v4.4s
-// 	sha1p	q6, s17, v19.4s
-// 	sha1su1	v1.4s, v0.4s
-// 	sha1su0	v2.4s, v3.4s, v0.4s
-//
-// 	// 7
-// 	sha1h	s17, s6
-// 	add	v19.4s, v3.4s, v4.4s
-// 	sha1p	q6, s18, v19.4s
-// 	sha1su1	v2.4s, v1.4s
-// 	sha1su0	v3.4s, v0.4s, v1.4s
-//
-// 	// 8
-// 	sha1h	s18, s6
-// 	add	v19.4s, v0.4s, v4.4s
-// 	sha1p	q6, s17, v19.4s
-// 	sha1su1	v3.4s, v2.4s
-// 	sha1su0	v0.4s, v1.4s, v2.4s
-//
-// 	// 9
-// 	sha1h	s17, s6
-// 	add	v19.4s, v1.4s, v4.4s
-// 	sha1p	q6, s18, v19.4s
-// 	sha1su1	v0.4s, v3.4s
-// 	sha1su0	v1.4s, v2.4s, v3.4s
-//
-// 	// k for the next five rounds
-// 	adrp	x1, .K2
-// 	ldr	q4, [x1, #:lo12:.K2]
-//
-// 	// 10
-// 	sha1h	s18, s6
-// 	add	v19.4s, v2.4s, v4.4s
-// 	sha1m	q6, s17, v19.4s
-// 	sha1su1	v1.4s, v0.4s
-// 	sha1su0	v2.4s, v3.4s, v0.4s
-//
-// 	// 11
-// 	sha1h	s17, s6
-// 	add	v19.4s, v3.4s, v4.4s
-// 	sha1m	q6, s18, v19.4s
-// 	sha1su1	v2.4s, v1.4s
-// 	sha1su0	v3.4s, v0.4s, v1.4s
-//
-// 	// 12
-// 	sha1h	s18, s6
-// 	add	v19.4s, v0.4s, v4.4s
-// 	sha1m	q6, s17, v19.4s
-// 	sha1su1	v3.4s, v2.4s
-// 	sha1su0	v0.4s, v1.4s, v2.4s
-//
-// 	// 13
-// 	sha1h	s17, s6
-// 	add	v19.4s, v1.4s, v4.4s
-// 	sha1m	q6, s18, v19.4s
-// 	sha1su1	v0.4s, v3.4s
-// 	sha1su0	v1.4s, v2.4s, v3.4s
-//
-// 	// 14
-// 	sha1h	s18, s6
-// 	add	v19.4s, v2.4s, v4.4s
-// 	sha1m	q6, s17, v19.4s
-// 	sha1su1	v1.4s, v0.4s
-// 	sha1su0	v2.4s, v3.4s, v0.4s
-//
-// 	// k for the next five rounds
-// 	adrp	x1, .K3
-// 	ldr	q4, [x1, #:lo12:.K3]
-//
-// 	// 15
-// 	sha1h	s17, s6
-// 	add	v19.4s, v3.4s, v4.4s
-// 	sha1p	q6, s18, v19.4s
-// 	sha1su1	v2.4s, v1.4s
-// 	sha1su0	v3.4s, v0.4s, v1.4s
-//
-// 	// 16
-// 	sha1h	s18, s6
-// 	add	v19.4s, v0.4s, v4.4s
-// 	sha1p	q6, s17, v19.4s
-// 	sha1su1	v3.4s, v2.4s
-//
-// 	// 17
-// 	sha1h	s17, s6
-// 	add	v19.4s, v1.4s, v4.4s
-// 	sha1p	q6, s18, v19.4s
-//
-// 	// 18
-// 	sha1h	s18, s6
-// 	add	v19.4s, v2.4s, v4.4s
-// 	sha1p	q6, s17, v19.4s
-//
-// 	// 19
-// 	sha1h	s17, s6
-// 	add	v19.4s, v3.4s, v4.4s
-// 	sha1p	q6, s18, v19.4s
-//
-// 	// Update state
-// 	add	v6.4s, v6.4s, v5.4s
-// 	str	q6, [x0]
-// 	add	v16.2s, v16.2s, v17.2s
-// 	str	s16, [x0, 16]
-//
-// 	ret
-// .align 4
-// .K0:
-// 	.word	0x5A827999
-// 	.word	0x5A827999
-// 	.word	0x5A827999
-// 	.word	0x5A827999
-// .K1:
-// 	.word	0x6ED9EBA1
-// 	.word	0x6ED9EBA1
-// 	.word	0x6ED9EBA1
-// 	.word	0x6ED9EBA1
-// .K2:
-// 	.word	0x8F1BBCDC
-// 	.word	0x8F1BBCDC
-// 	.word	0x8F1BBCDC
-// 	.word	0x8F1BBCDC
-// .K3:
-// 	.word	0xCA62C1D6
-// 	.word	0xCA62C1D6
-// 	.word	0xCA62C1D6
-// 	.word	0xCA62C1D6
+use core::arch::asm;
+
+// macro_rules! sha_1_through_4 {
+//     (F, $a: tt, $b: tt, $c: tt, $d: tt, $k: tt, $s: literal, $t: literal, $tmp1: tt, $tmp2: tt) => {
+
+/// SHA1 compress function. We don't have enough registers to load the whole block,
+/// so we need to use memory address to refer to the inputs. Due to possible failure
+/// of register allocation on `x86`, we explicitly specify registers to use.
+#[cfg(all(feature = "inline-asm", target_arch = "aarch64"))]
+pub fn compress(state: &mut [u32; 5], blocks: &[[u8; 64]]) {
+    // SAFETY: inline-assembly
+    unsafe {
+        asm!(
+            // define the SHA1 constants TODO (laudiacay) does it make sense for these to be up here? does it cause alignment issues?
+            ".K0:",
+            ".word	0x5A827999",
+            ".word	0x5A827999",
+            ".word	0x5A827999",
+            ".word	0x5A827999",
+            ".K1:",
+            ".word	0x6ED9EBA1",
+            ".word	0x6ED9EBA1",
+            ".word	0x6ED9EBA1",
+            ".word	0x6ED9EBA1",
+            ".K2:",
+            ".word	0x8F1BBCDC",
+            ".word	0x8F1BBCDC",
+            ".word	0x8F1BBCDC",
+            ".word	0x8F1BBCDC",
+            ".K3:",
+            ".word	0xCA62C1D6",
+            ".word	0xCA62C1D6",
+            ".word	0xCA62C1D6",
+            ".word	0xCA62C1D6",
+
+            // from original code, some docs :)
+            // 	/*
+            // 	 * Storage usage:
+            // 	 *   Bytes  Location  Description
+            // 	 *       4  x0        state argument
+            // 	 *       4  x1        block argument
+            // 	 *      16  q0        W0
+            // 	 *      16  q1        W1
+            // 	 *      16  q2        W2
+            // 	 *      16  q3        W3
+            // 	 *      16  q4        k
+            // 	 *      16  q5        Original ABCD
+            // 	 *      16  q6        ABCD (with s3 being A)
+            // 	 *       4  s16       E
+            // 	 *       4  s17       e0
+            // 	 *       4  s18       e1
+            // 	 *      16  q19       wk
+            // 	 */
+
+            // Load state in registers
+            // original code:
+            // 	ldr	q5, [x0]
+            // 	ldr	s16, [x0, 16]
+            // 	mov	v6.16b, v5.16b
+            in(q5) state[0..4],
+            in(s16) state[4],
+            "mov v6.16b, v5.16b",
+
+            // Load block in registers
+            // original code:
+            // 	ldr	q0, [x1]
+            // 	ldr	q1, [x1, 16]
+            // 	ldr	q2, [x1, 32]
+            // 	ldr	q3, [x1, 48]
+            in(q0) blocks[0][0..16],
+            in(q1) blocks[0][16..32],
+            in(q2) blocks[0][32..48],
+            in(q3) blocks[0][48..64],
+
+            // from original code: TODO: only do that on little endian
+            "rev32 v0.16b, v0.16b",
+            "rev32 v1.16b, v1.16b",
+            "rev32 v2.16b, v2.16b",
+            "rev32 v3.16b, v3.16b",
+
+            // k for the next five rounds
+            "adrp x1, .K0"
+            "ldr	q4, [x1, #:lo12:.K0]"
+
+            // 	// 0
+            // 	sha1h	s18, s6
+            // 	add	v19.4s, v0.4s, v4.4s
+            // 	sha1c	q6, s16, v19.4s
+            // 	sha1su0	v0.4s, v1.4s, v2.4s
+
+            // 	// 1
+            // 	sha1h	s17, s6
+            // 	add	v19.4s, v1.4s, v4.4s
+            // 	sha1c	q6, s18, v19.4s
+            // 	sha1su1	v0.4s, v3.4s
+            // 	sha1su0	v1.4s, v2.4s, v3.4s
+            //
+            // 	// 2
+            // 	sha1h	s18, s6
+            // 	add	v19.4s, v2.4s, v4.4s
+            // 	sha1c	q6, s17, v19.4s
+            // 	sha1su1	v1.4s, v0.4s
+            // 	sha1su0	v2.4s, v3.4s, v0.4s
+            //
+            // 	// 3
+            // 	sha1h	s17, s6
+            // 	add	v19.4s, v3.4s, v4.4s
+            // 	sha1c	q6, s18, v19.4s
+            // 	sha1su1	v2.4s, v1.4s
+            // 	sha1su0	v3.4s, v0.4s, v1.4s
+            //
+            // 	// 4
+            // 	sha1h	s18, s6
+            // 	add	v19.4s, v0.4s, v4.4s
+            // 	sha1c	q6, s17, v19.4s
+            // 	sha1su1	v3.4s, v2.4s
+            // 	sha1su0	v0.4s, v1.4s, v2.4s
+            //
+            // 	// k for the next five rounds
+            // 	adrp	x1, .K1
+            // 	ldr	q4, [x1, #:lo12:.K1]
+            //
+            // 	// 5
+            // 	sha1h	s17, s6
+            // 	add	v19.4s, v1.4s, v4.4s
+            // 	sha1p	q6, s18, v19.4s
+            // 	sha1su1	v0.4s, v3.4s
+            // 	sha1su0	v1.4s, v2.4s, v3.4s
+            //
+            // 	// 6
+            // 	sha1h	s18, s6
+            // 	add	v19.4s, v2.4s, v4.4s
+            // 	sha1p	q6, s17, v19.4s
+            // 	sha1su1	v1.4s, v0.4s
+            // 	sha1su0	v2.4s, v3.4s, v0.4s
+            //
+            // 	// 7
+            // 	sha1h	s17, s6
+            // 	add	v19.4s, v3.4s, v4.4s
+            // 	sha1p	q6, s18, v19.4s
+            // 	sha1su1	v2.4s, v1.4s
+            // 	sha1su0	v3.4s, v0.4s, v1.4s
+            //
+            // 	// 8
+            // 	sha1h	s18, s6
+            // 	add	v19.4s, v0.4s, v4.4s
+            // 	sha1p	q6, s17, v19.4s
+            // 	sha1su1	v3.4s, v2.4s
+            // 	sha1su0	v0.4s, v1.4s, v2.4s
+            //
+            // 	// 9
+            // 	sha1h	s17, s6
+            // 	add	v19.4s, v1.4s, v4.4s
+            // 	sha1p	q6, s18, v19.4s
+            // 	sha1su1	v0.4s, v3.4s
+            // 	sha1su0	v1.4s, v2.4s, v3.4s
+            //
+            // 	// k for the next five rounds
+            // 	adrp	x1, .K2
+            // 	ldr	q4, [x1, #:lo12:.K2]
+            //
+            // 	// 10
+            // 	sha1h	s18, s6
+            // 	add	v19.4s, v2.4s, v4.4s
+            // 	sha1m	q6, s17, v19.4s
+            // 	sha1su1	v1.4s, v0.4s
+            // 	sha1su0	v2.4s, v3.4s, v0.4s
+            //
+            // 	// 11
+            // 	sha1h	s17, s6
+            // 	add	v19.4s, v3.4s, v4.4s
+            // 	sha1m	q6, s18, v19.4s
+            // 	sha1su1	v2.4s, v1.4s
+            // 	sha1su0	v3.4s, v0.4s, v1.4s
+            //
+            // 	// 12
+            // 	sha1h	s18, s6
+            // 	add	v19.4s, v0.4s, v4.4s
+            // 	sha1m	q6, s17, v19.4s
+            // 	sha1su1	v3.4s, v2.4s
+            // 	sha1su0	v0.4s, v1.4s, v2.4s
+            //
+            // 	// 13
+            // 	sha1h	s17, s6
+            // 	add	v19.4s, v1.4s, v4.4s
+            // 	sha1m	q6, s18, v19.4s
+            // 	sha1su1	v0.4s, v3.4s
+            // 	sha1su0	v1.4s, v2.4s, v3.4s
+            //
+            // 	// 14
+            // 	sha1h	s18, s6
+            // 	add	v19.4s, v2.4s, v4.4s
+            // 	sha1m	q6, s17, v19.4s
+            // 	sha1su1	v1.4s, v0.4s
+            // 	sha1su0	v2.4s, v3.4s, v0.4s
+            //
+            // 	// k for the next five rounds
+            // 	adrp	x1, .K3
+            // 	ldr	q4, [x1, #:lo12:.K3]
+            //
+            // 	// 15
+            // 	sha1h	s17, s6
+            // 	add	v19.4s, v3.4s, v4.4s
+            // 	sha1p	q6, s18, v19.4s
+            // 	sha1su1	v2.4s, v1.4s
+            // 	sha1su0	v3.4s, v0.4s, v1.4s
+            //
+            // 	// 16
+            // 	sha1h	s18, s6
+            // 	add	v19.4s, v0.4s, v4.4s
+            // 	sha1p	q6, s17, v19.4s
+            // 	sha1su1	v3.4s, v2.4s
+            //
+            // 	// 17
+            // 	sha1h	s17, s6
+            // 	add	v19.4s, v1.4s, v4.4s
+            // 	sha1p	q6, s18, v19.4s
+            //
+            // 	// 18
+            // 	sha1h	s18, s6
+            // 	add	v19.4s, v2.4s, v4.4s
+            // 	sha1p	q6, s17, v19.4s
+            //
+            // 	// 19
+            // 	sha1h	s17, s6
+            // 	add	v19.4s, v3.4s, v4.4s
+            // 	sha1p	q6, s18, v19.4s
+            //
+            // 	// Update state
+            // 	add	v6.4s, v6.4s, v5.4s
+            // 	str	q6, [x0]
+            // 	add	v16.2s, v16.2s, v17.2s
+            // 	str	s16, [x0, 16]
+            //
+            // 	ret
+            // .align 4
+            // .K0:
+            // 	.word	0x5A827999
+            // 	.word	0x5A827999
+            // 	.word	0x5A827999
+            // 	.word	0x5A827999
+            // .K1:
+            // 	.word	0x6ED9EBA1
+            // 	.word	0x6ED9EBA1
+            // 	.word	0x6ED9EBA1
+            // 	.word	0x6ED9EBA1
+            // .K2:
+            // 	.word	0x8F1BBCDC
+            // 	.word	0x8F1BBCDC
+            // 	.word	0x8F1BBCDC
+            // 	.word	0x8F1BBCDC
+            // .K3:
+            // 	.word	0xCA62C1D6
+            // 	.word	0xCA62C1D6
+            // 	.word	0xCA62C1D6
+            // 	.word	0xCA62C1D6
+
+        );
+    };
+}

From 28c3a33a20a125f3305d7111605ae041bf2cc750 Mon Sep 17 00:00:00 2001
From: Claudia Richoux <c@laudiacay.cool>
Date: Sun, 29 Jan 2023 12:39:31 -0500
Subject: [PATCH 5/8] are we passing

---
 sha1/src/asm/aarch64.rs | 376 +++++++++++++++++++---------------------
 sha1/src/compress.rs    |   8 +
 sha1/src/lib.rs         |  28 +--
 3 files changed, 187 insertions(+), 225 deletions(-)

diff --git a/sha1/src/asm/aarch64.rs b/sha1/src/asm/aarch64.rs
index 5347a8e11..675144b6a 100644
--- a/sha1/src/asm/aarch64.rs
+++ b/sha1/src/asm/aarch64.rs
@@ -21,7 +21,6 @@
 //  *   out of or in connection with the Software or the use or other dealings in the
 //  *   Software.
 //  */
-
 use core::arch::asm;
 
 // macro_rules! sha_1_through_4 {
@@ -35,28 +34,6 @@ pub fn compress(state: &mut [u32; 5], blocks: &[[u8; 64]]) {
     // SAFETY: inline-assembly
     unsafe {
         asm!(
-            // define the SHA1 constants TODO (laudiacay) does it make sense for these to be up here? does it cause alignment issues?
-            ".K0:",
-            ".word	0x5A827999",
-            ".word	0x5A827999",
-            ".word	0x5A827999",
-            ".word	0x5A827999",
-            ".K1:",
-            ".word	0x6ED9EBA1",
-            ".word	0x6ED9EBA1",
-            ".word	0x6ED9EBA1",
-            ".word	0x6ED9EBA1",
-            ".K2:",
-            ".word	0x8F1BBCDC",
-            ".word	0x8F1BBCDC",
-            ".word	0x8F1BBCDC",
-            ".word	0x8F1BBCDC",
-            ".K3:",
-            ".word	0xCA62C1D6",
-            ".word	0xCA62C1D6",
-            ".word	0xCA62C1D6",
-            ".word	0xCA62C1D6",
-
             // from original code, some docs :)
             // 	/*
             // 	 * Storage usage:
@@ -103,181 +80,184 @@ pub fn compress(state: &mut [u32; 5], blocks: &[[u8; 64]]) {
             "rev32 v3.16b, v3.16b",
 
             // k for the next five rounds
-            "adrp x1, .K0"
-            "ldr	q4, [x1, #:lo12:.K0]"
-
-            // 	// 0
-            // 	sha1h	s18, s6
-            // 	add	v19.4s, v0.4s, v4.4s
-            // 	sha1c	q6, s16, v19.4s
-            // 	sha1su0	v0.4s, v1.4s, v2.4s
-
-            // 	// 1
-            // 	sha1h	s17, s6
-            // 	add	v19.4s, v1.4s, v4.4s
-            // 	sha1c	q6, s18, v19.4s
-            // 	sha1su1	v0.4s, v3.4s
-            // 	sha1su0	v1.4s, v2.4s, v3.4s
-            //
-            // 	// 2
-            // 	sha1h	s18, s6
-            // 	add	v19.4s, v2.4s, v4.4s
-            // 	sha1c	q6, s17, v19.4s
-            // 	sha1su1	v1.4s, v0.4s
-            // 	sha1su0	v2.4s, v3.4s, v0.4s
-            //
-            // 	// 3
-            // 	sha1h	s17, s6
-            // 	add	v19.4s, v3.4s, v4.4s
-            // 	sha1c	q6, s18, v19.4s
-            // 	sha1su1	v2.4s, v1.4s
-            // 	sha1su0	v3.4s, v0.4s, v1.4s
-            //
-            // 	// 4
-            // 	sha1h	s18, s6
-            // 	add	v19.4s, v0.4s, v4.4s
-            // 	sha1c	q6, s17, v19.4s
-            // 	sha1su1	v3.4s, v2.4s
-            // 	sha1su0	v0.4s, v1.4s, v2.4s
-            //
-            // 	// k for the next five rounds
-            // 	adrp	x1, .K1
-            // 	ldr	q4, [x1, #:lo12:.K1]
-            //
-            // 	// 5
-            // 	sha1h	s17, s6
-            // 	add	v19.4s, v1.4s, v4.4s
-            // 	sha1p	q6, s18, v19.4s
-            // 	sha1su1	v0.4s, v3.4s
-            // 	sha1su0	v1.4s, v2.4s, v3.4s
-            //
-            // 	// 6
-            // 	sha1h	s18, s6
-            // 	add	v19.4s, v2.4s, v4.4s
-            // 	sha1p	q6, s17, v19.4s
-            // 	sha1su1	v1.4s, v0.4s
-            // 	sha1su0	v2.4s, v3.4s, v0.4s
-            //
-            // 	// 7
-            // 	sha1h	s17, s6
-            // 	add	v19.4s, v3.4s, v4.4s
-            // 	sha1p	q6, s18, v19.4s
-            // 	sha1su1	v2.4s, v1.4s
-            // 	sha1su0	v3.4s, v0.4s, v1.4s
-            //
-            // 	// 8
-            // 	sha1h	s18, s6
-            // 	add	v19.4s, v0.4s, v4.4s
-            // 	sha1p	q6, s17, v19.4s
-            // 	sha1su1	v3.4s, v2.4s
-            // 	sha1su0	v0.4s, v1.4s, v2.4s
-            //
-            // 	// 9
-            // 	sha1h	s17, s6
-            // 	add	v19.4s, v1.4s, v4.4s
-            // 	sha1p	q6, s18, v19.4s
-            // 	sha1su1	v0.4s, v3.4s
-            // 	sha1su0	v1.4s, v2.4s, v3.4s
-            //
-            // 	// k for the next five rounds
-            // 	adrp	x1, .K2
-            // 	ldr	q4, [x1, #:lo12:.K2]
-            //
-            // 	// 10
-            // 	sha1h	s18, s6
-            // 	add	v19.4s, v2.4s, v4.4s
-            // 	sha1m	q6, s17, v19.4s
-            // 	sha1su1	v1.4s, v0.4s
-            // 	sha1su0	v2.4s, v3.4s, v0.4s
-            //
-            // 	// 11
-            // 	sha1h	s17, s6
-            // 	add	v19.4s, v3.4s, v4.4s
-            // 	sha1m	q6, s18, v19.4s
-            // 	sha1su1	v2.4s, v1.4s
-            // 	sha1su0	v3.4s, v0.4s, v1.4s
-            //
-            // 	// 12
-            // 	sha1h	s18, s6
-            // 	add	v19.4s, v0.4s, v4.4s
-            // 	sha1m	q6, s17, v19.4s
-            // 	sha1su1	v3.4s, v2.4s
-            // 	sha1su0	v0.4s, v1.4s, v2.4s
-            //
-            // 	// 13
-            // 	sha1h	s17, s6
-            // 	add	v19.4s, v1.4s, v4.4s
-            // 	sha1m	q6, s18, v19.4s
-            // 	sha1su1	v0.4s, v3.4s
-            // 	sha1su0	v1.4s, v2.4s, v3.4s
-            //
-            // 	// 14
-            // 	sha1h	s18, s6
-            // 	add	v19.4s, v2.4s, v4.4s
-            // 	sha1m	q6, s17, v19.4s
-            // 	sha1su1	v1.4s, v0.4s
-            // 	sha1su0	v2.4s, v3.4s, v0.4s
-            //
-            // 	// k for the next five rounds
-            // 	adrp	x1, .K3
-            // 	ldr	q4, [x1, #:lo12:.K3]
-            //
-            // 	// 15
-            // 	sha1h	s17, s6
-            // 	add	v19.4s, v3.4s, v4.4s
-            // 	sha1p	q6, s18, v19.4s
-            // 	sha1su1	v2.4s, v1.4s
-            // 	sha1su0	v3.4s, v0.4s, v1.4s
-            //
-            // 	// 16
-            // 	sha1h	s18, s6
-            // 	add	v19.4s, v0.4s, v4.4s
-            // 	sha1p	q6, s17, v19.4s
-            // 	sha1su1	v3.4s, v2.4s
-            //
-            // 	// 17
-            // 	sha1h	s17, s6
-            // 	add	v19.4s, v1.4s, v4.4s
-            // 	sha1p	q6, s18, v19.4s
-            //
-            // 	// 18
-            // 	sha1h	s18, s6
-            // 	add	v19.4s, v2.4s, v4.4s
-            // 	sha1p	q6, s17, v19.4s
-            //
-            // 	// 19
-            // 	sha1h	s17, s6
-            // 	add	v19.4s, v3.4s, v4.4s
-            // 	sha1p	q6, s18, v19.4s
-            //
-            // 	// Update state
-            // 	add	v6.4s, v6.4s, v5.4s
-            // 	str	q6, [x0]
-            // 	add	v16.2s, v16.2s, v17.2s
-            // 	str	s16, [x0, 16]
-            //
-            // 	ret
-            // .align 4
-            // .K0:
-            // 	.word	0x5A827999
-            // 	.word	0x5A827999
-            // 	.word	0x5A827999
-            // 	.word	0x5A827999
-            // .K1:
-            // 	.word	0x6ED9EBA1
-            // 	.word	0x6ED9EBA1
-            // 	.word	0x6ED9EBA1
-            // 	.word	0x6ED9EBA1
-            // .K2:
-            // 	.word	0x8F1BBCDC
-            // 	.word	0x8F1BBCDC
-            // 	.word	0x8F1BBCDC
-            // 	.word	0x8F1BBCDC
-            // .K3:
-            // 	.word	0xCA62C1D6
-            // 	.word	0xCA62C1D6
-            // 	.word	0xCA62C1D6
-            // 	.word	0xCA62C1D6
+            "adrp x1, .K0",
+            "ldr	q4, [x1, #:lo12:.K0]",
+
+            // 0
+            "sha1h	s18, s6",
+            "add	v19.4s, v0.4s, v4.4s",
+            "sha1c	q6, s16, v19.4s",
+            "sha1su0	v0.4s, v1.4s, v2.4s",
+
+            // 1
+            "sha1h	s17, s6",
+            "add	v19.4s, v1.4s, v4.4s",
+            "sha1c	q6, s18, v19.4s",
+            "sha1su1	v0.4s, v3.4s",
+            "sha1su0	v1.4s, v2.4s, v3.4s",
+
+            // 2
+            "sha1h	s18, s6",
+            "add	v19.4s, v2.4s, v4.4s",
+            "sha1c	q6, s17, v19.4s",
+            "sha1su1	v1.4s, v0.4s",
+            "sha1su0	v2.4s, v3.4s, v0.4s",
+
+            // 3
+            "sha1h	s17, s6",
+            "add	v19.4s, v3.4s, v4.4s",
+            "sha1c	q6, s18, v19.4s",
+            "sha1su1	v2.4s, v1.4s",
+            "sha1su0	v3.4s, v0.4s, v1.4s",
+
+            // 4
+            "sha1h	s18, s6",
+            "add	v19.4s, v0.4s, v4.4s",
+            "sha1c	q6, s17, v19.4s",
+            "sha1su1	v3.4s, v2.4s",
+            "sha1su0	v0.4s, v1.4s, v2.4s",
+
+            // k for the next five rounds
+            "adrp	x1, .K1",
+            "ldr	q4, [x1, #:lo12:.K1]",
+
+            // 5
+            "sha1h	s17, s6",
+            "add	v19.4s, v1.4s, v4.4s",
+            "sha1p	q6, s18, v19.4s",
+            "sha1su1	v0.4s, v3.4s",
+            "sha1su0	v1.4s, v2.4s, v3.4s",
+
+            // 6
+            "sha1h	s18, s6",
+            "add	v19.4s, v2.4s, v4.4s",
+            "sha1p	q6, s17, v19.4s",
+            "sha1su1	v1.4s, v0.4s",
+            "sha1su0	v2.4s, v3.4s, v0.4s",
+
+            // 7
+            "sha1h	s17, s6",
+            "add	v19.4s, v3.4s, v4.4s",
+            "sha1p	q6, s18, v19.4s",
+            "sha1su1	v2.4s, v1.4s",
+            "sha1su0	v3.4s, v0.4s, v1.4s",
+
+            // 8
+            "sha1h	s18, s6",
+            "add	v19.4s, v0.4s, v4.4s",
+            "sha1p	q6, s17, v19.4s",
+            "sha1su1	v3.4s, v2.4s",
+            "sha1su0	v0.4s, v1.4s, v2.4s",
+
+            // 9
+            "sha1h	s17, s6",
+            "add	v19.4s, v1.4s, v4.4s",
+            "sha1p	q6, s18, v19.4s",
+            "sha1su1	v0.4s, v3.4s",
+            "sha1su0	v1.4s, v2.4s, v3.4s",
+
+            // k for the next five rounds
+            "adrp	x1, .K2",
+            "ldr	q4, [x1, #:lo12:.K2]",
+
+            // 10
+            "sha1h	s18, s6",
+            "add	v19.4s, v2.4s, v4.4s",
+            "sha1m	q6, s17, v19.4s",
+            "sha1su1	v1.4s, v0.4s",
+            "sha1su0	v2.4s, v3.4s, v0.4s",
+
+            // 11
+            "sha1h	s17, s6",
+            "add	v19.4s, v3.4s, v4.4s",
+            "sha1m	q6, s18, v19.4s",
+            "sha1su1	v2.4s, v1.4s",
+            "sha1su0	v3.4s, v0.4s, v1.4s",
+
+            // 12
+            "sha1h	s18, s6",
+            "add	v19.4s, v0.4s, v4.4s",
+            "sha1m	q6, s17, v19.4s",
+            "sha1su1	v3.4s, v2.4s",
+            "sha1su0	v0.4s, v1.4s, v2.4s",
+
+            // 13
+            "sha1h	s17, s6",
+            "add	v19.4s, v1.4s, v4.4s",
+            "sha1m	q6, s18, v19.4s",
+            "sha1su1	v0.4s, v3.4s",
+            "sha1su0	v1.4s, v2.4s, v3.4s",
+
+            // 14
+            "sha1h	s18, s6",
+            "add	v19.4s, v2.4s, v4.4s",
+            "sha1m	q6, s17, v19.4s",
+            "sha1su1	v1.4s, v0.4s",
+            "sha1su0	v2.4s, v3.4s, v0.4s",
+
+            // k for the next five rounds
+            "adrp	x1, .K3",
+            "ldr	q4, [x1, #:lo12:.K3]",
+
+            // 15
+            "sha1h	s17, s6",
+            "add	v19.4s, v3.4s, v4.4s",
+            "sha1p	q6, s18, v19.4s",
+            "sha1su1	v2.4s, v1.4s",
+            "sha1su0	v3.4s, v0.4s, v1.4s",
+
+            // 16
+            "sha1h	s18, s6",
+            "add	v19.4s, v0.4s, v4.4s",
+            "sha1p	q6, s17, v19.4s",
+            "sha1su1	v3.4s, v2.4s",
+
+            // 17
+            "sha1h	s17, s6",
+            "add	v19.4s, v1.4s, v4.4s",
+            "sha1p	q6, s18, v19.4s",
+
+            // 18
+            "sha1h	s18, s6",
+            "add	v19.4s, v2.4s, v4.4s",
+            "sha1p	q6, s17, v19.4s",
+
+            // 19
+            "sha1h	s17, s6",
+            "add	v19.4s, v3.4s, v4.4s",
+            "sha1p	q6, s18, v19.4s",
+
+            // Update state
+            "add	v6.4s, v6.4s, v5.4s",
+            // source code: str	q6, [x0]
+            out(q6) state[0..4],
+            "add	v16.2s, v16.2s, v17.2s",
+            // source code: str	s16, [x0, 16]
+            out(s16) state[4],
+
+            "ret", // TODO is this right
+
+            ".align 4", // TODO ummm alignment...
+            ".K0:", // TODO are labels just the same in inline asm in rust?
+            ".word	0x5A827999"
+            ".word	0x5A827999",
+            ".word	0x5A827999",
+            ".word	0x5A827999",
+            ".K1:",
+            ".word	0x6ED9EBA1",
+            ".word	0x6ED9EBA1",
+            ".word	0x6ED9EBA1",
+            ".word	0x6ED9EBA1",
+            ".K2:",
+            ".word	0x8F1BBCDC",
+            ".word	0x8F1BBCDC",
+            ".word	0x8F1BBCDC",
+            ".word	0x8F1BBCDC",
+            ".K3:",
+            ".word	0xCA62C1D6",
+            ".word	0xCA62C1D6",
+            ".word	0xCA62C1D6",
+            ".word	0xCA62C1D6",
 
         );
     };
diff --git a/sha1/src/compress.rs b/sha1/src/compress.rs
index da4a10a98..c80650620 100644
--- a/sha1/src/compress.rs
+++ b/sha1/src/compress.rs
@@ -5,6 +5,14 @@ cfg_if::cfg_if! {
     if #[cfg(feature = "force-soft")] {
         mod soft;
         use soft::compress as compress_inner;
+    } else if #[cfg(feature = "inline-asm")] {
+        mod asm;
+        #[cfg(all(feature = "inline-asm", target_arch = "x86"))]
+        use asm::x86::compress as compress_inner;
+        #[cfg(all(feature = "inline-asm", target_arch = "x86_64"))]
+        use asm::x86_64::compress as compress_inner;
+        #[cfg(all(feature = "inline-asm", target_arch = "aarch64"))]
+        use asm::aarch64::compress as compress_inner;
     } else if #[cfg(all(feature = "asm", target_arch = "aarch64"))] {
         mod soft;
         mod aarch64;
diff --git a/sha1/src/lib.rs b/sha1/src/lib.rs
index 9b7fd0a37..c37227b8e 100644
--- a/sha1/src/lib.rs
+++ b/sha1/src/lib.rs
@@ -76,35 +76,9 @@ mod asm;
 mod compress;
 
 #[cfg(feature = "compress")]
-#[cfg(all(
-    feature = "inline-asm",
-    feature = "compress",
-    any(target_arch = "x86", target_arch = "x86_64", target_arch = "aarch64")
-))]
-pub use asm::compress;
-
-#[cfg(feature = "compress")]
-#[cfg(all(
-    feature = "inline-asm",
-    not(feature = "compress"),
-    any(target_arch = "x86", target_arch = "x86_64", target_arch = "aarch64")
-))]
-use asm::compress;
-
-#[cfg(feature = "compress")]
-#[cfg(all(
-    not(feature = "inline-asm"),
-    feature = "compress",
-    any(target_arch = "x86", target_arch = "x86_64", target_arch = "aarch64")
-))]
 pub use compress::compress;
 
-#[cfg(feature = "compress")]
-#[cfg(all(
-    not(feature = "inline-asm"),
-    not(feature = "compress"),
-    any(target_arch = "x86", target_arch = "x86_64", target_arch = "aarch64")
-))]
+#[cfg(not(feature = "compress"))]
 use compress::compress;
 
 const STATE_LEN: usize = 5;

From e65392db2bab274a02396bdf5f3d232a237e2f41 Mon Sep 17 00:00:00 2001
From: Claudia Richoux <c@laudiacay.cool>
Date: Sun, 29 Jan 2023 12:43:03 -0500
Subject: [PATCH 6/8] we love to change the target

---
 .github/workflows/sha1.yml | 6 ++++++
 Cargo.lock                 | 7 +++++++
 sha1/src/compress.rs       | 7 +------
 3 files changed, 14 insertions(+), 6 deletions(-)

diff --git a/.github/workflows/sha1.yml b/.github/workflows/sha1.yml
index 2228af5d5..2eb5c4312 100644
--- a/.github/workflows/sha1.yml
+++ b/.github/workflows/sha1.yml
@@ -189,10 +189,16 @@ jobs:
       - run: cargo test
 
   # TODO: merge with test on MSRV bump to 1.59 or higher
+  # TODO: do i need to think about no-std platforms here?
   test-inline-asm:
     runs-on: ubuntu-latest
     strategy:
       matrix:
+        target:
+          - aarch64-unknown-linux-gnu
+          - x86_64-unknown-linux-gnu
+          - x86-unknown-linux-gnu
+          # TODO - aarch64-apple-darwin
         rust:
           - 1.59.0 # MSRV
     steps:
diff --git a/Cargo.lock b/Cargo.lock
index d851c17b4..66c3279ff 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -2,6 +2,12 @@
 # It is not intended for manual editing.
 version = 3
 
+[[package]]
+name = "asm_block"
+version = "0.1.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "466c0990cf15ef0f331f19fdc16fd60229606ab237476c70a66b747fce2911ab"
+
 [[package]]
 name = "blake2"
 version = "0.10.6"
@@ -204,6 +210,7 @@ dependencies = [
 name = "sha1"
 version = "0.10.5"
 dependencies = [
+ "asm_block",
  "cfg-if",
  "cpufeatures",
  "digest",
diff --git a/sha1/src/compress.rs b/sha1/src/compress.rs
index c80650620..d43e509d6 100644
--- a/sha1/src/compress.rs
+++ b/sha1/src/compress.rs
@@ -7,12 +7,7 @@ cfg_if::cfg_if! {
         use soft::compress as compress_inner;
     } else if #[cfg(feature = "inline-asm")] {
         mod asm;
-        #[cfg(all(feature = "inline-asm", target_arch = "x86"))]
-        use asm::x86::compress as compress_inner;
-        #[cfg(all(feature = "inline-asm", target_arch = "x86_64"))]
-        use asm::x86_64::compress as compress_inner;
-        #[cfg(all(feature = "inline-asm", target_arch = "aarch64"))]
-        use asm::aarch64::compress as compress_inner;
+        use asm::compress as compress_inner;
     } else if #[cfg(all(feature = "asm", target_arch = "aarch64"))] {
         mod soft;
         mod aarch64;

From 8b8e56c9aa15f0f4a5d724d9a2ec8caea125d11e Mon Sep 17 00:00:00 2001
From: Claudia Richoux <c@laudiacay.cool>
Date: Sun, 29 Jan 2023 14:00:06 -0500
Subject: [PATCH 7/8] unsure how to get the state out of the inline asm

---
 sha1/src/asm/aarch64.rs | 35 +++++++++++++++++++++++++----------
 sha1/src/compress.rs    |  3 +--
 sha1/src/lib.rs         |  4 ----
 3 files changed, 26 insertions(+), 16 deletions(-)

diff --git a/sha1/src/asm/aarch64.rs b/sha1/src/asm/aarch64.rs
index 675144b6a..c5fb97351 100644
--- a/sha1/src/asm/aarch64.rs
+++ b/sha1/src/asm/aarch64.rs
@@ -31,6 +31,7 @@ use core::arch::asm;
 /// of register allocation on `x86`, we explicitly specify registers to use.
 #[cfg(all(feature = "inline-asm", target_arch = "aarch64"))]
 pub fn compress(state: &mut [u32; 5], blocks: &[[u8; 64]]) {
+    let mut out_state = [0u32; 5];
     // SAFETY: inline-assembly
     unsafe {
         asm!(
@@ -57,9 +58,10 @@ pub fn compress(state: &mut [u32; 5], blocks: &[[u8; 64]]) {
             // original code:
             // 	ldr	q5, [x0]
             // 	ldr	s16, [x0, 16]
-            // 	mov	v6.16b, v5.16b
-            in(q5) state[0..4],
-            in(s16) state[4],
+            // this now happens at the bottom...
+            // TODO what is this doing?
+            // i believe it's copying state[0..4] into v6 (which is also q6)
+            // confirmed this is the mutable copy of the first 4 words of the state
             "mov v6.16b, v5.16b",
 
             // Load block in registers
@@ -68,12 +70,10 @@ pub fn compress(state: &mut [u32; 5], blocks: &[[u8; 64]]) {
             // 	ldr	q1, [x1, 16]
             // 	ldr	q2, [x1, 32]
             // 	ldr	q3, [x1, 48]
-            in(q0) blocks[0][0..16],
-            in(q1) blocks[0][16..32],
-            in(q2) blocks[0][32..48],
-            in(q3) blocks[0][48..64],
+            // this is at the bottom now
 
             // from original code: TODO: only do that on little endian
+            // this flips the blocks from little to big endian
             "rev32 v0.16b, v0.16b",
             "rev32 v1.16b, v1.16b",
             "rev32 v2.16b, v2.16b",
@@ -230,16 +230,16 @@ pub fn compress(state: &mut [u32; 5], blocks: &[[u8; 64]]) {
             // Update state
             "add	v6.4s, v6.4s, v5.4s",
             // source code: str	q6, [x0]
-            out(q6) state[0..4],
+            // this now happens at the bottom
             "add	v16.2s, v16.2s, v17.2s",
             // source code: str	s16, [x0, 16]
-            out(s16) state[4],
+            // this now happens at the bottom
 
             "ret", // TODO is this right
 
             ".align 4", // TODO ummm alignment...
             ".K0:", // TODO are labels just the same in inline asm in rust?
-            ".word	0x5A827999"
+            ".word	0x5A827999",
             ".word	0x5A827999",
             ".word	0x5A827999",
             ".word	0x5A827999",
@@ -259,6 +259,21 @@ pub fn compress(state: &mut [u32; 5], blocks: &[[u8; 64]]) {
             ".word	0xCA62C1D6",
             ".word	0xCA62C1D6",
 
+            // state ins and outs
+            in("q4") state.as_mut_ptr(),
+            inout("s16") state[4],
+            lateout("q6") state as *mut u32,
+            // blocks in
+            in("q0") blocks[0][0..16].as_ptr(),
+            in("q1") blocks[0][16..32].as_ptr(),
+            in("q2") blocks[0][32..48].as_ptr(),
+            in("q3") blocks[0][48..64].as_ptr(),
+            // some clobbers
+            out("q5") _,
+            out("s17") _,
+            out("s18") _,
+            out("q19") _,
+        // TODO make sure there aren't any other clobbers
         );
     };
 }
diff --git a/sha1/src/compress.rs b/sha1/src/compress.rs
index d43e509d6..2e80ee090 100644
--- a/sha1/src/compress.rs
+++ b/sha1/src/compress.rs
@@ -6,8 +6,7 @@ cfg_if::cfg_if! {
         mod soft;
         use soft::compress as compress_inner;
     } else if #[cfg(feature = "inline-asm")] {
-        mod asm;
-        use asm::compress as compress_inner;
+        use crate::asm::compress as compress_inner;
     } else if #[cfg(all(feature = "asm", target_arch = "aarch64"))] {
         mod soft;
         mod aarch64;
diff --git a/sha1/src/lib.rs b/sha1/src/lib.rs
index c37227b8e..8a003d2d9 100644
--- a/sha1/src/lib.rs
+++ b/sha1/src/lib.rs
@@ -69,10 +69,6 @@ use digest::{
 ))]
 mod asm;
 
-#[cfg(not(all(
-    any(feature = "asm", feature = "inline-asm"),
-    any(target_arch = "x86", target_arch = "x86_64", target_arch = "aarch64")
-)))]
 mod compress;
 
 #[cfg(feature = "compress")]

From c14975b8e3d4ca63cc7ce6239bb940439eb9b6de Mon Sep 17 00:00:00 2001
From: Claudia Richoux <c@laudiacay.cool>
Date: Sun, 29 Jan 2023 14:41:52 -0500
Subject: [PATCH 8/8] starting x86

---
 sha1/src/asm/x86.rs    | 169 ++++++++++++++++++++++++++++++++++-------
 sha1/src/asm/x86_64.rs |  34 ++++-----
 2 files changed, 160 insertions(+), 43 deletions(-)

diff --git a/sha1/src/asm/x86.rs b/sha1/src/asm/x86.rs
index 2ab865649..f7b24e839 100644
--- a/sha1/src/asm/x86.rs
+++ b/sha1/src/asm/x86.rs
@@ -51,8 +51,9 @@ use asm_block::asm_block;
 //      *       4  [esp+72]  Caller's value of edi
 //      *       4  [esp+76]  Caller's value of ebp
 //      */
-//
-//     #define ROUND0a(a, b, c, d, e, i)  \
+
+
+//     #define round0a(a, b, c, d, e, i)  \
 //         movl    (i*4)(%edi), %esi;  \
 //         bswapl  %esi;               \
 //         movl    %esi, (i*4)(%esp);  \
@@ -62,7 +63,25 @@ use asm_block::asm_block;
 //         andl    %b, %esi;           \
 //         xorl    %d, %esi;           \
 //         ROUNDTAIL(a, b, e, i, 0x5A827999)
-//
+
+macro_rules! round0a {
+    ($a:tt, $b:tt, $c:tt, $d:tt, $e:tt, $i:tt) => {
+        concat! {
+            asm_block! {
+                movl    ($i*4)(%edi), %esi;
+                bswapl  %esi;
+                movl    %esi, ($i*4)(%esp);
+                addl    %esi, $e;
+                movl    $c, %esi;
+                xorl    $d, %esi;
+                andl    $b, %esi;
+                xorl    $d, %esi;
+                ROUNDTAIL!($a, $b, $e, $i, 0x5A827999);
+            }
+        }
+    };
+}
+
 //     #define SCHEDULE(i, e)  \
 //         movl  (((i- 3)&0xF)*4)(%esp), %esi;  \
 //         xorl  (((i- 8)&0xF)*4)(%esp), %esi;  \
@@ -71,7 +90,23 @@ use asm_block::asm_block;
 //         roll  $1, %esi;                      \
 //         addl  %esi, %e;                      \
 //         movl  %esi, ((i&0xF)*4)(%esp);
-//
+
+macro_rules! schedule {
+    ($i:tt, $e:tt) => {
+        concat!{
+            asm_block! {
+                movl  ((($i- 3)&0xF)*4)(%esp), %esi;
+                xorl  ((($i- 8)&0xF)*4)(%esp), %esi;
+                xorl  ((($i-14)&0xF)*4)(%esp), %esi;
+                xorl  ((($i-16)&0xF)*4)(%esp), %esi;
+                roll  $1, %esi;
+                addl  %esi, $e;
+                movl  %esi, (($i&0xF)*4)(%esp);
+            }
+        }
+    };
+}
+
 //     #define ROUND0b(a, b, c, d, e, i)  \
 //         SCHEDULE(i, e)   \
 //         movl  %c, %esi;  \
@@ -79,14 +114,43 @@ use asm_block::asm_block;
 //         andl  %b, %esi;  \
 //         xorl  %d, %esi;  \
 //         ROUNDTAIL(a, b, e, i, 0x5A827999)
-//
+
+macro_rules! round0b {
+    ($a:tt, $b:tt, $c:tt, $d:tt, $e:tt, $i:tt) => {
+        concat! {
+            asm_block! {
+                schedule!($i, $e);
+                movl  $c, %esi;
+                xorl  $d, %esi;
+                andl  $b, %esi;
+                xorl  $d, %esi;
+                roundtail!($a, $b, $e, $i, 0x5A827999);
+            }
+        }
+    };
+}
+
 //     #define ROUND1(a, b, c, d, e, i)  \
 //         SCHEDULE(i, e)   \
 //         movl  %b, %esi;  \
 //         xorl  %c, %esi;  \
 //         xorl  %d, %esi;  \
 //         ROUNDTAIL(a, b, e, i, 0x6ED9EBA1)
-//
+
+macro_rules! round1 {
+    ($a:tt, $b:tt, $c:tt, $d:tt, $e:tt, $i:tt) => {
+        concat! {
+            asm_block! {
+                schedule!($i, $e);
+                movl  $b, %esi;
+                xorl  $c, %esi;
+                xorl  $d, %esi;
+                roundtail!($a, $b, $e, $i, 0x6ED9EBA1);
+            }
+        }
+    };
+}
+
 //     #define ROUND2(a, b, c, d, e, i)  \
 //         SCHEDULE(i, e)     \
 //         movl  %c, %esi;    \
@@ -96,22 +160,69 @@ use asm_block::asm_block;
 //         andl  %d, %edi;    \
 //         orl   %edi, %esi;  \
 //         ROUNDTAIL(a, b, e, i, 0x8F1BBCDC)
-//
+
+macro_rules! round2 {
+    ($a:tt, $b:tt, $c:tt, $d:tt, $e:tt, $i:tt) => {
+        concat! {
+            asm_block! {
+                schedule!($i, $e);
+                movl  $c, %esi;
+                movl  $c, %edi;
+                orl   $d, %esi;
+                andl  $b, %esi;
+                andl  $d, %edi;
+                orl   %edi, %esi;
+                roundtail!($a, $b, $e, $i, 0x8F1BBCDC);
+            }
+        }
+    };
+}
+
 //     #define ROUND3(a, b, c, d, e, i)  \
 //         SCHEDULE(i, e)   \
 //         movl  %b, %esi;  \
 //         xorl  %c, %esi;  \
 //         xorl  %d, %esi;  \
 //         ROUNDTAIL(a, b, e, i, 0xCA62C1D6)
-//
+
+macro_rules! round3 {
+    ($a:tt, $b:tt, $c:tt, $d:tt, $e:tt, $i:tt) => {
+        concat! {
+            asm_block! {
+                schedule!($i, $e);
+                movl  $b, %esi;
+                xorl  $c, %esi;
+                xorl  $d, %esi;
+                roundtail!($a, $b, $e, $i, 0xCA62C1D6);
+            }
+        }
+    };
+}
+
 //     #define ROUNDTAIL(a, b, e, i, k)  \
 //         roll  $30, %b;         \
 //         leal  k(%e,%esi), %e;  \
 //         movl  %a, %esi;        \
 //         roll  $5, %esi;        \
 //         addl  %esi, %e;
-//
-//     /* Save registers */
+
+macro_rules! roundtail {
+    ($a:tt, $b:tt, $e:tt, $i:tt, $k:tt) => {
+        concat! {
+            asm_block! {
+                roll  $30, $b;
+                leal  $k($e,%esi), $e;
+                movl  $a, %esi;
+                roll  $5, %esi;
+                addl  %esi, $e;
+            }
+        }
+    };
+}
+
+macro_rules! asm_sha1 {
+    // states
+    //     /* Save registers */
 //     subl    $80, %esp
 //     movl    %ebx, 64(%esp)
 //     movl    %esi, 68(%esp)
@@ -128,22 +239,22 @@ use asm_block::asm_block;
 //     movl    16(%esi), %ebp  /* e */
 //
 //     /* 80 rounds of hashing */
-//     ROUND0a(eax, ebx, ecx, edx, ebp,  0)
-//     ROUND0a(ebp, eax, ebx, ecx, edx,  1)
-//     ROUND0a(edx, ebp, eax, ebx, ecx,  2)
-//     ROUND0a(ecx, edx, ebp, eax, ebx,  3)
-//     ROUND0a(ebx, ecx, edx, ebp, eax,  4)
-//     ROUND0a(eax, ebx, ecx, edx, ebp,  5)
-//     ROUND0a(ebp, eax, ebx, ecx, edx,  6)
-//     ROUND0a(edx, ebp, eax, ebx, ecx,  7)
-//     ROUND0a(ecx, edx, ebp, eax, ebx,  8)
-//     ROUND0a(ebx, ecx, edx, ebp, eax,  9)
-//     ROUND0a(eax, ebx, ecx, edx, ebp, 10)
-//     ROUND0a(ebp, eax, ebx, ecx, edx, 11)
-//     ROUND0a(edx, ebp, eax, ebx, ecx, 12)
-//     ROUND0a(ecx, edx, ebp, eax, ebx, 13)
-//     ROUND0a(ebx, ecx, edx, ebp, eax, 14)
-//     ROUND0a(eax, ebx, ecx, edx, ebp, 15)
+//     round0a(eax, ebx, ecx, edx, ebp,  0)
+//     round0a(ebp, eax, ebx, ecx, edx,  1)
+//     round0a(edx, ebp, eax, ebx, ecx,  2)
+//     round0a(ecx, edx, ebp, eax, ebx,  3)
+//     round0a(ebx, ecx, edx, ebp, eax,  4)
+//     round0a(eax, ebx, ecx, edx, ebp,  5)
+//     round0a(ebp, eax, ebx, ecx, edx,  6)
+//     round0a(edx, ebp, eax, ebx, ecx,  7)
+//     round0a(ecx, edx, ebp, eax, ebx,  8)
+//     round0a(ebx, ecx, edx, ebp, eax,  9)
+//     round0a(eax, ebx, ecx, edx, ebp, 10)
+//     round0a(ebp, eax, ebx, ecx, edx, 11)
+//     round0a(edx, ebp, eax, ebx, ecx, 12)
+//     round0a(ecx, edx, ebp, eax, ebx, 13)
+//     round0a(ebx, ecx, edx, ebp, eax, 14)
+//     round0a(eax, ebx, ecx, edx, ebp, 15)
 //     ROUND0b(ebp, eax, ebx, ecx, edx, 16)
 //     ROUND0b(edx, ebp, eax, ebx, ecx, 17)
 //     ROUND0b(ecx, edx, ebp, eax, ebx, 18)
@@ -224,3 +335,9 @@ use asm_block::asm_block;
 //     movl    76(%esp), %ebp
 //     addl    $80, %esp
 //     retl
+}
+
+#[cfg(all(feature = "inline_asm", target_arch = "x86"))]
+pub fn compress(state: &mut [u32; 5], blocks: &[[u8; 64]]) {
+   unimplemented!("compress() is not implemented for x86");
+}
diff --git a/sha1/src/asm/x86_64.rs b/sha1/src/asm/x86_64.rs
index 5e095883b..37c26dd15 100644
--- a/sha1/src/asm/x86_64.rs
+++ b/sha1/src/asm/x86_64.rs
@@ -48,7 +48,7 @@
 //      *      16  xmm1      Caller's value of rbp (only low 64 bits are used)
 //      */
 //
-//     #define ROUND0a(a, b, c, d, e, i)  \
+//     #define round0a(a, b, c, d, e, i)  \
 //         movl    (i*4)(%rdi), %esi;  \
 //         bswapl  %esi;               \
 //         movl    %esi, (i*4)(%rsp);  \
@@ -122,22 +122,22 @@
 //     movq    %rsi, %rdi
 //
 //     /* 80 rounds of hashing */
-//     ROUND0a(eax, ebx, ecx, edx, ebp,  0)
-//     ROUND0a(ebp, eax, ebx, ecx, edx,  1)
-//     ROUND0a(edx, ebp, eax, ebx, ecx,  2)
-//     ROUND0a(ecx, edx, ebp, eax, ebx,  3)
-//     ROUND0a(ebx, ecx, edx, ebp, eax,  4)
-//     ROUND0a(eax, ebx, ecx, edx, ebp,  5)
-//     ROUND0a(ebp, eax, ebx, ecx, edx,  6)
-//     ROUND0a(edx, ebp, eax, ebx, ecx,  7)
-//     ROUND0a(ecx, edx, ebp, eax, ebx,  8)
-//     ROUND0a(ebx, ecx, edx, ebp, eax,  9)
-//     ROUND0a(eax, ebx, ecx, edx, ebp, 10)
-//     ROUND0a(ebp, eax, ebx, ecx, edx, 11)
-//     ROUND0a(edx, ebp, eax, ebx, ecx, 12)
-//     ROUND0a(ecx, edx, ebp, eax, ebx, 13)
-//     ROUND0a(ebx, ecx, edx, ebp, eax, 14)
-//     ROUND0a(eax, ebx, ecx, edx, ebp, 15)
+//     round0a(eax, ebx, ecx, edx, ebp,  0)
+//     round0a(ebp, eax, ebx, ecx, edx,  1)
+//     round0a(edx, ebp, eax, ebx, ecx,  2)
+//     round0a(ecx, edx, ebp, eax, ebx,  3)
+//     round0a(ebx, ecx, edx, ebp, eax,  4)
+//     round0a(eax, ebx, ecx, edx, ebp,  5)
+//     round0a(ebp, eax, ebx, ecx, edx,  6)
+//     round0a(edx, ebp, eax, ebx, ecx,  7)
+//     round0a(ecx, edx, ebp, eax, ebx,  8)
+//     round0a(ebx, ecx, edx, ebp, eax,  9)
+//     round0a(eax, ebx, ecx, edx, ebp, 10)
+//     round0a(ebp, eax, ebx, ecx, edx, 11)
+//     round0a(edx, ebp, eax, ebx, ecx, 12)
+//     round0a(ecx, edx, ebp, eax, ebx, 13)
+//     round0a(ebx, ecx, edx, ebp, eax, 14)
+//     round0a(eax, ebx, ecx, edx, ebp, 15)
 //     ROUND0b(ebp, eax, ebx, ecx, edx, 16)
 //     ROUND0b(edx, ebp, eax, ebx, ecx, 17)
 //     ROUND0b(ecx, edx, ebp, eax, ebx, 18)