AVX2: Avoid overread in polyz_unpack_17_avx2/polyz_unpack_19_avx2

mkannwischer · mkannwischer · commit d81951cf7f61 · 2025-12-10T14:26:37.000+08:00
polyz_unpack_17_avx2/polyz_unpack_197_avx2 unpacks polynnomials with coefficients packed into 18/20 bits each. To do so, it currently loads 32 bytes into a 256-bit register and discards the last 14/12-bytes. This is problematic in the last iteration as this overreads the buffer presenting a potential safety problem. This is a violoation of the API contract which only requires 18*32/20*32-bytes. Hence, this commit eliminates the overread by adding a special handling for the last iteration. In practice this problem is not flagged as the z component in the signature is followed by the hint compoenent, and, hence, this never overreads the actual signature buffer. It did, however, show up in the unit tests that are being implemented in #777. Signed-off-by: Matthias J. Kannwischer <matthias@kannwischer.eu>
diff --git a/dev/x86_64/src/polyz_unpack_17_avx2.c b/dev/x86_64/src/polyz_unpack_17_avx2.c
@@ -39,7 +39,17 @@ void mld_polyz_unpack_17_avx2(__m256i *r, const uint8_t *a)
 
   for (i = 0; i < MLDSA_N / 8; i++)
   {
-    f = _mm256_loadu_si256((__m256i *)&a[18 * i]);
+    /* Last iteration: avoid overread by copying to padded buffer */
+    if (i == MLDSA_N / 8 - 1)
+    {
+      MLD_ALIGN uint8_t tmp[32] = {0};
+      memcpy(tmp, &a[18 * i], 18);
+      f = _mm256_load_si256((__m256i *)tmp);
+    }
+    else
+    {
+      f = _mm256_loadu_si256((__m256i *)&a[18 * i]);
+    }
 
     /* Permute 64-bit lanes
      * 0x94 = 10010100b rearranges 64-bit lanes as: [3,2,1,0] -> [2,1,1,0]
diff --git a/dev/x86_64/src/polyz_unpack_19_avx2.c b/dev/x86_64/src/polyz_unpack_19_avx2.c
@@ -40,7 +40,17 @@ void mld_polyz_unpack_19_avx2(__m256i *r, const uint8_t *a)
 
   for (i = 0; i < MLDSA_N / 8; i++)
   {
-    f = _mm256_loadu_si256((__m256i *)&a[20 * i]);
+    /* Last iteration: avoid overread by copying to padded buffer */
+    if (i == MLDSA_N / 8 - 1)
+    {
+      MLD_ALIGN uint8_t tmp[32] = {0};
+      memcpy(tmp, &a[20 * i], 20);
+      f = _mm256_load_si256((__m256i *)tmp);
+    }
+    else
+    {
+      f = _mm256_loadu_si256((__m256i *)&a[20 * i]);
+    }
 
     /* Permute 64-bit lanes
      * 0x94 = 10010100b rearranges 64-bit lanes as: [3,2,1,0] -> [2,1,1,0]
diff --git a/mldsa/src/native/x86_64/src/polyz_unpack_17_avx2.c b/mldsa/src/native/x86_64/src/polyz_unpack_17_avx2.c
@@ -39,7 +39,17 @@ void mld_polyz_unpack_17_avx2(__m256i *r, const uint8_t *a)
 
   for (i = 0; i < MLDSA_N / 8; i++)
   {
-    f = _mm256_loadu_si256((__m256i *)&a[18 * i]);
+    /* Last iteration: avoid overread by copying to padded buffer */
+    if (i == MLDSA_N / 8 - 1)
+    {
+      MLD_ALIGN uint8_t tmp[32] = {0};
+      memcpy(tmp, &a[18 * i], 18);
+      f = _mm256_load_si256((__m256i *)tmp);
+    }
+    else
+    {
+      f = _mm256_loadu_si256((__m256i *)&a[18 * i]);
+    }
 
     /* Permute 64-bit lanes
      * 0x94 = 10010100b rearranges 64-bit lanes as: [3,2,1,0] -> [2,1,1,0]
diff --git a/mldsa/src/native/x86_64/src/polyz_unpack_19_avx2.c b/mldsa/src/native/x86_64/src/polyz_unpack_19_avx2.c
@@ -40,7 +40,17 @@ void mld_polyz_unpack_19_avx2(__m256i *r, const uint8_t *a)
 
   for (i = 0; i < MLDSA_N / 8; i++)
   {
-    f = _mm256_loadu_si256((__m256i *)&a[20 * i]);
+    /* Last iteration: avoid overread by copying to padded buffer */
+    if (i == MLDSA_N / 8 - 1)
+    {
+      MLD_ALIGN uint8_t tmp[32] = {0};
+      memcpy(tmp, &a[20 * i], 20);
+      f = _mm256_load_si256((__m256i *)tmp);
+    }
+    else
+    {
+      f = _mm256_loadu_si256((__m256i *)&a[20 * i]);
+    }
 
     /* Permute 64-bit lanes
      * 0x94 = 10010100b rearranges 64-bit lanes as: [3,2,1,0] -> [2,1,1,0]