diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml
index 9431b32f4..5d2a2708b 100644
--- a/.github/workflows/tests.yml
+++ b/.github/workflows/tests.yml
@@ -49,7 +49,7 @@ jobs:
   build-cuda:
     strategy:
       matrix:
-        cuda_version: ["11.8.0", "12.8.1"]
+        cuda_version: ["11.8.0", "12.6.3", "12.8.1"]
         os: [ubuntu-22.04, ubuntu-22.04-arm, windows-2025]
         include:
           - os: ubuntu-22.04
@@ -100,7 +100,7 @@ jobs:
       fail-fast: false
       matrix:
         os: [ubuntu-22.04, ubuntu-22.04-arm, windows-2025, macos-15]
-        torch_version: ["2.7.0"]
+        torch_version: ["2.6.0", "2.7.0"]
         include:
           - os: ubuntu-22.04
             arch: x86_64
@@ -138,9 +138,35 @@ jobs:
       - name: Show installed packages
         run: pip list
 
+      - name: Show environment information
+        run: python -m torch.utils.collect_env
+
       - name: Run tests
         run: pytest --durations=100
 
+  # cuda-aarch64-tests:
+  #   if: github.repository == 'bitsandbytes-foundation/bitsandbytes'
+  #   needs: build-cuda
+  #   strategy:
+  #     fail-fast: false
+  #     matrix:
+  #       os: [ubuntu-22.04-arm]
+  #       arch: [aarch64]
+  #       torch_version: ["2.7.0"]
+  #       cuda_version: ["11.8.0", "12.8.1"]
+
+  #   runs-on: bandb-aws-g5g-4xlarge-plus-use1-public-80
+  #   env:
+  #     BNB_TEST_DEVICE: cuda
+  #   steps:
+  #     - name: Show GPU Information
+  #       run: nvidia-smi
+
+  #     - name: Show pip packages
+  #       run: pip list
+
+
+
   cuda-tests:
     if: github.repository == 'bitsandbytes-foundation/bitsandbytes'
     needs: build-cuda
@@ -149,25 +175,28 @@ jobs:
       matrix:
         os: [ubuntu-22.04, windows-2025]
         arch: [x86_64]
-        gpu: [T4, L4]
-        cuda_version: ["11.8.0", "12.8.1"]
+        gpu: [T4, L40S]
+        cuda_version: ["11.8.0", "12.6.3", "12.8.1"]
         include:
           - cuda_version: "11.8.0"
             torch_version: "2.4.1"
             pypi_index: "https://download.pytorch.org/whl/cu118"
+          - cuda_version: "12.6.3"
+            torch_version: "2.6.0"
+            pypi_index: "https://download.pytorch.org/whl/cu126"
           - cuda_version: "12.8.1"
             torch_version: "2.7.0"
             pypi_index: "https://download.pytorch.org/whl/cu128"
 
-          # L4 runners
+          # L40S runners
           - os: ubuntu-22.04
-            gpu: L4
-            runner: bandb-aws-g6-4xlarge-plus-use1-public-80
+            gpu: L40S
+            runner: bandb-aws-g6e-4xlarge-plus-use1-public-80
 
           # T4 runners
           - os: ubuntu-22.04
             gpu: T4
-            runner: CUDA-Linux-x64
+            runner: bandb-aws-g4dn-4xlarge-plus-use1-public-80
           - os: windows-2025
             gpu: T4
             runner: CUDA-Windows-x64
@@ -176,10 +205,12 @@ jobs:
           # and cannot support CUDA 12+. Skip for now.
           - os: windows-2025
             cuda_version: "12.8.1"
+          - os: windows-2025
+            cuda_version: "12.6.3"
 
-          # No Windows L4 runners.
+          # No Windows L40S runners.
           - os: windows-2025
-            gpu: L4
+            gpu: L40S
     runs-on: ${{ matrix.runner }}
     env:
       BNB_TEST_DEVICE: cuda
@@ -210,5 +241,8 @@ jobs:
       - name: Show installed packages
         run: pip list
 
+      - name: Show environment information
+        run: python -m torch.utils.collect_env
+
       - name: Run tests
         run: pytest --durations=100
diff --git a/tests/test_functional.py b/tests/test_functional.py
index 96e77e4f4..423a92193 100644
--- a/tests/test_functional.py
+++ b/tests/test_functional.py
@@ -929,39 +929,6 @@ def test_spmm_coo_very_sparse(self, dim1, dim2, dtype, out_func):
         # torch.cuda.synchronize()
         # print(time.time() - t0)
 
-    @pytest.mark.parametrize("dim1", [256, 1024], ids=id_formatter("dim1"))
-    @pytest.mark.parametrize("dim2", [256, 1024], ids=id_formatter("dim2"))
-    @pytest.mark.skip("No longer supported")
-    def test_integrated_sparse_decomp(self, dim1, dim2):
-        threshold = 3.0
-        for _ in range(k):
-            A = torch.randn(dim1, dim2).cuda().half()
-            w1 = torch.randn(dim1, dim2).cuda().half()
-            out1 = torch.matmul(A, w1.t())
-
-            Cw1, statsw1, _ = F.int8_vectorwise_quant(w1)
-            CA, statsA, _ = F.int8_vectorwise_quant(A)
-
-            out1_32 = F.int8_linear_matmul(CA, Cw1)
-            out2 = F.int8_mm_dequant(out1_32, statsA, statsw1)
-
-            # CA, statsA, outlier_cols = F.int8_vectorwise_quant(A, threshold=threshold)
-            CA, _, statsA, _, coo_tensor = F.double_quant(A, threshold=threshold)
-
-            out1_32 = F.int8_linear_matmul(CA, Cw1)
-            out3 = F.int8_mm_dequant(out1_32, statsA, statsw1)
-
-            assert coo_tensor is not None
-
-            out4 = F.spmm_coo(coo_tensor, w1.t())
-            # idx = torch.unique(coo_tensor._indices()[1]).long()
-            # out4 = torch.matmul(A, w1.t())
-            out5 = out3 + out4
-
-            err1 = torch.abs(out1 - out2).mean().item()
-            err2 = torch.abs(out1 - out5).mean().item()
-            assert err2 < err1
-
     @pytest.mark.parametrize("dim1", [1 * 2048])
     @pytest.mark.parametrize("dim2", [2048])
     @pytest.mark.parametrize("dtype", [torch.int8])
diff --git a/tests/test_modules.py b/tests/test_modules.py
index dc1d60e6c..c8ec6311a 100644
--- a/tests/test_modules.py
+++ b/tests/test_modules.py
@@ -130,7 +130,7 @@ def test_linear8bitlt_no_fp16_weights(device, threshold):
     assert l1.weight.dtype == torch.int8
 
     l1.eval()
-    for i in range(100):
+    for i in range(4):
         b1 = torch.randn(16, 8, 32, device=device, dtype=torch.float16)
         o1 = l1(b1)
         assert o1.dtype == torch.float16
@@ -139,7 +139,7 @@ def test_linear8bitlt_no_fp16_weights(device, threshold):
     assert mlp.fc1.weight.dtype == torch.int8
     assert mlp.fc2.weight.dtype == torch.int8
 
-    for i in range(100):
+    for i in range(4):
         b1 = torch.randn(16, 8, 32, device=device, dtype=torch.float16)
         o1 = mlp(b1)
         assert o1.dtype == torch.float16
@@ -152,7 +152,7 @@ def test_linear8bitlt_no_fp16_weights(device, threshold):
     assert mlp.fc1.weight.dtype == torch.int8
     assert mlp.fc2.weight.dtype == torch.int8
 
-    for i in range(100):
+    for i in range(4):
         b1 = torch.randn(16, 8, 32, device=device, dtype=torch.float16)
         o1 = mlp(b1)
         assert o1.dtype == torch.float16
@@ -163,7 +163,7 @@ def test_linear8bitlt_no_fp16_weights(device, threshold):
 
     mlp = MLP8bit(32, 64, threshold=threshold, has_fp16_weights=False).half().to(device)
 
-    for i in range(100):
+    for i in range(4):
         b1 = torch.randn(16, 8, 32, device=device, dtype=torch.float16)
         o1 = mlp(b1)
         assert o1.dtype == torch.float16
@@ -185,7 +185,7 @@ def test_linear8bitlt_no_fp16_weights(device, threshold):
         .to(device)
     )
 
-    for i in range(100):
+    for i in range(4):
         b1 = torch.randn(16, 8, 32, device=device, dtype=torch.float16)
         o1 = mlp(b1)
         assert o1.dtype == torch.float16
@@ -207,7 +207,7 @@ def test_linear8bitlt_no_fp16_weights(device, threshold):
     w1, w2 = mlp.fc1.weight.clone().to(device), mlp.fc2.weight.clone().to(device)  # grab weights before quantization,
     mlp = mlp.to(device).half()  # and this line triggers quantization
 
-    for i in range(100):
+    for i in range(4):
         b1 = torch.randn(16, 8, 32, device=device, dtype=torch.float16)
         o1 = mlp(b1)
         assert o1.dtype == torch.float16