From d3d68406f6acd998f3c1efad456dbb35013faf84 Mon Sep 17 00:00:00 2001
From: Matthew Douglas <38992547+matthewdouglas@users.noreply.github.com>
Date: Thu, 12 Jun 2025 12:10:14 -0400
Subject: [PATCH 01/22] Setup XPU CI

---
 .github/workflows/tests.yml | 44 +++++++++++++++++++++++++++++++++++++
 1 file changed, 44 insertions(+)

diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml
index c6423b1f8..d5c7c2382 100644
--- a/.github/workflows/tests.yml
+++ b/.github/workflows/tests.yml
@@ -222,6 +222,50 @@ jobs:
   #     - name: Show pip packages
   #       run: pip list
 
+  test-xpu:
+    if: github.repository == 'bitsandbytes-foundation/bitsandbytes'
+    needs: build-cpu
+    runs-on:
+      group: bandb-itac-bmsprpvc1550-8-1gpu
+    env:
+      BNB_TEST_DEVICE: xpu
+    steps:
+      - name: Show system information
+        run: |
+          echo "OS: $(uname -a)"
+          echo "CPU: $(lscpu | grep 'Model name')"
+          echo "Memory: $(free -h)"
+
+      - name: Show XPU Information
+        run: xpu-smi
+
+      - uses: actions/checkout@v4
+
+      - name: Download build artifact
+        uses: actions/download-artifact@v4
+        with:
+          name: lib_cpu_ubuntu-22.04_x86_64
+
+      - name: Setup Python
+        uses: actions/setup-python@v5
+        with:
+          python-version: 3.9
+
+      - name: Install dependencies
+        run: |
+          pip install torch==2.7.1 --index-url https://download.pytorch.org/whl/xpu
+
+          pip install -e ".[test]"
+          pip install pytest-cov
+
+      - name: Show installed packages
+        run: pip list
+
+      - name: Show environment information
+        run: |
+          python -m torch.utils.collect_env
+          python -m bitsandbytes
+
   test-cuda:
     if: github.repository == 'bitsandbytes-foundation/bitsandbytes'
     needs: build-cuda

From 6e0622c2127d8f03aad69aea292f8040a31479fe Mon Sep 17 00:00:00 2001
From: Matthew Douglas <38992547+matthewdouglas@users.noreply.github.com>
Date: Thu, 12 Jun 2025 12:38:10 -0400
Subject: [PATCH 02/22] CI: expand XPU matrix

---
 .github/workflows/tests.yml | 27 ++++++++++++++++++++++++---
 1 file changed, 24 insertions(+), 3 deletions(-)

diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml
index d5c7c2382..d7a62eaae 100644
--- a/.github/workflows/tests.yml
+++ b/.github/workflows/tests.yml
@@ -225,6 +225,18 @@ jobs:
   test-xpu:
     if: github.repository == 'bitsandbytes-foundation/bitsandbytes'
     needs: build-cpu
+    strategy:
+      fail-fast: false
+      matrix:
+        torch_version: ["2.6.0", "2.7.1"]
+        ipex: [true, false]
+        include:
+          - torch_version: "2.6.0"
+            ipex: true
+            ipex_version: "2.6.10+xpu"
+          - torch_version: "2.7.1"
+            ipex: true
+            ipex_version: "2.7.10+xpu"
     runs-on:
       group: bandb-itac-bmsprpvc1550-8-1gpu
     env:
@@ -237,7 +249,7 @@ jobs:
           echo "Memory: $(free -h)"
 
       - name: Show XPU Information
-        run: xpu-smi
+        run: xpu-smi discovery
 
       - uses: actions/checkout@v4
 
@@ -245,16 +257,23 @@ jobs:
         uses: actions/download-artifact@v4
         with:
           name: lib_cpu_ubuntu-22.04_x86_64
+          path: bitsandbytes/
+          merge-multiple: true
 
       - name: Setup Python
         uses: actions/setup-python@v5
         with:
           python-version: 3.9
 
+      - name: Install PyTorch
+        run: pip install torch==${{ matrix.torch_version }} --index-url https://download.pytorch.org/whl/xpu
+
+      - name: Install IPEX
+        if: matrix.ipex == true
+        run: pip install intel_extension_for_pytorch==${{ matrix.ipex_version }} --extra-index-url https://pytorch-extension.intel.com/release-whl/stable/xpu/us/
+
       - name: Install dependencies
         run: |
-          pip install torch==2.7.1 --index-url https://download.pytorch.org/whl/xpu
-
           pip install -e ".[test]"
           pip install pytest-cov
 
@@ -265,6 +284,8 @@ jobs:
         run: |
           python -m torch.utils.collect_env
           python -m bitsandbytes
+      - name: Run tests
+        run: pytest --durations=100
 
   test-cuda:
     if: github.repository == 'bitsandbytes-foundation/bitsandbytes'

From 0623333073ef01b104d0290a2d9483587b904289 Mon Sep 17 00:00:00 2001
From: Matthew Douglas <38992547+matthewdouglas@users.noreply.github.com>
Date: Thu, 12 Jun 2025 12:52:40 -0400
Subject: [PATCH 03/22] test

---
 .github/workflows/tests.yml | 28 ++++++++++++++++------------
 1 file changed, 16 insertions(+), 12 deletions(-)

diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml
index d7a62eaae..8549a6ed9 100644
--- a/.github/workflows/tests.yml
+++ b/.github/workflows/tests.yml
@@ -228,15 +228,16 @@ jobs:
     strategy:
       fail-fast: false
       matrix:
-        torch_version: ["2.6.0", "2.7.1"]
-        ipex: [true, false]
-        include:
-          - torch_version: "2.6.0"
-            ipex: true
-            ipex_version: "2.6.10+xpu"
-          - torch_version: "2.7.1"
-            ipex: true
-            ipex_version: "2.7.10+xpu"
+        torch_version: ["2.7.1"] #["2.6.0", "2.7.1"]
+        ipex: [false]
+        # ipex: [true, false]
+        # include:
+        #   - torch_version: "2.6.0"
+        #     ipex: true
+        #     ipex_version: "2.6.10+xpu"
+        #   - torch_version: "2.7.1"
+        #     ipex: true
+        #     ipex_version: "2.7.10+xpu"
     runs-on:
       group: bandb-itac-bmsprpvc1550-8-1gpu
     env:
@@ -249,7 +250,9 @@ jobs:
           echo "Memory: $(free -h)"
 
       - name: Show XPU Information
-        run: xpu-smi discovery
+        run: |
+          xpu-smi discovery
+          hwinfo --display
 
       - uses: actions/checkout@v4
 
@@ -284,8 +287,9 @@ jobs:
         run: |
           python -m torch.utils.collect_env
           python -m bitsandbytes
-      - name: Run tests
-        run: pytest --durations=100
+
+      # - name: Run tests
+      #   run: pytest --durations=100
 
   test-cuda:
     if: github.repository == 'bitsandbytes-foundation/bitsandbytes'

From 43b7d5185aae594a5acdf60a5a0cd385c98e82df Mon Sep 17 00:00:00 2001
From: Matthew Douglas <38992547+matthewdouglas@users.noreply.github.com>
Date: Thu, 12 Jun 2025 13:38:45 -0400
Subject: [PATCH 04/22] test

---
 .github/workflows/tests.yml | 24 ++++++++++++++++++++++++
 1 file changed, 24 insertions(+)

diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml
index 8549a6ed9..8af930945 100644
--- a/.github/workflows/tests.yml
+++ b/.github/workflows/tests.yml
@@ -222,6 +222,28 @@ jobs:
   #     - name: Show pip packages
   #       run: pip list
 
+  test-hpu:
+    if: github.repository == 'bitsandbytes-foundation/bitsandbytes'
+    needs: build-cpu
+    strategy:
+      fail-fast: false
+      matrix:
+        torch_version: ["2.6.0"]
+    runs-on:
+      group: bandb-itac-bmemr-gaudi3-1gaudi
+    env:
+      BNB_TEST_DEVICE: hpu
+    steps:
+      - name: Show system information
+        run: |
+          echo "OS: $(uname -a)"
+          echo "CPU: $(lscpu | grep 'Model name')"
+          echo "Memory: $(free -h)"
+
+      - name: Show HPU Information
+        run: |
+          hpu-smi
+
   test-xpu:
     if: github.repository == 'bitsandbytes-foundation/bitsandbytes'
     needs: build-cpu
@@ -252,6 +274,8 @@ jobs:
       - name: Show XPU Information
         run: |
           xpu-smi discovery
+          lspci
+          apt-get install -y hwinfo
           hwinfo --display
 
       - uses: actions/checkout@v4

From 16f5a88b493aeacd3e2cb1d874e6cd3314c5013f Mon Sep 17 00:00:00 2001
From: Matthew Douglas <38992547+matthewdouglas@users.noreply.github.com>
Date: Thu, 12 Jun 2025 13:42:33 -0400
Subject: [PATCH 05/22] test

---
 .github/workflows/tests.yml | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml
index 8af930945..f36df8c18 100644
--- a/.github/workflows/tests.yml
+++ b/.github/workflows/tests.yml
@@ -231,8 +231,12 @@ jobs:
         torch_version: ["2.6.0"]
     runs-on:
       group: bandb-itac-bmemr-gaudi3-1gaudi
-    env:
-      BNB_TEST_DEVICE: hpu
+    container:
+      image: vault.habana.ai/gaudi-docker/1.20.1/ubuntu22.04/habanalabs/pytorch-installer-2.6.0:latest
+      options: --runtime=habana --shm-size=64G --env HABANA_VISIBLE_DEVICES --env HABANA_VISIBLE_MODULES
+      env:
+        OMPI_MCA_btl_vader_single_copy_mechanism: none
+        BNB_TEST_DEVICE: hpu
     steps:
       - name: Show system information
         run: |

From fc3746d08106ceb9b46a3a80edf006ae7c0e32ce Mon Sep 17 00:00:00 2001
From: Matthew Douglas <38992547+matthewdouglas@users.noreply.github.com>
Date: Thu, 12 Jun 2025 13:43:15 -0400
Subject: [PATCH 06/22] test

---
 .github/workflows/tests.yml | 1 -
 1 file changed, 1 deletion(-)

diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml
index f36df8c18..f2be9a345 100644
--- a/.github/workflows/tests.yml
+++ b/.github/workflows/tests.yml
@@ -278,7 +278,6 @@ jobs:
       - name: Show XPU Information
         run: |
           xpu-smi discovery
-          lspci
           apt-get install -y hwinfo
           hwinfo --display
 

From 4c7b755fcfd83e15957d9d0132f6cf3b8aaf6d6d Mon Sep 17 00:00:00 2001
From: Matthew Douglas <38992547+matthewdouglas@users.noreply.github.com>
Date: Thu, 12 Jun 2025 13:48:48 -0400
Subject: [PATCH 07/22] test

---
 .github/workflows/tests.yml | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml
index f2be9a345..539d330e4 100644
--- a/.github/workflows/tests.yml
+++ b/.github/workflows/tests.yml
@@ -231,12 +231,12 @@ jobs:
         torch_version: ["2.6.0"]
     runs-on:
       group: bandb-itac-bmemr-gaudi3-1gaudi
-    container:
-      image: vault.habana.ai/gaudi-docker/1.20.1/ubuntu22.04/habanalabs/pytorch-installer-2.6.0:latest
-      options: --runtime=habana --shm-size=64G --env HABANA_VISIBLE_DEVICES --env HABANA_VISIBLE_MODULES
-      env:
-        OMPI_MCA_btl_vader_single_copy_mechanism: none
-        BNB_TEST_DEVICE: hpu
+    # container:
+    #   image: vault.habana.ai/gaudi-docker/1.20.1/ubuntu22.04/habanalabs/pytorch-installer-2.6.0:latest
+    #   options: --runtime=habana --shm-size=64G --env HABANA_VISIBLE_DEVICES --env HABANA_VISIBLE_MODULES
+    #   env:
+    #     OMPI_MCA_btl_vader_single_copy_mechanism: none
+    #     BNB_TEST_DEVICE: hpu
     steps:
       - name: Show system information
         run: |
@@ -246,7 +246,7 @@ jobs:
 
       - name: Show HPU Information
         run: |
-          hpu-smi
+          hl-smi
 
   test-xpu:
     if: github.repository == 'bitsandbytes-foundation/bitsandbytes'

From 69fbb636990a378956837b36173d0d3ee8bb5298 Mon Sep 17 00:00:00 2001
From: Matthew Douglas <38992547+matthewdouglas@users.noreply.github.com>
Date: Thu, 12 Jun 2025 13:49:31 -0400
Subject: [PATCH 08/22] test

---
 .github/workflows/tests.yml | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml
index 539d330e4..5c3bc2742 100644
--- a/.github/workflows/tests.yml
+++ b/.github/workflows/tests.yml
@@ -278,7 +278,8 @@ jobs:
       - name: Show XPU Information
         run: |
           xpu-smi discovery
-          apt-get install -y hwinfo
+          sudo xpu-smi discovery
+          sudo apt-get install -y hwinfo
           hwinfo --display
 
       - uses: actions/checkout@v4

From e82c4da1b8305211cada97c83f9591b4e3e86b8a Mon Sep 17 00:00:00 2001
From: Matthew Douglas <38992547+matthewdouglas@users.noreply.github.com>
Date: Thu, 12 Jun 2025 14:02:59 -0400
Subject: [PATCH 09/22] test

---
 .github/workflows/tests.yml | 42 +++++++++++++++++++++++++++++++------
 1 file changed, 36 insertions(+), 6 deletions(-)

diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml
index 5c3bc2742..f85994e31 100644
--- a/.github/workflows/tests.yml
+++ b/.github/workflows/tests.yml
@@ -231,12 +231,14 @@ jobs:
         torch_version: ["2.6.0"]
     runs-on:
       group: bandb-itac-bmemr-gaudi3-1gaudi
-    # container:
-    #   image: vault.habana.ai/gaudi-docker/1.20.1/ubuntu22.04/habanalabs/pytorch-installer-2.6.0:latest
-    #   options: --runtime=habana --shm-size=64G --env HABANA_VISIBLE_DEVICES --env HABANA_VISIBLE_MODULES
-    #   env:
-    #     OMPI_MCA_btl_vader_single_copy_mechanism: none
-    #     BNB_TEST_DEVICE: hpu
+    env:
+      BNB_TEST_DEVICE: hpu
+    container:
+      image: vault.habana.ai/gaudi-docker/1.20.1/ubuntu22.04/habanalabs/pytorch-installer-2.6.0:latest
+      options: --runtime=habana --shm-size=64G --env HABANA_VISIBLE_DEVICES --env HABANA_VISIBLE_MODULES
+      env:
+        OMPI_MCA_btl_vader_single_copy_mechanism: none
+        BNB_TEST_DEVICE: hpu
     steps:
       - name: Show system information
         run: |
@@ -248,6 +250,34 @@ jobs:
         run: |
           hl-smi
 
+      - uses: actions/checkout@v4
+
+      - name: Download build artifact
+        uses: actions/download-artifact@v4
+        with:
+          name: lib_cpu_ubuntu-22.04_x86_64
+          path: bitsandbytes/
+          merge-multiple: true
+
+      - name: Show installed packages
+        run: pip list
+
+      - name: Install dependencies
+        run: |
+          pip install -e ".[test]"
+          pip install pytest-cov
+
+      - name: Show installed packages
+        run: pip list
+
+      - name: Show environment information
+        run: |
+          python -m torch.utils.collect_env
+          python -m bitsandbytes
+
+      - name: Run tests
+        run: pytest --durations=100
+
   test-xpu:
     if: github.repository == 'bitsandbytes-foundation/bitsandbytes'
     needs: build-cpu

From be47d49a23ee695bf54ad427476cad79c4bad149 Mon Sep 17 00:00:00 2001
From: Matthew Douglas <38992547+matthewdouglas@users.noreply.github.com>
Date: Thu, 12 Jun 2025 14:38:01 -0400
Subject: [PATCH 10/22] test

---
 bitsandbytes/__init__.py | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

diff --git a/bitsandbytes/__init__.py b/bitsandbytes/__init__.py
index 8fcf57a65..06b4bf2c7 100644
--- a/bitsandbytes/__init__.py
+++ b/bitsandbytes/__init__.py
@@ -4,6 +4,7 @@
 # LICENSE file in the root directory of this source tree.
 
 
+import importlib
 import sys
 
 import torch
@@ -37,8 +38,13 @@
 if hasattr(torch, "xpu") and torch.xpu.is_available():
     from .backends.xpu import ops as xpu_ops
 
-if hasattr(torch, "hpu") and torch.hpu.is_available():
-    from .backends.hpu import ops as hpu_ops
+
+if importlib.util.find_spec("habana_frameworks.torch"):
+    # In case not automatically imported
+    import habana_frameworks.torch # type: ignore # noqa: I001
+
+    if hasattr(torch, "hpu") and torch.hpu.is_available():
+        from .backends.hpu import ops as hpu_ops
 
 
 def _import_backends():

From 4faa8e2cd83353f3ec64732f0649fef9b2cb3569 Mon Sep 17 00:00:00 2001
From: Matthew Douglas <38992547+matthewdouglas@users.noreply.github.com>
Date: Thu, 12 Jun 2025 14:48:22 -0400
Subject: [PATCH 11/22] test

---
 bitsandbytes/__init__.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/bitsandbytes/__init__.py b/bitsandbytes/__init__.py
index 06b4bf2c7..bd9b41e28 100644
--- a/bitsandbytes/__init__.py
+++ b/bitsandbytes/__init__.py
@@ -39,9 +39,9 @@
     from .backends.xpu import ops as xpu_ops
 
 
-if importlib.util.find_spec("habana_frameworks.torch"):
+if importlib.util.find_spec("habana_frameworks") and importlib.util.find_spec("habana_frameworks.torch"):
     # In case not automatically imported
-    import habana_frameworks.torch # type: ignore # noqa: I001
+    import habana_frameworks.torch # noqa: I001
 
     if hasattr(torch, "hpu") and torch.hpu.is_available():
         from .backends.hpu import ops as hpu_ops

From 5dae4a835cab3e0b9697d76217999061206d6e24 Mon Sep 17 00:00:00 2001
From: Matthew Douglas <38992547+matthewdouglas@users.noreply.github.com>
Date: Thu, 12 Jun 2025 15:14:04 -0400
Subject: [PATCH 12/22] test

---
 .github/workflows/tests.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml
index f85994e31..0d3884593 100644
--- a/.github/workflows/tests.yml
+++ b/.github/workflows/tests.yml
@@ -234,7 +234,7 @@ jobs:
     env:
       BNB_TEST_DEVICE: hpu
     container:
-      image: vault.habana.ai/gaudi-docker/1.20.1/ubuntu22.04/habanalabs/pytorch-installer-2.6.0:latest
+      image: vault.habana.ai/gaudi-docker/1.21.1/ubuntu22.04/habanalabs/pytorch-installer-2.6.0:latest
       options: --runtime=habana --shm-size=64G --env HABANA_VISIBLE_DEVICES --env HABANA_VISIBLE_MODULES
       env:
         OMPI_MCA_btl_vader_single_copy_mechanism: none

From 5c736a7c04acd7c65db628442e56654b314f58cd Mon Sep 17 00:00:00 2001
From: Matthew Douglas <38992547+matthewdouglas@users.noreply.github.com>
Date: Thu, 12 Jun 2025 16:23:19 -0400
Subject: [PATCH 13/22] skip some fp4 tests on hpu

---
 tests/test_autograd.py   | 3 +++
 tests/test_linear4bit.py | 3 +++
 tests/test_ops.py        | 6 ++++++
 3 files changed, 12 insertions(+)

diff --git a/tests/test_autograd.py b/tests/test_autograd.py
index 5fbe1065f..26cbab413 100644
--- a/tests/test_autograd.py
+++ b/tests/test_autograd.py
@@ -189,6 +189,9 @@ def test_matmul_4bit(
     if device == "cpu" and dtype != torch.float32 and any(req_grad) and torch.__version__ < (2, 6):
         pytest.xfail("mse_loss fp16 on CPU is not supported in torch < 2.6")
 
+    if device == "hpu" and quant_type != "nf4":
+        pytest.skip("HPU only supports nf4")
+
     for i in range(3):
         # normal multiply
         if funcs[0] in [torch.mm, torch.matmul]:
diff --git a/tests/test_linear4bit.py b/tests/test_linear4bit.py
index f28bfa29e..f433f5d87 100644
--- a/tests/test_linear4bit.py
+++ b/tests/test_linear4bit.py
@@ -276,6 +276,9 @@ def test_linear4bit_torch_compile(device, quant_type, compute_dtype, compress_st
     if device == "cuda" and platform.system() == "Windows":
         pytest.skip("Triton is not officially supported on Windows")
 
+    if device == "hpu" and quant_type != "nf4":
+        pytest.skip("fp4 dequantization is not supported on HPU")
+
     # Has a strange regression on Linux aarch64 CPU in torch==2.6.0 when fullgraph=False.
     if (
         not fullgraph
diff --git a/tests/test_ops.py b/tests/test_ops.py
index 60c47a250..c58d0d1ac 100644
--- a/tests/test_ops.py
+++ b/tests/test_ops.py
@@ -179,6 +179,9 @@ def test_quantize_4bit(self, device, dtype, storage_dtype, quant_type, blocksize
     @pytest.mark.parametrize("quant_type", ["fp4", "nf4"])
     @pytest.mark.parametrize("blocksize", [64, 128, 256, 512])
     def test_dequantize_4bit(self, device, dtype, storage_dtype, quant_type, blocksize):
+        if device == "hpu" and quant_type != "nf4":
+            pytest.skip("fp4 dequantization is not supported on HPU")
+
         shape = (128, 128)
 
         n = prod(shape)
@@ -210,6 +213,9 @@ def test_dequantize_4bit(self, device, dtype, storage_dtype, quant_type, blocksi
     @pytest.mark.parametrize("quant_type", ["fp4", "nf4"])
     @pytest.mark.parametrize("blocksize", [64, 128, 256, 512])
     def test_gemv_4bit(self, device, dtype, storage_dtype, quant_type, blocksize):
+        if device == "hpu" and quant_type != "nf4":
+            pytest.skip("fp4 dequantization is not supported on HPU")
+
         out_features = 1024
         in_features = 256
 

From bdd28f2b838c59f0feb48024c1c3790d1461b232 Mon Sep 17 00:00:00 2001
From: Matthew Douglas <38992547+matthewdouglas@users.noreply.github.com>
Date: Thu, 12 Jun 2025 16:59:28 -0400
Subject: [PATCH 14/22] skip some fp4 tests on hpu

---
 tests/test_functional.py | 12 ++++++++++++
 1 file changed, 12 insertions(+)

diff --git a/tests/test_functional.py b/tests/test_functional.py
index 2e2e898cc..7c7c0fec8 100644
--- a/tests/test_functional.py
+++ b/tests/test_functional.py
@@ -1101,6 +1101,9 @@ class TestQuantize4BitFunctional:
     @pytest.mark.parametrize("quant_type", ["fp4", "nf4"])
     @pytest.mark.parametrize("blocksize", [64, 128, 256, 512, 1024, 2048, 4096])
     def test_4bit_quant(self, device, dtype, quant_type, blocksize):
+        if device == "hpu" and quant_type != "nf4":
+            pytest.skip("fp4 dequantization is not supported on HPU")
+
         A1 = torch.randn(1024, 1024, device=device, dtype=dtype)
         qa, SA = F.quantize_4bit(A1, blocksize=blocksize, quant_type=quant_type)
         A2 = F.dequantize_4bit(qa, SA, blocksize=blocksize, quant_type=quant_type)
@@ -1133,6 +1136,9 @@ def test_4bit_quant(self, device, dtype, quant_type, blocksize):
     @pytest.mark.parametrize("quant_type", ["fp4", "nf4"])
     @pytest.mark.parametrize("blocksize", [64, 128], ids=id_formatter("blocksize"))
     def test_4bit_compressed_stats(self, device, quant_type, blocksize):
+        if device == "hpu" and quant_type != "nf4":
+            pytest.skip("fp4 dequantization is not supported on HPU")
+
         errs1 = []
         errs2 = []
         for i in range(10):
@@ -1205,6 +1211,9 @@ def test_bench_4bit_dequant(self, quant_type):
     )
     @pytest.mark.parametrize("dim", [128, 256, 512, 1024], ids=id_formatter("dim"))
     def test_gemv_4bit(self, device, dim, dtype, storage_type, quant_storage, double_quant, kind):
+        if device == "hpu" and storage_type != "nf4":
+            pytest.skip("fp4 dequantization is not supported on HPU")
+
         errs1 = []
         errs2 = []
         errs3 = []
@@ -1354,6 +1363,9 @@ def test_gemv_eye_4bit(self, device, storage_type, dtype, double_quant):
         if device == "cpu" and dtype == torch.bfloat16 and torch.__version__ < (2, 3):
             pytest.skip("eye doe not support bfloat16 on CPU in torch < 2.3")
 
+        if device == "hpu" and storage_type != "nf4":
+            pytest.skip("fp4 dequantization is not supported on HPU")
+
         dims = 10
         torch.random.manual_seed(np.random.randint(0, 412424242))
         dims = get_test_dims(0, 8192, n=dims)

From 55da7f397e91a1436352b07c6f8cec79047c99e2 Mon Sep 17 00:00:00 2001
From: Matthew Douglas <38992547+matthewdouglas@users.noreply.github.com>
Date: Thu, 12 Jun 2025 17:36:30 -0400
Subject: [PATCH 15/22] skip gemv tests on hpu

---
 tests/test_functional.py | 4 ++--
 tests/test_ops.py        | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/tests/test_functional.py b/tests/test_functional.py
index 7c7c0fec8..8ec43c244 100644
--- a/tests/test_functional.py
+++ b/tests/test_functional.py
@@ -1211,8 +1211,8 @@ def test_bench_4bit_dequant(self, quant_type):
     )
     @pytest.mark.parametrize("dim", [128, 256, 512, 1024], ids=id_formatter("dim"))
     def test_gemv_4bit(self, device, dim, dtype, storage_type, quant_storage, double_quant, kind):
-        if device == "hpu" and storage_type != "nf4":
-            pytest.skip("fp4 dequantization is not supported on HPU")
+        if device == "hpu":
+            pytest.skip("gemv not supported on HPU")
 
         errs1 = []
         errs2 = []
diff --git a/tests/test_ops.py b/tests/test_ops.py
index c58d0d1ac..fb5a399d5 100644
--- a/tests/test_ops.py
+++ b/tests/test_ops.py
@@ -213,8 +213,8 @@ def test_dequantize_4bit(self, device, dtype, storage_dtype, quant_type, blocksi
     @pytest.mark.parametrize("quant_type", ["fp4", "nf4"])
     @pytest.mark.parametrize("blocksize", [64, 128, 256, 512])
     def test_gemv_4bit(self, device, dtype, storage_dtype, quant_type, blocksize):
-        if device == "hpu" and quant_type != "nf4":
-            pytest.skip("fp4 dequantization is not supported on HPU")
+        if device == "hpu":
+            pytest.skip("gemv not supported on HPU")
 
         out_features = 1024
         in_features = 256

From 3f97860545950bf7148c3eb8e98b4f0105ad7cba Mon Sep 17 00:00:00 2001
From: Matthew Douglas <38992547+matthewdouglas@users.noreply.github.com>
Date: Thu, 12 Jun 2025 18:22:01 -0400
Subject: [PATCH 16/22] test

---
 tests/test_autograd.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/tests/test_autograd.py b/tests/test_autograd.py
index 26cbab413..0ea430220 100644
--- a/tests/test_autograd.py
+++ b/tests/test_autograd.py
@@ -233,6 +233,9 @@ def test_matmul_4bit(
                 out_bnb.data.copy_(out_torch)
                 if device == "cuda":
                     torch.cuda.synchronize()
+                elif device == "hpu":
+                    torch.hpu.synchronize()
+
                 loss_bnb = torch.nn.functional.mse_loss(out_bnb, target).mean()
                 loss_bnb.backward()
                 gradA1 = A.grad

From de6057be7672901c2c68ecfcce4ab854809ea6b3 Mon Sep 17 00:00:00 2001
From: Matthew Douglas <38992547+matthewdouglas@users.noreply.github.com>
Date: Mon, 16 Jun 2025 14:18:54 -0400
Subject: [PATCH 17/22] Additional test patches for HPU

---
 tests/test_linear4bit.py   |  3 ---
 tests/test_linear8bitlt.py |  3 ++-
 tests/test_modules.py      | 20 +++++++++++++++++++-
 3 files changed, 21 insertions(+), 5 deletions(-)

diff --git a/tests/test_linear4bit.py b/tests/test_linear4bit.py
index 28b04ded8..9fcde695d 100644
--- a/tests/test_linear4bit.py
+++ b/tests/test_linear4bit.py
@@ -294,9 +294,6 @@ def test_linear4bit_torch_compile(device, quant_type, compute_dtype, compress_st
     if device == "cuda" and platform.system() == "Windows":
         pytest.skip("Triton is not officially supported on Windows")
 
-    if device == "hpu" and quant_type != "nf4":
-        pytest.skip("fp4 dequantization is not supported on HPU")
-
     # Has a strange regression on Linux aarch64 CPU in torch==2.6.0 when fullgraph=False.
     if (
         not fullgraph
diff --git a/tests/test_linear8bitlt.py b/tests/test_linear8bitlt.py
index 271920b11..86726bd44 100644
--- a/tests/test_linear8bitlt.py
+++ b/tests/test_linear8bitlt.py
@@ -257,7 +257,8 @@ def test_linear8bitlt_torch_compile(device, threshold, bias, fullgraph, mode):
             ref_output = net(x)
 
         # Compile the model
-        compiled_net = torch.compile(net, fullgraph=fullgraph, mode=mode)
+        compile_backend = "hpu_backend" if device == "hpu" else "inductor"
+        compiled_net = torch.compile(net, fullgraph=fullgraph, mode=mode, backend=compile_backend)
 
         # Get output from compiled model
         with torch.no_grad():
diff --git a/tests/test_modules.py b/tests/test_modules.py
index 9eeb79f76..f996c45a1 100644
--- a/tests/test_modules.py
+++ b/tests/test_modules.py
@@ -5,7 +5,7 @@
 from torch import nn
 
 import bitsandbytes as bnb
-from tests.helpers import get_available_devices, id_formatter
+from tests.helpers import get_available_devices, id_formatter, is_supported_on_hpu
 
 
 class MockArgs:
@@ -295,7 +295,13 @@ def test_kbit_backprop(device, module):
     torch.nn.init.kaiming_normal_(ref[0].weight)
     torch.nn.init.kaiming_normal_(ref[1].weight)
     ref[1].weight.requires_grad_(False)
+
     kbit = nn.Sequential(*[torch.nn.Linear(dim1, dim2), module(dim2, 128)])
+
+    if device == "hpu":
+        if isinstance(module, bnb.nn.LinearFP4):
+            pytest.skip("FP4 is not supported on HPU")
+
     kbit[0].weight.detach().copy_(ref[0].weight)
     kbit[1].weight.detach().copy_(ref[1].weight)
     kbit[0].bias.detach().copy_(ref[0].bias)
@@ -358,6 +364,12 @@ def test_kbit_backprop(device, module):
     ids=lambda x: x.__name__ if inspect.isclass(x) else str(x),
 )
 def test_embedding_lossless(device, embedding_class, input_shape, embedding_dim, quant_storage):
+    if device == "hpu":
+        if embedding_class is bnb.nn.EmbeddingFP4:
+            pytest.skip("FP4 is not supported on HPU")
+        elif embedding_class is bnb.nn.EmbeddingNF4 and not is_supported_on_hpu("nf4", torch.float32, quant_storage):
+            pytest.skip("This configuration is not supported on HPU")
+
     num_embeddings = 128
 
     src_weight = (torch.randn((num_embeddings, embedding_dim), dtype=torch.float32) > 0).to(
@@ -403,6 +415,12 @@ def test_embedding_lossless(device, embedding_class, input_shape, embedding_dim,
     ids=lambda x: x.__name__ if inspect.isclass(x) else str(x),
 )
 def test_embedding_error(device, embedding_class, input_shape, embedding_dim, quant_storage):
+    if device == "hpu":
+        if embedding_class is bnb.nn.EmbeddingFP4:
+            pytest.skip("FP4 is not supported on HPU")
+        elif embedding_class is bnb.nn.EmbeddingNF4 and not is_supported_on_hpu("nf4", torch.float32, quant_storage):
+            pytest.skip("This configuration is not supported on HPU")
+
     is_8bit = embedding_class is bnb.nn.Embedding8bit
 
     num_embeddings = 128

From 214c3f3abaf77bd17a397d71a02dc2fdcb083f9f Mon Sep 17 00:00:00 2001
From: Matthew Douglas <38992547+matthewdouglas@users.noreply.github.com>
Date: Mon, 16 Jun 2025 14:44:55 -0400
Subject: [PATCH 18/22] HPU test update

---
 tests/test_modules.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/test_modules.py b/tests/test_modules.py
index f996c45a1..52d187a18 100644
--- a/tests/test_modules.py
+++ b/tests/test_modules.py
@@ -299,7 +299,7 @@ def test_kbit_backprop(device, module):
     kbit = nn.Sequential(*[torch.nn.Linear(dim1, dim2), module(dim2, 128)])
 
     if device == "hpu":
-        if isinstance(module, bnb.nn.LinearFP4):
+        if isinstance(kbit, bnb.nn.LinearFP4):
             pytest.skip("FP4 is not supported on HPU")
 
     kbit[0].weight.detach().copy_(ref[0].weight)

From a1b333167d2093d20d0ca0c760b1c9533aa00e40 Mon Sep 17 00:00:00 2001
From: Matthew Douglas <38992547+matthewdouglas@users.noreply.github.com>
Date: Mon, 16 Jun 2025 15:20:34 -0400
Subject: [PATCH 19/22] HPU test update

---
 tests/test_modules.py | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/tests/test_modules.py b/tests/test_modules.py
index 52d187a18..b89dafc6d 100644
--- a/tests/test_modules.py
+++ b/tests/test_modules.py
@@ -298,9 +298,8 @@ def test_kbit_backprop(device, module):
 
     kbit = nn.Sequential(*[torch.nn.Linear(dim1, dim2), module(dim2, 128)])
 
-    if device == "hpu":
-        if isinstance(kbit, bnb.nn.LinearFP4):
-            pytest.skip("FP4 is not supported on HPU")
+    if device == "hpu" and isinstance(kbit[1], bnb.nn.LinearFP4):
+        pytest.skip("FP4 is not supported on HPU")
 
     kbit[0].weight.detach().copy_(ref[0].weight)
     kbit[1].weight.detach().copy_(ref[1].weight)

From 0a7f959ce1d04272ff40e3e22d8e51b48e8a5fd1 Mon Sep 17 00:00:00 2001
From: Matthew Douglas <38992547+matthewdouglas@users.noreply.github.com>
Date: Mon, 16 Jun 2025 15:39:48 -0400
Subject: [PATCH 20/22] HPU test update

---
 tests/test_modules.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/tests/test_modules.py b/tests/test_modules.py
index b89dafc6d..bdfa830f4 100644
--- a/tests/test_modules.py
+++ b/tests/test_modules.py
@@ -276,9 +276,9 @@ def test_linear_kbit_fp32_bias(device, module):
     "NF4": bnb.nn.LinearNF4,
     "FP4+C": lambda d1, d2: bnb.nn.LinearFP4(d1, d2, compress_statistics=True),
     "NF4+C": lambda d1, d2: bnb.nn.LinearNF4(d1, d2, compress_statistics=True),
-    "NF4+fp32": lambda d1, d2: bnb.nn.LinearFP4(d1, d2, compute_dtype=torch.float32),
-    "NF4+fp16": lambda d1, d2: bnb.nn.LinearFP4(d1, d2, compute_dtype=torch.float16),
-    "NF4+bf16": lambda d1, d2: bnb.nn.LinearFP4(d1, d2, compute_dtype=torch.bfloat16),
+    "NF4+fp32": lambda d1, d2: bnb.nn.LinearNF4(d1, d2, compute_dtype=torch.float32),
+    "NF4+fp16": lambda d1, d2: bnb.nn.LinearNF4(d1, d2, compute_dtype=torch.float16),
+    "NF4+bf16": lambda d1, d2: bnb.nn.LinearNF4(d1, d2, compute_dtype=torch.bfloat16),
 }
 
 

From 2ba4b8feb48b947ed7849f9cec2831434ef85b3b Mon Sep 17 00:00:00 2001
From: Matthew Douglas <38992547+matthewdouglas@users.noreply.github.com>
Date: Mon, 16 Jun 2025 17:37:55 -0400
Subject: [PATCH 21/22] HPU test update

---
 tests/test_modules.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/test_modules.py b/tests/test_modules.py
index bdfa830f4..e35afb214 100644
--- a/tests/test_modules.py
+++ b/tests/test_modules.py
@@ -298,7 +298,7 @@ def test_kbit_backprop(device, module):
 
     kbit = nn.Sequential(*[torch.nn.Linear(dim1, dim2), module(dim2, 128)])
 
-    if device == "hpu" and isinstance(kbit[1], bnb.nn.LinearFP4):
+    if device == "hpu" and isinstance(kbit[1], bnb.nn.Linear4bit) and kbit[1].weight.quant_type == "fp4":
         pytest.skip("FP4 is not supported on HPU")
 
     kbit[0].weight.detach().copy_(ref[0].weight)

From 0c529a710117c6b047cc126faf18aeff88e01e03 Mon Sep 17 00:00:00 2001
From: Matthew Douglas <38992547+matthewdouglas@users.noreply.github.com>
Date: Tue, 17 Jun 2025 12:18:28 -0400
Subject: [PATCH 22/22] Format

---
 bitsandbytes/__init__.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/bitsandbytes/__init__.py b/bitsandbytes/__init__.py
index bd9b41e28..516afa51f 100644
--- a/bitsandbytes/__init__.py
+++ b/bitsandbytes/__init__.py
@@ -41,7 +41,7 @@
 
 if importlib.util.find_spec("habana_frameworks") and importlib.util.find_spec("habana_frameworks.torch"):
     # In case not automatically imported
-    import habana_frameworks.torch # noqa: I001
+    import habana_frameworks.torch
 
     if hasattr(torch, "hpu") and torch.hpu.is_available():
         from .backends.hpu import ops as hpu_ops