diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml index c6423b1f8..0d3884593 100644 --- a/.github/workflows/tests.yml +++ b/.github/workflows/tests.yml @@ -222,6 +222,133 @@ jobs: # - name: Show pip packages # run: pip list + test-hpu: + if: github.repository == 'bitsandbytes-foundation/bitsandbytes' + needs: build-cpu + strategy: + fail-fast: false + matrix: + torch_version: ["2.6.0"] + runs-on: + group: bandb-itac-bmemr-gaudi3-1gaudi + env: + BNB_TEST_DEVICE: hpu + container: + image: vault.habana.ai/gaudi-docker/1.21.1/ubuntu22.04/habanalabs/pytorch-installer-2.6.0:latest + options: --runtime=habana --shm-size=64G --env HABANA_VISIBLE_DEVICES --env HABANA_VISIBLE_MODULES + env: + OMPI_MCA_btl_vader_single_copy_mechanism: none + BNB_TEST_DEVICE: hpu + steps: + - name: Show system information + run: | + echo "OS: $(uname -a)" + echo "CPU: $(lscpu | grep 'Model name')" + echo "Memory: $(free -h)" + + - name: Show HPU Information + run: | + hl-smi + + - uses: actions/checkout@v4 + + - name: Download build artifact + uses: actions/download-artifact@v4 + with: + name: lib_cpu_ubuntu-22.04_x86_64 + path: bitsandbytes/ + merge-multiple: true + + - name: Show installed packages + run: pip list + + - name: Install dependencies + run: | + pip install -e ".[test]" + pip install pytest-cov + + - name: Show installed packages + run: pip list + + - name: Show environment information + run: | + python -m torch.utils.collect_env + python -m bitsandbytes + + - name: Run tests + run: pytest --durations=100 + + test-xpu: + if: github.repository == 'bitsandbytes-foundation/bitsandbytes' + needs: build-cpu + strategy: + fail-fast: false + matrix: + torch_version: ["2.7.1"] #["2.6.0", "2.7.1"] + ipex: [false] + # ipex: [true, false] + # include: + # - torch_version: "2.6.0" + # ipex: true + # ipex_version: "2.6.10+xpu" + # - torch_version: "2.7.1" + # ipex: true + # ipex_version: "2.7.10+xpu" + runs-on: + group: bandb-itac-bmsprpvc1550-8-1gpu + env: + BNB_TEST_DEVICE: xpu + steps: + - name: Show system information + run: | + echo "OS: $(uname -a)" + echo "CPU: $(lscpu | grep 'Model name')" + echo "Memory: $(free -h)" + + - name: Show XPU Information + run: | + xpu-smi discovery + sudo xpu-smi discovery + sudo apt-get install -y hwinfo + hwinfo --display + + - uses: actions/checkout@v4 + + - name: Download build artifact + uses: actions/download-artifact@v4 + with: + name: lib_cpu_ubuntu-22.04_x86_64 + path: bitsandbytes/ + merge-multiple: true + + - name: Setup Python + uses: actions/setup-python@v5 + with: + python-version: 3.9 + + - name: Install PyTorch + run: pip install torch==${{ matrix.torch_version }} --index-url https://download.pytorch.org/whl/xpu + + - name: Install IPEX + if: matrix.ipex == true + run: pip install intel_extension_for_pytorch==${{ matrix.ipex_version }} --extra-index-url https://pytorch-extension.intel.com/release-whl/stable/xpu/us/ + + - name: Install dependencies + run: | + pip install -e ".[test]" + pip install pytest-cov + + - name: Show installed packages + run: pip list + + - name: Show environment information + run: | + python -m torch.utils.collect_env + python -m bitsandbytes + + # - name: Run tests + # run: pytest --durations=100 + test-cuda: if: github.repository == 'bitsandbytes-foundation/bitsandbytes' needs: build-cuda diff --git a/bitsandbytes/__init__.py b/bitsandbytes/__init__.py index 8fcf57a65..516afa51f 100644 --- a/bitsandbytes/__init__.py +++ b/bitsandbytes/__init__.py @@ -4,6 +4,7 @@ # LICENSE file in the root directory of this source tree. +import importlib import sys import torch @@ -37,8 +38,13 @@ if hasattr(torch, "xpu") and torch.xpu.is_available(): from .backends.xpu import ops as xpu_ops -if hasattr(torch, "hpu") and torch.hpu.is_available(): - from .backends.hpu import ops as hpu_ops + +if importlib.util.find_spec("habana_frameworks") and importlib.util.find_spec("habana_frameworks.torch"): + # In case not automatically imported + import habana_frameworks.torch + + if hasattr(torch, "hpu") and torch.hpu.is_available(): + from .backends.hpu import ops as hpu_ops def _import_backends(): diff --git a/tests/test_autograd.py b/tests/test_autograd.py index 9737d15d7..7134925c1 100644 --- a/tests/test_autograd.py +++ b/tests/test_autograd.py @@ -234,6 +234,9 @@ def test_matmul_4bit( out_bnb.data.copy_(out_torch) if device == "cuda": torch.cuda.synchronize() + elif device == "hpu": + torch.hpu.synchronize() + loss_bnb = torch.nn.functional.mse_loss(out_bnb, target).mean() loss_bnb.backward() gradA1 = A.grad diff --git a/tests/test_linear8bitlt.py b/tests/test_linear8bitlt.py index 271920b11..86726bd44 100644 --- a/tests/test_linear8bitlt.py +++ b/tests/test_linear8bitlt.py @@ -257,7 +257,8 @@ def test_linear8bitlt_torch_compile(device, threshold, bias, fullgraph, mode): ref_output = net(x) # Compile the model - compiled_net = torch.compile(net, fullgraph=fullgraph, mode=mode) + compile_backend = "hpu_backend" if device == "hpu" else "inductor" + compiled_net = torch.compile(net, fullgraph=fullgraph, mode=mode, backend=compile_backend) # Get output from compiled model with torch.no_grad(): diff --git a/tests/test_modules.py b/tests/test_modules.py index 9eeb79f76..e35afb214 100644 --- a/tests/test_modules.py +++ b/tests/test_modules.py @@ -5,7 +5,7 @@ from torch import nn import bitsandbytes as bnb -from tests.helpers import get_available_devices, id_formatter +from tests.helpers import get_available_devices, id_formatter, is_supported_on_hpu class MockArgs: @@ -276,9 +276,9 @@ def test_linear_kbit_fp32_bias(device, module): "NF4": bnb.nn.LinearNF4, "FP4+C": lambda d1, d2: bnb.nn.LinearFP4(d1, d2, compress_statistics=True), "NF4+C": lambda d1, d2: bnb.nn.LinearNF4(d1, d2, compress_statistics=True), - "NF4+fp32": lambda d1, d2: bnb.nn.LinearFP4(d1, d2, compute_dtype=torch.float32), - "NF4+fp16": lambda d1, d2: bnb.nn.LinearFP4(d1, d2, compute_dtype=torch.float16), - "NF4+bf16": lambda d1, d2: bnb.nn.LinearFP4(d1, d2, compute_dtype=torch.bfloat16), + "NF4+fp32": lambda d1, d2: bnb.nn.LinearNF4(d1, d2, compute_dtype=torch.float32), + "NF4+fp16": lambda d1, d2: bnb.nn.LinearNF4(d1, d2, compute_dtype=torch.float16), + "NF4+bf16": lambda d1, d2: bnb.nn.LinearNF4(d1, d2, compute_dtype=torch.bfloat16), } @@ -295,7 +295,12 @@ def test_kbit_backprop(device, module): torch.nn.init.kaiming_normal_(ref[0].weight) torch.nn.init.kaiming_normal_(ref[1].weight) ref[1].weight.requires_grad_(False) + kbit = nn.Sequential(*[torch.nn.Linear(dim1, dim2), module(dim2, 128)]) + + if device == "hpu" and isinstance(kbit[1], bnb.nn.Linear4bit) and kbit[1].weight.quant_type == "fp4": + pytest.skip("FP4 is not supported on HPU") + kbit[0].weight.detach().copy_(ref[0].weight) kbit[1].weight.detach().copy_(ref[1].weight) kbit[0].bias.detach().copy_(ref[0].bias) @@ -358,6 +363,12 @@ def test_kbit_backprop(device, module): ids=lambda x: x.__name__ if inspect.isclass(x) else str(x), ) def test_embedding_lossless(device, embedding_class, input_shape, embedding_dim, quant_storage): + if device == "hpu": + if embedding_class is bnb.nn.EmbeddingFP4: + pytest.skip("FP4 is not supported on HPU") + elif embedding_class is bnb.nn.EmbeddingNF4 and not is_supported_on_hpu("nf4", torch.float32, quant_storage): + pytest.skip("This configuration is not supported on HPU") + num_embeddings = 128 src_weight = (torch.randn((num_embeddings, embedding_dim), dtype=torch.float32) > 0).to( @@ -403,6 +414,12 @@ def test_embedding_lossless(device, embedding_class, input_shape, embedding_dim, ids=lambda x: x.__name__ if inspect.isclass(x) else str(x), ) def test_embedding_error(device, embedding_class, input_shape, embedding_dim, quant_storage): + if device == "hpu": + if embedding_class is bnb.nn.EmbeddingFP4: + pytest.skip("FP4 is not supported on HPU") + elif embedding_class is bnb.nn.EmbeddingNF4 and not is_supported_on_hpu("nf4", torch.float32, quant_storage): + pytest.skip("This configuration is not supported on HPU") + is_8bit = embedding_class is bnb.nn.Embedding8bit num_embeddings = 128