From fbd425eb2ad17ea2de6b80fb2ba6f42cd3e9cf1e Mon Sep 17 00:00:00 2001 From: ckvermaAI Date: Fri, 13 Jun 2025 11:49:53 +0300 Subject: [PATCH] HPU support for unit tests --- bitsandbytes/backends/hpu/ops.py | 5 ----- tests/helpers.py | 11 +++++++++++ tests/test_autograd.py | 4 ++++ tests/test_functional.py | 18 ++++++++++++++++-- tests/test_linear4bit.py | 25 ++++++++++++++++++++++--- tests/test_ops.py | 11 ++++++++++- 6 files changed, 63 insertions(+), 11 deletions(-) diff --git a/bitsandbytes/backends/hpu/ops.py b/bitsandbytes/backends/hpu/ops.py index 1eeb7f014..4c43a3cb7 100644 --- a/bitsandbytes/backends/hpu/ops.py +++ b/bitsandbytes/backends/hpu/ops.py @@ -29,8 +29,6 @@ def _( if A.dtype != torch.uint8: A = A.view(torch.uint8) - transpose = False if len(A.shape) == 2 and A.shape[0] == 1 else True - A = A.reshape(-1) if GAUDI_SW_VER and (GAUDI_SW_VER.major < 1 or GAUDI_SW_VER.minor < 22): @@ -47,7 +45,4 @@ def _( output = out_dq.reshape(shape) - if transpose: - output = output.t() - return output diff --git a/tests/helpers.py b/tests/helpers.py index fbc4af071..02613bb75 100644 --- a/tests/helpers.py +++ b/tests/helpers.py @@ -98,3 +98,14 @@ def id_formatter(label: str): def describe_dtype(dtype: torch.dtype) -> str: return DTYPE_NAMES.get(dtype) or str(dtype).rpartition(".")[2] + + +def is_supported_on_hpu( + quant_type: str = "nf4", dtype: torch.dtype = torch.bfloat16, quant_storage: torch.dtype = torch.uint8 +) -> bool: + """ + Check if the given quant_type, dtype and quant_storage are supported on HPU. + """ + if quant_type == "fp4" or dtype == torch.float16 or quant_storage not in (torch.uint8, torch.bfloat16): + return False + return True diff --git a/tests/test_autograd.py b/tests/test_autograd.py index 5fbe1065f..9737d15d7 100644 --- a/tests/test_autograd.py +++ b/tests/test_autograd.py @@ -8,6 +8,7 @@ describe_dtype, get_available_devices, id_formatter, + is_supported_on_hpu, ) TRANSPOSE_VALS = [(False, True), (False, False)] @@ -189,6 +190,9 @@ def test_matmul_4bit( if device == "cpu" and dtype != torch.float32 and any(req_grad) and torch.__version__ < (2, 6): pytest.xfail("mse_loss fp16 on CPU is not supported in torch < 2.6") + if device == "hpu" and not is_supported_on_hpu(quant_type, dtype): + pytest.skip("This configuration is not supported on HPU.") + for i in range(3): # normal multiply if funcs[0] in [torch.mm, torch.matmul]: diff --git a/tests/test_functional.py b/tests/test_functional.py index 2e2e898cc..4fb0a0d2f 100644 --- a/tests/test_functional.py +++ b/tests/test_functional.py @@ -16,6 +16,7 @@ get_available_devices, get_test_dims, id_formatter, + is_supported_on_hpu, ) torch.set_printoptions(precision=5, sci_mode=False, linewidth=120, edgeitems=20, threshold=10000) @@ -1101,6 +1102,9 @@ class TestQuantize4BitFunctional: @pytest.mark.parametrize("quant_type", ["fp4", "nf4"]) @pytest.mark.parametrize("blocksize", [64, 128, 256, 512, 1024, 2048, 4096]) def test_4bit_quant(self, device, dtype, quant_type, blocksize): + if device == "hpu" and not is_supported_on_hpu(quant_type, dtype): + pytest.skip("This configuration is not supported on HPU.") + A1 = torch.randn(1024, 1024, device=device, dtype=dtype) qa, SA = F.quantize_4bit(A1, blocksize=blocksize, quant_type=quant_type) A2 = F.dequantize_4bit(qa, SA, blocksize=blocksize, quant_type=quant_type) @@ -1132,11 +1136,15 @@ def test_4bit_quant(self, device, dtype, quant_type, blocksize): @pytest.mark.parametrize("device", get_available_devices()) @pytest.mark.parametrize("quant_type", ["fp4", "nf4"]) @pytest.mark.parametrize("blocksize", [64, 128], ids=id_formatter("blocksize")) - def test_4bit_compressed_stats(self, device, quant_type, blocksize): + @pytest.mark.parametrize("dtype", [torch.float32, torch.float16], ids=describe_dtype) + def test_4bit_compressed_stats(self, device, quant_type, blocksize, dtype): + if device == "hpu" and not is_supported_on_hpu(quant_type, dtype): + pytest.skip("FP4 quantization is not supported on HPU.") + errs1 = [] errs2 = [] for i in range(10): - A1 = torch.randn(1024, 1024, device=device).half() + A1 = torch.randn(1024, 1024, device=device, dtype=dtype) q2, SA2 = F.quantize_4bit(A1, blocksize=blocksize, quant_type=quant_type) q3, SA3 = F.quantize_4bit(A1, blocksize=blocksize, compress_statistics=True, quant_type=quant_type) A2 = F.dequantize_4bit(q2, SA2, quant_type=quant_type) @@ -1205,6 +1213,9 @@ def test_bench_4bit_dequant(self, quant_type): ) @pytest.mark.parametrize("dim", [128, 256, 512, 1024], ids=id_formatter("dim")) def test_gemv_4bit(self, device, dim, dtype, storage_type, quant_storage, double_quant, kind): + if device == "hpu" and not is_supported_on_hpu(storage_type, dtype, quant_storage): + pytest.skip("This configuration is not supported on HPU.") + errs1 = [] errs2 = [] errs3 = [] @@ -1354,6 +1365,9 @@ def test_gemv_eye_4bit(self, device, storage_type, dtype, double_quant): if device == "cpu" and dtype == torch.bfloat16 and torch.__version__ < (2, 3): pytest.skip("eye doe not support bfloat16 on CPU in torch < 2.3") + if device == "hpu" and not is_supported_on_hpu(storage_type, dtype): + pytest.skip("This configuration is not supported on HPU.") + dims = 10 torch.random.manual_seed(np.random.randint(0, 412424242)) dims = get_test_dims(0, 8192, n=dims) diff --git a/tests/test_linear4bit.py b/tests/test_linear4bit.py index f28bfa29e..9fcde695d 100644 --- a/tests/test_linear4bit.py +++ b/tests/test_linear4bit.py @@ -13,6 +13,7 @@ describe_dtype, get_available_devices, id_formatter, + is_supported_on_hpu, torch_load_from_buffer, torch_save_to_buffer, ) @@ -27,12 +28,17 @@ @pytest.mark.parametrize("device", get_available_devices()) @pytest.mark.parametrize("quant_storage", ["uint8", "float16", "bfloat16", "float32"]) +@pytest.mark.parametrize("original_dtype", [torch.float16, torch.bfloat16]) @pytest.mark.parametrize("bias", TRUE_FALSE, ids=id_formatter("bias")) @pytest.mark.parametrize("compress_statistics", TRUE_FALSE, ids=id_formatter("compress_statistics")) @pytest.mark.parametrize("quant_type", ["nf4", "fp4"]) @pytest.mark.parametrize("save_before_forward", TRUE_FALSE, ids=id_formatter("save_before_forward")) -def test_linear_serialization(device, quant_type, compress_statistics, bias, quant_storage, save_before_forward): - original_dtype = torch.float16 +def test_linear_serialization( + device, quant_type, original_dtype, compress_statistics, bias, quant_storage, save_before_forward +): + if device == "hpu" and not is_supported_on_hpu(quant_type, original_dtype, storage[quant_storage]): + pytest.skip("This configuration is not supported on HPU.") + compute_dtype = None layer_shape = (300, 400) @@ -188,6 +194,9 @@ def test_linear_serialization(device, quant_type, compress_statistics, bias, qua @pytest.mark.parametrize("blocksize", [64, 128]) @pytest.mark.parametrize("compress_statistics", TRUE_FALSE, ids=id_formatter("compress_statistics")) def test_copy_param(device, quant_type, blocksize, compress_statistics): + if device == "hpu" and not is_supported_on_hpu(quant_type): + pytest.skip("This configuration is not supported on HPU.") + tensor = torch.randn(300, 400) param = bnb.nn.Params4bit( data=tensor, @@ -207,6 +216,9 @@ def test_copy_param(device, quant_type, blocksize, compress_statistics): @pytest.mark.parametrize("blocksize", [64, 128]) @pytest.mark.parametrize("compress_statistics", TRUE_FALSE, ids=id_formatter("compress_statistics")) def test_deepcopy_param(device, quant_type, blocksize, compress_statistics): + if device == "hpu" and not is_supported_on_hpu(quant_type): + pytest.skip("This configuration is not supported on HPU.") + tensor = torch.randn(300, 400) param = bnb.nn.Params4bit( data=tensor, @@ -233,6 +245,9 @@ def test_deepcopy_param(device, quant_type, blocksize, compress_statistics): @pytest.mark.parametrize("blocksize", [64, 128]) @pytest.mark.parametrize("compress_statistics", TRUE_FALSE, ids=id_formatter("compress_statistics")) def test_params4bit_real_serialization(device, quant_type, blocksize, compress_statistics): + if device == "hpu" and not is_supported_on_hpu(quant_type): + pytest.skip("This configuration is not supported on HPU.") + original_tensor = torch.randn(300, 400) original_param = bnb.nn.Params4bit( data=original_tensor, @@ -270,6 +285,9 @@ def test_params4bit_real_serialization(device, quant_type, blocksize, compress_s @pytest.mark.parametrize("mode", ["default", "reduce-overhead"], ids=id_formatter("mode")) @pytest.mark.skipif(torch.__version__ < (2, 4), reason="Not supported in torch < 2.4") def test_linear4bit_torch_compile(device, quant_type, compute_dtype, compress_statistics, bias, fullgraph, mode): + if device == "hpu" and not is_supported_on_hpu(quant_type): + pytest.skip("This configuration is not supported on HPU.") + if fullgraph and torch.__version__ < (2, 8, 0, "dev"): pytest.skip("fullgraph mode requires torch 2.8 or higher") @@ -314,7 +332,8 @@ def test_linear4bit_torch_compile(device, quant_type, compute_dtype, compress_st ref_output = net(x) # Compile the model - compiled_net = torch.compile(net, fullgraph=fullgraph, mode=mode) + compile_backend = "hpu_backend" if device == "hpu" else "inductor" + compiled_net = torch.compile(net, fullgraph=fullgraph, mode=mode, backend=compile_backend) # Get output from compiled model with torch.no_grad(): diff --git a/tests/test_ops.py b/tests/test_ops.py index 60c47a250..52f26fb05 100644 --- a/tests/test_ops.py +++ b/tests/test_ops.py @@ -5,7 +5,7 @@ import bitsandbytes from bitsandbytes.functional import ipex_xpu -from tests.helpers import TRUE_FALSE, get_available_devices, id_formatter +from tests.helpers import TRUE_FALSE, get_available_devices, id_formatter, is_supported_on_hpu # torch.library.opcheck is only available in torch 2.4 and later. # When testing with older versions, we will skip it as a no-op. @@ -158,6 +158,9 @@ class Test4bitBlockwiseQuantOps: @pytest.mark.parametrize("quant_type", ["fp4", "nf4"]) @pytest.mark.parametrize("blocksize", [64, 128, 256, 512]) def test_quantize_4bit(self, device, dtype, storage_dtype, quant_type, blocksize): + if device == "hpu" and not is_supported_on_hpu(quant_type, dtype, storage_dtype): + pytest.skip("This configuration is not supported on HPU.") + A = torch.randn(1024, 1024, dtype=dtype, device=device) out, absmax = torch.ops.bitsandbytes.quantize_4bit.default(A, blocksize, quant_type, storage_dtype) @@ -179,6 +182,9 @@ def test_quantize_4bit(self, device, dtype, storage_dtype, quant_type, blocksize @pytest.mark.parametrize("quant_type", ["fp4", "nf4"]) @pytest.mark.parametrize("blocksize", [64, 128, 256, 512]) def test_dequantize_4bit(self, device, dtype, storage_dtype, quant_type, blocksize): + if device == "hpu" and not is_supported_on_hpu(quant_type, dtype, storage_dtype): + pytest.skip("This configuration is not supported on HPU.") + shape = (128, 128) n = prod(shape) @@ -210,6 +216,9 @@ def test_dequantize_4bit(self, device, dtype, storage_dtype, quant_type, blocksi @pytest.mark.parametrize("quant_type", ["fp4", "nf4"]) @pytest.mark.parametrize("blocksize", [64, 128, 256, 512]) def test_gemv_4bit(self, device, dtype, storage_dtype, quant_type, blocksize): + if device == "hpu" and not is_supported_on_hpu(quant_type, dtype, storage_dtype): + pytest.skip("This configuration is not supported on HPU.") + out_features = 1024 in_features = 256