From a6e31601317b0b86237247590a25e7e3cf32294d Mon Sep 17 00:00:00 2001 From: Matthew Douglas <38992547+matthewdouglas@users.noreply.github.com> Date: Fri, 16 May 2025 15:26:14 -0400 Subject: [PATCH 1/4] Test g5g runner --- .github/workflows/tests.yml | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml index 9431b32f4..d49548be9 100644 --- a/.github/workflows/tests.yml +++ b/.github/workflows/tests.yml @@ -141,6 +141,28 @@ jobs: - name: Run tests run: pytest --durations=100 + cuda-aarch64-tests: + if: github.repository == 'bitsandbytes-foundation/bitsandbytes' + needs: build-cuda + strategy: + fail-fast: false + matrix: + os: [ubuntu-22.04-arm] + arch: [aarch64] + torch_version: ["2.7.0"] + cuda_version: ["11.8.0", "12.8.1"] + + runs-on: bandb-aws-g5g-4xlarge-plus-use1-public-80 + env: + BNB_TEST_DEVICE: cuda + steps: + - name: Show GPU Information + run: nvidia-smi + + - name: Show pip packages + run: pip list + + cuda-tests: if: github.repository == 'bitsandbytes-foundation/bitsandbytes' needs: build-cuda From c25e8f159ad7b0131fdbc34e6f11fe3851718bdd Mon Sep 17 00:00:00 2001 From: Matthew Douglas <38992547+matthewdouglas@users.noreply.github.com> Date: Fri, 16 May 2025 15:55:42 -0400 Subject: [PATCH 2/4] Switch L4 to L40S runner; swap GitHub Linux T4 runner for AWS g4dn --- .github/workflows/tests.yml | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml index d49548be9..ee0cf4b8f 100644 --- a/.github/workflows/tests.yml +++ b/.github/workflows/tests.yml @@ -171,7 +171,7 @@ jobs: matrix: os: [ubuntu-22.04, windows-2025] arch: [x86_64] - gpu: [T4, L4] + gpu: [T4, L40S] cuda_version: ["11.8.0", "12.8.1"] include: - cuda_version: "11.8.0" @@ -181,15 +181,15 @@ jobs: torch_version: "2.7.0" pypi_index: "https://download.pytorch.org/whl/cu128" - # L4 runners + # L40S runners - os: ubuntu-22.04 - gpu: L4 - runner: bandb-aws-g6-4xlarge-plus-use1-public-80 + gpu: L40S + runner: bandb-aws-g6e-4xlarge-plus-use1-public-80 # T4 runners - os: ubuntu-22.04 gpu: T4 - runner: CUDA-Linux-x64 + runner: bandb-aws-g4dn-4xlarge-plus-use1-public-80 - os: windows-2025 gpu: T4 runner: CUDA-Windows-x64 @@ -199,9 +199,9 @@ jobs: - os: windows-2025 cuda_version: "12.8.1" - # No Windows L4 runners. + # No Windows L40S runners. - os: windows-2025 - gpu: L4 + gpu: L40S runs-on: ${{ matrix.runner }} env: BNB_TEST_DEVICE: cuda From cdb85d6f2db6f53875c3af93ca2fba5aeeb516e8 Mon Sep 17 00:00:00 2001 From: Matthew Douglas <38992547+matthewdouglas@users.noreply.github.com> Date: Mon, 19 May 2025 13:01:47 -0400 Subject: [PATCH 3/4] Run tests on last 2 pytorch stable releases --- .github/workflows/tests.yml | 52 +++++++++++++++++++++++-------------- tests/test_functional.py | 33 ----------------------- tests/test_modules.py | 12 ++++----- 3 files changed, 38 insertions(+), 59 deletions(-) diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml index ee0cf4b8f..748a5eaf2 100644 --- a/.github/workflows/tests.yml +++ b/.github/workflows/tests.yml @@ -100,7 +100,7 @@ jobs: fail-fast: false matrix: os: [ubuntu-22.04, ubuntu-22.04-arm, windows-2025, macos-15] - torch_version: ["2.7.0"] + torch_version: ["2.6.0", "2.7.0"] include: - os: ubuntu-22.04 arch: x86_64 @@ -138,29 +138,33 @@ jobs: - name: Show installed packages run: pip list + - name: Show environment information + run: python -m torch.utils.collect_env + - name: Run tests run: pytest --durations=100 - cuda-aarch64-tests: - if: github.repository == 'bitsandbytes-foundation/bitsandbytes' - needs: build-cuda - strategy: - fail-fast: false - matrix: - os: [ubuntu-22.04-arm] - arch: [aarch64] - torch_version: ["2.7.0"] - cuda_version: ["11.8.0", "12.8.1"] + # cuda-aarch64-tests: + # if: github.repository == 'bitsandbytes-foundation/bitsandbytes' + # needs: build-cuda + # strategy: + # fail-fast: false + # matrix: + # os: [ubuntu-22.04-arm] + # arch: [aarch64] + # torch_version: ["2.7.0"] + # cuda_version: ["11.8.0", "12.8.1"] - runs-on: bandb-aws-g5g-4xlarge-plus-use1-public-80 - env: - BNB_TEST_DEVICE: cuda - steps: - - name: Show GPU Information - run: nvidia-smi + # runs-on: bandb-aws-g5g-4xlarge-plus-use1-public-80 + # env: + # BNB_TEST_DEVICE: cuda + # steps: + # - name: Show GPU Information + # run: nvidia-smi + + # - name: Show pip packages + # run: pip list - - name: Show pip packages - run: pip list cuda-tests: @@ -172,11 +176,14 @@ jobs: os: [ubuntu-22.04, windows-2025] arch: [x86_64] gpu: [T4, L40S] - cuda_version: ["11.8.0", "12.8.1"] + cuda_version: ["11.8.0", "12.6.3", "12.8.1"] include: - cuda_version: "11.8.0" torch_version: "2.4.1" pypi_index: "https://download.pytorch.org/whl/cu118" + - cuda_version: "12.6.3" + torch_version: "2.6.0" + pypi_index: "https://download.pytorch.org/whl/cu126" - cuda_version: "12.8.1" torch_version: "2.7.0" pypi_index: "https://download.pytorch.org/whl/cu128" @@ -198,6 +205,8 @@ jobs: # and cannot support CUDA 12+. Skip for now. - os: windows-2025 cuda_version: "12.8.1" + - os: windows-2025 + cuda_version: "12.6.3" # No Windows L40S runners. - os: windows-2025 @@ -232,5 +241,8 @@ jobs: - name: Show installed packages run: pip list + - name: Show environment information + run: python -m torch.utils.collect_env + - name: Run tests run: pytest --durations=100 diff --git a/tests/test_functional.py b/tests/test_functional.py index 96e77e4f4..423a92193 100644 --- a/tests/test_functional.py +++ b/tests/test_functional.py @@ -929,39 +929,6 @@ def test_spmm_coo_very_sparse(self, dim1, dim2, dtype, out_func): # torch.cuda.synchronize() # print(time.time() - t0) - @pytest.mark.parametrize("dim1", [256, 1024], ids=id_formatter("dim1")) - @pytest.mark.parametrize("dim2", [256, 1024], ids=id_formatter("dim2")) - @pytest.mark.skip("No longer supported") - def test_integrated_sparse_decomp(self, dim1, dim2): - threshold = 3.0 - for _ in range(k): - A = torch.randn(dim1, dim2).cuda().half() - w1 = torch.randn(dim1, dim2).cuda().half() - out1 = torch.matmul(A, w1.t()) - - Cw1, statsw1, _ = F.int8_vectorwise_quant(w1) - CA, statsA, _ = F.int8_vectorwise_quant(A) - - out1_32 = F.int8_linear_matmul(CA, Cw1) - out2 = F.int8_mm_dequant(out1_32, statsA, statsw1) - - # CA, statsA, outlier_cols = F.int8_vectorwise_quant(A, threshold=threshold) - CA, _, statsA, _, coo_tensor = F.double_quant(A, threshold=threshold) - - out1_32 = F.int8_linear_matmul(CA, Cw1) - out3 = F.int8_mm_dequant(out1_32, statsA, statsw1) - - assert coo_tensor is not None - - out4 = F.spmm_coo(coo_tensor, w1.t()) - # idx = torch.unique(coo_tensor._indices()[1]).long() - # out4 = torch.matmul(A, w1.t()) - out5 = out3 + out4 - - err1 = torch.abs(out1 - out2).mean().item() - err2 = torch.abs(out1 - out5).mean().item() - assert err2 < err1 - @pytest.mark.parametrize("dim1", [1 * 2048]) @pytest.mark.parametrize("dim2", [2048]) @pytest.mark.parametrize("dtype", [torch.int8]) diff --git a/tests/test_modules.py b/tests/test_modules.py index dc1d60e6c..c8ec6311a 100644 --- a/tests/test_modules.py +++ b/tests/test_modules.py @@ -130,7 +130,7 @@ def test_linear8bitlt_no_fp16_weights(device, threshold): assert l1.weight.dtype == torch.int8 l1.eval() - for i in range(100): + for i in range(4): b1 = torch.randn(16, 8, 32, device=device, dtype=torch.float16) o1 = l1(b1) assert o1.dtype == torch.float16 @@ -139,7 +139,7 @@ def test_linear8bitlt_no_fp16_weights(device, threshold): assert mlp.fc1.weight.dtype == torch.int8 assert mlp.fc2.weight.dtype == torch.int8 - for i in range(100): + for i in range(4): b1 = torch.randn(16, 8, 32, device=device, dtype=torch.float16) o1 = mlp(b1) assert o1.dtype == torch.float16 @@ -152,7 +152,7 @@ def test_linear8bitlt_no_fp16_weights(device, threshold): assert mlp.fc1.weight.dtype == torch.int8 assert mlp.fc2.weight.dtype == torch.int8 - for i in range(100): + for i in range(4): b1 = torch.randn(16, 8, 32, device=device, dtype=torch.float16) o1 = mlp(b1) assert o1.dtype == torch.float16 @@ -163,7 +163,7 @@ def test_linear8bitlt_no_fp16_weights(device, threshold): mlp = MLP8bit(32, 64, threshold=threshold, has_fp16_weights=False).half().to(device) - for i in range(100): + for i in range(4): b1 = torch.randn(16, 8, 32, device=device, dtype=torch.float16) o1 = mlp(b1) assert o1.dtype == torch.float16 @@ -185,7 +185,7 @@ def test_linear8bitlt_no_fp16_weights(device, threshold): .to(device) ) - for i in range(100): + for i in range(4): b1 = torch.randn(16, 8, 32, device=device, dtype=torch.float16) o1 = mlp(b1) assert o1.dtype == torch.float16 @@ -207,7 +207,7 @@ def test_linear8bitlt_no_fp16_weights(device, threshold): w1, w2 = mlp.fc1.weight.clone().to(device), mlp.fc2.weight.clone().to(device) # grab weights before quantization, mlp = mlp.to(device).half() # and this line triggers quantization - for i in range(100): + for i in range(4): b1 = torch.randn(16, 8, 32, device=device, dtype=torch.float16) o1 = mlp(b1) assert o1.dtype == torch.float16 From d3ad2942c275251391e5fd6a2871e73aadc4ac2a Mon Sep 17 00:00:00 2001 From: Matthew Douglas <38992547+matthewdouglas@users.noreply.github.com> Date: Mon, 19 May 2025 13:02:40 -0400 Subject: [PATCH 4/4] Run tests on last 2 pytorch stable releases --- .github/workflows/tests.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml index 748a5eaf2..5d2a2708b 100644 --- a/.github/workflows/tests.yml +++ b/.github/workflows/tests.yml @@ -49,7 +49,7 @@ jobs: build-cuda: strategy: matrix: - cuda_version: ["11.8.0", "12.8.1"] + cuda_version: ["11.8.0", "12.6.3", "12.8.1"] os: [ubuntu-22.04, ubuntu-22.04-arm, windows-2025] include: - os: ubuntu-22.04