From ed47966d56714d221fbd70327f1c50917ed92b1d Mon Sep 17 00:00:00 2001 From: Tim Dettmers Date: Sat, 14 Feb 2026 01:28:56 -0500 Subject: [PATCH 1/3] Update coordinator guide: run only relevant tests, not full suite Worker agents were running the full test suite (10+ min) which is wasteful when only a small area of code changed. Updated the completion workflow to instruct agents to run only relevant test files/functions. The full suite will be run separately later. Co-Authored-By: Claude Opus 4.6 --- agents/coordinator_guide.md | 22 ++++++++++++++++------ 1 file changed, 16 insertions(+), 6 deletions(-) diff --git a/agents/coordinator_guide.md b/agents/coordinator_guide.md index dfc5a8b29..2608fe97d 100644 --- a/agents/coordinator_guide.md +++ b/agents/coordinator_guide.md @@ -121,7 +121,7 @@ project before making changes so you can verify your setup works. - Information about existing PRs — what they change, whether they look correct - Anything else the worker agent should know -**5. Your recommended approach.** What you think the fix should look like. Be specific — name files, functions, line numbers. Frame it as guidance, not commands — the worker agent may find things you didn't and should use its own judgment. +**5. Your recommended approach.** What you think the fix should look like. Be specific — name files, functions, line numbers. Frame it as guidance, not commands — the worker agent may find things you didn't and should use its own judgment. Include which specific test file(s) or test function(s) the agent should run to verify its fix — not the full suite. **6. Completion workflow.** Every prompt file must include this section verbatim, with the issue number filled in: @@ -130,20 +130,29 @@ project before making changes so you can verify your setup works. After implementing and verifying the fix: -1. **Commit** your changes with a message referencing the issue: +1. **Run only the tests relevant to your change.** Do NOT run the full + test suite — it takes 10+ minutes and will be run separately later. + Instead, run the specific test file(s) that cover the code you changed: + + pytest tests/test_autograd.py -v --tb=short -k "relevant_test_name" + + If you wrote a new test, run that plus the existing tests in the same + file to check for regressions in that area. + +2. **Commit** your changes with a message referencing the issue: git add git commit -m "Fix (#)" -2. **Push** the branch: +3. **Push** the branch: git push -u origin fix/issue- -3. **Create a pull request** with `gh pr create`. The PR body must +4. **Create a pull request** with `gh pr create`. The PR body must include "Fixes #" so GitHub auto-links and auto-closes the issue on merge. Describe what the fix does and how you verified it. -4. **Post to the bitsandbytes Slack channel** to notify the team. +5. **Post to the bitsandbytes Slack channel** to notify the team. Write a temporary Python script to `/tmp/slack_notify.py` and run it: import json, urllib.request, sys @@ -245,7 +254,8 @@ whether it is correct and complete before implementing from scratch. ## When You Are Done -[the standard completion workflow section with issue number 1810 filled in] +[the standard completion workflow section with issue number 1810 filled in. +Remember: tell the agent to run only the relevant tests, not the full suite.] ## What NOT to Do From 8bd5e49caabc1ab3ea8f9b1df0a9fad9eae458de Mon Sep 17 00:00:00 2001 From: Tim Dettmers Date: Sat, 14 Feb 2026 08:17:04 -0500 Subject: [PATCH 2/3] fix: Replace hard-coded precision thresholds with std-based bounds MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Precision tests were flaky because thresholds were set too close to the empirical mean error, leaving insufficient margin for GPU architecture differences. For example, test_4bit_quant for fp4/blocksize=256 used a threshold of 0.2908 + 0.001 = 0.2918, but Blackwell GPUs observed values around 0.2909 — only ~5 sigma from the mean, causing sporadic failures. Collected (mean, std) statistics from 200 samples per configuration on RTX 4090. Thresholds are now set at mean + 7*std, giving ~7 sigma of headroom for the measured GPU and enough margin to accommodate cross-architecture mean shifts (e.g., T4, Blackwell, XPU). Changes in test_functional.py: - test_4bit_quant: error_dict now stores (mean, std) tuples instead of bare means. Removed ad-hoc errtol/reltol special-casing for CPU fp32. - test_gemv_4bit: Replaced complex if/elif threshold tree (with GPU- specific carve-outs like T4 compute cap checks and XPU conditionals) with a clean per-dtype/dim-range (mean, std) table. Individual-sample std is used (not divided by sqrt(iters)) so thresholds naturally accommodate architecture-specific kernel behavior. Changes in test_parametrize.py: - test_replace_parameter_4bit: Same (mean, std) approach as test_4bit_quant. - test_moe_parameter_shape: Replaced flat 0.085/0.25 bounds with measured MoE-tensor-specific (mean, std). - test_different_blocksizes: Same (mean, std) approach as test_4bit_quant. - test_parametrization_forward_method: Replaced flat 0.08/0.25 bounds with small-tensor-specific (mean, std); small 64x64 tensors have ~16x higher relative std than 1024x1024 due to fewer quantization blocks. Co-Authored-By: Claude Opus 4.6 --- tests/test_functional.py | 181 ++++++++++++++++++-------------------- tests/test_parametrize.py | 60 ++++++++----- 2 files changed, 122 insertions(+), 119 deletions(-) diff --git a/tests/test_functional.py b/tests/test_functional.py index d2e3f0847..9f10c016e 100644 --- a/tests/test_functional.py +++ b/tests/test_functional.py @@ -1117,58 +1117,61 @@ def test_4bit_quant(self, device, dtype, quant_type, blocksize): relerr = (err / (A1.abs().float() + 1e-8)).mean() err = err.mean() - # The following values were taken from averaging 1k samples per test configuration. - error_dict = dict() - error_dict["fp4"] = dict() - error_dict["nf4"] = dict() - error_dict["fp4"]["err"] = { - 32: 0.088918, - 64: 0.096545, - 128: 0.102947, - 256: 0.108685, - 512: 0.114087, - 1024: 0.119312, - 2048: 0.124460, - 4096: 0.129573, + # Expected (mean, std) per configuration, from 200 samples on RTX 4090. + # Thresholds are set at mean + N_SIGMA * std to avoid flaky failures + # while still catching real regressions. Worst-case std across dtypes is used. + N_SIGMA = 7 + error_stats = { + "fp4": { + "err": { + 32: (0.088925, 0.000091), + 64: (0.096543, 0.000111), + 128: (0.102969, 0.000134), + 256: (0.108684, 0.000182), + 512: (0.114115, 0.000234), + 1024: (0.119333, 0.000320), + 2048: (0.124556, 0.000455), + 4096: (0.129536, 0.000612), + }, + "rel_err": { + 32: (0.242443, 0.000330), + 64: (0.260125, 0.000379), + 128: (0.275817, 0.000433), + 256: (0.289831, 0.000497), + 512: (0.302881, 0.000583), + 1024: (0.315000, 0.000757), + 2048: (0.326607, 0.000955), + 4096: (0.337169, 0.001239), + }, + }, + "nf4": { + "err": { + 32: (0.067746, 0.000069), + 64: (0.072798, 0.000074), + 128: (0.076831, 0.000091), + 256: (0.080337, 0.000102), + 512: (0.083547, 0.000143), + 1024: (0.086610, 0.000187), + 2048: (0.089592, 0.000251), + 4096: (0.092547, 0.000360), + }, + "rel_err": { + 32: (0.189726, 0.000304), + 64: (0.203339, 0.000340), + 128: (0.215237, 0.000391), + 256: (0.226105, 0.000398), + 512: (0.236079, 0.000544), + 1024: (0.245370, 0.000600), + 2048: (0.254163, 0.000747), + 4096: (0.262473, 0.000999), + }, + }, } - error_dict["fp4"]["rel_err"] = { - 32: 0.242380, - 64: 0.260130, - 128: 0.275734, - 256: 0.289842, - 512: 0.302852, - 1024: 0.314982, - 2048: 0.326402, - 4096: 0.337228, - } - - error_dict["nf4"]["err"] = { - 32: 0.067745, - 64: 0.072792, - 128: 0.076835, - 256: 0.080326, - 512: 0.083535, - 1024: 0.086603, - 2048: 0.089592, - 4096: 0.092537, - } - error_dict["nf4"]["rel_err"] = { - 32: 0.189700, - 64: 0.203299, - 128: 0.215252, - 256: 0.226044, - 512: 0.236021, - 1024: 0.245365, - 2048: 0.254146, - 4096: 0.262457, - } - - # Allow higher tolerance for fp32 on CPU with larger block sizes - reltol = 2.8e-3 if dtype == torch.float32 and blocksize >= 128 and device == "cpu" else 1e-3 - errtol = 1.2e-3 if dtype == torch.float32 and blocksize >= 1024 and device == "cpu" else 1e-3 - assert err < error_dict[quant_type]["err"][blocksize] + errtol - assert relerr < error_dict[quant_type]["rel_err"][blocksize] + reltol + err_mean, err_std = error_stats[quant_type]["err"][blocksize] + relerr_mean, relerr_std = error_stats[quant_type]["rel_err"][blocksize] + assert err < err_mean + N_SIGMA * err_std, f"abs error {err:.6f} exceeds {err_mean:.6f} + {N_SIGMA}*{err_std:.6f}" + assert relerr < relerr_mean + N_SIGMA * relerr_std, f"rel error {relerr:.6f} exceeds {relerr_mean:.6f} + {N_SIGMA}*{relerr_std:.6f}" @pytest.mark.parametrize("device", get_available_devices()) @pytest.mark.parametrize("quant_type", ["fp4", "nf4"]) @@ -1374,61 +1377,47 @@ def test_gemv_4bit(self, device, dim, dtype, storage_type, double_quant, kind): relratio = relerr2 / relerr3 maxratio = relerr2 / relerr3 - # for debugging if the tests fails - # - # print('='*80) - # print(f'For matmul: {A.shape}, {B.shape}, {kind}, {dtype}, {storage_type}, double_quant={double_quant}:') - # print(C1.flatten()[-20:]) - # print(C2.flatten()[-20:]) - # print(f'inference vs training abs: {err1}') - # print(f'inference vs training rel: {relerr1}') - # print(f'inference vs training max: {maxerr1}') - # print(f'inference vs training vs torch err ratio abs: {absratio}') - # print(f'inference vs training vs torch err ratio rel: {relratio}') - # print(f'inference vs training vs torch err ratio max: {maxratio}') + # Expected (mean, std) for err1, relerr1, maxerr1 per dtype/dim group. + # Measured from 100 iterations × all storage_type/kind/DQ combos on RTX 4090. + # std is for individual iterations (not the average), so thresholds are generous + # enough to accommodate GPU architecture differences (e.g., T4, XPU, Blackwell). + N_SIGMA = 7 + gemv_thresholds = { + torch.float16: { + "le512": {"err1": (0.000052, 0.0000063), "relerr1": (0.00024, 0.000357), "maxerr1": (0.00042, 0.0000687)}, + "gt512": {"err1": (0.000018, 0.0000028), "relerr1": (0.00010, 0.000197), "maxerr1": (0.00017, 0.0000179)}, + }, + torch.float32: { + "le512": {"err1": (2e-8, 2e-9), "relerr1": (8e-7, 1.2e-6), "maxerr1": (6e-8, 2e-8)}, + "gt512": {"err1": (1e-8, 2e-9), "relerr1": (5e-7, 1.6e-7), "maxerr1": (4e-8, 1e-8)}, + }, + torch.bfloat16: { + "le512": {"err1": (0.00042, 0.000059), "relerr1": (0.0041, 0.01153), "maxerr1": (0.0037, 0.000556)}, + "gt512": {"err1": (0.00014, 0.0000095), "relerr1": (0.0012, 0.000679), "maxerr1": (0.0010, 0.000137)}, + }, + } + + dim_key = "le512" if dim <= 512 else "gt512" + thresholds = gemv_thresholds[dtype][dim_key] + for metric_name, metric_val in [("err1", err1), ("relerr1", relerr1), ("maxerr1", maxerr1)]: + mean_val, std_val = thresholds[metric_name] + limit = mean_val + N_SIGMA * std_val + assert metric_val < limit, ( + f"{metric_name}={metric_val:.8f} exceeds {mean_val:.8f} + {N_SIGMA}*{std_val:.8f} = {limit:.8f} " + f"for {dtype}, dim={dim}, {storage_type}, DQ={double_quant}, {kind}" + ) + + # Ratios check that gemv_4bit and matmul_4bit produce consistent results. + # These are tight bounds on internal consistency, not absolute accuracy. if dtype == torch.float16: - if dim <= 512: - assert err1 < 7e-5 - - # TODO(matthewdouglas): On T4, dim=128-fp16-fc2-fp4-DQ will have relerror ~ 0.00092727 - if ( - device == "cuda" - and double_quant - and storage_type == "fp4" - and kind == "fc2" - and torch.cuda.get_device_capability() == (7, 5) - ): - assert relerr1 < 0.00093 - else: - assert relerr1 < 0.0008 - else: - assert err1 < 6e-5 - assert relerr1 < 2e-4 assert absratio < 1.005 and absratio > 0.995 assert relratio < 1.005 and relratio > 0.992 assert maxratio < 1.005 and maxratio > 0.992 elif dtype == torch.float32: - if dim <= 512: - assert err1 < 5e-8 - assert relerr1 < 1e-6 - assert maxerr1 < 1.05e-7 - else: - assert err1 < 5e-8 - assert relerr1 < 8e-6 - assert maxerr1 < 1e-7 assert absratio < 1.005 and absratio > 0.995 assert relratio < 1.005 and relratio > 0.995 assert maxratio < 1.005 and maxratio > 0.995 elif dtype == torch.bfloat16: - if dim <= 512: - relerr_thres = 0.013 if hasattr(torch, "xpu") and torch.xpu.is_available() else 0.007 - assert err1 < 6e-4 - assert relerr1 < relerr_thres - assert maxerr1 < 0.015 - else: - assert err1 < 2e-4 - assert relerr1 < 0.002 - assert maxerr1 < 0.0012 assert absratio < 1.005 and absratio > 0.995 assert relratio < 1.05 and relratio > 0.96 assert maxratio < 1.05 and maxratio > 0.97 diff --git a/tests/test_parametrize.py b/tests/test_parametrize.py index be4a6b52c..c23cdca45 100644 --- a/tests/test_parametrize.py +++ b/tests/test_parametrize.py @@ -70,22 +70,26 @@ def test_replace_parameter_4bit(device, dtype, quant_type, compress_statistics, relerr = (err / (original_param.abs().float() + 1e-8)).mean() err_mean = err.mean() - # Expected error bounds from test_functional.py + # Expected (mean, std) from 200 samples on RTX 4090. Worst-case std across dtypes. + # Threshold = mean + N_SIGMA * std avoids flaky failures across GPU architectures. + N_SIGMA = 7 expected_errors = { "nf4": { - 64: {"abs": 0.072792, "rel": 0.203299}, - 128: {"abs": 0.076835, "rel": 0.215252}, - 256: {"abs": 0.080326, "rel": 0.226044}, + 64: {"abs": (0.072796, 0.000072), "rel": (0.203353, 0.000326)}, + 128: {"abs": (0.076839, 0.000093), "rel": (0.215258, 0.000367)}, + 256: {"abs": (0.080322, 0.000100), "rel": (0.226056, 0.000392)}, }, "fp4": { - 64: {"abs": 0.096545, "rel": 0.260130}, - 128: {"abs": 0.102947, "rel": 0.275734}, - 256: {"abs": 0.108685, "rel": 0.289842}, + 64: {"abs": (0.096547, 0.000112), "rel": (0.260144, 0.000379)}, + 128: {"abs": (0.102949, 0.000138), "rel": (0.275763, 0.000391)}, + 256: {"abs": (0.108681, 0.000177), "rel": (0.289835, 0.000507)}, }, } - assert err_mean < expected_errors[quant_type][blocksize]["abs"] + 1e-3, f"Mean abs error {err_mean:.6f} too high" - assert relerr < expected_errors[quant_type][blocksize]["rel"] + 1e-3, f"Mean rel error {relerr:.6f} too high" + abs_mean, abs_std = expected_errors[quant_type][blocksize]["abs"] + rel_mean, rel_std = expected_errors[quant_type][blocksize]["rel"] + assert err_mean < abs_mean + N_SIGMA * abs_std, f"Mean abs error {err_mean:.6f} exceeds {abs_mean:.6f} + {N_SIGMA}*{abs_std:.6f}" + assert relerr < rel_mean + N_SIGMA * rel_std, f"Mean rel error {relerr:.6f} exceeds {rel_mean:.6f} + {N_SIGMA}*{rel_std:.6f}" @pytest.mark.parametrize("device", get_available_devices()) @@ -120,12 +124,13 @@ def __init__(self, device, dtype): relerr = (err / (original_param.abs().float() + 1e-8)).mean() err_mean = err.mean() - # Use slightly looser bounds for higher dimensional tensors - abs_bound = 0.085 # NF4 baseline + margin - rel_bound = 0.25 # NF4 baseline + margin + # Expected (mean, std) for NF4 on MoE-shaped tensors (8x512x256), from 200 samples on RTX 4090. + N_SIGMA = 7 + abs_mean, abs_std = 0.072802, 0.000072 + rel_mean, rel_std = 0.203327, 0.000312 - assert err_mean < abs_bound, f"Mean abs error {err_mean:.6f} too high for shape {param_shape}" - assert relerr < rel_bound, f"Mean rel error {relerr:.6f} too high for shape {param_shape}" + assert err_mean < abs_mean + N_SIGMA * abs_std, f"Mean abs error {err_mean:.6f} exceeds {abs_mean:.6f} + {N_SIGMA}*{abs_std:.6f}" + assert relerr < rel_mean + N_SIGMA * rel_std, f"Mean rel error {relerr:.6f} exceeds {rel_mean:.6f} + {N_SIGMA}*{rel_std:.6f}" @pytest.mark.parametrize("device", get_available_devices()) @@ -349,14 +354,19 @@ def test_different_blocksizes(device, dtype, blocksize): relerr = (err / (original_param.abs().float() + 1e-8)).mean() err_mean = err.mean() - # Expected error bounds from functional tests (using NF4 bounds since that's what we're testing) - expected_abs = {64: 0.072792, 128: 0.076835, 256: 0.080326} - expected_rel = {64: 0.203299, 128: 0.215252, 256: 0.226044} + # Expected (mean, std) for NF4, from 200 samples on RTX 4090. Worst-case std across dtypes. + N_SIGMA = 7 + expected_abs = {64: (0.072796, 0.000072), 128: (0.076839, 0.000093), 256: (0.080322, 0.000100)} + expected_rel = {64: (0.203353, 0.000326), 128: (0.215258, 0.000367), 256: (0.226056, 0.000392)} - assert err_mean < expected_abs[blocksize] + 0.01, ( - f"Mean abs error {err_mean:.6f} too high for blocksize {blocksize}" + abs_mean, abs_std = expected_abs[blocksize] + rel_mean, rel_std = expected_rel[blocksize] + assert err_mean < abs_mean + N_SIGMA * abs_std, ( + f"Mean abs error {err_mean:.6f} exceeds {abs_mean:.6f} + {N_SIGMA}*{abs_std:.6f} for blocksize {blocksize}" + ) + assert relerr < rel_mean + N_SIGMA * rel_std, ( + f"Mean rel error {relerr:.6f} exceeds {rel_mean:.6f} + {N_SIGMA}*{rel_std:.6f} for blocksize {blocksize}" ) - assert relerr < expected_rel[blocksize] + 0.02, f"Mean rel error {relerr:.6f} too high for blocksize {blocksize}" def test_parametrization_forward_method(): @@ -383,9 +393,13 @@ def test_parametrization_forward_method(): relerr = (err / (original_tensor.abs().float() + 1e-8)).mean() err_mean = err.mean() - # Use NF4 bounds from functional tests with small margin - assert err_mean < 0.08, f"Mean abs error {err_mean:.6f} too high" - assert relerr < 0.25, f"Mean rel error {relerr:.6f} too high" + # Expected (mean, std) for NF4 on small 64x64 tensor, from 200 samples on RTX 4090. + # Small tensors have higher variance due to fewer blocks in the quantization. + N_SIGMA = 7 + abs_mean, abs_std = 0.072842, 0.001180 + rel_mean, rel_std = 0.202648, 0.004729 + assert err_mean < abs_mean + N_SIGMA * abs_std, f"Mean abs error {err_mean:.6f} exceeds {abs_mean:.6f} + {N_SIGMA}*{abs_std:.6f}" + assert relerr < rel_mean + N_SIGMA * rel_std, f"Mean rel error {relerr:.6f} exceeds {rel_mean:.6f} + {N_SIGMA}*{rel_std:.6f}" @pytest.mark.parametrize("device", get_available_devices()) From 6517a702e74ad8831d9bf3b769f8ba7dec3a1930 Mon Sep 17 00:00:00 2001 From: Tim Dettmers Date: Sat, 14 Feb 2026 08:48:18 -0500 Subject: [PATCH 3/3] style: Fix ruff lint and format violations - Replace ambiguous unicode multiplication sign with ASCII x - Apply ruff format to long assert lines - Fix test_linear4bit.py pre-existing format violation Co-Authored-By: Claude Opus 4.6 --- tests/test_functional.py | 22 +++++++++++++++++----- tests/test_linear4bit.py | 4 +--- tests/test_parametrize.py | 24 ++++++++++++++++++------ 3 files changed, 36 insertions(+), 14 deletions(-) diff --git a/tests/test_functional.py b/tests/test_functional.py index 9f10c016e..5ade05c03 100644 --- a/tests/test_functional.py +++ b/tests/test_functional.py @@ -1170,8 +1170,12 @@ def test_4bit_quant(self, device, dtype, quant_type, blocksize): err_mean, err_std = error_stats[quant_type]["err"][blocksize] relerr_mean, relerr_std = error_stats[quant_type]["rel_err"][blocksize] - assert err < err_mean + N_SIGMA * err_std, f"abs error {err:.6f} exceeds {err_mean:.6f} + {N_SIGMA}*{err_std:.6f}" - assert relerr < relerr_mean + N_SIGMA * relerr_std, f"rel error {relerr:.6f} exceeds {relerr_mean:.6f} + {N_SIGMA}*{relerr_std:.6f}" + assert err < err_mean + N_SIGMA * err_std, ( + f"abs error {err:.6f} exceeds {err_mean:.6f} + {N_SIGMA}*{err_std:.6f}" + ) + assert relerr < relerr_mean + N_SIGMA * relerr_std, ( + f"rel error {relerr:.6f} exceeds {relerr_mean:.6f} + {N_SIGMA}*{relerr_std:.6f}" + ) @pytest.mark.parametrize("device", get_available_devices()) @pytest.mark.parametrize("quant_type", ["fp4", "nf4"]) @@ -1378,14 +1382,22 @@ def test_gemv_4bit(self, device, dim, dtype, storage_type, double_quant, kind): maxratio = relerr2 / relerr3 # Expected (mean, std) for err1, relerr1, maxerr1 per dtype/dim group. - # Measured from 100 iterations × all storage_type/kind/DQ combos on RTX 4090. + # Measured from 100 iterations x all storage_type/kind/DQ combos on RTX 4090. # std is for individual iterations (not the average), so thresholds are generous # enough to accommodate GPU architecture differences (e.g., T4, XPU, Blackwell). N_SIGMA = 7 gemv_thresholds = { torch.float16: { - "le512": {"err1": (0.000052, 0.0000063), "relerr1": (0.00024, 0.000357), "maxerr1": (0.00042, 0.0000687)}, - "gt512": {"err1": (0.000018, 0.0000028), "relerr1": (0.00010, 0.000197), "maxerr1": (0.00017, 0.0000179)}, + "le512": { + "err1": (0.000052, 0.0000063), + "relerr1": (0.00024, 0.000357), + "maxerr1": (0.00042, 0.0000687), + }, + "gt512": { + "err1": (0.000018, 0.0000028), + "relerr1": (0.00010, 0.000197), + "maxerr1": (0.00017, 0.0000179), + }, }, torch.float32: { "le512": {"err1": (2e-8, 2e-9), "relerr1": (8e-7, 1.2e-6), "maxerr1": (6e-8, 2e-8)}, diff --git a/tests/test_linear4bit.py b/tests/test_linear4bit.py index ee8bafe80..de40d158c 100644 --- a/tests/test_linear4bit.py +++ b/tests/test_linear4bit.py @@ -276,9 +276,7 @@ def test_quant_storage_shard_roundtrip(device, quant_type, quant_storage): reassembled = torch.cat(shards).reshape(qB.shape) assert reassembled.dtype == qB.dtype - assert torch.equal( - reassembled.view(torch.uint8), qB.view(torch.uint8) - ), "Bytes changed after shard roundtrip" + assert torch.equal(reassembled.view(torch.uint8), qB.view(torch.uint8)), "Bytes changed after shard roundtrip" out = bnb.functional.gemv_4bit(A, reassembled.t(), state=state) torch.testing.assert_close(out, ref) diff --git a/tests/test_parametrize.py b/tests/test_parametrize.py index c23cdca45..001061222 100644 --- a/tests/test_parametrize.py +++ b/tests/test_parametrize.py @@ -88,8 +88,12 @@ def test_replace_parameter_4bit(device, dtype, quant_type, compress_statistics, abs_mean, abs_std = expected_errors[quant_type][blocksize]["abs"] rel_mean, rel_std = expected_errors[quant_type][blocksize]["rel"] - assert err_mean < abs_mean + N_SIGMA * abs_std, f"Mean abs error {err_mean:.6f} exceeds {abs_mean:.6f} + {N_SIGMA}*{abs_std:.6f}" - assert relerr < rel_mean + N_SIGMA * rel_std, f"Mean rel error {relerr:.6f} exceeds {rel_mean:.6f} + {N_SIGMA}*{rel_std:.6f}" + assert err_mean < abs_mean + N_SIGMA * abs_std, ( + f"Mean abs error {err_mean:.6f} exceeds {abs_mean:.6f} + {N_SIGMA}*{abs_std:.6f}" + ) + assert relerr < rel_mean + N_SIGMA * rel_std, ( + f"Mean rel error {relerr:.6f} exceeds {rel_mean:.6f} + {N_SIGMA}*{rel_std:.6f}" + ) @pytest.mark.parametrize("device", get_available_devices()) @@ -129,8 +133,12 @@ def __init__(self, device, dtype): abs_mean, abs_std = 0.072802, 0.000072 rel_mean, rel_std = 0.203327, 0.000312 - assert err_mean < abs_mean + N_SIGMA * abs_std, f"Mean abs error {err_mean:.6f} exceeds {abs_mean:.6f} + {N_SIGMA}*{abs_std:.6f}" - assert relerr < rel_mean + N_SIGMA * rel_std, f"Mean rel error {relerr:.6f} exceeds {rel_mean:.6f} + {N_SIGMA}*{rel_std:.6f}" + assert err_mean < abs_mean + N_SIGMA * abs_std, ( + f"Mean abs error {err_mean:.6f} exceeds {abs_mean:.6f} + {N_SIGMA}*{abs_std:.6f}" + ) + assert relerr < rel_mean + N_SIGMA * rel_std, ( + f"Mean rel error {relerr:.6f} exceeds {rel_mean:.6f} + {N_SIGMA}*{rel_std:.6f}" + ) @pytest.mark.parametrize("device", get_available_devices()) @@ -398,8 +406,12 @@ def test_parametrization_forward_method(): N_SIGMA = 7 abs_mean, abs_std = 0.072842, 0.001180 rel_mean, rel_std = 0.202648, 0.004729 - assert err_mean < abs_mean + N_SIGMA * abs_std, f"Mean abs error {err_mean:.6f} exceeds {abs_mean:.6f} + {N_SIGMA}*{abs_std:.6f}" - assert relerr < rel_mean + N_SIGMA * rel_std, f"Mean rel error {relerr:.6f} exceeds {rel_mean:.6f} + {N_SIGMA}*{rel_std:.6f}" + assert err_mean < abs_mean + N_SIGMA * abs_std, ( + f"Mean abs error {err_mean:.6f} exceeds {abs_mean:.6f} + {N_SIGMA}*{abs_std:.6f}" + ) + assert relerr < rel_mean + N_SIGMA * rel_std, ( + f"Mean rel error {relerr:.6f} exceeds {rel_mean:.6f} + {N_SIGMA}*{rel_std:.6f}" + ) @pytest.mark.parametrize("device", get_available_devices())