From 62d696348a6fba7a1ffb380307032570c56b5ee7 Mon Sep 17 00:00:00 2001 From: Tim Dettmers Date: Mon, 16 Feb 2026 08:21:22 -0500 Subject: [PATCH 1/2] Remove deprecated APIs: research module, non-blockwise optimizers, and legacy quantization functions Remove all remaining deprecated code that has been emitting FutureWarning since v0.45.0 (December 2024). Two prior cleanup rounds (v0.47.0, v0.49.0) already removed the easier items; this finishes the job. - Delete quantize(), dequantize(), quantize_no_absmax(), dequantize_no_absmax(), optimizer_update_8bit(), percentile_clipping(), and the str2optimizer8bit dispatch table from functional.py - Remove the non-blockwise 8-bit optimizer path from Optimizer2State and Optimizer1State; LAMB/LARS now use blockwise quantization - Remove percentile_clipping and block_wise parameters from all ~33 optimizer class constructors - Delete bitsandbytes/research/ (FP8 matmul, SwitchBack) - Delete bitsandbytes/nn/triton_based_modules.py, SwitchBackLinearBnb, and the orphaned bitsandbytes/triton/ kernel directory - Remove dead MatmulLtState fields (CxB, CxBt, formatB, _tile_indices) - Delete test_deprecated.py, test_triton.py; clean test_autograd.py, test_optim.py, test_functional.py - Remove benchmarking/switchback/ and update docs Co-Authored-By: Claude Opus 4.6 --- benchmarking/switchback/README.md | 4 - benchmarking/switchback/info_a100_py2.jsonl | 60 --- .../switchback/make_plot_with_jsonl.py | 151 ------- benchmarking/switchback/plot_with_info.pdf | Bin 34876 -> 0 bytes benchmarking/switchback/speed_benchmark.py | 160 ------- bitsandbytes/__init__.py | 2 +- bitsandbytes/autograd/_functions.py | 26 +- bitsandbytes/backends/utils.py | 3 +- bitsandbytes/functional.py | 310 -------------- bitsandbytes/nn/__init__.py | 7 - bitsandbytes/nn/modules.py | 39 -- bitsandbytes/nn/triton_based_modules.py | 264 ------------ bitsandbytes/optim/adagrad.py | 25 -- bitsandbytes/optim/adam.py | 48 --- bitsandbytes/optim/adamw.py | 48 --- bitsandbytes/optim/ademamix.py | 6 - bitsandbytes/optim/lamb.py | 24 -- bitsandbytes/optim/lars.py | 15 - bitsandbytes/optim/lion.py | 52 --- bitsandbytes/optim/optimizer.py | 138 +----- bitsandbytes/optim/rmsprop.py | 26 -- bitsandbytes/optim/sgd.py | 24 -- bitsandbytes/research/__init__.py | 6 - bitsandbytes/research/autograd/__init__.py | 0 bitsandbytes/research/autograd/_functions.py | 396 ------------------ bitsandbytes/research/nn/__init__.py | 1 - bitsandbytes/research/nn/modules.py | 76 ---- bitsandbytes/triton/__init__.py | 0 bitsandbytes/triton/dequantize_rowwise.py | 64 --- .../triton/int8_matmul_mixed_dequantize.py | 206 --------- .../triton/int8_matmul_rowwise_dequantize.py | 207 --------- bitsandbytes/triton/matmul_perf_model.py | 211 ---------- .../quantize_columnwise_and_transpose.py | 75 ---- bitsandbytes/triton/quantize_global.py | 124 ------ bitsandbytes/triton/quantize_rowwise.py | 67 --- bitsandbytes/triton/triton_utils.py | 11 - docs/source/optimizers.mdx | 4 +- pyproject.toml | 3 +- tests/test_autograd.py | 8 +- tests/test_deprecated.py | 175 -------- tests/test_functional.py | 1 - tests/test_optim.py | 106 +---- tests/test_triton.py | 64 --- 43 files changed, 46 insertions(+), 3191 deletions(-) delete mode 100644 benchmarking/switchback/README.md delete mode 100644 benchmarking/switchback/info_a100_py2.jsonl delete mode 100644 benchmarking/switchback/make_plot_with_jsonl.py delete mode 100644 benchmarking/switchback/plot_with_info.pdf delete mode 100644 benchmarking/switchback/speed_benchmark.py delete mode 100644 bitsandbytes/nn/triton_based_modules.py delete mode 100644 bitsandbytes/research/__init__.py delete mode 100644 bitsandbytes/research/autograd/__init__.py delete mode 100644 bitsandbytes/research/autograd/_functions.py delete mode 100644 bitsandbytes/research/nn/__init__.py delete mode 100644 bitsandbytes/research/nn/modules.py delete mode 100644 bitsandbytes/triton/__init__.py delete mode 100644 bitsandbytes/triton/dequantize_rowwise.py delete mode 100644 bitsandbytes/triton/int8_matmul_mixed_dequantize.py delete mode 100644 bitsandbytes/triton/int8_matmul_rowwise_dequantize.py delete mode 100644 bitsandbytes/triton/matmul_perf_model.py delete mode 100644 bitsandbytes/triton/quantize_columnwise_and_transpose.py delete mode 100644 bitsandbytes/triton/quantize_global.py delete mode 100644 bitsandbytes/triton/quantize_rowwise.py delete mode 100644 bitsandbytes/triton/triton_utils.py delete mode 100644 tests/test_deprecated.py delete mode 100644 tests/test_triton.py diff --git a/benchmarking/switchback/README.md b/benchmarking/switchback/README.md deleted file mode 100644 index b73569030..000000000 --- a/benchmarking/switchback/README.md +++ /dev/null @@ -1,4 +0,0 @@ -Steps: - -1. Run `python speed_benchmark/speed_benchmark.py` which times operations and writes their time to `speed_benchmark/info_a100_py2.jsonl` (change the name of the jsonl to a different name for your profiling). -2. Run `python speed_benchmark/make_plot_with_jsonl.py`, which produces the `speed_benchmark/plot_with_info.pdf`. Again make sure you change the jsonl which is being processed. diff --git a/benchmarking/switchback/info_a100_py2.jsonl b/benchmarking/switchback/info_a100_py2.jsonl deleted file mode 100644 index 53cda62cf..000000000 --- a/benchmarking/switchback/info_a100_py2.jsonl +++ /dev/null @@ -1,60 +0,0 @@ -{"repeat": 64, "batch_size": 8192, "dim_out": 4096, "dim_in": 1024, "wm": 4, "switch": false, "standard_fwd": 0.28139352798461914, "standard_gw": 0.2811811864376068, "standard_gx": 0.30258670449256897, "rowwise_fwd": 0.1994594931602478, "rowwise_bwd": 0.16159191727638245, "global_fwd": 0.19502267241477966, "global_bwd": 0.16080215573310852, "x_quantize_rowwise": 0.03306940197944641, "g_quantize_rowwise": 0.08210167288780212, "w_quantize_rowwise": 0.03385916352272034, "w_quantize_colwise_transpose": 0.08635595440864563, "w_quantize_global": 0.09237229824066162, "w_quantize_global_transpose": 0.10007619857788086, "time_standard": 0.8651614189147949, "time_rowwise": 0.8776187896728516, "time_global": 0.944625586271286} -{"repeat": 64, "batch_size": 8192, "dim_out": 1024, "dim_in": 4096, "wm": 4, "switch": true, "standard_fwd": 0.262625515460968, "standard_gw": 0.2806223928928375, "standard_gx": 0.31118839979171753, "rowwise_fwd": 0.1828707754611969, "rowwise_bwd": 0.21236762404441833, "global_fwd": 0.16665831208229065, "global_bwd": 0.19929558038711548, "x_quantize_rowwise": 0.08227676153182983, "g_quantize_rowwise": 0.03310292959213257, "w_quantize_rowwise": 0.032648444175720215, "w_quantize_colwise_transpose": 0.09015202522277832, "w_quantize_global": 0.0988692045211792, "w_quantize_global_transpose": 0.10057538747787476, "time_standard": 0.8544363081455231, "time_rowwise": 0.9140409529209137, "time_global": 0.96140056848526} -{"repeat": 64, "batch_size": 16384, "dim_out": 4096, "dim_in": 1024, "wm": 4, "switch": false, "standard_fwd": 0.5731917917728424, "standard_gw": 0.5709454417228699, "standard_gx": 0.5963630974292755, "rowwise_fwd": 0.37662312388420105, "rowwise_bwd": 0.281747430562973, "global_fwd": 0.36768242716789246, "global_bwd": 0.28043612837791443, "x_quantize_rowwise": 0.046547502279281616, "g_quantize_rowwise": 0.15532970428466797, "w_quantize_rowwise": 0.032436102628707886, "w_quantize_colwise_transpose": 0.08635222911834717, "w_quantize_global": 0.0947415828704834, "w_quantize_global_transpose": 0.10129809379577637, "time_standard": 1.7405003309249878, "time_rowwise": 1.5499815344810486, "time_global": 1.616980880498886} -{"repeat": 64, "batch_size": 16384, "dim_out": 1024, "dim_in": 4096, "wm": 4, "switch": true, "standard_fwd": 0.5341619253158569, "standard_gw": 0.5690865218639374, "standard_gx": 0.599835067987442, "rowwise_fwd": 0.3233291208744049, "rowwise_bwd": 0.41359663009643555, "global_fwd": 0.2831108868122101, "global_bwd": 0.37280842661857605, "x_quantize_rowwise": 0.15563145279884338, "g_quantize_rowwise": 0.046741217374801636, "w_quantize_rowwise": 0.03306940197944641, "w_quantize_colwise_transpose": 0.09020790457725525, "w_quantize_global": 0.0925213098526001, "w_quantize_global_transpose": 0.09945780038833618, "time_standard": 1.7030835151672363, "time_rowwise": 1.6316622495651245, "time_global": 1.6193576157093048} -{"repeat": 64, "batch_size": 32768, "dim_out": 4096, "dim_in": 1024, "wm": 4, "switch": false, "standard_fwd": 1.2199915945529938, "standard_gw": 1.1069811880588531, "standard_gx": 1.09761580824852, "rowwise_fwd": 0.738043338060379, "rowwise_bwd": 0.5549229681491852, "global_fwd": 0.7219798862934113, "global_bwd": 0.5512163043022156, "x_quantize_rowwise": 0.08748471736907959, "g_quantize_rowwise": 0.3023110330104828, "w_quantize_rowwise": 0.03182142972946167, "w_quantize_colwise_transpose": 0.08632615208625793, "w_quantize_global": 0.09445473551750183, "w_quantize_global_transpose": 0.10032951831817627, "time_standard": 3.424588590860367, "time_rowwise": 2.9078908264636993, "time_global": 2.9647573828697205} -{"repeat": 64, "batch_size": 32768, "dim_out": 1024, "dim_in": 4096, "wm": 4, "switch": true, "standard_fwd": 1.1040829122066498, "standard_gw": 1.1221766471862793, "standard_gx": 1.1548101902008057, "rowwise_fwd": 0.581938773393631, "rowwise_bwd": 0.7480122148990631, "global_fwd": 0.5537159740924835, "global_bwd": 0.7232688367366791, "x_quantize_rowwise": 0.30193477869033813, "g_quantize_rowwise": 0.08745118975639343, "w_quantize_rowwise": 0.03374740481376648, "w_quantize_colwise_transpose": 0.09068101644515991, "w_quantize_global": 0.09645149111747742, "w_quantize_global_transpose": 0.10189786553382874, "time_standard": 3.3810697495937347, "time_rowwise": 2.9659420251846313, "time_global": 2.9868967831134796} -{"repeat": 64, "batch_size": 65536, "dim_out": 4096, "dim_in": 1024, "wm": 4, "switch": false, "standard_fwd": 2.4533793330192566, "standard_gw": 2.1938569843769073, "standard_gx": 2.179361879825592, "rowwise_fwd": 1.4615543186664581, "rowwise_bwd": 1.0522231459617615, "global_fwd": 1.4288239181041718, "global_bwd": 1.0450035333633423, "x_quantize_rowwise": 0.1691766083240509, "g_quantize_rowwise": 0.5951300263404846, "w_quantize_rowwise": 0.03337860107421875, "w_quantize_colwise_transpose": 0.08653849363327026, "w_quantize_global": 0.0940859317779541, "w_quantize_global_transpose": 0.09976327419281006, "time_standard": 6.826598197221756, "time_rowwise": 5.5918581783771515, "time_global": 5.625840276479721} -{"repeat": 64, "batch_size": 65536, "dim_out": 1024, "dim_in": 4096, "wm": 4, "switch": true, "standard_fwd": 2.1698065102100372, "standard_gw": 2.1875128149986267, "standard_gx": 2.2887587547302246, "rowwise_fwd": 1.0762326419353485, "rowwise_bwd": 1.4638006687164307, "global_fwd": 1.0450668632984161, "global_bwd": 1.4308765530586243, "x_quantize_rowwise": 0.5953535437583923, "g_quantize_rowwise": 0.16899779438972473, "w_quantize_rowwise": 0.03240257501602173, "w_quantize_colwise_transpose": 0.09106099605560303, "w_quantize_global": 0.09546056389808655, "w_quantize_global_transpose": 0.09852275252342224, "time_standard": 6.6460780799388885, "time_rowwise": 5.615361034870148, "time_global": 5.621790885925293} -{"repeat": 64, "batch_size": 131072, "dim_out": 4096, "dim_in": 1024, "wm": 4, "switch": false, "standard_fwd": 4.858218133449554, "standard_gw": 4.3631307780742645, "standard_gx": 4.404045641422272, "rowwise_fwd": 2.9063820838928223, "rowwise_bwd": 2.094462513923645, "global_fwd": 2.8426870703697205, "global_bwd": 2.0792782306671143, "x_quantize_rowwise": 0.33241137862205505, "g_quantize_rowwise": 1.1817105114459991, "w_quantize_rowwise": 0.03374367952346802, "w_quantize_colwise_transpose": 0.08633732795715332, "w_quantize_global": 0.09231641888618469, "w_quantize_global_transpose": 0.100012868642807, "time_standard": 13.62539455294609, "time_rowwise": 10.998178273439407, "time_global": 10.991547256708145} -{"repeat": 64, "batch_size": 131072, "dim_out": 1024, "dim_in": 4096, "wm": 4, "switch": true, "standard_fwd": 4.246581345796585, "standard_gw": 4.42587211728096, "standard_gx": 4.581417888402939, "rowwise_fwd": 2.1114833652973175, "rowwise_bwd": 2.9050447046756744, "global_fwd": 2.0806826651096344, "global_bwd": 2.85966694355011, "x_quantize_rowwise": 1.1816024780273438, "g_quantize_rowwise": 0.33330172300338745, "w_quantize_rowwise": 0.033445656299591064, "w_quantize_colwise_transpose": 0.09065866470336914, "w_quantize_global": 0.09239837527275085, "w_quantize_global_transpose": 0.09984523057937622, "time_standard": 13.253871351480484, "time_rowwise": 11.081408709287643, "time_global": 11.073369532823563} -{"repeat": 64, "batch_size": 8192, "dim_out": 5120, "dim_in": 1280, "wm": 4, "switch": false, "standard_fwd": 0.4859529435634613, "standard_gw": 0.46338513493537903, "standard_gx": 0.42321905493736267, "rowwise_fwd": 0.2761557698249817, "rowwise_bwd": 0.20775198936462402, "global_fwd": 0.2713911235332489, "global_bwd": 0.20639970898628235, "x_quantize_rowwise": 0.033095479011535645, "g_quantize_rowwise": 0.11894106864929199, "w_quantize_rowwise": 0.03125518560409546, "w_quantize_colwise_transpose": 0.1424551010131836, "w_quantize_global": 0.07288157939910889, "w_quantize_global_transpose": 0.08071959018707275, "time_standard": 1.372557133436203, "time_rowwise": 1.2730397284030914, "time_global": 1.2468136847019196} -{"repeat": 64, "batch_size": 8192, "dim_out": 1280, "dim_in": 5120, "wm": 4, "switch": true, "standard_fwd": 0.3920421004295349, "standard_gw": 0.44424086809158325, "standard_gx": 0.4759356379508972, "rowwise_fwd": 0.23231282830238342, "rowwise_bwd": 0.28430670499801636, "global_fwd": 0.20883232355117798, "global_bwd": 0.2741999924182892, "x_quantize_rowwise": 0.12018159031867981, "g_quantize_rowwise": 0.03195926547050476, "w_quantize_rowwise": 0.026017427444458008, "w_quantize_colwise_transpose": 0.14733895659446716, "w_quantize_global": 0.07734447717666626, "w_quantize_global_transpose": 0.0788569450378418, "time_standard": 1.3122186064720154, "time_rowwise": 1.2863576412200928, "time_global": 1.235615462064743} -{"repeat": 64, "batch_size": 16384, "dim_out": 5120, "dim_in": 1280, "wm": 4, "switch": false, "standard_fwd": 1.0111741721630096, "standard_gw": 0.9267590939998627, "standard_gx": 0.8254274725914001, "rowwise_fwd": 0.5434826016426086, "rowwise_bwd": 0.4077926278114319, "global_fwd": 0.5318708717823029, "global_bwd": 0.40537863969802856, "x_quantize_rowwise": 0.059738755226135254, "g_quantize_rowwise": 0.2299174666404724, "w_quantize_rowwise": 0.02545863389968872, "w_quantize_colwise_transpose": 0.14269724488258362, "w_quantize_global": 0.07300823926925659, "w_quantize_global_transpose": 0.07878988981246948, "time_standard": 2.7633607387542725, "time_rowwise": 2.335846424102783, "time_global": 2.305462956428528} -{"repeat": 64, "batch_size": 16384, "dim_out": 1280, "dim_in": 5120, "wm": 4, "switch": true, "standard_fwd": 0.8095316588878632, "standard_gw": 0.8607134222984314, "standard_gx": 0.9204968810081482, "rowwise_fwd": 0.4275888204574585, "rowwise_bwd": 0.5485899746417999, "global_fwd": 0.41000545024871826, "global_bwd": 0.5317628383636475, "x_quantize_rowwise": 0.2301819622516632, "g_quantize_rowwise": 0.059254467487335205, "w_quantize_rowwise": 0.02466142177581787, "w_quantize_colwise_transpose": 0.14865398406982422, "w_quantize_global": 0.07582828402519226, "w_quantize_global_transpose": 0.08231401443481445, "time_standard": 2.5907419621944427, "time_rowwise": 2.2996440529823303, "time_global": 2.2500604391098022} -{"repeat": 64, "batch_size": 32768, "dim_out": 5120, "dim_in": 1280, "wm": 4, "switch": false, "standard_fwd": 2.0658522844314575, "standard_gw": 1.718364655971527, "standard_gx": 1.6660578548908234, "rowwise_fwd": 1.066897064447403, "rowwise_bwd": 0.8070804178714752, "global_fwd": 1.0473169386386871, "global_bwd": 0.8021742105484009, "x_quantize_rowwise": 0.11274218559265137, "g_quantize_rowwise": 0.4518181085586548, "w_quantize_rowwise": 0.026501715183258057, "w_quantize_colwise_transpose": 0.14259666204452515, "w_quantize_global": 0.07484853267669678, "w_quantize_global_transpose": 0.07976219058036804, "time_standard": 5.450274795293808, "time_rowwise": 4.326000809669495, "time_global": 4.287026822566986} -{"repeat": 64, "batch_size": 32768, "dim_out": 1280, "dim_in": 5120, "wm": 4, "switch": true, "standard_fwd": 2.7549192309379578, "standard_gw": 1.6954988241195679, "standard_gx": 1.8179528415203094, "rowwise_fwd": 0.8649080991744995, "rowwise_bwd": 1.0746456682682037, "global_fwd": 0.8023083209991455, "global_bwd": 1.0471977293491364, "x_quantize_rowwise": 0.45225024223327637, "g_quantize_rowwise": 0.11286512017250061, "w_quantize_rowwise": 0.0252649188041687, "w_quantize_colwise_transpose": 0.14732033014297485, "w_quantize_global": 0.07537379860877991, "w_quantize_global_transpose": 0.0807642936706543, "time_standard": 6.268370896577835, "time_rowwise": 4.372753202915192, "time_global": 4.266258329153061} -{"repeat": 64, "batch_size": 65536, "dim_out": 5120, "dim_in": 1280, "wm": 4, "switch": false, "standard_fwd": 4.098430275917053, "standard_gw": 3.3501461148262024, "standard_gx": 5.560480058193207, "rowwise_fwd": 2.112947404384613, "rowwise_bwd": 1.605246216058731, "global_fwd": 2.0697638392448425, "global_bwd": 1.5953518450260162, "x_quantize_rowwise": 0.21921470761299133, "g_quantize_rowwise": 0.8956789970397949, "w_quantize_rowwise": 0.02710893750190735, "w_quantize_colwise_transpose": 0.14268234372138977, "w_quantize_global": 0.07259473204612732, "w_quantize_global_transpose": 0.07899105548858643, "time_standard": 13.009056448936462, "time_rowwise": 8.35302472114563, "time_global": 8.281741291284561} -{"repeat": 64, "batch_size": 65536, "dim_out": 1280, "dim_in": 5120, "wm": 4, "switch": true, "standard_fwd": 5.586959421634674, "standard_gw": 3.358360379934311, "standard_gx": 3.6434978246688843, "rowwise_fwd": 1.6269534826278687, "rowwise_bwd": 2.128206193447113, "global_fwd": 1.5950687229633331, "global_bwd": 2.0831897854804993, "x_quantize_rowwise": 0.8954145014286041, "g_quantize_rowwise": 0.21914392709732056, "w_quantize_rowwise": 0.026203691959381104, "w_quantize_colwise_transpose": 0.14658644795417786, "w_quantize_global": 0.07478520274162292, "w_quantize_global_transpose": 0.07964670658111572, "time_standard": 12.58881762623787, "time_rowwise": 8.400868624448776, "time_global": 8.305609226226807} -{"repeat": 64, "batch_size": 131072, "dim_out": 5120, "dim_in": 1280, "wm": 4, "switch": false, "standard_fwd": 8.229725062847137, "standard_gw": 6.791356950998306, "standard_gx": 6.806455552577972, "rowwise_fwd": 4.252471029758453, "rowwise_bwd": 3.2062679529190063, "global_fwd": 4.175614565610886, "global_bwd": 3.1837262213230133, "x_quantize_rowwise": 0.4321373999118805, "g_quantize_rowwise": 1.787092536687851, "w_quantize_rowwise": 0.0270158052444458, "w_quantize_colwise_transpose": 0.1424252986907959, "w_quantize_global": 0.07348507642745972, "w_quantize_global_transpose": 0.07829815149307251, "time_standard": 21.827537566423416, "time_rowwise": 16.63876697421074, "time_global": 16.52171090245247} -{"repeat": 64, "batch_size": 131072, "dim_out": 1280, "dim_in": 5120, "wm": 4, "switch": true, "standard_fwd": 11.279478669166565, "standard_gw": 6.7345499992370605, "standard_gx": 7.206875830888748, "rowwise_fwd": 3.209315240383148, "rowwise_bwd": 4.256397485733032, "global_fwd": 3.180190920829773, "global_bwd": 4.177983850240707, "x_quantize_rowwise": 1.7836056649684906, "g_quantize_rowwise": 0.4321075975894928, "w_quantize_rowwise": 0.03205239772796631, "w_quantize_colwise_transpose": 0.14675036072731018, "w_quantize_global": 0.09316205978393555, "w_quantize_global_transpose": 0.10086596012115479, "time_standard": 25.220904499292374, "time_rowwise": 16.5947787463665, "time_global": 16.502466052770615} -{"repeat": 64, "batch_size": 8192, "dim_out": 5632, "dim_in": 1408, "wm": 4, "switch": false, "standard_fwd": 0.5776733160018921, "standard_gw": 0.5300231277942657, "standard_gx": 0.6005913019180298, "rowwise_fwd": 0.33330172300338745, "rowwise_bwd": 0.2957060933113098, "global_fwd": 0.32876431941986084, "global_bwd": 0.29108673334121704, "x_quantize_rowwise": 0.03466755151748657, "g_quantize_rowwise": 0.12264400720596313, "w_quantize_rowwise": 0.033874064683914185, "w_quantize_colwise_transpose": 0.1775398850440979, "w_quantize_global": 0.09503215551376343, "w_quantize_global_transpose": 0.10617449879646301, "time_standard": 1.7082877457141876, "time_rowwise": 1.5277564525604248, "time_global": 1.5083923935890198} -{"repeat": 64, "batch_size": 8192, "dim_out": 1408, "dim_in": 5632, "wm": 4, "switch": true, "standard_fwd": 0.5164109170436859, "standard_gw": 0.5367249250411987, "standard_gx": 0.5876161158084869, "rowwise_fwd": 0.3132447600364685, "rowwise_bwd": 0.3396235406398773, "global_fwd": 0.2943649888038635, "global_bwd": 0.33209100365638733, "x_quantize_rowwise": 0.12357160449028015, "g_quantize_rowwise": 0.035997480154037476, "w_quantize_rowwise": 0.03213062882423401, "w_quantize_colwise_transpose": 0.17676874995231628, "w_quantize_global": 0.09861215949058533, "w_quantize_global_transpose": 0.0998862087726593, "time_standard": 1.6407519578933716, "time_rowwise": 1.5580616891384125, "time_global": 1.5212483704090118} -{"repeat": 64, "batch_size": 16384, "dim_out": 5632, "dim_in": 1408, "wm": 4, "switch": false, "standard_fwd": 1.2096501886844635, "standard_gw": 1.0663382709026337, "standard_gx": 1.0961703956127167, "rowwise_fwd": 0.6396733224391937, "rowwise_bwd": 0.5173943936824799, "global_fwd": 0.6296299397945404, "global_bwd": 0.5130060017108917, "x_quantize_rowwise": 0.06211921572685242, "g_quantize_rowwise": 0.2361498773097992, "w_quantize_rowwise": 0.03260001540184021, "w_quantize_colwise_transpose": 0.17679482698440552, "w_quantize_global": 0.09361281991004944, "w_quantize_global_transpose": 0.09913742542266846, "time_standard": 3.372158855199814, "time_rowwise": 2.7310699224472046, "time_global": 2.6999935507774353} -{"repeat": 64, "batch_size": 16384, "dim_out": 1408, "dim_in": 5632, "wm": 4, "switch": true, "standard_fwd": 1.1065565049648285, "standard_gw": 1.0664314031600952, "standard_gx": 1.1266544461250305, "rowwise_fwd": 0.5352050065994263, "rowwise_bwd": 0.6464086472988129, "global_fwd": 0.513765960931778, "global_bwd": 0.6284862756729126, "x_quantize_rowwise": 0.23620948195457458, "g_quantize_rowwise": 0.062271952629089355, "w_quantize_rowwise": 0.031460076570510864, "w_quantize_colwise_transpose": 0.17675384879112244, "w_quantize_global": 0.09486451745033264, "w_quantize_global_transpose": 0.09898096323013306, "time_standard": 3.2996423542499542, "time_rowwise": 2.7547404170036316, "time_global": 2.7010105550289154} -{"repeat": 64, "batch_size": 32768, "dim_out": 5632, "dim_in": 1408, "wm": 4, "switch": false, "standard_fwd": 2.4367496371269226, "standard_gw": 2.0806193351745605, "standard_gx": 2.19624862074852, "rowwise_fwd": 1.2554042041301727, "rowwise_bwd": 1.0227933526039124, "global_fwd": 1.2322552502155304, "global_bwd": 1.0152235627174377, "x_quantize_rowwise": 0.11792033910751343, "g_quantize_rowwise": 0.4639364778995514, "w_quantize_rowwise": 0.03241002559661865, "w_quantize_colwise_transpose": 0.17657503485679626, "w_quantize_global": 0.09655207395553589, "w_quantize_global_transpose": 0.09958073496818542, "time_standard": 6.713617593050003, "time_rowwise": 5.149658769369125, "time_global": 5.106087774038315} -{"repeat": 64, "batch_size": 32768, "dim_out": 1408, "dim_in": 5632, "wm": 4, "switch": true, "standard_fwd": 2.1935217082500458, "standard_gw": 2.0055584609508514, "standard_gx": 2.1882541477680206, "rowwise_fwd": 1.0396353900432587, "rowwise_bwd": 1.2542344629764557, "global_fwd": 1.0161921381950378, "global_bwd": 1.233428716659546, "x_quantize_rowwise": 0.4642195999622345, "g_quantize_rowwise": 0.11782720685005188, "w_quantize_rowwise": 0.033117830753326416, "w_quantize_colwise_transpose": 0.17696991562843323, "w_quantize_global": 0.09416043758392334, "w_quantize_global_transpose": 0.10101497173309326, "time_standard": 6.387334316968918, "time_rowwise": 5.091562867164612, "time_global": 5.032401531934738} -{"repeat": 64, "batch_size": 65536, "dim_out": 5632, "dim_in": 1408, "wm": 4, "switch": false, "standard_fwd": 4.804681986570358, "standard_gw": 4.763372242450714, "standard_gx": 4.064023494720459, "rowwise_fwd": 2.484843134880066, "rowwise_bwd": 1.9691288471221924, "global_fwd": 2.441786229610443, "global_bwd": 1.9574686884880066, "x_quantize_rowwise": 0.2294592559337616, "g_quantize_rowwise": 0.9196549654006958, "w_quantize_rowwise": 0.0313781201839447, "w_quantize_colwise_transpose": 0.1768544316291809, "w_quantize_global": 0.09644776582717896, "w_quantize_global_transpose": 0.09847059845924377, "time_standard": 13.632077723741531, "time_rowwise": 10.574690997600555, "time_global": 10.506659746170044} -{"repeat": 64, "batch_size": 65536, "dim_out": 1408, "dim_in": 5632, "wm": 4, "switch": true, "standard_fwd": 4.0907710790634155, "standard_gw": 3.9793066680431366, "standard_gx": 4.302978515625, "rowwise_fwd": 1.992940902709961, "rowwise_bwd": 2.4996213614940643, "global_fwd": 1.9551962614059448, "global_bwd": 2.457551658153534, "x_quantize_rowwise": 0.9200014173984528, "g_quantize_rowwise": 0.2293996512889862, "w_quantize_rowwise": 0.0313781201839447, "w_quantize_colwise_transpose": 0.17882883548736572, "w_quantize_global": 0.09540095925331116, "w_quantize_global_transpose": 0.09880587458610535, "time_standard": 12.373056262731552, "time_rowwise": 9.831476956605911, "time_global": 9.73566249012947} -{"repeat": 64, "batch_size": 131072, "dim_out": 5632, "dim_in": 1408, "wm": 4, "switch": false, "standard_fwd": 9.655728936195374, "standard_gw": 8.261296898126602, "standard_gx": 8.064884692430496, "rowwise_fwd": 5.007706582546234, "rowwise_bwd": 3.8615092635154724, "global_fwd": 4.920527338981628, "global_bwd": 3.8330331444740295, "x_quantize_rowwise": 0.45276060700416565, "g_quantize_rowwise": 1.8306002020835876, "w_quantize_rowwise": 0.031366944313049316, "w_quantize_colwise_transpose": 0.1766495406627655, "w_quantize_global": 0.09412690997123718, "w_quantize_global_transpose": 0.09780004620552063, "time_standard": 25.981910526752472, "time_rowwise": 19.621890038251877, "time_global": 19.49014514684677} -{"repeat": 64, "batch_size": 131072, "dim_out": 1408, "dim_in": 5632, "wm": 4, "switch": true, "standard_fwd": 8.033104240894318, "standard_gw": 8.2889124751091, "standard_gx": 8.622754365205765, "rowwise_fwd": 3.8747042417526245, "rowwise_bwd": 5.003921687602997, "global_fwd": 3.8315393030643463, "global_bwd": 4.9162134528160095, "x_quantize_rowwise": 1.8304847180843353, "g_quantize_rowwise": 0.4522763192653656, "w_quantize_rowwise": 0.03413110971450806, "w_quantize_colwise_transpose": 0.1771189272403717, "w_quantize_global": 0.09519979357719421, "w_quantize_global_transpose": 0.09930506348609924, "time_standard": 24.944771081209183, "time_rowwise": 19.661549478769302, "time_global": 19.51393112540245} -{"repeat": 64, "batch_size": 8192, "dim_out": 6656, "dim_in": 1664, "wm": 4, "switch": false, "standard_fwd": 0.7954612374305725, "standard_gw": 0.7456131279468536, "standard_gx": 0.8799619972705841, "rowwise_fwd": 0.43267011642456055, "rowwise_bwd": 0.34622475504875183, "global_fwd": 0.42615458369255066, "global_bwd": 0.344250351190567, "x_quantize_rowwise": 0.03748014569282532, "g_quantize_rowwise": 0.13304129242897034, "w_quantize_rowwise": 0.03294646739959717, "w_quantize_colwise_transpose": 0.2407953143119812, "w_quantize_global": 0.094633549451828, "w_quantize_global_transpose": 0.10305643081665039, "time_standard": 2.4210363626480103, "time_rowwise": 1.96877121925354, "time_global": 1.8842294812202454} -{"repeat": 64, "batch_size": 8192, "dim_out": 1664, "dim_in": 6656, "wm": 4, "switch": true, "standard_fwd": 0.7120333611965179, "standard_gw": 0.7622130215167999, "standard_gx": 0.8262209594249725, "rowwise_fwd": 0.3702230751514435, "rowwise_bwd": 0.4419572651386261, "global_fwd": 0.3479123115539551, "global_bwd": 0.4306286573410034, "x_quantize_rowwise": 0.13308599591255188, "g_quantize_rowwise": 0.037495046854019165, "w_quantize_rowwise": 0.03398209810256958, "w_quantize_colwise_transpose": 0.23782625794410706, "w_quantize_global": 0.09853765368461609, "w_quantize_global_transpose": 0.10247156023979187, "time_standard": 2.3004673421382904, "time_rowwise": 2.016782760620117, "time_global": 1.9123442471027374} -{"repeat": 64, "batch_size": 16384, "dim_out": 6656, "dim_in": 1664, "wm": 4, "switch": false, "standard_fwd": 1.6292817890644073, "standard_gw": 1.5109702944755554, "standard_gx": 1.482747495174408, "rowwise_fwd": 0.8386112749576569, "rowwise_bwd": 0.6844550371170044, "global_fwd": 0.8220970630645752, "global_bwd": 0.6802082061767578, "x_quantize_rowwise": 0.06883963942527771, "g_quantize_rowwise": 0.25641173124313354, "w_quantize_rowwise": 0.033054500818252563, "w_quantize_colwise_transpose": 0.24027004837989807, "w_quantize_global": 0.0967271625995636, "w_quantize_global_transpose": 0.102948397397995, "time_standard": 4.622999578714371, "time_rowwise": 3.6326125264167786, "time_global": 3.5382024943828583} -{"repeat": 64, "batch_size": 16384, "dim_out": 1664, "dim_in": 6656, "wm": 4, "switch": true, "standard_fwd": 1.4877021312713623, "standard_gw": 1.5015341341495514, "standard_gx": 1.529306173324585, "rowwise_fwd": 0.715944916009903, "rowwise_bwd": 0.8529908955097198, "global_fwd": 0.680088996887207, "global_bwd": 0.8224695920944214, "x_quantize_rowwise": 0.2568177878856659, "g_quantize_rowwise": 0.06864592432975769, "w_quantize_rowwise": 0.03343448042869568, "w_quantize_colwise_transpose": 0.23645907640457153, "w_quantize_global": 0.09399279952049255, "w_quantize_global_transpose": 0.10286271572113037, "time_standard": 4.518542438745499, "time_rowwise": 3.665827214717865, "time_global": 3.5264119505882263} -{"repeat": 64, "batch_size": 32768, "dim_out": 6656, "dim_in": 1664, "wm": 4, "switch": false, "standard_fwd": 3.261040896177292, "standard_gw": 2.8816498816013336, "standard_gx": 2.8357282280921936, "rowwise_fwd": 1.6594752669334412, "rowwise_bwd": 1.359265297651291, "global_fwd": 1.6287527978420258, "global_bwd": 1.3503879308700562, "x_quantize_rowwise": 0.13146549463272095, "g_quantize_rowwise": 0.5035959184169769, "w_quantize_rowwise": 0.03438442945480347, "w_quantize_colwise_transpose": 0.24086236953735352, "w_quantize_global": 0.0945068895816803, "w_quantize_global_transpose": 0.10332837700843811, "time_standard": 8.978419005870819, "time_rowwise": 6.8106986582279205, "time_global": 6.693687289953232} -{"repeat": 64, "batch_size": 32768, "dim_out": 1664, "dim_in": 6656, "wm": 4, "switch": true, "standard_fwd": 2.848360687494278, "standard_gw": 2.8955675661563873, "standard_gx": 3.0499882996082306, "rowwise_fwd": 1.3900883495807648, "rowwise_bwd": 1.6595833003520966, "global_fwd": 1.3514049351215363, "global_bwd": 1.629263162612915, "x_quantize_rowwise": 0.5036592483520508, "g_quantize_rowwise": 0.13118237257003784, "w_quantize_rowwise": 0.03438442945480347, "w_quantize_colwise_transpose": 0.23709610104560852, "w_quantize_global": 0.0951625406742096, "w_quantize_global_transpose": 0.10216236114501953, "time_standard": 8.793916553258896, "time_rowwise": 6.851561367511749, "time_global": 6.708402186632156} -{"repeat": 64, "batch_size": 65536, "dim_out": 6656, "dim_in": 1664, "wm": 4, "switch": false, "standard_fwd": 6.4978525042533875, "standard_gw": 6.462603807449341, "standard_gx": 5.5987648665905, "rowwise_fwd": 3.2996535301208496, "rowwise_bwd": 2.6320070028305054, "global_fwd": 3.2426007091999054, "global_bwd": 2.612769603729248, "x_quantize_rowwise": 0.2561397850513458, "g_quantize_rowwise": 0.9984448552131653, "w_quantize_rowwise": 0.033076852560043335, "w_quantize_colwise_transpose": 0.24232640862464905, "w_quantize_global": 0.09618699550628662, "w_quantize_global_transpose": 0.10257214307785034, "time_standard": 18.559221178293228, "time_rowwise": 13.9242522418499, "time_global": 13.771317899227142} -{"repeat": 64, "batch_size": 65536, "dim_out": 1664, "dim_in": 6656, "wm": 4, "switch": true, "standard_fwd": 5.5702440440654755, "standard_gw": 5.717620253562927, "standard_gx": 6.08203187584877, "rowwise_fwd": 2.649586647748947, "rowwise_bwd": 3.315173089504242, "global_fwd": 2.6132799685001373, "global_bwd": 3.257807344198227, "x_quantize_rowwise": 0.9980201721191406, "g_quantize_rowwise": 0.256560742855072, "w_quantize_rowwise": 0.03356859087944031, "w_quantize_colwise_transpose": 0.23729726672172546, "w_quantize_global": 0.09495764970779419, "w_quantize_global_transpose": 0.103779137134552, "time_standard": 17.369896173477173, "time_rowwise": 13.207826763391495, "time_global": 13.04202526807785} -{"repeat": 64, "batch_size": 131072, "dim_out": 6656, "dim_in": 1664, "wm": 4, "switch": false, "standard_fwd": 13.058379292488098, "standard_gw": 11.480242013931274, "standard_gx": 11.092845350503922, "rowwise_fwd": 6.637874990701675, "rowwise_bwd": 5.24790957570076, "global_fwd": 6.521012634038925, "global_bwd": 5.214303731918335, "x_quantize_rowwise": 0.5057565867900848, "g_quantize_rowwise": 1.989319920539856, "w_quantize_rowwise": 0.03439188003540039, "w_quantize_colwise_transpose": 0.24280324578285217, "w_quantize_global": 0.09520724415779114, "w_quantize_global_transpose": 0.10240450501441956, "time_standard": 35.631466656923294, "time_rowwise": 26.138298213481903, "time_global": 25.908246636390686} -{"repeat": 64, "batch_size": 131072, "dim_out": 1664, "dim_in": 6656, "wm": 4, "switch": true, "standard_fwd": 11.13397628068924, "standard_gw": 11.371888220310211, "standard_gx": 12.12756335735321, "rowwise_fwd": 5.2495077252388, "rowwise_bwd": 6.638709455728531, "global_fwd": 5.215313285589218, "global_bwd": 6.5222084522247314, "x_quantize_rowwise": 1.9870512187480927, "g_quantize_rowwise": 0.5058236420154572, "w_quantize_rowwise": 0.034634023904800415, "w_quantize_colwise_transpose": 0.23674964904785156, "w_quantize_global": 0.09457767009735107, "w_quantize_global_transpose": 0.10183081030845642, "time_standard": 34.63342785835266, "time_rowwise": 26.024363934993744, "time_global": 25.798693299293518} -{"repeat": 64, "batch_size": 8192, "dim_out": 8192, "dim_in": 2048, "wm": 4, "switch": false, "standard_fwd": 1.2125298380851746, "standard_gw": 1.1111274361610413, "standard_gx": 1.0840706527233124, "rowwise_fwd": 0.6057210266590118, "rowwise_bwd": 0.51865354180336, "global_fwd": 0.5952082574367523, "global_bwd": 0.5167685449123383, "x_quantize_rowwise": 0.045686960220336914, "g_quantize_rowwise": 0.15827640891075134, "w_quantize_rowwise": 0.04361197352409363, "w_quantize_colwise_transpose": 0.34067779779434204, "w_quantize_global": 0.13644620776176453, "w_quantize_global_transpose": 0.14925003051757812, "time_standard": 3.407727926969528, "time_rowwise": 2.823755145072937, "time_global": 2.7127638459205627} -{"repeat": 64, "batch_size": 8192, "dim_out": 2048, "dim_in": 8192, "wm": 4, "switch": true, "standard_fwd": 1.0731369256973267, "standard_gw": 1.1365897953510284, "standard_gx": 1.1498592793941498, "rowwise_fwd": 0.5573518574237823, "rowwise_bwd": 0.615488737821579, "global_fwd": 0.5220361053943634, "global_bwd": 0.5939789116382599, "x_quantize_rowwise": 0.15765801072120667, "g_quantize_rowwise": 0.04369020462036133, "w_quantize_rowwise": 0.047359615564346313, "w_quantize_colwise_transpose": 0.5526281893253326, "w_quantize_global": 0.13606995344161987, "w_quantize_global_transpose": 0.15017390251159668, "time_standard": 3.359586000442505, "time_rowwise": 3.1107664108276367, "time_global": 2.7401968836784363} -{"repeat": 64, "batch_size": 16384, "dim_out": 8192, "dim_in": 2048, "wm": 4, "switch": false, "standard_fwd": 2.4274885654449463, "standard_gw": 2.1799951791763306, "standard_gx": 2.1426528692245483, "rowwise_fwd": 1.195710152387619, "rowwise_bwd": 1.027170568704605, "global_fwd": 1.1747106909751892, "global_bwd": 1.0251589119434357, "x_quantize_rowwise": 0.08098781108856201, "g_quantize_rowwise": 0.3052949905395508, "w_quantize_rowwise": 0.043764710426330566, "w_quantize_colwise_transpose": 0.33987686038017273, "w_quantize_global": 0.13646483421325684, "w_quantize_global_transpose": 0.14739856123924255, "time_standard": 6.750136613845825, "time_rowwise": 5.172800272703171, "time_global": 5.050010979175568} -{"repeat": 64, "batch_size": 16384, "dim_out": 2048, "dim_in": 8192, "wm": 4, "switch": true, "standard_fwd": 2.1661892533302307, "standard_gw": 2.0948275923728943, "standard_gx": 2.306375652551651, "rowwise_fwd": 1.0587647557258606, "rowwise_bwd": 1.1999905109405518, "global_fwd": 1.0296404361724854, "global_bwd": 1.1749230325222015, "x_quantize_rowwise": 0.3054030239582062, "g_quantize_rowwise": 0.08077546954154968, "w_quantize_rowwise": 0.047225505113601685, "w_quantize_colwise_transpose": 0.600133091211319, "w_quantize_global": 0.13613328337669373, "w_quantize_global_transpose": 0.1484006643295288, "time_standard": 6.567392498254776, "time_rowwise": 5.387119948863983, "time_global": 4.97010350227356} -{"repeat": 64, "batch_size": 32768, "dim_out": 8192, "dim_in": 2048, "wm": 4, "switch": false, "standard_fwd": 4.807606339454651, "standard_gw": 4.170913249254227, "standard_gx": 4.117622971534729, "rowwise_fwd": 2.370934933423996, "rowwise_bwd": 1.9481778144836426, "global_fwd": 2.3383721709251404, "global_bwd": 1.9443817436695099, "x_quantize_rowwise": 0.1547597348690033, "g_quantize_rowwise": 0.6000511348247528, "w_quantize_rowwise": 0.04361942410469055, "w_quantize_colwise_transpose": 0.3403201699256897, "w_quantize_global": 0.13600289821624756, "w_quantize_global_transpose": 0.1474134624004364, "time_standard": 13.096142560243607, "time_rowwise": 9.628776460886002, "time_global": 9.491894394159317} -{"repeat": 64, "batch_size": 32768, "dim_out": 2048, "dim_in": 8192, "wm": 4, "switch": true, "standard_fwd": 4.1619837284088135, "standard_gw": 4.181284457445145, "standard_gx": 4.635505378246307, "rowwise_fwd": 1.9684135913848877, "rowwise_bwd": 2.3750364780426025, "global_fwd": 1.9445866346359253, "global_bwd": 2.3551955819129944, "x_quantize_rowwise": 0.6004162132740021, "g_quantize_rowwise": 0.15468522906303406, "w_quantize_rowwise": 0.04730746150016785, "w_quantize_colwise_transpose": 0.5999617278575897, "w_quantize_global": 0.1364201307296753, "w_quantize_global_transpose": 0.14847144484519958, "time_standard": 12.978773564100266, "time_rowwise": 9.927105158567429, "time_global": 9.521059691905975} -{"repeat": 64, "batch_size": 65536, "dim_out": 8192, "dim_in": 2048, "wm": 4, "switch": false, "standard_fwd": 9.52371209859848, "standard_gw": 8.354485034942627, "standard_gx": 8.69860127568245, "rowwise_fwd": 4.717472940683365, "rowwise_bwd": 3.8843750953674316, "global_fwd": 4.645414650440216, "global_bwd": 3.8761012256145477, "x_quantize_rowwise": 0.3024861216545105, "g_quantize_rowwise": 1.1897757649421692, "w_quantize_rowwise": 0.04366785287857056, "w_quantize_colwise_transpose": 0.33988431096076965, "w_quantize_global": 0.1359507441520691, "w_quantize_global_transpose": 0.14724582433700562, "time_standard": 26.576798409223557, "time_rowwise": 18.832147121429443, "time_global": 18.651459366083145} -{"repeat": 64, "batch_size": 65536, "dim_out": 2048, "dim_in": 8192, "wm": 4, "switch": true, "standard_fwd": 8.307881653308868, "standard_gw": 8.214320987462997, "standard_gx": 9.21182706952095, "rowwise_fwd": 3.8919784128665924, "rowwise_bwd": 4.72346693277359, "global_fwd": 3.8761794567108154, "global_bwd": 4.673641175031662, "x_quantize_rowwise": 1.1893920600414276, "g_quantize_rowwise": 0.3024972975254059, "w_quantize_rowwise": 0.04708021879196167, "w_quantize_colwise_transpose": 0.6039328873157501, "w_quantize_global": 0.13624504208564758, "w_quantize_global_transpose": 0.14867261052131653, "time_standard": 25.734029710292816, "time_rowwise": 18.972668796777725, "time_global": 18.540948629379272} -{"repeat": 64, "batch_size": 131072, "dim_out": 8192, "dim_in": 2048, "wm": 4, "switch": false, "standard_fwd": 19.30372044444084, "standard_gw": 16.480475664138794, "standard_gx": 17.61433482170105, "rowwise_fwd": 9.49602946639061, "rowwise_bwd": 7.768530398607254, "global_fwd": 9.3533955514431, "global_bwd": 7.749464362859726, "x_quantize_rowwise": 0.5977451801300049, "g_quantize_rowwise": 2.3684948682785034, "w_quantize_rowwise": 0.04375725984573364, "w_quantize_colwise_transpose": 0.34042075276374817, "w_quantize_global": 0.13628974556922913, "w_quantize_global_transpose": 0.14671683311462402, "time_standard": 53.398530930280685, "time_rowwise": 37.09545359015465, "time_global": 36.83258220553398} -{"repeat": 64, "batch_size": 131072, "dim_out": 2048, "dim_in": 8192, "wm": 4, "switch": true, "standard_fwd": 18.041003495454788, "standard_gw": 17.770148813724518, "standard_gx": 17.70009845495224, "rowwise_fwd": 7.756810635328293, "rowwise_bwd": 9.502101689577103, "global_fwd": 7.7384114265441895, "global_bwd": 9.36170294880867, "x_quantize_rowwise": 2.3686252534389496, "g_quantize_rowwise": 0.5980581045150757, "w_quantize_rowwise": 0.04723668098449707, "w_quantize_colwise_transpose": 0.6035342812538147, "w_quantize_global": 0.13603642582893372, "w_quantize_global_transpose": 0.1485198736190796, "time_standard": 53.511250764131546, "time_rowwise": 38.64651545882225, "time_global": 38.121502846479416} -{"repeat": 64, "batch_size": 8192, "dim_out": 16384, "dim_in": 4096, "wm": 4, "switch": false, "standard_fwd": 4.598241299390793, "standard_gw": 4.294309765100479, "standard_gx": 4.261095076799393, "rowwise_fwd": 2.0976848900318146, "rowwise_bwd": 1.9718967378139496, "global_fwd": 2.0763762295246124, "global_bwd": 1.9703581929206848, "x_quantize_rowwise": 0.08216872811317444, "g_quantize_rowwise": 0.4405900835990906, "w_quantize_rowwise": 0.1553371548652649, "w_quantize_colwise_transpose": 1.6110725700855255, "w_quantize_global": 0.481240451335907, "w_quantize_global_transpose": 0.5061514675617218, "time_standard": 13.153646141290665, "time_rowwise": 10.653059929609299, "time_global": 9.85119491815567} -{"repeat": 64, "batch_size": 8192, "dim_out": 4096, "dim_in": 16384, "wm": 4, "switch": true, "standard_fwd": 4.35885414481163, "standard_gw": 4.29583340883255, "standard_gx": 4.5370906591415405, "rowwise_fwd": 2.0015686750411987, "rowwise_bwd": 2.097565680742264, "global_fwd": 1.969795674085617, "global_bwd": 2.075403928756714, "x_quantize_rowwise": 0.43984130024909973, "g_quantize_rowwise": 0.08216127753257751, "w_quantize_rowwise": 0.22544339299201965, "w_quantize_colwise_transpose": 2.4342015385627747, "w_quantize_global": 0.48087164759635925, "w_quantize_global_transpose": 0.5099289119243622, "time_standard": 13.19177821278572, "time_rowwise": 11.576615273952484, "time_global": 9.85383614897728} -{"repeat": 64, "batch_size": 16384, "dim_out": 16384, "dim_in": 4096, "wm": 4, "switch": false, "standard_fwd": 9.09888744354248, "standard_gw": 8.230950683355331, "standard_gx": 8.465446531772614, "rowwise_fwd": 4.182614386081696, "rowwise_bwd": 3.747660666704178, "global_fwd": 4.138719290494919, "global_bwd": 3.74777615070343, "x_quantize_rowwise": 0.15515834093093872, "g_quantize_rowwise": 0.8699297904968262, "w_quantize_rowwise": 0.15544891357421875, "w_quantize_colwise_transpose": 1.6132444143295288, "w_quantize_global": 0.48100948333740234, "w_quantize_global_transpose": 0.5051903426647186, "time_standard": 25.795284658670425, "time_rowwise": 18.955007195472717, "time_global": 18.128734081983566} -{"repeat": 64, "batch_size": 16384, "dim_out": 4096, "dim_in": 16384, "wm": 4, "switch": true, "standard_fwd": 8.378107100725174, "standard_gw": 8.923027664422989, "standard_gx": 9.049762040376663, "rowwise_fwd": 3.765825182199478, "rowwise_bwd": 4.183519631624222, "global_fwd": 3.744799643754959, "global_bwd": 4.1590481996536255, "x_quantize_rowwise": 0.8693933486938477, "g_quantize_rowwise": 0.1553073525428772, "w_quantize_rowwise": 0.2258792519569397, "w_quantize_colwise_transpose": 2.4386271834373474, "w_quantize_global": 0.4811100661754608, "w_quantize_global_transpose": 0.5102269351482391, "time_standard": 26.350896805524826, "time_rowwise": 20.5615796148777, "time_global": 18.842913210392} -{"repeat": 64, "batch_size": 32768, "dim_out": 16384, "dim_in": 4096, "wm": 4, "switch": false, "standard_fwd": 18.266115337610245, "standard_gw": 17.671160399913788, "standard_gx": 17.10302010178566, "rowwise_fwd": 8.347474038600922, "rowwise_bwd": 7.514089345932007, "global_fwd": 8.263226598501205, "global_bwd": 7.487393915653229, "x_quantize_rowwise": 0.3021806478500366, "g_quantize_rowwise": 1.7319358885288239, "w_quantize_rowwise": 0.15519559383392334, "w_quantize_colwise_transpose": 1.6133114695549011, "w_quantize_global": 0.48247724771499634, "w_quantize_global_transpose": 0.506427139043808, "time_standard": 53.04029583930969, "time_rowwise": 37.3353473842144, "time_global": 36.44480183720589} -{"repeat": 64, "batch_size": 32768, "dim_out": 4096, "dim_in": 16384, "wm": 4, "switch": true, "standard_fwd": 17.73649826645851, "standard_gw": 16.359902918338776, "standard_gx": 18.0993489921093, "rowwise_fwd": 7.493957877159119, "rowwise_bwd": 8.352488279342651, "global_fwd": 7.486194372177124, "global_bwd": 8.28903540968895, "x_quantize_rowwise": 1.7313472926616669, "g_quantize_rowwise": 0.30205026268959045, "w_quantize_rowwise": 0.2255477011203766, "w_quantize_colwise_transpose": 2.4363920092582703, "w_quantize_global": 0.4815347492694855, "w_quantize_global_transpose": 0.5103759467601776, "time_standard": 52.195750176906586, "time_rowwise": 36.90168634057045, "time_global": 35.16044095158577} -{"repeat": 64, "batch_size": 65536, "dim_out": 16384, "dim_in": 4096, "wm": 4, "switch": false, "standard_fwd": 36.309611052274704, "standard_gw": 32.85098075866699, "standard_gx": 34.34552624821663, "rowwise_fwd": 16.74525812268257, "rowwise_bwd": 15.026237815618515, "global_fwd": 16.574162989854813, "global_bwd": 14.977734535932541, "x_quantize_rowwise": 0.5954466760158539, "g_quantize_rowwise": 3.4569576382637024, "w_quantize_rowwise": 0.15521422028541565, "w_quantize_colwise_transpose": 1.6133897006511688, "w_quantize_global": 0.4822872579097748, "w_quantize_global_transpose": 0.5065612494945526, "time_standard": 103.50611805915833, "time_rowwise": 70.44348493218422, "time_global": 69.44413110613823} -{"repeat": 64, "batch_size": 65536, "dim_out": 4096, "dim_in": 16384, "wm": 4, "switch": true, "standard_fwd": 35.40017828345299, "standard_gw": 33.037226647138596, "standard_gx": 36.30436211824417, "rowwise_fwd": 15.043705701828003, "rowwise_bwd": 16.756191849708557, "global_fwd": 15.011314302682877, "global_bwd": 16.580048948526382, "x_quantize_rowwise": 3.4548528492450714, "g_quantize_rowwise": 0.5951337516307831, "w_quantize_rowwise": 0.22584572434425354, "w_quantize_colwise_transpose": 2.4329908192157745, "w_quantize_global": 0.4813261330127716, "w_quantize_global_transpose": 0.5101598799228668, "time_standard": 104.74176704883575, "time_rowwise": 71.54594734311104, "time_global": 69.67006251215935} -{"repeat": 64, "batch_size": 131072, "dim_out": 16384, "dim_in": 4096, "wm": 4, "switch": false, "standard_fwd": 73.40333238244057, "standard_gw": 73.76311346888542, "standard_gx": 70.41774317622185, "rowwise_fwd": 33.37597846984863, "rowwise_bwd": 30.345775187015533, "global_fwd": 33.00366923213005, "global_bwd": 30.218638479709625, "x_quantize_rowwise": 1.1825822293758392, "g_quantize_rowwise": 6.902601569890976, "w_quantize_rowwise": 0.15529245138168335, "w_quantize_colwise_transpose": 1.6109198331832886, "w_quantize_global": 0.48149004578590393, "w_quantize_global_transpose": 0.5066059529781342, "time_standard": 217.58418902754784, "time_rowwise": 147.33626320958138, "time_global": 146.05870097875595} -{"repeat": 64, "batch_size": 131072, "dim_out": 4096, "dim_in": 16384, "wm": 4, "switch": true, "standard_fwd": 71.5160183608532, "standard_gw": 73.76786693930626, "standard_gx": 72.98104092478752, "rowwise_fwd": 30.291248112916946, "rowwise_bwd": 33.36654230952263, "global_fwd": 30.181586742401123, "global_bwd": 33.082425594329834, "x_quantize_rowwise": 6.902430206537247, "g_quantize_rowwise": 1.1815279722213745, "w_quantize_rowwise": 0.2262219786643982, "w_quantize_colwise_transpose": 2.4421699345111847, "w_quantize_global": 0.4816502332687378, "w_quantize_global_transpose": 0.5105249583721161, "time_standard": 218.26492622494698, "time_rowwise": 148.17800745368004, "time_global": 146.1080126464367} diff --git a/benchmarking/switchback/make_plot_with_jsonl.py b/benchmarking/switchback/make_plot_with_jsonl.py deleted file mode 100644 index fd0dd7d58..000000000 --- a/benchmarking/switchback/make_plot_with_jsonl.py +++ /dev/null @@ -1,151 +0,0 @@ -import matplotlib.gridspec as gridspec -import matplotlib.pyplot as plt -import pandas as pd - -cmap = plt.get_cmap("cool") - -if __name__ == "__main__": - fig = plt.figure(tight_layout=True, figsize=(12, 3.5)) - gs = gridspec.GridSpec(1, 2) - - dims_to_consider = [1024, 1280, 1408, 1664, 2048, 4096] - batch_size_for_plot1 = 32768 - batch_sizes_for_plot2 = [2**14, 2**15, 2**16, 2**17] - dims_to_xtick = [1024, 2048, 4096] - logscale_plot1 = True - - ax = fig.add_subplot(gs[0, 0]) - - # TODO: change this to what you want. - rdf = pd.read_json("speed_benchmark/info_a100_py2.jsonl", lines=True) - df = rdf[rdf.batch_size == batch_size_for_plot1] - - # first plot the time occupied by different operations - for k, marker, ls, color, name in [ - ("standard_gx+standard_gw+standard_fwd", "s", "-", "C2", "Standard fp16 (sum of parts)"), - ( - "x_quantize_rowwise+g_quantize_rowwise+w_quantize_global+w_quantize_global_transpose+standard_gw+global_fwd+global_bwd", - "o", - "-", - "C4", - "SwitchBack int8 (sum of parts)", - ), - ("standard_fwd", "^", "--", "C2", "Matmul XW (standard)"), - ("standard_gw", "^", "-.", "C2", "Matmul GW (standard)"), - ("standard_gx", "^", ":", "gray", "Matmul GX (both)"), - ("global_fwd", "^", "--", "C4", "Int8 Matmul XW (switchback)"), - ("global_bwd", "^", "-.", "C4", "Int8 Matmul GW (switchback)"), - ("x_quantize_rowwise", "P", "--", "C4", "Quantize rowwise X (switchback)"), - ("g_quantize_rowwise", "P", "-.", "C4", "Quantize rowwise G (switchback)"), - ("w_quantize_global", ".", "--", "C4", "Quantize global W (switchback)"), - ("w_quantize_global_transpose", ".", "-.", "C4", "Quantize global and\ntranspose W (switchback)"), - ]: - xs = [] - ys = [] - for embed_dim in dims_to_consider: - # average over dim -> 4*dim and 4*dim -> dim - df_ = df[df.dim_in == embed_dim] - df_ = df_[df_.dim_out == embed_dim * 4] - xs.append(embed_dim) - y_ = 0 - for k_ in k.split("+"): - y_ += df_[k_].values[0] - df_ = df[df.dim_in == embed_dim * 4] - df_ = df_[df_.dim_out == embed_dim] - for k_ in k.split("+"): - y_ += df_[k_].values[0] - ys.append(y_ * 0.5) - - ax.plot( - xs, - ys, - color=color, - label=name, - marker=marker, - markersize=5 if marker == "s" else 5, - linestyle=ls, - linewidth=2 if "+" in k else 1.0, - ) - - ax.set_xlabel("dim", fontsize=13) - ax.set_ylabel("time (ms)", fontsize=13) - - ax.grid() - - ax.set_xscale("log") - if logscale_plot1: - ax.set_yscale("log") - - ax.tick_params(axis="x", labelsize=11) - ax.tick_params(axis="y", labelsize=11) - - ax.set_xticks(dims_to_xtick) - ax.set_xticklabels(dims_to_xtick) - ax.set_xticks([], minor=True) - - leg = ax.legend(loc="upper center", bbox_to_anchor=(-0.64, 1.0), ncol=1, fontsize=10) - leg.get_texts()[0].set_fontweight("bold") - leg.get_texts()[1].set_fontweight("bold") - plt.subplots_adjust(left=0.1) - ax.set_title(" Linear layer, batch * sequence length = 32k", fontsize=10, loc="left", y=1.05, pad=-20) - - ax = fig.add_subplot(gs[0, 1]) - - # now plot the % speedup for different batch sizes - for j, batch_size in enumerate(batch_sizes_for_plot2): - all_xs, all_ys = [], [] - for k, marker, ls, color, name in [ - ("standard_gx+standard_gw+standard_fwd", "s", "-", "C2", "Standard fp16 (total time)"), - ( - "x_quantize_rowwise+g_quantize_rowwise+w_quantize_global+w_quantize_global_transpose+standard_gw+global_fwd+global_bwd", - "o", - "-", - "C4", - "SwitchBack int8 (total time)", - ), - ]: - xs, ys = [], [] - df = rdf[rdf.batch_size == batch_size] - for embed_dim in dims_to_consider: - df_ = df[df.dim_in == embed_dim] - df_ = df_[df_.dim_out == embed_dim * 4] - xs.append(embed_dim) - y_ = 0 - for k_ in k.split("+"): - y_ += df_[k_].values[0] - df_ = df[df.dim_in == embed_dim * 4] - df_ = df_[df_.dim_out == embed_dim] - for k_ in k.split("+"): - y_ += df_[k_].values[0] - ys.append(y_ * 0.5) - all_xs.append(xs) - all_ys.append(ys) - - color = cmap(j * 0.25) - real_ys = [-((all_ys[1][i] - all_ys[0][i]) / all_ys[0][i]) * 100 for i in range(len(all_ys[0]))] - markers = ["^", "v", "P", "o"] - ax.plot( - all_xs[0], - real_ys, - color=color, - label=f"batch * sequence length = {batch_size}", - marker=markers[j], - markersize=5 if marker == "s" else 5, - ) - - ax.legend() - ax.set_xlabel("dim", fontsize=13) - ax.set_xscale("log") - ax.grid() - ax.set_ylabel(r"% speedup", fontsize=13) - - ax.tick_params(axis="x", labelsize=11) - ax.tick_params(axis="y", labelsize=11) - - ax.set_xticks(dims_to_xtick) - ax.set_xticklabels(dims_to_xtick) - ax.set_xticks([], minor=True) - - ax.set_title(" Linear layer summary, varying dimensions", fontsize=10, loc="left", y=1.05, pad=-20) - - plt.savefig("speed_benchmark/plot_with_info.pdf", bbox_inches="tight") diff --git a/benchmarking/switchback/plot_with_info.pdf b/benchmarking/switchback/plot_with_info.pdf deleted file mode 100644 index d186e91b7d96c6e605fd2802ee37881e6294cdd7..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 34876 zcmb@u1yq(x*8mFA-CZx;`O>9ycSv_iN~hA@4T4A`A*Cptl2U>+NOy;Tpxo!hbN=uA z-{HTy?z+o0%)HM$Gkee8`!-4rKTbbU6RL00`a?>|HDY?0%uou5Rh-4!0@JvH1b*Qu=IH1I{NjX+ z`6pE%-G7r&(bCb{%?8Bz7iei)dtl)pc4>Qn8j_agP8OCB{9WB#EKMDdy|TX;%DIyD zCbqU2kEKVxqWff}HTf(fm_N%j#@z4KM?&V#6KGgUWG*N+sG;iTKm2qT40E&=s=oTt zAgZG#1}aY^4L;7)Eq{6Ad>lV`8vXF0@9dAw_$K<(sjGcu&$i{mU1fLRZ?CfF$kD~Q z^75_^!MlB!&?PF-sx^AuIhh2_tL>?UgY<>uL$xdX(A<5GHFKw&FKa^OzUAi5HlJ;G zV&9EzXZp~-wh%uQ!~R0Aez0Q5cPmZev}x6sB}#|ao;X%B-hUzZ#AV?+`ha~w>0-^- zY@_tryw{Am`bUjMiwd%oRs??S)NEdYk9F#aS`?@h$GVBabEL^r0|OcPOT}m&0yoOQ za?hwX@p+9ea=G7pOIKT&Z^bCFH(%N~I`MIfOaH6AtMh~H=!N9l$(+fBXye*tqvqKk z?|cjKW)2(M+}VGAxUM2-+RW?-*T|78<71rBkz+0`DLpVTe^CbRh+mJPc1<~uow+=B z+e|s=&l}Q=&Cc|RIzZ>Ge=S7PrOGBq65o}h)vvIil@`&$OXri!n)tmC)lQz zvd^blE*&(~@ zz&H}@aS@p#(dn|@^{`n!#vo|R`SCTJV#Lvr!q|Ax@5BBxuW^yvEZ#36Cj)fGb8d9G z_8K$!)wW+&J{Pm>vouY%zMm8z>_1^DAaxSH(bXM2~EyP-hPOq}^G*_TI4jl0k?ZJ;gp-kC4eEz8v%M%3`d^zoQ6~U1D z8%c*~p`hpVoA!p!=-8R~(J_(b+t_#3i|l*&Zg(A@`q;xyI=| z&ggf5i{Z(pIw|~Sq*6;WHv2KI@O<<`afLziuj~kMfGIark( zr)wWYdzRd5wOz@Hmp&j39N89emyZx{YKi+DqMAj*>{KE@E8^G?hJF&92*Xu7O;Uu! z_T^0~Pci+6zyxj=@<-M0djt!&vi68c7y>NXv7?<%Bw5cH8 zUAkJh!dnP-%)LTi!NQ|9%W*hDlnv~%AQpCcfbaeC^Aaa!h8WzFBSNJ9B-a@S1>qe; zG`okF8%PNyaralHROUq-Nn@EPL~0LA z%A&R?yxc_UDB_`GVhDPk^XltXj_zu{7*+9=ECFs9gBud*kRRd-UL5I-dF2c6Jh zTutF!JseZO-%iwuB@f|krb*aNMPRe7<=PM5Fjv+L)==GA`)-4iJ-NPOnwl7z>jqyd z_+gMo8vhwrCsKGov#yKF$d+Vn;@InlI0a2`(t*AN;%+?BrlhO25)K?u5&M`chQVsl zxMbhyzM4$X5_X1U=p|4!J7-=f{mHmBNKJj5BWE|4l@KTVfn<{DW=fM;3g2J$fWW z-CY7G#cU%rlt&?E(kZq%G>YUcBa788Nyy1p&E(Ti98}oE;0h`ZD7ZG(fwd~$ABOm_ zYyz>WwJ%h~Z)HzXFaWp=-qo>`QSC}rMb0m9nQ*%a;mjtdZSq23VF$FD6YD#BKhH@? zbadquGulk>k%9&{7kx~M!HNXhZ%rE{$X^C0&U)4u zq`E0#Pt6JYjgD?kuoKfqD&oo42jX>SVUNq|&Tn|d(VjUE(7jjp2~;)fDo1+tjO0}W zI{s-P4;!&xai79MpTs^f4vJqEAuipnrS$r9f?*ux7~xuLLul}cG_RueYSMcfJa*5C z>X^u@YKiHWYMCtTU$&cxXvO-=pJm9G9l47MAfa#KORt5%WN7wKJ7wOeOvlSTl6bK- z_ACORud2FJ&_=+4ZE8GnYKGAhcD0x)%)6l4Wqb(CjvS7CZAzvQxZ18D?2s+_&#mXAyONSh4U2WML-Vq+HkUQ^xU-M^gB}bG|TkS z?g|kviKnr#aNOQj8{Rfm&NU%)(z4=%>YYxD#)2d2g*sJn=3$-V9aaw6gT!3ClL@dh zG6e<|l-nXf7K*QRBPNjRNI87CXbv&rq^v}StUo{F77ZgzAa70b%uXwsn~4wox*B@R zXsLL;d`YN0O$SA1N>$$@`@%#p9n}R<36N6gd2w)*eYLHgxJn!K;k`z7ZC?{Ekr)CNx_!?KzuD znd)MGhVtt@5g6)`5VrW%&Llf}>#8Z`;ru;DYp2~63F_e|*l5@lb! z)*P0bnT;rpHO|THr+yr+4}F}uKG@mIB{;ktZ4qt~x_x(jaqw+rAw2rUY4pL)%EtLZ z)xvYPu%B$6wz+;zVQRwU#$j>cU4u;%8sX4|bp8*-;%Z?SCYV*1AGY+IxffIEL??-g zr>SC%>m2#fms(6XTj(ZTo*~|DOFnGg9`ZeLWa|kz{d9D6`i1Qk_s@_-%#$j4yRMRf zcbA);V_%MRKd=qqE;ChTpW5z}H2HW|UuQ2osm$c5Z5f2Oe)kyGt19gXN@yTsKq6VP zCF=th-trTw5OKotPazerU6$;HpU}m>vpN+s&=SkRSG_$x*xUFrbNzUM-3cq?`7^ic zX!7dw5H zV}rqM7g2kGX$X@UA*AiwzX3I+o3^Ykug_2*DhaOpESav}0ru>~uOsZViFFv+G_CXO zV=$IKqyST}V`02}nRDCH)Z7sLqF|%7KIL?4tp8lktM)`Bj`a8k)SN9fO0#B~?f~zX z$gsn)=*YtYKYXhViK`!DS1Q(?QxtC+bANV5QOr+ylC2n^Q^gy-``$L9G~R)n7|2Q; z^-3`SV~-lu2SUw_eLjNu(O64B)6Cy610L60|_A{U-? z0%fO1O_EWg7=zaM{%OAAKzhpRTc<8f^S1A|;r^cMinYeik=mYRV_d!Mzcj%R5Ir3_XV|=UP%-kyl|x|&BOdM5JfnLr8K@M zlF119KsaSieV6+bmLAwNB#p*z`O9P?jA*7{-P$E-1fHgLsuJnQqb^dF<30R{h~_u_ z+yn{zai=%fY`{j80x9TA$ zh9N8|a%is9$@v?@h2qxrR$oT;6maB($T`*G`Uvo+-lVUtAB6P|?w0ehkGyF_7_NEn zS(m=$NRco+ibl9k^?3bR11S^Oy~ko{JSVLRXLt+EPmM0IGF*O%DE8Qql;m=4oPj$w zRkz2sBEYO*9Soa~6{gn8t0icjOO{d=j$Dd+Rt+msgOSC$mwcHbA%)A7fg6i=roDil zqOym!YJ&gDj{r+%pdzFpHov3+{CYIAQujRc$AIMLI2qJb6W`gFktyv#fiB-p%DQg6 zzq-QBgEsP+$TH^VTOKww_+M{)GjpAYjW6ii)Sh^2zAVh4*v*I49j3G<8FD^XFtdG#{CDLB?)hwX@tKOM#U z%Mtji{BfWJ>2X7ChEt;dfIcGz|IbjEI=_^e&B3`k1m#opz`uuU{h4QP{O+YBGH^F6qzOo8uIl;tV*`!4;O23v76VZ*=iWZM| ze-VBrYr&xuiy4rQqVMD8zNq;U`YAjPZw^&)dT+P=5ZYKZmcy{ zcBo-RJ?@^Ht-4cl%%hupTuOwnv>UzSjR;{)6Q#FfWgi9<{AqEDL{7S1`RiEvd&@5- z(zcQjTO6|o5=9J^U@8Wkd*|2@FNPINVlpF{ zp7SaMyI9cAU#p2&-p>tDB4=mZ;+nQJ?#uyi4x@>xq}G<}yr%$x^W@m307sppNxRwm z9I80Zj~VJM;pD*7hgE6=tko%>&sM1~h`qKxK{0bGhA*E_4k8d9QA$tK5tn7$QnlmA z@bOxAx7XFEs5-YU=tE-^@%YxjdquM62&vaY8$IJ0%TjGa zse2rR?4|$l8%$Lnl3dAdvd<|kjG~UI{JvS%0ozV&FDkit%vDNTrIOt`$yOf!*QGj- zZ}KGU#79Xkqr|7C#`EWs$?)BLD{84<44>XgRU=rZoFuiH_nHK|aMrR5g@|xn`PEhB z7H4g0!-z7sJ4tu9o4drmz;|C;_U=$@ zXstQ_4t*$8a^n_Xi^nHpT%%*GYply1zMZzaKQ(yn(Hiyi?1{gjSEE7AdKUw3c+r>?$N8ynv~_~IQIh&&V+*Jan1>0)@tt{x6s z<#;91kV&jgJQ!*e`xqCQF)no2Gqk4!`OM2r9h0(N&W}#)tb<@CJU=uu(@oS>1gGke zWUimJaoR_y7xHP(V27gVWXzUtG7Y6&bx?h{YWgg0DX- z%yys}b4#|65PLGNhaU5}>Ggiy#uJ0CqfBC^{vSa6!MI;G+PZ4Wj+wXmrHg#KKlg9&BI39 zqb3>CU$)24;#PZ9BVR%E0uFzz0D7&fZ?I$uA+q@R`p8wh=kd|1L9RN=t4sz=n#hZ^ zf=E#`o-UFD$@|CP7Ph4F@8~8GM8jWDUfcu`sb&f?s${Qc(y$kv zR-ilND{H2%iOFj6W#($s1vKyFVC^2|wJg1D_e!&&L+Yn}I+0+nV&ihq^x4N$gU}|e zmxY$EZcV>eePDXovO$7gj<@rrn7a6v?`sFQ(2975fe=gbU)ImRY|3}0Qi%DK2h0op z+t7SxV}(4_2O0kT3=FZU{+FGViw_JKSASz<6-iV`>f%6@yby$w!Caw!N?xK~=X}tX zfVK=iIkG&V`RfoKAc!% zsH;!%wFA);YO(|@w2sgW6vuE#l@xTsk16wgB6N;_swFUR30{L;#P_L@h@)AFj`ZAA znhh*P^d=Aax0Q>ula@pn8;ad9YM!EueCxKX+9dz_0}2M_87@g4^21tAO?1Ba z8mJ-DOdV(CUBaG^?}DDY^Esrd%c)e_t$91Q;P%BGy-PrbJeaV4T zRqXTNGSY`Ccxvpp);q8#{5yi8DpqFf5^uU1v2rvXs$7z-c}ge_CkqbY&|ABAFzUSf zN~}OHUO(N?_u?pLLO$nA%mbuNITKqLoun+N#@1jUshsM7=-n1N-B`Cx+i9)z@LZdq zD!~?^R#mpXcADMEzB9`!--qcv7YU8EF@hrkm!ktehdLLfUTha#`9d-0d92)AYxN2T z5{pCrbT-w%FuTz)P(bLOSO(`@TakoNhj4)&XutCy!tODaDn-L&jcPd1xGke#JNq}*7oE}de@haMcl_0pX*C#+R_M{#86)G!!CQh zrmjq&z5L8G3fC5VV}DbDD-l8cP0WUh3vS$6`JM)~zn65S5Gp&!GUeQGAU9S%3 z!z1L3L?^yk$9H1IHeuvB)oD*?xcD&)wC9qA39%99=X^z9Q4KE zeDd@aJzt6>#ylT7q!)Fo``HfM3y*Z(Kf=((2GJye7E`2IyT z{y(5vyn-VPCm_16$c7+5boWjbRowuxk*E(tWkw{>p1gJ&NRah@J?NI4C9&uur5vey z@-=FxZP!~gD%DA)QYkQ;Au`d7%}AquKE2>v{%LF+=Ud&w!Ja2Ykr)rjxahxMYAker zey3A`>VPPNZPZm;Qm{30?nc&cVi?3Ja}kdZC2uR|E=?S^h>M_l1}*dj)ODiG2E@L<#VU=;SvtH9(x#ruT=UcCCU}_h& z&v;?kR}&fUR5M}DLljto^EDwa=9TmUA#_wwRH>kNpM##!10+H$N>&Q~+$O`Eq z5#fE#AXFyPp)qOPGFqlDZ9nFxzFL-f8E+}&F^2_(`&G+V#Il5#D|Z;L#L|(CI3s@& z>72iHmhX${S!>?)|{`((hsnj<@71qU3RM z)V)ESMwnt4q7p9X$FLNjFijwmu|LdE7C51h3nV0N1@rK{Wshh>`II2^zDojy zT!cK8BX|~hW)z<0pzMozhSS5n{ZkBI;{45f82e8b`rpoC@E>gDO7UXYFqDX}hs{Cs zP(dFKXEdK+Y}`)8WbybuZ6qxsX7A|qS#tIqX1Y;tJO;n^aV}Z$PM#^G3dF64(M?gQ zg%`I|YLFjBosWeNLCR_F3ns0)w!fv3O>K1Lah|XK8Gj(g`b=}VqN|2yvNSu3W7NaB zbJj%IgSqi!e$JzD>jzvMHTd&A3%bwyEx`GENtH?vbLrwhg*o(rd-*64MBCPzql~eA zTLqm;Co&yPX+b=ih2B+cgxAf~HATuw{h&>Th&;IAq2f=Z+foh$*KO*kNZDRidd$H` zMiFHBlSaLfa9MP%Vp2;iB28rD@Q1D*^E&Qc@e#W`nJ#q8f>=#!UbeUW)Wf^g)-HvD z-!@1Vj-GDZM!nP9hpk#%lJ$g@TA#>GPaz(|)?KhBU!+d0*J_ee6Z+g}s>D;l|6X3- z%XnMlSEblx4J&wL#xR3Nt*;{+ z#MuaBQM(7A>rex@b!^cUrO+S4M<9KMS3@-u!+k-NB{9)KgxP@lh8v0|1;6JWjQ$xN z@t3IabNwD8o}nhRb`C_i_Lq!=9nd$`7m@k6-+nr9P@xI_e6-oqMFq+)IQCbwO+L%i zW8?{8IdLR^@U{?#<A< zY`Vr1?y%Yv5#dSg`gB``a%08%+cB}VU$T_GlO!I4N<=CcxFQvt z6eqZGw#bFqEkuYe<1LaTl}ujlgDPYR7MZBBaR>R#$eSHD@V)YU&BUB6yu>Q|2og4> zd&m=@?cOteTY9>nIPpqnPV7P$-HjX}sM6!%(-yLb`NY?A&?R5dr(K0iZf3<+pN>`t zmH4%DKN9?Dx$|O~To2lxI#}LmvAd~`eIC21bK%-$z~ig>k9ld1OyO=)JbQJ%s&VOO0V zM_OgWp^d4DHq>&|d_(-TOS?UK16*;r{bA=zQPZNQY@RM#%SZ+P>&EFx<>-1|3x%(; zml_ONCS?gFK5KNi4l-7bUO)Q%SA9LAa%WywE=crRIVObg@sViA4o(z5U+j9{@*LXn znbG+@T)jtiuj%`E+67lO0 zWksHa5@-z0T_=~e_UdUO zN=x2_<9D79vR|<31koz(14}xY-^+2pxrY`ARyjYKAm34R6SdO~g zO)bmptLi*$I9Xd7*di+9DI*ra7eZT8ZNa9nOtM4GBEr#&Zl$$F{fwp3kXTT?7( z?vBDjB9D?A8cZ!@=J>c=37O%m*7w}Nk8y7-3$Qa>#R)(4Crg*@UT6dul{%!VAxogs zM$(soH!CZ;SAW89)VCRbm@OB1=;Xl9^6zEx+@mppf$#pe_B@^>IY&uOz@8@xM_|*h zNIra8ppBn%?Q{wnK(=>Q)8cm;Na6N=tG7QT zW1&v!(?j?a0x)iURpBe)CH@jp-J(dL{LU+|V}x14gG#HI#r#=1PCYD#JW1IHkFj)j zavE|EjtrW0wM>bgFg|@+ zlI+`aAERBnbu`-|UaJ|#ugo3%fjQ*rz6uA@_EhJoRJkEyKJ8WXL7WquuC4Ys!Re8F zI|z>UHBj_>92vw18k~_&Ic36`ncO4c55~~a*q*93s1y#oWT~OYCMCgB(v;6jU~%S0 zkda1d>{l#r$6#}`Deoplj^wq$;8^Y7MxxGKxA5VNbPfEO*DFkx)iLK6w8;{H&L6)4 z>t7@wn)?8ANad;TPjwRkY6}xTqaE1-K2z6t#z=GC;T~#k^o!>p^4%XOhD-kUY<@iV z=+2yfus48tYg9P`x^N|c`vN%;%sr+W;Eed!4Tcjiy8rfjl!_Nam!Jf`$Lt_#M6P;mYbT+;+pi(~BQeVGr^BH|E7gZ&WPYxB{iFuzA{6B+M;eB0=tT(ZR8f$tpRo)EX zyVWX5$!)oCtkZc6T~@4^NV$gkL$8+0U|XU< z%cM!PecH?E$3pAeCJ;rUc%SUtqwop*iDnQph!p_b2oE*Ikd_I=AI7axetV9WfBXRD za0s567k7|T`qk@Iu>{XCw0Wbf_GQ_{q_tfojv3ocd7ggTlGVe8w1?Nlc6QddhwJj@ z{(a{A#$Fv)P?(G4ME~X-u)X|841Z-M+@%_@i<`Py{%T2S%Sni-FiToKG1Ye0Fm-fg z6?d|?fbiDE)lI_2)CI(W%&utqSDTB62j~J~`6Vq~&0TDr-JD!NT#zGH9mKBTZsvBE z_y7R}MrKzwbpX-_{?GY=NyLBU4X|=@@Ns}xxdAIYrvM-Dhm(&7a8=#GQ8IONvAs(! zVB_F`?4ZEE|7k(;3IGgMft&|d5E#-6BqT^V0+|7jjEMhaV*LFGnO)M>%E}VRXMiLy z=z}01q^q;3Igncc?8DBGLFyR3XRm-~` zUZ5oo;s;i$0TKXg*QOu=;9YYNy9JQe0L*L!RKOq`5W6kV0$#8KD&P$V5W6EVCjbRl zDe!^|FfoW7fa;FS$qDp(0&55I8rUD(TDaK&c_Vj;7XQf9xTE*q){*}(^&cF$qug)E zZ*OW1(2VOBmH%HE0b&<(HHV~t2mp}TAz&asSh*mUHVIQ_*)oeNdBK>n7c(n)}#;O1x&O5KNRHuuXBI_m6#jg zG7lFgz+8S1uK+JHCkMb#em*V`n1dJSqPmz~05j%LU>Ca|4|KMfkyYh5rsKK4dTi8K3}}HXvTWb0Q$X z1H8uv;s-QAKEMZgR|8)a9~WdSfIQ$DhRn&y$pHo$oInQvfQttivKk21kQ#yo4*(oe zc!7!efG?5<*wi33FCRb_@ULflz+@1gCZzBJV}TkV5jOw<=-}n&yBh!*#0?DQ0WjkR z2H*7oC$0bhn3o&C8ffzXnB7eVOaiF^1b*=iLP7wEJJJF`c>(0^$nrOKkor!9AnW4> zMsjlU^4$#q*7UddAm4EWpeSVcT>%T+t?hrN`qh9S2$_OI0Kg2gdz80M&^Ue0u%^xkbX!X&tDS&%z=#l z`wv7K?gRlM4F6R)Irw>ixd6a_Nygvx-;(in{Y$idNd`nJ{#OBD-~Hy~LWW4l-{np? zfIdE8KY|FteFb1RfH=Tlh-CbuApb2Hf7ibR;+J4R+W#yNAKAw%l_ z1TFkigBFME-oMTr(4Eg6IDt$7mi>BW4gzR+*J}Z=>sRa7J_URucgKVkz<^)RZ9srn z+`VfHIS1}qc7R0uYB_)axxRbm2(ao`%L&AN2M77?WB_FOuHPASXJ`TN_|Hkg{Z|&> zKM$0@z`8(=7GNMUu>0R#-++O42h3l9e;rJ~`tK%p1Kp{Vz&q}cqv-CLCkQx=A+7)H zqrCq-@c*(?a{);FfvP6w*v)|oSc7lcq0%ZodaFcl(URfrcVJR07_1z0OX3^V zt4@iBuXafrA-xDZ44%xsB(7O`I4fP4`cP9Jgu|&uWH((8Lj&4OM4xdw7HP^%3`9fHKp}1PI!m80+Lrlxb##!D(LcW zCg{@>2ymWuvIz;{o8h34gukUo6|K4P+a0=mVOv?0@KlV7`(x2p>*n4~uUn{86$83| zD@?rq6tTbP_b1j6a4okM;CXlpg&mf*fxID|cISBzjYu7Y!x{WvUWdoTP;D`2+V_Bl zaQg3K{V&h}I=|%{n1|=_FVMxs(*FyzrHw7R)p~#~e$OL#S)y!HsKSmcR}&wX4_qHU zLk;3#e!2&Vd+sfNV(J!&8-(~dBn}lj5SKL`Tx)7!^#7a+^YiJz(C_+(=SuHWLO>{09%F?@*G_Vl#HW~1uRKUwR z`VQL%y?r9y(-ah;Fe8SQ-B2qlo|itpR&zrBl^S1?^@CZy$ZgEg`J>)xh19<0M7yu> zPd!2=wqx-vTH2+2q`ib@`{+o!J~+)FT%lp8HP;G+{uF{b5)67Wu|NFLBW^0>tLyV% z6vCSC76qj{=-<9(X4UCl2GdwJ>qo>8xg&P9eDiaRP9GH8x$N^LBHunmKwmPy$JYp) zVgI6|0N3xuO(eDh!fBXWfYX={x&cm8Z2$a9hqZ%10Y5`%Vp_+u)G^bZ{&|+KrBEt) z7E$}1)Pm~ECfp(&LzyDQPL)id=jv{x`_AZdUnkf3l~%WmFGhbrW5|=E+{5d84A*}m zjjXXX5FBBBRXPxt5Mm%sw|>Cs=Fyd_T-DnZft1~=WP2u;&Mh-G)ondCXQKEd(PFDt zU?rgzTe`tu`OD|hkRVVrY%AL1v&`q_BiX4X3lICQNyhKN@E!v(7^3U{HjXaouaFGU zP&Ul=?=vq?3mFXX<-csVZcJn1Xbn}c0?^Lt?9g8RsM`O0B63B>{;TS z5=$TX&JyJ$L5}JnveukkpOA#j=7!h1xU$*ZaRE5*qP|-O%|ooX1=PHL6>D0*i?T>h-pyBNW!yT{b5dC*&(sOBAYsoo25mr zPhJ)ff)VisOV^D#-Tl#c%7OYavAuo}?zwygeaq# z>!{U-q|?Tyu+Kg-!47EDf87qkb}0YudR>9ya6OBo?BTm3w~iCBnO4zmxr&$kQ3qZ` ztwDj)KVio{k3O%NAvf4w^htzdI)n96WJd?XDEeN*zV(V0rH$c2iGz8mw0!S)8=8_* z65^F4l65`z0shbDBQ32{*BHu_nppP`>>hQ7_Yb}hFz;Jc4pk1m$D=&lfMUbNr3&2c z)MPo=08vE%yEd2v37DisLMNCa(sU5Jd@#{-Z5IU9s#XdZ5E6J-MU2WAnr)L4O zTGCMb*!Jx$22R<<5;;{$G>U0hqVr_Ms^iNzf?JQ&5gmj>-peRmBh2Ye#mUF`)@*Zd z=C;}00{xCLhB!5`?v*7PIx-x2s3C6kT`3?}&BC+(IIfK?{zUqh(TCsWSUd!tq3@#b z#!}*KUNZRv@doGbc^^I?b`iTZT(~5cYTBX&ks#qus+k>RuZI}7c({?jW59M5&k|$9 zp&Z&~D#pxGt*9$_rLN}uTC`-VBKP^au>tEn^twkC%9&ENN?ZBrO<-HJ__5>4Zg3~4TtXc*?1K^E zwpYxXh$Bdr4Ss0KXCBus#R;Fi?N%&;KqqkBi5%ieV!5X+OCMQA6}JP9V|>Zvo9l(o zvfsz6PoE^R23l5EkL)kXO&NITFpz>QXf+1Qg59uWfuqDf*d(VOLzKd`LtgAbiph#)J3VBoU)6GP@(n8f--OjIS2fG(sa8I0Kicri+&ye22i95)4KiGK$CApWPS(y-LO@q^h zw_t5XYO*<23;l*jQ!>qWuQ%fLaA{1lWVfh_(szb5MON(|M-3w5o0OOdKQ}n0*W4#5 zC!<<*qbNF&-q_y+Ze%YvC+#W21RW<&BG)OvAGxrHO%w!7^3qpjmgaQbvEGn1d>D2M zq)44~R(!krtnXaqr`w#~kbAQWV*aCbC9$I>^f%Hq-Q8TGIkAjB`7G7MS;(dQaX_VYtB9SSvc{??L9x4EfRF#yRmZ5`!!5KoqB668X>B{$mea!rfVw#Md$}ua2~X|zZ_RX z=|(Cgjxwq{DY{-^=Lb8pt`)s8;n;b{YiTP>x)8K@3-NivfC$j;gIYqC6kc8&F5O|6 zeYAcwJ(H=|Bx=z?%qf#dZu#2XWuKPnHYPpzBv?v$Q5ie|jd z^f*Wo_gPjFqWVORT*xKF%KXl^d&e>G`@=!?qaXbjmt<8KLpKK5_fYU2B^ofr{g$Uv zu^m{D*qK|vm4b{st3sCy9O|){Tz)2-=m#NiDD~db6mY3@MtC|1^5*lZ@sC=TU0D0) z;V_)(pU;kJW~YsN+9t|rG0qoad*7m4( zZJUO3pzLyb!s**Gh^DXqLh;Mc5|W9%*PuqR#XxlfEj7(lii$s#L`Crpw+jtb@ueIR zoJVSWubo0g>|SCiZtm>yYR#E+@eH2c`N3GxP`=vGK%YP#X5LG@##-MA^qC_c1KGyI zlp#ve!CTcp7Rv<}WLh!(QO`ZBuNY)+f|!F$tL@l=_bs_5Zoh})8yxyynDw#+;8$$6 zcA!I*mf*aF>m!;y;AOOd(rS#B>nu3&arDTyFVNb)B838@bGWoe(CU-5jx9o^`5SIY z#YX*Lex#=R=%(+BN5>*(zpOL<27FwLv+&1EX9fPA2alp!RtSBLXs0*WAHRvnIzn=9 zv|w-Ud^x+ZDV~hzSqX)1WYJYwUQn90zbzc}&05MuI%Xo8Uyw3M@3TPA%I8<)daDL} zH7gF(H4Try9#AZ2vm1QdB~aK4>ApvX?$N6`0juiog(#u-DI5^vbXfL)mLmZ9yrTCM zWA|rO*x9(451}}!nM!B1!Q-c!QUTA^QTW(1$7nwbLNbVtc^6$k0X5oK{BH#=7 z?OMTH?}1IQnj0EB44>}*ijRFun;jHk9gqQX!2Afpm#oQ*B$6b{hE5z@l81q!ox|_H z2cLV?Ou#DrTYN-531CBfp`$M~Zm2{0s4kw8Mtj|AbBw{0m;1b;Lss~)`;RW@wRoM)n=(_cSGSz~mgve7DW3z1HdUN>qpA1>lZ+i-83z5L#N-<>gYi-p$C>PS`&a{hsvTJU+ z_F-HI{v*k-p}=d4rv)+Yn(j=_iX*A#Y718$x+(rL=$6RS6|SWAO9KvFg!IxcDi6>v zB0(zj8ZsaEK{H1P^s)Y_<;>1G_TR=7IM88#h~s3eML*g_4eS}xI9tQsTV3_o3^Z}_ z#*&@E>`SC1Bn$G$F}PisylT098YVRIj_3{P;fiO*q(`r@SrVQrbo4x9>82f<-V~AG zVludQt@t`@xWOd7DOmCF>fz+b_n$E2j8@<8A>}`{xL@whzyFK8|G1o`om>F7>VMpr zfR~Jwn->h?8PA@9TX<3lLO@0W5NWBLM}0{RBRXQm#mIZ%WIk=L-af7a z3U0y)nwSlW+zJY>r(J+Mtc0~TzuD$>h~IuTXwtWB<>>-ua5D zXUuzArnb5YgGldlnyeln&1(oiLe40V1;5?1)_e3GFxMaKLOg0s4j(yC_juq4I6%*6 zhgLzdiSM7k5So20JKjv8WyDovz~C| zKP}Q*o~$N|<-!?-w-qW2+ZE;b0`Z7BK&@T!mE-XRMx!qFBVybe+^wwgG##@CXwjjS zGk7c73yeA+$}c@=8W50NICZx)(50dn-?#e_vCJguW$(J^BHudtKOi;@<=?A|mR#~~ zhf$>YX%S#QBC5Z)vm?dp5yrng8W0`uVu%J>OoP8%^n>a3lctT$Oa^*>fyt_T&Q>(# z4F&bZ#|^zBuf!HF!NS&_ajNK9s)%O`4?y7y?b0El??&3$VCvXyfoFT4w4J7Jdbb9w)a(;d&&RgW2j*Yln2|eKhMLzbA+`zq9_>xv9!JTd+@C*gaR7OHJuE3k+%!JeUYQ78n z9nHZFMmwCmMjtdIlpRx`DsP7ND2mMxE%(nz}=Ye3U`qn{*isT)xreT(dV znx!6>*JkHjG{U}4uQ7cOiSE(f{^W>~in&YRkn}A!fqh2zO;jzVbFmE#lh*MA{ep-A z8B!*rBe+Wf{v|3)nBmjFvosDA0~yu0oV^0mFReSUvFF9=r-v9RFZU`|!giq_h!b1g zgW5f+3m4#${OzWpq}uNQaPPVanTMl&i?oRxAzHZh$8(7Rq@;8kg_XRM6#cXT%L{4& zr~bxuiE!mq+Y`spp{kmn1*i`^ZdElgD`{)wKN);>tl^eppXuU{O}Bd&xB1#PM7O}} zXmxs=F>qUitHY|UMS!DRWEed``9n|D&bUx24NPQX!l`%>@-F!VZGJrST!wm%GV3~F zdhV95UmTRk$HmzBvED&~80mMcrDIQ0`7tW5$EZQ z70AnM>2fxK8LSH_EUHv*#;}CmdSEy!xXLXb6bIG$l_BQ*JAB!V5#PV*26zJgr3G;U zp48vc4HCKx1ZU6vKz;@aaM!6{Os3jO`1!nY>lGO+)+?{R+5&Spbk0DzUed_)w-h(r zv5#1Sv#5Qwf5%jV*g6uYpc%gRYaSgMbxiLXM*2xzOi)IpA=XOnrOij==aek5M_CVg+?1l7UDk*`Q2au2*%v}! z`oUdN?qV&aR`-^*T#S&+oO&amn^}D%AWuT}P@i$d!@WSeS1})=CtZ=2(g1t z_wieqfqBh{-3Rut$7h2*>_^4w2J5V3l9pM~%F}v}zu!qM+nky!G)KsTDX055W6?9q z%@=X$^)5!wtTSxwQmky(B$$w z7l|Fr3g@u;PS;?cMIBMW1$&O}ejY;B_E0P>GcLbke*SFU9H08gE&IAD){JO}!_#-U zR_Ay&%c<>}I{9N))LE^E{YI->a$RMXCVUXYTlb1}MW$BlJ5r?nmhm-%`KW+tf8=KQ z`#7Z(9RZmXe+`Rpl@i0wS2J>2b5p6r^m8*)BlgFcntD_03d0Q|jH9lUYLISKTpcAM z>Cf<|n~w{~ogR7TL(=}{qJ7(Kf};BU8Cu7eJ04iB9=u<_X>@jf8$m5`7o_aC%^9HEjWx|5As z&s){3zUZ=bslH*7%0!>(#@;m=YKyI6c1Dz}i+h%FIIF15O3AeS-F^F`kEi{ydAZt+ z?t$2qf~gh}%U$5+2=!{`PhIO2ye-DhB1M~1!o1Gm0;|PILQ)AQ)=bNoFS@LQIp6kA zLb3a;`DG!*-)Aj4p6yw@s%A#Wb48KGx9;w$4V5z;`!aEiTbq*O#**W|kD6f;logj? zS2v;gx}4~QmDyuT{ca{^;>)FH7fxI$B6YoNqn_-Bs%dY|thkU-+rivwvGuW2|ID#x zrRRRyEO1fUX(gYbF?`9(UbT|Iv}44fp_jrC2gI3B;OU6DhK>;*OD~-``{hT8$W4R% zr-G_8YNnW844v$$^|V|+ucm$76Q!kTOU_P7lQA!Li&dXzXkzG=W_VIIa(1>|g;Qph z_52FU2bEq;=jyv0I$D+_`XzpidpgGYtJyQhjMt`-$I?h&rdijiMx}Lt6^8o2lbllP{_V}s;Hyf)G+q9z|96Q_=#WcLrvM}~i zN_=&ia@6J}O1-2;jmC&qhu_G`elBfmtq{3D=-N1O!G{|=+~poNN!@mlY<7|CwJX~o zcW85nRl)g^Ikz6h|Ilg*R2EX+qTe32s(1gfq(45VP4W7e>2v*PXqk!np@Om*-e!R- z6;?T{IkixhDbo7k=&ebXwStqQPEoD|KA3miM!#}OFwJnSqxbrxZ+1)DqkJ9pcg{}P zUouN0x1hC60oRk9BO5S(w9B*Xqd@`E%l(V@wNdxRS5b^So}V!+&pNlsiC*zt)XL+; zrn$_3{2L`>UzM*hZf|EC-FJGoy=Tx~iMoOv%nQrRmYuPsJeOlgjw^9>us!QFIm>n2 z9r0CG7FOAzQs4i4`|^$Yw(kN;fm;IkbPJ!QJ)FIb*eetY%?jaf1;vYsqB%@aFe{ig zrM7UT+2d1o&rs1F??&`-^)7Jsa2>PS9#6v8Ae)rTY>Cl)O2cP$O98XL5sQO>6X>!C zred{r;cK%i?+zV`KLcufiPMrMMCXT!1p8~oXKu*fDLmi1V&+Hr^2PU)`?K3Dn?6pd zn9Ni@X26_OrHm7@kkGWtB1q1D8xwvd81@6*0e z>q@p99Wm!tpqKyHxB!K??FDuFy5}A#$Q#hK&$`r|Zd}+dW4i2#K-0x0qg*GW&$(Hd zOP8eWdA_xL@j!G#nueAEsWt2cEzT!jSfNk0P`<5k^1`xa!8(uZj^$6B&&kFpCfJUz zI4JA=c7?>Q0+}NZrMigsFA=R=^B$B5-+1im9`W+gL+Y_}vHce5PUrnz?!LTn>{ZY2 z(o-(};U<|DW>8!%KEaFL5ZkbCCB13oI|+tZL#*5H*-D~ro(jdGNpTJB`zi`uE;P$J zyni>*`stP!J{`el4gD*-4D2+K4VRF4Jlc{^%6yiW-+d+AG^4D;PetTus;nFRyW~tcsgnSg*qx+juVo!-X*03)kC@U6m*no| zD!q;{njbA^9-lcqRoUIQD(ntz!tvv~tO}FL;tvp$KU8{4%C5>^F>6EGqR7Vigu4%G zWLgSu*51#LKeN3+fHaP`HemkAEBh$O#C#;R1J#m&0GH3d;kM$#eDA#Kg01n_`Uxdn zlSlgpS!Sp^SO;HAcA`NZk5inzbGNYX1x)zl1pR)Ue?2 z+lHcy%}T)*GY2U1ND@zOOY7a$371dHWHi+T4Rn8MFVFpcD?zP(;ryp79PHh$X-W5I zTu>rE4YeBxdUvxqa=hxR{I7i-qT;IViD?%^j@)KM=Vw{igjDIKPq<8KmU}0*@4VEw zrC&5A*C)$M$Yiv1i%rPezDo8$LJsY|;N$#7K9VA0Jr(Cyo(h*-I(1#zM}ZQPC*g4K z!J89jIy+~|^oQ|Tn2<0S0gnaQFD$H;EsO$2>x@PBf$Z=o?snU=AO@}j8GBJ=%!GLD zAcvmZg#CCW+qy@ohds}CQ%Dir!ZF(xbo$u!YFp!t+{~$Y!EZtbWN-Orj%!Q5StjaH zI`M0Q;@u}r4@gpLsyf|o{Px1>m}ZtvYpG+Syu?(K4wu<$#1>_dRVo8D)$o1#c6(e> zd~K#v#FJE`4CSH@?Jc+>q$8eiR6mgxF!|3yLUK^_^2CFr7lFz?tC*(rb?q3uc5d3+>8nla z=;^MDefVd>vA({~Uq`QUUJ=WW#rdq>NxyOsXPROJ04KPMCgB?mtH*X!VnXqy>9tJZz}IyY*b9|7fk&Y9#60%U8DB6WrX@CLVZ> zOcFe)-`b^--!QS+ajf|2Ju}ZV;Ac4{f3|sNpFY5>Qoom6Q|YlvGruA3#n`4Y0o$-| zHO=A1=RU8z-n(bMyqiSK+wfnuvl}I);PKd8E;D+D ziknqnu7I}Qg(BzahZe;wh+HsX)UFiqSNzJvXSqoK1^odPbQ@tr(IuPB9!%<~?-G&2 z-KuXg@^kCHm%h@nNBhyjNl8WFb9LTlt;Bad6|m9ku`p`;VS2hm&x%>pf7n!LE+y&x zh8YL5uZOlf%=1Z~a^G+3s2rJoekJ6y&LjQG*_meYS=g$BUFm1s5AA|mig{biir2=x z{Bdlyaqaj}LAQjc_IBw*+2^C3_daa8CU3p&)8r;?d9{`-XR*S9Ycd9@p|134jFHzD%l3^Vr&cVR4PTXV~UaE3dmn{t}HcRN-fe&vFjb{dQ`9$(mcLY4=ns(gLd| z)Nf{5b^a(mTzBcS@ac^z-}zOG-#Y0RcCeZW>SECFXbc#5Zkr>rI5zzL_Cm$V_-_Mq z?lfI>c1RVhYH01%D!CrOoTI+wx4m+`w$ctKj$XW$EgGMlP(Pvjo@wX!BKp(@eud(* zTp|M~VMG?CS}?>w2mRusOl+Lsj}@i3a#dq-4)W> zS>-$Np4*A=JtyV*;_$BXq?B|nm*`1M@nOy_pw@#5@cy3udo5Ol7j{nzA0uExpV9fG z^Yt!GGli=`lH?nQsh6gWlhB)`@j0oeL*luE>NT?XjSG`X*N(1J|3+LkYLtZkCBfDE zmu6J-Hm)xps2F4W<-5Rs1MA(q4Fsn!LmSf}_!OKT{KUj9Hnp%=VlxaAhZ^rE91ac= zhQZ-j0IFf&E5Mpy*7=-pI6NB;$CETG{s<=*2yyqXk#M+SNH{85P5oDqaOf5`KX>@T z4`95|l4S_XFdP*}fEI8M@CpQ+1nkz_*8BLl;8p23La2KH0NW5=ju5mFd_kT8?%?-; zZwA4CuVC;ai~}LiJ=h&Ua4bj$!Pg5SAX|5L{|zqQ0Q|rc3;^2#FfzbEL30Cc44^4^ zf(2l(@B~{t!Hx~MBe=2wcLa9?o52&{YfHoNb_8#L+u?q~?FjzE(RO%3Faq0=;H-$S z2~P;+gxvuGj1yXc_3PlWVTf;r_53i{9Zxp{2mKSo9p9s>{{-TW0zj4_qzG0&RGq&8 z;tnH3B3KobMT#E*`wb1AY;03TJ*zD54u9S)}aL54)hs>bjzyXd7Fmb>ObU4;i z0|b;BLNp;v9eSeo06e0>$F$TSK1!1Tl9)zk(a;mC3 zXY5A98XM(}8W5h)usD<_72pn6dfFn%I>qD zNwGY@@T?>NKZ&AIQK_Khs9O;m40hMx6G{sQ9IRHtQc#5n0;l39>7!7nP8c&m zN?5zHnFFN`zj4?*b^;6<13%~xBd~Y?@d2m9+8X!*@x(BP@dM(CL5J}QnO_^7cnUjv=MO34!sBMA%Og~tqPAlG$B%wRNcJhVCH^8uD(`FsH@$@U?E z1(O8?01P0KN?1@N_vq|O`Qc&UW7HuD3gcmj!h)A$r#z6zpvZ+@z=HVWF$Fr5DF6>9 z3>XbQp2yCgB|ZU)%A$ir1pqqu0ZWL%*@0KFH-On@7lI{-VWhBMfN*AQgdv9w;ALkU z3|_3ZI3yVXzK$ic3B2r_L&3|=7L)Y|*p9*s2^JK6kahGCVw#nPJ`6`zsfL20He}43kpsNFc2yQN*BZPQ9$(6(i&E9R6}-YhKh*l z%0om|9xS&hni=6Yw4*6ZMeG(c1voLg#Z18ph23JNV2i?L6;r_ZvKi47U3*Ra&t<|uPI zI-d0A`(eC;o}rXavVcZgr<(2pe7t_x%(s_n|yVPnw1sW-~yoUbSFf{0DTZdki?bO51$<@_4)uRT)=JPRa!o9D_TRn7#R^mxrbUWj*oGU~IU{@KxKekD(3^!=T$r&_CO zw$%D5pYWGbIQiW0^n>;0MT<`L&wG0)#v)c~rkixghjAa0_mE8L+!oF;DT>Pztl8z% zyFkRRVp%Egq>0J-|8EY|FsK!WNzRMb>RN_fwpAiuzc z1cJ}cS1RiR)+pi#a0%P$4Q}v#_TjPoR(s%GUEJ0|HtZA>A$yd(*LvS2=%e=YmMkO_ z$rJ$Qkw|n3>`xsO(dB3Ca)kuAd*G0^iGhPlvDkm0T%dx&!UO-4hL)G;5AVySQNVm3 zs>?wm(;4Ur9c~)hM03&@Bv94x(2z=mI~-CQaM0if(tL2y&_zgGG`QIb`im3Lj1yoygO_ zL^>c-c*0T94OU!nky@LZM&}L3Ao1qQK-a=?#RY9BPnki>&E3DCQ$orFj^SRT5>-czSRHQuQ%vTMVmzzdMYILq}bX9Z@8)sa)8ZW=l`LrK{ zcix2&%`<1im7~0E%Ah0VFlW9D22!hX(8xpzPrs3gRGu~<6V>3zg)1&FG(Oco)4dw`dLYF*^cPDf9vI{j5QME_C}cR*!O6?( Ind#&H0byt<4*&oF diff --git a/benchmarking/switchback/speed_benchmark.py b/benchmarking/switchback/speed_benchmark.py deleted file mode 100644 index eaba0e9cd..000000000 --- a/benchmarking/switchback/speed_benchmark.py +++ /dev/null @@ -1,160 +0,0 @@ -import json -import time - -import torch - -from bitsandbytes.triton.int8_matmul_mixed_dequantize import ( - int8_matmul_mixed_dequantize, -) -from bitsandbytes.triton.int8_matmul_rowwise_dequantize import ( - int8_matmul_rowwise_dequantize, -) -from bitsandbytes.triton.quantize_columnwise_and_transpose import ( - quantize_columnwise_and_transpose, -) -from bitsandbytes.triton.quantize_global import ( - quantize_global, - quantize_global_transpose, -) -from bitsandbytes.triton.quantize_rowwise import quantize_rowwise - -# KNOW ISSUE: need to optimize "w_quantize_colwise_transpose" when embeddim is too large. - - -def get_time(k, fn, info_dict): - for _ in range(repeat // 2): - fn() - - torch.cuda.synchronize() - start = time.time() - for _ in range(repeat): - fn() - - torch.cuda.synchronize() - end = time.time() - ms = (end - start) / repeat * 1000 - print(f"time {k}: {ms:.3f} ms") - info_dict[k] = ms - - -if __name__ == "__main__": - torch.manual_seed(0) - wm = 4 - for dim in [1024, 1280, 1408, 1664, 2048, 4096]: - # note "batch_size" is actually "batch_size * embed_dim", which is why it's large - for batch_size in [256 * 32, 256 * 64, 256 * 128, 256 * 256, 256 * 512]: - # switch switches dim_in and dim_out - for switch in [False, True]: - # hparams - repeat = 64 - batch_size = batch_size - dim_out = dim * wm - dim_in = dim - if switch: - dim_out = dim - dim_in = wm * dim - - dim_in = round(dim_in) - dim_out = round(dim_out) - - # simulate forward pass - x = torch.randn(batch_size, dim_in, dtype=torch.float16).cuda() - g = torch.randn(batch_size, dim_out, dtype=torch.float16).cuda() - w = torch.randn(dim_out, dim_in, dtype=torch.float16).cuda() - - x_int8 = x.clone().to(torch.int8) - g_int8 = g.clone().to(torch.int8) - w_int8 = w.clone().to(torch.int8) - wt_int8 = w.t().contiguous().clone().to(torch.int8) - state_x_rowwise = x.max(dim=1)[0] - state_g_rowwise = g.max(dim=1)[0] - state_w_columnwise = w.max(dim=0)[0] - state_w_rowwise = w.max(dim=1)[0] - state_w_global = w.max() - - info = { - "repeat": repeat, - "batch_size": batch_size, - "dim_out": dim_out, - "dim_in": dim_in, - "wm": wm, - "switch": switch, - } - - get_time("standard_fwd", lambda: x.matmul(w.t()), info) - get_time("standard_gw", lambda: g.t().matmul(x), info) - get_time("standard_gx", lambda: g.matmul(w), info) - get_time( - "rowwise_fwd", - lambda: int8_matmul_rowwise_dequantize( - x_int8, - w_int8.t(), - state_x_rowwise, - state_w_columnwise, - None, - ), - info, - ) - get_time( - "rowwise_bwd", - lambda: int8_matmul_rowwise_dequantize( - g_int8, - wt_int8.t(), - state_x_rowwise, - state_w_rowwise, - None, - ), - info, - ) - get_time( - "global_fwd", - lambda: int8_matmul_mixed_dequantize(x_int8, w_int8.t(), state_x_rowwise, state_w_global, None), - info, - ) - get_time( - "global_bwd", - lambda: int8_matmul_mixed_dequantize(g_int8, wt_int8.t(), state_x_rowwise, state_w_global, None), - info, - ) - get_time("x_quantize_rowwise", lambda: quantize_rowwise(x), info) - get_time("g_quantize_rowwise", lambda: quantize_rowwise(g), info) - get_time("w_quantize_rowwise", lambda: quantize_rowwise(w), info) - get_time("w_quantize_colwise_transpose", lambda: quantize_columnwise_and_transpose(w), info) - get_time("w_quantize_global", lambda: quantize_global(w), info) - get_time("w_quantize_global_transpose", lambda: quantize_global_transpose(w), info) - - time_standard = info["standard_fwd"] + info["standard_gx"] + info["standard_gw"] - time_rowwise = ( - info["x_quantize_rowwise"] - + info["g_quantize_rowwise"] - + info["w_quantize_colwise_transpose"] - + info["w_quantize_rowwise"] - + info["standard_gw"] - + info["rowwise_fwd"] - + info["rowwise_bwd"] - ) - time_global = ( - info["x_quantize_rowwise"] - + info["g_quantize_rowwise"] - + info["w_quantize_global"] - + info["w_quantize_global_transpose"] - + info["standard_gw"] - + info["global_fwd"] - + info["global_bwd"] - ) - - print("TOTAL STANDARD", time_standard) - print("TOTAL ROWWISE", time_rowwise) - print("TOTAL GLOBAL", time_global) - - print("speedup", -100 * (time_global - time_standard) / time_standard) - - info["time_standard"] = time_standard - info["time_rowwise"] = time_rowwise - info["time_global"] = time_global - - info_json = json.dumps(info) - - # TODO: change this to what you want. - with open("speed_benchmark/info.jsonl", "a") as file: - file.write(info_json + "\n") diff --git a/bitsandbytes/__init__.py b/bitsandbytes/__init__.py index bdc150e5e..729342070 100644 --- a/bitsandbytes/__init__.py +++ b/bitsandbytes/__init__.py @@ -9,7 +9,7 @@ import torch -from . import _ops, research, utils +from . import _ops, utils from .autograd._functions import ( MatmulLtState, matmul, diff --git a/bitsandbytes/autograd/_functions.py b/bitsandbytes/autograd/_functions.py index da168e17b..614c19051 100644 --- a/bitsandbytes/autograd/_functions.py +++ b/bitsandbytes/autograd/_functions.py @@ -53,16 +53,12 @@ def get_current_outlier_idx(self): @dataclass class MatmulLtState: - _tile_indices: Optional[torch.Tensor] = None # TODO: remove - force_no_igemmlt: bool = False CB: Optional[torch.Tensor] = None - CxB: Optional[torch.Tensor] = None # TODO: Deprecate/remove SB: Optional[torch.Tensor] = None SCB: Optional[torch.Tensor] = None - CxBt: Optional[torch.Tensor] = None # TODO: Deprecate/remove SBt: Optional[torch.Tensor] = None CBt: Optional[torch.Tensor] = None @@ -75,22 +71,29 @@ class MatmulLtState: is_training = True has_fp16_weights = True use_pool = False - formatB = "row" # TODO: Deprecate/remove + + # Deprecated attributes kept for downstream compatibility (TGI, vLLM). + # These are always None and will be fully removed in the next release. + _deprecated_fields = frozenset({"CxB", "CxBt", "formatB", "_tile_indices"}) + + def __getattr__(self, name): + if name in MatmulLtState._deprecated_fields: + warnings.warn( + f"MatmulLtState.{name} is deprecated and will be removed in the next bitsandbytes release.", + FutureWarning, + stacklevel=2, + ) + return None + raise AttributeError(f"'{type(self).__name__}' object has no attribute '{name}'") def reset_grads(self): self.CB = None - self.CxB = None self.SB = None self.SCB = None - self.CxBt = None self.SBt = None self.CBt = None - @property - def tile_indices(self): - raise ValueError("tile_indices is no longer supported.") - class MatMul8bitLt(torch.autograd.Function): @staticmethod @@ -293,7 +296,6 @@ def backward(ctx, grad_output): class MatMul4Bit(torch.autograd.Function): # forward is the same, but we added the fallback for pre-turing GPUs - # backward is mostly the same, but adds one extra clause (see "elif state.CxB is not None") @staticmethod def forward(ctx, A, B, out=None, bias=None, quant_state: Optional[F.QuantState] = None): diff --git a/bitsandbytes/backends/utils.py b/bitsandbytes/backends/utils.py index ec96a440c..34e3d5faa 100644 --- a/bitsandbytes/backends/utils.py +++ b/bitsandbytes/backends/utils.py @@ -4,9 +4,8 @@ import torch try: - import triton.language as tl # noqa: F401 - import triton # noqa: F401 + import triton.language as tl # noqa: F401 triton_available = True except ImportError: diff --git a/bitsandbytes/functional.py b/bitsandbytes/functional.py index fe687e1e8..784eeafe5 100644 --- a/bitsandbytes/functional.py +++ b/bitsandbytes/functional.py @@ -11,7 +11,6 @@ import numpy as np import torch from torch import Tensor -from typing_extensions import deprecated from bitsandbytes.utils import pack_dict_to_tensor, unpack_tensor_to_dict @@ -20,32 +19,6 @@ name2qmap = {} """C FUNCTIONS FOR OPTIMIZERS""" -str2optimizer8bit = { - "adam": ( - lib.cadam_static_8bit_grad_32, - lib.cadam_static_8bit_grad_16, - ), - "momentum": ( - lib.cmomentum_static_8bit_grad_32, - lib.cmomentum_static_8bit_grad_16, - ), - "rmsprop": ( - lib.crmsprop_static_8bit_grad_32, - lib.crmsprop_static_8bit_grad_16, - ), - "lion": ( - lib.clion_static_8bit_grad_32, - lib.clion_static_8bit_grad_16, - ), - "lamb": ( - lib.cadam_static_8bit_grad_32, - lib.cadam_static_8bit_grad_16, - ), - "lars": ( - lib.cmomentum_static_8bit_grad_32, - lib.cmomentum_static_8bit_grad_16, - ), -} class GlobalPageManager: @@ -1069,110 +1042,6 @@ def dequantize_4bit( return out -@deprecated("This function is deprecated and will be removed in a future release.", category=FutureWarning) -def quantize( - A: Tensor, - code: Optional[torch.Tensor] = None, - out: Optional[torch.Tensor] = None, -) -> tuple[Tensor, tuple[Tensor, Tensor]]: - if code is None: - if "dynamic" not in name2qmap: - name2qmap["dynamic"] = create_dynamic_map().to(A.device) - code = name2qmap["dynamic"] - code = code.to(A.device) - - absmax = torch.abs(A).max() - if absmax.dtype != torch.float32: - absmax = absmax.float() - inp = A / absmax - out = quantize_no_absmax(inp, code, out) - return out, (absmax, code) - - -@deprecated("This function is deprecated and will be removed in a future release.", category=FutureWarning) -def dequantize( - A: Tensor, - state: Optional[tuple[Tensor, Tensor]] = None, - absmax: Optional[torch.Tensor] = None, - code: Optional[torch.Tensor] = None, - out: Optional[torch.Tensor] = None, -) -> Tensor: - assert state is not None or absmax is not None - if code is None and state is None: - if "dynamic" not in name2qmap: - name2qmap["dynamic"] = create_dynamic_map().to(A.device) - code = name2qmap["dynamic"] - code = code.to(A.device) - - if state is None: - state = (absmax, code) - out = dequantize_no_absmax(A, state[1], out) - return out * state[0] - - -@deprecated("This function is deprecated and will be removed in a future release.", category=FutureWarning) -def quantize_no_absmax(A: Tensor, code: Tensor, out: Optional[torch.Tensor] = None) -> Tensor: - """ - Quantizes input tensor to 8-bit. - - Quantizes the 32-bit input tensor `A` to the 8-bit output tensor - `out` using the quantization map `code`. - - Parameters - ---------- - A : torch.Tensor - The input tensor. - code : torch.Tensor - The quantization map. - out : torch.Tensor, optional - The output tensor. Needs to be of type byte. - - Returns - ------- - torch.Tensor: - Quantized 8-bit tensor. - """ - with _cuda_device_of(A): - if out is None: - out = torch.zeros_like(A, dtype=torch.uint8) - is_on_gpu([A, out]) - lib.cquantize(get_ptr(code), get_ptr(A), get_ptr(out), ct.c_int(A.numel())) - - return out - - -@deprecated("This function is deprecated and will be removed in a future release.", category=FutureWarning) -def dequantize_no_absmax(A: Tensor, code: Tensor, out: Optional[torch.Tensor] = None) -> Tensor: - """ - Dequantizes the 8-bit tensor to 32-bit. - - Dequantizes the 8-bit tensor `A` to the 32-bit tensor `out` via - the quantization map `code`. - - Parameters - ---------- - A : torch.Tensor - The 8-bit input tensor. - code : torch.Tensor - The quantization map. - out : torch.Tensor - The 32-bit output tensor. - - Returns - ------- - torch.Tensor: - 32-bit output tensor. - """ - with _cuda_device_of(A): - if out is None: - out = torch.zeros_like(A, dtype=torch.float32) - is_on_gpu([code, A, out]) - stream = _get_tensor_stream(A) - lib.cdequantize(get_ptr(code), get_ptr(A), get_ptr(out), ct.c_int(A.numel()), stream) - - return out - - def optimizer_update_32bit( optimizer_name: str, g: Tensor, @@ -1262,143 +1131,6 @@ def optimizer_update_32bit( ) -@deprecated( - "This function is deprecated and will be removed in a future release. " - "Please use optimizer_update_8bit_blockwise instead. ", - category=FutureWarning, -) -def optimizer_update_8bit( - optimizer_name: str, - g: Tensor, - p: Tensor, - state1: Tensor, - state2: Optional[torch.Tensor], - beta1: float, - beta2: float, - eps: float, - step: int, - lr: float, - qmap1: Tensor, - qmap2: Optional[torch.Tensor], - max1: Tensor, - max2: Optional[torch.Tensor], - new_max1: Tensor, - new_max2: Optional[torch.Tensor], - weight_decay: float = 0.0, - gnorm_scale: float = 1.0, - unorm_vec: Optional[torch.Tensor] = None, - max_unorm: float = 0.0, -) -> None: - """ - Performs an inplace Adam update. - - Universal Adam update for 32/8-bit state and 32/16-bit gradients/weights. - Uses AdamW formulation if weight decay > 0.0. - - Parameters - ---------- - optimizer_name : str - The name of the optimizer. Choices {adam, momentum} - g : torch.Tensor - Gradient tensor. - p : torch.Tensor - Parameter tensor. - state1 : torch.Tensor - Adam state 1. - state2 : torch.Tensor - Adam state 2. - beta1 : float - Adam beta1. - beta2 : float - Adam beta2. - eps : float - Adam epsilon. - weight_decay : float - Weight decay. - step : int - Current optimizer step. - lr : float - The learning rate. - qmap1 : torch.Tensor - Quantization map for first Adam state. - qmap2 : torch.Tensor - Quantization map for second Adam state. - max1 : torch.Tensor - Max value for first Adam state update. - max2 : torch.Tensor - Max value for second Adam state update. - new_max1 : torch.Tensor - Max value for the next Adam update of the first state. - new_max2 : torch.Tensor - Max value for the next Adam update of the second state. - gnorm_scale : float - The factor to rescale the gradient to the max clip value. - unorm_vec : torch.Tensor - The tensor for the update norm. - max_unorm : float - The maximum update norm relative to the weight norm. - """ - - param_norm = 0.0 - if max_unorm > 0.0: - param_norm = torch.norm(p.data.float()) - - with _cuda_device_of(g): - is_on_gpu([g, p, state1, state2, unorm_vec, qmap1, qmap2, max1, max2, new_max1, new_max2]) - if g.dtype == torch.float32 and state1.dtype == torch.uint8: - str2optimizer8bit[optimizer_name][0]( - get_ptr(p), - get_ptr(g), - get_ptr(state1), - get_ptr(state2), - get_ptr(unorm_vec), - ct.c_float(max_unorm), - ct.c_float(param_norm), - ct.c_float(beta1), - ct.c_float(beta2), - ct.c_float(eps), - ct.c_int32(step), - ct.c_float(lr), - get_ptr(qmap1), - get_ptr(qmap2), - get_ptr(max1), - get_ptr(max2), - get_ptr(new_max1), - get_ptr(new_max2), - ct.c_float(weight_decay), - ct.c_float(gnorm_scale), - ct.c_int32(g.numel()), - ) - elif g.dtype == torch.float16 and state1.dtype == torch.uint8: - str2optimizer8bit[optimizer_name][1]( - get_ptr(p), - get_ptr(g), - get_ptr(state1), - get_ptr(state2), - get_ptr(unorm_vec), - ct.c_float(max_unorm), - ct.c_float(param_norm), - ct.c_float(beta1), - ct.c_float(beta2), - ct.c_float(eps), - ct.c_int32(step), - ct.c_float(lr), - get_ptr(qmap1), - get_ptr(qmap2), - get_ptr(max1), - get_ptr(max2), - get_ptr(new_max1), - get_ptr(new_max2), - ct.c_float(weight_decay), - ct.c_float(gnorm_scale), - ct.c_int32(g.numel()), - ) - else: - raise ValueError( - f"Gradient+optimizer bit data type combination not supported: grad {g.dtype}, optimizer {state1.dtype}", - ) - - def optimizer_update_8bit_blockwise( optimizer_name: str, g: Tensor, @@ -1445,48 +1177,6 @@ def optimizer_update_8bit_blockwise( ) -@deprecated("This function is deprecated and will be removed in a future release.", category=FutureWarning) -def percentile_clipping(grad: Tensor, gnorm_vec: Tensor, step: int, percentile: int = 5): - """Applies percentile clipping - - grad: torch.Tensor - The gradient tensor. - gnorm_vec: torch.Tensor - Vector of gradient norms. 100 elements expected. - step: int - The current optimization steps (number of past gradient norms). - - """ - with _cuda_device_of(grad): - is_on_gpu([grad, gnorm_vec]) - if grad.dtype == torch.float32: - lib.cpercentile_clipping_g32( - get_ptr(grad), - get_ptr(gnorm_vec), - ct.c_int32(step), - ct.c_int32(grad.numel()), - ) - elif grad.dtype == torch.float16: - lib.cpercentile_clipping_g16( - get_ptr(grad), - get_ptr(gnorm_vec), - ct.c_int32(step), - ct.c_int32(grad.numel()), - ) - else: - raise ValueError(f"Gradient type {grad.dtype} not supported!") - - current_gnorm = torch.sqrt(gnorm_vec[step % 100]) - vals, _ = torch.sort(gnorm_vec) - clip_value = torch.sqrt(vals[percentile]) - gnorm_scale = 1.0 - - if current_gnorm > clip_value: - gnorm_scale = clip_value / current_gnorm - - return current_gnorm, clip_value, gnorm_scale - - def check_matmul(A, B, out, transposed_A, transposed_B, expected_type=torch.int8): if not torch.cuda.is_initialized(): torch.cuda.init() diff --git a/bitsandbytes/nn/__init__.py b/bitsandbytes/nn/__init__.py index 20aff67a3..54c2614bd 100644 --- a/bitsandbytes/nn/__init__.py +++ b/bitsandbytes/nn/__init__.py @@ -16,11 +16,4 @@ OutlierAwareLinear, Params4bit, StableEmbedding, - SwitchBackLinearBnb, -) -from .triton_based_modules import ( - StandardLinear, - SwitchBackLinear, - SwitchBackLinearGlobal, - SwitchBackLinearVectorwise, ) diff --git a/bitsandbytes/nn/modules.py b/bitsandbytes/nn/modules.py index 67847f40c..9f05ac6fb 100644 --- a/bitsandbytes/nn/modules.py +++ b/bitsandbytes/nn/modules.py @@ -1137,42 +1137,3 @@ def forward(self, x): w = self.quantize_weight(self.weight, self.outlier_dim) self.weight.data.copy_(w) self.is_quantized = True - - -class SwitchBackLinearBnb(nn.Linear): - def __init__( - self, - input_features, - output_features, - bias=True, - has_fp16_weights=True, - memory_efficient_backward=False, - threshold=0.0, - index=None, - device=None, - ): - super().__init__(input_features, output_features, bias, device) - self.state = bnb.MatmulLtState() - self.index = index - - self.state.threshold = threshold - self.state.has_fp16_weights = has_fp16_weights - self.state.memory_efficient_backward = memory_efficient_backward - if threshold > 0.0 and not has_fp16_weights: - self.state.use_pool = True - - self.weight = Int8Params(self.weight.data, has_fp16_weights=has_fp16_weights, requires_grad=has_fp16_weights) - - def init_8bit_state(self): - self.state.CB = self.weight.CB - self.state.SCB = self.weight.SCB - self.weight.CB = None - self.weight.SCB = None - - def forward(self, x): - self.state.is_training = self.training - - if self.weight.CB is not None: - self.init_8bit_state() - - return bnb.matmul_mixed(x.half(), self.weight.half(), bias=None, state=self.state) + self.bias diff --git a/bitsandbytes/nn/triton_based_modules.py b/bitsandbytes/nn/triton_based_modules.py deleted file mode 100644 index aa8494942..000000000 --- a/bitsandbytes/nn/triton_based_modules.py +++ /dev/null @@ -1,264 +0,0 @@ -from functools import partial - -import torch -import torch.nn as nn - -from bitsandbytes.triton.dequantize_rowwise import dequantize_rowwise -from bitsandbytes.triton.int8_matmul_mixed_dequantize import ( - int8_matmul_mixed_dequantize, -) -from bitsandbytes.triton.int8_matmul_rowwise_dequantize import ( - int8_matmul_rowwise_dequantize, -) -from bitsandbytes.triton.quantize_columnwise_and_transpose import ( - quantize_columnwise_and_transpose, -) -from bitsandbytes.triton.quantize_global import ( - quantize_global, - quantize_global_transpose, -) -from bitsandbytes.triton.quantize_rowwise import quantize_rowwise -from bitsandbytes.triton.triton_utils import is_triton_available - - -class _switchback_global(torch.autograd.Function): - @staticmethod - def forward(ctx, X_3D, W, bias): - # reshape input to [N * L, D] - X = X_3D.view(-1, X_3D.size(-1)) - - # rowwise quantize for X, global quantize for W - X_int8, state_X = quantize_rowwise(X) - W_int8, state_W = quantize_global(W) - - # save for backward. - ctx.save_for_backward = X, W - - # matmult, fused dequant and add bias - # call "mixed" because we are mixing rowwise quantized and global quantized - return int8_matmul_mixed_dequantize(X_int8, W_int8.t(), state_X, state_W, bias).view(*X_3D.size()[:-1], -1) - - @staticmethod - def backward(ctx, G_3D): - # reshape input to [N_out * L, D] - G = G_3D.reshape(-1, G_3D.size(-1)) - - grad_X = grad_W = grad_bias = None - - X, W = ctx.save_for_backward - if ctx.needs_input_grad[0]: - # rowwise quantize for G, global quantize for W - # for W, we also fuse the transpose operation because only A @ B^T is supported - # so we transpose once then call .t() in the matmul - G_int8, state_G = quantize_rowwise(G) - W_int8, state_W = quantize_global_transpose(W) - grad_X = int8_matmul_mixed_dequantize(G_int8, W_int8.t(), state_G, state_W, None).view( - *G_3D.size()[:-1], - -1, - ) - if ctx.needs_input_grad[1]: - # backward pass uses standard weight grad - grad_W = torch.matmul(G.t(), X.to(G.dtype)) - if ctx.needs_input_grad[2]: - grad_bias = G.sum(dim=0) - - return grad_X, grad_W, grad_bias - - -class _switchback_vectorrize(torch.autograd.Function): - @staticmethod - def forward(ctx, X_3D, W, bias): - # reshape input to [N * L, D] - X = X_3D.view(-1, X_3D.size(-1)) - - ctx.save_for_backward = X, W - # rowwise quantize for X - # columnwise quantize for W (first rowwise, transpose later) - X_int8, state_X = quantize_rowwise(X) - W_int8, state_W = quantize_rowwise(W) - - # matmult, fused dequant and add bias - # call kernel which expects rowwise quantized X and W - return int8_matmul_rowwise_dequantize(X_int8, W_int8.t(), state_X, state_W, bias).view(*X_3D.size()[:-1], -1) - - @staticmethod - def backward(ctx, G_3D): - X, W = ctx.save_for_backward - - G = G_3D.reshape(-1, G_3D.size(-1)) - - grad_X = grad_W = grad_bias = None - - if ctx.needs_input_grad[0]: - # rowwise quantize for G, columnwise quantize for W and fused transpose - # we call .t() for weight later because only A @ B^T is supported - G_int8, state_G = quantize_rowwise(G) - W_int8, state_W = quantize_columnwise_and_transpose(W) - grad_X = int8_matmul_rowwise_dequantize(G_int8, W_int8.t(), state_G, state_W, None).view( - *G_3D.size()[:-1], - -1, - ) - if ctx.needs_input_grad[1]: - # backward pass uses standard weight grad - grad_W = torch.matmul(G.t(), X.to(G.dtype)) - if ctx.needs_input_grad[2]: - grad_bias = G.sum(dim=0) - - return grad_X, grad_W, grad_bias - - -class _switchback_global_mem_efficient(torch.autograd.Function): - @staticmethod - def forward(ctx, X_3D, W, bias): - # reshape input to [N * L, D] - X = X_3D.view(-1, X_3D.size(-1)) - X_3D_sz = X_3D.size() - - # rowwise quantize for X, global quantize for W - X_int8, state_X = quantize_rowwise(X) - del X - W_int8, state_W = quantize_global(W) - - # save for backward. - ctx.save_for_backward = X_int8, state_X, W_int8, state_W - - # matmult, fused dequant and add bias - # call "mixed" because we are mixing rowwise quantized and global quantized - return int8_matmul_mixed_dequantize(X_int8, W_int8.t(), state_X, state_W, bias).view(*X_3D_sz[:-1], -1) - - @staticmethod - def backward(ctx, G_3D): - # reshape input to [N_out * L, D] - G = G_3D.reshape(-1, G_3D.size(-1)) - G_3D_sz = G_3D.size() - - grad_X = grad_W = grad_bias = None - - X_int8, state_X, W_int8, state_W = ctx.save_for_backward - if ctx.needs_input_grad[1]: - real_X = dequantize_rowwise(X_int8, state_X) - del X_int8 - grad_W = torch.matmul(G.t(), real_X.to(G.dtype)) - del real_X - if ctx.needs_input_grad[2]: - grad_bias = G.sum(dim=0) - if ctx.needs_input_grad[0]: - G_int8, state_G = quantize_rowwise(G) - del G - W_int8 = W_int8.t().contiguous() - grad_X = int8_matmul_mixed_dequantize(G_int8, W_int8.t(), state_G, state_W, None).view(*G_3D_sz[:-1], -1) - - return grad_X, grad_W, grad_bias - - -class SwitchBackLinear(nn.Linear): - def __init__( - self, - in_features: int, - out_features: int, - bias: bool = True, - device=None, - dtype=None, - vector_wise_quantization: bool = False, - mem_efficient: bool = False, - ): - super().__init__(in_features, out_features, bias, device, dtype) - - if not is_triton_available(): - raise ImportError("""Could not import triton. Please install triton to use SwitchBackLinear. - Alternatively, you can use bnb.nn.SwitchBackLinearBnb, but it will be slower""") - - # By default, we use the global quantization. - self.vector_wise_quantization = vector_wise_quantization - if self.vector_wise_quantization: - self._fn = _switchback_vectorrize - if mem_efficient: - print("mem efficient is not supported for vector-wise quantization.") - exit(1) - else: - if mem_efficient: - self._fn = _switchback_global_mem_efficient - else: - self._fn = _switchback_global - - def prepare_for_eval(self): - # If we just want to do eval, we can pre-quantize the weights instead of doing it on the forward pass. - # Note this is experimental and not tested thoroughly. - # Note this needs to be explicitly called with something like - # def cond_prepare(m): - # if hasattr(m, "prepare_for_eval"): - # m.prepare_for_eval() - # model.apply(cond_prepare) - print("=> preparing for eval.") - if self.vector_wise_quantization: - W_int8, state_W = quantize_rowwise(self.weight) - else: - W_int8, state_W = quantize_global(self.weight) - - self.register_buffer("W_int8", W_int8) - self.register_buffer("state_W", state_W) - - del self.weight - - def forward(self, x): - if self.training: - return self._fn.apply(x, self.weight, self.bias) - else: - # If it hasn't been "prepared for eval", run the standard forward pass. - if not hasattr(self, "W_int8"): - return self._fn.apply(x, self.weight, self.bias) - - # Otherwise, use pre-computed weights. - X = x.view(-1, x.size(-1)) - X_int8, state_X = quantize_rowwise(X) - - if self.vector_wise_quantization: - return int8_matmul_rowwise_dequantize(X_int8, self.W_int8.t(), state_X, self.state_W, self.bias).view( - *x.size()[:-1], - -1, - ) - else: - return int8_matmul_mixed_dequantize(X_int8, self.W_int8.t(), state_X, self.state_W, self.bias).view( - *x.size()[:-1], - -1, - ) - - -SwitchBackLinearGlobal = partial(SwitchBackLinear, vector_wise_quantization=False) -SwitchBackLinearGlobalMemEfficient = partial(SwitchBackLinear, vector_wise_quantization=False, mem_efficient=True) -SwitchBackLinearVectorwise = partial(SwitchBackLinear, vector_wise_quantization=True) - - -# This is just the standard linear function. -class StandardLinearFunction(torch.autograd.Function): - @staticmethod - def forward(ctx, input, weight, bias=None): - X = input.view(-1, input.size(-1)) - - ctx.save_for_backward(X, weight, bias) - output = input.matmul(weight.t()) - if bias is not None: - output += bias.unsqueeze(0).expand_as(output) - return output.view(*input.size()[:-1], -1) - - @staticmethod - def backward(ctx, grad_output_3D): - input, weight, bias = ctx.saved_tensors - - grad_output = grad_output_3D.reshape(-1, grad_output_3D.size(-1)) - - grad_input = grad_weight = grad_bias = None - - if ctx.needs_input_grad[0]: - grad_input = grad_output.matmul(weight.to(grad_output.dtype)).view(*grad_output_3D.size()[:-1], -1) - if ctx.needs_input_grad[1]: - grad_weight = grad_output.t().matmul(input.to(grad_output.dtype)) - if bias is not None and ctx.needs_input_grad[2]: - grad_bias = grad_output.sum(0) - - return grad_input, grad_weight, grad_bias - - -class StandardLinear(nn.Linear): - def forward(self, x): - return StandardLinearFunction.apply(x, self.weight, self.bias) diff --git a/bitsandbytes/optim/adagrad.py b/bitsandbytes/optim/adagrad.py index 7459dece1..b871f2bf4 100644 --- a/bitsandbytes/optim/adagrad.py +++ b/bitsandbytes/optim/adagrad.py @@ -17,8 +17,6 @@ def __init__( optim_bits=32, args=None, min_8bit_size=4096, - percentile_clipping=100, - block_wise=True, ): """ Base Adagrad optimizer. @@ -42,10 +40,6 @@ def __init__( An object with additional arguments. min_8bit_size (`int`, defaults to 4096): The minimum number of elements of the parameter tensors for 8-bit optimization. - percentile_clipping (`int`, defaults to 100): - Adapts clipping threshold automatically by tracking the last 100 gradient norms and clipping the gradient at a certain percentile to improve stability. - block_wise (`bool`, defaults to `True`): - Whether to independently quantize each block of tensors to reduce outlier effects and improve stability. """ if not 0.0 <= lr: raise ValueError(f"Invalid learning rate: {lr}") @@ -67,8 +61,6 @@ def __init__( optim_bits, args, min_8bit_size, - percentile_clipping, - block_wise, ) @@ -84,8 +76,6 @@ def __init__( optim_bits=8, args=None, min_8bit_size=4096, - percentile_clipping=100, - block_wise=True, ): """ 8-bit Adagrad optimizer. @@ -109,10 +99,6 @@ def __init__( An object with additional arguments. min_8bit_size (`int`, defaults to 4096): The minimum number of elements of the parameter tensors for 8-bit optimization. - percentile_clipping (`int`, defaults to 100): - Adapts clipping threshold automatically by tracking the last 100 gradient norms and clipping the gradient at a certain percentile to improve stability. - block_wise (`bool`, defaults to `True`): - Whether to independently quantize each block of tensors to reduce outlier effects and improve stability. """ if not 0.0 <= lr: raise ValueError(f"Invalid learning rate: {lr}") @@ -124,7 +110,6 @@ def __init__( raise ValueError("Initial accumulator value != 0.0 not supported!") if lr_decay != 0.0: raise ValueError("Lr Decay != 0.0 not supported!") - assert block_wise super().__init__( "adagrad", params, @@ -135,8 +120,6 @@ def __init__( 8, args, min_8bit_size, - percentile_clipping, - block_wise, ) @@ -152,8 +135,6 @@ def __init__( optim_bits=32, args=None, min_8bit_size=4096, - percentile_clipping=100, - block_wise=True, ): """ 32-bit Adagrad optimizer. @@ -177,10 +158,6 @@ def __init__( An object with additional arguments. min_8bit_size (`int`, defaults to 4096): The minimum number of elements of the parameter tensors for 8-bit optimization. - percentile_clipping (`int`, defaults to 100): - Adapts clipping threshold automatically by tracking the last 100 gradient norms and clipping the gradient at a certain percentile to improve stability. - block_wise (`bool`, defaults to `True`): - Whether to independently quantize each block of tensors to reduce outlier effects and improve stability. """ if not 0.0 <= lr: raise ValueError(f"Invalid learning rate: {lr}") @@ -202,6 +179,4 @@ def __init__( 32, args, min_8bit_size, - percentile_clipping, - block_wise, ) diff --git a/bitsandbytes/optim/adam.py b/bitsandbytes/optim/adam.py index 22a217c3b..63210bdc3 100644 --- a/bitsandbytes/optim/adam.py +++ b/bitsandbytes/optim/adam.py @@ -18,8 +18,6 @@ def __init__( optim_bits=32, args=None, min_8bit_size=4096, - percentile_clipping=100, - block_wise=True, is_paged=False, ): """ @@ -44,10 +42,6 @@ def __init__( An object with additional arguments. min_8bit_size (`int`, defaults to 4096): The minimum number of elements of the parameter tensors for 8-bit optimization. - percentile_clipping (`int`, defaults to 100): - Adapts clipping threshold automatically by tracking the last 100 gradient norms and clipping the gradient at a certain percentile to improve stability. - block_wise (`bool`, defaults to `True`): - Whether to independently quantize each block of tensors to reduce outlier effects and improve stability. is_paged (`bool`, defaults to `False`): Whether the optimizer is a paged optimizer or not. """ @@ -61,8 +55,6 @@ def __init__( optim_bits, args, min_8bit_size, - percentile_clipping, - block_wise, is_paged=is_paged, ) @@ -79,8 +71,6 @@ def __init__( optim_bits=32, args=None, min_8bit_size=4096, - percentile_clipping=100, - block_wise=True, is_paged=False, ): """ @@ -107,10 +97,6 @@ def __init__( An object with additional arguments. min_8bit_size (`int`, defaults to 4096): The minimum number of elements of the parameter tensors for 8-bit optimization. - percentile_clipping (`int`, defaults to 100): - Adapts clipping threshold automatically by tracking the last 100 gradient norms and clipping the gradient at a certain percentile to improve stability. - block_wise (`bool`, defaults to `True`): - Whether to independently quantize each block of tensors to reduce outlier effects and improve stability. is_paged (`bool`, defaults to `False`): Whether the optimizer is a paged optimizer or not. """ @@ -133,8 +119,6 @@ def __init__( 8, # Hardcoded to 8 bits args, min_8bit_size, - percentile_clipping, - block_wise, is_paged=is_paged, ) @@ -151,8 +135,6 @@ def __init__( optim_bits=32, args=None, min_8bit_size=4096, - percentile_clipping=100, - block_wise=True, is_paged=False, ): """ @@ -177,10 +159,6 @@ def __init__( An object with additional arguments. min_8bit_size (`int`, defaults to 4096): The minimum number of elements of the parameter tensors for 8-bit optimization. - percentile_clipping (`int`, defaults to 100): - Adapts clipping threshold automatically by tracking the last 100 gradient norms and clipping the gradient at a certain percentile to improve stability. - block_wise (`bool`, defaults to `True`): - Whether to independently quantize each block of tensors to reduce outlier effects and improve stability. is_paged (`bool`, defaults to `False`): Whether the optimizer is a paged optimizer or not. """ @@ -194,8 +172,6 @@ def __init__( 32, args, min_8bit_size, - percentile_clipping, - block_wise, is_paged=is_paged, ) @@ -212,8 +188,6 @@ def __init__( optim_bits=32, args=None, min_8bit_size=4096, - percentile_clipping=100, - block_wise=True, is_paged=False, ): """ @@ -238,10 +212,6 @@ def __init__( An object with additional arguments. min_8bit_size (`int`, defaults to 4096): The minimum number of elements of the parameter tensors for 8-bit optimization. - percentile_clipping (`int`, defaults to 100): - Adapts clipping threshold automatically by tracking the last 100 gradient norms and clipping the gradient at a certain percentile to improve stability. - block_wise (`bool`, defaults to `True`): - Whether to independently quantize each block of tensors to reduce outlier effects and improve stability. is_paged (`bool`, defaults to `False`): Whether the optimizer is a paged optimizer or not. """ @@ -255,8 +225,6 @@ def __init__( optim_bits, args, min_8bit_size, - percentile_clipping, - block_wise, is_paged=True, ) @@ -273,8 +241,6 @@ def __init__( optim_bits=32, args=None, min_8bit_size=4096, - percentile_clipping=100, - block_wise=True, is_paged=False, ): """ @@ -301,10 +267,6 @@ def __init__( An object with additional arguments. min_8bit_size (`int`, defaults to 4096): The minimum number of elements of the parameter tensors for 8-bit optimization. - percentile_clipping (`int`, defaults to 100): - Adapts clipping threshold automatically by tracking the last 100 gradient norms and clipping the gradient at a certain percentile to improve stability. - block_wise (`bool`, defaults to `True`): - Whether to independently quantize each block of tensors to reduce outlier effects and improve stability. is_paged (`bool`, defaults to `False`): Whether the optimizer is a paged optimizer or not. """ @@ -327,8 +289,6 @@ def __init__( 8, # Hardcoded to 8 bits args, min_8bit_size, - percentile_clipping, - block_wise, is_paged=True, ) @@ -345,8 +305,6 @@ def __init__( optim_bits=32, args=None, min_8bit_size=4096, - percentile_clipping=100, - block_wise=True, is_paged=False, ): """ @@ -371,10 +329,6 @@ def __init__( An object with additional arguments. min_8bit_size (`int`, defaults to 4096): The minimum number of elements of the parameter tensors for 8-bit optimization. - percentile_clipping (`int`, defaults to 100): - Adapts clipping threshold automatically by tracking the last 100 gradient norms and clipping the gradient at a certain percentile to improve stability. - block_wise (`bool`, defaults to `True`): - Whether to independently quantize each block of tensors to reduce outlier effects and improve stability. is_paged (`bool`, defaults to `False`): Whether the optimizer is a paged optimizer or not. """ @@ -388,7 +342,5 @@ def __init__( 32, args, min_8bit_size, - percentile_clipping, - block_wise, is_paged=True, ) diff --git a/bitsandbytes/optim/adamw.py b/bitsandbytes/optim/adamw.py index 5f225c9ad..36e151dfc 100644 --- a/bitsandbytes/optim/adamw.py +++ b/bitsandbytes/optim/adamw.py @@ -18,8 +18,6 @@ def __init__( optim_bits=32, args=None, min_8bit_size=4096, - percentile_clipping=100, - block_wise=True, is_paged=False, ): """ @@ -44,10 +42,6 @@ def __init__( An object with additional arguments. min_8bit_size (`int`, defaults to 4096): The minimum number of elements of the parameter tensors for 8-bit optimization. - percentile_clipping (`int`, defaults to 100): - Adapts clipping threshold automatically by tracking the last 100 gradient norms and clipping the gradient at a certain percentile to improve stability. - block_wise (`bool`, defaults to `True`): - Whether to independently quantize each block of tensors to reduce outlier effects and improve stability. is_paged (`bool`, defaults to `False`): Whether the optimizer is a paged optimizer or not. """ @@ -61,8 +55,6 @@ def __init__( optim_bits, args, min_8bit_size, - percentile_clipping, - block_wise, is_paged=is_paged, ) @@ -79,8 +71,6 @@ def __init__( optim_bits=32, args=None, min_8bit_size=4096, - percentile_clipping=100, - block_wise=True, is_paged=False, ): """ @@ -107,10 +97,6 @@ def __init__( An object with additional arguments. min_8bit_size (`int`, defaults to 4096): The minimum number of elements of the parameter tensors for 8-bit optimization. - percentile_clipping (`int`, defaults to 100): - Adapts clipping threshold automatically by tracking the last 100 gradient norms and clipping the gradient at a certain percentile to improve stability. - block_wise (`bool`, defaults to `True`): - Whether to independently quantize each block of tensors to reduce outlier effects and improve stability. is_paged (`bool`, defaults to `False`): Whether the optimizer is a paged optimizer or not. """ @@ -133,8 +119,6 @@ def __init__( 8, # Hardcoded to 8 bits args, min_8bit_size, - percentile_clipping, - block_wise, is_paged=is_paged, ) @@ -151,8 +135,6 @@ def __init__( optim_bits=32, args=None, min_8bit_size=4096, - percentile_clipping=100, - block_wise=True, is_paged=False, ): """ @@ -177,10 +159,6 @@ def __init__( An object with additional arguments. min_8bit_size (`int`, defaults to 4096): The minimum number of elements of the parameter tensors for 8-bit optimization. - percentile_clipping (`int`, defaults to 100): - Adapts clipping threshold automatically by tracking the last 100 gradient norms and clipping the gradient at a certain percentile to improve stability. - block_wise (`bool`, defaults to `True`): - Whether to independently quantize each block of tensors to reduce outlier effects and improve stability. is_paged (`bool`, defaults to `False`): Whether the optimizer is a paged optimizer or not. """ @@ -194,8 +172,6 @@ def __init__( 32, args, min_8bit_size, - percentile_clipping, - block_wise, is_paged=is_paged, ) @@ -212,8 +188,6 @@ def __init__( optim_bits=32, args=None, min_8bit_size=4096, - percentile_clipping=100, - block_wise=True, ): """ Paged AdamW optimizer. @@ -237,10 +211,6 @@ def __init__( An object with additional arguments. min_8bit_size (`int`, defaults to 4096): The minimum number of elements of the parameter tensors for 8-bit optimization. - percentile_clipping (`int`, defaults to 100): - Adapts clipping threshold automatically by tracking the last 100 gradient norms and clipping the gradient at a certain percentile to improve stability. - block_wise (`bool`, defaults to `True`): - Whether to independently quantize each block of tensors to reduce outlier effects and improve stability. """ super().__init__( "adam", @@ -252,8 +222,6 @@ def __init__( optim_bits, args, min_8bit_size, - percentile_clipping, - block_wise, is_paged=True, ) @@ -270,8 +238,6 @@ def __init__( optim_bits=32, args=None, min_8bit_size=4096, - percentile_clipping=100, - block_wise=True, ): """ Paged 8-bit AdamW optimizer. @@ -297,10 +263,6 @@ def __init__( An object with additional arguments. min_8bit_size (`int`, defaults to 4096): The minimum number of elements of the parameter tensors for 8-bit optimization. - percentile_clipping (`int`, defaults to 100): - Adapts clipping threshold automatically by tracking the last 100 gradient norms and clipping the gradient at a certain percentile to improve stability. - block_wise (`bool`, defaults to `True`): - Whether to independently quantize each block of tensors to reduce outlier effects and improve stability. """ # Validate unsupported parameters if amsgrad: @@ -321,8 +283,6 @@ def __init__( 8, # Hardcoded to 8 bits args, min_8bit_size, - percentile_clipping, - block_wise, is_paged=True, ) @@ -339,8 +299,6 @@ def __init__( optim_bits=32, args=None, min_8bit_size=4096, - percentile_clipping=100, - block_wise=True, ): """ Paged 32-bit AdamW optimizer. @@ -364,10 +322,6 @@ def __init__( An object with additional arguments. min_8bit_size (`int`, defaults to 4096): The minimum number of elements of the parameter tensors for 8-bit optimization. - percentile_clipping (`int`, defaults to 100): - Adapts clipping threshold automatically by tracking the last 100 gradient norms and clipping the gradient at a certain percentile to improve stability. - block_wise (`bool`, defaults to `True`): - Whether to independently quantize each block of tensors to reduce outlier effects and improve stability. """ super().__init__( "adam", @@ -379,7 +333,5 @@ def __init__( 32, args, min_8bit_size, - percentile_clipping, - block_wise, is_paged=True, ) diff --git a/bitsandbytes/optim/ademamix.py b/bitsandbytes/optim/ademamix.py index 928289adb..48a62198e 100644 --- a/bitsandbytes/optim/ademamix.py +++ b/bitsandbytes/optim/ademamix.py @@ -129,8 +129,6 @@ def __init__( optim_bits=optim_bits, args=None, min_8bit_size=min_8bit_size, - percentile_clipping=100, - block_wise=True, is_paged=is_paged, alpha=alpha, t_alpha=t_alpha, @@ -142,8 +140,6 @@ def init_state(self, group, p, gindex, pindex): # In our AdEMAMix implementation, we use `state` to hold # both the fast and slow EMAs. Here we override the base # `Optimizer2State` to allocate a buffer twice as large. - # Additional consideration: we do not support block_wise=False, - # percentile clipping, or max_unorm. config = self.get_config(gindex, pindex, group) @@ -380,8 +376,6 @@ def __init__( optim_bits=32, args=None, min_8bit_size=min_8bit_size, - percentile_clipping=100, - block_wise=True, is_paged=is_paged, alpha=alpha, t_alpha=t_alpha, diff --git a/bitsandbytes/optim/lamb.py b/bitsandbytes/optim/lamb.py index 8d29cbbfe..6dcfd383f 100644 --- a/bitsandbytes/optim/lamb.py +++ b/bitsandbytes/optim/lamb.py @@ -19,8 +19,6 @@ def __init__( optim_bits=32, args=None, min_8bit_size=4096, - percentile_clipping=100, - block_wise=False, max_unorm=1.0, ): """ @@ -49,10 +47,6 @@ def __init__( An object with additional arguments. min_8bit_size (`int`, defaults to 4096): The minimum number of elements of the parameter tensors for 8-bit optimization. - percentile_clipping (`int`, defaults to 100): - Adapts clipping threshold automatically by tracking the last 100 gradient norms and clipping the gradient at a certain percentile to improve stability. - block_wise (`bool`, defaults to `True`): - Whether to independently quantize each block of tensors to reduce outlier effects and improve stability. max_unorm (`float`, defaults to 1.0): The maximum gradient norm. """ @@ -66,8 +60,6 @@ def __init__( optim_bits, args, min_8bit_size, - percentile_clipping, - block_wise, max_unorm=1.0, ) @@ -85,8 +77,6 @@ def __init__( adam_w_mode=True, args=None, min_8bit_size=4096, - percentile_clipping=100, - block_wise=False, max_unorm=1.0, ): """ @@ -113,10 +103,6 @@ def __init__( An object with additional arguments. min_8bit_size (`int`, defaults to 4096): The minimum number of elements of the parameter tensors for 8-bit optimization. - percentile_clipping (`int`, defaults to 100): - Adapts clipping threshold automatically by tracking the last 100 gradient norms and clipping the gradient at a certain percentile to improve stability. - block_wise (`bool`, defaults to `True`): - Whether to independently quantize each block of tensors to reduce outlier effects and improve stability. max_unorm (`float`, defaults to 1.0): The maximum gradient norm. """ @@ -130,8 +116,6 @@ def __init__( 8, args, min_8bit_size, - percentile_clipping, - block_wise, max_unorm=1.0, ) @@ -149,8 +133,6 @@ def __init__( adam_w_mode=True, args=None, min_8bit_size=4096, - percentile_clipping=100, - block_wise=False, max_unorm=1.0, ): """ @@ -177,10 +159,6 @@ def __init__( An object with additional arguments. min_8bit_size (`int`, defaults to 4096): The minimum number of elements of the parameter tensors for 8-bit optimization. - percentile_clipping (`int`, defaults to 100): - Adapts clipping threshold automatically by tracking the last 100 gradient norms and clipping the gradient at a certain percentile to improve stability. - block_wise (`bool`, defaults to `True`): - Whether to independently quantize each block of tensors to reduce outlier effects and improve stability. max_unorm (`float`, defaults to 1.0): The maximum gradient norm. """ @@ -194,7 +172,5 @@ def __init__( 32, args, min_8bit_size, - percentile_clipping, - block_wise, max_unorm=1.0, ) diff --git a/bitsandbytes/optim/lars.py b/bitsandbytes/optim/lars.py index fa2af57bc..c2f5aa784 100644 --- a/bitsandbytes/optim/lars.py +++ b/bitsandbytes/optim/lars.py @@ -20,7 +20,6 @@ def __init__( optim_bits=32, args=None, min_8bit_size=4096, - percentile_clipping=100, max_unorm=0.02, ): """ @@ -45,8 +44,6 @@ def __init__( An object with additional arguments. min_8bit_size (`int`, defaults to 4096): The minimum number of elements of the parameter tensors for 8-bit optimization. - percentile_clipping (`int`, defaults to 100): - Adapts clipping threshold automatically by tracking the last 100 gradient norms and clipping the gradient at a certain percentile to improve stability. max_unorm (`float`, defaults to 0.02): The maximum gradient norm. """ @@ -62,9 +59,7 @@ def __init__( optim_bits, args, min_8bit_size, - percentile_clipping, max_unorm=max_unorm, - block_wise=False, ) @@ -79,7 +74,6 @@ def __init__( nesterov=False, args=None, min_8bit_size=4096, - percentile_clipping=100, max_unorm=0.02, ): """ @@ -102,8 +96,6 @@ def __init__( An object with additional arguments. min_8bit_size (`int`, defaults to 4096): The minimum number of elements of the parameter tensors for 8-bit optimization. - percentile_clipping (`int`, defaults to 100): - Adapts clipping threshold automatically by tracking the last 100 gradient norms and clipping the gradient at a certain percentile to improve stability. max_unorm (`float`, defaults to 0.02): The maximum gradient norm. """ @@ -119,9 +111,7 @@ def __init__( 8, args, min_8bit_size, - percentile_clipping, max_unorm=max_unorm, - block_wise=False, ) @@ -136,7 +126,6 @@ def __init__( nesterov=False, args=None, min_8bit_size=4096, - percentile_clipping=100, max_unorm=0.02, ): """ @@ -159,8 +148,6 @@ def __init__( An object with additional arguments. min_8bit_size (`int`, defaults to 4096): The minimum number of elements of the parameter tensors for 8-bit optimization. - percentile_clipping (`int`, defaults to 100): - Adapts clipping threshold automatically by tracking the last 100 gradient norms and clipping the gradient at a certain percentile to improve stability. max_unorm (`float`, defaults to 0.02): The maximum gradient norm. """ @@ -176,9 +163,7 @@ def __init__( 32, args, min_8bit_size, - percentile_clipping, max_unorm=max_unorm, - block_wise=False, ) diff --git a/bitsandbytes/optim/lion.py b/bitsandbytes/optim/lion.py index 2e4163694..6100491f6 100644 --- a/bitsandbytes/optim/lion.py +++ b/bitsandbytes/optim/lion.py @@ -15,8 +15,6 @@ def __init__( optim_bits=32, args=None, min_8bit_size=4096, - percentile_clipping=100, - block_wise=True, is_paged=False, ): """ @@ -37,10 +35,6 @@ def __init__( An object with additional arguments. min_8bit_size (`int`, defaults to 4096): The minimum number of elements of the parameter tensors for 8-bit optimization. - percentile_clipping (`int`, defaults to 100): - Adapts clipping threshold automatically by tracking the last 100 gradient norms and clipping the gradient at a certain percentile to improve stability. - block_wise (`bool`, defaults to `True`): - Whether to independently quantize each block of tensors to reduce outlier effects and improve stability. is_paged (`bool`, defaults to `False`): Whether the optimizer is a paged optimizer or not. """ @@ -54,8 +48,6 @@ def __init__( optim_bits, args, min_8bit_size, - percentile_clipping, - block_wise, is_paged=is_paged, ) @@ -69,8 +61,6 @@ def __init__( weight_decay=0, args=None, min_8bit_size=4096, - percentile_clipping=100, - block_wise=True, is_paged=False, ): """ @@ -89,10 +79,6 @@ def __init__( An object with additional arguments. min_8bit_size (`int`, defaults to 4096): The minimum number of elements of the parameter tensors for 8-bit optimization. - percentile_clipping (`int`, defaults to 100): - Adapts clipping threshold automatically by tracking the last 100 gradient norms and clipping the gradient at a certain percentile to improve stability. - block_wise (`bool`, defaults to `True`): - Whether to independently quantize each block of tensors to reduce outlier effects and improve stability. is_paged (`bool`, defaults to `False`): Whether the optimizer is a paged optimizer or not. """ @@ -106,8 +92,6 @@ def __init__( 8, args, min_8bit_size, - percentile_clipping, - block_wise, is_paged=is_paged, ) @@ -121,8 +105,6 @@ def __init__( weight_decay=0, args=None, min_8bit_size=4096, - percentile_clipping=100, - block_wise=True, is_paged=False, ): """ @@ -141,10 +123,6 @@ def __init__( An object with additional arguments. min_8bit_size (`int`, defaults to 4096): The minimum number of elements of the parameter tensors for 8-bit optimization. - percentile_clipping (`int`, defaults to 100): - Adapts clipping threshold automatically by tracking the last 100 gradient norms and clipping the gradient at a certain percentile to improve stability. - block_wise (`bool`, defaults to `True`): - Whether to independently quantize each block of tensors to reduce outlier effects and improve stability. is_paged (`bool`, defaults to `False`): Whether the optimizer is a paged optimizer or not. """ @@ -158,8 +136,6 @@ def __init__( 32, args, min_8bit_size, - percentile_clipping, - block_wise, is_paged=is_paged, ) @@ -174,8 +150,6 @@ def __init__( optim_bits=32, args=None, min_8bit_size=4096, - percentile_clipping=100, - block_wise=True, ): """ Paged Lion optimizer. @@ -195,10 +169,6 @@ def __init__( An object with additional arguments. min_8bit_size (`int`, defaults to 4096): The minimum number of elements of the parameter tensors for 8-bit optimization. - percentile_clipping (`int`, defaults to 100): - Adapts clipping threshold automatically by tracking the last 100 gradient norms and clipping the gradient at a certain percentile to improve stability. - block_wise (`bool`, defaults to `True`): - Whether to independently quantize each block of tensors to reduce outlier effects and improve stability. """ super().__init__( "lion", @@ -210,8 +180,6 @@ def __init__( optim_bits, args, min_8bit_size, - percentile_clipping, - block_wise, is_paged=True, ) @@ -225,8 +193,6 @@ def __init__( weight_decay=0, args=None, min_8bit_size=4096, - percentile_clipping=100, - block_wise=True, ): """ Paged 8-bit Lion optimizer. @@ -240,16 +206,10 @@ def __init__( The beta values are the decay rates of the first and second-order moment of the optimizer. weight_decay (`float`, defaults to 0): The weight decay value for the optimizer. - optim_bits (`int`, defaults to 32): - The number of bits of the optimizer state. args (`object`, defaults to `None`): An object with additional arguments. min_8bit_size (`int`, defaults to 4096): The minimum number of elements of the parameter tensors for 8-bit optimization. - percentile_clipping (`int`, defaults to 100): - Adapts clipping threshold automatically by tracking the last 100 gradient norms and clipping the gradient at a certain percentile to improve stability. - block_wise (`bool`, defaults to `True`): - Whether to independently quantize each block of tensors to reduce outlier effects and improve stability. """ super().__init__( "lion", @@ -261,8 +221,6 @@ def __init__( 8, args, min_8bit_size, - percentile_clipping, - block_wise, is_paged=True, ) @@ -276,8 +234,6 @@ def __init__( weight_decay=0, args=None, min_8bit_size=4096, - percentile_clipping=100, - block_wise=True, ): """ Paged 32-bit Lion optimizer. @@ -291,16 +247,10 @@ def __init__( The beta values are the decay rates of the first and second-order moment of the optimizer. weight_decay (`float`, defaults to 0): The weight decay value for the optimizer. - optim_bits (`int`, defaults to 32): - The number of bits of the optimizer state. args (`object`, defaults to `None`): An object with additional arguments. min_8bit_size (`int`, defaults to 4096): The minimum number of elements of the parameter tensors for 8-bit optimization. - percentile_clipping (`int`, defaults to 100): - Adapts clipping threshold automatically by tracking the last 100 gradient norms and clipping the gradient at a certain percentile to improve stability. - block_wise (`bool`, defaults to `True`): - Whether to independently quantize each block of tensors to reduce outlier effects and improve stability. """ super().__init__( "lion", @@ -312,7 +262,5 @@ def __init__( 32, args, min_8bit_size, - percentile_clipping, - block_wise, is_paged=True, ) diff --git a/bitsandbytes/optim/optimizer.py b/bitsandbytes/optim/optimizer.py index db7a35231..5c00a42bf 100644 --- a/bitsandbytes/optim/optimizer.py +++ b/bitsandbytes/optim/optimizer.py @@ -59,7 +59,7 @@ def override_config(self, parameters, key=None, value=None, key_value_dict=None) The key-values of the optimizer config for the input parameters are overridden This can be both, optimizer parameters like `betas` or `lr`, or it can be - 8-bit specific parameters like `optim_bits` or `percentile_clipping`. + 8-bit specific parameters like `optim_bits`. Arguments: parameters (`torch.Tensor` or `list(torch.Tensors)`): @@ -345,8 +345,6 @@ def get_config(self, gindex, pindex, group): config["t_beta3"] = group.get("t_beta3", 0) config["optim_bits"] = self.args.optim_bits config["min_8bit_size"] = self.args.min_8bit_size - config["percentile_clipping"] = self.args.percentile_clipping - config["block_wise"] = self.args.block_wise config["max_unorm"] = self.args.max_unorm config["skip_zeros"] = self.args.skip_zeros @@ -393,8 +391,6 @@ def __init__( optim_bits=32, args=None, min_8bit_size=4096, - percentile_clipping=100, - block_wise=True, max_unorm=0.0, skip_zeros=False, is_paged=False, @@ -424,10 +420,6 @@ def __init__( An object with additional arguments. min_8bit_size (`int`, defaults to 4096): The minimum number of elements of the parameter tensors for 8-bit optimization. - percentile_clipping (`int`, defaults to 100): - Adapts clipping threshold automatically by tracking the last 100 gradient norms and clipping the gradient at a certain percentile to improve stability. - block_wise (`bool`, defaults to `True`): - Whether to independently quantize each block of tensors to reduce outlier effects and improve stability. max_unorm (`float`, defaults to 0.0): The maximum value to normalize each block with. skip_zeros (`bool`, defaults to `False`): @@ -466,8 +458,6 @@ def __init__( args = {} args["optim_bits"] = optim_bits args["min_8bit_size"] = min_8bit_size - args["percentile_clipping"] = percentile_clipping - args["block_wise"] = block_wise args["max_unorm"] = max_unorm args["skip_zeros"] = skip_zeros @@ -510,21 +500,12 @@ def init_state(self, group, p, gindex, pindex): state["state2"] = self.get_state_buffer(p, dtype=torch.uint8) state["qmap2"] = self.name2qmap["udynamic"] - if config["block_wise"]: - blocksize = 256 - n = p.numel() - blocks = (n // blocksize) + bool(n % blocksize) + blocksize = 256 + n = p.numel() + blocks = (n // blocksize) + bool(n % blocksize) - state["absmax1"] = torch.zeros((blocks,), dtype=torch.float32, device=p.device) - state["absmax2"] = torch.zeros((blocks,), dtype=torch.float32, device=p.device) - else: - state["max1"] = torch.zeros((1,), dtype=torch.float32, device=p.device) - state["new_max1"] = torch.zeros((1,), dtype=torch.float32, device=p.device) - state["max2"] = torch.zeros((1,), dtype=torch.float32, device=p.device) - state["new_max2"] = torch.zeros((1,), dtype=torch.float32, device=p.device) - - if config["percentile_clipping"] < 100: - state["gnorm_vec"] = torch.zeros((100,), device=p.device) + state["absmax1"] = torch.zeros((blocks,), dtype=torch.float32, device=p.device) + state["absmax2"] = torch.zeros((blocks,), dtype=torch.float32, device=p.device) if config["max_unorm"] > 0.0: state["unorm_vec"] = torch.zeros((1,), device=p.device) @@ -543,16 +524,6 @@ def update_step(self, group, p, gindex, pindex): state["step"] += 1 step = state["step"] - if config["percentile_clipping"] < 100: - _current_gnorm, _clip_value, gnorm_scale = F.percentile_clipping( - grad, - state["gnorm_vec"], - step, - config["percentile_clipping"], - ) - else: - gnorm_scale = 1.0 - if state["state1"].dtype == torch.float: F.optimizer_update_32bit( self.optimizer_name, @@ -568,40 +539,13 @@ def update_step(self, group, p, gindex, pindex): config["betas"][2] if len(config["betas"]) >= 3 else 0.0, config.get("alpha", 0.0), config["weight_decay"], - gnorm_scale, + 1.0, state["unorm_vec"] if config["max_unorm"] > 0.0 else None, max_unorm=config["max_unorm"], skip_zeros=config["skip_zeros"], ) - elif state["state1"].dtype == torch.uint8 and not config["block_wise"]: - F.optimizer_update_8bit( - self.optimizer_name, - grad, - p, - state["state1"], - state["state2"], - config["betas"][0], - config["betas"][1], - config["eps"], - step, - config["lr"], - state["qmap1"], - state["qmap2"], - state["max1"], - state["max2"], - state["new_max1"], - state["new_max2"], - config["weight_decay"], - gnorm_scale=gnorm_scale, - unorm_vec=state["unorm_vec"] if config["max_unorm"] > 0.0 else None, - max_unorm=config["max_unorm"], - ) - - # swap maxes - state["max1"], state["new_max1"] = state["new_max1"], state["max1"] - state["max2"], state["new_max2"] = state["new_max2"], state["max2"] - elif state["state1"].dtype == torch.uint8 and config["block_wise"]: + elif state["state1"].dtype == torch.uint8: F.optimizer_update_8bit_blockwise( self.optimizer_name, grad, @@ -620,7 +564,7 @@ def update_step(self, group, p, gindex, pindex): state["absmax1"], state["absmax2"], config["weight_decay"], - gnorm_scale=gnorm_scale, + gnorm_scale=1.0, skip_zeros=config["skip_zeros"], ) @@ -637,8 +581,6 @@ def __init__( optim_bits=32, args=None, min_8bit_size=4096, - percentile_clipping=100, - block_wise=True, max_unorm=0.0, skip_zeros=False, is_paged=False, @@ -665,10 +607,6 @@ def __init__( An object with additional arguments. min_8bit_size (`int`, defaults to 4096): The minimum number of elements of the parameter tensors for 8-bit optimization. - percentile_clipping (`int`, defaults to 100): - Adapts clipping threshold automatically by tracking the last 100 gradient norms and clipping the gradient at a certain percentile to improve stability. - block_wise (`bool`, defaults to `True`): - Whether to independently quantize each block of tensors to reduce outlier effects and improve stability. max_unorm (`float`, defaults to 0.0): The maximum value to normalize each block with. skip_zeros (`bool`, defaults to `False`): @@ -692,8 +630,6 @@ def __init__( args = {} args["optim_bits"] = optim_bits args["min_8bit_size"] = min_8bit_size - args["percentile_clipping"] = percentile_clipping - args["block_wise"] = block_wise args["max_unorm"] = max_unorm args["skip_zeros"] = skip_zeros @@ -731,18 +667,11 @@ def init_state(self, group, p, gindex, pindex): state["state1"] = self.get_state_buffer(p, dtype=torch.uint8) state["qmap1"] = self.name2qmap["dynamic"] - if config["block_wise"]: - blocksize = 256 - n = p.numel() - blocks = (n // blocksize) + bool(n % blocksize) - - state["absmax1"] = torch.zeros((blocks,), dtype=torch.float32, device=p.device) - else: - state["max1"] = torch.zeros((1,), dtype=torch.float32, device=p.device) - state["new_max1"] = torch.zeros((1,), dtype=torch.float32, device=p.device) + blocksize = 256 + n = p.numel() + blocks = (n // blocksize) + bool(n % blocksize) - if config["percentile_clipping"] < 100: - state["gnorm_vec"] = torch.zeros((100,), device=p.device) + state["absmax1"] = torch.zeros((blocks,), dtype=torch.float32, device=p.device) if config["max_unorm"] > 0.0: state["unorm_vec"] = torch.zeros((1,), device=p.device) @@ -761,16 +690,6 @@ def update_step(self, group, p, gindex, pindex): state["step"] += 1 step = state["step"] - if config["percentile_clipping"] < 100: - _current_gnorm, _clip_value, gnorm_scale = F.percentile_clipping( - grad, - state["gnorm_vec"], - step, - config["percentile_clipping"], - ) - else: - gnorm_scale = 1.0 - if state["state1"].dtype == torch.float: F.optimizer_update_32bit( self.optimizer_name, @@ -786,38 +705,13 @@ def update_step(self, group, p, gindex, pindex): 0.0, 0.0, config["weight_decay"], - gnorm_scale, + 1.0, state["unorm_vec"] if config["max_unorm"] > 0.0 else None, max_unorm=config["max_unorm"], skip_zeros=config["skip_zeros"], ) - elif state["state1"].dtype == torch.uint8 and not config["block_wise"]: - F.optimizer_update_8bit( - self.optimizer_name, - grad, - p, - state["state1"], - None, - config["betas"][0], - config["betas"][1], - config["eps"], - step, - config["lr"], - state["qmap1"], - None, - state["max1"], - None, - state["new_max1"], - None, - config["weight_decay"], - gnorm_scale, - state["unorm_vec"] if config["max_unorm"] > 0.0 else None, - max_unorm=config["max_unorm"], - ) - - state["max1"], state["new_max1"] = state["new_max1"], state["max1"] - elif state["state1"].dtype == torch.uint8 and config["block_wise"]: + elif state["state1"].dtype == torch.uint8: F.optimizer_update_8bit_blockwise( self.optimizer_name, grad, @@ -836,6 +730,6 @@ def update_step(self, group, p, gindex, pindex): state["absmax1"], None, config["weight_decay"], - gnorm_scale=gnorm_scale, + gnorm_scale=1.0, skip_zeros=config["skip_zeros"], ) diff --git a/bitsandbytes/optim/rmsprop.py b/bitsandbytes/optim/rmsprop.py index 25611309b..54c1fbda0 100644 --- a/bitsandbytes/optim/rmsprop.py +++ b/bitsandbytes/optim/rmsprop.py @@ -18,8 +18,6 @@ def __init__( optim_bits=32, args=None, min_8bit_size=4096, - percentile_clipping=100, - block_wise=True, ): """ Base RMSprop optimizer. @@ -45,10 +43,6 @@ def __init__( An object with additional arguments. min_8bit_size (`int`, defaults to 4096): The minimum number of elements of the parameter tensors for 8-bit optimization. - percentile_clipping (`int`, defaults to 100): - Adapts clipping threshold automatically by tracking the last 100 gradient norms and clipping the gradient at a certain percentile to improve stability. - block_wise (`bool`, defaults to `True`): - Whether to independently quantize each block of tensors to reduce outlier effects and improve stability. """ if alpha == 0: raise NotImplementedError("RMSprop with alpha==0.0 is not supported!") @@ -64,8 +58,6 @@ def __init__( optim_bits, args, min_8bit_size, - percentile_clipping, - block_wise, ) @@ -81,8 +73,6 @@ def __init__( centered=False, args=None, min_8bit_size=4096, - percentile_clipping=100, - block_wise=True, ): """ 8-bit RMSprop optimizer. @@ -102,16 +92,10 @@ def __init__( The momentum value speeds up the optimizer by taking bigger steps. centered (`bool`, defaults to `False`): Whether the gradients are normalized by the variance. If `True`, it can help training at the expense of additional compute. - optim_bits (`int`, defaults to 32): - The number of bits of the optimizer state. args (`object`, defaults to `None`): An object with additional arguments. min_8bit_size (`int`, defaults to 4096): The minimum number of elements of the parameter tensors for 8-bit optimization. - percentile_clipping (`int`, defaults to 100): - Adapts clipping threshold automatically by tracking the last 100 gradient norms and clipping the gradient at a certain percentile to improve stability. - block_wise (`bool`, defaults to `True`): - Whether to independently quantize each block of tensors to reduce outlier effects and improve stability. """ if alpha == 0: raise NotImplementedError("RMSprop with alpha==0.0 is not supported!") @@ -127,8 +111,6 @@ def __init__( 8, args, min_8bit_size, - percentile_clipping, - block_wise, ) @@ -144,8 +126,6 @@ def __init__( centered=False, args=None, min_8bit_size=4096, - percentile_clipping=100, - block_wise=True, ): """ 32-bit RMSprop optimizer. @@ -171,10 +151,6 @@ def __init__( An object with additional arguments. min_8bit_size (`int`, defaults to 4096): The minimum number of elements of the parameter tensors for 8-bit optimization. - percentile_clipping (`int`, defaults to 100): - Adapts clipping threshold automatically by tracking the last 100 gradient norms and clipping the gradient at a certain percentile to improve stability. - block_wise (`bool`, defaults to `True`): - Whether to independently quantize each block of tensors to reduce outlier effects and improve stability. """ if alpha == 0: @@ -191,6 +167,4 @@ def __init__( 32, args, min_8bit_size, - percentile_clipping, - block_wise, ) diff --git a/bitsandbytes/optim/sgd.py b/bitsandbytes/optim/sgd.py index ec18f036c..75fc71474 100644 --- a/bitsandbytes/optim/sgd.py +++ b/bitsandbytes/optim/sgd.py @@ -17,8 +17,6 @@ def __init__( optim_bits=32, args=None, min_8bit_size=4096, - percentile_clipping=100, - block_wise=True, ): """ Base SGD optimizer. @@ -42,10 +40,6 @@ def __init__( An object with additional arguments. min_8bit_size (`int`, defaults to 4096): The minimum number of elements of the parameter tensors for 8-bit optimization. - percentile_clipping (`int`, defaults to 100): - Adapts clipping threshold automatically by tracking the last 100 gradient norms and clipping the gradient at a certain percentile to improve stability. - block_wise (`bool`, defaults to `True`): - Whether to independently quantize each block of tensors to reduce outlier effects and improve stability. """ if momentum == 0: raise NotImplementedError("SGD without momentum is not supported!") @@ -59,8 +53,6 @@ def __init__( optim_bits, args, min_8bit_size, - percentile_clipping, - block_wise, ) @@ -75,8 +67,6 @@ def __init__( nesterov=False, args=None, min_8bit_size=4096, - percentile_clipping=100, - block_wise=True, ): """ 8-bit SGD optimizer. @@ -98,10 +88,6 @@ def __init__( An object with additional arguments. min_8bit_size (`int`, defaults to 4096): The minimum number of elements of the parameter tensors for 8-bit optimization. - percentile_clipping (`int`, defaults to 100): - Adapts clipping threshold automatically by tracking the last 100 gradient norms and clipping the gradient at a certain percentile to improve stability. - block_wise (`bool`, defaults to `True`): - Whether to independently quantize each block of tensors to reduce outlier effects and improve stability. """ if momentum == 0: raise NotImplementedError("SGD without momentum is not supported!") @@ -115,8 +101,6 @@ def __init__( 8, args, min_8bit_size, - percentile_clipping, - block_wise, ) @@ -131,8 +115,6 @@ def __init__( nesterov=False, args=None, min_8bit_size=4096, - percentile_clipping=100, - block_wise=True, ): """ 32-bit SGD optimizer. @@ -154,10 +136,6 @@ def __init__( An object with additional arguments. min_8bit_size (`int`, defaults to 4096): The minimum number of elements of the parameter tensors for 8-bit optimization. - percentile_clipping (`int`, defaults to 100): - Adapts clipping threshold automatically by tracking the last 100 gradient norms and clipping the gradient at a certain percentile to improve stability. - block_wise (`bool`, defaults to `True`): - Whether to independently quantize each block of tensors to reduce outlier effects and improve stability. """ if momentum == 0: raise NotImplementedError("SGD without momentum is not supported!") @@ -171,6 +149,4 @@ def __init__( 32, args, min_8bit_size, - percentile_clipping, - block_wise, ) diff --git a/bitsandbytes/research/__init__.py b/bitsandbytes/research/__init__.py deleted file mode 100644 index 31db4f282..000000000 --- a/bitsandbytes/research/__init__.py +++ /dev/null @@ -1,6 +0,0 @@ -from . import nn -from .autograd._functions import ( - matmul_fp8_global, - matmul_fp8_mixed, - switchback_bnb, -) diff --git a/bitsandbytes/research/autograd/__init__.py b/bitsandbytes/research/autograd/__init__.py deleted file mode 100644 index e69de29bb..000000000 diff --git a/bitsandbytes/research/autograd/_functions.py b/bitsandbytes/research/autograd/_functions.py deleted file mode 100644 index 1ea147a90..000000000 --- a/bitsandbytes/research/autograd/_functions.py +++ /dev/null @@ -1,396 +0,0 @@ -from functools import reduce # Required in Python 3 -import operator -from typing import Optional -import warnings - -import torch - -from bitsandbytes.autograd._functions import GlobalOutlierPooler, MatmulLtState -import bitsandbytes.functional as F - - -# math.prod not compatible with python < 3.8 -def prod(iterable): - return reduce(operator.mul, iterable, 1) - - -class MatMulFP8Mixed(torch.autograd.Function): - # forward is the same, but we added the fallback for pre-turing GPUs - # backward is mostly the same, but adds one extra clause (see "elif state.CxB is not None") - - @staticmethod - def forward(ctx, A, B, out=None, fw_code=None, bw_code=None, bsz=1024, bsz2=1024): - # default of pytorch behavior if inputs are empty - ctx.is_empty = False - if prod(A.shape) == 0: - ctx.is_empty = True - ctx.A = A - ctx.B = B - - B_shape = B.shape - if A.shape[-1] == B_shape[0]: - return torch.empty(A.shape[:-1] + B_shape[1:], dtype=A.dtype, device=A.device) - else: - return torch.empty(A.shape[:-1] + B_shape[:1], dtype=A.dtype, device=A.device) - - # 1. Dequantize - # 2. MatmulnN - cA, state = F.quantize_blockwise(A, code=fw_code, blocksize=bsz) - fp8A = F.dequantize_blockwise(cA, state, blocksize=bsz).to(A.dtype) - - cB, state = F.quantize(B.float(), code=fw_code) - fp8B = F.dequantize(cB, state).to(B.dtype) - - output = torch.matmul(fp8A, fp8B) - - # output is half - - # 3. Save state - ctx.fw_code = fw_code - ctx.bw_code = bw_code - ctx.bsz = bsz - ctx.bsz2 = bsz2 - ctx.dtype_A, ctx.dtype_B = A.dtype, B.dtype - - if any(ctx.needs_input_grad[:2]): - # NOTE: we send back A, and re-quant. - ctx.tensors = (A, fp8B) - else: - ctx.tensors = (None, None) - - return output - - @staticmethod - def backward(ctx, grad_output): - if ctx.is_empty: - return torch.zeros_like(ctx.A), torch.zeros_like(ctx.B), None, None, None, None, None - - req_gradA, req_gradB, _, _, _, _, _ = ctx.needs_input_grad - A, B = ctx.tensors - - grad_A, grad_B = None, None - - # TODO: Fix blocksize to be output_dim - cgrad_out, state = F.quantize_blockwise(grad_output, code=ctx.bw_code, blocksize=ctx.bsz2) - fp8out = F.dequantize_blockwise(cgrad_out, state, blocksize=ctx.bsz2).to(grad_output.dtype) - - # cgrad_output_2, state_2 = F.quantize(grad_output.float(), code=ctx.bw_code) - # fp8out_2 = F.dequantize(cgrad_output_2, state_2).to(grad_output.dtype) - - # grad_output_reshape = grad_output.reshape(-1, grad_output.shape[-1]).contiguous() - # fp8grad_transpose, stategrad_transpose = F.vectorwise_quant(grad_output_reshape, dim=0, quant_type='vector') - # fp8out_transpose = (fp8grad_transpose / 7) * stategrad_transpose - # fp8out_transpose = fp8out_transpose.view(grad_output.shape[0], grad_output.shape[1], grad_output.shape[2]) - - # not supported by PyTorch. TODO: create work-around - if req_gradA: - grad_A = torch.matmul(fp8out, B.t().to(fp8out.dtype)).to(A.dtype) - - if req_gradB: - if len(A.shape) == 3: - At = A.transpose(2, 1).contiguous() - else: - At = A.transpose(1, 0).contiguous() - # cA, state = F.quantize(At.float(), code=ctx.fw_code) - # fp8At = F.dequantize(cA, state).to(A.dtype) - grad_B = torch.matmul(At.to(grad_output.dtype), grad_output).to(B.dtype) - - return grad_A, grad_B, None, None, None, None, None - - -class MatMulFP8Global(torch.autograd.Function): - # forward is the same, but we added the fallback for pre-turing GPUs - # backward is mostly the same, but adds one extra clause (see "elif state.CxB is not None") - - @staticmethod - def forward(ctx, A, B, out=None, fw_code=None, bw_code=None, bsz=1024, bsz2=1024): - # default of pytorch behavior if inputs are empty - ctx.is_empty = False - if prod(A.shape) == 0: - ctx.is_empty = True - ctx.A = A - ctx.B = B - - B_shape = B.shape - if A.shape[-1] == B_shape[0]: - return torch.empty(A.shape[:-1] + B_shape[1:], dtype=A.dtype, device=A.device) - else: - return torch.empty(A.shape[:-1] + B_shape[:1], dtype=A.dtype, device=A.device) - - # 1. Dequantize - # 2. MatmulnN - cA, state = F.quantize(A.float(), code=fw_code) - fp8A = F.dequantize(cA, state).to(A.dtype) - - cB, state = F.quantize(B.float(), code=fw_code) - fp8B = F.dequantize(cB, state).to(B.dtype) - - output = torch.matmul(fp8A, fp8B) - - # output is half - - # 3. Save state - ctx.fw_code = fw_code - ctx.bw_code = bw_code - ctx.bsz = bsz - ctx.bsz2 = bsz2 - ctx.dtype_A, ctx.dtype_B = A.dtype, B.dtype - - if any(ctx.needs_input_grad[:2]): - # NOTE: we send back A, and re-quant. - ctx.tensors = (A, fp8B) - else: - ctx.tensors = (None, None) - - return output - - @staticmethod - def backward(ctx, grad_output): - if ctx.is_empty: - return torch.zeros_like(ctx.A), torch.zeros_like(ctx.B), None, None, None, None, None - - req_gradA, req_gradB, _, _, _, _, _ = ctx.needs_input_grad - A, B = ctx.tensors - - grad_A, grad_B = None, None - - # TODO: Fix blocksize to be output_dim - cgrad_out, state = F.quantize(grad_output.float(), code=ctx.bw_code) - fp8out = F.dequantize(cgrad_out, state).to(grad_output.dtype) - - # cgrad_output_2, state_2 = F.quantize(grad_output.float(), code=ctx.bw_code) - # fp8out_2 = F.dequantize(cgrad_output_2, state_2).to(grad_output.dtype) - - # grad_output_reshape = grad_output.reshape(-1, grad_output.shape[-1]).contiguous() - # fp8grad_transpose, stategrad_transpose = F.vectorwise_quant(grad_output_reshape, dim=0, quant_type='vector') - # fp8out_transpose = (fp8grad_transpose / 7) * stategrad_transpose - # fp8out_transpose = fp8out_transpose.view(grad_output.shape[0], grad_output.shape[1], grad_output.shape[2]) - - # not supported by PyTorch. TODO: create work-around - if req_gradA: - grad_A = torch.matmul(fp8out, B.t().to(fp8out.dtype)).to(A.dtype) - - if req_gradB: - if len(A.shape) == 3: - At = A.transpose(2, 1).contiguous() - else: - At = A.transpose(1, 0).contiguous() - cA, state = F.quantize(At.float(), code=ctx.fw_code) - fp8At = F.dequantize(cA, state).to(A.dtype) - grad_B = torch.matmul(fp8At.to(fp8out.dtype), fp8out).to(B.dtype) - - return grad_A, grad_B, None, None, None, None, None - - -class SwitchBackBnb(torch.autograd.Function): - @staticmethod - def forward(ctx, A, B, out=None, bias=None, state: Optional[MatmulLtState] = None): - state = state or MatmulLtState() - - # default to pytorch behavior if inputs are empty - ctx.is_empty = False - if prod(A.shape) == 0: - ctx.is_empty = True - ctx.A = A - ctx.B = B - ctx.bias = bias - if A.shape[-1] == B.shape[0]: - return torch.empty(A.shape[:-1] + B.shape[1:], dtype=A.dtype, device=A.device) - else: - return torch.empty(A.shape[:-1] + B.shape[:1], dtype=A.dtype, device=A.device) - - # 1. Quantize A - # 2. Quantize B - # 3. Matmul - # 4. Mixed-precision decomposition matmul - # 5. Save state - input_shape = A.shape - if state.outlier_pool is None: - state.outlier_pool = GlobalOutlierPooler.get_instance() - - # Cast A to fp16 - if A.dtype != torch.float16: - warnings.warn(f"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization") - - # 1. Quantize A - if len(A.shape) == 3: - A = A.view(-1, A.shape[-1]).contiguous() - CA, CAt, SCA, SCAt, outlier_cols = F.int8_double_quant(A.to(torch.float16), threshold=state.threshold) - - if state.threshold > 0.0 and outlier_cols is not None: - if state.has_fp16_weights: - idx = outlier_cols - CA[:, idx] = 0 - subA = A[:, idx] - state.subB = B[:, idx].t().contiguous() - state.idx = idx - else: - if state.SB is None: - state.SB = (state.CB.shape, "row") - else: - if not state.has_fp16_weights and state.SB is None: - state.SB = (state.CB.shape, "row") - subA = None - - # 2. Quantize B - if state.has_fp16_weights: - # print('B shape', B.shape) - has_grad = getattr(B, "grad", None) is not None - is_transposed = not B.is_contiguous() and B.shape[0] == B.stride(1) - if is_transposed: - B = B.contiguous() - - if (state.is_training and not has_grad) or state.SB is None: - state.reset_grads() - ( - state.CB, - state.CBt, - state.SCB, - state.SCBt, - _, - ) = F.int8_double_quant(B.to(torch.float16)) - state.SB = (state.CB.shape, "row") - else: - has_grad = False - - if outlier_cols is not None and not state.has_fp16_weights: - # extract outliers - state.idx = outlier_cols - outliers = state.CB[:, state.idx.long()].clone() - state.subB = (outliers * state.SCB.view(-1, 1) / 127.0).t().contiguous().to(A.dtype) - CA[:, state.idx.long()] = 0 - - subA = A[:, state.idx.long()] - - shapeB = state.SB[0] - - if len(input_shape) == 3: - output_shape = (input_shape[0], input_shape[1], shapeB[0]) - else: - output_shape = (input_shape[0], shapeB[0]) - - # 3. Matmul - out32 = F.int8_linear_matmul(CA, state.CB) - # we apply the fused bias here - - if bias is None or bias.dtype == torch.float16: - output = F.int8_mm_dequant(out32, SCA, state.SCB, bias=bias).to(A.dtype) - else: # apply bias separately - output = F.int8_mm_dequant(out32, SCA, state.SCB, bias=None).to(A.dtype) - output.add_(bias) - - # 4. Mixed-precision decomposition matmul - if outlier_cols is not None and subA is not None: - output += torch.matmul(subA, state.subB) - - # 5. Save state - ctx.state = state - - ctx.grad_shape = input_shape - ctx.dtype_A, ctx.dtype_B, ctx.dtype_bias = A.dtype, B.dtype, None if bias is None else bias.dtype - - if any(ctx.needs_input_grad[:2]): - ctx.tensors = (CAt, subA, A) - ctx.tensor_states = (SCAt, state.idx) - else: - ctx.tensors = [None, None, None] - ctx.tensor_states = (None, None) - ctx.save_for_backward(None, None) - - clone_func = torch.clone if len(output_shape) == 3 else lambda x: x - return clone_func(output.view(output_shape)) - - @staticmethod - def backward(ctx, grad_output): - if ctx.is_empty: - bias_grad = None if ctx.bias is None else torch.zeros_like(ctx.bias) - return torch.zeros_like(ctx.A), torch.zeros_like(ctx.B), None, bias_grad, None - - req_gradA, req_gradB, _, req_gradBias, _ = ctx.needs_input_grad - _CAt, _subA, A = ctx.tensors - _SCAt, _idx = ctx.tensor_states - state = ctx.state - grad_A = grad_B = grad_bias = None - - if req_gradBias: - # compute grad_bias first before changing grad_output dtype - grad_bias = grad_output.sum(0, dtype=ctx.dtype_bias) - - # Cast grad_output to fp16 - if len(grad_output.shape) == 3: - grad_output = grad_output.reshape(-1, grad_output.shape[-1]).contiguous() - - _Cgrad, _Cgradt, _SCgrad, _SCgradt, _outlier_cols = F.int8_double_quant(grad_output.to(torch.float16)) - - if req_gradB: - # print('back A shape', A.shape) - # print('grad output t shape', grad_output.t().shape) - grad_B = torch.matmul(grad_output.t(), A) - - if req_gradA: - if state.CB is not None: - CB = state.CB.to(ctx.dtype_A, copy=True).mul_(state.SCB.unsqueeze(1).mul(1.0 / 127.0)) - grad_A = torch.matmul(grad_output, CB).view(ctx.grad_shape).to(ctx.dtype_A) - else: - raise Exception("State must contain either CBt or CB matrix for backward") - - return grad_A, grad_B, None, grad_bias, None - - -def get_block_sizes(input_matrix, weight_matrix): - input_features = input_matrix.shape[-1] - output_features = weight_matrix.shape[0] if weight_matrix.shape[1] == input_features else weight_matrix.shape[1] - array = [4096, 2048, 1024, 512, 256, 128, 64, 0] - bsz, bsz2 = 1024, 1024 - for i, k in enumerate(array): - if input_features > array[i + 1]: - bsz = k - break - for i, k in enumerate(array): - if output_features > array[i + 1]: - bsz2 = k - break - - return bsz, bsz2 - - -def matmul_fp8_global( - A: torch.Tensor, - B: torch.Tensor, - fw_code: torch.Tensor, - bw_code: torch.Tensor, - out: Optional[torch.Tensor] = None, - bsz: int = -1, - bsz2: int = -1, -): - if bsz == -1 or bsz2 == -1: - bsz, bsz2 = get_block_sizes(A, B) - return MatMulFP8Global.apply(A, B, out, fw_code, bw_code, bsz, bsz2) - - -def matmul_fp8_mixed( - A: torch.Tensor, - B: torch.Tensor, - fw_code: torch.Tensor, - bw_code: torch.Tensor, - out: Optional[torch.Tensor] = None, - bsz: int = -1, - bsz2: int = -1, -): - if bsz == -1 or bsz2 == -1: - bsz, bsz2 = get_block_sizes(A, B) - return MatMulFP8Mixed.apply(A, B, out, fw_code, bw_code, bsz, bsz2) - - -def switchback_bnb( - A: torch.Tensor, - B: torch.Tensor, - out: Optional[torch.Tensor] = None, - state: Optional[MatmulLtState] = None, - threshold=0.0, - bias=None, -): - state = state or MatmulLtState() - if threshold > 0.0: - state.threshold = threshold - return SwitchBackBnb.apply(A, B, out, bias, state) diff --git a/bitsandbytes/research/nn/__init__.py b/bitsandbytes/research/nn/__init__.py deleted file mode 100644 index 417011218..000000000 --- a/bitsandbytes/research/nn/__init__.py +++ /dev/null @@ -1 +0,0 @@ -from .modules import LinearFP8Global, LinearFP8Mixed diff --git a/bitsandbytes/research/nn/modules.py b/bitsandbytes/research/nn/modules.py deleted file mode 100644 index 57c0f3358..000000000 --- a/bitsandbytes/research/nn/modules.py +++ /dev/null @@ -1,76 +0,0 @@ -from typing import TypeVar - -import torch -from torch import nn - -import bitsandbytes as bnb - -T = TypeVar("T", bound="torch.nn.Module") - - -class LinearFP8Mixed(nn.Linear): - def __init__(self, input_features, output_features, bias=True): - super().__init__(input_features, output_features, bias) - self.bw_code = None - self.fw_code = None - array = [4096, 2048, 1024, 512, 256, 128, 64, 0] - for i, k in enumerate(array): - if input_features > array[i + 1]: - self.bsz = k - break - for i, k in enumerate(array): - if output_features > array[i + 1]: - self.bsz2 = k - break - - def forward(self, x: torch.Tensor): - if self.fw_code is None: - self.bw_code = bnb.functional.create_fp8_map(True, 5, 2, 8).to(x.device) - self.fw_code = bnb.functional.create_fp8_map(True, 4, 3, 8).to(x.device) - - out = bnb.research.matmul_fp8_mixed( - x, - self.weight.t(), - fw_code=self.fw_code, - bw_code=self.bw_code, - bsz=self.bsz, - bsz2=self.bsz2, - ) - if self.bias is not None: - out += self.bias - - return out - - -class LinearFP8Global(nn.Linear): - def __init__(self, input_features, output_features, bias=True): - super().__init__(input_features, output_features, bias) - self.bw_code = None - self.fw_code = None - array = [4096, 2048, 1024, 512, 256, 128, 64, 0] - for i, k in enumerate(array): - if input_features > array[i + 1]: - self.bsz = k - break - for i, k in enumerate(array): - if output_features > array[i + 1]: - self.bsz2 = k - break - - def forward(self, x: torch.Tensor): - if self.fw_code is None: - self.bw_code = bnb.functional.create_fp8_map(True, 5, 2, 8).to(x.device) - self.fw_code = bnb.functional.create_fp8_map(True, 4, 3, 8).to(x.device) - - out = bnb.matmul_fp8_global( - x, - self.weight.t(), - fw_code=self.fw_code, - bw_code=self.bw_code, - bsz=self.bsz, - bsz2=self.bsz2, - ) - if self.bias is not None: - out += self.bias - - return out diff --git a/bitsandbytes/triton/__init__.py b/bitsandbytes/triton/__init__.py deleted file mode 100644 index e69de29bb..000000000 diff --git a/bitsandbytes/triton/dequantize_rowwise.py b/bitsandbytes/triton/dequantize_rowwise.py deleted file mode 100644 index 26eab84f2..000000000 --- a/bitsandbytes/triton/dequantize_rowwise.py +++ /dev/null @@ -1,64 +0,0 @@ -import math - -import torch - -from bitsandbytes.triton.triton_utils import is_triton_available - -if not is_triton_available(): - - def dequantize_rowwise(x: torch.Tensor, state_x: torch.Tensor): - return None -else: - import triton - import triton.language as tl - - # rowwise quantize - - # TODO: autotune this better. - @triton.autotune( - configs=[ - triton.Config({}, num_stages=1, num_warps=8), - triton.Config({}, num_stages=2, num_warps=8), - triton.Config({}, num_stages=4, num_warps=8), - triton.Config({}, num_stages=8, num_warps=8), - triton.Config({}, num_stages=1), - triton.Config({}, num_stages=2), - triton.Config({}, num_stages=4), - triton.Config({}, num_stages=8), - triton.Config({}, num_warps=1), - triton.Config({}, num_warps=2), - triton.Config({}, num_warps=4), - triton.Config({}, num_warps=8), - ], - key=["n_elements"], - ) - @triton.jit - def _dequantize_rowwise( - x_ptr, - state_x, - output_ptr, - inv_127, - n_elements, - BLOCK_SIZE: tl.constexpr, - P2: tl.constexpr, - ): - pid = tl.program_id(axis=0) - block_start = pid * BLOCK_SIZE - arange = tl.arange(0, P2) - offsets = block_start + arange - row_mask = arange < BLOCK_SIZE - x = tl.load(x_ptr + offsets, mask=row_mask) - max_val = tl.load(state_x + pid) - output = max_val * x * inv_127 - tl.store(output_ptr + offsets, output, mask=row_mask) - - def dequantize_rowwise(x: torch.Tensor, state_x: torch.Tensor): - output = torch.empty(*x.shape, device=x.device, dtype=torch.float16) - - P2 = int(2 ** (math.ceil(math.log2(x.shape[1])))) - - assert x.is_cuda and output.is_cuda - n_elements = output.numel() - grid = lambda meta: (x.shape[0],) - _dequantize_rowwise[grid](x, state_x, output, 1.0 / 127, n_elements, BLOCK_SIZE=x.shape[1], P2=P2) - return output diff --git a/bitsandbytes/triton/int8_matmul_mixed_dequantize.py b/bitsandbytes/triton/int8_matmul_mixed_dequantize.py deleted file mode 100644 index 5fcb927d4..000000000 --- a/bitsandbytes/triton/int8_matmul_mixed_dequantize.py +++ /dev/null @@ -1,206 +0,0 @@ -import torch - -from bitsandbytes.triton.triton_utils import is_triton_available - -if not is_triton_available(): - - def int8_matmul_mixed_dequantize(a, b, state_x, state_w, bias): - return None -else: - import triton - import triton.language as tl - - from .matmul_perf_model import early_config_prune, estimate_matmul_time - - # This is a matmul kernel based on triton.ops.matmul - # It is modified to support rowwise quantized input and global quantized weight - # It's purpose is fused matmul then dequantize - # It does support bias. - - def init_to_zero(name): - return lambda nargs: nargs[name].zero_() - - def get_configs_io_bound(): - configs = [] - for num_stages in [2, 3, 4, 5, 6]: - for block_m in [16, 32]: - for block_k in [32, 64]: - for block_n in [32, 64, 128, 256]: - num_warps = 2 if block_n <= 64 else 4 - configs.append( - triton.Config( - {"BLOCK_M": block_m, "BLOCK_N": block_n, "BLOCK_K": block_k, "SPLIT_K": 1}, - num_stages=num_stages, - num_warps=num_warps, - ), - ) - # split_k - for split_k in [2, 4, 8, 16]: - configs.append( - triton.Config( - {"BLOCK_M": block_m, "BLOCK_N": block_n, "BLOCK_K": block_k, "SPLIT_K": split_k}, - num_stages=num_stages, - num_warps=num_warps, - pre_hook=init_to_zero("C"), - ), - ) - return configs - - @triton.autotune( - configs=[ - # basic configs for compute-bound matmuls - triton.Config({"BLOCK_M": 128, "BLOCK_N": 256, "BLOCK_K": 32, "SPLIT_K": 1}, num_stages=3, num_warps=8), - triton.Config({"BLOCK_M": 256, "BLOCK_N": 128, "BLOCK_K": 32, "SPLIT_K": 1}, num_stages=3, num_warps=8), - triton.Config({"BLOCK_M": 256, "BLOCK_N": 64, "BLOCK_K": 32, "SPLIT_K": 1}, num_stages=4, num_warps=4), - triton.Config({"BLOCK_M": 64, "BLOCK_N": 256, "BLOCK_K": 32, "SPLIT_K": 1}, num_stages=4, num_warps=4), - triton.Config({"BLOCK_M": 128, "BLOCK_N": 128, "BLOCK_K": 32, "SPLIT_K": 1}, num_stages=4, num_warps=4), - triton.Config({"BLOCK_M": 128, "BLOCK_N": 64, "BLOCK_K": 32, "SPLIT_K": 1}, num_stages=4, num_warps=4), - triton.Config({"BLOCK_M": 64, "BLOCK_N": 128, "BLOCK_K": 32, "SPLIT_K": 1}, num_stages=4, num_warps=4), - triton.Config({"BLOCK_M": 128, "BLOCK_N": 32, "BLOCK_K": 32, "SPLIT_K": 1}, num_stages=4, num_warps=4), - triton.Config({"BLOCK_M": 64, "BLOCK_N": 32, "BLOCK_K": 32, "SPLIT_K": 1}, num_stages=5, num_warps=2), - # good for int8 - triton.Config({"BLOCK_M": 128, "BLOCK_N": 256, "BLOCK_K": 128, "SPLIT_K": 1}, num_stages=3, num_warps=8), - triton.Config({"BLOCK_M": 256, "BLOCK_N": 128, "BLOCK_K": 128, "SPLIT_K": 1}, num_stages=3, num_warps=8), - triton.Config({"BLOCK_M": 256, "BLOCK_N": 64, "BLOCK_K": 128, "SPLIT_K": 1}, num_stages=4, num_warps=4), - triton.Config({"BLOCK_M": 64, "BLOCK_N": 256, "BLOCK_K": 128, "SPLIT_K": 1}, num_stages=4, num_warps=4), - triton.Config({"BLOCK_M": 128, "BLOCK_N": 128, "BLOCK_K": 128, "SPLIT_K": 1}, num_stages=4, num_warps=4), - triton.Config({"BLOCK_M": 128, "BLOCK_N": 64, "BLOCK_K": 64, "SPLIT_K": 1}, num_stages=4, num_warps=4), - triton.Config({"BLOCK_M": 64, "BLOCK_N": 128, "BLOCK_K": 64, "SPLIT_K": 1}, num_stages=4, num_warps=4), - triton.Config({"BLOCK_M": 128, "BLOCK_N": 32, "BLOCK_K": 64, "SPLIT_K": 1}, num_stages=4, num_warps=4), - triton.Config({"BLOCK_M": 64, "BLOCK_N": 32, "BLOCK_K": 64, "SPLIT_K": 1}, num_stages=5, num_warps=2), - *get_configs_io_bound(), - ], - key=["M", "N", "K"], - prune_configs_by={"early_config_prune": early_config_prune, "perf_model": estimate_matmul_time, "top_k": 10}, - ) - @triton.heuristics( - { - "EVEN_K": lambda args: args["K"] % (args["BLOCK_K"] * args["SPLIT_K"]) == 0, - }, - ) - @triton.jit - def _int8_matmul_mixed_dequantize( - A, - B, - C, - bias, - state_x_ptr, - state_w_ptr, - M, - N, - K, - divfactor: tl.constexpr, - has_bias: tl.constexpr, - stride_am, - stride_ak, - stride_bk, - stride_bn, - stride_cm, - stride_cn, - BLOCK_M: tl.constexpr, - BLOCK_N: tl.constexpr, - BLOCK_K: tl.constexpr, - GROUP_M: tl.constexpr, - SPLIT_K: tl.constexpr, - EVEN_K: tl.constexpr, - ACC_TYPE: tl.constexpr, - ): - # matrix multiplication - pid = tl.program_id(0) - pid_z = tl.program_id(1) - grid_m = tl.cdiv(M, BLOCK_M) - grid_n = tl.cdiv(N, BLOCK_N) - # re-order program ID for better L2 performance - width = GROUP_M * grid_n - group_id = pid // width - group_size = min(grid_m - group_id * GROUP_M, GROUP_M) - pid_m = group_id * GROUP_M + (pid % group_size) - pid_n = (pid % width) // (group_size) - # do matrix multiplication - rm = pid_m * BLOCK_M + tl.arange(0, BLOCK_M) - rn = pid_n * BLOCK_N + tl.arange(0, BLOCK_N) - ram = tl.max_contiguous(tl.multiple_of(rm % M, BLOCK_M), BLOCK_M) - rbn = tl.max_contiguous(tl.multiple_of(rn % N, BLOCK_N), BLOCK_N) - rk = pid_z * BLOCK_K + tl.arange(0, BLOCK_K) - # pointers - A = A + (ram[:, None] * stride_am + rk[None, :] * stride_ak) - B = B + (rk[:, None] * stride_bk + rbn[None, :] * stride_bn) - - # rematerialize rm and rn to save registers - rm = pid_m * BLOCK_M + tl.arange(0, BLOCK_M) - rn = pid_n * BLOCK_N + tl.arange(0, BLOCK_N) - - w_factor = tl.load(state_w_ptr) - x_factor = tl.load(state_x_ptr + ram)[:, None] - - # acc = tl.zeros((BLOCK_M, BLOCK_N), dtype=ACC_TYPE) - acc = tl.zeros((BLOCK_M, BLOCK_N), dtype=tl.int32) - for k in range(0, tl.cdiv(K, BLOCK_K * SPLIT_K)): - if EVEN_K: - a = tl.load(A) - b = tl.load(B) - else: - k_remaining = K - k * (BLOCK_K * SPLIT_K) - a = tl.load(A, mask=rk[None, :] < k_remaining, other=0.0) - b = tl.load(B, mask=rk[:, None] < k_remaining, other=0.0) - acc += tl.dot(a, b) - A += BLOCK_K * SPLIT_K * stride_ak - B += BLOCK_K * SPLIT_K * stride_bk - - acc = w_factor * (x_factor * (acc * divfactor)) - acc = acc.to(C.dtype.element_ty) - - # conditionally add bias - if has_bias: - bias = tl.load(bias + rn).to(C.dtype.element_ty) - acc = acc + bias[None, :] - - C = C + (rm[:, None] * stride_cm + rn[None, :] * stride_cn) - mask = (rm < M)[:, None] & (rn < N)[None, :] - # handles write-back with reduction-splitting - if SPLIT_K == 1: - tl.store(C, acc, mask=mask) - else: - tl.atomic_add(C, acc, mask=mask) - - def int8_matmul_mixed_dequantize(a, b, state_x, state_w, bias): - device = a.device - divfactor = 1.0 / (127.0 * 127.0) - has_bias = 0 if bias is None else 1 - # handle non-contiguous inputs if necessary - if a.stride(0) > 1 and a.stride(1) > 1: - a = a.contiguous() - if b.stride(0) > 1 and b.stride(1) > 1: - b = b.contiguous() - # checks constraints - assert a.shape[1] == b.shape[0], "incompatible dimensions" - M, K = a.shape - _, N = b.shape - # allocates output - c = torch.empty((M, N), device=device, dtype=torch.float16) - # accumulator types - ACC_TYPE = tl.float32 # if a.dtype in [torch.float16, torch.bfloat16, torch.float32] else tl.int32 - # launch int8_matmul_mixed_dequantize kernel - grid = lambda META: (triton.cdiv(M, META["BLOCK_M"]) * triton.cdiv(N, META["BLOCK_N"]), META["SPLIT_K"]) - _int8_matmul_mixed_dequantize[grid]( - a, - b, - c, - bias, - state_x, - state_w, - M, - N, - K, - divfactor, - has_bias, - a.stride(0), - a.stride(1), - b.stride(0), - b.stride(1), - c.stride(0), - c.stride(1), - GROUP_M=8, - ACC_TYPE=ACC_TYPE, - ) - return c diff --git a/bitsandbytes/triton/int8_matmul_rowwise_dequantize.py b/bitsandbytes/triton/int8_matmul_rowwise_dequantize.py deleted file mode 100644 index 05e30a4c9..000000000 --- a/bitsandbytes/triton/int8_matmul_rowwise_dequantize.py +++ /dev/null @@ -1,207 +0,0 @@ -import torch - -from bitsandbytes.triton.triton_utils import is_triton_available - -if not is_triton_available(): - - def int8_matmul_rowwise_dequantize(a, b, state_x, state_w, bias): - return None -else: - import triton - import triton.language as tl - - from .matmul_perf_model import early_config_prune, estimate_matmul_time - - # This is a matmul kernel based on triton.ops.matmul - # It is modified to support rowwise quantized input and columnwise quantized weight - # It's purpose is fused matmul then dequantize - # It does support bias. - - def init_to_zero(name): - return lambda nargs: nargs[name].zero_() - - def get_configs_io_bound(): - configs = [] - for num_stages in [2, 3, 4, 5, 6]: - for block_m in [16, 32]: - for block_k in [32, 64]: - for block_n in [32, 64, 128, 256]: - num_warps = 2 if block_n <= 64 else 4 - configs.append( - triton.Config( - {"BLOCK_M": block_m, "BLOCK_N": block_n, "BLOCK_K": block_k, "SPLIT_K": 1}, - num_stages=num_stages, - num_warps=num_warps, - ), - ) - # split_k - for split_k in [2, 4, 8, 16]: - configs.append( - triton.Config( - {"BLOCK_M": block_m, "BLOCK_N": block_n, "BLOCK_K": block_k, "SPLIT_K": split_k}, - num_stages=num_stages, - num_warps=num_warps, - pre_hook=init_to_zero("C"), - ), - ) - return configs - - @triton.autotune( - configs=[ - # basic configs for compute-bound matmuls - triton.Config({"BLOCK_M": 128, "BLOCK_N": 256, "BLOCK_K": 32, "SPLIT_K": 1}, num_stages=3, num_warps=8), - triton.Config({"BLOCK_M": 256, "BLOCK_N": 128, "BLOCK_K": 32, "SPLIT_K": 1}, num_stages=3, num_warps=8), - triton.Config({"BLOCK_M": 256, "BLOCK_N": 64, "BLOCK_K": 32, "SPLIT_K": 1}, num_stages=4, num_warps=4), - triton.Config({"BLOCK_M": 64, "BLOCK_N": 256, "BLOCK_K": 32, "SPLIT_K": 1}, num_stages=4, num_warps=4), - triton.Config({"BLOCK_M": 128, "BLOCK_N": 128, "BLOCK_K": 32, "SPLIT_K": 1}, num_stages=4, num_warps=4), - triton.Config({"BLOCK_M": 128, "BLOCK_N": 64, "BLOCK_K": 32, "SPLIT_K": 1}, num_stages=4, num_warps=4), - triton.Config({"BLOCK_M": 64, "BLOCK_N": 128, "BLOCK_K": 32, "SPLIT_K": 1}, num_stages=4, num_warps=4), - triton.Config({"BLOCK_M": 128, "BLOCK_N": 32, "BLOCK_K": 32, "SPLIT_K": 1}, num_stages=4, num_warps=4), - triton.Config({"BLOCK_M": 64, "BLOCK_N": 32, "BLOCK_K": 32, "SPLIT_K": 1}, num_stages=5, num_warps=2), - # good for int8 - triton.Config({"BLOCK_M": 128, "BLOCK_N": 256, "BLOCK_K": 128, "SPLIT_K": 1}, num_stages=3, num_warps=8), - triton.Config({"BLOCK_M": 256, "BLOCK_N": 128, "BLOCK_K": 128, "SPLIT_K": 1}, num_stages=3, num_warps=8), - triton.Config({"BLOCK_M": 256, "BLOCK_N": 64, "BLOCK_K": 128, "SPLIT_K": 1}, num_stages=4, num_warps=4), - triton.Config({"BLOCK_M": 64, "BLOCK_N": 256, "BLOCK_K": 128, "SPLIT_K": 1}, num_stages=4, num_warps=4), - triton.Config({"BLOCK_M": 128, "BLOCK_N": 128, "BLOCK_K": 128, "SPLIT_K": 1}, num_stages=4, num_warps=4), - triton.Config({"BLOCK_M": 128, "BLOCK_N": 64, "BLOCK_K": 64, "SPLIT_K": 1}, num_stages=4, num_warps=4), - triton.Config({"BLOCK_M": 64, "BLOCK_N": 128, "BLOCK_K": 64, "SPLIT_K": 1}, num_stages=4, num_warps=4), - triton.Config({"BLOCK_M": 128, "BLOCK_N": 32, "BLOCK_K": 64, "SPLIT_K": 1}, num_stages=4, num_warps=4), - triton.Config({"BLOCK_M": 64, "BLOCK_N": 32, "BLOCK_K": 64, "SPLIT_K": 1}, num_stages=5, num_warps=2), - *get_configs_io_bound(), - ], - key=["M", "N", "K"], - prune_configs_by={"early_config_prune": early_config_prune, "perf_model": estimate_matmul_time, "top_k": 10}, - ) - @triton.heuristics( - { - "EVEN_K": lambda args: args["K"] % (args["BLOCK_K"] * args["SPLIT_K"]) == 0, - }, - ) - @triton.jit - def _int8_matmul_rowwise_dequantize( - A, - B, - C, - bias, - state_x_ptr, - state_w_ptr, - M, - N, - K, - divfactor, - has_bias: tl.constexpr, - stride_am, - stride_ak, - stride_bk, - stride_bn, - stride_cm, - stride_cn, - BLOCK_M: tl.constexpr, - BLOCK_N: tl.constexpr, - BLOCK_K: tl.constexpr, - GROUP_M: tl.constexpr, - SPLIT_K: tl.constexpr, - EVEN_K: tl.constexpr, - ACC_TYPE: tl.constexpr, - ): - # matrix multiplication - pid = tl.program_id(0) - pid_z = tl.program_id(1) - grid_m = tl.cdiv(M, BLOCK_M) - grid_n = tl.cdiv(N, BLOCK_N) - # re-order program ID for better L2 performance - width = GROUP_M * grid_n - group_id = pid // width - group_size = min(grid_m - group_id * GROUP_M, GROUP_M) - pid_m = group_id * GROUP_M + (pid % group_size) - pid_n = (pid % width) // (group_size) - # do matrix multiplication - rm = pid_m * BLOCK_M + tl.arange(0, BLOCK_M) - rn = pid_n * BLOCK_N + tl.arange(0, BLOCK_N) - ram = tl.max_contiguous(tl.multiple_of(rm % M, BLOCK_M), BLOCK_M) - rbn = tl.max_contiguous(tl.multiple_of(rn % N, BLOCK_N), BLOCK_N) - rk = pid_z * BLOCK_K + tl.arange(0, BLOCK_K) - # pointers - A = A + (ram[:, None] * stride_am + rk[None, :] * stride_ak) - B = B + (rk[:, None] * stride_bk + rbn[None, :] * stride_bn) - - # rematerialize rm and rn to save registers - rm = pid_m * BLOCK_M + tl.arange(0, BLOCK_M) - rn = pid_n * BLOCK_N + tl.arange(0, BLOCK_N) - - w_factor = tl.load(state_w_ptr + rbn)[None, :] - x_factor = tl.load(state_x_ptr + ram)[:, None] - - # acc = tl.zeros((BLOCK_M, BLOCK_N), dtype=ACC_TYPE) - acc = tl.zeros((BLOCK_M, BLOCK_N), dtype=tl.int32) - for k in range(0, tl.cdiv(K, BLOCK_K * SPLIT_K)): - if EVEN_K: - a = tl.load(A) - b = tl.load(B) - else: - k_remaining = K - k * (BLOCK_K * SPLIT_K) - a = tl.load(A, mask=rk[None, :] < k_remaining, other=0.0) - b = tl.load(B, mask=rk[:, None] < k_remaining, other=0.0) - acc += tl.dot(a, b) - A += BLOCK_K * SPLIT_K * stride_ak - B += BLOCK_K * SPLIT_K * stride_bk - - acc = w_factor * (x_factor * (acc * divfactor)) - acc = acc.to(C.dtype.element_ty) - - if has_bias: - bias = tl.load(bias + rn).to(C.dtype.element_ty) - acc = acc + bias[None, :] - - C = C + (rm[:, None] * stride_cm + rn[None, :] * stride_cn) - mask = (rm < M)[:, None] & (rn < N)[None, :] - # handles write-back with reduction-splitting - if SPLIT_K == 1: - tl.store(C, acc, mask=mask) - else: - tl.atomic_add(C, acc, mask=mask) - - def int8_matmul_rowwise_dequantize(a, b, state_x, state_w, bias): - divfactor = 1.0 / (127.0 * 127.0) - - has_bias = 0 if bias is None else 1 - - device = a.device - # handle non-contiguous inputs if necessary - if a.stride(0) > 1 and a.stride(1) > 1: - a = a.contiguous() - if b.stride(0) > 1 and b.stride(1) > 1: - b = b.contiguous() - # checks constraints - assert a.shape[1] == b.shape[0], "incompatible dimensions" - M, K = a.shape - _, N = b.shape - # allocates output - c = torch.empty((M, N), device=device, dtype=torch.float16) - # accumulator types - ACC_TYPE = tl.float32 # if a.dtype in [torch.float16, torch.bfloat16, torch.float32] else tl.int32 - # launch int8_matmul_rowwise_dequantize kernel - grid = lambda META: (triton.cdiv(M, META["BLOCK_M"]) * triton.cdiv(N, META["BLOCK_N"]), META["SPLIT_K"]) - _int8_matmul_rowwise_dequantize[grid]( - a, - b, - c, - bias, - state_x, - state_w, - M, - N, - K, - divfactor, - has_bias, - a.stride(0), - a.stride(1), - b.stride(0), - b.stride(1), - c.stride(0), - c.stride(1), - GROUP_M=8, - ACC_TYPE=ACC_TYPE, - ) - return c diff --git a/bitsandbytes/triton/matmul_perf_model.py b/bitsandbytes/triton/matmul_perf_model.py deleted file mode 100644 index e843a3a39..000000000 --- a/bitsandbytes/triton/matmul_perf_model.py +++ /dev/null @@ -1,211 +0,0 @@ -# Adapted from https://github.com/triton-lang/kernels/blob/eeeebdd8be7d13629de22d600621e6234057eed3/kernels/matmul_perf_model.py -# https://github.com/triton-lang/kernels is licensed under the MIT License. - -import functools -import heapq - -import torch - -from triton import cdiv -from triton.runtime import driver -from triton.testing import ( - get_dram_gbps, - get_max_simd_tflops, - get_max_tensorcore_tflops, - nvsmi, -) - - -@functools.lru_cache -def get_clock_rate_in_khz(): - try: - return nvsmi(["clocks.max.sm"])[0] * 1e3 - except FileNotFoundError: - import pynvml - - pynvml.nvmlInit() - handle = pynvml.nvmlDeviceGetHandleByIndex(0) - return pynvml.nvmlDeviceGetMaxClockInfo(handle, pynvml.NVML_CLOCK_SM) * 1e3 - - -def get_tensorcore_tflops(device, num_ctas, num_warps, dtype): - """return compute throughput in TOPS""" - total_warps = num_ctas * min(num_warps, 4) - num_subcores = driver.active.utils.get_device_properties(device)["multiprocessor_count"] * 4 # on recent GPUs - tflops = ( - min(num_subcores, total_warps) - / num_subcores - * get_max_tensorcore_tflops(dtype, get_clock_rate_in_khz(), device) - ) - return tflops - - -def get_simd_tflops(device, num_ctas, num_warps, dtype): - """return compute throughput in TOPS""" - total_warps = num_ctas * min(num_warps, 4) - num_subcores = driver.active.utils.get_device_properties(device)["multiprocessor_count"] * 4 # on recent GPUs - tflops = ( - min(num_subcores, total_warps) / num_subcores * get_max_simd_tflops(dtype, get_clock_rate_in_khz(), device) - ) - return tflops - - -def get_tflops(device, num_ctas, num_warps, dtype): - capability = torch.cuda.get_device_capability(device) - if capability[0] < 8 and dtype == torch.float32: - return get_simd_tflops(device, num_ctas, num_warps, dtype) - return get_tensorcore_tflops(device, num_ctas, num_warps, dtype) - - -def estimate_matmul_time( - # backend, device, - num_warps, - num_stages, # - A, - B, - C, # - M, - N, - K, # - BLOCK_M, - BLOCK_N, - BLOCK_K, - SPLIT_K, # - debug=False, - **kwargs, # -): - """return estimated running time in ms - = max(compute, loading) + store""" - device = torch.cuda.current_device() - dtype = A.dtype - dtsize = A.element_size() - - num_cta_m = cdiv(M, BLOCK_M) - num_cta_n = cdiv(N, BLOCK_N) - num_cta_k = SPLIT_K - num_ctas = num_cta_m * num_cta_n * num_cta_k - - # If the input is smaller than the block size - M, N = max(M, BLOCK_M), max(N, BLOCK_N) - - # time to compute - total_ops = 2 * M * N * K / (1024 * 1024 * 1024) # GOPS - tput = get_tflops(device, num_ctas, num_warps, dtype) - compute_ms = total_ops / tput - - # time to load data - num_sm = driver.active.utils.get_device_properties(device)["multiprocessor_count"] - active_cta_ratio = min(1, num_ctas / num_sm) - active_cta_ratio_bw1 = min(1, num_ctas / 32) # 32 active ctas are enough to saturate - active_cta_ratio_bw2 = max(min(1, (num_ctas - 32) / (108 - 32)), 0) # 32-108, remaining 5% - dram_bw = get_dram_gbps(device) * (active_cta_ratio_bw1 * 0.95 + active_cta_ratio_bw2 * 0.05) # in GB/s - l2_bw = dram_bw * 4 # rough estimation (should be 4.7 for A100?) - # assume 80% of (following) loads are in L2 cache - load_a_dram = M * K * dtsize * (1 + 0.2 * (num_cta_n - 1)) - load_a_l2 = M * K * dtsize * 0.8 * (num_cta_n - 1) - load_b_dram = N * K * dtsize * (1 + 0.2 * (num_cta_m - 1)) - load_b_l2 = N * K * dtsize * 0.8 * (num_cta_m - 1) - # total - total_dram = (load_a_dram + load_b_dram) / (1024 * 1024) # MB - total_l2 = (load_a_l2 + load_b_l2) / (1024 * 1024) - # loading time in ms - load_ms = total_dram / dram_bw + total_l2 / l2_bw - - # estimate storing time - store_bw = dram_bw * 0.6 # :o - store_c_dram = M * N * dtsize * SPLIT_K / (1024 * 1024) # MB - if SPLIT_K == 1: - store_ms = store_c_dram / store_bw - else: - reduce_bw = store_bw - store_ms = store_c_dram / reduce_bw - # c.zero_() - zero_ms = M * N * 2 / (1024 * 1024) / store_bw - store_ms += zero_ms - - total_time_ms = max(compute_ms, load_ms) + store_ms - if debug: - print( - f"Total time: {total_time_ms}ms, compute time: {compute_ms}ms, " - f"loading time: {load_ms}ms, store time: {store_ms}ms, " - f"Activate CTAs: {active_cta_ratio * 100}%" - ) - return total_time_ms - - -def early_config_prune(configs, named_args, **kwargs): - device = torch.cuda.current_device() - capability = torch.cuda.get_device_capability() - # BLOCK_M, BLOCK_N, BLOCK_K, SPLIT_K, num_warps, num_stages - dtsize = named_args["A"].element_size() - dtype = named_args["A"].dtype - - # 1. make sure we have enough smem - pruned_configs = [] - for config in configs: - kw = config.kwargs - BLOCK_M, BLOCK_N, BLOCK_K, num_stages = ( - kw["BLOCK_M"], - kw["BLOCK_N"], - kw["BLOCK_K"], - config.num_stages, - ) - - max_shared_memory = driver.active.utils.get_device_properties(device)["max_shared_mem"] - required_shared_memory = (BLOCK_M + BLOCK_N) * BLOCK_K * num_stages * dtsize - if required_shared_memory <= max_shared_memory: - pruned_configs.append(config) - configs = pruned_configs - - # Some dtypes do not allow atomic_add - if dtype not in [torch.float16, torch.float32]: - configs = [config for config in configs if config.kwargs["SPLIT_K"] == 1] - - # group configs by (BLOCK_M,_N,_K, SPLIT_K, num_warps) - configs_map = {} - for config in configs: - kw = config.kwargs - BLOCK_M, BLOCK_N, BLOCK_K, SPLIT_K, num_warps, num_stages = ( - kw["BLOCK_M"], - kw["BLOCK_N"], - kw["BLOCK_K"], - kw["SPLIT_K"], - config.num_warps, - config.num_stages, - ) - - key = (BLOCK_M, BLOCK_N, BLOCK_K, SPLIT_K, num_warps) - if key in configs_map: - configs_map[key].append((config, num_stages)) - else: - configs_map[key] = [(config, num_stages)] - - pruned_configs = [] - for k, v in configs_map.items(): - BLOCK_M, BLOCK_N, BLOCK_K, SPLIT_K, num_warps = k - if capability[0] >= 8: - # compute cycles (only works for ampere GPUs) - mmas = BLOCK_M * BLOCK_N * BLOCK_K / (16 * 8 * 16) - mma_cycles = mmas / min(4, num_warps) * 8 - - ldgsts_latency = 300 # Does this matter? - optimal_num_stages = ldgsts_latency / mma_cycles - - # nearest stages, prefer large #stages - nearest = heapq.nsmallest( - 2, - v, - key=lambda x: ( - 10 + abs(x[1] - optimal_num_stages) - if (x[1] - optimal_num_stages) < 0 - else x[1] - optimal_num_stages - ), - ) - - for n in nearest: - pruned_configs.append(n[0]) - else: # Volta & Turing only supports num_stages <= 2 - random_config = v[0][0] - random_config.num_stages = 2 - pruned_configs.append(random_config) - return pruned_configs diff --git a/bitsandbytes/triton/quantize_columnwise_and_transpose.py b/bitsandbytes/triton/quantize_columnwise_and_transpose.py deleted file mode 100644 index b8eeffd0c..000000000 --- a/bitsandbytes/triton/quantize_columnwise_and_transpose.py +++ /dev/null @@ -1,75 +0,0 @@ -import math - -import torch - -from bitsandbytes.triton.triton_utils import is_triton_available - -if not is_triton_available(): - - def quantize_columnwise_and_transpose(x: torch.Tensor): - return None -else: - import triton - import triton.language as tl - - # This kernel does fused columnwise quantization and transpose. - - # TODO: autotune this better. - @triton.autotune( - configs=[ - triton.Config({}, num_stages=1), - triton.Config({}, num_stages=2), - triton.Config({}, num_stages=4), - triton.Config({}, num_stages=8), - triton.Config({}, num_stages=16), - triton.Config({}, num_stages=1, num_warps=8), - triton.Config({}, num_stages=2, num_warps=8), - triton.Config({}, num_stages=4, num_warps=8), - triton.Config({}, num_stages=8, num_warps=8), - triton.Config({}, num_stages=16, num_warps=8), - triton.Config({}, num_warps=1), - triton.Config({}, num_warps=2), - triton.Config({}, num_warps=4), - triton.Config({}, num_warps=8), - ], - key=["n_elements"], - ) - @triton.jit - def _quantize_columnwise_and_transpose( - x_ptr, - output_ptr, - output_maxs, - n_elements, - M: tl.constexpr, - N: tl.constexpr, - BLOCK_SIZE: tl.constexpr, - P2: tl.constexpr, - ): - pid = tl.program_id(axis=0) - block_start = pid - p2_arange = tl.arange(0, P2) - p2_arange_mask = p2_arange < M - arange = p2_arange * N - offsets = block_start + arange - x = tl.load(x_ptr + offsets, mask=p2_arange_mask) - abs_x = tl.abs(x) - max_val = tl.max(tl.where(p2_arange_mask, abs_x, 0), axis=0) - output = tl.libdevice.llrint(127.0 * (x / max_val)) - - new_start = pid * M - new_offsets = new_start + p2_arange - tl.store(output_ptr + new_offsets, output, mask=p2_arange_mask) - tl.store(output_maxs + pid, max_val) - - def quantize_columnwise_and_transpose(x: torch.Tensor): - M, N = x.shape - output = torch.empty(N, M, device=x.device, dtype=torch.int8) - output_maxs = torch.empty(x.shape[1], device=x.device, dtype=torch.float16) - - P2 = int(2 ** (math.ceil(math.log2(M)))) - - assert x.is_cuda and output.is_cuda - n_elements = output.numel() - grid = lambda meta: (triton.cdiv(n_elements, meta["BLOCK_SIZE"]),) - _quantize_columnwise_and_transpose[grid](x, output, output_maxs, n_elements, M, N, BLOCK_SIZE=M, P2=P2) - return output, output_maxs diff --git a/bitsandbytes/triton/quantize_global.py b/bitsandbytes/triton/quantize_global.py deleted file mode 100644 index f35bdd304..000000000 --- a/bitsandbytes/triton/quantize_global.py +++ /dev/null @@ -1,124 +0,0 @@ -import torch - -from bitsandbytes.triton.triton_utils import is_triton_available - -if not is_triton_available(): - - def quantize_global_transpose(input): - return None - - def quantize_global(x: torch.Tensor): - return None -else: - import triton - import triton.language as tl - - # global quantize - @triton.autotune( - configs=[ - triton.Config({"BLOCK_SIZE": 1024}, num_warps=4), - triton.Config({"BLOCK_SIZE": 2048}, num_stages=1), - ], - key=["n_elements"], - ) - @triton.jit - def _quantize_global( - x_ptr, - absmax_inv_ptr, - output_ptr, - n_elements, - BLOCK_SIZE: tl.constexpr, - ): - pid = tl.program_id(axis=0) - block_start = pid * BLOCK_SIZE - offsets = block_start + tl.arange(0, BLOCK_SIZE) - mask = offsets < n_elements - x = tl.load(x_ptr + offsets, mask=mask) - absmax_inv = tl.load(absmax_inv_ptr) - output = tl.libdevice.llrint(127.0 * (x * absmax_inv)) - tl.store(output_ptr + offsets, output, mask=mask) - - def quantize_global(x: torch.Tensor): - absmax = x.abs().max().unsqueeze(0) - absmax_inv = 1.0 / absmax - output = torch.empty(*x.shape, device="cuda", dtype=torch.int8) - assert x.is_cuda and output.is_cuda - n_elements = output.numel() - grid = lambda meta: (triton.cdiv(n_elements, meta["BLOCK_SIZE"]),) - _quantize_global[grid](x, absmax_inv, output, n_elements) - return output, absmax - - # global quantize and transpose - @triton.autotune( - configs=[ - triton.Config({"BLOCK_M": 128, "BLOCK_N": 128, "GROUP_M": 8}, num_warps=4), - triton.Config({"BLOCK_M": 128, "BLOCK_N": 128, "GROUP_M": 8}, num_warps=4), - # ... - ], - key=["M", "N"], - ) - @triton.jit - def _quantize_global_transpose( - A, - absmax_inv_ptr, - B, - stride_am, - stride_an, - stride_bn, - stride_bm, - M, - N, - BLOCK_M: tl.constexpr, - BLOCK_N: tl.constexpr, - GROUP_M: tl.constexpr, - ): - pid = tl.program_id(0) - grid_m = (M + BLOCK_M - 1) // BLOCK_M - grid_n = (N + BLOCK_N - 1) // BLOCK_N - - width = GROUP_M * grid_n - group_id = pid // width - group_size = min(grid_m - group_id * GROUP_M, GROUP_M) - pid_m = group_id * GROUP_M + (pid % group_size) - pid_n = (pid % width) // group_size - - rm = pid_m * BLOCK_M + tl.arange(0, BLOCK_M) - rn = pid_n * BLOCK_N + tl.arange(0, BLOCK_N) - A = A + (rm[:, None] * stride_am + rn[None, :] * stride_an) - mask = (rm < M)[:, None] & (rn < N)[None, :] - a = tl.load(A, mask=mask) - absmax_inv = tl.load(absmax_inv_ptr) - - # rematerialize to save registers - rm = pid_m * BLOCK_M + tl.arange(0, BLOCK_M) - rn = pid_n * BLOCK_N + tl.arange(0, BLOCK_N) - B = B + (rm[:, None] * stride_bm + rn[None, :] * stride_bn) - mask = (rm < M)[:, None] & (rn < N)[None, :] - - output = tl.libdevice.llrint(127.0 * (a * absmax_inv)) - - tl.store(B, output, mask=mask) - - def quantize_global_transpose(input): - absmax = input.abs().max().unsqueeze(0) - absmax_inv = 1.0 / absmax - M, N = input.shape - out = torch.empty(N, M, device="cuda", dtype=torch.int8) - - assert out.size(0) == N and out.size(1) == M - assert input.stride(0) == 1 or input.stride(1) == 1 - assert out.stride(0) == 1 or out.stride(1) == 1 - - grid = lambda META: (triton.cdiv(M, META["BLOCK_M"]) * triton.cdiv(N, META["BLOCK_N"]),) - _quantize_global_transpose[grid]( - input, - absmax_inv, - out, - input.stride(0), - input.stride(1), - out.stride(0), - out.stride(1), - M, - N, - ) - return out, absmax diff --git a/bitsandbytes/triton/quantize_rowwise.py b/bitsandbytes/triton/quantize_rowwise.py deleted file mode 100644 index f92ace02c..000000000 --- a/bitsandbytes/triton/quantize_rowwise.py +++ /dev/null @@ -1,67 +0,0 @@ -import math - -import torch - -from bitsandbytes.triton.triton_utils import is_triton_available - -if not is_triton_available(): - - def quantize_rowwise(x: torch.Tensor): - return None -else: - import triton - import triton.language as tl - - # rowwise quantize - - # TODO: autotune this better. - @triton.autotune( - configs=[ - triton.Config({}, num_stages=1, num_warps=8), - triton.Config({}, num_stages=2, num_warps=8), - triton.Config({}, num_stages=4, num_warps=8), - triton.Config({}, num_stages=8, num_warps=8), - triton.Config({}, num_stages=1), - triton.Config({}, num_stages=2), - triton.Config({}, num_stages=4), - triton.Config({}, num_stages=8), - triton.Config({}, num_warps=1), - triton.Config({}, num_warps=2), - triton.Config({}, num_warps=4), - triton.Config({}, num_warps=8), - ], - key=["n_elements"], - ) - @triton.jit - def _quantize_rowwise( - x_ptr, - output_ptr, - output_maxs, - n_elements, - BLOCK_SIZE: tl.constexpr, - P2: tl.constexpr, - ): - pid = tl.program_id(axis=0) - block_start = pid * BLOCK_SIZE - arange = tl.arange(0, P2) - offsets = block_start + arange - row_mask = arange < BLOCK_SIZE - x = tl.load(x_ptr + offsets, mask=row_mask) - - abs_x = tl.abs(x) - max_val = tl.max(tl.where(row_mask, abs_x, 0), axis=0) - output = tl.libdevice.llrint(127.0 * (x / max_val)) - tl.store(output_ptr + offsets, output, mask=row_mask) - tl.store(output_maxs + pid, max_val) - - def quantize_rowwise(x: torch.Tensor): - output = torch.empty(*x.shape, device=x.device, dtype=torch.int8) - output_maxs = torch.empty(x.shape[0], device=x.device, dtype=torch.float16) - - P2 = int(2 ** (math.ceil(math.log2(x.shape[1])))) - - assert x.is_cuda and output.is_cuda - n_elements = output.numel() - grid = lambda meta: (x.shape[0],) - _quantize_rowwise[grid](x, output, output_maxs, n_elements, BLOCK_SIZE=x.shape[1], P2=P2) - return output, output_maxs diff --git a/bitsandbytes/triton/triton_utils.py b/bitsandbytes/triton/triton_utils.py deleted file mode 100644 index f6bedd8cd..000000000 --- a/bitsandbytes/triton/triton_utils.py +++ /dev/null @@ -1,11 +0,0 @@ -import functools - - -@functools.lru_cache(None) -def is_triton_available(): - try: - from torch.utils._triton import has_triton, has_triton_package - - return has_triton_package() and has_triton() - except Exception: - return False diff --git a/docs/source/optimizers.mdx b/docs/source/optimizers.mdx index 7d04f82b1..3e5f6a2aa 100644 --- a/docs/source/optimizers.mdx +++ b/docs/source/optimizers.mdx @@ -30,12 +30,12 @@ import bitsandbytes as bnb adam = bnb.optim.Adam8bit(model.parameters(), min_8bit_size=16384) ``` -Other parameters you can configure include the learning rate (`lr`), the decay rates (`betas`), the number of bits of the optimizer state (`optim_bits`), and percentile clipping (`percentile_clipping`) which can increase stability. For example, to initialize a 32-bit [`~bitsandbytes.optim.Adam`] optimizer with 5th percentile clipping: +Other parameters you can configure include the learning rate (`lr`), the decay rates (`betas`), and the number of bits of the optimizer state (`optim_bits`). For example, to initialize a 32-bit [`~bitsandbytes.optim.Adam`] optimizer: ```py import bitsandbytes as bnb -adam = bnb.optim.Adam(model.parameters(), lr=0.001, betas=(0.9, 0.995), optim_bits=32, percentile_clipping=5) +adam = bnb.optim.Adam(model.parameters(), lr=0.001, betas=(0.9, 0.995), optim_bits=32) ``` ## Optimize unstable parameters diff --git a/pyproject.toml b/pyproject.toml index f448a079e..21e11261b 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -88,7 +88,7 @@ exclude_also = [ ] [tool.pytest.ini_options] -addopts = "-rP -m 'not slow and not benchmark and not deprecated'" +addopts = "-rP -m 'not slow and not benchmark'" # ; --cov=bitsandbytes # ; # contexts: record which test ran which line; can be seen in html coverage report # ; --cov-context=test @@ -98,7 +98,6 @@ log_cli_level = "INFO" log_file = "logs/pytest.log" markers = [ "benchmark: mark test as a benchmark", - "deprecated: mark test as covering a deprecated feature", "slow: mark test as slow", ] diff --git a/tests/test_autograd.py b/tests/test_autograd.py index 7134925c1..b942e6ab7 100644 --- a/tests/test_autograd.py +++ b/tests/test_autograd.py @@ -22,8 +22,8 @@ @pytest.mark.parametrize("decomp", [0.0, 6.0], ids=id_formatter("decomp")) @pytest.mark.parametrize( "funcs", - [(torch.matmul, bnb.matmul), (torch.matmul, bnb.research.switchback_bnb)], - ids=["func=matmul", "func=switchback_bnb"], + [(torch.matmul, bnb.matmul)], + ids=["func=matmul"], ) @pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16, torch.float32], ids=describe_dtype) @pytest.mark.parametrize("req_grad", BOOLEAN_TRIPLES, ids=id_formatter("req_grad")) @@ -34,10 +34,6 @@ def test_matmullt( device, dim1, dim2, dim3, dim4, funcs, dtype, req_grad, transpose, decomp, has_fp16_weights, has_bias ): if device != "cuda": - if funcs[1] == bnb.research.switchback_bnb: - # TODO: Deprecate/remove? - pytest.skip("switchback_bnb only works on CUDA.") - if req_grad[1]: # This will be deprecated for CUDA in the future. We don't expect # this to work on any other device. diff --git a/tests/test_deprecated.py b/tests/test_deprecated.py deleted file mode 100644 index feb17c966..000000000 --- a/tests/test_deprecated.py +++ /dev/null @@ -1,175 +0,0 @@ -import pytest -import torch - -import bitsandbytes as bnb -from bitsandbytes import functional as F -from tests.helpers import BOOLEAN_TRIPLES, describe_dtype, get_test_dims, id_formatter -from tests.test_autograd import TRANSPOSE_VALS - - -@pytest.mark.deprecated -def test_dynamic_quantization(): - diffs = [] - reldiffs = [] - for i in range(100): - A1 = torch.randn(1024, 1024, device="cuda") - C, S = F.quantize(A1) - A2 = F.dequantize(C, S) - diff = torch.abs(A1 - A2) - reldiff = diff / torch.abs(A1 + 1e-8) - diffs.append(diff.mean().item()) - reldiffs.append(reldiff.mean().item()) - assert diff.mean().item() < 0.0135 - print(sum(diffs) / len(diffs)) - print(sum(reldiffs) / len(reldiffs)) - - for i in range(100): - A1 = torch.rand(1024, 1024, device="cuda") - C, S = F.quantize(A1) - A2 = F.dequantize(C, S) - diff = torch.abs(A1 - A2).mean().item() - torch.testing.assert_close(A1, A2, atol=1e-2, rtol=0) - assert diff < 0.004 - - -@pytest.mark.parametrize("gtype", [torch.float32, torch.float16], ids=["float", "half"]) -@pytest.mark.deprecated -def test_percentile_clipping(gtype): - gnorm_vec1 = torch.zeros(100, device="cuda") - gnorm_vec2 = torch.zeros(100, device="cuda") - n = 4 - step = 0 - percentile = 5 - for i in range(20): - step += 1 - g = torch.randn(n, n, dtype=gtype, device="cuda") - gnorm1, clip2, gnorm_scale = F.percentile_clipping(g, gnorm_vec2, step, percentile=percentile) - assert gnorm_scale == 1.0 if gnorm1 < clip2 else clip2 / gnorm1 - - gnorm2 = torch.norm(g.float()) - if step == 1: - gnorm_vec1[:] = gnorm2 - else: - gnorm_vec1[step % 100] = gnorm2 - - vals, _ = torch.sort(gnorm_vec1) - clip1 = vals[percentile] - - torch.testing.assert_close(gnorm_vec1, torch.sqrt(gnorm_vec2)) - torch.testing.assert_close(clip1, clip2) - torch.testing.assert_close(gnorm1, gnorm2) - - -@pytest.mark.parametrize("dim1", get_test_dims(16, 64, n=1), ids=id_formatter("dim1")) -@pytest.mark.parametrize("dim2", [*get_test_dims(32, 96, n=1), 0], ids=id_formatter("dim2")) -@pytest.mark.parametrize("dim3", get_test_dims(32, 96, n=1), ids=id_formatter("dim3")) -@pytest.mark.parametrize("dim4", get_test_dims(32, 96, n=1), ids=id_formatter("dim4")) -@pytest.mark.parametrize("req_grad", BOOLEAN_TRIPLES, ids=id_formatter("req_grad")) -@pytest.mark.parametrize("transpose", TRANSPOSE_VALS, ids=id_formatter("transpose")) -@pytest.mark.parametrize("dtype", [torch.float16, torch.float32], ids=describe_dtype) -@pytest.mark.parametrize( - "funcs", - [(torch.matmul, bnb.research.matmul_fp8_mixed), (torch.matmul, bnb.research.matmul_fp8_global)], - ids=["matmul_fp8_mixed", "matmul_fp8_global"], -) -@pytest.mark.deprecated -@pytest.mark.skip("Deprecated functionality, to be removed.") -def test_matmul_fp8(dim1, dim2, dim3, dim4, funcs, dtype, req_grad, transpose): - dimA = (dim2, dim3) if not transpose[0] else (dim3, dim2) - dimB = (dim3, dim4) if not transpose[1] else (dim4, dim3) - req_grad = list(req_grad) - req_grad[2] = False - - for i in range(3): - # normal multiply - if funcs[0] in [torch.mm, torch.matmul]: - A = torch.randn(size=dimA, device="cuda", requires_grad=req_grad[0], dtype=dtype) - B = torch.randn(size=dimB, device="cuda", requires_grad=req_grad[1], dtype=dtype) - target = torch.randn(size=(dim2, dim4), device="cuda", requires_grad=req_grad[1], dtype=dtype) - - torch.nn.init.xavier_uniform_(B) - - fw_code = bnb.functional.create_fp8_map(True, 4, 3, 8).to(A.device) - bw_code = bnb.functional.create_fp8_map(True, 5, 2, 8).to(A.device) - - if not transpose[0] and transpose[1]: - out_torch = funcs[0](A, B.t()) - out_bnb = funcs[1](A, B.t(), fw_code, bw_code) - elif not transpose[0] and not transpose[1]: - out_torch = funcs[0](A, B) - out_bnb = funcs[1](A, B, fw_code, bw_code) - - assert out_bnb.dtype == A.dtype, f"bnb matmullt received {A.dtype} but returned {out_bnb.dtype}" - - n = out_bnb.numel() - err = torch.abs(out_bnb - out_torch).float().mean().item() - if n > 0: - assert err < 0.115 - # assert err < 0.20 - if any(req_grad): - out_bnb.data.copy_(out_torch) - torch.cuda.synchronize() - loss_bnb = torch.nn.functional.mse_loss(out_bnb, target).mean() - loss_bnb.backward() - gradA1 = A.grad - gradB1 = B.grad - A.grad = None - B.grad = None - - loss_torch = torch.nn.functional.mse_loss(out_torch, target).mean() - loss_torch.backward() - gradA2 = A.grad - gradB2 = B.grad - A.grad = None - B.grad = None - - if req_grad[0]: - torch.testing.assert_close(gradA1, gradA2, atol=0.015, rtol=0.1) - - if req_grad[1]: - n = gradB1.numel() - if dim2 > 0: - assert torch.abs(gradB1).sum() > 0.0 - assert torch.abs(gradB2).sum() > 0.0 - else: - assert torch.abs(gradB1).sum() == 0.0 - assert torch.abs(gradB2).sum() == 0.0 - idx = torch.isclose(gradB1, gradB2, atol=0.06, rtol=0.3) - - assert (idx == 0).sum().item() <= n * 0.1 - idx = torch.isclose(gradB1, gradB2, atol=0.10, rtol=0.3) - assert (idx == 0).sum().item() <= n * 0.02 - grad_err = (gradB1 - gradB2).abs().mean() - assert grad_err.item() < 0.003 - torch.testing.assert_close(gradB1, gradB2, atol=0.18, rtol=0.3) - - -@pytest.mark.deprecated -def test_fp8linear(): - b = 10 - h = 1024 - inp = torch.randn(b, h).cuda() - fp32 = torch.nn.Linear(h, h * 2).cuda() - fp8 = bnb.research.nn.LinearFP8Mixed(h, h * 2).cuda() - fp32b = torch.nn.Linear(h * 2, h).cuda() - fp8b = bnb.research.nn.LinearFP8Mixed(h * 2, h).cuda() - - fp8.weight.data.copy_(fp32.weight.data) - fp8.bias.data.copy_(fp32.bias.data) - fp8b.weight.data.copy_(fp32b.weight.data) - fp8b.bias.data.copy_(fp32b.bias.data) - - a = fp32b(torch.nn.functional.gelu(fp32(inp))) - b = fp8b(torch.nn.functional.gelu(fp8(inp))) - - err = (a - b).abs().mean() - - a.mean().backward() - b.mean().backward() - - graderr = (fp8.weight.grad - fp32.weight.grad).abs().mean() - bgraderr = (fp8.bias.grad - fp32.bias.grad).abs().mean() - - assert err < 0.05 - assert graderr < 0.00002 - assert bgraderr < 0.00002 diff --git a/tests/test_functional.py b/tests/test_functional.py index d2e3f0847..4670847ff 100644 --- a/tests/test_functional.py +++ b/tests/test_functional.py @@ -706,7 +706,6 @@ def test_dequant_mm(self, device, dim1, dim4, dims, has_bias): @pytest.mark.parametrize("dim1", [2048, 4096], ids=id_formatter("dim1")) @pytest.mark.parametrize("dim2", [512, 1024], ids=id_formatter("dim2")) - @pytest.mark.deprecated def test_int8_double_quant(self, dim1, dim2): for i in range(k): A = torch.randn(dim1, dim2, device="cuda").half() diff --git a/tests/test_optim.py b/tests/test_optim.py index 190d9a206..6216639b0 100644 --- a/tests/test_optim.py +++ b/tests/test_optim.py @@ -49,16 +49,16 @@ def rm_path(path): ) str2optimizers["adam"] = (torch.optim.Adam, bnb.optim.Adam) -str2optimizers["adam8bit_blockwise"] = (torch.optim.Adam, lambda pxx: bnb.optim.Adam8bit(pxx, block_wise=True)) +str2optimizers["adam8bit_blockwise"] = (torch.optim.Adam, lambda pxx: bnb.optim.Adam8bit(pxx)) str2optimizers["paged_adam"] = (torch.optim.Adam, bnb.optim.PagedAdam) str2optimizers["paged_adamw"] = (torch.optim.AdamW, bnb.optim.PagedAdamW) str2optimizers["paged_adam8bit_blockwise"] = ( torch.optim.Adam, - lambda pxx: bnb.optim.PagedAdam8bit(pxx, block_wise=True), + lambda pxx: bnb.optim.PagedAdam8bit(pxx), ) str2optimizers["paged_adamw8bit_blockwise"] = ( torch.optim.AdamW, - lambda pxx: bnb.optim.PagedAdamW8bit(pxx, block_wise=True), + lambda pxx: bnb.optim.PagedAdamW8bit(pxx), ) str2optimizers["ademamix"] = (bnb.optim.ademamix._ReferenceAdEMAMix, bnb.optim.AdEMAMix) @@ -90,25 +90,25 @@ def rm_path(path): str2optimizers["lion"] = (Lion, bnb.optim.Lion) str2optimizers["paged_lion"] = (Lion, bnb.optim.PagedLion) -str2optimizers["lion8bit_blockwise"] = (Lion, lambda pxx: bnb.optim.Lion8bit(pxx, block_wise=True)) -str2optimizers["paged_lion8bit_blockwise"] = (Lion, lambda pxx: bnb.optim.PagedLion8bit(pxx, block_wise=True)) +str2optimizers["lion8bit_blockwise"] = (Lion, lambda pxx: bnb.optim.Lion8bit(pxx)) +str2optimizers["paged_lion8bit_blockwise"] = (Lion, lambda pxx: bnb.optim.PagedLion8bit(pxx)) str2optimizers["momentum"] = ( lambda pxx: torch.optim.SGD(pxx, 0.01, 0.9), - lambda pxx: bnb.optim.SGD(pxx, 0.01, 0.9, block_wise=False), + lambda pxx: bnb.optim.SGD(pxx, 0.01, 0.9), ) str2optimizers["momentum8bit_blockwise"] = ( lambda pxx: torch.optim.SGD(pxx, 0.01, 0.9), - lambda pxx: bnb.optim.SGD8bit(pxx, 0.01, 0.9, block_wise=True), + lambda pxx: bnb.optim.SGD8bit(pxx, 0.01, 0.9), ) str2optimizers["rmsprop"] = ( lambda pxx: torch.optim.RMSprop(pxx, 0.01, 0.9), - lambda pxx: bnb.optim.RMSprop(pxx, 0.01, 0.9, block_wise=False), + lambda pxx: bnb.optim.RMSprop(pxx, 0.01, 0.9), ) str2optimizers["rmsprop8bit_blockwise"] = ( lambda pxx: torch.optim.RMSprop(pxx, 0.01, 0.9), - lambda pxx: bnb.optim.RMSprop8bit(pxx, 0.01, 0.9, block_wise=True), + lambda pxx: bnb.optim.RMSprop8bit(pxx, 0.01, 0.9), ) str2statenames = {} @@ -462,94 +462,6 @@ def test_optimizer8bit(dim1, dim2, gtype, optim_name, device): torch_optimizer.state[p1][name1].copy_(s.data) -@pytest.mark.parametrize("optim_bits", [32, 8], ids=id_formatter("optim_bits")) -@pytest.mark.parametrize("gtype", [torch.float32], ids=describe_dtype) -@pytest.mark.parametrize("dim2", [32, 1024, 4097], ids=id_formatter("dim2")) -@pytest.mark.parametrize("dim1", [1024], ids=id_formatter("dim1")) -@pytest.mark.deprecated -def test_adam_percentile_clipping(requires_cuda, dim1, dim2, gtype, optim_bits): - if dim1 == 1 and dim2 == 1: - return - p1 = torch.randn(dim1, dim2, device="cpu", dtype=gtype) * 0.1 - beta1 = 0.9 - beta2 = 0.999 - lr = 0.001 - eps = 1e-8 - p1 = p1.cuda() - p2 = p1.clone() - adam1 = bnb.optim.Adam([p1], lr, (beta1, beta2), eps, optim_bits=optim_bits) - adam2 = bnb.optim.Adam( - [p2], - lr, - (beta1, beta2), - eps, - optim_bits=optim_bits, - percentile_clipping=5, - ) - - gnorm_vec = torch.zeros(100).cuda() - step = 0 - - for i in range(50): - step += 1 - g1 = torch.randn(dim1, dim2, device="cuda", dtype=gtype) * 0.1 + (0.01 * i) - g2 = g1.clone() - p2.grad = g2 - - _current_gnorm, _clip_val, gnorm_scale = F.percentile_clipping(g1, gnorm_vec, step, 5) - g1 = (g1.float() * gnorm_scale).to(gtype) - p1.grad = g1 - - adam1.step() - adam2.step() - - # gnorm_scale is not deterministic (warp reductions), as such there can be slight differences in state - if optim_bits == 32: - torch.testing.assert_close(p1, p2) - torch.testing.assert_close( - adam1.state[p1]["state1"], - adam2.state[p2]["state1"], - atol=5e-5, - rtol=1e-4, - ) - torch.testing.assert_close( - adam1.state[p1]["state2"], - adam2.state[p2]["state2"], - atol=5e-5, - rtol=1e-4, - ) - elif optim_bits == 8: - torch.testing.assert_close(p1, p2, atol=1e-4, rtol=1e-3) - torch.testing.assert_close( - adam1.state[p1]["state1"], - adam2.state[p2]["state1"], - atol=2, - rtol=1e-3, - ) - torch.testing.assert_close( - adam1.state[p1]["state2"], - adam2.state[p2]["state2"], - atol=2, - rtol=1e-3, - ) - adam1.state[p1]["state1"].copy_(adam2.state[p2]["state1"]) - adam1.state[p1]["state2"].copy_(adam2.state[p2]["state2"]) - if i % 10 == 0 and i > 0: - path = get_temp_dir() - torch.save(adam2.state_dict(), join(path, "opt.pt")) - del adam2 - adam2 = None - adam2 = bnb.optim.Adam( - [p2], - lr, - (beta1, beta2), - eps, - optim_bits=optim_bits, - percentile_clipping=5, - ) - adam2.load_state_dict(torch.load(join(path, "opt.pt"))) - - optimizer_names_benchmark = [ "adam8bit_blockwise", "paged_adam8bit_blockwise", diff --git a/tests/test_triton.py b/tests/test_triton.py deleted file mode 100644 index b245e534a..000000000 --- a/tests/test_triton.py +++ /dev/null @@ -1,64 +0,0 @@ -import pytest -import torch - -from bitsandbytes.nn import Linear8bitLt -from bitsandbytes.nn.triton_based_modules import SwitchBackLinear -from bitsandbytes.triton.triton_utils import is_triton_available -from tests.helpers import TRUE_FALSE - - -@pytest.mark.skipif( - not is_triton_available() or not torch.cuda.is_available() or not torch.cuda.get_device_capability()[0] >= 8, - reason="This test requires triton and a GPU with compute capability 8.0 or higher.", -) -@pytest.mark.deprecated -@pytest.mark.parametrize("vector_wise_quantization", TRUE_FALSE) -def test_switchback(vector_wise_quantization): - for dim in [83]: - for batch in [13]: - standard = torch.nn.Linear(dim, 4 * dim).cuda().half() - switchback = ( - SwitchBackLinear(dim, 4 * dim, vector_wise_quantization=vector_wise_quantization).cuda().half() - ) - baseline = Linear8bitLt(dim, 4 * dim).cuda().half() - switchback.weight.data.copy_(standard.weight) - switchback.bias.data.copy_(standard.bias) - baseline.weight.data.copy_(standard.weight) - baseline.bias.data.copy_(standard.bias) - - x1 = torch.randn(batch, dim).cuda().half().requires_grad_(True) - x2 = x1.clone().detach().requires_grad_(True) - x3 = x1.clone().detach().requires_grad_(True) - - out_standard = standard(x1) - (2**10 * out_standard.abs().mean()).backward() - - print(x2.dtype) - out_sb = switchback(x2) - (2**10 * out_sb.abs().mean()).backward() - - out_baseline = baseline(x3) - (2**10 * out_baseline.abs().mean()).backward() - - err_sb = (out_standard - out_sb).abs().mean() - err_baseline = (out_standard - out_baseline).abs().mean() - print("OUT", err_sb, err_baseline) - assert err_sb < 2 * err_baseline - - err_sb = (standard.bias.grad - switchback.bias.grad).abs().mean() - err_baseline = (standard.bias.grad - baseline.bias.grad).abs().mean() - - print("GW2", err_sb, err_baseline) - assert err_sb < 2 * err_baseline - - err_sb = (standard.weight.grad - switchback.weight.grad).abs().mean() - err_baseline = (standard.weight.grad - baseline.weight.grad).abs().mean() - - print("GW1", err_sb, err_baseline) - assert err_sb < 2 * err_baseline - - err_sb = (x1.grad - x2.grad).abs().mean() - err_baseline = (x1.grad - x3.grad).abs().mean() - - print("GX1", err_sb, err_baseline) - assert err_sb < 2 * err_baseline From bf50ef0dabe819b8c4fc16afac23b62d6bccff4a Mon Sep 17 00:00:00 2001 From: Matthew Douglas <38992547+matthewdouglas@users.noreply.github.com> Date: Fri, 20 Feb 2026 13:09:29 -0500 Subject: [PATCH 2/2] Apply suggestions from code review Keep `deprecated` marker for tests --- pyproject.toml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index 21e11261b..f448a079e 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -88,7 +88,7 @@ exclude_also = [ ] [tool.pytest.ini_options] -addopts = "-rP -m 'not slow and not benchmark'" +addopts = "-rP -m 'not slow and not benchmark and not deprecated'" # ; --cov=bitsandbytes # ; # contexts: record which test ran which line; can be seen in html coverage report # ; --cov-context=test @@ -98,6 +98,7 @@ log_cli_level = "INFO" log_file = "logs/pytest.log" markers = [ "benchmark: mark test as a benchmark", + "deprecated: mark test as covering a deprecated feature", "slow: mark test as slow", ]