From 62d696348a6fba7a1ffb380307032570c56b5ee7 Mon Sep 17 00:00:00 2001
From: Tim Dettmers <tim.dettmers@gmail.com>
Date: Mon, 16 Feb 2026 08:21:22 -0500
Subject: [PATCH 1/2] Remove deprecated APIs: research module, non-blockwise
 optimizers, and legacy quantization functions

Remove all remaining deprecated code that has been emitting FutureWarning
since v0.45.0 (December 2024). Two prior cleanup rounds (v0.47.0, v0.49.0)
already removed the easier items; this finishes the job.

- Delete quantize(), dequantize(), quantize_no_absmax(),
  dequantize_no_absmax(), optimizer_update_8bit(), percentile_clipping(),
  and the str2optimizer8bit dispatch table from functional.py
- Remove the non-blockwise 8-bit optimizer path from Optimizer2State and
  Optimizer1State; LAMB/LARS now use blockwise quantization
- Remove percentile_clipping and block_wise parameters from all ~33
  optimizer class constructors
- Delete bitsandbytes/research/ (FP8 matmul, SwitchBack)
- Delete bitsandbytes/nn/triton_based_modules.py, SwitchBackLinearBnb,
  and the orphaned bitsandbytes/triton/ kernel directory
- Remove dead MatmulLtState fields (CxB, CxBt, formatB, _tile_indices)
- Delete test_deprecated.py, test_triton.py; clean test_autograd.py,
  test_optim.py, test_functional.py
- Remove benchmarking/switchback/ and update docs

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 benchmarking/switchback/README.md             |   4 -
 benchmarking/switchback/info_a100_py2.jsonl   |  60 ---
 .../switchback/make_plot_with_jsonl.py        | 151 -------
 benchmarking/switchback/plot_with_info.pdf    | Bin 34876 -> 0 bytes
 benchmarking/switchback/speed_benchmark.py    | 160 -------
 bitsandbytes/__init__.py                      |   2 +-
 bitsandbytes/autograd/_functions.py           |  26 +-
 bitsandbytes/backends/utils.py                |   3 +-
 bitsandbytes/functional.py                    | 310 --------------
 bitsandbytes/nn/__init__.py                   |   7 -
 bitsandbytes/nn/modules.py                    |  39 --
 bitsandbytes/nn/triton_based_modules.py       | 264 ------------
 bitsandbytes/optim/adagrad.py                 |  25 --
 bitsandbytes/optim/adam.py                    |  48 ---
 bitsandbytes/optim/adamw.py                   |  48 ---
 bitsandbytes/optim/ademamix.py                |   6 -
 bitsandbytes/optim/lamb.py                    |  24 --
 bitsandbytes/optim/lars.py                    |  15 -
 bitsandbytes/optim/lion.py                    |  52 ---
 bitsandbytes/optim/optimizer.py               | 138 +-----
 bitsandbytes/optim/rmsprop.py                 |  26 --
 bitsandbytes/optim/sgd.py                     |  24 --
 bitsandbytes/research/__init__.py             |   6 -
 bitsandbytes/research/autograd/__init__.py    |   0
 bitsandbytes/research/autograd/_functions.py  | 396 ------------------
 bitsandbytes/research/nn/__init__.py          |   1 -
 bitsandbytes/research/nn/modules.py           |  76 ----
 bitsandbytes/triton/__init__.py               |   0
 bitsandbytes/triton/dequantize_rowwise.py     |  64 ---
 .../triton/int8_matmul_mixed_dequantize.py    | 206 ---------
 .../triton/int8_matmul_rowwise_dequantize.py  | 207 ---------
 bitsandbytes/triton/matmul_perf_model.py      | 211 ----------
 .../quantize_columnwise_and_transpose.py      |  75 ----
 bitsandbytes/triton/quantize_global.py        | 124 ------
 bitsandbytes/triton/quantize_rowwise.py       |  67 ---
 bitsandbytes/triton/triton_utils.py           |  11 -
 docs/source/optimizers.mdx                    |   4 +-
 pyproject.toml                                |   3 +-
 tests/test_autograd.py                        |   8 +-
 tests/test_deprecated.py                      | 175 --------
 tests/test_functional.py                      |   1 -
 tests/test_optim.py                           | 106 +----
 tests/test_triton.py                          |  64 ---
 43 files changed, 46 insertions(+), 3191 deletions(-)
 delete mode 100644 benchmarking/switchback/README.md
 delete mode 100644 benchmarking/switchback/info_a100_py2.jsonl
 delete mode 100644 benchmarking/switchback/make_plot_with_jsonl.py
 delete mode 100644 benchmarking/switchback/plot_with_info.pdf
 delete mode 100644 benchmarking/switchback/speed_benchmark.py
 delete mode 100644 bitsandbytes/nn/triton_based_modules.py
 delete mode 100644 bitsandbytes/research/__init__.py
 delete mode 100644 bitsandbytes/research/autograd/__init__.py
 delete mode 100644 bitsandbytes/research/autograd/_functions.py
 delete mode 100644 bitsandbytes/research/nn/__init__.py
 delete mode 100644 bitsandbytes/research/nn/modules.py
 delete mode 100644 bitsandbytes/triton/__init__.py
 delete mode 100644 bitsandbytes/triton/dequantize_rowwise.py
 delete mode 100644 bitsandbytes/triton/int8_matmul_mixed_dequantize.py
 delete mode 100644 bitsandbytes/triton/int8_matmul_rowwise_dequantize.py
 delete mode 100644 bitsandbytes/triton/matmul_perf_model.py
 delete mode 100644 bitsandbytes/triton/quantize_columnwise_and_transpose.py
 delete mode 100644 bitsandbytes/triton/quantize_global.py
 delete mode 100644 bitsandbytes/triton/quantize_rowwise.py
 delete mode 100644 bitsandbytes/triton/triton_utils.py
 delete mode 100644 tests/test_deprecated.py
 delete mode 100644 tests/test_triton.py

diff --git a/benchmarking/switchback/README.md b/benchmarking/switchback/README.md
deleted file mode 100644
index b73569030..000000000
--- a/benchmarking/switchback/README.md
+++ /dev/null
@@ -1,4 +0,0 @@
-Steps:
-
-1. Run `python speed_benchmark/speed_benchmark.py` which times operations and writes their time to `speed_benchmark/info_a100_py2.jsonl` (change the name of the jsonl to a different name for your profiling).
-2. Run `python speed_benchmark/make_plot_with_jsonl.py`, which produces the `speed_benchmark/plot_with_info.pdf`. Again make sure you change the jsonl which is being processed.
diff --git a/benchmarking/switchback/info_a100_py2.jsonl b/benchmarking/switchback/info_a100_py2.jsonl
deleted file mode 100644
index 53cda62cf..000000000
--- a/benchmarking/switchback/info_a100_py2.jsonl
+++ /dev/null
@@ -1,60 +0,0 @@
-{"repeat": 64, "batch_size": 8192, "dim_out": 4096, "dim_in": 1024, "wm": 4, "switch": false, "standard_fwd": 0.28139352798461914, "standard_gw": 0.2811811864376068, "standard_gx": 0.30258670449256897, "rowwise_fwd": 0.1994594931602478, "rowwise_bwd": 0.16159191727638245, "global_fwd": 0.19502267241477966, "global_bwd": 0.16080215573310852, "x_quantize_rowwise": 0.03306940197944641, "g_quantize_rowwise": 0.08210167288780212, "w_quantize_rowwise": 0.03385916352272034, "w_quantize_colwise_transpose": 0.08635595440864563, "w_quantize_global": 0.09237229824066162, "w_quantize_global_transpose": 0.10007619857788086, "time_standard": 0.8651614189147949, "time_rowwise": 0.8776187896728516, "time_global": 0.944625586271286}
-{"repeat": 64, "batch_size": 8192, "dim_out": 1024, "dim_in": 4096, "wm": 4, "switch": true, "standard_fwd": 0.262625515460968, "standard_gw": 0.2806223928928375, "standard_gx": 0.31118839979171753, "rowwise_fwd": 0.1828707754611969, "rowwise_bwd": 0.21236762404441833, "global_fwd": 0.16665831208229065, "global_bwd": 0.19929558038711548, "x_quantize_rowwise": 0.08227676153182983, "g_quantize_rowwise": 0.03310292959213257, "w_quantize_rowwise": 0.032648444175720215, "w_quantize_colwise_transpose": 0.09015202522277832, "w_quantize_global": 0.0988692045211792, "w_quantize_global_transpose": 0.10057538747787476, "time_standard": 0.8544363081455231, "time_rowwise": 0.9140409529209137, "time_global": 0.96140056848526}
-{"repeat": 64, "batch_size": 16384, "dim_out": 4096, "dim_in": 1024, "wm": 4, "switch": false, "standard_fwd": 0.5731917917728424, "standard_gw": 0.5709454417228699, "standard_gx": 0.5963630974292755, "rowwise_fwd": 0.37662312388420105, "rowwise_bwd": 0.281747430562973, "global_fwd": 0.36768242716789246, "global_bwd": 0.28043612837791443, "x_quantize_rowwise": 0.046547502279281616, "g_quantize_rowwise": 0.15532970428466797, "w_quantize_rowwise": 0.032436102628707886, "w_quantize_colwise_transpose": 0.08635222911834717, "w_quantize_global": 0.0947415828704834, "w_quantize_global_transpose": 0.10129809379577637, "time_standard": 1.7405003309249878, "time_rowwise": 1.5499815344810486, "time_global": 1.616980880498886}
-{"repeat": 64, "batch_size": 16384, "dim_out": 1024, "dim_in": 4096, "wm": 4, "switch": true, "standard_fwd": 0.5341619253158569, "standard_gw": 0.5690865218639374, "standard_gx": 0.599835067987442, "rowwise_fwd": 0.3233291208744049, "rowwise_bwd": 0.41359663009643555, "global_fwd": 0.2831108868122101, "global_bwd": 0.37280842661857605, "x_quantize_rowwise": 0.15563145279884338, "g_quantize_rowwise": 0.046741217374801636, "w_quantize_rowwise": 0.03306940197944641, "w_quantize_colwise_transpose": 0.09020790457725525, "w_quantize_global": 0.0925213098526001, "w_quantize_global_transpose": 0.09945780038833618, "time_standard": 1.7030835151672363, "time_rowwise": 1.6316622495651245, "time_global": 1.6193576157093048}
-{"repeat": 64, "batch_size": 32768, "dim_out": 4096, "dim_in": 1024, "wm": 4, "switch": false, "standard_fwd": 1.2199915945529938, "standard_gw": 1.1069811880588531, "standard_gx": 1.09761580824852, "rowwise_fwd": 0.738043338060379, "rowwise_bwd": 0.5549229681491852, "global_fwd": 0.7219798862934113, "global_bwd": 0.5512163043022156, "x_quantize_rowwise": 0.08748471736907959, "g_quantize_rowwise": 0.3023110330104828, "w_quantize_rowwise": 0.03182142972946167, "w_quantize_colwise_transpose": 0.08632615208625793, "w_quantize_global": 0.09445473551750183, "w_quantize_global_transpose": 0.10032951831817627, "time_standard": 3.424588590860367, "time_rowwise": 2.9078908264636993, "time_global": 2.9647573828697205}
-{"repeat": 64, "batch_size": 32768, "dim_out": 1024, "dim_in": 4096, "wm": 4, "switch": true, "standard_fwd": 1.1040829122066498, "standard_gw": 1.1221766471862793, "standard_gx": 1.1548101902008057, "rowwise_fwd": 0.581938773393631, "rowwise_bwd": 0.7480122148990631, "global_fwd": 0.5537159740924835, "global_bwd": 0.7232688367366791, "x_quantize_rowwise": 0.30193477869033813, "g_quantize_rowwise": 0.08745118975639343, "w_quantize_rowwise": 0.03374740481376648, "w_quantize_colwise_transpose": 0.09068101644515991, "w_quantize_global": 0.09645149111747742, "w_quantize_global_transpose": 0.10189786553382874, "time_standard": 3.3810697495937347, "time_rowwise": 2.9659420251846313, "time_global": 2.9868967831134796}
-{"repeat": 64, "batch_size": 65536, "dim_out": 4096, "dim_in": 1024, "wm": 4, "switch": false, "standard_fwd": 2.4533793330192566, "standard_gw": 2.1938569843769073, "standard_gx": 2.179361879825592, "rowwise_fwd": 1.4615543186664581, "rowwise_bwd": 1.0522231459617615, "global_fwd": 1.4288239181041718, "global_bwd": 1.0450035333633423, "x_quantize_rowwise": 0.1691766083240509, "g_quantize_rowwise": 0.5951300263404846, "w_quantize_rowwise": 0.03337860107421875, "w_quantize_colwise_transpose": 0.08653849363327026, "w_quantize_global": 0.0940859317779541, "w_quantize_global_transpose": 0.09976327419281006, "time_standard": 6.826598197221756, "time_rowwise": 5.5918581783771515, "time_global": 5.625840276479721}
-{"repeat": 64, "batch_size": 65536, "dim_out": 1024, "dim_in": 4096, "wm": 4, "switch": true, "standard_fwd": 2.1698065102100372, "standard_gw": 2.1875128149986267, "standard_gx": 2.2887587547302246, "rowwise_fwd": 1.0762326419353485, "rowwise_bwd": 1.4638006687164307, "global_fwd": 1.0450668632984161, "global_bwd": 1.4308765530586243, "x_quantize_rowwise": 0.5953535437583923, "g_quantize_rowwise": 0.16899779438972473, "w_quantize_rowwise": 0.03240257501602173, "w_quantize_colwise_transpose": 0.09106099605560303, "w_quantize_global": 0.09546056389808655, "w_quantize_global_transpose": 0.09852275252342224, "time_standard": 6.6460780799388885, "time_rowwise": 5.615361034870148, "time_global": 5.621790885925293}
-{"repeat": 64, "batch_size": 131072, "dim_out": 4096, "dim_in": 1024, "wm": 4, "switch": false, "standard_fwd": 4.858218133449554, "standard_gw": 4.3631307780742645, "standard_gx": 4.404045641422272, "rowwise_fwd": 2.9063820838928223, "rowwise_bwd": 2.094462513923645, "global_fwd": 2.8426870703697205, "global_bwd": 2.0792782306671143, "x_quantize_rowwise": 0.33241137862205505, "g_quantize_rowwise": 1.1817105114459991, "w_quantize_rowwise": 0.03374367952346802, "w_quantize_colwise_transpose": 0.08633732795715332, "w_quantize_global": 0.09231641888618469, "w_quantize_global_transpose": 0.100012868642807, "time_standard": 13.62539455294609, "time_rowwise": 10.998178273439407, "time_global": 10.991547256708145}
-{"repeat": 64, "batch_size": 131072, "dim_out": 1024, "dim_in": 4096, "wm": 4, "switch": true, "standard_fwd": 4.246581345796585, "standard_gw": 4.42587211728096, "standard_gx": 4.581417888402939, "rowwise_fwd": 2.1114833652973175, "rowwise_bwd": 2.9050447046756744, "global_fwd": 2.0806826651096344, "global_bwd": 2.85966694355011, "x_quantize_rowwise": 1.1816024780273438, "g_quantize_rowwise": 0.33330172300338745, "w_quantize_rowwise": 0.033445656299591064, "w_quantize_colwise_transpose": 0.09065866470336914, "w_quantize_global": 0.09239837527275085, "w_quantize_global_transpose": 0.09984523057937622, "time_standard": 13.253871351480484, "time_rowwise": 11.081408709287643, "time_global": 11.073369532823563}
-{"repeat": 64, "batch_size": 8192, "dim_out": 5120, "dim_in": 1280, "wm": 4, "switch": false, "standard_fwd": 0.4859529435634613, "standard_gw": 0.46338513493537903, "standard_gx": 0.42321905493736267, "rowwise_fwd": 0.2761557698249817, "rowwise_bwd": 0.20775198936462402, "global_fwd": 0.2713911235332489, "global_bwd": 0.20639970898628235, "x_quantize_rowwise": 0.033095479011535645, "g_quantize_rowwise": 0.11894106864929199, "w_quantize_rowwise": 0.03125518560409546, "w_quantize_colwise_transpose": 0.1424551010131836, "w_quantize_global": 0.07288157939910889, "w_quantize_global_transpose": 0.08071959018707275, "time_standard": 1.372557133436203, "time_rowwise": 1.2730397284030914, "time_global": 1.2468136847019196}
-{"repeat": 64, "batch_size": 8192, "dim_out": 1280, "dim_in": 5120, "wm": 4, "switch": true, "standard_fwd": 0.3920421004295349, "standard_gw": 0.44424086809158325, "standard_gx": 0.4759356379508972, "rowwise_fwd": 0.23231282830238342, "rowwise_bwd": 0.28430670499801636, "global_fwd": 0.20883232355117798, "global_bwd": 0.2741999924182892, "x_quantize_rowwise": 0.12018159031867981, "g_quantize_rowwise": 0.03195926547050476, "w_quantize_rowwise": 0.026017427444458008, "w_quantize_colwise_transpose": 0.14733895659446716, "w_quantize_global": 0.07734447717666626, "w_quantize_global_transpose": 0.0788569450378418, "time_standard": 1.3122186064720154, "time_rowwise": 1.2863576412200928, "time_global": 1.235615462064743}
-{"repeat": 64, "batch_size": 16384, "dim_out": 5120, "dim_in": 1280, "wm": 4, "switch": false, "standard_fwd": 1.0111741721630096, "standard_gw": 0.9267590939998627, "standard_gx": 0.8254274725914001, "rowwise_fwd": 0.5434826016426086, "rowwise_bwd": 0.4077926278114319, "global_fwd": 0.5318708717823029, "global_bwd": 0.40537863969802856, "x_quantize_rowwise": 0.059738755226135254, "g_quantize_rowwise": 0.2299174666404724, "w_quantize_rowwise": 0.02545863389968872, "w_quantize_colwise_transpose": 0.14269724488258362, "w_quantize_global": 0.07300823926925659, "w_quantize_global_transpose": 0.07878988981246948, "time_standard": 2.7633607387542725, "time_rowwise": 2.335846424102783, "time_global": 2.305462956428528}
-{"repeat": 64, "batch_size": 16384, "dim_out": 1280, "dim_in": 5120, "wm": 4, "switch": true, "standard_fwd": 0.8095316588878632, "standard_gw": 0.8607134222984314, "standard_gx": 0.9204968810081482, "rowwise_fwd": 0.4275888204574585, "rowwise_bwd": 0.5485899746417999, "global_fwd": 0.41000545024871826, "global_bwd": 0.5317628383636475, "x_quantize_rowwise": 0.2301819622516632, "g_quantize_rowwise": 0.059254467487335205, "w_quantize_rowwise": 0.02466142177581787, "w_quantize_colwise_transpose": 0.14865398406982422, "w_quantize_global": 0.07582828402519226, "w_quantize_global_transpose": 0.08231401443481445, "time_standard": 2.5907419621944427, "time_rowwise": 2.2996440529823303, "time_global": 2.2500604391098022}
-{"repeat": 64, "batch_size": 32768, "dim_out": 5120, "dim_in": 1280, "wm": 4, "switch": false, "standard_fwd": 2.0658522844314575, "standard_gw": 1.718364655971527, "standard_gx": 1.6660578548908234, "rowwise_fwd": 1.066897064447403, "rowwise_bwd": 0.8070804178714752, "global_fwd": 1.0473169386386871, "global_bwd": 0.8021742105484009, "x_quantize_rowwise": 0.11274218559265137, "g_quantize_rowwise": 0.4518181085586548, "w_quantize_rowwise": 0.026501715183258057, "w_quantize_colwise_transpose": 0.14259666204452515, "w_quantize_global": 0.07484853267669678, "w_quantize_global_transpose": 0.07976219058036804, "time_standard": 5.450274795293808, "time_rowwise": 4.326000809669495, "time_global": 4.287026822566986}
-{"repeat": 64, "batch_size": 32768, "dim_out": 1280, "dim_in": 5120, "wm": 4, "switch": true, "standard_fwd": 2.7549192309379578, "standard_gw": 1.6954988241195679, "standard_gx": 1.8179528415203094, "rowwise_fwd": 0.8649080991744995, "rowwise_bwd": 1.0746456682682037, "global_fwd": 0.8023083209991455, "global_bwd": 1.0471977293491364, "x_quantize_rowwise": 0.45225024223327637, "g_quantize_rowwise": 0.11286512017250061, "w_quantize_rowwise": 0.0252649188041687, "w_quantize_colwise_transpose": 0.14732033014297485, "w_quantize_global": 0.07537379860877991, "w_quantize_global_transpose": 0.0807642936706543, "time_standard": 6.268370896577835, "time_rowwise": 4.372753202915192, "time_global": 4.266258329153061}
-{"repeat": 64, "batch_size": 65536, "dim_out": 5120, "dim_in": 1280, "wm": 4, "switch": false, "standard_fwd": 4.098430275917053, "standard_gw": 3.3501461148262024, "standard_gx": 5.560480058193207, "rowwise_fwd": 2.112947404384613, "rowwise_bwd": 1.605246216058731, "global_fwd": 2.0697638392448425, "global_bwd": 1.5953518450260162, "x_quantize_rowwise": 0.21921470761299133, "g_quantize_rowwise": 0.8956789970397949, "w_quantize_rowwise": 0.02710893750190735, "w_quantize_colwise_transpose": 0.14268234372138977, "w_quantize_global": 0.07259473204612732, "w_quantize_global_transpose": 0.07899105548858643, "time_standard": 13.009056448936462, "time_rowwise": 8.35302472114563, "time_global": 8.281741291284561}
-{"repeat": 64, "batch_size": 65536, "dim_out": 1280, "dim_in": 5120, "wm": 4, "switch": true, "standard_fwd": 5.586959421634674, "standard_gw": 3.358360379934311, "standard_gx": 3.6434978246688843, "rowwise_fwd": 1.6269534826278687, "rowwise_bwd": 2.128206193447113, "global_fwd": 1.5950687229633331, "global_bwd": 2.0831897854804993, "x_quantize_rowwise": 0.8954145014286041, "g_quantize_rowwise": 0.21914392709732056, "w_quantize_rowwise": 0.026203691959381104, "w_quantize_colwise_transpose": 0.14658644795417786, "w_quantize_global": 0.07478520274162292, "w_quantize_global_transpose": 0.07964670658111572, "time_standard": 12.58881762623787, "time_rowwise": 8.400868624448776, "time_global": 8.305609226226807}
-{"repeat": 64, "batch_size": 131072, "dim_out": 5120, "dim_in": 1280, "wm": 4, "switch": false, "standard_fwd": 8.229725062847137, "standard_gw": 6.791356950998306, "standard_gx": 6.806455552577972, "rowwise_fwd": 4.252471029758453, "rowwise_bwd": 3.2062679529190063, "global_fwd": 4.175614565610886, "global_bwd": 3.1837262213230133, "x_quantize_rowwise": 0.4321373999118805, "g_quantize_rowwise": 1.787092536687851, "w_quantize_rowwise": 0.0270158052444458, "w_quantize_colwise_transpose": 0.1424252986907959, "w_quantize_global": 0.07348507642745972, "w_quantize_global_transpose": 0.07829815149307251, "time_standard": 21.827537566423416, "time_rowwise": 16.63876697421074, "time_global": 16.52171090245247}
-{"repeat": 64, "batch_size": 131072, "dim_out": 1280, "dim_in": 5120, "wm": 4, "switch": true, "standard_fwd": 11.279478669166565, "standard_gw": 6.7345499992370605, "standard_gx": 7.206875830888748, "rowwise_fwd": 3.209315240383148, "rowwise_bwd": 4.256397485733032, "global_fwd": 3.180190920829773, "global_bwd": 4.177983850240707, "x_quantize_rowwise": 1.7836056649684906, "g_quantize_rowwise": 0.4321075975894928, "w_quantize_rowwise": 0.03205239772796631, "w_quantize_colwise_transpose": 0.14675036072731018, "w_quantize_global": 0.09316205978393555, "w_quantize_global_transpose": 0.10086596012115479, "time_standard": 25.220904499292374, "time_rowwise": 16.5947787463665, "time_global": 16.502466052770615}
-{"repeat": 64, "batch_size": 8192, "dim_out": 5632, "dim_in": 1408, "wm": 4, "switch": false, "standard_fwd": 0.5776733160018921, "standard_gw": 0.5300231277942657, "standard_gx": 0.6005913019180298, "rowwise_fwd": 0.33330172300338745, "rowwise_bwd": 0.2957060933113098, "global_fwd": 0.32876431941986084, "global_bwd": 0.29108673334121704, "x_quantize_rowwise": 0.03466755151748657, "g_quantize_rowwise": 0.12264400720596313, "w_quantize_rowwise": 0.033874064683914185, "w_quantize_colwise_transpose": 0.1775398850440979, "w_quantize_global": 0.09503215551376343, "w_quantize_global_transpose": 0.10617449879646301, "time_standard": 1.7082877457141876, "time_rowwise": 1.5277564525604248, "time_global": 1.5083923935890198}
-{"repeat": 64, "batch_size": 8192, "dim_out": 1408, "dim_in": 5632, "wm": 4, "switch": true, "standard_fwd": 0.5164109170436859, "standard_gw": 0.5367249250411987, "standard_gx": 0.5876161158084869, "rowwise_fwd": 0.3132447600364685, "rowwise_bwd": 0.3396235406398773, "global_fwd": 0.2943649888038635, "global_bwd": 0.33209100365638733, "x_quantize_rowwise": 0.12357160449028015, "g_quantize_rowwise": 0.035997480154037476, "w_quantize_rowwise": 0.03213062882423401, "w_quantize_colwise_transpose": 0.17676874995231628, "w_quantize_global": 0.09861215949058533, "w_quantize_global_transpose": 0.0998862087726593, "time_standard": 1.6407519578933716, "time_rowwise": 1.5580616891384125, "time_global": 1.5212483704090118}
-{"repeat": 64, "batch_size": 16384, "dim_out": 5632, "dim_in": 1408, "wm": 4, "switch": false, "standard_fwd": 1.2096501886844635, "standard_gw": 1.0663382709026337, "standard_gx": 1.0961703956127167, "rowwise_fwd": 0.6396733224391937, "rowwise_bwd": 0.5173943936824799, "global_fwd": 0.6296299397945404, "global_bwd": 0.5130060017108917, "x_quantize_rowwise": 0.06211921572685242, "g_quantize_rowwise": 0.2361498773097992, "w_quantize_rowwise": 0.03260001540184021, "w_quantize_colwise_transpose": 0.17679482698440552, "w_quantize_global": 0.09361281991004944, "w_quantize_global_transpose": 0.09913742542266846, "time_standard": 3.372158855199814, "time_rowwise": 2.7310699224472046, "time_global": 2.6999935507774353}
-{"repeat": 64, "batch_size": 16384, "dim_out": 1408, "dim_in": 5632, "wm": 4, "switch": true, "standard_fwd": 1.1065565049648285, "standard_gw": 1.0664314031600952, "standard_gx": 1.1266544461250305, "rowwise_fwd": 0.5352050065994263, "rowwise_bwd": 0.6464086472988129, "global_fwd": 0.513765960931778, "global_bwd": 0.6284862756729126, "x_quantize_rowwise": 0.23620948195457458, "g_quantize_rowwise": 0.062271952629089355, "w_quantize_rowwise": 0.031460076570510864, "w_quantize_colwise_transpose": 0.17675384879112244, "w_quantize_global": 0.09486451745033264, "w_quantize_global_transpose": 0.09898096323013306, "time_standard": 3.2996423542499542, "time_rowwise": 2.7547404170036316, "time_global": 2.7010105550289154}
-{"repeat": 64, "batch_size": 32768, "dim_out": 5632, "dim_in": 1408, "wm": 4, "switch": false, "standard_fwd": 2.4367496371269226, "standard_gw": 2.0806193351745605, "standard_gx": 2.19624862074852, "rowwise_fwd": 1.2554042041301727, "rowwise_bwd": 1.0227933526039124, "global_fwd": 1.2322552502155304, "global_bwd": 1.0152235627174377, "x_quantize_rowwise": 0.11792033910751343, "g_quantize_rowwise": 0.4639364778995514, "w_quantize_rowwise": 0.03241002559661865, "w_quantize_colwise_transpose": 0.17657503485679626, "w_quantize_global": 0.09655207395553589, "w_quantize_global_transpose": 0.09958073496818542, "time_standard": 6.713617593050003, "time_rowwise": 5.149658769369125, "time_global": 5.106087774038315}
-{"repeat": 64, "batch_size": 32768, "dim_out": 1408, "dim_in": 5632, "wm": 4, "switch": true, "standard_fwd": 2.1935217082500458, "standard_gw": 2.0055584609508514, "standard_gx": 2.1882541477680206, "rowwise_fwd": 1.0396353900432587, "rowwise_bwd": 1.2542344629764557, "global_fwd": 1.0161921381950378, "global_bwd": 1.233428716659546, "x_quantize_rowwise": 0.4642195999622345, "g_quantize_rowwise": 0.11782720685005188, "w_quantize_rowwise": 0.033117830753326416, "w_quantize_colwise_transpose": 0.17696991562843323, "w_quantize_global": 0.09416043758392334, "w_quantize_global_transpose": 0.10101497173309326, "time_standard": 6.387334316968918, "time_rowwise": 5.091562867164612, "time_global": 5.032401531934738}
-{"repeat": 64, "batch_size": 65536, "dim_out": 5632, "dim_in": 1408, "wm": 4, "switch": false, "standard_fwd": 4.804681986570358, "standard_gw": 4.763372242450714, "standard_gx": 4.064023494720459, "rowwise_fwd": 2.484843134880066, "rowwise_bwd": 1.9691288471221924, "global_fwd": 2.441786229610443, "global_bwd": 1.9574686884880066, "x_quantize_rowwise": 0.2294592559337616, "g_quantize_rowwise": 0.9196549654006958, "w_quantize_rowwise": 0.0313781201839447, "w_quantize_colwise_transpose": 0.1768544316291809, "w_quantize_global": 0.09644776582717896, "w_quantize_global_transpose": 0.09847059845924377, "time_standard": 13.632077723741531, "time_rowwise": 10.574690997600555, "time_global": 10.506659746170044}
-{"repeat": 64, "batch_size": 65536, "dim_out": 1408, "dim_in": 5632, "wm": 4, "switch": true, "standard_fwd": 4.0907710790634155, "standard_gw": 3.9793066680431366, "standard_gx": 4.302978515625, "rowwise_fwd": 1.992940902709961, "rowwise_bwd": 2.4996213614940643, "global_fwd": 1.9551962614059448, "global_bwd": 2.457551658153534, "x_quantize_rowwise": 0.9200014173984528, "g_quantize_rowwise": 0.2293996512889862, "w_quantize_rowwise": 0.0313781201839447, "w_quantize_colwise_transpose": 0.17882883548736572, "w_quantize_global": 0.09540095925331116, "w_quantize_global_transpose": 0.09880587458610535, "time_standard": 12.373056262731552, "time_rowwise": 9.831476956605911, "time_global": 9.73566249012947}
-{"repeat": 64, "batch_size": 131072, "dim_out": 5632, "dim_in": 1408, "wm": 4, "switch": false, "standard_fwd": 9.655728936195374, "standard_gw": 8.261296898126602, "standard_gx": 8.064884692430496, "rowwise_fwd": 5.007706582546234, "rowwise_bwd": 3.8615092635154724, "global_fwd": 4.920527338981628, "global_bwd": 3.8330331444740295, "x_quantize_rowwise": 0.45276060700416565, "g_quantize_rowwise": 1.8306002020835876, "w_quantize_rowwise": 0.031366944313049316, "w_quantize_colwise_transpose": 0.1766495406627655, "w_quantize_global": 0.09412690997123718, "w_quantize_global_transpose": 0.09780004620552063, "time_standard": 25.981910526752472, "time_rowwise": 19.621890038251877, "time_global": 19.49014514684677}
-{"repeat": 64, "batch_size": 131072, "dim_out": 1408, "dim_in": 5632, "wm": 4, "switch": true, "standard_fwd": 8.033104240894318, "standard_gw": 8.2889124751091, "standard_gx": 8.622754365205765, "rowwise_fwd": 3.8747042417526245, "rowwise_bwd": 5.003921687602997, "global_fwd": 3.8315393030643463, "global_bwd": 4.9162134528160095, "x_quantize_rowwise": 1.8304847180843353, "g_quantize_rowwise": 0.4522763192653656, "w_quantize_rowwise": 0.03413110971450806, "w_quantize_colwise_transpose": 0.1771189272403717, "w_quantize_global": 0.09519979357719421, "w_quantize_global_transpose": 0.09930506348609924, "time_standard": 24.944771081209183, "time_rowwise": 19.661549478769302, "time_global": 19.51393112540245}
-{"repeat": 64, "batch_size": 8192, "dim_out": 6656, "dim_in": 1664, "wm": 4, "switch": false, "standard_fwd": 0.7954612374305725, "standard_gw": 0.7456131279468536, "standard_gx": 0.8799619972705841, "rowwise_fwd": 0.43267011642456055, "rowwise_bwd": 0.34622475504875183, "global_fwd": 0.42615458369255066, "global_bwd": 0.344250351190567, "x_quantize_rowwise": 0.03748014569282532, "g_quantize_rowwise": 0.13304129242897034, "w_quantize_rowwise": 0.03294646739959717, "w_quantize_colwise_transpose": 0.2407953143119812, "w_quantize_global": 0.094633549451828, "w_quantize_global_transpose": 0.10305643081665039, "time_standard": 2.4210363626480103, "time_rowwise": 1.96877121925354, "time_global": 1.8842294812202454}
-{"repeat": 64, "batch_size": 8192, "dim_out": 1664, "dim_in": 6656, "wm": 4, "switch": true, "standard_fwd": 0.7120333611965179, "standard_gw": 0.7622130215167999, "standard_gx": 0.8262209594249725, "rowwise_fwd": 0.3702230751514435, "rowwise_bwd": 0.4419572651386261, "global_fwd": 0.3479123115539551, "global_bwd": 0.4306286573410034, "x_quantize_rowwise": 0.13308599591255188, "g_quantize_rowwise": 0.037495046854019165, "w_quantize_rowwise": 0.03398209810256958, "w_quantize_colwise_transpose": 0.23782625794410706, "w_quantize_global": 0.09853765368461609, "w_quantize_global_transpose": 0.10247156023979187, "time_standard": 2.3004673421382904, "time_rowwise": 2.016782760620117, "time_global": 1.9123442471027374}
-{"repeat": 64, "batch_size": 16384, "dim_out": 6656, "dim_in": 1664, "wm": 4, "switch": false, "standard_fwd": 1.6292817890644073, "standard_gw": 1.5109702944755554, "standard_gx": 1.482747495174408, "rowwise_fwd": 0.8386112749576569, "rowwise_bwd": 0.6844550371170044, "global_fwd": 0.8220970630645752, "global_bwd": 0.6802082061767578, "x_quantize_rowwise": 0.06883963942527771, "g_quantize_rowwise": 0.25641173124313354, "w_quantize_rowwise": 0.033054500818252563, "w_quantize_colwise_transpose": 0.24027004837989807, "w_quantize_global": 0.0967271625995636, "w_quantize_global_transpose": 0.102948397397995, "time_standard": 4.622999578714371, "time_rowwise": 3.6326125264167786, "time_global": 3.5382024943828583}
-{"repeat": 64, "batch_size": 16384, "dim_out": 1664, "dim_in": 6656, "wm": 4, "switch": true, "standard_fwd": 1.4877021312713623, "standard_gw": 1.5015341341495514, "standard_gx": 1.529306173324585, "rowwise_fwd": 0.715944916009903, "rowwise_bwd": 0.8529908955097198, "global_fwd": 0.680088996887207, "global_bwd": 0.8224695920944214, "x_quantize_rowwise": 0.2568177878856659, "g_quantize_rowwise": 0.06864592432975769, "w_quantize_rowwise": 0.03343448042869568, "w_quantize_colwise_transpose": 0.23645907640457153, "w_quantize_global": 0.09399279952049255, "w_quantize_global_transpose": 0.10286271572113037, "time_standard": 4.518542438745499, "time_rowwise": 3.665827214717865, "time_global": 3.5264119505882263}
-{"repeat": 64, "batch_size": 32768, "dim_out": 6656, "dim_in": 1664, "wm": 4, "switch": false, "standard_fwd": 3.261040896177292, "standard_gw": 2.8816498816013336, "standard_gx": 2.8357282280921936, "rowwise_fwd": 1.6594752669334412, "rowwise_bwd": 1.359265297651291, "global_fwd": 1.6287527978420258, "global_bwd": 1.3503879308700562, "x_quantize_rowwise": 0.13146549463272095, "g_quantize_rowwise": 0.5035959184169769, "w_quantize_rowwise": 0.03438442945480347, "w_quantize_colwise_transpose": 0.24086236953735352, "w_quantize_global": 0.0945068895816803, "w_quantize_global_transpose": 0.10332837700843811, "time_standard": 8.978419005870819, "time_rowwise": 6.8106986582279205, "time_global": 6.693687289953232}
-{"repeat": 64, "batch_size": 32768, "dim_out": 1664, "dim_in": 6656, "wm": 4, "switch": true, "standard_fwd": 2.848360687494278, "standard_gw": 2.8955675661563873, "standard_gx": 3.0499882996082306, "rowwise_fwd": 1.3900883495807648, "rowwise_bwd": 1.6595833003520966, "global_fwd": 1.3514049351215363, "global_bwd": 1.629263162612915, "x_quantize_rowwise": 0.5036592483520508, "g_quantize_rowwise": 0.13118237257003784, "w_quantize_rowwise": 0.03438442945480347, "w_quantize_colwise_transpose": 0.23709610104560852, "w_quantize_global": 0.0951625406742096, "w_quantize_global_transpose": 0.10216236114501953, "time_standard": 8.793916553258896, "time_rowwise": 6.851561367511749, "time_global": 6.708402186632156}
-{"repeat": 64, "batch_size": 65536, "dim_out": 6656, "dim_in": 1664, "wm": 4, "switch": false, "standard_fwd": 6.4978525042533875, "standard_gw": 6.462603807449341, "standard_gx": 5.5987648665905, "rowwise_fwd": 3.2996535301208496, "rowwise_bwd": 2.6320070028305054, "global_fwd": 3.2426007091999054, "global_bwd": 2.612769603729248, "x_quantize_rowwise": 0.2561397850513458, "g_quantize_rowwise": 0.9984448552131653, "w_quantize_rowwise": 0.033076852560043335, "w_quantize_colwise_transpose": 0.24232640862464905, "w_quantize_global": 0.09618699550628662, "w_quantize_global_transpose": 0.10257214307785034, "time_standard": 18.559221178293228, "time_rowwise": 13.9242522418499, "time_global": 13.771317899227142}
-{"repeat": 64, "batch_size": 65536, "dim_out": 1664, "dim_in": 6656, "wm": 4, "switch": true, "standard_fwd": 5.5702440440654755, "standard_gw": 5.717620253562927, "standard_gx": 6.08203187584877, "rowwise_fwd": 2.649586647748947, "rowwise_bwd": 3.315173089504242, "global_fwd": 2.6132799685001373, "global_bwd": 3.257807344198227, "x_quantize_rowwise": 0.9980201721191406, "g_quantize_rowwise": 0.256560742855072, "w_quantize_rowwise": 0.03356859087944031, "w_quantize_colwise_transpose": 0.23729726672172546, "w_quantize_global": 0.09495764970779419, "w_quantize_global_transpose": 0.103779137134552, "time_standard": 17.369896173477173, "time_rowwise": 13.207826763391495, "time_global": 13.04202526807785}
-{"repeat": 64, "batch_size": 131072, "dim_out": 6656, "dim_in": 1664, "wm": 4, "switch": false, "standard_fwd": 13.058379292488098, "standard_gw": 11.480242013931274, "standard_gx": 11.092845350503922, "rowwise_fwd": 6.637874990701675, "rowwise_bwd": 5.24790957570076, "global_fwd": 6.521012634038925, "global_bwd": 5.214303731918335, "x_quantize_rowwise": 0.5057565867900848, "g_quantize_rowwise": 1.989319920539856, "w_quantize_rowwise": 0.03439188003540039, "w_quantize_colwise_transpose": 0.24280324578285217, "w_quantize_global": 0.09520724415779114, "w_quantize_global_transpose": 0.10240450501441956, "time_standard": 35.631466656923294, "time_rowwise": 26.138298213481903, "time_global": 25.908246636390686}
-{"repeat": 64, "batch_size": 131072, "dim_out": 1664, "dim_in": 6656, "wm": 4, "switch": true, "standard_fwd": 11.13397628068924, "standard_gw": 11.371888220310211, "standard_gx": 12.12756335735321, "rowwise_fwd": 5.2495077252388, "rowwise_bwd": 6.638709455728531, "global_fwd": 5.215313285589218, "global_bwd": 6.5222084522247314, "x_quantize_rowwise": 1.9870512187480927, "g_quantize_rowwise": 0.5058236420154572, "w_quantize_rowwise": 0.034634023904800415, "w_quantize_colwise_transpose": 0.23674964904785156, "w_quantize_global": 0.09457767009735107, "w_quantize_global_transpose": 0.10183081030845642, "time_standard": 34.63342785835266, "time_rowwise": 26.024363934993744, "time_global": 25.798693299293518}
-{"repeat": 64, "batch_size": 8192, "dim_out": 8192, "dim_in": 2048, "wm": 4, "switch": false, "standard_fwd": 1.2125298380851746, "standard_gw": 1.1111274361610413, "standard_gx": 1.0840706527233124, "rowwise_fwd": 0.6057210266590118, "rowwise_bwd": 0.51865354180336, "global_fwd": 0.5952082574367523, "global_bwd": 0.5167685449123383, "x_quantize_rowwise": 0.045686960220336914, "g_quantize_rowwise": 0.15827640891075134, "w_quantize_rowwise": 0.04361197352409363, "w_quantize_colwise_transpose": 0.34067779779434204, "w_quantize_global": 0.13644620776176453, "w_quantize_global_transpose": 0.14925003051757812, "time_standard": 3.407727926969528, "time_rowwise": 2.823755145072937, "time_global": 2.7127638459205627}
-{"repeat": 64, "batch_size": 8192, "dim_out": 2048, "dim_in": 8192, "wm": 4, "switch": true, "standard_fwd": 1.0731369256973267, "standard_gw": 1.1365897953510284, "standard_gx": 1.1498592793941498, "rowwise_fwd": 0.5573518574237823, "rowwise_bwd": 0.615488737821579, "global_fwd": 0.5220361053943634, "global_bwd": 0.5939789116382599, "x_quantize_rowwise": 0.15765801072120667, "g_quantize_rowwise": 0.04369020462036133, "w_quantize_rowwise": 0.047359615564346313, "w_quantize_colwise_transpose": 0.5526281893253326, "w_quantize_global": 0.13606995344161987, "w_quantize_global_transpose": 0.15017390251159668, "time_standard": 3.359586000442505, "time_rowwise": 3.1107664108276367, "time_global": 2.7401968836784363}
-{"repeat": 64, "batch_size": 16384, "dim_out": 8192, "dim_in": 2048, "wm": 4, "switch": false, "standard_fwd": 2.4274885654449463, "standard_gw": 2.1799951791763306, "standard_gx": 2.1426528692245483, "rowwise_fwd": 1.195710152387619, "rowwise_bwd": 1.027170568704605, "global_fwd": 1.1747106909751892, "global_bwd": 1.0251589119434357, "x_quantize_rowwise": 0.08098781108856201, "g_quantize_rowwise": 0.3052949905395508, "w_quantize_rowwise": 0.043764710426330566, "w_quantize_colwise_transpose": 0.33987686038017273, "w_quantize_global": 0.13646483421325684, "w_quantize_global_transpose": 0.14739856123924255, "time_standard": 6.750136613845825, "time_rowwise": 5.172800272703171, "time_global": 5.050010979175568}
-{"repeat": 64, "batch_size": 16384, "dim_out": 2048, "dim_in": 8192, "wm": 4, "switch": true, "standard_fwd": 2.1661892533302307, "standard_gw": 2.0948275923728943, "standard_gx": 2.306375652551651, "rowwise_fwd": 1.0587647557258606, "rowwise_bwd": 1.1999905109405518, "global_fwd": 1.0296404361724854, "global_bwd": 1.1749230325222015, "x_quantize_rowwise": 0.3054030239582062, "g_quantize_rowwise": 0.08077546954154968, "w_quantize_rowwise": 0.047225505113601685, "w_quantize_colwise_transpose": 0.600133091211319, "w_quantize_global": 0.13613328337669373, "w_quantize_global_transpose": 0.1484006643295288, "time_standard": 6.567392498254776, "time_rowwise": 5.387119948863983, "time_global": 4.97010350227356}
-{"repeat": 64, "batch_size": 32768, "dim_out": 8192, "dim_in": 2048, "wm": 4, "switch": false, "standard_fwd": 4.807606339454651, "standard_gw": 4.170913249254227, "standard_gx": 4.117622971534729, "rowwise_fwd": 2.370934933423996, "rowwise_bwd": 1.9481778144836426, "global_fwd": 2.3383721709251404, "global_bwd": 1.9443817436695099, "x_quantize_rowwise": 0.1547597348690033, "g_quantize_rowwise": 0.6000511348247528, "w_quantize_rowwise": 0.04361942410469055, "w_quantize_colwise_transpose": 0.3403201699256897, "w_quantize_global": 0.13600289821624756, "w_quantize_global_transpose": 0.1474134624004364, "time_standard": 13.096142560243607, "time_rowwise": 9.628776460886002, "time_global": 9.491894394159317}
-{"repeat": 64, "batch_size": 32768, "dim_out": 2048, "dim_in": 8192, "wm": 4, "switch": true, "standard_fwd": 4.1619837284088135, "standard_gw": 4.181284457445145, "standard_gx": 4.635505378246307, "rowwise_fwd": 1.9684135913848877, "rowwise_bwd": 2.3750364780426025, "global_fwd": 1.9445866346359253, "global_bwd": 2.3551955819129944, "x_quantize_rowwise": 0.6004162132740021, "g_quantize_rowwise": 0.15468522906303406, "w_quantize_rowwise": 0.04730746150016785, "w_quantize_colwise_transpose": 0.5999617278575897, "w_quantize_global": 0.1364201307296753, "w_quantize_global_transpose": 0.14847144484519958, "time_standard": 12.978773564100266, "time_rowwise": 9.927105158567429, "time_global": 9.521059691905975}
-{"repeat": 64, "batch_size": 65536, "dim_out": 8192, "dim_in": 2048, "wm": 4, "switch": false, "standard_fwd": 9.52371209859848, "standard_gw": 8.354485034942627, "standard_gx": 8.69860127568245, "rowwise_fwd": 4.717472940683365, "rowwise_bwd": 3.8843750953674316, "global_fwd": 4.645414650440216, "global_bwd": 3.8761012256145477, "x_quantize_rowwise": 0.3024861216545105, "g_quantize_rowwise": 1.1897757649421692, "w_quantize_rowwise": 0.04366785287857056, "w_quantize_colwise_transpose": 0.33988431096076965, "w_quantize_global": 0.1359507441520691, "w_quantize_global_transpose": 0.14724582433700562, "time_standard": 26.576798409223557, "time_rowwise": 18.832147121429443, "time_global": 18.651459366083145}
-{"repeat": 64, "batch_size": 65536, "dim_out": 2048, "dim_in": 8192, "wm": 4, "switch": true, "standard_fwd": 8.307881653308868, "standard_gw": 8.214320987462997, "standard_gx": 9.21182706952095, "rowwise_fwd": 3.8919784128665924, "rowwise_bwd": 4.72346693277359, "global_fwd": 3.8761794567108154, "global_bwd": 4.673641175031662, "x_quantize_rowwise": 1.1893920600414276, "g_quantize_rowwise": 0.3024972975254059, "w_quantize_rowwise": 0.04708021879196167, "w_quantize_colwise_transpose": 0.6039328873157501, "w_quantize_global": 0.13624504208564758, "w_quantize_global_transpose": 0.14867261052131653, "time_standard": 25.734029710292816, "time_rowwise": 18.972668796777725, "time_global": 18.540948629379272}
-{"repeat": 64, "batch_size": 131072, "dim_out": 8192, "dim_in": 2048, "wm": 4, "switch": false, "standard_fwd": 19.30372044444084, "standard_gw": 16.480475664138794, "standard_gx": 17.61433482170105, "rowwise_fwd": 9.49602946639061, "rowwise_bwd": 7.768530398607254, "global_fwd": 9.3533955514431, "global_bwd": 7.749464362859726, "x_quantize_rowwise": 0.5977451801300049, "g_quantize_rowwise": 2.3684948682785034, "w_quantize_rowwise": 0.04375725984573364, "w_quantize_colwise_transpose": 0.34042075276374817, "w_quantize_global": 0.13628974556922913, "w_quantize_global_transpose": 0.14671683311462402, "time_standard": 53.398530930280685, "time_rowwise": 37.09545359015465, "time_global": 36.83258220553398}
-{"repeat": 64, "batch_size": 131072, "dim_out": 2048, "dim_in": 8192, "wm": 4, "switch": true, "standard_fwd": 18.041003495454788, "standard_gw": 17.770148813724518, "standard_gx": 17.70009845495224, "rowwise_fwd": 7.756810635328293, "rowwise_bwd": 9.502101689577103, "global_fwd": 7.7384114265441895, "global_bwd": 9.36170294880867, "x_quantize_rowwise": 2.3686252534389496, "g_quantize_rowwise": 0.5980581045150757, "w_quantize_rowwise": 0.04723668098449707, "w_quantize_colwise_transpose": 0.6035342812538147, "w_quantize_global": 0.13603642582893372, "w_quantize_global_transpose": 0.1485198736190796, "time_standard": 53.511250764131546, "time_rowwise": 38.64651545882225, "time_global": 38.121502846479416}
-{"repeat": 64, "batch_size": 8192, "dim_out": 16384, "dim_in": 4096, "wm": 4, "switch": false, "standard_fwd": 4.598241299390793, "standard_gw": 4.294309765100479, "standard_gx": 4.261095076799393, "rowwise_fwd": 2.0976848900318146, "rowwise_bwd": 1.9718967378139496, "global_fwd": 2.0763762295246124, "global_bwd": 1.9703581929206848, "x_quantize_rowwise": 0.08216872811317444, "g_quantize_rowwise": 0.4405900835990906, "w_quantize_rowwise": 0.1553371548652649, "w_quantize_colwise_transpose": 1.6110725700855255, "w_quantize_global": 0.481240451335907, "w_quantize_global_transpose": 0.5061514675617218, "time_standard": 13.153646141290665, "time_rowwise": 10.653059929609299, "time_global": 9.85119491815567}
-{"repeat": 64, "batch_size": 8192, "dim_out": 4096, "dim_in": 16384, "wm": 4, "switch": true, "standard_fwd": 4.35885414481163, "standard_gw": 4.29583340883255, "standard_gx": 4.5370906591415405, "rowwise_fwd": 2.0015686750411987, "rowwise_bwd": 2.097565680742264, "global_fwd": 1.969795674085617, "global_bwd": 2.075403928756714, "x_quantize_rowwise": 0.43984130024909973, "g_quantize_rowwise": 0.08216127753257751, "w_quantize_rowwise": 0.22544339299201965, "w_quantize_colwise_transpose": 2.4342015385627747, "w_quantize_global": 0.48087164759635925, "w_quantize_global_transpose": 0.5099289119243622, "time_standard": 13.19177821278572, "time_rowwise": 11.576615273952484, "time_global": 9.85383614897728}
-{"repeat": 64, "batch_size": 16384, "dim_out": 16384, "dim_in": 4096, "wm": 4, "switch": false, "standard_fwd": 9.09888744354248, "standard_gw": 8.230950683355331, "standard_gx": 8.465446531772614, "rowwise_fwd": 4.182614386081696, "rowwise_bwd": 3.747660666704178, "global_fwd": 4.138719290494919, "global_bwd": 3.74777615070343, "x_quantize_rowwise": 0.15515834093093872, "g_quantize_rowwise": 0.8699297904968262, "w_quantize_rowwise": 0.15544891357421875, "w_quantize_colwise_transpose": 1.6132444143295288, "w_quantize_global": 0.48100948333740234, "w_quantize_global_transpose": 0.5051903426647186, "time_standard": 25.795284658670425, "time_rowwise": 18.955007195472717, "time_global": 18.128734081983566}
-{"repeat": 64, "batch_size": 16384, "dim_out": 4096, "dim_in": 16384, "wm": 4, "switch": true, "standard_fwd": 8.378107100725174, "standard_gw": 8.923027664422989, "standard_gx": 9.049762040376663, "rowwise_fwd": 3.765825182199478, "rowwise_bwd": 4.183519631624222, "global_fwd": 3.744799643754959, "global_bwd": 4.1590481996536255, "x_quantize_rowwise": 0.8693933486938477, "g_quantize_rowwise": 0.1553073525428772, "w_quantize_rowwise": 0.2258792519569397, "w_quantize_colwise_transpose": 2.4386271834373474, "w_quantize_global": 0.4811100661754608, "w_quantize_global_transpose": 0.5102269351482391, "time_standard": 26.350896805524826, "time_rowwise": 20.5615796148777, "time_global": 18.842913210392}
-{"repeat": 64, "batch_size": 32768, "dim_out": 16384, "dim_in": 4096, "wm": 4, "switch": false, "standard_fwd": 18.266115337610245, "standard_gw": 17.671160399913788, "standard_gx": 17.10302010178566, "rowwise_fwd": 8.347474038600922, "rowwise_bwd": 7.514089345932007, "global_fwd": 8.263226598501205, "global_bwd": 7.487393915653229, "x_quantize_rowwise": 0.3021806478500366, "g_quantize_rowwise": 1.7319358885288239, "w_quantize_rowwise": 0.15519559383392334, "w_quantize_colwise_transpose": 1.6133114695549011, "w_quantize_global": 0.48247724771499634, "w_quantize_global_transpose": 0.506427139043808, "time_standard": 53.04029583930969, "time_rowwise": 37.3353473842144, "time_global": 36.44480183720589}
-{"repeat": 64, "batch_size": 32768, "dim_out": 4096, "dim_in": 16384, "wm": 4, "switch": true, "standard_fwd": 17.73649826645851, "standard_gw": 16.359902918338776, "standard_gx": 18.0993489921093, "rowwise_fwd": 7.493957877159119, "rowwise_bwd": 8.352488279342651, "global_fwd": 7.486194372177124, "global_bwd": 8.28903540968895, "x_quantize_rowwise": 1.7313472926616669, "g_quantize_rowwise": 0.30205026268959045, "w_quantize_rowwise": 0.2255477011203766, "w_quantize_colwise_transpose": 2.4363920092582703, "w_quantize_global": 0.4815347492694855, "w_quantize_global_transpose": 0.5103759467601776, "time_standard": 52.195750176906586, "time_rowwise": 36.90168634057045, "time_global": 35.16044095158577}
-{"repeat": 64, "batch_size": 65536, "dim_out": 16384, "dim_in": 4096, "wm": 4, "switch": false, "standard_fwd": 36.309611052274704, "standard_gw": 32.85098075866699, "standard_gx": 34.34552624821663, "rowwise_fwd": 16.74525812268257, "rowwise_bwd": 15.026237815618515, "global_fwd": 16.574162989854813, "global_bwd": 14.977734535932541, "x_quantize_rowwise": 0.5954466760158539, "g_quantize_rowwise": 3.4569576382637024, "w_quantize_rowwise": 0.15521422028541565, "w_quantize_colwise_transpose": 1.6133897006511688, "w_quantize_global": 0.4822872579097748, "w_quantize_global_transpose": 0.5065612494945526, "time_standard": 103.50611805915833, "time_rowwise": 70.44348493218422, "time_global": 69.44413110613823}
-{"repeat": 64, "batch_size": 65536, "dim_out": 4096, "dim_in": 16384, "wm": 4, "switch": true, "standard_fwd": 35.40017828345299, "standard_gw": 33.037226647138596, "standard_gx": 36.30436211824417, "rowwise_fwd": 15.043705701828003, "rowwise_bwd": 16.756191849708557, "global_fwd": 15.011314302682877, "global_bwd": 16.580048948526382, "x_quantize_rowwise": 3.4548528492450714, "g_quantize_rowwise": 0.5951337516307831, "w_quantize_rowwise": 0.22584572434425354, "w_quantize_colwise_transpose": 2.4329908192157745, "w_quantize_global": 0.4813261330127716, "w_quantize_global_transpose": 0.5101598799228668, "time_standard": 104.74176704883575, "time_rowwise": 71.54594734311104, "time_global": 69.67006251215935}
-{"repeat": 64, "batch_size": 131072, "dim_out": 16384, "dim_in": 4096, "wm": 4, "switch": false, "standard_fwd": 73.40333238244057, "standard_gw": 73.76311346888542, "standard_gx": 70.41774317622185, "rowwise_fwd": 33.37597846984863, "rowwise_bwd": 30.345775187015533, "global_fwd": 33.00366923213005, "global_bwd": 30.218638479709625, "x_quantize_rowwise": 1.1825822293758392, "g_quantize_rowwise": 6.902601569890976, "w_quantize_rowwise": 0.15529245138168335, "w_quantize_colwise_transpose": 1.6109198331832886, "w_quantize_global": 0.48149004578590393, "w_quantize_global_transpose": 0.5066059529781342, "time_standard": 217.58418902754784, "time_rowwise": 147.33626320958138, "time_global": 146.05870097875595}
-{"repeat": 64, "batch_size": 131072, "dim_out": 4096, "dim_in": 16384, "wm": 4, "switch": true, "standard_fwd": 71.5160183608532, "standard_gw": 73.76786693930626, "standard_gx": 72.98104092478752, "rowwise_fwd": 30.291248112916946, "rowwise_bwd": 33.36654230952263, "global_fwd": 30.181586742401123, "global_bwd": 33.082425594329834, "x_quantize_rowwise": 6.902430206537247, "g_quantize_rowwise": 1.1815279722213745, "w_quantize_rowwise": 0.2262219786643982, "w_quantize_colwise_transpose": 2.4421699345111847, "w_quantize_global": 0.4816502332687378, "w_quantize_global_transpose": 0.5105249583721161, "time_standard": 218.26492622494698, "time_rowwise": 148.17800745368004, "time_global": 146.1080126464367}
diff --git a/benchmarking/switchback/make_plot_with_jsonl.py b/benchmarking/switchback/make_plot_with_jsonl.py
deleted file mode 100644
index fd0dd7d58..000000000
--- a/benchmarking/switchback/make_plot_with_jsonl.py
+++ /dev/null
@@ -1,151 +0,0 @@
-import matplotlib.gridspec as gridspec
-import matplotlib.pyplot as plt
-import pandas as pd
-
-cmap = plt.get_cmap("cool")
-
-if __name__ == "__main__":
-    fig = plt.figure(tight_layout=True, figsize=(12, 3.5))
-    gs = gridspec.GridSpec(1, 2)
-
-    dims_to_consider = [1024, 1280, 1408, 1664, 2048, 4096]
-    batch_size_for_plot1 = 32768
-    batch_sizes_for_plot2 = [2**14, 2**15, 2**16, 2**17]
-    dims_to_xtick = [1024, 2048, 4096]
-    logscale_plot1 = True
-
-    ax = fig.add_subplot(gs[0, 0])
-
-    # TODO: change this to what you want.
-    rdf = pd.read_json("speed_benchmark/info_a100_py2.jsonl", lines=True)
-    df = rdf[rdf.batch_size == batch_size_for_plot1]
-
-    # first plot the time occupied by different operations
-    for k, marker, ls, color, name in [
-        ("standard_gx+standard_gw+standard_fwd", "s", "-", "C2", "Standard fp16 (sum of parts)"),
-        (
-            "x_quantize_rowwise+g_quantize_rowwise+w_quantize_global+w_quantize_global_transpose+standard_gw+global_fwd+global_bwd",
-            "o",
-            "-",
-            "C4",
-            "SwitchBack int8 (sum of parts)",
-        ),
-        ("standard_fwd", "^", "--", "C2", "Matmul XW (standard)"),
-        ("standard_gw", "^", "-.", "C2", "Matmul GW (standard)"),
-        ("standard_gx", "^", ":", "gray", "Matmul GX (both)"),
-        ("global_fwd", "^", "--", "C4", "Int8 Matmul XW (switchback)"),
-        ("global_bwd", "^", "-.", "C4", "Int8 Matmul GW (switchback)"),
-        ("x_quantize_rowwise", "P", "--", "C4", "Quantize rowwise X (switchback)"),
-        ("g_quantize_rowwise", "P", "-.", "C4", "Quantize rowwise G (switchback)"),
-        ("w_quantize_global", ".", "--", "C4", "Quantize global W (switchback)"),
-        ("w_quantize_global_transpose", ".", "-.", "C4", "Quantize global and\ntranspose W (switchback)"),
-    ]:
-        xs = []
-        ys = []
-        for embed_dim in dims_to_consider:
-            # average over dim -> 4*dim and 4*dim -> dim
-            df_ = df[df.dim_in == embed_dim]
-            df_ = df_[df_.dim_out == embed_dim * 4]
-            xs.append(embed_dim)
-            y_ = 0
-            for k_ in k.split("+"):
-                y_ += df_[k_].values[0]
-            df_ = df[df.dim_in == embed_dim * 4]
-            df_ = df_[df_.dim_out == embed_dim]
-            for k_ in k.split("+"):
-                y_ += df_[k_].values[0]
-            ys.append(y_ * 0.5)
-
-        ax.plot(
-            xs,
-            ys,
-            color=color,
-            label=name,
-            marker=marker,
-            markersize=5 if marker == "s" else 5,
-            linestyle=ls,
-            linewidth=2 if "+" in k else 1.0,
-        )
-
-    ax.set_xlabel("dim", fontsize=13)
-    ax.set_ylabel("time (ms)", fontsize=13)
-
-    ax.grid()
-
-    ax.set_xscale("log")
-    if logscale_plot1:
-        ax.set_yscale("log")
-
-    ax.tick_params(axis="x", labelsize=11)
-    ax.tick_params(axis="y", labelsize=11)
-
-    ax.set_xticks(dims_to_xtick)
-    ax.set_xticklabels(dims_to_xtick)
-    ax.set_xticks([], minor=True)
-
-    leg = ax.legend(loc="upper center", bbox_to_anchor=(-0.64, 1.0), ncol=1, fontsize=10)
-    leg.get_texts()[0].set_fontweight("bold")
-    leg.get_texts()[1].set_fontweight("bold")
-    plt.subplots_adjust(left=0.1)
-    ax.set_title("  Linear layer, batch * sequence length = 32k", fontsize=10, loc="left", y=1.05, pad=-20)
-
-    ax = fig.add_subplot(gs[0, 1])
-
-    # now plot the % speedup for different batch sizes
-    for j, batch_size in enumerate(batch_sizes_for_plot2):
-        all_xs, all_ys = [], []
-        for k, marker, ls, color, name in [
-            ("standard_gx+standard_gw+standard_fwd", "s", "-", "C2", "Standard fp16 (total time)"),
-            (
-                "x_quantize_rowwise+g_quantize_rowwise+w_quantize_global+w_quantize_global_transpose+standard_gw+global_fwd+global_bwd",
-                "o",
-                "-",
-                "C4",
-                "SwitchBack int8 (total time)",
-            ),
-        ]:
-            xs, ys = [], []
-            df = rdf[rdf.batch_size == batch_size]
-            for embed_dim in dims_to_consider:
-                df_ = df[df.dim_in == embed_dim]
-                df_ = df_[df_.dim_out == embed_dim * 4]
-                xs.append(embed_dim)
-                y_ = 0
-                for k_ in k.split("+"):
-                    y_ += df_[k_].values[0]
-                df_ = df[df.dim_in == embed_dim * 4]
-                df_ = df_[df_.dim_out == embed_dim]
-                for k_ in k.split("+"):
-                    y_ += df_[k_].values[0]
-                ys.append(y_ * 0.5)
-            all_xs.append(xs)
-            all_ys.append(ys)
-
-        color = cmap(j * 0.25)
-        real_ys = [-((all_ys[1][i] - all_ys[0][i]) / all_ys[0][i]) * 100 for i in range(len(all_ys[0]))]
-        markers = ["^", "v", "P", "o"]
-        ax.plot(
-            all_xs[0],
-            real_ys,
-            color=color,
-            label=f"batch * sequence length = {batch_size}",
-            marker=markers[j],
-            markersize=5 if marker == "s" else 5,
-        )
-
-    ax.legend()
-    ax.set_xlabel("dim", fontsize=13)
-    ax.set_xscale("log")
-    ax.grid()
-    ax.set_ylabel(r"% speedup", fontsize=13)
-
-    ax.tick_params(axis="x", labelsize=11)
-    ax.tick_params(axis="y", labelsize=11)
-
-    ax.set_xticks(dims_to_xtick)
-    ax.set_xticklabels(dims_to_xtick)
-    ax.set_xticks([], minor=True)
-
-    ax.set_title("  Linear layer summary, varying dimensions", fontsize=10, loc="left", y=1.05, pad=-20)
-
-    plt.savefig("speed_benchmark/plot_with_info.pdf", bbox_inches="tight")
diff --git a/benchmarking/switchback/plot_with_info.pdf b/benchmarking/switchback/plot_with_info.pdf
deleted file mode 100644
index d186e91b7d96c6e605fd2802ee37881e6294cdd7..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 34876
zcmb@u1yq(x*8mFA-CZx;`O>9ycSv_iN~hA@4T4A`A*Cptl2U>+NOy;Tpxo!hbN=uA
z-{HTy?z+o0%)HM$Gkee8`<dCZLt0fyX;w})Ze-euc~E&HGAD=w<Ye{)Sy&jvuIcS;
z31XKpbu+bhvIeoMnp#`Bg1CSe)Ip-6$d-;4kQe#?F+keM(G3K?dx2d?#q5ctxf_V@
z*H0-gHyI5#Q#VTx_pb-4rfzPQE{-4`$WLT;4I5JnTSsdU?_a$xPUaexZXkUCtfVx6
zilvtuh+WPBKtcSkTH>!-4rKTbbU6RL00`a?>|HDY?0%uou5Rh-<nCe)tOtTW0AI<{
z!q!yW$qRtU0sQ0V<YwdJ;^X7t6X56I261t7v+)CP*d>4!0@JvH1b*Qu=IH1I{NjX+
z`6pE%-G7r&(bCb{%?8Bz7iei)dtl)pc4>Qn8j_agP8OCB{9WB#EKMDdy|TX;%DIyD
zCbqU2kEKVxqWff}HTf(fm_N%j#@z4KM?&V#6KGgUWG*N+sG;iTKm2qT40E&=s=oTt
zAgZG#1}aY^4L;7)Eq{6Ad>lV`8vXF0@9dAw_$K<(sjGcu&$i{mU1fLRZ?CfF$kD~Q
z^75_^!MlB!&?PF-sx^AuIhh2_tL>?UgY<>uL$xdX(A<5GHFKw&FKa^OzUAi5HlJ;G
zV&9EzXZp~-wh%uQ!~R0Aez0Q5cPmZev}x6sB}#|ao;X%B-hUzZ#AV?+`ha~w>0-^-
zY@_tryw{Am`bUjMiwd%oRs??S)NEdYk9F#aS`?@h$GVBabEL^r0|OcPOT}m&0yoOQ
za?hwX@p+9ea=G7pOIKT&Z^bCFH(%N~I`MIfOaH6AtMh~H=!N9l$(+fBXye*tqvqKk
z?|cjKW)2(M+}VGAxUM2-+RW?-*T|78<71rBkz+0`DLpVTe^CbRh+mJPc1<~uow+=B
z+e|s=&l}Q=&Cc|RIzZ>Ge=S7PrOGBq65o}h)vvIil@`&$OXri!n)tmC)lQz<YNyA@
z(CE2{5F)`^JO9-yVglae_%rBT)};6TPx{M^+W8rq!2U-asxBQ_YSE*HRTB85Oxb8>
zvd^blE*&<nQ}V&+Mj`O557n;-X3x-{H@`=2s}ZqHMHAu0{tnU$<Xp_7C=%bA)>(~@
zz&H}@aS@p#(dn|@^{`n!#vo|R`SCTJV#Lvr!q|Ax@5BBxuW^yvEZ#36Cj)fGb8d9G
z_8K$!)wW+&J{Pm>vouY%zMm8z>_<uNJPyZUB{o6D;dC+W{Q<9)EY?;`o4@VAU8t||
z>1^DAaxSH(bXM2~EyP-hPOq}^G*_TI4jl0k?ZJ;gp-kC4eEz8v%M%3`d^zoQ6~U1D
z8%c*~p`hpVoA!p!=-8R~(J_(<P8D4ADXTr~Z3G!6>b+t_#3i|l*&Zg(A@`q;xyI=|
z&ggf5i<QlYQJI*)JhQ}$uSuYCsV|n#V=uE=C|^@QQcBHitq8K8D=nQP#_`srw21qW
zFAel&Ok6)=eAhX0CWTk>{Z(pI<q_ClmzK&C?i=JiHR!G{b|nP0BfykvW5RTE%2wa=
z<sV(5rYX)1DjI7eiJXwE>w|~Sq*6;WHv2KI@O<<`afLziuj~kMfGI<qf?=n4VH8FQ
zv=A9zDdUp#!20XL&6uoQE^0FZ{SwY_iy;%5ncIU*22lw<^;*b=UXC8K@iYE7A4T~=
zsu(Zzq0f0`hJI|XFRY(SU7Qx|&gHgUi`&wzbCkjeypW*ml|-I=9TM~E=TPn(A|rkH
zEaiJHM%bSBV@Fl6bq^M?%wI4Zmx86wc@yL?x!ZlcOM}1Iky5FsL)l1^(V!5`nLP+K
zz7WfUc9a?Bex*r~n1BwgBUVOT<o%F^X-oR&f_5w?<|e;^<mS}@OH0ouO6!#8>ark(
zr)wWYdzRd5wOz@Hmp&j39N89emyZx{YKi+DqMAj*>{KE@E8^G?hJF&92*Xu7O;Uu!
z_T^0~Pci+6zyxj=@<-M0djt!&vi68c7y>NXv7?<%Bw5<ka6WkCUlOzzL=VseS>cH8
zUAkJh!dnP-%)LTi!NQ|9%W*hDlnv~%AQpCcfbaeC^Aaa!h8WzFBSNJ9B-a@S1>qe;
zG`okF8%PN<k*7b;5qlcW9Lvw(3F*fw!)#_$eI62OP{7>yaralHROUq-Nn@EPL~0LA
z%A&R?yxc_UDB_`GVhDPk^XltXj_zu{7*+9=ECFs9gB<iBT%^wtvU3|^U5<3rHQ&+E
z&a-A3-RkznH)@fv0{rH#)rqCBy{gLz{&JdvLktRU&XbH>ud*kRRd-UL5I-dF2c6Jh
zTutF!JseZO-%iwuB@f|krb*aNMPRe7<=PM5Fjv+L)==GA`)-4iJ-NPOnwl7z>jqyd
z_+gMo8vhwrCsKGov#yKF$d+Vn;@InlI0a2`(t*AN;%+?BrlhO25)K?u5&M`chQVsl
zxMbhyzM4$X5_X1U=p|4!J7<g=lN1(QPU?&3&DjUw4m^e}#2k}fp7igrKovDZW)8KB
zF#AL$)E*Mrq-$Cb%1}!^a#a>-=f{mHmBNKJj5BWE|4l@KTVfn<{DW=fM;3g2J$fWW
z-CY7G#cU%rlt&?E(kZq%G>YUcBa788Nyy1p&E(Ti98}oE;0h`ZD7ZG(fwd~$ABOm_
zYyz>WwJ%h~Z)HzXFaWp=-qo>`QSC}rMb0m9nQ*%a;mjtdZSq23VF$FD6YD#BKhH@?
zbadq<G5icQKNhiwKpyvX(lGMF=8%Bjlu-U;m(jrv+UG1J$snp;=edp|wo3Q{tt5bx
z!m;{}hP+7A#KXd48g%4Z*DdGAZv@WobO^u+mNO?u_#)Y&-Y88k4i`pIovHo8(Yz)+
zH`sh)tm1f#CW;dz?Nwr1xsIP5H&s>uGulk>k%9&{7kx~M!HNXhZ%rE{$X^C0&U)4u
zq`E0#Pt6JYjgD?kuoKfqD&oo42jX>SVUNq|&Tn|d(VjUE(7jjp2~;)fDo1+tjO0}W
zI{s-P4;!&xai79MpTs^f4vJqEAuipnrS$r9f?*ux7~xuLLul}cG_RueYSMcfJa*5C
z>X^u@YKiHWYMCtTU$&cxXvO-=pJm9G9l47MAfa#KORt5%WN7wKJ7wOeOvlSTl6bK-
z_ACORud2FJ&_=+4ZE8GnYKGAh<d9psj8HVq{DL)T*7eHtm>cD0x)%)6l4Wqb(C<IM
zmW0J+>jvS7CZAzvQxZ18D?2s+_&#mXAyONSh4U2WML-Vq+HkUQ^xU-M^gB}bG|TkS
z?g|kviKnr#aNOQj8{Rfm&NU%)(z4=%>YYxD#)2d2g*sJn=3$-V9aaw6gT!3ClL@dh
zG6e<|l-nXf7K*QRBPNjRNI87CXbv&rq^v}StUo{F77ZgzAa70b%uXwsn~4wox*B@R
zXsLL;d`YN0O$SA1N>$$@`@%#p9n}R<36N6gd2w)*eYLHgxJn!K<L!0|_N~dQqOOox
zGriH3r1bjW3nfwn0UwxH?hi5k5tQn?Oy2PvuT57s?Vw##h?6P6mF7-+lfqW-cPJZQ
z(jEJlzwfy~4h$x6P}E^IG}m})rF^OBn3K%gK(oloV>;k`z7ZC?{Ekr)CNx_!?KzuD
znd)MGhVtt@5g6)`5VrW%&Llf}>#8Z`;ru;DYp2~63F_<Fp*hyo{uj-e>e|*l5@lb!
z)*P0bnT;rpHO|THr+yr+4}F}uKG@mIB{;ktZ4qt~x_x(jaqw+rAw2rUY4pL)%EtLZ
z)xvYPu%B$6wz+;zVQRwU#$j>cU4u;%8sX4|bp8*-;%Z?SCYV*1AGY+IxffIEL??-g
zr>SC%>m2#fms(6XTj(ZTo*~|DOFnGg9`ZeLWa|kz{d9D6`i1Qk_s@_-%#$j4yRMRf
zcbA);V_%MRKd=qqE;ChTpW5z}H2HW|UuQ2osm$c5Z5f2Oe)kyGt19gXN@yTsKq6VP
zCF=th-trTw5OKotPazerU6$;HpU}m>vpN+s&=SkRSG_$x*xUFrbNzUM-3cq?`7^ic
z<woCAg^O={KMY+T1frb?=)Vi*#0}baCU&juQOx{A$!P2);l4r%Vt&ye?yMm@220da
z)2y3q%LeCVU3@%Ch=GEK^+349ebZjpt~r`FnXGnGxNJHAGbX+A^bv8JjM#|Gi_=yd
zaa^ie8~ChoLYp;<^e(m#X;kTjPBMc+WIvyB9JA`Lq#ye}4}UF1Hq30srR>X!7dw5H
zV}rqM7g2kGX$X@UA*AiwzX3I+o3^Ykug_2*DhaOpESav}0ru>~uOsZViFFv+G_CXO
zV=$IKqyST}V`02}nRDCH)Z7sLqF|%7KIL?4tp8lktM)`Bj`a8k)SN9fO0#B~?f~zX
z$gsn)=*YtYKYXhViK`!DS1Q(?QxtC+bANV5QOr+ylC2n^Q^gy-``$L9G~R)n7|2Q;
z^-3`<AqAmMie_;mX%f*iU-4T)$|_5eN+)?=hV%<zq0e-^huG-E7jNQeUCR)YTj7=q
zW1DLlreD<j5OZ3L36XOMa?YL<j>SV~-lu2SUw_eLjNu(O64B)6Cy610L60|_A{U-?
z0%fO1O_EWg7=zaM{%OAAKzhpRTc<8f^S1A|;r^cMinYeik=mYRV_d!Mzcj%R<!6fF
ze5Rbnkc(jj9}LA}AK-0)4bPC<YfDiV;`VkY*P;BUC<64RWY_r{xkIp}dP|{jz&8PC
za94#abq!~z$jUgGa0~Vtvz+wDH4RzKinrooM<3tvv4aN6eEYQBxa;1<vk((KsuNEn
zjVufDBh7cq_L{~h+oTwFJYc(ZAEP|0#uGSczPda<`IdjZ9Vo-+qHMTafk~Hv)Cp6d
zsH<ZOT`OLR8EBtjLQjUAugwtnKF)w!>5Ir3_XV|=UP%-kyl|x|&BOdM5JfnLr8K@M
zlF119KsaSieV6+bmLAwNB#p*z`O9P?jA*7{-P$E-1fHgLsuJnQqb^dF<30R{h~_u_
z+yn{zai=<Kip{2((<mh=n+tJLpK-m|Tm9mx&|SYOBFTMBuv2u%AorJ|7m~maXDk{#
zJ<DW;!526Bnl%$XgnUgSdYU5ez0Z?gijpB#rA;#yI&N0N*!?~7r>%fY`{j80x9TA$
zh9N8|a%is9$@v?@h2qxrR$oT;6maB($T`*G`Uvo+-lVUtAB6P|?w0ehkGyF_7_NEn
zS(m=$NRco+ibl9k^?3bR11S^Oy~ko{JSVLRXLt+EPmM0IGF*O%DE8Qql;m=4oPj$w
zRkz2sBEYO*9Soa~6{gn8t0icjOO{d=j$Dd+Rt+msgOSC$mwcHbA%)A7fg6i=roDil
zqOym!YJ&gDj{r+%pdzFpHov3+{CYIAQujRc$AIMLI2qJb6W`gFktyv#fiB-p%DQg6
zzq-QBgEsP+$TH^VTOKww_+M{)GjpAYjW6ii)Sh^2zAVh4*v*I49j3G<8F<mq?V*&X
zG{=?y<XjcmNi6lNKn%|Jd?#&-iSEL3dO_IpLOy16p86}RP*+n10t3|J<gfDH{0n(k
zi@q^|@&;leQl1DTa}#6(sOy&jtv}b^tCK5x25JN>D^XFtdG#{CDLB?)hwX@tKOM#U
z%Mtji{BfWJ>2X7ChEt;dfIcGz|IbjEI=_^e&B3`k1m<pP!*(BE;riFKRf5pFMQhIM
z{i^akI&GGi7fLpmVyGK@X@&6CEB(YN`$%ZotZY1pIY|9o)jbC+E8mOhf6#H4YKhtx
zoV<XM3Jvn9MK8w>#opz`uuU{h4QP{O+YBGH^F6<G041`+h4xiF(DKQCU9ziI&v35u
z&RaF!Z^P(#XeeUO1uW=K;&b-Wx+tvdt+J}hRDr$PPR*0kSL7na4U0!D^^f$u3Y}o%
zZCDvoPv$V|$gOv_eNtHEMYJ8rC#cyO5~M>qzOo8uIl;tV*`!4;O23v76VZ*=iWZM|
ze-VBrYr&xuiy4rQqVMD8zNq;U`YAjPZw^&)dT<v{3z8qAGqd~m_XpqhKYlF7aB53<
z%HcnvCo#gjnenl^I9o@0;LQVy?Y^m$OU(EAex`jm$4?Mu&}CO$NxRTYoN%g4uqeo%
z3jet1B2SXNxk3vgY*w3(k$rN78=y5HK#g(5S$@*}MHVw$6lJbU*v>+P=5ZYKZmcy{
zcBo-RJ?@^Ht-4cl%%hupTuOwnv>UzSjR;{)6Q#FfWgi9<{AqEDL{7S1`RiEvd&@5-
z(zcQjTO6|o5=9J^U@8Wkd*|2@FNP<isE!MAy$MrqzZw3p%*YN+TXMZLio>INVlpF{
zp7SaMyI9cAU#p2&-p>tDB4=mZ;+nQJ?#uyi4x@>xq}G<}yr%$x^W@m307sppNxRwm
z9I80Zj~VJM;pD*7hgE6=tko%>&sM1~h`qKxK{0bGhA*E_4k8d9QA$tK5tn7$QnlmA
z@bO<zOc`}iE*-;IrZrR~(i>xAx7XFEs5-YU=tE-^@%YxjdquM62&vaY8$IJ0%TjGa
zse2rR?4|$l8%$Lnl3dAdvd<|kjG~UI{JvS%0ozV&FDkit%vDNTrIOt`$yOf!*QGj-
zZ}KGU#79Xkqr|7C#`EWs$?)BLD{84<44>XgRU=rZoFuiH_nHK|aMrR5g@|xn`PEhB
z7<Enr<qyRJAU2m#_B%IC^|HP41nWnG;ZY~7^|l(wXOQ#5Ll&Pw57f+5Qjt`XNQK}y
zkGF8O9PfyM7I{zo7RP7bt)T5B#4~DYB4Urv>H4g0!-z7sJ4tu9o4drmz;|C;_U=$@
zXstQ_4t*$8a^n_Xi^nHpT%%*GYply1zMZzaKQ(yn(Hiyi?1{gjSEE7A<VIjQ-q7Xu
zlyba<{qWu0x{KQzi3^MyU+>dKUw3c+r>?$N8ynv~_~IQIh&&V+*Jan1>0)@tt{x6s
z<#;91kV&jgJQ!*e`xqCQF)no2Gqk4!`OM2r9h0(N&W}#)tb<@CJU=uu(@oS>1gGke
zWUimJaoR_y7xHP(V27gVWXzUt<VT|d@{{w!i_oX4%<Pqk3>G7Y6&bx?h{YWgg0DX-
z%yys}b4#|65PLGNhaU<uOvgGf&n2Og{XFtXA$v37Kyf?$90gIp_U*o5q>5}>G<v~q
zEE&cL{)~10Xg<xNorjR8f`7I*5#FcvtVY(bgzkcK@^j5?4epVu(Qf~&hm`s(Sr}2l
zcBQz+HsXgt9#suwKI>giy#uJ0CqfBC^{vSa6!MI;G+PZ4Wj+wXmrHg#KKlg9&BI39
zqb3>CU$)24;#PZ9BVR%E0uFzz0D7&fZ?I$uA+q@R`p8wh=kd|1L9RN=t4sz=n#hZ^
zf=E#`<BXSM=9~!HY6&eXr!vT=>o-<yD^lXclo_WI$Z-@$%7PT#CVQpLw*zZ_F3G#`
z0YRX!TMXnps@U3`k@nU3$>UFD$@|CP7Ph4F@8~8GM8jWDUfcu`sb&f?s${Qc(y$kv
zR-ilND{H2%iOFj6W#($s1vKyFVC^2|wJg1D_e!&&L+Yn}I+0+nV&ihq^x4N$gU}|e
zmxY$EZcV>eePDXovO$7gj<@rrn7a6v?`sFQ(2975fe=gbU)ImRY|3}0Qi%DK2h0op
z+t7SxV}(4_2O0kT3=FZU{+FGViw_JKSASz<6-iV`>f%6@yby$w!Caw!N?xK~=X}tX
z<Cs0rnXe!{1mnS$sV<DYX++CT=3y}Z-Y=~DoeRow06N}Cirvwf;bnq;i8;$A1yj{<
zwD1qQxDyP6`~-}tY^I7XuUL1h@+Lgy#fMS8H)!RIjp&ii>fVK=iIkG&V`RfoKAc!%
zsH;!%wFA);YO(|@w2sgW6vuE#l@xTsk16wgB6N;_swFUR30{L;#P_L@h@)AFj`ZAA
znhh*P^d=Aax0Q>ula@pn8;ad9YM!<Qiq>EueCxKX+9dz_0}2M_87@g4^21tAO?1Ba
z8mJ-DOdV(CUBaG^?}DDY<dBS28#x)M^2;Hc=k$HA_sUzicqmY6llX5IfFWl5|Dr6I
z`wu9~qb}q~%Za+XiQD`LYS(&Z)4Rhgdhf>^Esrd%c)e_t$91Q;P%BGy-PrbJeaV4T
zRqXTNGSY`Ccxvpp);q8#{5yi8DpqFf5^uU1v2rvXs$7z-c}ge_CkqbY&|ABAFzUSf
zN~}OHUO(N?_u?pLLO$nA%mbuNITKqLoun+N#@1jUshsM7=-n1N-B`Cx+i9)z@LZdq
zD!~?^R#mpXcADMEzB9`!--qcv7YU8EF@hrkm!ktehdLLfUTha#`9d-0d92)AYxN2T
z5{pCrbT-w<m$&0<Ir4eW)YMh^xVXWI<0((s8nXJ?%h_wt)JdhJPv|91qL3nU3~{18
zT`7BRMd5R~2<JyuqbX;|4oqlPOWyV6H8=c>%FuTz)P(bLOSO(`@TakoNhj4)&Xu<i
znOtpVXc;x&;cGLrDP~yw)NJ#fXy=gqM)-N|!&_)Ur{^#4;Sa=(^56X7<oyH5Pg00m
z<3N?T5C~-(4LGk`J=HAWjLy1t+8oQ=k$NOH8IB8nJXbA$ee!m3+z0*o8?(9o#M`#V
zd;DsjYWwI(YWCcv=J6N(=KQJ<9`8u03Av98CzJDGOerK2omIkxt@j02F}sQqedD({
z(XRT2doH@b1n=7;rnDi2<+s@8_jM{)f$KBOK@_~e7yl!V)Kk?#m$4D1EeAE{_v~jC
z-oseB2*&AM+KDClSv___=~?d`H_wnv(CLW<K22mxk4c-`N!Ky-!xQ_qJCpfoGWcA|
z-QL=O(|(%yEut)B<tnlrBsstCOm4z((Z|g}oh|?Hh1J?!zv|kpDR|4o*^!aXhG{Wf
zqG26-{KZat`oSvgjN`oF^$qC?YY5*WGgJ1HZ|)~(4fEv{R@oHt(V}t9&YPPQ-k5Fy
z_b>tCy!tODaDn-L&jcPd1xGke#JNq}*7oE}de@haMcl_0pX*C#+R_M{#86)G!!CQh
zrmjq&z5L8G3fC5VV}DbDD-l8cP0WUh3v<J(>S$6`JM)~zn65S5Gp&!GUeQGAU9S%3
z!z1L3L?^yk$9H1IHeuvB<U_AjrCz9LY2^rzZ9n?<%9*Q&#JB3bx#~&sBKJX376x;0
zRP;wzRRaYtI7vdIFR{co1n93D;xuZh9jaRr>)oD*?xcD&)wC9qA39%99=X^z9Q4KE
zeDd@a<SQ}AZ9#Bb`!ehwQJ>Jzt6>#ylT7q!)Fo``HfM3y*Z(Kf=((2GJye7E`2IyT
z{y(5vyn-VPCm_16$c7+5boWjbRowuxk*E(tWkw{>p1gJ&NRah@J?NI4C9&uur5vey
z@-=FxZP!~gD%DA)QYkQ;Au`d7%}AquKE2>v{%LF+=Ud&w!Ja2Ykr)rjxahxMYAker
zey3A`>VPPNZPZm;Qm{30?nc<f2HgjxO+%d&1)Di3xh=&GDh^ERexbX$bp0K!s&nCK
z^VAbO7|9-k3Dg6GiO-AR>&cVi?3Ja}kdZC2uR|E=?S^h>M_l1}*<Jd&mMS_Mp1s0z
zANM}gzRcE^-GG(*;5xk}uGg6KvmyNb4?iefe}k5L$OiGG{ukM}IXM1+Y#MS9vNeq`
zp9Macd~z+K0o$^6JC$^TI;t5(n>dj)ODiG2E@L<#VU=;SvtH9(x#ruT=UcCCU}_h&
z&v;?kR}&fUR5M}DLljto^EDwa=9TmUA#_wwRH>kNpM<Y*>##!10+H$N>&Q~+$O`Eq
z5#fE#AXFyPp)qOPGFqlDZ9nFxzFL-f8E+}&F^2_(`&G+V#Il5#D|Z;L#L|(CI3s@&
z<fscWI+`r@S)fbS7+9dDp<qq%dY9fML>>72iHmhX${S!>?)|{`((hsnj<@71qU3RM
z)V)ESMwnt4q7p9X$FLN<yM3_4Kj+9>jFijwmu|LdE<RHd8jCC1DZ`-?R*nR<jGFnX
z%BT?F3Tv!ZCw(cxm_&Iq#M{z;@ob`0YFDHC^~Hj;6P*TjR%<Uz3?4r|=)=WMdDp~)
z4{=0N*(muR;Y`)f9?Y3ao7|SrIm<{!62{I>7C51h3nV0N1@rK{Wshh>`II2^zDojy
zT!cK8BX|~hW)z<0pzMozhSS5n{ZkBI;{45f82e8b`rpoC@E>gDO7UXYFqDX}hs{Cs
zP(dFKXEdK+Y}`)8WbybuZ6qxsX7A|qS#tIqX1Y;tJO;n^aV}Z$PM#^G3dF64(M?gQ
zg%`I|YLFjBosWeNLCR_F3ns0)w!fv3O>K1Lah|XK8Gj(g`b=}VqN|2yvNSu3W7NaB
zbJj%IgSqi!e$JzD>jzvMHTd&A3%bwyEx`GENtH?vbLrwhg*o(rd-*64MBCPzql~eA
zTLqm;Co&yPX+b=ih2B+cgxAf~HATuw{h&>Th&;IAq2f=Z+foh$*KO*kNZDRidd$H`
zMiFHBlSaLfa9MP%Vp2;iB28rD@Q1D*^E&Qc@e#W`nJ#q8f>=#!UbeUW)Wf^g)-HvD
z-!@1Vj-GDZM!nP9hpk#%lJ$g@TA#>GPaz(|)?KhBU!+d0*J_ee6Z+g}s>D;l|6X3-
z%XnMlS<eOFzLqxsHyyYkQ33z;#qx3go(y1KPHhg=Y7S^@>Eblx4J&wL#xR3Nt*;{+
z#MuaBQM(7A>rex@b!^cUrO+S4M<9KMS3@-u!+k-NB{9)KgxP@lh8v0|1;6JWjQ$xN
z@t3IabNwD8o}nhRb`C_i_Lq!=9nd$`7m@k6-+nr9P@xI_e6-oqMFq+)IQCbwO+L%i
zW8?{8IdLR^@U{?#<<u?lz1cwMtw^GpU@CDbMYNHRXy+TEn|m<3Pm|~W6PZz$OM>A<
zY`Vr1?y%Yv5#dSg`gB``<x%1)s{kjB*q1fH%?Ow4LU}6p&`yU{-Ss;~Ysg!=2HY7g
z7lf!$p5}IYwLZq^43etE&KH(c`Iup0O47v>a%08%+cB}VU$T_GlO!I4N<=CcxFQvt
z6eqZGw#bFqEkuYe<1LaTl}ujlgDPYR7MZBBaR>R#$eSHD@V)YU&BUB6yu>Q|2og4>
zd&m=@?cOteTY9>nIPpqnPV7P$-HjX}sM6!%(-yLb`NY?A&?R5dr(K0iZf3<+pN>`t
zmH4%DKN9?Dx$|O~To2lxI#}LmvAd~`eIC21bK%-$z~ig>k9l<ztS%8<wQ<*sJYqIh
z4%%-&u0{-2d7gc3@P8_N4~6g1=s5-c;QWwEP=HWa;zBeuE<o(k^`&x=6=%aOFV_|8
zUWvFP4@Xm1b*UX}VwdyoeqTg;LCSI)=Tv(C`=d|qkzF}TMDsHu+zebEaH6eAS772b
z4MrkmF||H=*DmTd`BV|<*`jU(r_AvatZS`p7r|v8EW5tryqg`Fy(X_GE``c-(G)Z#
z-LG$PWnQN)i6X_h$*5}Pem_n5EcvzqEwRPx-P@_*xEHvoi5_&MhB$Ghm7nQEnOkO!
zn;t^jqDtG`Lxy`4XD-e^x!9;SpaI9|u1F|8qRCF3(@T^f?wPh&2KtYI!Y}e=H;uSH
zDv@)@c}8ZRbXVs)Qs$b1v$mNuBcvk?=`KbE-&K&XP6Ri8FTO2S4HWT(&ueBRC`B=Z
zM!dj0G}V9M?XUS^nQt}F<9NcuVNb+jUJ)+YdBdUrW&CsM_O1WH@quH-6_I0;Zi-+`
zNztrPSvS)UZ|uQfYkDG&oRkNnFXeh-(Amz`%w14w1bgknQgI*cqpp2{KfgR<zD%vH
zJKllP%#|gzmYwB1YlCNQX{8F&G`WX5_h_{If3RgKeG)@&2O+|?3so&4vCO{vSz_xm
zdTU%h-ECx&WNBKI^}trqXKY1_9Z?mx|7~<G<AgXj>d<NMO?MXcIRjsRw@SH?v6C+M
zg;78C1xk<+$z6Qe|F#Nn-={Zn|H&4SAmj)O=#9R)K?FMO-)BWfV54zouHR$v;1p6p
z%dOR=)6mtg7P(TUgIrb=!g@LcFW+u2k6h~85h!<Ux1HL*M|*~DKdFdkRaEbcHW-T-
zs<KahG2;_ks?*q)l|gCUS$<2romxI-k3UnUAizlP8%#n~;>1OyO=)JbQJ%s&VOO0V
zM_OgWp^d4DHq>&|d_(-TOS?UK16*;r{bA=zQPZNQY@RM#%SZ+P>&EFx<>-1|3x%(;
zml_ONCS?gFK5KNi4l-7bUO)Q%SA9LAa%WywE=crRIVObg@sViA4o(z5U+j9{@*LXn
znbG+@T)jti<mCASnHPx{LI)0H3D2A$0z{J^`==ie1q&~XnQ3hhBw&>uj%`E+67lO0
zWksHa5@-z0T_=<ZpHRJ|^$!@LgvRg5ntqF&%_kG%%V0OdTI}B4JpLeEOH>~e_UdUO
zN=x2_<9D79vR|<31koz(14}xY-^+<d@lmpc##`5(D>2pxrY`ARyjYKAm34R6SdO~g
z<Zao$c~C3HanE&_`#v3t=MQ8<L`4Ao0Vn3}De`kD#+TH+zK<fzuAWEJBSzWK9plL;
zyH&)I<Q&O;n)j;y@p3WN>O)bmptLi*$I9Xd7*di+9DI*ra7eZT8ZNa9nOtM4G<g9^
z;-*f;(m-<eiTz`6V()xyekuHgq~22luQmR*GrX^I?2<1+9nGA-70i@FWljeSX@<th
z3sfH5K6~u@ZWCeOfn(wx4&J9y@%+J^NlNujM{K%Aei@Lo_5J0(o6q#m+u8Q|@nE_T
z#YZ(=^XOZh9;<G?XKCEuOb^50g2&q(wz$OH!*kH6Q?>BEr#&Zl$$F{fwp3kXTT?7(
z?vBDjB9D?A8cZ!@=J>c=37O%m*7w}Nk8y7-3$Qa>#R)(4Crg*@UT6dul{%!VAxogs
zM$(soH!CZ;SAW89)VCRbm@OB1=;Xl9^6zEx+@mppf$#pe_B@^>IY&uOz@8@xM_|*h
zNIra8ppBn%?Q{wnK(=>Q)8cm;Na6N=t<aNbpwa49SXQWVwf*TZZ6snJ`w4a{Eej2u
zHJ^7_6*ZQ|d@|t*SWvX<+ukR$G2f5vs8eMOA{%DOQNwZ1T9{HDd@{BAdRUBq^t`5V
zI*E{&WCQ!gk+7ITDOArdby?7lyf!QG!G3dIYms#jH~S$ckIq5PC~{SHsg*mh>G7QT
zW1&v!(?j?a0x)iURpBe)CH@jp-J(dL{LU+|V}x14gG#HI#r#=1PCYD#JW1IHkFj)j
zavE|EjtrW0wM>bgFg|<KIUN^Kpgt2<Tp1qsah`W_rp{Dm#@v;<l;&+sbQB#8UirY0
znXReIOjjdq!S}jWd54D1*LFz}topP%VUzF2%<|-VKHEi}m!O{G*=d!7d$&IgeGVK{
z*QcO{MuDeO{`tJq+AscseX{%+@jWcMM}-7~|3Ho4QCEPl2(Wn=yi&iU-^_e6A?17P
z;9%7B0FI+xZXt=)@+6N9kB*Efw~<uENdH;dT+S!Yh+aG)nfbWpw0A4-x(7ZAzTE0T
z<`UVG3bz`G@yQd%)dE3lMMj9kl+I_f*sI9Ay3LGyD;D`#%+l|hV%X3g2xnM$K0>@+
zlI+`aAERBnbu`-|UaJ|#ugo3%fjQ*rz6uA@_EhJoRJkEyKJ8WXL7WquuC4Ys!Re8F
zI|z>UHBj_>92vw18k~_&Ic36`ncO4c55~~a*q*93s1y#oWT~OYCMCgB(v;6jU~%S0
zkda1d>{l#r$6#}`Deoplj^wq$;8^Y7MxxGKxA5VNbPfEO*DFkx)iLK6w8;{H&L6)4
z>t7@wn)?8ANad;TPjwRkY6}xTqaE1-K2z6t#z=GC;T~#k^o!>p^4%XOhD-kUY<@iV
z=+2yfus48tYg9P`x^N|c`vN%;%sr+W;Eed!4Tcjiy8rfjl!_Nam!Jf`$Lt_#M6<Ee
zX>P;mYbT+;+pi(~BQeVGr^BH|E7g<J(I@({BFFDvXP)L)V`MTh%u8?$D6yOG_kHvw
zOZ@7s&I%$k;CcJh;dSz-q{-qOE;g;mXK(sCHG34X4GeTVOyoWy5WZpwNhR-_T_%e<
zpRC);XZQc93*u6|M{@7e{D9!@-!4W;P0SIN61539BTQ(qA5b37JozGVeQW1nCB4&6
zN7sX78y7e4HPe0)OBBg&SHaKRM#c1lJNGzV>Z&WPYxB{iFuzA{6B+M;eB0=t<Hzj+
zBR)nH?Og_vG!$i-<lU9b_oZtgaLhs`63TnWg<L*KL}8v84y_I7>T(ZR8f$tpRo)EX
zyVWX5$!)oCtkZc6T~@4^<uL4buwcETlg7)Icq6_&qvoZ=kgmO>NV$gkL$8+0U|XU<
z%cM!PecH?E$3pAeCJ;rUc%SUtqwop*iDnQph!p_b2oE*Ikd_I=AI7axetV9WfBXRD
za0s567k7|T`qk@Iu>{XCw0Wbf_GQ_{q_tfojv3ocd7ggTlGVe8w1?Nlc6QddhwJj@
z{(a{A#$Fv)P?(G4ME~X-u)X|841Z-M+@%_@i<`Py{%T2S%Sni-FiToKG1Ye0Fm-fg
z6?d|?fbiDE)lI_2)CI(W%&utqSDTB62j~J~`6Vq~&0TDr-JD!NT#zGH9mKBTZsvBE
z_y7R}MrKzwbpX-_{?GY=NyLBU4X|=@@Ns}xxdAIYrvM-Dhm(&7a8=#GQ8IONvAs(!
zVB_F`?4ZEE|7k(;3IGgMft&|d5E#-6BqT^V0+|7jjEMhaV*LFGnO)M>%E}VRXMiLy
z=z}01q^q;3Igncc?8DBGL<f6ID<EIuuIXZHZQ};w767q3Ia-2v0C{%>FyR3XRm-~`
zUZ5oo;s;i$0TKXg*QOu=;9YYNy9JQe0L*L!RKOq`5W6kV0$#8KD&P$V5W6EVCjbRl
zDe!^|FfoW7fa;FS$qDp(0&55I8rUD(TDaK&c_Vj;7XQf9xTE*q){*}(^&cF$qug)E
zZ*OW1(2VOBmH%HE0b&<(HHV~t2mp}TAz&asSh*mUHVIQ_*<aKEq(tZMCXiqPq`ugi
zi#b}`TLN5R*Ko6R(1tXWOug=w1>)oeNdBK>n7c(n)}#;O1x&O5KNRHuuXBI_m6#jg
zG7lFgz+8S1uK+JHCkMb#em*V`n1dJS<hv_8T;RJG!2BE_ZcZK$qz3cyKnDC@1^J(S
z5F0e4K!E&Rcz`(oU>qPmz~05j%LU>Ca|4|KMfkyYh5rsKK4dTi8K3}}HXvTWb0Q$X
z1H8uv;s-QAKEMZgR|8)a9~WdSfIQ$DhRn&y$pHo$oInQvfQttivKk21kQ#yo4*(oe
zc!7!efG?5<*wi33FCRb_@ULflz+@1gCZzBJV}TkV5jOw<=-}n&yBh!*#0?DQ0WjkR
z2H*7oC$0bhn3o&C8ffzXnB7eVOaiF^1b*=iLP7wEJJJF`c>(0^$nrOKkor!9AnW4>
zMsjlU^4$#q*7UddAm4EWpeSVcT>%T+t?hrN`qh9S2$_OI0Kg2gd<a$mmVZkEL;~&@
z2S_}mhLG!TM1IY{3t1BH9sMAZ@IUiI-hvR7`>z80M&^Ue0u%^xkbX!X&tDS&%z=#l
z`wv7K?gRlM4F6R)Irw>ixd6a_Nygvx-;(in{Y$idNd`nJ{#OBD-~Hy~LWW4l-{np?
zfIdE8KY|FteFb1RfH=Tlh-CbuApb2Hf7ibR;+J4R+W#yN<o@o52*BS3A{>AKAw%l_
z1TFkigBFME-oMTr(4Eg6IDt$7mi>BW4gzR+*J}Z=>sRa7J_URucgKVkz<^)RZ9srn
z+`VfHIS1}qc7R0uYB_)axxRbm2(ao`%L&AN2M77?WB_FOuHPASXJ`TN_|Hkg{Z|&>
zKM$0@z`8(=7GNMUu>0R#-++O42h3l9e;rJ~`tK%p1Kp{Vz&q}cqv-CLCkQx=A+7)H
zqrCq-@c*(?a{);FfvP6w*v)|oSc7lcq0%ZodaFc<VeH>l(URfrcVJR07_1z0OX3^V
zt4@iBuXafrA-xDZ44%xsB(7O`I4fP4`cP9Jgu|&u<EF+(d?Cx%j6C+NFz9jFh0ZCK
zC-acLphr#HMaSEXM5c0a;k9Yf8Ckt4$+2URw~MY4ZUdk|Q9eopN!vK1(89KFUG5It
zP49T?=*xm-ibGE1>WH((8Lj&4OM4xdw7HP^%3`9fHKp}1PI!m80+Lrlxb##!D(LcW
zCg{@>2ymWuvIz;{o8h34gukUo6|K4P+a0=mVOv?0@KlV7`(x2p>*n4~uUn{86$83|
zD@?rq6tTbP_b1j6a4okM;CXlpg&mf*fxID|cISBzjYu7Y!x{WvUWdoTP;D`2+V_Bl
zaQg3K{V&h}I=|%{n1|=_FVMxs(*FyzrHw7R)p~#~e$OL#S)y!HsKSmcR}&wX4_qHU
zLk;3#e!2&Vd+sfNV(J!&8-(~dBn}lj5SKL`Tx)7!^#7a+^YiJz(C_<zgpW#5FdLc*
z;}ldw?`~pf1cN7zPh_z+Cqxh}Z-w8MMfV!#GtJXQ$vhz?`5cV5<8w=)n=t3wWPA5u
z2oe7OBGWIe^f!@9kkEm@uF)oik1lEc=N|p}JrX{JdGai+2Gx2|eNmcxLw|aly8)Nd
zvtBMc3!)_EZhJZV<crZtY{GHCH*I3DT>+(=SuHWLO>{09%F?@*G_Vl#HW~1uRKUwR
z`VQL%y?r9y(-ah;Fe8SQ-B2qlo|itpR&zrBl^S1?^@CZy$ZgEg`J>)xh19<0M7yu>
zPd!2=wqx-vTH2+2q`ib@`{+o!J~+)FT%lp8HP;G+{uF{b5)67Wu|NFLBW^0>tLyV%
z6vCSC76qj{=-<9(X4UCl2GdwJ>qo>8xg&P9eDiaRP9GH8x$N^LBHunmKwmPy$JYp)
zVgI6|0N3xuO(eDh!fBXWfYX={x&cm8Z2$a9hqZ%10Y5`%Vp_+u)G^bZ{&|+KrBEt)
z7E$}1)Pm~ECfp(&LzyDQPL)id=jv{x`_AZdUnkf3l~%WmFGhbrW5|=E+{5d84A*}m
zjjXXX5FBBBRXPxt5Mm%sw|>Cs=Fyd_T-DnZft1~=WP2u;&Mh-G)ondCXQKEd(PFDt
zU?rgzTe`tu`OD|hkRVVrY%AL1v&`q_BiX4X3lICQNyhKN@E!v(7^3U{HjXaouaFGU
zP&Ul=?=vq?3mF<S7N5`2({4wR&frTuNo!Qp)gVvcP7Zm#edJt|b0TCvVr8|Ym!BTF
zp0YWLXLgY9H}gr_P?!GWPKcG@`z<vA0t8;AvfOf(q6IcD^eMWR`roh~Hjlm{pWtCb
z;&XdkV9CgBUUvQbv+qp|*U}S3LwogdUkYI(Nv*9d1er)G^K^pERWYP(`A!?Xh*VE0
z7_ozsw*$UN2Xl*0`f3qgXABE5T2=45sk1o}gj16Dos=G}7fjoUk6czW)1Jvrhk;$_
z4{UliCi+$qelEtsD#X9Bp*&rxa-3u{Y1R;wGP6*2sj!XKQoc~d9dGDrLB%|`my%#~
zP5iQB+%=41mu8ReCI20cykLP&0bP4$u2lK3Qg~+cf<kY~jp82K-)Fk!_yhBeQk)nz
z;B$dFY^73x!X3MH6BL=gHD*>XX<-csVZcJn1Xbn}c0?^Lt?9g8RsM`O0B63B>{;TS
z5=$TX&JyJ$L5}JnveukkpOA#j=7!h1xU$*ZaRE5*qP|-O%|ooX1<yj}9E@9YMz(v9
zzQ-O7B+36?V5Ab2T;M1XLoQzCvmceMJr)@$RM%BJXhR!nreW3#-GCW?fFpB+EwLAw
zibKKW$G*ko_nu4WNN^_8S>=PHL6>D0*i?T>h-pyBNW!yT{b5dC*&(sOBAYsoo25mr
zPhJ)ff)VisOV^D#-Tl#c%7OYavAuo}?zwygea<WHEv$hf&lEd9cN~?fnl;TKO1+^<
z9O9^IdoQtxej2QIXwmd(u9Rw5WhCW-Y$nOHD`x@ouTri2eAMNzID-8u`fWZv;2lkF
zvL@)sQXrR#d(53tEhNWt6mYRp-+ICOH2##}GSu_yEi=0R)C1AyPzpZ{;r~rM;6C%O
zJ&Ws4tZF=J|IltG51Q#ez6$?#JEcB=i};AP{sXO<IrRf}&P#!a`O74p0=j3`E{|_m
zR6=DsE0^SO1Jjp{Jc0&EdMqEJgDINRwTBCPl2QAlctwP<xTa|_1v#4F-KVW)BU>q#
z>!{U-q|?Tyu+Kg-!47EDf87qkb}0YudR>9ya6OBo?BTm3w~iCBnO4zmxr&$kQ3qZ`
ztwDj)KVio{k3O%NAvf4w^htzdI)n96WJd?XDEeN*zV(V0rH$c2iGz8mw0!S)8=8_*
z65^F4l65`z0shbDBQ32{*BHu_nppP`>>hQ7_Yb}hFz;Jc4pk1m$D=&lfMUbNr3&2c
z)MPo=08vE%yEd2v37DisLMNCa(sU5Jd@#{-Z5IU9s#XdZ5E<A)J<Z3qz6Y0k^cya~
zX!F~xl1D`#39up#dWVW~Xc5jmYAY$J=bgTK-k6lBoNz?hKaN9|>6J-MU2WAnr)L4O
zTGCMb*!Jx$22R<<5;;{$G>U0hqVr_Ms^iNzf?JQ&5gmj>-peRmBh2Ye#mUF`)@*Zd
z=C;}00{xCLhB!5`?v*7PIx-x2s3C6kT`3?}&BC+(IIfK?{zUqh(TCsWSUd!tq3@#b
z#!}*KUNZRv@doGbc^^I?b`iTZT(~5cYTBX&ks#qus+k>RuZI}7c({?jW59M5&k|$9
zp&Z&~D#pxGt*9$_rLN}uTC`-VBKP^au>tEn^twkC<APYDetWd23dKoMqV5WWvLDf$
zP&<=Lm$O<OetiRz#i*+ItR74=iWMk>%9&ENN?ZBrO<-HJ__5>4Zg3~4TtXc*?1K^E
zwpYxXh$Bdr4Ss0KXCBus#R;Fi?N%&;KqqkBi5%ieV!5X+OCMQA6}JP9V|>Zvo9l(o
zvfsz6PoE^R23l5EkL)kXO&NITFpz>QXf+1Qg59uWfuqDf*d(VOLzKd`LtgAb<zFG(
zS0K1$p)_1UojEhEGM9Nt>iph#)<QY8*NyzsW%k?QU;(_wi`7V81f|&7t-T8zTPAk(
zCNjO8XYSak3xu15*y+~KiN0FgULpJp*Q2_Jg7>J3VBoU<dkO*`myeHrd0ajpy%_OU
z*IH=4cCwRDf6de!$=So<KakL{u3(7$G_?P{(Hni?Vm~&0y-e?jrs!B&Q6NV0fU@4u
zjje{COy&fID~Cz4bf|1B?@~N16d^(RC=aF6U*P8|dqBrjHQI+|5iYT6nhi$r9!E8z
zENn3>)6GP@(n8f--OjIS2fG(sa8I0Kicri+&ye22i95)4KiGK$CApWPS(y-LO@q^h
zw_t5XYO*<23;l*jQ!>qWuQ%fLaA{1lWVfh_(szb5MON(|M-3w5o0OOdKQ}n0*W4#5
zC!<<*qbNF&-q_y+Ze%YvC+#W21RW<&BG)OvAGxrHO%w!7^3qpjmgaQbvEGn1d>D2M
zq)44~R(!krtnXaqr`w#~kbAQWV*aCbC9$I>^f%Hq-Q8TGIkAjB`7G7MS<TYm4pU65
z$&0Q&7~&r;&6wOJc2sY5s9wA?lZ=hBcNJ-Yom4#hC@|TD(`kR0GnrQl1~X9x%{GQT
z*?r_u*Lvz)pZgHJmG5v5Pw!D%0n5m5Sz5<Y582VUIW#!9`P4c1c-1*Lc_2I51Ej!=
z1_O=`QjS5wQWgZ3XcI9DaJxOVhDIqlCQ5!U<{Rvepgt?-B4{YJRwDd+FuF%c{iE+x
zNlh*a2Dox>;(dQaX_VYtB9SSvc{??L9x4EfRF#yRmZ5`!!5KoqB668X>B{$m<XQeq
zldexv|IBU1CY6TTrFtownYdvD$w(g7kw~5$wr)+!u>ea!rfVw#Md$}ua2~X|zZ_RX
z=|(Cgjxwq{DY{-^=Lb8pt`)s8;n;b{YiTP>x)8K@3-NivfC$j;gIYqC6kc8&F5O|6
zeYAcwJ(H=|Bx=z?%qf#dZu#2XWuKPnHYPpzBv?<DO8t+&+i0hogpMhk>v$Q5ie|jd
z^f*Wo_gPjFqWVORT*xKF%KXl^d&e>G`@=!?qaXbjmt<8KLpKK5_fYU2B^ofr{g$Uv
zu^m{D*qK|vm4b{st3sCy9O|){Tz)2-=m#NiDD~db6mY3@MtC|1^5*lZ@sC=TU0D0)
z;V_)(pU;kJW~YsN+9t|rG0qoad*7m4(<PL8kD2FOpqO7%-xCG~Sh@bCe1QKbismo>
zZJUO3pzLyb!s**Gh^DXqLh;Mc5|W9%*PuqR#XxlfEj7(lii$s#L`Crpw+jtb@ueIR
zoJVSWubo0g>|SCiZtm>yYR#E+@eH2c`N3GxP`=vGK%YP#X5LG@##-MA^qC_c1KGyI
zlp#ve!CTcp7Rv<}WLh!(QO`ZBuNY)+f|!F$tL@l=_bs_5Zoh})8yxyynDw#+;8$$6
zcA!I*mf*aF>m!;y;AOOd(rS#B>nu3&arDTyFVNb)B838@bGWoe(CU-5jx9o^`5SIY
z#YX*Lex#=R=%(+BN5>*(zpOL<27FwLv+&1EX9fPA2alp!RtSBLXs0*WAHRvnIzn=9
zv|w-Ud^x+ZDV~hzSqX)1WYJYwUQn90zbzc}&05MuI%Xo8Uyw3M@3TPA%I8<)daDL}
zH7gF(H4Try9#AZ2vm1QdB~aK4>ApvX?$N6`0juiog(#u-DI5^vbXfL)mLmZ9yrTCM
zWA|rO*x9(451}}!nM!B1!Q-c<Ze8*3S;1(}Wrz-XzH^y3f6)QaZ^57@MwX8ie)srD
zRCc!Z#q3Q%z5%-+751m3)$&KweuA~V$@TT%A8tkzdz6Pbw@gf%j;7FTWe1P0)G3Z9
z;%C%$vhw1YnC-})%svh=#FC#AWh>!QUTA^QTW(1$7nwbLNbVtc^6$k0X5oK{BH#=7
z?OMTH?}1IQnj0EB44>}*ijRFun;jHk9gqQX!2Afpm#oQ*B$6b{hE5z@l81q!ox|_H
z2cLV?Ou#DrTYN-531CBfp`$M~Zm2{0s4kw8Mtj|AbBw{0m;1b;<PUD_s9ZJ~XmrCl
z)L8w}lZ}m3>Lss~)`;RW@wRoM)n=(_cSGSz<inv6f*gz{GNOfhP`gL#{1bcVy`)OR
z)Mg`4WI_1!U@XlUnd)UnDm+w!H5&arVDD4j{wVAyQNaZk1o;1722lsxtO+N}TS7uc
z(S|K}J<>~mgve7DW3z1HdUN>qpA1>lZ+i-83z5L#N-<>gYi-p$C>PS`&a{hsvTJU+
z_F-HI{v*k-p}=d4rv)+Yn(j=_iX*A#Y718$x+(rL=$6RS6|SWAO9KvFg!IxcDi6>v
zB0(zj8ZsaEK{H1P^s)Y_<;>1G_TR=7IM88#h~s3eML*g_4eS}xI9tQsTV3_o3^Z}_
z#*&@E>`SC1Bn$G$F}PisylT098YVRIj_3{P;fiO*q(`r@SrVQrbo4x9>82f<-V~AG
zVludQt@t`@xWOd7DOmCF>fz+b_n$E2j8@<8A>}`{xL@whzyFK8|G1o`om>F7>VMpr
zfR~Jwn->h?<O2>8PA<Sf0QidkGOB_F0H^0~jD@RQ5KV!XN2_n0i;wp~ol83lwJS9U
zh2~B@0L-U&r^tTy;ChdC!U>@9TX<3lLO@0W5NWBLM}0{RBRXQm#mIZ%WIk=L-af7a
z3U0y)nwSlW+zJY>r(J+Mtc0~TzuD$><iTEpbq~-kT(wD2_iy-Uo9cFK!|Te7Qi(%m
zh3=D5K=cYz5N~lmf6b-%VE9VH`P}%t&1^y7MPy+A02$)Yj>h~IuTXwtWB<>>-ua5D
zXUuzArnb5YgGldlnyeln&1(oiLe40V1;5?1)_e3GFxMaKLOg0s4j(yC_juq4I6%*6
zhgLzdiSM7k5So20JKjv8WyDovz~<i1q-%OoB;g)a?Dx{D_y+wvVKsw%hO$v(GR>C|
zKP}Q*o~$N|<-!?-w-qW2+ZE;b0`Z7BK&@T!mE-XRMx!qFBVybe+^wwgG##@CXwjjS
zGk7c73yeA+$}c@=8W50NICZx)(50dn-?#e_vCJguW$(J^BHudtKOi;@<=?A|mR#~~
zhf$>YX%S#QBC5Z)vm?dp5yrng8W0`uVu%J>OoP8%^n>a3lctT$Oa^*>fyt_T&Q>(#
z4F&bZ#|^zBuf!HF!NS&_ajNK9s)%O`4?y7y?b0El??&3$VCv<XJk8~=vMm@}VR@0i
zm1>XyfoFT4w4J7Jdbb9w)a(;d&&RgW2j*Yln2|eK<muPeU-Mzq`QJht6Zu-)!-{(p
zD6T&VWD-$rhy?<C2Mt4sb=rOQUt8lxD-_?J$zFY<8pD@TXX43BVq|wuq1Dw`6!c5$
zM)wX|ceLQ8Ws+h~uBN>hMLzbA+`zq9_>xv9!JTd+@C*gaR7OHJuE3k+%!JeUYQ78n
z9nHZFMmwCmMjtd<Xfjm~a_Pk&MD57LiKar8SAxHN;}V~16ip)9bu59Kn1sH;bF1^?
z#1*YdBc49d(W!OCDEl=!3>IlpRx`DsP7ND2mMxE%(nz}=Ye3U`qn{*isT)xreT(dV
znx!6>*JkHjG{U}4uQ7cOiSE(f{^W>~in&YRkn}A!fqh2zO;jzVbFmE#lh*MA{ep-A
z8B!*rBe+Wf{v|3)nBmjFvosDA0~yu0oV^0mFReSUvFF9=r-v9RFZU`|!giq_h!b1g
zgW5f+3m4#${OzWpq}uNQaPPVanTMl&i?oRxAzHZh$8(7Rq@;8kg_XRM6#cXT%L{4&
zr~bxuiE!mq+Y`spp{kmn1*i`^ZdElgD`{)wKN);>tl^eppXuU{O}Bd&xB1#PM7O}}
zXmxs=F>qUitHY|UMS!DRWEed``9n|D&bUx24NPQX!l`%>@-F!VZGJrST!wm%GV3~F
zdhV95UmTRk$HmzBvED&~80mMcrDIQ0`7t<fG{#b&mL~?L;J`~*%RJ^rDe4>W5$EZQ
z70AnM>2fxK8LSH_EUHv*#;}CmdSEy!xXLXb6bIG$l_BQ*JAB!V5#PV*26zJgr3G;U
zp48vc4HCKx1ZU6vKz;@aaM!6{Os3jO`1!nY>lGO+)+?{R+5&Spbk0DzUed_)w-h(r
zv5#1Sv#5Q<KXR}*^U4K=i)qU}7S+#Iq<jZwBzrlDD@>wf5%jV*g6u<b{}#-lx_*uO
z7W6)*7L8#Ys(DRiqqB%9EYC+Dyp3}X^T;HbTrGuNXgHk9&sWb691lJGp-nz=-1)fw
zcho+R_jS+m{={AuKZpj2J^z2&yApURw{{<q+DRu;Nwigz!k+AH?+lSlA!DW}Lqs7{
zk|sl@%tM8oLWY!zC_+&YN^K32CMPLP8a1AV?^*BOVh`V^@80wM?z#7O{eH{4_VccJ
zJ!?H{J?r_e_f>Ypc%gRYaSgMbxiLXM*2xzOi)IpA=XOnrOij==aek5M_CVg+?1<K!
z*;O@0jNeMk)r8DEO++a<1;&5udV0oPqbh`P<c9F?DJ}QxeN>l7UDk*`Q2au2*%v}!
z`oUdN?qV&aR`-^*T#S&+oO&amn^}D%AWuT}<bw10>P@i$d!@WSeS1})=CtZ=2(g1t
z_wieqfqBh{-3Rut$7h2*>_^4w2J5V3l9pM~%F}v}zu!qM+nky!G)KsTDX055W6?9q
z%@=X$^<R`$^c1~))p}a`(b6~N@`14(8O-!au5k&))1FD4h>)5!wtTSxwQmky(B$$w
z7l|Fr3g@u;PS;?cMIBMW1$&O}ejY;B_E0P>GcLbke*SFU9H08gE&IAD){JO}!_#-U
zR_Ay&%c<>}I{9N))LE^E{YI->a$RMXCVUXYTlb1}MW$BlJ5r?nmhm-%`KW+tf8=KQ
z`#7Z(9RZmXe+`Rpl@i0wS2J>2b5p6r^m8*)BlgFcntD_03d0Q|jH9lUYLISKTpcAM
z>Cf<|n~w{~ogR7TL(=}{qJ7(Kf};BU8Cu7eJ04iB9=u<_X>@jf<Jwj0gYQ}HdfB~+
zzQSWbv*+~d%gm3YgMNp#52%DbKeKUWjPCX&N2}M4f78$}$-KNNN+K&!Lgwn5FG4#P
zkCA*3^RSkI%ZM&{GbTj-gG+BwW$KqFxJSL0?GNaEsZM;LD&TCp+AHEw^Ausvj0n~5
zN#~Y-{VFo)=`lWtIWP?2VPCidVC3#KwWvM9h6~Z*VC>8$m5`7o_aC%^9HEjWx|5As
z&s){3zUZ=bslH*7%0!>(#@;m=YKyI6c1Dz}i+h%FIIF15O3AeS-F^F`kEi{ydAZt+
z?t$2qf~gh}%U$5+2=!{`PhIO2ye-DhB1M~1!o1Gm0;|PILQ)AQ)=bNoFS@LQIp6kA
zLb3a;`DG!*-)Aj4p6yw@s%A#Wb48KGx9;w$4V5z;`!aEiTbq*O#**W|kD6f;logj?
zS2v;gx}4~QmDyuT{ca{^;>)FH7fxI$B6YoNqn_-Bs%dY|thkU-+rivwvGuW2|ID#x
zrRRRyEO1fUX(gYbF?`9(UbT|Iv}44fp_jrC2gI3B;OU6DhK>;*OD~-``{hT8$W4R%
zr-G_8YNnW844v$$^|V|+ucm$76Q!kTOU_P7lQA!Li&dXzXkzG=W_VIIa(1>|g;Qph
z_52FU2bEq;=jyv0I$D+_`XzpidpgGYtJyQhjMt`-$I?h&rdijiM<y73TB&!}fz<zf
z39(%2!?^W&3c@?$%1gS=FrA~@KH@s*&F?0dM5sOIQ&B$aWeV8vj95`+OXGkhyyzVO
zwC7SjiS%kb4XOz_{o`a>x}Lt6^8o2lbllP{_V}s;Hyf)G+q9z|96Q_=#WcLrvM}~i
zN_=&ia@6J}O1-2;jmC&qhu_G`elBfmtq{3D=-N1O!G{|=+~poNN!@mlY<7|CwJX~o
zcW85nRl)g^Ikz6h|Ilg*R2EX+qTe32s(1gfq(45VP4W7e>2v*PXqk!np@Om*-e!R-
z6;?T{IkixhDbo7k=&ebXwStqQPEoD|KA3miM!#}OFwJnSqxbrxZ+1)DqkJ9pcg{}P
zUouN0x1hC60oRk9BO5S(w9B*Xqd@`E%l(V@wNdxRS5b^So}V!+&pNlsiC*zt)XL+;
zrn$_3{2L`>UzM*hZf|EC-FJGoy=Tx~iMoOv%nQrRmYuPsJeOlgjw^9>us!QFIm>n2
z9r0CG7FOAzQs4i4`|^$Yw(kN;fm;IkbPJ!QJ)FIb*eetY%?jaf1;vYsqB%@aFe{ig
zrM7UT+2d1o&rs1F??&`-^)7Jsa2>PS9#6v8Ae)rTY>Cl)O2cP$O98XL5sQO>6X>!C
zred{r;cK%i?+zV`KLcufiPMrMMCXT!1p8~oXKu*fDLmi1V&+Hr^2PU)`?K3Dn?6pd
zn9Ni@X26_OrHm7@kkGWtB1q1<!5nz9@Sxw){o%G_t0qg=MH-EYT>D8xwvd81@6*0e
z>q@p99Wm!tpqKyHxB!K??FDuFy5}A#$Q#hK&$`r|Zd}+dW4i2#K-0x0qg*GW&$(Hd
zOP8eWdA_xL@j!G#nueAEsWt2cEzT!jSfNk0P`<5k^1`xa!8(uZj^$6B&&kFpCfJUz
zI4JA=c7?>Q0+}NZrMigsFA=R=^B$B5-+1im9`W+gL+Y_}vHce5PUrnz?!LTn>{ZY2
z(o-(};U<|DW>8!%KEaFL5ZkbCCB13oI|+tZL#*5H*-D~ro(jdGNpTJB`zi`uE;P$J
zyni>*`stP!J{`el4gD*-4D2+K4VRF4Jlc{^%6yi<B-pf%*u{Fu#$rMkTp~ZIp?PMX
zKly1?;E(>W-+d+AG^4D;PetTus;nFRyW~tcsgnSg*qx+juVo!-X*03)kC@U6m*no|
zD!q;{njbA^9-lcqRoUIQD(ntz!tvv~tO}FL;tvp$KU8{4%C5>^F>6EGqR7Vigu4%G
zWLgSu*51#LKeN3+fHaP`HemkAEBh$O#C#;R1J#m&0GH3d;kM$#eDA#Kg01n_`Uxdn
zlSlgpS!Sp^SO;H<iLyTNBItAFuG`{jy&K)58ppLB7Vo!=Hau8oBiHdNi+rq#x<;tQ
zvhZAQL9c&n#G><s#p9blpDaK2s6{4O)=qME-jW7#Q{$8AyRW<~^8mPzS3`2hj+_NI
zm7DE^Ev4_#`*(iK+oXJaOzi2b-8*qIHgQw7>AcA`NZk5inzbGNYX1x)zl1pR)Ue?2
z+lHcy%}T)*GY2U1ND@zOOY7a$371dHWHi+T4Rn8MFVFpcD?zP(;ryp79PHh$X-W5I
zTu>rE4YeBxdUvxqa=hxR{I7i-qT;IViD?%^j@)KM=Vw{igjDIKPq<8KmU}0*@4VEw
zrC&5A*C)$M$Yiv1i%rPezDo8$LJsY|;N$#7K9VA0Jr(Cyo(h*-I(1#zM}ZQPC*g4K
z!J89jIy+~|^oQ|Tn2<0S0gnaQFD$H;EsO$2>x@PBf$Z=o?snU=AO@}j8GBJ=%!GLD
zAcvmZg#CCW+qy@ohds}CQ%Dir!ZF(xbo$u!YFp!t+{~$Y!EZtbWN-Orj%!Q5StjaH
zI`M0Q;@u}r4@gpLsyf|o{Px1>m}ZtvYpG+Syu?(K4wu<$#1>_dRVo8D)$o1#c6(e>
zd~K#v#FJE`4CSH@?Jc+>q$8eiR6mgxF!|3yLUK^_^2CFr7lFz?tC*(rb?q<gO5gTo
z$U6C}%$-UooiXs9HhQXsSLs`A!-j9Z6W<3OPwx&f?z=MO`oqnaTi>3uc5d3+>8nla
z=;^MDefVd>vA({~Uq`QUUJ=WW#rdq>NxyOsXPROJ04KPMCg<GLgF7e<eH!(LYCK=n
zAJG-r5dO6n@28>B?mtH*X!VnXqy>9tJZz}IyY*b9|7fk&Y9#60%U8DB6WrX@CLVZ>
zOcFe)-`b^--!QS+ajf|2Ju}ZV;Ac4{f3|sNpFY5>Qoom6Q|YlvGruA3#n`4Y0o$-|
zHO=A1=RU8z-n(bMyqiSK+wf<zENSX_zn!pt5pZZ{#l5`?{3HjyUe$fsFMMLQMmN8D
z^I6t{f%J$Yn{q<c63!oKqr{6m^l?QBkAE77x_hq1c|7uyvd>nuvl}I);PKd8E;D+D
ziknqnu7I}Qg(BzahZe;wh+HsX)UFiqSNzJvXSqoK1^odPbQ@tr(IuPB9!%<~?-G&2
z-KuXg@^kCHm%h@nNBhyjNl8WFb9LTlt;Bad6|m9ku`p`;VS2hm&x%>pf7n!LE+y&x
zh8YL5uZOlf%=1Z~a^G+3s2rJoekJ6y&LjQG*_meYS=g$BUFm1s5AA|mig{biir2=x
z{Bdlyaqaj}LAQjc_IBw*+2^C3_daa8CU3p&)8r;?d9{`-XR*S9Ycd9@p|134j<S`~
zNj>FHzD%l3^Vr&cVR4PTXV~UaE3dmn{t}HcRN-fe&vFjb{dQ`9$(mcLY4=ns(gLd|
z)Nf{5b^a(mTzBcS@ac^z-}zOG-#Y0RcCeZW>SECFXbc#5Zkr>rI5zzL_Cm$V_-_Mq
z?lfI>c1RVhYH01%D!CrOoTI+wx4m+`w$ctKj$XW$EgGMlP(Pvjo@wX!BKp(@eud(*
zTp|M~VMG?CS}?>w2mRus<ag~k34{={{CStZ*I!P!Y9o8rF}gBCF6*sj`a!uzR$8H1
zDmgWGJKNT(_U*1z4h*Y4Sw}dM))40r8+q0%vsxfqLqNCl>Ol+Lsj}@i3a#dq-4)W>
zS>-$Np4*A=JtyV*;_$BXq?B|nm*`1M@nOy_pw@#5@cy3udo5Ol7j{nzA0uExpV9fG
z^Yt!GGli=`lH?nQsh6gWlhB)`@j0oeL*luE>NT?XjSG`X*N(1J|3+LkYLtZkCBfDE
zmu6J-Hm)xps2F4W<-5Rs1MA(q4Fsn!LmSf}_!OKT{KUj9Hnp%=VlxaAhZ^rE91ac=
zhQZ-j0IFf&E5Mpy*7=-pI6NB;$CETG{s<=*2yyqXk#M+SNH{85P5oDqaOf5`KX>@T
z4`95|l4S_XFdP*}fEI8M@CpQ+1nkz_*8BLl;8p23La2KH0NW5=ju5mFd_kT8?%?-;
zZwA4CuVC;ai~}LiJ=h&Ua4bj$!Pg5SAX|5L{|zqQ0Q|rc3;^2#FfzbEL30Cc44^4^
zf(2l(@B~{t!Hx~MBe=2wcLa9?o52&{YfHoNb_8#L+u?q~?FjzE(RO%3Faq0=;H-$S
z2~P;+gxvuGj1yXc_3PlWVTf;r_53i{9Zxp{2mKSo9p9s>{{-TW0zj4_qzG0&RGq&8
z;tn<us<0|k1r-Md2%^w%41{BYb8a#SR|;r)kwa)H^qvCY;i@Or6ZjROss4q-4Zfp5
zSmaoUC>H3B3KobMT#E*`wb1AY;03TJ*zD54u9S)}aL54)hs>bjzyXd7Fmb>ObU4;i
z0|b;BLNp;v9eSeo06e0>$F$TSK1!1Tl9)zk(a;mC3<UsuP$5(hR4L?WaFq^<ONENF
z(txm#FXji82pQ~8kb@6vVNV7uwrB|c1<);47=&UYVX#FC81kccRCv;%vCtE92pfl>
zXY5A98XM(}8W5h)usD<_72<GU*iZpUh<{K{2(ySGb}@t{sv1gy6&>pn6dfFn%I>qD
zNwGY@@T?>NKZ&AIQK_Khs9O;m40hMx6G{sQ9IRHtQc#5n0;<j81>l39>7!7nP8c&m
zN?5zHnFFN`zj4?*b^;6<13%~xBd~Y?@d2m9+8X!*@x(BP@dM(CL5J}Qn<o%I40otp
z*l$P!@y4J-HK!4&(1&auKzzXE7ywAb_yWW@ZI~P6iNPmo6zF#x_C)?+4(n^ydlcrU
zLwtdH6+M3<?Jx;4IM*3K`B);&2vTJEOz?nB#l)N`#Af*{00YVL*}^Dc`Ro86ihMZ0
z<^ezxi}6!*fF&|UaD{=x^10zL6K~5wkplgMg;))EOytACJP&dKqp@TMi~`m#mgHb(
zi%A*~9oQ>O_^7cnUjv=MO34!sBMA%Og~tqPAlG$B%wRNcJhVCH^8uD(`FsH@$@U?E
z1(O8?01P0KN?1@N_vq|O`Qc&UW7HuD3gcmj!h)A$r#z6zpvZ+@z=HVWF$Fr5DF6>9
z3>XbQp2yCgB|ZU)%A$ir1pqqu0ZWL%*@0KFH-On@7lI{-VWhBMfN*AQgdv9w;ALkU
z3|_3ZI3yVXzK$ic3B2r_L&3|=7L)Y|*p9*s2^JK6kahGCVw#nPJ`6`zsfL20<OhT5
zkcI>He}43kpsNFc2yQN*BZP<q9r8a724Z|D80DV^Bf<v>Q9$(6(i&E9R6}-YhKh*l
z%0om|9xS&hni=6Yw4*6ZMeG(c1voLg#Z18ph23JNV2i?L6;r_ZvKi47U3<VzK@AUz
zO;%zs-LrYo6lQ#O|CqvM-Y{*lvm?Rx6_Fpl!U!W9rcpMXf+r8=2n8auv(zUk54+{T
z$IEvEe8iDe-l6(pbs4PNuUXrusZjCC=-;3lW@-aV!jW`d6!VrIvia+tYt(C$<)b7{
zIr@2sWsY-sIxADk(Q%pMC_*2O&!IWi(0^GuLAH%tP@fb{u`s-76qsg>*Ra&t<|uPI
zI-d0A`(eC;o}r<aaks%89XNr@TkTwFB~t$3KDk+Z;%yWE00!}9ExpL(wVaIb8=Ev+
zjR>XavVcZgr<(2pe7t_x%(s_n<F~hY+aB7!c=qCniA4g|-yS@j{+{SrthaH2XI0E1
z8TYV6&mxIUMaFzA%`i6m%Nl}09l0IY9VX~HThl7aQN`&uVTFL|6n1Y_z7iSW-E(0B
z;BQ~Z;lujB&-(wNmPC`r-@YZ$boO^IBsS6hWrb8D{qn%+fvK*&f;dJ%*oifQthb4Y
z&%AQ+fuK>|yVPnw1sW-~yoUbSFf{0DTZdki?bO51$<@_4)uRT)=JPRa!o<Y?iHaa0
z_wQANrUFt=8T&u<53UXxJT!;J|A+#}Cq-SX5f5t6{So6u&uuwgaF5)X8)tL-&nh{g
z#I&#-pQbS@n%8dYX;at`>9D_TRn7#R^mxrbUWj*oGU~IU{@KxKekD(3^!=T$r&_CO
zw$%D5pYWGbIQiW0^n>;0MT<`L&wG0)#v)c~rkixghjAa0_mE8L+!oF;DT>Pztl8z%
zyFkRRVp%Egq>0J-|8EY|FsK!<rOv<C9RF%%@}HTWu{GFVSIA$mR32JBnYW`ARLdj5
zRD90)!gyy7KQI~+7*$!zPa<Or#J{Yd|C6)dUn;f#7cT^PCr{RQ=LyDMtI=g=pl3ka
z16Z8vuHOL4U2H!!_}w6Y%@0kzy6{1Dbg|n!P&xZ~uMhI}a>WNzRMb>RN_fwpAiuzc
z1cJ}cS1RiR)+pi#a0%P$4Q}v#_TjPoR(s%GUEJ0|HtZA>A$yd(*LvS2=%e=YmMkO_
z$rJ$Qkw|n3>`xsO(dB3Ca)kuAd*G0^iGhPlvDkm0T%dx&!UO-4hL)G;5AVySQNVm3
zs>?wm(;4Ur9c~)hM03&@Bv94x(2z=mI~-CQaM0if(tL2y&_zgGG`QIb`im<Z8TA4u
zjjDze5*#!#gMv2oTr?`uu5i)lpl0HxF_5BwBOHZDLb@C-8U^hdxoE0LIl)Ck*J^Ro
z=xC$Q84mOZ+-0VKfd_Z{QNYTLyS@~X8qy(grN;mc;G)50<lNy%kO}N?IpUJY478i(
zqN(!m2AoeKbqhy0B3uo|9hXQWAw?EvI2sl0ths0mq`KmysgaR#iHk->3Lj1yoygO_
zL^>c-c*0T94OU!nky@LZM&}L3Ao1qQK-a=?#RY9BPnki>&E3DCQ$orFj<SG83Ej2D
zMWZ1NA{UL$Qx+2HOzv<bbjuu9ToN=EH;oL_3lEKqbafp0!Y9zsO@v%DbV9^MLsJkp
z4K%FW;oyoop0+2c@+lt-EuMCv5_!fYiAqMxAdd2ps8n=G7#EFCyHM#!^~o8IM&=o-
zfSTm(Be(#Lr%ly>^SRT5>-czSRHQuQ%vTMVmzzdMYILq}bX9Z@8)sa)8ZW=l`LrK{
zcix2&%`<1im7~0E%Ah0VFlW9D22!hX(8xpzPrs3gRGu~<6V>3zg)1&FG<Uy|Nvdeo
z&zT1aga8kXj&#YK;mD964-FmlaEGJwj2XCxgSRYT&B@&#@Qp5{i090g!a&M6PMRvZ
z1c`^nBYVhj<i}HHXg;2}RCM^nl^zwHFLBe<cx5*P;|T|+E<AOkk<s}D`WwEE7|!EC
z0WMzN?g2P-v#ga@C^~ZnGaJkG>(Oco)4dw`dLYF*^cPDf9vI{j5QME_C}cR*!O6?(
Ind#&H0byt<4*&oF

diff --git a/benchmarking/switchback/speed_benchmark.py b/benchmarking/switchback/speed_benchmark.py
deleted file mode 100644
index eaba0e9cd..000000000
--- a/benchmarking/switchback/speed_benchmark.py
+++ /dev/null
@@ -1,160 +0,0 @@
-import json
-import time
-
-import torch
-
-from bitsandbytes.triton.int8_matmul_mixed_dequantize import (
-    int8_matmul_mixed_dequantize,
-)
-from bitsandbytes.triton.int8_matmul_rowwise_dequantize import (
-    int8_matmul_rowwise_dequantize,
-)
-from bitsandbytes.triton.quantize_columnwise_and_transpose import (
-    quantize_columnwise_and_transpose,
-)
-from bitsandbytes.triton.quantize_global import (
-    quantize_global,
-    quantize_global_transpose,
-)
-from bitsandbytes.triton.quantize_rowwise import quantize_rowwise
-
-# KNOW ISSUE: need to optimize "w_quantize_colwise_transpose" when embeddim is too large.
-
-
-def get_time(k, fn, info_dict):
-    for _ in range(repeat // 2):
-        fn()
-
-    torch.cuda.synchronize()
-    start = time.time()
-    for _ in range(repeat):
-        fn()
-
-    torch.cuda.synchronize()
-    end = time.time()
-    ms = (end - start) / repeat * 1000
-    print(f"time {k}: {ms:.3f} ms")
-    info_dict[k] = ms
-
-
-if __name__ == "__main__":
-    torch.manual_seed(0)
-    wm = 4
-    for dim in [1024, 1280, 1408, 1664, 2048, 4096]:
-        # note "batch_size" is actually "batch_size * embed_dim", which is why it's large
-        for batch_size in [256 * 32, 256 * 64, 256 * 128, 256 * 256, 256 * 512]:
-            # switch switches dim_in and dim_out
-            for switch in [False, True]:
-                # hparams
-                repeat = 64
-                batch_size = batch_size
-                dim_out = dim * wm
-                dim_in = dim
-                if switch:
-                    dim_out = dim
-                    dim_in = wm * dim
-
-                dim_in = round(dim_in)
-                dim_out = round(dim_out)
-
-                # simulate forward pass
-                x = torch.randn(batch_size, dim_in, dtype=torch.float16).cuda()
-                g = torch.randn(batch_size, dim_out, dtype=torch.float16).cuda()
-                w = torch.randn(dim_out, dim_in, dtype=torch.float16).cuda()
-
-                x_int8 = x.clone().to(torch.int8)
-                g_int8 = g.clone().to(torch.int8)
-                w_int8 = w.clone().to(torch.int8)
-                wt_int8 = w.t().contiguous().clone().to(torch.int8)
-                state_x_rowwise = x.max(dim=1)[0]
-                state_g_rowwise = g.max(dim=1)[0]
-                state_w_columnwise = w.max(dim=0)[0]
-                state_w_rowwise = w.max(dim=1)[0]
-                state_w_global = w.max()
-
-                info = {
-                    "repeat": repeat,
-                    "batch_size": batch_size,
-                    "dim_out": dim_out,
-                    "dim_in": dim_in,
-                    "wm": wm,
-                    "switch": switch,
-                }
-
-                get_time("standard_fwd", lambda: x.matmul(w.t()), info)
-                get_time("standard_gw", lambda: g.t().matmul(x), info)
-                get_time("standard_gx", lambda: g.matmul(w), info)
-                get_time(
-                    "rowwise_fwd",
-                    lambda: int8_matmul_rowwise_dequantize(
-                        x_int8,
-                        w_int8.t(),
-                        state_x_rowwise,
-                        state_w_columnwise,
-                        None,
-                    ),
-                    info,
-                )
-                get_time(
-                    "rowwise_bwd",
-                    lambda: int8_matmul_rowwise_dequantize(
-                        g_int8,
-                        wt_int8.t(),
-                        state_x_rowwise,
-                        state_w_rowwise,
-                        None,
-                    ),
-                    info,
-                )
-                get_time(
-                    "global_fwd",
-                    lambda: int8_matmul_mixed_dequantize(x_int8, w_int8.t(), state_x_rowwise, state_w_global, None),
-                    info,
-                )
-                get_time(
-                    "global_bwd",
-                    lambda: int8_matmul_mixed_dequantize(g_int8, wt_int8.t(), state_x_rowwise, state_w_global, None),
-                    info,
-                )
-                get_time("x_quantize_rowwise", lambda: quantize_rowwise(x), info)
-                get_time("g_quantize_rowwise", lambda: quantize_rowwise(g), info)
-                get_time("w_quantize_rowwise", lambda: quantize_rowwise(w), info)
-                get_time("w_quantize_colwise_transpose", lambda: quantize_columnwise_and_transpose(w), info)
-                get_time("w_quantize_global", lambda: quantize_global(w), info)
-                get_time("w_quantize_global_transpose", lambda: quantize_global_transpose(w), info)
-
-                time_standard = info["standard_fwd"] + info["standard_gx"] + info["standard_gw"]
-                time_rowwise = (
-                    info["x_quantize_rowwise"]
-                    + info["g_quantize_rowwise"]
-                    + info["w_quantize_colwise_transpose"]
-                    + info["w_quantize_rowwise"]
-                    + info["standard_gw"]
-                    + info["rowwise_fwd"]
-                    + info["rowwise_bwd"]
-                )
-                time_global = (
-                    info["x_quantize_rowwise"]
-                    + info["g_quantize_rowwise"]
-                    + info["w_quantize_global"]
-                    + info["w_quantize_global_transpose"]
-                    + info["standard_gw"]
-                    + info["global_fwd"]
-                    + info["global_bwd"]
-                )
-
-                print("TOTAL STANDARD", time_standard)
-                print("TOTAL ROWWISE", time_rowwise)
-                print("TOTAL GLOBAL", time_global)
-
-                print("speedup", -100 * (time_global - time_standard) / time_standard)
-
-                info["time_standard"] = time_standard
-                info["time_rowwise"] = time_rowwise
-                info["time_global"] = time_global
-
-                info_json = json.dumps(info)
-
-                # TODO: change this to what you want.
-                with open("speed_benchmark/info.jsonl", "a") as file:
-                    file.write(info_json + "\n")
diff --git a/bitsandbytes/__init__.py b/bitsandbytes/__init__.py
index bdc150e5e..729342070 100644
--- a/bitsandbytes/__init__.py
+++ b/bitsandbytes/__init__.py
@@ -9,7 +9,7 @@
 
 import torch
 
-from . import _ops, research, utils
+from . import _ops, utils
 from .autograd._functions import (
     MatmulLtState,
     matmul,
diff --git a/bitsandbytes/autograd/_functions.py b/bitsandbytes/autograd/_functions.py
index da168e17b..614c19051 100644
--- a/bitsandbytes/autograd/_functions.py
+++ b/bitsandbytes/autograd/_functions.py
@@ -53,16 +53,12 @@ def get_current_outlier_idx(self):
 
 @dataclass
 class MatmulLtState:
-    _tile_indices: Optional[torch.Tensor] = None  # TODO: remove
-
     force_no_igemmlt: bool = False
 
     CB: Optional[torch.Tensor] = None
-    CxB: Optional[torch.Tensor] = None  # TODO: Deprecate/remove
     SB: Optional[torch.Tensor] = None
     SCB: Optional[torch.Tensor] = None
 
-    CxBt: Optional[torch.Tensor] = None  # TODO: Deprecate/remove
     SBt: Optional[torch.Tensor] = None
     CBt: Optional[torch.Tensor] = None
 
@@ -75,22 +71,29 @@ class MatmulLtState:
     is_training = True
     has_fp16_weights = True
     use_pool = False
-    formatB = "row"  # TODO: Deprecate/remove
+
+    # Deprecated attributes kept for downstream compatibility (TGI, vLLM).
+    # These are always None and will be fully removed in the next release.
+    _deprecated_fields = frozenset({"CxB", "CxBt", "formatB", "_tile_indices"})
+
+    def __getattr__(self, name):
+        if name in MatmulLtState._deprecated_fields:
+            warnings.warn(
+                f"MatmulLtState.{name} is deprecated and will be removed in the next bitsandbytes release.",
+                FutureWarning,
+                stacklevel=2,
+            )
+            return None
+        raise AttributeError(f"'{type(self).__name__}' object has no attribute '{name}'")
 
     def reset_grads(self):
         self.CB = None
-        self.CxB = None
         self.SB = None
         self.SCB = None
 
-        self.CxBt = None
         self.SBt = None
         self.CBt = None
 
-    @property
-    def tile_indices(self):
-        raise ValueError("tile_indices is no longer supported.")
-
 
 class MatMul8bitLt(torch.autograd.Function):
     @staticmethod
@@ -293,7 +296,6 @@ def backward(ctx, grad_output):
 
 class MatMul4Bit(torch.autograd.Function):
     # forward is the same, but we added the fallback for pre-turing GPUs
-    # backward is mostly the same, but adds one extra clause (see "elif state.CxB is not None")
 
     @staticmethod
     def forward(ctx, A, B, out=None, bias=None, quant_state: Optional[F.QuantState] = None):
diff --git a/bitsandbytes/backends/utils.py b/bitsandbytes/backends/utils.py
index ec96a440c..34e3d5faa 100644
--- a/bitsandbytes/backends/utils.py
+++ b/bitsandbytes/backends/utils.py
@@ -4,9 +4,8 @@
 import torch
 
 try:
-    import triton.language as tl  # noqa: F401
-
     import triton  # noqa: F401
+    import triton.language as tl  # noqa: F401
 
     triton_available = True
 except ImportError:
diff --git a/bitsandbytes/functional.py b/bitsandbytes/functional.py
index fe687e1e8..784eeafe5 100644
--- a/bitsandbytes/functional.py
+++ b/bitsandbytes/functional.py
@@ -11,7 +11,6 @@
 import numpy as np
 import torch
 from torch import Tensor
-from typing_extensions import deprecated
 
 from bitsandbytes.utils import pack_dict_to_tensor, unpack_tensor_to_dict
 
@@ -20,32 +19,6 @@
 name2qmap = {}
 
 """C FUNCTIONS FOR OPTIMIZERS"""
-str2optimizer8bit = {
-    "adam": (
-        lib.cadam_static_8bit_grad_32,
-        lib.cadam_static_8bit_grad_16,
-    ),
-    "momentum": (
-        lib.cmomentum_static_8bit_grad_32,
-        lib.cmomentum_static_8bit_grad_16,
-    ),
-    "rmsprop": (
-        lib.crmsprop_static_8bit_grad_32,
-        lib.crmsprop_static_8bit_grad_16,
-    ),
-    "lion": (
-        lib.clion_static_8bit_grad_32,
-        lib.clion_static_8bit_grad_16,
-    ),
-    "lamb": (
-        lib.cadam_static_8bit_grad_32,
-        lib.cadam_static_8bit_grad_16,
-    ),
-    "lars": (
-        lib.cmomentum_static_8bit_grad_32,
-        lib.cmomentum_static_8bit_grad_16,
-    ),
-}
 
 
 class GlobalPageManager:
@@ -1069,110 +1042,6 @@ def dequantize_4bit(
     return out
 
 
-@deprecated("This function is deprecated and will be removed in a future release.", category=FutureWarning)
-def quantize(
-    A: Tensor,
-    code: Optional[torch.Tensor] = None,
-    out: Optional[torch.Tensor] = None,
-) -> tuple[Tensor, tuple[Tensor, Tensor]]:
-    if code is None:
-        if "dynamic" not in name2qmap:
-            name2qmap["dynamic"] = create_dynamic_map().to(A.device)
-        code = name2qmap["dynamic"]
-        code = code.to(A.device)
-
-    absmax = torch.abs(A).max()
-    if absmax.dtype != torch.float32:
-        absmax = absmax.float()
-    inp = A / absmax
-    out = quantize_no_absmax(inp, code, out)
-    return out, (absmax, code)
-
-
-@deprecated("This function is deprecated and will be removed in a future release.", category=FutureWarning)
-def dequantize(
-    A: Tensor,
-    state: Optional[tuple[Tensor, Tensor]] = None,
-    absmax: Optional[torch.Tensor] = None,
-    code: Optional[torch.Tensor] = None,
-    out: Optional[torch.Tensor] = None,
-) -> Tensor:
-    assert state is not None or absmax is not None
-    if code is None and state is None:
-        if "dynamic" not in name2qmap:
-            name2qmap["dynamic"] = create_dynamic_map().to(A.device)
-        code = name2qmap["dynamic"]
-        code = code.to(A.device)
-
-    if state is None:
-        state = (absmax, code)
-    out = dequantize_no_absmax(A, state[1], out)
-    return out * state[0]
-
-
-@deprecated("This function is deprecated and will be removed in a future release.", category=FutureWarning)
-def quantize_no_absmax(A: Tensor, code: Tensor, out: Optional[torch.Tensor] = None) -> Tensor:
-    """
-    Quantizes input tensor to 8-bit.
-
-    Quantizes the 32-bit input tensor `A` to the 8-bit output tensor
-    `out` using the quantization map `code`.
-
-    Parameters
-    ----------
-    A : torch.Tensor
-        The input tensor.
-    code : torch.Tensor
-        The quantization map.
-    out : torch.Tensor, optional
-        The output tensor. Needs to be of type byte.
-
-    Returns
-    -------
-    torch.Tensor:
-        Quantized 8-bit tensor.
-    """
-    with _cuda_device_of(A):
-        if out is None:
-            out = torch.zeros_like(A, dtype=torch.uint8)
-        is_on_gpu([A, out])
-        lib.cquantize(get_ptr(code), get_ptr(A), get_ptr(out), ct.c_int(A.numel()))
-
-    return out
-
-
-@deprecated("This function is deprecated and will be removed in a future release.", category=FutureWarning)
-def dequantize_no_absmax(A: Tensor, code: Tensor, out: Optional[torch.Tensor] = None) -> Tensor:
-    """
-    Dequantizes the 8-bit tensor to 32-bit.
-
-    Dequantizes the 8-bit tensor `A` to the 32-bit tensor `out` via
-    the quantization map `code`.
-
-    Parameters
-    ----------
-    A : torch.Tensor
-        The 8-bit input tensor.
-    code : torch.Tensor
-        The quantization map.
-    out : torch.Tensor
-        The 32-bit output tensor.
-
-    Returns
-    -------
-    torch.Tensor:
-        32-bit output tensor.
-    """
-    with _cuda_device_of(A):
-        if out is None:
-            out = torch.zeros_like(A, dtype=torch.float32)
-        is_on_gpu([code, A, out])
-        stream = _get_tensor_stream(A)
-        lib.cdequantize(get_ptr(code), get_ptr(A), get_ptr(out), ct.c_int(A.numel()), stream)
-
-    return out
-
-
 def optimizer_update_32bit(
     optimizer_name: str,
     g: Tensor,
@@ -1262,143 +1131,6 @@ def optimizer_update_32bit(
     )
 
 
-@deprecated(
-    "This function is deprecated and will be removed in a future release. "
-    "Please use optimizer_update_8bit_blockwise instead. ",
-    category=FutureWarning,
-)
-def optimizer_update_8bit(
-    optimizer_name: str,
-    g: Tensor,
-    p: Tensor,
-    state1: Tensor,
-    state2: Optional[torch.Tensor],
-    beta1: float,
-    beta2: float,
-    eps: float,
-    step: int,
-    lr: float,
-    qmap1: Tensor,
-    qmap2: Optional[torch.Tensor],
-    max1: Tensor,
-    max2: Optional[torch.Tensor],
-    new_max1: Tensor,
-    new_max2: Optional[torch.Tensor],
-    weight_decay: float = 0.0,
-    gnorm_scale: float = 1.0,
-    unorm_vec: Optional[torch.Tensor] = None,
-    max_unorm: float = 0.0,
-) -> None:
-    """
-    Performs an inplace Adam update.
-
-    Universal Adam update for 32/8-bit state and 32/16-bit gradients/weights.
-    Uses AdamW formulation if weight decay > 0.0.
-
-    Parameters
-    ----------
-    optimizer_name : str
-        The name of the optimizer. Choices {adam, momentum}
-    g : torch.Tensor
-        Gradient tensor.
-    p : torch.Tensor
-        Parameter tensor.
-    state1 : torch.Tensor
-        Adam state 1.
-    state2 : torch.Tensor
-        Adam state 2.
-    beta1 : float
-        Adam beta1.
-    beta2 : float
-        Adam beta2.
-    eps : float
-        Adam epsilon.
-    weight_decay : float
-        Weight decay.
-    step : int
-        Current optimizer step.
-    lr : float
-        The learning rate.
-    qmap1 : torch.Tensor
-        Quantization map for first Adam state.
-    qmap2 : torch.Tensor
-        Quantization map for second Adam state.
-    max1 : torch.Tensor
-        Max value for first Adam state update.
-    max2 : torch.Tensor
-        Max value for second Adam state update.
-    new_max1 : torch.Tensor
-        Max value for the next Adam update of the first state.
-    new_max2 : torch.Tensor
-        Max value for the next Adam update of the second state.
-    gnorm_scale : float
-        The factor to rescale the gradient to the max clip value.
-    unorm_vec : torch.Tensor
-        The tensor for the update norm.
-    max_unorm : float
-        The maximum update norm relative to the weight norm.
-    """
-
-    param_norm = 0.0
-    if max_unorm > 0.0:
-        param_norm = torch.norm(p.data.float())
-
-    with _cuda_device_of(g):
-        is_on_gpu([g, p, state1, state2, unorm_vec, qmap1, qmap2, max1, max2, new_max1, new_max2])
-        if g.dtype == torch.float32 and state1.dtype == torch.uint8:
-            str2optimizer8bit[optimizer_name][0](
-                get_ptr(p),
-                get_ptr(g),
-                get_ptr(state1),
-                get_ptr(state2),
-                get_ptr(unorm_vec),
-                ct.c_float(max_unorm),
-                ct.c_float(param_norm),
-                ct.c_float(beta1),
-                ct.c_float(beta2),
-                ct.c_float(eps),
-                ct.c_int32(step),
-                ct.c_float(lr),
-                get_ptr(qmap1),
-                get_ptr(qmap2),
-                get_ptr(max1),
-                get_ptr(max2),
-                get_ptr(new_max1),
-                get_ptr(new_max2),
-                ct.c_float(weight_decay),
-                ct.c_float(gnorm_scale),
-                ct.c_int32(g.numel()),
-            )
-        elif g.dtype == torch.float16 and state1.dtype == torch.uint8:
-            str2optimizer8bit[optimizer_name][1](
-                get_ptr(p),
-                get_ptr(g),
-                get_ptr(state1),
-                get_ptr(state2),
-                get_ptr(unorm_vec),
-                ct.c_float(max_unorm),
-                ct.c_float(param_norm),
-                ct.c_float(beta1),
-                ct.c_float(beta2),
-                ct.c_float(eps),
-                ct.c_int32(step),
-                ct.c_float(lr),
-                get_ptr(qmap1),
-                get_ptr(qmap2),
-                get_ptr(max1),
-                get_ptr(max2),
-                get_ptr(new_max1),
-                get_ptr(new_max2),
-                ct.c_float(weight_decay),
-                ct.c_float(gnorm_scale),
-                ct.c_int32(g.numel()),
-            )
-        else:
-            raise ValueError(
-                f"Gradient+optimizer bit data type combination not supported: grad {g.dtype}, optimizer {state1.dtype}",
-            )
-
-
 def optimizer_update_8bit_blockwise(
     optimizer_name: str,
     g: Tensor,
@@ -1445,48 +1177,6 @@ def optimizer_update_8bit_blockwise(
     )
 
 
-@deprecated("This function is deprecated and will be removed in a future release.", category=FutureWarning)
-def percentile_clipping(grad: Tensor, gnorm_vec: Tensor, step: int, percentile: int = 5):
-    """Applies percentile clipping
-
-    grad: torch.Tensor
-        The gradient tensor.
-    gnorm_vec: torch.Tensor
-        Vector of gradient norms. 100 elements expected.
-    step: int
-        The current optimization steps (number of past gradient norms).
-
-    """
-    with _cuda_device_of(grad):
-        is_on_gpu([grad, gnorm_vec])
-        if grad.dtype == torch.float32:
-            lib.cpercentile_clipping_g32(
-                get_ptr(grad),
-                get_ptr(gnorm_vec),
-                ct.c_int32(step),
-                ct.c_int32(grad.numel()),
-            )
-        elif grad.dtype == torch.float16:
-            lib.cpercentile_clipping_g16(
-                get_ptr(grad),
-                get_ptr(gnorm_vec),
-                ct.c_int32(step),
-                ct.c_int32(grad.numel()),
-            )
-        else:
-            raise ValueError(f"Gradient type {grad.dtype} not supported!")
-
-    current_gnorm = torch.sqrt(gnorm_vec[step % 100])
-    vals, _ = torch.sort(gnorm_vec)
-    clip_value = torch.sqrt(vals[percentile])
-    gnorm_scale = 1.0
-
-    if current_gnorm > clip_value:
-        gnorm_scale = clip_value / current_gnorm
-
-    return current_gnorm, clip_value, gnorm_scale
-
-
 def check_matmul(A, B, out, transposed_A, transposed_B, expected_type=torch.int8):
     if not torch.cuda.is_initialized():
         torch.cuda.init()
diff --git a/bitsandbytes/nn/__init__.py b/bitsandbytes/nn/__init__.py
index 20aff67a3..54c2614bd 100644
--- a/bitsandbytes/nn/__init__.py
+++ b/bitsandbytes/nn/__init__.py
@@ -16,11 +16,4 @@
     OutlierAwareLinear,
     Params4bit,
     StableEmbedding,
-    SwitchBackLinearBnb,
-)
-from .triton_based_modules import (
-    StandardLinear,
-    SwitchBackLinear,
-    SwitchBackLinearGlobal,
-    SwitchBackLinearVectorwise,
 )
diff --git a/bitsandbytes/nn/modules.py b/bitsandbytes/nn/modules.py
index 67847f40c..9f05ac6fb 100644
--- a/bitsandbytes/nn/modules.py
+++ b/bitsandbytes/nn/modules.py
@@ -1137,42 +1137,3 @@ def forward(self, x):
             w = self.quantize_weight(self.weight, self.outlier_dim)
             self.weight.data.copy_(w)
             self.is_quantized = True
-
-
-class SwitchBackLinearBnb(nn.Linear):
-    def __init__(
-        self,
-        input_features,
-        output_features,
-        bias=True,
-        has_fp16_weights=True,
-        memory_efficient_backward=False,
-        threshold=0.0,
-        index=None,
-        device=None,
-    ):
-        super().__init__(input_features, output_features, bias, device)
-        self.state = bnb.MatmulLtState()
-        self.index = index
-
-        self.state.threshold = threshold
-        self.state.has_fp16_weights = has_fp16_weights
-        self.state.memory_efficient_backward = memory_efficient_backward
-        if threshold > 0.0 and not has_fp16_weights:
-            self.state.use_pool = True
-
-        self.weight = Int8Params(self.weight.data, has_fp16_weights=has_fp16_weights, requires_grad=has_fp16_weights)
-
-    def init_8bit_state(self):
-        self.state.CB = self.weight.CB
-        self.state.SCB = self.weight.SCB
-        self.weight.CB = None
-        self.weight.SCB = None
-
-    def forward(self, x):
-        self.state.is_training = self.training
-
-        if self.weight.CB is not None:
-            self.init_8bit_state()
-
-        return bnb.matmul_mixed(x.half(), self.weight.half(), bias=None, state=self.state) + self.bias
diff --git a/bitsandbytes/nn/triton_based_modules.py b/bitsandbytes/nn/triton_based_modules.py
deleted file mode 100644
index aa8494942..000000000
--- a/bitsandbytes/nn/triton_based_modules.py
+++ /dev/null
@@ -1,264 +0,0 @@
-from functools import partial
-
-import torch
-import torch.nn as nn
-
-from bitsandbytes.triton.dequantize_rowwise import dequantize_rowwise
-from bitsandbytes.triton.int8_matmul_mixed_dequantize import (
-    int8_matmul_mixed_dequantize,
-)
-from bitsandbytes.triton.int8_matmul_rowwise_dequantize import (
-    int8_matmul_rowwise_dequantize,
-)
-from bitsandbytes.triton.quantize_columnwise_and_transpose import (
-    quantize_columnwise_and_transpose,
-)
-from bitsandbytes.triton.quantize_global import (
-    quantize_global,
-    quantize_global_transpose,
-)
-from bitsandbytes.triton.quantize_rowwise import quantize_rowwise
-from bitsandbytes.triton.triton_utils import is_triton_available
-
-
-class _switchback_global(torch.autograd.Function):
-    @staticmethod
-    def forward(ctx, X_3D, W, bias):
-        # reshape input to [N * L, D]
-        X = X_3D.view(-1, X_3D.size(-1))
-
-        # rowwise quantize for X, global quantize for W
-        X_int8, state_X = quantize_rowwise(X)
-        W_int8, state_W = quantize_global(W)
-
-        # save for backward.
-        ctx.save_for_backward = X, W
-
-        # matmult, fused dequant and add bias
-        # call "mixed" because we are mixing rowwise quantized and global quantized
-        return int8_matmul_mixed_dequantize(X_int8, W_int8.t(), state_X, state_W, bias).view(*X_3D.size()[:-1], -1)
-
-    @staticmethod
-    def backward(ctx, G_3D):
-        # reshape input to [N_out * L, D]
-        G = G_3D.reshape(-1, G_3D.size(-1))
-
-        grad_X = grad_W = grad_bias = None
-
-        X, W = ctx.save_for_backward
-        if ctx.needs_input_grad[0]:
-            # rowwise quantize for G, global quantize for W
-            # for W, we also fuse the transpose operation because only A @ B^T is supported
-            # so we transpose once then call .t() in the matmul
-            G_int8, state_G = quantize_rowwise(G)
-            W_int8, state_W = quantize_global_transpose(W)
-            grad_X = int8_matmul_mixed_dequantize(G_int8, W_int8.t(), state_G, state_W, None).view(
-                *G_3D.size()[:-1],
-                -1,
-            )
-        if ctx.needs_input_grad[1]:
-            # backward pass uses standard weight grad
-            grad_W = torch.matmul(G.t(), X.to(G.dtype))
-        if ctx.needs_input_grad[2]:
-            grad_bias = G.sum(dim=0)
-
-        return grad_X, grad_W, grad_bias
-
-
-class _switchback_vectorrize(torch.autograd.Function):
-    @staticmethod
-    def forward(ctx, X_3D, W, bias):
-        # reshape input to [N * L, D]
-        X = X_3D.view(-1, X_3D.size(-1))
-
-        ctx.save_for_backward = X, W
-        # rowwise quantize for X
-        # columnwise quantize for W (first rowwise, transpose later)
-        X_int8, state_X = quantize_rowwise(X)
-        W_int8, state_W = quantize_rowwise(W)
-
-        # matmult, fused dequant and add bias
-        # call kernel which expects rowwise quantized X and W
-        return int8_matmul_rowwise_dequantize(X_int8, W_int8.t(), state_X, state_W, bias).view(*X_3D.size()[:-1], -1)
-
-    @staticmethod
-    def backward(ctx, G_3D):
-        X, W = ctx.save_for_backward
-
-        G = G_3D.reshape(-1, G_3D.size(-1))
-
-        grad_X = grad_W = grad_bias = None
-
-        if ctx.needs_input_grad[0]:
-            # rowwise quantize for G, columnwise quantize for W and fused transpose
-            # we call .t() for weight later because only A @ B^T is supported
-            G_int8, state_G = quantize_rowwise(G)
-            W_int8, state_W = quantize_columnwise_and_transpose(W)
-            grad_X = int8_matmul_rowwise_dequantize(G_int8, W_int8.t(), state_G, state_W, None).view(
-                *G_3D.size()[:-1],
-                -1,
-            )
-        if ctx.needs_input_grad[1]:
-            # backward pass uses standard weight grad
-            grad_W = torch.matmul(G.t(), X.to(G.dtype))
-        if ctx.needs_input_grad[2]:
-            grad_bias = G.sum(dim=0)
-
-        return grad_X, grad_W, grad_bias
-
-
-class _switchback_global_mem_efficient(torch.autograd.Function):
-    @staticmethod
-    def forward(ctx, X_3D, W, bias):
-        # reshape input to [N * L, D]
-        X = X_3D.view(-1, X_3D.size(-1))
-        X_3D_sz = X_3D.size()
-
-        # rowwise quantize for X, global quantize for W
-        X_int8, state_X = quantize_rowwise(X)
-        del X
-        W_int8, state_W = quantize_global(W)
-
-        # save for backward.
-        ctx.save_for_backward = X_int8, state_X, W_int8, state_W
-
-        # matmult, fused dequant and add bias
-        # call "mixed" because we are mixing rowwise quantized and global quantized
-        return int8_matmul_mixed_dequantize(X_int8, W_int8.t(), state_X, state_W, bias).view(*X_3D_sz[:-1], -1)
-
-    @staticmethod
-    def backward(ctx, G_3D):
-        # reshape input to [N_out * L, D]
-        G = G_3D.reshape(-1, G_3D.size(-1))
-        G_3D_sz = G_3D.size()
-
-        grad_X = grad_W = grad_bias = None
-
-        X_int8, state_X, W_int8, state_W = ctx.save_for_backward
-        if ctx.needs_input_grad[1]:
-            real_X = dequantize_rowwise(X_int8, state_X)
-            del X_int8
-            grad_W = torch.matmul(G.t(), real_X.to(G.dtype))
-            del real_X
-        if ctx.needs_input_grad[2]:
-            grad_bias = G.sum(dim=0)
-        if ctx.needs_input_grad[0]:
-            G_int8, state_G = quantize_rowwise(G)
-            del G
-            W_int8 = W_int8.t().contiguous()
-            grad_X = int8_matmul_mixed_dequantize(G_int8, W_int8.t(), state_G, state_W, None).view(*G_3D_sz[:-1], -1)
-
-        return grad_X, grad_W, grad_bias
-
-
-class SwitchBackLinear(nn.Linear):
-    def __init__(
-        self,
-        in_features: int,
-        out_features: int,
-        bias: bool = True,
-        device=None,
-        dtype=None,
-        vector_wise_quantization: bool = False,
-        mem_efficient: bool = False,
-    ):
-        super().__init__(in_features, out_features, bias, device, dtype)
-
-        if not is_triton_available():
-            raise ImportError("""Could not import triton. Please install triton to use SwitchBackLinear.
-                               Alternatively, you can use bnb.nn.SwitchBackLinearBnb, but it will be slower""")
-
-        # By default, we use the global quantization.
-        self.vector_wise_quantization = vector_wise_quantization
-        if self.vector_wise_quantization:
-            self._fn = _switchback_vectorrize
-            if mem_efficient:
-                print("mem efficient is not supported for vector-wise quantization.")
-                exit(1)
-        else:
-            if mem_efficient:
-                self._fn = _switchback_global_mem_efficient
-            else:
-                self._fn = _switchback_global
-
-    def prepare_for_eval(self):
-        # If we just want to do eval, we can pre-quantize the weights instead of doing it on the forward pass.
-        # Note this is experimental and not tested thoroughly.
-        # Note this needs to be explicitly called with something like
-        # def cond_prepare(m):
-        #     if hasattr(m, "prepare_for_eval"):
-        #         m.prepare_for_eval()
-        # model.apply(cond_prepare)
-        print("=> preparing for eval.")
-        if self.vector_wise_quantization:
-            W_int8, state_W = quantize_rowwise(self.weight)
-        else:
-            W_int8, state_W = quantize_global(self.weight)
-
-        self.register_buffer("W_int8", W_int8)
-        self.register_buffer("state_W", state_W)
-
-        del self.weight
-
-    def forward(self, x):
-        if self.training:
-            return self._fn.apply(x, self.weight, self.bias)
-        else:
-            # If it hasn't been "prepared for eval", run the standard forward pass.
-            if not hasattr(self, "W_int8"):
-                return self._fn.apply(x, self.weight, self.bias)
-
-            # Otherwise, use pre-computed weights.
-            X = x.view(-1, x.size(-1))
-            X_int8, state_X = quantize_rowwise(X)
-
-            if self.vector_wise_quantization:
-                return int8_matmul_rowwise_dequantize(X_int8, self.W_int8.t(), state_X, self.state_W, self.bias).view(
-                    *x.size()[:-1],
-                    -1,
-                )
-            else:
-                return int8_matmul_mixed_dequantize(X_int8, self.W_int8.t(), state_X, self.state_W, self.bias).view(
-                    *x.size()[:-1],
-                    -1,
-                )
-
-
-SwitchBackLinearGlobal = partial(SwitchBackLinear, vector_wise_quantization=False)
-SwitchBackLinearGlobalMemEfficient = partial(SwitchBackLinear, vector_wise_quantization=False, mem_efficient=True)
-SwitchBackLinearVectorwise = partial(SwitchBackLinear, vector_wise_quantization=True)
-
-
-# This is just the standard linear function.
-class StandardLinearFunction(torch.autograd.Function):
-    @staticmethod
-    def forward(ctx, input, weight, bias=None):
-        X = input.view(-1, input.size(-1))
-
-        ctx.save_for_backward(X, weight, bias)
-        output = input.matmul(weight.t())
-        if bias is not None:
-            output += bias.unsqueeze(0).expand_as(output)
-        return output.view(*input.size()[:-1], -1)
-
-    @staticmethod
-    def backward(ctx, grad_output_3D):
-        input, weight, bias = ctx.saved_tensors
-
-        grad_output = grad_output_3D.reshape(-1, grad_output_3D.size(-1))
-
-        grad_input = grad_weight = grad_bias = None
-
-        if ctx.needs_input_grad[0]:
-            grad_input = grad_output.matmul(weight.to(grad_output.dtype)).view(*grad_output_3D.size()[:-1], -1)
-        if ctx.needs_input_grad[1]:
-            grad_weight = grad_output.t().matmul(input.to(grad_output.dtype))
-        if bias is not None and ctx.needs_input_grad[2]:
-            grad_bias = grad_output.sum(0)
-
-        return grad_input, grad_weight, grad_bias
-
-
-class StandardLinear(nn.Linear):
-    def forward(self, x):
-        return StandardLinearFunction.apply(x, self.weight, self.bias)
diff --git a/bitsandbytes/optim/adagrad.py b/bitsandbytes/optim/adagrad.py
index 7459dece1..b871f2bf4 100644
--- a/bitsandbytes/optim/adagrad.py
+++ b/bitsandbytes/optim/adagrad.py
@@ -17,8 +17,6 @@ def __init__(
         optim_bits=32,
         args=None,
         min_8bit_size=4096,
-        percentile_clipping=100,
-        block_wise=True,
     ):
         """
         Base Adagrad optimizer.
@@ -42,10 +40,6 @@ def __init__(
                 An object with additional arguments.
             min_8bit_size (`int`, defaults to 4096):
                 The minimum number of elements of the parameter tensors for 8-bit optimization.
-            percentile_clipping (`int`, defaults to 100):
-                Adapts clipping threshold automatically by tracking the last 100 gradient norms and clipping the gradient at a certain percentile to improve stability.
-            block_wise (`bool`, defaults to `True`):
-                Whether to independently quantize each block of tensors to reduce outlier effects and improve stability.
         """
         if not 0.0 <= lr:
             raise ValueError(f"Invalid learning rate: {lr}")
@@ -67,8 +61,6 @@ def __init__(
             optim_bits,
             args,
             min_8bit_size,
-            percentile_clipping,
-            block_wise,
         )
 
 
@@ -84,8 +76,6 @@ def __init__(
         optim_bits=8,
         args=None,
         min_8bit_size=4096,
-        percentile_clipping=100,
-        block_wise=True,
     ):
         """
         8-bit Adagrad optimizer.
@@ -109,10 +99,6 @@ def __init__(
                 An object with additional arguments.
             min_8bit_size (`int`, defaults to 4096):
                 The minimum number of elements of the parameter tensors for 8-bit optimization.
-            percentile_clipping (`int`, defaults to 100):
-                Adapts clipping threshold automatically by tracking the last 100 gradient norms and clipping the gradient at a certain percentile to improve stability.
-            block_wise (`bool`, defaults to `True`):
-                Whether to independently quantize each block of tensors to reduce outlier effects and improve stability.
         """
         if not 0.0 <= lr:
             raise ValueError(f"Invalid learning rate: {lr}")
@@ -124,7 +110,6 @@ def __init__(
             raise ValueError("Initial accumulator value != 0.0 not supported!")
         if lr_decay != 0.0:
             raise ValueError("Lr Decay != 0.0 not supported!")
-        assert block_wise
         super().__init__(
             "adagrad",
             params,
@@ -135,8 +120,6 @@ def __init__(
             8,
             args,
             min_8bit_size,
-            percentile_clipping,
-            block_wise,
         )
 
 
@@ -152,8 +135,6 @@ def __init__(
         optim_bits=32,
         args=None,
         min_8bit_size=4096,
-        percentile_clipping=100,
-        block_wise=True,
     ):
         """
         32-bit Adagrad optimizer.
@@ -177,10 +158,6 @@ def __init__(
                 An object with additional arguments.
             min_8bit_size (`int`, defaults to 4096):
                 The minimum number of elements of the parameter tensors for 8-bit optimization.
-            percentile_clipping (`int`, defaults to 100):
-                Adapts clipping threshold automatically by tracking the last 100 gradient norms and clipping the gradient at a certain percentile to improve stability.
-            block_wise (`bool`, defaults to `True`):
-                Whether to independently quantize each block of tensors to reduce outlier effects and improve stability.
         """
         if not 0.0 <= lr:
             raise ValueError(f"Invalid learning rate: {lr}")
@@ -202,6 +179,4 @@ def __init__(
             32,
             args,
             min_8bit_size,
-            percentile_clipping,
-            block_wise,
         )
diff --git a/bitsandbytes/optim/adam.py b/bitsandbytes/optim/adam.py
index 22a217c3b..63210bdc3 100644
--- a/bitsandbytes/optim/adam.py
+++ b/bitsandbytes/optim/adam.py
@@ -18,8 +18,6 @@ def __init__(
         optim_bits=32,
         args=None,
         min_8bit_size=4096,
-        percentile_clipping=100,
-        block_wise=True,
         is_paged=False,
     ):
         """
@@ -44,10 +42,6 @@ def __init__(
                 An object with additional arguments.
             min_8bit_size (`int`, defaults to 4096):
                 The minimum number of elements of the parameter tensors for 8-bit optimization.
-            percentile_clipping (`int`, defaults to 100):
-                Adapts clipping threshold automatically by tracking the last 100 gradient norms and clipping the gradient at a certain percentile to improve stability.
-            block_wise (`bool`, defaults to `True`):
-                Whether to independently quantize each block of tensors to reduce outlier effects and improve stability.
             is_paged (`bool`, defaults to `False`):
                 Whether the optimizer is a paged optimizer or not.
         """
@@ -61,8 +55,6 @@ def __init__(
             optim_bits,
             args,
             min_8bit_size,
-            percentile_clipping,
-            block_wise,
             is_paged=is_paged,
         )
 
@@ -79,8 +71,6 @@ def __init__(
         optim_bits=32,
         args=None,
         min_8bit_size=4096,
-        percentile_clipping=100,
-        block_wise=True,
         is_paged=False,
     ):
         """
@@ -107,10 +97,6 @@ def __init__(
                 An object with additional arguments.
             min_8bit_size (`int`, defaults to 4096):
                 The minimum number of elements of the parameter tensors for 8-bit optimization.
-            percentile_clipping (`int`, defaults to 100):
-                Adapts clipping threshold automatically by tracking the last 100 gradient norms and clipping the gradient at a certain percentile to improve stability.
-            block_wise (`bool`, defaults to `True`):
-                Whether to independently quantize each block of tensors to reduce outlier effects and improve stability.
             is_paged (`bool`, defaults to `False`):
                 Whether the optimizer is a paged optimizer or not.
         """
@@ -133,8 +119,6 @@ def __init__(
             8,  # Hardcoded to 8 bits
             args,
             min_8bit_size,
-            percentile_clipping,
-            block_wise,
             is_paged=is_paged,
         )
 
@@ -151,8 +135,6 @@ def __init__(
         optim_bits=32,
         args=None,
         min_8bit_size=4096,
-        percentile_clipping=100,
-        block_wise=True,
         is_paged=False,
     ):
         """
@@ -177,10 +159,6 @@ def __init__(
                 An object with additional arguments.
             min_8bit_size (`int`, defaults to 4096):
                 The minimum number of elements of the parameter tensors for 8-bit optimization.
-            percentile_clipping (`int`, defaults to 100):
-                Adapts clipping threshold automatically by tracking the last 100 gradient norms and clipping the gradient at a certain percentile to improve stability.
-            block_wise (`bool`, defaults to `True`):
-                Whether to independently quantize each block of tensors to reduce outlier effects and improve stability.
             is_paged (`bool`, defaults to `False`):
                 Whether the optimizer is a paged optimizer or not.
         """
@@ -194,8 +172,6 @@ def __init__(
             32,
             args,
             min_8bit_size,
-            percentile_clipping,
-            block_wise,
             is_paged=is_paged,
         )
 
@@ -212,8 +188,6 @@ def __init__(
         optim_bits=32,
         args=None,
         min_8bit_size=4096,
-        percentile_clipping=100,
-        block_wise=True,
         is_paged=False,
     ):
         """
@@ -238,10 +212,6 @@ def __init__(
                 An object with additional arguments.
             min_8bit_size (`int`, defaults to 4096):
                 The minimum number of elements of the parameter tensors for 8-bit optimization.
-            percentile_clipping (`int`, defaults to 100):
-                Adapts clipping threshold automatically by tracking the last 100 gradient norms and clipping the gradient at a certain percentile to improve stability.
-            block_wise (`bool`, defaults to `True`):
-                Whether to independently quantize each block of tensors to reduce outlier effects and improve stability.
             is_paged (`bool`, defaults to `False`):
                 Whether the optimizer is a paged optimizer or not.
         """
@@ -255,8 +225,6 @@ def __init__(
             optim_bits,
             args,
             min_8bit_size,
-            percentile_clipping,
-            block_wise,
             is_paged=True,
         )
 
@@ -273,8 +241,6 @@ def __init__(
         optim_bits=32,
         args=None,
         min_8bit_size=4096,
-        percentile_clipping=100,
-        block_wise=True,
         is_paged=False,
     ):
         """
@@ -301,10 +267,6 @@ def __init__(
                 An object with additional arguments.
             min_8bit_size (`int`, defaults to 4096):
                 The minimum number of elements of the parameter tensors for 8-bit optimization.
-            percentile_clipping (`int`, defaults to 100):
-                Adapts clipping threshold automatically by tracking the last 100 gradient norms and clipping the gradient at a certain percentile to improve stability.
-            block_wise (`bool`, defaults to `True`):
-                Whether to independently quantize each block of tensors to reduce outlier effects and improve stability.
             is_paged (`bool`, defaults to `False`):
                 Whether the optimizer is a paged optimizer or not.
         """
@@ -327,8 +289,6 @@ def __init__(
             8,  # Hardcoded to 8 bits
             args,
             min_8bit_size,
-            percentile_clipping,
-            block_wise,
             is_paged=True,
         )
 
@@ -345,8 +305,6 @@ def __init__(
         optim_bits=32,
         args=None,
         min_8bit_size=4096,
-        percentile_clipping=100,
-        block_wise=True,
         is_paged=False,
     ):
         """
@@ -371,10 +329,6 @@ def __init__(
                 An object with additional arguments.
             min_8bit_size (`int`, defaults to 4096):
                 The minimum number of elements of the parameter tensors for 8-bit optimization.
-            percentile_clipping (`int`, defaults to 100):
-                Adapts clipping threshold automatically by tracking the last 100 gradient norms and clipping the gradient at a certain percentile to improve stability.
-            block_wise (`bool`, defaults to `True`):
-                Whether to independently quantize each block of tensors to reduce outlier effects and improve stability.
             is_paged (`bool`, defaults to `False`):
                 Whether the optimizer is a paged optimizer or not.
         """
@@ -388,7 +342,5 @@ def __init__(
             32,
             args,
             min_8bit_size,
-            percentile_clipping,
-            block_wise,
             is_paged=True,
         )
diff --git a/bitsandbytes/optim/adamw.py b/bitsandbytes/optim/adamw.py
index 5f225c9ad..36e151dfc 100644
--- a/bitsandbytes/optim/adamw.py
+++ b/bitsandbytes/optim/adamw.py
@@ -18,8 +18,6 @@ def __init__(
         optim_bits=32,
         args=None,
         min_8bit_size=4096,
-        percentile_clipping=100,
-        block_wise=True,
         is_paged=False,
     ):
         """
@@ -44,10 +42,6 @@ def __init__(
                 An object with additional arguments.
             min_8bit_size (`int`, defaults to 4096):
                 The minimum number of elements of the parameter tensors for 8-bit optimization.
-            percentile_clipping (`int`, defaults to 100):
-                Adapts clipping threshold automatically by tracking the last 100 gradient norms and clipping the gradient at a certain percentile to improve stability.
-            block_wise (`bool`, defaults to `True`):
-                Whether to independently quantize each block of tensors to reduce outlier effects and improve stability.
             is_paged (`bool`, defaults to `False`):
                 Whether the optimizer is a paged optimizer or not.
         """
@@ -61,8 +55,6 @@ def __init__(
             optim_bits,
             args,
             min_8bit_size,
-            percentile_clipping,
-            block_wise,
             is_paged=is_paged,
         )
 
@@ -79,8 +71,6 @@ def __init__(
         optim_bits=32,
         args=None,
         min_8bit_size=4096,
-        percentile_clipping=100,
-        block_wise=True,
         is_paged=False,
     ):
         """
@@ -107,10 +97,6 @@ def __init__(
                 An object with additional arguments.
             min_8bit_size (`int`, defaults to 4096):
                 The minimum number of elements of the parameter tensors for 8-bit optimization.
-            percentile_clipping (`int`, defaults to 100):
-                Adapts clipping threshold automatically by tracking the last 100 gradient norms and clipping the gradient at a certain percentile to improve stability.
-            block_wise (`bool`, defaults to `True`):
-                Whether to independently quantize each block of tensors to reduce outlier effects and improve stability.
             is_paged (`bool`, defaults to `False`):
                 Whether the optimizer is a paged optimizer or not.
         """
@@ -133,8 +119,6 @@ def __init__(
             8,  # Hardcoded to 8 bits
             args,
             min_8bit_size,
-            percentile_clipping,
-            block_wise,
             is_paged=is_paged,
         )
 
@@ -151,8 +135,6 @@ def __init__(
         optim_bits=32,
         args=None,
         min_8bit_size=4096,
-        percentile_clipping=100,
-        block_wise=True,
         is_paged=False,
     ):
         """
@@ -177,10 +159,6 @@ def __init__(
                 An object with additional arguments.
             min_8bit_size (`int`, defaults to 4096):
                 The minimum number of elements of the parameter tensors for 8-bit optimization.
-            percentile_clipping (`int`, defaults to 100):
-                Adapts clipping threshold automatically by tracking the last 100 gradient norms and clipping the gradient at a certain percentile to improve stability.
-            block_wise (`bool`, defaults to `True`):
-                Whether to independently quantize each block of tensors to reduce outlier effects and improve stability.
             is_paged (`bool`, defaults to `False`):
                 Whether the optimizer is a paged optimizer or not.
         """
@@ -194,8 +172,6 @@ def __init__(
             32,
             args,
             min_8bit_size,
-            percentile_clipping,
-            block_wise,
             is_paged=is_paged,
         )
 
@@ -212,8 +188,6 @@ def __init__(
         optim_bits=32,
         args=None,
         min_8bit_size=4096,
-        percentile_clipping=100,
-        block_wise=True,
     ):
         """
         Paged AdamW optimizer.
@@ -237,10 +211,6 @@ def __init__(
                 An object with additional arguments.
             min_8bit_size (`int`, defaults to 4096):
                 The minimum number of elements of the parameter tensors for 8-bit optimization.
-            percentile_clipping (`int`, defaults to 100):
-                Adapts clipping threshold automatically by tracking the last 100 gradient norms and clipping the gradient at a certain percentile to improve stability.
-            block_wise (`bool`, defaults to `True`):
-                Whether to independently quantize each block of tensors to reduce outlier effects and improve stability.
         """
         super().__init__(
             "adam",
@@ -252,8 +222,6 @@ def __init__(
             optim_bits,
             args,
             min_8bit_size,
-            percentile_clipping,
-            block_wise,
             is_paged=True,
         )
 
@@ -270,8 +238,6 @@ def __init__(
         optim_bits=32,
         args=None,
         min_8bit_size=4096,
-        percentile_clipping=100,
-        block_wise=True,
     ):
         """
         Paged 8-bit AdamW optimizer.
@@ -297,10 +263,6 @@ def __init__(
                 An object with additional arguments.
             min_8bit_size (`int`, defaults to 4096):
                 The minimum number of elements of the parameter tensors for 8-bit optimization.
-            percentile_clipping (`int`, defaults to 100):
-                Adapts clipping threshold automatically by tracking the last 100 gradient norms and clipping the gradient at a certain percentile to improve stability.
-            block_wise (`bool`, defaults to `True`):
-                Whether to independently quantize each block of tensors to reduce outlier effects and improve stability.
         """
         # Validate unsupported parameters
         if amsgrad:
@@ -321,8 +283,6 @@ def __init__(
             8,  # Hardcoded to 8 bits
             args,
             min_8bit_size,
-            percentile_clipping,
-            block_wise,
             is_paged=True,
         )
 
@@ -339,8 +299,6 @@ def __init__(
         optim_bits=32,
         args=None,
         min_8bit_size=4096,
-        percentile_clipping=100,
-        block_wise=True,
     ):
         """
         Paged 32-bit AdamW optimizer.
@@ -364,10 +322,6 @@ def __init__(
                 An object with additional arguments.
             min_8bit_size (`int`, defaults to 4096):
                 The minimum number of elements of the parameter tensors for 8-bit optimization.
-            percentile_clipping (`int`, defaults to 100):
-                Adapts clipping threshold automatically by tracking the last 100 gradient norms and clipping the gradient at a certain percentile to improve stability.
-            block_wise (`bool`, defaults to `True`):
-                Whether to independently quantize each block of tensors to reduce outlier effects and improve stability.
         """
         super().__init__(
             "adam",
@@ -379,7 +333,5 @@ def __init__(
             32,
             args,
             min_8bit_size,
-            percentile_clipping,
-            block_wise,
             is_paged=True,
         )
diff --git a/bitsandbytes/optim/ademamix.py b/bitsandbytes/optim/ademamix.py
index 928289adb..48a62198e 100644
--- a/bitsandbytes/optim/ademamix.py
+++ b/bitsandbytes/optim/ademamix.py
@@ -129,8 +129,6 @@ def __init__(
             optim_bits=optim_bits,
             args=None,
             min_8bit_size=min_8bit_size,
-            percentile_clipping=100,
-            block_wise=True,
             is_paged=is_paged,
             alpha=alpha,
             t_alpha=t_alpha,
@@ -142,8 +140,6 @@ def init_state(self, group, p, gindex, pindex):
         # In our AdEMAMix implementation, we use `state` to hold
         # both the fast and slow EMAs. Here we override the base
         # `Optimizer2State` to allocate a buffer twice as large.
-        # Additional consideration: we do not support block_wise=False,
-        # percentile clipping, or max_unorm.
 
         config = self.get_config(gindex, pindex, group)
 
@@ -380,8 +376,6 @@ def __init__(
             optim_bits=32,
             args=None,
             min_8bit_size=min_8bit_size,
-            percentile_clipping=100,
-            block_wise=True,
             is_paged=is_paged,
             alpha=alpha,
             t_alpha=t_alpha,
diff --git a/bitsandbytes/optim/lamb.py b/bitsandbytes/optim/lamb.py
index 8d29cbbfe..6dcfd383f 100644
--- a/bitsandbytes/optim/lamb.py
+++ b/bitsandbytes/optim/lamb.py
@@ -19,8 +19,6 @@ def __init__(
         optim_bits=32,
         args=None,
         min_8bit_size=4096,
-        percentile_clipping=100,
-        block_wise=False,
         max_unorm=1.0,
     ):
         """
@@ -49,10 +47,6 @@ def __init__(
                 An object with additional arguments.
             min_8bit_size (`int`, defaults to 4096):
                 The minimum number of elements of the parameter tensors for 8-bit optimization.
-            percentile_clipping (`int`, defaults to 100):
-                Adapts clipping threshold automatically by tracking the last 100 gradient norms and clipping the gradient at a certain percentile to improve stability.
-            block_wise (`bool`, defaults to `True`):
-                Whether to independently quantize each block of tensors to reduce outlier effects and improve stability.
             max_unorm (`float`, defaults to 1.0):
                 The maximum gradient norm.
         """
@@ -66,8 +60,6 @@ def __init__(
             optim_bits,
             args,
             min_8bit_size,
-            percentile_clipping,
-            block_wise,
             max_unorm=1.0,
         )
 
@@ -85,8 +77,6 @@ def __init__(
         adam_w_mode=True,
         args=None,
         min_8bit_size=4096,
-        percentile_clipping=100,
-        block_wise=False,
         max_unorm=1.0,
     ):
         """
@@ -113,10 +103,6 @@ def __init__(
                 An object with additional arguments.
             min_8bit_size (`int`, defaults to 4096):
                 The minimum number of elements of the parameter tensors for 8-bit optimization.
-            percentile_clipping (`int`, defaults to 100):
-                Adapts clipping threshold automatically by tracking the last 100 gradient norms and clipping the gradient at a certain percentile to improve stability.
-            block_wise (`bool`, defaults to `True`):
-                Whether to independently quantize each block of tensors to reduce outlier effects and improve stability.
             max_unorm (`float`, defaults to 1.0):
                 The maximum gradient norm.
         """
@@ -130,8 +116,6 @@ def __init__(
             8,
             args,
             min_8bit_size,
-            percentile_clipping,
-            block_wise,
             max_unorm=1.0,
         )
 
@@ -149,8 +133,6 @@ def __init__(
         adam_w_mode=True,
         args=None,
         min_8bit_size=4096,
-        percentile_clipping=100,
-        block_wise=False,
         max_unorm=1.0,
     ):
         """
@@ -177,10 +159,6 @@ def __init__(
                 An object with additional arguments.
             min_8bit_size (`int`, defaults to 4096):
                 The minimum number of elements of the parameter tensors for 8-bit optimization.
-            percentile_clipping (`int`, defaults to 100):
-                Adapts clipping threshold automatically by tracking the last 100 gradient norms and clipping the gradient at a certain percentile to improve stability.
-            block_wise (`bool`, defaults to `True`):
-                Whether to independently quantize each block of tensors to reduce outlier effects and improve stability.
             max_unorm (`float`, defaults to 1.0):
                 The maximum gradient norm.
         """
@@ -194,7 +172,5 @@ def __init__(
             32,
             args,
             min_8bit_size,
-            percentile_clipping,
-            block_wise,
             max_unorm=1.0,
         )
diff --git a/bitsandbytes/optim/lars.py b/bitsandbytes/optim/lars.py
index fa2af57bc..c2f5aa784 100644
--- a/bitsandbytes/optim/lars.py
+++ b/bitsandbytes/optim/lars.py
@@ -20,7 +20,6 @@ def __init__(
         optim_bits=32,
         args=None,
         min_8bit_size=4096,
-        percentile_clipping=100,
         max_unorm=0.02,
     ):
         """
@@ -45,8 +44,6 @@ def __init__(
                 An object with additional arguments.
             min_8bit_size (`int`, defaults to 4096):
                 The minimum number of elements of the parameter tensors for 8-bit optimization.
-            percentile_clipping (`int`, defaults to 100):
-                Adapts clipping threshold automatically by tracking the last 100 gradient norms and clipping the gradient at a certain percentile to improve stability.
             max_unorm (`float`, defaults to 0.02):
                 The maximum gradient norm.
         """
@@ -62,9 +59,7 @@ def __init__(
             optim_bits,
             args,
             min_8bit_size,
-            percentile_clipping,
             max_unorm=max_unorm,
-            block_wise=False,
         )
 
 
@@ -79,7 +74,6 @@ def __init__(
         nesterov=False,
         args=None,
         min_8bit_size=4096,
-        percentile_clipping=100,
         max_unorm=0.02,
     ):
         """
@@ -102,8 +96,6 @@ def __init__(
                 An object with additional arguments.
             min_8bit_size (`int`, defaults to 4096):
                 The minimum number of elements of the parameter tensors for 8-bit optimization.
-            percentile_clipping (`int`, defaults to 100):
-                Adapts clipping threshold automatically by tracking the last 100 gradient norms and clipping the gradient at a certain percentile to improve stability.
             max_unorm (`float`, defaults to 0.02):
                 The maximum gradient norm.
         """
@@ -119,9 +111,7 @@ def __init__(
             8,
             args,
             min_8bit_size,
-            percentile_clipping,
             max_unorm=max_unorm,
-            block_wise=False,
         )
 
 
@@ -136,7 +126,6 @@ def __init__(
         nesterov=False,
         args=None,
         min_8bit_size=4096,
-        percentile_clipping=100,
         max_unorm=0.02,
     ):
         """
@@ -159,8 +148,6 @@ def __init__(
                 An object with additional arguments.
             min_8bit_size (`int`, defaults to 4096):
                 The minimum number of elements of the parameter tensors for 8-bit optimization.
-            percentile_clipping (`int`, defaults to 100):
-                Adapts clipping threshold automatically by tracking the last 100 gradient norms and clipping the gradient at a certain percentile to improve stability.
             max_unorm (`float`, defaults to 0.02):
                 The maximum gradient norm.
         """
@@ -176,9 +163,7 @@ def __init__(
             32,
             args,
             min_8bit_size,
-            percentile_clipping,
             max_unorm=max_unorm,
-            block_wise=False,
         )
 
 
diff --git a/bitsandbytes/optim/lion.py b/bitsandbytes/optim/lion.py
index 2e4163694..6100491f6 100644
--- a/bitsandbytes/optim/lion.py
+++ b/bitsandbytes/optim/lion.py
@@ -15,8 +15,6 @@ def __init__(
         optim_bits=32,
         args=None,
         min_8bit_size=4096,
-        percentile_clipping=100,
-        block_wise=True,
         is_paged=False,
     ):
         """
@@ -37,10 +35,6 @@ def __init__(
                 An object with additional arguments.
             min_8bit_size (`int`, defaults to 4096):
                 The minimum number of elements of the parameter tensors for 8-bit optimization.
-            percentile_clipping (`int`, defaults to 100):
-                Adapts clipping threshold automatically by tracking the last 100 gradient norms and clipping the gradient at a certain percentile to improve stability.
-            block_wise (`bool`, defaults to `True`):
-                Whether to independently quantize each block of tensors to reduce outlier effects and improve stability.
             is_paged (`bool`, defaults to `False`):
                 Whether the optimizer is a paged optimizer or not.
         """
@@ -54,8 +48,6 @@ def __init__(
             optim_bits,
             args,
             min_8bit_size,
-            percentile_clipping,
-            block_wise,
             is_paged=is_paged,
         )
 
@@ -69,8 +61,6 @@ def __init__(
         weight_decay=0,
         args=None,
         min_8bit_size=4096,
-        percentile_clipping=100,
-        block_wise=True,
         is_paged=False,
     ):
         """
@@ -89,10 +79,6 @@ def __init__(
                 An object with additional arguments.
             min_8bit_size (`int`, defaults to 4096):
                 The minimum number of elements of the parameter tensors for 8-bit optimization.
-            percentile_clipping (`int`, defaults to 100):
-                Adapts clipping threshold automatically by tracking the last 100 gradient norms and clipping the gradient at a certain percentile to improve stability.
-            block_wise (`bool`, defaults to `True`):
-                Whether to independently quantize each block of tensors to reduce outlier effects and improve stability.
             is_paged (`bool`, defaults to `False`):
                 Whether the optimizer is a paged optimizer or not.
         """
@@ -106,8 +92,6 @@ def __init__(
             8,
             args,
             min_8bit_size,
-            percentile_clipping,
-            block_wise,
             is_paged=is_paged,
         )
 
@@ -121,8 +105,6 @@ def __init__(
         weight_decay=0,
         args=None,
         min_8bit_size=4096,
-        percentile_clipping=100,
-        block_wise=True,
         is_paged=False,
     ):
         """
@@ -141,10 +123,6 @@ def __init__(
                 An object with additional arguments.
             min_8bit_size (`int`, defaults to 4096):
                 The minimum number of elements of the parameter tensors for 8-bit optimization.
-            percentile_clipping (`int`, defaults to 100):
-                Adapts clipping threshold automatically by tracking the last 100 gradient norms and clipping the gradient at a certain percentile to improve stability.
-            block_wise (`bool`, defaults to `True`):
-                Whether to independently quantize each block of tensors to reduce outlier effects and improve stability.
             is_paged (`bool`, defaults to `False`):
                 Whether the optimizer is a paged optimizer or not.
         """
@@ -158,8 +136,6 @@ def __init__(
             32,
             args,
             min_8bit_size,
-            percentile_clipping,
-            block_wise,
             is_paged=is_paged,
         )
 
@@ -174,8 +150,6 @@ def __init__(
         optim_bits=32,
         args=None,
         min_8bit_size=4096,
-        percentile_clipping=100,
-        block_wise=True,
     ):
         """
         Paged Lion optimizer.
@@ -195,10 +169,6 @@ def __init__(
                 An object with additional arguments.
             min_8bit_size (`int`, defaults to 4096):
                 The minimum number of elements of the parameter tensors for 8-bit optimization.
-            percentile_clipping (`int`, defaults to 100):
-                Adapts clipping threshold automatically by tracking the last 100 gradient norms and clipping the gradient at a certain percentile to improve stability.
-            block_wise (`bool`, defaults to `True`):
-                Whether to independently quantize each block of tensors to reduce outlier effects and improve stability.
         """
         super().__init__(
             "lion",
@@ -210,8 +180,6 @@ def __init__(
             optim_bits,
             args,
             min_8bit_size,
-            percentile_clipping,
-            block_wise,
             is_paged=True,
         )
 
@@ -225,8 +193,6 @@ def __init__(
         weight_decay=0,
         args=None,
         min_8bit_size=4096,
-        percentile_clipping=100,
-        block_wise=True,
     ):
         """
         Paged 8-bit Lion optimizer.
@@ -240,16 +206,10 @@ def __init__(
                 The beta values are the decay rates of the first and second-order moment of the optimizer.
             weight_decay (`float`, defaults to 0):
                 The weight decay value for the optimizer.
-            optim_bits (`int`, defaults to 32):
-                The number of bits of the optimizer state.
             args (`object`, defaults to `None`):
                 An object with additional arguments.
             min_8bit_size (`int`, defaults to 4096):
                 The minimum number of elements of the parameter tensors for 8-bit optimization.
-            percentile_clipping (`int`, defaults to 100):
-                Adapts clipping threshold automatically by tracking the last 100 gradient norms and clipping the gradient at a certain percentile to improve stability.
-            block_wise (`bool`, defaults to `True`):
-                Whether to independently quantize each block of tensors to reduce outlier effects and improve stability.
         """
         super().__init__(
             "lion",
@@ -261,8 +221,6 @@ def __init__(
             8,
             args,
             min_8bit_size,
-            percentile_clipping,
-            block_wise,
             is_paged=True,
         )
 
@@ -276,8 +234,6 @@ def __init__(
         weight_decay=0,
         args=None,
         min_8bit_size=4096,
-        percentile_clipping=100,
-        block_wise=True,
     ):
         """
         Paged 32-bit Lion optimizer.
@@ -291,16 +247,10 @@ def __init__(
                 The beta values are the decay rates of the first and second-order moment of the optimizer.
             weight_decay (`float`, defaults to 0):
                 The weight decay value for the optimizer.
-            optim_bits (`int`, defaults to 32):
-                The number of bits of the optimizer state.
             args (`object`, defaults to `None`):
                 An object with additional arguments.
             min_8bit_size (`int`, defaults to 4096):
                 The minimum number of elements of the parameter tensors for 8-bit optimization.
-            percentile_clipping (`int`, defaults to 100):
-                Adapts clipping threshold automatically by tracking the last 100 gradient norms and clipping the gradient at a certain percentile to improve stability.
-            block_wise (`bool`, defaults to `True`):
-                Whether to independently quantize each block of tensors to reduce outlier effects and improve stability.
         """
         super().__init__(
             "lion",
@@ -312,7 +262,5 @@ def __init__(
             32,
             args,
             min_8bit_size,
-            percentile_clipping,
-            block_wise,
             is_paged=True,
         )
diff --git a/bitsandbytes/optim/optimizer.py b/bitsandbytes/optim/optimizer.py
index db7a35231..5c00a42bf 100644
--- a/bitsandbytes/optim/optimizer.py
+++ b/bitsandbytes/optim/optimizer.py
@@ -59,7 +59,7 @@ def override_config(self, parameters, key=None, value=None, key_value_dict=None)
 
         The key-values of the optimizer config for the input parameters are overridden
         This can be both, optimizer parameters like `betas` or `lr`, or it can be
-        8-bit specific parameters like `optim_bits` or `percentile_clipping`.
+        8-bit specific parameters like `optim_bits`.
 
         Arguments:
            parameters (`torch.Tensor` or `list(torch.Tensors)`):
@@ -345,8 +345,6 @@ def get_config(self, gindex, pindex, group):
         config["t_beta3"] = group.get("t_beta3", 0)
         config["optim_bits"] = self.args.optim_bits
         config["min_8bit_size"] = self.args.min_8bit_size
-        config["percentile_clipping"] = self.args.percentile_clipping
-        config["block_wise"] = self.args.block_wise
         config["max_unorm"] = self.args.max_unorm
         config["skip_zeros"] = self.args.skip_zeros
 
@@ -393,8 +391,6 @@ def __init__(
         optim_bits=32,
         args=None,
         min_8bit_size=4096,
-        percentile_clipping=100,
-        block_wise=True,
         max_unorm=0.0,
         skip_zeros=False,
         is_paged=False,
@@ -424,10 +420,6 @@ def __init__(
                 An object with additional arguments.
             min_8bit_size (`int`, defaults to 4096):
                 The minimum number of elements of the parameter tensors for 8-bit optimization.
-            percentile_clipping (`int`, defaults to 100):
-                Adapts clipping threshold automatically by tracking the last 100 gradient norms and clipping the gradient at a certain percentile to improve stability.
-            block_wise (`bool`, defaults to `True`):
-                Whether to independently quantize each block of tensors to reduce outlier effects and improve stability.
             max_unorm (`float`, defaults to 0.0):
                 The maximum value to normalize each block with.
             skip_zeros (`bool`, defaults to `False`):
@@ -466,8 +458,6 @@ def __init__(
             args = {}
             args["optim_bits"] = optim_bits
             args["min_8bit_size"] = min_8bit_size
-            args["percentile_clipping"] = percentile_clipping
-            args["block_wise"] = block_wise
             args["max_unorm"] = max_unorm
             args["skip_zeros"] = skip_zeros
 
@@ -510,21 +500,12 @@ def init_state(self, group, p, gindex, pindex):
             state["state2"] = self.get_state_buffer(p, dtype=torch.uint8)
             state["qmap2"] = self.name2qmap["udynamic"]
 
-            if config["block_wise"]:
-                blocksize = 256
-                n = p.numel()
-                blocks = (n // blocksize) + bool(n % blocksize)
+            blocksize = 256
+            n = p.numel()
+            blocks = (n // blocksize) + bool(n % blocksize)
 
-                state["absmax1"] = torch.zeros((blocks,), dtype=torch.float32, device=p.device)
-                state["absmax2"] = torch.zeros((blocks,), dtype=torch.float32, device=p.device)
-            else:
-                state["max1"] = torch.zeros((1,), dtype=torch.float32, device=p.device)
-                state["new_max1"] = torch.zeros((1,), dtype=torch.float32, device=p.device)
-                state["max2"] = torch.zeros((1,), dtype=torch.float32, device=p.device)
-                state["new_max2"] = torch.zeros((1,), dtype=torch.float32, device=p.device)
-
-        if config["percentile_clipping"] < 100:
-            state["gnorm_vec"] = torch.zeros((100,), device=p.device)
+            state["absmax1"] = torch.zeros((blocks,), dtype=torch.float32, device=p.device)
+            state["absmax2"] = torch.zeros((blocks,), dtype=torch.float32, device=p.device)
 
         if config["max_unorm"] > 0.0:
             state["unorm_vec"] = torch.zeros((1,), device=p.device)
@@ -543,16 +524,6 @@ def update_step(self, group, p, gindex, pindex):
         state["step"] += 1
         step = state["step"]
 
-        if config["percentile_clipping"] < 100:
-            _current_gnorm, _clip_value, gnorm_scale = F.percentile_clipping(
-                grad,
-                state["gnorm_vec"],
-                step,
-                config["percentile_clipping"],
-            )
-        else:
-            gnorm_scale = 1.0
-
         if state["state1"].dtype == torch.float:
             F.optimizer_update_32bit(
                 self.optimizer_name,
@@ -568,40 +539,13 @@ def update_step(self, group, p, gindex, pindex):
                 config["betas"][2] if len(config["betas"]) >= 3 else 0.0,
                 config.get("alpha", 0.0),
                 config["weight_decay"],
-                gnorm_scale,
+                1.0,
                 state["unorm_vec"] if config["max_unorm"] > 0.0 else None,
                 max_unorm=config["max_unorm"],
                 skip_zeros=config["skip_zeros"],
             )
 
-        elif state["state1"].dtype == torch.uint8 and not config["block_wise"]:
-            F.optimizer_update_8bit(
-                self.optimizer_name,
-                grad,
-                p,
-                state["state1"],
-                state["state2"],
-                config["betas"][0],
-                config["betas"][1],
-                config["eps"],
-                step,
-                config["lr"],
-                state["qmap1"],
-                state["qmap2"],
-                state["max1"],
-                state["max2"],
-                state["new_max1"],
-                state["new_max2"],
-                config["weight_decay"],
-                gnorm_scale=gnorm_scale,
-                unorm_vec=state["unorm_vec"] if config["max_unorm"] > 0.0 else None,
-                max_unorm=config["max_unorm"],
-            )
-
-            # swap maxes
-            state["max1"], state["new_max1"] = state["new_max1"], state["max1"]
-            state["max2"], state["new_max2"] = state["new_max2"], state["max2"]
-        elif state["state1"].dtype == torch.uint8 and config["block_wise"]:
+        elif state["state1"].dtype == torch.uint8:
             F.optimizer_update_8bit_blockwise(
                 self.optimizer_name,
                 grad,
@@ -620,7 +564,7 @@ def update_step(self, group, p, gindex, pindex):
                 state["absmax1"],
                 state["absmax2"],
                 config["weight_decay"],
-                gnorm_scale=gnorm_scale,
+                gnorm_scale=1.0,
                 skip_zeros=config["skip_zeros"],
             )
 
@@ -637,8 +581,6 @@ def __init__(
         optim_bits=32,
         args=None,
         min_8bit_size=4096,
-        percentile_clipping=100,
-        block_wise=True,
         max_unorm=0.0,
         skip_zeros=False,
         is_paged=False,
@@ -665,10 +607,6 @@ def __init__(
                 An object with additional arguments.
             min_8bit_size (`int`, defaults to 4096):
                 The minimum number of elements of the parameter tensors for 8-bit optimization.
-            percentile_clipping (`int`, defaults to 100):
-                Adapts clipping threshold automatically by tracking the last 100 gradient norms and clipping the gradient at a certain percentile to improve stability.
-            block_wise (`bool`, defaults to `True`):
-                Whether to independently quantize each block of tensors to reduce outlier effects and improve stability.
             max_unorm (`float`, defaults to 0.0):
                 The maximum value to normalize each block with.
             skip_zeros (`bool`, defaults to `False`):
@@ -692,8 +630,6 @@ def __init__(
             args = {}
             args["optim_bits"] = optim_bits
             args["min_8bit_size"] = min_8bit_size
-            args["percentile_clipping"] = percentile_clipping
-            args["block_wise"] = block_wise
             args["max_unorm"] = max_unorm
             args["skip_zeros"] = skip_zeros
 
@@ -731,18 +667,11 @@ def init_state(self, group, p, gindex, pindex):
             state["state1"] = self.get_state_buffer(p, dtype=torch.uint8)
             state["qmap1"] = self.name2qmap["dynamic"]
 
-            if config["block_wise"]:
-                blocksize = 256
-                n = p.numel()
-                blocks = (n // blocksize) + bool(n % blocksize)
-
-                state["absmax1"] = torch.zeros((blocks,), dtype=torch.float32, device=p.device)
-            else:
-                state["max1"] = torch.zeros((1,), dtype=torch.float32, device=p.device)
-                state["new_max1"] = torch.zeros((1,), dtype=torch.float32, device=p.device)
+            blocksize = 256
+            n = p.numel()
+            blocks = (n // blocksize) + bool(n % blocksize)
 
-        if config["percentile_clipping"] < 100:
-            state["gnorm_vec"] = torch.zeros((100,), device=p.device)
+            state["absmax1"] = torch.zeros((blocks,), dtype=torch.float32, device=p.device)
 
         if config["max_unorm"] > 0.0:
             state["unorm_vec"] = torch.zeros((1,), device=p.device)
@@ -761,16 +690,6 @@ def update_step(self, group, p, gindex, pindex):
         state["step"] += 1
         step = state["step"]
 
-        if config["percentile_clipping"] < 100:
-            _current_gnorm, _clip_value, gnorm_scale = F.percentile_clipping(
-                grad,
-                state["gnorm_vec"],
-                step,
-                config["percentile_clipping"],
-            )
-        else:
-            gnorm_scale = 1.0
-
         if state["state1"].dtype == torch.float:
             F.optimizer_update_32bit(
                 self.optimizer_name,
@@ -786,38 +705,13 @@ def update_step(self, group, p, gindex, pindex):
                 0.0,
                 0.0,
                 config["weight_decay"],
-                gnorm_scale,
+                1.0,
                 state["unorm_vec"] if config["max_unorm"] > 0.0 else None,
                 max_unorm=config["max_unorm"],
                 skip_zeros=config["skip_zeros"],
             )
 
-        elif state["state1"].dtype == torch.uint8 and not config["block_wise"]:
-            F.optimizer_update_8bit(
-                self.optimizer_name,
-                grad,
-                p,
-                state["state1"],
-                None,
-                config["betas"][0],
-                config["betas"][1],
-                config["eps"],
-                step,
-                config["lr"],
-                state["qmap1"],
-                None,
-                state["max1"],
-                None,
-                state["new_max1"],
-                None,
-                config["weight_decay"],
-                gnorm_scale,
-                state["unorm_vec"] if config["max_unorm"] > 0.0 else None,
-                max_unorm=config["max_unorm"],
-            )
-
-            state["max1"], state["new_max1"] = state["new_max1"], state["max1"]
-        elif state["state1"].dtype == torch.uint8 and config["block_wise"]:
+        elif state["state1"].dtype == torch.uint8:
             F.optimizer_update_8bit_blockwise(
                 self.optimizer_name,
                 grad,
@@ -836,6 +730,6 @@ def update_step(self, group, p, gindex, pindex):
                 state["absmax1"],
                 None,
                 config["weight_decay"],
-                gnorm_scale=gnorm_scale,
+                gnorm_scale=1.0,
                 skip_zeros=config["skip_zeros"],
             )
diff --git a/bitsandbytes/optim/rmsprop.py b/bitsandbytes/optim/rmsprop.py
index 25611309b..54c1fbda0 100644
--- a/bitsandbytes/optim/rmsprop.py
+++ b/bitsandbytes/optim/rmsprop.py
@@ -18,8 +18,6 @@ def __init__(
         optim_bits=32,
         args=None,
         min_8bit_size=4096,
-        percentile_clipping=100,
-        block_wise=True,
     ):
         """
         Base RMSprop optimizer.
@@ -45,10 +43,6 @@ def __init__(
                 An object with additional arguments.
             min_8bit_size (`int`, defaults to 4096):
                 The minimum number of elements of the parameter tensors for 8-bit optimization.
-            percentile_clipping (`int`, defaults to 100):
-                Adapts clipping threshold automatically by tracking the last 100 gradient norms and clipping the gradient at a certain percentile to improve stability.
-            block_wise (`bool`, defaults to `True`):
-                Whether to independently quantize each block of tensors to reduce outlier effects and improve stability.
         """
         if alpha == 0:
             raise NotImplementedError("RMSprop with alpha==0.0 is not supported!")
@@ -64,8 +58,6 @@ def __init__(
             optim_bits,
             args,
             min_8bit_size,
-            percentile_clipping,
-            block_wise,
         )
 
 
@@ -81,8 +73,6 @@ def __init__(
         centered=False,
         args=None,
         min_8bit_size=4096,
-        percentile_clipping=100,
-        block_wise=True,
     ):
         """
         8-bit RMSprop optimizer.
@@ -102,16 +92,10 @@ def __init__(
                 The momentum value speeds up the optimizer by taking bigger steps.
             centered (`bool`, defaults to `False`):
                 Whether the gradients are normalized by the variance. If `True`, it can help training at the expense of additional compute.
-            optim_bits (`int`, defaults to 32):
-                The number of bits of the optimizer state.
             args (`object`, defaults to `None`):
                 An object with additional arguments.
             min_8bit_size (`int`, defaults to 4096):
                 The minimum number of elements of the parameter tensors for 8-bit optimization.
-            percentile_clipping (`int`, defaults to 100):
-                Adapts clipping threshold automatically by tracking the last 100 gradient norms and clipping the gradient at a certain percentile to improve stability.
-            block_wise (`bool`, defaults to `True`):
-                Whether to independently quantize each block of tensors to reduce outlier effects and improve stability.
         """
         if alpha == 0:
             raise NotImplementedError("RMSprop with alpha==0.0 is not supported!")
@@ -127,8 +111,6 @@ def __init__(
             8,
             args,
             min_8bit_size,
-            percentile_clipping,
-            block_wise,
         )
 
 
@@ -144,8 +126,6 @@ def __init__(
         centered=False,
         args=None,
         min_8bit_size=4096,
-        percentile_clipping=100,
-        block_wise=True,
     ):
         """
         32-bit RMSprop optimizer.
@@ -171,10 +151,6 @@ def __init__(
                 An object with additional arguments.
             min_8bit_size (`int`, defaults to 4096):
                 The minimum number of elements of the parameter tensors for 8-bit optimization.
-            percentile_clipping (`int`, defaults to 100):
-                Adapts clipping threshold automatically by tracking the last 100 gradient norms and clipping the gradient at a certain percentile to improve stability.
-            block_wise (`bool`, defaults to `True`):
-                Whether to independently quantize each block of tensors to reduce outlier effects and improve stability.
         """
 
         if alpha == 0:
@@ -191,6 +167,4 @@ def __init__(
             32,
             args,
             min_8bit_size,
-            percentile_clipping,
-            block_wise,
         )
diff --git a/bitsandbytes/optim/sgd.py b/bitsandbytes/optim/sgd.py
index ec18f036c..75fc71474 100644
--- a/bitsandbytes/optim/sgd.py
+++ b/bitsandbytes/optim/sgd.py
@@ -17,8 +17,6 @@ def __init__(
         optim_bits=32,
         args=None,
         min_8bit_size=4096,
-        percentile_clipping=100,
-        block_wise=True,
     ):
         """
         Base SGD optimizer.
@@ -42,10 +40,6 @@ def __init__(
                 An object with additional arguments.
             min_8bit_size (`int`, defaults to 4096):
                 The minimum number of elements of the parameter tensors for 8-bit optimization.
-            percentile_clipping (`int`, defaults to 100):
-                Adapts clipping threshold automatically by tracking the last 100 gradient norms and clipping the gradient at a certain percentile to improve stability.
-            block_wise (`bool`, defaults to `True`):
-                Whether to independently quantize each block of tensors to reduce outlier effects and improve stability.
         """
         if momentum == 0:
             raise NotImplementedError("SGD without momentum is not supported!")
@@ -59,8 +53,6 @@ def __init__(
             optim_bits,
             args,
             min_8bit_size,
-            percentile_clipping,
-            block_wise,
         )
 
 
@@ -75,8 +67,6 @@ def __init__(
         nesterov=False,
         args=None,
         min_8bit_size=4096,
-        percentile_clipping=100,
-        block_wise=True,
     ):
         """
         8-bit SGD optimizer.
@@ -98,10 +88,6 @@ def __init__(
                 An object with additional arguments.
             min_8bit_size (`int`, defaults to 4096):
                 The minimum number of elements of the parameter tensors for 8-bit optimization.
-            percentile_clipping (`int`, defaults to 100):
-                Adapts clipping threshold automatically by tracking the last 100 gradient norms and clipping the gradient at a certain percentile to improve stability.
-            block_wise (`bool`, defaults to `True`):
-                Whether to independently quantize each block of tensors to reduce outlier effects and improve stability.
         """
         if momentum == 0:
             raise NotImplementedError("SGD without momentum is not supported!")
@@ -115,8 +101,6 @@ def __init__(
             8,
             args,
             min_8bit_size,
-            percentile_clipping,
-            block_wise,
         )
 
 
@@ -131,8 +115,6 @@ def __init__(
         nesterov=False,
         args=None,
         min_8bit_size=4096,
-        percentile_clipping=100,
-        block_wise=True,
     ):
         """
         32-bit SGD optimizer.
@@ -154,10 +136,6 @@ def __init__(
                 An object with additional arguments.
             min_8bit_size (`int`, defaults to 4096):
                 The minimum number of elements of the parameter tensors for 8-bit optimization.
-            percentile_clipping (`int`, defaults to 100):
-                Adapts clipping threshold automatically by tracking the last 100 gradient norms and clipping the gradient at a certain percentile to improve stability.
-            block_wise (`bool`, defaults to `True`):
-                Whether to independently quantize each block of tensors to reduce outlier effects and improve stability.
         """
         if momentum == 0:
             raise NotImplementedError("SGD without momentum is not supported!")
@@ -171,6 +149,4 @@ def __init__(
             32,
             args,
             min_8bit_size,
-            percentile_clipping,
-            block_wise,
         )
diff --git a/bitsandbytes/research/__init__.py b/bitsandbytes/research/__init__.py
deleted file mode 100644
index 31db4f282..000000000
--- a/bitsandbytes/research/__init__.py
+++ /dev/null
@@ -1,6 +0,0 @@
-from . import nn
-from .autograd._functions import (
-    matmul_fp8_global,
-    matmul_fp8_mixed,
-    switchback_bnb,
-)
diff --git a/bitsandbytes/research/autograd/__init__.py b/bitsandbytes/research/autograd/__init__.py
deleted file mode 100644
index e69de29bb..000000000
diff --git a/bitsandbytes/research/autograd/_functions.py b/bitsandbytes/research/autograd/_functions.py
deleted file mode 100644
index 1ea147a90..000000000
--- a/bitsandbytes/research/autograd/_functions.py
+++ /dev/null
@@ -1,396 +0,0 @@
-from functools import reduce  # Required in Python 3
-import operator
-from typing import Optional
-import warnings
-
-import torch
-
-from bitsandbytes.autograd._functions import GlobalOutlierPooler, MatmulLtState
-import bitsandbytes.functional as F
-
-
-# math.prod not compatible with python < 3.8
-def prod(iterable):
-    return reduce(operator.mul, iterable, 1)
-
-
-class MatMulFP8Mixed(torch.autograd.Function):
-    # forward is the same, but we added the fallback for pre-turing GPUs
-    # backward is mostly the same, but adds one extra clause (see "elif state.CxB is not None")
-
-    @staticmethod
-    def forward(ctx, A, B, out=None, fw_code=None, bw_code=None, bsz=1024, bsz2=1024):
-        # default of pytorch behavior if inputs are empty
-        ctx.is_empty = False
-        if prod(A.shape) == 0:
-            ctx.is_empty = True
-            ctx.A = A
-            ctx.B = B
-
-            B_shape = B.shape
-            if A.shape[-1] == B_shape[0]:
-                return torch.empty(A.shape[:-1] + B_shape[1:], dtype=A.dtype, device=A.device)
-            else:
-                return torch.empty(A.shape[:-1] + B_shape[:1], dtype=A.dtype, device=A.device)
-
-        # 1. Dequantize
-        # 2. MatmulnN
-        cA, state = F.quantize_blockwise(A, code=fw_code, blocksize=bsz)
-        fp8A = F.dequantize_blockwise(cA, state, blocksize=bsz).to(A.dtype)
-
-        cB, state = F.quantize(B.float(), code=fw_code)
-        fp8B = F.dequantize(cB, state).to(B.dtype)
-
-        output = torch.matmul(fp8A, fp8B)
-
-        # output is half
-
-        # 3. Save state
-        ctx.fw_code = fw_code
-        ctx.bw_code = bw_code
-        ctx.bsz = bsz
-        ctx.bsz2 = bsz2
-        ctx.dtype_A, ctx.dtype_B = A.dtype, B.dtype
-
-        if any(ctx.needs_input_grad[:2]):
-            # NOTE: we send back A, and re-quant.
-            ctx.tensors = (A, fp8B)
-        else:
-            ctx.tensors = (None, None)
-
-        return output
-
-    @staticmethod
-    def backward(ctx, grad_output):
-        if ctx.is_empty:
-            return torch.zeros_like(ctx.A), torch.zeros_like(ctx.B), None, None, None, None, None
-
-        req_gradA, req_gradB, _, _, _, _, _ = ctx.needs_input_grad
-        A, B = ctx.tensors
-
-        grad_A, grad_B = None, None
-
-        # TODO: Fix blocksize to be output_dim
-        cgrad_out, state = F.quantize_blockwise(grad_output, code=ctx.bw_code, blocksize=ctx.bsz2)
-        fp8out = F.dequantize_blockwise(cgrad_out, state, blocksize=ctx.bsz2).to(grad_output.dtype)
-
-        # cgrad_output_2, state_2 = F.quantize(grad_output.float(), code=ctx.bw_code)
-        # fp8out_2 = F.dequantize(cgrad_output_2, state_2).to(grad_output.dtype)
-
-        # grad_output_reshape = grad_output.reshape(-1, grad_output.shape[-1]).contiguous()
-        # fp8grad_transpose, stategrad_transpose = F.vectorwise_quant(grad_output_reshape, dim=0, quant_type='vector')
-        # fp8out_transpose = (fp8grad_transpose / 7) * stategrad_transpose
-        # fp8out_transpose = fp8out_transpose.view(grad_output.shape[0], grad_output.shape[1], grad_output.shape[2])
-
-        # not supported by PyTorch. TODO: create work-around
-        if req_gradA:
-            grad_A = torch.matmul(fp8out, B.t().to(fp8out.dtype)).to(A.dtype)
-
-        if req_gradB:
-            if len(A.shape) == 3:
-                At = A.transpose(2, 1).contiguous()
-            else:
-                At = A.transpose(1, 0).contiguous()
-            # cA, state = F.quantize(At.float(), code=ctx.fw_code)
-            # fp8At = F.dequantize(cA, state).to(A.dtype)
-            grad_B = torch.matmul(At.to(grad_output.dtype), grad_output).to(B.dtype)
-
-        return grad_A, grad_B, None, None, None, None, None
-
-
-class MatMulFP8Global(torch.autograd.Function):
-    # forward is the same, but we added the fallback for pre-turing GPUs
-    # backward is mostly the same, but adds one extra clause (see "elif state.CxB is not None")
-
-    @staticmethod
-    def forward(ctx, A, B, out=None, fw_code=None, bw_code=None, bsz=1024, bsz2=1024):
-        # default of pytorch behavior if inputs are empty
-        ctx.is_empty = False
-        if prod(A.shape) == 0:
-            ctx.is_empty = True
-            ctx.A = A
-            ctx.B = B
-
-            B_shape = B.shape
-            if A.shape[-1] == B_shape[0]:
-                return torch.empty(A.shape[:-1] + B_shape[1:], dtype=A.dtype, device=A.device)
-            else:
-                return torch.empty(A.shape[:-1] + B_shape[:1], dtype=A.dtype, device=A.device)
-
-        # 1. Dequantize
-        # 2. MatmulnN
-        cA, state = F.quantize(A.float(), code=fw_code)
-        fp8A = F.dequantize(cA, state).to(A.dtype)
-
-        cB, state = F.quantize(B.float(), code=fw_code)
-        fp8B = F.dequantize(cB, state).to(B.dtype)
-
-        output = torch.matmul(fp8A, fp8B)
-
-        # output is half
-
-        # 3. Save state
-        ctx.fw_code = fw_code
-        ctx.bw_code = bw_code
-        ctx.bsz = bsz
-        ctx.bsz2 = bsz2
-        ctx.dtype_A, ctx.dtype_B = A.dtype, B.dtype
-
-        if any(ctx.needs_input_grad[:2]):
-            # NOTE: we send back A, and re-quant.
-            ctx.tensors = (A, fp8B)
-        else:
-            ctx.tensors = (None, None)
-
-        return output
-
-    @staticmethod
-    def backward(ctx, grad_output):
-        if ctx.is_empty:
-            return torch.zeros_like(ctx.A), torch.zeros_like(ctx.B), None, None, None, None, None
-
-        req_gradA, req_gradB, _, _, _, _, _ = ctx.needs_input_grad
-        A, B = ctx.tensors
-
-        grad_A, grad_B = None, None
-
-        # TODO: Fix blocksize to be output_dim
-        cgrad_out, state = F.quantize(grad_output.float(), code=ctx.bw_code)
-        fp8out = F.dequantize(cgrad_out, state).to(grad_output.dtype)
-
-        # cgrad_output_2, state_2 = F.quantize(grad_output.float(), code=ctx.bw_code)
-        # fp8out_2 = F.dequantize(cgrad_output_2, state_2).to(grad_output.dtype)
-
-        # grad_output_reshape = grad_output.reshape(-1, grad_output.shape[-1]).contiguous()
-        # fp8grad_transpose, stategrad_transpose = F.vectorwise_quant(grad_output_reshape, dim=0, quant_type='vector')
-        # fp8out_transpose = (fp8grad_transpose / 7) * stategrad_transpose
-        # fp8out_transpose = fp8out_transpose.view(grad_output.shape[0], grad_output.shape[1], grad_output.shape[2])
-
-        # not supported by PyTorch. TODO: create work-around
-        if req_gradA:
-            grad_A = torch.matmul(fp8out, B.t().to(fp8out.dtype)).to(A.dtype)
-
-        if req_gradB:
-            if len(A.shape) == 3:
-                At = A.transpose(2, 1).contiguous()
-            else:
-                At = A.transpose(1, 0).contiguous()
-            cA, state = F.quantize(At.float(), code=ctx.fw_code)
-            fp8At = F.dequantize(cA, state).to(A.dtype)
-            grad_B = torch.matmul(fp8At.to(fp8out.dtype), fp8out).to(B.dtype)
-
-        return grad_A, grad_B, None, None, None, None, None
-
-
-class SwitchBackBnb(torch.autograd.Function):
-    @staticmethod
-    def forward(ctx, A, B, out=None, bias=None, state: Optional[MatmulLtState] = None):
-        state = state or MatmulLtState()
-
-        # default to pytorch behavior if inputs are empty
-        ctx.is_empty = False
-        if prod(A.shape) == 0:
-            ctx.is_empty = True
-            ctx.A = A
-            ctx.B = B
-            ctx.bias = bias
-            if A.shape[-1] == B.shape[0]:
-                return torch.empty(A.shape[:-1] + B.shape[1:], dtype=A.dtype, device=A.device)
-            else:
-                return torch.empty(A.shape[:-1] + B.shape[:1], dtype=A.dtype, device=A.device)
-
-        # 1. Quantize A
-        # 2. Quantize B
-        # 3. Matmul
-        # 4. Mixed-precision decomposition matmul
-        # 5. Save state
-        input_shape = A.shape
-        if state.outlier_pool is None:
-            state.outlier_pool = GlobalOutlierPooler.get_instance()
-
-        # Cast A to fp16
-        if A.dtype != torch.float16:
-            warnings.warn(f"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization")
-
-        # 1. Quantize A
-        if len(A.shape) == 3:
-            A = A.view(-1, A.shape[-1]).contiguous()
-        CA, CAt, SCA, SCAt, outlier_cols = F.int8_double_quant(A.to(torch.float16), threshold=state.threshold)
-
-        if state.threshold > 0.0 and outlier_cols is not None:
-            if state.has_fp16_weights:
-                idx = outlier_cols
-                CA[:, idx] = 0
-                subA = A[:, idx]
-                state.subB = B[:, idx].t().contiguous()
-                state.idx = idx
-            else:
-                if state.SB is None:
-                    state.SB = (state.CB.shape, "row")
-        else:
-            if not state.has_fp16_weights and state.SB is None:
-                state.SB = (state.CB.shape, "row")
-            subA = None
-
-        # 2. Quantize B
-        if state.has_fp16_weights:
-            # print('B shape', B.shape)
-            has_grad = getattr(B, "grad", None) is not None
-            is_transposed = not B.is_contiguous() and B.shape[0] == B.stride(1)
-            if is_transposed:
-                B = B.contiguous()
-
-            if (state.is_training and not has_grad) or state.SB is None:
-                state.reset_grads()
-                (
-                    state.CB,
-                    state.CBt,
-                    state.SCB,
-                    state.SCBt,
-                    _,
-                ) = F.int8_double_quant(B.to(torch.float16))
-                state.SB = (state.CB.shape, "row")
-        else:
-            has_grad = False
-
-        if outlier_cols is not None and not state.has_fp16_weights:
-            # extract outliers
-            state.idx = outlier_cols
-            outliers = state.CB[:, state.idx.long()].clone()
-            state.subB = (outliers * state.SCB.view(-1, 1) / 127.0).t().contiguous().to(A.dtype)
-            CA[:, state.idx.long()] = 0
-
-            subA = A[:, state.idx.long()]
-
-        shapeB = state.SB[0]
-
-        if len(input_shape) == 3:
-            output_shape = (input_shape[0], input_shape[1], shapeB[0])
-        else:
-            output_shape = (input_shape[0], shapeB[0])
-
-        # 3. Matmul
-        out32 = F.int8_linear_matmul(CA, state.CB)
-        # we apply the fused bias here
-
-        if bias is None or bias.dtype == torch.float16:
-            output = F.int8_mm_dequant(out32, SCA, state.SCB, bias=bias).to(A.dtype)
-        else:  # apply bias separately
-            output = F.int8_mm_dequant(out32, SCA, state.SCB, bias=None).to(A.dtype)
-            output.add_(bias)
-
-        # 4. Mixed-precision decomposition matmul
-        if outlier_cols is not None and subA is not None:
-            output += torch.matmul(subA, state.subB)
-
-        # 5. Save state
-        ctx.state = state
-
-        ctx.grad_shape = input_shape
-        ctx.dtype_A, ctx.dtype_B, ctx.dtype_bias = A.dtype, B.dtype, None if bias is None else bias.dtype
-
-        if any(ctx.needs_input_grad[:2]):
-            ctx.tensors = (CAt, subA, A)
-            ctx.tensor_states = (SCAt, state.idx)
-        else:
-            ctx.tensors = [None, None, None]
-            ctx.tensor_states = (None, None)
-            ctx.save_for_backward(None, None)
-
-        clone_func = torch.clone if len(output_shape) == 3 else lambda x: x
-        return clone_func(output.view(output_shape))
-
-    @staticmethod
-    def backward(ctx, grad_output):
-        if ctx.is_empty:
-            bias_grad = None if ctx.bias is None else torch.zeros_like(ctx.bias)
-            return torch.zeros_like(ctx.A), torch.zeros_like(ctx.B), None, bias_grad, None
-
-        req_gradA, req_gradB, _, req_gradBias, _ = ctx.needs_input_grad
-        _CAt, _subA, A = ctx.tensors
-        _SCAt, _idx = ctx.tensor_states
-        state = ctx.state
-        grad_A = grad_B = grad_bias = None
-
-        if req_gradBias:
-            # compute grad_bias first before changing grad_output dtype
-            grad_bias = grad_output.sum(0, dtype=ctx.dtype_bias)
-
-        # Cast grad_output to fp16
-        if len(grad_output.shape) == 3:
-            grad_output = grad_output.reshape(-1, grad_output.shape[-1]).contiguous()
-
-        _Cgrad, _Cgradt, _SCgrad, _SCgradt, _outlier_cols = F.int8_double_quant(grad_output.to(torch.float16))
-
-        if req_gradB:
-            # print('back A shape', A.shape)
-            # print('grad output t shape', grad_output.t().shape)
-            grad_B = torch.matmul(grad_output.t(), A)
-
-        if req_gradA:
-            if state.CB is not None:
-                CB = state.CB.to(ctx.dtype_A, copy=True).mul_(state.SCB.unsqueeze(1).mul(1.0 / 127.0))
-                grad_A = torch.matmul(grad_output, CB).view(ctx.grad_shape).to(ctx.dtype_A)
-            else:
-                raise Exception("State must contain either CBt or CB matrix for backward")
-
-        return grad_A, grad_B, None, grad_bias, None
-
-
-def get_block_sizes(input_matrix, weight_matrix):
-    input_features = input_matrix.shape[-1]
-    output_features = weight_matrix.shape[0] if weight_matrix.shape[1] == input_features else weight_matrix.shape[1]
-    array = [4096, 2048, 1024, 512, 256, 128, 64, 0]
-    bsz, bsz2 = 1024, 1024
-    for i, k in enumerate(array):
-        if input_features > array[i + 1]:
-            bsz = k
-            break
-    for i, k in enumerate(array):
-        if output_features > array[i + 1]:
-            bsz2 = k
-            break
-
-    return bsz, bsz2
-
-
-def matmul_fp8_global(
-    A: torch.Tensor,
-    B: torch.Tensor,
-    fw_code: torch.Tensor,
-    bw_code: torch.Tensor,
-    out: Optional[torch.Tensor] = None,
-    bsz: int = -1,
-    bsz2: int = -1,
-):
-    if bsz == -1 or bsz2 == -1:
-        bsz, bsz2 = get_block_sizes(A, B)
-    return MatMulFP8Global.apply(A, B, out, fw_code, bw_code, bsz, bsz2)
-
-
-def matmul_fp8_mixed(
-    A: torch.Tensor,
-    B: torch.Tensor,
-    fw_code: torch.Tensor,
-    bw_code: torch.Tensor,
-    out: Optional[torch.Tensor] = None,
-    bsz: int = -1,
-    bsz2: int = -1,
-):
-    if bsz == -1 or bsz2 == -1:
-        bsz, bsz2 = get_block_sizes(A, B)
-    return MatMulFP8Mixed.apply(A, B, out, fw_code, bw_code, bsz, bsz2)
-
-
-def switchback_bnb(
-    A: torch.Tensor,
-    B: torch.Tensor,
-    out: Optional[torch.Tensor] = None,
-    state: Optional[MatmulLtState] = None,
-    threshold=0.0,
-    bias=None,
-):
-    state = state or MatmulLtState()
-    if threshold > 0.0:
-        state.threshold = threshold
-    return SwitchBackBnb.apply(A, B, out, bias, state)
diff --git a/bitsandbytes/research/nn/__init__.py b/bitsandbytes/research/nn/__init__.py
deleted file mode 100644
index 417011218..000000000
--- a/bitsandbytes/research/nn/__init__.py
+++ /dev/null
@@ -1 +0,0 @@
-from .modules import LinearFP8Global, LinearFP8Mixed
diff --git a/bitsandbytes/research/nn/modules.py b/bitsandbytes/research/nn/modules.py
deleted file mode 100644
index 57c0f3358..000000000
--- a/bitsandbytes/research/nn/modules.py
+++ /dev/null
@@ -1,76 +0,0 @@
-from typing import TypeVar
-
-import torch
-from torch import nn
-
-import bitsandbytes as bnb
-
-T = TypeVar("T", bound="torch.nn.Module")
-
-
-class LinearFP8Mixed(nn.Linear):
-    def __init__(self, input_features, output_features, bias=True):
-        super().__init__(input_features, output_features, bias)
-        self.bw_code = None
-        self.fw_code = None
-        array = [4096, 2048, 1024, 512, 256, 128, 64, 0]
-        for i, k in enumerate(array):
-            if input_features > array[i + 1]:
-                self.bsz = k
-                break
-        for i, k in enumerate(array):
-            if output_features > array[i + 1]:
-                self.bsz2 = k
-                break
-
-    def forward(self, x: torch.Tensor):
-        if self.fw_code is None:
-            self.bw_code = bnb.functional.create_fp8_map(True, 5, 2, 8).to(x.device)
-            self.fw_code = bnb.functional.create_fp8_map(True, 4, 3, 8).to(x.device)
-
-        out = bnb.research.matmul_fp8_mixed(
-            x,
-            self.weight.t(),
-            fw_code=self.fw_code,
-            bw_code=self.bw_code,
-            bsz=self.bsz,
-            bsz2=self.bsz2,
-        )
-        if self.bias is not None:
-            out += self.bias
-
-        return out
-
-
-class LinearFP8Global(nn.Linear):
-    def __init__(self, input_features, output_features, bias=True):
-        super().__init__(input_features, output_features, bias)
-        self.bw_code = None
-        self.fw_code = None
-        array = [4096, 2048, 1024, 512, 256, 128, 64, 0]
-        for i, k in enumerate(array):
-            if input_features > array[i + 1]:
-                self.bsz = k
-                break
-        for i, k in enumerate(array):
-            if output_features > array[i + 1]:
-                self.bsz2 = k
-                break
-
-    def forward(self, x: torch.Tensor):
-        if self.fw_code is None:
-            self.bw_code = bnb.functional.create_fp8_map(True, 5, 2, 8).to(x.device)
-            self.fw_code = bnb.functional.create_fp8_map(True, 4, 3, 8).to(x.device)
-
-        out = bnb.matmul_fp8_global(
-            x,
-            self.weight.t(),
-            fw_code=self.fw_code,
-            bw_code=self.bw_code,
-            bsz=self.bsz,
-            bsz2=self.bsz2,
-        )
-        if self.bias is not None:
-            out += self.bias
-
-        return out
diff --git a/bitsandbytes/triton/__init__.py b/bitsandbytes/triton/__init__.py
deleted file mode 100644
index e69de29bb..000000000
diff --git a/bitsandbytes/triton/dequantize_rowwise.py b/bitsandbytes/triton/dequantize_rowwise.py
deleted file mode 100644
index 26eab84f2..000000000
--- a/bitsandbytes/triton/dequantize_rowwise.py
+++ /dev/null
@@ -1,64 +0,0 @@
-import math
-
-import torch
-
-from bitsandbytes.triton.triton_utils import is_triton_available
-
-if not is_triton_available():
-
-    def dequantize_rowwise(x: torch.Tensor, state_x: torch.Tensor):
-        return None
-else:
-    import triton
-    import triton.language as tl
-
-    # rowwise quantize
-
-    # TODO: autotune this better.
-    @triton.autotune(
-        configs=[
-            triton.Config({}, num_stages=1, num_warps=8),
-            triton.Config({}, num_stages=2, num_warps=8),
-            triton.Config({}, num_stages=4, num_warps=8),
-            triton.Config({}, num_stages=8, num_warps=8),
-            triton.Config({}, num_stages=1),
-            triton.Config({}, num_stages=2),
-            triton.Config({}, num_stages=4),
-            triton.Config({}, num_stages=8),
-            triton.Config({}, num_warps=1),
-            triton.Config({}, num_warps=2),
-            triton.Config({}, num_warps=4),
-            triton.Config({}, num_warps=8),
-        ],
-        key=["n_elements"],
-    )
-    @triton.jit
-    def _dequantize_rowwise(
-        x_ptr,
-        state_x,
-        output_ptr,
-        inv_127,
-        n_elements,
-        BLOCK_SIZE: tl.constexpr,
-        P2: tl.constexpr,
-    ):
-        pid = tl.program_id(axis=0)
-        block_start = pid * BLOCK_SIZE
-        arange = tl.arange(0, P2)
-        offsets = block_start + arange
-        row_mask = arange < BLOCK_SIZE
-        x = tl.load(x_ptr + offsets, mask=row_mask)
-        max_val = tl.load(state_x + pid)
-        output = max_val * x * inv_127
-        tl.store(output_ptr + offsets, output, mask=row_mask)
-
-    def dequantize_rowwise(x: torch.Tensor, state_x: torch.Tensor):
-        output = torch.empty(*x.shape, device=x.device, dtype=torch.float16)
-
-        P2 = int(2 ** (math.ceil(math.log2(x.shape[1]))))
-
-        assert x.is_cuda and output.is_cuda
-        n_elements = output.numel()
-        grid = lambda meta: (x.shape[0],)
-        _dequantize_rowwise[grid](x, state_x, output, 1.0 / 127, n_elements, BLOCK_SIZE=x.shape[1], P2=P2)
-        return output
diff --git a/bitsandbytes/triton/int8_matmul_mixed_dequantize.py b/bitsandbytes/triton/int8_matmul_mixed_dequantize.py
deleted file mode 100644
index 5fcb927d4..000000000
--- a/bitsandbytes/triton/int8_matmul_mixed_dequantize.py
+++ /dev/null
@@ -1,206 +0,0 @@
-import torch
-
-from bitsandbytes.triton.triton_utils import is_triton_available
-
-if not is_triton_available():
-
-    def int8_matmul_mixed_dequantize(a, b, state_x, state_w, bias):
-        return None
-else:
-    import triton
-    import triton.language as tl
-
-    from .matmul_perf_model import early_config_prune, estimate_matmul_time
-
-    # This is a matmul kernel based on triton.ops.matmul
-    # It is modified to support rowwise quantized input and global quantized weight
-    # It's purpose is fused matmul then dequantize
-    # It does support bias.
-
-    def init_to_zero(name):
-        return lambda nargs: nargs[name].zero_()
-
-    def get_configs_io_bound():
-        configs = []
-        for num_stages in [2, 3, 4, 5, 6]:
-            for block_m in [16, 32]:
-                for block_k in [32, 64]:
-                    for block_n in [32, 64, 128, 256]:
-                        num_warps = 2 if block_n <= 64 else 4
-                        configs.append(
-                            triton.Config(
-                                {"BLOCK_M": block_m, "BLOCK_N": block_n, "BLOCK_K": block_k, "SPLIT_K": 1},
-                                num_stages=num_stages,
-                                num_warps=num_warps,
-                            ),
-                        )
-                        # split_k
-                        for split_k in [2, 4, 8, 16]:
-                            configs.append(
-                                triton.Config(
-                                    {"BLOCK_M": block_m, "BLOCK_N": block_n, "BLOCK_K": block_k, "SPLIT_K": split_k},
-                                    num_stages=num_stages,
-                                    num_warps=num_warps,
-                                    pre_hook=init_to_zero("C"),
-                                ),
-                            )
-        return configs
-
-    @triton.autotune(
-        configs=[
-            # basic configs for compute-bound matmuls
-            triton.Config({"BLOCK_M": 128, "BLOCK_N": 256, "BLOCK_K": 32, "SPLIT_K": 1}, num_stages=3, num_warps=8),
-            triton.Config({"BLOCK_M": 256, "BLOCK_N": 128, "BLOCK_K": 32, "SPLIT_K": 1}, num_stages=3, num_warps=8),
-            triton.Config({"BLOCK_M": 256, "BLOCK_N": 64, "BLOCK_K": 32, "SPLIT_K": 1}, num_stages=4, num_warps=4),
-            triton.Config({"BLOCK_M": 64, "BLOCK_N": 256, "BLOCK_K": 32, "SPLIT_K": 1}, num_stages=4, num_warps=4),
-            triton.Config({"BLOCK_M": 128, "BLOCK_N": 128, "BLOCK_K": 32, "SPLIT_K": 1}, num_stages=4, num_warps=4),
-            triton.Config({"BLOCK_M": 128, "BLOCK_N": 64, "BLOCK_K": 32, "SPLIT_K": 1}, num_stages=4, num_warps=4),
-            triton.Config({"BLOCK_M": 64, "BLOCK_N": 128, "BLOCK_K": 32, "SPLIT_K": 1}, num_stages=4, num_warps=4),
-            triton.Config({"BLOCK_M": 128, "BLOCK_N": 32, "BLOCK_K": 32, "SPLIT_K": 1}, num_stages=4, num_warps=4),
-            triton.Config({"BLOCK_M": 64, "BLOCK_N": 32, "BLOCK_K": 32, "SPLIT_K": 1}, num_stages=5, num_warps=2),
-            # good for int8
-            triton.Config({"BLOCK_M": 128, "BLOCK_N": 256, "BLOCK_K": 128, "SPLIT_K": 1}, num_stages=3, num_warps=8),
-            triton.Config({"BLOCK_M": 256, "BLOCK_N": 128, "BLOCK_K": 128, "SPLIT_K": 1}, num_stages=3, num_warps=8),
-            triton.Config({"BLOCK_M": 256, "BLOCK_N": 64, "BLOCK_K": 128, "SPLIT_K": 1}, num_stages=4, num_warps=4),
-            triton.Config({"BLOCK_M": 64, "BLOCK_N": 256, "BLOCK_K": 128, "SPLIT_K": 1}, num_stages=4, num_warps=4),
-            triton.Config({"BLOCK_M": 128, "BLOCK_N": 128, "BLOCK_K": 128, "SPLIT_K": 1}, num_stages=4, num_warps=4),
-            triton.Config({"BLOCK_M": 128, "BLOCK_N": 64, "BLOCK_K": 64, "SPLIT_K": 1}, num_stages=4, num_warps=4),
-            triton.Config({"BLOCK_M": 64, "BLOCK_N": 128, "BLOCK_K": 64, "SPLIT_K": 1}, num_stages=4, num_warps=4),
-            triton.Config({"BLOCK_M": 128, "BLOCK_N": 32, "BLOCK_K": 64, "SPLIT_K": 1}, num_stages=4, num_warps=4),
-            triton.Config({"BLOCK_M": 64, "BLOCK_N": 32, "BLOCK_K": 64, "SPLIT_K": 1}, num_stages=5, num_warps=2),
-            *get_configs_io_bound(),
-        ],
-        key=["M", "N", "K"],
-        prune_configs_by={"early_config_prune": early_config_prune, "perf_model": estimate_matmul_time, "top_k": 10},
-    )
-    @triton.heuristics(
-        {
-            "EVEN_K": lambda args: args["K"] % (args["BLOCK_K"] * args["SPLIT_K"]) == 0,
-        },
-    )
-    @triton.jit
-    def _int8_matmul_mixed_dequantize(
-        A,
-        B,
-        C,
-        bias,
-        state_x_ptr,
-        state_w_ptr,
-        M,
-        N,
-        K,
-        divfactor: tl.constexpr,
-        has_bias: tl.constexpr,
-        stride_am,
-        stride_ak,
-        stride_bk,
-        stride_bn,
-        stride_cm,
-        stride_cn,
-        BLOCK_M: tl.constexpr,
-        BLOCK_N: tl.constexpr,
-        BLOCK_K: tl.constexpr,
-        GROUP_M: tl.constexpr,
-        SPLIT_K: tl.constexpr,
-        EVEN_K: tl.constexpr,
-        ACC_TYPE: tl.constexpr,
-    ):
-        # matrix multiplication
-        pid = tl.program_id(0)
-        pid_z = tl.program_id(1)
-        grid_m = tl.cdiv(M, BLOCK_M)
-        grid_n = tl.cdiv(N, BLOCK_N)
-        # re-order program ID for better L2 performance
-        width = GROUP_M * grid_n
-        group_id = pid // width
-        group_size = min(grid_m - group_id * GROUP_M, GROUP_M)
-        pid_m = group_id * GROUP_M + (pid % group_size)
-        pid_n = (pid % width) // (group_size)
-        # do matrix multiplication
-        rm = pid_m * BLOCK_M + tl.arange(0, BLOCK_M)
-        rn = pid_n * BLOCK_N + tl.arange(0, BLOCK_N)
-        ram = tl.max_contiguous(tl.multiple_of(rm % M, BLOCK_M), BLOCK_M)
-        rbn = tl.max_contiguous(tl.multiple_of(rn % N, BLOCK_N), BLOCK_N)
-        rk = pid_z * BLOCK_K + tl.arange(0, BLOCK_K)
-        # pointers
-        A = A + (ram[:, None] * stride_am + rk[None, :] * stride_ak)
-        B = B + (rk[:, None] * stride_bk + rbn[None, :] * stride_bn)
-
-        # rematerialize rm and rn to save registers
-        rm = pid_m * BLOCK_M + tl.arange(0, BLOCK_M)
-        rn = pid_n * BLOCK_N + tl.arange(0, BLOCK_N)
-
-        w_factor = tl.load(state_w_ptr)
-        x_factor = tl.load(state_x_ptr + ram)[:, None]
-
-        # acc = tl.zeros((BLOCK_M, BLOCK_N), dtype=ACC_TYPE)
-        acc = tl.zeros((BLOCK_M, BLOCK_N), dtype=tl.int32)
-        for k in range(0, tl.cdiv(K, BLOCK_K * SPLIT_K)):
-            if EVEN_K:
-                a = tl.load(A)
-                b = tl.load(B)
-            else:
-                k_remaining = K - k * (BLOCK_K * SPLIT_K)
-                a = tl.load(A, mask=rk[None, :] < k_remaining, other=0.0)
-                b = tl.load(B, mask=rk[:, None] < k_remaining, other=0.0)
-            acc += tl.dot(a, b)
-            A += BLOCK_K * SPLIT_K * stride_ak
-            B += BLOCK_K * SPLIT_K * stride_bk
-
-        acc = w_factor * (x_factor * (acc * divfactor))
-        acc = acc.to(C.dtype.element_ty)
-
-        # conditionally add bias
-        if has_bias:
-            bias = tl.load(bias + rn).to(C.dtype.element_ty)
-            acc = acc + bias[None, :]
-
-        C = C + (rm[:, None] * stride_cm + rn[None, :] * stride_cn)
-        mask = (rm < M)[:, None] & (rn < N)[None, :]
-        # handles write-back with reduction-splitting
-        if SPLIT_K == 1:
-            tl.store(C, acc, mask=mask)
-        else:
-            tl.atomic_add(C, acc, mask=mask)
-
-    def int8_matmul_mixed_dequantize(a, b, state_x, state_w, bias):
-        device = a.device
-        divfactor = 1.0 / (127.0 * 127.0)
-        has_bias = 0 if bias is None else 1
-        # handle non-contiguous inputs if necessary
-        if a.stride(0) > 1 and a.stride(1) > 1:
-            a = a.contiguous()
-        if b.stride(0) > 1 and b.stride(1) > 1:
-            b = b.contiguous()
-        # checks constraints
-        assert a.shape[1] == b.shape[0], "incompatible dimensions"
-        M, K = a.shape
-        _, N = b.shape
-        # allocates output
-        c = torch.empty((M, N), device=device, dtype=torch.float16)
-        # accumulator types
-        ACC_TYPE = tl.float32  # if a.dtype in [torch.float16, torch.bfloat16, torch.float32] else tl.int32
-        # launch int8_matmul_mixed_dequantize kernel
-        grid = lambda META: (triton.cdiv(M, META["BLOCK_M"]) * triton.cdiv(N, META["BLOCK_N"]), META["SPLIT_K"])
-        _int8_matmul_mixed_dequantize[grid](
-            a,
-            b,
-            c,
-            bias,
-            state_x,
-            state_w,
-            M,
-            N,
-            K,
-            divfactor,
-            has_bias,
-            a.stride(0),
-            a.stride(1),
-            b.stride(0),
-            b.stride(1),
-            c.stride(0),
-            c.stride(1),
-            GROUP_M=8,
-            ACC_TYPE=ACC_TYPE,
-        )
-        return c
diff --git a/bitsandbytes/triton/int8_matmul_rowwise_dequantize.py b/bitsandbytes/triton/int8_matmul_rowwise_dequantize.py
deleted file mode 100644
index 05e30a4c9..000000000
--- a/bitsandbytes/triton/int8_matmul_rowwise_dequantize.py
+++ /dev/null
@@ -1,207 +0,0 @@
-import torch
-
-from bitsandbytes.triton.triton_utils import is_triton_available
-
-if not is_triton_available():
-
-    def int8_matmul_rowwise_dequantize(a, b, state_x, state_w, bias):
-        return None
-else:
-    import triton
-    import triton.language as tl
-
-    from .matmul_perf_model import early_config_prune, estimate_matmul_time
-
-    # This is a matmul kernel based on triton.ops.matmul
-    # It is modified to support rowwise quantized input and columnwise quantized weight
-    # It's purpose is fused matmul then dequantize
-    # It does support bias.
-
-    def init_to_zero(name):
-        return lambda nargs: nargs[name].zero_()
-
-    def get_configs_io_bound():
-        configs = []
-        for num_stages in [2, 3, 4, 5, 6]:
-            for block_m in [16, 32]:
-                for block_k in [32, 64]:
-                    for block_n in [32, 64, 128, 256]:
-                        num_warps = 2 if block_n <= 64 else 4
-                        configs.append(
-                            triton.Config(
-                                {"BLOCK_M": block_m, "BLOCK_N": block_n, "BLOCK_K": block_k, "SPLIT_K": 1},
-                                num_stages=num_stages,
-                                num_warps=num_warps,
-                            ),
-                        )
-                        # split_k
-                        for split_k in [2, 4, 8, 16]:
-                            configs.append(
-                                triton.Config(
-                                    {"BLOCK_M": block_m, "BLOCK_N": block_n, "BLOCK_K": block_k, "SPLIT_K": split_k},
-                                    num_stages=num_stages,
-                                    num_warps=num_warps,
-                                    pre_hook=init_to_zero("C"),
-                                ),
-                            )
-        return configs
-
-    @triton.autotune(
-        configs=[
-            # basic configs for compute-bound matmuls
-            triton.Config({"BLOCK_M": 128, "BLOCK_N": 256, "BLOCK_K": 32, "SPLIT_K": 1}, num_stages=3, num_warps=8),
-            triton.Config({"BLOCK_M": 256, "BLOCK_N": 128, "BLOCK_K": 32, "SPLIT_K": 1}, num_stages=3, num_warps=8),
-            triton.Config({"BLOCK_M": 256, "BLOCK_N": 64, "BLOCK_K": 32, "SPLIT_K": 1}, num_stages=4, num_warps=4),
-            triton.Config({"BLOCK_M": 64, "BLOCK_N": 256, "BLOCK_K": 32, "SPLIT_K": 1}, num_stages=4, num_warps=4),
-            triton.Config({"BLOCK_M": 128, "BLOCK_N": 128, "BLOCK_K": 32, "SPLIT_K": 1}, num_stages=4, num_warps=4),
-            triton.Config({"BLOCK_M": 128, "BLOCK_N": 64, "BLOCK_K": 32, "SPLIT_K": 1}, num_stages=4, num_warps=4),
-            triton.Config({"BLOCK_M": 64, "BLOCK_N": 128, "BLOCK_K": 32, "SPLIT_K": 1}, num_stages=4, num_warps=4),
-            triton.Config({"BLOCK_M": 128, "BLOCK_N": 32, "BLOCK_K": 32, "SPLIT_K": 1}, num_stages=4, num_warps=4),
-            triton.Config({"BLOCK_M": 64, "BLOCK_N": 32, "BLOCK_K": 32, "SPLIT_K": 1}, num_stages=5, num_warps=2),
-            # good for int8
-            triton.Config({"BLOCK_M": 128, "BLOCK_N": 256, "BLOCK_K": 128, "SPLIT_K": 1}, num_stages=3, num_warps=8),
-            triton.Config({"BLOCK_M": 256, "BLOCK_N": 128, "BLOCK_K": 128, "SPLIT_K": 1}, num_stages=3, num_warps=8),
-            triton.Config({"BLOCK_M": 256, "BLOCK_N": 64, "BLOCK_K": 128, "SPLIT_K": 1}, num_stages=4, num_warps=4),
-            triton.Config({"BLOCK_M": 64, "BLOCK_N": 256, "BLOCK_K": 128, "SPLIT_K": 1}, num_stages=4, num_warps=4),
-            triton.Config({"BLOCK_M": 128, "BLOCK_N": 128, "BLOCK_K": 128, "SPLIT_K": 1}, num_stages=4, num_warps=4),
-            triton.Config({"BLOCK_M": 128, "BLOCK_N": 64, "BLOCK_K": 64, "SPLIT_K": 1}, num_stages=4, num_warps=4),
-            triton.Config({"BLOCK_M": 64, "BLOCK_N": 128, "BLOCK_K": 64, "SPLIT_K": 1}, num_stages=4, num_warps=4),
-            triton.Config({"BLOCK_M": 128, "BLOCK_N": 32, "BLOCK_K": 64, "SPLIT_K": 1}, num_stages=4, num_warps=4),
-            triton.Config({"BLOCK_M": 64, "BLOCK_N": 32, "BLOCK_K": 64, "SPLIT_K": 1}, num_stages=5, num_warps=2),
-            *get_configs_io_bound(),
-        ],
-        key=["M", "N", "K"],
-        prune_configs_by={"early_config_prune": early_config_prune, "perf_model": estimate_matmul_time, "top_k": 10},
-    )
-    @triton.heuristics(
-        {
-            "EVEN_K": lambda args: args["K"] % (args["BLOCK_K"] * args["SPLIT_K"]) == 0,
-        },
-    )
-    @triton.jit
-    def _int8_matmul_rowwise_dequantize(
-        A,
-        B,
-        C,
-        bias,
-        state_x_ptr,
-        state_w_ptr,
-        M,
-        N,
-        K,
-        divfactor,
-        has_bias: tl.constexpr,
-        stride_am,
-        stride_ak,
-        stride_bk,
-        stride_bn,
-        stride_cm,
-        stride_cn,
-        BLOCK_M: tl.constexpr,
-        BLOCK_N: tl.constexpr,
-        BLOCK_K: tl.constexpr,
-        GROUP_M: tl.constexpr,
-        SPLIT_K: tl.constexpr,
-        EVEN_K: tl.constexpr,
-        ACC_TYPE: tl.constexpr,
-    ):
-        # matrix multiplication
-        pid = tl.program_id(0)
-        pid_z = tl.program_id(1)
-        grid_m = tl.cdiv(M, BLOCK_M)
-        grid_n = tl.cdiv(N, BLOCK_N)
-        # re-order program ID for better L2 performance
-        width = GROUP_M * grid_n
-        group_id = pid // width
-        group_size = min(grid_m - group_id * GROUP_M, GROUP_M)
-        pid_m = group_id * GROUP_M + (pid % group_size)
-        pid_n = (pid % width) // (group_size)
-        # do matrix multiplication
-        rm = pid_m * BLOCK_M + tl.arange(0, BLOCK_M)
-        rn = pid_n * BLOCK_N + tl.arange(0, BLOCK_N)
-        ram = tl.max_contiguous(tl.multiple_of(rm % M, BLOCK_M), BLOCK_M)
-        rbn = tl.max_contiguous(tl.multiple_of(rn % N, BLOCK_N), BLOCK_N)
-        rk = pid_z * BLOCK_K + tl.arange(0, BLOCK_K)
-        # pointers
-        A = A + (ram[:, None] * stride_am + rk[None, :] * stride_ak)
-        B = B + (rk[:, None] * stride_bk + rbn[None, :] * stride_bn)
-
-        # rematerialize rm and rn to save registers
-        rm = pid_m * BLOCK_M + tl.arange(0, BLOCK_M)
-        rn = pid_n * BLOCK_N + tl.arange(0, BLOCK_N)
-
-        w_factor = tl.load(state_w_ptr + rbn)[None, :]
-        x_factor = tl.load(state_x_ptr + ram)[:, None]
-
-        # acc = tl.zeros((BLOCK_M, BLOCK_N), dtype=ACC_TYPE)
-        acc = tl.zeros((BLOCK_M, BLOCK_N), dtype=tl.int32)
-        for k in range(0, tl.cdiv(K, BLOCK_K * SPLIT_K)):
-            if EVEN_K:
-                a = tl.load(A)
-                b = tl.load(B)
-            else:
-                k_remaining = K - k * (BLOCK_K * SPLIT_K)
-                a = tl.load(A, mask=rk[None, :] < k_remaining, other=0.0)
-                b = tl.load(B, mask=rk[:, None] < k_remaining, other=0.0)
-            acc += tl.dot(a, b)
-            A += BLOCK_K * SPLIT_K * stride_ak
-            B += BLOCK_K * SPLIT_K * stride_bk
-
-        acc = w_factor * (x_factor * (acc * divfactor))
-        acc = acc.to(C.dtype.element_ty)
-
-        if has_bias:
-            bias = tl.load(bias + rn).to(C.dtype.element_ty)
-            acc = acc + bias[None, :]
-
-        C = C + (rm[:, None] * stride_cm + rn[None, :] * stride_cn)
-        mask = (rm < M)[:, None] & (rn < N)[None, :]
-        # handles write-back with reduction-splitting
-        if SPLIT_K == 1:
-            tl.store(C, acc, mask=mask)
-        else:
-            tl.atomic_add(C, acc, mask=mask)
-
-    def int8_matmul_rowwise_dequantize(a, b, state_x, state_w, bias):
-        divfactor = 1.0 / (127.0 * 127.0)
-
-        has_bias = 0 if bias is None else 1
-
-        device = a.device
-        # handle non-contiguous inputs if necessary
-        if a.stride(0) > 1 and a.stride(1) > 1:
-            a = a.contiguous()
-        if b.stride(0) > 1 and b.stride(1) > 1:
-            b = b.contiguous()
-        # checks constraints
-        assert a.shape[1] == b.shape[0], "incompatible dimensions"
-        M, K = a.shape
-        _, N = b.shape
-        # allocates output
-        c = torch.empty((M, N), device=device, dtype=torch.float16)
-        # accumulator types
-        ACC_TYPE = tl.float32  # if a.dtype in [torch.float16, torch.bfloat16, torch.float32] else tl.int32
-        # launch int8_matmul_rowwise_dequantize kernel
-        grid = lambda META: (triton.cdiv(M, META["BLOCK_M"]) * triton.cdiv(N, META["BLOCK_N"]), META["SPLIT_K"])
-        _int8_matmul_rowwise_dequantize[grid](
-            a,
-            b,
-            c,
-            bias,
-            state_x,
-            state_w,
-            M,
-            N,
-            K,
-            divfactor,
-            has_bias,
-            a.stride(0),
-            a.stride(1),
-            b.stride(0),
-            b.stride(1),
-            c.stride(0),
-            c.stride(1),
-            GROUP_M=8,
-            ACC_TYPE=ACC_TYPE,
-        )
-        return c
diff --git a/bitsandbytes/triton/matmul_perf_model.py b/bitsandbytes/triton/matmul_perf_model.py
deleted file mode 100644
index e843a3a39..000000000
--- a/bitsandbytes/triton/matmul_perf_model.py
+++ /dev/null
@@ -1,211 +0,0 @@
-# Adapted from https://github.com/triton-lang/kernels/blob/eeeebdd8be7d13629de22d600621e6234057eed3/kernels/matmul_perf_model.py
-# https://github.com/triton-lang/kernels is licensed under the MIT License.
-
-import functools
-import heapq
-
-import torch
-
-from triton import cdiv
-from triton.runtime import driver
-from triton.testing import (
-    get_dram_gbps,
-    get_max_simd_tflops,
-    get_max_tensorcore_tflops,
-    nvsmi,
-)
-
-
-@functools.lru_cache
-def get_clock_rate_in_khz():
-    try:
-        return nvsmi(["clocks.max.sm"])[0] * 1e3
-    except FileNotFoundError:
-        import pynvml
-
-        pynvml.nvmlInit()
-        handle = pynvml.nvmlDeviceGetHandleByIndex(0)
-        return pynvml.nvmlDeviceGetMaxClockInfo(handle, pynvml.NVML_CLOCK_SM) * 1e3
-
-
-def get_tensorcore_tflops(device, num_ctas, num_warps, dtype):
-    """return compute throughput in TOPS"""
-    total_warps = num_ctas * min(num_warps, 4)
-    num_subcores = driver.active.utils.get_device_properties(device)["multiprocessor_count"] * 4  # on recent GPUs
-    tflops = (
-        min(num_subcores, total_warps)
-        / num_subcores
-        * get_max_tensorcore_tflops(dtype, get_clock_rate_in_khz(), device)
-    )
-    return tflops
-
-
-def get_simd_tflops(device, num_ctas, num_warps, dtype):
-    """return compute throughput in TOPS"""
-    total_warps = num_ctas * min(num_warps, 4)
-    num_subcores = driver.active.utils.get_device_properties(device)["multiprocessor_count"] * 4  # on recent GPUs
-    tflops = (
-        min(num_subcores, total_warps) / num_subcores * get_max_simd_tflops(dtype, get_clock_rate_in_khz(), device)
-    )
-    return tflops
-
-
-def get_tflops(device, num_ctas, num_warps, dtype):
-    capability = torch.cuda.get_device_capability(device)
-    if capability[0] < 8 and dtype == torch.float32:
-        return get_simd_tflops(device, num_ctas, num_warps, dtype)
-    return get_tensorcore_tflops(device, num_ctas, num_warps, dtype)
-
-
-def estimate_matmul_time(
-    # backend, device,
-    num_warps,
-    num_stages,  #
-    A,
-    B,
-    C,  #
-    M,
-    N,
-    K,  #
-    BLOCK_M,
-    BLOCK_N,
-    BLOCK_K,
-    SPLIT_K,  #
-    debug=False,
-    **kwargs,  #
-):
-    """return estimated running time in ms
-    = max(compute, loading) + store"""
-    device = torch.cuda.current_device()
-    dtype = A.dtype
-    dtsize = A.element_size()
-
-    num_cta_m = cdiv(M, BLOCK_M)
-    num_cta_n = cdiv(N, BLOCK_N)
-    num_cta_k = SPLIT_K
-    num_ctas = num_cta_m * num_cta_n * num_cta_k
-
-    # If the input is smaller than the block size
-    M, N = max(M, BLOCK_M), max(N, BLOCK_N)
-
-    # time to compute
-    total_ops = 2 * M * N * K / (1024 * 1024 * 1024)  # GOPS
-    tput = get_tflops(device, num_ctas, num_warps, dtype)
-    compute_ms = total_ops / tput
-
-    # time to load data
-    num_sm = driver.active.utils.get_device_properties(device)["multiprocessor_count"]
-    active_cta_ratio = min(1, num_ctas / num_sm)
-    active_cta_ratio_bw1 = min(1, num_ctas / 32)  # 32 active ctas are enough to saturate
-    active_cta_ratio_bw2 = max(min(1, (num_ctas - 32) / (108 - 32)), 0)  # 32-108, remaining 5%
-    dram_bw = get_dram_gbps(device) * (active_cta_ratio_bw1 * 0.95 + active_cta_ratio_bw2 * 0.05)  # in GB/s
-    l2_bw = dram_bw * 4  # rough estimation (should be 4.7 for A100?)
-    # assume 80% of (following) loads are in L2 cache
-    load_a_dram = M * K * dtsize * (1 + 0.2 * (num_cta_n - 1))
-    load_a_l2 = M * K * dtsize * 0.8 * (num_cta_n - 1)
-    load_b_dram = N * K * dtsize * (1 + 0.2 * (num_cta_m - 1))
-    load_b_l2 = N * K * dtsize * 0.8 * (num_cta_m - 1)
-    # total
-    total_dram = (load_a_dram + load_b_dram) / (1024 * 1024)  # MB
-    total_l2 = (load_a_l2 + load_b_l2) / (1024 * 1024)
-    # loading time in ms
-    load_ms = total_dram / dram_bw + total_l2 / l2_bw
-
-    # estimate storing time
-    store_bw = dram_bw * 0.6  # :o
-    store_c_dram = M * N * dtsize * SPLIT_K / (1024 * 1024)  # MB
-    if SPLIT_K == 1:
-        store_ms = store_c_dram / store_bw
-    else:
-        reduce_bw = store_bw
-        store_ms = store_c_dram / reduce_bw
-        # c.zero_()
-        zero_ms = M * N * 2 / (1024 * 1024) / store_bw
-        store_ms += zero_ms
-
-    total_time_ms = max(compute_ms, load_ms) + store_ms
-    if debug:
-        print(
-            f"Total time: {total_time_ms}ms, compute time: {compute_ms}ms, "
-            f"loading time: {load_ms}ms, store time: {store_ms}ms, "
-            f"Activate CTAs: {active_cta_ratio * 100}%"
-        )
-    return total_time_ms
-
-
-def early_config_prune(configs, named_args, **kwargs):
-    device = torch.cuda.current_device()
-    capability = torch.cuda.get_device_capability()
-    # BLOCK_M, BLOCK_N, BLOCK_K, SPLIT_K, num_warps, num_stages
-    dtsize = named_args["A"].element_size()
-    dtype = named_args["A"].dtype
-
-    # 1. make sure we have enough smem
-    pruned_configs = []
-    for config in configs:
-        kw = config.kwargs
-        BLOCK_M, BLOCK_N, BLOCK_K, num_stages = (
-            kw["BLOCK_M"],
-            kw["BLOCK_N"],
-            kw["BLOCK_K"],
-            config.num_stages,
-        )
-
-        max_shared_memory = driver.active.utils.get_device_properties(device)["max_shared_mem"]
-        required_shared_memory = (BLOCK_M + BLOCK_N) * BLOCK_K * num_stages * dtsize
-        if required_shared_memory <= max_shared_memory:
-            pruned_configs.append(config)
-    configs = pruned_configs
-
-    # Some dtypes do not allow atomic_add
-    if dtype not in [torch.float16, torch.float32]:
-        configs = [config for config in configs if config.kwargs["SPLIT_K"] == 1]
-
-    # group configs by (BLOCK_M,_N,_K, SPLIT_K, num_warps)
-    configs_map = {}
-    for config in configs:
-        kw = config.kwargs
-        BLOCK_M, BLOCK_N, BLOCK_K, SPLIT_K, num_warps, num_stages = (
-            kw["BLOCK_M"],
-            kw["BLOCK_N"],
-            kw["BLOCK_K"],
-            kw["SPLIT_K"],
-            config.num_warps,
-            config.num_stages,
-        )
-
-        key = (BLOCK_M, BLOCK_N, BLOCK_K, SPLIT_K, num_warps)
-        if key in configs_map:
-            configs_map[key].append((config, num_stages))
-        else:
-            configs_map[key] = [(config, num_stages)]
-
-    pruned_configs = []
-    for k, v in configs_map.items():
-        BLOCK_M, BLOCK_N, BLOCK_K, SPLIT_K, num_warps = k
-        if capability[0] >= 8:
-            # compute cycles (only works for ampere GPUs)
-            mmas = BLOCK_M * BLOCK_N * BLOCK_K / (16 * 8 * 16)
-            mma_cycles = mmas / min(4, num_warps) * 8
-
-            ldgsts_latency = 300  # Does this matter?
-            optimal_num_stages = ldgsts_latency / mma_cycles
-
-            # nearest stages, prefer large #stages
-            nearest = heapq.nsmallest(
-                2,
-                v,
-                key=lambda x: (
-                    10 + abs(x[1] - optimal_num_stages)
-                    if (x[1] - optimal_num_stages) < 0
-                    else x[1] - optimal_num_stages
-                ),
-            )
-
-            for n in nearest:
-                pruned_configs.append(n[0])
-        else:  # Volta & Turing only supports num_stages <= 2
-            random_config = v[0][0]
-            random_config.num_stages = 2
-            pruned_configs.append(random_config)
-    return pruned_configs
diff --git a/bitsandbytes/triton/quantize_columnwise_and_transpose.py b/bitsandbytes/triton/quantize_columnwise_and_transpose.py
deleted file mode 100644
index b8eeffd0c..000000000
--- a/bitsandbytes/triton/quantize_columnwise_and_transpose.py
+++ /dev/null
@@ -1,75 +0,0 @@
-import math
-
-import torch
-
-from bitsandbytes.triton.triton_utils import is_triton_available
-
-if not is_triton_available():
-
-    def quantize_columnwise_and_transpose(x: torch.Tensor):
-        return None
-else:
-    import triton
-    import triton.language as tl
-
-    # This kernel does fused columnwise quantization and transpose.
-
-    # TODO: autotune this better.
-    @triton.autotune(
-        configs=[
-            triton.Config({}, num_stages=1),
-            triton.Config({}, num_stages=2),
-            triton.Config({}, num_stages=4),
-            triton.Config({}, num_stages=8),
-            triton.Config({}, num_stages=16),
-            triton.Config({}, num_stages=1, num_warps=8),
-            triton.Config({}, num_stages=2, num_warps=8),
-            triton.Config({}, num_stages=4, num_warps=8),
-            triton.Config({}, num_stages=8, num_warps=8),
-            triton.Config({}, num_stages=16, num_warps=8),
-            triton.Config({}, num_warps=1),
-            triton.Config({}, num_warps=2),
-            triton.Config({}, num_warps=4),
-            triton.Config({}, num_warps=8),
-        ],
-        key=["n_elements"],
-    )
-    @triton.jit
-    def _quantize_columnwise_and_transpose(
-        x_ptr,
-        output_ptr,
-        output_maxs,
-        n_elements,
-        M: tl.constexpr,
-        N: tl.constexpr,
-        BLOCK_SIZE: tl.constexpr,
-        P2: tl.constexpr,
-    ):
-        pid = tl.program_id(axis=0)
-        block_start = pid
-        p2_arange = tl.arange(0, P2)
-        p2_arange_mask = p2_arange < M
-        arange = p2_arange * N
-        offsets = block_start + arange
-        x = tl.load(x_ptr + offsets, mask=p2_arange_mask)
-        abs_x = tl.abs(x)
-        max_val = tl.max(tl.where(p2_arange_mask, abs_x, 0), axis=0)
-        output = tl.libdevice.llrint(127.0 * (x / max_val))
-
-        new_start = pid * M
-        new_offsets = new_start + p2_arange
-        tl.store(output_ptr + new_offsets, output, mask=p2_arange_mask)
-        tl.store(output_maxs + pid, max_val)
-
-    def quantize_columnwise_and_transpose(x: torch.Tensor):
-        M, N = x.shape
-        output = torch.empty(N, M, device=x.device, dtype=torch.int8)
-        output_maxs = torch.empty(x.shape[1], device=x.device, dtype=torch.float16)
-
-        P2 = int(2 ** (math.ceil(math.log2(M))))
-
-        assert x.is_cuda and output.is_cuda
-        n_elements = output.numel()
-        grid = lambda meta: (triton.cdiv(n_elements, meta["BLOCK_SIZE"]),)
-        _quantize_columnwise_and_transpose[grid](x, output, output_maxs, n_elements, M, N, BLOCK_SIZE=M, P2=P2)
-        return output, output_maxs
diff --git a/bitsandbytes/triton/quantize_global.py b/bitsandbytes/triton/quantize_global.py
deleted file mode 100644
index f35bdd304..000000000
--- a/bitsandbytes/triton/quantize_global.py
+++ /dev/null
@@ -1,124 +0,0 @@
-import torch
-
-from bitsandbytes.triton.triton_utils import is_triton_available
-
-if not is_triton_available():
-
-    def quantize_global_transpose(input):
-        return None
-
-    def quantize_global(x: torch.Tensor):
-        return None
-else:
-    import triton
-    import triton.language as tl
-
-    # global quantize
-    @triton.autotune(
-        configs=[
-            triton.Config({"BLOCK_SIZE": 1024}, num_warps=4),
-            triton.Config({"BLOCK_SIZE": 2048}, num_stages=1),
-        ],
-        key=["n_elements"],
-    )
-    @triton.jit
-    def _quantize_global(
-        x_ptr,
-        absmax_inv_ptr,
-        output_ptr,
-        n_elements,
-        BLOCK_SIZE: tl.constexpr,
-    ):
-        pid = tl.program_id(axis=0)
-        block_start = pid * BLOCK_SIZE
-        offsets = block_start + tl.arange(0, BLOCK_SIZE)
-        mask = offsets < n_elements
-        x = tl.load(x_ptr + offsets, mask=mask)
-        absmax_inv = tl.load(absmax_inv_ptr)
-        output = tl.libdevice.llrint(127.0 * (x * absmax_inv))
-        tl.store(output_ptr + offsets, output, mask=mask)
-
-    def quantize_global(x: torch.Tensor):
-        absmax = x.abs().max().unsqueeze(0)
-        absmax_inv = 1.0 / absmax
-        output = torch.empty(*x.shape, device="cuda", dtype=torch.int8)
-        assert x.is_cuda and output.is_cuda
-        n_elements = output.numel()
-        grid = lambda meta: (triton.cdiv(n_elements, meta["BLOCK_SIZE"]),)
-        _quantize_global[grid](x, absmax_inv, output, n_elements)
-        return output, absmax
-
-    # global quantize and transpose
-    @triton.autotune(
-        configs=[
-            triton.Config({"BLOCK_M": 128, "BLOCK_N": 128, "GROUP_M": 8}, num_warps=4),
-            triton.Config({"BLOCK_M": 128, "BLOCK_N": 128, "GROUP_M": 8}, num_warps=4),
-            # ...
-        ],
-        key=["M", "N"],
-    )
-    @triton.jit
-    def _quantize_global_transpose(
-        A,
-        absmax_inv_ptr,
-        B,
-        stride_am,
-        stride_an,
-        stride_bn,
-        stride_bm,
-        M,
-        N,
-        BLOCK_M: tl.constexpr,
-        BLOCK_N: tl.constexpr,
-        GROUP_M: tl.constexpr,
-    ):
-        pid = tl.program_id(0)
-        grid_m = (M + BLOCK_M - 1) // BLOCK_M
-        grid_n = (N + BLOCK_N - 1) // BLOCK_N
-
-        width = GROUP_M * grid_n
-        group_id = pid // width
-        group_size = min(grid_m - group_id * GROUP_M, GROUP_M)
-        pid_m = group_id * GROUP_M + (pid % group_size)
-        pid_n = (pid % width) // group_size
-
-        rm = pid_m * BLOCK_M + tl.arange(0, BLOCK_M)
-        rn = pid_n * BLOCK_N + tl.arange(0, BLOCK_N)
-        A = A + (rm[:, None] * stride_am + rn[None, :] * stride_an)
-        mask = (rm < M)[:, None] & (rn < N)[None, :]
-        a = tl.load(A, mask=mask)
-        absmax_inv = tl.load(absmax_inv_ptr)
-
-        # rematerialize to save registers
-        rm = pid_m * BLOCK_M + tl.arange(0, BLOCK_M)
-        rn = pid_n * BLOCK_N + tl.arange(0, BLOCK_N)
-        B = B + (rm[:, None] * stride_bm + rn[None, :] * stride_bn)
-        mask = (rm < M)[:, None] & (rn < N)[None, :]
-
-        output = tl.libdevice.llrint(127.0 * (a * absmax_inv))
-
-        tl.store(B, output, mask=mask)
-
-    def quantize_global_transpose(input):
-        absmax = input.abs().max().unsqueeze(0)
-        absmax_inv = 1.0 / absmax
-        M, N = input.shape
-        out = torch.empty(N, M, device="cuda", dtype=torch.int8)
-
-        assert out.size(0) == N and out.size(1) == M
-        assert input.stride(0) == 1 or input.stride(1) == 1
-        assert out.stride(0) == 1 or out.stride(1) == 1
-
-        grid = lambda META: (triton.cdiv(M, META["BLOCK_M"]) * triton.cdiv(N, META["BLOCK_N"]),)
-        _quantize_global_transpose[grid](
-            input,
-            absmax_inv,
-            out,
-            input.stride(0),
-            input.stride(1),
-            out.stride(0),
-            out.stride(1),
-            M,
-            N,
-        )
-        return out, absmax
diff --git a/bitsandbytes/triton/quantize_rowwise.py b/bitsandbytes/triton/quantize_rowwise.py
deleted file mode 100644
index f92ace02c..000000000
--- a/bitsandbytes/triton/quantize_rowwise.py
+++ /dev/null
@@ -1,67 +0,0 @@
-import math
-
-import torch
-
-from bitsandbytes.triton.triton_utils import is_triton_available
-
-if not is_triton_available():
-
-    def quantize_rowwise(x: torch.Tensor):
-        return None
-else:
-    import triton
-    import triton.language as tl
-
-    # rowwise quantize
-
-    # TODO: autotune this better.
-    @triton.autotune(
-        configs=[
-            triton.Config({}, num_stages=1, num_warps=8),
-            triton.Config({}, num_stages=2, num_warps=8),
-            triton.Config({}, num_stages=4, num_warps=8),
-            triton.Config({}, num_stages=8, num_warps=8),
-            triton.Config({}, num_stages=1),
-            triton.Config({}, num_stages=2),
-            triton.Config({}, num_stages=4),
-            triton.Config({}, num_stages=8),
-            triton.Config({}, num_warps=1),
-            triton.Config({}, num_warps=2),
-            triton.Config({}, num_warps=4),
-            triton.Config({}, num_warps=8),
-        ],
-        key=["n_elements"],
-    )
-    @triton.jit
-    def _quantize_rowwise(
-        x_ptr,
-        output_ptr,
-        output_maxs,
-        n_elements,
-        BLOCK_SIZE: tl.constexpr,
-        P2: tl.constexpr,
-    ):
-        pid = tl.program_id(axis=0)
-        block_start = pid * BLOCK_SIZE
-        arange = tl.arange(0, P2)
-        offsets = block_start + arange
-        row_mask = arange < BLOCK_SIZE
-        x = tl.load(x_ptr + offsets, mask=row_mask)
-
-        abs_x = tl.abs(x)
-        max_val = tl.max(tl.where(row_mask, abs_x, 0), axis=0)
-        output = tl.libdevice.llrint(127.0 * (x / max_val))
-        tl.store(output_ptr + offsets, output, mask=row_mask)
-        tl.store(output_maxs + pid, max_val)
-
-    def quantize_rowwise(x: torch.Tensor):
-        output = torch.empty(*x.shape, device=x.device, dtype=torch.int8)
-        output_maxs = torch.empty(x.shape[0], device=x.device, dtype=torch.float16)
-
-        P2 = int(2 ** (math.ceil(math.log2(x.shape[1]))))
-
-        assert x.is_cuda and output.is_cuda
-        n_elements = output.numel()
-        grid = lambda meta: (x.shape[0],)
-        _quantize_rowwise[grid](x, output, output_maxs, n_elements, BLOCK_SIZE=x.shape[1], P2=P2)
-        return output, output_maxs
diff --git a/bitsandbytes/triton/triton_utils.py b/bitsandbytes/triton/triton_utils.py
deleted file mode 100644
index f6bedd8cd..000000000
--- a/bitsandbytes/triton/triton_utils.py
+++ /dev/null
@@ -1,11 +0,0 @@
-import functools
-
-
-@functools.lru_cache(None)
-def is_triton_available():
-    try:
-        from torch.utils._triton import has_triton, has_triton_package
-
-        return has_triton_package() and has_triton()
-    except Exception:
-        return False
diff --git a/docs/source/optimizers.mdx b/docs/source/optimizers.mdx
index 7d04f82b1..3e5f6a2aa 100644
--- a/docs/source/optimizers.mdx
+++ b/docs/source/optimizers.mdx
@@ -30,12 +30,12 @@ import bitsandbytes as bnb
 adam = bnb.optim.Adam8bit(model.parameters(), min_8bit_size=16384)
 ```
 
-Other parameters you can configure include the learning rate (`lr`), the decay rates (`betas`), the number of bits of the optimizer state (`optim_bits`), and percentile clipping (`percentile_clipping`) which can increase stability. For example, to initialize a 32-bit [`~bitsandbytes.optim.Adam`] optimizer with 5th percentile clipping:
+Other parameters you can configure include the learning rate (`lr`), the decay rates (`betas`), and the number of bits of the optimizer state (`optim_bits`). For example, to initialize a 32-bit [`~bitsandbytes.optim.Adam`] optimizer:
 
 ```py
 import bitsandbytes as bnb
 
-adam = bnb.optim.Adam(model.parameters(), lr=0.001, betas=(0.9, 0.995), optim_bits=32, percentile_clipping=5)
+adam = bnb.optim.Adam(model.parameters(), lr=0.001, betas=(0.9, 0.995), optim_bits=32)
 ```
 
 ## Optimize unstable parameters
diff --git a/pyproject.toml b/pyproject.toml
index f448a079e..21e11261b 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -88,7 +88,7 @@ exclude_also = [
 ]
 
 [tool.pytest.ini_options]
-addopts = "-rP -m 'not slow and not benchmark and not deprecated'"
+addopts = "-rP -m 'not slow and not benchmark'"
 #    ; --cov=bitsandbytes
 #    ; # contexts: record which test ran which line; can be seen in html coverage report
 #    ; --cov-context=test
@@ -98,7 +98,6 @@ log_cli_level = "INFO"
 log_file = "logs/pytest.log"
 markers = [
     "benchmark: mark test as a benchmark",
-    "deprecated: mark test as covering a deprecated feature",
     "slow: mark test as slow",
 ]
 
diff --git a/tests/test_autograd.py b/tests/test_autograd.py
index 7134925c1..b942e6ab7 100644
--- a/tests/test_autograd.py
+++ b/tests/test_autograd.py
@@ -22,8 +22,8 @@
 @pytest.mark.parametrize("decomp", [0.0, 6.0], ids=id_formatter("decomp"))
 @pytest.mark.parametrize(
     "funcs",
-    [(torch.matmul, bnb.matmul), (torch.matmul, bnb.research.switchback_bnb)],
-    ids=["func=matmul", "func=switchback_bnb"],
+    [(torch.matmul, bnb.matmul)],
+    ids=["func=matmul"],
 )
 @pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16, torch.float32], ids=describe_dtype)
 @pytest.mark.parametrize("req_grad", BOOLEAN_TRIPLES, ids=id_formatter("req_grad"))
@@ -34,10 +34,6 @@ def test_matmullt(
     device, dim1, dim2, dim3, dim4, funcs, dtype, req_grad, transpose, decomp, has_fp16_weights, has_bias
 ):
     if device != "cuda":
-        if funcs[1] == bnb.research.switchback_bnb:
-            # TODO: Deprecate/remove?
-            pytest.skip("switchback_bnb only works on CUDA.")
-
         if req_grad[1]:
             # This will be deprecated for CUDA in the future. We don't expect
             # this to work on any other device.
diff --git a/tests/test_deprecated.py b/tests/test_deprecated.py
deleted file mode 100644
index feb17c966..000000000
--- a/tests/test_deprecated.py
+++ /dev/null
@@ -1,175 +0,0 @@
-import pytest
-import torch
-
-import bitsandbytes as bnb
-from bitsandbytes import functional as F
-from tests.helpers import BOOLEAN_TRIPLES, describe_dtype, get_test_dims, id_formatter
-from tests.test_autograd import TRANSPOSE_VALS
-
-
-@pytest.mark.deprecated
-def test_dynamic_quantization():
-    diffs = []
-    reldiffs = []
-    for i in range(100):
-        A1 = torch.randn(1024, 1024, device="cuda")
-        C, S = F.quantize(A1)
-        A2 = F.dequantize(C, S)
-        diff = torch.abs(A1 - A2)
-        reldiff = diff / torch.abs(A1 + 1e-8)
-        diffs.append(diff.mean().item())
-        reldiffs.append(reldiff.mean().item())
-        assert diff.mean().item() < 0.0135
-    print(sum(diffs) / len(diffs))
-    print(sum(reldiffs) / len(reldiffs))
-
-    for i in range(100):
-        A1 = torch.rand(1024, 1024, device="cuda")
-        C, S = F.quantize(A1)
-        A2 = F.dequantize(C, S)
-        diff = torch.abs(A1 - A2).mean().item()
-        torch.testing.assert_close(A1, A2, atol=1e-2, rtol=0)
-        assert diff < 0.004
-
-
-@pytest.mark.parametrize("gtype", [torch.float32, torch.float16], ids=["float", "half"])
-@pytest.mark.deprecated
-def test_percentile_clipping(gtype):
-    gnorm_vec1 = torch.zeros(100, device="cuda")
-    gnorm_vec2 = torch.zeros(100, device="cuda")
-    n = 4
-    step = 0
-    percentile = 5
-    for i in range(20):
-        step += 1
-        g = torch.randn(n, n, dtype=gtype, device="cuda")
-        gnorm1, clip2, gnorm_scale = F.percentile_clipping(g, gnorm_vec2, step, percentile=percentile)
-        assert gnorm_scale == 1.0 if gnorm1 < clip2 else clip2 / gnorm1
-
-        gnorm2 = torch.norm(g.float())
-        if step == 1:
-            gnorm_vec1[:] = gnorm2
-        else:
-            gnorm_vec1[step % 100] = gnorm2
-
-        vals, _ = torch.sort(gnorm_vec1)
-        clip1 = vals[percentile]
-
-        torch.testing.assert_close(gnorm_vec1, torch.sqrt(gnorm_vec2))
-        torch.testing.assert_close(clip1, clip2)
-        torch.testing.assert_close(gnorm1, gnorm2)
-
-
-@pytest.mark.parametrize("dim1", get_test_dims(16, 64, n=1), ids=id_formatter("dim1"))
-@pytest.mark.parametrize("dim2", [*get_test_dims(32, 96, n=1), 0], ids=id_formatter("dim2"))
-@pytest.mark.parametrize("dim3", get_test_dims(32, 96, n=1), ids=id_formatter("dim3"))
-@pytest.mark.parametrize("dim4", get_test_dims(32, 96, n=1), ids=id_formatter("dim4"))
-@pytest.mark.parametrize("req_grad", BOOLEAN_TRIPLES, ids=id_formatter("req_grad"))
-@pytest.mark.parametrize("transpose", TRANSPOSE_VALS, ids=id_formatter("transpose"))
-@pytest.mark.parametrize("dtype", [torch.float16, torch.float32], ids=describe_dtype)
-@pytest.mark.parametrize(
-    "funcs",
-    [(torch.matmul, bnb.research.matmul_fp8_mixed), (torch.matmul, bnb.research.matmul_fp8_global)],
-    ids=["matmul_fp8_mixed", "matmul_fp8_global"],
-)
-@pytest.mark.deprecated
-@pytest.mark.skip("Deprecated functionality, to be removed.")
-def test_matmul_fp8(dim1, dim2, dim3, dim4, funcs, dtype, req_grad, transpose):
-    dimA = (dim2, dim3) if not transpose[0] else (dim3, dim2)
-    dimB = (dim3, dim4) if not transpose[1] else (dim4, dim3)
-    req_grad = list(req_grad)
-    req_grad[2] = False
-
-    for i in range(3):
-        # normal multiply
-        if funcs[0] in [torch.mm, torch.matmul]:
-            A = torch.randn(size=dimA, device="cuda", requires_grad=req_grad[0], dtype=dtype)
-            B = torch.randn(size=dimB, device="cuda", requires_grad=req_grad[1], dtype=dtype)
-            target = torch.randn(size=(dim2, dim4), device="cuda", requires_grad=req_grad[1], dtype=dtype)
-
-            torch.nn.init.xavier_uniform_(B)
-
-            fw_code = bnb.functional.create_fp8_map(True, 4, 3, 8).to(A.device)
-            bw_code = bnb.functional.create_fp8_map(True, 5, 2, 8).to(A.device)
-
-            if not transpose[0] and transpose[1]:
-                out_torch = funcs[0](A, B.t())
-                out_bnb = funcs[1](A, B.t(), fw_code, bw_code)
-            elif not transpose[0] and not transpose[1]:
-                out_torch = funcs[0](A, B)
-                out_bnb = funcs[1](A, B, fw_code, bw_code)
-
-            assert out_bnb.dtype == A.dtype, f"bnb matmullt received {A.dtype} but returned {out_bnb.dtype}"
-
-            n = out_bnb.numel()
-            err = torch.abs(out_bnb - out_torch).float().mean().item()
-            if n > 0:
-                assert err < 0.115
-                # assert err < 0.20
-            if any(req_grad):
-                out_bnb.data.copy_(out_torch)
-                torch.cuda.synchronize()
-                loss_bnb = torch.nn.functional.mse_loss(out_bnb, target).mean()
-                loss_bnb.backward()
-                gradA1 = A.grad
-                gradB1 = B.grad
-                A.grad = None
-                B.grad = None
-
-                loss_torch = torch.nn.functional.mse_loss(out_torch, target).mean()
-                loss_torch.backward()
-                gradA2 = A.grad
-                gradB2 = B.grad
-                A.grad = None
-                B.grad = None
-
-                if req_grad[0]:
-                    torch.testing.assert_close(gradA1, gradA2, atol=0.015, rtol=0.1)
-
-                if req_grad[1]:
-                    n = gradB1.numel()
-                    if dim2 > 0:
-                        assert torch.abs(gradB1).sum() > 0.0
-                        assert torch.abs(gradB2).sum() > 0.0
-                    else:
-                        assert torch.abs(gradB1).sum() == 0.0
-                        assert torch.abs(gradB2).sum() == 0.0
-                    idx = torch.isclose(gradB1, gradB2, atol=0.06, rtol=0.3)
-
-                    assert (idx == 0).sum().item() <= n * 0.1
-                    idx = torch.isclose(gradB1, gradB2, atol=0.10, rtol=0.3)
-                    assert (idx == 0).sum().item() <= n * 0.02
-                    grad_err = (gradB1 - gradB2).abs().mean()
-                    assert grad_err.item() < 0.003
-                    torch.testing.assert_close(gradB1, gradB2, atol=0.18, rtol=0.3)
-
-
-@pytest.mark.deprecated
-def test_fp8linear():
-    b = 10
-    h = 1024
-    inp = torch.randn(b, h).cuda()
-    fp32 = torch.nn.Linear(h, h * 2).cuda()
-    fp8 = bnb.research.nn.LinearFP8Mixed(h, h * 2).cuda()
-    fp32b = torch.nn.Linear(h * 2, h).cuda()
-    fp8b = bnb.research.nn.LinearFP8Mixed(h * 2, h).cuda()
-
-    fp8.weight.data.copy_(fp32.weight.data)
-    fp8.bias.data.copy_(fp32.bias.data)
-    fp8b.weight.data.copy_(fp32b.weight.data)
-    fp8b.bias.data.copy_(fp32b.bias.data)
-
-    a = fp32b(torch.nn.functional.gelu(fp32(inp)))
-    b = fp8b(torch.nn.functional.gelu(fp8(inp)))
-
-    err = (a - b).abs().mean()
-
-    a.mean().backward()
-    b.mean().backward()
-
-    graderr = (fp8.weight.grad - fp32.weight.grad).abs().mean()
-    bgraderr = (fp8.bias.grad - fp32.bias.grad).abs().mean()
-
-    assert err < 0.05
-    assert graderr < 0.00002
-    assert bgraderr < 0.00002
diff --git a/tests/test_functional.py b/tests/test_functional.py
index d2e3f0847..4670847ff 100644
--- a/tests/test_functional.py
+++ b/tests/test_functional.py
@@ -706,7 +706,6 @@ def test_dequant_mm(self, device, dim1, dim4, dims, has_bias):
 
     @pytest.mark.parametrize("dim1", [2048, 4096], ids=id_formatter("dim1"))
     @pytest.mark.parametrize("dim2", [512, 1024], ids=id_formatter("dim2"))
-    @pytest.mark.deprecated
     def test_int8_double_quant(self, dim1, dim2):
         for i in range(k):
             A = torch.randn(dim1, dim2, device="cuda").half()
diff --git a/tests/test_optim.py b/tests/test_optim.py
index 190d9a206..6216639b0 100644
--- a/tests/test_optim.py
+++ b/tests/test_optim.py
@@ -49,16 +49,16 @@ def rm_path(path):
 )
 
 str2optimizers["adam"] = (torch.optim.Adam, bnb.optim.Adam)
-str2optimizers["adam8bit_blockwise"] = (torch.optim.Adam, lambda pxx: bnb.optim.Adam8bit(pxx, block_wise=True))
+str2optimizers["adam8bit_blockwise"] = (torch.optim.Adam, lambda pxx: bnb.optim.Adam8bit(pxx))
 str2optimizers["paged_adam"] = (torch.optim.Adam, bnb.optim.PagedAdam)
 str2optimizers["paged_adamw"] = (torch.optim.AdamW, bnb.optim.PagedAdamW)
 str2optimizers["paged_adam8bit_blockwise"] = (
     torch.optim.Adam,
-    lambda pxx: bnb.optim.PagedAdam8bit(pxx, block_wise=True),
+    lambda pxx: bnb.optim.PagedAdam8bit(pxx),
 )
 str2optimizers["paged_adamw8bit_blockwise"] = (
     torch.optim.AdamW,
-    lambda pxx: bnb.optim.PagedAdamW8bit(pxx, block_wise=True),
+    lambda pxx: bnb.optim.PagedAdamW8bit(pxx),
 )
 
 str2optimizers["ademamix"] = (bnb.optim.ademamix._ReferenceAdEMAMix, bnb.optim.AdEMAMix)
@@ -90,25 +90,25 @@ def rm_path(path):
 
 str2optimizers["lion"] = (Lion, bnb.optim.Lion)
 str2optimizers["paged_lion"] = (Lion, bnb.optim.PagedLion)
-str2optimizers["lion8bit_blockwise"] = (Lion, lambda pxx: bnb.optim.Lion8bit(pxx, block_wise=True))
-str2optimizers["paged_lion8bit_blockwise"] = (Lion, lambda pxx: bnb.optim.PagedLion8bit(pxx, block_wise=True))
+str2optimizers["lion8bit_blockwise"] = (Lion, lambda pxx: bnb.optim.Lion8bit(pxx))
+str2optimizers["paged_lion8bit_blockwise"] = (Lion, lambda pxx: bnb.optim.PagedLion8bit(pxx))
 
 str2optimizers["momentum"] = (
     lambda pxx: torch.optim.SGD(pxx, 0.01, 0.9),
-    lambda pxx: bnb.optim.SGD(pxx, 0.01, 0.9, block_wise=False),
+    lambda pxx: bnb.optim.SGD(pxx, 0.01, 0.9),
 )
 str2optimizers["momentum8bit_blockwise"] = (
     lambda pxx: torch.optim.SGD(pxx, 0.01, 0.9),
-    lambda pxx: bnb.optim.SGD8bit(pxx, 0.01, 0.9, block_wise=True),
+    lambda pxx: bnb.optim.SGD8bit(pxx, 0.01, 0.9),
 )
 
 str2optimizers["rmsprop"] = (
     lambda pxx: torch.optim.RMSprop(pxx, 0.01, 0.9),
-    lambda pxx: bnb.optim.RMSprop(pxx, 0.01, 0.9, block_wise=False),
+    lambda pxx: bnb.optim.RMSprop(pxx, 0.01, 0.9),
 )
 str2optimizers["rmsprop8bit_blockwise"] = (
     lambda pxx: torch.optim.RMSprop(pxx, 0.01, 0.9),
-    lambda pxx: bnb.optim.RMSprop8bit(pxx, 0.01, 0.9, block_wise=True),
+    lambda pxx: bnb.optim.RMSprop8bit(pxx, 0.01, 0.9),
 )
 
 str2statenames = {}
@@ -462,94 +462,6 @@ def test_optimizer8bit(dim1, dim2, gtype, optim_name, device):
             torch_optimizer.state[p1][name1].copy_(s.data)
 
 
-@pytest.mark.parametrize("optim_bits", [32, 8], ids=id_formatter("optim_bits"))
-@pytest.mark.parametrize("gtype", [torch.float32], ids=describe_dtype)
-@pytest.mark.parametrize("dim2", [32, 1024, 4097], ids=id_formatter("dim2"))
-@pytest.mark.parametrize("dim1", [1024], ids=id_formatter("dim1"))
-@pytest.mark.deprecated
-def test_adam_percentile_clipping(requires_cuda, dim1, dim2, gtype, optim_bits):
-    if dim1 == 1 and dim2 == 1:
-        return
-    p1 = torch.randn(dim1, dim2, device="cpu", dtype=gtype) * 0.1
-    beta1 = 0.9
-    beta2 = 0.999
-    lr = 0.001
-    eps = 1e-8
-    p1 = p1.cuda()
-    p2 = p1.clone()
-    adam1 = bnb.optim.Adam([p1], lr, (beta1, beta2), eps, optim_bits=optim_bits)
-    adam2 = bnb.optim.Adam(
-        [p2],
-        lr,
-        (beta1, beta2),
-        eps,
-        optim_bits=optim_bits,
-        percentile_clipping=5,
-    )
-
-    gnorm_vec = torch.zeros(100).cuda()
-    step = 0
-
-    for i in range(50):
-        step += 1
-        g1 = torch.randn(dim1, dim2, device="cuda", dtype=gtype) * 0.1 + (0.01 * i)
-        g2 = g1.clone()
-        p2.grad = g2
-
-        _current_gnorm, _clip_val, gnorm_scale = F.percentile_clipping(g1, gnorm_vec, step, 5)
-        g1 = (g1.float() * gnorm_scale).to(gtype)
-        p1.grad = g1
-
-        adam1.step()
-        adam2.step()
-
-        # gnorm_scale is not deterministic (warp reductions), as such there can be slight differences in state
-        if optim_bits == 32:
-            torch.testing.assert_close(p1, p2)
-            torch.testing.assert_close(
-                adam1.state[p1]["state1"],
-                adam2.state[p2]["state1"],
-                atol=5e-5,
-                rtol=1e-4,
-            )
-            torch.testing.assert_close(
-                adam1.state[p1]["state2"],
-                adam2.state[p2]["state2"],
-                atol=5e-5,
-                rtol=1e-4,
-            )
-        elif optim_bits == 8:
-            torch.testing.assert_close(p1, p2, atol=1e-4, rtol=1e-3)
-            torch.testing.assert_close(
-                adam1.state[p1]["state1"],
-                adam2.state[p2]["state1"],
-                atol=2,
-                rtol=1e-3,
-            )
-            torch.testing.assert_close(
-                adam1.state[p1]["state2"],
-                adam2.state[p2]["state2"],
-                atol=2,
-                rtol=1e-3,
-            )
-            adam1.state[p1]["state1"].copy_(adam2.state[p2]["state1"])
-            adam1.state[p1]["state2"].copy_(adam2.state[p2]["state2"])
-        if i % 10 == 0 and i > 0:
-            path = get_temp_dir()
-            torch.save(adam2.state_dict(), join(path, "opt.pt"))
-            del adam2
-            adam2 = None
-            adam2 = bnb.optim.Adam(
-                [p2],
-                lr,
-                (beta1, beta2),
-                eps,
-                optim_bits=optim_bits,
-                percentile_clipping=5,
-            )
-            adam2.load_state_dict(torch.load(join(path, "opt.pt")))
-
-
 optimizer_names_benchmark = [
     "adam8bit_blockwise",
     "paged_adam8bit_blockwise",
diff --git a/tests/test_triton.py b/tests/test_triton.py
deleted file mode 100644
index b245e534a..000000000
--- a/tests/test_triton.py
+++ /dev/null
@@ -1,64 +0,0 @@
-import pytest
-import torch
-
-from bitsandbytes.nn import Linear8bitLt
-from bitsandbytes.nn.triton_based_modules import SwitchBackLinear
-from bitsandbytes.triton.triton_utils import is_triton_available
-from tests.helpers import TRUE_FALSE
-
-
-@pytest.mark.skipif(
-    not is_triton_available() or not torch.cuda.is_available() or not torch.cuda.get_device_capability()[0] >= 8,
-    reason="This test requires triton and a GPU with compute capability 8.0 or higher.",
-)
-@pytest.mark.deprecated
-@pytest.mark.parametrize("vector_wise_quantization", TRUE_FALSE)
-def test_switchback(vector_wise_quantization):
-    for dim in [83]:
-        for batch in [13]:
-            standard = torch.nn.Linear(dim, 4 * dim).cuda().half()
-            switchback = (
-                SwitchBackLinear(dim, 4 * dim, vector_wise_quantization=vector_wise_quantization).cuda().half()
-            )
-            baseline = Linear8bitLt(dim, 4 * dim).cuda().half()
-            switchback.weight.data.copy_(standard.weight)
-            switchback.bias.data.copy_(standard.bias)
-            baseline.weight.data.copy_(standard.weight)
-            baseline.bias.data.copy_(standard.bias)
-
-            x1 = torch.randn(batch, dim).cuda().half().requires_grad_(True)
-            x2 = x1.clone().detach().requires_grad_(True)
-            x3 = x1.clone().detach().requires_grad_(True)
-
-            out_standard = standard(x1)
-            (2**10 * out_standard.abs().mean()).backward()
-
-            print(x2.dtype)
-            out_sb = switchback(x2)
-            (2**10 * out_sb.abs().mean()).backward()
-
-            out_baseline = baseline(x3)
-            (2**10 * out_baseline.abs().mean()).backward()
-
-            err_sb = (out_standard - out_sb).abs().mean()
-            err_baseline = (out_standard - out_baseline).abs().mean()
-            print("OUT", err_sb, err_baseline)
-            assert err_sb < 2 * err_baseline
-
-            err_sb = (standard.bias.grad - switchback.bias.grad).abs().mean()
-            err_baseline = (standard.bias.grad - baseline.bias.grad).abs().mean()
-
-            print("GW2", err_sb, err_baseline)
-            assert err_sb < 2 * err_baseline
-
-            err_sb = (standard.weight.grad - switchback.weight.grad).abs().mean()
-            err_baseline = (standard.weight.grad - baseline.weight.grad).abs().mean()
-
-            print("GW1", err_sb, err_baseline)
-            assert err_sb < 2 * err_baseline
-
-            err_sb = (x1.grad - x2.grad).abs().mean()
-            err_baseline = (x1.grad - x3.grad).abs().mean()
-
-            print("GX1", err_sb, err_baseline)
-            assert err_sb < 2 * err_baseline

From bf50ef0dabe819b8c4fc16afac23b62d6bccff4a Mon Sep 17 00:00:00 2001
From: Matthew Douglas <38992547+matthewdouglas@users.noreply.github.com>
Date: Fri, 20 Feb 2026 13:09:29 -0500
Subject: [PATCH 2/2] Apply suggestions from code review

Keep `deprecated` marker for tests
---
 pyproject.toml | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/pyproject.toml b/pyproject.toml
index 21e11261b..f448a079e 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -88,7 +88,7 @@ exclude_also = [
 ]
 
 [tool.pytest.ini_options]
-addopts = "-rP -m 'not slow and not benchmark'"
+addopts = "-rP -m 'not slow and not benchmark and not deprecated'"
 #    ; --cov=bitsandbytes
 #    ; # contexts: record which test ran which line; can be seen in html coverage report
 #    ; --cov-context=test
@@ -98,6 +98,7 @@ log_cli_level = "INFO"
 log_file = "logs/pytest.log"
 markers = [
     "benchmark: mark test as a benchmark",
+    "deprecated: mark test as covering a deprecated feature",
     "slow: mark test as slow",
 ]