From 0066653c29a82f93d8ad423c6e9376b0e61b94ec Mon Sep 17 00:00:00 2001 From: badaoui Date: Mon, 16 Feb 2026 14:58:22 +0100 Subject: [PATCH 1/6] documentation --- docs/source/quickstart.mdx | 136 +++++++++++++++++++++++++++++++++++-- 1 file changed, 130 insertions(+), 6 deletions(-) diff --git a/docs/source/quickstart.mdx b/docs/source/quickstart.mdx index ed92c896b..e6c85197f 100644 --- a/docs/source/quickstart.mdx +++ b/docs/source/quickstart.mdx @@ -1,15 +1,139 @@ # Quickstart -## How does it work? +Welcome to bitsandbytes! This library enables accessible large language models via k-bit quantization for PyTorch, dramatically reducing memory consumption for inference and training. -... work in progress ... +## Installation -(Community contributions would we very welcome!) +```bash +pip install bitsandbytes +``` + +**Requirements:** Python 3.10+, PyTorch 2.3+ + +For detailed installation instructions, see the [Installation Guide](./installation). + +## What is bitsandbytes? + +bitsandbytes provides three main features: + +- **LLM.int8()**: 8-bit quantization for inference (50% memory reduction) +- **QLoRA**: 4-bit quantization for training (75% memory reduction) +- **8-bit Optimizers**: Memory-efficient optimizers for training + +## Quick Examples + +### 8-bit Inference + +Load and run a model using 8-bit quantization: + +```py +from transformers import AutoModelForCausalLM, AutoTokenizer + +model = AutoModelForCausalLM.from_pretrained( + "meta-llama/Llama-2-7b-hf", + device_map="auto", + load_in_8bit=True, +) + +tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-2-7b-hf") +inputs = tokenizer("Hello, my name is", return_tensors="pt").to("cuda") +outputs = model.generate(**inputs, max_new_tokens=20) +print(tokenizer.decode(outputs[0])) +``` + +### 4-bit Quantization + +For even greater memory savings: + +```py +import torch +from transformers import AutoModelForCausalLM, BitsAndBytesConfig + +bnb_config = BitsAndBytesConfig( + load_in_4bit=True, + bnb_4bit_compute_dtype=torch.bfloat16, +) + +model = AutoModelForCausalLM.from_pretrained( + "meta-llama/Llama-2-7b-hf", + quantization_config=bnb_config, + device_map="auto", +) +``` + +### QLoRA Fine-tuning -## Minimal examples +Combine 4-bit quantization with LoRA for efficient training: -The following code illustrates the steps above. +```py +from transformers import AutoModelForCausalLM, BitsAndBytesConfig +from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training + +# Load 4-bit model +bnb_config = BitsAndBytesConfig(load_in_4bit=True) +model = AutoModelForCausalLM.from_pretrained( + "meta-llama/Llama-2-7b-hf", + quantization_config=bnb_config, +) + +# Prepare for training +model = prepare_model_for_kbit_training(model) + +# Add LoRA adapters +lora_config = LoraConfig( + r=16, + lora_alpha=32, + target_modules=["q_proj", "v_proj"], + task_type="CAUSAL_LM", +) +model = get_peft_model(model, lora_config) + +# Now train with your preferred trainer +``` + +### 8-bit Optimizers + +Use 8-bit optimizers to reduce training memory by 75%: + +```py +import bitsandbytes as bnb + +model = YourModel() + +# Replace standard optimizer with 8-bit version +optimizer = bnb.optim.Adam8bit(model.parameters(), lr=1e-3) + +# Use in training loop as normal +for batch in dataloader: + loss = model(batch) + loss.backward() + optimizer.step() + optimizer.zero_grad() +``` + +### Custom Quantized Layers + +Use quantized linear layers directly in your models: ```py -code examples will soon follow +import bitsandbytes as bnb + +# 8-bit linear layer +linear_8bit = bnb.nn.Linear8bitLt(1024, 1024, has_fp16_weights=False) + +# 4-bit linear layer +linear_4bit = bnb.nn.Linear4bit(1024, 1024, compute_dtype=torch.bfloat16) ``` + +## Next Steps + +- [8-bit Optimizers Guide](./optimizers) - Detailed optimizer usage +- [FSDP-QLoRA](./fsdp_qlora) - Train 70B+ models on consumer GPUs +- [Integrations](./integrations) - Use with Transformers, PEFT, Accelerate +- [FAQs](./faqs) - Common questions and troubleshooting + +## Getting Help + +- Check the [FAQs](./faqs) and [Common Errors](./errors) +- Visit [official documentation](https://huggingface.co/docs/bitsandbytes) +- Open an issue on [GitHub](https://github.com/bitsandbytes-foundation/bitsandbytes/issues) From f41338ba7fbdd061483e505ca357bcd57a83fc63 Mon Sep 17 00:00:00 2001 From: badaoui Date: Mon, 16 Feb 2026 15:07:08 +0100 Subject: [PATCH 2/6] doc --- docs/source/quickstart.mdx | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/docs/source/quickstart.mdx b/docs/source/quickstart.mdx index e6c85197f..d95620349 100644 --- a/docs/source/quickstart.mdx +++ b/docs/source/quickstart.mdx @@ -41,6 +41,8 @@ outputs = model.generate(**inputs, max_new_tokens=20) print(tokenizer.decode(outputs[0])) ``` +> **Learn more:** See the [Integrations guide](./integrations) for more details on using bitsandbytes with Transformers. + ### 4-bit Quantization For even greater memory savings: @@ -91,6 +93,8 @@ model = get_peft_model(model, lora_config) # Now train with your preferred trainer ``` +> **Learn more:** See the [FSDP-QLoRA guide](./fsdp_qlora) for advanced training techniques and the [Integrations guide](./integrations) for using with PEFT. + ### 8-bit Optimizers Use 8-bit optimizers to reduce training memory by 75%: @@ -111,6 +115,8 @@ for batch in dataloader: optimizer.zero_grad() ``` +> **Learn more:** See the [8-bit Optimizers guide](./optimizers) for detailed usage and configuration options. + ### Custom Quantized Layers Use quantized linear layers directly in your models: From c72f9e80859316a35cedbae0d2561daf969730c9 Mon Sep 17 00:00:00 2001 From: badaoui Date: Mon, 16 Feb 2026 18:12:53 +0100 Subject: [PATCH 3/6] fix --- docs/source/quickstart.mdx | 1 + 1 file changed, 1 insertion(+) diff --git a/docs/source/quickstart.mdx b/docs/source/quickstart.mdx index d95620349..f14918a7e 100644 --- a/docs/source/quickstart.mdx +++ b/docs/source/quickstart.mdx @@ -122,6 +122,7 @@ for batch in dataloader: Use quantized linear layers directly in your models: ```py +import torch import bitsandbytes as bnb # 8-bit linear layer From 930c9baadf657d240345e9276041c3ab6324e8fd Mon Sep 17 00:00:00 2001 From: badaoui Date: Fri, 20 Feb 2026 16:38:18 +0100 Subject: [PATCH 4/6] fix --- docs/source/quickstart.mdx | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/source/quickstart.mdx b/docs/source/quickstart.mdx index f14918a7e..398efb21d 100644 --- a/docs/source/quickstart.mdx +++ b/docs/source/quickstart.mdx @@ -27,12 +27,12 @@ bitsandbytes provides three main features: Load and run a model using 8-bit quantization: ```py -from transformers import AutoModelForCausalLM, AutoTokenizer +from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig model = AutoModelForCausalLM.from_pretrained( "meta-llama/Llama-2-7b-hf", device_map="auto", - load_in_8bit=True, + quantization_config=BitsAndBytesConfig(load_in_8bit=True), ) tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-2-7b-hf") From 0df2fd39d0834f2c4aa16d5ff3ec554805654886 Mon Sep 17 00:00:00 2001 From: badaoui Date: Fri, 20 Feb 2026 16:42:16 +0100 Subject: [PATCH 5/6] typo --- agents/kbit_gemm_context.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/agents/kbit_gemm_context.md b/agents/kbit_gemm_context.md index e22921fac..45d68c9a0 100644 --- a/agents/kbit_gemm_context.md +++ b/agents/kbit_gemm_context.md @@ -1089,7 +1089,7 @@ void kbit_gemm( cudaDeviceGetAttribute(&sms, cudaDevAttrMultiProcessorCount, dev); int max_shmem; cudaDeviceGetAttribute(&max_shmem, - cudaDevAttrMaxSharedMemoryPerBlockOptin, dev); + cudaDevAttrMaxSharedMemoryPerBlockOption, dev); // Choose M-blocking int m_blocks; From df1084e90edf8fef55e90aa24e9c9cd8d02344f0 Mon Sep 17 00:00:00 2001 From: badaoui Date: Fri, 20 Feb 2026 16:44:39 +0100 Subject: [PATCH 6/6] nf4 example --- docs/source/quickstart.mdx | 1 + 1 file changed, 1 insertion(+) diff --git a/docs/source/quickstart.mdx b/docs/source/quickstart.mdx index 398efb21d..7ce93e282 100644 --- a/docs/source/quickstart.mdx +++ b/docs/source/quickstart.mdx @@ -54,6 +54,7 @@ from transformers import AutoModelForCausalLM, BitsAndBytesConfig bnb_config = BitsAndBytesConfig( load_in_4bit=True, bnb_4bit_compute_dtype=torch.bfloat16, + bnb_4bit_quant_type="nf4", ) model = AutoModelForCausalLM.from_pretrained(