From ee75911502ee93dc0ab8dfab4b60496eb50bfae3 Mon Sep 17 00:00:00 2001 From: Filippo Boni <88785623+FilippoBoni1921@users.noreply.github.com> Date: Mon, 24 Nov 2025 14:46:33 +0100 Subject: [PATCH 1/8] add https protocol (#2) --- verifiers/rl/inference/client.py | 5 +++-- verifiers/rl/trainer/config.py | 7 +++++++ verifiers/rl/trainer/trainer.py | 7 ++++++- 3 files changed, 16 insertions(+), 3 deletions(-) diff --git a/verifiers/rl/inference/client.py b/verifiers/rl/inference/client.py index 04876582e..d5e19622a 100644 --- a/verifiers/rl/inference/client.py +++ b/verifiers/rl/inference/client.py @@ -25,10 +25,11 @@ def __init__( self, host: str = "0.0.0.0", port: int = 8000, + protocol: str = "http", group_port: int = 51216, connection_timeout: float = 0.0, ): - super().__init__(base_url=f"http://{host}:{port}/v1", api_key="local") + super().__init__(base_url=f"{protocol}://{host}:{port}/v1", api_key="local") self.session = requests.Session() # configure connection pooling to handle rapid requests better adapter = HTTPAdapter( @@ -39,7 +40,7 @@ def __init__( self.host = host self.server_port = port - self.server_url = f"http://{self.host}:{self.server_port}" + self.server_url = f"{protocol}://{self.host}:{self.server_port}" self.group_port = group_port self.check_server(connection_timeout) diff --git a/verifiers/rl/trainer/config.py b/verifiers/rl/trainer/config.py index 49c1824f6..c1803840d 100644 --- a/verifiers/rl/trainer/config.py +++ b/verifiers/rl/trainer/config.py @@ -188,10 +188,17 @@ class RLConfig(TrainingArguments): default="0.0.0.0", metadata={"help": "Host of the vLLM server to connect to."}, ) + vllm_server_port: int = field( default=8000, metadata={"help": "Port of the vLLM server to connect to."}, ) + + vllm_server_protocol: str = field( + default="http", + metadata={"help": "VLLM server protocol type (https or http)."}, + ) + vllm_server_timeout: float = field( default=300.0, metadata={ diff --git a/verifiers/rl/trainer/trainer.py b/verifiers/rl/trainer/trainer.py index b3451e8ba..0cb250bbc 100644 --- a/verifiers/rl/trainer/trainer.py +++ b/verifiers/rl/trainer/trainer.py @@ -87,11 +87,16 @@ def __init__( if self.accelerator.is_main_process: host = args.vllm_server_host port = args.vllm_server_port + protocol = args.vllm_server_protocol + + if protocol not in ["http", "https"]: + raise ValueError(f"Invalid protocol '{protocol}'. Supported protocols are 'http' and 'https'.") + self.client = VLLMClient( host=host, port=port, connection_timeout=args.vllm_server_timeout ) self.client.init_communicator() - vllm_base_url = f"http://{host}:{port}/v1" + vllm_base_url = f"{protocol}://{host}:{port}/v1" self.orchestrator = Orchestrator( env=env, client_base_url=vllm_base_url, From 58935cd3bd318621dba653716213959c83db96aa Mon Sep 17 00:00:00 2001 From: Filippo Boni <88785623+FilippoBoni1921@users.noreply.github.com> Date: Mon, 24 Nov 2025 17:22:39 +0100 Subject: [PATCH 2/8] fix bug (#4) --- verifiers/rl/trainer/trainer.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/verifiers/rl/trainer/trainer.py b/verifiers/rl/trainer/trainer.py index 0cb250bbc..983882bbb 100644 --- a/verifiers/rl/trainer/trainer.py +++ b/verifiers/rl/trainer/trainer.py @@ -93,7 +93,7 @@ def __init__( raise ValueError(f"Invalid protocol '{protocol}'. Supported protocols are 'http' and 'https'.") self.client = VLLMClient( - host=host, port=port, connection_timeout=args.vllm_server_timeout + host=host, port=port, protocol=protocol, connection_timeout=args.vllm_server_timeout ) self.client.init_communicator() vllm_base_url = f"{protocol}://{host}:{port}/v1" From 78a231cceda425cb023116e79315289d730fce2d Mon Sep 17 00:00:00 2001 From: FilippoBoni1921 Date: Fri, 26 Dec 2025 10:46:39 +0100 Subject: [PATCH 3/8] start unsloth config --- verifiers/rl/trainer/unsloth_config.py | 0 1 file changed, 0 insertions(+), 0 deletions(-) create mode 100644 verifiers/rl/trainer/unsloth_config.py diff --git a/verifiers/rl/trainer/unsloth_config.py b/verifiers/rl/trainer/unsloth_config.py new file mode 100644 index 000000000..e69de29bb From 1f7286dc3a52d169454b090645e79c7babe80eb1 Mon Sep 17 00:00:00 2001 From: FilippoBoni1921 Date: Fri, 26 Dec 2025 11:54:51 +0100 Subject: [PATCH 4/8] add unsloth config --- pyproject.toml | 5 +- verifiers/configs/unsloth_config.py | 102 ++++++++++++++++++++++++++++ 2 files changed, 105 insertions(+), 2 deletions(-) create mode 100644 verifiers/configs/unsloth_config.py diff --git a/pyproject.toml b/pyproject.toml index b46e51697..dbd25f33f 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -42,6 +42,7 @@ dependencies = [ "textual", "tomli; python_version < '3.11'", "typing_extensions; python_version < '3.12'", + "unsloth>=2025.12.9", "wget>=3.2", ] @@ -70,9 +71,9 @@ rl = [ "peft", "wandb", "vllm>=0.10.0,<0.11.0", - "liger-kernel>=0.5.10", + #"liger-kernel>=0.5.10", "deepspeed>=0.17.6", - "flash-attn>=2.8.3", + #"flash-attn>=2.8.3", ] envs = [ "math-verify>=0.8.0", diff --git a/verifiers/configs/unsloth_config.py b/verifiers/configs/unsloth_config.py new file mode 100644 index 000000000..463826d3e --- /dev/null +++ b/verifiers/configs/unsloth_config.py @@ -0,0 +1,102 @@ +from dataclasses import dataclass, field +from typing import List, Optional, Union + +from peft import LoraConfig +from transformers.trainer_utils import SchedulerType + + +@dataclass +class UnslothConfig: + """ + Configuration class for Unsloth Trainer. + """ + + # Model Load parameters + load_in_4bit: bool = field( + default=False, + metadata={"help": "Whether to use 4-bit precision for model weights."}, + ) + + load_in_8bit: bool = field( + default=False, + metadata={"help": "Whether to use 8-bit precision for model weights."}, + ) + + load_in_16bit: bool = field( + default=True, + metadata={"help": "Whether to use 16-bit precision for model weights."}, + ) + + full_finetuning: bool = field( + default=False, + metadata={"help": "Whether to fine-tune the entire model."}, + ) + + use_exact_model_name: bool = field( + default=False, + metadata={"help": "Whether to use the exact model name without mapping."}, + ) + + gpu_memory_utilization: float = field( + default=0.8, + metadata={"help": "Target GPU memory utilization for model loading."}, + ) + + random_state: int = field( + default=3407, + metadata={"help": "Random state for reproducibility."}, + ) + + max_lora_rank: int = field( + default=64, + metadata={"help": "Maximum allowable rank for LoRA adapters."}, + ) + + token: Optional[str] = field( + default=None, + metadata={"help": "Huggingface token for private model access."}, + ) + + # Model Lora parameters + r: int = field( + default=16, + metadata={"help": "LoRA rank."}, + ) + + target_modules: List[str] = field( + default= [ + "q_proj", + "k_proj", + "v_proj", + "o_proj", + "gate_proj", + "up_proj", + "down_proj", + ], + metadata={"help": "Target modules for LoRA."}, + ) + + lora_alpha: int = field( + default=16, + metadata={"help": "LoRA alpha parameter."}, + ) + + lora_dropout: float = field( + default=0.0, + metadata={"help": "LoRA dropout rate."}, + ) + + use_gradient_checkpointing: str = field( + default="unsloth", + metadata={"help": "Gradient checkpointing strategy."}, + ) + + use_rslora: bool = field( + default=False, + metadata={"help": "Whether to use RS-LoRA."}, + ) + + loftq_config: Optional[dict] = field( + default=None, + metadata={"help": "Configuration for LoFT-Q."}, + ) From 58fdef0c900125779500a39918ba7362db584124 Mon Sep 17 00:00:00 2001 From: FilippoBoni1921 Date: Fri, 26 Dec 2025 19:28:18 +0100 Subject: [PATCH 5/8] add unsloth usage --- verifiers/__init__.py | 3 ++ verifiers/configs/unsloth_config.py | 41 ++------------------ verifiers/rl/trainer/config.py | 58 +++++++++++++++++++++++++++++ verifiers/rl/trainer/trainer.py | 18 +++++++-- verifiers/rl/trainer/utils.py | 16 ++++++++ 5 files changed, 95 insertions(+), 41 deletions(-) diff --git a/verifiers/__init__.py b/verifiers/__init__.py index d7e2fdb61..ac74ff91b 100644 --- a/verifiers/__init__.py +++ b/verifiers/__init__.py @@ -110,6 +110,7 @@ def setup_logging( "get_model_and_tokenizer", "RLTrainer", "RLConfig", + "UnslothConfig", "GRPOTrainer", "GRPOConfig", "grpo_defaults", @@ -123,6 +124,7 @@ def setup_logging( "get_model": "verifiers.rl.trainer.utils:get_model", "get_model_and_tokenizer": "verifiers.rl.trainer.utils:get_model_and_tokenizer", "RLConfig": "verifiers.rl.trainer:RLConfig", + "UnslothConfig": "verifiers.configs.unsloth_config:UnslothConfig", "RLTrainer": "verifiers.rl.trainer:RLTrainer", "GRPOTrainer": "verifiers.rl.trainer:GRPOTrainer", "GRPOConfig": "verifiers.rl.trainer:GRPOConfig", @@ -170,6 +172,7 @@ def __getattr__(name: str): grpo_defaults, lora_defaults, ) + from .configs.unsloth_config import UnslothConfig # noqa: F401 from .rl.trainer.utils import ( # noqa: F401 get_model, get_model_and_tokenizer, diff --git a/verifiers/configs/unsloth_config.py b/verifiers/configs/unsloth_config.py index 463826d3e..0b09e96fe 100644 --- a/verifiers/configs/unsloth_config.py +++ b/verifiers/configs/unsloth_config.py @@ -1,9 +1,5 @@ from dataclasses import dataclass, field -from typing import List, Optional, Union - -from peft import LoraConfig -from transformers.trainer_utils import SchedulerType - +from typing import List, Optional @dataclass class UnslothConfig: @@ -57,46 +53,15 @@ class UnslothConfig: metadata={"help": "Huggingface token for private model access."}, ) - # Model Lora parameters - r: int = field( - default=16, - metadata={"help": "LoRA rank."}, - ) - - target_modules: List[str] = field( - default= [ - "q_proj", - "k_proj", - "v_proj", - "o_proj", - "gate_proj", - "up_proj", - "down_proj", - ], - metadata={"help": "Target modules for LoRA."}, - ) - - lora_alpha: int = field( - default=16, - metadata={"help": "LoRA alpha parameter."}, - ) - - lora_dropout: float = field( - default=0.0, - metadata={"help": "LoRA dropout rate."}, - ) + # Additional Model Lora parameters use_gradient_checkpointing: str = field( default="unsloth", metadata={"help": "Gradient checkpointing strategy."}, ) - use_rslora: bool = field( - default=False, - metadata={"help": "Whether to use RS-LoRA."}, - ) - loftq_config: Optional[dict] = field( default=None, metadata={"help": "Configuration for LoFT-Q."}, ) + diff --git a/verifiers/rl/trainer/config.py b/verifiers/rl/trainer/config.py index c1803840d..e7c4ce232 100644 --- a/verifiers/rl/trainer/config.py +++ b/verifiers/rl/trainer/config.py @@ -5,6 +5,8 @@ from transformers import TrainingArguments from transformers.trainer_utils import SchedulerType +from verifiers.configs.unsloth_config import UnslothConfig + @dataclass class RLConfig(TrainingArguments): @@ -280,6 +282,28 @@ class RLConfig(TrainingArguments): metadata={"help": "Whether to shuffle the training dataset."}, ) + use_unsloth: bool = field( + default=False, + metadata={"help": "Whether to use UnslothConfig for additional model training parameters."}, + ) + + unsloth_config: Optional[UnslothConfig] = field( + default=None, + metadata={"help": "UnslothConfig instance for additional model training parameters."}, + ) + + unsloth_base_model_args: dict = field( + init=False, + default_factory=dict, + metadata={"help": "Arguments for loading the base model with Unsloth."}, + ) + + unsloth_lora_args: dict = field( + init=False, + default_factory=dict, + metadata={"help": "Additional arguments for LoRA configuration with Unsloth."}, + ) + def __post_init__(self): # configure output dir if self.output_dir is None: @@ -332,6 +356,40 @@ def __post_init__(self): }, } self.gradient_accumulation_steps = 1 + + if self.use_unsloth: + self.unsloth_base_model_args = { + "load_in_4bit": self.unsloth_config.load_in_4bit, + "load_in_8bit": self.unsloth_config.load_in_8bit, + "load_in_16bit": self.unsloth_config.load_in_16bit, + "full_finetuning": self.unsloth_config.full_finetuning, + "use_exact_model_name": self.unsloth_config.use_exact_model_name, + "gpu_memory_utilization": self.unsloth_config.gpu_memory_utilization, + "token": self.unsloth_config.token, + } + + self.unsloth_lora_args = { + "r": self.lora_rank, + "lora_alpha": self.lora_alpha, + "target_modules": self.lora_target_modules, + "lora_dropout": self.lora_dropout, + "use_rslora": self.lora_use_rslora, + "loftq_config": self.unsloth_config.loftq_config, + "random_state": self.unsloth_config.random_state, + "use_gradient_checkpointing": self.unsloth_config.use_gradient_checkpointing, + } + + self.unsloth_config.r, + self.unsloth_config.lora_alpha, + self.unsloth_config.target_modules, + self.unsloth_config.lora_dropout = ( + self.lora_rank, + self.lora_rank, + self.lora_alpha, + self.lora_target_modules, + self.lora_dropout, + ) + super().__post_init__() num_processes = self.world_size diff --git a/verifiers/rl/trainer/trainer.py b/verifiers/rl/trainer/trainer.py index 9a8fc4f2f..a0651dfb0 100644 --- a/verifiers/rl/trainer/trainer.py +++ b/verifiers/rl/trainer/trainer.py @@ -53,12 +53,24 @@ def __init__( # model + tokenizer if isinstance(model, str): model_name = model - model, processing_class = vf.get_model_and_tokenizer(model) + if args.use_unsloth and args.unsloth_config is not None: + model, processing_class = vf.unsloth_get_model_and_tokenizer( + model_name, + unsloth_config=args.unsloth_base_model_args, + ) + else: + model, processing_class = vf.get_model_and_tokenizer(model_name) else: model_name = model.config._name_or_path assert isinstance(model, PreTrainedModel) - if args.use_lora and isinstance(args.lora_config, PeftConfig): - model = prepare_peft_model(model, args.lora_config, args) + if args.use_lora: + if args.use_unsloth and args.unsloth_lora_args is not None: + model = vf.unsloth_prepare_peft_model( + model, + unsloth_config=args.unsloth_lora_args, + ) + elif isinstance(args.lora_config, PeftConfig): + model = prepare_peft_model(model, args.lora_config, args) model.warnings_issued["estimate_tokens"] = True # suppress warning super().__init__( diff --git a/verifiers/rl/trainer/utils.py b/verifiers/rl/trainer/utils.py index 4b7f975e0..9ac851df2 100644 --- a/verifiers/rl/trainer/utils.py +++ b/verifiers/rl/trainer/utils.py @@ -43,6 +43,13 @@ def get_model_and_tokenizer( tokenizer = AutoTokenizer.from_pretrained(model_name) return model, tokenizer +def unsloth_get_model_and_tokenizer( + model_name: str, + unsloth_config: dict[str, Any], + ) -> tuple[Any, Any]: + from unsloth import FastLanguageModel + model, tokenizer = FastLanguageModel.from_pretrained(model_name, **unsloth_config) + return model, tokenizer def pad( tensors: list[torch.Tensor], @@ -168,6 +175,15 @@ def prepare_peft_model( return model +def unsloth_prepare_peft_model( + model: PreTrainedModel, unsloth_config: dict[str, Any], +) -> PreTrainedModel: + """Prepares a model for PEFT training using Unsloth.""" + from unsloth import FastLanguageModel + # TODO: check additional args ad kwargs + model = cast(PreTrainedModel, FastLanguageModel.get_peft_model(model, **unsloth_config)) + return model + def init_stat_tracker(device: torch.device) -> dict[str, torch.Tensor]: zero = torch.zeros((), device=device, dtype=torch.float32) From c43d6ace3dda49ed8c6cb38de73727f8c434a20b Mon Sep 17 00:00:00 2001 From: FilippoBoni1921 Date: Wed, 7 Jan 2026 22:14:22 +0100 Subject: [PATCH 6/8] move unsloth config --- verifiers/{ => rl}/configs/unsloth_config.py | 0 verifiers/rl/trainer/unsloth_config.py | 0 2 files changed, 0 insertions(+), 0 deletions(-) rename verifiers/{ => rl}/configs/unsloth_config.py (100%) delete mode 100644 verifiers/rl/trainer/unsloth_config.py diff --git a/verifiers/configs/unsloth_config.py b/verifiers/rl/configs/unsloth_config.py similarity index 100% rename from verifiers/configs/unsloth_config.py rename to verifiers/rl/configs/unsloth_config.py diff --git a/verifiers/rl/trainer/unsloth_config.py b/verifiers/rl/trainer/unsloth_config.py deleted file mode 100644 index e69de29bb..000000000 From 8d62f312e069286ac2826637ade9da098266506d Mon Sep 17 00:00:00 2001 From: FilippoBoni1921 Date: Wed, 7 Jan 2026 22:15:37 +0100 Subject: [PATCH 7/8] remove http experiment --- verifiers/rl/inference/client.py | 5 ++--- verifiers/rl/trainer/config.py | 7 +------ verifiers/rl/trainer/trainer.py | 10 +++------- 3 files changed, 6 insertions(+), 16 deletions(-) diff --git a/verifiers/rl/inference/client.py b/verifiers/rl/inference/client.py index d5e19622a..04876582e 100644 --- a/verifiers/rl/inference/client.py +++ b/verifiers/rl/inference/client.py @@ -25,11 +25,10 @@ def __init__( self, host: str = "0.0.0.0", port: int = 8000, - protocol: str = "http", group_port: int = 51216, connection_timeout: float = 0.0, ): - super().__init__(base_url=f"{protocol}://{host}:{port}/v1", api_key="local") + super().__init__(base_url=f"http://{host}:{port}/v1", api_key="local") self.session = requests.Session() # configure connection pooling to handle rapid requests better adapter = HTTPAdapter( @@ -40,7 +39,7 @@ def __init__( self.host = host self.server_port = port - self.server_url = f"{protocol}://{self.host}:{self.server_port}" + self.server_url = f"http://{self.host}:{self.server_port}" self.group_port = group_port self.check_server(connection_timeout) diff --git a/verifiers/rl/trainer/config.py b/verifiers/rl/trainer/config.py index e7c4ce232..f505f8ab6 100644 --- a/verifiers/rl/trainer/config.py +++ b/verifiers/rl/trainer/config.py @@ -5,7 +5,7 @@ from transformers import TrainingArguments from transformers.trainer_utils import SchedulerType -from verifiers.configs.unsloth_config import UnslothConfig +from verifiers.rl.configs.unsloth_config import UnslothConfig @dataclass @@ -196,11 +196,6 @@ class RLConfig(TrainingArguments): metadata={"help": "Port of the vLLM server to connect to."}, ) - vllm_server_protocol: str = field( - default="http", - metadata={"help": "VLLM server protocol type (https or http)."}, - ) - vllm_server_timeout: float = field( default=300.0, metadata={ diff --git a/verifiers/rl/trainer/trainer.py b/verifiers/rl/trainer/trainer.py index a0651dfb0..e1784b8dc 100644 --- a/verifiers/rl/trainer/trainer.py +++ b/verifiers/rl/trainer/trainer.py @@ -100,16 +100,12 @@ def __init__( if self.accelerator.is_main_process: host = args.vllm_server_host port = args.vllm_server_port - protocol = args.vllm_server_protocol - - if protocol not in ["http", "https"]: - raise ValueError(f"Invalid protocol '{protocol}'. Supported protocols are 'http' and 'https'.") - + self.client = VLLMClient( - host=host, port=port, protocol=protocol, connection_timeout=args.vllm_server_timeout + host=host, port=port, connection_timeout=args.vllm_server_timeout ) self.client.init_communicator() - vllm_base_url = f"{protocol}://{host}:{port}/v1" + vllm_base_url = f"http://{host}:{port}/v1" self.orchestrator = Orchestrator( env=env, client_base_url=vllm_base_url, From e8953a49d2253253b5dcfad83b914c6edb78a0a2 Mon Sep 17 00:00:00 2001 From: FilippoBoni1921 Date: Wed, 7 Jan 2026 22:16:15 +0100 Subject: [PATCH 8/8] modify dependencies --- pyproject.toml | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index dbd25f33f..ce03ab9a2 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -42,7 +42,6 @@ dependencies = [ "textual", "tomli; python_version < '3.11'", "typing_extensions; python_version < '3.12'", - "unsloth>=2025.12.9", "wget>=3.2", ] @@ -71,9 +70,11 @@ rl = [ "peft", "wandb", "vllm>=0.10.0,<0.11.0", - #"liger-kernel>=0.5.10", + "liger-kernel>=0.5.10", "deepspeed>=0.17.6", - #"flash-attn>=2.8.3", + "flash-attn>=2.8.3", + "unsloth>=2025.12.9", + ] envs = [ "math-verify>=0.8.0",