From 3c876368ae8e28ac1e9bf6d09f5eb453a0ba2add Mon Sep 17 00:00:00 2001 From: Arsh Zahed Date: Tue, 18 Mar 2025 13:40:11 -0700 Subject: [PATCH 1/8] Port cosine lr scheduler init --- src/together/cli/api/finetune.py | 16 +++++++++++ src/together/resources/finetune.py | 34 ++++++++++++++++++++--- src/together/types/__init__.py | 2 ++ src/together/types/finetune.py | 43 +++++++++++++++++++++++++----- 4 files changed, 85 insertions(+), 10 deletions(-) diff --git a/src/together/cli/api/finetune.py b/src/together/cli/api/finetune.py index b413f323..1c14de55 100644 --- a/src/together/cli/api/finetune.py +++ b/src/together/cli/api/finetune.py @@ -71,12 +71,24 @@ def fine_tuning(ctx: click.Context) -> None: ) @click.option("--batch-size", type=INT_WITH_MAX, default="max", help="Train batch size") @click.option("--learning-rate", type=float, default=1e-5, help="Learning rate") +@click.option( + "--lr-scheduler-type", + type=click.Choice(["linear", "cosine"]), + default="linear", + help="Learning rate scheduler type", +) @click.option( "--min-lr-ratio", type=float, default=0.0, help="The ratio of the final learning rate to the peak learning rate", ) +@click.option( + "--num-cycles", + type=float, + default=0.5, + help="Number of cycles for cosine learning rate scheduler.", +) @click.option( "--warmup-ratio", type=float, @@ -162,7 +174,9 @@ def create( n_checkpoints: int, batch_size: int | Literal["max"], learning_rate: float, + lr_scheduler_type: Literal["linear", "cosine"], min_lr_ratio: float, + num_cycles: float, warmup_ratio: float, max_grad_norm: float, weight_decay: float, @@ -194,7 +208,9 @@ def create( n_checkpoints=n_checkpoints, batch_size=batch_size, learning_rate=learning_rate, + lr_scheduler_type=lr_scheduler_type, min_lr_ratio=min_lr_ratio, + num_cycles=num_cycles, warmup_ratio=warmup_ratio, max_grad_norm=max_grad_norm, weight_decay=weight_decay, diff --git a/src/together/resources/finetune.py b/src/together/resources/finetune.py index 8cc48a17..5d278f5f 100644 --- a/src/together/resources/finetune.py +++ b/src/together/resources/finetune.py @@ -23,6 +23,7 @@ TrainingType, FinetuneLRScheduler, FinetuneLinearLRSchedulerArgs, + FinetuneCosineLRSchedulerArgs, TrainingMethodDPO, TrainingMethodSFT, FinetuneCheckpoint, @@ -57,7 +58,9 @@ def createFinetuneRequest( n_checkpoints: int | None = 1, batch_size: int | Literal["max"] = "max", learning_rate: float | None = 0.00001, + lr_scheduler_type: Literal["linear", "cosine"] = "linear", min_lr_ratio: float = 0.0, + num_cycles: float = 0.5, warmup_ratio: float = 0.0, max_grad_norm: float = 1.0, weight_decay: float = 0.0, @@ -129,10 +132,21 @@ def createFinetuneRequest( f"training_method must be one of {', '.join(AVAILABLE_TRAINING_METHODS)}" ) - lrScheduler = FinetuneLRScheduler( - lr_scheduler_type="linear", - lr_scheduler_args=FinetuneLinearLRSchedulerArgs(min_lr_ratio=min_lr_ratio), - ) + if lr_scheduler_type == "cosine": + if num_cycles <= 0.0: + raise ValueError("Number of cycles should be greater than 0") + + lrScheduler = FinetuneLRScheduler( + lr_scheduler_type="cosine", + lr_scheduler_args=FinetuneCosineLRSchedulerArgs( + min_lr_ratio=min_lr_ratio, num_cycles=num_cycles + ), + ) + else: + lrScheduler = FinetuneLRScheduler( + lr_scheduler_type="linear", + lr_scheduler_args=FinetuneLinearLRSchedulerArgs(min_lr_ratio=min_lr_ratio), + ) training_method_cls: TrainingMethodSFT | TrainingMethodDPO = TrainingMethodSFT() if training_method == "dpo": @@ -244,7 +258,9 @@ def create( n_checkpoints: int | None = 1, batch_size: int | Literal["max"] = "max", learning_rate: float | None = 0.00001, + lr_scheduler_type: Literal["linear", "cosine"] = "linear", min_lr_ratio: float = 0.0, + num_cycles: float = 0.5, warmup_ratio: float = 0.0, max_grad_norm: float = 1.0, weight_decay: float = 0.0, @@ -279,8 +295,10 @@ def create( batch_size (int or "max"): Batch size for fine-tuning. Defaults to max. learning_rate (float, optional): Learning rate multiplier to use for training Defaults to 0.00001. + lr_scheduler_type (Literal["linear", "cosine"]): Learning rate scheduler type. Defaults to "linear". min_lr_ratio (float, optional): Min learning rate ratio of the initial learning rate for the learning rate scheduler. Defaults to 0.0. + num_cycles (float, optional): Number of cycles for cosine learning rate scheduler. Defaults to 0.5. warmup_ratio (float, optional): Warmup ratio for learning rate scheduler. max_grad_norm (float, optional): Max gradient norm. Defaults to 1.0, set to 0 to disable. weight_decay (float, optional): Weight decay. Defaults to 0.0. @@ -336,7 +354,9 @@ def create( n_checkpoints=n_checkpoints, batch_size=batch_size, learning_rate=learning_rate, + lr_scheduler_type=lr_scheduler_type, min_lr_ratio=min_lr_ratio, + num_cycles=num_cycles, warmup_ratio=warmup_ratio, max_grad_norm=max_grad_norm, weight_decay=weight_decay, @@ -617,7 +637,9 @@ async def create( n_checkpoints: int | None = 1, batch_size: int | Literal["max"] = "max", learning_rate: float | None = 0.00001, + lr_scheduler_type: Literal["linear", "cosine"] = "linear", min_lr_ratio: float = 0.0, + num_cycles: float = 0.5, warmup_ratio: float = 0.0, max_grad_norm: float = 1.0, weight_decay: float = 0.0, @@ -652,8 +674,10 @@ async def create( batch_size (int, optional): Batch size for fine-tuning. Defaults to max. learning_rate (float, optional): Learning rate multiplier to use for training Defaults to 0.00001. + lr_scheduler_type (Literal["linear", "cosine"]): Learning rate scheduler type. Defaults to "linear". min_lr_ratio (float, optional): Min learning rate ratio of the initial learning rate for the learning rate scheduler. Defaults to 0.0. + num_cycles (float, optional): Number of cycles for cosine learning rate scheduler. Defaults to 0.5. warmup_ratio (float, optional): Warmup ratio for learning rate scheduler. max_grad_norm (float, optional): Max gradient norm. Defaults to 1.0, set to 0 to disable. weight_decay (float, optional): Weight decay. Defaults to 0.0. @@ -710,7 +734,9 @@ async def create( n_checkpoints=n_checkpoints, batch_size=batch_size, learning_rate=learning_rate, + lr_scheduler_type=lr_scheduler_type, min_lr_ratio=min_lr_ratio, + num_cycles=num_cycles, warmup_ratio=warmup_ratio, max_grad_norm=max_grad_norm, weight_decay=weight_decay, diff --git a/src/together/types/__init__.py b/src/together/types/__init__.py index 47fed22b..f1dde652 100644 --- a/src/together/types/__init__.py +++ b/src/together/types/__init__.py @@ -34,6 +34,7 @@ TrainingMethodDPO, TrainingMethodSFT, FinetuneCheckpoint, + FinetuneCosineLRSchedulerArgs, FinetuneDownloadResult, FinetuneLinearLRSchedulerArgs, FinetuneList, @@ -70,6 +71,7 @@ "FinetuneDownloadResult", "FinetuneLRScheduler", "FinetuneLinearLRSchedulerArgs", + "FinetuneCosineLRSchedulerArgs", "FileRequest", "FileResponse", "FileList", diff --git a/src/together/types/finetune.py b/src/together/types/finetune.py index 94140a92..0d51f905 100644 --- a/src/together/types/finetune.py +++ b/src/together/types/finetune.py @@ -1,9 +1,9 @@ from __future__ import annotations from enum import Enum -from typing import List, Literal +from typing import List, Literal, Union -from pydantic import StrictBool, Field, validator, field_validator +from pydantic import StrictBool, Field, validator, field_validator, ValidationInfo from together.types.abstract import BaseModel from together.types.common import ( @@ -345,13 +345,44 @@ class FinetuneTrainingLimits(BaseModel): lora_training: FinetuneLoraTrainingLimits | None = None -class FinetuneLRScheduler(BaseModel): - lr_scheduler_type: str - lr_scheduler_args: FinetuneLinearLRSchedulerArgs | None = None +class FinetuneLinearLRSchedulerArgs(BaseModel): + min_lr_ratio: float | None = 0.0 -class FinetuneLinearLRSchedulerArgs(BaseModel): +class FinetuneCosineLRSchedulerArgs(BaseModel): min_lr_ratio: float | None = 0.0 + num_cycles: float | None = 0.5 + + +LRSchedulerTypeToArgs = { + "linear": FinetuneLinearLRSchedulerArgs, + "cosine": FinetuneCosineLRSchedulerArgs, +} + +FinetuneLRSchedulerArgs = Union[ + FinetuneLinearLRSchedulerArgs, FinetuneCosineLRSchedulerArgs, None +] + + +class FinetuneLRScheduler(BaseModel): + lr_scheduler_type: Literal["linear", "cosine"] + lr_scheduler_args: FinetuneLRSchedulerArgs | None = None + + @field_validator("lr_scheduler_args") + @classmethod + def validate_scheduler_args( + cls, v: FinetuneLRSchedulerArgs, info: ValidationInfo + ) -> FinetuneLRSchedulerArgs: + scheduler_type = info.data.get("lr_scheduler_type") + + if v is None: + return v + + expected_type = LRSchedulerTypeToArgs[str(scheduler_type)] + if not isinstance(v, expected_type): + raise ValueError(f"Expected {expected_type}, got {type(v)}") + + return v class FinetuneCheckpoint(BaseModel): From a8724380932c3aec5ec44045f18b0815934c30d3 Mon Sep 17 00:00:00 2001 From: Arsh Zahed Date: Tue, 18 Mar 2025 13:41:16 -0700 Subject: [PATCH 2/8] Port cosine lr scheduler init --- src/together/types/finetune.py | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) diff --git a/src/together/types/finetune.py b/src/together/types/finetune.py index 0d51f905..26f5ca13 100644 --- a/src/together/types/finetune.py +++ b/src/together/types/finetune.py @@ -365,20 +365,29 @@ class FinetuneCosineLRSchedulerArgs(BaseModel): class FinetuneLRScheduler(BaseModel): - lr_scheduler_type: Literal["linear", "cosine"] + lr_scheduler_type: str lr_scheduler_args: FinetuneLRSchedulerArgs | None = None + @field_validator("lr_scheduler_type") + @classmethod + def validate_scheduler_type(cls, v: str) -> str: + if v not in LRSchedulerTypeToArgs: + raise ValueError( + f"Scheduler type must be one of: {LRSchedulerTypeToArgs.keys()}" + ) + return v + @field_validator("lr_scheduler_args") @classmethod def validate_scheduler_args( cls, v: FinetuneLRSchedulerArgs, info: ValidationInfo ) -> FinetuneLRSchedulerArgs: - scheduler_type = info.data.get("lr_scheduler_type") + scheduler_type = str(info.data.get("lr_scheduler_type")) if v is None: return v - expected_type = LRSchedulerTypeToArgs[str(scheduler_type)] + expected_type = LRSchedulerTypeToArgs[scheduler_type] if not isinstance(v, expected_type): raise ValueError(f"Expected {expected_type}, got {type(v)}") From c17a405e4db0415dab06f13f3f26defd175913dd Mon Sep 17 00:00:00 2001 From: Arsh Zahed Date: Tue, 18 Mar 2025 14:00:06 -0700 Subject: [PATCH 3/8] Upgrade 1.4.7 --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index 7e3a210d..39346629 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -12,7 +12,7 @@ build-backend = "poetry.masonry.api" [tool.poetry] name = "together" -version = "1.4.6" +version = "1.4.7" authors = [ "Together AI " ] From f6e2258097cbb1a94113b58d7186a310b5bae0bb Mon Sep 17 00:00:00 2001 From: Arsh Zahed Date: Wed, 19 Mar 2025 15:02:14 -0700 Subject: [PATCH 4/8] Typos, type error --- src/together/cli/api/finetune.py | 4 ++-- src/together/resources/finetune.py | 8 ++++---- src/together/types/finetune.py | 18 +++++++++--------- 3 files changed, 15 insertions(+), 15 deletions(-) diff --git a/src/together/cli/api/finetune.py b/src/together/cli/api/finetune.py index 1c14de55..f602da34 100644 --- a/src/together/cli/api/finetune.py +++ b/src/together/cli/api/finetune.py @@ -87,13 +87,13 @@ def fine_tuning(ctx: click.Context) -> None: "--num-cycles", type=float, default=0.5, - help="Number of cycles for cosine learning rate scheduler.", + help="Number of cycles for the cosine learning rate scheduler.", ) @click.option( "--warmup-ratio", type=float, default=0.0, - help="Warmup ratio for learning rate scheduler.", + help="Warmup ratio for the learning rate scheduler.", ) @click.option( "--max-grad-norm", diff --git a/src/together/resources/finetune.py b/src/together/resources/finetune.py index 5d278f5f..273a7db7 100644 --- a/src/together/resources/finetune.py +++ b/src/together/resources/finetune.py @@ -298,8 +298,8 @@ def create( lr_scheduler_type (Literal["linear", "cosine"]): Learning rate scheduler type. Defaults to "linear". min_lr_ratio (float, optional): Min learning rate ratio of the initial learning rate for the learning rate scheduler. Defaults to 0.0. - num_cycles (float, optional): Number of cycles for cosine learning rate scheduler. Defaults to 0.5. - warmup_ratio (float, optional): Warmup ratio for learning rate scheduler. + num_cycles (float, optional): Number of cycles for the cosine learning rate scheduler. Defaults to 0.5. + warmup_ratio (float, optional): Warmup ratio for the learning rate scheduler. max_grad_norm (float, optional): Max gradient norm. Defaults to 1.0, set to 0 to disable. weight_decay (float, optional): Weight decay. Defaults to 0.0. lora (bool, optional): Whether to use LoRA adapters. Defaults to True. @@ -677,8 +677,8 @@ async def create( lr_scheduler_type (Literal["linear", "cosine"]): Learning rate scheduler type. Defaults to "linear". min_lr_ratio (float, optional): Min learning rate ratio of the initial learning rate for the learning rate scheduler. Defaults to 0.0. - num_cycles (float, optional): Number of cycles for cosine learning rate scheduler. Defaults to 0.5. - warmup_ratio (float, optional): Warmup ratio for learning rate scheduler. + num_cycles (float, optional): Number of cycles for the cosine learning rate scheduler. Defaults to 0.5. + warmup_ratio (float, optional): Warmup ratio for the learning rate scheduler. max_grad_norm (float, optional): Max gradient norm. Defaults to 1.0, set to 0 to disable. weight_decay (float, optional): Weight decay. Defaults to 0.0. lora (bool, optional): Whether to use LoRA adapters. Defaults to True. diff --git a/src/together/types/finetune.py b/src/together/types/finetune.py index 26f5ca13..f5b6758e 100644 --- a/src/together/types/finetune.py +++ b/src/together/types/finetune.py @@ -370,28 +370,28 @@ class FinetuneLRScheduler(BaseModel): @field_validator("lr_scheduler_type") @classmethod - def validate_scheduler_type(cls, v: str) -> str: - if v not in LRSchedulerTypeToArgs: + def validate_scheduler_type(cls, scheduler_type: str) -> str: + if scheduler_type not in LRSchedulerTypeToArgs: raise ValueError( f"Scheduler type must be one of: {LRSchedulerTypeToArgs.keys()}" ) - return v + return scheduler_type @field_validator("lr_scheduler_args") @classmethod def validate_scheduler_args( - cls, v: FinetuneLRSchedulerArgs, info: ValidationInfo + cls, args: FinetuneLRSchedulerArgs, info: ValidationInfo ) -> FinetuneLRSchedulerArgs: scheduler_type = str(info.data.get("lr_scheduler_type")) - if v is None: - return v + if args is None: + return args expected_type = LRSchedulerTypeToArgs[scheduler_type] - if not isinstance(v, expected_type): - raise ValueError(f"Expected {expected_type}, got {type(v)}") + if not isinstance(args, expected_type): + raise TypeError(f"Expected {expected_type}, got {type(args)}") - return v + return args class FinetuneCheckpoint(BaseModel): From 76f616b3ea8d5d81821a86fbfe68cf81c9bf4c74 Mon Sep 17 00:00:00 2001 From: Arsh Zahed Date: Wed, 19 Mar 2025 15:15:25 -0700 Subject: [PATCH 5/8] Use subclasses instead of validation --- src/together/resources/finetune.py | 11 +++++--- src/together/types/__init__.py | 6 ++++- src/together/types/finetune.py | 42 ++++++------------------------ 3 files changed, 20 insertions(+), 39 deletions(-) diff --git a/src/together/resources/finetune.py b/src/together/resources/finetune.py index 273a7db7..feb7308d 100644 --- a/src/together/resources/finetune.py +++ b/src/together/resources/finetune.py @@ -22,6 +22,8 @@ TogetherRequest, TrainingType, FinetuneLRScheduler, + FinetuneLinearLRScheduler, + FinetuneCosineLRScheduler, FinetuneLinearLRSchedulerArgs, FinetuneCosineLRSchedulerArgs, TrainingMethodDPO, @@ -132,19 +134,20 @@ def createFinetuneRequest( f"training_method must be one of {', '.join(AVAILABLE_TRAINING_METHODS)}" ) + # Default to generic lr scheduler + lrScheduler: FinetuneLRScheduler = FinetuneLRScheduler(lr_scheduler_type="linear") + if lr_scheduler_type == "cosine": if num_cycles <= 0.0: raise ValueError("Number of cycles should be greater than 0") - lrScheduler = FinetuneLRScheduler( - lr_scheduler_type="cosine", + lrScheduler = FinetuneCosineLRScheduler( lr_scheduler_args=FinetuneCosineLRSchedulerArgs( min_lr_ratio=min_lr_ratio, num_cycles=num_cycles ), ) else: - lrScheduler = FinetuneLRScheduler( - lr_scheduler_type="linear", + lrScheduler = FinetuneLinearLRScheduler( lr_scheduler_args=FinetuneLinearLRSchedulerArgs(min_lr_ratio=min_lr_ratio), ) diff --git a/src/together/types/__init__.py b/src/together/types/__init__.py index f1dde652..53e1858e 100644 --- a/src/together/types/__init__.py +++ b/src/together/types/__init__.py @@ -34,12 +34,14 @@ TrainingMethodDPO, TrainingMethodSFT, FinetuneCheckpoint, + FinetuneCosineLRScheduler, FinetuneCosineLRSchedulerArgs, FinetuneDownloadResult, + FinetuneLinearLRScheduler, FinetuneLinearLRSchedulerArgs, + FinetuneLRScheduler, FinetuneList, FinetuneListEvents, - FinetuneLRScheduler, FinetuneRequest, FinetuneResponse, FinetuneTrainingLimits, @@ -70,7 +72,9 @@ "FinetuneListEvents", "FinetuneDownloadResult", "FinetuneLRScheduler", + "FinetuneLinearLRScheduler", "FinetuneLinearLRSchedulerArgs", + "FinetuneCosineLRScheduler", "FinetuneCosineLRSchedulerArgs", "FileRequest", "FileResponse", diff --git a/src/together/types/finetune.py b/src/together/types/finetune.py index f5b6758e..7f085132 100644 --- a/src/together/types/finetune.py +++ b/src/together/types/finetune.py @@ -176,7 +176,7 @@ class FinetuneRequest(BaseModel): # training learning rate learning_rate: float # learning rate scheduler type and args - lr_scheduler: FinetuneLRScheduler | None = None + lr_scheduler: FinetuneLinearLRScheduler | FinetuneCosineLRScheduler | None = None # learning rate warmup ratio warmup_ratio: float # max gradient norm @@ -239,7 +239,7 @@ class FinetuneResponse(BaseModel): # training learning rate learning_rate: float | None = None # learning rate scheduler type and args - lr_scheduler: FinetuneLRScheduler | None = None + lr_scheduler: FinetuneLinearLRScheduler | FinetuneCosineLRScheduler | None = None # learning rate warmup ratio warmup_ratio: float | None = None # max gradient norm @@ -354,44 +354,18 @@ class FinetuneCosineLRSchedulerArgs(BaseModel): num_cycles: float | None = 0.5 -LRSchedulerTypeToArgs = { - "linear": FinetuneLinearLRSchedulerArgs, - "cosine": FinetuneCosineLRSchedulerArgs, -} - -FinetuneLRSchedulerArgs = Union[ - FinetuneLinearLRSchedulerArgs, FinetuneCosineLRSchedulerArgs, None -] - - class FinetuneLRScheduler(BaseModel): lr_scheduler_type: str - lr_scheduler_args: FinetuneLRSchedulerArgs | None = None - @field_validator("lr_scheduler_type") - @classmethod - def validate_scheduler_type(cls, scheduler_type: str) -> str: - if scheduler_type not in LRSchedulerTypeToArgs: - raise ValueError( - f"Scheduler type must be one of: {LRSchedulerTypeToArgs.keys()}" - ) - return scheduler_type - - @field_validator("lr_scheduler_args") - @classmethod - def validate_scheduler_args( - cls, args: FinetuneLRSchedulerArgs, info: ValidationInfo - ) -> FinetuneLRSchedulerArgs: - scheduler_type = str(info.data.get("lr_scheduler_type")) - if args is None: - return args +class FinetuneLinearLRScheduler(FinetuneLRScheduler): + lr_scheduler_type: Literal["linear"] = "linear" + lr_scheduler: FinetuneLinearLRSchedulerArgs | None = None - expected_type = LRSchedulerTypeToArgs[scheduler_type] - if not isinstance(args, expected_type): - raise TypeError(f"Expected {expected_type}, got {type(args)}") - return args +class FinetuneCosineLRScheduler(FinetuneLRScheduler): + lr_scheduler_type: Literal["cosine"] = "cosine" + lr_scheduler: FinetuneCosineLRSchedulerArgs | None = None class FinetuneCheckpoint(BaseModel): From 9a8418ed8b93638a12690bc2314fdd059a0d882c Mon Sep 17 00:00:00 2001 From: Arsh Zahed Date: Wed, 19 Mar 2025 16:02:34 -0700 Subject: [PATCH 6/8] Update num_cycles description --- src/together/cli/api/finetune.py | 2 +- src/together/resources/finetune.py | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/src/together/cli/api/finetune.py b/src/together/cli/api/finetune.py index f602da34..bb6e6d5f 100644 --- a/src/together/cli/api/finetune.py +++ b/src/together/cli/api/finetune.py @@ -87,7 +87,7 @@ def fine_tuning(ctx: click.Context) -> None: "--num-cycles", type=float, default=0.5, - help="Number of cycles for the cosine learning rate scheduler.", + help="Number or fraction of cycles for the cosine learning rate scheduler.", ) @click.option( "--warmup-ratio", diff --git a/src/together/resources/finetune.py b/src/together/resources/finetune.py index feb7308d..b3fc378d 100644 --- a/src/together/resources/finetune.py +++ b/src/together/resources/finetune.py @@ -301,7 +301,7 @@ def create( lr_scheduler_type (Literal["linear", "cosine"]): Learning rate scheduler type. Defaults to "linear". min_lr_ratio (float, optional): Min learning rate ratio of the initial learning rate for the learning rate scheduler. Defaults to 0.0. - num_cycles (float, optional): Number of cycles for the cosine learning rate scheduler. Defaults to 0.5. + num_cycles (float, optional): Number or fraction of cycles for the cosine learning rate scheduler. Defaults to 0.5. warmup_ratio (float, optional): Warmup ratio for the learning rate scheduler. max_grad_norm (float, optional): Max gradient norm. Defaults to 1.0, set to 0 to disable. weight_decay (float, optional): Weight decay. Defaults to 0.0. @@ -680,7 +680,7 @@ async def create( lr_scheduler_type (Literal["linear", "cosine"]): Learning rate scheduler type. Defaults to "linear". min_lr_ratio (float, optional): Min learning rate ratio of the initial learning rate for the learning rate scheduler. Defaults to 0.0. - num_cycles (float, optional): Number of cycles for the cosine learning rate scheduler. Defaults to 0.5. + num_cycles (float, optional): Number or fraction of cycles for the cosine learning rate scheduler. Defaults to 0.5. warmup_ratio (float, optional): Warmup ratio for the learning rate scheduler. max_grad_norm (float, optional): Max gradient norm. Defaults to 1.0, set to 0 to disable. weight_decay (float, optional): Weight decay. Defaults to 0.0. From 968e5abfa01a0760611e083c91241c97be021a99 Mon Sep 17 00:00:00 2001 From: Arsh Zahed Date: Wed, 19 Mar 2025 16:06:15 -0700 Subject: [PATCH 7/8] Change cli arg from num_cycles to scheduler_num_cycles --- src/together/cli/api/finetune.py | 6 +++--- src/together/resources/finetune.py | 18 +++++++++--------- 2 files changed, 12 insertions(+), 12 deletions(-) diff --git a/src/together/cli/api/finetune.py b/src/together/cli/api/finetune.py index bb6e6d5f..f8032057 100644 --- a/src/together/cli/api/finetune.py +++ b/src/together/cli/api/finetune.py @@ -84,7 +84,7 @@ def fine_tuning(ctx: click.Context) -> None: help="The ratio of the final learning rate to the peak learning rate", ) @click.option( - "--num-cycles", + "--scheduler-num-cycles", type=float, default=0.5, help="Number or fraction of cycles for the cosine learning rate scheduler.", @@ -176,7 +176,7 @@ def create( learning_rate: float, lr_scheduler_type: Literal["linear", "cosine"], min_lr_ratio: float, - num_cycles: float, + scheduler_num_cycles: float, warmup_ratio: float, max_grad_norm: float, weight_decay: float, @@ -210,7 +210,7 @@ def create( learning_rate=learning_rate, lr_scheduler_type=lr_scheduler_type, min_lr_ratio=min_lr_ratio, - num_cycles=num_cycles, + scheduler_num_cycles=scheduler_num_cycles, warmup_ratio=warmup_ratio, max_grad_norm=max_grad_norm, weight_decay=weight_decay, diff --git a/src/together/resources/finetune.py b/src/together/resources/finetune.py index b3fc378d..f6500c0f 100644 --- a/src/together/resources/finetune.py +++ b/src/together/resources/finetune.py @@ -62,7 +62,7 @@ def createFinetuneRequest( learning_rate: float | None = 0.00001, lr_scheduler_type: Literal["linear", "cosine"] = "linear", min_lr_ratio: float = 0.0, - num_cycles: float = 0.5, + scheduler_num_cycles: float = 0.5, warmup_ratio: float = 0.0, max_grad_norm: float = 1.0, weight_decay: float = 0.0, @@ -138,12 +138,12 @@ def createFinetuneRequest( lrScheduler: FinetuneLRScheduler = FinetuneLRScheduler(lr_scheduler_type="linear") if lr_scheduler_type == "cosine": - if num_cycles <= 0.0: + if scheduler_num_cycles <= 0.0: raise ValueError("Number of cycles should be greater than 0") lrScheduler = FinetuneCosineLRScheduler( lr_scheduler_args=FinetuneCosineLRSchedulerArgs( - min_lr_ratio=min_lr_ratio, num_cycles=num_cycles + min_lr_ratio=min_lr_ratio, num_cycles=scheduler_num_cycles ), ) else: @@ -263,7 +263,7 @@ def create( learning_rate: float | None = 0.00001, lr_scheduler_type: Literal["linear", "cosine"] = "linear", min_lr_ratio: float = 0.0, - num_cycles: float = 0.5, + scheduler_num_cycles: float = 0.5, warmup_ratio: float = 0.0, max_grad_norm: float = 1.0, weight_decay: float = 0.0, @@ -301,7 +301,7 @@ def create( lr_scheduler_type (Literal["linear", "cosine"]): Learning rate scheduler type. Defaults to "linear". min_lr_ratio (float, optional): Min learning rate ratio of the initial learning rate for the learning rate scheduler. Defaults to 0.0. - num_cycles (float, optional): Number or fraction of cycles for the cosine learning rate scheduler. Defaults to 0.5. + scheduler_num_cycles (float, optional): Number or fraction of cycles for the cosine learning rate scheduler. Defaults to 0.5. warmup_ratio (float, optional): Warmup ratio for the learning rate scheduler. max_grad_norm (float, optional): Max gradient norm. Defaults to 1.0, set to 0 to disable. weight_decay (float, optional): Weight decay. Defaults to 0.0. @@ -359,7 +359,7 @@ def create( learning_rate=learning_rate, lr_scheduler_type=lr_scheduler_type, min_lr_ratio=min_lr_ratio, - num_cycles=num_cycles, + scheduler_num_cycles=scheduler_num_cycles, warmup_ratio=warmup_ratio, max_grad_norm=max_grad_norm, weight_decay=weight_decay, @@ -642,7 +642,7 @@ async def create( learning_rate: float | None = 0.00001, lr_scheduler_type: Literal["linear", "cosine"] = "linear", min_lr_ratio: float = 0.0, - num_cycles: float = 0.5, + scheduler_num_cycles: float = 0.5, warmup_ratio: float = 0.0, max_grad_norm: float = 1.0, weight_decay: float = 0.0, @@ -680,7 +680,7 @@ async def create( lr_scheduler_type (Literal["linear", "cosine"]): Learning rate scheduler type. Defaults to "linear". min_lr_ratio (float, optional): Min learning rate ratio of the initial learning rate for the learning rate scheduler. Defaults to 0.0. - num_cycles (float, optional): Number or fraction of cycles for the cosine learning rate scheduler. Defaults to 0.5. + scheduler_num_cycles (float, optional): Number or fraction of cycles for the cosine learning rate scheduler. Defaults to 0.5. warmup_ratio (float, optional): Warmup ratio for the learning rate scheduler. max_grad_norm (float, optional): Max gradient norm. Defaults to 1.0, set to 0 to disable. weight_decay (float, optional): Weight decay. Defaults to 0.0. @@ -739,7 +739,7 @@ async def create( learning_rate=learning_rate, lr_scheduler_type=lr_scheduler_type, min_lr_ratio=min_lr_ratio, - num_cycles=num_cycles, + scheduler_num_cycles=scheduler_num_cycles, warmup_ratio=warmup_ratio, max_grad_norm=max_grad_norm, weight_decay=weight_decay, From 840233e10f1d3940bde09c64c501b64613ba1c83 Mon Sep 17 00:00:00 2001 From: Arsh Zahed Date: Tue, 25 Mar 2025 00:37:21 -0700 Subject: [PATCH 8/8] Update version --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index 82f8090c..c5683567 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -12,7 +12,7 @@ build-backend = "poetry.masonry.api" [tool.poetry] name = "together" -version = "1.5.1" +version = "1.5.2" authors = [ "Together AI " ]