From b22a58a57c983c201832aec1feede5e33c4b0650 Mon Sep 17 00:00:00 2001 From: Ives van Hoorne Date: Wed, 12 Mar 2025 17:44:35 +0100 Subject: [PATCH 1/5] feat: support auto-shutdown --- src/together/cli/api/endpoints.py | 21 ++++++++++++++++++++- src/together/resources/endpoints.py | 28 ++++++++++++++++++++++++---- 2 files changed, 44 insertions(+), 5 deletions(-) diff --git a/src/together/cli/api/endpoints.py b/src/together/cli/api/endpoints.py index 3d306063..f9634f9c 100644 --- a/src/together/cli/api/endpoints.py +++ b/src/together/cli/api/endpoints.py @@ -127,6 +127,11 @@ def endpoints(ctx: click.Context) -> None: is_flag=True, help="Create the endpoint in STOPPED state instead of auto-starting it", ) +@click.option( + "--inactive-timeout", + type=int, + help="Number of minutes of inactivity after which the endpoint will be automatically stopped. Set to 0 to disable.", +) @click.option( "--wait", is_flag=True, @@ -146,6 +151,7 @@ def create( no_prompt_cache: bool, no_speculative_decoding: bool, no_auto_start: bool, + inactive_timeout: int | None, wait: bool, ) -> None: """Create a new dedicated inference endpoint.""" @@ -170,6 +176,7 @@ def create( disable_prompt_cache=no_prompt_cache, disable_speculative_decoding=no_speculative_decoding, state="STOPPED" if no_auto_start else "STARTED", + inactive_timeout=inactive_timeout, ) except InvalidRequestError as e: print_api_error(e) @@ -194,6 +201,8 @@ def create( click.echo(" Speculative decoding: disabled", err=True) if no_auto_start: click.echo(" Auto-start: disabled", err=True) + if inactive_timeout is not None: + click.echo(f" Inactive timeout: {inactive_timeout} minutes", err=True) click.echo(f"Endpoint created successfully, id: {response.id}", err=True) @@ -371,6 +380,11 @@ def list( type=int, help="New maximum number of replicas to scale up to", ) +@click.option( + "--inactive-timeout", + type=int, + help="Number of minutes of inactivity after which the endpoint will be automatically stopped. Set to 0 to disable.", +) @click.pass_obj @handle_api_errors def update( @@ -379,9 +393,10 @@ def update( display_name: str | None, min_replicas: int | None, max_replicas: int | None, + inactive_timeout: int | None, ) -> None: """Update a dedicated inference endpoint's configuration.""" - if not any([display_name, min_replicas, max_replicas]): + if not any([display_name, min_replicas, max_replicas, inactive_timeout]): click.echo("Error: At least one update option must be specified", err=True) sys.exit(1) @@ -400,6 +415,8 @@ def update( if min_replicas is not None and max_replicas is not None: kwargs["min_replicas"] = min_replicas kwargs["max_replicas"] = max_replicas + if inactive_timeout is not None: + kwargs["inactive_timeout"] = inactive_timeout _response = client.endpoints.update(endpoint_id, **kwargs) @@ -410,6 +427,8 @@ def update( if min_replicas is not None and max_replicas is not None: click.echo(f" Min replicas: {min_replicas}", err=True) click.echo(f" Max replicas: {max_replicas}", err=True) + if inactive_timeout is not None: + click.echo(f" Inactive timeout: {inactive_timeout} minutes", err=True) click.echo("Successfully updated endpoint", err=True) click.echo(endpoint_id) diff --git a/src/together/resources/endpoints.py b/src/together/resources/endpoints.py index 176894f5..5d8f9a49 100644 --- a/src/together/resources/endpoints.py +++ b/src/together/resources/endpoints.py @@ -59,6 +59,7 @@ def create( disable_prompt_cache: bool = False, disable_speculative_decoding: bool = False, state: Literal["STARTED", "STOPPED"] = "STARTED", + inactive_timeout: Optional[int] = None, ) -> DedicatedEndpoint: """ Create a new dedicated endpoint. @@ -72,6 +73,7 @@ def create( disable_prompt_cache (bool, optional): Whether to disable the prompt cache. Defaults to False. disable_speculative_decoding (bool, optional): Whether to disable speculative decoding. Defaults to False. state (str, optional): The desired state of the endpoint. Defaults to "STARTED". + inactive_timeout (int, optional): The number of minutes of inactivity after which the endpoint will be automatically stopped. Set to 0 to disable automatic timeout. Returns: DedicatedEndpoint: Object containing endpoint information @@ -80,7 +82,7 @@ def create( client=self._client, ) - data: Dict[str, Union[str, bool, Dict[str, int]]] = { + data: Dict[str, Union[str, bool, Dict[str, int], int]] = { "model": model, "hardware": hardware, "autoscaling": { @@ -95,6 +97,9 @@ def create( if display_name is not None: data["display_name"] = display_name + if inactive_timeout is not None: + data["inactive_timeout"] = inactive_timeout + response, _, _ = requestor.request( options=TogetherRequest( method="POST", @@ -161,6 +166,7 @@ def update( max_replicas: Optional[int] = None, state: Optional[Literal["STARTED", "STOPPED"]] = None, display_name: Optional[str] = None, + inactive_timeout: Optional[int] = None, ) -> DedicatedEndpoint: """ Update an endpoint's configuration. @@ -171,6 +177,7 @@ def update( max_replicas (int, optional): The maximum number of replicas to scale up to state (str, optional): The desired state of the endpoint ("STARTED" or "STOPPED") display_name (str, optional): A human-readable name for the endpoint + inactive_timeout (int, optional): The number of minutes of inactivity after which the endpoint will be automatically stopped. Set to 0 to disable automatic timeout. Returns: DedicatedEndpoint: Object containing endpoint information @@ -179,7 +186,7 @@ def update( client=self._client, ) - data: Dict[str, Union[str, Dict[str, int]]] = {} + data: Dict[str, Union[str, Dict[str, int], int]] = {} if min_replicas is not None or max_replicas is not None: current_min = min_replicas @@ -200,6 +207,9 @@ def update( if display_name is not None: data["display_name"] = display_name + if inactive_timeout is not None: + data["inactive_timeout"] = inactive_timeout + response, _, _ = requestor.request( options=TogetherRequest( method="PATCH", @@ -297,6 +307,7 @@ async def create( disable_prompt_cache: bool = False, disable_speculative_decoding: bool = False, state: Literal["STARTED", "STOPPED"] = "STARTED", + inactive_timeout: Optional[int] = None, ) -> DedicatedEndpoint: """ Create a new dedicated endpoint. @@ -310,6 +321,7 @@ async def create( disable_prompt_cache (bool, optional): Whether to disable the prompt cache. Defaults to False. disable_speculative_decoding (bool, optional): Whether to disable speculative decoding. Defaults to False. state (str, optional): The desired state of the endpoint. Defaults to "STARTED". + inactive_timeout (int, optional): The number of minutes of inactivity after which the endpoint will be automatically stopped. Set to 0 to disable automatic timeout. Returns: DedicatedEndpoint: Object containing endpoint information @@ -318,7 +330,7 @@ async def create( client=self._client, ) - data: Dict[str, Union[str, bool, Dict[str, int]]] = { + data: Dict[str, Union[str, bool, Dict[str, int], int]] = { "model": model, "hardware": hardware, "autoscaling": { @@ -333,6 +345,9 @@ async def create( if display_name is not None: data["display_name"] = display_name + if inactive_timeout is not None: + data["inactive_timeout"] = inactive_timeout + response, _, _ = await requestor.arequest( options=TogetherRequest( method="POST", @@ -399,6 +414,7 @@ async def update( max_replicas: Optional[int] = None, state: Optional[Literal["STARTED", "STOPPED"]] = None, display_name: Optional[str] = None, + inactive_timeout: Optional[int] = None, ) -> DedicatedEndpoint: """ Update an endpoint's configuration. @@ -409,6 +425,7 @@ async def update( max_replicas (int, optional): The maximum number of replicas to scale up to state (str, optional): The desired state of the endpoint ("STARTED" or "STOPPED") display_name (str, optional): A human-readable name for the endpoint + inactive_timeout (int, optional): The number of minutes of inactivity after which the endpoint will be automatically stopped. Set to 0 to disable automatic timeout. Returns: DedicatedEndpoint: Object containing endpoint information @@ -417,7 +434,7 @@ async def update( client=self._client, ) - data: Dict[str, Union[str, Dict[str, int]]] = {} + data: Dict[str, Union[str, Dict[str, int], int]] = {} if min_replicas is not None or max_replicas is not None: current_min = min_replicas @@ -438,6 +455,9 @@ async def update( if display_name is not None: data["display_name"] = display_name + if inactive_timeout is not None: + data["inactive_timeout"] = inactive_timeout + response, _, _ = await requestor.arequest( options=TogetherRequest( method="PATCH", From 009b0b24a86de14fc9cd7b343a926bc9f3d5400a Mon Sep 17 00:00:00 2001 From: Ives van Hoorne Date: Wed, 12 Mar 2025 17:58:20 +0100 Subject: [PATCH 2/5] feat: add support for --type dedicated --- src/together/cli/api/models.py | 9 +++- src/together/resources/models.py | 71 ++++++++++++++++++++++++++++---- 2 files changed, 70 insertions(+), 10 deletions(-) diff --git a/src/together/cli/api/models.py b/src/together/cli/api/models.py index 78c0f648..c4a110c2 100644 --- a/src/together/cli/api/models.py +++ b/src/together/cli/api/models.py @@ -15,12 +15,17 @@ def models(ctx: click.Context) -> None: @models.command() +@click.option( + "--type", + type=click.Choice(["dedicated"]), + help="Filter models by type (dedicated: models that support autoscaling)", +) @click.pass_context -def list(ctx: click.Context) -> None: +def list(ctx: click.Context, type: str | None) -> None: """List models""" client: Together = ctx.obj - response = client.models.list() + response = client.models.list(dedicated=(type == "dedicated")) display_list = [] diff --git a/src/together/resources/models.py b/src/together/resources/models.py index 9a85e9bb..56241e67 100644 --- a/src/together/resources/models.py +++ b/src/together/resources/models.py @@ -11,20 +11,47 @@ ) -class Models: +class ModelsBase: def __init__(self, client: TogetherClient) -> None: self._client = client + def _filter_dedicated_models( + self, models: List[ModelObject], dedicated_response: TogetherResponse + ) -> List[ModelObject]: + """ + Filter models based on dedicated model response. + + Args: + models (List[ModelObject]): List of all models + dedicated_response (TogetherResponse): Response from autoscale models endpoint + + Returns: + List[ModelObject]: Filtered list of models + """ + assert isinstance(dedicated_response.data, list) + + # Create a set of dedicated model names for efficient lookup + dedicated_model_names = {model["name"] for model in dedicated_response.data} + + # Filter models to only include those in dedicated_model_names + # Note: The model.id from ModelObject matches the name field in the autoscale response + return [model for model in models if model.id in dedicated_model_names] + + +class Models(ModelsBase): def list( self, + dedicated: bool = False, ) -> List[ModelObject]: """ Method to return list of models on the API + Args: + dedicated (bool, optional): If True, returns only dedicated models. Defaults to False. + Returns: List[ModelObject]: List of model objects """ - requestor = api_requestor.APIRequestor( client=self._client, ) @@ -40,23 +67,37 @@ def list( assert isinstance(response, TogetherResponse) assert isinstance(response.data, list) - return [ModelObject(**model) for model in response.data] + models = [ModelObject(**model) for model in response.data] + if dedicated: + # Get dedicated models + dedicated_response, _, _ = requestor.request( + options=TogetherRequest( + method="GET", + url="autoscale/models", + ), + stream=False, + ) + + models = self._filter_dedicated_models(models, dedicated_response) + + return models -class AsyncModels: - def __init__(self, client: TogetherClient) -> None: - self._client = client +class AsyncModels(ModelsBase): async def list( self, + dedicated: bool = False, ) -> List[ModelObject]: """ Async method to return list of models on API + Args: + dedicated (bool, optional): If True, returns only dedicated models. Defaults to False. + Returns: List[ModelObject]: List of model objects """ - requestor = api_requestor.APIRequestor( client=self._client, ) @@ -72,4 +113,18 @@ async def list( assert isinstance(response, TogetherResponse) assert isinstance(response.data, list) - return [ModelObject(**model) for model in response.data] + models = [ModelObject(**model) for model in response.data] + + if dedicated: + # Get dedicated models + dedicated_response, _, _ = await requestor.arequest( + options=TogetherRequest( + method="GET", + url="autoscale/models", + ), + stream=False, + ) + + models = self._filter_dedicated_models(models, dedicated_response) + + return models From 97874dad25b3c8516637e389169d6e949aaee87d Mon Sep 17 00:00:00 2001 From: Ives van Hoorne Date: Thu, 13 Mar 2025 13:48:51 +0100 Subject: [PATCH 3/5] fix: make list readable by programs --- src/together/cli/api/models.py | 8 ++++---- src/together/resources/models.py | 4 ++++ 2 files changed, 8 insertions(+), 4 deletions(-) diff --git a/src/together/cli/api/models.py b/src/together/cli/api/models.py index c4a110c2..72ee03a7 100644 --- a/src/together/cli/api/models.py +++ b/src/together/cli/api/models.py @@ -33,15 +33,15 @@ def list(ctx: click.Context, type: str | None) -> None: for model in response: display_list.append( { - "ID": "\n".join(wrap(model.id or "", width=30)), - "Name": "\n".join(wrap(model.display_name or "", width=30)), + "ID": model.id, + "Name": model.display_name, "Organization": model.organization, "Type": model.type, "Context Length": model.context_length, - "License": "\n".join(wrap(model.license or "", width=30)), + "License": model.license, "Input per 1M token": model.pricing.input, "Output per 1M token": model.pricing.output, } ) - click.echo(tabulate(display_list, headers="keys", tablefmt="grid")) + click.echo(tabulate(display_list, headers="keys", tablefmt="plain")) diff --git a/src/together/resources/models.py b/src/together/resources/models.py index 56241e67..1e16c9a8 100644 --- a/src/together/resources/models.py +++ b/src/together/resources/models.py @@ -81,6 +81,8 @@ def list( models = self._filter_dedicated_models(models, dedicated_response) + models.sort(key=lambda x: x.id.lower()) + return models @@ -127,4 +129,6 @@ async def list( models = self._filter_dedicated_models(models, dedicated_response) + models.sort(key=lambda x: x.id.lower()) + return models From a8ff1c3a1e9ad56f3fd9f83ef489026fb227b886 Mon Sep 17 00:00:00 2001 From: Ives van Hoorne Date: Thu, 13 Mar 2025 13:51:47 +0100 Subject: [PATCH 4/5] add --json output --- src/together/cli/api/models.py | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/src/together/cli/api/models.py b/src/together/cli/api/models.py index 72ee03a7..5661d55f 100644 --- a/src/together/cli/api/models.py +++ b/src/together/cli/api/models.py @@ -1,4 +1,4 @@ -from textwrap import wrap +import json as json_lib import click from tabulate import tabulate @@ -20,8 +20,13 @@ def models(ctx: click.Context) -> None: type=click.Choice(["dedicated"]), help="Filter models by type (dedicated: models that support autoscaling)", ) +@click.option( + "--json", + is_flag=True, + help="Output in JSON format", +) @click.pass_context -def list(ctx: click.Context, type: str | None) -> None: +def list(ctx: click.Context, type: str | None, json: bool) -> None: """List models""" client: Together = ctx.obj @@ -44,4 +49,7 @@ def list(ctx: click.Context, type: str | None) -> None: } ) - click.echo(tabulate(display_list, headers="keys", tablefmt="plain")) + if json: + click.echo(json_lib.dumps(display_list, indent=2)) + else: + click.echo(tabulate(display_list, headers="keys", tablefmt="plain")) From e43930315432a0ae79fd79e5d2a61b450c460fbe Mon Sep 17 00:00:00 2001 From: Ives van Hoorne Date: Thu, 13 Mar 2025 14:22:30 +0100 Subject: [PATCH 5/5] fix description --- src/together/cli/api/models.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/together/cli/api/models.py b/src/together/cli/api/models.py index 5661d55f..b807a1d4 100644 --- a/src/together/cli/api/models.py +++ b/src/together/cli/api/models.py @@ -18,7 +18,7 @@ def models(ctx: click.Context) -> None: @click.option( "--type", type=click.Choice(["dedicated"]), - help="Filter models by type (dedicated: models that support autoscaling)", + help="Filter models by type (dedicated: models that can be deployed as dedicated endpoints)", ) @click.option( "--json",