From b22a58a57c983c201832aec1feede5e33c4b0650 Mon Sep 17 00:00:00 2001
From: Ives van Hoorne <ives@codesandbox.io>
Date: Wed, 12 Mar 2025 17:44:35 +0100
Subject: [PATCH 1/5] feat: support auto-shutdown

---
 src/together/cli/api/endpoints.py   | 21 ++++++++++++++++++++-
 src/together/resources/endpoints.py | 28 ++++++++++++++++++++++++----
 2 files changed, 44 insertions(+), 5 deletions(-)

diff --git a/src/together/cli/api/endpoints.py b/src/together/cli/api/endpoints.py
index 3d306063..f9634f9c 100644
--- a/src/together/cli/api/endpoints.py
+++ b/src/together/cli/api/endpoints.py
@@ -127,6 +127,11 @@ def endpoints(ctx: click.Context) -> None:
     is_flag=True,
     help="Create the endpoint in STOPPED state instead of auto-starting it",
 )
+@click.option(
+    "--inactive-timeout",
+    type=int,
+    help="Number of minutes of inactivity after which the endpoint will be automatically stopped. Set to 0 to disable.",
+)
 @click.option(
     "--wait",
     is_flag=True,
@@ -146,6 +151,7 @@ def create(
     no_prompt_cache: bool,
     no_speculative_decoding: bool,
     no_auto_start: bool,
+    inactive_timeout: int | None,
     wait: bool,
 ) -> None:
     """Create a new dedicated inference endpoint."""
@@ -170,6 +176,7 @@ def create(
             disable_prompt_cache=no_prompt_cache,
             disable_speculative_decoding=no_speculative_decoding,
             state="STOPPED" if no_auto_start else "STARTED",
+            inactive_timeout=inactive_timeout,
         )
     except InvalidRequestError as e:
         print_api_error(e)
@@ -194,6 +201,8 @@ def create(
         click.echo("  Speculative decoding: disabled", err=True)
     if no_auto_start:
         click.echo("  Auto-start: disabled", err=True)
+    if inactive_timeout is not None:
+        click.echo(f"  Inactive timeout: {inactive_timeout} minutes", err=True)
 
     click.echo(f"Endpoint created successfully, id: {response.id}", err=True)
 
@@ -371,6 +380,11 @@ def list(
     type=int,
     help="New maximum number of replicas to scale up to",
 )
+@click.option(
+    "--inactive-timeout",
+    type=int,
+    help="Number of minutes of inactivity after which the endpoint will be automatically stopped. Set to 0 to disable.",
+)
 @click.pass_obj
 @handle_api_errors
 def update(
@@ -379,9 +393,10 @@ def update(
     display_name: str | None,
     min_replicas: int | None,
     max_replicas: int | None,
+    inactive_timeout: int | None,
 ) -> None:
     """Update a dedicated inference endpoint's configuration."""
-    if not any([display_name, min_replicas, max_replicas]):
+    if not any([display_name, min_replicas, max_replicas, inactive_timeout]):
         click.echo("Error: At least one update option must be specified", err=True)
         sys.exit(1)
 
@@ -400,6 +415,8 @@ def update(
     if min_replicas is not None and max_replicas is not None:
         kwargs["min_replicas"] = min_replicas
         kwargs["max_replicas"] = max_replicas
+    if inactive_timeout is not None:
+        kwargs["inactive_timeout"] = inactive_timeout
 
     _response = client.endpoints.update(endpoint_id, **kwargs)
 
@@ -410,6 +427,8 @@ def update(
     if min_replicas is not None and max_replicas is not None:
         click.echo(f"  Min replicas: {min_replicas}", err=True)
         click.echo(f"  Max replicas: {max_replicas}", err=True)
+    if inactive_timeout is not None:
+        click.echo(f"  Inactive timeout: {inactive_timeout} minutes", err=True)
 
     click.echo("Successfully updated endpoint", err=True)
     click.echo(endpoint_id)
diff --git a/src/together/resources/endpoints.py b/src/together/resources/endpoints.py
index 176894f5..5d8f9a49 100644
--- a/src/together/resources/endpoints.py
+++ b/src/together/resources/endpoints.py
@@ -59,6 +59,7 @@ def create(
         disable_prompt_cache: bool = False,
         disable_speculative_decoding: bool = False,
         state: Literal["STARTED", "STOPPED"] = "STARTED",
+        inactive_timeout: Optional[int] = None,
     ) -> DedicatedEndpoint:
         """
         Create a new dedicated endpoint.
@@ -72,6 +73,7 @@ def create(
             disable_prompt_cache (bool, optional): Whether to disable the prompt cache. Defaults to False.
             disable_speculative_decoding (bool, optional): Whether to disable speculative decoding. Defaults to False.
             state (str, optional): The desired state of the endpoint. Defaults to "STARTED".
+            inactive_timeout (int, optional): The number of minutes of inactivity after which the endpoint will be automatically stopped. Set to 0 to disable automatic timeout.
 
         Returns:
             DedicatedEndpoint: Object containing endpoint information
@@ -80,7 +82,7 @@ def create(
             client=self._client,
         )
 
-        data: Dict[str, Union[str, bool, Dict[str, int]]] = {
+        data: Dict[str, Union[str, bool, Dict[str, int], int]] = {
             "model": model,
             "hardware": hardware,
             "autoscaling": {
@@ -95,6 +97,9 @@ def create(
         if display_name is not None:
             data["display_name"] = display_name
 
+        if inactive_timeout is not None:
+            data["inactive_timeout"] = inactive_timeout
+
         response, _, _ = requestor.request(
             options=TogetherRequest(
                 method="POST",
@@ -161,6 +166,7 @@ def update(
         max_replicas: Optional[int] = None,
         state: Optional[Literal["STARTED", "STOPPED"]] = None,
         display_name: Optional[str] = None,
+        inactive_timeout: Optional[int] = None,
     ) -> DedicatedEndpoint:
         """
         Update an endpoint's configuration.
@@ -171,6 +177,7 @@ def update(
             max_replicas (int, optional): The maximum number of replicas to scale up to
             state (str, optional): The desired state of the endpoint ("STARTED" or "STOPPED")
             display_name (str, optional): A human-readable name for the endpoint
+            inactive_timeout (int, optional): The number of minutes of inactivity after which the endpoint will be automatically stopped. Set to 0 to disable automatic timeout.
 
         Returns:
             DedicatedEndpoint: Object containing endpoint information
@@ -179,7 +186,7 @@ def update(
             client=self._client,
         )
 
-        data: Dict[str, Union[str, Dict[str, int]]] = {}
+        data: Dict[str, Union[str, Dict[str, int], int]] = {}
 
         if min_replicas is not None or max_replicas is not None:
             current_min = min_replicas
@@ -200,6 +207,9 @@ def update(
         if display_name is not None:
             data["display_name"] = display_name
 
+        if inactive_timeout is not None:
+            data["inactive_timeout"] = inactive_timeout
+
         response, _, _ = requestor.request(
             options=TogetherRequest(
                 method="PATCH",
@@ -297,6 +307,7 @@ async def create(
         disable_prompt_cache: bool = False,
         disable_speculative_decoding: bool = False,
         state: Literal["STARTED", "STOPPED"] = "STARTED",
+        inactive_timeout: Optional[int] = None,
     ) -> DedicatedEndpoint:
         """
         Create a new dedicated endpoint.
@@ -310,6 +321,7 @@ async def create(
             disable_prompt_cache (bool, optional): Whether to disable the prompt cache. Defaults to False.
             disable_speculative_decoding (bool, optional): Whether to disable speculative decoding. Defaults to False.
             state (str, optional): The desired state of the endpoint. Defaults to "STARTED".
+            inactive_timeout (int, optional): The number of minutes of inactivity after which the endpoint will be automatically stopped. Set to 0 to disable automatic timeout.
 
         Returns:
             DedicatedEndpoint: Object containing endpoint information
@@ -318,7 +330,7 @@ async def create(
             client=self._client,
         )
 
-        data: Dict[str, Union[str, bool, Dict[str, int]]] = {
+        data: Dict[str, Union[str, bool, Dict[str, int], int]] = {
             "model": model,
             "hardware": hardware,
             "autoscaling": {
@@ -333,6 +345,9 @@ async def create(
         if display_name is not None:
             data["display_name"] = display_name
 
+        if inactive_timeout is not None:
+            data["inactive_timeout"] = inactive_timeout
+
         response, _, _ = await requestor.arequest(
             options=TogetherRequest(
                 method="POST",
@@ -399,6 +414,7 @@ async def update(
         max_replicas: Optional[int] = None,
         state: Optional[Literal["STARTED", "STOPPED"]] = None,
         display_name: Optional[str] = None,
+        inactive_timeout: Optional[int] = None,
     ) -> DedicatedEndpoint:
         """
         Update an endpoint's configuration.
@@ -409,6 +425,7 @@ async def update(
             max_replicas (int, optional): The maximum number of replicas to scale up to
             state (str, optional): The desired state of the endpoint ("STARTED" or "STOPPED")
             display_name (str, optional): A human-readable name for the endpoint
+            inactive_timeout (int, optional): The number of minutes of inactivity after which the endpoint will be automatically stopped. Set to 0 to disable automatic timeout.
 
         Returns:
             DedicatedEndpoint: Object containing endpoint information
@@ -417,7 +434,7 @@ async def update(
             client=self._client,
         )
 
-        data: Dict[str, Union[str, Dict[str, int]]] = {}
+        data: Dict[str, Union[str, Dict[str, int], int]] = {}
 
         if min_replicas is not None or max_replicas is not None:
             current_min = min_replicas
@@ -438,6 +455,9 @@ async def update(
         if display_name is not None:
             data["display_name"] = display_name
 
+        if inactive_timeout is not None:
+            data["inactive_timeout"] = inactive_timeout
+
         response, _, _ = await requestor.arequest(
             options=TogetherRequest(
                 method="PATCH",

From 009b0b24a86de14fc9cd7b343a926bc9f3d5400a Mon Sep 17 00:00:00 2001
From: Ives van Hoorne <ives@codesandbox.io>
Date: Wed, 12 Mar 2025 17:58:20 +0100
Subject: [PATCH 2/5] feat: add support for --type dedicated

---
 src/together/cli/api/models.py   |  9 +++-
 src/together/resources/models.py | 71 ++++++++++++++++++++++++++++----
 2 files changed, 70 insertions(+), 10 deletions(-)

diff --git a/src/together/cli/api/models.py b/src/together/cli/api/models.py
index 78c0f648..c4a110c2 100644
--- a/src/together/cli/api/models.py
+++ b/src/together/cli/api/models.py
@@ -15,12 +15,17 @@ def models(ctx: click.Context) -> None:
 
 
 @models.command()
+@click.option(
+    "--type",
+    type=click.Choice(["dedicated"]),
+    help="Filter models by type (dedicated: models that support autoscaling)",
+)
 @click.pass_context
-def list(ctx: click.Context) -> None:
+def list(ctx: click.Context, type: str | None) -> None:
     """List models"""
     client: Together = ctx.obj
 
-    response = client.models.list()
+    response = client.models.list(dedicated=(type == "dedicated"))
 
     display_list = []
 
diff --git a/src/together/resources/models.py b/src/together/resources/models.py
index 9a85e9bb..56241e67 100644
--- a/src/together/resources/models.py
+++ b/src/together/resources/models.py
@@ -11,20 +11,47 @@
 )
 
 
-class Models:
+class ModelsBase:
     def __init__(self, client: TogetherClient) -> None:
         self._client = client
 
+    def _filter_dedicated_models(
+        self, models: List[ModelObject], dedicated_response: TogetherResponse
+    ) -> List[ModelObject]:
+        """
+        Filter models based on dedicated model response.
+
+        Args:
+            models (List[ModelObject]): List of all models
+            dedicated_response (TogetherResponse): Response from autoscale models endpoint
+
+        Returns:
+            List[ModelObject]: Filtered list of models
+        """
+        assert isinstance(dedicated_response.data, list)
+
+        # Create a set of dedicated model names for efficient lookup
+        dedicated_model_names = {model["name"] for model in dedicated_response.data}
+
+        # Filter models to only include those in dedicated_model_names
+        # Note: The model.id from ModelObject matches the name field in the autoscale response
+        return [model for model in models if model.id in dedicated_model_names]
+
+
+class Models(ModelsBase):
     def list(
         self,
+        dedicated: bool = False,
     ) -> List[ModelObject]:
         """
         Method to return list of models on the API
 
+        Args:
+            dedicated (bool, optional): If True, returns only dedicated models. Defaults to False.
+
         Returns:
             List[ModelObject]: List of model objects
         """
-
         requestor = api_requestor.APIRequestor(
             client=self._client,
         )
@@ -40,23 +67,37 @@ def list(
         assert isinstance(response, TogetherResponse)
         assert isinstance(response.data, list)
 
-        return [ModelObject(**model) for model in response.data]
+        models = [ModelObject(**model) for model in response.data]
 
+        if dedicated:
+            # Get dedicated models
+            dedicated_response, _, _ = requestor.request(
+                options=TogetherRequest(
+                    method="GET",
+                    url="autoscale/models",
+                ),
+                stream=False,
+            )
+
+            models = self._filter_dedicated_models(models, dedicated_response)
+
+        return models
 
-class AsyncModels:
-    def __init__(self, client: TogetherClient) -> None:
-        self._client = client
 
+class AsyncModels(ModelsBase):
     async def list(
         self,
+        dedicated: bool = False,
     ) -> List[ModelObject]:
         """
         Async method to return list of models on API
 
+        Args:
+            dedicated (bool, optional): If True, returns only dedicated models. Defaults to False.
+
         Returns:
             List[ModelObject]: List of model objects
         """
-
         requestor = api_requestor.APIRequestor(
             client=self._client,
         )
@@ -72,4 +113,18 @@ async def list(
         assert isinstance(response, TogetherResponse)
         assert isinstance(response.data, list)
 
-        return [ModelObject(**model) for model in response.data]
+        models = [ModelObject(**model) for model in response.data]
+
+        if dedicated:
+            # Get dedicated models
+            dedicated_response, _, _ = await requestor.arequest(
+                options=TogetherRequest(
+                    method="GET",
+                    url="autoscale/models",
+                ),
+                stream=False,
+            )
+
+            models = self._filter_dedicated_models(models, dedicated_response)
+
+        return models

From 97874dad25b3c8516637e389169d6e949aaee87d Mon Sep 17 00:00:00 2001
From: Ives van Hoorne <ives@codesandbox.io>
Date: Thu, 13 Mar 2025 13:48:51 +0100
Subject: [PATCH 3/5] fix: make list readable by programs

---
 src/together/cli/api/models.py   | 8 ++++----
 src/together/resources/models.py | 4 ++++
 2 files changed, 8 insertions(+), 4 deletions(-)

diff --git a/src/together/cli/api/models.py b/src/together/cli/api/models.py
index c4a110c2..72ee03a7 100644
--- a/src/together/cli/api/models.py
+++ b/src/together/cli/api/models.py
@@ -33,15 +33,15 @@ def list(ctx: click.Context, type: str | None) -> None:
     for model in response:
         display_list.append(
             {
-                "ID": "\n".join(wrap(model.id or "", width=30)),
-                "Name": "\n".join(wrap(model.display_name or "", width=30)),
+                "ID": model.id,
+                "Name": model.display_name,
                 "Organization": model.organization,
                 "Type": model.type,
                 "Context Length": model.context_length,
-                "License": "\n".join(wrap(model.license or "", width=30)),
+                "License": model.license,
                 "Input per 1M token": model.pricing.input,
                 "Output per 1M token": model.pricing.output,
             }
         )
 
-    click.echo(tabulate(display_list, headers="keys", tablefmt="grid"))
+    click.echo(tabulate(display_list, headers="keys", tablefmt="plain"))
diff --git a/src/together/resources/models.py b/src/together/resources/models.py
index 56241e67..1e16c9a8 100644
--- a/src/together/resources/models.py
+++ b/src/together/resources/models.py
@@ -81,6 +81,8 @@ def list(
 
             models = self._filter_dedicated_models(models, dedicated_response)
 
+        models.sort(key=lambda x: x.id.lower())
+
         return models
 
 
@@ -127,4 +129,6 @@ async def list(
 
             models = self._filter_dedicated_models(models, dedicated_response)
 
+        models.sort(key=lambda x: x.id.lower())
+
         return models

From a8ff1c3a1e9ad56f3fd9f83ef489026fb227b886 Mon Sep 17 00:00:00 2001
From: Ives van Hoorne <ives@codesandbox.io>
Date: Thu, 13 Mar 2025 13:51:47 +0100
Subject: [PATCH 4/5] add --json output

---
 src/together/cli/api/models.py | 14 +++++++++++---
 1 file changed, 11 insertions(+), 3 deletions(-)

diff --git a/src/together/cli/api/models.py b/src/together/cli/api/models.py
index 72ee03a7..5661d55f 100644
--- a/src/together/cli/api/models.py
+++ b/src/together/cli/api/models.py
@@ -1,4 +1,4 @@
-from textwrap import wrap
+import json as json_lib
 
 import click
 from tabulate import tabulate
@@ -20,8 +20,13 @@ def models(ctx: click.Context) -> None:
     type=click.Choice(["dedicated"]),
     help="Filter models by type (dedicated: models that support autoscaling)",
 )
+@click.option(
+    "--json",
+    is_flag=True,
+    help="Output in JSON format",
+)
 @click.pass_context
-def list(ctx: click.Context, type: str | None) -> None:
+def list(ctx: click.Context, type: str | None, json: bool) -> None:
     """List models"""
     client: Together = ctx.obj
 
@@ -44,4 +49,7 @@ def list(ctx: click.Context, type: str | None) -> None:
             }
         )
 
-    click.echo(tabulate(display_list, headers="keys", tablefmt="plain"))
+    if json:
+        click.echo(json_lib.dumps(display_list, indent=2))
+    else:
+        click.echo(tabulate(display_list, headers="keys", tablefmt="plain"))

From e43930315432a0ae79fd79e5d2a61b450c460fbe Mon Sep 17 00:00:00 2001
From: Ives van Hoorne <ives@codesandbox.io>
Date: Thu, 13 Mar 2025 14:22:30 +0100
Subject: [PATCH 5/5] fix description

---
 src/together/cli/api/models.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/together/cli/api/models.py b/src/together/cli/api/models.py
index 5661d55f..b807a1d4 100644
--- a/src/together/cli/api/models.py
+++ b/src/together/cli/api/models.py
@@ -18,7 +18,7 @@ def models(ctx: click.Context) -> None:
 @click.option(
     "--type",
     type=click.Choice(["dedicated"]),
-    help="Filter models by type (dedicated: models that support autoscaling)",
+    help="Filter models by type (dedicated: models that can be deployed as dedicated endpoints)",
 )
 @click.option(
     "--json",