From a369cf9fc975981a96649aedf20ebbfa17e67f5a Mon Sep 17 00:00:00 2001 From: peterschmidt85 Date: Wed, 18 Feb 2026 14:21:15 +0100 Subject: [PATCH] Clarify why GPU vendor default inference is split between client and server; add TODOs on how this should change in the future (move resource defaults to the server). --- .../cli/services/configurators/run.py | 19 ++++++++++--------- .../_internal/server/services/resources.py | 14 +++++++++++--- 2 files changed, 21 insertions(+), 12 deletions(-) diff --git a/src/dstack/_internal/cli/services/configurators/run.py b/src/dstack/_internal/cli/services/configurators/run.py index 0322cac22..6fc427a38 100644 --- a/src/dstack/_internal/cli/services/configurators/run.py +++ b/src/dstack/_internal/cli/services/configurators/run.py @@ -391,10 +391,14 @@ def validate_gpu_vendor_and_image(self, conf: RunConfigurationT) -> None: Infers GPU vendor if not set. Defaults to Nvidia when using the default CUDA image. Requires explicit `image` if the vendor is AMD or Tenstorrent. - NOTE: We don't set the inferred vendor on gpu_spec for compatibility with - older servers. Servers set the vendor using the same logic in - set_resources_defaults(). The inferred vendor is used here only for - validation and display (see _infer_gpu_vendor). + When vendor is inferred from GPU name (e.g. A100 -> nvidia), it is written to + gpu_spec. When vendor is inferred from image context (no name, no vendor, default + CUDA image -> nvidia), it is NOT written to gpu_spec because 0.19.x servers + (gpuhunt <0.1.12) break on vendor=nvidia + min_gpu_count=0. The server applies + the same default in set_gpu_vendor_default(). + + TODO: This entire method should move to the server (set_resources_defaults) + so that defaults and validation are equal for CLI and API users. """ gpu_spec = conf.resources.gpu if gpu_spec is None: @@ -439,11 +443,8 @@ def validate_gpu_vendor_and_image(self, conf: RunConfigurationT) -> None: # Set vendor inferred from name on the spec (server needs it for filtering). gpu_spec.vendor = vendor else: - # No vendor or name specified. Default to Nvidia if using the default - # CUDA image, since it's only compatible with Nvidia GPUs. - # We don't set the inferred vendor on the spec — the server does the - # same inference in set_resources_defaults() for compatibility with - # older servers that don't handle vendor + count.min=0 correctly. + # No vendor or name specified. Default to Nvidia if using the + # default CUDA image, since it's only compatible with Nvidia GPUs. if conf.image is None and conf.docker is not True: vendor = gpuhunt.AcceleratorVendor.NVIDIA has_amd_gpu = False diff --git a/src/dstack/_internal/server/services/resources.py b/src/dstack/_internal/server/services/resources.py index aab47de21..8b38f92f4 100644 --- a/src/dstack/_internal/server/services/resources.py +++ b/src/dstack/_internal/server/services/resources.py @@ -29,9 +29,17 @@ def set_gpu_vendor_default( docker: Optional[bool], ) -> None: """Default GPU vendor to Nvidia when using the default CUDA image, - since it's only compatible with Nvidia GPUs. - Mirrors the client-side logic in validate_gpu_vendor_and_image(). - Should only be called for runs (not fleets) since fleets don't have image context.""" + since it's only compatible with Nvidia GPUs. Only called for runs + (not fleets) since fleets don't have image context. + + The client infers the same default for display and validation + (see validate_gpu_vendor_and_image) but does not write it to the spec + for 0.19.x server compatibility. This server-side function is what + actually sets the vendor before offer matching. + + TODO: All resource defaults and validation (gpu vendor, cpu arch, memory, + disk, etc.) should be set here on the server, not split between client + and model-level defaults.""" gpu = resources.gpu if ( gpu is not None