Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
19 changes: 10 additions & 9 deletions src/dstack/_internal/cli/services/configurators/run.py
Original file line number Diff line number Diff line change
Expand Up @@ -391,10 +391,14 @@ def validate_gpu_vendor_and_image(self, conf: RunConfigurationT) -> None:
Infers GPU vendor if not set. Defaults to Nvidia when using the default
CUDA image. Requires explicit `image` if the vendor is AMD or Tenstorrent.

NOTE: We don't set the inferred vendor on gpu_spec for compatibility with
older servers. Servers set the vendor using the same logic in
set_resources_defaults(). The inferred vendor is used here only for
validation and display (see _infer_gpu_vendor).
When vendor is inferred from GPU name (e.g. A100 -> nvidia), it is written to
gpu_spec. When vendor is inferred from image context (no name, no vendor, default
CUDA image -> nvidia), it is NOT written to gpu_spec because 0.19.x servers
(gpuhunt <0.1.12) break on vendor=nvidia + min_gpu_count=0. The server applies
the same default in set_gpu_vendor_default().

TODO: This entire method should move to the server (set_resources_defaults)
so that defaults and validation are equal for CLI and API users.
"""
gpu_spec = conf.resources.gpu
if gpu_spec is None:
Expand Down Expand Up @@ -439,11 +443,8 @@ def validate_gpu_vendor_and_image(self, conf: RunConfigurationT) -> None:
# Set vendor inferred from name on the spec (server needs it for filtering).
gpu_spec.vendor = vendor
else:
# No vendor or name specified. Default to Nvidia if using the default
# CUDA image, since it's only compatible with Nvidia GPUs.
# We don't set the inferred vendor on the spec — the server does the
# same inference in set_resources_defaults() for compatibility with
# older servers that don't handle vendor + count.min=0 correctly.
# No vendor or name specified. Default to Nvidia if using the
# default CUDA image, since it's only compatible with Nvidia GPUs.
if conf.image is None and conf.docker is not True:
vendor = gpuhunt.AcceleratorVendor.NVIDIA
has_amd_gpu = False
Expand Down
14 changes: 11 additions & 3 deletions src/dstack/_internal/server/services/resources.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,9 +29,17 @@ def set_gpu_vendor_default(
docker: Optional[bool],
) -> None:
"""Default GPU vendor to Nvidia when using the default CUDA image,
since it's only compatible with Nvidia GPUs.
Mirrors the client-side logic in validate_gpu_vendor_and_image().
Should only be called for runs (not fleets) since fleets don't have image context."""
since it's only compatible with Nvidia GPUs. Only called for runs
(not fleets) since fleets don't have image context.

The client infers the same default for display and validation
(see validate_gpu_vendor_and_image) but does not write it to the spec
for 0.19.x server compatibility. This server-side function is what
actually sets the vendor before offer matching.

TODO: All resource defaults and validation (gpu vendor, cpu arch, memory,
disk, etc.) should be set here on the server, not split between client
and model-level defaults."""
gpu = resources.gpu
if (
gpu is not None
Expand Down