Skip to content

Commit 93678d8

Browse files
pohlyoxxenixbart0sh
committed
DRA kubelet: adapt to v1alpha3 API
This adds the ability to select specific requests inside a claim for a container. NodePrepareResources is always called, even if the claim is not used by any container. This could be useful for drivers where that call has some effect other than injecting CDI device IDs into containers. It also ensures that drivers can validate configs. The pod resource API can no longer report a class for each claim because there is no such 1:1 relationship anymore. Instead, that API reports claim, API devices (with driver/pool/device as ID) and CDI device IDs. The kubelet itself doesn't extract that information from the claim. Instead, it relies on drivers to report this information when the claim gets prepared. This isolates the kubelet from API changes. Because of a faulty E2E test, kubelet was told to contact the wrong driver for a claim. This was not visible in the kubelet log output. Now changes to the claim info cache are getting logged. While at it, naming of variables and some existing log output gets harmonized. Co-authored-by: Oksana Baranova <oksana.baranova@intel.com> Co-authored-by: Ed Bartosh <eduard.bartosh@intel.com>
1 parent 5668764 commit 93678d8

File tree

19 files changed

+1482
-1810
lines changed

19 files changed

+1482
-1810
lines changed

pkg/kubelet/apis/podresources/server_v1_test.go

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -85,12 +85,14 @@ func TestListPodResourcesV1(t *testing.T) {
8585
}
8686

8787
pluginCDIDevices := []*podresourcesapi.CDIDevice{{Name: "dra-dev0"}, {Name: "dra-dev1"}}
88+
draDriverName := "dra.example.com"
89+
poolName := "worker-1-pool"
90+
deviceName := "gpu-1"
8891
draDevs := []*podresourcesapi.DynamicResource{
8992
{
90-
ClassName: "resource-class",
9193
ClaimName: "claim-name",
9294
ClaimNamespace: "default",
93-
ClaimResources: []*podresourcesapi.ClaimResource{{CDIDevices: pluginCDIDevices}},
95+
ClaimResources: []*podresourcesapi.ClaimResource{{CDIDevices: pluginCDIDevices, DriverName: draDriverName, PoolName: poolName, DeviceName: deviceName}},
9496
},
9597
}
9698

@@ -893,7 +895,6 @@ func TestGetPodResourcesV1(t *testing.T) {
893895
pluginCDIDevices := []*podresourcesapi.CDIDevice{{Name: "dra-dev0"}, {Name: "dra-dev1"}}
894896
draDevs := []*podresourcesapi.DynamicResource{
895897
{
896-
ClassName: "resource-class",
897898
ClaimName: "claim-name",
898899
ClaimNamespace: "default",
899900
ClaimResources: []*podresourcesapi.ClaimResource{{CDIDevices: pluginCDIDevices}},

pkg/kubelet/cm/container_manager_linux.go

Lines changed: 12 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -661,10 +661,6 @@ func (cm *containerManagerImpl) GetResources(pod *v1.Pod, container *v1.Containe
661661
if err != nil {
662662
return nil, err
663663
}
664-
// NOTE: Passing CDI device names as annotations is a temporary solution
665-
// It will be removed after all runtimes are updated
666-
// to get CDI device names from the ContainerConfig.CDIDevices field
667-
opts.Annotations = append(opts.Annotations, resOpts.Annotations...)
668664
opts.CDIDevices = append(opts.CDIDevices, resOpts.CDIDevices...)
669665
}
670666
// Allocate should already be called during predicateAdmitHandler.Admit(),
@@ -965,19 +961,22 @@ func (cm *containerManagerImpl) GetDynamicResources(pod *v1.Pod, container *v1.C
965961
}
966962
for _, containerClaimInfo := range containerClaimInfos {
967963
var claimResources []*podresourcesapi.ClaimResource
968-
// TODO: Currently we maintain a list of ClaimResources, each of which contains
969-
// a set of CDIDevices from a different kubelet plugin. In the future we may want to
970-
// include the name of the kubelet plugin and/or other types of resources that are
971-
// not CDIDevices (assuming the DRAmanager supports this).
972-
for _, klPluginCdiDevices := range containerClaimInfo.CDIDevices {
964+
for driverName, driverState := range containerClaimInfo.DriverState {
973965
var cdiDevices []*podresourcesapi.CDIDevice
974-
for _, cdiDevice := range klPluginCdiDevices {
975-
cdiDevices = append(cdiDevices, &podresourcesapi.CDIDevice{Name: cdiDevice})
966+
for _, device := range driverState.Devices {
967+
for _, cdiDeviceID := range device.CDIDeviceIDs {
968+
cdiDevices = append(cdiDevices, &podresourcesapi.CDIDevice{Name: cdiDeviceID})
969+
}
970+
resources := &podresourcesapi.ClaimResource{
971+
CDIDevices: cdiDevices,
972+
DriverName: driverName,
973+
PoolName: device.PoolName,
974+
DeviceName: device.DeviceName,
975+
}
976+
claimResources = append(claimResources, resources)
976977
}
977-
claimResources = append(claimResources, &podresourcesapi.ClaimResource{CDIDevices: cdiDevices})
978978
}
979979
containerDynamicResource := podresourcesapi.DynamicResource{
980-
ClassName: containerClaimInfo.ClassName,
981980
ClaimName: containerClaimInfo.ClaimName,
982981
ClaimNamespace: containerClaimInfo.Namespace,
983982
ClaimResources: claimResources,

pkg/kubelet/cm/dra/claiminfo.go

Lines changed: 40 additions & 69 deletions
Original file line numberDiff line numberDiff line change
@@ -17,14 +17,15 @@ limitations under the License.
1717
package dra
1818

1919
import (
20+
"errors"
2021
"fmt"
22+
"slices"
2123
"sync"
2224

2325
resourceapi "k8s.io/api/resource/v1alpha3"
2426
"k8s.io/apimachinery/pkg/types"
2527
"k8s.io/apimachinery/pkg/util/sets"
2628
"k8s.io/kubernetes/pkg/kubelet/cm/dra/state"
27-
"k8s.io/kubernetes/pkg/kubelet/cm/util/cdi"
2829
kubecontainer "k8s.io/kubernetes/pkg/kubelet/container"
2930
)
3031

@@ -33,10 +34,7 @@ import (
3334
// +k8s:deepcopy-gen=true
3435
type ClaimInfo struct {
3536
state.ClaimInfoState
36-
// annotations is a mapping of container annotations per DRA plugin associated with
37-
// a prepared resource
38-
annotations map[string][]kubecontainer.Annotation
39-
prepared bool
37+
prepared bool
4038
}
4139

4240
// claimInfoCache is a cache of processed resource claims keyed by namespace/claimname.
@@ -47,89 +45,45 @@ type claimInfoCache struct {
4745
}
4846

4947
// newClaimInfoFromClaim creates a new claim info from a resource claim.
50-
func newClaimInfoFromClaim(claim *resourceapi.ResourceClaim) *ClaimInfo {
51-
// Grab the allocation.resourceHandles. If there are no
52-
// allocation.resourceHandles, create a single resourceHandle with no
53-
// content. This will trigger processing of this claim by a single
54-
// kubelet plugin whose name matches resourceClaim.Status.DriverName.
55-
resourceHandles := claim.Status.Allocation.ResourceHandles
56-
if len(resourceHandles) == 0 {
57-
resourceHandles = make([]resourceapi.ResourceHandle, 1)
58-
}
48+
// It verifies that the kubelet can handle the claim.
49+
func newClaimInfoFromClaim(claim *resourceapi.ResourceClaim) (*ClaimInfo, error) {
5950
claimInfoState := state.ClaimInfoState{
60-
DriverName: claim.Status.DriverName,
61-
ClassName: claim.Spec.ResourceClassName,
62-
ClaimUID: claim.UID,
63-
ClaimName: claim.Name,
64-
Namespace: claim.Namespace,
65-
PodUIDs: sets.New[string](),
66-
ResourceHandles: resourceHandles,
67-
CDIDevices: make(map[string][]string),
51+
ClaimUID: claim.UID,
52+
ClaimName: claim.Name,
53+
Namespace: claim.Namespace,
54+
PodUIDs: sets.New[string](),
55+
DriverState: make(map[string]state.DriverState),
56+
}
57+
if claim.Status.Allocation == nil {
58+
return nil, errors.New("not allocated")
59+
}
60+
for _, result := range claim.Status.Allocation.Devices.Results {
61+
claimInfoState.DriverState[result.Driver] = state.DriverState{}
6862
}
6963
info := &ClaimInfo{
7064
ClaimInfoState: claimInfoState,
71-
annotations: make(map[string][]kubecontainer.Annotation),
7265
prepared: false,
7366
}
74-
return info
67+
return info, nil
7568
}
7669

7770
// newClaimInfoFromClaim creates a new claim info from a checkpointed claim info state object.
7871
func newClaimInfoFromState(state *state.ClaimInfoState) *ClaimInfo {
7972
info := &ClaimInfo{
8073
ClaimInfoState: *state.DeepCopy(),
81-
annotations: make(map[string][]kubecontainer.Annotation),
8274
prepared: false,
8375
}
84-
for pluginName, devices := range info.CDIDevices {
85-
annotations, _ := cdi.GenerateAnnotations(info.ClaimUID, info.DriverName, devices)
86-
info.annotations[pluginName] = append(info.annotations[pluginName], annotations...)
87-
}
8876
return info
8977
}
9078

9179
// setCDIDevices adds a set of CDI devices to the claim info.
92-
func (info *ClaimInfo) setCDIDevices(pluginName string, cdiDevices []string) error {
93-
// NOTE: Passing CDI device names as annotations is a temporary solution
94-
// It will be removed after all runtimes are updated
95-
// to get CDI device names from the ContainerConfig.CDIDevices field
96-
annotations, err := cdi.GenerateAnnotations(info.ClaimUID, info.DriverName, cdiDevices)
97-
if err != nil {
98-
return fmt.Errorf("failed to generate container annotations, err: %+v", err)
99-
}
100-
101-
if info.CDIDevices == nil {
102-
info.CDIDevices = make(map[string][]string)
103-
}
104-
105-
if info.annotations == nil {
106-
info.annotations = make(map[string][]kubecontainer.Annotation)
107-
}
108-
109-
info.CDIDevices[pluginName] = cdiDevices
110-
info.annotations[pluginName] = annotations
111-
112-
return nil
113-
}
114-
115-
// annotationsAsList returns container annotations as a single list.
116-
func (info *ClaimInfo) annotationsAsList() []kubecontainer.Annotation {
117-
var lst []kubecontainer.Annotation
118-
for _, v := range info.annotations {
119-
lst = append(lst, v...)
80+
func (info *ClaimInfo) addDevice(driverName string, device state.Device) {
81+
if info.DriverState == nil {
82+
info.DriverState = make(map[string]state.DriverState)
12083
}
121-
return lst
122-
}
123-
124-
// cdiDevicesAsList returns a list of CDIDevices from the provided claim info.
125-
func (info *ClaimInfo) cdiDevicesAsList() []kubecontainer.CDIDevice {
126-
var cdiDevices []kubecontainer.CDIDevice
127-
for _, devices := range info.CDIDevices {
128-
for _, device := range devices {
129-
cdiDevices = append(cdiDevices, kubecontainer.CDIDevice{Name: device})
130-
}
131-
}
132-
return cdiDevices
84+
driverState := info.DriverState[driverName]
85+
driverState.Devices = append(driverState.Devices, device)
86+
info.DriverState[driverName] = driverState
13387
}
13488

13589
// addPodReference adds a pod reference to the claim info.
@@ -240,3 +194,20 @@ func (cache *claimInfoCache) syncToCheckpoint() error {
240194
}
241195
return cache.state.Store(claimInfoStateList)
242196
}
197+
198+
// cdiDevicesAsList returns a list of CDIDevices from the provided claim info.
199+
// When the request name is non-empty, only devices relevant for that request
200+
// are returned.
201+
func (info *ClaimInfo) cdiDevicesAsList(requestName string) []kubecontainer.CDIDevice {
202+
var cdiDevices []kubecontainer.CDIDevice
203+
for _, driverData := range info.DriverState {
204+
for _, device := range driverData.Devices {
205+
if requestName == "" || len(device.RequestNames) == 0 || slices.Contains(device.RequestNames, requestName) {
206+
for _, cdiDeviceID := range device.CDIDeviceIDs {
207+
cdiDevices = append(cdiDevices, kubecontainer.CDIDevice{Name: cdiDeviceID})
208+
}
209+
}
210+
}
211+
}
212+
return cdiDevices
213+
}

0 commit comments

Comments
 (0)