From d4c462292e2742a9494f3e0fd4939eadb7b6d4e1 Mon Sep 17 00:00:00 2001 From: "Abhishek Singh (Manifold)" Date: Fri, 15 Aug 2025 17:15:38 +0000 Subject: [PATCH 1/3] support multipod scenarios with VirtualPodID annotation This is in continuation of the azcri changes https://msazure.visualstudio.com/ContainerPlatform/_git/azcri/pullrequest/12968264 - Add VirtualPodID, TenantSandboxID, and SkipPodNetworking annotations to pkg/annotations - Update create.go to treat containers with VirtualPodID equal to container ID as sandboxes for networking to support separate Network namespace for each Pod in the UVM. (cherry picked from commit c196086161b65682e0d923e3bb8b3c5ed789a497) Signed-off-by: Hamza El-Saawy --- internal/hcsoci/create.go | 7 ++++++- pkg/annotations/annotations.go | 15 +++++++++++++++ 2 files changed, 21 insertions(+), 1 deletion(-) diff --git a/internal/hcsoci/create.go b/internal/hcsoci/create.go index 317eef6629..bfc5342b65 100644 --- a/internal/hcsoci/create.go +++ b/internal/hcsoci/create.go @@ -27,6 +27,7 @@ import ( "github.com/Microsoft/hcsshim/internal/resources" "github.com/Microsoft/hcsshim/internal/schemaversion" "github.com/Microsoft/hcsshim/internal/uvm" + "github.com/Microsoft/hcsshim/pkg/annotations" ) var ( @@ -148,10 +149,14 @@ func configureSandboxNetwork(ctx context.Context, coi *createOptionsInternal, r coi.actualNetworkNamespace = r.NetNS() if coi.HostingSystem != nil { + // Check for virtual pod first containers: if containerID == virtualPodID, treat as sandbox for networking configuration + virtualPodID := coi.Spec.Annotations[annotations.VirtualPodID] + isVirtualPodFirstContainer := virtualPodID != "" && coi.actualID == virtualPodID + // Only add the network namespace to a standalone or sandbox // container but not a workload container in a sandbox that inherits // the namespace. - if ct == oci.KubernetesContainerTypeNone || ct == oci.KubernetesContainerTypeSandbox { + if ct == oci.KubernetesContainerTypeNone || ct == oci.KubernetesContainerTypeSandbox || isVirtualPodFirstContainer { if err := coi.HostingSystem.ConfigureNetworking(ctx, coi.actualNetworkNamespace); err != nil { // No network setup type was specified for this UVM. Create and assign one here unless // we received a different error. diff --git a/pkg/annotations/annotations.go b/pkg/annotations/annotations.go index 6e76e59423..8d914d3040 100644 --- a/pkg/annotations/annotations.go +++ b/pkg/annotations/annotations.go @@ -107,6 +107,21 @@ const ( LCOWPrivileged = "io.microsoft.virtualmachine.lcow.privileged" ) +// LCOW multipod annotations enables multipod and warmpooling. +const ( + // SkipPodNetworking is the annotation to skip networking setup for the pod. + // This prevents errors from being raised when the pod is created without endpoints. Boolean. + SkipPodNetworking = "io.microsoft.cri.skip-pod-networking" + + // TenantSandboxID is the annotation to specify the ID of an existing tenant sandbox + // to use for the pod sandbox. If present, the pod will join the specified tenant sandbox. String. + TenantSandboxID = "io.microsoft.cri.tenant-sandbox-id" + + // VirtualPodID is the annotation to specify the pod ID not associated with a shim + // that a container should be placed in. This is used for multipod scenarios. String. + VirtualPodID = "io.microsoft.cri.virtual-pod-id" +) + // LCOW integrity protection and confidential container annotations. const ( // DmVerityCreateArgs specifies the `dm-mod.create` parameters to kernel and enables integrity protection of From b0a9d5fa7d0ac533b7e630227b85ee0e7fc66e34 Mon Sep 17 00:00:00 2001 From: "Abhishek Singh (Manifold)" Date: Fri, 24 Oct 2025 00:53:11 +0000 Subject: [PATCH 2/3] multi pod changes for GCS Introduce cgroup changes and per pod mount changes to support multiple pod. (cherry picked from commit 7170f3fae8d26fef6a975cdff8300f9ca67691d1) Signed-off-by: Hamza El-Saawy --- cmd/gcs/main.go | 97 +++-- internal/guest/runtime/hcsv2/container.go | 17 +- .../guest/runtime/hcsv2/sandbox_container.go | 82 ++-- .../runtime/hcsv2/standalone_container.go | 51 ++- internal/guest/runtime/hcsv2/uvm.go | 355 ++++++++++++++++-- .../guest/runtime/hcsv2/workload_container.go | 68 ++-- internal/guest/spec/spec.go | 66 ++++ 7 files changed, 617 insertions(+), 119 deletions(-) diff --git a/cmd/gcs/main.go b/cmd/gcs/main.go index 25751763dd..36ae1991b6 100644 --- a/cmd/gcs/main.go +++ b/cmd/gcs/main.go @@ -10,6 +10,7 @@ import ( "os" "os/exec" "path/filepath" + "strings" "syscall" "time" @@ -67,7 +68,12 @@ func readMemoryEvents(startTime time.Time, efdFile *os.File, cgName string, thre } count++ - msg := "memory usage for cgroup exceeded threshold" + var msg string + if strings.HasPrefix(cgName, "/virtual-pods") { + msg = "memory usage for virtual pods cgroup exceeded threshold" + } else { + msg = "memory usage for cgroup exceeded threshold" + } entry := logrus.WithFields(logrus.Fields{ "gcsStartTime": startTime, "time": time.Now(), @@ -294,40 +300,9 @@ func main() { // Continuously log /dev/kmsg go kmsg.ReadForever(kmsg.LogLevel(*kmsgLogLevel)) - tport := &transport.VsockTransport{} - rtime, err := runc.NewRuntime(baseLogPath) - if err != nil { - logrus.WithError(err).Fatal("failed to initialize new runc runtime") - } - mux := bridge.NewBridgeMux() - b := bridge.Bridge{ - Handler: mux, - EnableV4: *v4, - } - h := hcsv2.NewHost(rtime, tport, initialEnforcer, logWriter) - b.AssignHandlers(mux, h) - - var bridgeIn io.ReadCloser - var bridgeOut io.WriteCloser - if *useInOutErr { - bridgeIn = os.Stdin - bridgeOut = os.Stdout - } else { - const commandPort uint32 = 0x40000000 - bridgeCon, err := tport.Dial(commandPort) - if err != nil { - logrus.WithFields(logrus.Fields{ - "port": commandPort, - logrus.ErrorKey: err, - }).Fatal("failed to dial host vsock connection") - } - bridgeIn = bridgeCon - bridgeOut = bridgeCon - } - // Setup the UVM cgroups to protect against a workload taking all available - // memory and causing the GCS to malfunction we create two cgroups: gcs, - // containers. + // memory and causing the GCS to malfunction we create cgroups: gcs, + // containers, and virtual-pods for multi-pod support. // // Write 1 to memory.use_hierarchy on the root cgroup to enable hierarchy @@ -357,6 +332,18 @@ func main() { } defer containersControl.Delete() //nolint:errcheck + // Create virtual-pods cgroup hierarchy for multi-pod support + // This will be the parent for all virtual pod cgroups: /containers/virtual-pods/{virtualSandboxID} + virtualPodsControl, err := cgroups.New(cgroups.StaticPath("/containers/virtual-pods"), &oci.LinuxResources{ + Memory: &oci.LinuxMemory{ + Limit: &containersLimit, // Share the same limit as containers + }, + }) + if err != nil { + logrus.WithError(err).Fatal("failed to create containers/virtual-pods cgroup") + } + defer virtualPodsControl.Delete() //nolint:errcheck + gcsControl, err := cgroups.New(cgroups.StaticPath("/gcs"), &oci.LinuxResources{}) if err != nil { logrus.WithError(err).Fatal("failed to create gcs cgroup") @@ -366,6 +353,39 @@ func main() { logrus.WithError(err).Fatal("failed add gcs pid to gcs cgroup") } + tport := &transport.VsockTransport{} + rtime, err := runc.NewRuntime(baseLogPath) + if err != nil { + logrus.WithError(err).Fatal("failed to initialize new runc runtime") + } + mux := bridge.NewBridgeMux() + b := bridge.Bridge{ + Handler: mux, + EnableV4: *v4, + } + h := hcsv2.NewHost(rtime, tport, initialEnforcer, logWriter) + // Initialize virtual pod support in the host + h.InitializeVirtualPodSupport(virtualPodsControl) + b.AssignHandlers(mux, h) + + var bridgeIn io.ReadCloser + var bridgeOut io.WriteCloser + if *useInOutErr { + bridgeIn = os.Stdin + bridgeOut = os.Stdout + } else { + const commandPort uint32 = 0x40000000 + bridgeCon, err := tport.Dial(commandPort) + if err != nil { + logrus.WithFields(logrus.Fields{ + "port": commandPort, + logrus.ErrorKey: err, + }).Fatal("failed to dial host vsock connection") + } + bridgeIn = bridgeCon + bridgeOut = bridgeCon + } + event := cgroups.MemoryThresholdEvent(*gcsMemLimitBytes, false) gefd, err := gcsControl.RegisterMemoryEvent(event) if err != nil { @@ -381,6 +401,14 @@ func main() { oomFile := os.NewFile(oom, "cefd") defer oomFile.Close() + // Setup OOM monitoring for virtual-pods cgroup + virtualPodsOom, err := virtualPodsControl.OOMEventFD() + if err != nil { + logrus.WithError(err).Fatal("failed to retrieve the virtual-pods cgroups oom eventfd") + } + virtualPodsOomFile := os.NewFile(virtualPodsOom, "vp-oomfd") + defer virtualPodsOomFile.Close() + // time synchronization service if !(*disableTimeSync) { if err = startTimeSyncService(); err != nil { @@ -390,6 +418,7 @@ func main() { go readMemoryEvents(startTime, gefdFile, "/gcs", int64(*gcsMemLimitBytes), gcsControl) go readMemoryEvents(startTime, oomFile, "/containers", containersLimit, containersControl) + go readMemoryEvents(startTime, virtualPodsOomFile, "/containers/virtual-pods", containersLimit, virtualPodsControl) err = b.ListenAndServe(bridgeIn, bridgeOut) if err != nil { logrus.WithFields(logrus.Fields{ diff --git a/internal/guest/runtime/hcsv2/container.go b/internal/guest/runtime/hcsv2/container.go index 9627daf645..7d381301a5 100644 --- a/internal/guest/runtime/hcsv2/container.go +++ b/internal/guest/runtime/hcsv2/container.go @@ -30,6 +30,7 @@ import ( "github.com/Microsoft/hcsshim/internal/oc" "github.com/Microsoft/hcsshim/internal/protocol/guestrequest" "github.com/Microsoft/hcsshim/internal/protocol/guestresource" + "github.com/Microsoft/hcsshim/pkg/annotations" ) // containerStatus has been introduced to enable parallel container creation @@ -193,13 +194,21 @@ func (c *Container) Delete(ctx context.Context) error { entity := log.G(ctx).WithField(logfields.ContainerID, c.id) entity.Info("opengcs::Container::Delete") if c.isSandbox { - // remove user mounts in sandbox container - if err := storage.UnmountAllInPath(ctx, specGuest.SandboxMountsDir(c.id), true); err != nil { + // Check if this is a virtual pod + virtualSandboxID := "" + if c.spec != nil && c.spec.Annotations != nil { + virtualSandboxID = c.spec.Annotations[annotations.VirtualPodID] + } + + // remove user mounts in sandbox container - use virtual pod aware paths + mountsDir := specGuest.VirtualPodAwareSandboxMountsDir(c.id, virtualSandboxID) + if err := storage.UnmountAllInPath(ctx, mountsDir, true); err != nil { entity.WithError(err).Error("failed to unmount sandbox mounts") } - // remove hugepages mounts in sandbox container - if err := storage.UnmountAllInPath(ctx, specGuest.HugePagesMountsDir(c.id), true); err != nil { + // remove hugepages mounts in sandbox container - use virtual pod aware paths + hugePagesDir := specGuest.VirtualPodAwareHugePagesMountsDir(c.id, virtualSandboxID) + if err := storage.UnmountAllInPath(ctx, hugePagesDir, true); err != nil { entity.WithError(err).Error("failed to unmount hugepages mounts") } } diff --git a/internal/guest/runtime/hcsv2/sandbox_container.go b/internal/guest/runtime/hcsv2/sandbox_container.go index 03f41afef6..3d1730e7e5 100644 --- a/internal/guest/runtime/hcsv2/sandbox_container.go +++ b/internal/guest/runtime/hcsv2/sandbox_container.go @@ -15,6 +15,7 @@ import ( "github.com/Microsoft/hcsshim/internal/guest/network" specGuest "github.com/Microsoft/hcsshim/internal/guest/spec" + "github.com/Microsoft/hcsshim/internal/log" "github.com/Microsoft/hcsshim/internal/oc" "github.com/Microsoft/hcsshim/pkg/annotations" ) @@ -23,22 +24,37 @@ func getSandboxHostnamePath(id string) string { return filepath.Join(specGuest.SandboxRootDir(id), "hostname") } +func getVirtualPodAwareSandboxHostnamePath(id, virtualSandboxID string) string { + return filepath.Join(specGuest.VirtualPodAwareSandboxRootDir(id, virtualSandboxID), "hostname") +} + func getSandboxHostsPath(id string) string { return filepath.Join(specGuest.SandboxRootDir(id), "hosts") } +func getVirtualPodAwareSandboxHostsPath(id, virtualSandboxID string) string { + return filepath.Join(specGuest.VirtualPodAwareSandboxRootDir(id, virtualSandboxID), "hosts") +} + func getSandboxResolvPath(id string) string { return filepath.Join(specGuest.SandboxRootDir(id), "resolv.conf") } +func getVirtualPodAwareSandboxResolvPath(id, virtualSandboxID string) string { + return filepath.Join(specGuest.VirtualPodAwareSandboxRootDir(id, virtualSandboxID), "resolv.conf") +} + func setupSandboxContainerSpec(ctx context.Context, id string, spec *oci.Spec) (err error) { ctx, span := oc.StartSpan(ctx, "hcsv2::setupSandboxContainerSpec") defer span.End() defer func() { oc.SetSpanStatus(span, err) }() span.AddAttributes(trace.StringAttribute("cid", id)) - // Generate the sandbox root dir - rootDir := specGuest.SandboxRootDir(id) + // Check if this is a virtual pod to use appropriate root directory + virtualSandboxID := spec.Annotations[annotations.VirtualPodID] + + // Generate the sandbox root dir - virtual pod aware + rootDir := specGuest.VirtualPodAwareSandboxRootDir(id, virtualSandboxID) if err := os.MkdirAll(rootDir, 0755); err != nil { return errors.Wrapf(err, "failed to create sandbox root directory %q", rootDir) } @@ -58,39 +74,55 @@ func setupSandboxContainerSpec(ctx context.Context, id string, spec *oci.Spec) ( } } - sandboxHostnamePath := getSandboxHostnamePath(id) + sandboxHostnamePath := getVirtualPodAwareSandboxHostnamePath(id, virtualSandboxID) if err := os.WriteFile(sandboxHostnamePath, []byte(hostname+"\n"), 0644); err != nil { return errors.Wrapf(err, "failed to write hostname to %q", sandboxHostnamePath) } // Write the hosts sandboxHostsContent := network.GenerateEtcHostsContent(ctx, hostname) - sandboxHostsPath := getSandboxHostsPath(id) + sandboxHostsPath := getVirtualPodAwareSandboxHostsPath(id, virtualSandboxID) if err := os.WriteFile(sandboxHostsPath, []byte(sandboxHostsContent), 0644); err != nil { return errors.Wrapf(err, "failed to write sandbox hosts to %q", sandboxHostsPath) } + log.G(ctx).Debug("quick setup network namespace, cflick") + // Check if this is a virtual pod sandbox container by comparing container ID with virtual pod ID + isVirtualPodSandbox := virtualSandboxID != "" && id == virtualSandboxID + if strings.EqualFold(spec.Annotations[annotations.SkipPodNetworking], "true") || isVirtualPodSandbox { + ns := GetOrAddNetworkNamespace(specGuest.GetNetworkNamespaceID(spec)) + err := ns.Sync(ctx) + if err != nil { + return err + } + } // Write resolv.conf + log.G(ctx).Debug("sandbox resolv.conf, cflick") ns, err := getNetworkNamespace(specGuest.GetNetworkNamespaceID(spec)) if err != nil { - return err - } - var searches, servers []string - for _, n := range ns.Adapters() { - if len(n.DNSSuffix) > 0 { - searches = network.MergeValues(searches, strings.Split(n.DNSSuffix, ",")) + if !strings.EqualFold(spec.Annotations[annotations.SkipPodNetworking], "true") { + return err } - if len(n.DNSServerList) > 0 { - servers = network.MergeValues(servers, strings.Split(n.DNSServerList, ",")) + // Networking is skipped, do not error out + log.G(ctx).Infof("setupSandboxContainerSpec: Did not find NS spec %v, err %v", spec, err) + } else { + var searches, servers []string + for _, n := range ns.Adapters() { + if len(n.DNSSuffix) > 0 { + searches = network.MergeValues(searches, strings.Split(n.DNSSuffix, ",")) + } + if len(n.DNSServerList) > 0 { + servers = network.MergeValues(servers, strings.Split(n.DNSServerList, ",")) + } + } + resolvContent, err := network.GenerateResolvConfContent(ctx, searches, servers, nil) + if err != nil { + return errors.Wrap(err, "failed to generate sandbox resolv.conf content") + } + sandboxResolvPath := getVirtualPodAwareSandboxResolvPath(id, virtualSandboxID) + if err := os.WriteFile(sandboxResolvPath, []byte(resolvContent), 0644); err != nil { + return errors.Wrap(err, "failed to write sandbox resolv.conf") } - } - resolvContent, err := network.GenerateResolvConfContent(ctx, searches, servers, nil) - if err != nil { - return errors.Wrap(err, "failed to generate sandbox resolv.conf content") - } - sandboxResolvPath := getSandboxResolvPath(id) - if err := os.WriteFile(sandboxResolvPath, []byte(resolvContent), 0644); err != nil { - return errors.Wrap(err, "failed to write sandbox resolv.conf") } // User.Username is generally only used on Windows, but as there's no (easy/fast at least) way to grab @@ -113,8 +145,14 @@ func setupSandboxContainerSpec(ctx context.Context, id string, spec *oci.Spec) ( // also has a concept of a sandbox/shm file when the IPC NamespaceMode != // NODE. - // Force the parent cgroup into our /containers root - spec.Linux.CgroupsPath = "/containers/" + id + // Set cgroup path - check if this is a virtual pod + if virtualSandboxID != "" { + // Virtual pod sandbox gets its own cgroup under /containers/virtual-pods using the virtual pod ID + spec.Linux.CgroupsPath = "/containers/virtual-pods/" + virtualSandboxID + } else { + // Traditional sandbox goes under /containers + spec.Linux.CgroupsPath = "/containers/" + id + } // Clear the windows section as we dont want to forward to runc spec.Windows = nil diff --git a/internal/guest/runtime/hcsv2/standalone_container.go b/internal/guest/runtime/hcsv2/standalone_container.go index eecf52e026..4171d30f96 100644 --- a/internal/guest/runtime/hcsv2/standalone_container.go +++ b/internal/guest/runtime/hcsv2/standalone_container.go @@ -17,32 +17,56 @@ import ( specGuest "github.com/Microsoft/hcsshim/internal/guest/spec" "github.com/Microsoft/hcsshim/internal/guestpath" "github.com/Microsoft/hcsshim/internal/oc" + "github.com/Microsoft/hcsshim/pkg/annotations" ) func getStandaloneRootDir(id string) string { return filepath.Join(guestpath.LCOWRootPrefixInUVM, id) } +func getVirtualPodAwareStandaloneRootDir(id, virtualSandboxID string) string { + if virtualSandboxID != "" { + // Standalone container in virtual pod gets its own subdir + return filepath.Join(guestpath.LCOWRootPrefixInUVM, "virtual-pods", virtualSandboxID, id) + } + return getStandaloneRootDir(id) +} + func getStandaloneHostnamePath(id string) string { return filepath.Join(getStandaloneRootDir(id), "hostname") } +func getVirtualPodAwareStandaloneHostnamePath(id, virtualSandboxID string) string { + return filepath.Join(getVirtualPodAwareStandaloneRootDir(id, virtualSandboxID), "hostname") +} + func getStandaloneHostsPath(id string) string { return filepath.Join(getStandaloneRootDir(id), "hosts") } +func getVirtualPodAwareStandaloneHostsPath(id, virtualSandboxID string) string { + return filepath.Join(getVirtualPodAwareStandaloneRootDir(id, virtualSandboxID), "hosts") +} + func getStandaloneResolvPath(id string) string { return filepath.Join(getStandaloneRootDir(id), "resolv.conf") } +func getVirtualPodAwareStandaloneResolvPath(id, virtualSandboxID string) string { + return filepath.Join(getVirtualPodAwareStandaloneRootDir(id, virtualSandboxID), "resolv.conf") +} + func setupStandaloneContainerSpec(ctx context.Context, id string, spec *oci.Spec) (err error) { ctx, span := oc.StartSpan(ctx, "hcsv2::setupStandaloneContainerSpec") defer span.End() defer func() { oc.SetSpanStatus(span, err) }() span.AddAttributes(trace.StringAttribute("cid", id)) - // Generate the standalone root dir - rootDir := getStandaloneRootDir(id) + // Check if this is a virtual pod (unlikely for standalone) + virtualSandboxID := spec.Annotations[annotations.VirtualPodID] + + // Generate the standalone root dir - virtual pod aware + rootDir := getVirtualPodAwareStandaloneRootDir(id, virtualSandboxID) if err := os.MkdirAll(rootDir, 0755); err != nil { return errors.Wrapf(err, "failed to create container root directory %q", rootDir) } @@ -63,7 +87,7 @@ func setupStandaloneContainerSpec(ctx context.Context, id string, spec *oci.Spec // Write the hostname if !specGuest.MountPresent("/etc/hostname", spec.Mounts) { - standaloneHostnamePath := getStandaloneHostnamePath(id) + standaloneHostnamePath := getVirtualPodAwareStandaloneHostnamePath(id, virtualSandboxID) if err := os.WriteFile(standaloneHostnamePath, []byte(hostname+"\n"), 0644); err != nil { return errors.Wrapf(err, "failed to write hostname to %q", standaloneHostnamePath) } @@ -71,7 +95,7 @@ func setupStandaloneContainerSpec(ctx context.Context, id string, spec *oci.Spec mt := oci.Mount{ Destination: "/etc/hostname", Type: "bind", - Source: getStandaloneHostnamePath(id), + Source: getVirtualPodAwareStandaloneHostnamePath(id, virtualSandboxID), Options: []string{"bind"}, } if specGuest.IsRootReadonly(spec) { @@ -83,7 +107,7 @@ func setupStandaloneContainerSpec(ctx context.Context, id string, spec *oci.Spec // Write the hosts if !specGuest.MountPresent("/etc/hosts", spec.Mounts) { standaloneHostsContent := network.GenerateEtcHostsContent(ctx, hostname) - standaloneHostsPath := getStandaloneHostsPath(id) + standaloneHostsPath := getVirtualPodAwareStandaloneHostsPath(id, virtualSandboxID) if err := os.WriteFile(standaloneHostsPath, []byte(standaloneHostsContent), 0644); err != nil { return errors.Wrapf(err, "failed to write standalone hosts to %q", standaloneHostsPath) } @@ -91,7 +115,7 @@ func setupStandaloneContainerSpec(ctx context.Context, id string, spec *oci.Spec mt := oci.Mount{ Destination: "/etc/hosts", Type: "bind", - Source: getStandaloneHostsPath(id), + Source: getVirtualPodAwareStandaloneHostsPath(id, virtualSandboxID), Options: []string{"bind"}, } if specGuest.IsRootReadonly(spec) { @@ -116,7 +140,7 @@ func setupStandaloneContainerSpec(ctx context.Context, id string, spec *oci.Spec if err != nil { return errors.Wrap(err, "failed to generate standalone resolv.conf content") } - standaloneResolvPath := getStandaloneResolvPath(id) + standaloneResolvPath := getVirtualPodAwareStandaloneResolvPath(id, virtualSandboxID) if err := os.WriteFile(standaloneResolvPath, []byte(resolvContent), 0644); err != nil { return errors.Wrap(err, "failed to write standalone resolv.conf") } @@ -124,7 +148,7 @@ func setupStandaloneContainerSpec(ctx context.Context, id string, spec *oci.Spec mt := oci.Mount{ Destination: "/etc/resolv.conf", Type: "bind", - Source: getStandaloneResolvPath(id), + Source: getVirtualPodAwareStandaloneResolvPath(id, virtualSandboxID), Options: []string{"bind"}, } if specGuest.IsRootReadonly(spec) { @@ -133,8 +157,15 @@ func setupStandaloneContainerSpec(ctx context.Context, id string, spec *oci.Spec spec.Mounts = append(spec.Mounts, mt) } - // Force the parent cgroup into our /containers root - spec.Linux.CgroupsPath = "/containers/" + id + // Set cgroup path - check if this is part of a virtual pod (unlikely for standalone) + if virtualSandboxID != "" { + // Standalone container in virtual pod goes under /containers/virtual-pods/{virtualSandboxID}/{containerID} + // Each virtualSandboxID creates its own pod-level cgroup for all containers in that virtual pod + spec.Linux.CgroupsPath = "/containers/virtual-pods/" + virtualSandboxID + "/" + id + } else { + // Traditional standalone container goes under /containers + spec.Linux.CgroupsPath = "/containers/" + id + } // Clear the windows section as we dont want to forward to runc spec.Windows = nil diff --git a/internal/guest/runtime/hcsv2/uvm.go b/internal/guest/runtime/hcsv2/uvm.go index 4b72e62f2c..0e596a8c9a 100644 --- a/internal/guest/runtime/hcsv2/uvm.go +++ b/internal/guest/runtime/hcsv2/uvm.go @@ -23,6 +23,14 @@ import ( "github.com/Microsoft/cosesign1go/pkg/cosesign1" didx509resolver "github.com/Microsoft/didx509go/pkg/did-x509-resolver" "github.com/Microsoft/hcsshim/internal/bridgeutils/gcserr" + cgroups "github.com/containerd/cgroups/v3/cgroup1" + cgroup1stats "github.com/containerd/cgroups/v3/cgroup1/stats" + "github.com/mattn/go-shellwords" + "github.com/opencontainers/runtime-spec/specs-go" + "github.com/pkg/errors" + "github.com/sirupsen/logrus" + "golang.org/x/sys/unix" + "github.com/Microsoft/hcsshim/internal/debug" "github.com/Microsoft/hcsshim/internal/guest/policy" "github.com/Microsoft/hcsshim/internal/guest/prot" @@ -43,18 +51,23 @@ import ( "github.com/Microsoft/hcsshim/internal/verity" "github.com/Microsoft/hcsshim/pkg/annotations" "github.com/Microsoft/hcsshim/pkg/securitypolicy" - cgroup1stats "github.com/containerd/cgroups/v3/cgroup1/stats" - "github.com/mattn/go-shellwords" - "github.com/opencontainers/runtime-spec/specs-go" - "github.com/pkg/errors" - "github.com/sirupsen/logrus" - "golang.org/x/sys/unix" ) // UVMContainerID is the ContainerID that will be sent on any prot.MessageBase // for V2 where the specific message is targeted at the UVM itself. const UVMContainerID = "00000000-0000-0000-0000-000000000000" +// VirtualPod represents a virtual pod that shares a UVM/Sandbox with other pods +type VirtualPod struct { + VirtualSandboxID string + MasterSandboxID string + NetworkNamespace string + CgroupPath string + CgroupControl cgroups.Cgroup + Containers map[string]bool // containerID -> exists + CreatedAt time.Time +} + // Host is the structure tracking all UVM host state including all containers // and processes. type Host struct { @@ -64,7 +77,12 @@ type Host struct { externalProcessesMutex sync.Mutex externalProcesses map[int]*externalProcess - // Rtime is the Runtime interface used by the GCS core. + // Virtual pod support for multi-pod scenarios + virtualPodsMutex sync.Mutex + virtualPods map[string]*VirtualPod // virtualSandboxID -> VirtualPod + containerToVirtualPod map[string]string // containerID -> virtualSandboxID + virtualPodsCgroupParent cgroups.Cgroup // Parent cgroup for all virtual pods + rtime runtime.Runtime vsock transport.Transport devNullTransport transport.Transport @@ -86,6 +104,8 @@ func NewHost(rtime runtime.Runtime, vsock transport.Transport, initialEnforcer s return &Host{ containers: make(map[string]*Container), externalProcesses: make(map[int]*externalProcess), + virtualPods: make(map[string]*VirtualPod), + containerToVirtualPod: make(map[string]string), rtime: rtime, vsock: vsock, devNullTransport: &transport.DevNullTransport{}, @@ -241,10 +261,22 @@ func (h *Host) RemoveContainer(id string) { return } - // delete the network namespace for standalone and sandbox containers - criType, isCRI := c.spec.Annotations[annotations.KubernetesContainerType] - if !isCRI || criType == "sandbox" { - _ = RemoveNetworkNamespace(context.Background(), id) + // Check if this container is part of a virtual pod + virtualPodID, isVirtualPod := c.spec.Annotations[annotations.VirtualPodID] + if isVirtualPod { + // Remove from virtual pod tracking + h.RemoveContainerFromVirtualPod(id) + // Network namespace cleanup is handled in virtual pod cleanup when last container is removed. + logrus.WithFields(logrus.Fields{ + "containerID": id, + "virtualPodID": virtualPodID, + }).Info("Container removed from virtual pod") + } else { + // delete the network namespace for standalone and sandbox containers + criType, isCRI := c.spec.Annotations[annotations.KubernetesContainerType] + if !isCRI || criType == "sandbox" { + _ = RemoveNetworkNamespace(context.Background(), id) + } } delete(h.containers, id) @@ -326,6 +358,23 @@ func setupSandboxHugePageMountsPath(id string) error { func (h *Host) CreateContainer(ctx context.Context, id string, settings *prot.VMHostedContainerSettingsV2) (_ *Container, err error) { criType, isCRI := settings.OCISpecification.Annotations[annotations.KubernetesContainerType] + + // Check for virtual pod annotation + virtualPodID, isVirtualPod := settings.OCISpecification.Annotations[annotations.VirtualPodID] + + // Special handling for virtual pod sandbox containers: + // The first container in a virtual pod (containerID == virtualPodID) should be treated as a sandbox + // even if the CRI annotation might indicate otherwise due to host-side UVM setup differences + if isVirtualPod && id == virtualPodID { + criType = "sandbox" + isCRI = true + logrus.WithFields(logrus.Fields{ + "containerID": id, + "virtualPodID": virtualPodID, + "originalCriType": settings.OCISpecification.Annotations[annotations.KubernetesContainerType], + }).Info("Virtual pod first container detected - treating as sandbox container") + } + c := &Container{ id: id, vsock: h.vsock, @@ -347,6 +396,55 @@ func (h *Host) CreateContainer(ctx context.Context, id string, settings *prot.VM } }() + // Handle virtual pod logic + if isVirtualPod && isCRI { + logrus.WithFields(logrus.Fields{ + "containerID": id, + "virtualPodID": virtualPodID, + "criType": criType, + }).Info("Processing container for virtual pod") + + if criType == "sandbox" { + // This is a virtual pod sandbox - create the virtual pod if it doesn't exist + if _, exists := h.GetVirtualPod(virtualPodID); !exists { + // Use the network namespace ID from the current container spec + // Virtual pods share the same network namespace + networkNamespace := specGuest.GetNetworkNamespaceID(settings.OCISpecification) + if networkNamespace == "" { + networkNamespace = fmt.Sprintf("virtual-pod-%s", virtualPodID) + } + + // Extract memory limit from sandbox container spec + var memoryLimit *int64 + if settings.OCISpecification.Linux != nil && + settings.OCISpecification.Linux.Resources != nil && + settings.OCISpecification.Linux.Resources.Memory != nil && + settings.OCISpecification.Linux.Resources.Memory.Limit != nil { + memoryLimit = settings.OCISpecification.Linux.Resources.Memory.Limit + logrus.WithFields(logrus.Fields{ + "containerID": id, + "virtualPodID": virtualPodID, + "memoryLimit": *memoryLimit, + }).Info("Extracted memory limit from sandbox container spec") + } else { + logrus.WithFields(logrus.Fields{ + "containerID": id, + "virtualPodID": virtualPodID, + }).Info("No memory limit found in sandbox container spec") + } + + if err := h.CreateVirtualPod(ctx, virtualPodID, virtualPodID, networkNamespace, memoryLimit); err != nil { + return nil, errors.Wrapf(err, "failed to create virtual pod %s", virtualPodID) + } + } + } + + // Add this container to the virtual pod + if err := h.AddContainerToVirtualPod(id, virtualPodID); err != nil { + return nil, errors.Wrapf(err, "failed to add container %s to virtual pod %s", id, virtualPodID) + } + } + // Normally we would be doing policy checking here at the start of our // "policy gated function". However, we can't for create container as we // need a properly correct sandboxID which might be changed by the code @@ -362,6 +460,7 @@ func (h *Host) CreateContainer(ctx context.Context, id string, settings *prot.VM case "sandbox": // Capture namespaceID if any because setupSandboxContainerSpec clears the Windows section. namespaceID = specGuest.GetNetworkNamespaceID(settings.OCISpecification) + err = setupSandboxContainerSpec(ctx, id, settings.OCISpecification) if err != nil { return nil, err @@ -372,16 +471,31 @@ func (h *Host) CreateContainer(ctx context.Context, id string, settings *prot.VM } }() - if err = setupSandboxMountsPath(id); err != nil { - return nil, err - } - - if err = setupSandboxTmpfsMountsPath(id); err != nil { - return nil, err - } - - if err = setupSandboxHugePageMountsPath(id); err != nil { - return nil, err + if isVirtualPod { + // For virtual pods, create virtual pod specific paths + err = setupVirtualPodMountsPath(virtualPodID, id) + if err != nil { + return nil, err + } + // Create hugepages path for virtual pod + mountPath := specGuest.VirtualPodHugePagesMountsDir(virtualPodID) + if err := os.MkdirAll(mountPath, 0755); err != nil { + return nil, errors.Wrapf(err, "failed to create virtual pod hugepage mounts dir %v", virtualPodID) + } + if err := storage.MountRShared(mountPath); err != nil { + return nil, err + } + } else { + // Traditional sandbox setup + if err = setupSandboxMountsPath(id); err != nil { + return nil, err + } + if err = setupSandboxTmpfsMountsPath(id); err != nil { + return nil, err + } + if err = setupSandboxHugePageMountsPath(id); err != nil { + return nil, err + } } if err := policy.ExtendPolicyWithNetworkingMounts(id, h.securityPolicyEnforcer, settings.OCISpecification); err != nil { @@ -393,7 +507,8 @@ func (h *Host) CreateContainer(ctx context.Context, id string, settings *prot.VM if !ok || sid == "" { return nil, errors.Errorf("unsupported 'io.kubernetes.cri.sandbox-id': '%s'", sid) } - if err := setupWorkloadContainerSpec(ctx, sid, id, settings.OCISpecification, settings.OCIBundlePath); err != nil { + err = setupWorkloadContainerSpec(ctx, sid, id, settings.OCISpecification, settings.OCIBundlePath) + if err != nil { return nil, err } @@ -555,10 +670,11 @@ func (h *Host) CreateContainer(ctx context.Context, id string, settings *prot.VM // Sandbox or standalone, move the networks to the container namespace if criType == "sandbox" || !isCRI { ns, err := getNetworkNamespace(namespaceID) - if isCRI && err != nil { + // skip network activity for sandbox containers marked with skip uvm networking annotation + if isCRI && err != nil && !strings.EqualFold(settings.OCISpecification.Annotations[annotations.SkipPodNetworking], "true") { + // return nil, err return nil, err } - // standalone is not required to have a networking namespace setup if ns != nil { if err := ns.AssignContainerPid(ctx, c.container.Pid()); err != nil { return nil, err @@ -1263,3 +1379,194 @@ func writeFileInDir(dir string, filename string, data []byte, perm os.FileMode) targetFilename := filepath.Join(dir, filename) return os.WriteFile(targetFilename, data, perm) } + +// Virtual Pod Management Methods + +// InitializeVirtualPodSupport sets up the parent cgroup for virtual pods +func (h *Host) InitializeVirtualPodSupport(virtualPodsCgroup cgroups.Cgroup) { + h.virtualPodsMutex.Lock() + defer h.virtualPodsMutex.Unlock() + + h.virtualPodsCgroupParent = virtualPodsCgroup + logrus.Info("Virtual pod support initialized") +} + +// CreateVirtualPod creates a new virtual pod with its own cgroup and network namespace +func (h *Host) CreateVirtualPod(ctx context.Context, virtualSandboxID, masterSandboxID, networkNamespace string, memoryLimit *int64) error { + h.virtualPodsMutex.Lock() + defer h.virtualPodsMutex.Unlock() + + // Check if virtual pod already exists + if _, exists := h.virtualPods[virtualSandboxID]; exists { + return fmt.Errorf("virtual pod %s already exists", virtualSandboxID) + } + + // Create cgroup path for this virtual pod under the parent cgroup + parentPath := "" + if h.virtualPodsCgroupParent != nil { + if pather, ok := h.virtualPodsCgroupParent.(interface{ Path() string }); ok { + parentPath = pather.Path() + } else { + parentPath = "/containers/virtual-pods" // fallback for default behavior + } + } else { + parentPath = "/containers/virtual-pods" // fallback for default behavior + } + cgroupPath := path.Join(parentPath, virtualSandboxID) + + // Create the cgroup for this virtual pod with memory limit if provided + resources := &specs.LinuxResources{} + if memoryLimit != nil { + resources.Memory = &specs.LinuxMemory{ + Limit: memoryLimit, + } + logrus.WithFields(logrus.Fields{ + "virtualSandboxID": virtualSandboxID, + "memoryLimit": *memoryLimit, + }).Info("Creating virtual pod with memory limit") + } else { + logrus.WithField("virtualSandboxID", virtualSandboxID).Info("Creating virtual pod without memory limit") + } + + cgroupControl, err := cgroups.New(cgroups.StaticPath(cgroupPath), resources) + if err != nil { + return errors.Wrapf(err, "failed to create cgroup for virtual pod %s", virtualSandboxID) + } + + // Create virtual pod structure + virtualPod := &VirtualPod{ + VirtualSandboxID: virtualSandboxID, + MasterSandboxID: masterSandboxID, + NetworkNamespace: networkNamespace, + CgroupPath: cgroupPath, + CgroupControl: cgroupControl, + Containers: make(map[string]bool), + CreatedAt: time.Now(), + } + + h.virtualPods[virtualSandboxID] = virtualPod + + logrus.WithFields(logrus.Fields{ + "virtualSandboxID": virtualSandboxID, + "masterSandboxID": masterSandboxID, + "cgroupPath": cgroupPath, + "networkNamespace": networkNamespace, + }).Info("Virtual pod created successfully") + + return nil +} + +// CreateVirtualPodWithoutMemoryLimit creates a virtual pod without memory limits (backward compatibility) +func (h *Host) CreateVirtualPodWithoutMemoryLimit(ctx context.Context, virtualSandboxID, masterSandboxID, networkNamespace string) error { + return h.CreateVirtualPod(ctx, virtualSandboxID, masterSandboxID, networkNamespace, nil) +} + +// GetVirtualPod retrieves a virtual pod by its virtualSandboxID +func (h *Host) GetVirtualPod(virtualSandboxID string) (*VirtualPod, bool) { + h.virtualPodsMutex.Lock() + defer h.virtualPodsMutex.Unlock() + + vp, exists := h.virtualPods[virtualSandboxID] + return vp, exists +} + +// AddContainerToVirtualPod associates a container with a virtual pod +func (h *Host) AddContainerToVirtualPod(containerID, virtualSandboxID string) error { + h.virtualPodsMutex.Lock() + defer h.virtualPodsMutex.Unlock() + + // Check if virtual pod exists + vp, exists := h.virtualPods[virtualSandboxID] + if !exists { + return fmt.Errorf("virtual pod %s does not exist", virtualSandboxID) + } + + // Add container to virtual pod + vp.Containers[containerID] = true + h.containerToVirtualPod[containerID] = virtualSandboxID + + logrus.WithFields(logrus.Fields{ + "containerID": containerID, + "virtualSandboxID": virtualSandboxID, + }).Info("Container added to virtual pod") + + return nil +} + +// RemoveContainerFromVirtualPod removes a container from a virtual pod +func (h *Host) RemoveContainerFromVirtualPod(containerID string) { + h.virtualPodsMutex.Lock() + defer h.virtualPodsMutex.Unlock() + + virtualSandboxID, exists := h.containerToVirtualPod[containerID] + if !exists { + return // Container not in any virtual pod + } + + // Remove from virtual pod + if vp, vpExists := h.virtualPods[virtualSandboxID]; vpExists { + delete(vp.Containers, containerID) + + // If this is the sandbox container, delete the network namespace + if containerID == virtualSandboxID && vp.NetworkNamespace != "" { + if err := RemoveNetworkNamespace(context.Background(), vp.NetworkNamespace); err != nil { + logrus.WithError(err).WithField("virtualSandboxID", virtualSandboxID). + Warn("Failed to remove virtual pod network namespace (sandbox container removal)") + } + } + + // If this was the last container, cleanup the virtual pod + if len(vp.Containers) == 0 { + h.cleanupVirtualPod(virtualSandboxID) + } + } + + delete(h.containerToVirtualPod, containerID) + + logrus.WithFields(logrus.Fields{ + "containerID": containerID, + "virtualSandboxID": virtualSandboxID, + }).Info("Container removed from virtual pod") +} + +// cleanupVirtualPod removes a virtual pod and its cgroup (should be called with mutex held) +func (h *Host) cleanupVirtualPod(virtualSandboxID string) { + if vp, exists := h.virtualPods[virtualSandboxID]; exists { + // Delete the cgroup + if err := vp.CgroupControl.Delete(); err != nil { + logrus.WithError(err).WithField("virtualSandboxID", virtualSandboxID). + Warn("Failed to delete virtual pod cgroup") + } + + // Clean up network namespace if this is the last virtual pod using it + // Only remove if this virtual pod was managing the network namespace + if vp.NetworkNamespace != "" { + // For virtual pods, the network namespace is shared, so we only clean it up + // when the virtual pod itself is being destroyed + if err := RemoveNetworkNamespace(context.Background(), vp.NetworkNamespace); err != nil { + logrus.WithError(err).WithField("virtualSandboxID", virtualSandboxID). + Warn("Failed to remove virtual pod network namespace") + } + } + + delete(h.virtualPods, virtualSandboxID) + + logrus.WithField("virtualSandboxID", virtualSandboxID).Info("Virtual pod cleaned up") + } +} + +// setupVirtualPodMountsPath creates mount directories for virtual pods +func setupVirtualPodMountsPath(virtualSandboxID, masterSandboxID string) (err error) { + // Create virtual pod specific mount path using the new path generation functions + mountPath := specGuest.VirtualPodMountsDir(virtualSandboxID) + if err := os.MkdirAll(mountPath, 0755); err != nil { + return errors.Wrapf(err, "failed to create virtual pod sandboxMounts dir %v", virtualSandboxID) + } + defer func() { + if err != nil { + _ = os.RemoveAll(mountPath) + } + }() + + return storage.MountRShared(mountPath) +} diff --git a/internal/guest/runtime/hcsv2/workload_container.go b/internal/guest/runtime/hcsv2/workload_container.go index 23fa3a42bc..ebfc0e0865 100644 --- a/internal/guest/runtime/hcsv2/workload_container.go +++ b/internal/guest/runtime/hcsv2/workload_container.go @@ -33,6 +33,9 @@ func mkdirAllModePerm(target string) error { } func updateSandboxMounts(sbid string, spec *oci.Spec) error { + // Check if this is a virtual pod + virtualSandboxID := spec.Annotations[annotations.VirtualPodID] + for i, m := range spec.Mounts { if !strings.HasPrefix(m.Source, guestpath.SandboxMountPrefix) && !strings.HasPrefix(m.Source, guestpath.SandboxTmpfsMountPrefix) { @@ -50,10 +53,13 @@ func updateSandboxMounts(sbid string, spec *oci.Spec) error { } } else { - sandboxSource = specGuest.SandboxMountSource(sbid, m.Source) + // Use virtual pod aware mount source + sandboxSource = specGuest.VirtualPodAwareSandboxMountSource(sbid, virtualSandboxID, m.Source) + expectedMountsDir := specGuest.VirtualPodAwareSandboxMountsDir(sbid, virtualSandboxID) + // filepath.Join cleans the resulting path before returning, so it would resolve the relative path if one was given. // Hence, we need to ensure that the resolved path is still under the correct directory - if !strings.HasPrefix(sandboxSource, specGuest.SandboxMountsDir(sbid)) { + if !strings.HasPrefix(sandboxSource, expectedMountsDir) { return errors.Errorf("mount path %v for mount %v is not within sandbox's mounts dir", sandboxSource, m.Source) } } @@ -71,30 +77,33 @@ func updateSandboxMounts(sbid string, spec *oci.Spec) error { } func updateHugePageMounts(sbid string, spec *oci.Spec) error { + // Check if this is a virtual pod + virtualSandboxID := spec.Annotations[annotations.VirtualPodID] + for i, m := range spec.Mounts { - if !strings.HasPrefix(m.Source, guestpath.HugePagesMountPrefix) { - continue - } - mountsDir := specGuest.HugePagesMountsDir(sbid) - subPath := strings.TrimPrefix(m.Source, guestpath.HugePagesMountPrefix) - pageSize := strings.Split(subPath, string(os.PathSeparator))[0] - hugePageMountSource := filepath.Join(mountsDir, subPath) - - // filepath.Join cleans the resulting path before returning so it would resolve the relative path if one was given. - // Hence, we need to ensure that the resolved path is still under the correct directory - if !strings.HasPrefix(hugePageMountSource, mountsDir) { - return errors.Errorf("mount path %v for mount %v is not within hugepages's mounts dir", hugePageMountSource, m.Source) - } + if strings.HasPrefix(m.Source, guestpath.HugePagesMountPrefix) { + // Use virtual pod aware hugepages directory + mountsDir := specGuest.VirtualPodAwareHugePagesMountsDir(sbid, virtualSandboxID) + subPath := strings.TrimPrefix(m.Source, guestpath.HugePagesMountPrefix) + pageSize := strings.Split(subPath, string(os.PathSeparator))[0] + hugePageMountSource := filepath.Join(mountsDir, subPath) + + // filepath.Join cleans the resulting path before returning so it would resolve the relative path if one was given. + // Hence, we need to ensure that the resolved path is still under the correct directory + if !strings.HasPrefix(hugePageMountSource, mountsDir) { + return errors.Errorf("mount path %v for mount %v is not within hugepages's mounts dir", hugePageMountSource, m.Source) + } - spec.Mounts[i].Source = hugePageMountSource + spec.Mounts[i].Source = hugePageMountSource - _, err := os.Stat(hugePageMountSource) - if os.IsNotExist(err) { - if err := mkdirAllModePerm(hugePageMountSource); err != nil { - return err - } - if err := unix.Mount("none", hugePageMountSource, "hugetlbfs", 0, "pagesize="+pageSize); err != nil { - return errors.Errorf("mount operation failed for %v failed with error %v", hugePageMountSource, err) + _, err := os.Stat(hugePageMountSource) + if os.IsNotExist(err) { + if err := mkdirAllModePerm(hugePageMountSource); err != nil { + return err + } + if err := unix.Mount("none", hugePageMountSource, "hugetlbfs", 0, "pagesize="+pageSize); err != nil { + return errors.Errorf("mount operation failed for %v failed with error %v", hugePageMountSource, err) + } } } } @@ -224,8 +233,17 @@ func setupWorkloadContainerSpec(ctx context.Context, sbid, id string, spec *oci. } } - // Force the parent cgroup into our /containers root - spec.Linux.CgroupsPath = "/containers/" + id + // Check if this is a virtual pod container + virtualPodID := spec.Annotations[annotations.VirtualPodID] + + // Set cgroup path - check if this is a virtual pod container + if virtualPodID != "" { + // Virtual pod containers go under /containers/virtual-pods/virtualPodID/containerID + spec.Linux.CgroupsPath = "/containers/virtual-pods/" + virtualPodID + "/" + id + } else { + // Regular containers go under /containers + spec.Linux.CgroupsPath = "/containers/" + id + } if spec.Windows != nil { // we only support Nvidia gpus right now diff --git a/internal/guest/spec/spec.go b/internal/guest/spec/spec.go index 7b008a4b15..2a4efffaa7 100644 --- a/internal/guest/spec/spec.go +++ b/internal/guest/spec/spec.go @@ -79,11 +79,44 @@ func SandboxRootDir(sandboxID string) string { return filepath.Join(guestpath.LCOWRootPrefixInUVM, sandboxID) } +// VirtualPodRootDir returns the virtual pod root directory inside UVM/host. +// This is used when multiple pods share a UVM via virtualSandboxID +func VirtualPodRootDir(virtualSandboxID string) string { + // Ensure virtualSandboxID is a relative path to prevent directory traversal + sanitizedID := filepath.Clean(virtualSandboxID) + if filepath.IsAbs(sanitizedID) || strings.Contains(sanitizedID, "..") { + return "" + } + return filepath.Join(guestpath.LCOWRootPrefixInUVM, "virtual-pods", sanitizedID) +} + +// VirtualPodAwareSandboxRootDir returns the appropriate root directory based on whether +// the sandbox is part of a virtual pod or traditional single-pod setup +func VirtualPodAwareSandboxRootDir(sandboxID, virtualSandboxID string) string { + if virtualSandboxID != "" { + return VirtualPodRootDir(virtualSandboxID) + } + return SandboxRootDir(sandboxID) +} + // SandboxMountsDir returns sandbox mounts directory inside UVM/host. func SandboxMountsDir(sandboxID string) string { return filepath.Join(SandboxRootDir(sandboxID), "sandboxMounts") } +// VirtualPodMountsDir returns virtual pod mounts directory inside UVM/host. +func VirtualPodMountsDir(virtualSandboxID string) string { + return filepath.Join(VirtualPodRootDir(virtualSandboxID), "sandboxMounts") +} + +// VirtualPodAwareSandboxMountsDir returns the appropriate mounts directory +func VirtualPodAwareSandboxMountsDir(sandboxID, virtualSandboxID string) string { + if virtualSandboxID != "" { + return VirtualPodMountsDir(virtualSandboxID) + } + return SandboxMountsDir(sandboxID) +} + // SandboxTmpfsMountsDir returns sandbox tmpfs mounts directory inside UVM. func SandboxTmpfsMountsDir(sandboxID string) string { return filepath.Join(SandboxRootDir(sandboxID), "sandboxTmpfsMounts") @@ -94,6 +127,19 @@ func HugePagesMountsDir(sandboxID string) string { return filepath.Join(SandboxRootDir(sandboxID), "hugepages") } +// VirtualPodHugePagesMountsDir returns virtual pod hugepages mounts directory +func VirtualPodHugePagesMountsDir(virtualSandboxID string) string { + return filepath.Join(VirtualPodRootDir(virtualSandboxID), "hugepages") +} + +// VirtualPodAwareHugePagesMountsDir returns the appropriate hugepages directory +func VirtualPodAwareHugePagesMountsDir(sandboxID, virtualSandboxID string) string { + if virtualSandboxID != "" { + return VirtualPodHugePagesMountsDir(virtualSandboxID) + } + return HugePagesMountsDir(sandboxID) +} + // SandboxMountSource returns sandbox mount path inside UVM func SandboxMountSource(sandboxID, path string) string { mountsDir := SandboxMountsDir(sandboxID) @@ -101,6 +147,16 @@ func SandboxMountSource(sandboxID, path string) string { return filepath.Join(mountsDir, subPath) } +// VirtualPodAwareSandboxMountSource returns mount source path for virtual pod aware containers +func VirtualPodAwareSandboxMountSource(sandboxID, virtualSandboxID, path string) string { + if virtualSandboxID != "" { + mountsDir := VirtualPodMountsDir(virtualSandboxID) + subPath := strings.TrimPrefix(path, guestpath.SandboxMountPrefix) + return filepath.Join(mountsDir, subPath) + } + return SandboxMountSource(sandboxID, path) +} + // SandboxTmpfsMountSource returns sandbox tmpfs mount path inside UVM func SandboxTmpfsMountSource(sandboxID, path string) string { tmpfsMountDir := SandboxTmpfsMountsDir(sandboxID) @@ -115,6 +171,16 @@ func HugePagesMountSource(sandboxID, path string) string { return filepath.Join(mountsDir, subPath) } +// VirtualPodAwareHugePagesMountSource returns hugepages mount source for virtual pod aware containers +func VirtualPodAwareHugePagesMountSource(sandboxID, virtualSandboxID, path string) string { + if virtualSandboxID != "" { + mountsDir := VirtualPodHugePagesMountsDir(virtualSandboxID) + subPath := strings.TrimPrefix(path, guestpath.HugePagesMountPrefix) + return filepath.Join(mountsDir, subPath) + } + return HugePagesMountSource(sandboxID, path) +} + // GetNetworkNamespaceID returns the `ToLower` of // `spec.Windows.Network.NetworkNamespace` or `""`. func GetNetworkNamespaceID(spec *oci.Spec) string { From 72d8ecced2dadfed19d2982c23cfea2b8d32cecb Mon Sep 17 00:00:00 2001 From: Hamza El-Saawy Date: Tue, 28 Oct 2025 12:47:19 -0400 Subject: [PATCH 3/3] Add multi-pod tmpfs support; fix lint errors Remove unused functions `get(Sandbox|Standalone)(Hostname|Hosts|Resolv)Path` and replace them with their `VirtualPodAware` counterparts to satisfy linter. (The original functions are already replaced wholesale). Expand multi-pod functionality to include tmpfs-backed sandbox mounts. Signed-off-by: Hamza El-Saawy --- internal/guest/runtime/hcsv2/container.go | 11 ++-- .../guest/runtime/hcsv2/sandbox_container.go | 26 +++------ .../runtime/hcsv2/standalone_container.go | 46 ++++++---------- internal/guest/runtime/hcsv2/uvm.go | 54 ++++++++++++++----- .../guest/runtime/hcsv2/workload_container.go | 54 ++++++++++--------- internal/guest/spec/spec.go | 23 ++++++++ 6 files changed, 120 insertions(+), 94 deletions(-) diff --git a/internal/guest/runtime/hcsv2/container.go b/internal/guest/runtime/hcsv2/container.go index 7d381301a5..58680c199f 100644 --- a/internal/guest/runtime/hcsv2/container.go +++ b/internal/guest/runtime/hcsv2/container.go @@ -201,14 +201,17 @@ func (c *Container) Delete(ctx context.Context) error { } // remove user mounts in sandbox container - use virtual pod aware paths - mountsDir := specGuest.VirtualPodAwareSandboxMountsDir(c.id, virtualSandboxID) - if err := storage.UnmountAllInPath(ctx, mountsDir, true); err != nil { + if err := storage.UnmountAllInPath(ctx, specGuest.VirtualPodAwareSandboxMountsDir(c.id, virtualSandboxID), true); err != nil { entity.WithError(err).Error("failed to unmount sandbox mounts") } + // remove user mounts in tmpfs sandbox container - use virtual pod aware paths + if err := storage.UnmountAllInPath(ctx, specGuest.VirtualPodAwareSandboxTmpfsMountsDir(c.id, virtualSandboxID), true); err != nil { + entity.WithError(err).Error("failed to unmount tmpfs sandbox mounts") + } + // remove hugepages mounts in sandbox container - use virtual pod aware paths - hugePagesDir := specGuest.VirtualPodAwareHugePagesMountsDir(c.id, virtualSandboxID) - if err := storage.UnmountAllInPath(ctx, hugePagesDir, true); err != nil { + if err := storage.UnmountAllInPath(ctx, specGuest.VirtualPodAwareHugePagesMountsDir(c.id, virtualSandboxID), true); err != nil { entity.WithError(err).Error("failed to unmount hugepages mounts") } } diff --git a/internal/guest/runtime/hcsv2/sandbox_container.go b/internal/guest/runtime/hcsv2/sandbox_container.go index 3d1730e7e5..7456e1462a 100644 --- a/internal/guest/runtime/hcsv2/sandbox_container.go +++ b/internal/guest/runtime/hcsv2/sandbox_container.go @@ -20,27 +20,15 @@ import ( "github.com/Microsoft/hcsshim/pkg/annotations" ) -func getSandboxHostnamePath(id string) string { - return filepath.Join(specGuest.SandboxRootDir(id), "hostname") -} - -func getVirtualPodAwareSandboxHostnamePath(id, virtualSandboxID string) string { +func getSandboxHostnamePath(id, virtualSandboxID string) string { return filepath.Join(specGuest.VirtualPodAwareSandboxRootDir(id, virtualSandboxID), "hostname") } -func getSandboxHostsPath(id string) string { - return filepath.Join(specGuest.SandboxRootDir(id), "hosts") -} - -func getVirtualPodAwareSandboxHostsPath(id, virtualSandboxID string) string { +func getSandboxHostsPath(id, virtualSandboxID string) string { return filepath.Join(specGuest.VirtualPodAwareSandboxRootDir(id, virtualSandboxID), "hosts") } -func getSandboxResolvPath(id string) string { - return filepath.Join(specGuest.SandboxRootDir(id), "resolv.conf") -} - -func getVirtualPodAwareSandboxResolvPath(id, virtualSandboxID string) string { +func getSandboxResolvPath(id, virtualSandboxID string) string { return filepath.Join(specGuest.VirtualPodAwareSandboxRootDir(id, virtualSandboxID), "resolv.conf") } @@ -74,19 +62,18 @@ func setupSandboxContainerSpec(ctx context.Context, id string, spec *oci.Spec) ( } } - sandboxHostnamePath := getVirtualPodAwareSandboxHostnamePath(id, virtualSandboxID) + sandboxHostnamePath := getSandboxHostnamePath(id, virtualSandboxID) if err := os.WriteFile(sandboxHostnamePath, []byte(hostname+"\n"), 0644); err != nil { return errors.Wrapf(err, "failed to write hostname to %q", sandboxHostnamePath) } // Write the hosts sandboxHostsContent := network.GenerateEtcHostsContent(ctx, hostname) - sandboxHostsPath := getVirtualPodAwareSandboxHostsPath(id, virtualSandboxID) + sandboxHostsPath := getSandboxHostsPath(id, virtualSandboxID) if err := os.WriteFile(sandboxHostsPath, []byte(sandboxHostsContent), 0644); err != nil { return errors.Wrapf(err, "failed to write sandbox hosts to %q", sandboxHostsPath) } - log.G(ctx).Debug("quick setup network namespace, cflick") // Check if this is a virtual pod sandbox container by comparing container ID with virtual pod ID isVirtualPodSandbox := virtualSandboxID != "" && id == virtualSandboxID if strings.EqualFold(spec.Annotations[annotations.SkipPodNetworking], "true") || isVirtualPodSandbox { @@ -97,7 +84,6 @@ func setupSandboxContainerSpec(ctx context.Context, id string, spec *oci.Spec) ( } } // Write resolv.conf - log.G(ctx).Debug("sandbox resolv.conf, cflick") ns, err := getNetworkNamespace(specGuest.GetNetworkNamespaceID(spec)) if err != nil { if !strings.EqualFold(spec.Annotations[annotations.SkipPodNetworking], "true") { @@ -119,7 +105,7 @@ func setupSandboxContainerSpec(ctx context.Context, id string, spec *oci.Spec) ( if err != nil { return errors.Wrap(err, "failed to generate sandbox resolv.conf content") } - sandboxResolvPath := getVirtualPodAwareSandboxResolvPath(id, virtualSandboxID) + sandboxResolvPath := getSandboxResolvPath(id, virtualSandboxID) if err := os.WriteFile(sandboxResolvPath, []byte(resolvContent), 0644); err != nil { return errors.Wrap(err, "failed to write sandbox resolv.conf") } diff --git a/internal/guest/runtime/hcsv2/standalone_container.go b/internal/guest/runtime/hcsv2/standalone_container.go index 4171d30f96..bb1c5ad390 100644 --- a/internal/guest/runtime/hcsv2/standalone_container.go +++ b/internal/guest/runtime/hcsv2/standalone_container.go @@ -20,40 +20,24 @@ import ( "github.com/Microsoft/hcsshim/pkg/annotations" ) -func getStandaloneRootDir(id string) string { - return filepath.Join(guestpath.LCOWRootPrefixInUVM, id) -} - -func getVirtualPodAwareStandaloneRootDir(id, virtualSandboxID string) string { +func getStandaloneRootDir(id, virtualSandboxID string) string { if virtualSandboxID != "" { // Standalone container in virtual pod gets its own subdir return filepath.Join(guestpath.LCOWRootPrefixInUVM, "virtual-pods", virtualSandboxID, id) } - return getStandaloneRootDir(id) -} - -func getStandaloneHostnamePath(id string) string { - return filepath.Join(getStandaloneRootDir(id), "hostname") -} - -func getVirtualPodAwareStandaloneHostnamePath(id, virtualSandboxID string) string { - return filepath.Join(getVirtualPodAwareStandaloneRootDir(id, virtualSandboxID), "hostname") -} - -func getStandaloneHostsPath(id string) string { - return filepath.Join(getStandaloneRootDir(id), "hosts") + return filepath.Join(guestpath.LCOWRootPrefixInUVM, id) } -func getVirtualPodAwareStandaloneHostsPath(id, virtualSandboxID string) string { - return filepath.Join(getVirtualPodAwareStandaloneRootDir(id, virtualSandboxID), "hosts") +func getStandaloneHostnamePath(id, virtualSandboxID string) string { + return filepath.Join(getStandaloneRootDir(id, virtualSandboxID), "hostname") } -func getStandaloneResolvPath(id string) string { - return filepath.Join(getStandaloneRootDir(id), "resolv.conf") +func getStandaloneHostsPath(id, virtualSandboxID string) string { + return filepath.Join(getStandaloneRootDir(id, virtualSandboxID), "hosts") } -func getVirtualPodAwareStandaloneResolvPath(id, virtualSandboxID string) string { - return filepath.Join(getVirtualPodAwareStandaloneRootDir(id, virtualSandboxID), "resolv.conf") +func getStandaloneResolvPath(id, virtualSandboxID string) string { + return filepath.Join(getStandaloneRootDir(id, virtualSandboxID), "resolv.conf") } func setupStandaloneContainerSpec(ctx context.Context, id string, spec *oci.Spec) (err error) { @@ -66,7 +50,7 @@ func setupStandaloneContainerSpec(ctx context.Context, id string, spec *oci.Spec virtualSandboxID := spec.Annotations[annotations.VirtualPodID] // Generate the standalone root dir - virtual pod aware - rootDir := getVirtualPodAwareStandaloneRootDir(id, virtualSandboxID) + rootDir := getStandaloneRootDir(id, virtualSandboxID) if err := os.MkdirAll(rootDir, 0755); err != nil { return errors.Wrapf(err, "failed to create container root directory %q", rootDir) } @@ -87,7 +71,7 @@ func setupStandaloneContainerSpec(ctx context.Context, id string, spec *oci.Spec // Write the hostname if !specGuest.MountPresent("/etc/hostname", spec.Mounts) { - standaloneHostnamePath := getVirtualPodAwareStandaloneHostnamePath(id, virtualSandboxID) + standaloneHostnamePath := getStandaloneHostnamePath(id, virtualSandboxID) if err := os.WriteFile(standaloneHostnamePath, []byte(hostname+"\n"), 0644); err != nil { return errors.Wrapf(err, "failed to write hostname to %q", standaloneHostnamePath) } @@ -95,7 +79,7 @@ func setupStandaloneContainerSpec(ctx context.Context, id string, spec *oci.Spec mt := oci.Mount{ Destination: "/etc/hostname", Type: "bind", - Source: getVirtualPodAwareStandaloneHostnamePath(id, virtualSandboxID), + Source: getStandaloneHostnamePath(id, virtualSandboxID), Options: []string{"bind"}, } if specGuest.IsRootReadonly(spec) { @@ -107,7 +91,7 @@ func setupStandaloneContainerSpec(ctx context.Context, id string, spec *oci.Spec // Write the hosts if !specGuest.MountPresent("/etc/hosts", spec.Mounts) { standaloneHostsContent := network.GenerateEtcHostsContent(ctx, hostname) - standaloneHostsPath := getVirtualPodAwareStandaloneHostsPath(id, virtualSandboxID) + standaloneHostsPath := getStandaloneHostsPath(id, virtualSandboxID) if err := os.WriteFile(standaloneHostsPath, []byte(standaloneHostsContent), 0644); err != nil { return errors.Wrapf(err, "failed to write standalone hosts to %q", standaloneHostsPath) } @@ -115,7 +99,7 @@ func setupStandaloneContainerSpec(ctx context.Context, id string, spec *oci.Spec mt := oci.Mount{ Destination: "/etc/hosts", Type: "bind", - Source: getVirtualPodAwareStandaloneHostsPath(id, virtualSandboxID), + Source: getStandaloneHostsPath(id, virtualSandboxID), Options: []string{"bind"}, } if specGuest.IsRootReadonly(spec) { @@ -140,7 +124,7 @@ func setupStandaloneContainerSpec(ctx context.Context, id string, spec *oci.Spec if err != nil { return errors.Wrap(err, "failed to generate standalone resolv.conf content") } - standaloneResolvPath := getVirtualPodAwareStandaloneResolvPath(id, virtualSandboxID) + standaloneResolvPath := getStandaloneResolvPath(id, virtualSandboxID) if err := os.WriteFile(standaloneResolvPath, []byte(resolvContent), 0644); err != nil { return errors.Wrap(err, "failed to write standalone resolv.conf") } @@ -148,7 +132,7 @@ func setupStandaloneContainerSpec(ctx context.Context, id string, spec *oci.Spec mt := oci.Mount{ Destination: "/etc/resolv.conf", Type: "bind", - Source: getVirtualPodAwareStandaloneResolvPath(id, virtualSandboxID), + Source: getStandaloneResolvPath(id, virtualSandboxID), Options: []string{"bind"}, } if specGuest.IsRootReadonly(spec) { diff --git a/internal/guest/runtime/hcsv2/uvm.go b/internal/guest/runtime/hcsv2/uvm.go index 0e596a8c9a..b575e7ef30 100644 --- a/internal/guest/runtime/hcsv2/uvm.go +++ b/internal/guest/runtime/hcsv2/uvm.go @@ -322,8 +322,7 @@ func setupSandboxMountsPath(id string) (err error) { return storage.MountRShared(mountPath) } -func setupSandboxTmpfsMountsPath(id string) error { - var err error +func setupSandboxTmpfsMountsPath(id string) (err error) { tmpfsDir := specGuest.SandboxTmpfsMountsDir(id) if err := os.MkdirAll(tmpfsDir, 0755); err != nil { return errors.Wrapf(err, "failed to create sandbox tmpfs mounts dir in sandbox %v", id) @@ -473,16 +472,13 @@ func (h *Host) CreateContainer(ctx context.Context, id string, settings *prot.VM if isVirtualPod { // For virtual pods, create virtual pod specific paths - err = setupVirtualPodMountsPath(virtualPodID, id) - if err != nil { + if err = setupVirtualPodMountsPath(virtualPodID); err != nil { return nil, err } - // Create hugepages path for virtual pod - mountPath := specGuest.VirtualPodHugePagesMountsDir(virtualPodID) - if err := os.MkdirAll(mountPath, 0755); err != nil { - return nil, errors.Wrapf(err, "failed to create virtual pod hugepage mounts dir %v", virtualPodID) + if err = setupVirtualPodTmpfsMountsPath(virtualPodID); err != nil { + return nil, err } - if err := storage.MountRShared(mountPath); err != nil { + if err = setupVirtualPodHugePageMountsPath(virtualPodID); err != nil { return nil, err } } else { @@ -507,8 +503,7 @@ func (h *Host) CreateContainer(ctx context.Context, id string, settings *prot.VM if !ok || sid == "" { return nil, errors.Errorf("unsupported 'io.kubernetes.cri.sandbox-id': '%s'", sid) } - err = setupWorkloadContainerSpec(ctx, sid, id, settings.OCISpecification, settings.OCIBundlePath) - if err != nil { + if err = setupWorkloadContainerSpec(ctx, sid, id, settings.OCISpecification, settings.OCIBundlePath); err != nil { return nil, err } @@ -672,9 +667,9 @@ func (h *Host) CreateContainer(ctx context.Context, id string, settings *prot.VM ns, err := getNetworkNamespace(namespaceID) // skip network activity for sandbox containers marked with skip uvm networking annotation if isCRI && err != nil && !strings.EqualFold(settings.OCISpecification.Annotations[annotations.SkipPodNetworking], "true") { - // return nil, err return nil, err } + // standalone is not required to have a networking namespace setup if ns != nil { if err := ns.AssignContainerPid(ctx, c.container.Pid()); err != nil { return nil, err @@ -1556,11 +1551,11 @@ func (h *Host) cleanupVirtualPod(virtualSandboxID string) { } // setupVirtualPodMountsPath creates mount directories for virtual pods -func setupVirtualPodMountsPath(virtualSandboxID, masterSandboxID string) (err error) { +func setupVirtualPodMountsPath(virtualSandboxID string) (err error) { // Create virtual pod specific mount path using the new path generation functions mountPath := specGuest.VirtualPodMountsDir(virtualSandboxID) if err := os.MkdirAll(mountPath, 0755); err != nil { - return errors.Wrapf(err, "failed to create virtual pod sandboxMounts dir %v", virtualSandboxID) + return errors.Wrapf(err, "failed to create virtual pod mounts dir in sandbox %v", virtualSandboxID) } defer func() { if err != nil { @@ -1570,3 +1565,34 @@ func setupVirtualPodMountsPath(virtualSandboxID, masterSandboxID string) (err er return storage.MountRShared(mountPath) } + +func setupVirtualPodTmpfsMountsPath(virtualSandboxID string) (err error) { + tmpfsDir := specGuest.VirtualPodTmpfsMountsDir(virtualSandboxID) + if err := os.MkdirAll(tmpfsDir, 0755); err != nil { + return errors.Wrapf(err, "failed to create virtual pod tmpfs mounts dir in sandbox %v", virtualSandboxID) + } + + defer func() { + if err != nil { + _ = os.RemoveAll(tmpfsDir) + } + }() + + // mount a tmpfs at the tmpfsDir + // this ensures that the tmpfsDir is a mount point and not just a directory + // we don't care if it is already mounted, so ignore EBUSY + if err := unix.Mount("tmpfs", tmpfsDir, "tmpfs", 0, ""); err != nil && !errors.Is(err, unix.EBUSY) { + return errors.Wrapf(err, "failed to mount tmpfs at %s", tmpfsDir) + } + + return storage.MountRShared(tmpfsDir) +} + +func setupVirtualPodHugePageMountsPath(virtualSandboxID string) error { + mountPath := specGuest.VirtualPodHugePagesMountsDir(virtualSandboxID) + if err := os.MkdirAll(mountPath, 0755); err != nil { + return errors.Wrapf(err, "failed to create virtual pod hugepage mounts dir %v", virtualSandboxID) + } + + return storage.MountRShared(mountPath) +} diff --git a/internal/guest/runtime/hcsv2/workload_container.go b/internal/guest/runtime/hcsv2/workload_container.go index ebfc0e0865..683c076e2c 100644 --- a/internal/guest/runtime/hcsv2/workload_container.go +++ b/internal/guest/runtime/hcsv2/workload_container.go @@ -45,13 +45,15 @@ func updateSandboxMounts(sbid string, spec *oci.Spec) error { var sandboxSource string // if using `sandbox-tmp://` prefix, we mount a tmpfs in sandboxTmpfsMountsDir if strings.HasPrefix(m.Source, guestpath.SandboxTmpfsMountPrefix) { - sandboxSource = specGuest.SandboxTmpfsMountSource(sbid, m.Source) + // Use virtual pod aware mount source + sandboxSource = specGuest.VirtualPodAwareSandboxTmpfsMountSource(sbid, virtualSandboxID, m.Source) + expectedMountsDir := specGuest.VirtualPodAwareSandboxTmpfsMountsDir(sbid, virtualSandboxID) + // filepath.Join cleans the resulting path before returning, so it would resolve the relative path if one was given. // Hence, we need to ensure that the resolved path is still under the correct directory - if !strings.HasPrefix(sandboxSource, specGuest.SandboxTmpfsMountsDir(sbid)) { - return errors.Errorf("mount path %v for mount %v is not within sandboxTmpfsMountsDir", sandboxSource, m.Source) + if !strings.HasPrefix(sandboxSource, expectedMountsDir) { + return errors.Errorf("mount path %v for mount %v is not within sandbox's tmpfs mounts dir", sandboxSource, m.Source) } - } else { // Use virtual pod aware mount source sandboxSource = specGuest.VirtualPodAwareSandboxMountSource(sbid, virtualSandboxID, m.Source) @@ -81,29 +83,31 @@ func updateHugePageMounts(sbid string, spec *oci.Spec) error { virtualSandboxID := spec.Annotations[annotations.VirtualPodID] for i, m := range spec.Mounts { - if strings.HasPrefix(m.Source, guestpath.HugePagesMountPrefix) { - // Use virtual pod aware hugepages directory - mountsDir := specGuest.VirtualPodAwareHugePagesMountsDir(sbid, virtualSandboxID) - subPath := strings.TrimPrefix(m.Source, guestpath.HugePagesMountPrefix) - pageSize := strings.Split(subPath, string(os.PathSeparator))[0] - hugePageMountSource := filepath.Join(mountsDir, subPath) - - // filepath.Join cleans the resulting path before returning so it would resolve the relative path if one was given. - // Hence, we need to ensure that the resolved path is still under the correct directory - if !strings.HasPrefix(hugePageMountSource, mountsDir) { - return errors.Errorf("mount path %v for mount %v is not within hugepages's mounts dir", hugePageMountSource, m.Source) - } + if !strings.HasPrefix(m.Source, guestpath.HugePagesMountPrefix) { + continue + } + + // Use virtual pod aware hugepages directory + mountsDir := specGuest.VirtualPodAwareHugePagesMountsDir(sbid, virtualSandboxID) + subPath := strings.TrimPrefix(m.Source, guestpath.HugePagesMountPrefix) + pageSize := strings.Split(subPath, string(os.PathSeparator))[0] + hugePageMountSource := filepath.Join(mountsDir, subPath) + + // filepath.Join cleans the resulting path before returning so it would resolve the relative path if one was given. + // Hence, we need to ensure that the resolved path is still under the correct directory + if !strings.HasPrefix(hugePageMountSource, mountsDir) { + return errors.Errorf("mount path %v for mount %v is not within hugepages's mounts dir", hugePageMountSource, m.Source) + } - spec.Mounts[i].Source = hugePageMountSource + spec.Mounts[i].Source = hugePageMountSource - _, err := os.Stat(hugePageMountSource) - if os.IsNotExist(err) { - if err := mkdirAllModePerm(hugePageMountSource); err != nil { - return err - } - if err := unix.Mount("none", hugePageMountSource, "hugetlbfs", 0, "pagesize="+pageSize); err != nil { - return errors.Errorf("mount operation failed for %v failed with error %v", hugePageMountSource, err) - } + _, err := os.Stat(hugePageMountSource) + if os.IsNotExist(err) { + if err := mkdirAllModePerm(hugePageMountSource); err != nil { + return err + } + if err := unix.Mount("none", hugePageMountSource, "hugetlbfs", 0, "pagesize="+pageSize); err != nil { + return errors.Errorf("mount operation failed for %v failed with error %v", hugePageMountSource, err) } } } diff --git a/internal/guest/spec/spec.go b/internal/guest/spec/spec.go index 2a4efffaa7..5518ae4166 100644 --- a/internal/guest/spec/spec.go +++ b/internal/guest/spec/spec.go @@ -122,6 +122,19 @@ func SandboxTmpfsMountsDir(sandboxID string) string { return filepath.Join(SandboxRootDir(sandboxID), "sandboxTmpfsMounts") } +// VirtualPodTmpfsMountsDir returns virtual pod tmpfs mounts directory inside UVM/host. +func VirtualPodTmpfsMountsDir(virtualSandboxID string) string { + return filepath.Join(VirtualPodRootDir(virtualSandboxID), "sandboxTmpfsMounts") +} + +// VirtualPodAwareSandboxTmpfsMountsDir returns the appropriate tmpfs mounts directory +func VirtualPodAwareSandboxTmpfsMountsDir(sandboxID, virtualSandboxID string) string { + if virtualSandboxID != "" { + return VirtualPodTmpfsMountsDir(virtualSandboxID) + } + return SandboxTmpfsMountsDir(sandboxID) +} + // HugePagesMountsDir returns hugepages mounts directory inside UVM. func HugePagesMountsDir(sandboxID string) string { return filepath.Join(SandboxRootDir(sandboxID), "hugepages") @@ -164,6 +177,16 @@ func SandboxTmpfsMountSource(sandboxID, path string) string { return filepath.Join(tmpfsMountDir, subPath) } +// VirtualPodAwareSandboxTmpfsMountSource returns tmpfs mount source path for virtual pod aware containers +func VirtualPodAwareSandboxTmpfsMountSource(sandboxID, virtualSandboxID, path string) string { + if virtualSandboxID != "" { + mountsDir := VirtualPodTmpfsMountsDir(virtualSandboxID) + subPath := strings.TrimPrefix(path, guestpath.SandboxTmpfsMountPrefix) + return filepath.Join(mountsDir, subPath) + } + return SandboxTmpfsMountSource(sandboxID, path) +} + // HugePagesMountSource returns hugepages mount path inside UVM func HugePagesMountSource(sandboxID, path string) string { mountsDir := HugePagesMountsDir(sandboxID)