diff --git a/cmd/gcs/main.go b/cmd/gcs/main.go index 25751763dd..36ae1991b6 100644 --- a/cmd/gcs/main.go +++ b/cmd/gcs/main.go @@ -10,6 +10,7 @@ import ( "os" "os/exec" "path/filepath" + "strings" "syscall" "time" @@ -67,7 +68,12 @@ func readMemoryEvents(startTime time.Time, efdFile *os.File, cgName string, thre } count++ - msg := "memory usage for cgroup exceeded threshold" + var msg string + if strings.HasPrefix(cgName, "/virtual-pods") { + msg = "memory usage for virtual pods cgroup exceeded threshold" + } else { + msg = "memory usage for cgroup exceeded threshold" + } entry := logrus.WithFields(logrus.Fields{ "gcsStartTime": startTime, "time": time.Now(), @@ -294,40 +300,9 @@ func main() { // Continuously log /dev/kmsg go kmsg.ReadForever(kmsg.LogLevel(*kmsgLogLevel)) - tport := &transport.VsockTransport{} - rtime, err := runc.NewRuntime(baseLogPath) - if err != nil { - logrus.WithError(err).Fatal("failed to initialize new runc runtime") - } - mux := bridge.NewBridgeMux() - b := bridge.Bridge{ - Handler: mux, - EnableV4: *v4, - } - h := hcsv2.NewHost(rtime, tport, initialEnforcer, logWriter) - b.AssignHandlers(mux, h) - - var bridgeIn io.ReadCloser - var bridgeOut io.WriteCloser - if *useInOutErr { - bridgeIn = os.Stdin - bridgeOut = os.Stdout - } else { - const commandPort uint32 = 0x40000000 - bridgeCon, err := tport.Dial(commandPort) - if err != nil { - logrus.WithFields(logrus.Fields{ - "port": commandPort, - logrus.ErrorKey: err, - }).Fatal("failed to dial host vsock connection") - } - bridgeIn = bridgeCon - bridgeOut = bridgeCon - } - // Setup the UVM cgroups to protect against a workload taking all available - // memory and causing the GCS to malfunction we create two cgroups: gcs, - // containers. + // memory and causing the GCS to malfunction we create cgroups: gcs, + // containers, and virtual-pods for multi-pod support. // // Write 1 to memory.use_hierarchy on the root cgroup to enable hierarchy @@ -357,6 +332,18 @@ func main() { } defer containersControl.Delete() //nolint:errcheck + // Create virtual-pods cgroup hierarchy for multi-pod support + // This will be the parent for all virtual pod cgroups: /containers/virtual-pods/{virtualSandboxID} + virtualPodsControl, err := cgroups.New(cgroups.StaticPath("/containers/virtual-pods"), &oci.LinuxResources{ + Memory: &oci.LinuxMemory{ + Limit: &containersLimit, // Share the same limit as containers + }, + }) + if err != nil { + logrus.WithError(err).Fatal("failed to create containers/virtual-pods cgroup") + } + defer virtualPodsControl.Delete() //nolint:errcheck + gcsControl, err := cgroups.New(cgroups.StaticPath("/gcs"), &oci.LinuxResources{}) if err != nil { logrus.WithError(err).Fatal("failed to create gcs cgroup") @@ -366,6 +353,39 @@ func main() { logrus.WithError(err).Fatal("failed add gcs pid to gcs cgroup") } + tport := &transport.VsockTransport{} + rtime, err := runc.NewRuntime(baseLogPath) + if err != nil { + logrus.WithError(err).Fatal("failed to initialize new runc runtime") + } + mux := bridge.NewBridgeMux() + b := bridge.Bridge{ + Handler: mux, + EnableV4: *v4, + } + h := hcsv2.NewHost(rtime, tport, initialEnforcer, logWriter) + // Initialize virtual pod support in the host + h.InitializeVirtualPodSupport(virtualPodsControl) + b.AssignHandlers(mux, h) + + var bridgeIn io.ReadCloser + var bridgeOut io.WriteCloser + if *useInOutErr { + bridgeIn = os.Stdin + bridgeOut = os.Stdout + } else { + const commandPort uint32 = 0x40000000 + bridgeCon, err := tport.Dial(commandPort) + if err != nil { + logrus.WithFields(logrus.Fields{ + "port": commandPort, + logrus.ErrorKey: err, + }).Fatal("failed to dial host vsock connection") + } + bridgeIn = bridgeCon + bridgeOut = bridgeCon + } + event := cgroups.MemoryThresholdEvent(*gcsMemLimitBytes, false) gefd, err := gcsControl.RegisterMemoryEvent(event) if err != nil { @@ -381,6 +401,14 @@ func main() { oomFile := os.NewFile(oom, "cefd") defer oomFile.Close() + // Setup OOM monitoring for virtual-pods cgroup + virtualPodsOom, err := virtualPodsControl.OOMEventFD() + if err != nil { + logrus.WithError(err).Fatal("failed to retrieve the virtual-pods cgroups oom eventfd") + } + virtualPodsOomFile := os.NewFile(virtualPodsOom, "vp-oomfd") + defer virtualPodsOomFile.Close() + // time synchronization service if !(*disableTimeSync) { if err = startTimeSyncService(); err != nil { @@ -390,6 +418,7 @@ func main() { go readMemoryEvents(startTime, gefdFile, "/gcs", int64(*gcsMemLimitBytes), gcsControl) go readMemoryEvents(startTime, oomFile, "/containers", containersLimit, containersControl) + go readMemoryEvents(startTime, virtualPodsOomFile, "/containers/virtual-pods", containersLimit, virtualPodsControl) err = b.ListenAndServe(bridgeIn, bridgeOut) if err != nil { logrus.WithFields(logrus.Fields{ diff --git a/internal/guest/runtime/hcsv2/container.go b/internal/guest/runtime/hcsv2/container.go index 9627daf645..58680c199f 100644 --- a/internal/guest/runtime/hcsv2/container.go +++ b/internal/guest/runtime/hcsv2/container.go @@ -30,6 +30,7 @@ import ( "github.com/Microsoft/hcsshim/internal/oc" "github.com/Microsoft/hcsshim/internal/protocol/guestrequest" "github.com/Microsoft/hcsshim/internal/protocol/guestresource" + "github.com/Microsoft/hcsshim/pkg/annotations" ) // containerStatus has been introduced to enable parallel container creation @@ -193,13 +194,24 @@ func (c *Container) Delete(ctx context.Context) error { entity := log.G(ctx).WithField(logfields.ContainerID, c.id) entity.Info("opengcs::Container::Delete") if c.isSandbox { - // remove user mounts in sandbox container - if err := storage.UnmountAllInPath(ctx, specGuest.SandboxMountsDir(c.id), true); err != nil { + // Check if this is a virtual pod + virtualSandboxID := "" + if c.spec != nil && c.spec.Annotations != nil { + virtualSandboxID = c.spec.Annotations[annotations.VirtualPodID] + } + + // remove user mounts in sandbox container - use virtual pod aware paths + if err := storage.UnmountAllInPath(ctx, specGuest.VirtualPodAwareSandboxMountsDir(c.id, virtualSandboxID), true); err != nil { entity.WithError(err).Error("failed to unmount sandbox mounts") } - // remove hugepages mounts in sandbox container - if err := storage.UnmountAllInPath(ctx, specGuest.HugePagesMountsDir(c.id), true); err != nil { + // remove user mounts in tmpfs sandbox container - use virtual pod aware paths + if err := storage.UnmountAllInPath(ctx, specGuest.VirtualPodAwareSandboxTmpfsMountsDir(c.id, virtualSandboxID), true); err != nil { + entity.WithError(err).Error("failed to unmount tmpfs sandbox mounts") + } + + // remove hugepages mounts in sandbox container - use virtual pod aware paths + if err := storage.UnmountAllInPath(ctx, specGuest.VirtualPodAwareHugePagesMountsDir(c.id, virtualSandboxID), true); err != nil { entity.WithError(err).Error("failed to unmount hugepages mounts") } } diff --git a/internal/guest/runtime/hcsv2/sandbox_container.go b/internal/guest/runtime/hcsv2/sandbox_container.go index 03f41afef6..7456e1462a 100644 --- a/internal/guest/runtime/hcsv2/sandbox_container.go +++ b/internal/guest/runtime/hcsv2/sandbox_container.go @@ -15,20 +15,21 @@ import ( "github.com/Microsoft/hcsshim/internal/guest/network" specGuest "github.com/Microsoft/hcsshim/internal/guest/spec" + "github.com/Microsoft/hcsshim/internal/log" "github.com/Microsoft/hcsshim/internal/oc" "github.com/Microsoft/hcsshim/pkg/annotations" ) -func getSandboxHostnamePath(id string) string { - return filepath.Join(specGuest.SandboxRootDir(id), "hostname") +func getSandboxHostnamePath(id, virtualSandboxID string) string { + return filepath.Join(specGuest.VirtualPodAwareSandboxRootDir(id, virtualSandboxID), "hostname") } -func getSandboxHostsPath(id string) string { - return filepath.Join(specGuest.SandboxRootDir(id), "hosts") +func getSandboxHostsPath(id, virtualSandboxID string) string { + return filepath.Join(specGuest.VirtualPodAwareSandboxRootDir(id, virtualSandboxID), "hosts") } -func getSandboxResolvPath(id string) string { - return filepath.Join(specGuest.SandboxRootDir(id), "resolv.conf") +func getSandboxResolvPath(id, virtualSandboxID string) string { + return filepath.Join(specGuest.VirtualPodAwareSandboxRootDir(id, virtualSandboxID), "resolv.conf") } func setupSandboxContainerSpec(ctx context.Context, id string, spec *oci.Spec) (err error) { @@ -37,8 +38,11 @@ func setupSandboxContainerSpec(ctx context.Context, id string, spec *oci.Spec) ( defer func() { oc.SetSpanStatus(span, err) }() span.AddAttributes(trace.StringAttribute("cid", id)) - // Generate the sandbox root dir - rootDir := specGuest.SandboxRootDir(id) + // Check if this is a virtual pod to use appropriate root directory + virtualSandboxID := spec.Annotations[annotations.VirtualPodID] + + // Generate the sandbox root dir - virtual pod aware + rootDir := specGuest.VirtualPodAwareSandboxRootDir(id, virtualSandboxID) if err := os.MkdirAll(rootDir, 0755); err != nil { return errors.Wrapf(err, "failed to create sandbox root directory %q", rootDir) } @@ -58,39 +62,53 @@ func setupSandboxContainerSpec(ctx context.Context, id string, spec *oci.Spec) ( } } - sandboxHostnamePath := getSandboxHostnamePath(id) + sandboxHostnamePath := getSandboxHostnamePath(id, virtualSandboxID) if err := os.WriteFile(sandboxHostnamePath, []byte(hostname+"\n"), 0644); err != nil { return errors.Wrapf(err, "failed to write hostname to %q", sandboxHostnamePath) } // Write the hosts sandboxHostsContent := network.GenerateEtcHostsContent(ctx, hostname) - sandboxHostsPath := getSandboxHostsPath(id) + sandboxHostsPath := getSandboxHostsPath(id, virtualSandboxID) if err := os.WriteFile(sandboxHostsPath, []byte(sandboxHostsContent), 0644); err != nil { return errors.Wrapf(err, "failed to write sandbox hosts to %q", sandboxHostsPath) } + // Check if this is a virtual pod sandbox container by comparing container ID with virtual pod ID + isVirtualPodSandbox := virtualSandboxID != "" && id == virtualSandboxID + if strings.EqualFold(spec.Annotations[annotations.SkipPodNetworking], "true") || isVirtualPodSandbox { + ns := GetOrAddNetworkNamespace(specGuest.GetNetworkNamespaceID(spec)) + err := ns.Sync(ctx) + if err != nil { + return err + } + } // Write resolv.conf ns, err := getNetworkNamespace(specGuest.GetNetworkNamespaceID(spec)) if err != nil { - return err - } - var searches, servers []string - for _, n := range ns.Adapters() { - if len(n.DNSSuffix) > 0 { - searches = network.MergeValues(searches, strings.Split(n.DNSSuffix, ",")) + if !strings.EqualFold(spec.Annotations[annotations.SkipPodNetworking], "true") { + return err } - if len(n.DNSServerList) > 0 { - servers = network.MergeValues(servers, strings.Split(n.DNSServerList, ",")) + // Networking is skipped, do not error out + log.G(ctx).Infof("setupSandboxContainerSpec: Did not find NS spec %v, err %v", spec, err) + } else { + var searches, servers []string + for _, n := range ns.Adapters() { + if len(n.DNSSuffix) > 0 { + searches = network.MergeValues(searches, strings.Split(n.DNSSuffix, ",")) + } + if len(n.DNSServerList) > 0 { + servers = network.MergeValues(servers, strings.Split(n.DNSServerList, ",")) + } + } + resolvContent, err := network.GenerateResolvConfContent(ctx, searches, servers, nil) + if err != nil { + return errors.Wrap(err, "failed to generate sandbox resolv.conf content") + } + sandboxResolvPath := getSandboxResolvPath(id, virtualSandboxID) + if err := os.WriteFile(sandboxResolvPath, []byte(resolvContent), 0644); err != nil { + return errors.Wrap(err, "failed to write sandbox resolv.conf") } - } - resolvContent, err := network.GenerateResolvConfContent(ctx, searches, servers, nil) - if err != nil { - return errors.Wrap(err, "failed to generate sandbox resolv.conf content") - } - sandboxResolvPath := getSandboxResolvPath(id) - if err := os.WriteFile(sandboxResolvPath, []byte(resolvContent), 0644); err != nil { - return errors.Wrap(err, "failed to write sandbox resolv.conf") } // User.Username is generally only used on Windows, but as there's no (easy/fast at least) way to grab @@ -113,8 +131,14 @@ func setupSandboxContainerSpec(ctx context.Context, id string, spec *oci.Spec) ( // also has a concept of a sandbox/shm file when the IPC NamespaceMode != // NODE. - // Force the parent cgroup into our /containers root - spec.Linux.CgroupsPath = "/containers/" + id + // Set cgroup path - check if this is a virtual pod + if virtualSandboxID != "" { + // Virtual pod sandbox gets its own cgroup under /containers/virtual-pods using the virtual pod ID + spec.Linux.CgroupsPath = "/containers/virtual-pods/" + virtualSandboxID + } else { + // Traditional sandbox goes under /containers + spec.Linux.CgroupsPath = "/containers/" + id + } // Clear the windows section as we dont want to forward to runc spec.Windows = nil diff --git a/internal/guest/runtime/hcsv2/standalone_container.go b/internal/guest/runtime/hcsv2/standalone_container.go index eecf52e026..bb1c5ad390 100644 --- a/internal/guest/runtime/hcsv2/standalone_container.go +++ b/internal/guest/runtime/hcsv2/standalone_container.go @@ -17,22 +17,27 @@ import ( specGuest "github.com/Microsoft/hcsshim/internal/guest/spec" "github.com/Microsoft/hcsshim/internal/guestpath" "github.com/Microsoft/hcsshim/internal/oc" + "github.com/Microsoft/hcsshim/pkg/annotations" ) -func getStandaloneRootDir(id string) string { +func getStandaloneRootDir(id, virtualSandboxID string) string { + if virtualSandboxID != "" { + // Standalone container in virtual pod gets its own subdir + return filepath.Join(guestpath.LCOWRootPrefixInUVM, "virtual-pods", virtualSandboxID, id) + } return filepath.Join(guestpath.LCOWRootPrefixInUVM, id) } -func getStandaloneHostnamePath(id string) string { - return filepath.Join(getStandaloneRootDir(id), "hostname") +func getStandaloneHostnamePath(id, virtualSandboxID string) string { + return filepath.Join(getStandaloneRootDir(id, virtualSandboxID), "hostname") } -func getStandaloneHostsPath(id string) string { - return filepath.Join(getStandaloneRootDir(id), "hosts") +func getStandaloneHostsPath(id, virtualSandboxID string) string { + return filepath.Join(getStandaloneRootDir(id, virtualSandboxID), "hosts") } -func getStandaloneResolvPath(id string) string { - return filepath.Join(getStandaloneRootDir(id), "resolv.conf") +func getStandaloneResolvPath(id, virtualSandboxID string) string { + return filepath.Join(getStandaloneRootDir(id, virtualSandboxID), "resolv.conf") } func setupStandaloneContainerSpec(ctx context.Context, id string, spec *oci.Spec) (err error) { @@ -41,8 +46,11 @@ func setupStandaloneContainerSpec(ctx context.Context, id string, spec *oci.Spec defer func() { oc.SetSpanStatus(span, err) }() span.AddAttributes(trace.StringAttribute("cid", id)) - // Generate the standalone root dir - rootDir := getStandaloneRootDir(id) + // Check if this is a virtual pod (unlikely for standalone) + virtualSandboxID := spec.Annotations[annotations.VirtualPodID] + + // Generate the standalone root dir - virtual pod aware + rootDir := getStandaloneRootDir(id, virtualSandboxID) if err := os.MkdirAll(rootDir, 0755); err != nil { return errors.Wrapf(err, "failed to create container root directory %q", rootDir) } @@ -63,7 +71,7 @@ func setupStandaloneContainerSpec(ctx context.Context, id string, spec *oci.Spec // Write the hostname if !specGuest.MountPresent("/etc/hostname", spec.Mounts) { - standaloneHostnamePath := getStandaloneHostnamePath(id) + standaloneHostnamePath := getStandaloneHostnamePath(id, virtualSandboxID) if err := os.WriteFile(standaloneHostnamePath, []byte(hostname+"\n"), 0644); err != nil { return errors.Wrapf(err, "failed to write hostname to %q", standaloneHostnamePath) } @@ -71,7 +79,7 @@ func setupStandaloneContainerSpec(ctx context.Context, id string, spec *oci.Spec mt := oci.Mount{ Destination: "/etc/hostname", Type: "bind", - Source: getStandaloneHostnamePath(id), + Source: getStandaloneHostnamePath(id, virtualSandboxID), Options: []string{"bind"}, } if specGuest.IsRootReadonly(spec) { @@ -83,7 +91,7 @@ func setupStandaloneContainerSpec(ctx context.Context, id string, spec *oci.Spec // Write the hosts if !specGuest.MountPresent("/etc/hosts", spec.Mounts) { standaloneHostsContent := network.GenerateEtcHostsContent(ctx, hostname) - standaloneHostsPath := getStandaloneHostsPath(id) + standaloneHostsPath := getStandaloneHostsPath(id, virtualSandboxID) if err := os.WriteFile(standaloneHostsPath, []byte(standaloneHostsContent), 0644); err != nil { return errors.Wrapf(err, "failed to write standalone hosts to %q", standaloneHostsPath) } @@ -91,7 +99,7 @@ func setupStandaloneContainerSpec(ctx context.Context, id string, spec *oci.Spec mt := oci.Mount{ Destination: "/etc/hosts", Type: "bind", - Source: getStandaloneHostsPath(id), + Source: getStandaloneHostsPath(id, virtualSandboxID), Options: []string{"bind"}, } if specGuest.IsRootReadonly(spec) { @@ -116,7 +124,7 @@ func setupStandaloneContainerSpec(ctx context.Context, id string, spec *oci.Spec if err != nil { return errors.Wrap(err, "failed to generate standalone resolv.conf content") } - standaloneResolvPath := getStandaloneResolvPath(id) + standaloneResolvPath := getStandaloneResolvPath(id, virtualSandboxID) if err := os.WriteFile(standaloneResolvPath, []byte(resolvContent), 0644); err != nil { return errors.Wrap(err, "failed to write standalone resolv.conf") } @@ -124,7 +132,7 @@ func setupStandaloneContainerSpec(ctx context.Context, id string, spec *oci.Spec mt := oci.Mount{ Destination: "/etc/resolv.conf", Type: "bind", - Source: getStandaloneResolvPath(id), + Source: getStandaloneResolvPath(id, virtualSandboxID), Options: []string{"bind"}, } if specGuest.IsRootReadonly(spec) { @@ -133,8 +141,15 @@ func setupStandaloneContainerSpec(ctx context.Context, id string, spec *oci.Spec spec.Mounts = append(spec.Mounts, mt) } - // Force the parent cgroup into our /containers root - spec.Linux.CgroupsPath = "/containers/" + id + // Set cgroup path - check if this is part of a virtual pod (unlikely for standalone) + if virtualSandboxID != "" { + // Standalone container in virtual pod goes under /containers/virtual-pods/{virtualSandboxID}/{containerID} + // Each virtualSandboxID creates its own pod-level cgroup for all containers in that virtual pod + spec.Linux.CgroupsPath = "/containers/virtual-pods/" + virtualSandboxID + "/" + id + } else { + // Traditional standalone container goes under /containers + spec.Linux.CgroupsPath = "/containers/" + id + } // Clear the windows section as we dont want to forward to runc spec.Windows = nil diff --git a/internal/guest/runtime/hcsv2/uvm.go b/internal/guest/runtime/hcsv2/uvm.go index 4b72e62f2c..b575e7ef30 100644 --- a/internal/guest/runtime/hcsv2/uvm.go +++ b/internal/guest/runtime/hcsv2/uvm.go @@ -23,6 +23,14 @@ import ( "github.com/Microsoft/cosesign1go/pkg/cosesign1" didx509resolver "github.com/Microsoft/didx509go/pkg/did-x509-resolver" "github.com/Microsoft/hcsshim/internal/bridgeutils/gcserr" + cgroups "github.com/containerd/cgroups/v3/cgroup1" + cgroup1stats "github.com/containerd/cgroups/v3/cgroup1/stats" + "github.com/mattn/go-shellwords" + "github.com/opencontainers/runtime-spec/specs-go" + "github.com/pkg/errors" + "github.com/sirupsen/logrus" + "golang.org/x/sys/unix" + "github.com/Microsoft/hcsshim/internal/debug" "github.com/Microsoft/hcsshim/internal/guest/policy" "github.com/Microsoft/hcsshim/internal/guest/prot" @@ -43,18 +51,23 @@ import ( "github.com/Microsoft/hcsshim/internal/verity" "github.com/Microsoft/hcsshim/pkg/annotations" "github.com/Microsoft/hcsshim/pkg/securitypolicy" - cgroup1stats "github.com/containerd/cgroups/v3/cgroup1/stats" - "github.com/mattn/go-shellwords" - "github.com/opencontainers/runtime-spec/specs-go" - "github.com/pkg/errors" - "github.com/sirupsen/logrus" - "golang.org/x/sys/unix" ) // UVMContainerID is the ContainerID that will be sent on any prot.MessageBase // for V2 where the specific message is targeted at the UVM itself. const UVMContainerID = "00000000-0000-0000-0000-000000000000" +// VirtualPod represents a virtual pod that shares a UVM/Sandbox with other pods +type VirtualPod struct { + VirtualSandboxID string + MasterSandboxID string + NetworkNamespace string + CgroupPath string + CgroupControl cgroups.Cgroup + Containers map[string]bool // containerID -> exists + CreatedAt time.Time +} + // Host is the structure tracking all UVM host state including all containers // and processes. type Host struct { @@ -64,7 +77,12 @@ type Host struct { externalProcessesMutex sync.Mutex externalProcesses map[int]*externalProcess - // Rtime is the Runtime interface used by the GCS core. + // Virtual pod support for multi-pod scenarios + virtualPodsMutex sync.Mutex + virtualPods map[string]*VirtualPod // virtualSandboxID -> VirtualPod + containerToVirtualPod map[string]string // containerID -> virtualSandboxID + virtualPodsCgroupParent cgroups.Cgroup // Parent cgroup for all virtual pods + rtime runtime.Runtime vsock transport.Transport devNullTransport transport.Transport @@ -86,6 +104,8 @@ func NewHost(rtime runtime.Runtime, vsock transport.Transport, initialEnforcer s return &Host{ containers: make(map[string]*Container), externalProcesses: make(map[int]*externalProcess), + virtualPods: make(map[string]*VirtualPod), + containerToVirtualPod: make(map[string]string), rtime: rtime, vsock: vsock, devNullTransport: &transport.DevNullTransport{}, @@ -241,10 +261,22 @@ func (h *Host) RemoveContainer(id string) { return } - // delete the network namespace for standalone and sandbox containers - criType, isCRI := c.spec.Annotations[annotations.KubernetesContainerType] - if !isCRI || criType == "sandbox" { - _ = RemoveNetworkNamespace(context.Background(), id) + // Check if this container is part of a virtual pod + virtualPodID, isVirtualPod := c.spec.Annotations[annotations.VirtualPodID] + if isVirtualPod { + // Remove from virtual pod tracking + h.RemoveContainerFromVirtualPod(id) + // Network namespace cleanup is handled in virtual pod cleanup when last container is removed. + logrus.WithFields(logrus.Fields{ + "containerID": id, + "virtualPodID": virtualPodID, + }).Info("Container removed from virtual pod") + } else { + // delete the network namespace for standalone and sandbox containers + criType, isCRI := c.spec.Annotations[annotations.KubernetesContainerType] + if !isCRI || criType == "sandbox" { + _ = RemoveNetworkNamespace(context.Background(), id) + } } delete(h.containers, id) @@ -290,8 +322,7 @@ func setupSandboxMountsPath(id string) (err error) { return storage.MountRShared(mountPath) } -func setupSandboxTmpfsMountsPath(id string) error { - var err error +func setupSandboxTmpfsMountsPath(id string) (err error) { tmpfsDir := specGuest.SandboxTmpfsMountsDir(id) if err := os.MkdirAll(tmpfsDir, 0755); err != nil { return errors.Wrapf(err, "failed to create sandbox tmpfs mounts dir in sandbox %v", id) @@ -326,6 +357,23 @@ func setupSandboxHugePageMountsPath(id string) error { func (h *Host) CreateContainer(ctx context.Context, id string, settings *prot.VMHostedContainerSettingsV2) (_ *Container, err error) { criType, isCRI := settings.OCISpecification.Annotations[annotations.KubernetesContainerType] + + // Check for virtual pod annotation + virtualPodID, isVirtualPod := settings.OCISpecification.Annotations[annotations.VirtualPodID] + + // Special handling for virtual pod sandbox containers: + // The first container in a virtual pod (containerID == virtualPodID) should be treated as a sandbox + // even if the CRI annotation might indicate otherwise due to host-side UVM setup differences + if isVirtualPod && id == virtualPodID { + criType = "sandbox" + isCRI = true + logrus.WithFields(logrus.Fields{ + "containerID": id, + "virtualPodID": virtualPodID, + "originalCriType": settings.OCISpecification.Annotations[annotations.KubernetesContainerType], + }).Info("Virtual pod first container detected - treating as sandbox container") + } + c := &Container{ id: id, vsock: h.vsock, @@ -347,6 +395,55 @@ func (h *Host) CreateContainer(ctx context.Context, id string, settings *prot.VM } }() + // Handle virtual pod logic + if isVirtualPod && isCRI { + logrus.WithFields(logrus.Fields{ + "containerID": id, + "virtualPodID": virtualPodID, + "criType": criType, + }).Info("Processing container for virtual pod") + + if criType == "sandbox" { + // This is a virtual pod sandbox - create the virtual pod if it doesn't exist + if _, exists := h.GetVirtualPod(virtualPodID); !exists { + // Use the network namespace ID from the current container spec + // Virtual pods share the same network namespace + networkNamespace := specGuest.GetNetworkNamespaceID(settings.OCISpecification) + if networkNamespace == "" { + networkNamespace = fmt.Sprintf("virtual-pod-%s", virtualPodID) + } + + // Extract memory limit from sandbox container spec + var memoryLimit *int64 + if settings.OCISpecification.Linux != nil && + settings.OCISpecification.Linux.Resources != nil && + settings.OCISpecification.Linux.Resources.Memory != nil && + settings.OCISpecification.Linux.Resources.Memory.Limit != nil { + memoryLimit = settings.OCISpecification.Linux.Resources.Memory.Limit + logrus.WithFields(logrus.Fields{ + "containerID": id, + "virtualPodID": virtualPodID, + "memoryLimit": *memoryLimit, + }).Info("Extracted memory limit from sandbox container spec") + } else { + logrus.WithFields(logrus.Fields{ + "containerID": id, + "virtualPodID": virtualPodID, + }).Info("No memory limit found in sandbox container spec") + } + + if err := h.CreateVirtualPod(ctx, virtualPodID, virtualPodID, networkNamespace, memoryLimit); err != nil { + return nil, errors.Wrapf(err, "failed to create virtual pod %s", virtualPodID) + } + } + } + + // Add this container to the virtual pod + if err := h.AddContainerToVirtualPod(id, virtualPodID); err != nil { + return nil, errors.Wrapf(err, "failed to add container %s to virtual pod %s", id, virtualPodID) + } + } + // Normally we would be doing policy checking here at the start of our // "policy gated function". However, we can't for create container as we // need a properly correct sandboxID which might be changed by the code @@ -362,6 +459,7 @@ func (h *Host) CreateContainer(ctx context.Context, id string, settings *prot.VM case "sandbox": // Capture namespaceID if any because setupSandboxContainerSpec clears the Windows section. namespaceID = specGuest.GetNetworkNamespaceID(settings.OCISpecification) + err = setupSandboxContainerSpec(ctx, id, settings.OCISpecification) if err != nil { return nil, err @@ -372,16 +470,28 @@ func (h *Host) CreateContainer(ctx context.Context, id string, settings *prot.VM } }() - if err = setupSandboxMountsPath(id); err != nil { - return nil, err - } - - if err = setupSandboxTmpfsMountsPath(id); err != nil { - return nil, err - } - - if err = setupSandboxHugePageMountsPath(id); err != nil { - return nil, err + if isVirtualPod { + // For virtual pods, create virtual pod specific paths + if err = setupVirtualPodMountsPath(virtualPodID); err != nil { + return nil, err + } + if err = setupVirtualPodTmpfsMountsPath(virtualPodID); err != nil { + return nil, err + } + if err = setupVirtualPodHugePageMountsPath(virtualPodID); err != nil { + return nil, err + } + } else { + // Traditional sandbox setup + if err = setupSandboxMountsPath(id); err != nil { + return nil, err + } + if err = setupSandboxTmpfsMountsPath(id); err != nil { + return nil, err + } + if err = setupSandboxHugePageMountsPath(id); err != nil { + return nil, err + } } if err := policy.ExtendPolicyWithNetworkingMounts(id, h.securityPolicyEnforcer, settings.OCISpecification); err != nil { @@ -393,7 +503,7 @@ func (h *Host) CreateContainer(ctx context.Context, id string, settings *prot.VM if !ok || sid == "" { return nil, errors.Errorf("unsupported 'io.kubernetes.cri.sandbox-id': '%s'", sid) } - if err := setupWorkloadContainerSpec(ctx, sid, id, settings.OCISpecification, settings.OCIBundlePath); err != nil { + if err = setupWorkloadContainerSpec(ctx, sid, id, settings.OCISpecification, settings.OCIBundlePath); err != nil { return nil, err } @@ -555,7 +665,8 @@ func (h *Host) CreateContainer(ctx context.Context, id string, settings *prot.VM // Sandbox or standalone, move the networks to the container namespace if criType == "sandbox" || !isCRI { ns, err := getNetworkNamespace(namespaceID) - if isCRI && err != nil { + // skip network activity for sandbox containers marked with skip uvm networking annotation + if isCRI && err != nil && !strings.EqualFold(settings.OCISpecification.Annotations[annotations.SkipPodNetworking], "true") { return nil, err } // standalone is not required to have a networking namespace setup @@ -1263,3 +1374,225 @@ func writeFileInDir(dir string, filename string, data []byte, perm os.FileMode) targetFilename := filepath.Join(dir, filename) return os.WriteFile(targetFilename, data, perm) } + +// Virtual Pod Management Methods + +// InitializeVirtualPodSupport sets up the parent cgroup for virtual pods +func (h *Host) InitializeVirtualPodSupport(virtualPodsCgroup cgroups.Cgroup) { + h.virtualPodsMutex.Lock() + defer h.virtualPodsMutex.Unlock() + + h.virtualPodsCgroupParent = virtualPodsCgroup + logrus.Info("Virtual pod support initialized") +} + +// CreateVirtualPod creates a new virtual pod with its own cgroup and network namespace +func (h *Host) CreateVirtualPod(ctx context.Context, virtualSandboxID, masterSandboxID, networkNamespace string, memoryLimit *int64) error { + h.virtualPodsMutex.Lock() + defer h.virtualPodsMutex.Unlock() + + // Check if virtual pod already exists + if _, exists := h.virtualPods[virtualSandboxID]; exists { + return fmt.Errorf("virtual pod %s already exists", virtualSandboxID) + } + + // Create cgroup path for this virtual pod under the parent cgroup + parentPath := "" + if h.virtualPodsCgroupParent != nil { + if pather, ok := h.virtualPodsCgroupParent.(interface{ Path() string }); ok { + parentPath = pather.Path() + } else { + parentPath = "/containers/virtual-pods" // fallback for default behavior + } + } else { + parentPath = "/containers/virtual-pods" // fallback for default behavior + } + cgroupPath := path.Join(parentPath, virtualSandboxID) + + // Create the cgroup for this virtual pod with memory limit if provided + resources := &specs.LinuxResources{} + if memoryLimit != nil { + resources.Memory = &specs.LinuxMemory{ + Limit: memoryLimit, + } + logrus.WithFields(logrus.Fields{ + "virtualSandboxID": virtualSandboxID, + "memoryLimit": *memoryLimit, + }).Info("Creating virtual pod with memory limit") + } else { + logrus.WithField("virtualSandboxID", virtualSandboxID).Info("Creating virtual pod without memory limit") + } + + cgroupControl, err := cgroups.New(cgroups.StaticPath(cgroupPath), resources) + if err != nil { + return errors.Wrapf(err, "failed to create cgroup for virtual pod %s", virtualSandboxID) + } + + // Create virtual pod structure + virtualPod := &VirtualPod{ + VirtualSandboxID: virtualSandboxID, + MasterSandboxID: masterSandboxID, + NetworkNamespace: networkNamespace, + CgroupPath: cgroupPath, + CgroupControl: cgroupControl, + Containers: make(map[string]bool), + CreatedAt: time.Now(), + } + + h.virtualPods[virtualSandboxID] = virtualPod + + logrus.WithFields(logrus.Fields{ + "virtualSandboxID": virtualSandboxID, + "masterSandboxID": masterSandboxID, + "cgroupPath": cgroupPath, + "networkNamespace": networkNamespace, + }).Info("Virtual pod created successfully") + + return nil +} + +// CreateVirtualPodWithoutMemoryLimit creates a virtual pod without memory limits (backward compatibility) +func (h *Host) CreateVirtualPodWithoutMemoryLimit(ctx context.Context, virtualSandboxID, masterSandboxID, networkNamespace string) error { + return h.CreateVirtualPod(ctx, virtualSandboxID, masterSandboxID, networkNamespace, nil) +} + +// GetVirtualPod retrieves a virtual pod by its virtualSandboxID +func (h *Host) GetVirtualPod(virtualSandboxID string) (*VirtualPod, bool) { + h.virtualPodsMutex.Lock() + defer h.virtualPodsMutex.Unlock() + + vp, exists := h.virtualPods[virtualSandboxID] + return vp, exists +} + +// AddContainerToVirtualPod associates a container with a virtual pod +func (h *Host) AddContainerToVirtualPod(containerID, virtualSandboxID string) error { + h.virtualPodsMutex.Lock() + defer h.virtualPodsMutex.Unlock() + + // Check if virtual pod exists + vp, exists := h.virtualPods[virtualSandboxID] + if !exists { + return fmt.Errorf("virtual pod %s does not exist", virtualSandboxID) + } + + // Add container to virtual pod + vp.Containers[containerID] = true + h.containerToVirtualPod[containerID] = virtualSandboxID + + logrus.WithFields(logrus.Fields{ + "containerID": containerID, + "virtualSandboxID": virtualSandboxID, + }).Info("Container added to virtual pod") + + return nil +} + +// RemoveContainerFromVirtualPod removes a container from a virtual pod +func (h *Host) RemoveContainerFromVirtualPod(containerID string) { + h.virtualPodsMutex.Lock() + defer h.virtualPodsMutex.Unlock() + + virtualSandboxID, exists := h.containerToVirtualPod[containerID] + if !exists { + return // Container not in any virtual pod + } + + // Remove from virtual pod + if vp, vpExists := h.virtualPods[virtualSandboxID]; vpExists { + delete(vp.Containers, containerID) + + // If this is the sandbox container, delete the network namespace + if containerID == virtualSandboxID && vp.NetworkNamespace != "" { + if err := RemoveNetworkNamespace(context.Background(), vp.NetworkNamespace); err != nil { + logrus.WithError(err).WithField("virtualSandboxID", virtualSandboxID). + Warn("Failed to remove virtual pod network namespace (sandbox container removal)") + } + } + + // If this was the last container, cleanup the virtual pod + if len(vp.Containers) == 0 { + h.cleanupVirtualPod(virtualSandboxID) + } + } + + delete(h.containerToVirtualPod, containerID) + + logrus.WithFields(logrus.Fields{ + "containerID": containerID, + "virtualSandboxID": virtualSandboxID, + }).Info("Container removed from virtual pod") +} + +// cleanupVirtualPod removes a virtual pod and its cgroup (should be called with mutex held) +func (h *Host) cleanupVirtualPod(virtualSandboxID string) { + if vp, exists := h.virtualPods[virtualSandboxID]; exists { + // Delete the cgroup + if err := vp.CgroupControl.Delete(); err != nil { + logrus.WithError(err).WithField("virtualSandboxID", virtualSandboxID). + Warn("Failed to delete virtual pod cgroup") + } + + // Clean up network namespace if this is the last virtual pod using it + // Only remove if this virtual pod was managing the network namespace + if vp.NetworkNamespace != "" { + // For virtual pods, the network namespace is shared, so we only clean it up + // when the virtual pod itself is being destroyed + if err := RemoveNetworkNamespace(context.Background(), vp.NetworkNamespace); err != nil { + logrus.WithError(err).WithField("virtualSandboxID", virtualSandboxID). + Warn("Failed to remove virtual pod network namespace") + } + } + + delete(h.virtualPods, virtualSandboxID) + + logrus.WithField("virtualSandboxID", virtualSandboxID).Info("Virtual pod cleaned up") + } +} + +// setupVirtualPodMountsPath creates mount directories for virtual pods +func setupVirtualPodMountsPath(virtualSandboxID string) (err error) { + // Create virtual pod specific mount path using the new path generation functions + mountPath := specGuest.VirtualPodMountsDir(virtualSandboxID) + if err := os.MkdirAll(mountPath, 0755); err != nil { + return errors.Wrapf(err, "failed to create virtual pod mounts dir in sandbox %v", virtualSandboxID) + } + defer func() { + if err != nil { + _ = os.RemoveAll(mountPath) + } + }() + + return storage.MountRShared(mountPath) +} + +func setupVirtualPodTmpfsMountsPath(virtualSandboxID string) (err error) { + tmpfsDir := specGuest.VirtualPodTmpfsMountsDir(virtualSandboxID) + if err := os.MkdirAll(tmpfsDir, 0755); err != nil { + return errors.Wrapf(err, "failed to create virtual pod tmpfs mounts dir in sandbox %v", virtualSandboxID) + } + + defer func() { + if err != nil { + _ = os.RemoveAll(tmpfsDir) + } + }() + + // mount a tmpfs at the tmpfsDir + // this ensures that the tmpfsDir is a mount point and not just a directory + // we don't care if it is already mounted, so ignore EBUSY + if err := unix.Mount("tmpfs", tmpfsDir, "tmpfs", 0, ""); err != nil && !errors.Is(err, unix.EBUSY) { + return errors.Wrapf(err, "failed to mount tmpfs at %s", tmpfsDir) + } + + return storage.MountRShared(tmpfsDir) +} + +func setupVirtualPodHugePageMountsPath(virtualSandboxID string) error { + mountPath := specGuest.VirtualPodHugePagesMountsDir(virtualSandboxID) + if err := os.MkdirAll(mountPath, 0755); err != nil { + return errors.Wrapf(err, "failed to create virtual pod hugepage mounts dir %v", virtualSandboxID) + } + + return storage.MountRShared(mountPath) +} diff --git a/internal/guest/runtime/hcsv2/workload_container.go b/internal/guest/runtime/hcsv2/workload_container.go index 23fa3a42bc..683c076e2c 100644 --- a/internal/guest/runtime/hcsv2/workload_container.go +++ b/internal/guest/runtime/hcsv2/workload_container.go @@ -33,6 +33,9 @@ func mkdirAllModePerm(target string) error { } func updateSandboxMounts(sbid string, spec *oci.Spec) error { + // Check if this is a virtual pod + virtualSandboxID := spec.Annotations[annotations.VirtualPodID] + for i, m := range spec.Mounts { if !strings.HasPrefix(m.Source, guestpath.SandboxMountPrefix) && !strings.HasPrefix(m.Source, guestpath.SandboxTmpfsMountPrefix) { @@ -42,18 +45,23 @@ func updateSandboxMounts(sbid string, spec *oci.Spec) error { var sandboxSource string // if using `sandbox-tmp://` prefix, we mount a tmpfs in sandboxTmpfsMountsDir if strings.HasPrefix(m.Source, guestpath.SandboxTmpfsMountPrefix) { - sandboxSource = specGuest.SandboxTmpfsMountSource(sbid, m.Source) + // Use virtual pod aware mount source + sandboxSource = specGuest.VirtualPodAwareSandboxTmpfsMountSource(sbid, virtualSandboxID, m.Source) + expectedMountsDir := specGuest.VirtualPodAwareSandboxTmpfsMountsDir(sbid, virtualSandboxID) + // filepath.Join cleans the resulting path before returning, so it would resolve the relative path if one was given. // Hence, we need to ensure that the resolved path is still under the correct directory - if !strings.HasPrefix(sandboxSource, specGuest.SandboxTmpfsMountsDir(sbid)) { - return errors.Errorf("mount path %v for mount %v is not within sandboxTmpfsMountsDir", sandboxSource, m.Source) + if !strings.HasPrefix(sandboxSource, expectedMountsDir) { + return errors.Errorf("mount path %v for mount %v is not within sandbox's tmpfs mounts dir", sandboxSource, m.Source) } - } else { - sandboxSource = specGuest.SandboxMountSource(sbid, m.Source) + // Use virtual pod aware mount source + sandboxSource = specGuest.VirtualPodAwareSandboxMountSource(sbid, virtualSandboxID, m.Source) + expectedMountsDir := specGuest.VirtualPodAwareSandboxMountsDir(sbid, virtualSandboxID) + // filepath.Join cleans the resulting path before returning, so it would resolve the relative path if one was given. // Hence, we need to ensure that the resolved path is still under the correct directory - if !strings.HasPrefix(sandboxSource, specGuest.SandboxMountsDir(sbid)) { + if !strings.HasPrefix(sandboxSource, expectedMountsDir) { return errors.Errorf("mount path %v for mount %v is not within sandbox's mounts dir", sandboxSource, m.Source) } } @@ -71,11 +79,16 @@ func updateSandboxMounts(sbid string, spec *oci.Spec) error { } func updateHugePageMounts(sbid string, spec *oci.Spec) error { + // Check if this is a virtual pod + virtualSandboxID := spec.Annotations[annotations.VirtualPodID] + for i, m := range spec.Mounts { if !strings.HasPrefix(m.Source, guestpath.HugePagesMountPrefix) { continue } - mountsDir := specGuest.HugePagesMountsDir(sbid) + + // Use virtual pod aware hugepages directory + mountsDir := specGuest.VirtualPodAwareHugePagesMountsDir(sbid, virtualSandboxID) subPath := strings.TrimPrefix(m.Source, guestpath.HugePagesMountPrefix) pageSize := strings.Split(subPath, string(os.PathSeparator))[0] hugePageMountSource := filepath.Join(mountsDir, subPath) @@ -224,8 +237,17 @@ func setupWorkloadContainerSpec(ctx context.Context, sbid, id string, spec *oci. } } - // Force the parent cgroup into our /containers root - spec.Linux.CgroupsPath = "/containers/" + id + // Check if this is a virtual pod container + virtualPodID := spec.Annotations[annotations.VirtualPodID] + + // Set cgroup path - check if this is a virtual pod container + if virtualPodID != "" { + // Virtual pod containers go under /containers/virtual-pods/virtualPodID/containerID + spec.Linux.CgroupsPath = "/containers/virtual-pods/" + virtualPodID + "/" + id + } else { + // Regular containers go under /containers + spec.Linux.CgroupsPath = "/containers/" + id + } if spec.Windows != nil { // we only support Nvidia gpus right now diff --git a/internal/guest/spec/spec.go b/internal/guest/spec/spec.go index 7b008a4b15..5518ae4166 100644 --- a/internal/guest/spec/spec.go +++ b/internal/guest/spec/spec.go @@ -79,21 +79,80 @@ func SandboxRootDir(sandboxID string) string { return filepath.Join(guestpath.LCOWRootPrefixInUVM, sandboxID) } +// VirtualPodRootDir returns the virtual pod root directory inside UVM/host. +// This is used when multiple pods share a UVM via virtualSandboxID +func VirtualPodRootDir(virtualSandboxID string) string { + // Ensure virtualSandboxID is a relative path to prevent directory traversal + sanitizedID := filepath.Clean(virtualSandboxID) + if filepath.IsAbs(sanitizedID) || strings.Contains(sanitizedID, "..") { + return "" + } + return filepath.Join(guestpath.LCOWRootPrefixInUVM, "virtual-pods", sanitizedID) +} + +// VirtualPodAwareSandboxRootDir returns the appropriate root directory based on whether +// the sandbox is part of a virtual pod or traditional single-pod setup +func VirtualPodAwareSandboxRootDir(sandboxID, virtualSandboxID string) string { + if virtualSandboxID != "" { + return VirtualPodRootDir(virtualSandboxID) + } + return SandboxRootDir(sandboxID) +} + // SandboxMountsDir returns sandbox mounts directory inside UVM/host. func SandboxMountsDir(sandboxID string) string { return filepath.Join(SandboxRootDir(sandboxID), "sandboxMounts") } +// VirtualPodMountsDir returns virtual pod mounts directory inside UVM/host. +func VirtualPodMountsDir(virtualSandboxID string) string { + return filepath.Join(VirtualPodRootDir(virtualSandboxID), "sandboxMounts") +} + +// VirtualPodAwareSandboxMountsDir returns the appropriate mounts directory +func VirtualPodAwareSandboxMountsDir(sandboxID, virtualSandboxID string) string { + if virtualSandboxID != "" { + return VirtualPodMountsDir(virtualSandboxID) + } + return SandboxMountsDir(sandboxID) +} + // SandboxTmpfsMountsDir returns sandbox tmpfs mounts directory inside UVM. func SandboxTmpfsMountsDir(sandboxID string) string { return filepath.Join(SandboxRootDir(sandboxID), "sandboxTmpfsMounts") } +// VirtualPodTmpfsMountsDir returns virtual pod tmpfs mounts directory inside UVM/host. +func VirtualPodTmpfsMountsDir(virtualSandboxID string) string { + return filepath.Join(VirtualPodRootDir(virtualSandboxID), "sandboxTmpfsMounts") +} + +// VirtualPodAwareSandboxTmpfsMountsDir returns the appropriate tmpfs mounts directory +func VirtualPodAwareSandboxTmpfsMountsDir(sandboxID, virtualSandboxID string) string { + if virtualSandboxID != "" { + return VirtualPodTmpfsMountsDir(virtualSandboxID) + } + return SandboxTmpfsMountsDir(sandboxID) +} + // HugePagesMountsDir returns hugepages mounts directory inside UVM. func HugePagesMountsDir(sandboxID string) string { return filepath.Join(SandboxRootDir(sandboxID), "hugepages") } +// VirtualPodHugePagesMountsDir returns virtual pod hugepages mounts directory +func VirtualPodHugePagesMountsDir(virtualSandboxID string) string { + return filepath.Join(VirtualPodRootDir(virtualSandboxID), "hugepages") +} + +// VirtualPodAwareHugePagesMountsDir returns the appropriate hugepages directory +func VirtualPodAwareHugePagesMountsDir(sandboxID, virtualSandboxID string) string { + if virtualSandboxID != "" { + return VirtualPodHugePagesMountsDir(virtualSandboxID) + } + return HugePagesMountsDir(sandboxID) +} + // SandboxMountSource returns sandbox mount path inside UVM func SandboxMountSource(sandboxID, path string) string { mountsDir := SandboxMountsDir(sandboxID) @@ -101,6 +160,16 @@ func SandboxMountSource(sandboxID, path string) string { return filepath.Join(mountsDir, subPath) } +// VirtualPodAwareSandboxMountSource returns mount source path for virtual pod aware containers +func VirtualPodAwareSandboxMountSource(sandboxID, virtualSandboxID, path string) string { + if virtualSandboxID != "" { + mountsDir := VirtualPodMountsDir(virtualSandboxID) + subPath := strings.TrimPrefix(path, guestpath.SandboxMountPrefix) + return filepath.Join(mountsDir, subPath) + } + return SandboxMountSource(sandboxID, path) +} + // SandboxTmpfsMountSource returns sandbox tmpfs mount path inside UVM func SandboxTmpfsMountSource(sandboxID, path string) string { tmpfsMountDir := SandboxTmpfsMountsDir(sandboxID) @@ -108,6 +177,16 @@ func SandboxTmpfsMountSource(sandboxID, path string) string { return filepath.Join(tmpfsMountDir, subPath) } +// VirtualPodAwareSandboxTmpfsMountSource returns tmpfs mount source path for virtual pod aware containers +func VirtualPodAwareSandboxTmpfsMountSource(sandboxID, virtualSandboxID, path string) string { + if virtualSandboxID != "" { + mountsDir := VirtualPodTmpfsMountsDir(virtualSandboxID) + subPath := strings.TrimPrefix(path, guestpath.SandboxTmpfsMountPrefix) + return filepath.Join(mountsDir, subPath) + } + return SandboxTmpfsMountSource(sandboxID, path) +} + // HugePagesMountSource returns hugepages mount path inside UVM func HugePagesMountSource(sandboxID, path string) string { mountsDir := HugePagesMountsDir(sandboxID) @@ -115,6 +194,16 @@ func HugePagesMountSource(sandboxID, path string) string { return filepath.Join(mountsDir, subPath) } +// VirtualPodAwareHugePagesMountSource returns hugepages mount source for virtual pod aware containers +func VirtualPodAwareHugePagesMountSource(sandboxID, virtualSandboxID, path string) string { + if virtualSandboxID != "" { + mountsDir := VirtualPodHugePagesMountsDir(virtualSandboxID) + subPath := strings.TrimPrefix(path, guestpath.HugePagesMountPrefix) + return filepath.Join(mountsDir, subPath) + } + return HugePagesMountSource(sandboxID, path) +} + // GetNetworkNamespaceID returns the `ToLower` of // `spec.Windows.Network.NetworkNamespace` or `""`. func GetNetworkNamespaceID(spec *oci.Spec) string { diff --git a/internal/hcsoci/create.go b/internal/hcsoci/create.go index 317eef6629..bfc5342b65 100644 --- a/internal/hcsoci/create.go +++ b/internal/hcsoci/create.go @@ -27,6 +27,7 @@ import ( "github.com/Microsoft/hcsshim/internal/resources" "github.com/Microsoft/hcsshim/internal/schemaversion" "github.com/Microsoft/hcsshim/internal/uvm" + "github.com/Microsoft/hcsshim/pkg/annotations" ) var ( @@ -148,10 +149,14 @@ func configureSandboxNetwork(ctx context.Context, coi *createOptionsInternal, r coi.actualNetworkNamespace = r.NetNS() if coi.HostingSystem != nil { + // Check for virtual pod first containers: if containerID == virtualPodID, treat as sandbox for networking configuration + virtualPodID := coi.Spec.Annotations[annotations.VirtualPodID] + isVirtualPodFirstContainer := virtualPodID != "" && coi.actualID == virtualPodID + // Only add the network namespace to a standalone or sandbox // container but not a workload container in a sandbox that inherits // the namespace. - if ct == oci.KubernetesContainerTypeNone || ct == oci.KubernetesContainerTypeSandbox { + if ct == oci.KubernetesContainerTypeNone || ct == oci.KubernetesContainerTypeSandbox || isVirtualPodFirstContainer { if err := coi.HostingSystem.ConfigureNetworking(ctx, coi.actualNetworkNamespace); err != nil { // No network setup type was specified for this UVM. Create and assign one here unless // we received a different error. diff --git a/pkg/annotations/annotations.go b/pkg/annotations/annotations.go index 6e76e59423..8d914d3040 100644 --- a/pkg/annotations/annotations.go +++ b/pkg/annotations/annotations.go @@ -107,6 +107,21 @@ const ( LCOWPrivileged = "io.microsoft.virtualmachine.lcow.privileged" ) +// LCOW multipod annotations enables multipod and warmpooling. +const ( + // SkipPodNetworking is the annotation to skip networking setup for the pod. + // This prevents errors from being raised when the pod is created without endpoints. Boolean. + SkipPodNetworking = "io.microsoft.cri.skip-pod-networking" + + // TenantSandboxID is the annotation to specify the ID of an existing tenant sandbox + // to use for the pod sandbox. If present, the pod will join the specified tenant sandbox. String. + TenantSandboxID = "io.microsoft.cri.tenant-sandbox-id" + + // VirtualPodID is the annotation to specify the pod ID not associated with a shim + // that a container should be placed in. This is used for multipod scenarios. String. + VirtualPodID = "io.microsoft.cri.virtual-pod-id" +) + // LCOW integrity protection and confidential container annotations. const ( // DmVerityCreateArgs specifies the `dm-mod.create` parameters to kernel and enables integrity protection of