Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
97 changes: 63 additions & 34 deletions cmd/gcs/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@ import (
"os"
"os/exec"
"path/filepath"
"strings"
"syscall"
"time"

Expand Down Expand Up @@ -67,7 +68,12 @@ func readMemoryEvents(startTime time.Time, efdFile *os.File, cgName string, thre
}

count++
msg := "memory usage for cgroup exceeded threshold"
var msg string
if strings.HasPrefix(cgName, "/virtual-pods") {
msg = "memory usage for virtual pods cgroup exceeded threshold"
} else {
msg = "memory usage for cgroup exceeded threshold"
}
entry := logrus.WithFields(logrus.Fields{
"gcsStartTime": startTime,
"time": time.Now(),
Expand Down Expand Up @@ -294,40 +300,9 @@ func main() {
// Continuously log /dev/kmsg
go kmsg.ReadForever(kmsg.LogLevel(*kmsgLogLevel))

tport := &transport.VsockTransport{}
rtime, err := runc.NewRuntime(baseLogPath)
if err != nil {
logrus.WithError(err).Fatal("failed to initialize new runc runtime")
}
mux := bridge.NewBridgeMux()
b := bridge.Bridge{
Handler: mux,
EnableV4: *v4,
}
h := hcsv2.NewHost(rtime, tport, initialEnforcer, logWriter)
b.AssignHandlers(mux, h)

var bridgeIn io.ReadCloser
var bridgeOut io.WriteCloser
if *useInOutErr {
bridgeIn = os.Stdin
bridgeOut = os.Stdout
} else {
const commandPort uint32 = 0x40000000
bridgeCon, err := tport.Dial(commandPort)
if err != nil {
logrus.WithFields(logrus.Fields{
"port": commandPort,
logrus.ErrorKey: err,
}).Fatal("failed to dial host vsock connection")
}
bridgeIn = bridgeCon
bridgeOut = bridgeCon
}

// Setup the UVM cgroups to protect against a workload taking all available
// memory and causing the GCS to malfunction we create two cgroups: gcs,
// containers.
// memory and causing the GCS to malfunction we create cgroups: gcs,
// containers, and virtual-pods for multi-pod support.
//

// Write 1 to memory.use_hierarchy on the root cgroup to enable hierarchy
Expand Down Expand Up @@ -357,6 +332,18 @@ func main() {
}
defer containersControl.Delete() //nolint:errcheck

// Create virtual-pods cgroup hierarchy for multi-pod support
// This will be the parent for all virtual pod cgroups: /containers/virtual-pods/{virtualSandboxID}
virtualPodsControl, err := cgroups.New(cgroups.StaticPath("/containers/virtual-pods"), &oci.LinuxResources{
Memory: &oci.LinuxMemory{
Limit: &containersLimit, // Share the same limit as containers
},
})
if err != nil {
logrus.WithError(err).Fatal("failed to create containers/virtual-pods cgroup")
}
defer virtualPodsControl.Delete() //nolint:errcheck

gcsControl, err := cgroups.New(cgroups.StaticPath("/gcs"), &oci.LinuxResources{})
if err != nil {
logrus.WithError(err).Fatal("failed to create gcs cgroup")
Expand All @@ -366,6 +353,39 @@ func main() {
logrus.WithError(err).Fatal("failed add gcs pid to gcs cgroup")
}

tport := &transport.VsockTransport{}
rtime, err := runc.NewRuntime(baseLogPath)
if err != nil {
logrus.WithError(err).Fatal("failed to initialize new runc runtime")
}
mux := bridge.NewBridgeMux()
b := bridge.Bridge{
Handler: mux,
EnableV4: *v4,
}
h := hcsv2.NewHost(rtime, tport, initialEnforcer, logWriter)
// Initialize virtual pod support in the host
h.InitializeVirtualPodSupport(virtualPodsControl)
b.AssignHandlers(mux, h)

var bridgeIn io.ReadCloser
var bridgeOut io.WriteCloser
if *useInOutErr {
bridgeIn = os.Stdin
bridgeOut = os.Stdout
} else {
const commandPort uint32 = 0x40000000
bridgeCon, err := tport.Dial(commandPort)
if err != nil {
logrus.WithFields(logrus.Fields{
"port": commandPort,
logrus.ErrorKey: err,
}).Fatal("failed to dial host vsock connection")
}
bridgeIn = bridgeCon
bridgeOut = bridgeCon
}

event := cgroups.MemoryThresholdEvent(*gcsMemLimitBytes, false)
gefd, err := gcsControl.RegisterMemoryEvent(event)
if err != nil {
Expand All @@ -381,6 +401,14 @@ func main() {
oomFile := os.NewFile(oom, "cefd")
defer oomFile.Close()

// Setup OOM monitoring for virtual-pods cgroup
virtualPodsOom, err := virtualPodsControl.OOMEventFD()
if err != nil {
logrus.WithError(err).Fatal("failed to retrieve the virtual-pods cgroups oom eventfd")
}
virtualPodsOomFile := os.NewFile(virtualPodsOom, "vp-oomfd")
defer virtualPodsOomFile.Close()

// time synchronization service
if !(*disableTimeSync) {
if err = startTimeSyncService(); err != nil {
Expand All @@ -390,6 +418,7 @@ func main() {

go readMemoryEvents(startTime, gefdFile, "/gcs", int64(*gcsMemLimitBytes), gcsControl)
go readMemoryEvents(startTime, oomFile, "/containers", containersLimit, containersControl)
go readMemoryEvents(startTime, virtualPodsOomFile, "/containers/virtual-pods", containersLimit, virtualPodsControl)
err = b.ListenAndServe(bridgeIn, bridgeOut)
if err != nil {
logrus.WithFields(logrus.Fields{
Expand Down
20 changes: 16 additions & 4 deletions internal/guest/runtime/hcsv2/container.go
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@ import (
"github.com/Microsoft/hcsshim/internal/oc"
"github.com/Microsoft/hcsshim/internal/protocol/guestrequest"
"github.com/Microsoft/hcsshim/internal/protocol/guestresource"
"github.com/Microsoft/hcsshim/pkg/annotations"
)

// containerStatus has been introduced to enable parallel container creation
Expand Down Expand Up @@ -193,13 +194,24 @@ func (c *Container) Delete(ctx context.Context) error {
entity := log.G(ctx).WithField(logfields.ContainerID, c.id)
entity.Info("opengcs::Container::Delete")
if c.isSandbox {
// remove user mounts in sandbox container
if err := storage.UnmountAllInPath(ctx, specGuest.SandboxMountsDir(c.id), true); err != nil {
// Check if this is a virtual pod
virtualSandboxID := ""
if c.spec != nil && c.spec.Annotations != nil {
virtualSandboxID = c.spec.Annotations[annotations.VirtualPodID]
}

// remove user mounts in sandbox container - use virtual pod aware paths
if err := storage.UnmountAllInPath(ctx, specGuest.VirtualPodAwareSandboxMountsDir(c.id, virtualSandboxID), true); err != nil {
entity.WithError(err).Error("failed to unmount sandbox mounts")
}

// remove hugepages mounts in sandbox container
if err := storage.UnmountAllInPath(ctx, specGuest.HugePagesMountsDir(c.id), true); err != nil {
// remove user mounts in tmpfs sandbox container - use virtual pod aware paths
if err := storage.UnmountAllInPath(ctx, specGuest.VirtualPodAwareSandboxTmpfsMountsDir(c.id, virtualSandboxID), true); err != nil {
entity.WithError(err).Error("failed to unmount tmpfs sandbox mounts")
}

// remove hugepages mounts in sandbox container - use virtual pod aware paths
if err := storage.UnmountAllInPath(ctx, specGuest.VirtualPodAwareHugePagesMountsDir(c.id, virtualSandboxID), true); err != nil {
entity.WithError(err).Error("failed to unmount hugepages mounts")
}
}
Expand Down
80 changes: 52 additions & 28 deletions internal/guest/runtime/hcsv2/sandbox_container.go
Original file line number Diff line number Diff line change
Expand Up @@ -15,20 +15,21 @@ import (

"github.com/Microsoft/hcsshim/internal/guest/network"
specGuest "github.com/Microsoft/hcsshim/internal/guest/spec"
"github.com/Microsoft/hcsshim/internal/log"
"github.com/Microsoft/hcsshim/internal/oc"
"github.com/Microsoft/hcsshim/pkg/annotations"
)

func getSandboxHostnamePath(id string) string {
return filepath.Join(specGuest.SandboxRootDir(id), "hostname")
func getSandboxHostnamePath(id, virtualSandboxID string) string {
return filepath.Join(specGuest.VirtualPodAwareSandboxRootDir(id, virtualSandboxID), "hostname")
}

func getSandboxHostsPath(id string) string {
return filepath.Join(specGuest.SandboxRootDir(id), "hosts")
func getSandboxHostsPath(id, virtualSandboxID string) string {
return filepath.Join(specGuest.VirtualPodAwareSandboxRootDir(id, virtualSandboxID), "hosts")
}

func getSandboxResolvPath(id string) string {
return filepath.Join(specGuest.SandboxRootDir(id), "resolv.conf")
func getSandboxResolvPath(id, virtualSandboxID string) string {
return filepath.Join(specGuest.VirtualPodAwareSandboxRootDir(id, virtualSandboxID), "resolv.conf")
}

func setupSandboxContainerSpec(ctx context.Context, id string, spec *oci.Spec) (err error) {
Expand All @@ -37,8 +38,11 @@ func setupSandboxContainerSpec(ctx context.Context, id string, spec *oci.Spec) (
defer func() { oc.SetSpanStatus(span, err) }()
span.AddAttributes(trace.StringAttribute("cid", id))

// Generate the sandbox root dir
rootDir := specGuest.SandboxRootDir(id)
// Check if this is a virtual pod to use appropriate root directory
virtualSandboxID := spec.Annotations[annotations.VirtualPodID]

// Generate the sandbox root dir - virtual pod aware
rootDir := specGuest.VirtualPodAwareSandboxRootDir(id, virtualSandboxID)
if err := os.MkdirAll(rootDir, 0755); err != nil {
return errors.Wrapf(err, "failed to create sandbox root directory %q", rootDir)
}
Expand All @@ -58,39 +62,53 @@ func setupSandboxContainerSpec(ctx context.Context, id string, spec *oci.Spec) (
}
}

sandboxHostnamePath := getSandboxHostnamePath(id)
sandboxHostnamePath := getSandboxHostnamePath(id, virtualSandboxID)
if err := os.WriteFile(sandboxHostnamePath, []byte(hostname+"\n"), 0644); err != nil {
return errors.Wrapf(err, "failed to write hostname to %q", sandboxHostnamePath)
}

// Write the hosts
sandboxHostsContent := network.GenerateEtcHostsContent(ctx, hostname)
sandboxHostsPath := getSandboxHostsPath(id)
sandboxHostsPath := getSandboxHostsPath(id, virtualSandboxID)
if err := os.WriteFile(sandboxHostsPath, []byte(sandboxHostsContent), 0644); err != nil {
return errors.Wrapf(err, "failed to write sandbox hosts to %q", sandboxHostsPath)
}

// Check if this is a virtual pod sandbox container by comparing container ID with virtual pod ID
isVirtualPodSandbox := virtualSandboxID != "" && id == virtualSandboxID
if strings.EqualFold(spec.Annotations[annotations.SkipPodNetworking], "true") || isVirtualPodSandbox {
ns := GetOrAddNetworkNamespace(specGuest.GetNetworkNamespaceID(spec))
err := ns.Sync(ctx)
if err != nil {
return err
}
}
// Write resolv.conf
ns, err := getNetworkNamespace(specGuest.GetNetworkNamespaceID(spec))
if err != nil {
return err
}
var searches, servers []string
for _, n := range ns.Adapters() {
if len(n.DNSSuffix) > 0 {
searches = network.MergeValues(searches, strings.Split(n.DNSSuffix, ","))
if !strings.EqualFold(spec.Annotations[annotations.SkipPodNetworking], "true") {
return err
}
if len(n.DNSServerList) > 0 {
servers = network.MergeValues(servers, strings.Split(n.DNSServerList, ","))
// Networking is skipped, do not error out
log.G(ctx).Infof("setupSandboxContainerSpec: Did not find NS spec %v, err %v", spec, err)
} else {
var searches, servers []string
for _, n := range ns.Adapters() {
if len(n.DNSSuffix) > 0 {
searches = network.MergeValues(searches, strings.Split(n.DNSSuffix, ","))
}
if len(n.DNSServerList) > 0 {
servers = network.MergeValues(servers, strings.Split(n.DNSServerList, ","))
}
}
resolvContent, err := network.GenerateResolvConfContent(ctx, searches, servers, nil)
if err != nil {
return errors.Wrap(err, "failed to generate sandbox resolv.conf content")
}
sandboxResolvPath := getSandboxResolvPath(id, virtualSandboxID)
if err := os.WriteFile(sandboxResolvPath, []byte(resolvContent), 0644); err != nil {
return errors.Wrap(err, "failed to write sandbox resolv.conf")
}
}
resolvContent, err := network.GenerateResolvConfContent(ctx, searches, servers, nil)
if err != nil {
return errors.Wrap(err, "failed to generate sandbox resolv.conf content")
}
sandboxResolvPath := getSandboxResolvPath(id)
if err := os.WriteFile(sandboxResolvPath, []byte(resolvContent), 0644); err != nil {
return errors.Wrap(err, "failed to write sandbox resolv.conf")
}

// User.Username is generally only used on Windows, but as there's no (easy/fast at least) way to grab
Expand All @@ -113,8 +131,14 @@ func setupSandboxContainerSpec(ctx context.Context, id string, spec *oci.Spec) (
// also has a concept of a sandbox/shm file when the IPC NamespaceMode !=
// NODE.

// Force the parent cgroup into our /containers root
spec.Linux.CgroupsPath = "/containers/" + id
// Set cgroup path - check if this is a virtual pod
if virtualSandboxID != "" {
// Virtual pod sandbox gets its own cgroup under /containers/virtual-pods using the virtual pod ID
spec.Linux.CgroupsPath = "/containers/virtual-pods/" + virtualSandboxID
} else {
// Traditional sandbox goes under /containers
spec.Linux.CgroupsPath = "/containers/" + id
}

// Clear the windows section as we dont want to forward to runc
spec.Windows = nil
Expand Down
Loading