diff --git a/init/init.c b/init/init.c index a468531c6b..864bbf5a53 100644 --- a/init/init.c +++ b/init/init.c @@ -6,6 +6,7 @@ #include #include #include +#include #include #include #include @@ -138,6 +139,52 @@ const struct InitOp ops[] = { {OpMount, .mount = {"cgroup_root", "/sys/fs/cgroup", "tmpfs", MS_NODEV | MS_NOSUID | MS_NOEXEC, "mode=0755"}}, }; +/* +rootfs VHDs are mounted as read-only, which can cause issues for binaries running in the +uVM (e.g., syslogd, (GPU) drivers) that expect to be able to write to /etc/ +(e.g., syslogd is configured by /etc/syslog.conf) or /var/ (e.g., syslogd (typically) writes to /var/log/messages). + +Make /var and /etc writable by creating an overlay with a tmpfs-backer upper (and work) directories. + +Use /run for overlay directories since that shouldn't be as volatile as /tmp. +/run is already tmpfs backed, but create a new (smaller) tmpfs mount to prevent contestion +with container-specific files under /run/gcs/c/ (e.g., the container config file and overlay work directory). + +Note: tmpfs is backed by virtual memory and can be swapped out, but the uVM is, itself, virtual memory +backed on the host. +Hence limiting the total size of tmpfs mounts will prevent the virtual machine's worker +thread on the host from growing egregiously. + +See: +- https://refspecs.linuxfoundation.org/FHS_3.0/fhs/ch03s07.html +- https://refspecs.linuxfoundation.org/FHS_3.0/fhs/ch05.html +- https://refspecs.linuxfoundation.org/FHS_3.0/fhs/ch05s10.html +- https://refspecs.linuxfoundation.org/FHS_3.0/fhs/ch03s15.html +*/ +#define OVERLAY_PATH "/run/over" +#define VAR_OVERLAY_PATH OVERLAY_PATH "/var" +#define ETC_OVERLAY_PATH OVERLAY_PATH "/etc" + +const struct InitOp overlay_ops[] = { + // /run should already exist + {OpMkdir, .mkdir = {OVERLAY_PATH, 0755}}, + {OpMount, .mount = {"tmpfs", OVERLAY_PATH, "tmpfs", MS_NODEV | MS_NOSUID | MS_NOEXEC, "size=40\%,mode=0755"}}, + + // /etc + {OpMkdir, .mkdir = {ETC_OVERLAY_PATH, 0755}}, + {OpMkdir, .mkdir = {(ETC_OVERLAY_PATH "/upper"), 0755}}, + {OpMkdir, .mkdir = {(ETC_OVERLAY_PATH "/work"), 0755}}, + {OpMount, .mount = {"overlay", "/etc", "overlay", MS_NODEV | MS_NOSUID | MS_NOEXEC, + "lowerdir=/etc,upperdir=" ETC_OVERLAY_PATH "/upper,workdir=" ETC_OVERLAY_PATH "/work"}}, + + // /var + {OpMkdir, .mkdir = {VAR_OVERLAY_PATH, 0755}}, + {OpMkdir, .mkdir = {VAR_OVERLAY_PATH "/upper", 0755}}, + {OpMkdir, .mkdir = {VAR_OVERLAY_PATH "/work", 0755}}, + {OpMount, .mount = {"overlay", "/var", "overlay", MS_NODEV | MS_NOSUID, // allow execs from the /var + "lowerdir=/var,upperdir=" VAR_OVERLAY_PATH "/upper,workdir=" VAR_OVERLAY_PATH "/work"}}, +}; + void warn(const char* msg) { int error = errno; perror(msg); @@ -592,6 +639,8 @@ int debug_main(int argc, char** argv) { close(sockets[i]); } } + + return 0; } #endif @@ -637,24 +686,27 @@ void start_services() { int main(int argc, char** argv) { #ifdef DEBUG - debug_main(argc, argv); + if (debug_main(argc, argv) != 0) { + dmesgWarn("failed to connect debug sockets"); + } printf("Running init\n"); #endif char* debug_shell = NULL; int entropy_port = 0; + bool overlay_mount = false; if (argc <= 1) { argv = (char**)default_argv; argc = sizeof(default_argv) / sizeof(default_argv[0]); optind = 0; debug_shell = (char*)default_shell; } else { - for (int opt; (opt = getopt(argc, argv, "+d:e:")) >= 0;) { + for (int opt; (opt = getopt(argc, argv, "+d:e:w")) >= 0;) { switch (opt) { - case 'd': + case 'd': // [d]ebug debug_shell = optarg; break; - case 'e': + case 'e': // [e]ntropy port entropy_port = atoi(optarg); #ifdef DEBUG printf("entropy port %d\n", entropy_port); @@ -666,6 +718,10 @@ int main(int argc, char** argv) { break; + case 'w': // [w]ritable overlay mounts + overlay_mount = true; + break; + default: exit(1); } @@ -702,6 +758,13 @@ int main(int argc, char** argv) { #endif init_fs(ops, sizeof(ops) / sizeof(ops[0])); + if (overlay_mount) { +#ifdef DEBUG + printf("init_fs for overlay mounts\n"); +#endif + init_fs(overlay_ops, sizeof(overlay_ops) / sizeof(overlay_ops[0])); + } + #ifdef DEBUG printf("init_cgroups\n"); #endif diff --git a/internal/annotations/annotations.go b/internal/annotations/annotations.go index e97b77a225..85165acd65 100644 --- a/internal/annotations/annotations.go +++ b/internal/annotations/annotations.go @@ -39,6 +39,11 @@ const ( // ExtraVSockPorts adds additional ports to the list of ports that the UVM is allowed to use. ExtraVSockPorts = "io.microsoft.virtualmachine.lcow.extra-vsock-ports" + // WritableOverlayDirs creates writable overlay mounts for the /var and /etc directories. + // + // This will nop if the LCOW uVM rootfs is already writable (e.g., initramfs-backed initrd). + WritableOverlayDirs = "io.microsoft.virtualmachine.lcow.writable-overlay-directories" + // NetworkingPolicyBasedRouting toggles on the ability to set policy based routing in the // guest for LCOW. // diff --git a/internal/oci/uvm.go b/internal/oci/uvm.go index 204456da46..7d1da10bf3 100644 --- a/internal/oci/uvm.go +++ b/internal/oci/uvm.go @@ -318,6 +318,7 @@ func specToUVMCreateOptionsCommon(ctx context.Context, opts *uvm.Options, s *spe opts.ProcessDumpLocation = ParseAnnotationsString(s.Annotations, annotations.ContainerProcessDumpLocation, opts.ProcessDumpLocation) opts.NoWritableFileShares = ParseAnnotationsBool(ctx, s.Annotations, annotations.DisableWritableFileShares, opts.NoWritableFileShares) opts.DumpDirectoryPath = ParseAnnotationsString(s.Annotations, annotations.DumpDirectoryPath, opts.DumpDirectoryPath) + opts.ConsolePipe = ParseAnnotationsString(s.Annotations, iannotations.UVMConsolePipe, opts.ConsolePipe) // NUMA settings opts.MaxProcessorsPerNumaNode = ParseAnnotationsUint32(ctx, s.Annotations, annotations.NumaMaximumProcessorsPerNode, opts.MaxProcessorsPerNumaNode) @@ -330,7 +331,6 @@ func specToUVMCreateOptionsCommon(ctx context.Context, opts *uvm.Options, s *spe opts.NumaProcessorCounts) opts.NumaMemoryBlocksCounts = ParseAnnotationCommaSeparatedUint64(ctx, s.Annotations, annotations.NumaCountOfMemoryBlocks, opts.NumaMemoryBlocksCounts) - opts.ConsolePipe = ParseAnnotationsString(s.Annotations, iannotations.UVMConsolePipe, opts.ConsolePipe) maps.Copy(opts.AdditionalHyperVConfig, parseHVSocketServiceTable(ctx, s.Annotations)) @@ -377,6 +377,7 @@ func SpecToUVMCreateOpts(ctx context.Context, s *specs.Spec, id, owner string) ( lopts.UVMReferenceInfoFile = ParseAnnotationsString(s.Annotations, annotations.LCOWReferenceInfoFile, lopts.UVMReferenceInfoFile) lopts.KernelBootOptions = ParseAnnotationsString(s.Annotations, annotations.KernelBootOptions, lopts.KernelBootOptions) lopts.DisableTimeSyncService = ParseAnnotationsBool(ctx, s.Annotations, annotations.DisableLCOWTimeSyncService, lopts.DisableTimeSyncService) + lopts.WritableOverlayDirs = ParseAnnotationsBool(ctx, s.Annotations, iannotations.WritableOverlayDirs, lopts.WritableOverlayDirs) handleAnnotationPreferredRootFSType(ctx, s.Annotations, lopts) handleAnnotationKernelDirectBoot(ctx, s.Annotations, lopts) handleAnnotationFullyPhysicallyBacked(ctx, s.Annotations, lopts) diff --git a/internal/uvm/create_lcow.go b/internal/uvm/create_lcow.go index d6d9b4f2d3..d0c0a93574 100644 --- a/internal/uvm/create_lcow.go +++ b/internal/uvm/create_lcow.go @@ -132,6 +132,7 @@ type OptionsLCOW struct { ExtraVSockPorts []uint32 // Extra vsock ports to allow AssignedDevices []VPCIDeviceID // AssignedDevices are devices to add on pod boot PolicyBasedRouting bool // Whether we should use policy based routing when configuring net interfaces in guest + WritableOverlayDirs bool // Whether init should create writable overlay mounts for /var and /etc } // defaultLCOWOSBootFilesPath returns the default path used to locate the LCOW @@ -579,7 +580,9 @@ Example JSON document produced once the hcsschema.ComputeSytem returned by makeL // Make the ComputeSystem document object that will be serialized to json to be presented to the HCS api. func makeLCOWDoc(ctx context.Context, opts *OptionsLCOW, uvm *UtilityVM) (_ *hcsschema.ComputeSystem, err error) { - logrus.Tracef("makeLCOWDoc %v\n", opts) + if logrus.IsLevelEnabled(logrus.TraceLevel) { + log.G(ctx).WithField("options", log.Format(ctx, opts)).Trace("makeLCOWDoc") + } kernelFullPath := filepath.Join(opts.BootFilesPath, opts.KernelFile) if _, err := os.Stat(kernelFullPath); os.IsNotExist(err) { @@ -868,10 +871,20 @@ func makeLCOWDoc(ctx context.Context, opts *OptionsLCOW, uvm *UtilityVM) (_ *hcs execCmdArgs += " -core-dump-location " + opts.ProcessDumpLocation } - initArgs := fmt.Sprintf("%s %s", entropyArgs, execCmdArgs) + initArgs := entropyArgs + if opts.WritableOverlayDirs { + switch opts.PreferredRootFSType { + case PreferredRootFSTypeInitRd: + log.G(ctx).Warn("ignoring `WritableOverlayDirs` option since rootfs is already writable") + case PreferredRootFSTypeVHD: + initArgs += " -w" + } + } if vmDebugging { // Launch a shell on the console. - initArgs = entropyArgs + ` sh -c "` + execCmdArgs + ` & exec sh"` + initArgs += ` sh -c "` + execCmdArgs + ` & exec sh"` + } else { + initArgs += " " + execCmdArgs } kernelArgs += fmt.Sprintf(" nr_cpus=%d", opts.ProcessorCount) @@ -915,7 +928,9 @@ func CreateLCOW(ctx context.Context, opts *OptionsLCOW) (_ *UtilityVM, err error } span.AddAttributes(trace.StringAttribute(logfields.UVMID, opts.ID)) - log.G(ctx).WithField("options", log.Format(ctx, opts)).Debug("uvm::CreateLCOW options") + if logrus.IsLevelEnabled(logrus.DebugLevel) { + log.G(ctx).WithField("options", log.Format(ctx, opts)).Debug("uvm::CreateLCOW options") + } // We don't serialize OutputHandlerCreator so if it is missing we need to put it back to the default. if opts.OutputHandlerCreator == nil { @@ -960,10 +975,20 @@ func CreateLCOW(ctx context.Context, opts *OptionsLCOW) (_ *UtilityVM, err error var doc *hcsschema.ComputeSystem if opts.SecurityPolicyEnabled { doc, err = makeLCOWSecurityDoc(ctx, opts, uvm) - log.G(ctx).Tracef("create_lcow::CreateLCOW makeLCOWSecurityDoc result doc: %v err %v", doc, err) + if logrus.IsLevelEnabled(logrus.TraceLevel) { + log.G(ctx).WithFields(logrus.Fields{ + "doc": log.Format(ctx, doc), + logrus.ErrorKey: err, + }).Trace("create_lcow::CreateLCOW makeLCOWSecurityDoc result") + } } else { doc, err = makeLCOWDoc(ctx, opts, uvm) - log.G(ctx).Tracef("create_lcow::CreateLCOW makeLCOWDoc result doc: %v err %v", doc, err) + if logrus.IsLevelEnabled(logrus.TraceLevel) { + log.G(ctx).WithFields(logrus.Fields{ + "doc": log.Format(ctx, doc), + logrus.ErrorKey: err, + }).Trace("create_lcow::CreateLCOW makeLCOWDoc result") + } } if err != nil { return nil, err @@ -972,7 +997,9 @@ func CreateLCOW(ctx context.Context, opts *OptionsLCOW) (_ *UtilityVM, err error if err = uvm.create(ctx, doc); err != nil { return nil, fmt.Errorf("error while creating the compute system: %w", err) } - log.G(ctx).WithField("uvm", uvm).Trace("create_lcow::CreateLCOW uvm.create result") + if logrus.IsLevelEnabled(logrus.TraceLevel) { + log.G(ctx).WithField("uvm", log.Format(ctx, uvm)).Trace("create_lcow::CreateLCOW uvm.create result") + } // Create a socket to inject entropy during boot. uvm.entropyListener, err = uvm.listenVsock(entropyVsockPort) diff --git a/test/functional/lcow_uvm_test.go b/test/functional/lcow_uvm_test.go index 6aa5416229..a703a4b969 100644 --- a/test/functional/lcow_uvm_test.go +++ b/test/functional/lcow_uvm_test.go @@ -6,6 +6,9 @@ package functional import ( "context" "fmt" + "path" + "regexp" + "strings" "testing" "github.com/opencontainers/runtime-spec/specs-go" @@ -174,7 +177,7 @@ func TestLCOW_UVM_KernelArgs(t *testing.T) { ioArgs.TestStdOutContains(t, tc.wantArgs, tc.notWantArgs) - // some boot options (notably using initrd) need to validated by looking at dmesg logs + // some boot options (notably using initrd) need to be validated by looking at dmesg logs // dmesg will output the kernel command line as // // [ 0.000000] Command line: <...> @@ -191,7 +194,7 @@ func TestLCOW_UVM_KernelArgs(t *testing.T) { } } -// TestLCOWUVM_Boot starts and terminates a utility VM multiple times using different boot options. +// TestLCOWUVM_Boot starts and terminates a utility VM multiple times using different boot options. func TestLCOW_UVM_Boot(t *testing.T) { require.Build(t, osversion.RS5) requireFeatures(t, featureLCOW, featureUVM) @@ -224,7 +227,7 @@ func TestLCOW_UVM_Boot(t *testing.T) { opts.RootFSFile = uvm.InitrdFile opts.PreferredRootFSType = uvm.PreferredRootFSTypeInitRd - opts.VPMemDeviceCount = 32 + opts.VPMemDeviceCount = uvm.DefaultVPMEMCount }, }, { @@ -236,7 +239,7 @@ func TestLCOW_UVM_Boot(t *testing.T) { opts.RootFSFile = uvm.VhdFile opts.PreferredRootFSType = uvm.PreferredRootFSTypeVHD - opts.VPMemDeviceCount = 32 + opts.VPMemDeviceCount = uvm.DefaultVPMEMCount }, }, { @@ -248,7 +251,7 @@ func TestLCOW_UVM_Boot(t *testing.T) { opts.PreferredRootFSType = uvm.PreferredRootFSTypeVHD opts.RootFSFile = uvm.VhdFile - opts.VPMemDeviceCount = 32 + opts.VPMemDeviceCount = uvm.DefaultVPMEMCount }, }, } { @@ -269,3 +272,183 @@ func TestLCOW_UVM_Boot(t *testing.T) { }) } } + +func TestLCOW_UVM_WritableOverlay(t *testing.T) { + require.Build(t, osversion.RS5) + requireFeatures(t, featureLCOW, featureUVM) + + ctx := util.Context(context.Background(), t) + + // validate the init flags are as expected + // theres some weirdness with getting the exact init command line + // the kernel's command line will have init args after the `--` (via `/proc/cmdline`) + // + // init's command line is under `/proc/1/cmdline`, but with `\0` as separator + // between args (which makes reading from the command line awkward). + // (could use `ps -o args | sed -n '2{p;q}'`, which has the appropriate parsing) + // + // we already rely on `proc/cmdline` above, so stick with that. + // (potentially) match against uVM debugging scenarios, which execs a shell before vsockexec + re := regexp.MustCompile(`-- (.*) (?:sh -c ")?/bin/vsockexec`) + + for _, tc := range []struct { + name string + optsFn func(*uvm.OptionsLCOW) + }{ + { + name: "no kernel direct initrd", + optsFn: func(opts *uvm.OptionsLCOW) { + opts.KernelDirect = false + opts.KernelFile = uvm.KernelFile + + opts.RootFSFile = uvm.InitrdFile + opts.PreferredRootFSType = uvm.PreferredRootFSTypeInitRd + }, + }, + { + name: "kernel direct initrd", + optsFn: func(opts *uvm.OptionsLCOW) { + opts.KernelDirect = true + opts.KernelFile = uvm.UncompressedKernelFile + + opts.RootFSFile = uvm.InitrdFile + opts.PreferredRootFSType = uvm.PreferredRootFSTypeInitRd + }, + }, + { + name: "no kernel direct VHD", + optsFn: func(opts *uvm.OptionsLCOW) { + opts.KernelDirect = false + opts.KernelFile = uvm.KernelFile + + opts.RootFSFile = uvm.VhdFile + opts.PreferredRootFSType = uvm.PreferredRootFSTypeVHD + }, + }, + { + name: "kernel direct VHD", + optsFn: func(opts *uvm.OptionsLCOW) { + opts.KernelDirect = true + opts.KernelFile = uvm.UncompressedKernelFile + + opts.PreferredRootFSType = uvm.PreferredRootFSTypeVHD + opts.RootFSFile = uvm.VhdFile + }, + }, + } { + for _, writable := range []bool{false, true} { + n := tc.name + if writable { + n += " writable" + } + t.Run(n, func(t *testing.T) { + // create new options every time, in case they are modified during uVM creation + opts := defaultLCOWOptions(ctx, t) + tc.optsFn(opts) + opts.WritableOverlayDirs = writable + + if opts.KernelDirect { + require.Build(t, 18286) + } + + // mounts are only added for VHD rootfs + overlay := writable && (opts.PreferredRootFSType == uvm.PreferredRootFSTypeVHD) + + vm := testuvm.CreateAndStartLCOWFromOpts(ctx, t, opts) + + // subtests just to namespace variables + + // check for correct init args + t.Run("init args", func(t *testing.T) { + io := testcmd.NewBufferedIO() + c := testcmd.Create(ctx, t, vm, &specs.Process{Args: []string{"cat", "/proc/cmdline"}}, io) + testcmd.Start(ctx, t, c) + testcmd.WaitExitCode(ctx, t, c, 0) + + out, err := io.Output() + out = strings.TrimSpace(out) + if err != nil { + t.Fatalf("got stderr: %v", err) + } + t.Logf("stdout:\n%s\n", out) + + ms := re.FindStringSubmatch(out) + if len(ms) != 2 { + t.Fatalf("failed to match %v: %v", re, ms) + } + + args := ms[1] + if found := strings.Contains(args, " -w"); overlay && !found { + t.Fatalf("expected '-w' flag in: %s", args) + } else if !overlay && found { + t.Fatalf("unexpected '-w' flag in: %s", args) + } + }) + + // validate /var and /etc are writable + for _, dir := range []string{"var", "etc"} { + t.Run("writable "+dir, func(t *testing.T) { + const hello = "hello world" + f := path.Join("/", dir, "t.txt") + + ec := 0 + outWant := hello + var errWant error + if !writable && (opts.PreferredRootFSType == uvm.PreferredRootFSTypeVHD) { + ec = 1 + outWant = "" + errWant = fmt.Errorf("sh: %s: Read-only file system", f) + } + + io := testcmd.NewBufferedIO() + c := testcmd.Create(ctx, t, vm, &specs.Process{Args: []string{ + "sh", "-c", + fmt.Sprintf("echo %s>%s&&cat %s", hello, f, f), + }}, io) + testcmd.Start(ctx, t, c) + testcmd.WaitExitCode(ctx, t, c, ec) + + io.TestOutput(t, outWant, errWant) + }) + } + + // parse mounts + if overlay { + for _, tcc := range []struct { + mType string + dir string + want []string + }{ + { + mType: "tmpfs", + dir: "/run/over", + want: []string{"rw", "nosuid", "nodev", "noexec", "mode=755"}, + }, + { + mType: "overlay", + dir: "/etc", + want: []string{"rw", "nosuid", "nodev", "noexec", "mode=755"}, + }, + { + mType: "overlay", + dir: "/var", + want: []string{"rw", "nosuid", "nodev", "mode=755"}, + }, + } { + io := testcmd.NewBufferedIO() + c := testcmd.Create(ctx, t, vm, &specs.Process{Args: []string{ + "sh", "-c", + fmt.Sprintf("mount -t %s | grep %s", tcc.mType, tcc.dir), + }}, io) + testcmd.Start(ctx, t, c) + testcmd.WaitExitCode(ctx, t, c, 0) + + io.TestStdOutContains(t, tcc.want, nil) + } + } + + testuvm.Close(ctx, t, vm) + }) + } + } +} diff --git a/test/internal/cmd/io.go b/test/internal/cmd/io.go index 8a5d05d5da..cb4b815667 100644 --- a/test/internal/cmd/io.go +++ b/test/internal/cmd/io.go @@ -42,8 +42,17 @@ func (b *BufferedIO) TestOutput(tb testing.TB, out string, err error) { tb.Helper() outGot, errGot := b.Output() - if !errors.Is(errGot, err) { + + // dont use [errors.Is] since errGot will always be created via [errors.New] and + // therefore never match non-nil errors. + if (err == nil && errGot != nil) || (err != nil && errGot == nil) { tb.Fatalf("got stderr: %v; wanted: %v", errGot, err) + } else if err != nil && errGot != nil { + errStr := strings.ToLower(strings.TrimSpace(err.Error())) + errGotStr := strings.ToLower(strings.TrimSpace(errGot.Error())) + if diff := cmp.Diff(errStr, errGotStr); diff != "" { + tb.Fatalf("stderr mismatch (-want +got):\n%s", diff) + } } out = strings.ToLower(strings.TrimSpace(out))