Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
71 changes: 67 additions & 4 deletions init/init.c
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
#include <net/if.h>
#include <netinet/ip.h>
#include <signal.h>
#include <stdbool.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
Expand Down Expand Up @@ -138,6 +139,52 @@ const struct InitOp ops[] = {
{OpMount, .mount = {"cgroup_root", "/sys/fs/cgroup", "tmpfs", MS_NODEV | MS_NOSUID | MS_NOEXEC, "mode=0755"}},
};

/*
rootfs VHDs are mounted as read-only, which can cause issues for binaries running in the
uVM (e.g., syslogd, (GPU) drivers) that expect to be able to write to /etc/
(e.g., syslogd is configured by /etc/syslog.conf) or /var/ (e.g., syslogd (typically) writes to /var/log/messages).

Make /var and /etc writable by creating an overlay with a tmpfs-backer upper (and work) directories.

Use /run for overlay directories since that shouldn't be as volatile as /tmp.
/run is already tmpfs backed, but create a new (smaller) tmpfs mount to prevent contestion
with container-specific files under /run/gcs/c/ (e.g., the container config file and overlay work directory).

Note: tmpfs is backed by virtual memory and can be swapped out, but the uVM is, itself, virtual memory
backed on the host.
Hence limiting the total size of tmpfs mounts will prevent the virtual machine's worker
thread on the host from growing egregiously.

See:
- https://refspecs.linuxfoundation.org/FHS_3.0/fhs/ch03s07.html
- https://refspecs.linuxfoundation.org/FHS_3.0/fhs/ch05.html
- https://refspecs.linuxfoundation.org/FHS_3.0/fhs/ch05s10.html
- https://refspecs.linuxfoundation.org/FHS_3.0/fhs/ch03s15.html
*/
#define OVERLAY_PATH "/run/over"
#define VAR_OVERLAY_PATH OVERLAY_PATH "/var"
#define ETC_OVERLAY_PATH OVERLAY_PATH "/etc"

const struct InitOp overlay_ops[] = {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Would hard-coding this be restrictive?
Say tomorrow somebody wants to write logs to /tmp and want it to be overlay.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

yes, but the flipside would be the complexity of C code needed to handle parsing and validating a list of user-provided directories from the command line

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

True, but we will future proof this so that we don't need code change every time new directory needs to be writable.
Should we pass something like "--overlay-dirs=/etc,/var,/opt" to init process?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

we can always add that in the future, but i think the long option parsing and additional path validation would over-complicate this already complicated PR

// /run should already exist
{OpMkdir, .mkdir = {OVERLAY_PATH, 0755}},
{OpMount, .mount = {"tmpfs", OVERLAY_PATH, "tmpfs", MS_NODEV | MS_NOSUID | MS_NOEXEC, "size=40\%,mode=0755"}},

// /etc
{OpMkdir, .mkdir = {ETC_OVERLAY_PATH, 0755}},
{OpMkdir, .mkdir = {(ETC_OVERLAY_PATH "/upper"), 0755}},
{OpMkdir, .mkdir = {(ETC_OVERLAY_PATH "/work"), 0755}},
{OpMount, .mount = {"overlay", "/etc", "overlay", MS_NODEV | MS_NOSUID | MS_NOEXEC,
"lowerdir=/etc,upperdir=" ETC_OVERLAY_PATH "/upper,workdir=" ETC_OVERLAY_PATH "/work"}},

// /var
{OpMkdir, .mkdir = {VAR_OVERLAY_PATH, 0755}},
{OpMkdir, .mkdir = {VAR_OVERLAY_PATH "/upper", 0755}},
{OpMkdir, .mkdir = {VAR_OVERLAY_PATH "/work", 0755}},
{OpMount, .mount = {"overlay", "/var", "overlay", MS_NODEV | MS_NOSUID, // allow execs from the /var
"lowerdir=/var,upperdir=" VAR_OVERLAY_PATH "/upper,workdir=" VAR_OVERLAY_PATH "/work"}},
};

void warn(const char* msg) {
int error = errno;
perror(msg);
Expand Down Expand Up @@ -592,6 +639,8 @@ int debug_main(int argc, char** argv) {
close(sockets[i]);
}
}

return 0;
}
#endif

Expand Down Expand Up @@ -637,24 +686,27 @@ void start_services() {

int main(int argc, char** argv) {
#ifdef DEBUG
debug_main(argc, argv);
if (debug_main(argc, argv) != 0) {
dmesgWarn("failed to connect debug sockets");
}
printf("Running init\n");
#endif
char* debug_shell = NULL;
int entropy_port = 0;
bool overlay_mount = false;
if (argc <= 1) {
argv = (char**)default_argv;
argc = sizeof(default_argv) / sizeof(default_argv[0]);
optind = 0;
debug_shell = (char*)default_shell;
} else {
for (int opt; (opt = getopt(argc, argv, "+d:e:")) >= 0;) {
for (int opt; (opt = getopt(argc, argv, "+d:e:w")) >= 0;) {
switch (opt) {
case 'd':
case 'd': // [d]ebug
debug_shell = optarg;
break;

case 'e':
case 'e': // [e]ntropy port
entropy_port = atoi(optarg);
#ifdef DEBUG
printf("entropy port %d\n", entropy_port);
Expand All @@ -666,6 +718,10 @@ int main(int argc, char** argv) {

break;

case 'w': // [w]ritable overlay mounts
overlay_mount = true;
break;

default:
exit(1);
}
Expand Down Expand Up @@ -702,6 +758,13 @@ int main(int argc, char** argv) {
#endif
init_fs(ops, sizeof(ops) / sizeof(ops[0]));

if (overlay_mount) {
#ifdef DEBUG
printf("init_fs for overlay mounts\n");
#endif
init_fs(overlay_ops, sizeof(overlay_ops) / sizeof(overlay_ops[0]));
}

#ifdef DEBUG
printf("init_cgroups\n");
#endif
Expand Down
5 changes: 5 additions & 0 deletions internal/annotations/annotations.go
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,11 @@ const (
// ExtraVSockPorts adds additional ports to the list of ports that the UVM is allowed to use.
ExtraVSockPorts = "io.microsoft.virtualmachine.lcow.extra-vsock-ports"

// WritableOverlayDirs creates writable overlay mounts for the /var and /etc directories.
//
// This will nop if the LCOW uVM rootfs is already writable (e.g., initramfs-backed initrd).
WritableOverlayDirs = "io.microsoft.virtualmachine.lcow.writable-overlay-directories"

// NetworkingPolicyBasedRouting toggles on the ability to set policy based routing in the
// guest for LCOW.
//
Expand Down
3 changes: 2 additions & 1 deletion internal/oci/uvm.go
Original file line number Diff line number Diff line change
Expand Up @@ -318,6 +318,7 @@ func specToUVMCreateOptionsCommon(ctx context.Context, opts *uvm.Options, s *spe
opts.ProcessDumpLocation = ParseAnnotationsString(s.Annotations, annotations.ContainerProcessDumpLocation, opts.ProcessDumpLocation)
opts.NoWritableFileShares = ParseAnnotationsBool(ctx, s.Annotations, annotations.DisableWritableFileShares, opts.NoWritableFileShares)
opts.DumpDirectoryPath = ParseAnnotationsString(s.Annotations, annotations.DumpDirectoryPath, opts.DumpDirectoryPath)
opts.ConsolePipe = ParseAnnotationsString(s.Annotations, iannotations.UVMConsolePipe, opts.ConsolePipe)

// NUMA settings
opts.MaxProcessorsPerNumaNode = ParseAnnotationsUint32(ctx, s.Annotations, annotations.NumaMaximumProcessorsPerNode, opts.MaxProcessorsPerNumaNode)
Expand All @@ -330,7 +331,6 @@ func specToUVMCreateOptionsCommon(ctx context.Context, opts *uvm.Options, s *spe
opts.NumaProcessorCounts)
opts.NumaMemoryBlocksCounts = ParseAnnotationCommaSeparatedUint64(ctx, s.Annotations, annotations.NumaCountOfMemoryBlocks,
opts.NumaMemoryBlocksCounts)
opts.ConsolePipe = ParseAnnotationsString(s.Annotations, iannotations.UVMConsolePipe, opts.ConsolePipe)

maps.Copy(opts.AdditionalHyperVConfig, parseHVSocketServiceTable(ctx, s.Annotations))

Expand Down Expand Up @@ -377,6 +377,7 @@ func SpecToUVMCreateOpts(ctx context.Context, s *specs.Spec, id, owner string) (
lopts.UVMReferenceInfoFile = ParseAnnotationsString(s.Annotations, annotations.LCOWReferenceInfoFile, lopts.UVMReferenceInfoFile)
lopts.KernelBootOptions = ParseAnnotationsString(s.Annotations, annotations.KernelBootOptions, lopts.KernelBootOptions)
lopts.DisableTimeSyncService = ParseAnnotationsBool(ctx, s.Annotations, annotations.DisableLCOWTimeSyncService, lopts.DisableTimeSyncService)
lopts.WritableOverlayDirs = ParseAnnotationsBool(ctx, s.Annotations, iannotations.WritableOverlayDirs, lopts.WritableOverlayDirs)
handleAnnotationPreferredRootFSType(ctx, s.Annotations, lopts)
handleAnnotationKernelDirectBoot(ctx, s.Annotations, lopts)
handleAnnotationFullyPhysicallyBacked(ctx, s.Annotations, lopts)
Expand Down
41 changes: 34 additions & 7 deletions internal/uvm/create_lcow.go
Original file line number Diff line number Diff line change
Expand Up @@ -132,6 +132,7 @@ type OptionsLCOW struct {
ExtraVSockPorts []uint32 // Extra vsock ports to allow
AssignedDevices []VPCIDeviceID // AssignedDevices are devices to add on pod boot
PolicyBasedRouting bool // Whether we should use policy based routing when configuring net interfaces in guest
WritableOverlayDirs bool // Whether init should create writable overlay mounts for /var and /etc
}

// defaultLCOWOSBootFilesPath returns the default path used to locate the LCOW
Expand Down Expand Up @@ -579,7 +580,9 @@ Example JSON document produced once the hcsschema.ComputeSytem returned by makeL

// Make the ComputeSystem document object that will be serialized to json to be presented to the HCS api.
func makeLCOWDoc(ctx context.Context, opts *OptionsLCOW, uvm *UtilityVM) (_ *hcsschema.ComputeSystem, err error) {
logrus.Tracef("makeLCOWDoc %v\n", opts)
if logrus.IsLevelEnabled(logrus.TraceLevel) {
log.G(ctx).WithField("options", log.Format(ctx, opts)).Trace("makeLCOWDoc")
}

kernelFullPath := filepath.Join(opts.BootFilesPath, opts.KernelFile)
if _, err := os.Stat(kernelFullPath); os.IsNotExist(err) {
Expand Down Expand Up @@ -868,10 +871,20 @@ func makeLCOWDoc(ctx context.Context, opts *OptionsLCOW, uvm *UtilityVM) (_ *hcs
execCmdArgs += " -core-dump-location " + opts.ProcessDumpLocation
}

initArgs := fmt.Sprintf("%s %s", entropyArgs, execCmdArgs)
initArgs := entropyArgs
if opts.WritableOverlayDirs {
switch opts.PreferredRootFSType {
case PreferredRootFSTypeInitRd:
log.G(ctx).Warn("ignoring `WritableOverlayDirs` option since rootfs is already writable")
case PreferredRootFSTypeVHD:
initArgs += " -w"
}
}
if vmDebugging {
// Launch a shell on the console.
initArgs = entropyArgs + ` sh -c "` + execCmdArgs + ` & exec sh"`
initArgs += ` sh -c "` + execCmdArgs + ` & exec sh"`
} else {
initArgs += " " + execCmdArgs
}

kernelArgs += fmt.Sprintf(" nr_cpus=%d", opts.ProcessorCount)
Expand Down Expand Up @@ -915,7 +928,9 @@ func CreateLCOW(ctx context.Context, opts *OptionsLCOW) (_ *UtilityVM, err error
}

span.AddAttributes(trace.StringAttribute(logfields.UVMID, opts.ID))
log.G(ctx).WithField("options", log.Format(ctx, opts)).Debug("uvm::CreateLCOW options")
if logrus.IsLevelEnabled(logrus.DebugLevel) {
log.G(ctx).WithField("options", log.Format(ctx, opts)).Debug("uvm::CreateLCOW options")
}

// We don't serialize OutputHandlerCreator so if it is missing we need to put it back to the default.
if opts.OutputHandlerCreator == nil {
Expand Down Expand Up @@ -960,10 +975,20 @@ func CreateLCOW(ctx context.Context, opts *OptionsLCOW) (_ *UtilityVM, err error
var doc *hcsschema.ComputeSystem
if opts.SecurityPolicyEnabled {
doc, err = makeLCOWSecurityDoc(ctx, opts, uvm)
log.G(ctx).Tracef("create_lcow::CreateLCOW makeLCOWSecurityDoc result doc: %v err %v", doc, err)
if logrus.IsLevelEnabled(logrus.TraceLevel) {
log.G(ctx).WithFields(logrus.Fields{
"doc": log.Format(ctx, doc),
logrus.ErrorKey: err,
}).Trace("create_lcow::CreateLCOW makeLCOWSecurityDoc result")
}
} else {
doc, err = makeLCOWDoc(ctx, opts, uvm)
log.G(ctx).Tracef("create_lcow::CreateLCOW makeLCOWDoc result doc: %v err %v", doc, err)
if logrus.IsLevelEnabled(logrus.TraceLevel) {
log.G(ctx).WithFields(logrus.Fields{
"doc": log.Format(ctx, doc),
logrus.ErrorKey: err,
}).Trace("create_lcow::CreateLCOW makeLCOWDoc result")
}
}
if err != nil {
return nil, err
Expand All @@ -972,7 +997,9 @@ func CreateLCOW(ctx context.Context, opts *OptionsLCOW) (_ *UtilityVM, err error
if err = uvm.create(ctx, doc); err != nil {
return nil, fmt.Errorf("error while creating the compute system: %w", err)
}
log.G(ctx).WithField("uvm", uvm).Trace("create_lcow::CreateLCOW uvm.create result")
if logrus.IsLevelEnabled(logrus.TraceLevel) {
log.G(ctx).WithField("uvm", log.Format(ctx, uvm)).Trace("create_lcow::CreateLCOW uvm.create result")
}

// Create a socket to inject entropy during boot.
uvm.entropyListener, err = uvm.listenVsock(entropyVsockPort)
Expand Down
Loading