diff --git a/pkg/abi/linux/file.go b/pkg/abi/linux/file.go index 242a1387e4..b22eb10f19 100644 --- a/pkg/abi/linux/file.go +++ b/pkg/abi/linux/file.go @@ -100,6 +100,56 @@ const ( UMOUNT_NOFOLLOW = 0x8 ) +// Constants for fsopen(2). +const ( + FSOPEN_CLOEXEC = 0x1 +) + +// Constants for fsconfig(2). +const ( + FSCONFIG_SET_FLAG = 0x0 + FSCONFIG_SET_STRING = 0x1 + FSCONFIG_SET_BINARY = 0x2 + FSCONFIG_SET_PATH = 0x3 + FSCONFIG_SET_PATH_EMPTY = 0x4 + FSCONFIG_SET_FD = 0x5 + FSCONFIG_CMD_CREATE = 0x6 + FSCONFIG_CMD_RECONFIGURE = 0x7 + FSCONFIG_CMD_CREATE_EXCL = 0x8 +) + +// Constants for fsmount(2). +const ( + FSMOUNT_CLOEXEC = 0x1 +) + +// Constants for move_mount(2). +const ( + MOVE_MOUNT_F_SYMLINKS = 0x00000001 + MOVE_MOUNT_F_AUTOMOUNTS = 0x00000002 + MOVE_MOUNT_F_EMPTY_PATH = 0x00000004 + MOVE_MOUNT_T_SYMLINKS = 0x00000010 + MOVE_MOUNT_T_AUTOMOUNTS = 0x00000020 + MOVE_MOUNT_T_EMPTY_PATH = 0x00000040 + MOVE_MOUNT_SET_GROUP = 0x00000100 + MOVE_MOUNT_BENEATH = 0x00000200 +) + +// Constants for mount_setattr(2). +const ( + MOUNT_ATTR_RDONLY = 0x00000001 + MOUNT_ATTR_NOSUID = 0x00000002 + MOUNT_ATTR_NODEV = 0x00000004 + MOUNT_ATTR_NOEXEC = 0x00000008 + MOUNT_ATTR__ATIME = 0x00000070 + MOUNT_ATTR_RELATIME = 0x00000000 + MOUNT_ATTR_NOATIME = 0x00000010 + MOUNT_ATTR_STRICTATIME = 0x00000020 + MOUNT_ATTR_NODIRATIME = 0x00000080 + MOUNT_ATTR_IDMAP = 0x00100000 + MOUNT_ATTR_NOSYMFOLLOW = 0x00200000 +) + // Constants for unlinkat(2). const ( AT_REMOVEDIR = 0x200 diff --git a/pkg/sentry/fsimpl/fsconfigfd/BUILD b/pkg/sentry/fsimpl/fsconfigfd/BUILD new file mode 100644 index 0000000000..25fe2651b3 --- /dev/null +++ b/pkg/sentry/fsimpl/fsconfigfd/BUILD @@ -0,0 +1,19 @@ +load("//tools:defs.bzl", "go_library") + +package(default_applicable_licenses = ["//:license"]) + +licenses(["notice"]) + +go_library( + name = "fsconfigfd", + srcs = ["fsconfigfd.go"], + visibility = ["//pkg/sentry:internal"], + deps = [ + "//pkg/abi/linux", + "//pkg/context", + "//pkg/errors/linuxerr", + "//pkg/sentry/kernel/auth", + "//pkg/sentry/vfs", + "//pkg/sync", + ], +) diff --git a/pkg/sentry/fsimpl/fsconfigfd/fsconfigfd.go b/pkg/sentry/fsimpl/fsconfigfd/fsconfigfd.go new file mode 100644 index 0000000000..18b3c08c2b --- /dev/null +++ b/pkg/sentry/fsimpl/fsconfigfd/fsconfigfd.go @@ -0,0 +1,364 @@ +// Copyright 2026 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Package fsconfigfd provides an implementation of a *filesystem creation context*, +// part of the new file-descriptor-based mount API. +// +// Applications can create and mount a filesystem separately from placing it on the +// real mount tree. An fs context is created using fsopen() and configured with fsconfig(), +// and a mount file descriptor (fsimpl/mountfd) is created with fsmount(). +// +// The implementation is currently a work-in-progress. Currently, the primary differences +// to the Linux implementation are: +// - Filesystem parameters are parsed (and errors handled) at FSCONFIG_CMD_CREATE time rather +// than when parameters are set +// - FSCONFIG_CMD_CREATE_EXL and FSCONFIG_CMD_RECONFIGURE are not supported (see sys_mount_fd.go) +// - Only FLAG and STRING arguments are supported +package fsconfigfd + +import ( + "fmt" + "strings" + + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/errors/linuxerr" + "gvisor.dev/gvisor/pkg/sentry/kernel/auth" + "gvisor.dev/gvisor/pkg/sentry/vfs" + "gvisor.dev/gvisor/pkg/sync" +) + +// Fd represents a filesystem configuration context. +// +// +stateify savable +type Fd struct { + vfsfd vfs.FileDescription + vfs.FileDescriptionDefaultImpl + vfs.DentryMetadataFileDescriptionImpl + vfs.NoLockFD + + // contextMu protects context. + contextMu sync.Mutex `state:"nosave"` + + // The filesystem configuration context. + // context is protected by contextMu. + context fsContext +} + +// FSValue represents the value assigned to a parameter passed to the filesystem. There are 5 +// valid types: bool, string, []byte, FileDescription, and string (path). +type FSValue interface { + isFSValue() +} + +// FSValueFlag represents a binary value passed to the filesystem. +// +// +stateify savable +type FSValueFlag struct{} + +// FSValueString represents a string value passed to the filesystem. +// +// +stateify savable +type FSValueString string + +// FSValueBlob represents a binary blob value passed to the filesystem. +// +// +stateify savable +type FSValueBlob []byte + +// FSValuePath represents a path value passed to the filesystem. +// +// +stateify savable +type FSValuePath string + +// FSValueFd represents a file descriptor value passed to the filesystem. +// +// +stateify savable +type FSValueFd struct { + FileDescription *vfs.FileDescription +} + +func (FSValueFlag) isFSValue() {} +func (FSValueString) isFSValue() {} +func (FSValueBlob) isFSValue() {} +func (FSValuePath) isFSValue() {} +func (FSValueFd) isFSValue() {} + +// FSParameter (together with a key) represents a parameter passed to the filesystem. +// +// +stateify savable +type FSParameter struct { + Value FSValue + DirFd int +} + +// An fsconfigfd has different "states", and in each state, only some operations +// are valid. We use different types for each state to make logic bugs harder to +// write. +// +// Analogous to include/linux/fs_context.h:fs_context_phase. +// +// +stateify savable +type fsContext interface { + isFSContext() +} + +// createParamsContext represents a filesystem configuration context that is awaiting +// parameters to be set using fsconfig(2). +// +// +stateify savable +type createParamsContext struct { + // The filesystem type, e.g. "tmpfs" + fsName string + + // The mount source + source *string + + // A mutable list of parameters to be passed to the filesystem. + params map[string]FSParameter + + // The credentials of the process mounting the filesystem. + creds *auth.Credentials +} + +func (c *createParamsContext) parseMountOptions() (*vfs.MountOptions, error) { + params := c.params + + var opts vfs.MountOptions + + // Handle mount-specific options + _, ro := params["ro"] + if ro { + opts.ReadOnly = true + } + + var data []string + for key, param := range params { + if key == "ro" { + continue + } + value := param.Value + switch v := value.(type) { + case FSValueFlag: + data = append(data, key) + case FSValueString: + data = append(data, fmt.Sprintf("%s=%s", key, string(v))) + default: + // TODO(b/513024543): when filesystems are refactored to parse options at + // fsconfig() time, we should also support non-flag/non-string mount options. + return nil, linuxerr.EINVAL + } + } + + opts.GetFilesystemOptions.Data = strings.Join(data, ",") + + return &opts, nil +} + +func (createParamsContext) isFSContext() {} + +// awaitingMountContext represents a filesystem configuration context that is waiting +// to be mounted with fsmount(2). +// +// +stateify savable +type awaitingMountContext struct { + // The filesystem created using FSCONFIG_CMD_CREATE. + filesystem *vfs.Filesystem + + // The root of the filesystem created using FSCONFIG_CMD_CREATE. + root *vfs.Dentry + + // The MountOptions which will be used to mount the filesystem by fsmount(2). + // Note that the filesystem-specific options have already been processed by + // FSCONFIG_CMD_CREATE, so only the mount-specific options are relevant here. + opts *vfs.MountOptions +} + +func (awaitingMountContext) isFSContext() {} + +// doneContext represents a filesystem configuration context after fsmount(2) has been called. +// TODO(b/513024543): this should be removed once reconfiguration support is added. +type doneContext struct { +} + +func (doneContext) isFSContext() {} + +// failedContext represents a filesystem configuration context after an operation has failed +// and left an unrecoverable state. +// +// +stateify savable +type failedContext struct { +} + +func (failedContext) isFSContext() {} + +// New returns a new filesystem configuration context fd. +// The context's credentials are saved as they are used for the eventual mount. +func New(ctx context.Context, vfsObj *vfs.VirtualFilesystem, fsname string, fileFlags uint32) (*vfs.FileDescription, error) { + creds := auth.CredentialsFromContext(ctx) + fd := &Fd{ + context: &createParamsContext{ + fsName: fsname, + creds: creds, + params: make(map[string]FSParameter), + }, + } + + vd := vfsObj.NewAnonVirtualDentry("[fscontext]") + defer vd.DecRef(ctx) + + err := fd.vfsfd.Init(fd, fileFlags, creds, vd.Mount(), vd.Dentry(), &vfs.FileDescriptionOptions{ + UseDentryMetadata: true, + DenyPRead: true, + DenyPWrite: true, + }) + if err != nil { + return nil, err + } + + return &fd.vfsfd, nil +} + +// clearFlags defines flags that, rather than themselves being flags, clear another flag. +// clearFlags is immutable. +var clearFlags = map[string]string{ + "rw": "ro", +} + +// SetParam sets the parameter named key to param. +func (fd *Fd) SetParam(key string, param FSParameter) error { + fd.contextMu.Lock() + defer fd.contextMu.Unlock() + + fdContext, ok := fd.context.(*createParamsContext) + if !ok { + // Filesystem context is in the wrong state to add a param + return linuxerr.EBUSY + } + + if key == "source" { + // source= is handled separately + source, ok := param.Value.(FSValueString) + if !ok { + // source= must be a string + return linuxerr.EINVAL + } + if fdContext.source != nil { + // source= can only be set once + return linuxerr.EINVAL + } + src := string(source) + fdContext.source = &src + } else if clearFlag, ok := clearFlags[key]; ok { + delete(fdContext.params, clearFlag) + } else { + // TODO(b/513024543): refactor filesystems to parse options at fsconfig() time + // rather than at mount time + fdContext.params[key] = param + } + + return nil +} + +// DoCmdCreate instantiates an instance of the requested filesystem, including permission checks. +// If filesystem instantiation fails, an error will be returned and the context +// may be placed in a failed state. +func (fd *Fd) DoCmdCreate(ctx context.Context, vfsObj *vfs.VirtualFilesystem) error { + fd.contextMu.Lock() + defer fd.contextMu.Unlock() + + fdContext, ok := fd.context.(*createParamsContext) + if !ok { + // Filesystem context is in the wrong state to create the fs + return linuxerr.EBUSY + } + + if fdContext.source == nil { + // Source was not specified + return linuxerr.EINVAL + } + + // Check for CAP_SYS_ADMIN in the fd origin's user ns. + // Analogous to fs/super.c:mount_capable(). + // + // Note that unlike in Linux, all filesystems marked with registeredFilesystemType.AllowUserMount + // can be mounted by CAP_SYS_ADMIN in a non-initial user namespace. This matches the behavior + // of the traditional mount(2) API in gVisor. + creds := fdContext.creds + if !creds.HasSelfCapability(linux.CAP_SYS_ADMIN) { + return linuxerr.EPERM + } + + // Create the filesystem + opts, err := fdContext.parseMountOptions() + if err != nil { + return err + } + fs, root, err := vfsObj.NewFilesystem(ctx, creds, *fdContext.source, fdContext.fsName, opts) + if err != nil { + // Transition into the failed state (i.e. no retries for this error) + fd.context = &failedContext{} + return err + } + + // Transition into the awaitingMountContext state + fd.context = &awaitingMountContext{ + filesystem: fs, + root: root, + opts: opts, + } + + return nil +} + +// GetFilesystem returns the filesystem and transitions the fd into the appropriate state if the +// fd is in "awaiting-mount" mode. If the fd is not in "awaiting-mount" mode, returns an error. +func (fd *Fd) GetFilesystem() (*vfs.Filesystem, *vfs.Dentry, *vfs.MountOptions, error) { + fd.contextMu.Lock() + defer fd.contextMu.Unlock() + + fdContext, ok := fd.context.(*awaitingMountContext) + if !ok { + // Filesystem context is in the wrong state + var err error + switch fd.context.(type) { + case *createParamsContext: + // Linux returns EINVAL in this case for some reason + err = linuxerr.EINVAL + default: + err = linuxerr.EBUSY + } + return nil, nil, nil, err + } + + fs := fdContext.filesystem + root := fdContext.root + opts := fdContext.opts + + // Transition into doneContext + fd.context = &doneContext{} + + return fs, root, opts, nil +} + +// Release implements vfs.FileDescriptionImpl.Release. +func (fd *Fd) Release(ctx context.Context) { + fdContext, ok := fd.context.(*awaitingMountContext) + if ok { + // Destroy the created filesystem if the fd is dropped without + // calling fsmount(2). + fdContext.root.DecRef(ctx) + fdContext.filesystem.DecRef(ctx) + } +} diff --git a/pkg/sentry/fsimpl/mountfd/BUILD b/pkg/sentry/fsimpl/mountfd/BUILD new file mode 100644 index 0000000000..34f061a3e5 --- /dev/null +++ b/pkg/sentry/fsimpl/mountfd/BUILD @@ -0,0 +1,16 @@ +load("//tools:defs.bzl", "go_library") + +package(default_applicable_licenses = ["//:license"]) + +licenses(["notice"]) + +go_library( + name = "mountfd", + srcs = ["mountfd.go"], + visibility = ["//pkg/sentry:internal"], + deps = [ + "//pkg/context", + "//pkg/sentry/kernel/auth", + "//pkg/sentry/vfs", + ], +) diff --git a/pkg/sentry/fsimpl/mountfd/mountfd.go b/pkg/sentry/fsimpl/mountfd/mountfd.go new file mode 100644 index 0000000000..b75d810b5f --- /dev/null +++ b/pkg/sentry/fsimpl/mountfd/mountfd.go @@ -0,0 +1,68 @@ +// Copyright 2026 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Package mountfd provides a "mount object" file descriptor, as returned by fsmount(2) +// or open_tree(2) with OPEN_TREE_CLONE. +// +// A mount object fd can be used as an argument to move_mount(2) to place the mount +// on the directory tree, or it can be used as a dirfd argument to the *at() syscalls. +// If a mount object fd is closed without calling move_mount(2), the mount is unmounted. +package mountfd + +import ( + "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/sentry/kernel/auth" + "gvisor.dev/gvisor/pkg/sentry/vfs" +) + +// Fd represents a mount object file descriptor. +type Fd struct { + vfsfd vfs.FileDescription + vfs.FileDescriptionDefaultImpl + vfs.DentryMetadataFileDescriptionImpl + vfs.NoLockFD + + // anonNS is the anonymous mount namespace created for backing this mount. + anonNS *vfs.MountNamespace +} + +// New creates a new mount object file descriptor from the anonymous mount namespace anonNs +// and the mount at the root of anonNS. +func New(ctx context.Context, anonNS *vfs.MountNamespace, fileFlags uint32) (*vfs.FileDescription, error) { + fd := &Fd{ + anonNS: anonNS, + } + + // The mount comes from the root of the anonymous mount namespace + root := anonNS.Root(ctx) + defer root.DecRef(ctx) + + err := fd.vfsfd.Init(fd, fileFlags, auth.CredentialsFromContext(ctx), root.Mount(), root.Dentry(), &vfs.FileDescriptionOptions{ + UseDentryMetadata: true, + DenyPRead: true, + DenyPWrite: true, + }) + if err != nil { + return nil, err + } + + return &fd.vfsfd, nil +} + +// Release implements vfs.FileDescriptionImpl.Release. +func (fd *Fd) Release(ctx context.Context) { + // Decrement the references on the mount's anonymous namespace. + // If move_mount(2) was not called, this will also result in the filesystem being unmounted. + fd.anonNS.DecRef(ctx) +} diff --git a/pkg/sentry/syscalls/linux/BUILD b/pkg/sentry/syscalls/linux/BUILD index 77d9a359b2..5b9e817ded 100644 --- a/pkg/sentry/syscalls/linux/BUILD +++ b/pkg/sentry/syscalls/linux/BUILD @@ -31,6 +31,7 @@ go_library( "sys_mempolicy.go", "sys_mmap.go", "sys_mount.go", + "sys_mount_fd.go", "sys_mq.go", "sys_msgqueue.go", "sys_pipe.go", @@ -86,9 +87,11 @@ go_library( "//pkg/safemem", "//pkg/sentry/arch", "//pkg/sentry/fsimpl/eventfd", + "//pkg/sentry/fsimpl/fsconfigfd", "//pkg/sentry/fsimpl/host", "//pkg/sentry/fsimpl/iouringfs", "//pkg/sentry/fsimpl/lock", + "//pkg/sentry/fsimpl/mountfd", "//pkg/sentry/fsimpl/pipefs", "//pkg/sentry/fsimpl/signalfd", "//pkg/sentry/fsimpl/timerfd", diff --git a/pkg/sentry/syscalls/linux/linux64.go b/pkg/sentry/syscalls/linux/linux64.go index 33fdddee7c..3c68015221 100644 --- a/pkg/sentry/syscalls/linux/linux64.go +++ b/pkg/sentry/syscalls/linux/linux64.go @@ -387,10 +387,10 @@ var AMD64 = &kernel.SyscallTable{ 426: syscalls.PartiallySupported("io_uring_enter", IOUringEnter, "Not all flags and functionality supported.", nil), 427: syscalls.ErrorWithEvent("io_uring_register", linuxerr.ENOSYS, "", nil), 428: syscalls.ErrorWithEvent("open_tree", linuxerr.ENOSYS, "", nil), - 429: syscalls.ErrorWithEvent("move_mount", linuxerr.ENOSYS, "", nil), - 430: syscalls.ErrorWithEvent("fsopen", linuxerr.ENOSYS, "", nil), - 431: syscalls.ErrorWithEvent("fsconfig", linuxerr.ENOSYS, "", nil), - 432: syscalls.ErrorWithEvent("fsmount", linuxerr.ENOSYS, "", nil), + 429: syscalls.PartiallySupported("move_mount", MoveMount, "Options MOVE_MOUNT_SET_GROUP and MOVE_MOUNT_BENEATH are not supported.", nil), + 430: syscalls.PartiallySupported("fsopen", FSOpen, "Message retrieval interface not supported.", nil), + 431: syscalls.PartiallySupported("fsconfig", FSConfig, "Only options FSCONFIG_SET_FLAG, FSCONFIG_SET_STRING, and FSCONFIG_CMD_CREATE are supported. All option parsing and error handling happens upon FSCONFIG_CMD_CREATE.", nil), + 432: syscalls.PartiallySupported("fsmount", FSMount, "Options MOUNT_ATTR_NOSYMFOLLOW and MOUNT_ATTR_NODIRATIME are not supported.", nil), 433: syscalls.ErrorWithEvent("fspick", linuxerr.ENOSYS, "", nil), 434: syscalls.Supported("pidfd_open", PIDFDOpen), 435: syscalls.PartiallySupported("clone3", Clone3, "Options CLONE_NEWCGROUP, CLONE_INTO_CGROUP, CLONE_NEWTIME, CLONE_SYSVSEM and, SetTid are not supported.", nil), @@ -709,10 +709,10 @@ var ARM64 = &kernel.SyscallTable{ 426: syscalls.PartiallySupported("io_uring_enter", IOUringEnter, "Not all flags and functionality supported.", nil), 427: syscalls.ErrorWithEvent("io_uring_register", linuxerr.ENOSYS, "", nil), 428: syscalls.ErrorWithEvent("open_tree", linuxerr.ENOSYS, "", nil), - 429: syscalls.ErrorWithEvent("move_mount", linuxerr.ENOSYS, "", nil), - 430: syscalls.ErrorWithEvent("fsopen", linuxerr.ENOSYS, "", nil), - 431: syscalls.ErrorWithEvent("fsconfig", linuxerr.ENOSYS, "", nil), - 432: syscalls.ErrorWithEvent("fsmount", linuxerr.ENOSYS, "", nil), + 429: syscalls.PartiallySupported("move_mount", MoveMount, "Options MOVE_MOUNT_SET_GROUP and MOVE_MOUNT_BENEATH are not supported.", nil), + 430: syscalls.PartiallySupported("fsopen", FSOpen, "Message retrieval interface not supported.", nil), + 431: syscalls.PartiallySupported("fsconfig", FSConfig, "Only options FSCONFIG_SET_FLAG, FSCONFIG_SET_STRING, and FSCONFIG_CMD_CREATE are supported. All option parsing and error handling happens upon FSCONFIG_CMD_CREATE.", nil), + 432: syscalls.PartiallySupported("fsmount", FSMount, "Options MOUNT_ATTR_NOSYMFOLLOW and MOUNT_ATTR_NODIRATIME are not supported.", nil), 433: syscalls.ErrorWithEvent("fspick", linuxerr.ENOSYS, "", nil), 434: syscalls.Supported("pidfd_open", PIDFDOpen), 435: syscalls.PartiallySupported("clone3", Clone3, "Options CLONE_NEWCGROUP, CLONE_INTO_CGROUP, CLONE_NEWTIME, CLONE_SYSVSEM and clone_args.set_tid are not supported.", nil), diff --git a/pkg/sentry/syscalls/linux/sys_mount_fd.go b/pkg/sentry/syscalls/linux/sys_mount_fd.go new file mode 100644 index 0000000000..3683e01396 --- /dev/null +++ b/pkg/sentry/syscalls/linux/sys_mount_fd.go @@ -0,0 +1,293 @@ +// Copyright 2026 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package linux + +import ( + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/errors/linuxerr" + "gvisor.dev/gvisor/pkg/hostarch" + "gvisor.dev/gvisor/pkg/sentry/arch" + "gvisor.dev/gvisor/pkg/sentry/fsimpl/fsconfigfd" + "gvisor.dev/gvisor/pkg/sentry/fsimpl/mountfd" + "gvisor.dev/gvisor/pkg/sentry/kernel" + "gvisor.dev/gvisor/pkg/sentry/vfs" +) + +// FSOpen implements Linux syscall fsopen(2). +func FSOpen(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + fsnameAddr := args[0].Pointer() + flags := args[1].Uint() + + if flags&^linux.FSOPEN_CLOEXEC != 0 { + return 0, nil, linuxerr.EINVAL + } + + // Must have CAP_SYS_ADMIN in the current mount namespace's associated user + // namespace. + creds := t.Credentials() + if !creds.HasCapabilityIn(linux.CAP_SYS_ADMIN, t.MountNamespace().Owner) { + return 0, nil, linuxerr.EPERM + } + + fsname, err := t.CopyInString(fsnameAddr, hostarch.PageSize) + if err != nil { + return 0, nil, err + } + + vfsObj := t.Kernel().VFS() + fileFlags := uint32(linux.O_RDWR) + file, err := fsconfigfd.New(t, vfsObj, fsname, fileFlags) + if err != nil { + return 0, nil, err + } + defer file.DecRef(t) + fsfd, err := t.NewFDFrom(0, file, kernel.FDFlags{ + CloseOnExec: flags&linux.FSOPEN_CLOEXEC == linux.FSOPEN_CLOEXEC, + }) + if err != nil { + return 0, nil, err + } + + return uintptr(fsfd), nil, nil +} + +func paramFromFSConfigArgs(t *kernel.Task, cmd uint32, valueAddr hostarch.Addr, aux int32) (*fsconfigfd.FSParameter, error) { + var value fsconfigfd.FSValue + switch cmd { + case linux.FSCONFIG_SET_FLAG: + if valueAddr != 0 || aux != 0 { + return nil, linuxerr.EINVAL + } + + value = fsconfigfd.FSValueFlag{} + case linux.FSCONFIG_SET_STRING: + if aux != 0 { + return nil, linuxerr.EINVAL + } + str, err := t.CopyInString(valueAddr, hostarch.PageSize) + if err != nil { + return nil, err + } + + value = fsconfigfd.FSValueString(str) + case linux.FSCONFIG_SET_BINARY: + fallthrough + case linux.FSCONFIG_SET_FD: + fallthrough + case linux.FSCONFIG_SET_PATH: + fallthrough + case linux.FSCONFIG_SET_PATH_EMPTY: + return nil, linuxerr.EINVAL + case linux.FSCONFIG_CMD_CREATE: + fallthrough + case linux.FSCONFIG_CMD_CREATE_EXCL: + fallthrough + case linux.FSCONFIG_CMD_RECONFIGURE: + return nil, nil + default: + return nil, linuxerr.EINVAL + } + + param := fsconfigfd.FSParameter{ + Value: value, + // TODO(b/513024543): support non-flag/non-string parameters, which will include using DirFd + DirFd: -1, + } + + return ¶m, nil +} + +// FSConfig implements Linux syscall fsconfig(2). +func FSConfig(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + fd := args[0].Int() + cmd := args[1].Uint() + keyAddr := args[2].Pointer() + valueAddr := args[3].Pointer() + aux := args[4].Int() + + file := t.GetFile(fd) + if file == nil { + return 0, nil, linuxerr.EBADF + } + defer file.DecRef(t) + + fsfd, ok := file.Impl().(*fsconfigfd.Fd) + if !ok { + return 0, nil, linuxerr.EINVAL + } + + // First, is this a request to set a parameter (FSCONFIG_SET_*)? + param, err := paramFromFSConfigArgs(t, cmd, valueAddr, aux) + if err != nil { + return 0, nil, err + } + + if param != nil { + // FSCONFIG_SET_*: copy in the key and set the parameter + key, err := t.CopyInString(keyAddr, hostarch.PageSize) + if err != nil { + return 0, nil, err + } + err = fsfd.SetParam(key, *param) + return 0, nil, err + } + + if cmd == linux.FSCONFIG_CMD_CREATE { + // FSCONFIG_CMD_CREATE: create a detached mount + + // CAP_SYS_ADMIN check performed in fsfd.DoCmdCreate(). + vfsObj := t.Kernel().VFS() + err := fsfd.DoCmdCreate(t, vfsObj) + return 0, nil, err + } + + // TODO(b/513024543): support FSCONFIG_CMD_CREATE_EXCL and FSCONFIG_CMD_RECONFIGURE + + return 0, nil, linuxerr.EINVAL +} + +var fsmountValidAttrFlags = uint32(linux.MOUNT_ATTR_RDONLY | linux.MOUNT_ATTR_NOSUID | linux.MOUNT_ATTR_NODEV | linux.MOUNT_ATTR_NOEXEC | linux.MOUNT_ATTR__ATIME) + +func parseAttrFlagsIntoMountOpts(attrFlags uint32, opts *vfs.MountOptions) { + if attrFlags&linux.MOUNT_ATTR_RDONLY == linux.MOUNT_ATTR_RDONLY { + opts.ReadOnly = true + } + if attrFlags&linux.MOUNT_ATTR_NOSUID == linux.MOUNT_ATTR_NOSUID { + opts.Flags.NoSUID = true + } + if attrFlags&linux.MOUNT_ATTR_NODEV == linux.MOUNT_ATTR_NODEV { + opts.Flags.NoDev = true + } + if attrFlags&linux.MOUNT_ATTR_NOEXEC == linux.MOUNT_ATTR_NOEXEC { + opts.Flags.NoExec = true + } + if attrFlags&linux.MOUNT_ATTR__ATIME == linux.MOUNT_ATTR_NOATIME { + opts.Flags.NoATime = true + } +} + +// FSMount implements Linux syscall fsmount(2). +func FSMount(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + fd := args[0].Int() + flags := args[1].Uint() + attrFlags := args[2].Uint() + + if flags&^linux.FSMOUNT_CLOEXEC != 0 { + return 0, nil, linuxerr.EINVAL + } + + if attrFlags&^fsmountValidAttrFlags != 0 { + return 0, nil, linuxerr.EINVAL + } + + // Must have CAP_SYS_ADMIN in the current mount namespace's associated user + // namespace. + creds := t.Credentials() + if !creds.HasCapabilityIn(linux.CAP_SYS_ADMIN, t.MountNamespace().Owner) { + return 0, nil, linuxerr.EPERM + } + + file := t.GetFile(fd) + if file == nil { + return 0, nil, linuxerr.EBADF + } + defer file.DecRef(t) + + fsfd, ok := file.Impl().(*fsconfigfd.Fd) + if !ok { + return 0, nil, linuxerr.EINVAL + } + + // Fetch the previously-instantiated filesystem + fs, root, opts, err := fsfd.GetFilesystem() + if err != nil { + return 0, nil, err + } + + // Parse mount options specified in attrFlags + parseAttrFlagsIntoMountOpts(attrFlags, opts) + + // Create the mount, which we will place at the root of a new anonymous mount namespace + mountNs := t.Kernel().VFS().NewMountNamespaceFrom(t, creds, fs, root, opts, t.Kernel(), true /* anon */) + + // Create the mount object fd + mountFile, err := mountfd.New(t, mountNs, linux.O_RDONLY) + if err != nil { + return 0, nil, err + } + defer mountFile.DecRef(t) + mountFd, err := t.NewFDFrom(0, mountFile, kernel.FDFlags{ + CloseOnExec: flags&linux.FSMOUNT_CLOEXEC == linux.FSMOUNT_CLOEXEC, + }) + if err != nil { + return 0, nil, err + } + + return uintptr(mountFd), nil, nil +} + +const supportedMoveMountFlags = linux.MOVE_MOUNT_F_EMPTY_PATH | linux.MOVE_MOUNT_T_EMPTY_PATH | linux.MOVE_MOUNT_F_SYMLINKS | linux.MOVE_MOUNT_T_SYMLINKS + +// MoveMount implements Linux syscall move_mount(2). +func MoveMount(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + fromDirfd := args[0].Int() + fromAddr := args[1].Pointer() + toDirfd := args[2].Int() + toAddr := args[3].Pointer() + flags := args[4].Uint() + + // TODO(b/270247637): gVisor does not yet support automount, so + // MOVE_MOUNT_*_AUTOMOUNTS flags are a no-op. + flags &= ^(uint32(linux.MOVE_MOUNT_F_AUTOMOUNTS | linux.MOVE_MOUNT_T_AUTOMOUNTS)) + + if flags&^supportedMoveMountFlags != 0 { + return 0, nil, linuxerr.EINVAL + } + + // Must have CAP_SYS_ADMIN in the current mount namespace's associated user + // namespace. + creds := t.Credentials() + if !creds.HasCapabilityIn(linux.CAP_SYS_ADMIN, t.MountNamespace().Owner) { + return 0, nil, linuxerr.EPERM + } + + fromPath, err := copyInPath(t, fromAddr) + if err != nil { + return 0, nil, err + } + from, err := getTaskPathOperation(t, fromDirfd, fromPath, shouldAllowEmptyPath(flags&linux.MOVE_MOUNT_F_EMPTY_PATH != 0), shouldFollowFinalSymlink(flags&linux.MOVE_MOUNT_F_SYMLINKS != 0)) + if err != nil { + return 0, nil, err + } + defer from.Release(t) + toPath, err := copyInPath(t, toAddr) + if err != nil { + return 0, nil, err + } + to, err := getTaskPathOperation(t, toDirfd, toPath, shouldAllowEmptyPath(flags&linux.MOVE_MOUNT_T_EMPTY_PATH != 0), shouldFollowFinalSymlink(flags&linux.MOVE_MOUNT_T_SYMLINKS != 0)) + if err != nil { + return 0, nil, err + } + defer to.Release(t) + + // Re-attach the mount to the destination mountpoint + vfsObj := t.Kernel().VFS() + err = vfsObj.MoveMountAt(t, creds, &from.pop, &to.pop) + if err != nil { + return 0, nil, err + } + + return 0, nil, nil +} diff --git a/pkg/sentry/vfs/mount.go b/pkg/sentry/vfs/mount.go index 4ebcf89562..c8c720db8d 100644 --- a/pkg/sentry/vfs/mount.go +++ b/pkg/sentry/vfs/mount.go @@ -279,9 +279,11 @@ func (vfs *VirtualFilesystem) MountDisconnected(ctx context.Context, creds *auth // peers and followers. This method consumes the reference on mp. It is analogous to // fs/namespace.c:attach_recursive_mnt() in Linux. The mount point mp must have its dentry locked // before calling attachTreeLocked. +// If tryMove is set to true, mnt is detached from its parent mount before performing the attach +// operation. // // +checklocks:vfs.mountMu -func (vfs *VirtualFilesystem) attachTreeLocked(ctx context.Context, mnt *Mount, mp VirtualDentry) error { +func (vfs *VirtualFilesystem) attachTreeLocked(ctx context.Context, mnt *Mount, mp VirtualDentry, tryMove bool) error { cleanup := cleanup.Make(func() { vfs.cleanupGroupIDs(mnt.submountsLocked()) // +checklocksforce mp.dentry.mu.Unlock() @@ -325,6 +327,19 @@ func (vfs *VirtualFilesystem) attachTreeLocked(ctx context.Context, mnt *Mount, } } vfs.mounts.seq.BeginWrite() + if tryMove { + if mnt.parent() != nil { + // Moving a submount: disconnect from old parent mount + oldMp := vfs.disconnectLocked(mnt) + vfs.delayDecRef(oldMp) + vfs.delayDecRef(mnt) + } else if mnt.ns != nil && mnt.ns.anon && mnt.ns.root == mnt { + // Moving the root of an anonymous mount namespace: sever the mount from its ns + mnt.ns.root = nil + vfs.delayDecRef(mnt) + } + vfs.migrateChildrenNs(mnt, mp.mount.ns) + } vfs.connectLocked(mnt, mp, mp.mount.ns) vfs.mounts.seq.EndWrite() mp.dentry.mu.Unlock() @@ -389,7 +404,125 @@ func (vfs *VirtualFilesystem) ConnectMountAt(ctx context.Context, creds *auth.Cr vfs.delayDecRef(mp) return linuxerr.EINVAL } - return vfs.attachTreeLocked(ctx, mnt, mp) + return vfs.attachTreeLocked(ctx, mnt, mp, false) +} + +// MoveMountAt connects source's mount at the path represented by target, possibly first disconnecting source's mnt. +// The target's mountpoint may or may not be connected. +// The path lookups for source and target checks traversal permissions against creds. +// +// Roughly analogous to Linux fs/namespace.c:do_move_mount(). +func (vfs *VirtualFilesystem) MoveMountAt(ctx context.Context, creds *auth.Credentials, source *PathOperation, target *PathOperation) error { + // Lookup the source path + sourceVd, err := vfs.GetDentryAt(ctx, creds, source, &GetDentryOptions{CheckSearchable: true}) + if err != nil { + return err + } + defer sourceVd.DecRef(ctx) + + // The source path must be the root of its mount. + sourceMnt := sourceVd.mount + if sourceVd.dentry != sourceMnt.root { + return linuxerr.EINVAL + } + + // Fetch source stat info + sourceStat, err := vfs.StatAt(ctx, creds, &PathOperation{ + Root: sourceVd, + Start: sourceVd, + }, &StatOptions{ + Mask: linux.STATX_MODE, + }) + if err != nil { + return err + } + + // Lookup the target path + targetVd, err := vfs.GetDentryAt(ctx, creds, target, &GetDentryOptions{CheckSearchable: true}) + if err != nil { + return err + } + targetCleanup := cleanup.Make(func() { + targetVd.DecRef(ctx) + }) + defer targetCleanup.Clean() + + // Fetch target stat info + targetStat, err := vfs.StatAt(ctx, creds, &PathOperation{ + Root: targetVd, + Start: targetVd, + }, &StatOptions{ + Mask: linux.STATX_MODE, + }) + if err != nil { + return err + } + + // Only allow directory-to-directory or file-to-file mount moves + if linux.FileMode(sourceStat.Mode).IsDir() != linux.FileMode(targetStat.Mode).IsDir() { + return linuxerr.EINVAL + } + + // We can't hold vfs.mountMu while calling FilesystemImpl methods due to + // lock ordering. + vfs.lockMounts() + defer vfs.unlockMounts(ctx) + + targetCleanup.Release() + mp, err := vfs.lockMountpoint(targetVd) + if err != nil { + return err + } + mpCleanup := cleanup.Make(func() { + mp.dentry.mu.Unlock() + vfs.delayDecRef(mp) // +checklocksforce + }) + defer mpCleanup.Clean() + + // Is the source mount in our mount namespace? + if vfs.validInMountNS(ctx, sourceMnt) { + // Source mount must: + // - Not be locked + // - Have a non-shared parent + if sourceMnt.locked || sourceMnt.parent() == nil || sourceMnt.parent().isShared { + return linuxerr.EINVAL + } + // Target mount must: + // - Also be in our namespace + if !vfs.validInMountNS(ctx, mp.mount) { + return linuxerr.EINVAL + } + } else { + // If source is not in our mount ns, it must be: + // - Mounted at the root of anonymous mount ns + if sourceMnt.umounted || sourceMnt.ns == nil || !sourceMnt.ns.anon || sourceMnt != sourceMnt.ns.root { + return linuxerr.EINVAL + } + // - In a *different* mount ns from the destination (i.e. not moving within its own anon mount ns) + if sourceMnt.ns == mp.mount.ns || mp.mount.ns == nil { + return linuxerr.EINVAL + } + // And the destination, if not in our mount ns, must be: + // - Mounted + // - In an anonymous mount ns + if !vfs.validInMountNS(ctx, mp.mount) { + if mp.mount.umounted || !mp.mount.ns.anon { + return linuxerr.EINVAL + } + // TODO(b/513024543): when open_tree(2) is implemented, we may need to start tracking + // and checking the mount namespace's "owner" + } + } + + // Verify that source is not an ancestor of destination + for p := mp.mount; p != nil; p = p.parent() { + if p == sourceMnt { + return linuxerr.ELOOP + } + } + + mpCleanup.Release() + return vfs.attachTreeLocked(ctx, sourceVd.mount, mp, true /* tryMove */) } // lockMountpoint returns VirtualDentry with a locked Dentry. If vd is a @@ -625,7 +758,7 @@ func (vfs *VirtualFilesystem) BindAt(ctx context.Context, creds *auth.Credential vfs.delayDecRef(clone) clone.locked = false - if err := vfs.attachTreeLocked(ctx, clone, mp); err != nil { + if err := vfs.attachTreeLocked(ctx, clone, mp, false); err != nil { vfs.abortUncomittedChildren(ctx, clone) return err } @@ -854,6 +987,29 @@ func (vfs *VirtualFilesystem) changeMountpoint(mnt *Mount, mp VirtualDentry) { mp.dentry.mu.Unlock() } +// migrateChildrenNs recursively migrates mnt's children into newNs. +// +// Preconditions: +// - vfs.mountMu must be locked. +func (vfs *VirtualFilesystem) migrateChildrenNs(mnt *Mount, newNs *MountNamespace) { + for c := range mnt.children { + vd := c.getKey() + oldNs := c.ns + + oldNs.mountpoints[vd.dentry]-- + oldNs.mounts-- + if oldNs.mountpoints[vd.dentry] == 0 { + delete(oldNs.mountpoints, vd.dentry) + } + + c.ns = newNs + newNs.mountpoints[vd.dentry]++ + newNs.mounts++ + + vfs.migrateChildrenNs(c, newNs) + } +} + // connectLocked makes vd the mount parent/point for mnt. It consumes // references held by vd. // diff --git a/pkg/sentry/vfs/namespace.go b/pkg/sentry/vfs/namespace.go index 6a2eb5a32f..b3188632f5 100644 --- a/pkg/sentry/vfs/namespace.go +++ b/pkg/sentry/vfs/namespace.go @@ -35,6 +35,10 @@ type MountNamespace struct { // Owner is the usernamespace that owns this mount namespace. Owner *auth.UserNamespace + // vfs is the vfs this namespace belongs to. + // vfs is immutable. + vfs *VirtualFilesystem + // root is the MountNamespace's root mount. root *Mount @@ -57,6 +61,9 @@ type MountNamespace struct { // pending is the total number of pending mounts in this mount namespace. pending uint32 + + // anon indicates whether the mount namespace is anonymous. + anon bool } // Namespace is the namespace interface. @@ -87,7 +94,7 @@ func (vfs *VirtualFilesystem) NewMountNamespace( if err != nil { return nil, err } - return vfs.NewMountNamespaceFrom(ctx, creds, fs, root, opts, nsfs), nil + return vfs.NewMountNamespaceFrom(ctx, creds, fs, root, opts, nsfs, false /* anon */), nil } type namespaceDefaultRefs struct { @@ -113,10 +120,13 @@ func (vfs *VirtualFilesystem) NewMountNamespaceFrom( root *Dentry, opts *MountOptions, nsfs NamespaceInodeGetter, + anon bool, ) *MountNamespace { mntns := &MountNamespace{ + vfs: vfs, Owner: creds.UserNamespace, mountpoints: make(map[*Dentry]uint32), + anon: anon, } if nsfs == nil { refs := &namespaceDefaultRefs{destroy: mntns.Destroy} @@ -166,6 +176,7 @@ func (vfs *VirtualFilesystem) CloneMountNamespace( nsfs NamespaceInodeGetter, ) (*MountNamespace, error) { newns := &MountNamespace{ + vfs: vfs, Owner: uns, mountpoints: make(map[*Dentry]uint32), } @@ -197,11 +208,13 @@ func (vfs *VirtualFilesystem) CloneMountNamespace( // Destroy implements nsfs.Namespace.Destroy. func (mntns *MountNamespace) Destroy(ctx context.Context) { - vfs := mntns.root.fs.VirtualFilesystem() + vfs := mntns.vfs vfs.lockMounts() - vfs.umountTreeLocked(mntns.root, &umountRecursiveOptions{ - disconnectHierarchy: true, - }) + if mntns.root != nil { + vfs.umountTreeLocked(mntns.root, &umountRecursiveOptions{ + disconnectHierarchy: true, + }) + } vfs.unlockMounts(ctx) } @@ -232,7 +245,12 @@ func (mntns *MountNamespace) TryIncRef() bool { // Root returns mntns' root. If the root is over-mounted, it returns the top // mount. +// May return an empty virtual dentry if mntns is an anonymous mount namespace and its root +// has been moved to another mountpoint. func (mntns *MountNamespace) Root(ctx context.Context) VirtualDentry { + if mntns.root == nil { + return VirtualDentry{} + } vfs := mntns.root.fs.VirtualFilesystem() vd := VirtualDentry{ mount: mntns.root, diff --git a/test/syscalls/BUILD b/test/syscalls/BUILD index c9baecbc80..ce62184c7e 100644 --- a/test/syscalls/BUILD +++ b/test/syscalls/BUILD @@ -419,6 +419,12 @@ syscall_test( test = "//test/syscalls/linux:mount_test", ) +syscall_test( + add_overlay = True, + save = False, + test = "//test/syscalls/linux:mount_fd_test", +) + syscall_test( test = "//test/syscalls/linux:mq_test", ) diff --git a/test/syscalls/linux/BUILD b/test/syscalls/linux/BUILD index ee505aaa39..2cee03ecdc 100644 --- a/test/syscalls/linux/BUILD +++ b/test/syscalls/linux/BUILD @@ -1533,6 +1533,35 @@ cc_binary( ], ) +cc_binary( + name = "mount_fd_test", + testonly = 1, + srcs = ["mount_fd.cc"], + linkstatic = 1, + malloc = "//test/util:errno_safe_allocator", + deps = select_gtest() + [ + "//test/util:capability_util", + "//test/util:cleanup", + "//test/util:eventfd_util", + "//test/util:file_descriptor", + "//test/util:fs_util", + "//test/util:logging", + "//test/util:memory_util", + "//test/util:mount_util", + "//test/util:multiprocess_util", + "//test/util:posix_error", + "//test/util:save_util", + "//test/util:temp_path", + "//test/util:test_main", + "//test/util:test_util", + "//test/util:thread_util", + "@com_google_absl//absl/container:flat_hash_map", + "@com_google_absl//absl/strings", + "@com_google_absl//absl/strings:str_format", + "@com_google_absl//absl/time", + ], +) + cc_binary( name = "mremap_test", testonly = 1, diff --git a/test/syscalls/linux/mount_fd.cc b/test/syscalls/linux/mount_fd.cc new file mode 100644 index 0000000000..8f58a42660 --- /dev/null +++ b/test/syscalls/linux/mount_fd.cc @@ -0,0 +1,618 @@ +// Copyright 2026 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include "gmock/gmock.h" +#include "gtest/gtest.h" +#include "test/util/cleanup.h" +#include "test/util/fs_util.h" +#include "test/util/linux_capability_util.h" +#include "test/util/posix_error.h" +#include "test/util/temp_path.h" +#include "test/util/test_util.h" +#include "test/util/thread_util.h" + +// Syscall Definitions/Constants + +#ifndef FSOPEN_CLOEXEC +#define FSOPEN_CLOEXEC 0x1 +#endif + +#ifndef FSCONFIG_SET_FLAG +#define FSCONFIG_SET_FLAG 0x0 +#define FSCONFIG_SET_STRING 0x1 +#define FSCONFIG_SET_BINARY 0x2 +#define FSCONFIG_SET_PATH 0x3 +#define FSCONFIG_SET_PATH_EMPTY 0x4 +#define FSCONFIG_SET_FD 0x5 +#define FSCONFIG_CMD_CREATE 0x6 +#define FSCONFIG_CMD_RECONFIGURE 0x7 +#define FSCONFIG_CMD_CREATE_EXCL 0x8 +#endif + +#ifndef FSMOUNT_CLOEXEC +#define FSMOUNT_CLOEXEC 0x1 +#endif + +#ifndef MOUNT_ATTR_RDONLY +#define MOUNT_ATTR_RDONLY 0x00000001 +#define MOUNT_ATTR_NOSUID 0x00000002 +#define MOUNT_ATTR_NODEV 0x00000004 +#define MOUNT_ATTR_NOEXEC 0x00000008 +#define MOUNT_ATTR__ATIME 0x00000070 +#define MOUNT_ATTR_RELATIME 0x00000000 +#define MOUNT_ATTR_NOATIME 0x00000010 +#define MOUNT_ATTR_STRICTATIME 0x00000020 +#define MOUNT_ATTR_NODIRATIME 0x00000080 +#endif + +#ifndef MOVE_MOUNT_F_SYMLINKS +#define MOVE_MOUNT_F_SYMLINKS 0x00000001 +#define MOVE_MOUNT_F_AUTOMOUNTS 0x00000002 +#define MOVE_MOUNT_F_EMPTY_PATH 0x00000004 +#define MOVE_MOUNT_T_SYMLINKS 0x00000010 +#define MOVE_MOUNT_T_AUTOMOUNTS 0x00000020 +#define MOVE_MOUNT_T_EMPTY_PATH 0x00000040 +#define MOVE_MOUNT__MASK 0x00000077 +#endif + +#ifndef ST_RDONLY +#define ST_RDONLY 0x1 +#endif +#ifndef ST_NODEV +#define ST_NODEV 0x4 +#endif +#ifndef ST_NOEXEC +#define ST_NOEXEC 0x8 +#endif + +#ifndef SYS_fsopen +#if defined(__x86_64__) +#define SYS_move_mount 429 +#define SYS_fsopen 430 +#define SYS_fsconfig 431 +#define SYS_fsmount 432 +#define SYS_fspick 433 +#elif defined(__aarch64__) +#define SYS_move_mount 429 +#define SYS_fsopen 430 +#define SYS_fsconfig 431 +#define SYS_fsmount 432 +#define SYS_fspick 433 +#else +#error "Unknown architecture" +#endif +#endif + +inline int fsopen(const char* fsname, unsigned int flags) { + return syscall(SYS_fsopen, fsname, flags); +} +inline int fsconfig(int fd, unsigned int cmd, const char* key, + const void* value, int aux) { + return syscall(SYS_fsconfig, fd, cmd, key, value, aux); +} +inline int fsmount(int fsfd, unsigned int flags, unsigned int attr_flags) { + return syscall(SYS_fsmount, fsfd, flags, attr_flags); +} +inline int move_mount(int from_dirfd, const char* from_pathname, int to_dirfd, + const char* to_pathname, unsigned int flags) { + return syscall(SYS_move_mount, from_dirfd, from_pathname, to_dirfd, + to_pathname, flags); +} + +namespace gvisor { +namespace testing { + +namespace { + +constexpr char kTmpfs[] = "tmpfs"; + +// fsopen(2) tests + +TEST(FsOpenTest, FsOpenSuccess) { + SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(HaveCapability(CAP_SYS_ADMIN))); + + int fd1 = fsopen(kTmpfs, 0); + ASSERT_THAT(fd1, SyscallSucceeds()); + EXPECT_THAT(close(fd1), SyscallSucceeds()); + + int fd2 = fsopen(kTmpfs, FSOPEN_CLOEXEC); + ASSERT_THAT(fd2, SyscallSucceeds()); + EXPECT_THAT(close(fd2), SyscallSucceeds()); +} + +TEST(FsOpenTest, FsOpenInvalidFlags) { + SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(HaveCapability(CAP_SYS_ADMIN))); + + EXPECT_THAT(fsopen(kTmpfs, -1), SyscallFailsWithErrno(EINVAL)); +} + +TEST(FsOpenTest, FsOpenPermDenied) { + SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(HaveCapability(CAP_SYS_ADMIN))); + + // Drop privileges in another thread to verify CAP_SYS_ADMIN check. + ScopedThread([&]() { + EXPECT_NO_ERRNO(SetCapability(CAP_SYS_ADMIN, false)); + EXPECT_THAT(fsopen(kTmpfs, 0), SyscallFailsWithErrno(EPERM)); + }); +} + +// fsconfig(2) tests + +TEST(FsConfigTest, FsConfigSetSource) { + SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(HaveCapability(CAP_SYS_ADMIN))); + + int fsfd = fsopen(kTmpfs, 0); + ASSERT_THAT(fsfd, SyscallSucceeds()); + auto cleanup = Cleanup([&]() { close(fsfd); }); + + EXPECT_THAT(fsconfig(fsfd, FSCONFIG_SET_STRING, "source", "my_source", 0), + SyscallSucceeds()); +} + +TEST(FsConfigTest, FsConfigSetParams) { + SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(HaveCapability(CAP_SYS_ADMIN))); + + int fsfd = fsopen(kTmpfs, 0); + ASSERT_THAT(fsfd, SyscallSucceeds()); + auto cleanup = Cleanup([&]() { close(fsfd); }); + + EXPECT_THAT(fsconfig(fsfd, FSCONFIG_SET_FLAG, "ro", NULL, 0), + SyscallSucceeds()); + EXPECT_THAT(fsconfig(fsfd, FSCONFIG_SET_STRING, "mode", "755", 0), + SyscallSucceeds()); +} + +TEST(FsConfigTest, FsConfigSetSourceTwice) { + SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(HaveCapability(CAP_SYS_ADMIN))); + + int fsfd = fsopen(kTmpfs, 0); + ASSERT_THAT(fsfd, SyscallSucceeds()); + auto cleanup = Cleanup([&]() { close(fsfd); }); + + EXPECT_THAT(fsconfig(fsfd, FSCONFIG_SET_STRING, "source", "src1", 0), + SyscallSucceeds()); + EXPECT_THAT(fsconfig(fsfd, FSCONFIG_SET_STRING, "source", "src2", 0), + SyscallFailsWithErrno(EINVAL)); +} + +TEST(FsConfigTest, FsConfigCreateWithoutSource) { + SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(HaveCapability(CAP_SYS_ADMIN))); + // Note: Skipped on Linux because Linux tmpfs does not require a source. + // Currently, gVisor requires a source for all mounts. + SKIP_IF(!IsRunningOnGvisor()); + + int fsfd = fsopen(kTmpfs, 0); + ASSERT_THAT(fsfd, SyscallSucceeds()); + auto cleanup = Cleanup([&]() { close(fsfd); }); + + EXPECT_THAT(fsconfig(fsfd, FSCONFIG_CMD_CREATE, NULL, NULL, 0), + SyscallFailsWithErrno(EINVAL)); +} + +TEST(FsConfigTest, FsConfigBadFilesystem) { + SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(HaveCapability(CAP_SYS_ADMIN))); + // Note: Skipped on Linux because Linux fails immediately at fsopen() time + // with ENODEV. Currently on gVisor, fsopen() succeeds and the ENODEV failure + // occurs during fsconfig(FSCONFIG_CMD_CREATE). + SKIP_IF(!IsRunningOnGvisor()); + + int fsfd = fsopen("invalid_fs_name", 0); + ASSERT_THAT(fsfd, SyscallSucceeds()); + auto cleanup = Cleanup([&]() { close(fsfd); }); + + EXPECT_THAT(fsconfig(fsfd, FSCONFIG_SET_STRING, "source", "src", 0), + SyscallSucceeds()); + EXPECT_THAT(fsconfig(fsfd, FSCONFIG_CMD_CREATE, NULL, NULL, 0), + SyscallFailsWithErrno(ENODEV)); +} + +TEST(FsConfigTest, FsConfigCreateSuccess) { + SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(HaveCapability(CAP_SYS_ADMIN))); + + int fsfd = fsopen(kTmpfs, 0); + ASSERT_THAT(fsfd, SyscallSucceeds()); + auto cleanup = Cleanup([&]() { close(fsfd); }); + + EXPECT_THAT(fsconfig(fsfd, FSCONFIG_SET_STRING, "source", "my_source", 0), + SyscallSucceeds()); + EXPECT_THAT(fsconfig(fsfd, FSCONFIG_CMD_CREATE, NULL, NULL, 0), + SyscallSucceeds()); +} + +TEST(FsConfigTest, FsConfigCreateTwice) { + SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(HaveCapability(CAP_SYS_ADMIN))); + + int fsfd = fsopen(kTmpfs, 0); + ASSERT_THAT(fsfd, SyscallSucceeds()); + auto cleanup = Cleanup([&]() { close(fsfd); }); + + EXPECT_THAT(fsconfig(fsfd, FSCONFIG_SET_STRING, "source", "my_source", 0), + SyscallSucceeds()); + EXPECT_THAT(fsconfig(fsfd, FSCONFIG_CMD_CREATE, NULL, NULL, 0), + SyscallSucceeds()); + EXPECT_THAT(fsconfig(fsfd, FSCONFIG_CMD_CREATE, NULL, NULL, 0), + SyscallFailsWithErrno(EBUSY)); +} + +TEST(FsConfigTest, FsConfigUnsupportedCmds) { + SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(HaveCapability(CAP_SYS_ADMIN))); + // Note: Skipped on Linux because gVisor currently does not support + // FSCONFIG_CMD_CREATE_EXCL or FSCONFIG_CMD_RECONFIGURE (returns EINVAL). + SKIP_IF(!IsRunningOnGvisor()); + + int fsfd = fsopen(kTmpfs, 0); + ASSERT_THAT(fsfd, SyscallSucceeds()); + auto cleanup = Cleanup([&]() { close(fsfd); }); + + EXPECT_THAT(fsconfig(fsfd, FSCONFIG_CMD_CREATE_EXCL, NULL, NULL, 0), + SyscallFailsWithErrno(EINVAL)); + EXPECT_THAT(fsconfig(fsfd, FSCONFIG_CMD_RECONFIGURE, NULL, NULL, 0), + SyscallFailsWithErrno(EINVAL)); +} + +TEST(FsConfigTest, FsConfigUnsupportedTypes) { + SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(HaveCapability(CAP_SYS_ADMIN))); + // Note: Skipped on Linux because gVisor currently does not support + // BINARY, FD, or PATH parameter types (returns EINVAL). + SKIP_IF(!IsRunningOnGvisor()); + + int fsfd = fsopen(kTmpfs, 0); + ASSERT_THAT(fsfd, SyscallSucceeds()); + auto cleanup = Cleanup([&]() { close(fsfd); }); + + EXPECT_THAT(fsconfig(fsfd, FSCONFIG_SET_BINARY, "key", "val", 0), + SyscallFailsWithErrno(EINVAL)); + EXPECT_THAT(fsconfig(fsfd, FSCONFIG_SET_FD, "key", NULL, 0), + SyscallFailsWithErrno(EINVAL)); + EXPECT_THAT(fsconfig(fsfd, FSCONFIG_SET_PATH, "key", "val", 0), + SyscallFailsWithErrno(EINVAL)); +} + +// fsmount(2) tests + +TEST(FsMountTest, FsMountSuccess) { + SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(HaveCapability(CAP_SYS_ADMIN))); + + int fsfd = fsopen(kTmpfs, 0); + ASSERT_THAT(fsfd, SyscallSucceeds()); + auto cleanup_fs = Cleanup([&]() { close(fsfd); }); + + EXPECT_THAT(fsconfig(fsfd, FSCONFIG_SET_STRING, "source", "my_source", 0), + SyscallSucceeds()); + EXPECT_THAT(fsconfig(fsfd, FSCONFIG_SET_STRING, "nr_inodes", "12345", 0), + SyscallSucceeds()); + EXPECT_THAT(fsconfig(fsfd, FSCONFIG_SET_FLAG, "ro", NULL, 0), + SyscallSucceeds()); + EXPECT_THAT(fsconfig(fsfd, FSCONFIG_CMD_CREATE, NULL, NULL, 0), + SyscallSucceeds()); + + int mntfd = + fsmount(fsfd, FSMOUNT_CLOEXEC, MOUNT_ATTR_NODEV | MOUNT_ATTR_NOEXEC); + ASSERT_THAT(mntfd, SyscallSucceeds()); + auto cleanup_mnt = Cleanup([&]() { close(mntfd); }); + + struct statfs st; + ASSERT_THAT(fstatfs(mntfd, &st), SyscallSucceeds()); + EXPECT_EQ(st.f_files, 12345); + EXPECT_EQ(st.f_flags & (ST_NODEV | ST_NOEXEC | ST_RDONLY), + ST_NODEV | ST_NOEXEC | ST_RDONLY); +} + +TEST(FsMountTest, FsMountBeforeCreate) { + SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(HaveCapability(CAP_SYS_ADMIN))); + + int fsfd = fsopen(kTmpfs, 0); + ASSERT_THAT(fsfd, SyscallSucceeds()); + auto cleanup = Cleanup([&]() { close(fsfd); }); + + EXPECT_THAT(fsconfig(fsfd, FSCONFIG_SET_STRING, "source", "my_source", 0), + SyscallSucceeds()); + EXPECT_THAT(fsmount(fsfd, 0, 0), SyscallFailsWithErrno(EINVAL)); +} + +TEST(FsMountTest, FsMountTwice) { + SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(HaveCapability(CAP_SYS_ADMIN))); + + int fsfd = fsopen(kTmpfs, 0); + ASSERT_THAT(fsfd, SyscallSucceeds()); + auto cleanup_fs = Cleanup([&]() { close(fsfd); }); + + EXPECT_THAT(fsconfig(fsfd, FSCONFIG_SET_STRING, "source", "my_source", 0), + SyscallSucceeds()); + EXPECT_THAT(fsconfig(fsfd, FSCONFIG_CMD_CREATE, NULL, NULL, 0), + SyscallSucceeds()); + + int mntfd = fsmount(fsfd, 0, 0); + ASSERT_THAT(mntfd, SyscallSucceeds()); + auto cleanup_mnt = Cleanup([&]() { close(mntfd); }); + + EXPECT_THAT(fsmount(fsfd, 0, 0), SyscallFailsWithErrno(EBUSY)); +} + +TEST(FsMountTest, FsMountInvalidFlags) { + SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(HaveCapability(CAP_SYS_ADMIN))); + + int fsfd = fsopen(kTmpfs, 0); + ASSERT_THAT(fsfd, SyscallSucceeds()); + auto cleanup_fs = Cleanup([&]() { close(fsfd); }); + + EXPECT_THAT(fsconfig(fsfd, FSCONFIG_SET_STRING, "source", "my_source", 0), + SyscallSucceeds()); + EXPECT_THAT(fsconfig(fsfd, FSCONFIG_CMD_CREATE, NULL, NULL, 0), + SyscallSucceeds()); + + EXPECT_THAT(fsmount(fsfd, -1, 0), SyscallFailsWithErrno(EINVAL)); + EXPECT_THAT(fsmount(fsfd, 0, -1), SyscallFailsWithErrno(EINVAL)); +} + +TEST(FsMountTest, FsMountDetachedUsage) { + SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(HaveCapability(CAP_SYS_ADMIN))); + + int fsfd = fsopen(kTmpfs, 0); + ASSERT_THAT(fsfd, SyscallSucceeds()); + auto cleanup_fs = Cleanup([&]() { close(fsfd); }); + + EXPECT_THAT(fsconfig(fsfd, FSCONFIG_SET_STRING, "source", "my_source", 0), + SyscallSucceeds()); + EXPECT_THAT(fsconfig(fsfd, FSCONFIG_CMD_CREATE, NULL, NULL, 0), + SyscallSucceeds()); + + int mntfd = fsmount(fsfd, 0, 0); + ASSERT_THAT(mntfd, SyscallSucceeds()); + auto cleanup_mnt = Cleanup([&]() { close(mntfd); }); + + // Verify that detached mount FD acts as a valid root dirfd before attachment. + EXPECT_THAT(mkdirat(mntfd, "subdir", 0755), SyscallSucceeds()); + + int filefd = openat(mntfd, "file", O_CREAT | O_RDWR, 0644); + ASSERT_THAT(filefd, SyscallSucceeds()); + EXPECT_THAT(close(filefd), SyscallSucceeds()); +} + +// move_mount(2) tests + +TEST(MoveMountTest, MoveMountAttachSuccess) { + SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(HaveCapability(CAP_SYS_ADMIN))); + + int fsfd = fsopen(kTmpfs, 0); + ASSERT_THAT(fsfd, SyscallSucceeds()); + auto cleanup_fs = Cleanup([&]() { close(fsfd); }); + + EXPECT_THAT(fsconfig(fsfd, FSCONFIG_SET_STRING, "source", "my_source", 0), + SyscallSucceeds()); + EXPECT_THAT(fsconfig(fsfd, FSCONFIG_CMD_CREATE, NULL, NULL, 0), + SyscallSucceeds()); + + int mntfd = fsmount(fsfd, 0, 0); + ASSERT_THAT(mntfd, SyscallSucceeds()); + auto cleanup_mnt = Cleanup([&]() { close(mntfd); }); + + // Pre-move creation: Create files and directories in detached mntfd. + EXPECT_THAT(mkdirat(mntfd, "pre_move_dir", 0755), SyscallSucceeds()); + int filefd = openat(mntfd, "pre_move_file", O_CREAT | O_RDWR, 0644); + ASSERT_THAT(filefd, SyscallSucceeds()); + EXPECT_THAT(close(filefd), SyscallSucceeds()); + + // Create a mountpoint on the existing root filesystem + auto const dir = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateDir()); + + // Move the detached mount onto the new mount directory. + EXPECT_THAT(move_mount(mntfd, "", AT_FDCWD, dir.path().c_str(), + MOVE_MOUNT_F_EMPTY_PATH), + SyscallSucceeds()); + cleanup_mnt.Release()(); + + // Verify that pre-move contents are fully preserved and accessible at target. + EXPECT_THAT(access(JoinPath(dir.path(), "pre_move_dir").c_str(), F_OK), + SyscallSucceeds()); + EXPECT_THAT(access(JoinPath(dir.path(), "pre_move_file").c_str(), F_OK), + SyscallSucceeds()); + + EXPECT_THAT(umount(dir.path().c_str()), SyscallSucceeds()); +} + +TEST(MoveMountTest, MoveMountFromAttached) { + SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(HaveCapability(CAP_SYS_ADMIN))); + + int fsfd = fsopen(kTmpfs, 0); + ASSERT_THAT(fsfd, SyscallSucceeds()); + auto cleanup_fs = Cleanup([&]() { close(fsfd); }); + + EXPECT_THAT(fsconfig(fsfd, FSCONFIG_SET_STRING, "source", "my_source", 0), + SyscallSucceeds()); + EXPECT_THAT(fsconfig(fsfd, FSCONFIG_CMD_CREATE, NULL, NULL, 0), + SyscallSucceeds()); + + int mntfd = fsmount(fsfd, 0, 0); + ASSERT_THAT(mntfd, SyscallSucceeds()); + auto cleanup_mnt = Cleanup([&]() { close(mntfd); }); + + auto const dir1 = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateDir()); + auto const dir2 = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateDir()); + + // First attach to dir1. + EXPECT_THAT(move_mount(mntfd, "", AT_FDCWD, dir1.path().c_str(), + MOVE_MOUNT_F_EMPTY_PATH), + SyscallSucceeds()); + cleanup_mnt.Release()(); + + // Now move from dir1 to dir2. + EXPECT_THAT(move_mount(AT_FDCWD, dir1.path().c_str(), AT_FDCWD, + dir2.path().c_str(), 0), + SyscallSucceeds()); + + EXPECT_THAT(umount(dir2.path().c_str()), SyscallSucceeds()); +} + +TEST(MoveMountTest, MoveMountOntoDetached) { + SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(HaveCapability(CAP_SYS_ADMIN))); + + // Create first detached mount mntfd1. + int fsfd1 = fsopen(kTmpfs, 0); + ASSERT_THAT(fsfd1, SyscallSucceeds()); + auto cleanup_fs1 = Cleanup([&]() { close(fsfd1); }); + EXPECT_THAT(fsconfig(fsfd1, FSCONFIG_SET_STRING, "source", "src1", 0), + SyscallSucceeds()); + EXPECT_THAT(fsconfig(fsfd1, FSCONFIG_CMD_CREATE, NULL, NULL, 0), + SyscallSucceeds()); + int mntfd1 = fsmount(fsfd1, 0, 0); + ASSERT_THAT(mntfd1, SyscallSucceeds()); + auto cleanup_mnt1 = Cleanup([&]() { close(mntfd1); }); + + EXPECT_THAT(mkdirat(mntfd1, "submnt", 0755), SyscallSucceeds()); + + // Create second detached mount mntfd2. + int fsfd2 = fsopen(kTmpfs, 0); + ASSERT_THAT(fsfd2, SyscallSucceeds()); + auto cleanup_fs2 = Cleanup([&]() { close(fsfd2); }); + EXPECT_THAT(fsconfig(fsfd2, FSCONFIG_SET_STRING, "source", "src2", 0), + SyscallSucceeds()); + EXPECT_THAT(fsconfig(fsfd2, FSCONFIG_CMD_CREATE, NULL, NULL, 0), + SyscallSucceeds()); + int mntfd2 = fsmount(fsfd2, 0, 0); + ASSERT_THAT(mntfd2, SyscallSucceeds()); + auto cleanup_mnt2 = Cleanup([&]() { close(mntfd2); }); + + // Move mntfd2 onto submnt of mntfd1. + EXPECT_THAT(move_mount(mntfd2, "", mntfd1, "submnt", MOVE_MOUNT_F_EMPTY_PATH), + SyscallSucceeds()); + cleanup_mnt2.Release()(); + + auto const dir = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateDir()); + EXPECT_THAT(move_mount(mntfd1, "", AT_FDCWD, dir.path().c_str(), + MOVE_MOUNT_F_EMPTY_PATH), + SyscallSucceeds()); + cleanup_mnt1.Release()(); + + EXPECT_THAT(umount(JoinPath(dir.path(), "submnt").c_str()), + SyscallSucceeds()); + EXPECT_THAT(umount(dir.path().c_str()), SyscallSucceeds()); +} + +TEST(MoveMountTest, MoveMountSubmountToFs) { + SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(HaveCapability(CAP_SYS_ADMIN))); + + // To move a mount out of an anonymous mount namespace, it must be + // the *root* of that namespace. + + // Create first detached mount mntfd1. + int fsfd1 = fsopen(kTmpfs, 0); + ASSERT_THAT(fsfd1, SyscallSucceeds()); + auto cleanup_fs1 = Cleanup([&]() { close(fsfd1); }); + EXPECT_THAT(fsconfig(fsfd1, FSCONFIG_SET_STRING, "source", "src1", 0), + SyscallSucceeds()); + EXPECT_THAT(fsconfig(fsfd1, FSCONFIG_CMD_CREATE, NULL, NULL, 0), + SyscallSucceeds()); + int mntfd1 = fsmount(fsfd1, 0, 0); + ASSERT_THAT(mntfd1, SyscallSucceeds()); + auto cleanup_mnt1 = Cleanup([&]() { close(mntfd1); }); + + EXPECT_THAT(mkdirat(mntfd1, "submnt", 0755), SyscallSucceeds()); + + // Create second detached mount mntfd2. + int fsfd2 = fsopen(kTmpfs, 0); + ASSERT_THAT(fsfd2, SyscallSucceeds()); + auto cleanup_fs2 = Cleanup([&]() { close(fsfd2); }); + EXPECT_THAT(fsconfig(fsfd2, FSCONFIG_SET_STRING, "source", "src2", 0), + SyscallSucceeds()); + EXPECT_THAT(fsconfig(fsfd2, FSCONFIG_CMD_CREATE, NULL, NULL, 0), + SyscallSucceeds()); + int mntfd2 = fsmount(fsfd2, 0, 0); + ASSERT_THAT(mntfd2, SyscallSucceeds()); + auto cleanup_mnt2 = Cleanup([&]() { close(mntfd2); }); + + // Move mntfd2 onto submnt of mntfd1. + EXPECT_THAT(move_mount(mntfd2, "", mntfd1, "submnt", MOVE_MOUNT_F_EMPTY_PATH), + SyscallSucceeds()); + cleanup_mnt2.Release()(); + + auto const dir = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateDir()); + + // Attempt to move *only* the submount (submnt of mntfd1) onto the real tree + // (which should fail). + EXPECT_THAT(move_mount(mntfd1, "submnt", AT_FDCWD, dir.path().c_str(), 0), + SyscallFailsWithErrno(EINVAL)); +} + +TEST(MoveMountTest, MoveMountWithoutEmptyPath) { + SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(HaveCapability(CAP_SYS_ADMIN))); + + int fsfd = fsopen(kTmpfs, 0); + ASSERT_THAT(fsfd, SyscallSucceeds()); + auto cleanup_fs = Cleanup([&]() { close(fsfd); }); + EXPECT_THAT(fsconfig(fsfd, FSCONFIG_SET_STRING, "source", "src", 0), + SyscallSucceeds()); + EXPECT_THAT(fsconfig(fsfd, FSCONFIG_CMD_CREATE, NULL, NULL, 0), + SyscallSucceeds()); + int mntfd = fsmount(fsfd, 0, 0); + ASSERT_THAT(mntfd, SyscallSucceeds()); + auto cleanup_mnt = Cleanup([&]() { close(mntfd); }); + + auto const dir = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateDir()); + + // Omit MOVE_MOUNT_F_EMPTY_PATH when passing mntfd and "". + EXPECT_THAT(move_mount(mntfd, "", AT_FDCWD, dir.path().c_str(), 0), + SyscallFailsWithErrno(ENOENT)); +} + +TEST(MoveMountTest, MoveMountLoop) { + SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(HaveCapability(CAP_SYS_ADMIN))); + + int fsfd = fsopen(kTmpfs, 0); + ASSERT_THAT(fsfd, SyscallSucceeds()); + auto cleanup_fs = Cleanup([&]() { close(fsfd); }); + EXPECT_THAT(fsconfig(fsfd, FSCONFIG_SET_STRING, "source", "src", 0), + SyscallSucceeds()); + EXPECT_THAT(fsconfig(fsfd, FSCONFIG_CMD_CREATE, NULL, NULL, 0), + SyscallSucceeds()); + int mntfd = fsmount(fsfd, 0, 0); + ASSERT_THAT(mntfd, SyscallSucceeds()); + auto cleanup_mnt = Cleanup([&]() { close(mntfd); }); + + auto const dir = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateDir()); + + EXPECT_THAT(move_mount(mntfd, "", AT_FDCWD, dir.path().c_str(), + MOVE_MOUNT_F_EMPTY_PATH), + SyscallSucceeds()); + cleanup_mnt.Release(); + close(mntfd); + + EXPECT_THAT(mkdir(JoinPath(dir.path(), "child").c_str(), 0755), + SyscallSucceeds()); + + // Attempt to move parent mount under its own child directory. + EXPECT_THAT(move_mount(AT_FDCWD, dir.path().c_str(), AT_FDCWD, + JoinPath(dir.path(), "child").c_str(), 0), + SyscallFailsWithErrno(ELOOP)); + + EXPECT_THAT(umount(dir.path().c_str()), SyscallSucceeds()); +} + +} // namespace +} // namespace testing +} // namespace gvisor