Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
81 changes: 81 additions & 0 deletions g3doc/user_guide/containerd/configuration.md
Original file line number Diff line number Diff line change
Expand Up @@ -101,6 +101,87 @@ log_level = "debug"
EOF
```

## User Namespace Injection

The shim can inject a Linux user namespace and uid/gid mappings into a
sandbox container's OCI spec when the pod opts in via an annotation, so the
workload runs in a user namespace without the caller having to set one up
explicitly. This is useful on Kubernetes nodes whose runtime stack does not
yet support `pod.spec.hostUsers: false` ([KEP-127][KEP-127]) for runsc; once
that path is plumbed through, drop the annotation and use `hostUsers: false`
instead. See [issue #13303](https://github.com/google/gvisor/issues/13303).

Two gates must both be true for injection to happen:

1. The operator enables the feature in `runsc.toml` with
`enable_user_namespace_annotation = true`.
2. The pod sets `metadata.annotations["dev.gvisor.spec.user-namespace"] =
"true"`. Containerd propagates `dev.gvisor.*` pod annotations to the
sandbox OCI spec via the `pod_annotations` match list.

The operator gate exists so a misconfigured pod cannot unilaterally request
a userns on a runtime that is not provisioned for one.

Application/exec containers within the same pod inherit the sandbox's user
namespace from runsc; only the sandbox container's spec is modified. If the
caller already declared a user namespace or uid/gid mappings (e.g. via
`hostUsers: false`), the shim leaves the spec untouched.

Each opted-in sandbox is assigned a contiguous, non-overlapping block of
host UIDs from a per-node pool. Allocations are persisted under
`user_namespace_state_dir` (default `/run/runsc/userns-pool`) so they
survive shim restarts, and freed when the sandbox is deleted.

Enable it in `runsc.toml`:

```shell
cat <<EOF | sudo tee /etc/containerd/runsc.toml
enable_user_namespace_annotation = true
user_namespace_host_uid_base = 100000
user_namespace_host_gid_base = 100000
# Optional, with defaults shown:
user_namespace_range_size = 65536 # UIDs/GIDs per sandbox
user_namespace_pool_size = 1000 # max concurrent sandboxes
user_namespace_state_dir = "/run/runsc/userns-pool"
EOF
```

Ensure the runtime registration in containerd's config has
`pod_annotations = ["dev.gvisor.*"]` so the opt-in annotation reaches the
shim:

```shell
cat <<EOF | sudo tee /etc/containerd/config.toml
[plugins."io.containerd.grpc.v1.cri".containerd.runtimes.runsc]
runtime_type = "io.containerd.runsc.v1"
pod_annotations = ["dev.gvisor.*"]
[plugins."io.containerd.grpc.v1.cri".containerd.runtimes.runsc.options]
TypeUrl = "io.containerd.runsc.v1.options"
ConfigPath = "/etc/containerd/runsc.toml"
EOF
```

A pod opts in by setting the annotation:

```yaml
apiVersion: v1
kind: Pod
metadata:
annotations:
dev.gvisor.spec.user-namespace: "true"
spec:
runtimeClassName: gvisor
containers:
- name: app
image: ...
```

The host UID/GID range used by the pool must not overlap with system or
kubelet-managed UIDs. With the defaults above the pool occupies
`[100000, 100000 + 1000*65536)`; size accordingly for your node.

[KEP-127]: https://github.com/kubernetes/enhancements/issues/127

## NVIDIA Container Runtime

If you want to use
Expand Down
29 changes: 28 additions & 1 deletion pkg/shim/v1/runsc/container.go
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,7 @@ import (
"gvisor.dev/gvisor/pkg/shim/v1/proc"
"gvisor.dev/gvisor/pkg/shim/v1/runsccmd"
"gvisor.dev/gvisor/pkg/shim/v1/runtimeoptions"
"gvisor.dev/gvisor/pkg/shim/v1/utils"
)

// CgroupMode is the cgroups mode that is being used by the container.
Expand Down Expand Up @@ -69,6 +70,13 @@ type Container struct {

// cgroup is the cgroups mode that is being used by the container.
cgroup CgroupMode

// userNS is the user-namespace allocator config that owns this
// container's UID/GID slot, set when newInit injected a user namespace
// into the spec via UserNamespaceConfig. Nil otherwise (sandbox without
// shim-side userns, or non-sandbox container that inherits its
// sandbox's userns).
userNS *utils.UserNamespaceConfig
}

// NewContainer returns a new runsc container
Expand Down Expand Up @@ -197,10 +205,20 @@ func NewContainer(ctx context.Context, platform stdio.Platform, r *task.CreateTa
FSRestoreDirect: FSRestoreDirect,
}

process, err := newInit(filepath.Join(r.Bundle, "work"), ns, platform, config, &opts, st.Rootfs)
process, userNS, err := newInit(filepath.Join(r.Bundle, "work"), ns, platform, config, &opts, st.Rootfs)
if err != nil {
return nil, err
}
// Release the user namespace slot if anything from this point on fails.
// On success cu.Release() below cancels the cleanup.
if userNS != nil {
sandboxID := r.ID
cu.Add(func() {
if err := utils.ReleaseUserNamespaceSlot(userNS, sandboxID); err != nil {
log.L.Warningf("failed to release user namespace slot for %s: %v", sandboxID, err)
}
})
}
if err := process.Create(ctx, config); err != nil {
return nil, err
}
Expand All @@ -218,6 +236,7 @@ func NewContainer(ctx context.Context, platform stdio.Platform, r *task.CreateTa
task: process,
cgroup: cgroupMode,
processes: make(map[string]extension.Process),
userNS: userNS,
}
return &c, nil
}
Expand Down Expand Up @@ -312,6 +331,14 @@ func (c *Container) Delete(ctx context.Context, r *task.DeleteRequest) (extensio
// When ExecID is empty, it removes the init task in the container.
if r.ExecID != "" {
c.ProcessRemove(r.ExecID)
} else if c.userNS != nil {
// Sandbox init container is being deleted; release its user namespace
// slot. Best-effort: a leaked slot is an operator nuisance, not a
// correctness issue (cleared on reboot since /run is tmpfs), so log
// and continue.
if err := utils.ReleaseUserNamespaceSlot(c.userNS, c.ID); err != nil {
log.L.Warningf("failed to release user namespace slot for %s: %v", c.ID, err)
}
}
return p, nil
}
Expand Down
48 changes: 48 additions & 0 deletions pkg/shim/v1/runsc/options.go
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,54 @@ type Options struct {
// EnableHibernateServer indicates if the hibernate server should be started.
EnableHibernateServer bool `toml:"enable_hibernate_server" json:"enableHibernateServer"`

// EnableUserNamespaceAnnotation is the operator-side gate that allows
// pods to opt into shim-side user namespace injection via the pod
// annotation "dev.gvisor.spec.user-namespace": "true" (see
// utils.UserNamespaceRequestAnnotation). When true, sandbox containers
// whose pod annotations contain that key get a user namespace plus
// contiguous, non-overlapping uid/gid mappings injected into their OCI
// spec before runsc is invoked. Application/exec containers within the
// same pod inherit the sandbox's user namespace.
//
// This exists to let runsc workloads run inside a user namespace on
// nodes whose kubelet+containerd stack does not yet plumb pod.spec.
// hostUsers (KEP-127) through to runsc. When that path lands upstream,
// drop the annotation and use hostUsers: false on the pod spec instead.
// See https://github.com/google/gvisor/issues/13303.
//
// The shim respects caller-supplied user namespaces and uid/gid
// mappings: if the OCI spec already declares them (e.g. via
// hostUsers: false), the shim leaves the spec untouched and does not
// allocate a slot.
//
// Pods can only request the userns when this option is true, so a
// misconfigured workload cannot unilaterally enable it.
EnableUserNamespaceAnnotation bool `toml:"enable_user_namespace_annotation" json:"enableUserNamespaceAnnotation"`

// UserNamespaceHostUIDBase is the lowest host UID used by the
// per-node UID pool. Each sandbox that opts in receives a contiguous
// block of UserNamespaceRangeSize UIDs starting at
// UserNamespaceHostUIDBase + slot*UserNamespaceRangeSize.
UserNamespaceHostUIDBase uint32 `toml:"user_namespace_host_uid_base" json:"userNamespaceHostUidBase"`

// UserNamespaceHostGIDBase is the GID equivalent of
// UserNamespaceHostUIDBase.
UserNamespaceHostGIDBase uint32 `toml:"user_namespace_host_gid_base" json:"userNamespaceHostGidBase"`

// UserNamespaceRangeSize is the number of UIDs/GIDs each sandbox
// receives. Defaults to 65536 when the annotation gate is enabled and
// this field is unset.
UserNamespaceRangeSize uint32 `toml:"user_namespace_range_size" json:"userNamespaceRangeSize"`

// UserNamespacePoolSize is the maximum number of concurrent sandboxes
// that can hold non-overlapping UID/GID ranges on this node. Defaults
// to 1000 when the annotation gate is enabled and this field is unset.
UserNamespacePoolSize uint32 `toml:"user_namespace_pool_size" json:"userNamespacePoolSize"`

// UserNamespaceStateDir is the directory used to persist slot
// allocations across shim restarts. Defaults to /run/runsc/userns-pool.
UserNamespaceStateDir string `toml:"user_namespace_state_dir" json:"userNamespaceStateDir"`

// RunscConfig is a key/value map of all runsc flags.
RunscConfig map[string]string `toml:"runsc_config" json:"runscConfig"`
}
54 changes: 49 additions & 5 deletions pkg/shim/v1/runsc/service.go
Original file line number Diff line number Diff line change
Expand Up @@ -657,21 +657,65 @@ func getTopic(e any) string {
return runtime.TaskUnknownTopic
}

func newInit(workDir, namespace string, platform stdio.Platform, r *proc.CreateConfig, options *Options, rootfs string) (*proc.Init, error) {
func newInit(workDir, namespace string, platform stdio.Platform, r *proc.CreateConfig, options *Options, rootfs string) (*proc.Init, *utils.UserNamespaceConfig, error) {
spec, err := utils.ReadSpec(r.Bundle)
if err != nil {
return nil, fmt.Errorf("read oci spec: %w", err)
return nil, nil, fmt.Errorf("read oci spec: %w", err)
}

updated, err := utils.UpdateVolumeAnnotations(spec)
if err != nil {
return nil, fmt.Errorf("update volume annotations: %w", err)
return nil, nil, fmt.Errorf("update volume annotations: %w", err)
}
updated = setPodCgroup(spec) || updated

// Shim-side user namespace injection.
//
// Two gates must both be true: the runtime operator opted in via
// enable_user_namespace_annotation, AND the pod's metadata.annotations
// requested it via "dev.gvisor.spec.user-namespace": "true". The
// operator gate exists so a misconfigured workload cannot unilaterally
// enable a userns when the runtime is not configured to support one.
//
// Only applied to sandbox containers; application/exec containers within
// the pod inherit the sandbox's user namespace from runsc. The caller's
// pre-existing user namespace or uid/gid mappings (e.g. from kubelet's
// pod.spec.hostUsers: false plumbing) take precedence: InjectUserNamespace
// returns updated=false in that case and we drop the slot we just claimed.
var userNS *utils.UserNamespaceConfig
if options.EnableUserNamespaceAnnotation && utils.IsSandbox(spec) && utils.HasUserNamespaceRequest(spec) {
userNS = &utils.UserNamespaceConfig{
HostUIDBase: options.UserNamespaceHostUIDBase,
HostGIDBase: options.UserNamespaceHostGIDBase,
RangeSize: options.UserNamespaceRangeSize,
PoolSize: options.UserNamespacePoolSize,
StateDir: options.UserNamespaceStateDir,
}
slot, err := utils.AllocateUserNamespaceSlot(userNS, r.ID)
if err != nil {
return nil, nil, fmt.Errorf("allocate user namespace slot: %w", err)
}
injected, err := utils.InjectUserNamespace(spec, userNS, slot)
if err != nil {
_ = utils.ReleaseUserNamespaceSlot(userNS, r.ID)
return nil, nil, fmt.Errorf("inject user namespace: %w", err)
}
if injected {
updated = true
} else {
// Caller already configured a user namespace; release the slot
// we claimed and let the caller's spec stand.
_ = utils.ReleaseUserNamespaceSlot(userNS, r.ID)
userNS = nil
}
}

if updated {
if err := utils.WriteSpec(r.Bundle, spec); err != nil {
return nil, err
if userNS != nil {
_ = utils.ReleaseUserNamespaceSlot(userNS, r.ID)
}
return nil, nil, err
}
}

Expand All @@ -691,7 +735,7 @@ func newInit(workDir, namespace string, platform stdio.Platform, r *proc.CreateC
p.Sandbox = specutils.SpecContainerType(spec) == specutils.ContainerTypeSandbox
p.UserLog = utils.UserLogPath(spec)
p.Monitor = reaper.Default
return p, nil
return p, userNS, nil
}

// setPodCgroup searches for the pod cgroup path inside the container's cgroup
Expand Down
6 changes: 5 additions & 1 deletion pkg/shim/v1/utils/BUILD
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@ go_library(
name = "utils",
srcs = [
"annotations.go",
"userns.go",
"utils.go",
"volumes.go",
],
Expand All @@ -25,7 +26,10 @@ go_library(
go_test(
name = "utils_test",
size = "small",
srcs = ["volumes_test.go"],
srcs = [
"userns_test.go",
"volumes_test.go",
],
library = ":utils",
deps = [
"@com_github_mohae_deepcopy//:go_default_library",
Expand Down
Loading