Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
39 changes: 27 additions & 12 deletions pkg/shim/v1/runsccmd/runsc.go
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,6 @@
package runsccmd

import (
"bytes"
"context"
"encoding/json"
"fmt"
Expand Down Expand Up @@ -610,16 +609,29 @@ func (r *Runsc) command(context context.Context, args ...string) *exec.Cmd {
}

func cmdOutput(cmd *exec.Cmd, combined bool) ([]byte, []byte, error) {
stdout := getBuf()
defer putBuf(stdout)
cmd.Stdout = stdout
cmd.Stderr = stdout
// Use *os.File-backed temp files instead of bytes.Buffer. With a
// non-*os.File writer Go's exec package creates an internal pipe and
// blocks cmd.Wait() until the read-side sees EOF. When runsc daemonizes
// a sandbox child, the sandbox inherits the pipe's write end, EOF never
// arrives, and the shim's TaskService.Create hangs forever.
stdoutFile, err := os.CreateTemp("", "runsc-stdout-*")
if err != nil {
return nil, nil, err
}
defer os.Remove(stdoutFile.Name())
defer stdoutFile.Close()
cmd.Stdout = stdoutFile
cmd.Stderr = stdoutFile

var stderr *bytes.Buffer
var stderrFile *os.File
if !combined {
stderr = getBuf()
defer putBuf(stderr)
cmd.Stderr = stderr
stderrFile, err = os.CreateTemp("", "runsc-stderr-*")
if err != nil {
return nil, nil, err
}
defer os.Remove(stderrFile.Name())
defer stderrFile.Close()
cmd.Stderr = stderrFile
}
ec, err := Monitor.Start(cmd)
if err != nil {
Expand All @@ -630,8 +642,11 @@ func cmdOutput(cmd *exec.Cmd, combined bool) ([]byte, []byte, error) {
if err == nil && status != 0 {
err = fmt.Errorf("%q did not terminate successfully", cmd.Args[0])
}
if stderr == nil {
return stdout.Bytes(), nil, err

stdoutBytes, _ := os.ReadFile(stdoutFile.Name())
if stderrFile == nil {
return stdoutBytes, nil, err
}
return stdout.Bytes(), stderr.Bytes(), err
stderrBytes, _ := os.ReadFile(stderrFile.Name())
return stdoutBytes, stderrBytes, err
}
6 changes: 4 additions & 2 deletions runsc/config/config.go
Original file line number Diff line number Diff line change
Expand Up @@ -981,8 +981,10 @@ type Overlay2 struct {
}

func defaultOverlay2() *Overlay2 {
// Rootfs overlay is enabled by default and backed by a file in rootfs itself.
return &Overlay2{rootMount: true, subMounts: false, medium: SelfOverlay}
// PatchNoDefaultRootfsOverlay: default to no overlay so that host-side
// rootfs writes after sandbox start (e.g. Garden streamInToBundleRoot
// for CF /tmp/app) are visible inside the sandbox via the gofer.
return &Overlay2{rootMount: false, subMounts: false, medium: NoOverlay}
}

func setOverlay2Err(v string) error {
Expand Down
44 changes: 40 additions & 4 deletions runsc/container/container.go
Original file line number Diff line number Diff line change
Expand Up @@ -1570,10 +1570,18 @@ func (c *Container) createGoferProcess(conf *config.Config, mountHints *boot.Pod
// users in the sandbox.
if !rootlessEUID {
if userNS, ok := specutils.GetNS(specs.UserNamespace, c.Spec); ok {
nss = append(nss, userNS)
specutils.SetUIDGIDMappings(cmd, c.Spec)
// We need to set UID and GID to have capabilities in a new user namespace.
cmd.SysProcAttr.Credential = &syscall.Credential{Uid: 0, Gid: 0}
// PatchGoferUserns: only join the sandbox userns if our host uid is
// mapped into it. With unprivileged userns where host uid 0 is
// unmapped, setns() fails (no CAP_SYS_ADMIN in target). Run the
// gofer in the init userns as root in that case.
if goferCanJoinUserNS(userNS) {
nss = append(nss, userNS)
specutils.SetUIDGIDMappings(cmd, c.Spec)
// We need to set UID and GID to have capabilities in a new user namespace.
cmd.SysProcAttr.Credential = &syscall.Credential{Uid: 0, Gid: 0}
} else {
log.Warningf("PatchGoferUserns: host uid 0 not mapped in %q; running gofer in init userns", userNS.Path)
}
}
} else {
userNS, ok := specutils.GetNS(specs.UserNamespace, c.Spec)
Expand Down Expand Up @@ -2292,3 +2300,31 @@ func (c *Container) GetNetworkConfig() (*boot.CreateLinksAndRoutesArgs, error) {
}
return c.Sandbox.GetNetworkConfig()
}

// PatchGoferUserns: returns true if it is safe for the gofer to setns() into
// the sandbox userns referenced by ns. When the userns has a Path (existing
// NS), we read its uid_map and verify host uid 0 falls inside any mapped
// range. If we can't determine, default to true (preserve original behaviour).
func goferCanJoinUserNS(ns specs.LinuxNamespace) bool {
if ns.Path == "" {
return true // a new NS will be created via clone — always fine
}
// ns.Path is typically "/proc/<pid>/ns/user"; resolve sibling uid_map.
parts := strings.Split(strings.Trim(ns.Path, "/"), "/")
if len(parts) < 4 || parts[0] != "proc" || parts[2] != "ns" {
return true
}
data, err := os.ReadFile("/" + parts[0] + "/" + parts[1] + "/uid_map")
if err != nil {
return true // can't tell — let StartInNS try
}
for _, line := range strings.Split(strings.TrimSpace(string(data)), "\n") {
var inID, outID, sz int
if _, err := fmt.Sscanf(line, "%d %d %d", &inID, &outID, &sz); err == nil {
if outID <= 0 && 0 < outID+sz {
return true
}
}
}
return false
}
26 changes: 26 additions & 0 deletions runsc/container/state_file.go
Original file line number Diff line number Diff line change
Expand Up @@ -197,6 +197,15 @@ func LoadSandbox(rootDir, id string, opts LoadOpts) ([]*Container, error) {
}

func findContainerID(rootDir, partialID string) (FullID, error) {
// PatchExactMatch-state-file: prefer an EXACT container-id match first.
// Without this, a lookup for an id that is itself a prefix of other
// container ids (e.g. Garden creates child exec containers named
// "<id>-envoy-startup-healthcheck-N" alongside "<id>") falls into the
// prefix branch below and is rejected as ambiguous.
exactPattern := buildPath(rootDir, FullID{SandboxID: "*", ContainerID: partialID}, stateFileExtension)
if exact, err := filepath.Glob(exactPattern); err == nil && len(exact) == 1 {
return parseFileName(filepath.Base(exact[0]))
}
// Check whether the id fully specifies an existing container.
pattern := buildPath(rootDir, FullID{SandboxID: "*", ContainerID: partialID + "*"}, stateFileExtension)
list, err := filepath.Glob(pattern)
Expand All @@ -210,6 +219,23 @@ func findContainerID(rootDir, partialID string) (FullID, error) {
return parseFileName(filepath.Base(list[0]))
}

// PatchPrefixNoMatch-state-file: if the exact-match step above missed
// AND every prefix hit is a "<partialID>-..." child id, then partialID
// itself does not exist (typical Garden post-cleanup case: parent's
// state file removed but pea exec children still on disk). Return
// ErrNotExist instead of falling through to the ambiguity branch below.
allChildren := true
for _, p := range list {
full, perr := parseFileName(filepath.Base(p))
if perr != nil || !strings.HasPrefix(full.ContainerID, partialID+"-") {
allChildren = false
break
}
}
if allChildren {
return FullID{}, os.ErrNotExist
}

// Now see whether id could be an abbreviation of exactly 1 of the
// container ids. If id is ambiguous (it could match more than 1
// container), it is an error.
Expand Down
107 changes: 91 additions & 16 deletions runsc/sandbox/network.go
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@ import (
"path/filepath"
"runtime"
"strconv"
"time"

specs "github.com/opencontainers/runtime-spec/specs-go"
"github.com/vishvananda/netlink"
Expand Down Expand Up @@ -129,21 +130,45 @@ func addrBitLength(ip net.IP) int {
}

// removeLinkAddresses removes IP addresses from the host NIC for a link.
// PatchPeerAddrDel: list addrs with full netlink attrs (Peer/Scope/Flags) and
// AddrDel the actual struct. Required for point-to-point links (e.g. CF silk
// CNI sets eth0 with IFA_LOCAL=local, IFA_ADDRESS=peer); ParseAddr-based
// AddrDel uses IFA_ADDRESS=local and the kernel rejects with EADDRNOTAVAIL.
func removeLinkAddresses(linkName string, addresses []boot.IPWithPrefix) error {
ifaceLink, err := netlink.LinkByName(linkName)
if err != nil {
return fmt.Errorf("getting link for interface %q: %w", linkName, err)
}
existing, err := netlink.AddrList(ifaceLink, netlink.FAMILY_ALL)
if err != nil {
return fmt.Errorf("listing addresses for interface %q: %w", linkName, err)
}
for _, addr := range addresses {
ipNet := &net.IPNet{
IP: addr.Address,
Mask: net.CIDRMask(addr.PrefixLen, addrBitLength(addr.Address)),
}
if err := removeAddress(ifaceLink, ipNet.String()); err != nil {
// If we encounter an error while deleting the ip,
// verify the ip is still present on the interface.
if present, err := isAddressOnInterface(linkName, ipNet); err != nil {
return fmt.Errorf("checking if address %v is on interface %q: %w", ipNet, linkName, err)
var nlAddr *netlink.Addr
for i := range existing {
e := &existing[i]
if e.IPNet == nil || !e.IPNet.IP.Equal(ipNet.IP) {
continue
}
ep, _ := e.IPNet.Mask.Size()
lp, _ := ipNet.Mask.Size()
if ep != lp {
continue
}
nlAddr = e
break
}
if nlAddr == nil {
// Already gone.
continue
}
if err := netlink.AddrDel(ifaceLink, nlAddr); err != nil {
if present, perr := isAddressOnInterface(linkName, ipNet); perr != nil {
return fmt.Errorf("checking if address %v is on interface %q: %w", ipNet, linkName, perr)
} else if !present {
continue
}
Expand Down Expand Up @@ -212,17 +237,6 @@ func collectLinksAndRoutes(conf *config.Config, disableIPv6 bool) (boot.CreateLi
return boot.CreateLinksAndRoutesArgs{}, fmt.Errorf("fetching ARP table for %q: %w", iface.Name, err)
}

var neighbors []boot.Neighbor
for _, n := range dump {
// There are only two "good" states NUD_PERMANENT and NUD_REACHABLE,
// but NUD_REACHABLE is fully dynamic and will be re-probed anyway.
if n.State == netlink.NUD_PERMANENT {
log.Debugf("Copying a static ARP entry: %+v %+v", n.IP, n.HardwareAddr)
// No flags are copied because Stack.AddStaticNeighbor does not support flags right now.
neighbors = append(neighbors, boot.Neighbor{IP: n.IP, HardwareAddr: n.HardwareAddr})
}
}

// Scrape the routes before removing the address, since that
// will remove the routes as well.
routes, defv4, defv6, err := routesForIface(iface, disableIPv6)
Expand All @@ -245,6 +259,67 @@ func collectLinksAndRoutes(conf *config.Config, disableIPv6 bool) (boot.CreateLi
args.Defaultv6Gateway.Name = iface.Name
}

// PatchArpWarmCache: warm the kernel neighbor cache for all gateway
// IPs we will hand to the sandbox, then copy "live" entries to
// boot.Neighbor so gVisor's netstack starts with a static cache.
// Required for silk-cni peer-style /32 setups where gVisor's own ARP
// through the FDBased link is unreliable.
gwSet := map[string]net.IP{}
for _, r := range routes {
if r.Gateway != nil && !r.Gateway.IsUnspecified() {
gwSet[r.Gateway.String()] = r.Gateway
}
}
if defv4 != nil && defv4.Gateway != nil && !defv4.Gateway.IsUnspecified() {
gwSet[defv4.Gateway.String()] = defv4.Gateway
}
if defv6 != nil && defv6.Gateway != nil && !defv6.Gateway.IsUnspecified() {
gwSet[defv6.Gateway.String()] = defv6.Gateway
}
haveEntry := func(ip net.IP) bool {
for _, n := range dump {
if n.IP != nil && n.IP.Equal(ip) && n.HardwareAddr != nil && len(n.HardwareAddr) > 0 {
return true
}
}
return false
}
warmed := false
for _, gw := range gwSet {
if haveEntry(gw) {
continue
}
network := "udp4"
if gw.To4() == nil {
network = "udp6"
}
c, derr := net.DialUDP(network, nil, &net.UDPAddr{IP: gw, Port: 9})
if derr != nil {
log.Debugf("PatchArpWarmCache: DialUDP(%s) failed: %v", gw, derr)
continue
}
c.Write([]byte{})
c.Close()
warmed = true
}
if warmed {
time.Sleep(75 * time.Millisecond)
if redump, rerr := netlink.NeighList(iface.Index, 0); rerr == nil {
dump = redump
}
}
const liveStates = netlink.NUD_PERMANENT | netlink.NUD_REACHABLE | netlink.NUD_STALE | netlink.NUD_DELAY | netlink.NUD_PROBE
var neighbors []boot.Neighbor
for _, n := range dump {
if n.HardwareAddr == nil || len(n.HardwareAddr) == 0 {
continue
}
if n.State&liveStates == 0 {
continue
}
neighbors = append(neighbors, boot.Neighbor{IP: n.IP, HardwareAddr: n.HardwareAddr})
}

// Get the link for the interface.
ifaceLink, err := netlink.LinkByName(iface.Name)
if err != nil {
Expand Down