diff --git a/pkg/shim/v1/runsccmd/runsc.go b/pkg/shim/v1/runsccmd/runsc.go index 64b590bd98..6b2a91e934 100644 --- a/pkg/shim/v1/runsccmd/runsc.go +++ b/pkg/shim/v1/runsccmd/runsc.go @@ -17,7 +17,6 @@ package runsccmd import ( - "bytes" "context" "encoding/json" "fmt" @@ -610,16 +609,29 @@ func (r *Runsc) command(context context.Context, args ...string) *exec.Cmd { } func cmdOutput(cmd *exec.Cmd, combined bool) ([]byte, []byte, error) { - stdout := getBuf() - defer putBuf(stdout) - cmd.Stdout = stdout - cmd.Stderr = stdout + // Use *os.File-backed temp files instead of bytes.Buffer. With a + // non-*os.File writer Go's exec package creates an internal pipe and + // blocks cmd.Wait() until the read-side sees EOF. When runsc daemonizes + // a sandbox child, the sandbox inherits the pipe's write end, EOF never + // arrives, and the shim's TaskService.Create hangs forever. + stdoutFile, err := os.CreateTemp("", "runsc-stdout-*") + if err != nil { + return nil, nil, err + } + defer os.Remove(stdoutFile.Name()) + defer stdoutFile.Close() + cmd.Stdout = stdoutFile + cmd.Stderr = stdoutFile - var stderr *bytes.Buffer + var stderrFile *os.File if !combined { - stderr = getBuf() - defer putBuf(stderr) - cmd.Stderr = stderr + stderrFile, err = os.CreateTemp("", "runsc-stderr-*") + if err != nil { + return nil, nil, err + } + defer os.Remove(stderrFile.Name()) + defer stderrFile.Close() + cmd.Stderr = stderrFile } ec, err := Monitor.Start(cmd) if err != nil { @@ -630,8 +642,11 @@ func cmdOutput(cmd *exec.Cmd, combined bool) ([]byte, []byte, error) { if err == nil && status != 0 { err = fmt.Errorf("%q did not terminate successfully", cmd.Args[0]) } - if stderr == nil { - return stdout.Bytes(), nil, err + + stdoutBytes, _ := os.ReadFile(stdoutFile.Name()) + if stderrFile == nil { + return stdoutBytes, nil, err } - return stdout.Bytes(), stderr.Bytes(), err + stderrBytes, _ := os.ReadFile(stderrFile.Name()) + return stdoutBytes, stderrBytes, err } diff --git a/runsc/config/config.go b/runsc/config/config.go index fd6268015d..3fbddba8c1 100644 --- a/runsc/config/config.go +++ b/runsc/config/config.go @@ -981,8 +981,10 @@ type Overlay2 struct { } func defaultOverlay2() *Overlay2 { - // Rootfs overlay is enabled by default and backed by a file in rootfs itself. - return &Overlay2{rootMount: true, subMounts: false, medium: SelfOverlay} + // PatchNoDefaultRootfsOverlay: default to no overlay so that host-side + // rootfs writes after sandbox start (e.g. Garden streamInToBundleRoot + // for CF /tmp/app) are visible inside the sandbox via the gofer. + return &Overlay2{rootMount: false, subMounts: false, medium: NoOverlay} } func setOverlay2Err(v string) error { diff --git a/runsc/container/container.go b/runsc/container/container.go index b99bb2e849..179a701f8c 100644 --- a/runsc/container/container.go +++ b/runsc/container/container.go @@ -1570,10 +1570,18 @@ func (c *Container) createGoferProcess(conf *config.Config, mountHints *boot.Pod // users in the sandbox. if !rootlessEUID { if userNS, ok := specutils.GetNS(specs.UserNamespace, c.Spec); ok { - nss = append(nss, userNS) - specutils.SetUIDGIDMappings(cmd, c.Spec) - // We need to set UID and GID to have capabilities in a new user namespace. - cmd.SysProcAttr.Credential = &syscall.Credential{Uid: 0, Gid: 0} + // PatchGoferUserns: only join the sandbox userns if our host uid is + // mapped into it. With unprivileged userns where host uid 0 is + // unmapped, setns() fails (no CAP_SYS_ADMIN in target). Run the + // gofer in the init userns as root in that case. + if goferCanJoinUserNS(userNS) { + nss = append(nss, userNS) + specutils.SetUIDGIDMappings(cmd, c.Spec) + // We need to set UID and GID to have capabilities in a new user namespace. + cmd.SysProcAttr.Credential = &syscall.Credential{Uid: 0, Gid: 0} + } else { + log.Warningf("PatchGoferUserns: host uid 0 not mapped in %q; running gofer in init userns", userNS.Path) + } } } else { userNS, ok := specutils.GetNS(specs.UserNamespace, c.Spec) @@ -2292,3 +2300,31 @@ func (c *Container) GetNetworkConfig() (*boot.CreateLinksAndRoutesArgs, error) { } return c.Sandbox.GetNetworkConfig() } + +// PatchGoferUserns: returns true if it is safe for the gofer to setns() into +// the sandbox userns referenced by ns. When the userns has a Path (existing +// NS), we read its uid_map and verify host uid 0 falls inside any mapped +// range. If we can't determine, default to true (preserve original behaviour). +func goferCanJoinUserNS(ns specs.LinuxNamespace) bool { + if ns.Path == "" { + return true // a new NS will be created via clone — always fine + } + // ns.Path is typically "/proc//ns/user"; resolve sibling uid_map. + parts := strings.Split(strings.Trim(ns.Path, "/"), "/") + if len(parts) < 4 || parts[0] != "proc" || parts[2] != "ns" { + return true + } + data, err := os.ReadFile("/" + parts[0] + "/" + parts[1] + "/uid_map") + if err != nil { + return true // can't tell — let StartInNS try + } + for _, line := range strings.Split(strings.TrimSpace(string(data)), "\n") { + var inID, outID, sz int + if _, err := fmt.Sscanf(line, "%d %d %d", &inID, &outID, &sz); err == nil { + if outID <= 0 && 0 < outID+sz { + return true + } + } + } + return false +} diff --git a/runsc/container/state_file.go b/runsc/container/state_file.go index 0fcd70dff3..93909b3ab9 100644 --- a/runsc/container/state_file.go +++ b/runsc/container/state_file.go @@ -197,6 +197,15 @@ func LoadSandbox(rootDir, id string, opts LoadOpts) ([]*Container, error) { } func findContainerID(rootDir, partialID string) (FullID, error) { + // PatchExactMatch-state-file: prefer an EXACT container-id match first. + // Without this, a lookup for an id that is itself a prefix of other + // container ids (e.g. Garden creates child exec containers named + // "-envoy-startup-healthcheck-N" alongside "") falls into the + // prefix branch below and is rejected as ambiguous. + exactPattern := buildPath(rootDir, FullID{SandboxID: "*", ContainerID: partialID}, stateFileExtension) + if exact, err := filepath.Glob(exactPattern); err == nil && len(exact) == 1 { + return parseFileName(filepath.Base(exact[0])) + } // Check whether the id fully specifies an existing container. pattern := buildPath(rootDir, FullID{SandboxID: "*", ContainerID: partialID + "*"}, stateFileExtension) list, err := filepath.Glob(pattern) @@ -210,6 +219,23 @@ func findContainerID(rootDir, partialID string) (FullID, error) { return parseFileName(filepath.Base(list[0])) } + // PatchPrefixNoMatch-state-file: if the exact-match step above missed + // AND every prefix hit is a "-..." child id, then partialID + // itself does not exist (typical Garden post-cleanup case: parent's + // state file removed but pea exec children still on disk). Return + // ErrNotExist instead of falling through to the ambiguity branch below. + allChildren := true + for _, p := range list { + full, perr := parseFileName(filepath.Base(p)) + if perr != nil || !strings.HasPrefix(full.ContainerID, partialID+"-") { + allChildren = false + break + } + } + if allChildren { + return FullID{}, os.ErrNotExist + } + // Now see whether id could be an abbreviation of exactly 1 of the // container ids. If id is ambiguous (it could match more than 1 // container), it is an error. diff --git a/runsc/sandbox/network.go b/runsc/sandbox/network.go index 760728ea99..7c2e4f5918 100644 --- a/runsc/sandbox/network.go +++ b/runsc/sandbox/network.go @@ -22,6 +22,7 @@ import ( "path/filepath" "runtime" "strconv" + "time" specs "github.com/opencontainers/runtime-spec/specs-go" "github.com/vishvananda/netlink" @@ -129,21 +130,45 @@ func addrBitLength(ip net.IP) int { } // removeLinkAddresses removes IP addresses from the host NIC for a link. +// PatchPeerAddrDel: list addrs with full netlink attrs (Peer/Scope/Flags) and +// AddrDel the actual struct. Required for point-to-point links (e.g. CF silk +// CNI sets eth0 with IFA_LOCAL=local, IFA_ADDRESS=peer); ParseAddr-based +// AddrDel uses IFA_ADDRESS=local and the kernel rejects with EADDRNOTAVAIL. func removeLinkAddresses(linkName string, addresses []boot.IPWithPrefix) error { ifaceLink, err := netlink.LinkByName(linkName) if err != nil { return fmt.Errorf("getting link for interface %q: %w", linkName, err) } + existing, err := netlink.AddrList(ifaceLink, netlink.FAMILY_ALL) + if err != nil { + return fmt.Errorf("listing addresses for interface %q: %w", linkName, err) + } for _, addr := range addresses { ipNet := &net.IPNet{ IP: addr.Address, Mask: net.CIDRMask(addr.PrefixLen, addrBitLength(addr.Address)), } - if err := removeAddress(ifaceLink, ipNet.String()); err != nil { - // If we encounter an error while deleting the ip, - // verify the ip is still present on the interface. - if present, err := isAddressOnInterface(linkName, ipNet); err != nil { - return fmt.Errorf("checking if address %v is on interface %q: %w", ipNet, linkName, err) + var nlAddr *netlink.Addr + for i := range existing { + e := &existing[i] + if e.IPNet == nil || !e.IPNet.IP.Equal(ipNet.IP) { + continue + } + ep, _ := e.IPNet.Mask.Size() + lp, _ := ipNet.Mask.Size() + if ep != lp { + continue + } + nlAddr = e + break + } + if nlAddr == nil { + // Already gone. + continue + } + if err := netlink.AddrDel(ifaceLink, nlAddr); err != nil { + if present, perr := isAddressOnInterface(linkName, ipNet); perr != nil { + return fmt.Errorf("checking if address %v is on interface %q: %w", ipNet, linkName, perr) } else if !present { continue } @@ -212,17 +237,6 @@ func collectLinksAndRoutes(conf *config.Config, disableIPv6 bool) (boot.CreateLi return boot.CreateLinksAndRoutesArgs{}, fmt.Errorf("fetching ARP table for %q: %w", iface.Name, err) } - var neighbors []boot.Neighbor - for _, n := range dump { - // There are only two "good" states NUD_PERMANENT and NUD_REACHABLE, - // but NUD_REACHABLE is fully dynamic and will be re-probed anyway. - if n.State == netlink.NUD_PERMANENT { - log.Debugf("Copying a static ARP entry: %+v %+v", n.IP, n.HardwareAddr) - // No flags are copied because Stack.AddStaticNeighbor does not support flags right now. - neighbors = append(neighbors, boot.Neighbor{IP: n.IP, HardwareAddr: n.HardwareAddr}) - } - } - // Scrape the routes before removing the address, since that // will remove the routes as well. routes, defv4, defv6, err := routesForIface(iface, disableIPv6) @@ -245,6 +259,67 @@ func collectLinksAndRoutes(conf *config.Config, disableIPv6 bool) (boot.CreateLi args.Defaultv6Gateway.Name = iface.Name } + // PatchArpWarmCache: warm the kernel neighbor cache for all gateway + // IPs we will hand to the sandbox, then copy "live" entries to + // boot.Neighbor so gVisor's netstack starts with a static cache. + // Required for silk-cni peer-style /32 setups where gVisor's own ARP + // through the FDBased link is unreliable. + gwSet := map[string]net.IP{} + for _, r := range routes { + if r.Gateway != nil && !r.Gateway.IsUnspecified() { + gwSet[r.Gateway.String()] = r.Gateway + } + } + if defv4 != nil && defv4.Gateway != nil && !defv4.Gateway.IsUnspecified() { + gwSet[defv4.Gateway.String()] = defv4.Gateway + } + if defv6 != nil && defv6.Gateway != nil && !defv6.Gateway.IsUnspecified() { + gwSet[defv6.Gateway.String()] = defv6.Gateway + } + haveEntry := func(ip net.IP) bool { + for _, n := range dump { + if n.IP != nil && n.IP.Equal(ip) && n.HardwareAddr != nil && len(n.HardwareAddr) > 0 { + return true + } + } + return false + } + warmed := false + for _, gw := range gwSet { + if haveEntry(gw) { + continue + } + network := "udp4" + if gw.To4() == nil { + network = "udp6" + } + c, derr := net.DialUDP(network, nil, &net.UDPAddr{IP: gw, Port: 9}) + if derr != nil { + log.Debugf("PatchArpWarmCache: DialUDP(%s) failed: %v", gw, derr) + continue + } + c.Write([]byte{}) + c.Close() + warmed = true + } + if warmed { + time.Sleep(75 * time.Millisecond) + if redump, rerr := netlink.NeighList(iface.Index, 0); rerr == nil { + dump = redump + } + } + const liveStates = netlink.NUD_PERMANENT | netlink.NUD_REACHABLE | netlink.NUD_STALE | netlink.NUD_DELAY | netlink.NUD_PROBE + var neighbors []boot.Neighbor + for _, n := range dump { + if n.HardwareAddr == nil || len(n.HardwareAddr) == 0 { + continue + } + if n.State&liveStates == 0 { + continue + } + neighbors = append(neighbors, boot.Neighbor{IP: n.IP, HardwareAddr: n.HardwareAddr}) + } + // Get the link for the interface. ifaceLink, err := netlink.LinkByName(iface.Name) if err != nil {