diff --git a/pkg/hostarch/hostarch_arm64.go b/pkg/hostarch/hostarch_arm64.go index f5d85dde45..d6600f9234 100644 --- a/pkg/hostarch/hostarch_arm64.go +++ b/pkg/hostarch/hostarch_arm64.go @@ -29,9 +29,15 @@ const ( // HugePageSize is the system huge page size. HugePageSize = 1 << HugePageShift + // JumboPageSize is the 1GB jumbo page size. + JumboPageSize = 1 << JumboPageShift + // CacheLineSize is the size of the cache line. CacheLineSize = 1 << CacheLineShift + // JumboPageShift is the binary log of jumbo page whose size is 1GB. + JumboPageShift = 30 + // CacheLineShift is the binary log of the cache line size. CacheLineShift = 6 ) diff --git a/pkg/hostarch/hostarch_x86.go b/pkg/hostarch/hostarch_x86.go index f79a101efb..24f152f90d 100644 --- a/pkg/hostarch/hostarch_x86.go +++ b/pkg/hostarch/hostarch_x86.go @@ -26,6 +26,9 @@ const ( // HugePageSize is the system huge page size. HugePageSize = 1 << HugePageShift + // JumboPageSize is the 1GB jumbo page size. + JumboPageSize = 1 << JumboPageShift + // CacheLineSize is the size of the cache line. CacheLineSize = 1 << CacheLineShift @@ -35,6 +38,9 @@ const ( // HugePageShift is the binary log of the system huge page size. HugePageShift = 21 + // JumboPageShift is the binary log of jumbo page whose size is 1GB. + JumboPageShift = 30 + // CacheLineShift is the binary log of the cache line size. CacheLineShift = 6 ) diff --git a/pkg/hostarch/sizes_util.go b/pkg/hostarch/sizes_util.go index 548d2617f0..d53c9b1270 100644 --- a/pkg/hostarch/sizes_util.go +++ b/pkg/hostarch/sizes_util.go @@ -11,6 +11,7 @@ const ( PageMask = PageSize - 1 HugePageMask = HugePageSize - 1 CacheLineMask = CacheLineSize - 1 + JumboPageMask = ^uintptr(JumboPageSize - 1) ) type bytecount interface { diff --git a/pkg/ring0/aarch64.go b/pkg/ring0/aarch64.go index 2c357f59ee..0f6a2306d0 100644 --- a/pkg/ring0/aarch64.go +++ b/pkg/ring0/aarch64.go @@ -102,5 +102,6 @@ const ( const ( Syscall Vector = El0SyncSVC PageFault Vector = El0SyncDa + OOMException Vector = El0Fiq VirtualizationException Vector = El0ErrBounce ) diff --git a/pkg/ring0/defs.go b/pkg/ring0/defs.go index 38ce9be1e6..e821cacebb 100644 --- a/pkg/ring0/defs.go +++ b/pkg/ring0/defs.go @@ -20,6 +20,18 @@ import ( "gvisor.dev/gvisor/pkg/sentry/arch/fpu" ) +const ( + // CPUIntel is Intel CPU. + CPUIntel uint64 = iota + + // CPUAMD is AMD (and compatible) CPU. + CPUAMD +) + +var ( + CPUVendor uint64 +) + // Kernel is a global kernel object. // // This contains global state, shared by multiple CPUs. diff --git a/pkg/ring0/defs_amd64.go b/pkg/ring0/defs_amd64.go index cb928327b8..bb6af2680f 100644 --- a/pkg/ring0/defs_amd64.go +++ b/pkg/ring0/defs_amd64.go @@ -108,6 +108,9 @@ type kernelEntry struct { // kernelCR3 is the cr3 used for sentry kernel. kernelCR3 uintptr + // whether enable VMCALL + enableVMCALL uint64 + // gdt is the CPU's descriptor table. gdt descriptorTable @@ -180,6 +183,14 @@ func (c *CPU) FaultAddr() uintptr { return c.faultAddr } +func (c *CPU) EnableVMCALL() { + c.enableVMCALL = 1 +} + +func (c *CPU) DisableVMCALL() { + c.enableVMCALL = 0 +} + // SwitchArchOpts are embedded in SwitchOpts. type SwitchArchOpts struct { // UserPCID indicates that the application PCID to be used on switch, diff --git a/pkg/ring0/entry_amd64.go b/pkg/ring0/entry_amd64.go index 399d72d95a..205a1aa20d 100644 --- a/pkg/ring0/entry_amd64.go +++ b/pkg/ring0/entry_amd64.go @@ -117,6 +117,7 @@ func machineCheck() func simdFloatingPointException() func virtualizationException() func securityException() +func oomException() func syscallInt80() // These returns the start address of the functions above. @@ -145,6 +146,7 @@ func addrOfMachineCheck() uintptr func addrOfSimdFloatingPointException() uintptr func addrOfVirtualizationException() uintptr func addrOfSecurityException() uintptr +func addrOfOOMException() uintptr func addrOfSyscallInt80() uintptr // Exception handler index. @@ -170,5 +172,6 @@ var handlers = map[Vector]uintptr{ SIMDFloatingPointException: addrOfSimdFloatingPointException(), VirtualizationException: addrOfVirtualizationException(), SecurityException: addrOfSecurityException(), + OOMException: addrOfOOMException(), SyscallInt80: addrOfSyscallInt80(), } diff --git a/pkg/ring0/entry_amd64.s b/pkg/ring0/entry_amd64.s index a804b5d694..9467aacfd6 100644 --- a/pkg/ring0/entry_amd64.s +++ b/pkg/ring0/entry_amd64.s @@ -33,6 +33,7 @@ #define ENTRY_STACK_TOP 264 // +checkoffset . kernelEntry.stackTop #define ENTRY_CPU_SELF 272 // +checkoffset . kernelEntry.cpuSelf #define ENTRY_KERNEL_CR3 280 // +checkoffset . kernelEntry.kernelCR3 +#define ENTRY_ENABLE_VMCALL 288 // +checkoffset . kernelEntry.enableVMCALL // Bits. #define _RFLAGS_IF 512 // +checkconst . _RFLAGS_IF @@ -61,9 +62,16 @@ #define SIMDFloatingPointException 19 // +checkconst . SIMDFloatingPointException #define VirtualizationException 20 // +checkconst . VirtualizationException #define SecurityException 30 // +checkconst . SecurityException +#define OOMException 32 // +checkconst . OOMException #define SyscallInt80 128 // +checkconst . SyscallInt80 #define Syscall 256 // +checkconst . Syscall +#define SyscallExit 60 // +checkconst . SyscallExit +#define SyscallExitGroup 231 // +checkconst . SyscallExitGroup +#define SyscallRedPill 4294967295 // +checkconst . SyscallRedPill + +#define CPUIntel 0 // +checkconst . CPUIntel + #define PTRACE_R15 0 // +checkoffset linux PtraceRegs.R15 #define PTRACE_R14 8 // +checkoffset linux PtraceRegs.R14 #define PTRACE_R13 16 // +checkoffset linux PtraceRegs.R13 @@ -160,6 +168,15 @@ #define LOAD_KERNEL_STACK(entry) \ MOVQ ENTRY_STACK_TOP(entry), SP; +// VMCALL do vmcall/vmmcal instruction +#define VMCALL() \ + CMPQ ·CPUVendor(SB), $CPUIntel; \ + JE 2(PC); \ + JMP 5(PC); \ // vmmcall and vmcall will be treated as 3 independent instructions + BYTE $0x0F; BYTE $0x01; BYTE $0xC1; \ + JMP 4(PC); \ // vmmcall and vmcall will be treated as 3 independent instructions + BYTE $0x0F; BYTE $0x01; BYTE $0xD9; + // ADDR_OF_FUNC defines a function named 'name' that returns the address of // 'symbol'. #define ADDR_OF_FUNC(name, symbol) \ @@ -488,6 +505,45 @@ sysenter_skip_gs: RET kernel: + // Handle any syscalls from GR0 in HR3 when EnableVMCALL is false. + // Currently there are 2 use cases: + // 1. Using KVM platform. + // 2. Upgrading SlimVM platform. This is one such method to return M to + // user mode (HR3) for upgrading platform. + CMPQ ENTRY_ENABLE_VMCALL(GS), $0 + JE hr3_do_syscall + + CMPQ AX, $SyscallRedPill + JE hr3_do_syscall + + CMPQ AX, $SyscallExit + JE hr3_do_syscall + + CMPQ AX, $SyscallExitGroup + JE hr3_do_syscall + +vmcall: + // handle syscall from GR0 in host kernel + // copy from "handle system calls from G0" part of __dune_syscall in libdune/dune.S + PUSHQ R11 + POPFQ + + CMPQ AX, $158 // arch_prctl syscall + JNE 3(PC) + CMPQ DI, $0x1002 //ARCH_SET_FS + JE arch_prctl_vmcall + + VMCALL() + JMP *CX + +arch_prctl_vmcall: + VMCALL() + CMPQ AX, $0 + JNE 2(PC) + MOVQ SI, CPU_REGISTERS+PTRACE_FS_BASE(GS) + JMP *CX + +hr3_do_syscall: // We can't restore the original stack, but we can access the registers // in the CPU state directly. No need for temporary juggling. MOVQ AX, ENTRY_SCRATCH0(GS) @@ -705,4 +761,5 @@ EXCEPTION_WITHOUT_ERROR(MachineCheck, ·machineCheck(SB), ·addrOfMachineCheck(S EXCEPTION_WITHOUT_ERROR(SIMDFloatingPointException, ·simdFloatingPointException(SB), ·addrOfSimdFloatingPointException(SB)) EXCEPTION_WITHOUT_ERROR(VirtualizationException, ·virtualizationException(SB), ·addrOfVirtualizationException(SB)) EXCEPTION_WITH_ERROR(SecurityException, ·securityException(SB), ·addrOfSecurityException(SB)) +EXCEPTION_WITH_ERROR(OOMException, ·oomException(SB), ·addrOfOOMException(SB)) EXCEPTION_WITHOUT_ERROR(SyscallInt80, ·syscallInt80(SB), ·addrOfSyscallInt80(SB)) diff --git a/pkg/ring0/entry_arm64.s b/pkg/ring0/entry_arm64.s index cce7862303..2057ef0b62 100644 --- a/pkg/ring0/entry_arm64.s +++ b/pkg/ring0/entry_arm64.s @@ -64,6 +64,7 @@ #define El0ErrNMI 34 // +checkconst . El0ErrNMI #define PageFault 23 // +checkconst . PageFault #define Syscall 22 // +checkconst . Syscall +#define OOMException 10 // +checkconst . OOMException #define VirtualizationException 35 // +checkconst . VirtualizationException #define PTRACE_REGS 0 // +checkoffset linux PtraceRegs.Regs @@ -765,7 +766,10 @@ TEXT ·El0_irq(SB),NOSPLIT,$0 B ·Shutdown(SB) TEXT ·El0_fiq(SB),NOSPLIT,$0 - B ·Shutdown(SB) + KERNEL_ENTRY_FROM_EL0 + MOVD $0x8400000a, R8 + HVC $0 + EXCEPTION_EL0(OOMException) TEXT ·El0_error(SB),NOSPLIT,$0 KERNEL_ENTRY_FROM_EL0 diff --git a/pkg/ring0/kernel_amd64.go b/pkg/ring0/kernel_amd64.go index aa6ac8eb4c..f4b0587d99 100644 --- a/pkg/ring0/kernel_amd64.go +++ b/pkg/ring0/kernel_amd64.go @@ -324,3 +324,30 @@ func startGo(c *CPU) { func ReadCR2() uintptr { return readCR2() } + +//go:noinline +//go:nosplit +func (c *CPU) PrefaultIDT() uint32 { + return c.kernel.globalIDT[0].bits[0] + c.kernel.globalIDT[_NR_INTERRUPTS-1].bits[3] +} + +// SetCPUIDFaulting sets CPUID faulting per the boolean value. +// +// True is returned if faulting could be set. +// +//go:nosplit +func SetCPUIDFaulting(on bool) bool { + // Per the SDM (Vol 3, Table 2-43), PLATFORM_INFO bit 31 denotes support + // for CPUID faulting, and we enable and disable via the MISC_FEATURES MSR. + if rdmsr(_MSR_PLATFORM_INFO)&_PLATFORM_INFO_CPUID_FAULT != 0 { + features := rdmsr(_MSR_MISC_FEATURES) + if on { + features |= _MISC_FEATURE_CPUID_TRAP + } else { + features &^= _MISC_FEATURE_CPUID_TRAP + } + wrmsr(_MSR_MISC_FEATURES, features) + return true // Setting successful. + } + return false +} diff --git a/pkg/ring0/lib_amd64.go b/pkg/ring0/lib_amd64.go index d683d8bfbb..f9a3139328 100644 --- a/pkg/ring0/lib_amd64.go +++ b/pkg/ring0/lib_amd64.go @@ -128,3 +128,21 @@ func InitDefault() { cpuid.Initialize() Init(cpuid.HostFeatureSet()) } + +// DisableLA57 forces ring0 to behave as if the host CPU did not advertise +// 5-level paging: hasLA57 is cleared so CR4.LA57 stays 0, and the address- +// space sizes are clamped to a 4-level layout (48-bit VA, 2^47 userspace). +// +// Must be called after Init/InitDefault and before any vCPU loads CR4 or +// any PageTables are created. Use this if the platform's hardware- +// virtualization layer cannot follow a 5-level page table walk (e.g. an +// EPT implementation limited to 4 levels) regardless of host CPUID. +func DisableLA57() { + hasLA57 = false + if VirtualAddressBits > 48 { + VirtualAddressBits = 48 + UserspaceSize = uintptr(1) << (VirtualAddressBits - 1) + MaximumUserAddress = (UserspaceSize - 1) &^ uintptr(hostarch.PageSize-1) + KernelStartAddress = ^uintptr(0) - (UserspaceSize - 1) + } +} diff --git a/pkg/ring0/pagetables/BUILD b/pkg/ring0/pagetables/BUILD index 2137a9520b..384d975c50 100644 --- a/pkg/ring0/pagetables/BUILD +++ b/pkg/ring0/pagetables/BUILD @@ -76,6 +76,7 @@ go_library( visibility = [ "//pkg/ring0:__subpackages__", "//pkg/sentry/platform/kvm:__subpackages__", + "//pkg/sentry/platform/slimvm:__subpackages__", ], deps = [ "//pkg/cpuid", diff --git a/pkg/ring0/pagetables/pagetables.go b/pkg/ring0/pagetables/pagetables.go index 04f9195d71..1d2b96c139 100644 --- a/pkg/ring0/pagetables/pagetables.go +++ b/pkg/ring0/pagetables/pagetables.go @@ -331,3 +331,17 @@ func (p *PageTables) Lookup(addr hostarch.Addr, findFirst bool) (virtual hostarc func (p *PageTables) MarkReadOnlyShared() { p.readOnlyShared = true } + +// PrefaultRootTable touches the root table page to be sure that its physical +// page is mapped. The runtime allocator backs PTEs with plain Go heap pages +// (new(PTEs), no mlock / MAP_POPULATE / memfile pinning), so Linux can +// reclaim the root page under memory pressure. Touching it from sentry +// context right before SwitchToUser guarantees the page is resident when +// iret/sysret loads CR3, avoiding rare host page faults that have been +// observed to manifest as vCPU bounce stalls (state=7, userExits stuck). +// +//go:nosplit +//go:noinline +func (p *PageTables) PrefaultRootTable() PTE { + return p.root[0] +} diff --git a/pkg/ring0/pagetables/pagetables_amd64.go b/pkg/ring0/pagetables/pagetables_amd64.go index e33a63b8c4..d4d8058a0f 100644 --- a/pkg/ring0/pagetables/pagetables_amd64.go +++ b/pkg/ring0/pagetables/pagetables_amd64.go @@ -25,8 +25,22 @@ var ( pgdShift = 39 pgdMask uintptr = 0x1ff << pgdShift pgdSize uintptr = 1 << pgdShift + + // la57Enabled gates whether InitArch promotes new PageTables to a + // 5-level layout when the host CPU advertises LA57. Defaults to true + // so existing behavior is unchanged; platforms whose hardware- + // virtualization layer cannot walk a 5-level page table call + // DisableLA57 once at startup to force 4-level. + la57Enabled = true ) +// DisableLA57 forces all subsequently-created PageTables to use a 4-level +// layout regardless of host CPUID. Must be called before any PageTables is +// created. +func DisableLA57() { + la57Enabled = false +} + const ( pteShift = 12 pmdShift = 21 @@ -54,7 +68,7 @@ const ( //go:nosplit func (p *PageTables) InitArch(allocator Allocator) { featureSet := cpuid.HostFeatureSet() - if featureSet.HasFeature(cpuid.X86FeatureLA57) { + if la57Enabled && featureSet.HasFeature(cpuid.X86FeatureLA57) { p.largeAddressesEnabled = true lowerTop = 0x00FFFFFFFFFFFFFF upperBottom = 0xFF00000000000000 diff --git a/pkg/ring0/x86.go b/pkg/ring0/x86.go index ad1b9f4893..0a0f66503e 100644 --- a/pkg/ring0/x86.go +++ b/pkg/ring0/x86.go @@ -53,10 +53,16 @@ const ( _EFER_LMA = 0x400 _EFER_NX = 0x800 - _MSR_STAR = 0xc0000081 - _MSR_LSTAR = 0xc0000082 - _MSR_CSTAR = 0xc0000083 - _MSR_SYSCALL_MASK = 0xc0000084 + _MSR_STAR = 0xc0000081 + _MSR_LSTAR = 0xc0000082 + _MSR_CSTAR = 0xc0000083 + _MSR_SYSCALL_MASK = 0xc0000084 + _MSR_PLATFORM_INFO = 0xce + _MSR_MISC_FEATURES = 0x140 + + _PLATFORM_INFO_CPUID_FAULT = 1 << 31 + + _MISC_FEATURE_CPUID_TRAP = 0x1 ) const ( @@ -130,6 +136,7 @@ const ( SIMDFloatingPointException VirtualizationException SecurityException = 0x1e + OOMException = 0x20 SyscallInt80 = 0x80 _NR_INTERRUPTS = 0x100 ) @@ -139,6 +146,13 @@ const ( Syscall Vector = _NR_INTERRUPTS ) +// System call number +const ( + SyscallExit uint32 = 0x3c + SyscallExitGroup uint32 = 0xe7 + SyscallRedPill uint32 = ^uint32(0) +) + // Selector is a segment Selector. type Selector uint16 diff --git a/pkg/sentry/kernel/task_run.go b/pkg/sentry/kernel/task_run.go index eb266fc67e..d47c3d5e5e 100644 --- a/pkg/sentry/kernel/task_run.go +++ b/pkg/sentry/kernel/task_run.go @@ -262,6 +262,15 @@ func (app *runApp) execute(t *Task) taskRunState { // loop to figure out why. return (*runApp)(nil) + case platform.ErrContextOOM: + // The SlimVM kernel module injects OOMException (vector 32) + // when EPT memory allocation fails. A proper OOM killer that + // selects a container/process to reclaim memory is not yet + // implemented. Fail loudly so the issue is visible. + t.Warningf("OOM event received but OOM handler is not implemented") + t.PrepareExit(linux.WaitStatusTerminationSignal(linux.SIGKILL)) + return (*runExit)(nil) + case platform.ErrContextSignal: // Looks like a signal has been delivered to us. If it's a synchronous // signal (SEGV, SIGBUS, etc.), it should be sent to the application diff --git a/pkg/sentry/platform/platform.go b/pkg/sentry/platform/platform.go index db5d4a00a4..2670b18d47 100644 --- a/pkg/sentry/platform/platform.go +++ b/pkg/sentry/platform/platform.go @@ -326,6 +326,10 @@ var ( // Context was interrupted by a call to Context.Interrupt(). ErrContextInterrupt = fmt.Errorf("interrupted by platform.Context.Interrupt()") + // ErrContextOOM is returned by Context.Switch() to indicate that the + // OOM is invoked. + ErrContextOOM = fmt.Errorf("interrupted by OOM") + // ErrContextCPUPreempted is returned by Context.Switch() to indicate that // one of the following occurred: // @@ -592,6 +596,10 @@ type Options struct { // as CPU numbers in the sentry. This is necessary to support features like // rseq UseCPUNums bool + + // SandboxID is the sandbox identifier, used by slimvm to pass to the + // host kernel module for sandbox identification. + SandboxID string } // Constructor represents a platform type. diff --git a/pkg/sentry/platform/platforms/platforms.go b/pkg/sentry/platform/platforms/platforms.go index 5e3f161552..f52ea6c0d1 100644 --- a/pkg/sentry/platform/platforms/platforms.go +++ b/pkg/sentry/platform/platforms/platforms.go @@ -21,6 +21,7 @@ package platforms import ( // Import platforms that runsc might use. _ "gvisor.dev/gvisor/pkg/sentry/platform/kvm" + _ "gvisor.dev/gvisor/pkg/sentry/platform/slimvm" _ "gvisor.dev/gvisor/pkg/sentry/platform/ptrace" _ "gvisor.dev/gvisor/pkg/sentry/platform/systrap" ) diff --git a/pkg/sentry/platform/slimvm/BUILD b/pkg/sentry/platform/slimvm/BUILD new file mode 100644 index 0000000000..1b20021c35 --- /dev/null +++ b/pkg/sentry/platform/slimvm/BUILD @@ -0,0 +1,80 @@ +load("//tools:defs.bzl", "go_library", "go_test") + +package(licenses = ["notice"]) # Apache 2.0 + +go_library( + name = "slimvm", + srcs = [ + "address_space.go", + "address_space_x86.go", + "allocator.go", + "bluepill.go", + "bluepill_amd64.go", + "bluepill_amd64.s", + "bluepill_amd64_unsafe.go", + "bluepill_fault.go", + "bluepill_unsafe.go", + "context.go", + "filters_amd64.go", + "slimvm.go", + "slimvm_amd64.go", + "slimvm_amd64_unsafe.go", + "slimvm_const.go", + "slimvm_const_amd64.go", + "machine.go", + "machine_amd64.go", + "machine_amd64_unsafe.go", + "machine_unsafe.go", + "pcids_x86.go", + "physical_map.go", + "thread_control.go", + "physical_map_amd64.go", + "virtual_map.go", + ], + visibility = ["//pkg/sentry:internal"], + deps = [ + "//pkg/abi/linux", + "//pkg/atomicbitops", + "//pkg/context", + "//pkg/cpuid", + "//pkg/fd", + "//pkg/hostarch", + "//pkg/log", + "//pkg/seccomp", + "//pkg/sentry/arch", + "//pkg/sentry/arch/fpu", + "//pkg/sentry/memmap", + "//pkg/sentry/platform", + "//pkg/sentry/platform/interrupt", + "//pkg/sighandling", + "//pkg/spinlock", + "//pkg/hosttid", + "//pkg/ring0", + "//pkg/ring0/pagetables", + "//pkg/sentry/time", + "//pkg/sync", + "//pkg/usermem", + "@org_golang_x_sys//unix:go_default_library", + ], +) + +go_test( + name = "slimvm_test", + srcs = [ + "slimvm_test.go", + "virtual_map_test.go", + ], + embed = [":slimvm"], + tags = [ + "nogotsan", + "requires-slimvm", + ], + deps = [ + "//pkg/sentry/arch", + "//pkg/sentry/platform", + "//pkg/sentry/platform/slimvm/testutil", + "//pkg/ring0", + "//pkg/ring0/pagetables", + "//pkg/usermem", + ], +) diff --git a/pkg/sentry/platform/slimvm/address_space.go b/pkg/sentry/platform/slimvm/address_space.go new file mode 100644 index 0000000000..9e94f4a39b --- /dev/null +++ b/pkg/sentry/platform/slimvm/address_space.go @@ -0,0 +1,215 @@ +// Copyright 2026 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package slimvm + +import ( + "sync" + + "gvisor.dev/gvisor/pkg/atomicbitops" + "gvisor.dev/gvisor/pkg/hostarch" + "gvisor.dev/gvisor/pkg/ring0/pagetables" + "gvisor.dev/gvisor/pkg/sentry/memmap" + "gvisor.dev/gvisor/pkg/sentry/platform" +) + +type vCPUBitArray [(_SLIMVM_NR_VCPUS + 63) / 64]atomicbitops.Uint64 +type vCPUBitArrayLocal [(_SLIMVM_NR_VCPUS + 63) / 64]uint64 + +// dirtySet tracks vCPUs for invalidation. +type dirtySet struct { + vCPUs vCPUBitArray +} + +// forEach iterates over all CPUs in the dirty set. +func (ds *dirtySet) forEach(m *machine, fn func(c *vCPU)) { + var localSet vCPUBitArrayLocal + for index := 0; index < len(ds.vCPUs); index++ { + // Clear the dirty set, copy to the local one. + localSet[index] = ds.vCPUs[index].Swap(0) + } + + m.mu.RLock() + defer m.mu.RUnlock() + + for _, c := range m.vCPUs { + index := uint64(c.id) / 64 + bit := uint64(1) << uint(c.id%64) + + // Call the function if it was set. + if localSet[index]&bit != 0 { + fn(c) + } + } +} + +// mark marks the given vCPU as dirty and returns whether it was previously +// clean. Being previously clean implies that a flush is needed on entry. +func (ds *dirtySet) mark(c *vCPU) bool { + index := uint64(c.id) / 64 + bit := uint64(1) << uint(c.id%64) + + oldValue := ds.vCPUs[index].Load() + if oldValue&bit != 0 { + return false // Not clean. + } + + // Set the bit unilaterally, and ensure that a flush takes place. Note + // that it's possible for races to occur here, but since the flush is + // taking place long after these lines there's no race in practice. + atomicbitops.OrUint64(&ds.vCPUs[index], bit) + return true // Previously clean. +} + +// addressSpace is a wrapper for PageTables. +type addressSpace struct { + platform.NoAddressSpaceIO + + // mu is the lock for modifications to the address space. + // + // Note that the page tables themselves are not locked. + mu sync.Mutex + + // machine is the underlying machine. + machine *machine + + // pageTables are for this particular address space. + pageTables *pagetables.PageTables + + // dirtySet is the set of dirty vCPUs. + dirtySet dirtySet + + // pcid associated with this address space. + pcid uint16 +} + +// invalidate is the implementation for Invalidate. +func (as *addressSpace) invalidate() { + as.dirtySet.forEach(as.machine, func(c *vCPU) { + if c.active.get() == as { // If this happens to be active, + c.BounceToKernel() // ... force a kernel transition. + } + }) +} + +// Invalidate interrupts all dirty contexts. +func (as *addressSpace) Invalidate() { + as.mu.Lock() + defer as.mu.Unlock() + as.invalidate() +} + +// Touch adds the given vCPU to the dirty list. +// +// The return value indicates whether a flush is required. +func (as *addressSpace) Touch(c *vCPU) bool { + return as.dirtySet.mark(c) +} + +type hostMapEntry struct { + addr uintptr + length uintptr +} + +func (as *addressSpace) mapHost(addr hostarch.Addr, m hostMapEntry, at hostarch.AccessType) (inv bool) { + for m.length > 0 { + physical, length, ok := translateToPhysical(m.addr) + if !ok { + panic("unable to translate segment") + } + if length > m.length { + length = m.length + } + + inv = as.pageTables.Map(addr, length, pagetables.MapOpts{ + AccessType: at, + User: true, + }, physical) || inv + m.addr += length + m.length -= length + addr += hostarch.Addr(length) + } + + return inv +} + +// MapFile implements platform.AddressSpace.MapFile. +func (as *addressSpace) MapFile(addr hostarch.Addr, f memmap.File, fr memmap.FileRange, at hostarch.AccessType, precommit bool) error { + as.mu.Lock() + defer as.mu.Unlock() + + // Get mappings in the sentry's address space, which are guaranteed to be + // valid as long as a reference is held on the mapped pages (which is in + // turn required by AddressSpace.MapFile precondition). + // + // If precommit is true, we will touch mappings to commit them, so ensure + // that mappings are readable from sentry context. + // + // We don't execute from application file-mapped memory, and guest page + // tables don't care if we have execute permission (but they do need pages + // to be readable). + bs, err := f.MapInternal(fr, hostarch.AccessType{ + Read: at.Read || at.Execute || precommit, + Write: at.Write, + }) + if err != nil { + return err + } + + // Map the mappings in the sentry's address space (guest physical memory) + // into the application's address space (guest virtual memory). + inv := false + for !bs.IsEmpty() { + b := bs.Head() + bs = bs.Tail() + // Since fr was page-aligned, b should also be page-aligned. We do the + // lookup in our host page tables for this translation. + if precommit { + s := b.ToSlice() + for i := 0; i < len(s); i += hostarch.PageSize { + _ = s[i] // Touch to commit. + } + } + prev := as.mapHost(addr, hostMapEntry{ + addr: b.Addr(), + length: uintptr(b.Len()), + }, at) + inv = inv || prev + addr += hostarch.Addr(b.Len()) + } + if inv { + as.invalidate() + } + + return nil +} + +// Unmap unmaps the given range by calling pagetables.PageTables.Unmap. +func (as *addressSpace) Unmap(addr hostarch.Addr, length uint64) { + as.mu.Lock() + defer as.mu.Unlock() + + if as.pageTables.Unmap(addr, uintptr(length)) { + as.invalidate() + + // Recycle any freed intermediate pages. + as.pageTables.Allocator.Recycle() + } +} + +// PreFork implements platform.AddressSpace.PreFork. +func (as *addressSpace) PreFork() {} + +// PostFork implements platform.AddressSpace.PostFork. +func (as *addressSpace) PostFork() {} diff --git a/pkg/sentry/platform/slimvm/address_space_x86.go b/pkg/sentry/platform/slimvm/address_space_x86.go new file mode 100644 index 0000000000..fe4ef707c5 --- /dev/null +++ b/pkg/sentry/platform/slimvm/address_space_x86.go @@ -0,0 +1,29 @@ +// Copyright 2026 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +//go:build amd64 +// +build amd64 + +package slimvm + +// Release releases the page tables. +func (as *addressSpace) Release() { + as.Unmap(0, ^uint64(0)) + + // Free all pages from the allocator. + as.pageTables.Allocator.(allocator).base.Drain() + + // Drop all cached machine references. + as.machine.dropPageTables(as.pcid) +} diff --git a/pkg/sentry/platform/slimvm/allocator.go b/pkg/sentry/platform/slimvm/allocator.go new file mode 100644 index 0000000000..e604aae924 --- /dev/null +++ b/pkg/sentry/platform/slimvm/allocator.go @@ -0,0 +1,76 @@ +// Copyright 2026 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package slimvm + +import ( + "fmt" + + "gvisor.dev/gvisor/pkg/ring0/pagetables" +) + +type allocator struct { + base *pagetables.RuntimeAllocator +} + +// newAllocator is used to define the allocator. +func newAllocator() allocator { + return allocator{ + base: pagetables.NewRuntimeAllocator(), + } +} + +// NewPTEs implements pagetables.Allocator.NewPTEs. +// +//go:nosplit +func (a allocator) NewPTEs() *pagetables.PTEs { + return a.base.NewPTEs() +} + +// PhysicalFor returns the physical address for a set of PTEs. +// +//go:nosplit +func (a allocator) PhysicalFor(ptes *pagetables.PTEs) uintptr { + virtual := a.base.PhysicalFor(ptes) + physical, _, ok := translateToPhysical(virtual) + if !ok { + panic(fmt.Sprintf("PhysicalFor failed for %p", ptes)) + } + return physical +} + +// LookupPTEs implements pagetables.Allocator.LookupPTEs. +// +//go:nosplit +func (a allocator) LookupPTEs(physical uintptr) *pagetables.PTEs { + virtualStart, physicalStart, _, ok := calculateBluepillFault(physical) + if !ok { + panic(fmt.Sprintf("LookupPTEs failed for 0x%x", physical)) + } + return a.base.LookupPTEs(virtualStart + (physical - physicalStart)) +} + +// FreePTEs implements pagetables.Allocator.FreePTEs. +// +//go:nosplit +func (a allocator) FreePTEs(ptes *pagetables.PTEs) { + a.base.FreePTEs(ptes) +} + +// Recycle implements pagetables.Allocator.Recycle. +// +//go:nosplit +func (a allocator) Recycle() { + a.base.Recycle() +} diff --git a/pkg/sentry/platform/slimvm/bluepill.go b/pkg/sentry/platform/slimvm/bluepill.go new file mode 100644 index 0000000000..1d4f056fb2 --- /dev/null +++ b/pkg/sentry/platform/slimvm/bluepill.go @@ -0,0 +1,72 @@ +// Copyright 2026 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package slimvm + +import ( + "fmt" + "syscall" + + "gvisor.dev/gvisor/pkg/ring0" + "gvisor.dev/gvisor/pkg/sighandling" +) + +var ( + // bounceSignal is the signal used for bouncing KVM. + // + // We use SIGCHLD because it is not masked by the runtime, and + // it will be ignored properly by other parts of the kernel. + bounceSignal = syscall.SIGCHLD + + // bounceSignalMask has only bounceSignal set. + bounceSignalMask = uint64(1 << (uint64(bounceSignal) - 1)) + + // bounce is the interrupt vector used to return to the kernel. + bounce = uint32(ring0.VirtualizationException) + + // guestOOM is the private interrupt vector used to process OOM. + guestOOM = uint32(ring0.OOMException) + + // savedHandler is a pointer to the previous handler. + // + // This is called by bluepillHandler. + savedHandler uintptr +) + +// bluepill enters guest mode. +func bluepill(*vCPU) + +// sighandler is the signal entry point. +func sighandler() + +// Return the start address of the functions above. +// +// In Go 1.17+, Go references to assembly functions resolve to an ABIInternal +// wrapper function rather than the function itself. We must reference from +// assembly to get the ABI0 (i.e., primary) address. +func addrOfSighandler() uintptr + +// redpill on amd64/arm64 invokes a syscall with -1. +// +//go:nosplit +func redpill() { + syscall.RawSyscall(^uintptr(0), 0, 0, 0) +} + +func saveSignalHandler() { + // Install the handler. + if err := sighandling.ReplaceSignalHandler(bluepillSignal, addrOfSighandler(), &savedHandler); err != nil { + panic(fmt.Sprintf("Unable to set handler for signal %d: %v", bluepillSignal, err)) + } +} diff --git a/pkg/sentry/platform/slimvm/bluepill_amd64.go b/pkg/sentry/platform/slimvm/bluepill_amd64.go new file mode 100644 index 0000000000..370835a8a2 --- /dev/null +++ b/pkg/sentry/platform/slimvm/bluepill_amd64.go @@ -0,0 +1,135 @@ +// Copyright 2026 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +//go:build amd64 +// +build amd64 + +package slimvm + +import ( + "syscall" + + "gvisor.dev/gvisor/pkg/ring0" + "gvisor.dev/gvisor/pkg/sentry/arch" +) + +var ( + // The action for bluepillSignal is changed by sigaction(). + bluepillSignal = syscall.SIGSEGV +) + +// bluepillArchEnter is called during bluepillEnter. +// +//go:nosplit +func bluepillArchEnter(context *arch.SignalContext64) *vCPU { + c := vCPUPtr(uintptr(context.Rax)) + regs := c.CPU.Registers() + regs.R8 = context.R8 + regs.R9 = context.R9 + regs.R10 = context.R10 + regs.R11 = context.R11 + regs.R12 = context.R12 + regs.R13 = context.R13 + regs.R14 = context.R14 + regs.R15 = context.R15 + regs.Rdi = context.Rdi + regs.Rsi = context.Rsi + regs.Rbp = context.Rbp + regs.Rbx = context.Rbx + regs.Rdx = context.Rdx + regs.Rax = context.Rax + regs.Rcx = context.Rcx + regs.Rsp = context.Rsp + regs.Rip = context.Rip + regs.Eflags = context.Eflags + regs.Eflags &^= uint64(ring0.KernelFlagsClear) + regs.Eflags |= ring0.KernelFlagsSet + regs.Cs = uint64(ring0.Kcode) + regs.Ds = uint64(ring0.Udata) + regs.Es = uint64(ring0.Udata) + regs.Ss = uint64(ring0.Kdata) + + return c +} + +// KernelSyscall handles kernel syscalls. +// +//go:nosplit +func (c *vCPU) KernelSyscall() { + regs := c.Registers() + if regs.Rax != ^uint64(0) { + regs.Rip -= 2 // Rewind. + } + // Syscall/exception handling in SlimVM: + // + // When enableVMCALL is set (normal SlimVM operation), most syscalls + // from GR0 (guest ring 0, i.e. the sentry) are forwarded to the host + // kernel directly via VMCALL/VMMCALL in entry_amd64.s, without exiting + // to HR3. Only special syscalls (exit, exit_group, and + // RedPill/0xFFFFFFFF) fall through to the HLT path below. + // + // When enableVMCALL is not set, all syscalls go through the HLT path + // (same as KVM, see kvm/bluepill_amd64.go for details). This happens + // in two cases: + // - KVM platform: enableVMCALL is never set. + // - SlimVM platform upgrade: DisableVMCALL() is called temporarily to + // return the M (machine thread) back to HR3. + ring0.HaltAndWriteFSBase(regs) // escapes: no, reload host segment. +} + +// KernelException handles kernel exceptions. +// +//go:nosplit +func (c *vCPU) KernelException(vector ring0.Vector) { + regs := c.Registers() + if vector == ring0.Vector(bounce) { + // This go-routine was saved in hr3 and resumed in gr0 with the + // userspace flags. Let's adjust flags and skip the interrupt. + regs.Eflags &^= uint64(ring0.KernelFlagsClear) + regs.Eflags |= ring0.KernelFlagsSet + return + } + // See above. + ring0.HaltAndWriteFSBase(regs) // escapes: no, reload host segment. +} + +// bluepillArchExit is called during bluepillEnter. +// +//go:nosplit +func bluepillArchExit(c *vCPU, context *arch.SignalContext64) { + regs := c.CPU.Registers() + context.R8 = regs.R8 + context.R9 = regs.R9 + context.R10 = regs.R10 + context.R11 = regs.R11 + context.R12 = regs.R12 + context.R13 = regs.R13 + context.R14 = regs.R14 + context.R15 = regs.R15 + context.Rdi = regs.Rdi + context.Rsi = regs.Rsi + context.Rbp = regs.Rbp + context.Rbx = regs.Rbx + context.Rdx = regs.Rdx + context.Rax = regs.Rax + context.Rcx = regs.Rcx + context.Rsp = regs.Rsp + context.Rip = regs.Rip + context.Eflags = regs.Eflags + + // Set the context pointer to the saved floating point state. This is + // where the guest data has been serialized, the kernel will restore + // from this new pointer value. + context.Fpstate = uint64(uintptrValue(c.FloatingPointState().BytePointer())) // escapes: no. +} diff --git a/pkg/sentry/platform/slimvm/bluepill_amd64.s b/pkg/sentry/platform/slimvm/bluepill_amd64.s new file mode 100644 index 0000000000..8ce2a5c2a2 --- /dev/null +++ b/pkg/sentry/platform/slimvm/bluepill_amd64.s @@ -0,0 +1,101 @@ +// Copyright 2026 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "textflag.h" + +// VCPU_CPU is the location of the CPU in the vCPU struct. +// +// This is guaranteed to be zero. +#define VCPU_CPU 0x0 + +// ENTRY_CPU_SELF is the location of the CPU in the entry struct. +// +// This is sourced from ring0. +#define ENTRY_CPU_SELF 272 // +checkoffset ring0 kernelEntry.cpuSelf + +// Context offsets. +// +// Only limited use of the context is done in the assembly stub below, most is +// done in the Go handlers. However, the RIP must be examined. +#define CONTEXT_RAX 0x90 +#define CONTEXT_RIP 0xa8 +#define CONTEXT_FP 0xe0 + +// CLI is the literal byte for the disable interrupts instruction. +// +// This is checked as the source of the fault. +#define CLI $0xfa + +// See bluepill.go. +TEXT ·bluepill(SB),NOSPLIT|NOFRAME,$0 +begin: + MOVQ arg+0(FP), AX + LEAQ VCPU_CPU(AX), BX + + // The gorountine stack will be changed in guest which renders + // the frame pointer outdated and misleads perf tools. + // Disconnect the frame-chain with the zeroed frame pointer + // when it is saved in the frame in bluepillHandler(). + MOVQ BP, CX + MOVQ $0, BP + BYTE CLI; + MOVQ CX, BP +check_vcpu: + MOVQ ENTRY_CPU_SELF(GS), CX + CMPQ BX, CX + JE right_vCPU +wrong_vcpu: + CALL ·redpill(SB) + JMP begin +right_vCPU: + RET + +// sighandler: see bluepill.go for documentation. +// +// The arguments are the following: +// +// DI - The signal number. +// SI - Pointer to siginfo_t structure. +// DX - Pointer to ucontext structure. +// +TEXT ·sighandler(SB),NOSPLIT|NOFRAME,$0 + // Check if the signal is from the kernel. + MOVQ $0x80, CX + CMPL CX, 0x8(SI) + JNE fallback + + // Check if RIP is disable interrupts. + MOVQ CONTEXT_RIP(DX), CX + CMPQ CX, $0x0 + JE fallback + CMPB 0(CX), CLI + JNE fallback + + // Call the bluepillHandler. + PUSHQ DX // First argument (context). + CALL ·bluepillHandler(SB) // Call the handler. + POPQ DX // Discard the argument. + RET + +fallback: + // Jump to the previous signal handler. + XORQ CX, CX + MOVQ ·savedHandler(SB), AX + JMP AX + +// func addrOfSighandler() uintptr +TEXT ·addrOfSighandler(SB), $0-8 + MOVQ $·sighandler(SB), AX + MOVQ AX, ret+0(FP) + RET diff --git a/pkg/sentry/platform/slimvm/bluepill_amd64_unsafe.go b/pkg/sentry/platform/slimvm/bluepill_amd64_unsafe.go new file mode 100644 index 0000000000..645e1f1343 --- /dev/null +++ b/pkg/sentry/platform/slimvm/bluepill_amd64_unsafe.go @@ -0,0 +1,33 @@ +// Copyright 2026 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +//go:build amd64 +// +build amd64 + +package slimvm + +import ( + "unsafe" + + "gvisor.dev/gvisor/pkg/sentry/arch" +) + +//go:nosplit +func breakpoint(regs *arch.Registers) { + const INT3 = 0xcc + rip := regs.Rip - 1 + if *(*uint8)(unsafe.Pointer(uintptr(rip))) == INT3 { + regs.Rip = rip + } +} diff --git a/pkg/sentry/platform/slimvm/bluepill_fault.go b/pkg/sentry/platform/slimvm/bluepill_fault.go new file mode 100644 index 0000000000..10c7537edc --- /dev/null +++ b/pkg/sentry/platform/slimvm/bluepill_fault.go @@ -0,0 +1,57 @@ +// Copyright 2026 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package slimvm + +import ( + "syscall" + + "gvisor.dev/gvisor/pkg/hostarch" +) + +// yield yields the CPU. +// +//go:nosplit +func yield() { + syscall.RawSyscall(syscall.SYS_SCHED_YIELD, 0, 0, 0) +} + +// calculateBluepillFault calculates the fault address range. +// +//go:nosplit +func calculateBluepillFault(physical uintptr) (virtualStart, physicalStart, length uintptr, ok bool) { + alignedPhysical := physical &^ uintptr(hostarch.PageSize-1) + for _, pr := range physicalRegions { + end := pr.physical + pr.length + if physical < pr.physical || physical >= end { + continue + } + + // Adjust the block to match our size. + physicalStart = alignedPhysical + if physicalStart < pr.physical { + // Bound the starting point to the start of the region. + physicalStart = pr.physical + } + virtualStart = pr.virtual + (physicalStart - pr.physical) + physicalEnd := physicalStart + pr.length + if physicalEnd > end { + physicalEnd = end + } + length = physicalEnd - physicalStart + return virtualStart, physicalStart, length, true + } + + return 0, 0, 0, false +} diff --git a/pkg/sentry/platform/slimvm/bluepill_unsafe.go b/pkg/sentry/platform/slimvm/bluepill_unsafe.go new file mode 100644 index 0000000000..7f06f914f2 --- /dev/null +++ b/pkg/sentry/platform/slimvm/bluepill_unsafe.go @@ -0,0 +1,293 @@ +// Copyright 2026 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package slimvm + +import ( + "syscall" + "unsafe" + + "golang.org/x/sys/unix" + "gvisor.dev/gvisor/pkg/sentry/arch" +) + +var ( + // Memory reserved for triggering OOM in HR3 when slimvm returns ENOMEM + dummyBytes []byte + + // We will try to recover when OOM happened only if we have at least + // 100MB memory available. + nrDummyBytes = 100 << 20 +) + +//go:linkname throw runtime.throw +func throw(string) + +var hexSyms = []byte("0123456789abcdef") + +// printHex writes title followed by val (as a 16-digit hex number) to stderr +// using only a raw write(2). It is async-signal-safe: it allocates nothing, +// takes no lock, and may run on the signal stack. +// +//go:nosplit +func printHex(title []byte, val uint64) { + var str [19]byte + str[0] = ' ' + for i := 0; i < 16; i++ { + str[16-i] = hexSyms[val&0xf] + val = val >> 4 + } + str[17] = '\n' + syscall.RawSyscall(syscall.SYS_WRITE, uintptr(unix.Stderr), uintptr(unsafe.Pointer(&title[0])), uintptr(len(title))) + syscall.RawSyscall(syscall.SYS_WRITE, uintptr(unix.Stderr), uintptr(unsafe.Pointer(&str[0])), 18) +} + +// bluepillDieCleanly reports a fatal condition encountered inside +// bluepillHandler and terminates the process. +// +// It must be used instead of throw() from within the signal handler context: +// runtime.throw acquires the runtime print lock (debuglock), which is not +// async-signal-safe and deadlocks when reached from the bluepill SIGSEGV +// handler (see the BenchmarkApplicationSyscall hang). This helper only issues +// raw system calls: it writes a diagnostic line to stderr and then exits the +// whole process with status 99. +// +//go:nosplit +func bluepillDieCleanly(msg []byte, status uint64) { + syscall.RawSyscall(syscall.SYS_WRITE, uintptr(unix.Stderr), uintptr(unsafe.Pointer(&msg[0])), uintptr(len(msg))) + printHex(dieStatusTitle, status) + syscall.RawSyscall(syscall.SYS_EXIT_GROUP, 99, 0, 0) +} + +var dieStatusTitle = []byte("slimvm: fatal in bluepillHandler, status =") + +var ( + dieMsgInvalidState = []byte("slimvm: invalid vCPU state in bluepillHandler\n") + dieMsgPendingSignal = []byte("slimvm: error waiting for pending signal\n") + dieMsgUnexpectedSignal = []byte("slimvm: unexpected signal\n") + dieMsgNMIInjection = []byte("slimvm: NMI injection failed\n") + dieMsgOOMRelease = []byte("slimvm: OOM: failed to release dummy bytes\n") + dieMsgOOMTime = []byte("slimvm: OOM: failed to get current time\n") + dieMsgOOMRepeat = []byte("slimvm: OOM: ENOMEM happened more than 5 times on this vCPU in last minute\n") + dieMsgRunFailed = []byte("slimvm: run ioctl failed\n") + dieMsgException = []byte("slimvm: unexpected exception exit\n") + dieMsgIO = []byte("slimvm: unexpected I/O exit\n") + dieMsgHypercall = []byte("slimvm: unexpected hypercall exit\n") + dieMsgDebug = []byte("slimvm: unexpected debug exit\n") + dieMsgMMIO = []byte("slimvm: VM exit MMIO, maybe a physical address is out of range\n") + dieMsgShutdown = []byte("slimvm: unexpected shutdown exit\n") + dieMsgFailEntry = []byte("slimvm: VM entry failed\n") + dieMsgMSRWrite = []byte("slimvm: write msr failed\n") + dieMsgUnknown = []byte("slimvm: unknown VM exit status\n") +) + +// vCPUPtr returns a CPU for the given address. +// +//go:nosplit +func vCPUPtr(addr uintptr) *vCPU { + return (*vCPU)(unsafe.Pointer(addr)) +} + +// bytePtr returns a bytePtr for the given address. +// +//go:nosplit +func bytePtr(addr uintptr) *byte { + return (*byte)(unsafe.Pointer(addr)) +} + +// uintptrValue returns a uintptr for the given address. +// +//go:nosplit +func uintptrValue(addr *byte) uintptr { + return (uintptr)(unsafe.Pointer(addr)) +} + +// bluepillArchContext returns the arch-specific context. +// +//go:nosplit +func bluepillArchContext(context unsafe.Pointer) *arch.SignalContext64 { + return &((*arch.UContext64)(context).MContext) +} + +// bluepillHandler is called from the signal stub. +// +// The world may be stopped while this is executing, and it executes on the +// signal stack. It should only execute raw system calls and functions that are +// explicitly marked go:nosplit. +// +//go:nosplit +func bluepillHandler(context unsafe.Pointer) { + // Sanitize the registers; interrupts must always be disabled. + c := bluepillArchEnter(bluepillArchContext(context)) + + // Mark this as guest mode. + switch c.state.Swap(vCPUGuest | vCPUUser) { + case vCPUUser: // Expected case. + case vCPUUser | vCPUWaiter: + c.notify() + default: + bluepillDieCleanly(dieMsgInvalidState, uint64(c.state.Load())) + } + + for { + switch _, _, errno := syscall.RawSyscall(syscall.SYS_IOCTL, slimvmFD, _SLIMVM_RUN, uintptr(unsafe.Pointer(&c.vmxConfig))); errno { + case 0: // Expected case. + case syscall.EINTR: + // In SlimVM, bounce signals (SIG_BOUNCE) are consumed by + // the kernel module in slimvm_signal_handler() and never + // reach userspace. EINTR here is triggered by non-bounce + // signals (e.g. SIGPROF). + // + // Use zero timeout so rt_sigtimedwait is non-blocking: + // if there is no pending bounce signal, EAGAIN is returned + // and we simply rerun the vCPU. + timeout := unix.Timespec{} + sig, _, errno := syscall.RawSyscall6( + syscall.SYS_RT_SIGTIMEDWAIT, + uintptr(unsafe.Pointer(&bounceSignalMask)), + 0, // siginfo. + uintptr(unsafe.Pointer(&timeout)), // zero timeout. + 8, // sigset size. + 0, 0) + if errno == unix.EAGAIN { + continue + } + if errno != 0 { + bluepillDieCleanly(dieMsgPendingSignal, uint64(errno)) + } + if sig != uintptr(bounceSignal) { + bluepillDieCleanly(dieMsgUnexpectedSignal, uint64(sig)) + } + + // Check whether the current state of the vCPU is ready + // for interrupt injection. Because we don't have a + // PIC, we can't inject an interrupt while they are + // masked. We need to request a window if it's not + // ready. + if c.runData.readyForInterruptInjection == 0 { + c.runData.requestInterruptWindow = 1 + continue // Rerun vCPU. + } else { + // Force injection below; the vCPU is ready. + c.runData.exitReason = _SLIMVM_EXIT_IRQ_WINDOW_OPEN + } + case syscall.EFAULT: + // If a fault is not serviceable due to the host + // backing pages having page permissions, instead of an + // MMIO exit we receive EFAULT from the run ioctl. We + // always inject an NMI here since we may be in kernel + // mode and have interrupts disabled. + if _, _, errno := syscall.RawSyscall( + syscall.SYS_IOCTL, + slimvmFD, + _SLIMVM_NMI, uintptr(unsafe.Pointer(&c.vmxConfig.vcpu))); errno != 0 { + bluepillDieCleanly(dieMsgNMIInjection, uint64(errno)) + } + continue // Rerun vCPU. + case syscall.ENOMEM: + // OOM happened. Trigger the OOM killer in HR3. + for i := 0; i < nrDummyBytes; i += 4096 { + dummyBytes[i] = 0xff + } + + // We failed to trigger the OOM killer. It's possible that we have + // the enough memory now. Release the dummy bytes and try again. + if _, _, errno := syscall.RawSyscall( + syscall.SYS_MADVISE, + uintptr(unsafe.Pointer(&dummyBytes[0])), + uintptr(nrDummyBytes), + syscall.MADV_DONTNEED); errno != 0 { + bluepillDieCleanly(dieMsgOOMRelease, uint64(errno)) + } + + var now syscall.Timeval + if _, _, errno := syscall.RawSyscall( + syscall.SYS_GETTIMEOFDAY, + uintptr(unsafe.Pointer(&now)), + 0, 0); errno != 0 { + bluepillDieCleanly(dieMsgOOMTime, uint64(errno)) + } + + c.OOMCount++ + elapsed := now.Sec - c.OOMLastTS + if elapsed > 60 { + // Reset the counter if it has expired (more than 60s). + c.OOMCount = 1 + c.OOMLastTS = now.Sec + } else if c.OOMCount > 5 { + bluepillDieCleanly(dieMsgOOMRepeat, uint64(c.OOMCount)) + } + continue // Rerun vCPU. + default: + bluepillDieCleanly(dieMsgRunFailed, uint64(errno)) + } + + switch c.vmxConfig.status { + case _SLIMVM_EXIT_EXCEPTION: + bluepillDieCleanly(dieMsgException, uint64(c.vmxConfig.status)) + case _SLIMVM_EXIT_IO: + bluepillDieCleanly(dieMsgIO, uint64(c.vmxConfig.status)) + case _SLIMVM_EXIT_INTERNAL_ERROR: + // An internal error is typically thrown when emulation + // fails. This can occur via the MMIO path below (and + // it might fail because we have multiple regions that + // are not mapped). We would actually prefer that no + // emulation occur, and don't mind at all if it fails. + case _SLIMVM_EXIT_HYPERCALL: + bluepillDieCleanly(dieMsgHypercall, uint64(c.vmxConfig.status)) + case _SLIMVM_EXIT_DEBUG: + bluepillDieCleanly(dieMsgDebug, uint64(c.vmxConfig.status)) + case _SLIMVM_EXIT_HLT: + // Increment our counter. + c.guestExits.Add(1) + + // Copy out registers. + bluepillArchExit(c, bluepillArchContext(context)) + + // Return to the vCPUReady state; notify any waiters. + user := c.state.Load() & vCPUUser + switch c.state.Swap(user) { + case user | vCPUGuest: // Expected case. + case user | vCPUGuest | vCPUWaiter: + c.notify() + default: + bluepillDieCleanly(dieMsgInvalidState, uint64(c.state.Load())) + } + return + case _SLIMVM_EXIT_MMIO: + bluepillDieCleanly(dieMsgMMIO, uint64(c.vmxConfig.status)) + case _SLIMVM_EXIT_IRQ_WINDOW_OPEN: + // NOTE: KVM-residual code. In SlimVM, interrupt injection + // (bounce, NMI) is handled entirely by the kernel module + // via vmx_inject_bounce()/vmx_inject_nmi() before VM + // entry. This exit reason should not occur in practice. + // Clear previous injection request. + c.runData.requestInterruptWindow = 0 + case _SLIMVM_EXIT_SHUTDOWN: + bluepillDieCleanly(dieMsgShutdown, uint64(c.vmxConfig.status)) + case _SLIMVM_EXIT_FAIL_ENTRY: + bluepillDieCleanly(dieMsgFailEntry, uint64(c.vmxConfig.status)) + case _SLIMVM_EXIT_INTR: + /* Signal Handler */ + case _SLIMVM_EXIT_MSR_WRITE: + bluepillDieCleanly(dieMsgMSRWrite, uint64(c.vmxConfig.status)) + default: + bluepillDieCleanly(dieMsgUnknown, uint64(c.vmxConfig.status)) + } + } +} + +func init() { + dummyBytes, _ = syscall.Mmap(-1, 0, nrDummyBytes, syscall.PROT_READ|syscall.PROT_WRITE, syscall.MAP_ANONYMOUS|syscall.MAP_PRIVATE) +} diff --git a/pkg/sentry/platform/slimvm/context.go b/pkg/sentry/platform/slimvm/context.go new file mode 100644 index 0000000000..9b97ae464d --- /dev/null +++ b/pkg/sentry/platform/slimvm/context.go @@ -0,0 +1,145 @@ +// Copyright 2026 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package slimvm + +import ( + "gvisor.dev/gvisor/pkg/abi/linux" + pkgcontext "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/hostarch" + "gvisor.dev/gvisor/pkg/ring0" + "gvisor.dev/gvisor/pkg/sentry/arch" + "gvisor.dev/gvisor/pkg/sentry/platform" + "gvisor.dev/gvisor/pkg/sentry/platform/interrupt" +) + +// context is an implementation of the platform context. +// +// This is a thin wrapper around the machine. +type context struct { + platform.NoCPUNumbers + + // machine is the parent machine, and is immutable. + machine *machine + + // info is the linux.SignalInfo cached for this context. + info linux.SignalInfo + + // interrupt is the interrupt context. + interrupt interrupt.Forwarder +} + +// tryCPUIDError indicates that CPUID emulation should occur. +type tryCPUIDError struct{} + +// Error implements error.Error. +func (tryCPUIDError) Error() string { return "cpuid emulation failed" } + +// Switch runs the provided context in the given address space. +func (c *context) Switch(ctx pkgcontext.Context, mm platform.MemoryManager, ac *arch.Context64, _ int32) (*linux.SignalInfo, hostarch.AccessType, error) { + as := mm.AddressSpace() + localAS := as.(*addressSpace) + +restart: + // Grab a vCPU. + cpu := c.machine.Get() + + // Enable interrupts (i.e. calls to vCPU.Notify). + if !c.interrupt.Enable(cpu) { + c.machine.Put(cpu) // Already preempted. + return nil, hostarch.NoAccess, platform.ErrContextInterrupt + } + + // Set the active address space. + // + // This must be done prior to the call to Touch below. If the address + // space is invalidated between this line and the call below, we will + // flag on entry anyways. When the active address space below is + // cleared, it indicates that we don't need an explicit interrupt and + // that the flush can occur naturally on the next user entry. + cpu.active.set(localAS) + + // Prepare switch options. + switchOpts := ring0.SwitchOpts{ + Registers: &ac.StateData().Regs, + FloatingPointState: ac.FloatingPointData(), + PageTables: localAS.pageTables, + Flush: localAS.Touch(cpu), + FullRestore: cpu.FullRestore || ac.FullRestore(), + } + + // Reset the flag. + cpu.FullRestore = false + + // Take the blue pill. + at, err := cpu.SwitchToUser(switchOpts, &c.info) + + // Clear the address space. + cpu.active.set(nil) + + // Increment the number of user exits. + cpu.userExits.Add(1) + + // Release resources. + c.machine.Put(cpu) + + // All done. + c.interrupt.Disable() + + if err != nil { + if _, ok := err.(tryCPUIDError); ok { + // Does emulation work for the CPUID? + if platform.TryCPUIDEmulate(ctx, mm, ac) { + goto restart + } + // If not a valid CPUID, then the signal should be + // delivered as is and the information is filled. + err = platform.ErrContextSignal + } + } + + return &c.info, at, err +} + +// Interrupt interrupts the running context. +func (c *context) Interrupt() { + c.interrupt.NotifyInterrupt() +} + +// Release implements platform.Context.Release(). +func (c *context) Release() {} + +// FullStateChanged implements platform.Context.FullStateChanged. +func (c *context) FullStateChanged() {} + +// PullFullState implements platform.Context.PullFullState. +func (c *context) PullFullState(as platform.AddressSpace, ac *arch.Context64) error { return nil } + +// PrepareSleep implements platform.Context.PrepareSleep. +func (*context) PrepareSleep() {} + +// PrepareUninterruptibleSleep implements platform.Context.PrepareUninterruptibleSleep. +func (*context) PrepareUninterruptibleSleep() {} + +// PrepareStop implements platform.Context.PrepareStop. +func (*context) PrepareStop() {} + +// Preempt implements platform.Context.Preempt. +func (*context) Preempt() {} + +// PrepareExecve implements platform.Context.PrepareExecve. +func (*context) PrepareExecve() {} + +// PrepareExit implements platform.Context.PrepareExit. +func (*context) PrepareExit() {} diff --git a/pkg/sentry/platform/slimvm/filters_amd64.go b/pkg/sentry/platform/slimvm/filters_amd64.go new file mode 100644 index 0000000000..fcd9d49254 --- /dev/null +++ b/pkg/sentry/platform/slimvm/filters_amd64.go @@ -0,0 +1,58 @@ +// Copyright 2026 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package slimvm + +import ( + "golang.org/x/sys/unix" + + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/seccomp" + "gvisor.dev/gvisor/pkg/sentry/platform" +) + +// SyscallFilters returns syscalls made exclusively by the SlimVM platform. +func (*SlimVM) SeccompInfo() platform.SeccompInfo { + return platform.StaticSeccompInfo{ + PlatformName: "slimvm", + Filters: seccomp.MakeSyscallRules(map[uintptr]seccomp.SyscallRule{ + unix.SYS_ARCH_PRCTL: seccomp.MatchAll{}, + unix.SYS_FUTEX: seccomp.MatchAll{}, + unix.SYS_IOCTL: seccomp.MatchAll{}, + unix.SYS_MEMBARRIER: seccomp.PerArg{ + seccomp.EqualTo(linux.MEMBARRIER_CMD_PRIVATE_EXPEDITED), + seccomp.EqualTo(0), + }, + unix.SYS_MMAP: seccomp.MatchAll{}, + unix.SYS_RT_SIGSUSPEND: seccomp.MatchAll{}, + unix.SYS_RT_SIGTIMEDWAIT: seccomp.MatchAll{}, + 0xffffffffffffffff: seccomp.MatchAll{}, // SlimVM uses syscall -1 to transition to host. + }), + HotSyscalls: hottestSyscalls(), + } +} + +// hottestSyscalls returns the list of hot syscalls for the SlimVM platform. +func hottestSyscalls() []uintptr { + return []uintptr{ + unix.SYS_FUTEX, + unix.SYS_MMAP, + } +} + +// PrecompiledSeccompInfo implements +// platform.Constructor.PrecompiledSeccompInfo. +func (*constructor) PrecompiledSeccompInfo() []platform.SeccompInfo { + return []platform.SeccompInfo{(*SlimVM)(nil).SeccompInfo()} +} diff --git a/pkg/sentry/platform/slimvm/machine.go b/pkg/sentry/platform/slimvm/machine.go new file mode 100644 index 0000000000..5360d095d8 --- /dev/null +++ b/pkg/sentry/platform/slimvm/machine.go @@ -0,0 +1,665 @@ +// Copyright 2026 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package slimvm + +import ( + "fmt" + "runtime" + "sync" + "syscall" + + "gvisor.dev/gvisor/pkg/atomicbitops" + "gvisor.dev/gvisor/pkg/hostarch" + "gvisor.dev/gvisor/pkg/hosttid" + "gvisor.dev/gvisor/pkg/log" + "gvisor.dev/gvisor/pkg/ring0" + "gvisor.dev/gvisor/pkg/ring0/pagetables" + "gvisor.dev/gvisor/pkg/spinlock" +) + +const ( + vCPUsPoolSize = 64 + bitMapBaseLen = 64 + vCPUsPoolBitmapSize = (vCPUsPoolSize + bitMapBaseLen - 1) / bitMapBaseLen +) + +// machine contains state associated with the VM as a whole. +type machine struct { + // upperSharedPageTables tracks the read-only shared upper of all the pagetables. + upperSharedPageTables *pagetables.PageTables + + // kernel is the set of global structures. + kernel ring0.Kernel + + // mappingCache is used for mapPhysical. + mappingCache sync.Map + + // mu protects vCPUs. + mu sync.RWMutex + + // available is notified when vCPUs are available. + available sync.Cond + + // vCPUs are the machine vCPUs. + // + // These are populated dynamically. + vCPUs map[uint64]*vCPU + + vCPUsPoolSpin spinlock.Spinlock + + vCPUsPool [vCPUsPoolSize]*vCPU + vCPUSBitmap [vCPUsPoolBitmapSize]uint64 + + memoryRegions []userMemoryRegion + + // sandboxID is the sandbox identifier passed to the host kernel module. + sandboxID int64 +} + +type slimvmConfig struct { + userRegs userRegs + sysRegs systemRegs + sandboxID int64 + status int64 + vcpu uint64 + pagefaultPhysical uint64 + memoryRegionNum uint64 + memoryRegionAddr uintptr +} + +const ( + // vCPUReady is an alias for all the below clear. + vCPUReady uint32 = 0 + + // vCPUser indicates that the vCPU is in or about to enter user mode. + vCPUUser uint32 = 1 << 0 + + // vCPUGuest indicates the vCPU is in guest mode. + vCPUGuest uint32 = 1 << 1 + + // vCPUWaiter indicates that there is a waiter. + // + // If this is set, then notify must be called on any state transitions. + vCPUWaiter uint32 = 1 << 2 +) + +// vCPU is a single KVM vCPU. +type vCPU struct { + // CPU is the kernel CPU data. + // + // This must be the first element of this structure, it is referenced + // by the bluepill code (see bluepill_amd64.s). + ring0.CPU + + // id is the vCPU id. + id int + + // tid is the last set tid. + tid atomicbitops.Uint64 + + // userExits is the count of user exits. + userExits atomicbitops.Uint64 + + // guestExits is the count of guest to host world switches. + guestExits atomicbitops.Uint64 + + // faults is a count of world faults (informational only). + faults uint32 + + // state is the vCPU state. + // + // This is a bitmask of the three fields (vCPU*) described above. + state atomicbitops.Uint32 + + vmxConfig slimvmConfig + + // runData for this vCPU. + runData *runData + + // machine associated with this vCPU. + machine *machine + + // active is the current addressSpace: this is set and read atomically, + // it is used to elide unnecessary interrupts due to invalidations. + active atomicAddressSpace + + // vCPUArchState is the architecture-specific state. + vCPUArchState + + // active PCIDs on this vCPU. + activePCIDs pcidBitmap + + // CPUID Faulting enable flag. + cpuidFaultingEnable int + + // Count the OOM happened recently. + OOMCount int + OOMLastTS int64 + + // FullRestore indicates whether this vCPU need + // iret-based restore before enter guest. + FullRestore bool +} + +// newVCPU creates a returns a new vCPU. +// +// Precondtion: mu must be held. +func (m *machine) newVCPU() *vCPU { + // TODO: add VCPU limit in sentry side. + // Create the vCPU. + c := &vCPU{ + machine: m, + id: len(m.vCPUs), + } + c.CPU.Init(&m.kernel, c.id, c) + + // SlimVM platform support VMCALL + c.CPU.EnableVMCALL() + + // Ensure the signal mask is correct. + if err := c.setSignalMask(); err != nil { + panic(fmt.Sprintf("error setting signal mask: %v", err)) + } + + // Initialize architecture state. + if err := c.initArchState(); err != nil { + panic(fmt.Sprintf("error initialization vCPU state: %v", err)) + } + + id, _, errno := c.createVCPU(m.memoryRegions) + if errno != 0 { + panic(fmt.Sprintf("error creating new vCPU: %v", errno)) + } + + c.vmxConfig.vcpu = uint64(id) + + c.vmxConfig.sandboxID = m.sandboxID + + return c // Done. +} + +func (m *machine) vCPUsPoolInit() { + for i := 0; i < vCPUsPoolSize; i++ { + m.vCPUsPool[i] = nil + } +} + +// newMachine returns a new VM context. +func newMachine(sandboxID int64) (*machine, error) { + // Create the machine. + m := &machine{ + vCPUs: make(map[uint64]*vCPU), + sandboxID: sandboxID, + } + m.available.L = &m.mu + + // TODO: more general max vcpu number. + m.kernel.Init(_SLIMVM_NR_VCPUS) + + // Create the upper shared pagetables and kernel(sentry) pagetables. + m.upperSharedPageTables = pagetables.New(newAllocator()) + m.mapUpperHalf(m.upperSharedPageTables) + m.upperSharedPageTables.Allocator.(allocator).base.Drain() + m.upperSharedPageTables.MarkReadOnlyShared() + m.kernel.PageTables = pagetables.NewWithUpper(newAllocator(), m.upperSharedPageTables, ring0.KernelStartAddress) + + // Apply the physical mappings. Note that these mappings may point to + // guest physical addresses that are not actually available. These + // physical pages are mapped on demand, see kernel_unsafe.go. + applyPhysicalRegions(func(pr physicalRegion) bool { + // Map everything in the lower half. + m.kernel.PageTables.Map( + hostarch.Addr(pr.virtual), + pr.length, + pagetables.MapOpts{AccessType: hostarch.AnyAccess}, + pr.physical) + + m.mapPhysical(pr.physical, pr.length) + + return true // Keep iterating. + }) + + // Initialize architecture state. + if err := m.initArchState(); err != nil { + m.Destroy() + return nil, err + } + + // Ensure the machine is cleaned up properly. + runtime.SetFinalizer(m, (*machine).Destroy) + + return m, nil +} + +func (m *machine) getVCPUFromPool() *vCPU { + m.vCPUsPoolSpin.Lock() + for i := 0; i < vCPUsPoolSize; i += bitMapBaseLen { + bitmap := m.vCPUSBitmap[i/bitMapBaseLen] + if bitmap == 0 { + continue + } + var j int + for j = 0; j < bitMapBaseLen; j++ { + if bitmap&(1< 0 { + c.sendSignal() + } + } +} + +// BounceToKernel ensures that the vCPU bounces back to the kernel. +// +//go:nosplit +func (c *vCPU) BounceToKernel() { + c.bounce(false) +} + +// BounceToHost ensures that the vCPU is in host mode. +// +//go:nosplit +func (c *vCPU) BounceToHost() { + c.bounce(true) +} + +func (m *machine) dumpVCPUStats() { + var ( + nrReady uint + nrUser uint + nrGuest uint + nrGuestUser uint + ) + + m.mu.RLock() + for _, c := range m.vCPUs { + switch c.state.Load() &^ vCPUWaiter { + case vCPUReady: + nrReady++ + + case vCPUUser: + nrUser++ + + case vCPUGuest: + nrGuest++ + + case vCPUGuest | vCPUUser: + nrGuestUser++ + } + } + m.mu.RUnlock() + + log.Infof("vCPU stats: Ready=%d, User=%d, Guest=%d, GuestUser=%d", nrReady, nrUser, nrGuest, nrGuestUser) +} diff --git a/pkg/sentry/platform/slimvm/machine_amd64.go b/pkg/sentry/platform/slimvm/machine_amd64.go new file mode 100644 index 0000000000..d7c54a1842 --- /dev/null +++ b/pkg/sentry/platform/slimvm/machine_amd64.go @@ -0,0 +1,393 @@ +// Copyright 2026 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +//go:build amd64 +// +build amd64 + +package slimvm + +import ( + "fmt" + "reflect" + "runtime/debug" + "syscall" + + "golang.org/x/sys/unix" + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/cpuid" + "gvisor.dev/gvisor/pkg/hostarch" + "gvisor.dev/gvisor/pkg/ring0" + "gvisor.dev/gvisor/pkg/ring0/pagetables" + "gvisor.dev/gvisor/pkg/sentry/arch/fpu" + "gvisor.dev/gvisor/pkg/sentry/platform" +) + +// initArchState initializes architecture-specific state. +func (m *machine) initArchState() error { + // Set the legacy TSS address. This address is covered by the reserved + // range (up to 4GB). In fact, this is a main reason it exists. + if _, _, errno := syscall.RawSyscall( + syscall.SYS_IOCTL, + slimvmFD, + _SLIMVM_SET_TSS_ADDR, + uintptr(reservedMemory-(3*hostarch.PageSize))); errno != 0 { + return errno + } + + // Enable CPUID faulting, if possible. Note that this also serves as a + // basic platform sanity tests, since we will enter guest mode for the + // first time here. The recovery is necessary, since if we fail to read + // the platform info register, we will retry to host mode and + // ultimately need to handle a segmentation fault. + old := debug.SetPanicOnFault(true) + defer func() { + recover() + debug.SetPanicOnFault(old) + }() + + return nil +} + +type vCPUArchState struct { + // floatingPointState is the floating point state buffer used in guest + // to host transitions. See usage in bluepill_amd64.go. + floatingPointState fpu.State +} + +// dropPageTables drops cached page table entries. +func (m *machine) dropPageTables(pcid uint16) { + m.mu.Lock() + defer m.mu.Unlock() + + // Clear on all vCPUs. + for _, c := range m.vCPUs { + c.activePCIDs.clear(pcid) + } + dropPCID(pcid) +} + +// initArchState initializes architecture-specific state. +func (c *vCPU) initArchState() error { + var ( + kernelSystemRegs systemRegs + kernelUserRegs userRegs + ) + + // AMD and compatible CPUs do not support CPUID Faulting feature. + // cpuidFaultingEnable will set to 1 to avoid setting the cpuidFaulting + // function in SwitchToUser. + fs := cpuid.HostFeatureSet() + if fs.AMD() { + ring0.CPUVendor = ring0.CPUAMD + c.cpuidFaultingEnable = 1 + } else { + ring0.CPUVendor = ring0.CPUIntel + c.cpuidFaultingEnable = 0 + } + + // Set base control registers. + kernelSystemRegs.CR0 = c.CR0() + kernelSystemRegs.CR4 = c.CR4() + kernelSystemRegs.EFER = c.EFER() + + // Set the IDT & GDT in the registers. + kernelSystemRegs.IDT.base, kernelSystemRegs.IDT.limit = c.IDT() + kernelSystemRegs.GDT.base, kernelSystemRegs.GDT.limit = c.GDT() + kernelSystemRegs.CS.Load(&ring0.KernelCodeSegment, ring0.Kcode) + kernelSystemRegs.DS.Load(&ring0.UserDataSegment, ring0.Udata) + kernelSystemRegs.ES.Load(&ring0.UserDataSegment, ring0.Udata) + kernelSystemRegs.SS.Load(&ring0.KernelDataSegment, ring0.Kdata) + kernelSystemRegs.FS.Load(&ring0.UserDataSegment, ring0.Udata) + kernelSystemRegs.GS.Load(&ring0.UserDataSegment, ring0.Udata) + tssBase, tssLimit, tss := c.TSS() + kernelSystemRegs.TR.Load(tss, ring0.Tss) + kernelSystemRegs.TR.base = tssBase + kernelSystemRegs.TR.limit = uint32(tssLimit) + + kernelSystemRegs.LDT.base = 0 + kernelSystemRegs.LDT.limit = 0 + kernelSystemRegs.LDT.selector = 0 + kernelSystemRegs.LDT.typ = 0x2 + kernelSystemRegs.LDT.present = 0x1 + + // Point to kernel page tables, with no initial PCID. + kernelSystemRegs.CR3 = c.machine.kernel.PageTables.CR3(false, 0) + + // Set the CPUID; this is required before setting system registers, + // since KVM will reject several CR4 bits if the CPUID does not + // indicate the support is available. + if err := c.setCPUID(); err != nil { + return err + } + + // Set the entrypoint for the kernel. + kernelUserRegs.RIP = uint64(ring0.AddrOfStart()) + kernelUserRegs.RAX = uint64(reflect.ValueOf(&c.CPU).Pointer()) + kernelUserRegs.RSP = c.StackTop() + kernelUserRegs.RFLAGS = ring0.KernelFlagsSet + + // Set the system registers. + if err := c.setSystemRegisters(&kernelSystemRegs); err != nil { + return err + } + + // Set the user registers. + if err := c.setUserRegisters(&kernelUserRegs); err != nil { + return err + } + + // Allocate some floating point state save area for the local vCPU. + // This will be saved prior to leaving the guest, and we restore from + // this always. We cannot use the pointer in the context alone because + // we don't know how large the area there is in reality. + c.floatingPointState = fpu.NewState() + + // Set the time offset to the host native time. + return c.setSystemTime() +} + +// nonCanonical generates a canonical address return. +// +//go:nosplit +func nonCanonical(addr uint64, signal int32, info *linux.SignalInfo) (hostarch.AccessType, error) { + *info = linux.SignalInfo{ + Signo: signal, + Code: linux.SI_KERNEL, + } + info.SetAddr(addr) // Include address. + return hostarch.NoAccess, platform.ErrContextSignal +} + +// fault generates an appropriate fault return. +// +//go:nosplit +func (c *vCPU) fault(signal int32, info *linux.SignalInfo) (hostarch.AccessType, error) { + bluepill(c) // Probably no-op, but may not be. + faultAddr := ring0.ReadCR2() + code, user := c.ErrorCode() + if !user { + // The last fault serviced by this CPU was not a user + // fault, so we can't reliably trust the faultAddr or + // the code provided here. We need to re-execute. + return hostarch.NoAccess, platform.ErrContextInterrupt + } + // Reset the pointed SignalInfo. + *info = linux.SignalInfo{Signo: signal} + info.SetAddr(uint64(faultAddr)) + accessType := hostarch.AccessType{} + if signal == int32(unix.SIGSEGV) { + accessType = hostarch.AccessType{ + Read: code&(1<<1) == 0, + Write: code&(1<<1) != 0, + Execute: code&(1<<4) != 0, + } + } + if !accessType.Write && !accessType.Execute { + info.Code = 1 // SEGV_MAPERR. + } else { + info.Code = 2 // SEGV_ACCERR. + } + return accessType, platform.ErrContextSignal +} + +// SwitchToUser unpacks architectural-details. +func (c *vCPU) SwitchToUser(switchOpts ring0.SwitchOpts, info *linux.SignalInfo) (hostarch.AccessType, error) { + // Check for canonical addresses. + if regs := switchOpts.Registers; !ring0.IsCanonical(regs.Rip) { + return nonCanonical(regs.Rip, int32(syscall.SIGSEGV), info) + } else if !ring0.IsCanonical(regs.Rsp) { + return nonCanonical(regs.Rsp, int32(syscall.SIGBUS), info) + } else if !ring0.IsCanonical(regs.Fs_base) { + return nonCanonical(regs.Fs_base, int32(syscall.SIGBUS), info) + } else if !ring0.IsCanonical(regs.Gs_base) { + return nonCanonical(regs.Gs_base, int32(syscall.SIGBUS), info) + } + + localAS := c.active.get() + if hasGuestPCID && (localAS != nil) { + pcid := localAS.pcid + if pcid == 0 { + // As an optimization, we use pcidMu to protect + // the update of localAS.pcid. + // + // Note that, a snapshot of localAS.pcid is needed + // as localAS.pcid can be modified by other vCPUs + // when it is 0. + pcid = assignPCID(&localAS.pcid) + } + if pcid != 0 && !c.activePCIDs.test(pcid) { + switchOpts.Flush = true + c.activePCIDs.set(pcid) + } + switchOpts.KernelPCID = fixedKernelPCID + switchOpts.UserPCID = pcid + } + + // See below. + var vector ring0.Vector + + entersyscall() + bluepill(c) + + // The root table physical page has to be mapped to not fault in iret + // or sysret after switching into a user address space. sysret and iret + // are in the upper half that is global and already mapped. PTEs come + // from the runtime allocator's Go heap (no mlock / memfile pinning), + // so the root page can be reclaimed under memory pressure and + // re-faulting it from inside the iret/sysret window has been observed + // to produce vCPU bounce stalls. + switchOpts.PageTables.PrefaultRootTable() + + // Enable CPUID Faulting featue if the CPU supported. + if c.cpuidFaultingEnable == 0 { + ring0.SetCPUIDFaulting(true) + c.cpuidFaultingEnable = 1 + } + + c.PrefaultIDT() + vector = c.CPU.SwitchToUser(switchOpts) + exitsyscall() + + switch vector { + case ring0.Syscall, ring0.SyscallInt80: + // Fast path: system call executed. + return hostarch.NoAccess, nil + + case ring0.PageFault: + return c.fault(int32(syscall.SIGSEGV), info) + + case ring0.Debug: + // Vector #DB. SlimVM does not currently expose DR6 to the sentry, so + // we can't distinguish single-step from hardware breakpoints. App-level + // PTRACE_SINGLESTEP is the only path that should hit this in practice; + // report TRAP_TRACE so ptrace observes the expected si_code. + c.FullRestore = true + *info = linux.SignalInfo{ + Signo: int32(syscall.SIGTRAP), + Code: 2, // TRAP_TRACE + } + info.SetAddr(switchOpts.Registers.Rip) // Include address. + return hostarch.AccessType{}, platform.ErrContextSignal + + case ring0.Breakpoint: + c.FullRestore = true + *info = linux.SignalInfo{ + Signo: int32(syscall.SIGTRAP), + Code: linux.SI_KERNEL, + } + info.SetAddr(switchOpts.Registers.Rip) // Include address. + return hostarch.AccessType{}, platform.ErrContextSignal + + case ring0.GeneralProtectionFault, + ring0.SegmentNotPresent, + ring0.BoundRangeExceeded, + ring0.InvalidTSS, + ring0.StackSegmentFault: + c.FullRestore = true + *info = linux.SignalInfo{ + Signo: int32(syscall.SIGSEGV), + Code: linux.SI_KERNEL, + } + info.SetAddr(switchOpts.Registers.Rip) // Include address. + if vector == ring0.GeneralProtectionFault { + // When CPUID faulting is enabled, we will generate a #GP(0) when + // userspace executes a CPUID instruction. This is handled above, + // because we need to be able to map and read user memory. + return hostarch.AccessType{}, tryCPUIDError{} + } + return hostarch.AccessType{}, platform.ErrContextSignal + + case ring0.InvalidOpcode: + c.FullRestore = true + *info = linux.SignalInfo{ + Signo: int32(syscall.SIGILL), + Code: 1, // ILL_ILLOPC (illegal opcode). + } + info.SetAddr(switchOpts.Registers.Rip) // Include address. + return hostarch.AccessType{}, platform.ErrContextSignal + + case ring0.DivideByZero: + c.FullRestore = true + *info = linux.SignalInfo{ + Signo: int32(syscall.SIGFPE), + Code: 1, // FPE_INTDIV (divide by zero). + } + info.SetAddr(switchOpts.Registers.Rip) // Include address. + return hostarch.AccessType{}, platform.ErrContextSignal + + case ring0.Overflow: + c.FullRestore = true + *info = linux.SignalInfo{ + Signo: int32(syscall.SIGFPE), + Code: 1, // FPE_INTOVF (integer overflow). + } + info.SetAddr(switchOpts.Registers.Rip) // Include address. + return hostarch.AccessType{}, platform.ErrContextSignal + + case ring0.X87FloatingPointException, + ring0.SIMDFloatingPointException: + c.FullRestore = true + *info = linux.SignalInfo{ + Signo: int32(syscall.SIGFPE), + Code: 7, // FPE_FLTINV (invalid operation). + } + info.SetAddr(switchOpts.Registers.Rip) // Include address. + return hostarch.AccessType{}, platform.ErrContextSignal + + case ring0.Vector(bounce): // ring0.VirtualizationException + return hostarch.NoAccess, platform.ErrContextInterrupt + + case ring0.Vector(guestOOM): // ring0.OOMException + c.FullRestore = true + return hostarch.NoAccess, platform.ErrContextOOM + + case ring0.AlignmentCheck: + c.FullRestore = true + *info = linux.SignalInfo{ + Signo: int32(syscall.SIGBUS), + Code: 2, // BUS_ADRERR (physical address does not exist). + } + return hostarch.NoAccess, platform.ErrContextSignal + + case ring0.NMI: + // An NMI is generated only when a fault is not servicable by + // KVM itself, so we think some mapping is writeable but it's + // really not. This could happen, e.g. if some file is + // truncated (and would generate a SIGBUS) and we map it + // directly into the instance. + c.FullRestore = true + return c.fault(int32(syscall.SIGBUS), info) + + case ring0.DeviceNotAvailable, + ring0.DoubleFault, + ring0.CoprocessorSegmentOverrun, + ring0.MachineCheck, + ring0.SecurityException: + fallthrough + default: + panic(fmt.Sprintf("unexpected vector: 0x%x", vector)) + } +} + +func (m *machine) mapUpperHalf(pageTable *pagetables.PageTables) { + applyPhysicalRegions(func(pr physicalRegion) bool { + pageTable.Map( + hostarch.Addr(ring0.KernelStartAddress|pr.virtual), + pr.length, + pagetables.MapOpts{AccessType: hostarch.AnyAccess}, + pr.physical) + + return true // Keep iterating. + }) +} diff --git a/pkg/sentry/platform/slimvm/machine_amd64_unsafe.go b/pkg/sentry/platform/slimvm/machine_amd64_unsafe.go new file mode 100644 index 0000000000..8a484d1a76 --- /dev/null +++ b/pkg/sentry/platform/slimvm/machine_amd64_unsafe.go @@ -0,0 +1,86 @@ +// Copyright 2026 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +//go:build amd64 +// +build amd64 + +package slimvm + +import ( + "syscall" + "unsafe" + + "gvisor.dev/gvisor/pkg/abi/linux" +) + +// loadSegments copies the current segments. +// +// This may be called from within the signal context and throws on error. +// +//go:nosplit +func (c *vCPU) loadSegments(tid uint64) { + if _, _, errno := syscall.RawSyscall( + syscall.SYS_ARCH_PRCTL, + linux.ARCH_GET_FS, + uintptr(unsafe.Pointer(&c.CPU.Registers().Fs_base)), + 0); errno != 0 { + throw("getting FS segment") + } + if _, _, errno := syscall.RawSyscall( + syscall.SYS_ARCH_PRCTL, + linux.ARCH_GET_GS, + uintptr(unsafe.Pointer(&c.CPU.Registers().Gs_base)), + 0); errno != 0 { + throw("getting GS segment") + } + c.tid.Store(tid) +} + +// setUserRegisters sets user registers in the vCPU. +func (c *vCPU) setUserRegisters(uregs *userRegs) error { + c.vmxConfig.userRegs = *uregs + return nil +} + +// setSystemRegisters sets system registers. +func (c *vCPU) setSystemRegisters(sregs *systemRegs) error { + c.vmxConfig.sysRegs = *sregs + return nil +} + +// setCPUID sets the CPUID to be used by the guest. +func (c *vCPU) setCPUID() error { + return nil +} + +// setSystemTime sets the TSC for the vCPU. +// +// In SlimVM, the guest shares the host TSC directly, so no TSC +// synchronization is needed and there is no host/guest TSC offset. +func (c *vCPU) setSystemTime() error { + return nil +} + +// setSignalMask sets the vCPU signal mask. +// +// This must be called prior to running the vCPU. +// +// In KVM, this uses KVM_SET_SIGNAL_MASK ioctl to configure which +// signals are blocked while the vCPU is running. In SlimVM, the +// kernel module manages the signal mask internally during +// _SLIMVM_RUN (blocking all signals except SIGKILL, SIGSTOP, +// SIG_BOUNCE, and SIGPROF), so no userspace configuration is needed. +func (c *vCPU) setSignalMask() error { + return nil +} diff --git a/pkg/sentry/platform/slimvm/machine_unsafe.go b/pkg/sentry/platform/slimvm/machine_unsafe.go new file mode 100644 index 0000000000..075a1bf528 --- /dev/null +++ b/pkg/sentry/platform/slimvm/machine_unsafe.go @@ -0,0 +1,110 @@ +// Copyright 2026 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package slimvm + +import ( + "fmt" + "math" + "sync/atomic" + "syscall" + "unsafe" + + "gvisor.dev/gvisor/pkg/abi/linux" +) + +//go:linkname entersyscall runtime.entersyscall +func entersyscall() + +//go:linkname exitsyscall runtime.exitsyscall +func exitsyscall() + +// atomicAddressSpace is an atomic address space pointer. +type atomicAddressSpace struct { + pointer unsafe.Pointer +} + +// set sets the address space value. +// +//go:nosplit +func (a *atomicAddressSpace) set(as *addressSpace) { + atomic.StorePointer(&a.pointer, unsafe.Pointer(as)) +} + +// get gets the address space value. +// +// Note that this should be considered best-effort, and may have changed by the +// time this function returns. +// +//go:nosplit +func (a *atomicAddressSpace) get() *addressSpace { + return (*addressSpace)(atomic.LoadPointer(&a.pointer)) +} + +// notify notifies that the vCPU has transitioned modes. +// +// This may be called by a signal handler and therefore throws on error. +// +//go:nosplit +func (c *vCPU) notify() { + _, _, errno := syscall.RawSyscall6( + syscall.SYS_FUTEX, + uintptr(unsafe.Pointer(&c.state)), + linux.FUTEX_WAKE, + math.MaxInt32, // Number of waiters. + 0, 0, 0) + if errno != 0 { + throw("futex wake error") + } +} + +// waitUntilNot waits for the vCPU to transition modes. +// +// The state should have been previously set to vCPUWaiter after performing an +// appropriate action to cause a transition (e.g. interrupt injection). +// +// This panics on error. +func (c *vCPU) waitUntilNot(state uint32) bool { + // Check transition mode before to issue a SYS_FUTEX. + if c.state.Load() != state { + return false + } + ts := linux.Timespec{Sec: 1} + _, _, errno := syscall.Syscall6( + syscall.SYS_FUTEX, + uintptr(unsafe.Pointer(&c.state)), + linux.FUTEX_WAIT, + uintptr(state), + uintptr(unsafe.Pointer(&ts)), + 0, 0) + if errno != 0 && errno != syscall.EINTR && errno != syscall.EAGAIN && errno != syscall.ETIMEDOUT { + panic("futex wait error") + } + return errno == syscall.ETIMEDOUT +} + +// createVCPU create VCPU in slimvm. +func (c *vCPU) createVCPU(memoryRegions []userMemoryRegion) (uintptr, uintptr, syscall.Errno) { + c.vmxConfig.memoryRegionNum = uint64(len(memoryRegions)) + c.vmxConfig.memoryRegionAddr = uintptr(unsafe.Pointer(&memoryRegions[0])) + + return syscall.RawSyscall(syscall.SYS_IOCTL, slimvmFD, _SLIMVM_CREATE_VCPU, uintptr(unsafe.Pointer(&c.vmxConfig))) +} + +func (c *vCPU) releaseVCPU() { + _, _, errno := syscall.RawSyscall(syscall.SYS_IOCTL, slimvmFD, _SLIMVM_RELEASE_VCPU, uintptr(unsafe.Pointer(&c.vmxConfig.vcpu))) + if errno != 0 { + panic(fmt.Sprintf("error release free vCPU: %v %d", errno, c.vmxConfig.vcpu)) + } +} diff --git a/pkg/sentry/platform/slimvm/pcids_x86.go b/pkg/sentry/platform/slimvm/pcids_x86.go new file mode 100644 index 0000000000..b3e29fc5db --- /dev/null +++ b/pkg/sentry/platform/slimvm/pcids_x86.go @@ -0,0 +1,82 @@ +// Copyright 2026 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +//go:build amd64 +// +build amd64 + +package slimvm + +import ( + "sync" + + "gvisor.dev/gvisor/pkg/atomicbitops" +) + +const ( + limitPCID = 4096 + fixedKernelPCID = 1 +) + +var ( + pcidCache []uint16 + pcidMu sync.Mutex +) + +type pcidBitmap [(limitPCID + 64 - 1) / 64]atomicbitops.Uint64 + +func (bm *pcidBitmap) set(pcid uint16) { + atomicbitops.OrUint64(&bm[pcid/64], 1<<(pcid%64)) +} + +func (bm *pcidBitmap) clear(pcid uint16) { + atomicbitops.AndUint64(&bm[pcid/64], ^(1 << (pcid % 64))) +} + +func (bm *pcidBitmap) test(pcid uint16) bool { + return bm[pcid/64].Load()&(1<<(pcid%64)) != 0 +} + +func (bm *pcidBitmap) reset() { + for i := range bm { + bm[i].Store(0) + } +} + +func initPCIDs() { + if !hasGuestPCID { + return + } + for pcid := fixedKernelPCID + 1; pcid < limitPCID; pcid++ { + pcidCache = append(pcidCache, uint16(pcid)) + } +} + +func assignPCID(pcid *uint16) uint16 { + pcidMu.Lock() + if *pcid == 0 && len(pcidCache) > 0 { + *pcid = pcidCache[len(pcidCache)-1] + pcidCache = pcidCache[:len(pcidCache)-1] + } + pcidMu.Unlock() + return *pcid +} + +func dropPCID(pcid uint16) { + if pcid == 0 { + return + } + pcidMu.Lock() + pcidCache = append(pcidCache, pcid) + pcidMu.Unlock() +} diff --git a/pkg/sentry/platform/slimvm/physical_map.go b/pkg/sentry/platform/slimvm/physical_map.go new file mode 100644 index 0000000000..9eb9efd6f7 --- /dev/null +++ b/pkg/sentry/platform/slimvm/physical_map.go @@ -0,0 +1,207 @@ +// Copyright 2026 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package slimvm + +import ( + "fmt" + "sort" + "syscall" + + "gvisor.dev/gvisor/pkg/hostarch" + "gvisor.dev/gvisor/pkg/log" + "gvisor.dev/gvisor/pkg/ring0" +) + +type region struct { + virtual uintptr + length uintptr +} + +type physicalRegion struct { + region + physical uintptr +} + +// physicalRegions contains a list of available physical regions. +// +// The physical value used in physicalRegions is a number indicating the +// physical offset, aligned appropriately and starting above reservedMemory. +var physicalRegions []physicalRegion + +// fillAddressSpace fills the host address space with PROT_NONE mappings until +// the number of available bits until we have a host address space size that is +// equal to the physical address space. +// +// The excluded regions are returned. +func fillAddressSpace() (excludedRegions []region) { + // We can cut vSize in half, because the kernel will be using the top + // half and we ignore it while constructing mappings. It's as if we've + // already excluded half the possible addresses. + vSize := ring0.UserspaceSize + + // We exclude reservedMemory below from our physical memory size, so it + // needs to be dropped here as well. Otherwise, we could end up with + // physical addresses that are beyond what is mapped. + pSize := uintptr(1) << ring0.PhysicalAddressBits + pSize -= reservedMemory + + // Sanity check. + if vSize < pSize { + return excludedRegions + } + + // Calculate the required space and fill it. + // + // Virtual address range and physical address range should aligned with huge page size. + requiredAddr, ok := hostarch.Addr(vSize - pSize + hostarch.HugePageSize).RoundUp() + if !ok { + panic(fmt.Sprintf( + "overflow for vSize (%x) - pSize (%x) + HugePageSize(%x)", + vSize, pSize, hostarch.HugePageSize)) + } + required := uintptr(requiredAddr) + current := required // Attempted mmap size. + for filled := uintptr(0); filled < required && current > 0; { + addr, _, errno := syscall.RawSyscall6( + syscall.SYS_MMAP, + 0, // Suggested address. + current, + syscall.PROT_NONE, + syscall.MAP_ANONYMOUS|syscall.MAP_PRIVATE|syscall.MAP_NORESERVE, + 0, 0) + if errno != 0 { + // One page is the smallest mapping that can be allocated. + // Without this guard, hostarch.Addr(PageSize>>1).RoundUp() + // rounds back up to PageSize, leaving current stuck and the + // loop never terminating (mirrors the equivalent guard in + // the kvm platform's fillAddressSpace). + if current == hostarch.PageSize { + current = 0 + continue + } + // Attempt half the size; overflow not possible. + currentAddr, _ := hostarch.Addr(current >> 1).RoundUp() + current = uintptr(currentAddr) + continue + } + // We filled a block. + filled += current + excludedRegions = append(excludedRegions, region{ + virtual: addr, + length: current, + }) + // See comment above. + if filled != required { + required += hostarch.HugePageSize + } + } + if current == 0 { + panic("filling address space failed") + } + sort.Slice(excludedRegions, func(i, j int) bool { + return excludedRegions[i].virtual < excludedRegions[j].virtual + }) + for _, r := range excludedRegions { + log.Infof("region: virtual [%x,%x)", r.virtual, r.virtual+r.length) + } + return excludedRegions +} + +// computePhysicalRegions computes physical regions. +func computePhysicalRegions(excludedRegions []region) (physicalRegions []physicalRegion) { + physical := uintptr(reservedMemory) + addValidRegion := func(virtual, length uintptr) { + if length == 0 { + return + } + if virtual == 0 { + virtual += hostarch.PageSize + length -= hostarch.PageSize + } + if end := virtual + length; end > ring0.MaximumUserAddress { + length -= (end - ring0.MaximumUserAddress) + } + if length == 0 { + return + } + // Round physical up to the same alignment as the virtual + // address (with respect to faultBlockSize). + if offset := virtual &^ hostarch.JumboPageMask; physical&^hostarch.JumboPageMask != offset { + if newPhysical := (physical & hostarch.JumboPageMask) + offset; newPhysical > physical { + physical = newPhysical // Round up by only a little bit. + } else { + physical = ((physical + hostarch.JumboPageSize) & hostarch.JumboPageMask) + offset + } + } + physicalRegions = append(physicalRegions, physicalRegion{ + region: region{ + virtual: virtual, + length: length, + }, + physical: physical, + }) + physical += length + } + lastExcludedEnd := uintptr(0) + for _, r := range excludedRegions { + addValidRegion(lastExcludedEnd, r.virtual-lastExcludedEnd) + lastExcludedEnd = r.virtual + r.length + } + addValidRegion(lastExcludedEnd, ring0.MaximumUserAddress-lastExcludedEnd) + + // Dump our all physical regions. + for _, r := range physicalRegions { + log.Infof("physicalRegion: virtual [%x,%x) => physical [%x,%x)", + r.virtual, r.virtual+r.length, r.physical, r.physical+r.length) + } + return physicalRegions +} + +// physicalInit initializes physical address mappings. +func physicalInit() { + physicalRegions = computePhysicalRegions(fillAddressSpace()) +} + +// applyPhysicalRegions applies the given function on physical regions. +// +// Iteration continues as long as true is returned. The return value is the +// return from the last call to fn, or true if there are no entries. +// +// Precondition: physicalInit must have been called. +func applyPhysicalRegions(fn func(pr physicalRegion) bool) bool { + for _, pr := range physicalRegions { + if !fn(pr) { + return false + } + } + return true +} + +// translateToPhysical translates the given virtual address. +// +// Precondition: physicalInit must have been called. +// +//go:nosplit +func translateToPhysical(virtual uintptr) (physical uintptr, length uintptr, ok bool) { + for _, pr := range physicalRegions { + if pr.virtual <= virtual && virtual < pr.virtual+pr.length { + physical = pr.physical + (virtual - pr.virtual) + length = pr.length - (virtual - pr.virtual) + ok = true + return + } + } + return +} diff --git a/pkg/sentry/platform/slimvm/physical_map_amd64.go b/pkg/sentry/platform/slimvm/physical_map_amd64.go new file mode 100644 index 0000000000..d607c86882 --- /dev/null +++ b/pkg/sentry/platform/slimvm/physical_map_amd64.go @@ -0,0 +1,22 @@ +// Copyright 2026 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package slimvm + +const ( + // reservedMemory is a chunk of physical memory reserved starting at + // physical address zero. There are some special pages in this region, + // so we just call the whole thing off. + reservedMemory = 0x100000000 +) diff --git a/pkg/sentry/platform/slimvm/slimvm.go b/pkg/sentry/platform/slimvm/slimvm.go new file mode 100644 index 0000000000..dfd609d381 --- /dev/null +++ b/pkg/sentry/platform/slimvm/slimvm.go @@ -0,0 +1,192 @@ +// Copyright 2026 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Package slimvm provides a slimvm-based implementation of the platform interface. +package slimvm + +import ( + "fmt" + "strconv" + "sync" + "syscall" + + pkgcontext "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/fd" + "gvisor.dev/gvisor/pkg/hostarch" + "gvisor.dev/gvisor/pkg/ring0" + "gvisor.dev/gvisor/pkg/ring0/pagetables" + "gvisor.dev/gvisor/pkg/sentry/platform" +) + +// SlimVM represents a lightweight VM context. +type SlimVM struct { + // TODO: support preemption as KVM. + platform.NoCPUPreemptionDetection + + platform.UseHostProcessMemoryBarrier + + // TODO: follow commit 0b9bde06d0 to Add RSEQ support for SlimVM + platform.NoCPUNumbers + + // machine is the backing VM. + machine *machine +} + +// runData is the run structure. This may be mapped for synchronous register +// access (although that doesn't appear to be supported by my kernel at least). +// +// This mirrors kvm_run. +type runData struct { + requestInterruptWindow uint8 + _ [7]uint8 + + exitReason uint32 + readyForInterruptInjection uint8 + ifFlag uint8 + _ [2]uint8 + + cr8 uint64 + apicBase uint64 + + // This is the union data for exits. Interpretation depends entirely on + // the exitReason above (see vCPU code for more information). + data [32]uint64 +} + +var ( + globalOnce sync.Once + globalErr error + slimvmFD uintptr + slimvmFile *fd.FD +) + +// OpenDevice opens the SlimVM device at /dev/kvm and returns the File. +func OpenDevice(devicePath string) (*fd.FD, error) { + if devicePath == "" { + devicePath = "/dev/slimvm" + } + f, err := fd.Open(devicePath, syscall.O_RDWR, 0) + if err != nil { + return nil, fmt.Errorf("error opening %s: %v", devicePath, err) + } + return f, nil +} + +// New returns a new SlimVM-based implementation of the platform interface. +func New(deviceFile *fd.FD, sandboxID string) (*SlimVM, error) { + slimvmFile = deviceFile + slimvmFD = uintptr(slimvmFile.FD()) + + // Ensure global initialization is done. + globalOnce.Do(func() { + updateGlobalOnce(int(slimvmFD)) + }) + if globalErr != nil { + return nil, globalErr + } + + // Parse sandbox ID for the host kernel module. + sid, _ := strconv.ParseInt(sandboxID[:min(8, len(sandboxID))], 16, 64) + + // Create a VM context. + machine, err := newMachine(sid) + if err != nil { + return nil, err + } + + StartReclaimDaemon(machine) + + // All set. + return &SlimVM{ + machine: machine, + }, nil +} + +// SupportsAddressSpaceIO implements platform.Platform.SupportsAddressSpaceIO. +func (*SlimVM) SupportsAddressSpaceIO() bool { + return false +} + +// CooperativelySchedulesAddressSpace implements platform.Platform.CooperativelySchedulesAddressSpace. +func (*SlimVM) CooperativelySchedulesAddressSpace() bool { + return false +} + +// MapUnit implements platform.Platform.MapUnit. +func (*SlimVM) MapUnit() uint64 { + // We greedily creates PTEs in MapFile, so extremely large mappings can + // be expensive. Not _that_ expensive since we allow super pages, but + // even though can get out of hand if you're creating multi-terabyte + // mappings. For this reason, we limit mappings to an arbitrary 16MB. + return 16 << 20 +} + +// MinUserAddress returns the lowest available address. +func (*SlimVM) MinUserAddress() hostarch.Addr { + return hostarch.PageSize +} + +// MaxUserAddress returns the first address that may not be used. +func (*SlimVM) MaxUserAddress() hostarch.Addr { + return hostarch.Addr(ring0.MaximumUserAddress) +} + +// NewAddressSpace returns a new pagetable root. +func (k *SlimVM) NewAddressSpace() (platform.AddressSpace, error) { + // Allocate page tables and install system mappings. + pageTables := pagetables.NewWithUpper(newAllocator(), k.machine.upperSharedPageTables, ring0.KernelStartAddress) + + // Return the new address space. + return &addressSpace{ + machine: k.machine, + pageTables: pageTables, + }, nil +} + +// NewContext returns an interruptible context. +func (k *SlimVM) NewContext(pkgcontext.Context) platform.Context { + return &context{ + machine: k.machine, + } +} + +// ConcurrencyCount implements platform.Platform.ConcurrencyCount. +func (k *SlimVM) ConcurrencyCount() int { + return int(MaxThreads) +} + +// HealthCheck implements platform.Platform.HealthCheck. +func (k *SlimVM) HealthCheck() { + k.machine.mu.RLock() + k.machine.mu.RUnlock() +} + +type constructor struct{} + +func (*constructor) New(opts platform.Options) (platform.Platform, error) { + // TODO: add support to pass ApplicationCores + return New(opts.DeviceFile, opts.SandboxID) +} + +func (*constructor) OpenDevice(devicePath string) (*fd.FD, error) { + return OpenDevice(devicePath) +} + +func (*constructor) Requirements() platform.Requirements { + return platform.Requirements{} +} + +func init() { + platform.Register("slimvm", &constructor{}) +} diff --git a/pkg/sentry/platform/slimvm/slimvm_amd64.go b/pkg/sentry/platform/slimvm/slimvm_amd64.go new file mode 100644 index 0000000000..b5b69addf3 --- /dev/null +++ b/pkg/sentry/platform/slimvm/slimvm_amd64.go @@ -0,0 +1,224 @@ +// Copyright 2026 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +//go:build amd64 +// +build amd64 + +package slimvm + +import ( + "gvisor.dev/gvisor/pkg/cpuid" + "gvisor.dev/gvisor/pkg/ring0" + "gvisor.dev/gvisor/pkg/ring0/pagetables" +) + +// userMemoryRegion is a region of physical memory. +// +// This mirrors kvm_memory_region. +type userMemoryRegion struct { + userspaceAddr uint64 + guestPhysAddr uint64 + memorySize uint64 +} + +// userRegs represents KVM user registers. +// +// This mirrors kvm_regs. +type userRegs struct { + RAX uint64 + RBX uint64 + RCX uint64 + RDX uint64 + RSI uint64 + RDI uint64 + RSP uint64 + RBP uint64 + R8 uint64 + R9 uint64 + R10 uint64 + R11 uint64 + R12 uint64 + R13 uint64 + R14 uint64 + R15 uint64 + RIP uint64 + RFLAGS uint64 +} + +// systemRegs represents KVM system registers. +// +// This mirrors kvm_sregs. +type systemRegs struct { + CS segment + DS segment + ES segment + FS segment + GS segment + SS segment + TR segment + LDT segment + GDT descriptor + IDT descriptor + CR0 uint64 + CR2 uint64 + CR3 uint64 + CR4 uint64 + CR8 uint64 + EFER uint64 + apicBase uint64 + interruptBitmap [(_SLIMVM_NR_INTERRUPTS + 63) / 64]uint64 +} + +// segment is the expanded form of a segment register. +// +// This mirrors kvm_segment. +type segment struct { + base uint64 + limit uint32 + selector uint16 + typ uint8 + present uint8 + DPL uint8 + DB uint8 + S uint8 + L uint8 + G uint8 + AVL uint8 + unusable uint8 + _ uint8 +} + +// Clear clears the segment and marks it unusable. +func (s *segment) Clear() { + *s = segment{unusable: 1} +} + +// selector is a segment selector. +type selector uint16 + +// tobool is a simple helper. +func tobool(x ring0.SegmentDescriptorFlags) uint8 { + if x != 0 { + return 1 + } + return 0 +} + +// Load loads the segment described by d into the segment s. +// +// The argument sel is recorded as the segment selector index. +func (s *segment) Load(d *ring0.SegmentDescriptor, sel ring0.Selector) { + flag := d.Flags() + if flag&ring0.SegmentDescriptorPresent == 0 { + s.Clear() + return + } + s.base = uint64(d.Base()) + s.limit = d.Limit() + s.typ = uint8((flag>>8)&0xF) | 1 + s.S = tobool(flag & ring0.SegmentDescriptorSystem) + s.DPL = uint8(d.DPL()) + s.present = tobool(flag & ring0.SegmentDescriptorPresent) + s.AVL = tobool(flag & ring0.SegmentDescriptorAVL) + s.L = tobool(flag & ring0.SegmentDescriptorLong) + s.DB = tobool(flag & ring0.SegmentDescriptorDB) + s.G = tobool(flag & ring0.SegmentDescriptorG) + if s.L != 0 { + s.limit = 0xffffffff + } + s.unusable = 0 + s.selector = uint16(sel) +} + +// descriptor describes a region of physical memory. +// +// It corresponds to the pseudo-descriptor used in the x86 LGDT and LIDT +// instructions, and mirrors kvm_dtable. +type descriptor struct { + base uint64 + limit uint16 + _ [3]uint16 +} + +// modelControlRegister is an MSR entry. +// +// This mirrors kvm_msr_entry. +type modelControlRegister struct { + index uint32 + _ uint32 + data uint64 +} + +// modelControlRegisers is a collection of MSRs. +// +// This mirrors kvm_msrs. +type modelControlRegisters struct { + nmsrs uint32 + _ uint32 + entries [16]modelControlRegister +} + +// cpuidEntry is a single CPUID entry. +// +// This mirrors kvm_cpuid_entry2. +type cpuidEntry struct { + function uint32 + index uint32 + flags uint32 + eax uint32 + ebx uint32 + ecx uint32 + edx uint32 + _ [3]uint32 +} + +// cpuidEntries is a collection of CPUID entries. +// +// This mirrors kvm_cpuid2. +type cpuidEntries struct { + nr uint32 + _ uint32 + entries [_SLIMVM_NR_CPUID_ENTRIES]cpuidEntry +} + +// Query implements cpuid.Function.Query. +func (c *cpuidEntries) Query(in cpuid.In) (out cpuid.Out) { + for i := 0; i < int(c.nr); i++ { + if c.entries[i].function == in.Eax && c.entries[i].index == in.Ecx { + out.Eax = c.entries[i].eax + out.Ebx = c.entries[i].ebx + out.Ecx = c.entries[i].ecx + out.Edx = c.entries[i].edx + return + } + } + return +} + +// updateGlobalOnce does global initialization. It has to be called only once. +func updateGlobalOnce(fd int) error { + err := updateSystemValues(int(fd)) + // TODO: static feature set like kvm + ring0.Init(cpuid.HostFeatureSet()) + // slimvm.ko's EPT only supports 4-level page tables. Clamp ring0 and + // pagetables to a 4-level layout regardless of host CPUID so that + // fillAddressSpace doesn't try to mmap a 2^56 userspace and so that + // hardware never attempts a 5-level walk against a 4-level EPT. + ring0.DisableLA57() + pagetables.DisableLA57() + physicalInit() + initPCIDs() + saveSignalHandler() + return err +} diff --git a/pkg/sentry/platform/slimvm/slimvm_amd64_unsafe.go b/pkg/sentry/platform/slimvm/slimvm_amd64_unsafe.go new file mode 100644 index 0000000000..e8c0c902b5 --- /dev/null +++ b/pkg/sentry/platform/slimvm/slimvm_amd64_unsafe.go @@ -0,0 +1,37 @@ +// Copyright 2026 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +//go:build amd64 +// +build amd64 + +package slimvm + +import ( + "gvisor.dev/gvisor/pkg/cpuid" + "gvisor.dev/gvisor/pkg/log" +) + +var ( + runDataSize int + hasGuestPCID bool + cpuidSupported = cpuidEntries{nr: _SLIMVM_NR_CPUID_ENTRIES} +) + +func updateSystemValues(fd int) error { + featureSet := cpuid.HostFeatureSet() + hasGuestPCID = featureSet.HasFeature(cpuid.X86FeaturePCID) + log.Infof("CPU support GuestPCID: %t", hasGuestPCID) + + return nil +} diff --git a/pkg/sentry/platform/slimvm/slimvm_const.go b/pkg/sentry/platform/slimvm/slimvm_const.go new file mode 100644 index 0000000000..f8ffabb362 --- /dev/null +++ b/pkg/sentry/platform/slimvm/slimvm_const.go @@ -0,0 +1,38 @@ +// Copyright 2026 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package slimvm + +// SlimVM exit reasons (matching SLIMVM_RET_* in slimvm kernel module). +const ( + _SLIMVM_EXIT_EXCEPTION = 0x1 + _SLIMVM_EXIT_IO = 0x2 + _SLIMVM_EXIT_HYPERCALL = 0x3 + _SLIMVM_EXIT_DEBUG = 0x4 + _SLIMVM_EXIT_HLT = 0x5 + _SLIMVM_EXIT_MMIO = 0x6 + _SLIMVM_EXIT_IRQ_WINDOW_OPEN = 0x7 + _SLIMVM_EXIT_SHUTDOWN = 0x8 + _SLIMVM_EXIT_FAIL_ENTRY = 0x9 + _SLIMVM_EXIT_INTR = 0xa + _SLIMVM_EXIT_INTERNAL_ERROR = 0x11 + _SLIMVM_EXIT_MSR_WRITE = 0x20 +) + +// SlimVM limits. +const ( + _SLIMVM_NR_VCPUS = 0x800 + _SLIMVM_NR_INTERRUPTS = 0x100 + _SLIMVM_NR_CPUID_ENTRIES = 0x100 +) diff --git a/pkg/sentry/platform/slimvm/slimvm_const_amd64.go b/pkg/sentry/platform/slimvm/slimvm_const_amd64.go new file mode 100644 index 0000000000..365a70d486 --- /dev/null +++ b/pkg/sentry/platform/slimvm/slimvm_const_amd64.go @@ -0,0 +1,34 @@ +// Copyright 2026 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +//go:build amd64 +// +build amd64 + +package slimvm + +// SlimVM ioctls. +// +// Only the ioctls we need in Go appear here; some additional ioctls are used +// within the assembly stubs (KVM_INTERRUPT, etc.). +// 1. open(/dev/slimvm) +// 2. ioctl(_SLIMVM_CREATE_VCPU) +// 3. ioctl(_SLIMVM_RUN) +// 4. close(/dev/slimvm) +const ( + _SLIMVM_RUN = 0x81f8e901 + _SLIMVM_SET_TSS_ADDR = 0xe907 + _SLIMVM_CREATE_VCPU = 0xe908 + _SLIMVM_RELEASE_VCPU = 0xe909 + _SLIMVM_NMI = 0xe90a +) diff --git a/pkg/sentry/platform/slimvm/slimvm_test.go b/pkg/sentry/platform/slimvm/slimvm_test.go new file mode 100644 index 0000000000..1d45eef22f --- /dev/null +++ b/pkg/sentry/platform/slimvm/slimvm_test.go @@ -0,0 +1,528 @@ +// Copyright 2026 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package slimvm + +import ( + "math/rand" + "os" + "reflect" + "testing" + "time" + + "golang.org/x/sys/unix" + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/cpuid" + "gvisor.dev/gvisor/pkg/hostarch" + "gvisor.dev/gvisor/pkg/ring0" + "gvisor.dev/gvisor/pkg/ring0/pagetables" + "gvisor.dev/gvisor/pkg/sentry/arch" + "gvisor.dev/gvisor/pkg/sentry/arch/fpu" + "gvisor.dev/gvisor/pkg/sentry/platform" + "gvisor.dev/gvisor/pkg/sentry/platform/slimvm/testutil" + ktime "gvisor.dev/gvisor/pkg/sentry/time" +) + +// dummyFPState is initialized in TestMain. +var dummyFPState fpu.State + +type testHarness interface { + Errorf(format string, args ...any) + Fatalf(format string, args ...any) +} + +// testSandboxID is an arbitrary fixed sandbox ID for tests. The slimvm host +// kernel module uses this value to bucket per-sandbox state. +const testSandboxID = "deadbeef" + +func slimvmTest(t testHarness, setup func(*SlimVM), fn func(*vCPU) bool) { + // Create the machine. + deviceFile, err := OpenDevice("") + if err != nil { + t.Fatalf("error opening device file: %v", err) + } + k, err := New(deviceFile, testSandboxID) + if err != nil { + t.Fatalf("error creating SlimVM instance: %v", err) + } + defer k.machine.Destroy() + + // Call additional setup. + if setup != nil { + setup(k) + } + + var c *vCPU // For recovery. + defer func() { + redpill() + if c != nil { + k.machine.Put(c) + } + }() + for { + c = k.machine.Get() + if !fn(c) { + break + } + + // We put the vCPU here and clear the value so that the + // deferred recovery will not re-put it above. + k.machine.Put(c) + c = nil + } +} + +func bluepillTest(t testHarness, fn func(*vCPU)) { + slimvmTest(t, nil, func(c *vCPU) bool { + bluepill(c) + fn(c) + return false + }) +} + +func TestKernelSyscall(t *testing.T) { + bluepillTest(t, func(c *vCPU) { + redpill() // Leave guest mode. + if got := c.state.Load(); got != vCPUUser { + t.Errorf("vCPU not in ready state: got %v", got) + } + }) +} + +func hostFault() { + defer func() { + recover() + }() + var foo *int + *foo = 0 +} + +func TestKernelFault(t *testing.T) { + hostFault() // Ensure recovery works. + bluepillTest(t, func(c *vCPU) { + hostFault() + if got := c.state.Load(); got != vCPUUser { + t.Errorf("vCPU not in ready state: got %v", got) + } + }) +} + +func TestKernelFloatingPoint(t *testing.T) { + bluepillTest(t, func(c *vCPU) { + if !testutil.FloatingPointWorks() { + t.Errorf("floating point does not work, and it should!") + } + }) +} + +func applicationTest(t testHarness, useHostMappings bool, targetFn uintptr, fn func(*vCPU, *arch.Registers, *pagetables.PageTables) bool) { + // Initialize registers & page tables. + var ( + regs arch.Registers + pt *pagetables.PageTables + ) + testutil.SetTestTarget(®s, targetFn) + + slimvmTest(t, func(k *SlimVM) { + // Create new page tables. + as, err := k.NewAddressSpace() + if err != nil { + t.Fatalf("can't create new address space: %v", err) + } + pt = as.(*addressSpace).pageTables + + if useHostMappings { + // Apply the physical mappings to these page tables. + // (This is normally dangerous, since they point to + // physical pages that may not exist. This shouldn't be + // done for regular user code, but is fine for test + // purposes.) + applyPhysicalRegions(func(pr physicalRegion) bool { + pt.Map(hostarch.Addr(pr.virtual), pr.length, pagetables.MapOpts{ + AccessType: hostarch.AnyAccess, + User: true, + }, pr.physical) + return true // Keep iterating. + }) + } + }, func(c *vCPU) bool { + // Invoke the function with the extra data. + return fn(c, ®s, pt) + }) +} + +func TestApplicationSyscall(t *testing.T) { + applicationTest(t, true, testutil.AddrOfSyscallLoop(), func(c *vCPU, regs *arch.Registers, pt *pagetables.PageTables) bool { + var si linux.SignalInfo + if _, err := c.SwitchToUser(ring0.SwitchOpts{ + Registers: regs, + FloatingPointState: &dummyFPState, + PageTables: pt, + FullRestore: true, + }, &si); err == platform.ErrContextInterrupt { + return true // Retry. + } else if err != nil { + t.Errorf("application syscall with full restore failed: %v", err) + } + return false + }) + applicationTest(t, true, testutil.AddrOfSyscallLoop(), func(c *vCPU, regs *arch.Registers, pt *pagetables.PageTables) bool { + var si linux.SignalInfo + if _, err := c.SwitchToUser(ring0.SwitchOpts{ + Registers: regs, + FloatingPointState: &dummyFPState, + PageTables: pt, + }, &si); err == platform.ErrContextInterrupt { + return true // Retry. + } else if err != nil { + t.Errorf("application syscall with partial restore failed: %v", err) + } + return false + }) +} + +func TestApplicationFault(t *testing.T) { + applicationTest(t, true, testutil.AddrOfTouch(), func(c *vCPU, regs *arch.Registers, pt *pagetables.PageTables) bool { + testutil.SetTouchTarget(regs, nil) // Cause fault. + var si linux.SignalInfo + if _, err := c.SwitchToUser(ring0.SwitchOpts{ + Registers: regs, + FloatingPointState: &dummyFPState, + PageTables: pt, + FullRestore: true, + }, &si); err == platform.ErrContextInterrupt { + return true // Retry. + } else if err != platform.ErrContextSignal || si.Signo != int32(unix.SIGSEGV) { + t.Errorf("application fault with full restore got (%v, %v), expected (%v, SIGSEGV)", err, si, platform.ErrContextSignal) + } + return false + }) + applicationTest(t, true, testutil.AddrOfTouch(), func(c *vCPU, regs *arch.Registers, pt *pagetables.PageTables) bool { + testutil.SetTouchTarget(regs, nil) // Cause fault. + var si linux.SignalInfo + if _, err := c.SwitchToUser(ring0.SwitchOpts{ + Registers: regs, + FloatingPointState: &dummyFPState, + PageTables: pt, + }, &si); err == platform.ErrContextInterrupt { + return true // Retry. + } else if err != platform.ErrContextSignal || si.Signo != int32(unix.SIGSEGV) { + t.Errorf("application fault with partial restore got (%v, %v), expected (%v, SIGSEGV)", err, si, platform.ErrContextSignal) + } + return false + }) +} + +func TestRegistersSyscall(t *testing.T) { + applicationTest(t, true, testutil.AddrOfTwiddleRegsSyscall(), func(c *vCPU, regs *arch.Registers, pt *pagetables.PageTables) bool { + testutil.SetTestRegs(regs) // Fill values for all registers. + for { + var si linux.SignalInfo + if _, err := c.SwitchToUser(ring0.SwitchOpts{ + Registers: regs, + FloatingPointState: &dummyFPState, + PageTables: pt, + }, &si); err == platform.ErrContextInterrupt { + continue // Retry. + } else if err != nil { + t.Errorf("application register check with partial restore got unexpected error: %v", err) + } + if err := testutil.CheckTestRegs(regs, false); err != nil { + t.Errorf("application register check with partial restore failed: %v", err) + } + break // Done. + } + return false + }) +} + +func TestRegistersFault(t *testing.T) { + applicationTest(t, true, testutil.AddrOfTwiddleRegsFault(), func(c *vCPU, regs *arch.Registers, pt *pagetables.PageTables) bool { + testutil.SetTestRegs(regs) // Fill values for all registers. + for { + var si linux.SignalInfo + if _, err := c.SwitchToUser(ring0.SwitchOpts{ + Registers: regs, + FloatingPointState: &dummyFPState, + PageTables: pt, + FullRestore: true, + }, &si); err == platform.ErrContextInterrupt { + continue // Retry. + } else if err != platform.ErrContextSignal || si.Signo != int32(unix.SIGSEGV) { + t.Errorf("application register check with full restore got unexpected error: %v", err) + } + if err := testutil.CheckTestRegs(regs, true); err != nil { + t.Errorf("application register check with full restore failed: %v", err) + } + break // Done. + } + return false + }) +} + +func TestSegments(t *testing.T) { + applicationTest(t, true, testutil.AddrOfTwiddleSegments(), func(c *vCPU, regs *arch.Registers, pt *pagetables.PageTables) bool { + testutil.SetTestSegments(regs) + for { + var si linux.SignalInfo + if _, err := c.SwitchToUser(ring0.SwitchOpts{ + Registers: regs, + FloatingPointState: &dummyFPState, + PageTables: pt, + FullRestore: true, + }, &si); err == platform.ErrContextInterrupt { + continue // Retry. + } else if err != nil { + t.Errorf("application segment check with full restore got unexpected error: %v", err) + } + if err := testutil.CheckTestSegments(regs); err != nil { + t.Errorf("application segment check with full restore failed: %v", err) + } + break // Done. + } + return false + }) +} + +func TestBounce(t *testing.T) { + applicationTest(t, true, testutil.AddrOfSpinLoop(), func(c *vCPU, regs *arch.Registers, pt *pagetables.PageTables) bool { + go func() { + time.Sleep(time.Millisecond) + c.BounceToKernel() + }() + var si linux.SignalInfo + if _, err := c.SwitchToUser(ring0.SwitchOpts{ + Registers: regs, + FloatingPointState: &dummyFPState, + PageTables: pt, + }, &si); err != platform.ErrContextInterrupt { + t.Errorf("application partial restore: got %v, wanted %v", err, platform.ErrContextInterrupt) + } + return false + }) + applicationTest(t, true, testutil.AddrOfSpinLoop(), func(c *vCPU, regs *arch.Registers, pt *pagetables.PageTables) bool { + go func() { + time.Sleep(time.Millisecond) + c.BounceToKernel() + }() + var si linux.SignalInfo + if _, err := c.SwitchToUser(ring0.SwitchOpts{ + Registers: regs, + FloatingPointState: &dummyFPState, + PageTables: pt, + FullRestore: true, + }, &si); err != platform.ErrContextInterrupt { + t.Errorf("application full restore: got %v, wanted %v", err, platform.ErrContextInterrupt) + } + return false + }) +} + +func TestBounceStress(t *testing.T) { + applicationTest(t, true, testutil.AddrOfSpinLoop(), func(c *vCPU, regs *arch.Registers, pt *pagetables.PageTables) bool { + randomSleep := func() { + // O(hundreds of microseconds) is appropriate to ensure + // different overlaps and different schedules. + if n := rand.Intn(1000); n > 100 { + time.Sleep(time.Duration(n) * time.Microsecond) + } + } + for i := 0; i < 1000; i++ { + // Start an asynchronously executing goroutine that + // calls Bounce at pseudo-random point in time. + // This should wind up calling Bounce when the + // kernel is in various stages of the switch. + go func() { + randomSleep() + c.BounceToKernel() + }() + randomSleep() + var si linux.SignalInfo + if _, err := c.SwitchToUser(ring0.SwitchOpts{ + Registers: regs, + FloatingPointState: &dummyFPState, + PageTables: pt, + }, &si); err != platform.ErrContextInterrupt { + t.Errorf("application partial restore: got %v, wanted %v", err, platform.ErrContextInterrupt) + } + c.unlock() + randomSleep() + c.lock() + } + return false + }) +} + +func TestInvalidate(t *testing.T) { + var data uintptr // Used below. + applicationTest(t, true, testutil.AddrOfTouch(), func(c *vCPU, regs *arch.Registers, pt *pagetables.PageTables) bool { + testutil.SetTouchTarget(regs, &data) // Read legitimate value. + for { + var si linux.SignalInfo + if _, err := c.SwitchToUser(ring0.SwitchOpts{ + Registers: regs, + FloatingPointState: &dummyFPState, + PageTables: pt, + }, &si); err == platform.ErrContextInterrupt { + continue // Retry. + } else if err != nil { + t.Errorf("application partial restore: got %v, wanted nil", err) + } + break // Done. + } + // Unmap the page containing data & invalidate. + pt.Unmap(hostarch.Addr(reflect.ValueOf(&data).Pointer() & ^uintptr(hostarch.PageSize-1)), hostarch.PageSize) + for { + var si linux.SignalInfo + if _, err := c.SwitchToUser(ring0.SwitchOpts{ + Registers: regs, + FloatingPointState: &dummyFPState, + PageTables: pt, + Flush: true, + }, &si); err == platform.ErrContextInterrupt { + continue // Retry. + } else if err != platform.ErrContextSignal { + t.Errorf("application partial restore: got %v, wanted %v", err, platform.ErrContextSignal) + } + break // Success. + } + return false + }) +} + +// IsFault returns true iff the given signal represents a fault. +func IsFault(err error, si *linux.SignalInfo) bool { + return err == platform.ErrContextSignal && si.Signo == int32(unix.SIGSEGV) +} + +func TestEmptyAddressSpace(t *testing.T) { + applicationTest(t, false, testutil.AddrOfSyscallLoop(), func(c *vCPU, regs *arch.Registers, pt *pagetables.PageTables) bool { + var si linux.SignalInfo + if _, err := c.SwitchToUser(ring0.SwitchOpts{ + Registers: regs, + FloatingPointState: &dummyFPState, + PageTables: pt, + }, &si); err == platform.ErrContextInterrupt { + return true // Retry. + } else if !IsFault(err, &si) { + t.Errorf("first fault with partial restore failed got %v", err) + t.Logf("registers: %#v", ®s) + } + return false + }) + applicationTest(t, false, testutil.AddrOfSyscallLoop(), func(c *vCPU, regs *arch.Registers, pt *pagetables.PageTables) bool { + var si linux.SignalInfo + if _, err := c.SwitchToUser(ring0.SwitchOpts{ + Registers: regs, + FloatingPointState: &dummyFPState, + PageTables: pt, + FullRestore: true, + }, &si); err == platform.ErrContextInterrupt { + return true // Retry. + } else if !IsFault(err, &si) { + t.Errorf("first fault with full restore failed got %v", err) + t.Logf("registers: %#v", ®s) + } + return false + }) +} + +func TestRdtsc(t *testing.T) { + var i int // Iteration count. + slimvmTest(t, nil, func(c *vCPU) bool { + start := ktime.Rdtsc() + bluepill(c) + guest := ktime.Rdtsc() + redpill() + end := ktime.Rdtsc() + if start > guest || guest > end { + t.Errorf("inconsistent time: start=%d, guest=%d, end=%d", start, guest, end) + } + i++ + return i < 100 + }) +} + +func BenchmarkApplicationSyscall(b *testing.B) { + var ( + i int // Iteration includes machine.Get() / machine.Put(). + a int // Count for ErrContextInterrupt. + ) + applicationTest(b, true, testutil.AddrOfSyscallLoop(), func(c *vCPU, regs *arch.Registers, pt *pagetables.PageTables) bool { + var si linux.SignalInfo + if _, err := c.SwitchToUser(ring0.SwitchOpts{ + Registers: regs, + FloatingPointState: &dummyFPState, + PageTables: pt, + }, &si); err == platform.ErrContextInterrupt { + a++ + return true // Ignore. + } else if err != nil { + b.Fatalf("benchmark failed: %v", err) + } + i++ + return i < b.N + }) + if a != 0 { + b.Logf("ErrContextInterrupt occurred %d times (in %d iterations).", a, a+i) + } +} + +func BenchmarkKernelSyscall(b *testing.B) { + // Note that the target passed here is irrelevant, we never execute SwitchToUser. + applicationTest(b, true, testutil.AddrOfGetpid(), func(c *vCPU, regs *arch.Registers, pt *pagetables.PageTables) bool { + // iteration does not include machine.Get() / machine.Put(). + for i := 0; i < b.N; i++ { + bluepill(c) + testutil.Getpid() + } + return false + }) +} + +func BenchmarkWorldSwitchToUserRoundtrip(b *testing.B) { + // see BenchmarkApplicationSyscall. + var ( + i int + a int + ) + applicationTest(b, true, testutil.AddrOfSyscallLoop(), func(c *vCPU, regs *arch.Registers, pt *pagetables.PageTables) bool { + var si linux.SignalInfo + if _, err := c.SwitchToUser(ring0.SwitchOpts{ + Registers: regs, + FloatingPointState: &dummyFPState, + PageTables: pt, + }, &si); err == platform.ErrContextInterrupt { + a++ + return true // Ignore. + } else if err != nil { + b.Fatalf("benchmark failed: %v", err) + } + // This will intentionally cause the world switch. By executing + // a host syscall here, we force the transition between guest + // and host mode. + testutil.Getpid() + i++ + return i < b.N + }) + if a != 0 { + b.Logf("ErrContextInterrupt occurred %d times (in %d iterations).", a, a+i) + } +} + +func TestMain(m *testing.M) { + cpuid.Initialize() + dummyFPState = fpu.NewState() + os.Exit(m.Run()) +} diff --git a/pkg/sentry/platform/slimvm/testutil/BUILD b/pkg/sentry/platform/slimvm/testutil/BUILD new file mode 100644 index 0000000000..4c62937286 --- /dev/null +++ b/pkg/sentry/platform/slimvm/testutil/BUILD @@ -0,0 +1,18 @@ +load("//tools:defs.bzl", "go_library") + +package( + default_applicable_licenses = ["//:license"], + licenses = ["notice"], +) + +go_library( + name = "testutil", + testonly = 1, + srcs = [ + "testutil.go", + "testutil_amd64.go", + "testutil_amd64.s", + ], + visibility = ["//pkg/sentry/platform/slimvm:__pkg__"], + deps = ["//pkg/sentry/arch"], +) diff --git a/pkg/sentry/platform/slimvm/testutil/testutil.go b/pkg/sentry/platform/slimvm/testutil/testutil.go new file mode 100644 index 0000000000..5d4040ec05 --- /dev/null +++ b/pkg/sentry/platform/slimvm/testutil/testutil.go @@ -0,0 +1,90 @@ +// Copyright 2026 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Package testutil provides common assembly stubs for testing. +package testutil + +import ( + "fmt" + "strings" +) + +// Getpid executes a trivial system call. +func Getpid() + +// AddrOfGetpid returns the address of Getpid. +// +// In Go 1.17+, Go references to assembly functions resolve to an ABIInternal +// wrapper function rather than the function itself. We must reference from +// assembly to get the ABI0 (i.e., primary) address. +func AddrOfGetpid() uintptr + +// AddrOfTouch returns the address of a function that touches the value in the +// first register. +func AddrOfTouch() uintptr +func touch() + +// AddrOfSyscallLoop returns the address of a function that executes a syscall +// and loops. +func AddrOfSyscallLoop() uintptr +func syscallLoop() + +// AddrOfSpinLoop returns the address of a function that spins on the CPU. +func AddrOfSpinLoop() uintptr +func spinLoop() + +// AddrOfHaltLoop returns the address of a function that immediately halts and +// loops. +func AddrOfHaltLoop() uintptr +func haltLoop() + +// AddrOfTwiddleRegsFault returns the address of a function that twiddles +// registers then faults. +func AddrOfTwiddleRegsFault() uintptr +func twiddleRegsFault() + +// AddrOfTwiddleRegsSyscall returns the address of a function that twiddles +// registers then executes a syscall. +func AddrOfTwiddleRegsSyscall() uintptr +func twiddleRegsSyscall() + +// FloatingPointWorks is a floating point test. +// +// It returns true or false. +func FloatingPointWorks() bool + +// RegisterMismatchError is used for checking registers. +type RegisterMismatchError []string + +// Error returns a human-readable error. +func (r RegisterMismatchError) Error() string { + return strings.Join([]string(r), ";") +} + +// addRegisterMisatch allows simple chaining of register mismatches. +func addRegisterMismatch(err error, reg string, got, expected any) error { + errStr := fmt.Sprintf("%s got %08x, expected %08x", reg, got, expected) + switch r := err.(type) { + case nil: + // Return a new register mismatch. + return RegisterMismatchError{errStr} + case RegisterMismatchError: + // Append the error. + r = append(r, errStr) + return r + default: + // Leave as is. + return err + } +} diff --git a/pkg/sentry/platform/slimvm/testutil/testutil_amd64.go b/pkg/sentry/platform/slimvm/testutil/testutil_amd64.go new file mode 100644 index 0000000000..38998deb8e --- /dev/null +++ b/pkg/sentry/platform/slimvm/testutil/testutil_amd64.go @@ -0,0 +1,142 @@ +// Copyright 2026 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +//go:build amd64 +// +build amd64 + +package testutil + +import ( + "reflect" + + "gvisor.dev/gvisor/pkg/sentry/arch" +) + +// AddrOfTwiddleSegments return the address of a function that reads segments +// into known registers. +func AddrOfTwiddleSegments() uintptr +func twiddleSegments() + +// SetTestTarget sets the rip appropriately. +func SetTestTarget(regs *arch.Registers, fn uintptr) { + regs.Rip = uint64(fn) +} + +// SetTouchTarget sets rax appropriately. +func SetTouchTarget(regs *arch.Registers, target *uintptr) { + if target != nil { + regs.Rax = uint64(reflect.ValueOf(target).Pointer()) + } else { + regs.Rax = 0 + } +} + +// RewindSyscall rewinds a syscall RIP. +func RewindSyscall(regs *arch.Registers) { + regs.Rip -= 2 +} + +// SetTestRegs initializes registers to known values. +func SetTestRegs(regs *arch.Registers) { + regs.R15 = 0x15 + regs.R14 = 0x14 + regs.R13 = 0x13 + regs.R12 = 0x12 + regs.Rbp = 0xb9 + regs.Rbx = 0xb4 + regs.R11 = 0x11 + regs.R10 = 0x10 + regs.R9 = 0x09 + regs.R8 = 0x08 + regs.Rax = 0x44 + regs.Rcx = 0xc4 + regs.Rdx = 0xd4 + regs.Rsi = 0x51 + regs.Rdi = 0xd1 + regs.Rsp = 0x59 +} + +// CheckTestRegs checks that registers were twiddled per TwiddleRegs. +func CheckTestRegs(regs *arch.Registers, full bool) (err error) { + if need := ^uint64(0x15); regs.R15 != need { + err = addRegisterMismatch(err, "R15", regs.R15, need) + } + if need := ^uint64(0x14); regs.R14 != need { + err = addRegisterMismatch(err, "R14", regs.R14, need) + } + if need := ^uint64(0x13); regs.R13 != need { + err = addRegisterMismatch(err, "R13", regs.R13, need) + } + if need := ^uint64(0x12); regs.R12 != need { + err = addRegisterMismatch(err, "R12", regs.R12, need) + } + if need := ^uint64(0xb9); regs.Rbp != need { + err = addRegisterMismatch(err, "Rbp", regs.Rbp, need) + } + if need := ^uint64(0xb4); regs.Rbx != need { + err = addRegisterMismatch(err, "Rbx", regs.Rbx, need) + } + if need := ^uint64(0x10); regs.R10 != need { + err = addRegisterMismatch(err, "R10", regs.R10, need) + } + if need := ^uint64(0x09); regs.R9 != need { + err = addRegisterMismatch(err, "R9", regs.R9, need) + } + if need := ^uint64(0x08); regs.R8 != need { + err = addRegisterMismatch(err, "R8", regs.R8, need) + } + if need := ^uint64(0x44); regs.Rax != need { + err = addRegisterMismatch(err, "Rax", regs.Rax, need) + } + if need := ^uint64(0xd4); regs.Rdx != need { + err = addRegisterMismatch(err, "Rdx", regs.Rdx, need) + } + if need := ^uint64(0x51); regs.Rsi != need { + err = addRegisterMismatch(err, "Rsi", regs.Rsi, need) + } + if need := ^uint64(0xd1); regs.Rdi != need { + err = addRegisterMismatch(err, "Rdi", regs.Rdi, need) + } + if need := ^uint64(0x59); regs.Rsp != need { + err = addRegisterMismatch(err, "Rsp", regs.Rsp, need) + } + // Rcx & R11 are ignored if !full is set. + if need := ^uint64(0x11); full && regs.R11 != need { + err = addRegisterMismatch(err, "R11", regs.R11, need) + } + if need := ^uint64(0xc4); full && regs.Rcx != need { + err = addRegisterMismatch(err, "Rcx", regs.Rcx, need) + } + return +} + +var fsData uint64 = 0x55 +var gsData uint64 = 0x85 + +// SetTestSegments initializes segments to known values. +func SetTestSegments(regs *arch.Registers) { + regs.Fs_base = uint64(reflect.ValueOf(&fsData).Pointer()) + regs.Gs_base = uint64(reflect.ValueOf(&gsData).Pointer()) +} + +// CheckTestSegments checks that registers were twiddled per TwiddleSegments. +func CheckTestSegments(regs *arch.Registers) (err error) { + if regs.Rax != fsData { + err = addRegisterMismatch(err, "Rax", regs.Rax, fsData) + } + if regs.Rbx != gsData { + err = addRegisterMismatch(err, "Rbx", regs.Rcx, gsData) + } + return +} diff --git a/pkg/sentry/platform/slimvm/testutil/testutil_amd64.s b/pkg/sentry/platform/slimvm/testutil/testutil_amd64.s new file mode 100644 index 0000000000..834fee1b5a --- /dev/null +++ b/pkg/sentry/platform/slimvm/testutil/testutil_amd64.s @@ -0,0 +1,135 @@ +// Copyright 2026 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// +build amd64 + +// test_util_amd64.s provides AMD64 test functions. + +#include "funcdata.h" +#include "textflag.h" + +TEXT ·Getpid(SB),NOSPLIT|NOFRAME,$0 + NO_LOCAL_POINTERS + MOVQ $39, AX // getpid + SYSCALL + RET + +// func AddrOfGetpid() uintptr +TEXT ·AddrOfGetpid(SB), $0-8 + MOVQ $·Getpid(SB), AX + MOVQ AX, ret+0(FP) + RET + +TEXT ·touch(SB),NOSPLIT|NOFRAME,$0 +start: + MOVQ 0(AX), BX // deref AX + MOVQ $39, AX // getpid + SYSCALL + JMP start + +// func AddrOfTouch() uintptr +TEXT ·AddrOfTouch(SB), $0-8 + MOVQ $·touch(SB), AX + MOVQ AX, ret+0(FP) + RET + +TEXT ·syscallLoop(SB),NOSPLIT|NOFRAME,$0 +start: + SYSCALL + JMP start + +// func AddrOfSyscallLoop() uintptr +TEXT ·AddrOfSyscallLoop(SB), $0-8 + MOVQ $·syscallLoop(SB), AX + MOVQ AX, ret+0(FP) + RET + +TEXT ·spinLoop(SB),NOSPLIT|NOFRAME,$0 +start: + JMP start + +// func AddrOfSpinLoop() uintptr +TEXT ·AddrOfSpinLoop(SB), $0-8 + MOVQ $·spinLoop(SB), AX + MOVQ AX, ret+0(FP) + RET + +TEXT ·FloatingPointWorks(SB),NOSPLIT|NOFRAME,$0 + NO_LOCAL_POINTERS + MOVQ $1, AX + MOVQ AX, X0 + MOVQ $39, AX // getpid + SYSCALL + MOVQ X0, AX + CMPQ AX, $1 + SETEQ ret+0(FP) + RET + +#define TWIDDLE_REGS() \ + NOTQ R15; \ + NOTQ R14; \ + NOTQ R13; \ + NOTQ R12; \ + NOTQ BP; \ + NOTQ BX; \ + NOTQ R11; \ + NOTQ R10; \ + NOTQ R9; \ + NOTQ R8; \ + NOTQ AX; \ + NOTQ CX; \ + NOTQ DX; \ + NOTQ SI; \ + NOTQ DI; \ + NOTQ SP; + +TEXT ·twiddleRegsSyscall(SB),NOSPLIT|NOFRAME,$0 + TWIDDLE_REGS() + SYSCALL + RET // never reached + +// func AddrOfTwiddleRegsSyscall() uintptr +TEXT ·AddrOfTwiddleRegsSyscall(SB), $0-8 + MOVQ $·twiddleRegsSyscall(SB), AX + MOVQ AX, ret+0(FP) + RET + +TEXT ·twiddleRegsFault(SB),NOSPLIT|NOFRAME,$0 + TWIDDLE_REGS() + JMP AX // must fault + RET // never reached + +// func AddrOfTwiddleRegsFault() uintptr +TEXT ·AddrOfTwiddleRegsFault(SB), $0-8 + MOVQ $·twiddleRegsFault(SB), AX + MOVQ AX, ret+0(FP) + RET + +#define READ_FS() BYTE $0x64; BYTE $0x48; BYTE $0x8b; BYTE $0x00; +#define READ_GS() BYTE $0x65; BYTE $0x48; BYTE $0x8b; BYTE $0x00; + +TEXT ·twiddleSegments(SB),NOSPLIT|NOFRAME,$0 + MOVQ $0x0, AX + READ_GS() + MOVQ AX, BX + MOVQ $0x0, AX + READ_FS() + SYSCALL + RET // never reached + +// func AddrOfTwiddleSegments() uintptr +TEXT ·AddrOfTwiddleSegments(SB), $0-8 + MOVQ $·twiddleSegments(SB), AX + MOVQ AX, ret+0(FP) + RET diff --git a/pkg/sentry/platform/slimvm/thread_control.go b/pkg/sentry/platform/slimvm/thread_control.go new file mode 100644 index 0000000000..cde2068745 --- /dev/null +++ b/pkg/sentry/platform/slimvm/thread_control.go @@ -0,0 +1,71 @@ +// Copyright 2026 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package slimvm + +import ( + "runtime" + "sync" + "time" + + "gvisor.dev/gvisor/pkg/log" +) + +// TODO: support configure thread reclaim +const ( + ReclaimPeriod = 5 + MaxThreads = uint32(256) +) + +func reclaimThreads(m *machine, max uint32, n int) { + log.Infof("Exceed the max thread limit (%d), will reclaim %d.", max, n) + + wg := sync.WaitGroup{} + wg.Add(n) + for i := 0; i < n; i++ { + go func() { + runtime.LockOSThread() + m.PutVCPU() + wg.Done() + + // the M will exit with exit of this go routine, + // see https://golang.org/pkg/runtime/#LockOSThread. + }() + } + wg.Wait() +} + +const defaultPeriod = 5 // seconds + +// StartReclaimDaemon starts a go routine to check extra Golang M periodly. +func StartReclaimDaemon(m *machine) { + go func() { + for { + period := ReclaimPeriod + if period == 0 { + period = defaultPeriod + } + time.Sleep(time.Duration(period) * time.Second) + + maxThreads := MaxThreads + curThreads, _ := runtime.ThreadCreateProfile(nil) + + extra := curThreads - int(maxThreads) + if extra > 0 { + m.dumpVCPUStats() + reclaimThreads(m, maxThreads, extra) + } + } + }() +} diff --git a/pkg/sentry/platform/slimvm/virtual_map.go b/pkg/sentry/platform/slimvm/virtual_map.go new file mode 100644 index 0000000000..d5792b715b --- /dev/null +++ b/pkg/sentry/platform/slimvm/virtual_map.go @@ -0,0 +1,99 @@ +// Copyright 2026 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package slimvm + +import ( + "bufio" + "fmt" + "io" + "os" + "regexp" + "strconv" + + "gvisor.dev/gvisor/pkg/hostarch" +) + +type virtualRegion struct { + region + accessType hostarch.AccessType + shared bool + offset uintptr + filename string +} + +// mapsLine matches a single line from /proc/PID/maps. +var mapsLine = regexp.MustCompile("([0-9a-f]+)-([0-9a-f]+) ([r-][w-][x-][sp]) ([0-9a-f]+) [0-9a-f]{2,3}:[0-9a-f]{2,} [0-9]+\\s+(.*)") + +// applyVirtualRegions parses the process maps file. +// +// Unlike mappedRegions, these are not consistent over time. +func applyVirtualRegions(fn func(vr virtualRegion)) error { + // Open /proc/self/maps. + f, err := os.Open("/proc/self/maps") + if err != nil { + return err + } + defer f.Close() + + // Parse all entries. + r := bufio.NewReader(f) + for { + b, err := r.ReadBytes('\n') + if b != nil && len(b) > 0 { + m := mapsLine.FindSubmatch(b) + if m == nil { + // This should not happen: kernel bug? + return fmt.Errorf("badly formed line: %v", string(b)) + } + start, err := strconv.ParseUint(string(m[1]), 16, 64) + if err != nil { + return fmt.Errorf("bad start address: %v", string(b)) + } + end, err := strconv.ParseUint(string(m[2]), 16, 64) + if err != nil { + return fmt.Errorf("bad end address: %v", string(b)) + } + read := m[3][0] == 'r' + write := m[3][1] == 'w' + execute := m[3][2] == 'x' + shared := m[3][3] == 's' + offset, err := strconv.ParseUint(string(m[4]), 16, 64) + if err != nil { + return fmt.Errorf("bad offset: %v", string(b)) + } + fn(virtualRegion{ + region: region{ + virtual: uintptr(start), + length: uintptr(end - start), + }, + accessType: hostarch.AccessType{ + Read: read, + Write: write, + Execute: execute, + }, + shared: shared, + offset: uintptr(offset), + filename: string(m[5]), + }) + } + if err != nil && err == io.EOF { + break + } else if err != nil { + return err + } + } + + return nil +} diff --git a/pkg/sentry/platform/slimvm/virtual_map_test.go b/pkg/sentry/platform/slimvm/virtual_map_test.go new file mode 100644 index 0000000000..c8df3af257 --- /dev/null +++ b/pkg/sentry/platform/slimvm/virtual_map_test.go @@ -0,0 +1,93 @@ +// Copyright 2026 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package slimvm + +import ( + "testing" + + "golang.org/x/sys/unix" + "gvisor.dev/gvisor/pkg/hostarch" +) + +type checker struct { + ok bool + accessType hostarch.AccessType +} + +func (c *checker) Containing(addr uintptr) func(virtualRegion) { + c.ok = false // Reset for below calls. + return func(vr virtualRegion) { + if vr.virtual <= addr && addr < vr.virtual+vr.length { + c.ok = true + c.accessType = vr.accessType + } + } +} + +func TestParseMaps(t *testing.T) { + c := new(checker) + + // Simple test. + if err := applyVirtualRegions(c.Containing(0)); err != nil { + t.Fatalf("unexpected error: %v", err) + } + + // MMap a new page. + addr, _, errno := unix.RawSyscall6( + unix.SYS_MMAP, 0, hostarch.PageSize, + unix.PROT_READ|unix.PROT_WRITE, + unix.MAP_ANONYMOUS|unix.MAP_PRIVATE, 0, 0) + if errno != 0 { + t.Fatalf("unexpected map error: %v", errno) + } + + // Re-parse maps. + if err := applyVirtualRegions(c.Containing(addr)); err != nil { + unix.RawSyscall(unix.SYS_MUNMAP, addr, hostarch.PageSize, 0) + t.Fatalf("unexpected error: %v", err) + } + + // Assert that it now does contain the region. + if !c.ok { + unix.RawSyscall(unix.SYS_MUNMAP, addr, hostarch.PageSize, 0) + t.Fatalf("updated map does not contain 0x%08x, expected true", addr) + } + + // Map the region as PROT_NONE. + newAddr, _, errno := unix.RawSyscall6( + unix.SYS_MMAP, addr, hostarch.PageSize, + unix.PROT_NONE, + unix.MAP_ANONYMOUS|unix.MAP_FIXED|unix.MAP_PRIVATE, 0, 0) + if errno != 0 { + t.Fatalf("unexpected map error: %v", errno) + } + if newAddr != addr { + t.Fatalf("unable to remap address: got 0x%08x, wanted 0x%08x", newAddr, addr) + } + + // Re-parse maps. + if err := applyVirtualRegions(c.Containing(addr)); err != nil { + t.Fatalf("unexpected error: %v", err) + } + if !c.ok { + t.Fatalf("final map does not contain 0x%08x, expected true", addr) + } + if c.accessType.Read || c.accessType.Write || c.accessType.Execute { + t.Fatalf("final map has incorrect permissions for 0x%08x", addr) + } + + // Unmap the region. + unix.RawSyscall(unix.SYS_MUNMAP, addr, hostarch.PageSize, 0) +} diff --git a/pkg/spinlock/BUILD b/pkg/spinlock/BUILD new file mode 100644 index 0000000000..730b072f87 --- /dev/null +++ b/pkg/spinlock/BUILD @@ -0,0 +1,11 @@ +load("//tools:defs.bzl", "go_library", "go_test") + +package(licenses = ["notice"]) + +go_library( + name = "spinlock", + srcs = [ + "spinlock.go", + ], + visibility = ["//visibility:public"], +) diff --git a/pkg/spinlock/spinlock.go b/pkg/spinlock/spinlock.go new file mode 100644 index 0000000000..bfd99639e7 --- /dev/null +++ b/pkg/spinlock/spinlock.go @@ -0,0 +1,51 @@ +// Copyright 2026 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package spinlock + +import ( + "runtime" + "sync/atomic" +) + +// A Spinlock is a spin exclusion lock, just like kernel spinlock. +// value: +// +// 0, unlocked +// 1, locked +type Spinlock struct { + state uint32 +} + +//go:nosplit +func (s *Spinlock) Lock() { + for !atomic.CompareAndSwapUint32(&s.state, 0, 1) { + } +} + +//go:nosplit +func (s *Spinlock) LockYield() { + count := 1000 + for !atomic.CompareAndSwapUint32(&s.state, 0, 1) { + if count--; count == 0 { + runtime.Gosched() + count = 1000 + } + } +} + +//go:nosplit +func (s *Spinlock) Unlock() { + atomic.StoreUint32(&s.state, 0) +} diff --git a/runsc/boot/loader.go b/runsc/boot/loader.go index bf632f4c7b..e6b07426d9 100644 --- a/runsc/boot/loader.go +++ b/runsc/boot/loader.go @@ -607,7 +607,7 @@ func New(args Args) (*Loader, error) { } // Create kernel and platform. - p, err := createPlatform(args.Conf, args.NumCPU, args.Device) + p, err := createPlatform(args.Conf, args.NumCPU, args.Device, args.ID) if err != nil { return nil, fmt.Errorf("creating platform: %w", err) } @@ -904,7 +904,7 @@ func (l *Loader) Destroy() { refs.OnExit() } -func createPlatform(conf *config.Config, numCPU int, deviceFile *fd.FD) (platform.Platform, error) { +func createPlatform(conf *config.Config, numCPU int, deviceFile *fd.FD, sandboxID string) (platform.Platform, error) { platformName := conf.Platform p, err := platform.Lookup(conf.Platform) if err != nil { @@ -917,6 +917,7 @@ func createPlatform(conf *config.Config, numCPU int, deviceFile *fd.FD) (platfor DisableSyscallPatching: platformName == "systrap" && conf.SystrapDisableSyscallPatching, ApplicationCores: numCPU, UseCPUNums: platformName == "kvm" && conf.UseCPUNums, + SandboxID: sandboxID, }) } diff --git a/runsc/boot/restore.go b/runsc/boot/restore.go index dd048d7c0a..83e08b4a10 100644 --- a/runsc/boot/restore.go +++ b/runsc/boot/restore.go @@ -211,7 +211,7 @@ func (r *restorer) restore(l *Loader) error { // old kernel is released. l.k.RootNetworkNamespace().ResetStack() - p, err := createPlatform(l.root.conf, l.root.applicationCores, r.deviceFile) + p, err := createPlatform(l.root.conf, l.root.applicationCores, r.deviceFile, l.sandboxID) if err != nil { return fmt.Errorf("creating platform: %v", err) } diff --git a/runsc/sandbox/sandbox.go b/runsc/sandbox/sandbox.go index 311c027725..5a4b42e704 100644 --- a/runsc/sandbox/sandbox.go +++ b/runsc/sandbox/sandbox.go @@ -1046,7 +1046,7 @@ func (s *Sandbox) createSandboxProcess(conf *config.Config, args *Args, startSyn // TODO(b/151157106): syscall tests fail by timeout if asyncpreemptoff // isn't set. - if conf.Platform == "kvm" { + if conf.Platform == "kvm" || conf.Platform == "slimvm" { cmd.Env = append(cmd.Env, "GODEBUG=asyncpreemptoff=1") } diff --git a/test/syscalls/linux/itimer.cc b/test/syscalls/linux/itimer.cc index b82d632994..1e7f2c842e 100644 --- a/test/syscalls/linux/itimer.cc +++ b/test/syscalls/linux/itimer.cc @@ -276,7 +276,7 @@ TEST(ItimerTest, DeliversSIGPROFToThreadsRoughlyFairlyActive) { // many" signals are delivered to the thread group leader, so these tests are // flaky on these platforms. const auto gvisor_platform = GvisorPlatform(); - SKIP_IF(gvisor_platform == Platform::kPtrace); + SKIP_IF(gvisor_platform == Platform::kPtrace || gvisor_platform == Platform::kSlimVM); pid_t child; int execve_errno; @@ -301,7 +301,7 @@ TEST(ItimerTest, DeliversSIGPROFToThreadsRoughlyFairlyActive) { TEST(ItimerTest, DeliversSIGPROFToThreadsRoughlyFairlyIdle) { // See comment in DeliversSIGPROFToThreadsRoughlyFairlyActive. const auto gvisor_platform = GvisorPlatform(); - SKIP_IF(gvisor_platform == Platform::kPtrace); + SKIP_IF(gvisor_platform == Platform::kPtrace || gvisor_platform == Platform::kSlimVM); pid_t child; int execve_errno; diff --git a/test/util/test_util.h b/test/util/test_util.h index adb84d31e3..dc01e91a91 100644 --- a/test/util/test_util.h +++ b/test/util/test_util.h @@ -226,6 +226,7 @@ namespace Platform { constexpr char kNative[] = "native"; constexpr char kPtrace[] = "ptrace"; constexpr char kKVM[] = "kvm"; +constexpr char kSlimVM[] = "slimvm"; constexpr char kFuchsia[] = "fuchsia"; constexpr char kStarnix[] = "starnix"; constexpr char kSystrap[] = "systrap"; diff --git a/tools/bazeldefs/platforms.bzl b/tools/bazeldefs/platforms.bzl index 0b295b2d4c..f866798ef7 100644 --- a/tools/bazeldefs/platforms.bzl +++ b/tools/bazeldefs/platforms.bzl @@ -4,6 +4,7 @@ platforms = { "ptrace": [], "kvm": [], + "slimvm": [], "systrap": [], } @@ -39,6 +40,13 @@ platform_capabilities = { _CAPABILITY_INT3: False, _CAPABILITY_VSYSCALL: True, }, + "slimvm": { + _CAPABILITY_32BIT: False, + _CAPABILITY_ALIGNMENT_CHECK: True, + _CAPABILITY_MULTIPROCESS: True, + _CAPABILITY_INT3: False, + _CAPABILITY_VSYSCALL: True, + }, } default_platform = "systrap"