Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions pkg/hostarch/hostarch_arm64.go
Original file line number Diff line number Diff line change
Expand Up @@ -29,9 +29,15 @@ const (
// HugePageSize is the system huge page size.
HugePageSize = 1 << HugePageShift

// JumboPageSize is the 1GB jumbo page size.
JumboPageSize = 1 << JumboPageShift

// CacheLineSize is the size of the cache line.
CacheLineSize = 1 << CacheLineShift

// JumboPageShift is the binary log of jumbo page whose size is 1GB.
JumboPageShift = 30

// CacheLineShift is the binary log of the cache line size.
CacheLineShift = 6
)
Expand Down
6 changes: 6 additions & 0 deletions pkg/hostarch/hostarch_x86.go
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,9 @@ const (
// HugePageSize is the system huge page size.
HugePageSize = 1 << HugePageShift

// JumboPageSize is the 1GB jumbo page size.
JumboPageSize = 1 << JumboPageShift

// CacheLineSize is the size of the cache line.
CacheLineSize = 1 << CacheLineShift

Expand All @@ -35,6 +38,9 @@ const (
// HugePageShift is the binary log of the system huge page size.
HugePageShift = 21

// JumboPageShift is the binary log of jumbo page whose size is 1GB.
JumboPageShift = 30

// CacheLineShift is the binary log of the cache line size.
CacheLineShift = 6
)
Expand Down
1 change: 1 addition & 0 deletions pkg/hostarch/sizes_util.go
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@ const (
PageMask = PageSize - 1
HugePageMask = HugePageSize - 1
CacheLineMask = CacheLineSize - 1
JumboPageMask = ^uintptr(JumboPageSize - 1)
)

type bytecount interface {
Expand Down
1 change: 1 addition & 0 deletions pkg/ring0/aarch64.go
Original file line number Diff line number Diff line change
Expand Up @@ -102,5 +102,6 @@ const (
const (
Syscall Vector = El0SyncSVC
PageFault Vector = El0SyncDa
OOMException Vector = El0Fiq
VirtualizationException Vector = El0ErrBounce
)
12 changes: 12 additions & 0 deletions pkg/ring0/defs.go
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,18 @@ import (
"gvisor.dev/gvisor/pkg/sentry/arch/fpu"
)

const (
// CPUIntel is Intel CPU.
CPUIntel uint64 = iota

// CPUAMD is AMD (and compatible) CPU.
CPUAMD
)

var (
CPUVendor uint64
)

// Kernel is a global kernel object.
//
// This contains global state, shared by multiple CPUs.
Expand Down
11 changes: 11 additions & 0 deletions pkg/ring0/defs_amd64.go
Original file line number Diff line number Diff line change
Expand Up @@ -108,6 +108,9 @@ type kernelEntry struct {
// kernelCR3 is the cr3 used for sentry kernel.
kernelCR3 uintptr

// whether enable VMCALL
enableVMCALL uint64

// gdt is the CPU's descriptor table.
gdt descriptorTable

Expand Down Expand Up @@ -180,6 +183,14 @@ func (c *CPU) FaultAddr() uintptr {
return c.faultAddr
}

func (c *CPU) EnableVMCALL() {
c.enableVMCALL = 1
}

func (c *CPU) DisableVMCALL() {
c.enableVMCALL = 0
}

// SwitchArchOpts are embedded in SwitchOpts.
type SwitchArchOpts struct {
// UserPCID indicates that the application PCID to be used on switch,
Expand Down
3 changes: 3 additions & 0 deletions pkg/ring0/entry_amd64.go
Original file line number Diff line number Diff line change
Expand Up @@ -117,6 +117,7 @@ func machineCheck()
func simdFloatingPointException()
func virtualizationException()
func securityException()
func oomException()
func syscallInt80()

// These returns the start address of the functions above.
Expand Down Expand Up @@ -145,6 +146,7 @@ func addrOfMachineCheck() uintptr
func addrOfSimdFloatingPointException() uintptr
func addrOfVirtualizationException() uintptr
func addrOfSecurityException() uintptr
func addrOfOOMException() uintptr
func addrOfSyscallInt80() uintptr

// Exception handler index.
Expand All @@ -170,5 +172,6 @@ var handlers = map[Vector]uintptr{
SIMDFloatingPointException: addrOfSimdFloatingPointException(),
VirtualizationException: addrOfVirtualizationException(),
SecurityException: addrOfSecurityException(),
OOMException: addrOfOOMException(),
SyscallInt80: addrOfSyscallInt80(),
}
57 changes: 57 additions & 0 deletions pkg/ring0/entry_amd64.s
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@
#define ENTRY_STACK_TOP 264 // +checkoffset . kernelEntry.stackTop
#define ENTRY_CPU_SELF 272 // +checkoffset . kernelEntry.cpuSelf
#define ENTRY_KERNEL_CR3 280 // +checkoffset . kernelEntry.kernelCR3
#define ENTRY_ENABLE_VMCALL 288 // +checkoffset . kernelEntry.enableVMCALL

// Bits.
#define _RFLAGS_IF 512 // +checkconst . _RFLAGS_IF
Expand Down Expand Up @@ -61,9 +62,16 @@
#define SIMDFloatingPointException 19 // +checkconst . SIMDFloatingPointException
#define VirtualizationException 20 // +checkconst . VirtualizationException
#define SecurityException 30 // +checkconst . SecurityException
#define OOMException 32 // +checkconst . OOMException
#define SyscallInt80 128 // +checkconst . SyscallInt80
#define Syscall 256 // +checkconst . Syscall

#define SyscallExit 60 // +checkconst . SyscallExit
#define SyscallExitGroup 231 // +checkconst . SyscallExitGroup
#define SyscallRedPill 4294967295 // +checkconst . SyscallRedPill

#define CPUIntel 0 // +checkconst . CPUIntel

#define PTRACE_R15 0 // +checkoffset linux PtraceRegs.R15
#define PTRACE_R14 8 // +checkoffset linux PtraceRegs.R14
#define PTRACE_R13 16 // +checkoffset linux PtraceRegs.R13
Expand Down Expand Up @@ -160,6 +168,15 @@
#define LOAD_KERNEL_STACK(entry) \
MOVQ ENTRY_STACK_TOP(entry), SP;

// VMCALL do vmcall/vmmcal instruction
#define VMCALL() \
CMPQ ·CPUVendor(SB), $CPUIntel; \
JE 2(PC); \
JMP 5(PC); \ // vmmcall and vmcall will be treated as 3 independent instructions
BYTE $0x0F; BYTE $0x01; BYTE $0xC1; \
JMP 4(PC); \ // vmmcall and vmcall will be treated as 3 independent instructions
BYTE $0x0F; BYTE $0x01; BYTE $0xD9;

// ADDR_OF_FUNC defines a function named 'name' that returns the address of
// 'symbol'.
#define ADDR_OF_FUNC(name, symbol) \
Expand Down Expand Up @@ -488,6 +505,45 @@ sysenter_skip_gs:
RET

kernel:
// Handle any syscalls from GR0 in HR3 when EnableVMCALL is false.
// Currently there are 2 use cases:
// 1. Using KVM platform.
// 2. Upgrading SlimVM platform. This is one such method to return M to
// user mode (HR3) for upgrading platform.
CMPQ ENTRY_ENABLE_VMCALL(GS), $0
JE hr3_do_syscall

CMPQ AX, $SyscallRedPill
JE hr3_do_syscall

CMPQ AX, $SyscallExit
JE hr3_do_syscall

CMPQ AX, $SyscallExitGroup
JE hr3_do_syscall

vmcall:
// handle syscall from GR0 in host kernel
// copy from "handle system calls from G0" part of __dune_syscall in libdune/dune.S
PUSHQ R11
POPFQ

CMPQ AX, $158 // arch_prctl syscall
JNE 3(PC)
CMPQ DI, $0x1002 //ARCH_SET_FS
JE arch_prctl_vmcall

VMCALL()
JMP *CX

arch_prctl_vmcall:
VMCALL()
CMPQ AX, $0
JNE 2(PC)
MOVQ SI, CPU_REGISTERS+PTRACE_FS_BASE(GS)
JMP *CX

hr3_do_syscall:
// We can't restore the original stack, but we can access the registers
// in the CPU state directly. No need for temporary juggling.
MOVQ AX, ENTRY_SCRATCH0(GS)
Expand Down Expand Up @@ -705,4 +761,5 @@ EXCEPTION_WITHOUT_ERROR(MachineCheck, ·machineCheck(SB), ·addrOfMachineCheck(S
EXCEPTION_WITHOUT_ERROR(SIMDFloatingPointException, ·simdFloatingPointException(SB), ·addrOfSimdFloatingPointException(SB))
EXCEPTION_WITHOUT_ERROR(VirtualizationException, ·virtualizationException(SB), ·addrOfVirtualizationException(SB))
EXCEPTION_WITH_ERROR(SecurityException, ·securityException(SB), ·addrOfSecurityException(SB))
EXCEPTION_WITH_ERROR(OOMException, ·oomException(SB), ·addrOfOOMException(SB))
EXCEPTION_WITHOUT_ERROR(SyscallInt80, ·syscallInt80(SB), ·addrOfSyscallInt80(SB))
6 changes: 5 additions & 1 deletion pkg/ring0/entry_arm64.s
Original file line number Diff line number Diff line change
Expand Up @@ -64,6 +64,7 @@
#define El0ErrNMI 34 // +checkconst . El0ErrNMI
#define PageFault 23 // +checkconst . PageFault
#define Syscall 22 // +checkconst . Syscall
#define OOMException 10 // +checkconst . OOMException
#define VirtualizationException 35 // +checkconst . VirtualizationException

#define PTRACE_REGS 0 // +checkoffset linux PtraceRegs.Regs
Expand Down Expand Up @@ -765,7 +766,10 @@ TEXT ·El0_irq(SB),NOSPLIT,$0
B ·Shutdown(SB)

TEXT ·El0_fiq(SB),NOSPLIT,$0
B ·Shutdown(SB)
KERNEL_ENTRY_FROM_EL0
MOVD $0x8400000a, R8
HVC $0
EXCEPTION_EL0(OOMException)

TEXT ·El0_error(SB),NOSPLIT,$0
KERNEL_ENTRY_FROM_EL0
Expand Down
27 changes: 27 additions & 0 deletions pkg/ring0/kernel_amd64.go
Original file line number Diff line number Diff line change
Expand Up @@ -324,3 +324,30 @@ func startGo(c *CPU) {
func ReadCR2() uintptr {
return readCR2()
}

//go:noinline
//go:nosplit
func (c *CPU) PrefaultIDT() uint32 {
return c.kernel.globalIDT[0].bits[0] + c.kernel.globalIDT[_NR_INTERRUPTS-1].bits[3]
}

// SetCPUIDFaulting sets CPUID faulting per the boolean value.
//
// True is returned if faulting could be set.
//
//go:nosplit
func SetCPUIDFaulting(on bool) bool {
// Per the SDM (Vol 3, Table 2-43), PLATFORM_INFO bit 31 denotes support
// for CPUID faulting, and we enable and disable via the MISC_FEATURES MSR.
if rdmsr(_MSR_PLATFORM_INFO)&_PLATFORM_INFO_CPUID_FAULT != 0 {
features := rdmsr(_MSR_MISC_FEATURES)
if on {
features |= _MISC_FEATURE_CPUID_TRAP
} else {
features &^= _MISC_FEATURE_CPUID_TRAP
}
wrmsr(_MSR_MISC_FEATURES, features)
return true // Setting successful.
}
return false
}
18 changes: 18 additions & 0 deletions pkg/ring0/lib_amd64.go
Original file line number Diff line number Diff line change
Expand Up @@ -128,3 +128,21 @@ func InitDefault() {
cpuid.Initialize()
Init(cpuid.HostFeatureSet())
}

// DisableLA57 forces ring0 to behave as if the host CPU did not advertise
// 5-level paging: hasLA57 is cleared so CR4.LA57 stays 0, and the address-
// space sizes are clamped to a 4-level layout (48-bit VA, 2^47 userspace).
//
// Must be called after Init/InitDefault and before any vCPU loads CR4 or
// any PageTables are created. Use this if the platform's hardware-
// virtualization layer cannot follow a 5-level page table walk (e.g. an
// EPT implementation limited to 4 levels) regardless of host CPUID.
func DisableLA57() {
hasLA57 = false
if VirtualAddressBits > 48 {
VirtualAddressBits = 48
UserspaceSize = uintptr(1) << (VirtualAddressBits - 1)
MaximumUserAddress = (UserspaceSize - 1) &^ uintptr(hostarch.PageSize-1)
KernelStartAddress = ^uintptr(0) - (UserspaceSize - 1)
}
}
1 change: 1 addition & 0 deletions pkg/ring0/pagetables/BUILD
Original file line number Diff line number Diff line change
Expand Up @@ -76,6 +76,7 @@ go_library(
visibility = [
"//pkg/ring0:__subpackages__",
"//pkg/sentry/platform/kvm:__subpackages__",
"//pkg/sentry/platform/slimvm:__subpackages__",
],
deps = [
"//pkg/cpuid",
Expand Down
14 changes: 14 additions & 0 deletions pkg/ring0/pagetables/pagetables.go
Original file line number Diff line number Diff line change
Expand Up @@ -331,3 +331,17 @@ func (p *PageTables) Lookup(addr hostarch.Addr, findFirst bool) (virtual hostarc
func (p *PageTables) MarkReadOnlyShared() {
p.readOnlyShared = true
}

// PrefaultRootTable touches the root table page to be sure that its physical
// page is mapped. The runtime allocator backs PTEs with plain Go heap pages
// (new(PTEs), no mlock / MAP_POPULATE / memfile pinning), so Linux can
// reclaim the root page under memory pressure. Touching it from sentry
// context right before SwitchToUser guarantees the page is resident when
// iret/sysret loads CR3, avoiding rare host page faults that have been
// observed to manifest as vCPU bounce stalls (state=7, userExits stuck).
//
//go:nosplit
//go:noinline
func (p *PageTables) PrefaultRootTable() PTE {
return p.root[0]
}
16 changes: 15 additions & 1 deletion pkg/ring0/pagetables/pagetables_amd64.go
Original file line number Diff line number Diff line change
Expand Up @@ -25,8 +25,22 @@ var (
pgdShift = 39
pgdMask uintptr = 0x1ff << pgdShift
pgdSize uintptr = 1 << pgdShift

// la57Enabled gates whether InitArch promotes new PageTables to a
// 5-level layout when the host CPU advertises LA57. Defaults to true
// so existing behavior is unchanged; platforms whose hardware-
// virtualization layer cannot walk a 5-level page table call
// DisableLA57 once at startup to force 4-level.
la57Enabled = true
)

// DisableLA57 forces all subsequently-created PageTables to use a 4-level
// layout regardless of host CPUID. Must be called before any PageTables is
// created.
func DisableLA57() {
la57Enabled = false
}

const (
pteShift = 12
pmdShift = 21
Expand Down Expand Up @@ -54,7 +68,7 @@ const (
//go:nosplit
func (p *PageTables) InitArch(allocator Allocator) {
featureSet := cpuid.HostFeatureSet()
if featureSet.HasFeature(cpuid.X86FeatureLA57) {
if la57Enabled && featureSet.HasFeature(cpuid.X86FeatureLA57) {
p.largeAddressesEnabled = true
lowerTop = 0x00FFFFFFFFFFFFFF
upperBottom = 0xFF00000000000000
Expand Down
22 changes: 18 additions & 4 deletions pkg/ring0/x86.go
Original file line number Diff line number Diff line change
Expand Up @@ -53,10 +53,16 @@ const (
_EFER_LMA = 0x400
_EFER_NX = 0x800

_MSR_STAR = 0xc0000081
_MSR_LSTAR = 0xc0000082
_MSR_CSTAR = 0xc0000083
_MSR_SYSCALL_MASK = 0xc0000084
_MSR_STAR = 0xc0000081
_MSR_LSTAR = 0xc0000082
_MSR_CSTAR = 0xc0000083
_MSR_SYSCALL_MASK = 0xc0000084
_MSR_PLATFORM_INFO = 0xce
_MSR_MISC_FEATURES = 0x140

_PLATFORM_INFO_CPUID_FAULT = 1 << 31

_MISC_FEATURE_CPUID_TRAP = 0x1
)

const (
Expand Down Expand Up @@ -130,6 +136,7 @@ const (
SIMDFloatingPointException
VirtualizationException
SecurityException = 0x1e
OOMException = 0x20
SyscallInt80 = 0x80
_NR_INTERRUPTS = 0x100
)
Expand All @@ -139,6 +146,13 @@ const (
Syscall Vector = _NR_INTERRUPTS
)

// System call number
const (
SyscallExit uint32 = 0x3c
SyscallExitGroup uint32 = 0xe7
SyscallRedPill uint32 = ^uint32(0)
)

// Selector is a segment Selector.
type Selector uint16

Expand Down
Loading