From a8dc3fcaf1be25bc29f32382d2ada504b268e9c9 Mon Sep 17 00:00:00 2001 From: 4lDO2 <4lDO2@protonmail.com> Date: Mon, 8 Feb 2021 08:09:20 +0100 Subject: [PATCH 01/10] Begin using sysretq in the system call handler. --- src/arch/x86_64/gdt.rs | 17 ++++++++------ src/arch/x86_64/idt.rs | 2 +- src/arch/x86_64/interrupt/handler.rs | 5 +++++ src/arch/x86_64/interrupt/syscall.rs | 33 +++++++++++++++++++++------- src/arch/x86_64/pti.rs | 4 ++-- 5 files changed, 43 insertions(+), 18 deletions(-) diff --git a/src/arch/x86_64/gdt.rs b/src/arch/x86_64/gdt.rs index c171314..7878451 100644 --- a/src/arch/x86_64/gdt.rs +++ b/src/arch/x86_64/gdt.rs @@ -14,11 +14,12 @@ pub const GDT_NULL: usize = 0; pub const GDT_KERNEL_CODE: usize = 1; pub const GDT_KERNEL_DATA: usize = 2; pub const GDT_KERNEL_TLS: usize = 3; -pub const GDT_USER_CODE: usize = 4; +pub const GDT_USER_CODE32_UNUSED: usize = 4; pub const GDT_USER_DATA: usize = 5; -pub const GDT_USER_TLS: usize = 6; -pub const GDT_TSS: usize = 7; -pub const GDT_TSS_HIGH: usize = 8; +pub const GDT_USER_CODE: usize = 6; +pub const GDT_USER_TLS: usize = 7; +pub const GDT_TSS: usize = 8; +pub const GDT_TSS_HIGH: usize = 9; pub const GDT_A_PRESENT: u8 = 1 << 7; pub const GDT_A_RING_0: u8 = 0 << 5; @@ -61,7 +62,7 @@ pub static mut GDTR: DescriptorTablePointer = DescriptorTable }; #[thread_local] -pub static mut GDT: [GdtEntry; 9] = [ +pub static mut GDT: [GdtEntry; 10] = [ // Null GdtEntry::new(0, 0, 0, 0), // Kernel code @@ -70,10 +71,12 @@ pub static mut GDT: [GdtEntry; 9] = [ GdtEntry::new(0, 0, GDT_A_PRESENT | GDT_A_RING_0 | GDT_A_SYSTEM | GDT_A_PRIVILEGE, GDT_F_LONG_MODE), // Kernel TLS GdtEntry::new(0, 0, GDT_A_PRESENT | GDT_A_RING_0 | GDT_A_SYSTEM | GDT_A_PRIVILEGE, GDT_F_LONG_MODE), - // User code - GdtEntry::new(0, 0, GDT_A_PRESENT | GDT_A_RING_3 | GDT_A_SYSTEM | GDT_A_EXECUTABLE | GDT_A_PRIVILEGE, GDT_F_LONG_MODE), + // Dummy 32-bit user code - apparently necessary for SYSEXIT. We restrict it to ring 0 anyway. + GdtEntry::new(0, 0, GDT_A_PRESENT | GDT_A_RING_0 | GDT_A_SYSTEM | GDT_A_EXECUTABLE | GDT_A_PRIVILEGE, GDT_F_PROTECTED_MODE), // User data GdtEntry::new(0, 0, GDT_A_PRESENT | GDT_A_RING_3 | GDT_A_SYSTEM | GDT_A_PRIVILEGE, GDT_F_LONG_MODE), + // User (64-bit) code + GdtEntry::new(0, 0, GDT_A_PRESENT | GDT_A_RING_3 | GDT_A_SYSTEM | GDT_A_EXECUTABLE | GDT_A_PRIVILEGE, GDT_F_LONG_MODE), // User TLS GdtEntry::new(0, 0, GDT_A_PRESENT | GDT_A_RING_3 | GDT_A_SYSTEM | GDT_A_PRIVILEGE, GDT_F_LONG_MODE), // TSS diff --git a/src/arch/x86_64/idt.rs b/src/arch/x86_64/idt.rs index bb0eb29..4a85aa8 100644 --- a/src/arch/x86_64/idt.rs +++ b/src/arch/x86_64/idt.rs @@ -285,6 +285,6 @@ impl IdtEntry { // A function to set the offset more easily pub fn set_func(&mut self, func: unsafe extern fn()) { self.set_flags(IdtFlags::PRESENT | IdtFlags::RING_0 | IdtFlags::INTERRUPT); - self.set_offset(8, func as usize); + self.set_offset((crate::gdt::GDT_KERNEL_CODE as u16) << 3, func as usize); } } diff --git a/src/arch/x86_64/interrupt/handler.rs b/src/arch/x86_64/interrupt/handler.rs index 9e75a22..696be7e 100644 --- a/src/arch/x86_64/interrupt/handler.rs +++ b/src/arch/x86_64/interrupt/handler.rs @@ -75,6 +75,11 @@ impl IretRegisters { println!("RFLAG: {:>016X}", { self.rflags }); println!("CS: {:>016X}", { self.cs }); println!("RIP: {:>016X}", { self.rip }); + + if self.cs & 0b11 != 0b00 { + println!("RSP: {:>016X}", { self.rsp }); + println!("SS: {:>016X}", { self.ss }); + } } } diff --git a/src/arch/x86_64/interrupt/syscall.rs b/src/arch/x86_64/interrupt/syscall.rs index 9c5364e..b526263 100644 --- a/src/arch/x86_64/interrupt/syscall.rs +++ b/src/arch/x86_64/interrupt/syscall.rs @@ -8,7 +8,18 @@ use crate::{ use x86::msr; pub unsafe fn init() { - msr::wrmsr(msr::IA32_STAR, ((gdt::GDT_KERNEL_CODE as u64) << 3) << 32); + // IA32_STAR[31:0] are reserved. + + // The base selector of the two consecutive segments for kernel code and the immediately + // suceeding stack (data). + let syscall_cs_ss_base = (gdt::GDT_KERNEL_CODE as u16) << 3; + // The base selector of the three consecutive segments (of which two are used) for user code + // and user data. It points to a 32-bit code segment, which must be followed by a data segment + // (stack), and a 64-bit code segment. + let sysret_cs_ss_base = ((gdt::GDT_USER_CODE32_UNUSED as u16) << 3) | u16::from(gdt::GDT_A_RING_3); + let star_high = u32::from(syscall_cs_ss_base) | (u32::from(sysret_cs_ss_base) << 16); + + msr::wrmsr(msr::IA32_STAR, u64::from(star_high) << 32); msr::wrmsr(msr::IA32_LSTAR, syscall_instruction as u64); msr::wrmsr(msr::IA32_FMASK, 0x0300); // Clear trap flag and interrupt enable msr::wrmsr(msr::IA32_KERNEL_GSBASE, &gdt::TSS as *const _ as u64); @@ -52,15 +63,13 @@ function!(syscall_instruction => { // Yes, this is magic. No, you don't need to understand " swapgs // Set gs segment to TSS - mov gs:[28], rsp // Save userspace rsp - mov rsp, gs:[4] // Load kernel rsp - push 5 * 8 + 3 // Push userspace data segment + mov gs:[28], rsp // Save userspace stack pointer + mov rsp, gs:[4] // Load kernel stack pointer + push WORD PTR 5 * 8 + 3 // Push fake userspace SS (resembling iret frame) push QWORD PTR gs:[28] // Push userspace rsp - mov QWORD PTR gs:[28], 0 // Clear userspace rsp push r11 // Push rflags - push 4 * 8 + 3 // Push userspace code segment + push WORD PTR 6 * 8 + 3 // Push fake userspace CS (resembling iret frame) push rcx // Push userspace return pointer - swapgs // Restore gs ", // Push context registers @@ -85,7 +94,15 @@ function!(syscall_instruction => { pop_scratch!(), // Return - "iretq\n", + " + pop rcx // Pop userspace return pointer + add rsp, 2 + pop r11 // Pop rflags + add rsp, 10 // Pop SS and rsp + mov rsp, gs:[28] // Restore userspace stack pointer + swapgs // Restore gs from TSS to kernel data + sysretq // Return into userspace; RCX=>RIP,R11=>RFLAGS + ", }); interrupt_stack!(syscall, |stack| { diff --git a/src/arch/x86_64/pti.rs b/src/arch/x86_64/pti.rs index eb5841c..0f5cc07 100644 --- a/src/arch/x86_64/pti.rs +++ b/src/arch/x86_64/pti.rs @@ -58,7 +58,7 @@ pub unsafe fn map() { #[cfg(feature = "pti")] #[inline(always)] -pub unsafe fn unmap() { +pub unsafe extern "C" fn unmap() { // Switch to per-CPU stack switch_stack(PTI_CONTEXT_STACK, PTI_CPU_STACK.as_ptr() as usize + PTI_CPU_STACK.len()); @@ -83,4 +83,4 @@ pub unsafe fn map() {} #[cfg(not(feature = "pti"))] #[inline(always)] -pub unsafe fn unmap() {} +pub unsafe extern "C" fn unmap() {} From c913c3be804fb77625069a0e6cd8dbdde0bc3df6 Mon Sep 17 00:00:00 2001 From: 4lDO2 <4lDO2@protonmail.com> Date: Mon, 8 Feb 2021 08:44:48 +0100 Subject: [PATCH 02/10] Use sysretq in usermode(). --- src/arch/x86_64/start.rs | 95 +++++++++++++++++++++++++--------------- src/context/signal.rs | 2 +- src/syscall/process.rs | 2 +- 3 files changed, 61 insertions(+), 38 deletions(-) diff --git a/src/arch/x86_64/start.rs b/src/arch/x86_64/start.rs index 7f28eaa..24874d8 100644 --- a/src/arch/x86_64/start.rs +++ b/src/arch/x86_64/start.rs @@ -240,13 +240,10 @@ pub unsafe extern fn kstart_ap(args_ptr: *const KernelArgsAp) -> ! { } #[naked] -pub unsafe fn usermode(ip: usize, sp: usize, arg: usize, singlestep: bool) -> ! { - let mut flags = FLAG_INTERRUPTS; - if singlestep { - flags |= FLAG_SINGLESTEP; - } - - asm!("push r10 +#[inline(never)] +// TODO: AbiCompatBool +pub unsafe extern "C" fn usermode(_ip: usize, _sp: usize, _arg: usize, _singlestep: u32) -> ! { + /*asm!("push r10 push r11 push r12 push r13 @@ -258,36 +255,62 @@ pub unsafe fn usermode(ip: usize, sp: usize, arg: usize, singlestep: bool) -> ! in("r13") (gdt::GDT_USER_CODE << 3 | 3), // Code segment in("r14") ip, // IP in("r15") arg, // Argument - ); + );*/ + // rdi, rsi, rdx, rcx + asm!( + " + mov rbx, {flag_interrupts} + test ecx, ecx + jz .after_singlestep_branch + or rbx, {flag_singlestep} - // Unmap kernel - pti::unmap(); + .after_singlestep_branch: + mov r12, rdi + mov r13, rsi + mov rdi, rdx + call {pti_unmap} - // Go to usermode - asm!("mov ds, r14d - mov es, r14d - mov fs, r15d - mov gs, r14d - xor rax, rax - xor rbx, rbx - xor rcx, rcx - xor rdx, rdx - xor rsi, rsi - xor rdi, rdi - xor rbp, rbp - xor r8, r8 - xor r9, r9 - xor r10, r10 - xor r11, r11 - xor r12, r12 - xor r13, r13 - xor r14, r14 - xor r15, r15 - fninit - pop rdi - iretq", - in("r14") (gdt::GDT_USER_DATA << 3 | 3), // Data segment - in("r15") (gdt::GDT_USER_TLS << 3 | 3), // TLS segment - options(noreturn), + // Go to usermode + mov r14, {user_data_seg_selector} + mov r15, {user_tls_seg_selector} + mov ds, r14d + mov es, r14d + mov fs, r15d + mov gs, r14d + + // Target RFLAGS + mov r11, rbx + // Target instruction pointer + mov rcx, r12 + // Target stack pointer + mov rsp, r13 + + xor rax, rax + xor rbx, rbx + // Don't zero rcx; it's used for `ip`. + xor rdx, rdx + // Don't zero rdi; it's used for `arg`. + xor rsi, rsi + xor rbp, rbp + // Don't zero rsp, obviously. + xor r8, r8 + xor r9, r9 + xor r10, r10 + // Don't zero r11; it's used for `rflags`. + xor r12, r12 + xor r13, r13 + xor r14, r14 + xor r15, r15 + + fninit + sysretq + ", + + flag_interrupts = const(FLAG_INTERRUPTS), + flag_singlestep = const(FLAG_SINGLESTEP), + pti_unmap = sym pti::unmap, + user_data_seg_selector = const(gdt::GDT_USER_DATA << 3 | 3), + user_tls_seg_selector = const(gdt::GDT_USER_TLS << 3 | 3), + options(noreturn), ); } diff --git a/src/context/signal.rs b/src/context/signal.rs index bc5add1..3fe1687 100644 --- a/src/context/signal.rs +++ b/src/context/signal.rs @@ -122,7 +122,7 @@ pub extern "C" fn signal_handler(sig: usize) { sp -= mem::size_of::(); *(sp as *mut usize) = restorer; - usermode(handler, sp, sig, singlestep); + usermode(handler, sp, sig, u32::from(singlestep)); } } diff --git a/src/syscall/process.rs b/src/syscall/process.rs index 995ead9..08d0407 100644 --- a/src/syscall/process.rs +++ b/src/syscall/process.rs @@ -845,7 +845,7 @@ fn fexec_noreturn( } // Go to usermode - unsafe { usermode(entry, sp, 0, singlestep) } + unsafe { usermode(entry, sp, 0, u32::from(singlestep)) } } pub fn fexec_kernel(fd: FileHandle, args: Box<[Box<[u8]>]>, vars: Box<[Box<[u8]>]>, name_override_opt: Option>, auxv: Option>) -> Result { From 5a638691e0d505013eb827beeb56c497c12fe0c6 Mon Sep 17 00:00:00 2001 From: 4lDO2 <4lDO2@protonmail.com> Date: Mon, 8 Feb 2021 10:58:10 +0100 Subject: [PATCH 03/10] Treat GS as always pointing to TSS in kernel space. --- src/arch/x86_64/gdt.rs | 44 +++++++++++++++++++++----- src/arch/x86_64/interrupt/exception.rs | 2 ++ src/arch/x86_64/interrupt/handler.rs | 17 ++++++++++ src/arch/x86_64/interrupt/syscall.rs | 20 +++++++----- src/arch/x86_64/start.rs | 44 ++++++++++++-------------- 5 files changed, 87 insertions(+), 40 deletions(-) diff --git a/src/arch/x86_64/gdt.rs b/src/arch/x86_64/gdt.rs index 7878451..a2ad7d5 100644 --- a/src/arch/x86_64/gdt.rs +++ b/src/arch/x86_64/gdt.rs @@ -85,15 +85,39 @@ pub static mut GDT: [GdtEntry; 10] = [ GdtEntry::new(0, 0, 0, 0), ]; +#[repr(packed)] +pub struct TssWrapper { + base: TaskStateSegment, + _pad: u64, + _user_stack: u64, +} +impl core::ops::Deref for TssWrapper { + type Target = TaskStateSegment; + + fn deref(&self) -> &Self::Target { + &self.base + } +} +impl core::ops::DerefMut for TssWrapper { + fn deref_mut(&mut self) -> &mut Self::Target { + &mut self.base + } +} + #[thread_local] -pub static mut TSS: TaskStateSegment = TaskStateSegment { - reserved: 0, - rsp: [0; 3], - reserved2: 0, - ist: [0; 7], - reserved3: 0, - reserved4: 0, - iomap_base: 0xFFFF +pub static mut TSS: TssWrapper = TssWrapper { + base: TaskStateSegment { + reserved: 0, + rsp: [0; 3], + reserved2: 0, + ist: [0; 7], + reserved3: 0, + reserved4: 0, + iomap_base: 0xFFFF + }, + _pad: 0_u64, + // Accessed only from assembly, at `gs:[0x70]` + _user_stack: 0_u64, }; pub unsafe fn set_tcb(pid: usize) { @@ -167,7 +191,11 @@ pub unsafe fn init_paging(tcb_offset: usize, stack_offset: usize) { segmentation::load_ds(SegmentSelector::new(GDT_KERNEL_DATA as u16, Ring::Ring0)); segmentation::load_es(SegmentSelector::new(GDT_KERNEL_DATA as u16, Ring::Ring0)); segmentation::load_fs(SegmentSelector::new(GDT_KERNEL_TLS as u16, Ring::Ring0)); + segmentation::load_gs(SegmentSelector::new(GDT_KERNEL_DATA as u16, Ring::Ring0)); + // Ensure that GS always points to the TSS segment in kernel space. + x86::msr::wrmsr(x86::msr::IA32_GS_BASE, &TSS as *const _ as usize as u64); + segmentation::load_ss(SegmentSelector::new(GDT_KERNEL_DATA as u16, Ring::Ring0)); // Load the task register diff --git a/src/arch/x86_64/interrupt/exception.rs b/src/arch/x86_64/interrupt/exception.rs index 18b4f06..9a1e526 100644 --- a/src/arch/x86_64/interrupt/exception.rs +++ b/src/arch/x86_64/interrupt/exception.rs @@ -41,6 +41,8 @@ interrupt_stack!(debug, |stack| { } }); +// TODO: Give NMI and double faults a separate stack, as they can trigger a triple fault due to the +// way swapgs is used. interrupt_stack!(non_maskable, |stack| { println!("Non-maskable interrupt"); stack.dump(); diff --git a/src/arch/x86_64/interrupt/handler.rs b/src/arch/x86_64/interrupt/handler.rs index 696be7e..18a8f8a 100644 --- a/src/arch/x86_64/interrupt/handler.rs +++ b/src/arch/x86_64/interrupt/handler.rs @@ -309,6 +309,17 @@ macro_rules! pop_fs { " }; } +macro_rules! swapgs_if_ring3 { + () => { " + // Check whether the last two bits RSP+8 (code segment) are equal to zero. + test BYTE PTR [rsp + 8], 0x03 + // Skip the SWAPGS instruction if CS & 0b11 == 0b00. + jz 1f + swapgs + 1: + " } +} + #[macro_export] macro_rules! interrupt_stack { ($name:ident, |$stack:ident| $code:block) => { @@ -327,6 +338,7 @@ macro_rules! interrupt_stack { function!($name => { // Backup all userspace registers to stack + swapgs_if_ring3!(), "push rax\n", push_scratch!(), push_preserved!(), @@ -347,6 +359,7 @@ macro_rules! interrupt_stack { pop_preserved!(), pop_scratch!(), + swapgs_if_ring3!(), "iretq\n", }); } @@ -364,6 +377,7 @@ macro_rules! interrupt { function!($name => { // Backup all userspace registers to stack + swapgs_if_ring3!(), "push rax\n", push_scratch!(), push_fs!(), @@ -381,6 +395,7 @@ macro_rules! interrupt { pop_fs!(), pop_scratch!(), + swapgs_if_ring3!(), "iretq\n", }); } @@ -404,6 +419,7 @@ macro_rules! interrupt_error { } function!($name => { + swapgs_if_ring3!(), // Move rax into code's place, put code in last instead (to be // compatible with InterruptStack) "xchg [rsp], rax\n", @@ -434,6 +450,7 @@ macro_rules! interrupt_error { pop_preserved!(), pop_scratch!(), + swapgs_if_ring3!(), "iretq\n", }); } diff --git a/src/arch/x86_64/interrupt/syscall.rs b/src/arch/x86_64/interrupt/syscall.rs index b526263..cc84460 100644 --- a/src/arch/x86_64/interrupt/syscall.rs +++ b/src/arch/x86_64/interrupt/syscall.rs @@ -16,13 +16,16 @@ pub unsafe fn init() { // The base selector of the three consecutive segments (of which two are used) for user code // and user data. It points to a 32-bit code segment, which must be followed by a data segment // (stack), and a 64-bit code segment. - let sysret_cs_ss_base = ((gdt::GDT_USER_CODE32_UNUSED as u16) << 3) | u16::from(gdt::GDT_A_RING_3); + let sysret_cs_ss_base = ((gdt::GDT_USER_CODE32_UNUSED as u16) << 3) | 3; let star_high = u32::from(syscall_cs_ss_base) | (u32::from(sysret_cs_ss_base) << 16); msr::wrmsr(msr::IA32_STAR, u64::from(star_high) << 32); msr::wrmsr(msr::IA32_LSTAR, syscall_instruction as u64); msr::wrmsr(msr::IA32_FMASK, 0x0300); // Clear trap flag and interrupt enable - msr::wrmsr(msr::IA32_KERNEL_GSBASE, &gdt::TSS as *const _ as u64); + + // Inside kernel space, GS should _always_ point to the TSS. When leaving userspace, `swapgs` + // is called again, making the userspace GS always point to user data. + msr::wrmsr(msr::IA32_KERNEL_GSBASE, 0); let efer = msr::rdmsr(msr::IA32_EFER); msr::wrmsr(msr::IA32_EFER, efer | 1); @@ -63,10 +66,10 @@ function!(syscall_instruction => { // Yes, this is magic. No, you don't need to understand " swapgs // Set gs segment to TSS - mov gs:[28], rsp // Save userspace stack pointer + mov gs:[0x70], rsp // Save userspace stack pointer mov rsp, gs:[4] // Load kernel stack pointer push WORD PTR 5 * 8 + 3 // Push fake userspace SS (resembling iret frame) - push QWORD PTR gs:[28] // Push userspace rsp + push QWORD PTR gs:[0x70] // Push userspace rsp push r11 // Push rflags push WORD PTR 6 * 8 + 3 // Push fake userspace CS (resembling iret frame) push rcx // Push userspace return pointer @@ -96,11 +99,12 @@ function!(syscall_instruction => { // Return " pop rcx // Pop userspace return pointer - add rsp, 2 + add rsp, 2 // Pop CS pop r11 // Pop rflags - add rsp, 10 // Pop SS and rsp - mov rsp, gs:[28] // Restore userspace stack pointer - swapgs // Restore gs from TSS to kernel data + pop QWORD PTR gs:[0x70] // Pop userspace stack pointer + add rsp, 2 // Pop SS + mov rsp, gs:[0x70] // Restore userspace stack pointer + swapgs // Restore gs from TSS to user data sysretq // Return into userspace; RCX=>RIP,R11=>RFLAGS ", }); diff --git a/src/arch/x86_64/start.rs b/src/arch/x86_64/start.rs index 24874d8..d330082 100644 --- a/src/arch/x86_64/start.rs +++ b/src/arch/x86_64/start.rs @@ -243,19 +243,6 @@ pub unsafe extern fn kstart_ap(args_ptr: *const KernelArgsAp) -> ! { #[inline(never)] // TODO: AbiCompatBool pub unsafe extern "C" fn usermode(_ip: usize, _sp: usize, _arg: usize, _singlestep: u32) -> ! { - /*asm!("push r10 - push r11 - push r12 - push r13 - push r14 - push r15", - in("r10") (gdt::GDT_USER_DATA << 3 | 3), // Data segment - in("r11") sp, // Stack pointer - in("r12") flags, // Flags - in("r13") (gdt::GDT_USER_CODE << 3 | 3), // Code segment - in("r14") ip, // IP - in("r15") arg, // Argument - );*/ // rdi, rsi, rdx, rcx asm!( " @@ -265,25 +252,34 @@ pub unsafe extern "C" fn usermode(_ip: usize, _sp: usize, _arg: usize, _singlest or rbx, {flag_singlestep} .after_singlestep_branch: - mov r12, rdi - mov r13, rsi - mov rdi, rdx + + // save `ip` (rdi), `sp` (rsi), and `arg` (rdx) in callee-preserved registers, so that + // they are not modified by `pti_unmap` + + mov r13, rdi + mov r14, rsi + mov r15, rdx call {pti_unmap} // Go to usermode - mov r14, {user_data_seg_selector} - mov r15, {user_tls_seg_selector} - mov ds, r14d - mov es, r14d - mov fs, r15d - mov gs, r14d + mov r8, {user_data_seg_selector} + mov r9, {user_tls_seg_selector} + mov ds, r8d + mov es, r8d + mov fs, r9d + // Exchange the old kernel GS (pointing to TSS) and kernel data + swapgs + // Replace kernel data segment with user data segment + mov gs, r8d // Target RFLAGS mov r11, rbx // Target instruction pointer - mov rcx, r12 + mov rcx, r13 // Target stack pointer - mov rsp, r13 + mov rsp, r14 + // Target argument + mov rdi, r13 xor rax, rax xor rbx, rbx From 1a8016b985f81442e6303e095f356ea0461eb438 Mon Sep 17 00:00:00 2001 From: 4lDO2 <4lDO2@protonmail.com> Date: Mon, 8 Feb 2021 11:27:22 +0100 Subject: [PATCH 04/10] Give NMI, #DF, and #MC handlers a special stack. This is done by allocating an extra 64 KiB per CPU, and putting it in the Interrupt Stack Table. --- src/arch/x86_64/idt.rs | 61 ++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 61 insertions(+) diff --git a/src/arch/x86_64/idt.rs b/src/arch/x86_64/idt.rs index 4a85aa8..407db03 100644 --- a/src/arch/x86_64/idt.rs +++ b/src/arch/x86_64/idt.rs @@ -154,10 +154,63 @@ pub unsafe fn init_generic(is_bsp: bool, idt: &mut Idt) { IDTR.limit = (current_idt.len() * mem::size_of::() - 1) as u16; IDTR.base = current_idt.as_ptr() as *const X86IdtEntry; + let backup_ist = { + // A problem with SWAPGS, is that if a non-maskable interrupt were to occur in the middle + // of swapping, the CS would now point to the new kernel CS from the kernel-triggered + // interrupt, and no swap would occur. Thus, we give the NMI handler a separate stack. This + // is also true for Machine Check, and for Double Faults, but for other reasons. + // + // Note that each CPU has its own "backup interrupt stack". + let index = 1_u8; + + // Allocate 64 KiB of stack space for the backup stack. + const BACKUP_STACK_SIZE: usize = 65536; + assert_eq!(BACKUP_STACK_SIZE % crate::memory::PAGE_SIZE, 0); + let page_count = BACKUP_STACK_SIZE / crate::memory::PAGE_SIZE; + let frames = crate::memory::allocate_frames(page_count) + .expect("failed to allocate pages for backup interrupt stack"); + + // Map them linearly, i.e. KERNEL_OFFSET + physaddr. + let base_address = { + use crate::memory::{Frame, PhysicalAddress}; + use crate::paging::{ActivePageTable, Page, VirtualAddress}; + use crate::paging::entry::EntryFlags; + + let mut active_table = ActivePageTable::new(); + let base_virtual_address = VirtualAddress::new(frames.start_address().data() + crate::KERNEL_OFFSET); + + for i in 0..page_count { + let virtual_address = VirtualAddress::new(base_virtual_address.data() + i * crate::memory::PAGE_SIZE); + let physical_address = PhysicalAddress::new(frames.start_address().data() + i * crate::memory::PAGE_SIZE); + let page = Page::containing_address(virtual_address); + + let flags = EntryFlags::PRESENT | EntryFlags::WRITABLE | EntryFlags::NO_EXECUTE; + + let flusher = if let Some(already_mapped) = active_table.translate_page(page) { + assert_eq!(already_mapped.start_address(), physical_address, "address already mapped, but non-linearly"); + active_table.remap(page, flags) + } else { + active_table.map_to(page, Frame::containing_address(physical_address), flags) + }; + flusher.flush(&mut active_table); + } + + base_virtual_address + }; + // Stack always grows downwards. + let address = base_address.data() + BACKUP_STACK_SIZE; + + // Put them in the 1st entry of the IST. + crate::gdt::TSS.ist[usize::from(index - 1)] = address as u64; + + index + }; + // Set up exceptions current_idt[0].set_func(exception::divide_by_zero); current_idt[1].set_func(exception::debug); current_idt[2].set_func(exception::non_maskable); + current_idt[2].set_ist(backup_ist); current_idt[3].set_func(exception::breakpoint); current_idt[3].set_flags(IdtFlags::PRESENT | IdtFlags::RING_3 | IdtFlags::INTERRUPT); current_idt[4].set_func(exception::overflow); @@ -165,6 +218,7 @@ pub unsafe fn init_generic(is_bsp: bool, idt: &mut Idt) { current_idt[6].set_func(exception::invalid_opcode); current_idt[7].set_func(exception::device_not_available); current_idt[8].set_func(exception::double_fault); + current_idt[8].set_ist(backup_ist); // 9 no longer available current_idt[10].set_func(exception::invalid_tss); current_idt[11].set_func(exception::segment_not_present); @@ -175,6 +229,7 @@ pub unsafe fn init_generic(is_bsp: bool, idt: &mut Idt) { current_idt[16].set_func(exception::fpu_fault); current_idt[17].set_func(exception::alignment_check); current_idt[18].set_func(exception::machine_check); + current_idt[18].set_ist(backup_ist); current_idt[19].set_func(exception::simd); current_idt[20].set_func(exception::virtualization); // 21 through 29 reserved @@ -275,6 +330,12 @@ impl IdtEntry { self.attribute = flags.bits; } + pub fn set_ist(&mut self, ist: u8) { + assert_eq!(ist & 0x07, ist, "interrupt stack table must be within 0..=7"); + self.zero &= 0xF8; + self.zero |= ist; + } + pub fn set_offset(&mut self, selector: u16, base: usize) { self.selector = selector; self.offsetl = base as u16; From 05db0f59775ce6d44b1491542d364c8d3bc6be31 Mon Sep 17 00:00:00 2001 From: 4lDO2 <4lDO2@protonmail.com> Date: Mon, 8 Feb 2021 13:46:49 +0100 Subject: [PATCH 05/10] Temporarily fix sysretq by swapping gs 4 times. In order words, it swaps gs both directly at the start of the syscall handler, then swaps it back, and the at the end of the syscall handler. I cannot tell for sure why this is necessary, but probably since some interrupt handler will execute swapgs in the wrong order or something. --- src/arch/x86_64/gdt.rs | 2 +- src/arch/x86_64/interrupt/exception.rs | 2 -- src/arch/x86_64/interrupt/handler.rs | 42 ++++++++++++++++++++------ src/arch/x86_64/interrupt/syscall.rs | 12 +++++--- src/arch/x86_64/start.rs | 5 +-- 5 files changed, 42 insertions(+), 21 deletions(-) diff --git a/src/arch/x86_64/gdt.rs b/src/arch/x86_64/gdt.rs index a2ad7d5..a285d9a 100644 --- a/src/arch/x86_64/gdt.rs +++ b/src/arch/x86_64/gdt.rs @@ -194,7 +194,7 @@ pub unsafe fn init_paging(tcb_offset: usize, stack_offset: usize) { segmentation::load_gs(SegmentSelector::new(GDT_KERNEL_DATA as u16, Ring::Ring0)); // Ensure that GS always points to the TSS segment in kernel space. - x86::msr::wrmsr(x86::msr::IA32_GS_BASE, &TSS as *const _ as usize as u64); + //x86::msr::wrmsr(x86::msr::IA32_GS_BASE, &TSS as *const _ as usize as u64); segmentation::load_ss(SegmentSelector::new(GDT_KERNEL_DATA as u16, Ring::Ring0)); diff --git a/src/arch/x86_64/interrupt/exception.rs b/src/arch/x86_64/interrupt/exception.rs index 9a1e526..18b4f06 100644 --- a/src/arch/x86_64/interrupt/exception.rs +++ b/src/arch/x86_64/interrupt/exception.rs @@ -41,8 +41,6 @@ interrupt_stack!(debug, |stack| { } }); -// TODO: Give NMI and double faults a separate stack, as they can trigger a triple fault due to the -// way swapgs is used. interrupt_stack!(non_maskable, |stack| { println!("Non-maskable interrupt"); stack.dump(); diff --git a/src/arch/x86_64/interrupt/handler.rs b/src/arch/x86_64/interrupt/handler.rs index 18a8f8a..776b98d 100644 --- a/src/arch/x86_64/interrupt/handler.rs +++ b/src/arch/x86_64/interrupt/handler.rs @@ -309,14 +309,38 @@ macro_rules! pop_fs { " }; } -macro_rules! swapgs_if_ring3 { - () => { " +macro_rules! swapgs_iff_ring3 { + (error_code: true) => { " // Check whether the last two bits RSP+8 (code segment) are equal to zero. - test BYTE PTR [rsp + 8], 0x03 + test QWORD PTR [rsp + 8], 0x3 // Skip the SWAPGS instruction if CS & 0b11 == 0b00. jz 1f swapgs 1: + " }; + (error_code: false) => { " + test QWORD PTR [rsp + 16], 0x3 + jz 1f + swapgs + 1: + " }; +} +macro_rules! swapgs_iff_ring3_slow { + () => { " + push eax + push edx + push ecx + mov ecx, 0xC000_0102 + rdmsr + shl rdx, 32 + or eax, edx + test rdx, rdx + jnz 1f + swapgs + 1: + pop ecx + pop edx + pop eax " } } @@ -338,7 +362,7 @@ macro_rules! interrupt_stack { function!($name => { // Backup all userspace registers to stack - swapgs_if_ring3!(), + //swapgs_if_ring3!(error_code: false), "push rax\n", push_scratch!(), push_preserved!(), @@ -359,7 +383,7 @@ macro_rules! interrupt_stack { pop_preserved!(), pop_scratch!(), - swapgs_if_ring3!(), + //swapgs_if_ring3!(error_code: false), "iretq\n", }); } @@ -377,7 +401,7 @@ macro_rules! interrupt { function!($name => { // Backup all userspace registers to stack - swapgs_if_ring3!(), + //swapgs_if_ring3!(error_code: false), "push rax\n", push_scratch!(), push_fs!(), @@ -395,7 +419,7 @@ macro_rules! interrupt { pop_fs!(), pop_scratch!(), - swapgs_if_ring3!(), + //swapgs_if_ring3!(error_code: false), "iretq\n", }); } @@ -419,7 +443,7 @@ macro_rules! interrupt_error { } function!($name => { - swapgs_if_ring3!(), + //swapgs_if_ring3!(error_code: true), // Move rax into code's place, put code in last instead (to be // compatible with InterruptStack) "xchg [rsp], rax\n", @@ -450,7 +474,7 @@ macro_rules! interrupt_error { pop_preserved!(), pop_scratch!(), - swapgs_if_ring3!(), + //swapgs_if_ring3!(error_code: true), "iretq\n", }); } diff --git a/src/arch/x86_64/interrupt/syscall.rs b/src/arch/x86_64/interrupt/syscall.rs index cc84460..1df5906 100644 --- a/src/arch/x86_64/interrupt/syscall.rs +++ b/src/arch/x86_64/interrupt/syscall.rs @@ -25,7 +25,7 @@ pub unsafe fn init() { // Inside kernel space, GS should _always_ point to the TSS. When leaving userspace, `swapgs` // is called again, making the userspace GS always point to user data. - msr::wrmsr(msr::IA32_KERNEL_GSBASE, 0); + x86::msr::wrmsr(x86::msr::IA32_KERNEL_GSBASE, &gdt::TSS as *const _ as usize as u64); let efer = msr::rdmsr(msr::IA32_EFER); msr::wrmsr(msr::IA32_EFER, efer | 1); @@ -68,11 +68,12 @@ function!(syscall_instruction => { swapgs // Set gs segment to TSS mov gs:[0x70], rsp // Save userspace stack pointer mov rsp, gs:[4] // Load kernel stack pointer - push WORD PTR 5 * 8 + 3 // Push fake userspace SS (resembling iret frame) + push QWORD PTR 5 * 8 + 3 // Push fake userspace SS (resembling iret frame) push QWORD PTR gs:[0x70] // Push userspace rsp push r11 // Push rflags - push WORD PTR 6 * 8 + 3 // Push fake userspace CS (resembling iret frame) + push QWORD PTR 6 * 8 + 3 // Push fake userspace CS (resembling iret frame) push rcx // Push userspace return pointer + swapgs ", // Push context registers @@ -99,10 +100,11 @@ function!(syscall_instruction => { // Return " pop rcx // Pop userspace return pointer - add rsp, 2 // Pop CS + add rsp, 8 // Pop CS pop r11 // Pop rflags + swapgs pop QWORD PTR gs:[0x70] // Pop userspace stack pointer - add rsp, 2 // Pop SS + add rsp, 8 // Pop SS mov rsp, gs:[0x70] // Restore userspace stack pointer swapgs // Restore gs from TSS to user data sysretq // Return into userspace; RCX=>RIP,R11=>RFLAGS diff --git a/src/arch/x86_64/start.rs b/src/arch/x86_64/start.rs index d330082..f13aeb9 100644 --- a/src/arch/x86_64/start.rs +++ b/src/arch/x86_64/start.rs @@ -267,9 +267,6 @@ pub unsafe extern "C" fn usermode(_ip: usize, _sp: usize, _arg: usize, _singlest mov ds, r8d mov es, r8d mov fs, r9d - // Exchange the old kernel GS (pointing to TSS) and kernel data - swapgs - // Replace kernel data segment with user data segment mov gs, r8d // Target RFLAGS @@ -279,7 +276,7 @@ pub unsafe extern "C" fn usermode(_ip: usize, _sp: usize, _arg: usize, _singlest // Target stack pointer mov rsp, r14 // Target argument - mov rdi, r13 + mov rdi, r15 xor rax, rax xor rbx, rbx From a3583a10ce44f70e7df9e23f5c9296a5482da474 Mon Sep 17 00:00:00 2001 From: 4lDO2 <4lDO2@protonmail.com> Date: Fri, 12 Feb 2021 23:55:23 +0100 Subject: [PATCH 06/10] Only swapgs when leaving/entering userspace code. --- src/arch/x86_64/gdt.rs | 9 ++++-- src/arch/x86_64/interrupt/exception.rs | 6 ++-- src/arch/x86_64/interrupt/handler.rs | 40 +++++++++++++++----------- src/arch/x86_64/interrupt/syscall.rs | 6 ---- src/arch/x86_64/start.rs | 1 + 5 files changed, 33 insertions(+), 29 deletions(-) diff --git a/src/arch/x86_64/gdt.rs b/src/arch/x86_64/gdt.rs index a285d9a..e999abe 100644 --- a/src/arch/x86_64/gdt.rs +++ b/src/arch/x86_64/gdt.rs @@ -193,13 +193,16 @@ pub unsafe fn init_paging(tcb_offset: usize, stack_offset: usize) { segmentation::load_fs(SegmentSelector::new(GDT_KERNEL_TLS as u16, Ring::Ring0)); segmentation::load_gs(SegmentSelector::new(GDT_KERNEL_DATA as u16, Ring::Ring0)); - // Ensure that GS always points to the TSS segment in kernel space. - //x86::msr::wrmsr(x86::msr::IA32_GS_BASE, &TSS as *const _ as usize as u64); - segmentation::load_ss(SegmentSelector::new(GDT_KERNEL_DATA as u16, Ring::Ring0)); // Load the task register task::load_tr(SegmentSelector::new(GDT_TSS as u16, Ring::Ring0)); + + // Ensure that GS always points to the TSS segment in kernel space. + x86::msr::wrmsr(x86::msr::IA32_GS_BASE, &TSS as *const _ as usize as u64); + // Inside kernel space, GS should _always_ point to the TSS. When leaving userspace, `swapgs` + // is called again, making the userspace GS always point to user data. + x86::msr::wrmsr(x86::msr::IA32_KERNEL_GSBASE, 0); } #[derive(Copy, Clone, Debug)] diff --git a/src/arch/x86_64/interrupt/exception.rs b/src/arch/x86_64/interrupt/exception.rs index 18b4f06..621379f 100644 --- a/src/arch/x86_64/interrupt/exception.rs +++ b/src/arch/x86_64/interrupt/exception.rs @@ -18,7 +18,7 @@ interrupt_stack!(divide_by_zero, |stack| { ksignal(SIGFPE); }); -interrupt_stack!(debug, |stack| { +interrupt_stack!(debug, super_atomic: swapgs_iff_ring3_slow!, |stack| { let mut handled = false; // Disable singlestep before there is a breakpoint, since the breakpoint @@ -41,7 +41,7 @@ interrupt_stack!(debug, |stack| { } }); -interrupt_stack!(non_maskable, |stack| { +interrupt_stack!(non_maskable, super_atomic: swapgs_iff_ring3_slow!, |stack| { println!("Non-maskable interrupt"); stack.dump(); }); @@ -153,7 +153,7 @@ interrupt_error!(alignment_check, |stack| { ksignal(SIGBUS); }); -interrupt_stack!(machine_check, |stack| { +interrupt_stack!(machine_check, super_atomic: swapgs_iff_ring3_slow!, |stack| { println!("Machine check fault"); stack.dump(); stack_trace(); diff --git a/src/arch/x86_64/interrupt/handler.rs b/src/arch/x86_64/interrupt/handler.rs index 776b98d..626e1e3 100644 --- a/src/arch/x86_64/interrupt/handler.rs +++ b/src/arch/x86_64/interrupt/handler.rs @@ -309,8 +309,8 @@ macro_rules! pop_fs { " }; } -macro_rules! swapgs_iff_ring3 { - (error_code: true) => { " +macro_rules! swapgs_iff_ring3_fast { + () => { " // Check whether the last two bits RSP+8 (code segment) are equal to zero. test QWORD PTR [rsp + 8], 0x3 // Skip the SWAPGS instruction if CS & 0b11 == 0b00. @@ -318,19 +318,22 @@ macro_rules! swapgs_iff_ring3 { swapgs 1: " }; - (error_code: false) => { " +} +macro_rules! swapgs_iff_ring3_fast_errorcode { + () => { " test QWORD PTR [rsp + 16], 0x3 jz 1f swapgs 1: " }; } +#[macro_export] macro_rules! swapgs_iff_ring3_slow { () => { " - push eax - push edx - push ecx - mov ecx, 0xC000_0102 + push rax + push rdx + push rcx + mov ecx, 0xC0000102 rdmsr shl rdx, 32 or eax, edx @@ -338,15 +341,17 @@ macro_rules! swapgs_iff_ring3_slow { jnz 1f swapgs 1: - pop ecx - pop edx - pop eax + pop rcx + pop rdx + pop rax " } } #[macro_export] macro_rules! interrupt_stack { - ($name:ident, |$stack:ident| $code:block) => { + // XXX: Apparently we cannot use $expr and check for bool exhaustiveness, so we will have to + // use idents directly instead. + ($name:ident, super_atomic: $is_super_atomic:ident!, |$stack:ident| $code:block) => { paste::item! { #[no_mangle] unsafe extern "C" fn [<__interrupt_ $name>](stack: *mut $crate::arch::x86_64::interrupt::InterruptStack) { @@ -362,7 +367,7 @@ macro_rules! interrupt_stack { function!($name => { // Backup all userspace registers to stack - //swapgs_if_ring3!(error_code: false), + $is_super_atomic!(), "push rax\n", push_scratch!(), push_preserved!(), @@ -383,11 +388,12 @@ macro_rules! interrupt_stack { pop_preserved!(), pop_scratch!(), - //swapgs_if_ring3!(error_code: false), + $is_super_atomic!(), "iretq\n", }); } }; + ($name:ident, |$stack:ident| $code:block) => { interrupt_stack!($name, super_atomic: swapgs_iff_ring3_fast!, |$stack| $code); }; } #[macro_export] @@ -401,7 +407,7 @@ macro_rules! interrupt { function!($name => { // Backup all userspace registers to stack - //swapgs_if_ring3!(error_code: false), + swapgs_iff_ring3_fast!(), "push rax\n", push_scratch!(), push_fs!(), @@ -419,7 +425,7 @@ macro_rules! interrupt { pop_fs!(), pop_scratch!(), - //swapgs_if_ring3!(error_code: false), + swapgs_iff_ring3_fast!(), "iretq\n", }); } @@ -443,7 +449,7 @@ macro_rules! interrupt_error { } function!($name => { - //swapgs_if_ring3!(error_code: true), + swapgs_iff_ring3_fast_errorcode!(), // Move rax into code's place, put code in last instead (to be // compatible with InterruptStack) "xchg [rsp], rax\n", @@ -474,7 +480,7 @@ macro_rules! interrupt_error { pop_preserved!(), pop_scratch!(), - //swapgs_if_ring3!(error_code: true), + swapgs_iff_ring3_fast_errorcode!(), "iretq\n", }); } diff --git a/src/arch/x86_64/interrupt/syscall.rs b/src/arch/x86_64/interrupt/syscall.rs index 1df5906..ef8ec59 100644 --- a/src/arch/x86_64/interrupt/syscall.rs +++ b/src/arch/x86_64/interrupt/syscall.rs @@ -23,10 +23,6 @@ pub unsafe fn init() { msr::wrmsr(msr::IA32_LSTAR, syscall_instruction as u64); msr::wrmsr(msr::IA32_FMASK, 0x0300); // Clear trap flag and interrupt enable - // Inside kernel space, GS should _always_ point to the TSS. When leaving userspace, `swapgs` - // is called again, making the userspace GS always point to user data. - x86::msr::wrmsr(x86::msr::IA32_KERNEL_GSBASE, &gdt::TSS as *const _ as usize as u64); - let efer = msr::rdmsr(msr::IA32_EFER); msr::wrmsr(msr::IA32_EFER, efer | 1); } @@ -73,7 +69,6 @@ function!(syscall_instruction => { push r11 // Push rflags push QWORD PTR 6 * 8 + 3 // Push fake userspace CS (resembling iret frame) push rcx // Push userspace return pointer - swapgs ", // Push context registers @@ -102,7 +97,6 @@ function!(syscall_instruction => { pop rcx // Pop userspace return pointer add rsp, 8 // Pop CS pop r11 // Pop rflags - swapgs pop QWORD PTR gs:[0x70] // Pop userspace stack pointer add rsp, 8 // Pop SS mov rsp, gs:[0x70] // Restore userspace stack pointer diff --git a/src/arch/x86_64/start.rs b/src/arch/x86_64/start.rs index f13aeb9..3ebe195 100644 --- a/src/arch/x86_64/start.rs +++ b/src/arch/x86_64/start.rs @@ -262,6 +262,7 @@ pub unsafe extern "C" fn usermode(_ip: usize, _sp: usize, _arg: usize, _singlest call {pti_unmap} // Go to usermode + swapgs mov r8, {user_data_seg_selector} mov r9, {user_tls_seg_selector} mov ds, r8d From a183953ee898c7cfe2266e405d35235872bfecd0 Mon Sep 17 00:00:00 2001 From: 4lDO2 <4lDO2@protonmail.com> Date: Sat, 13 Feb 2021 00:31:46 +0100 Subject: [PATCH 07/10] Motivate usage of the IST without SWAPGS involved. --- src/arch/x86_64/idt.rs | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/src/arch/x86_64/idt.rs b/src/arch/x86_64/idt.rs index 407db03..06c7171 100644 --- a/src/arch/x86_64/idt.rs +++ b/src/arch/x86_64/idt.rs @@ -155,10 +155,11 @@ pub unsafe fn init_generic(is_bsp: bool, idt: &mut Idt) { IDTR.base = current_idt.as_ptr() as *const X86IdtEntry; let backup_ist = { - // A problem with SWAPGS, is that if a non-maskable interrupt were to occur in the middle - // of swapping, the CS would now point to the new kernel CS from the kernel-triggered - // interrupt, and no swap would occur. Thus, we give the NMI handler a separate stack. This - // is also true for Machine Check, and for Double Faults, but for other reasons. + // We give Non-Maskable Interrupts, Double Fault, and Machine Check exceptions separate + // stacks, since these (unless we are going to set up NMI watchdogs like Linux does) are + // considered the most fatal, especially Double Faults which are caused by errors __when + // accessing the system IDT__. If that goes wrong, then kernel memory may be partially + // corrupt, and we want a separate stack. // // Note that each CPU has its own "backup interrupt stack". let index = 1_u8; From ff33090fd09dc9a0617a39ce2cee2ec9028e985d Mon Sep 17 00:00:00 2001 From: 4lDO2 <4lDO2@protonmail.com> Date: Mon, 15 Feb 2021 16:48:34 +0100 Subject: [PATCH 08/10] Check whether RCX is canonical in sysretq. --- src/arch/x86_64/interrupt/syscall.rs | 73 ++++++++++++++++++++++++---- 1 file changed, 63 insertions(+), 10 deletions(-) diff --git a/src/arch/x86_64/interrupt/syscall.rs b/src/arch/x86_64/interrupt/syscall.rs index ef8ec59..a404728 100644 --- a/src/arch/x86_64/interrupt/syscall.rs +++ b/src/arch/x86_64/interrupt/syscall.rs @@ -58,16 +58,29 @@ pub unsafe extern "C" fn __inner_syscall_instruction(stack: *mut InterruptStack) }); } +macro_rules! fast_sysretq { + () => { " + pop rcx // Pop userspace return pointer + sub rsp, 8 // Pop fake userspace CS + pop r11 // Pop rflags + pop QWORD PTR gs:[0x70] // Pop userspace stack pointer + mov rsp, gs:[0x70] // Restore userspace stack pointer + swapgs // Restore gs from TSS to user data + ud2 + sysretq // Return into userspace; RCX=>RIP,R11=>RFLAGS + " } +} + function!(syscall_instruction => { // Yes, this is magic. No, you don't need to understand " swapgs // Set gs segment to TSS mov gs:[0x70], rsp // Save userspace stack pointer mov rsp, gs:[4] // Load kernel stack pointer - push QWORD PTR 5 * 8 + 3 // Push fake userspace SS (resembling iret frame) + push QWORD PTR 5 * 8 + 3 // Push fake SS (resembling iret stack frame) push QWORD PTR gs:[0x70] // Push userspace rsp push r11 // Push rflags - push QWORD PTR 6 * 8 + 3 // Push fake userspace CS (resembling iret frame) + push QWORD PTR 6 * 8 + 3 // Push fake CS (resembling iret stack frame) push rcx // Push userspace return pointer ", @@ -94,14 +107,54 @@ function!(syscall_instruction => { // Return " - pop rcx // Pop userspace return pointer - add rsp, 8 // Pop CS - pop r11 // Pop rflags - pop QWORD PTR gs:[0x70] // Pop userspace stack pointer - add rsp, 8 // Pop SS - mov rsp, gs:[0x70] // Restore userspace stack pointer - swapgs // Restore gs from TSS to user data - sysretq // Return into userspace; RCX=>RIP,R11=>RFLAGS + // Test whether RCX is canonical. This is not strictly necessary, but could be fatal if + // some kernel bug would allow RCX to be modified by user code. + + // Peek at the preserved RCX, to double-check that it is canonical. + // Set ZF iff bit 47 (i.e. the bit that must be sign extended) is set. + + // TODO: Any hacky one-instruction canonicalness check? + + bt QWORD PTR [rsp], 47 + + // Jump to the canonicalness check. + jnz 2f + + // Otherwise, continue with the fast return. + 1: + // Fast sysretq + ", + + fast_sysretq!(), + + " + // Jumps to aligned addresses are faster, and it is not always the case the userspace + // doesn't call from higher half (though unlikely). + .align 8, 0x90 + + 2: + + // Load userspace return RIP. + mov rcx, [rsp] + + // Shift it to the right by 47, only getting the bits which must all be zero or one. + shr rcx, 47 + // Since we have already checked that RCX[47] == 1, RCX[63:47] must all be one. + cmp rcx, 0x1FFFF + // If the address is canonical, go to the slower iretq. + jne 3f + ", + + fast_sysretq!(), + + " + 3: + // Slow iretq + + xor rcx, rcx + xor r11, r11 + swapgs + iretq ", }); From 5b2df9f504f624da784a90bfa0fe2cdaed604ae2 Mon Sep 17 00:00:00 2001 From: 4lDO2 <4lDO2@protonmail.com> Date: Mon, 15 Feb 2021 17:27:19 +0100 Subject: [PATCH 09/10] Document why usermode() can omit rcx check. --- src/arch/x86_64/start.rs | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/src/arch/x86_64/start.rs b/src/arch/x86_64/start.rs index 3ebe195..c0a2ac4 100644 --- a/src/arch/x86_64/start.rs +++ b/src/arch/x86_64/start.rs @@ -297,6 +297,12 @@ pub unsafe extern "C" fn usermode(_ip: usize, _sp: usize, _arg: usize, _singlest xor r15, r15 fninit + + // NOTE: Regarding the sysretq vulnerability, this is safe as we cannot modify RCX, + // even though the caller can give us the wrong address. But, it's marked unsafe, so + // the caller is responsible for this! (And, the likelihood of rcx being changed in the + // middle here, is minimal, unless the attacker already has partial control of kernel + // memory.) sysretq ", From 8eb58891aac0cbf192458f7cb72df35560c57888 Mon Sep 17 00:00:00 2001 From: 4lDO2 <4lDO2@protonmail.com> Date: Mon, 15 Feb 2021 19:04:43 +0100 Subject: [PATCH 10/10] Simplify sysretq code. --- src/arch/x86_64/interrupt/syscall.rs | 72 +++++++++------------------- 1 file changed, 22 insertions(+), 50 deletions(-) diff --git a/src/arch/x86_64/interrupt/syscall.rs b/src/arch/x86_64/interrupt/syscall.rs index a404728..0b0c514 100644 --- a/src/arch/x86_64/interrupt/syscall.rs +++ b/src/arch/x86_64/interrupt/syscall.rs @@ -58,19 +58,6 @@ pub unsafe extern "C" fn __inner_syscall_instruction(stack: *mut InterruptStack) }); } -macro_rules! fast_sysretq { - () => { " - pop rcx // Pop userspace return pointer - sub rsp, 8 // Pop fake userspace CS - pop r11 // Pop rflags - pop QWORD PTR gs:[0x70] // Pop userspace stack pointer - mov rsp, gs:[0x70] // Restore userspace stack pointer - swapgs // Restore gs from TSS to user data - ud2 - sysretq // Return into userspace; RCX=>RIP,R11=>RFLAGS - " } -} - function!(syscall_instruction => { // Yes, this is magic. No, you don't need to understand " @@ -106,51 +93,36 @@ function!(syscall_instruction => { pop_scratch!(), // Return + // + // We must test whether RCX is canonical. This is not strictly necessary, but could be + // fatal if some kernel bug would allow RCX to be modified by user code. + // + // See https://xenproject.org/2012/06/13/the-intel-sysret-privilege-escalation/. + // + // This is not just theoretical; ptrace allows userspace to change rcx of target processes. " - // Test whether RCX is canonical. This is not strictly necessary, but could be fatal if - // some kernel bug would allow RCX to be modified by user code. + pop rcx // Pop userspace return pointer - // Peek at the preserved RCX, to double-check that it is canonical. - // Set ZF iff bit 47 (i.e. the bit that must be sign extended) is set. + // Set ZF iff forbidden bit 47 (i.e. the bit that must be sign extended) is set. + bt rcx, 47 - // TODO: Any hacky one-instruction canonicalness check? + // If ZF was set, i.e. the address was invalid higher-half, so jump to the slower iretq and + // handle the error without being able to execute attacker-controlled code! + jmp 1f - bt QWORD PTR [rsp], 47 + // Otherwise, continue with the fast sysretq. - // Jump to the canonicalness check. - jnz 2f + sub rsp, 8 // Pop fake userspace CS + pop r11 // Pop rflags + pop QWORD PTR gs:[0x70] // Pop userspace stack pointer + mov rsp, gs:[0x70] // Restore userspace stack pointer + swapgs // Restore gs from TSS to user data + sysretq // Return into userspace; RCX=>RIP,R11=>RFLAGS - // Otherwise, continue with the fast return. - 1: - // Fast sysretq - ", +1: - fast_sysretq!(), - - " - // Jumps to aligned addresses are faster, and it is not always the case the userspace - // doesn't call from higher half (though unlikely). - .align 8, 0x90 - - 2: - - // Load userspace return RIP. - mov rcx, [rsp] - - // Shift it to the right by 47, only getting the bits which must all be zero or one. - shr rcx, 47 - // Since we have already checked that RCX[47] == 1, RCX[63:47] must all be one. - cmp rcx, 0x1FFFF - // If the address is canonical, go to the slower iretq. - jne 3f - ", - - fast_sysretq!(), - - " - 3: // Slow iretq - + push rcx xor rcx, rcx xor r11, r11 swapgs