diff --git a/Cargo.lock b/Cargo.lock index 6c52bb1..0d6c719 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1,7 +1,5 @@ # This file is automatically @generated by Cargo. # It is not intended for manual editing. -version = 3 - [[package]] name = "autocfg" version = "1.0.1" diff --git a/Cargo.toml b/Cargo.toml index 766f1bd..9f651a6 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -56,6 +56,9 @@ serial_debug = [] system76_ec_debug = [] slab = ["slab_allocator"] +# TODO: Either wait for LLVM 12 and use target_feature, or use another system for cpu features +x86_fsgsbase = [] + [profile.dev] # Kernel doesn't yet work great with debug mode :( opt-level = 3 diff --git a/src/arch/x86_64/gdt.rs b/src/arch/x86_64/gdt.rs index 32553b0..94ad795 100644 --- a/src/arch/x86_64/gdt.rs +++ b/src/arch/x86_64/gdt.rs @@ -19,9 +19,8 @@ pub const GDT_KERNEL_KPCR: usize = 3; pub const GDT_USER_CODE32_UNUSED: usize = 4; pub const GDT_USER_DATA: usize = 5; pub const GDT_USER_CODE: usize = 6; -pub const GDT_USER_TLS: usize = 7; -pub const GDT_TSS: usize = 8; -pub const GDT_TSS_HIGH: usize = 9; +pub const GDT_TSS: usize = 7; +pub const GDT_TSS_HIGH: usize = 8; pub const GDT_A_PRESENT: u8 = 1 << 7; pub const GDT_A_RING_0: u8 = 0 << 5; @@ -53,7 +52,7 @@ static mut INIT_GDT: [GdtEntry; 4] = [ ]; #[thread_local] -pub static mut GDT: [GdtEntry; 10] = [ +pub static mut GDT: [GdtEntry; 9] = [ // Null GdtEntry::new(0, 0, 0, 0), // Kernel code @@ -68,8 +67,6 @@ pub static mut GDT: [GdtEntry; 10] = [ GdtEntry::new(0, 0, GDT_A_PRESENT | GDT_A_RING_3 | GDT_A_SYSTEM | GDT_A_PRIVILEGE, GDT_F_LONG_MODE), // User (64-bit) code GdtEntry::new(0, 0, GDT_A_PRESENT | GDT_A_RING_3 | GDT_A_SYSTEM | GDT_A_EXECUTABLE | GDT_A_PRIVILEGE, GDT_F_LONG_MODE), - // User TLS - GdtEntry::new(0, 0, GDT_A_PRESENT | GDT_A_RING_3 | GDT_A_SYSTEM | GDT_A_PRIVILEGE, GDT_F_LONG_MODE), // TSS GdtEntry::new(0, 0, GDT_A_PRESENT | GDT_A_RING_3 | GDT_A_TSS_AVAIL, 0), // TSS must be 16 bytes long, twice the normal size @@ -106,11 +103,6 @@ pub static mut KPCR: ProcessorControlRegion = ProcessorControlRegion { }), }; -pub unsafe fn set_tcb(pid: usize) { - GDT[GDT_USER_TLS].set_offset((crate::USER_TCB_OFFSET + pid * PAGE_SIZE) as u32); - x86::segmentation::load_fs(SegmentSelector::new(GDT_USER_TLS as u16, Ring::Ring3)); -} - #[cfg(feature = "pti")] pub unsafe fn set_tss_stack(stack: usize) { use super::pti::{PTI_CPU_STACK, PTI_CONTEXT_STACK}; @@ -199,8 +191,8 @@ pub unsafe fn init_paging(tcb_offset: usize, stack_offset: usize) { // is called again, making the userspace GS always point to user data. x86::msr::wrmsr(x86::msr::IA32_KERNEL_GSBASE, 0); - // Set the User TLS segment to the offset of the user TCB - set_tcb(0); + // Set the User TLS segment to zero, before we create any contexts and start scheduling. + x86::msr::wrmsr(x86::msr::IA32_FS_BASE, 0); // Reload the segment descriptors load_cs(SegmentSelector::new(GDT_KERNEL_CODE as u16, Ring::Ring0)); @@ -214,6 +206,18 @@ pub unsafe fn init_paging(tcb_offset: usize, stack_offset: usize) { // Load the task register task::load_tr(SegmentSelector::new(GDT_TSS as u16, Ring::Ring0)); + + let has_fsgsbase = raw_cpuid::CpuId::new() + .get_extended_feature_info() + .map_or(false, |extended_features| extended_features.has_fsgsbase()); + + if cfg!(feature = "x86_fsgsbase") { + assert!(has_fsgsbase, "running kernel with features not supported by the current CPU"); + } + + if has_fsgsbase { + x86::controlregs::cr4_write(x86::controlregs::cr4() | x86::controlregs::Cr4::CR4_ENABLE_FSGSBASE); + } } #[derive(Copy, Clone, Debug)] diff --git a/src/arch/x86_64/mod.rs b/src/arch/x86_64/mod.rs index 88e5073..0811d26 100644 --- a/src/arch/x86_64/mod.rs +++ b/src/arch/x86_64/mod.rs @@ -45,6 +45,7 @@ pub use ::rmm::X8664Arch as CurrentRmmArch; // Flags pub mod flags { - pub const FLAG_SINGLESTEP: usize = 1 << 8; + pub const SHIFT_SINGLESTEP: usize = 8; + pub const FLAG_SINGLESTEP: usize = 1 << SHIFT_SINGLESTEP; pub const FLAG_INTERRUPTS: usize = 1 << 9; } diff --git a/src/arch/x86_64/start.rs b/src/arch/x86_64/start.rs index c0a2ac4..8f82b98 100644 --- a/src/arch/x86_64/start.rs +++ b/src/arch/x86_64/start.rs @@ -239,45 +239,130 @@ pub unsafe extern fn kstart_ap(args_ptr: *const KernelArgsAp) -> ! { crate::kmain_ap(cpu_id); } +#[cfg(not(feature = "pit"))] +macro_rules! inner_pit_unmap( + () => { + " + // unused: {pti_unmap} + " + } +); +#[cfg(feature = "pit")] +macro_rules! inner_pit_unmap( + () => { + " + push rdi + push rsi + push rdx + push rcx + sub rsp, 8 + + call {pti_unmap} + + add rsp, 8 + pop rcx + pop rdx + pop rsi + pop rdi + " + } +); + +#[cfg(not(feature = "x86_fsgsbase"))] +macro_rules! save_fsgsbase( + () => { + " + mov ecx, {MSR_FSBASE} + rdmsr + shl rdx, 32 + mov edx, eax + mov r14, rdx + + mov ecx, {MSR_GSBASE} + rdmsr + shl rdx, 32 + mov edx, eax + mov r13, rdx + " + } +); +#[cfg(feature = "x86_fsgsbase")] +macro_rules! save_fsgsbase( + () => { + " + // placeholder: {MSR_FSBASE} {MSR_GSBASE} + rdfsbase r14 + rdgsbase r13 + " + } +); + +#[cfg(feature = "x86_fsgsbase")] +macro_rules! restore_fsgsbase( + () => { + " + wrfsbase r14 + wrgsbase r13 + " + } +); + +#[cfg(not(feature = "x86_fsgsbase"))] +macro_rules! restore_fsgsbase( + () => { + " + mov ecx, {MSR_FSBASE} + mov rdx, r14 + mov eax, edx + shr rdx, 32 + wrmsr + + mov ecx, {MSR_GSBASE} + mov rdx, r13 + mov eax, edx + shr rdx, 32 + wrmsr + " + } +); + #[naked] #[inline(never)] // TODO: AbiCompatBool -pub unsafe extern "C" fn usermode(_ip: usize, _sp: usize, _arg: usize, _singlestep: u32) -> ! { +pub unsafe extern "C" fn usermode(_ip: usize, _sp: usize, _arg: usize, _is_singlestep: usize) -> ! { // rdi, rsi, rdx, rcx asm!( - " - mov rbx, {flag_interrupts} - test ecx, ecx - jz .after_singlestep_branch - or rbx, {flag_singlestep} + concat!(" + shl rcx, {shift_singlestep} + or rcx, {flag_interrupts} - .after_singlestep_branch: + ", inner_pit_unmap!(), " - // save `ip` (rdi), `sp` (rsi), and `arg` (rdx) in callee-preserved registers, so that - // they are not modified by `pti_unmap` + // Save rdx for later + mov r12, rdx - mov r13, rdi - mov r14, rsi - mov r15, rdx - call {pti_unmap} + // Target RFLAGS + mov r11, rcx // Go to usermode swapgs - mov r8, {user_data_seg_selector} - mov r9, {user_tls_seg_selector} - mov ds, r8d - mov es, r8d - mov fs, r9d - mov gs, r8d - // Target RFLAGS - mov r11, rbx + ", save_fsgsbase!(), " + + mov r15, {user_data_seg_selector} + mov ds, r15d + mov es, r15d + mov fs, r15d + mov gs, r15d + + ", restore_fsgsbase!(), " + // Target instruction pointer - mov rcx, r13 + mov rcx, rdi // Target stack pointer - mov rsp, r14 + mov rsp, rsi // Target argument - mov rdi, r15 + mov rdi, r12 xor rax, rax xor rbx, rbx @@ -304,13 +389,16 @@ pub unsafe extern "C" fn usermode(_ip: usize, _sp: usize, _arg: usize, _singlest // middle here, is minimal, unless the attacker already has partial control of kernel // memory.) sysretq - ", + "), flag_interrupts = const(FLAG_INTERRUPTS), - flag_singlestep = const(FLAG_SINGLESTEP), + shift_singlestep = const(SHIFT_SINGLESTEP), pti_unmap = sym pti::unmap, user_data_seg_selector = const(gdt::GDT_USER_DATA << 3 | 3), - user_tls_seg_selector = const(gdt::GDT_USER_TLS << 3 | 3), + + MSR_FSBASE = const(x86::msr::IA32_FS_BASE), + MSR_GSBASE = const(x86::msr::IA32_GS_BASE), + options(noreturn), ); } diff --git a/src/context/arch/x86_64.rs b/src/context/arch/x86_64.rs index b895965..bf35ad5 100644 --- a/src/context/arch/x86_64.rs +++ b/src/context/arch/x86_64.rs @@ -36,6 +36,10 @@ pub struct Context { rbp: usize, /// Stack pointer rsp: usize, + /// FSBASE + pub fsbase: usize, + /// GSBASE + gsbase: usize, /// FX valid? loadable: AbiCompatBool, } @@ -48,7 +52,7 @@ enum AbiCompatBool { } impl Context { - pub fn new() -> Context { + pub fn new(pid: usize) -> Context { Context { loadable: AbiCompatBool::False, fx: 0, @@ -60,9 +64,14 @@ impl Context { r14: 0, r15: 0, rbp: 0, - rsp: 0 + rsp: 0, + fsbase: crate::USER_TCB_OFFSET + pid * crate::memory::PAGE_SIZE, + gsbase: 0, } } + pub fn update_tcb(&mut self, pid: usize) { + self.fsbase = crate::USER_TCB_OFFSET + pid * crate::memory::PAGE_SIZE; + } pub fn get_page_utable(&mut self) -> usize { self.cr3 @@ -138,6 +147,66 @@ impl Context { } } +macro_rules! switch_msr( + ($name:literal, $offset:literal) => { + concat!(" + // EDX:EAX <= MSR + + mov ecx, {", $name, "} + rdmsr + shl rdx, 32 + mov edx, eax + + // Save old, load new. + + mov [rdi + {", $offset, "}], rdx + mov rdx, [rsi + {", $offset, "}] + mov eax, edx + shr rdx, 32 + + // MSR <= EDX:EAX + wrmsr + ") + } +); + +// NOTE: RAX is a scratch register and can be set to whatever. There is also no return +// value in switch_to, to it will also never be read. The same goes for RDX, and RCX. +// TODO: Use runtime code patching (perhaps in the bootloader) by pushing alternative code +// sequences into a specialized section, with some macro resembling Linux's `.ALTERNATIVE`. +#[cfg(feature = "x86_fsgsbase")] +macro_rules! switch_fsgsbase( + () => { + " + // placeholder: {MSR_FSBASE} {MSR_KERNELGSBASE} + + rdfsbase rax + mov [rdi + {off_fsbase}], rax + mov rax, [rsi + {off_fsbase}] + wrfsbase rax + + swapgs + rdgsbase rax + mov [rdi + {off_gsbase}], rax + mov rax, [rsi + {off_gsbase}] + wrgsbase rax + swapgs + " + } +); + +#[cfg(not(feature = "x86_fsgsbase"))] +macro_rules! switch_fsgsbase( + () => { + // TODO: Is it faster to perform two 32-bit memory accesses, rather than shifting? + concat!( + switch_msr!("MSR_FSBASE", "off_fsbase"), + switch_msr!("MSR_KERNELGSBASE", "off_gsbase"), + ) + } +); + + /// Switch to the next context by restoring its stack and registers /// Check disassembly! #[inline(never)] @@ -152,7 +221,7 @@ pub unsafe extern "C" fn switch_to(_prev: &mut Context, _next: &mut Context) { // - we can modify scratch registers, e.g. rax // - we cannot change callee-preserved registers arbitrarily, e.g. rbx, which is why we // store them here in the first place. - " + concat!(" // load `prev.fx` mov rax, [rdi + {off_fx}] @@ -163,26 +232,26 @@ pub unsafe extern "C" fn switch_to(_prev: &mut Context, _next: &mut Context) { mov BYTE PTR [rdi + {off_loadable}], {true} // compare `next.loadable` with true cmp BYTE PTR [rsi + {off_loadable}], {true} - je switch_to.next_is_loadable + je 3f fninit - jmp switch_to.after_fx + jmp 3f - switch_to.next_is_loadable: +2: mov rax, [rsi + {off_fx}] fxrstor64 [rax] - switch_to.after_fx: +3: // Save the current CR3, and load the next CR3 if not identical mov rcx, cr3 mov [rdi + {off_cr3}], rcx mov rax, [rsi + {off_cr3}] cmp rax, rcx - je switch_to.same_cr3 + je 4f mov cr3, rax - switch_to.same_cr3: +4: // Save old registers, and load new ones mov [rdi + {off_rbx}], rbx mov rbx, [rsi + {off_rbx}] @@ -205,6 +274,10 @@ pub unsafe extern "C" fn switch_to(_prev: &mut Context, _next: &mut Context) { mov [rdi + {off_rsp}], rsp mov rsp, [rsi + {off_rsp}] + ", + switch_fsgsbase!(), + " + // push RFLAGS (can only be modified via stack) pushfq // pop RFLAGS into `self.rflags` @@ -222,7 +295,7 @@ pub unsafe extern "C" fn switch_to(_prev: &mut Context, _next: &mut Context) { // Note that switch_finish_hook will be responsible for executing `ret`. jmp {switch_hook} - ", + "), off_fx = const(offset_of!(Cx, fx)), off_cr3 = const(offset_of!(Cx, cr3)), @@ -237,12 +310,17 @@ pub unsafe extern "C" fn switch_to(_prev: &mut Context, _next: &mut Context) { off_rbp = const(offset_of!(Cx, rbp)), off_rsp = const(offset_of!(Cx, rsp)), + off_fsbase = const(offset_of!(Cx, fsbase)), + off_gsbase = const(offset_of!(Cx, gsbase)), + + MSR_FSBASE = const(x86::msr::IA32_FS_BASE), + MSR_KERNELGSBASE = const(x86::msr::IA32_KERNEL_GSBASE), + true = const(AbiCompatBool::True as u8), switch_hook = sym crate::context::switch_finish_hook, options(noreturn), ); } - #[allow(dead_code)] #[repr(packed)] pub struct SignalHandlerStack { diff --git a/src/context/context.rs b/src/context/context.rs index 299e303..a6e366c 100644 --- a/src/context/context.rs +++ b/src/context/context.rs @@ -282,7 +282,7 @@ impl Context { waitpid: Arc::new(WaitMap::new()), pending: VecDeque::new(), wake: None, - arch: arch::Context::new(), + arch: arch::Context::new(id.into()), kfx: None, kstack: None, ksig: None, diff --git a/src/context/signal.rs b/src/context/signal.rs index 3fe1687..ae6b252 100644 --- a/src/context/signal.rs +++ b/src/context/signal.rs @@ -122,7 +122,7 @@ pub extern "C" fn signal_handler(sig: usize) { sp -= mem::size_of::(); *(sp as *mut usize) = restorer; - usermode(handler, sp, sig, u32::from(singlestep)); + usermode(handler, sp, sig, usize::from(singlestep)); } } diff --git a/src/context/switch.rs b/src/context/switch.rs index a823854..9d7c38c 100644 --- a/src/context/switch.rs +++ b/src/context/switch.rs @@ -173,7 +173,6 @@ pub unsafe fn switch() -> bool { if let Some(ref stack) = to_context.kstack { gdt::set_tss_stack(stack.as_ptr() as usize + stack.len()); } - gdt::set_tcb(to_context.id.into()); } #[cfg(target_arch = "aarch64")] { diff --git a/src/syscall/process.rs b/src/syscall/process.rs index 850dfeb..e0ff324 100644 --- a/src/syscall/process.rs +++ b/src/syscall/process.rs @@ -378,6 +378,10 @@ pub fn clone(flags: CloneFlags, stack_base: usize) -> Result { let mut new_ktable = unsafe { InactivePageTable::from_address(new_utable.address()) }; + #[cfg(target_arch = "x86_64")] + { + context.arch.update_tcb(pid.into()); + } // Copy kernel image mapping { @@ -904,7 +908,7 @@ fn fexec_noreturn( } // Go to usermode - unsafe { usermode(entry, sp, 0, u32::from(singlestep)) } + unsafe { usermode(entry, sp, 0, usize::from(singlestep)) } } pub fn fexec_kernel(fd: FileHandle, args: Box<[Box<[u8]>]>, vars: Box<[Box<[u8]>]>, name_override_opt: Option>, auxv: Option>) -> Result {