From 1047728f3542e55048a19fe4ecf6d1c40dbfae3f Mon Sep 17 00:00:00 2001 From: 4lDO2 <4lDO2@protonmail.com> Date: Fri, 6 Aug 2021 15:41:07 +0200 Subject: [PATCH 1/4] Only set process regs for faults from ring 3. This fixes a deadlock that might occur if a page fault is triggered while a lock to the current context is held. --- Cargo.lock | 4 +++- src/arch/x86_64/interrupt/handler.rs | 16 +++++++++++++++- 2 files changed, 18 insertions(+), 2 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 0d6c719..706ed5e 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1,5 +1,7 @@ # This file is automatically @generated by Cargo. # It is not intended for manual editing. +version = 3 + [[package]] name = "autocfg" version = "1.0.1" @@ -165,7 +167,7 @@ dependencies = [ [[package]] name = "redox_syscall" -version = "0.2.9" +version = "0.2.10" dependencies = [ "bitflags", ] diff --git a/src/arch/x86_64/interrupt/handler.rs b/src/arch/x86_64/interrupt/handler.rs index 877792d..d3921f0 100644 --- a/src/arch/x86_64/interrupt/handler.rs +++ b/src/arch/x86_64/interrupt/handler.rs @@ -331,6 +331,9 @@ macro_rules! interrupt_stack { paste::item! { #[no_mangle] unsafe extern "C" fn [<__interrupt_ $name>]($stack: &mut $crate::arch::x86_64::interrupt::InterruptStack) { + // Deadlock safety: interrupts are not normally enabled in the kernel, except in + // kmain. However, no locks for context list nor even individual context locks, are + // ever meant to be acquired there. let _guard = $crate::ptrace::set_process_regs($stack); // TODO: Force the declarations to specify unsafe? @@ -410,7 +413,18 @@ macro_rules! interrupt_error { paste::item! { #[no_mangle] unsafe extern "C" fn [<__interrupt_ $name>]($stack: &mut $crate::arch::x86_64::interrupt::handler::InterruptErrorStack) { - let _guard = $crate::ptrace::set_process_regs(&mut $stack.inner); + let _guard; + + // Only set_ptrace_process_regs if this error occured from userspace. If this fault + // originated from kernel mode, we have no idea what it might have locked (and + // kernel mode faults are never meant to occur unless something is wrong, and will + // not context switch anyway, rendering that statement useless in such a case + // anyway). + // + // Check the privilege level of CS against ring 3. + if $stack.inner.iret.cs & 0b11 == 0b11 { + _guard = $crate::ptrace::set_process_regs(&mut $stack.inner); + } #[allow(unused_unsafe)] unsafe { From d7a1c6255b863935988b8b7a88baa7b6df306cc5 Mon Sep 17 00:00:00 2001 From: 4lDO2 <4lDO2@protonmail.com> Date: Fri, 6 Aug 2021 17:34:56 +0200 Subject: [PATCH 2/4] Fix possible race condition in paranoid ISRs. Additionally, because it turned out to be infeasible to rely on link-time constants in global_asm! code, I have also converted the interrupt handlers to naked fns. This removes the proc-macro-reliant "paste" dependency, but inserts a tiny ud2 at the end of every ISR. --- Cargo.lock | 7 - Cargo.toml | 1 - src/arch/x86_64/consts.rs | 3 +- src/arch/x86_64/gdt.rs | 5 +- src/arch/x86_64/interrupt/exception.rs | 6 +- src/arch/x86_64/interrupt/handler.rs | 206 +++++++++++++++++++------ 6 files changed, 169 insertions(+), 59 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 706ed5e..6184caf 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -75,7 +75,6 @@ dependencies = [ "linked_list_allocator 0.9.0", "log", "memoffset", - "paste", "raw-cpuid 8.1.2", "redox_syscall", "rmm", @@ -131,12 +130,6 @@ dependencies = [ "autocfg", ] -[[package]] -name = "paste" -version = "1.0.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "acbf547ad0c65e31259204bd90935776d1c693cec2f4ff7abb7a1bbbd40dfe58" - [[package]] name = "plain" version = "0.2.3" diff --git a/Cargo.toml b/Cargo.toml index 9f651a6..c58a7ec 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -22,7 +22,6 @@ memoffset = { version = "0.6", features = ["unstable_const"] } redox_syscall = { path = "syscall" } slab_allocator = { path = "slab_allocator", optional = true } spin = "0.9" -paste = "1" rmm = { path = "rmm", default-features = false } [dependencies.goblin] diff --git a/src/arch/x86_64/consts.rs b/src/arch/x86_64/consts.rs index 5589106..2d9ac40 100644 --- a/src/arch/x86_64/consts.rs +++ b/src/arch/x86_64/consts.rs @@ -28,7 +28,8 @@ pub const KERNEL_PERCPU_OFFSET: usize = KERNEL_TMP_MISC_OFFSET - PML4_SIZE; pub const KERNEL_PERCPU_PML4: usize = (KERNEL_PERCPU_OFFSET & PML4_MASK)/PML4_SIZE; /// Size of kernel percpu variables - pub const KERNEL_PERCPU_SIZE: usize = 64 * 1024; // 64 KB + pub const KERNEL_PERCPU_SHIFT: u8 = 16; // 2^16 = 64 KiB + pub const KERNEL_PERCPU_SIZE: usize = 1_usize << KERNEL_PERCPU_SHIFT; /// Offset of physmap // This needs to match RMM's PHYS_OFFSET diff --git a/src/arch/x86_64/gdt.rs b/src/arch/x86_64/gdt.rs index 94ad795..5e974c3 100644 --- a/src/arch/x86_64/gdt.rs +++ b/src/arch/x86_64/gdt.rs @@ -52,7 +52,7 @@ static mut INIT_GDT: [GdtEntry; 4] = [ ]; #[thread_local] -pub static mut GDT: [GdtEntry; 9] = [ +pub static mut GDT: [GdtEntry; 10] = [ // Null GdtEntry::new(0, 0, 0, 0), // Kernel code @@ -71,6 +71,9 @@ pub static mut GDT: [GdtEntry; 9] = [ GdtEntry::new(0, 0, GDT_A_PRESENT | GDT_A_RING_3 | GDT_A_TSS_AVAIL, 0), // TSS must be 16 bytes long, twice the normal size GdtEntry::new(0, 0, 0, 0), + // Unused entry which stores the CPU ID. This is necessary for paranoid interrupts as they have + // no other way of determining it. + GdtEntry::new(0, 0, 0, 0), ]; #[repr(C, align(16))] diff --git a/src/arch/x86_64/interrupt/exception.rs b/src/arch/x86_64/interrupt/exception.rs index 48c0353..242c7cf 100644 --- a/src/arch/x86_64/interrupt/exception.rs +++ b/src/arch/x86_64/interrupt/exception.rs @@ -18,7 +18,7 @@ interrupt_stack!(divide_by_zero, |stack| { ksignal(SIGFPE); }); -interrupt_stack!(debug, super_atomic: swapgs_iff_ring3_slow!, |stack| { +interrupt_stack!(debug, @paranoid, |stack| { let mut handled = false; // Disable singlestep before there is a breakpoint, since the breakpoint @@ -41,7 +41,7 @@ interrupt_stack!(debug, super_atomic: swapgs_iff_ring3_slow!, |stack| { } }); -interrupt_stack!(non_maskable, super_atomic: swapgs_iff_ring3_slow!, |stack| { +interrupt_stack!(non_maskable, @paranoid, |stack| { println!("Non-maskable interrupt"); stack.dump(); }); @@ -158,7 +158,7 @@ interrupt_error!(alignment_check, |stack| { ksignal(SIGBUS); }); -interrupt_stack!(machine_check, super_atomic: swapgs_iff_ring3_slow!, |stack| { +interrupt_stack!(machine_check, @paranoid, |stack| { println!("Machine check fault"); stack.dump(); stack_trace(); diff --git a/src/arch/x86_64/interrupt/handler.rs b/src/arch/x86_64/interrupt/handler.rs index d3921f0..58ee4dd 100644 --- a/src/arch/x86_64/interrupt/handler.rs +++ b/src/arch/x86_64/interrupt/handler.rs @@ -303,38 +303,125 @@ macro_rules! swapgs_iff_ring3_fast_errorcode { 1: " }; } -#[macro_export] -macro_rules! swapgs_iff_ring3_slow { + +#[cfg(feature = "x86_fsgsbase")] +macro_rules! save_gsbase_paranoid { () => { " - push rax - push rdx - push rcx - mov ecx, 0xC0000102 - rdmsr - shl rdx, 32 - or eax, edx - test rdx, rdx - jnz 1f - swapgs - 1: - pop rcx - pop rdx - pop rax + // Unused: {IA32_GS_BASE} + rdgsbase rax + push rax " } } +#[cfg(feature = "x86_fsgsbase")] +macro_rules! restore_gsbase_paranoid { + () => { " + // Unused: {IA32_GS_BASE} + pop rax + wrgsbase rax + " } +} +#[cfg(not(feature = "x86_fsgsbase"))] +macro_rules! save_gsbase_paranoid { + () => { " + mov ecx, {IA32_GS_BASE} + rdmsr + shl rdx, 32 + or rax, rdx + + push rax + " } +} +#[cfg(not(feature = "x86_fsgsbase"))] +macro_rules! restore_gsbase_paranoid { + () => { " + pop rdx + + mov ecx, {IA32_GS_BASE} + mov eax, edx + shr rdx, 32 + wrmsr + " } +} + +#[cfg(feature = "x86_fsgsbase")] +macro_rules! set_gsbase_paranoid { + () => { " + // Unused: {IA32_GS_BASE} + wrgsbase rax + " } +} +#[cfg(not(feature = "x86_fsgsbase"))] +macro_rules! set_gsbase_paranoid { + () => { " + mov ecx, {IA32_GS_BASE} + mov rdx, rax + shr rdx, 32 + wrmsr + " } +} + +macro_rules! save_and_set_gsbase_paranoid { + // For paranoid interrupt entries, we have to be extremely careful with how we use IA32_GS_BASE + // and IA32_KERNEL_GS_BASE. If FSGSBASE is enabled, then we have no way to differentiate these + // two, as paranoid interrupts (e.g. NMIs) can occur even in kernel mode. In fact, they can + // even occur within another IRQ, so we cannot check the the privilege level via the stack. + // + // TODO: Linux uses the Interrupt Stack Table to figure out which NMIs were nested. Perhaps + // this could be done here, because if nested (sp > initial_sp), that means the NMI could not + // have come from userspace. But then, knowing the initial sp would somehow have to involve + // percpu, which brings us back to square one. But it might be useful if we would allow faults + // in NMIs. + // + // What we do instead, is using a special entry in the GDT, since we know that the GDT will + // always be thread local, as it contains the TSS. This gives us more than 32 bits to work + // with, which already is the largest x2APIC ID that an x86 CPU can handle. Luckily we can also + // use the stack, even though there might be interrupts in between. + () => { concat!( + save_gsbase_paranoid!(), + + // Allocate stack space for 8 bytes GDT base and 2 bytes size (ignored). + "sub rsp, 16\n", + // Set it to the GDT base. + "sgdt [rsp + 6]\n", + // Get the base pointer + " + mov rax, [rsp + 8] + add rsp, 16 + ", + // Load the lower 32 bits of that GDT entry. + "mov eax, [rax]\n", + // Calculate the percpu offset. + " + mov rbx, {KERNEL_PERCPU_OFFSET} + shl rax, {KERNEL_PERCPU_SHIFT} + add rax, rbx + ", + // Set GSBASE to RAX accordingly + set_gsbase_paranoid!(), + ) } +} +macro_rules! nop { + () => { " + // Unused: {IA32_GS_BASE} {KERNEL_PERCPU_OFFSET} {KERNEL_PERCPU_SHIFT} + " } +} #[macro_export] macro_rules! interrupt_stack { // XXX: Apparently we cannot use $expr and check for bool exhaustiveness, so we will have to // use idents directly instead. - ($name:ident, super_atomic: $is_super_atomic:ident!, |$stack:ident| $code:block) => { - paste::item! { - #[no_mangle] - unsafe extern "C" fn [<__interrupt_ $name>]($stack: &mut $crate::arch::x86_64::interrupt::InterruptStack) { - // Deadlock safety: interrupts are not normally enabled in the kernel, except in - // kmain. However, no locks for context list nor even individual context locks, are - // ever meant to be acquired there. - let _guard = $crate::ptrace::set_process_regs($stack); + ($name:ident, $save1:ident!, $save2:ident!, $rstor2:ident!, $rstor1:ident!, is_paranoid: $is_paranoid:expr, |$stack:ident| $code:block) => { + #[naked] + pub unsafe extern "C" fn $name() { + unsafe extern "C" fn inner($stack: &mut $crate::arch::x86_64::interrupt::InterruptStack) { + let _guard; + + if !$is_paranoid { + // Deadlock safety: (non-paranoid) interrupts are not normally enabled in the + // kernel, except in kmain. However, no locks for context list nor even + // individual context locks, are ever meant to be acquired there. + _guard = $crate::ptrace::set_process_regs($stack); + } // TODO: Force the declarations to specify unsafe? @@ -343,46 +430,61 @@ macro_rules! interrupt_stack { $code } } - - function!($name => { + asm!(concat!( // Backup all userspace registers to stack - $is_super_atomic!(), + $save1!(), "push rax\n", push_scratch!(), push_preserved!(), + $save2!(), + // TODO: Map PTI // $crate::arch::x86_64::pti::map(); // Call inner function with pointer to stack - "mov rdi, rsp\n", - "call __interrupt_", stringify!($name), "\n", + " + mov rdi, rsp + call {inner} + ", // TODO: Unmap PTI // $crate::arch::x86_64::pti::unmap(); + $rstor2!(), + // Restore all userspace registers pop_preserved!(), pop_scratch!(), - $is_super_atomic!(), + $rstor1!(), "iretq\n", - }); + ), + + inner = sym inner, + IA32_GS_BASE = const(x86::msr::IA32_GS_BASE), + KERNEL_PERCPU_SHIFT = const(crate::KERNEL_PERCPU_SHIFT), + KERNEL_PERCPU_OFFSET = const(crate::KERNEL_PERCPU_OFFSET), + + options(noreturn), + + ); } }; - ($name:ident, |$stack:ident| $code:block) => { interrupt_stack!($name, super_atomic: swapgs_iff_ring3_fast!, |$stack| $code); }; + ($name:ident, |$stack:ident| $code:block) => { interrupt_stack!($name, swapgs_iff_ring3_fast!, nop!, nop!, swapgs_iff_ring3_fast!, is_paranoid: false, |$stack| $code); }; + ($name:ident, @paranoid, |$stack:ident| $code:block) => { interrupt_stack!($name, nop!, save_and_set_gsbase_paranoid!, restore_gsbase_paranoid!, nop!, is_paranoid: true, |$stack| $code); } } #[macro_export] macro_rules! interrupt { ($name:ident, || $code:block) => { - paste::item! { - #[no_mangle] - unsafe extern "C" fn [<__interrupt_ $name>]() { + #[naked] + pub unsafe extern "C" fn $name() { + unsafe extern "C" fn inner() { $code } - function!($name => { + asm!(concat!( // Backup all userspace registers to stack swapgs_iff_ring3_fast!(), "push rax\n", @@ -392,7 +494,7 @@ macro_rules! interrupt { // $crate::arch::x86_64::pti::map(); // Call inner function with pointer to stack - "call __interrupt_", stringify!($name), "\n", + "call {inner}\n", // TODO: Unmap PTI // $crate::arch::x86_64::pti::unmap(); @@ -402,7 +504,12 @@ macro_rules! interrupt { swapgs_iff_ring3_fast!(), "iretq\n", - }); + ), + + inner = sym inner, + + options(noreturn), + ); } }; } @@ -410,9 +517,9 @@ macro_rules! interrupt { #[macro_export] macro_rules! interrupt_error { ($name:ident, |$stack:ident| $code:block) => { - paste::item! { - #[no_mangle] - unsafe extern "C" fn [<__interrupt_ $name>]($stack: &mut $crate::arch::x86_64::interrupt::handler::InterruptErrorStack) { + #[naked] + pub unsafe extern "C" fn $name() { + unsafe extern "C" fn inner($stack: &mut $crate::arch::x86_64::interrupt::handler::InterruptErrorStack) { let _guard; // Only set_ptrace_process_regs if this error occured from userspace. If this fault @@ -432,7 +539,7 @@ macro_rules! interrupt_error { } } - function!($name => { + asm!(concat!( swapgs_iff_ring3_fast_errorcode!(), // Move rax into code's place, put code in last instead (to be // compatible with InterruptStack) @@ -449,8 +556,10 @@ macro_rules! interrupt_error { // $crate::arch::x86_64::pti::map(); // Call inner function with pointer to stack - "mov rdi, rsp\n", - "call __interrupt_", stringify!($name), "\n", + " + mov rdi, rsp + call {inner} + ", // TODO: Unmap PTI // $crate::arch::x86_64::pti::unmap(); @@ -462,9 +571,14 @@ macro_rules! interrupt_error { pop_preserved!(), pop_scratch!(), - swapgs_iff_ring3_fast_errorcode!(), + // The error code has already been popped, so use the regular macro. + swapgs_iff_ring3_fast!(), "iretq\n", - }); + ), + + inner = sym inner, + + options(noreturn)); } }; } From 41d5a2a786726b3655b0dd444b77b76ab1aa45f3 Mon Sep 17 00:00:00 2001 From: 4lDO2 <4lDO2@protonmail.com> Date: Fri, 6 Aug 2021 18:08:04 +0200 Subject: [PATCH 3/4] Use naked functions in syscall inst handler too. --- src/arch/x86_64/gdt.rs | 3 --- src/arch/x86_64/interrupt/handler.rs | 28 ------------------- src/arch/x86_64/interrupt/syscall.rs | 40 +++++++++++++++++++--------- 3 files changed, 27 insertions(+), 44 deletions(-) diff --git a/src/arch/x86_64/gdt.rs b/src/arch/x86_64/gdt.rs index 5e974c3..b55db05 100644 --- a/src/arch/x86_64/gdt.rs +++ b/src/arch/x86_64/gdt.rs @@ -78,9 +78,6 @@ pub static mut GDT: [GdtEntry; 10] = [ #[repr(C, align(16))] pub struct ProcessorControlRegion { - // NOTE: If you plan to change any fields here, please make sure that you also modify the - // offsets in the syscall instruction handler accordingly! - pub tcb_end: usize, pub user_rsp_tmp: usize, pub tss: TssWrapper, diff --git a/src/arch/x86_64/interrupt/handler.rs b/src/arch/x86_64/interrupt/handler.rs index 58ee4dd..4cf27e8 100644 --- a/src/arch/x86_64/interrupt/handler.rs +++ b/src/arch/x86_64/interrupt/handler.rs @@ -203,34 +203,6 @@ impl InterruptErrorStack { } } -#[macro_export] -macro_rules! intel_asm { - ($($strings:expr,)+) => { - global_asm!(concat!( - $($strings),+, - )); - }; -} -#[macro_export] -macro_rules! function { - ($name:ident => { $($body:expr,)+ }) => { - intel_asm!( - ".global ", stringify!($name), "\n", - ".type ", stringify!($name), ", @function\n", - ".section .text.", stringify!($name), ", \"ax\", @progbits\n", - // Align the function to a 16-byte boundary, padding with multi-byte NOPs. - ".p2align 4,,15\n", - stringify!($name), ":\n", - $($body),+, - ".size ", stringify!($name), ", . - ", stringify!($name), "\n", - ".text\n", - ); - extern "C" { - pub fn $name(); - } - }; -} - #[macro_export] macro_rules! push_scratch { () => { " diff --git a/src/arch/x86_64/interrupt/syscall.rs b/src/arch/x86_64/interrupt/syscall.rs index 3fbc49a..38c78c2 100644 --- a/src/arch/x86_64/interrupt/syscall.rs +++ b/src/arch/x86_64/interrupt/syscall.rs @@ -5,7 +5,8 @@ use crate::{ syscall, syscall::flag::{PTRACE_FLAG_IGNORE, PTRACE_STOP_PRE_SYSCALL, PTRACE_STOP_POST_SYSCALL}, }; -use x86::msr; +use memoffset::offset_of; +use x86::{bits64::task::TaskStateSegment, msr, segmentation::SegmentSelector}; pub unsafe fn init() { // IA32_STAR[31:0] are reserved. @@ -58,16 +59,18 @@ pub unsafe extern "C" fn __inner_syscall_instruction(stack: *mut InterruptStack) }); } -function!(syscall_instruction => { +#[naked] +pub unsafe extern "C" fn syscall_instruction() { + asm!(concat!( // Yes, this is magic. No, you don't need to understand " swapgs // Set gs segment to TSS - mov gs:[0x08], rsp // Save userspace stack pointer - mov rsp, gs:[0x14] // Load kernel stack pointer - push QWORD PTR 5 * 8 + 3 // Push fake userspace SS (resembling iret frame) - push QWORD PTR gs:[0x08] // Push userspace rsp + mov gs:[{sp}], rsp // Save userspace stack pointer + mov rsp, gs:[{ksp}] // Load kernel stack pointer + push QWORD PTR {ss_sel} // Push fake userspace SS (resembling iret frame) + push QWORD PTR gs:[{sp}] // Push userspace rsp push r11 // Push rflags - push QWORD PTR 6 * 8 + 3 // Push fake CS (resembling iret stack frame) + push QWORD PTR {cs_sel} // Push fake CS (resembling iret stack frame) push rcx // Push userspace return pointer ", @@ -113,8 +116,8 @@ function!(syscall_instruction => { pop rcx // Pop userspace return pointer add rsp, 8 // Pop fake userspace CS pop r11 // Pop rflags - pop QWORD PTR gs:[0x08] // Pop userspace stack pointer - mov rsp, gs:[0x08] // Restore userspace stack pointer + pop QWORD PTR gs:[{sp}] // Pop userspace stack pointer + mov rsp, gs:[{sp}] // Restore userspace stack pointer swapgs // Restore gs from TSS to user data sysretq // Return into userspace; RCX=>RIP,R11=>RFLAGS @@ -125,8 +128,16 @@ function!(syscall_instruction => { xor r11, r11 swapgs iretq - ", -}); + "), + + sp = const(offset_of!(gdt::ProcessorControlRegion, user_rsp_tmp)), + ksp = const(offset_of!(gdt::ProcessorControlRegion, tss) + offset_of!(TaskStateSegment, rsp)), + ss_sel = const(SegmentSelector::new(gdt::GDT_USER_DATA as u16, x86::Ring::Ring3).bits()), + cs_sel = const(SegmentSelector::new(gdt::GDT_USER_CODE as u16, x86::Ring::Ring3).bits()), + + options(noreturn), + ); +} interrupt_stack!(syscall, |stack| { with_interrupt_stack!(|stack| { @@ -150,7 +161,9 @@ interrupt_stack!(syscall, |stack| { }) }); -function!(clone_ret => { +#[naked] +pub unsafe extern "C" fn clone_ret() { + asm!(concat!( // The address of this instruction is injected by `clone` in process.rs, on // top of the stack syscall->inner in this file, which is done using the rbp // register we save there. @@ -165,4 +178,5 @@ function!(clone_ret => { "pop rbp\n", // ...and we return to the address at the top of the stack "ret\n", -}); + ), options(noreturn)); +} From 1a80351a2c820bb85c208c86189c307275f952fb Mon Sep 17 00:00:00 2001 From: 4lDO2 <4lDO2@protonmail.com> Date: Mon, 9 Aug 2021 14:25:15 +0200 Subject: [PATCH 4/4] Fix TLS in paranoid entries. --- src/arch/x86_64/device/local_apic.rs | 14 ++++++++++-- src/arch/x86_64/gdt.rs | 7 +++++- src/arch/x86_64/interrupt/handler.rs | 34 ++++++++++++++++------------ src/arch/x86_64/start.rs | 4 ++-- 4 files changed, 40 insertions(+), 19 deletions(-) diff --git a/src/arch/x86_64/device/local_apic.rs b/src/arch/x86_64/device/local_apic.rs index 4f7c515..ab2c784 100644 --- a/src/arch/x86_64/device/local_apic.rs +++ b/src/arch/x86_64/device/local_apic.rs @@ -106,10 +106,15 @@ impl LocalApic { unsafe { wrmsr(IA32_X2APIC_ICR, value); } } else { unsafe { - while self.read(0x300) & 1 << 12 == 1 << 12 {} + const PENDING: u32 = 1 << 12; + while self.read(0x300) & PENDING == PENDING { + core::hint::spin_loop(); + } self.write(0x310, (value >> 32) as u32); self.write(0x300, value as u32); - while self.read(0x300) & 1 << 12 == 1 << 12 {} + while self.read(0x300) & PENDING == PENDING { + core::hint::spin_loop(); + } } } } @@ -123,6 +128,11 @@ impl LocalApic { } self.set_icr(icr); } + // Not used just yet, but allows triggering an NMI to another processor. + pub fn ipi_nmi(&mut self, apic_id: u32) { + let shift = if self.x2 { 32 } else { 56 }; + self.set_icr((u64::from(apic_id) << shift) | (1 << 14) | (0b100 << 8)); + } pub unsafe fn eoi(&mut self) { if self.x2 { diff --git a/src/arch/x86_64/gdt.rs b/src/arch/x86_64/gdt.rs index b55db05..6842dd9 100644 --- a/src/arch/x86_64/gdt.rs +++ b/src/arch/x86_64/gdt.rs @@ -21,6 +21,7 @@ pub const GDT_USER_DATA: usize = 5; pub const GDT_USER_CODE: usize = 6; pub const GDT_TSS: usize = 7; pub const GDT_TSS_HIGH: usize = 8; +pub const GDT_CPU_ID_CONTAINER: usize = 9; pub const GDT_A_PRESENT: u8 = 1 << 7; pub const GDT_A_RING_0: u8 = 0 << 5; @@ -145,7 +146,7 @@ pub unsafe fn init() { } /// Initialize GDT with TLS -pub unsafe fn init_paging(tcb_offset: usize, stack_offset: usize) { +pub unsafe fn init_paging(cpu_id: u32, tcb_offset: usize, stack_offset: usize) { // Set temporary TLS segment to the self-pointer of the Thread Control Block. x86::msr::wrmsr(x86::msr::IA32_GS_BASE, tcb_offset as u64); @@ -179,6 +180,10 @@ pub unsafe fn init_paging(tcb_offset: usize, stack_offset: usize) { (&mut GDT[GDT_TSS_HIGH] as *mut GdtEntry).cast::().write(tss_hi); } + // And finally, populate the last GDT entry with the current CPU ID, to allow paranoid + // interrupt handlers to safely use TLS. + (&mut GDT[GDT_CPU_ID_CONTAINER] as *mut GdtEntry).cast::().write(cpu_id); + // Set the stack pointer to use when coming back from userspace. set_tss_stack(stack_offset); diff --git a/src/arch/x86_64/interrupt/handler.rs b/src/arch/x86_64/interrupt/handler.rs index 4cf27e8..609d7b2 100644 --- a/src/arch/x86_64/interrupt/handler.rs +++ b/src/arch/x86_64/interrupt/handler.rs @@ -83,7 +83,8 @@ impl IretRegisters { unsafe { let fsbase = x86::msr::rdmsr(x86::msr::IA32_FS_BASE); let gsbase = x86::msr::rdmsr(x86::msr::IA32_KERNEL_GSBASE); - println!("FSBASE {:>016X}\nGSBASE {:016X}", fsbase, gsbase); + let kgsbase = x86::msr::rdmsr(x86::msr::IA32_GS_BASE); + println!("FSBASE {:>016X}\nGSBASE {:016X}\nKGSBASE {:016X}", fsbase, gsbase, kgsbase); } } } @@ -151,7 +152,6 @@ impl InterruptStack { pub fn load(&mut self, all: &IntRegisters) { // TODO: Which of these should be allowed to change? - // self.fs = all.fs; self.preserved.r15 = all.r15; self.preserved.r14 = all.r14; self.preserved.r13 = all.r13; @@ -319,14 +319,14 @@ macro_rules! restore_gsbase_paranoid { macro_rules! set_gsbase_paranoid { () => { " // Unused: {IA32_GS_BASE} - wrgsbase rax + wrgsbase rdx " } } #[cfg(not(feature = "x86_fsgsbase"))] macro_rules! set_gsbase_paranoid { () => { " mov ecx, {IA32_GS_BASE} - mov rdx, rax + mov eax, edx shr rdx, 32 wrmsr " } @@ -338,16 +338,20 @@ macro_rules! save_and_set_gsbase_paranoid { // two, as paranoid interrupts (e.g. NMIs) can occur even in kernel mode. In fact, they can // even occur within another IRQ, so we cannot check the the privilege level via the stack. // - // TODO: Linux uses the Interrupt Stack Table to figure out which NMIs were nested. Perhaps - // this could be done here, because if nested (sp > initial_sp), that means the NMI could not - // have come from userspace. But then, knowing the initial sp would somehow have to involve - // percpu, which brings us back to square one. But it might be useful if we would allow faults - // in NMIs. - // // What we do instead, is using a special entry in the GDT, since we know that the GDT will // always be thread local, as it contains the TSS. This gives us more than 32 bits to work // with, which already is the largest x2APIC ID that an x86 CPU can handle. Luckily we can also // use the stack, even though there might be interrupts in between. + // + // TODO: Linux uses the Interrupt Stack Table to figure out which NMIs were nested. Perhaps + // this could be done here, because if nested (sp > initial_sp), that means the NMI could not + // have come from userspace. But then, knowing the initial sp would somehow have to involve + // percpu, which brings us back to square one. But it might be useful if we would allow faults + // in NMIs. If we do detect a nested interrupt, then we can perform the iretq procedure + // ourselves, so that the newly nested NMI still blocks additional interrupts while still + // returning to the previously (faulting) NMI. See https://lwn.net/Articles/484932/, although I + // think the solution becomes a bit simpler when we cannot longer rely on GSBASE anymore. + () => { concat!( save_gsbase_paranoid!(), @@ -361,12 +365,12 @@ macro_rules! save_and_set_gsbase_paranoid { add rsp, 16 ", // Load the lower 32 bits of that GDT entry. - "mov eax, [rax]\n", + "mov edx, [rax + {gdt_cpu_id_offset}]\n", // Calculate the percpu offset. " mov rbx, {KERNEL_PERCPU_OFFSET} - shl rax, {KERNEL_PERCPU_SHIFT} - add rax, rbx + shl rdx, {KERNEL_PERCPU_SHIFT} + add rdx, rbx ", // Set GSBASE to RAX accordingly set_gsbase_paranoid!(), @@ -374,7 +378,7 @@ macro_rules! save_and_set_gsbase_paranoid { } macro_rules! nop { () => { " - // Unused: {IA32_GS_BASE} {KERNEL_PERCPU_OFFSET} {KERNEL_PERCPU_SHIFT} + // Unused: {IA32_GS_BASE} {KERNEL_PERCPU_OFFSET} {KERNEL_PERCPU_SHIFT} {gdt_cpu_id_offset} " } } @@ -438,6 +442,8 @@ macro_rules! interrupt_stack { KERNEL_PERCPU_SHIFT = const(crate::KERNEL_PERCPU_SHIFT), KERNEL_PERCPU_OFFSET = const(crate::KERNEL_PERCPU_OFFSET), + gdt_cpu_id_offset = const(crate::gdt::GDT_CPU_ID_CONTAINER * core::mem::size_of::()), + options(noreturn), ); diff --git a/src/arch/x86_64/start.rs b/src/arch/x86_64/start.rs index 9f94e08..b07901c 100644 --- a/src/arch/x86_64/start.rs +++ b/src/arch/x86_64/start.rs @@ -113,7 +113,7 @@ pub unsafe extern fn kstart(args_ptr: *const KernelArgs) -> ! { let (mut active_table, tcb_offset) = paging::init(0); // Set up GDT after paging with TLS - gdt::init_paging(tcb_offset, stack_base + stack_size); + gdt::init_paging(0, tcb_offset, stack_base + stack_size); // Set up IDT idt::init_paging_bsp(); @@ -206,7 +206,7 @@ pub unsafe extern fn kstart_ap(args_ptr: *const KernelArgsAp) -> ! { let tcb_offset = paging::init_ap(cpu_id, bsp_table); // Set up GDT with TLS - gdt::init_paging(tcb_offset, stack_end); + gdt::init_paging(cpu_id as u32, tcb_offset, stack_end); // Set up IDT for AP idt::init_paging_post_heap(false, cpu_id);