Merge branch 'sysretq' into 'master'

Use faster sysretq when returning from system calls

See merge request redox-os/kernel!168
This commit is contained in:
Jeremy Soller
2021-02-15 18:59:19 +00:00
9 changed files with 293 additions and 80 deletions

View File

@@ -14,11 +14,12 @@ pub const GDT_NULL: usize = 0;
pub const GDT_KERNEL_CODE: usize = 1;
pub const GDT_KERNEL_DATA: usize = 2;
pub const GDT_KERNEL_TLS: usize = 3;
pub const GDT_USER_CODE: usize = 4;
pub const GDT_USER_CODE32_UNUSED: usize = 4;
pub const GDT_USER_DATA: usize = 5;
pub const GDT_USER_TLS: usize = 6;
pub const GDT_TSS: usize = 7;
pub const GDT_TSS_HIGH: usize = 8;
pub const GDT_USER_CODE: usize = 6;
pub const GDT_USER_TLS: usize = 7;
pub const GDT_TSS: usize = 8;
pub const GDT_TSS_HIGH: usize = 9;
pub const GDT_A_PRESENT: u8 = 1 << 7;
pub const GDT_A_RING_0: u8 = 0 << 5;
@@ -61,7 +62,7 @@ pub static mut GDTR: DescriptorTablePointer<SegmentDescriptor> = DescriptorTable
};
#[thread_local]
pub static mut GDT: [GdtEntry; 9] = [
pub static mut GDT: [GdtEntry; 10] = [
// Null
GdtEntry::new(0, 0, 0, 0),
// Kernel code
@@ -70,10 +71,12 @@ pub static mut GDT: [GdtEntry; 9] = [
GdtEntry::new(0, 0, GDT_A_PRESENT | GDT_A_RING_0 | GDT_A_SYSTEM | GDT_A_PRIVILEGE, GDT_F_LONG_MODE),
// Kernel TLS
GdtEntry::new(0, 0, GDT_A_PRESENT | GDT_A_RING_0 | GDT_A_SYSTEM | GDT_A_PRIVILEGE, GDT_F_LONG_MODE),
// User code
GdtEntry::new(0, 0, GDT_A_PRESENT | GDT_A_RING_3 | GDT_A_SYSTEM | GDT_A_EXECUTABLE | GDT_A_PRIVILEGE, GDT_F_LONG_MODE),
// Dummy 32-bit user code - apparently necessary for SYSEXIT. We restrict it to ring 0 anyway.
GdtEntry::new(0, 0, GDT_A_PRESENT | GDT_A_RING_0 | GDT_A_SYSTEM | GDT_A_EXECUTABLE | GDT_A_PRIVILEGE, GDT_F_PROTECTED_MODE),
// User data
GdtEntry::new(0, 0, GDT_A_PRESENT | GDT_A_RING_3 | GDT_A_SYSTEM | GDT_A_PRIVILEGE, GDT_F_LONG_MODE),
// User (64-bit) code
GdtEntry::new(0, 0, GDT_A_PRESENT | GDT_A_RING_3 | GDT_A_SYSTEM | GDT_A_EXECUTABLE | GDT_A_PRIVILEGE, GDT_F_LONG_MODE),
// User TLS
GdtEntry::new(0, 0, GDT_A_PRESENT | GDT_A_RING_3 | GDT_A_SYSTEM | GDT_A_PRIVILEGE, GDT_F_LONG_MODE),
// TSS
@@ -82,15 +85,39 @@ pub static mut GDT: [GdtEntry; 9] = [
GdtEntry::new(0, 0, 0, 0),
];
#[repr(packed)]
pub struct TssWrapper {
base: TaskStateSegment,
_pad: u64,
_user_stack: u64,
}
impl core::ops::Deref for TssWrapper {
type Target = TaskStateSegment;
fn deref(&self) -> &Self::Target {
&self.base
}
}
impl core::ops::DerefMut for TssWrapper {
fn deref_mut(&mut self) -> &mut Self::Target {
&mut self.base
}
}
#[thread_local]
pub static mut TSS: TaskStateSegment = TaskStateSegment {
reserved: 0,
rsp: [0; 3],
reserved2: 0,
ist: [0; 7],
reserved3: 0,
reserved4: 0,
iomap_base: 0xFFFF
pub static mut TSS: TssWrapper = TssWrapper {
base: TaskStateSegment {
reserved: 0,
rsp: [0; 3],
reserved2: 0,
ist: [0; 7],
reserved3: 0,
reserved4: 0,
iomap_base: 0xFFFF
},
_pad: 0_u64,
// Accessed only from assembly, at `gs:[0x70]`
_user_stack: 0_u64,
};
pub unsafe fn set_tcb(pid: usize) {
@@ -164,11 +191,18 @@ pub unsafe fn init_paging(tcb_offset: usize, stack_offset: usize) {
segmentation::load_ds(SegmentSelector::new(GDT_KERNEL_DATA as u16, Ring::Ring0));
segmentation::load_es(SegmentSelector::new(GDT_KERNEL_DATA as u16, Ring::Ring0));
segmentation::load_fs(SegmentSelector::new(GDT_KERNEL_TLS as u16, Ring::Ring0));
segmentation::load_gs(SegmentSelector::new(GDT_KERNEL_DATA as u16, Ring::Ring0));
segmentation::load_ss(SegmentSelector::new(GDT_KERNEL_DATA as u16, Ring::Ring0));
// Load the task register
task::load_tr(SegmentSelector::new(GDT_TSS as u16, Ring::Ring0));
// Ensure that GS always points to the TSS segment in kernel space.
x86::msr::wrmsr(x86::msr::IA32_GS_BASE, &TSS as *const _ as usize as u64);
// Inside kernel space, GS should _always_ point to the TSS. When leaving userspace, `swapgs`
// is called again, making the userspace GS always point to user data.
x86::msr::wrmsr(x86::msr::IA32_KERNEL_GSBASE, 0);
}
#[derive(Copy, Clone, Debug)]

View File

@@ -154,10 +154,64 @@ pub unsafe fn init_generic(is_bsp: bool, idt: &mut Idt) {
IDTR.limit = (current_idt.len() * mem::size_of::<IdtEntry>() - 1) as u16;
IDTR.base = current_idt.as_ptr() as *const X86IdtEntry;
let backup_ist = {
// We give Non-Maskable Interrupts, Double Fault, and Machine Check exceptions separate
// stacks, since these (unless we are going to set up NMI watchdogs like Linux does) are
// considered the most fatal, especially Double Faults which are caused by errors __when
// accessing the system IDT__. If that goes wrong, then kernel memory may be partially
// corrupt, and we want a separate stack.
//
// Note that each CPU has its own "backup interrupt stack".
let index = 1_u8;
// Allocate 64 KiB of stack space for the backup stack.
const BACKUP_STACK_SIZE: usize = 65536;
assert_eq!(BACKUP_STACK_SIZE % crate::memory::PAGE_SIZE, 0);
let page_count = BACKUP_STACK_SIZE / crate::memory::PAGE_SIZE;
let frames = crate::memory::allocate_frames(page_count)
.expect("failed to allocate pages for backup interrupt stack");
// Map them linearly, i.e. KERNEL_OFFSET + physaddr.
let base_address = {
use crate::memory::{Frame, PhysicalAddress};
use crate::paging::{ActivePageTable, Page, VirtualAddress};
use crate::paging::entry::EntryFlags;
let mut active_table = ActivePageTable::new();
let base_virtual_address = VirtualAddress::new(frames.start_address().data() + crate::KERNEL_OFFSET);
for i in 0..page_count {
let virtual_address = VirtualAddress::new(base_virtual_address.data() + i * crate::memory::PAGE_SIZE);
let physical_address = PhysicalAddress::new(frames.start_address().data() + i * crate::memory::PAGE_SIZE);
let page = Page::containing_address(virtual_address);
let flags = EntryFlags::PRESENT | EntryFlags::WRITABLE | EntryFlags::NO_EXECUTE;
let flusher = if let Some(already_mapped) = active_table.translate_page(page) {
assert_eq!(already_mapped.start_address(), physical_address, "address already mapped, but non-linearly");
active_table.remap(page, flags)
} else {
active_table.map_to(page, Frame::containing_address(physical_address), flags)
};
flusher.flush(&mut active_table);
}
base_virtual_address
};
// Stack always grows downwards.
let address = base_address.data() + BACKUP_STACK_SIZE;
// Put them in the 1st entry of the IST.
crate::gdt::TSS.ist[usize::from(index - 1)] = address as u64;
index
};
// Set up exceptions
current_idt[0].set_func(exception::divide_by_zero);
current_idt[1].set_func(exception::debug);
current_idt[2].set_func(exception::non_maskable);
current_idt[2].set_ist(backup_ist);
current_idt[3].set_func(exception::breakpoint);
current_idt[3].set_flags(IdtFlags::PRESENT | IdtFlags::RING_3 | IdtFlags::INTERRUPT);
current_idt[4].set_func(exception::overflow);
@@ -165,6 +219,7 @@ pub unsafe fn init_generic(is_bsp: bool, idt: &mut Idt) {
current_idt[6].set_func(exception::invalid_opcode);
current_idt[7].set_func(exception::device_not_available);
current_idt[8].set_func(exception::double_fault);
current_idt[8].set_ist(backup_ist);
// 9 no longer available
current_idt[10].set_func(exception::invalid_tss);
current_idt[11].set_func(exception::segment_not_present);
@@ -175,6 +230,7 @@ pub unsafe fn init_generic(is_bsp: bool, idt: &mut Idt) {
current_idt[16].set_func(exception::fpu_fault);
current_idt[17].set_func(exception::alignment_check);
current_idt[18].set_func(exception::machine_check);
current_idt[18].set_ist(backup_ist);
current_idt[19].set_func(exception::simd);
current_idt[20].set_func(exception::virtualization);
// 21 through 29 reserved
@@ -275,6 +331,12 @@ impl IdtEntry {
self.attribute = flags.bits;
}
pub fn set_ist(&mut self, ist: u8) {
assert_eq!(ist & 0x07, ist, "interrupt stack table must be within 0..=7");
self.zero &= 0xF8;
self.zero |= ist;
}
pub fn set_offset(&mut self, selector: u16, base: usize) {
self.selector = selector;
self.offsetl = base as u16;
@@ -285,6 +347,6 @@ impl IdtEntry {
// A function to set the offset more easily
pub fn set_func(&mut self, func: unsafe extern fn()) {
self.set_flags(IdtFlags::PRESENT | IdtFlags::RING_0 | IdtFlags::INTERRUPT);
self.set_offset(8, func as usize);
self.set_offset((crate::gdt::GDT_KERNEL_CODE as u16) << 3, func as usize);
}
}

View File

@@ -18,7 +18,7 @@ interrupt_stack!(divide_by_zero, |stack| {
ksignal(SIGFPE);
});
interrupt_stack!(debug, |stack| {
interrupt_stack!(debug, super_atomic: swapgs_iff_ring3_slow!, |stack| {
let mut handled = false;
// Disable singlestep before there is a breakpoint, since the breakpoint
@@ -41,7 +41,7 @@ interrupt_stack!(debug, |stack| {
}
});
interrupt_stack!(non_maskable, |stack| {
interrupt_stack!(non_maskable, super_atomic: swapgs_iff_ring3_slow!, |stack| {
println!("Non-maskable interrupt");
stack.dump();
});
@@ -153,7 +153,7 @@ interrupt_error!(alignment_check, |stack| {
ksignal(SIGBUS);
});
interrupt_stack!(machine_check, |stack| {
interrupt_stack!(machine_check, super_atomic: swapgs_iff_ring3_slow!, |stack| {
println!("Machine check fault");
stack.dump();
stack_trace();

View File

@@ -75,6 +75,11 @@ impl IretRegisters {
println!("RFLAG: {:>016X}", { self.rflags });
println!("CS: {:>016X}", { self.cs });
println!("RIP: {:>016X}", { self.rip });
if self.cs & 0b11 != 0b00 {
println!("RSP: {:>016X}", { self.rsp });
println!("SS: {:>016X}", { self.ss });
}
}
}
@@ -304,9 +309,49 @@ macro_rules! pop_fs {
" };
}
macro_rules! swapgs_iff_ring3_fast {
() => { "
// Check whether the last two bits RSP+8 (code segment) are equal to zero.
test QWORD PTR [rsp + 8], 0x3
// Skip the SWAPGS instruction if CS & 0b11 == 0b00.
jz 1f
swapgs
1:
" };
}
macro_rules! swapgs_iff_ring3_fast_errorcode {
() => { "
test QWORD PTR [rsp + 16], 0x3
jz 1f
swapgs
1:
" };
}
#[macro_export]
macro_rules! swapgs_iff_ring3_slow {
() => { "
push rax
push rdx
push rcx
mov ecx, 0xC0000102
rdmsr
shl rdx, 32
or eax, edx
test rdx, rdx
jnz 1f
swapgs
1:
pop rcx
pop rdx
pop rax
" }
}
#[macro_export]
macro_rules! interrupt_stack {
($name:ident, |$stack:ident| $code:block) => {
// XXX: Apparently we cannot use $expr and check for bool exhaustiveness, so we will have to
// use idents directly instead.
($name:ident, super_atomic: $is_super_atomic:ident!, |$stack:ident| $code:block) => {
paste::item! {
#[no_mangle]
unsafe extern "C" fn [<__interrupt_ $name>](stack: *mut $crate::arch::x86_64::interrupt::InterruptStack) {
@@ -322,6 +367,7 @@ macro_rules! interrupt_stack {
function!($name => {
// Backup all userspace registers to stack
$is_super_atomic!(),
"push rax\n",
push_scratch!(),
push_preserved!(),
@@ -342,10 +388,12 @@ macro_rules! interrupt_stack {
pop_preserved!(),
pop_scratch!(),
$is_super_atomic!(),
"iretq\n",
});
}
};
($name:ident, |$stack:ident| $code:block) => { interrupt_stack!($name, super_atomic: swapgs_iff_ring3_fast!, |$stack| $code); };
}
#[macro_export]
@@ -359,6 +407,7 @@ macro_rules! interrupt {
function!($name => {
// Backup all userspace registers to stack
swapgs_iff_ring3_fast!(),
"push rax\n",
push_scratch!(),
push_fs!(),
@@ -376,6 +425,7 @@ macro_rules! interrupt {
pop_fs!(),
pop_scratch!(),
swapgs_iff_ring3_fast!(),
"iretq\n",
});
}
@@ -399,6 +449,7 @@ macro_rules! interrupt_error {
}
function!($name => {
swapgs_iff_ring3_fast_errorcode!(),
// Move rax into code's place, put code in last instead (to be
// compatible with InterruptStack)
"xchg [rsp], rax\n",
@@ -429,6 +480,7 @@ macro_rules! interrupt_error {
pop_preserved!(),
pop_scratch!(),
swapgs_iff_ring3_fast_errorcode!(),
"iretq\n",
});
}

View File

@@ -8,10 +8,20 @@ use crate::{
use x86::msr;
pub unsafe fn init() {
msr::wrmsr(msr::IA32_STAR, ((gdt::GDT_KERNEL_CODE as u64) << 3) << 32);
// IA32_STAR[31:0] are reserved.
// The base selector of the two consecutive segments for kernel code and the immediately
// suceeding stack (data).
let syscall_cs_ss_base = (gdt::GDT_KERNEL_CODE as u16) << 3;
// The base selector of the three consecutive segments (of which two are used) for user code
// and user data. It points to a 32-bit code segment, which must be followed by a data segment
// (stack), and a 64-bit code segment.
let sysret_cs_ss_base = ((gdt::GDT_USER_CODE32_UNUSED as u16) << 3) | 3;
let star_high = u32::from(syscall_cs_ss_base) | (u32::from(sysret_cs_ss_base) << 16);
msr::wrmsr(msr::IA32_STAR, u64::from(star_high) << 32);
msr::wrmsr(msr::IA32_LSTAR, syscall_instruction as u64);
msr::wrmsr(msr::IA32_FMASK, 0x0300); // Clear trap flag and interrupt enable
msr::wrmsr(msr::IA32_KERNEL_GSBASE, &gdt::TSS as *const _ as u64);
let efer = msr::rdmsr(msr::IA32_EFER);
msr::wrmsr(msr::IA32_EFER, efer | 1);
@@ -52,15 +62,13 @@ function!(syscall_instruction => {
// Yes, this is magic. No, you don't need to understand
"
swapgs // Set gs segment to TSS
mov gs:[28], rsp // Save userspace rsp
mov rsp, gs:[4] // Load kernel rsp
push 5 * 8 + 3 // Push userspace data segment
push QWORD PTR gs:[28] // Push userspace rsp
mov QWORD PTR gs:[28], 0 // Clear userspace rsp
mov gs:[0x70], rsp // Save userspace stack pointer
mov rsp, gs:[4] // Load kernel stack pointer
push QWORD PTR 5 * 8 + 3 // Push fake SS (resembling iret stack frame)
push QWORD PTR gs:[0x70] // Push userspace rsp
push r11 // Push rflags
push 4 * 8 + 3 // Push userspace code segment
push QWORD PTR 6 * 8 + 3 // Push fake CS (resembling iret stack frame)
push rcx // Push userspace return pointer
swapgs // Restore gs
",
// Push context registers
@@ -85,7 +93,41 @@ function!(syscall_instruction => {
pop_scratch!(),
// Return
"iretq\n",
//
// We must test whether RCX is canonical. This is not strictly necessary, but could be
// fatal if some kernel bug would allow RCX to be modified by user code.
//
// See https://xenproject.org/2012/06/13/the-intel-sysret-privilege-escalation/.
//
// This is not just theoretical; ptrace allows userspace to change rcx of target processes.
"
pop rcx // Pop userspace return pointer
// Set ZF iff forbidden bit 47 (i.e. the bit that must be sign extended) is set.
bt rcx, 47
// If ZF was set, i.e. the address was invalid higher-half, so jump to the slower iretq and
// handle the error without being able to execute attacker-controlled code!
jmp 1f
// Otherwise, continue with the fast sysretq.
sub rsp, 8 // Pop fake userspace CS
pop r11 // Pop rflags
pop QWORD PTR gs:[0x70] // Pop userspace stack pointer
mov rsp, gs:[0x70] // Restore userspace stack pointer
swapgs // Restore gs from TSS to user data
sysretq // Return into userspace; RCX=>RIP,R11=>RFLAGS
1:
// Slow iretq
push rcx
xor rcx, rcx
xor r11, r11
swapgs
iretq
",
});
interrupt_stack!(syscall, |stack| {

View File

@@ -58,7 +58,7 @@ pub unsafe fn map() {
#[cfg(feature = "pti")]
#[inline(always)]
pub unsafe fn unmap() {
pub unsafe extern "C" fn unmap() {
// Switch to per-CPU stack
switch_stack(PTI_CONTEXT_STACK, PTI_CPU_STACK.as_ptr() as usize + PTI_CPU_STACK.len());
@@ -83,4 +83,4 @@ pub unsafe fn map() {}
#[cfg(not(feature = "pti"))]
#[inline(always)]
pub unsafe fn unmap() {}
pub unsafe extern "C" fn unmap() {}

View File

@@ -240,54 +240,77 @@ pub unsafe extern fn kstart_ap(args_ptr: *const KernelArgsAp) -> ! {
}
#[naked]
pub unsafe fn usermode(ip: usize, sp: usize, arg: usize, singlestep: bool) -> ! {
let mut flags = FLAG_INTERRUPTS;
if singlestep {
flags |= FLAG_SINGLESTEP;
}
#[inline(never)]
// TODO: AbiCompatBool
pub unsafe extern "C" fn usermode(_ip: usize, _sp: usize, _arg: usize, _singlestep: u32) -> ! {
// rdi, rsi, rdx, rcx
asm!(
"
mov rbx, {flag_interrupts}
test ecx, ecx
jz .after_singlestep_branch
or rbx, {flag_singlestep}
asm!("push r10
push r11
push r12
push r13
push r14
push r15",
in("r10") (gdt::GDT_USER_DATA << 3 | 3), // Data segment
in("r11") sp, // Stack pointer
in("r12") flags, // Flags
in("r13") (gdt::GDT_USER_CODE << 3 | 3), // Code segment
in("r14") ip, // IP
in("r15") arg, // Argument
);
.after_singlestep_branch:
// Unmap kernel
pti::unmap();
// save `ip` (rdi), `sp` (rsi), and `arg` (rdx) in callee-preserved registers, so that
// they are not modified by `pti_unmap`
// Go to usermode
asm!("mov ds, r14d
mov es, r14d
mov fs, r15d
mov gs, r14d
xor rax, rax
xor rbx, rbx
xor rcx, rcx
xor rdx, rdx
xor rsi, rsi
xor rdi, rdi
xor rbp, rbp
xor r8, r8
xor r9, r9
xor r10, r10
xor r11, r11
xor r12, r12
xor r13, r13
xor r14, r14
xor r15, r15
fninit
pop rdi
iretq",
in("r14") (gdt::GDT_USER_DATA << 3 | 3), // Data segment
in("r15") (gdt::GDT_USER_TLS << 3 | 3), // TLS segment
options(noreturn),
mov r13, rdi
mov r14, rsi
mov r15, rdx
call {pti_unmap}
// Go to usermode
swapgs
mov r8, {user_data_seg_selector}
mov r9, {user_tls_seg_selector}
mov ds, r8d
mov es, r8d
mov fs, r9d
mov gs, r8d
// Target RFLAGS
mov r11, rbx
// Target instruction pointer
mov rcx, r13
// Target stack pointer
mov rsp, r14
// Target argument
mov rdi, r15
xor rax, rax
xor rbx, rbx
// Don't zero rcx; it's used for `ip`.
xor rdx, rdx
// Don't zero rdi; it's used for `arg`.
xor rsi, rsi
xor rbp, rbp
// Don't zero rsp, obviously.
xor r8, r8
xor r9, r9
xor r10, r10
// Don't zero r11; it's used for `rflags`.
xor r12, r12
xor r13, r13
xor r14, r14
xor r15, r15
fninit
// NOTE: Regarding the sysretq vulnerability, this is safe as we cannot modify RCX,
// even though the caller can give us the wrong address. But, it's marked unsafe, so
// the caller is responsible for this! (And, the likelihood of rcx being changed in the
// middle here, is minimal, unless the attacker already has partial control of kernel
// memory.)
sysretq
",
flag_interrupts = const(FLAG_INTERRUPTS),
flag_singlestep = const(FLAG_SINGLESTEP),
pti_unmap = sym pti::unmap,
user_data_seg_selector = const(gdt::GDT_USER_DATA << 3 | 3),
user_tls_seg_selector = const(gdt::GDT_USER_TLS << 3 | 3),
options(noreturn),
);
}

View File

@@ -122,7 +122,7 @@ pub extern "C" fn signal_handler(sig: usize) {
sp -= mem::size_of::<usize>();
*(sp as *mut usize) = restorer;
usermode(handler, sp, sig, singlestep);
usermode(handler, sp, sig, u32::from(singlestep));
}
}

View File

@@ -845,7 +845,7 @@ fn fexec_noreturn(
}
// Go to usermode
unsafe { usermode(entry, sp, 0, singlestep) }
unsafe { usermode(entry, sp, 0, u32::from(singlestep)) }
}
pub fn fexec_kernel(fd: FileHandle, args: Box<[Box<[u8]>]>, vars: Box<[Box<[u8]>]>, name_override_opt: Option<Box<str>>, auxv: Option<Vec<usize>>) -> Result<usize> {