Merge branch 'sysretq' into 'master'
Use faster sysretq when returning from system calls See merge request redox-os/kernel!168
This commit is contained in:
@@ -14,11 +14,12 @@ pub const GDT_NULL: usize = 0;
|
||||
pub const GDT_KERNEL_CODE: usize = 1;
|
||||
pub const GDT_KERNEL_DATA: usize = 2;
|
||||
pub const GDT_KERNEL_TLS: usize = 3;
|
||||
pub const GDT_USER_CODE: usize = 4;
|
||||
pub const GDT_USER_CODE32_UNUSED: usize = 4;
|
||||
pub const GDT_USER_DATA: usize = 5;
|
||||
pub const GDT_USER_TLS: usize = 6;
|
||||
pub const GDT_TSS: usize = 7;
|
||||
pub const GDT_TSS_HIGH: usize = 8;
|
||||
pub const GDT_USER_CODE: usize = 6;
|
||||
pub const GDT_USER_TLS: usize = 7;
|
||||
pub const GDT_TSS: usize = 8;
|
||||
pub const GDT_TSS_HIGH: usize = 9;
|
||||
|
||||
pub const GDT_A_PRESENT: u8 = 1 << 7;
|
||||
pub const GDT_A_RING_0: u8 = 0 << 5;
|
||||
@@ -61,7 +62,7 @@ pub static mut GDTR: DescriptorTablePointer<SegmentDescriptor> = DescriptorTable
|
||||
};
|
||||
|
||||
#[thread_local]
|
||||
pub static mut GDT: [GdtEntry; 9] = [
|
||||
pub static mut GDT: [GdtEntry; 10] = [
|
||||
// Null
|
||||
GdtEntry::new(0, 0, 0, 0),
|
||||
// Kernel code
|
||||
@@ -70,10 +71,12 @@ pub static mut GDT: [GdtEntry; 9] = [
|
||||
GdtEntry::new(0, 0, GDT_A_PRESENT | GDT_A_RING_0 | GDT_A_SYSTEM | GDT_A_PRIVILEGE, GDT_F_LONG_MODE),
|
||||
// Kernel TLS
|
||||
GdtEntry::new(0, 0, GDT_A_PRESENT | GDT_A_RING_0 | GDT_A_SYSTEM | GDT_A_PRIVILEGE, GDT_F_LONG_MODE),
|
||||
// User code
|
||||
GdtEntry::new(0, 0, GDT_A_PRESENT | GDT_A_RING_3 | GDT_A_SYSTEM | GDT_A_EXECUTABLE | GDT_A_PRIVILEGE, GDT_F_LONG_MODE),
|
||||
// Dummy 32-bit user code - apparently necessary for SYSEXIT. We restrict it to ring 0 anyway.
|
||||
GdtEntry::new(0, 0, GDT_A_PRESENT | GDT_A_RING_0 | GDT_A_SYSTEM | GDT_A_EXECUTABLE | GDT_A_PRIVILEGE, GDT_F_PROTECTED_MODE),
|
||||
// User data
|
||||
GdtEntry::new(0, 0, GDT_A_PRESENT | GDT_A_RING_3 | GDT_A_SYSTEM | GDT_A_PRIVILEGE, GDT_F_LONG_MODE),
|
||||
// User (64-bit) code
|
||||
GdtEntry::new(0, 0, GDT_A_PRESENT | GDT_A_RING_3 | GDT_A_SYSTEM | GDT_A_EXECUTABLE | GDT_A_PRIVILEGE, GDT_F_LONG_MODE),
|
||||
// User TLS
|
||||
GdtEntry::new(0, 0, GDT_A_PRESENT | GDT_A_RING_3 | GDT_A_SYSTEM | GDT_A_PRIVILEGE, GDT_F_LONG_MODE),
|
||||
// TSS
|
||||
@@ -82,15 +85,39 @@ pub static mut GDT: [GdtEntry; 9] = [
|
||||
GdtEntry::new(0, 0, 0, 0),
|
||||
];
|
||||
|
||||
#[repr(packed)]
|
||||
pub struct TssWrapper {
|
||||
base: TaskStateSegment,
|
||||
_pad: u64,
|
||||
_user_stack: u64,
|
||||
}
|
||||
impl core::ops::Deref for TssWrapper {
|
||||
type Target = TaskStateSegment;
|
||||
|
||||
fn deref(&self) -> &Self::Target {
|
||||
&self.base
|
||||
}
|
||||
}
|
||||
impl core::ops::DerefMut for TssWrapper {
|
||||
fn deref_mut(&mut self) -> &mut Self::Target {
|
||||
&mut self.base
|
||||
}
|
||||
}
|
||||
|
||||
#[thread_local]
|
||||
pub static mut TSS: TaskStateSegment = TaskStateSegment {
|
||||
reserved: 0,
|
||||
rsp: [0; 3],
|
||||
reserved2: 0,
|
||||
ist: [0; 7],
|
||||
reserved3: 0,
|
||||
reserved4: 0,
|
||||
iomap_base: 0xFFFF
|
||||
pub static mut TSS: TssWrapper = TssWrapper {
|
||||
base: TaskStateSegment {
|
||||
reserved: 0,
|
||||
rsp: [0; 3],
|
||||
reserved2: 0,
|
||||
ist: [0; 7],
|
||||
reserved3: 0,
|
||||
reserved4: 0,
|
||||
iomap_base: 0xFFFF
|
||||
},
|
||||
_pad: 0_u64,
|
||||
// Accessed only from assembly, at `gs:[0x70]`
|
||||
_user_stack: 0_u64,
|
||||
};
|
||||
|
||||
pub unsafe fn set_tcb(pid: usize) {
|
||||
@@ -164,11 +191,18 @@ pub unsafe fn init_paging(tcb_offset: usize, stack_offset: usize) {
|
||||
segmentation::load_ds(SegmentSelector::new(GDT_KERNEL_DATA as u16, Ring::Ring0));
|
||||
segmentation::load_es(SegmentSelector::new(GDT_KERNEL_DATA as u16, Ring::Ring0));
|
||||
segmentation::load_fs(SegmentSelector::new(GDT_KERNEL_TLS as u16, Ring::Ring0));
|
||||
|
||||
segmentation::load_gs(SegmentSelector::new(GDT_KERNEL_DATA as u16, Ring::Ring0));
|
||||
segmentation::load_ss(SegmentSelector::new(GDT_KERNEL_DATA as u16, Ring::Ring0));
|
||||
|
||||
// Load the task register
|
||||
task::load_tr(SegmentSelector::new(GDT_TSS as u16, Ring::Ring0));
|
||||
|
||||
// Ensure that GS always points to the TSS segment in kernel space.
|
||||
x86::msr::wrmsr(x86::msr::IA32_GS_BASE, &TSS as *const _ as usize as u64);
|
||||
// Inside kernel space, GS should _always_ point to the TSS. When leaving userspace, `swapgs`
|
||||
// is called again, making the userspace GS always point to user data.
|
||||
x86::msr::wrmsr(x86::msr::IA32_KERNEL_GSBASE, 0);
|
||||
}
|
||||
|
||||
#[derive(Copy, Clone, Debug)]
|
||||
|
||||
@@ -154,10 +154,64 @@ pub unsafe fn init_generic(is_bsp: bool, idt: &mut Idt) {
|
||||
IDTR.limit = (current_idt.len() * mem::size_of::<IdtEntry>() - 1) as u16;
|
||||
IDTR.base = current_idt.as_ptr() as *const X86IdtEntry;
|
||||
|
||||
let backup_ist = {
|
||||
// We give Non-Maskable Interrupts, Double Fault, and Machine Check exceptions separate
|
||||
// stacks, since these (unless we are going to set up NMI watchdogs like Linux does) are
|
||||
// considered the most fatal, especially Double Faults which are caused by errors __when
|
||||
// accessing the system IDT__. If that goes wrong, then kernel memory may be partially
|
||||
// corrupt, and we want a separate stack.
|
||||
//
|
||||
// Note that each CPU has its own "backup interrupt stack".
|
||||
let index = 1_u8;
|
||||
|
||||
// Allocate 64 KiB of stack space for the backup stack.
|
||||
const BACKUP_STACK_SIZE: usize = 65536;
|
||||
assert_eq!(BACKUP_STACK_SIZE % crate::memory::PAGE_SIZE, 0);
|
||||
let page_count = BACKUP_STACK_SIZE / crate::memory::PAGE_SIZE;
|
||||
let frames = crate::memory::allocate_frames(page_count)
|
||||
.expect("failed to allocate pages for backup interrupt stack");
|
||||
|
||||
// Map them linearly, i.e. KERNEL_OFFSET + physaddr.
|
||||
let base_address = {
|
||||
use crate::memory::{Frame, PhysicalAddress};
|
||||
use crate::paging::{ActivePageTable, Page, VirtualAddress};
|
||||
use crate::paging::entry::EntryFlags;
|
||||
|
||||
let mut active_table = ActivePageTable::new();
|
||||
let base_virtual_address = VirtualAddress::new(frames.start_address().data() + crate::KERNEL_OFFSET);
|
||||
|
||||
for i in 0..page_count {
|
||||
let virtual_address = VirtualAddress::new(base_virtual_address.data() + i * crate::memory::PAGE_SIZE);
|
||||
let physical_address = PhysicalAddress::new(frames.start_address().data() + i * crate::memory::PAGE_SIZE);
|
||||
let page = Page::containing_address(virtual_address);
|
||||
|
||||
let flags = EntryFlags::PRESENT | EntryFlags::WRITABLE | EntryFlags::NO_EXECUTE;
|
||||
|
||||
let flusher = if let Some(already_mapped) = active_table.translate_page(page) {
|
||||
assert_eq!(already_mapped.start_address(), physical_address, "address already mapped, but non-linearly");
|
||||
active_table.remap(page, flags)
|
||||
} else {
|
||||
active_table.map_to(page, Frame::containing_address(physical_address), flags)
|
||||
};
|
||||
flusher.flush(&mut active_table);
|
||||
}
|
||||
|
||||
base_virtual_address
|
||||
};
|
||||
// Stack always grows downwards.
|
||||
let address = base_address.data() + BACKUP_STACK_SIZE;
|
||||
|
||||
// Put them in the 1st entry of the IST.
|
||||
crate::gdt::TSS.ist[usize::from(index - 1)] = address as u64;
|
||||
|
||||
index
|
||||
};
|
||||
|
||||
// Set up exceptions
|
||||
current_idt[0].set_func(exception::divide_by_zero);
|
||||
current_idt[1].set_func(exception::debug);
|
||||
current_idt[2].set_func(exception::non_maskable);
|
||||
current_idt[2].set_ist(backup_ist);
|
||||
current_idt[3].set_func(exception::breakpoint);
|
||||
current_idt[3].set_flags(IdtFlags::PRESENT | IdtFlags::RING_3 | IdtFlags::INTERRUPT);
|
||||
current_idt[4].set_func(exception::overflow);
|
||||
@@ -165,6 +219,7 @@ pub unsafe fn init_generic(is_bsp: bool, idt: &mut Idt) {
|
||||
current_idt[6].set_func(exception::invalid_opcode);
|
||||
current_idt[7].set_func(exception::device_not_available);
|
||||
current_idt[8].set_func(exception::double_fault);
|
||||
current_idt[8].set_ist(backup_ist);
|
||||
// 9 no longer available
|
||||
current_idt[10].set_func(exception::invalid_tss);
|
||||
current_idt[11].set_func(exception::segment_not_present);
|
||||
@@ -175,6 +230,7 @@ pub unsafe fn init_generic(is_bsp: bool, idt: &mut Idt) {
|
||||
current_idt[16].set_func(exception::fpu_fault);
|
||||
current_idt[17].set_func(exception::alignment_check);
|
||||
current_idt[18].set_func(exception::machine_check);
|
||||
current_idt[18].set_ist(backup_ist);
|
||||
current_idt[19].set_func(exception::simd);
|
||||
current_idt[20].set_func(exception::virtualization);
|
||||
// 21 through 29 reserved
|
||||
@@ -275,6 +331,12 @@ impl IdtEntry {
|
||||
self.attribute = flags.bits;
|
||||
}
|
||||
|
||||
pub fn set_ist(&mut self, ist: u8) {
|
||||
assert_eq!(ist & 0x07, ist, "interrupt stack table must be within 0..=7");
|
||||
self.zero &= 0xF8;
|
||||
self.zero |= ist;
|
||||
}
|
||||
|
||||
pub fn set_offset(&mut self, selector: u16, base: usize) {
|
||||
self.selector = selector;
|
||||
self.offsetl = base as u16;
|
||||
@@ -285,6 +347,6 @@ impl IdtEntry {
|
||||
// A function to set the offset more easily
|
||||
pub fn set_func(&mut self, func: unsafe extern fn()) {
|
||||
self.set_flags(IdtFlags::PRESENT | IdtFlags::RING_0 | IdtFlags::INTERRUPT);
|
||||
self.set_offset(8, func as usize);
|
||||
self.set_offset((crate::gdt::GDT_KERNEL_CODE as u16) << 3, func as usize);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -18,7 +18,7 @@ interrupt_stack!(divide_by_zero, |stack| {
|
||||
ksignal(SIGFPE);
|
||||
});
|
||||
|
||||
interrupt_stack!(debug, |stack| {
|
||||
interrupt_stack!(debug, super_atomic: swapgs_iff_ring3_slow!, |stack| {
|
||||
let mut handled = false;
|
||||
|
||||
// Disable singlestep before there is a breakpoint, since the breakpoint
|
||||
@@ -41,7 +41,7 @@ interrupt_stack!(debug, |stack| {
|
||||
}
|
||||
});
|
||||
|
||||
interrupt_stack!(non_maskable, |stack| {
|
||||
interrupt_stack!(non_maskable, super_atomic: swapgs_iff_ring3_slow!, |stack| {
|
||||
println!("Non-maskable interrupt");
|
||||
stack.dump();
|
||||
});
|
||||
@@ -153,7 +153,7 @@ interrupt_error!(alignment_check, |stack| {
|
||||
ksignal(SIGBUS);
|
||||
});
|
||||
|
||||
interrupt_stack!(machine_check, |stack| {
|
||||
interrupt_stack!(machine_check, super_atomic: swapgs_iff_ring3_slow!, |stack| {
|
||||
println!("Machine check fault");
|
||||
stack.dump();
|
||||
stack_trace();
|
||||
|
||||
@@ -75,6 +75,11 @@ impl IretRegisters {
|
||||
println!("RFLAG: {:>016X}", { self.rflags });
|
||||
println!("CS: {:>016X}", { self.cs });
|
||||
println!("RIP: {:>016X}", { self.rip });
|
||||
|
||||
if self.cs & 0b11 != 0b00 {
|
||||
println!("RSP: {:>016X}", { self.rsp });
|
||||
println!("SS: {:>016X}", { self.ss });
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -304,9 +309,49 @@ macro_rules! pop_fs {
|
||||
" };
|
||||
}
|
||||
|
||||
macro_rules! swapgs_iff_ring3_fast {
|
||||
() => { "
|
||||
// Check whether the last two bits RSP+8 (code segment) are equal to zero.
|
||||
test QWORD PTR [rsp + 8], 0x3
|
||||
// Skip the SWAPGS instruction if CS & 0b11 == 0b00.
|
||||
jz 1f
|
||||
swapgs
|
||||
1:
|
||||
" };
|
||||
}
|
||||
macro_rules! swapgs_iff_ring3_fast_errorcode {
|
||||
() => { "
|
||||
test QWORD PTR [rsp + 16], 0x3
|
||||
jz 1f
|
||||
swapgs
|
||||
1:
|
||||
" };
|
||||
}
|
||||
#[macro_export]
|
||||
macro_rules! swapgs_iff_ring3_slow {
|
||||
() => { "
|
||||
push rax
|
||||
push rdx
|
||||
push rcx
|
||||
mov ecx, 0xC0000102
|
||||
rdmsr
|
||||
shl rdx, 32
|
||||
or eax, edx
|
||||
test rdx, rdx
|
||||
jnz 1f
|
||||
swapgs
|
||||
1:
|
||||
pop rcx
|
||||
pop rdx
|
||||
pop rax
|
||||
" }
|
||||
}
|
||||
|
||||
#[macro_export]
|
||||
macro_rules! interrupt_stack {
|
||||
($name:ident, |$stack:ident| $code:block) => {
|
||||
// XXX: Apparently we cannot use $expr and check for bool exhaustiveness, so we will have to
|
||||
// use idents directly instead.
|
||||
($name:ident, super_atomic: $is_super_atomic:ident!, |$stack:ident| $code:block) => {
|
||||
paste::item! {
|
||||
#[no_mangle]
|
||||
unsafe extern "C" fn [<__interrupt_ $name>](stack: *mut $crate::arch::x86_64::interrupt::InterruptStack) {
|
||||
@@ -322,6 +367,7 @@ macro_rules! interrupt_stack {
|
||||
|
||||
function!($name => {
|
||||
// Backup all userspace registers to stack
|
||||
$is_super_atomic!(),
|
||||
"push rax\n",
|
||||
push_scratch!(),
|
||||
push_preserved!(),
|
||||
@@ -342,10 +388,12 @@ macro_rules! interrupt_stack {
|
||||
pop_preserved!(),
|
||||
pop_scratch!(),
|
||||
|
||||
$is_super_atomic!(),
|
||||
"iretq\n",
|
||||
});
|
||||
}
|
||||
};
|
||||
($name:ident, |$stack:ident| $code:block) => { interrupt_stack!($name, super_atomic: swapgs_iff_ring3_fast!, |$stack| $code); };
|
||||
}
|
||||
|
||||
#[macro_export]
|
||||
@@ -359,6 +407,7 @@ macro_rules! interrupt {
|
||||
|
||||
function!($name => {
|
||||
// Backup all userspace registers to stack
|
||||
swapgs_iff_ring3_fast!(),
|
||||
"push rax\n",
|
||||
push_scratch!(),
|
||||
push_fs!(),
|
||||
@@ -376,6 +425,7 @@ macro_rules! interrupt {
|
||||
pop_fs!(),
|
||||
pop_scratch!(),
|
||||
|
||||
swapgs_iff_ring3_fast!(),
|
||||
"iretq\n",
|
||||
});
|
||||
}
|
||||
@@ -399,6 +449,7 @@ macro_rules! interrupt_error {
|
||||
}
|
||||
|
||||
function!($name => {
|
||||
swapgs_iff_ring3_fast_errorcode!(),
|
||||
// Move rax into code's place, put code in last instead (to be
|
||||
// compatible with InterruptStack)
|
||||
"xchg [rsp], rax\n",
|
||||
@@ -429,6 +480,7 @@ macro_rules! interrupt_error {
|
||||
pop_preserved!(),
|
||||
pop_scratch!(),
|
||||
|
||||
swapgs_iff_ring3_fast_errorcode!(),
|
||||
"iretq\n",
|
||||
});
|
||||
}
|
||||
|
||||
@@ -8,10 +8,20 @@ use crate::{
|
||||
use x86::msr;
|
||||
|
||||
pub unsafe fn init() {
|
||||
msr::wrmsr(msr::IA32_STAR, ((gdt::GDT_KERNEL_CODE as u64) << 3) << 32);
|
||||
// IA32_STAR[31:0] are reserved.
|
||||
|
||||
// The base selector of the two consecutive segments for kernel code and the immediately
|
||||
// suceeding stack (data).
|
||||
let syscall_cs_ss_base = (gdt::GDT_KERNEL_CODE as u16) << 3;
|
||||
// The base selector of the three consecutive segments (of which two are used) for user code
|
||||
// and user data. It points to a 32-bit code segment, which must be followed by a data segment
|
||||
// (stack), and a 64-bit code segment.
|
||||
let sysret_cs_ss_base = ((gdt::GDT_USER_CODE32_UNUSED as u16) << 3) | 3;
|
||||
let star_high = u32::from(syscall_cs_ss_base) | (u32::from(sysret_cs_ss_base) << 16);
|
||||
|
||||
msr::wrmsr(msr::IA32_STAR, u64::from(star_high) << 32);
|
||||
msr::wrmsr(msr::IA32_LSTAR, syscall_instruction as u64);
|
||||
msr::wrmsr(msr::IA32_FMASK, 0x0300); // Clear trap flag and interrupt enable
|
||||
msr::wrmsr(msr::IA32_KERNEL_GSBASE, &gdt::TSS as *const _ as u64);
|
||||
|
||||
let efer = msr::rdmsr(msr::IA32_EFER);
|
||||
msr::wrmsr(msr::IA32_EFER, efer | 1);
|
||||
@@ -52,15 +62,13 @@ function!(syscall_instruction => {
|
||||
// Yes, this is magic. No, you don't need to understand
|
||||
"
|
||||
swapgs // Set gs segment to TSS
|
||||
mov gs:[28], rsp // Save userspace rsp
|
||||
mov rsp, gs:[4] // Load kernel rsp
|
||||
push 5 * 8 + 3 // Push userspace data segment
|
||||
push QWORD PTR gs:[28] // Push userspace rsp
|
||||
mov QWORD PTR gs:[28], 0 // Clear userspace rsp
|
||||
mov gs:[0x70], rsp // Save userspace stack pointer
|
||||
mov rsp, gs:[4] // Load kernel stack pointer
|
||||
push QWORD PTR 5 * 8 + 3 // Push fake SS (resembling iret stack frame)
|
||||
push QWORD PTR gs:[0x70] // Push userspace rsp
|
||||
push r11 // Push rflags
|
||||
push 4 * 8 + 3 // Push userspace code segment
|
||||
push QWORD PTR 6 * 8 + 3 // Push fake CS (resembling iret stack frame)
|
||||
push rcx // Push userspace return pointer
|
||||
swapgs // Restore gs
|
||||
",
|
||||
|
||||
// Push context registers
|
||||
@@ -85,7 +93,41 @@ function!(syscall_instruction => {
|
||||
pop_scratch!(),
|
||||
|
||||
// Return
|
||||
"iretq\n",
|
||||
//
|
||||
// We must test whether RCX is canonical. This is not strictly necessary, but could be
|
||||
// fatal if some kernel bug would allow RCX to be modified by user code.
|
||||
//
|
||||
// See https://xenproject.org/2012/06/13/the-intel-sysret-privilege-escalation/.
|
||||
//
|
||||
// This is not just theoretical; ptrace allows userspace to change rcx of target processes.
|
||||
"
|
||||
pop rcx // Pop userspace return pointer
|
||||
|
||||
// Set ZF iff forbidden bit 47 (i.e. the bit that must be sign extended) is set.
|
||||
bt rcx, 47
|
||||
|
||||
// If ZF was set, i.e. the address was invalid higher-half, so jump to the slower iretq and
|
||||
// handle the error without being able to execute attacker-controlled code!
|
||||
jmp 1f
|
||||
|
||||
// Otherwise, continue with the fast sysretq.
|
||||
|
||||
sub rsp, 8 // Pop fake userspace CS
|
||||
pop r11 // Pop rflags
|
||||
pop QWORD PTR gs:[0x70] // Pop userspace stack pointer
|
||||
mov rsp, gs:[0x70] // Restore userspace stack pointer
|
||||
swapgs // Restore gs from TSS to user data
|
||||
sysretq // Return into userspace; RCX=>RIP,R11=>RFLAGS
|
||||
|
||||
1:
|
||||
|
||||
// Slow iretq
|
||||
push rcx
|
||||
xor rcx, rcx
|
||||
xor r11, r11
|
||||
swapgs
|
||||
iretq
|
||||
",
|
||||
});
|
||||
|
||||
interrupt_stack!(syscall, |stack| {
|
||||
|
||||
@@ -58,7 +58,7 @@ pub unsafe fn map() {
|
||||
|
||||
#[cfg(feature = "pti")]
|
||||
#[inline(always)]
|
||||
pub unsafe fn unmap() {
|
||||
pub unsafe extern "C" fn unmap() {
|
||||
// Switch to per-CPU stack
|
||||
switch_stack(PTI_CONTEXT_STACK, PTI_CPU_STACK.as_ptr() as usize + PTI_CPU_STACK.len());
|
||||
|
||||
@@ -83,4 +83,4 @@ pub unsafe fn map() {}
|
||||
|
||||
#[cfg(not(feature = "pti"))]
|
||||
#[inline(always)]
|
||||
pub unsafe fn unmap() {}
|
||||
pub unsafe extern "C" fn unmap() {}
|
||||
|
||||
@@ -240,54 +240,77 @@ pub unsafe extern fn kstart_ap(args_ptr: *const KernelArgsAp) -> ! {
|
||||
}
|
||||
|
||||
#[naked]
|
||||
pub unsafe fn usermode(ip: usize, sp: usize, arg: usize, singlestep: bool) -> ! {
|
||||
let mut flags = FLAG_INTERRUPTS;
|
||||
if singlestep {
|
||||
flags |= FLAG_SINGLESTEP;
|
||||
}
|
||||
#[inline(never)]
|
||||
// TODO: AbiCompatBool
|
||||
pub unsafe extern "C" fn usermode(_ip: usize, _sp: usize, _arg: usize, _singlestep: u32) -> ! {
|
||||
// rdi, rsi, rdx, rcx
|
||||
asm!(
|
||||
"
|
||||
mov rbx, {flag_interrupts}
|
||||
test ecx, ecx
|
||||
jz .after_singlestep_branch
|
||||
or rbx, {flag_singlestep}
|
||||
|
||||
asm!("push r10
|
||||
push r11
|
||||
push r12
|
||||
push r13
|
||||
push r14
|
||||
push r15",
|
||||
in("r10") (gdt::GDT_USER_DATA << 3 | 3), // Data segment
|
||||
in("r11") sp, // Stack pointer
|
||||
in("r12") flags, // Flags
|
||||
in("r13") (gdt::GDT_USER_CODE << 3 | 3), // Code segment
|
||||
in("r14") ip, // IP
|
||||
in("r15") arg, // Argument
|
||||
);
|
||||
.after_singlestep_branch:
|
||||
|
||||
// Unmap kernel
|
||||
pti::unmap();
|
||||
// save `ip` (rdi), `sp` (rsi), and `arg` (rdx) in callee-preserved registers, so that
|
||||
// they are not modified by `pti_unmap`
|
||||
|
||||
// Go to usermode
|
||||
asm!("mov ds, r14d
|
||||
mov es, r14d
|
||||
mov fs, r15d
|
||||
mov gs, r14d
|
||||
xor rax, rax
|
||||
xor rbx, rbx
|
||||
xor rcx, rcx
|
||||
xor rdx, rdx
|
||||
xor rsi, rsi
|
||||
xor rdi, rdi
|
||||
xor rbp, rbp
|
||||
xor r8, r8
|
||||
xor r9, r9
|
||||
xor r10, r10
|
||||
xor r11, r11
|
||||
xor r12, r12
|
||||
xor r13, r13
|
||||
xor r14, r14
|
||||
xor r15, r15
|
||||
fninit
|
||||
pop rdi
|
||||
iretq",
|
||||
in("r14") (gdt::GDT_USER_DATA << 3 | 3), // Data segment
|
||||
in("r15") (gdt::GDT_USER_TLS << 3 | 3), // TLS segment
|
||||
options(noreturn),
|
||||
mov r13, rdi
|
||||
mov r14, rsi
|
||||
mov r15, rdx
|
||||
call {pti_unmap}
|
||||
|
||||
// Go to usermode
|
||||
swapgs
|
||||
mov r8, {user_data_seg_selector}
|
||||
mov r9, {user_tls_seg_selector}
|
||||
mov ds, r8d
|
||||
mov es, r8d
|
||||
mov fs, r9d
|
||||
mov gs, r8d
|
||||
|
||||
// Target RFLAGS
|
||||
mov r11, rbx
|
||||
// Target instruction pointer
|
||||
mov rcx, r13
|
||||
// Target stack pointer
|
||||
mov rsp, r14
|
||||
// Target argument
|
||||
mov rdi, r15
|
||||
|
||||
xor rax, rax
|
||||
xor rbx, rbx
|
||||
// Don't zero rcx; it's used for `ip`.
|
||||
xor rdx, rdx
|
||||
// Don't zero rdi; it's used for `arg`.
|
||||
xor rsi, rsi
|
||||
xor rbp, rbp
|
||||
// Don't zero rsp, obviously.
|
||||
xor r8, r8
|
||||
xor r9, r9
|
||||
xor r10, r10
|
||||
// Don't zero r11; it's used for `rflags`.
|
||||
xor r12, r12
|
||||
xor r13, r13
|
||||
xor r14, r14
|
||||
xor r15, r15
|
||||
|
||||
fninit
|
||||
|
||||
// NOTE: Regarding the sysretq vulnerability, this is safe as we cannot modify RCX,
|
||||
// even though the caller can give us the wrong address. But, it's marked unsafe, so
|
||||
// the caller is responsible for this! (And, the likelihood of rcx being changed in the
|
||||
// middle here, is minimal, unless the attacker already has partial control of kernel
|
||||
// memory.)
|
||||
sysretq
|
||||
",
|
||||
|
||||
flag_interrupts = const(FLAG_INTERRUPTS),
|
||||
flag_singlestep = const(FLAG_SINGLESTEP),
|
||||
pti_unmap = sym pti::unmap,
|
||||
user_data_seg_selector = const(gdt::GDT_USER_DATA << 3 | 3),
|
||||
user_tls_seg_selector = const(gdt::GDT_USER_TLS << 3 | 3),
|
||||
options(noreturn),
|
||||
);
|
||||
}
|
||||
|
||||
@@ -122,7 +122,7 @@ pub extern "C" fn signal_handler(sig: usize) {
|
||||
sp -= mem::size_of::<usize>();
|
||||
*(sp as *mut usize) = restorer;
|
||||
|
||||
usermode(handler, sp, sig, singlestep);
|
||||
usermode(handler, sp, sig, u32::from(singlestep));
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -845,7 +845,7 @@ fn fexec_noreturn(
|
||||
}
|
||||
|
||||
// Go to usermode
|
||||
unsafe { usermode(entry, sp, 0, singlestep) }
|
||||
unsafe { usermode(entry, sp, 0, u32::from(singlestep)) }
|
||||
}
|
||||
|
||||
pub fn fexec_kernel(fd: FileHandle, args: Box<[Box<[u8]>]>, vars: Box<[Box<[u8]>]>, name_override_opt: Option<Box<str>>, auxv: Option<Vec<usize>>) -> Result<usize> {
|
||||
|
||||
Reference in New Issue
Block a user