Use GS for TLS!
Previously, the kernel used the regular FS segment for Thread-Local Storage. The problem however, is that userspace code also uses FS for TLS, meaning that the kernel would have to switch the FS segment between user and kernel, _upon every syscall_. This is obviously suboptimal for performance (especially with fast syscalls such as futex, nanosleep, or yield). I had to search LLVM for hours, just to find out that the insertion of the memory load with FS was actually done in the linker, so I added a flag for that. I haven't done any proper benchmarking, but the boot process seems to have gotten much faster!
This commit is contained in:
@@ -1,6 +1,8 @@
|
||||
//! Global descriptor table
|
||||
|
||||
use core::convert::TryInto;
|
||||
use core::mem;
|
||||
|
||||
use x86::segmentation::load_cs;
|
||||
use x86::bits64::task::TaskStateSegment;
|
||||
use x86::Ring;
|
||||
@@ -13,7 +15,7 @@ use crate::paging::PAGE_SIZE;
|
||||
pub const GDT_NULL: usize = 0;
|
||||
pub const GDT_KERNEL_CODE: usize = 1;
|
||||
pub const GDT_KERNEL_DATA: usize = 2;
|
||||
pub const GDT_KERNEL_TLS: usize = 3;
|
||||
pub const GDT_KERNEL_KPCR: usize = 3;
|
||||
pub const GDT_USER_CODE32_UNUSED: usize = 4;
|
||||
pub const GDT_USER_DATA: usize = 5;
|
||||
pub const GDT_USER_CODE: usize = 6;
|
||||
@@ -39,11 +41,6 @@ pub const GDT_F_PAGE_SIZE: u8 = 1 << 7;
|
||||
pub const GDT_F_PROTECTED_MODE: u8 = 1 << 6;
|
||||
pub const GDT_F_LONG_MODE: u8 = 1 << 5;
|
||||
|
||||
static mut INIT_GDTR: DescriptorTablePointer<SegmentDescriptor> = DescriptorTablePointer {
|
||||
limit: 0,
|
||||
base: 0 as *const SegmentDescriptor
|
||||
};
|
||||
|
||||
static mut INIT_GDT: [GdtEntry; 4] = [
|
||||
// Null
|
||||
GdtEntry::new(0, 0, 0, 0),
|
||||
@@ -55,12 +52,6 @@ static mut INIT_GDT: [GdtEntry; 4] = [
|
||||
GdtEntry::new(0, 0, GDT_A_PRESENT | GDT_A_RING_3 | GDT_A_SYSTEM | GDT_A_PRIVILEGE, GDT_F_LONG_MODE)
|
||||
];
|
||||
|
||||
#[thread_local]
|
||||
pub static mut GDTR: DescriptorTablePointer<SegmentDescriptor> = DescriptorTablePointer {
|
||||
limit: 0,
|
||||
base: 0 as *const SegmentDescriptor
|
||||
};
|
||||
|
||||
#[thread_local]
|
||||
pub static mut GDT: [GdtEntry; 10] = [
|
||||
// Null
|
||||
@@ -85,28 +76,26 @@ pub static mut GDT: [GdtEntry; 10] = [
|
||||
GdtEntry::new(0, 0, 0, 0),
|
||||
];
|
||||
|
||||
#[repr(packed)]
|
||||
pub struct TssWrapper {
|
||||
base: TaskStateSegment,
|
||||
_pad: u64,
|
||||
_user_stack: u64,
|
||||
}
|
||||
impl core::ops::Deref for TssWrapper {
|
||||
type Target = TaskStateSegment;
|
||||
#[repr(C, align(16))]
|
||||
pub struct ProcessorControlRegion {
|
||||
// NOTE: If you plan to change any fields here, please make sure that you also modify the
|
||||
// offsets in the syscall instruction handler accordingly!
|
||||
|
||||
fn deref(&self) -> &Self::Target {
|
||||
&self.base
|
||||
}
|
||||
}
|
||||
impl core::ops::DerefMut for TssWrapper {
|
||||
fn deref_mut(&mut self) -> &mut Self::Target {
|
||||
&mut self.base
|
||||
}
|
||||
pub tcb_end: usize,
|
||||
pub user_rsp_tmp: usize,
|
||||
pub tss: TssWrapper,
|
||||
}
|
||||
|
||||
// NOTE: Despite not using #[repr(packed)], we do know that while there may be some padding
|
||||
// inserted before and after the TSS, the main TSS structure will remain intact.
|
||||
#[repr(C, align(16))]
|
||||
pub struct TssWrapper(pub TaskStateSegment);
|
||||
|
||||
#[thread_local]
|
||||
pub static mut TSS: TssWrapper = TssWrapper {
|
||||
base: TaskStateSegment {
|
||||
pub static mut KPCR: ProcessorControlRegion = ProcessorControlRegion {
|
||||
tcb_end: 0,
|
||||
user_rsp_tmp: 0,
|
||||
tss: TssWrapper(TaskStateSegment {
|
||||
reserved: 0,
|
||||
rsp: [0; 3],
|
||||
reserved2: 0,
|
||||
@@ -114,95 +103,117 @@ pub static mut TSS: TssWrapper = TssWrapper {
|
||||
reserved3: 0,
|
||||
reserved4: 0,
|
||||
iomap_base: 0xFFFF
|
||||
},
|
||||
_pad: 0_u64,
|
||||
// Accessed only from assembly, at `gs:[0x70]`
|
||||
_user_stack: 0_u64,
|
||||
}),
|
||||
};
|
||||
|
||||
pub unsafe fn set_tcb(pid: usize) {
|
||||
GDT[GDT_USER_TLS].set_offset((crate::USER_TCB_OFFSET + pid * PAGE_SIZE) as u32);
|
||||
x86::segmentation::load_fs(SegmentSelector::new(GDT_USER_TLS as u16, Ring::Ring3));
|
||||
}
|
||||
|
||||
#[cfg(feature = "pti")]
|
||||
pub unsafe fn set_tss_stack(stack: usize) {
|
||||
use super::pti::{PTI_CPU_STACK, PTI_CONTEXT_STACK};
|
||||
TSS.rsp[0] = (PTI_CPU_STACK.as_ptr() as usize + PTI_CPU_STACK.len()) as u64;
|
||||
KPCR.tss.0.rsp[0] = (PTI_CPU_STACK.as_ptr() as usize + PTI_CPU_STACK.len()) as u64;
|
||||
PTI_CONTEXT_STACK = stack;
|
||||
}
|
||||
|
||||
#[cfg(not(feature = "pti"))]
|
||||
pub unsafe fn set_tss_stack(stack: usize) {
|
||||
TSS.rsp[0] = stack as u64;
|
||||
KPCR.tss.0.rsp[0] = stack as u64;
|
||||
}
|
||||
|
||||
// Initialize GDT
|
||||
pub unsafe fn init() {
|
||||
// Setup the initial GDT with TLS, so we can setup the TLS GDT (a little confusing)
|
||||
// This means that each CPU will have its own GDT, but we only need to define it once as a thread local
|
||||
INIT_GDTR.limit = (INIT_GDT.len() * mem::size_of::<GdtEntry>() - 1) as u16;
|
||||
INIT_GDTR.base = INIT_GDT.as_ptr() as *const SegmentDescriptor;
|
||||
{
|
||||
// Setup the initial GDT with TLS, so we can setup the TLS GDT (a little confusing)
|
||||
// This means that each CPU will have its own GDT, but we only need to define it once as a thread local
|
||||
|
||||
// Load the initial GDT, before we have access to thread locals
|
||||
dtables::lgdt(&INIT_GDTR);
|
||||
let limit = (INIT_GDT.len() * mem::size_of::<GdtEntry>() - 1)
|
||||
.try_into()
|
||||
.expect("initial GDT way too large");
|
||||
let base = INIT_GDT.as_ptr() as *const SegmentDescriptor;
|
||||
|
||||
let init_gdtr: DescriptorTablePointer<SegmentDescriptor> = DescriptorTablePointer {
|
||||
limit,
|
||||
base,
|
||||
};
|
||||
|
||||
// Load the initial GDT, before we have access to thread locals
|
||||
dtables::lgdt(&init_gdtr);
|
||||
}
|
||||
|
||||
// Load the segment descriptors
|
||||
load_cs(SegmentSelector::new(GDT_KERNEL_CODE as u16, Ring::Ring0));
|
||||
segmentation::load_ds(SegmentSelector::new(GDT_KERNEL_DATA as u16, Ring::Ring0));
|
||||
segmentation::load_es(SegmentSelector::new(GDT_KERNEL_DATA as u16, Ring::Ring0));
|
||||
segmentation::load_fs(SegmentSelector::new(GDT_KERNEL_DATA as u16, Ring::Ring0));
|
||||
segmentation::load_gs(SegmentSelector::new(GDT_KERNEL_DATA as u16, Ring::Ring0));
|
||||
segmentation::load_gs(SegmentSelector::new(GDT_KERNEL_KPCR as u16, Ring::Ring0));
|
||||
segmentation::load_ss(SegmentSelector::new(GDT_KERNEL_DATA as u16, Ring::Ring0));
|
||||
}
|
||||
|
||||
/// Initialize GDT with TLS
|
||||
pub unsafe fn init_paging(tcb_offset: usize, stack_offset: usize) {
|
||||
// Set the TLS segment to the offset of the Thread Control Block
|
||||
INIT_GDT[GDT_KERNEL_TLS].set_offset(tcb_offset as u32);
|
||||
// Set temporary TLS segment to the self-pointer of the Thread Control Block.
|
||||
x86::msr::wrmsr(x86::msr::IA32_GS_BASE, tcb_offset as u64);
|
||||
|
||||
// Load the initial GDT, before we have access to thread locals
|
||||
dtables::lgdt(&INIT_GDTR);
|
||||
// Now that we have access to thread locals, begin by getting a pointer to the Processor
|
||||
// Control Region.
|
||||
let kpcr = &mut KPCR;
|
||||
|
||||
// Load the segment descriptors
|
||||
segmentation::load_fs(SegmentSelector::new(GDT_KERNEL_TLS as u16, Ring::Ring0));
|
||||
// Then, setup the AP's individual GDT
|
||||
let limit = (GDT.len() * mem::size_of::<GdtEntry>() - 1)
|
||||
.try_into()
|
||||
.expect("main GDT way too large");
|
||||
let base = GDT.as_ptr() as *const SegmentDescriptor;
|
||||
|
||||
// Now that we have access to thread locals, setup the AP's individual GDT
|
||||
GDTR.limit = (GDT.len() * mem::size_of::<GdtEntry>() - 1) as u16;
|
||||
GDTR.base = GDT.as_ptr() as *const SegmentDescriptor;
|
||||
let gdtr: DescriptorTablePointer<SegmentDescriptor> = DescriptorTablePointer {
|
||||
limit,
|
||||
base,
|
||||
};
|
||||
|
||||
// Set the TLS segment to the offset of the Thread Control Block
|
||||
GDT[GDT_KERNEL_TLS].set_offset(tcb_offset as u32);
|
||||
// Once we have fetched the real KPCR address, set the TLS segment to the TCB pointer there.
|
||||
kpcr.tcb_end = (tcb_offset as *const usize).read();
|
||||
|
||||
{
|
||||
// We can now access our TSS, via the KPCR, which is a thread local
|
||||
let tss = &kpcr.tss.0 as *const _ as usize as u64;
|
||||
let tss_lo = (tss & 0xFFFF_FFFF) as u32;
|
||||
let tss_hi = (tss >> 32) as u32;
|
||||
|
||||
GDT[GDT_TSS].set_offset(tss_lo);
|
||||
GDT[GDT_TSS].set_limit(mem::size_of::<TaskStateSegment>() as u32);
|
||||
|
||||
(&mut GDT[GDT_TSS_HIGH] as *mut GdtEntry).cast::<u32>().write(tss_hi);
|
||||
}
|
||||
|
||||
// Set the stack pointer to use when coming back from userspace.
|
||||
set_tss_stack(stack_offset);
|
||||
|
||||
// Load the new GDT, which is correctly located in thread local storage.
|
||||
dtables::lgdt(&gdtr);
|
||||
|
||||
// Ensure that GS always points to the KPCR in kernel space.
|
||||
x86::msr::wrmsr(x86::msr::IA32_GS_BASE, kpcr as *mut _ as usize as u64);
|
||||
// Inside kernel space, GS should _always_ point to the TSS. When leaving userspace, `swapgs`
|
||||
// is called again, making the userspace GS always point to user data.
|
||||
x86::msr::wrmsr(x86::msr::IA32_KERNEL_GSBASE, 0);
|
||||
|
||||
// Set the User TLS segment to the offset of the user TCB
|
||||
set_tcb(0);
|
||||
|
||||
// We can now access our TSS, which is a thread local
|
||||
GDT[GDT_TSS].set_offset(&TSS as *const _ as u32);
|
||||
GDT[GDT_TSS].set_limit(mem::size_of::<TaskStateSegment>() as u32);
|
||||
|
||||
// Set the stack pointer when coming back from userspace
|
||||
set_tss_stack(stack_offset);
|
||||
|
||||
// Load the new GDT, which is correctly located in thread local storage
|
||||
dtables::lgdt(&GDTR);
|
||||
|
||||
// Reload the segment descriptors
|
||||
load_cs(SegmentSelector::new(GDT_KERNEL_CODE as u16, Ring::Ring0));
|
||||
segmentation::load_ds(SegmentSelector::new(GDT_KERNEL_DATA as u16, Ring::Ring0));
|
||||
segmentation::load_es(SegmentSelector::new(GDT_KERNEL_DATA as u16, Ring::Ring0));
|
||||
segmentation::load_fs(SegmentSelector::new(GDT_KERNEL_TLS as u16, Ring::Ring0));
|
||||
|
||||
segmentation::load_gs(SegmentSelector::new(GDT_KERNEL_DATA as u16, Ring::Ring0));
|
||||
segmentation::load_ss(SegmentSelector::new(GDT_KERNEL_DATA as u16, Ring::Ring0));
|
||||
|
||||
// NOTE: FS has already been updated while calling set_tcb.
|
||||
// NOTE: We do not want to load GS again, since it has already been loaded into
|
||||
// GDT_KERNEL_KPCR. Instead, we use the base MSR to allow for a 64-bit offset.
|
||||
|
||||
// Load the task register
|
||||
task::load_tr(SegmentSelector::new(GDT_TSS as u16, Ring::Ring0));
|
||||
|
||||
// Ensure that GS always points to the TSS segment in kernel space.
|
||||
x86::msr::wrmsr(x86::msr::IA32_GS_BASE, &TSS as *const _ as usize as u64);
|
||||
// Inside kernel space, GS should _always_ point to the TSS. When leaving userspace, `swapgs`
|
||||
// is called again, making the userspace GS always point to user data.
|
||||
x86::msr::wrmsr(x86::msr::IA32_KERNEL_GSBASE, 0);
|
||||
}
|
||||
|
||||
#[derive(Copy, Clone, Debug)]
|
||||
|
||||
@@ -202,7 +202,7 @@ pub unsafe fn init_generic(is_bsp: bool, idt: &mut Idt) {
|
||||
let address = base_address.data() + BACKUP_STACK_SIZE;
|
||||
|
||||
// Put them in the 1st entry of the IST.
|
||||
crate::gdt::TSS.ist[usize::from(index - 1)] = address as u64;
|
||||
crate::gdt::KPCR.tss.0.ist[usize::from(index - 1)] = address as u64;
|
||||
|
||||
index
|
||||
};
|
||||
|
||||
@@ -86,7 +86,6 @@ impl IretRegisters {
|
||||
#[derive(Default)]
|
||||
#[repr(packed)]
|
||||
pub struct InterruptStack {
|
||||
pub fs: usize,
|
||||
pub preserved: PreservedRegisters,
|
||||
pub scratch: ScratchRegisters,
|
||||
pub iret: IretRegisters,
|
||||
@@ -97,13 +96,10 @@ impl InterruptStack {
|
||||
self.iret.dump();
|
||||
self.scratch.dump();
|
||||
self.preserved.dump();
|
||||
println!("FS: {:>016X}", { self.fs });
|
||||
}
|
||||
/// Saves all registers to a struct used by the proc:
|
||||
/// scheme to read/write registers.
|
||||
pub fn save(&self, all: &mut IntRegisters) {
|
||||
all.fs = self.fs;
|
||||
|
||||
all.r15 = self.preserved.r15;
|
||||
all.r14 = self.preserved.r14;
|
||||
all.r13 = self.preserved.r13;
|
||||
@@ -284,31 +280,6 @@ macro_rules! pop_preserved {
|
||||
pop rbx
|
||||
" };
|
||||
}
|
||||
|
||||
#[macro_export]
|
||||
macro_rules! push_fs {
|
||||
() => { "
|
||||
// Push fs
|
||||
push fs
|
||||
|
||||
// Load kernel tls
|
||||
//
|
||||
// NOTE: We can't load the value directly into `fs`. So we need to use a
|
||||
// scratch register (as preserved registers aren't backed up by the
|
||||
// interrupt! macro) to store it. We also can't use `rax` as the temporary
|
||||
// value, as during errors that's already used for the error code.
|
||||
mov rcx, 0x18
|
||||
mov fs, cx
|
||||
" };
|
||||
}
|
||||
#[macro_export]
|
||||
macro_rules! pop_fs {
|
||||
() => { "
|
||||
// Pop fs
|
||||
pop fs
|
||||
" };
|
||||
}
|
||||
|
||||
macro_rules! swapgs_iff_ring3_fast {
|
||||
() => { "
|
||||
// Check whether the last two bits RSP+8 (code segment) are equal to zero.
|
||||
@@ -371,7 +342,6 @@ macro_rules! interrupt_stack {
|
||||
"push rax\n",
|
||||
push_scratch!(),
|
||||
push_preserved!(),
|
||||
push_fs!(),
|
||||
|
||||
// TODO: Map PTI
|
||||
// $crate::arch::x86_64::pti::map();
|
||||
@@ -384,7 +354,6 @@ macro_rules! interrupt_stack {
|
||||
// $crate::arch::x86_64::pti::unmap();
|
||||
|
||||
// Restore all userspace registers
|
||||
pop_fs!(),
|
||||
pop_preserved!(),
|
||||
pop_scratch!(),
|
||||
|
||||
@@ -410,7 +379,6 @@ macro_rules! interrupt {
|
||||
swapgs_iff_ring3_fast!(),
|
||||
"push rax\n",
|
||||
push_scratch!(),
|
||||
push_fs!(),
|
||||
|
||||
// TODO: Map PTI
|
||||
// $crate::arch::x86_64::pti::map();
|
||||
@@ -422,7 +390,6 @@ macro_rules! interrupt {
|
||||
// $crate::arch::x86_64::pti::unmap();
|
||||
|
||||
// Restore all userspace registers
|
||||
pop_fs!(),
|
||||
pop_scratch!(),
|
||||
|
||||
swapgs_iff_ring3_fast!(),
|
||||
@@ -457,7 +424,6 @@ macro_rules! interrupt_error {
|
||||
// Push all userspace registers
|
||||
push_scratch!(),
|
||||
push_preserved!(),
|
||||
push_fs!(),
|
||||
|
||||
// Put code in, it's now in rax
|
||||
"push rax\n",
|
||||
@@ -476,7 +442,6 @@ macro_rules! interrupt_error {
|
||||
"add rsp, 8\n",
|
||||
|
||||
// Restore all userspace registers
|
||||
pop_fs!(),
|
||||
pop_preserved!(),
|
||||
pop_scratch!(),
|
||||
|
||||
|
||||
@@ -62,10 +62,10 @@ function!(syscall_instruction => {
|
||||
// Yes, this is magic. No, you don't need to understand
|
||||
"
|
||||
swapgs // Set gs segment to TSS
|
||||
mov gs:[0x70], rsp // Save userspace stack pointer
|
||||
mov rsp, gs:[4] // Load kernel stack pointer
|
||||
push QWORD PTR 5 * 8 + 3 // Push fake SS (resembling iret stack frame)
|
||||
push QWORD PTR gs:[0x70] // Push userspace rsp
|
||||
mov gs:[0x08], rsp // Save userspace stack pointer
|
||||
mov rsp, gs:[0x14] // Load kernel stack pointer
|
||||
push QWORD PTR 5 * 8 + 3 // Push fake userspace SS (resembling iret frame)
|
||||
push QWORD PTR gs:[0x08] // Push userspace rsp
|
||||
push r11 // Push rflags
|
||||
push QWORD PTR 6 * 8 + 3 // Push fake CS (resembling iret stack frame)
|
||||
push rcx // Push userspace return pointer
|
||||
@@ -75,7 +75,6 @@ function!(syscall_instruction => {
|
||||
"push rax\n",
|
||||
push_scratch!(),
|
||||
push_preserved!(),
|
||||
push_fs!(),
|
||||
|
||||
// TODO: Map PTI
|
||||
// $crate::arch::x86_64::pti::map();
|
||||
@@ -88,7 +87,6 @@ function!(syscall_instruction => {
|
||||
// $crate::arch::x86_64::pti::unmap();
|
||||
|
||||
// Pop context registers
|
||||
pop_fs!(),
|
||||
pop_preserved!(),
|
||||
pop_scratch!(),
|
||||
|
||||
@@ -115,8 +113,8 @@ function!(syscall_instruction => {
|
||||
pop rcx // Pop userspace return pointer
|
||||
add rsp, 8 // Pop fake userspace CS
|
||||
pop r11 // Pop rflags
|
||||
pop QWORD PTR gs:[0x70] // Pop userspace stack pointer
|
||||
mov rsp, gs:[0x70] // Restore userspace stack pointer
|
||||
pop QWORD PTR gs:[0x08] // Pop userspace stack pointer
|
||||
mov rsp, gs:[0x08] // Restore userspace stack pointer
|
||||
swapgs // Restore gs from TSS to user data
|
||||
sysretq // Return into userspace; RCX=>RIP,R11=>RFLAGS
|
||||
|
||||
|
||||
Reference in New Issue
Block a user