Use GS for TLS!

Previously, the kernel used the regular FS segment for Thread-Local
Storage. The problem however, is that userspace code also uses FS for
TLS, meaning that the kernel would have to switch the FS segment between
user and kernel, _upon every syscall_. This is obviously suboptimal for
performance (especially with fast syscalls such as futex, nanosleep, or
yield).

I had to search LLVM for hours, just to find out that the insertion of
the memory load with FS was actually done in the linker, so I added a
flag for that.

I haven't done any proper benchmarking, but the boot process seems to
have gotten much faster!
This commit is contained in:
4lDO2
2021-02-15 14:18:01 +01:00
parent a283160c14
commit bdc925d275
4 changed files with 91 additions and 117 deletions

View File

@@ -1,6 +1,8 @@
//! Global descriptor table
use core::convert::TryInto;
use core::mem;
use x86::segmentation::load_cs;
use x86::bits64::task::TaskStateSegment;
use x86::Ring;
@@ -13,7 +15,7 @@ use crate::paging::PAGE_SIZE;
pub const GDT_NULL: usize = 0;
pub const GDT_KERNEL_CODE: usize = 1;
pub const GDT_KERNEL_DATA: usize = 2;
pub const GDT_KERNEL_TLS: usize = 3;
pub const GDT_KERNEL_KPCR: usize = 3;
pub const GDT_USER_CODE32_UNUSED: usize = 4;
pub const GDT_USER_DATA: usize = 5;
pub const GDT_USER_CODE: usize = 6;
@@ -39,11 +41,6 @@ pub const GDT_F_PAGE_SIZE: u8 = 1 << 7;
pub const GDT_F_PROTECTED_MODE: u8 = 1 << 6;
pub const GDT_F_LONG_MODE: u8 = 1 << 5;
static mut INIT_GDTR: DescriptorTablePointer<SegmentDescriptor> = DescriptorTablePointer {
limit: 0,
base: 0 as *const SegmentDescriptor
};
static mut INIT_GDT: [GdtEntry; 4] = [
// Null
GdtEntry::new(0, 0, 0, 0),
@@ -55,12 +52,6 @@ static mut INIT_GDT: [GdtEntry; 4] = [
GdtEntry::new(0, 0, GDT_A_PRESENT | GDT_A_RING_3 | GDT_A_SYSTEM | GDT_A_PRIVILEGE, GDT_F_LONG_MODE)
];
#[thread_local]
pub static mut GDTR: DescriptorTablePointer<SegmentDescriptor> = DescriptorTablePointer {
limit: 0,
base: 0 as *const SegmentDescriptor
};
#[thread_local]
pub static mut GDT: [GdtEntry; 10] = [
// Null
@@ -85,28 +76,26 @@ pub static mut GDT: [GdtEntry; 10] = [
GdtEntry::new(0, 0, 0, 0),
];
#[repr(packed)]
pub struct TssWrapper {
base: TaskStateSegment,
_pad: u64,
_user_stack: u64,
}
impl core::ops::Deref for TssWrapper {
type Target = TaskStateSegment;
#[repr(C, align(16))]
pub struct ProcessorControlRegion {
// NOTE: If you plan to change any fields here, please make sure that you also modify the
// offsets in the syscall instruction handler accordingly!
fn deref(&self) -> &Self::Target {
&self.base
}
}
impl core::ops::DerefMut for TssWrapper {
fn deref_mut(&mut self) -> &mut Self::Target {
&mut self.base
}
pub tcb_end: usize,
pub user_rsp_tmp: usize,
pub tss: TssWrapper,
}
// NOTE: Despite not using #[repr(packed)], we do know that while there may be some padding
// inserted before and after the TSS, the main TSS structure will remain intact.
#[repr(C, align(16))]
pub struct TssWrapper(pub TaskStateSegment);
#[thread_local]
pub static mut TSS: TssWrapper = TssWrapper {
base: TaskStateSegment {
pub static mut KPCR: ProcessorControlRegion = ProcessorControlRegion {
tcb_end: 0,
user_rsp_tmp: 0,
tss: TssWrapper(TaskStateSegment {
reserved: 0,
rsp: [0; 3],
reserved2: 0,
@@ -114,95 +103,117 @@ pub static mut TSS: TssWrapper = TssWrapper {
reserved3: 0,
reserved4: 0,
iomap_base: 0xFFFF
},
_pad: 0_u64,
// Accessed only from assembly, at `gs:[0x70]`
_user_stack: 0_u64,
}),
};
pub unsafe fn set_tcb(pid: usize) {
GDT[GDT_USER_TLS].set_offset((crate::USER_TCB_OFFSET + pid * PAGE_SIZE) as u32);
x86::segmentation::load_fs(SegmentSelector::new(GDT_USER_TLS as u16, Ring::Ring3));
}
#[cfg(feature = "pti")]
pub unsafe fn set_tss_stack(stack: usize) {
use super::pti::{PTI_CPU_STACK, PTI_CONTEXT_STACK};
TSS.rsp[0] = (PTI_CPU_STACK.as_ptr() as usize + PTI_CPU_STACK.len()) as u64;
KPCR.tss.0.rsp[0] = (PTI_CPU_STACK.as_ptr() as usize + PTI_CPU_STACK.len()) as u64;
PTI_CONTEXT_STACK = stack;
}
#[cfg(not(feature = "pti"))]
pub unsafe fn set_tss_stack(stack: usize) {
TSS.rsp[0] = stack as u64;
KPCR.tss.0.rsp[0] = stack as u64;
}
// Initialize GDT
pub unsafe fn init() {
// Setup the initial GDT with TLS, so we can setup the TLS GDT (a little confusing)
// This means that each CPU will have its own GDT, but we only need to define it once as a thread local
INIT_GDTR.limit = (INIT_GDT.len() * mem::size_of::<GdtEntry>() - 1) as u16;
INIT_GDTR.base = INIT_GDT.as_ptr() as *const SegmentDescriptor;
{
// Setup the initial GDT with TLS, so we can setup the TLS GDT (a little confusing)
// This means that each CPU will have its own GDT, but we only need to define it once as a thread local
// Load the initial GDT, before we have access to thread locals
dtables::lgdt(&INIT_GDTR);
let limit = (INIT_GDT.len() * mem::size_of::<GdtEntry>() - 1)
.try_into()
.expect("initial GDT way too large");
let base = INIT_GDT.as_ptr() as *const SegmentDescriptor;
let init_gdtr: DescriptorTablePointer<SegmentDescriptor> = DescriptorTablePointer {
limit,
base,
};
// Load the initial GDT, before we have access to thread locals
dtables::lgdt(&init_gdtr);
}
// Load the segment descriptors
load_cs(SegmentSelector::new(GDT_KERNEL_CODE as u16, Ring::Ring0));
segmentation::load_ds(SegmentSelector::new(GDT_KERNEL_DATA as u16, Ring::Ring0));
segmentation::load_es(SegmentSelector::new(GDT_KERNEL_DATA as u16, Ring::Ring0));
segmentation::load_fs(SegmentSelector::new(GDT_KERNEL_DATA as u16, Ring::Ring0));
segmentation::load_gs(SegmentSelector::new(GDT_KERNEL_DATA as u16, Ring::Ring0));
segmentation::load_gs(SegmentSelector::new(GDT_KERNEL_KPCR as u16, Ring::Ring0));
segmentation::load_ss(SegmentSelector::new(GDT_KERNEL_DATA as u16, Ring::Ring0));
}
/// Initialize GDT with TLS
pub unsafe fn init_paging(tcb_offset: usize, stack_offset: usize) {
// Set the TLS segment to the offset of the Thread Control Block
INIT_GDT[GDT_KERNEL_TLS].set_offset(tcb_offset as u32);
// Set temporary TLS segment to the self-pointer of the Thread Control Block.
x86::msr::wrmsr(x86::msr::IA32_GS_BASE, tcb_offset as u64);
// Load the initial GDT, before we have access to thread locals
dtables::lgdt(&INIT_GDTR);
// Now that we have access to thread locals, begin by getting a pointer to the Processor
// Control Region.
let kpcr = &mut KPCR;
// Load the segment descriptors
segmentation::load_fs(SegmentSelector::new(GDT_KERNEL_TLS as u16, Ring::Ring0));
// Then, setup the AP's individual GDT
let limit = (GDT.len() * mem::size_of::<GdtEntry>() - 1)
.try_into()
.expect("main GDT way too large");
let base = GDT.as_ptr() as *const SegmentDescriptor;
// Now that we have access to thread locals, setup the AP's individual GDT
GDTR.limit = (GDT.len() * mem::size_of::<GdtEntry>() - 1) as u16;
GDTR.base = GDT.as_ptr() as *const SegmentDescriptor;
let gdtr: DescriptorTablePointer<SegmentDescriptor> = DescriptorTablePointer {
limit,
base,
};
// Set the TLS segment to the offset of the Thread Control Block
GDT[GDT_KERNEL_TLS].set_offset(tcb_offset as u32);
// Once we have fetched the real KPCR address, set the TLS segment to the TCB pointer there.
kpcr.tcb_end = (tcb_offset as *const usize).read();
{
// We can now access our TSS, via the KPCR, which is a thread local
let tss = &kpcr.tss.0 as *const _ as usize as u64;
let tss_lo = (tss & 0xFFFF_FFFF) as u32;
let tss_hi = (tss >> 32) as u32;
GDT[GDT_TSS].set_offset(tss_lo);
GDT[GDT_TSS].set_limit(mem::size_of::<TaskStateSegment>() as u32);
(&mut GDT[GDT_TSS_HIGH] as *mut GdtEntry).cast::<u32>().write(tss_hi);
}
// Set the stack pointer to use when coming back from userspace.
set_tss_stack(stack_offset);
// Load the new GDT, which is correctly located in thread local storage.
dtables::lgdt(&gdtr);
// Ensure that GS always points to the KPCR in kernel space.
x86::msr::wrmsr(x86::msr::IA32_GS_BASE, kpcr as *mut _ as usize as u64);
// Inside kernel space, GS should _always_ point to the TSS. When leaving userspace, `swapgs`
// is called again, making the userspace GS always point to user data.
x86::msr::wrmsr(x86::msr::IA32_KERNEL_GSBASE, 0);
// Set the User TLS segment to the offset of the user TCB
set_tcb(0);
// We can now access our TSS, which is a thread local
GDT[GDT_TSS].set_offset(&TSS as *const _ as u32);
GDT[GDT_TSS].set_limit(mem::size_of::<TaskStateSegment>() as u32);
// Set the stack pointer when coming back from userspace
set_tss_stack(stack_offset);
// Load the new GDT, which is correctly located in thread local storage
dtables::lgdt(&GDTR);
// Reload the segment descriptors
load_cs(SegmentSelector::new(GDT_KERNEL_CODE as u16, Ring::Ring0));
segmentation::load_ds(SegmentSelector::new(GDT_KERNEL_DATA as u16, Ring::Ring0));
segmentation::load_es(SegmentSelector::new(GDT_KERNEL_DATA as u16, Ring::Ring0));
segmentation::load_fs(SegmentSelector::new(GDT_KERNEL_TLS as u16, Ring::Ring0));
segmentation::load_gs(SegmentSelector::new(GDT_KERNEL_DATA as u16, Ring::Ring0));
segmentation::load_ss(SegmentSelector::new(GDT_KERNEL_DATA as u16, Ring::Ring0));
// NOTE: FS has already been updated while calling set_tcb.
// NOTE: We do not want to load GS again, since it has already been loaded into
// GDT_KERNEL_KPCR. Instead, we use the base MSR to allow for a 64-bit offset.
// Load the task register
task::load_tr(SegmentSelector::new(GDT_TSS as u16, Ring::Ring0));
// Ensure that GS always points to the TSS segment in kernel space.
x86::msr::wrmsr(x86::msr::IA32_GS_BASE, &TSS as *const _ as usize as u64);
// Inside kernel space, GS should _always_ point to the TSS. When leaving userspace, `swapgs`
// is called again, making the userspace GS always point to user data.
x86::msr::wrmsr(x86::msr::IA32_KERNEL_GSBASE, 0);
}
#[derive(Copy, Clone, Debug)]

View File

@@ -202,7 +202,7 @@ pub unsafe fn init_generic(is_bsp: bool, idt: &mut Idt) {
let address = base_address.data() + BACKUP_STACK_SIZE;
// Put them in the 1st entry of the IST.
crate::gdt::TSS.ist[usize::from(index - 1)] = address as u64;
crate::gdt::KPCR.tss.0.ist[usize::from(index - 1)] = address as u64;
index
};

View File

@@ -86,7 +86,6 @@ impl IretRegisters {
#[derive(Default)]
#[repr(packed)]
pub struct InterruptStack {
pub fs: usize,
pub preserved: PreservedRegisters,
pub scratch: ScratchRegisters,
pub iret: IretRegisters,
@@ -97,13 +96,10 @@ impl InterruptStack {
self.iret.dump();
self.scratch.dump();
self.preserved.dump();
println!("FS: {:>016X}", { self.fs });
}
/// Saves all registers to a struct used by the proc:
/// scheme to read/write registers.
pub fn save(&self, all: &mut IntRegisters) {
all.fs = self.fs;
all.r15 = self.preserved.r15;
all.r14 = self.preserved.r14;
all.r13 = self.preserved.r13;
@@ -284,31 +280,6 @@ macro_rules! pop_preserved {
pop rbx
" };
}
#[macro_export]
macro_rules! push_fs {
() => { "
// Push fs
push fs
// Load kernel tls
//
// NOTE: We can't load the value directly into `fs`. So we need to use a
// scratch register (as preserved registers aren't backed up by the
// interrupt! macro) to store it. We also can't use `rax` as the temporary
// value, as during errors that's already used for the error code.
mov rcx, 0x18
mov fs, cx
" };
}
#[macro_export]
macro_rules! pop_fs {
() => { "
// Pop fs
pop fs
" };
}
macro_rules! swapgs_iff_ring3_fast {
() => { "
// Check whether the last two bits RSP+8 (code segment) are equal to zero.
@@ -371,7 +342,6 @@ macro_rules! interrupt_stack {
"push rax\n",
push_scratch!(),
push_preserved!(),
push_fs!(),
// TODO: Map PTI
// $crate::arch::x86_64::pti::map();
@@ -384,7 +354,6 @@ macro_rules! interrupt_stack {
// $crate::arch::x86_64::pti::unmap();
// Restore all userspace registers
pop_fs!(),
pop_preserved!(),
pop_scratch!(),
@@ -410,7 +379,6 @@ macro_rules! interrupt {
swapgs_iff_ring3_fast!(),
"push rax\n",
push_scratch!(),
push_fs!(),
// TODO: Map PTI
// $crate::arch::x86_64::pti::map();
@@ -422,7 +390,6 @@ macro_rules! interrupt {
// $crate::arch::x86_64::pti::unmap();
// Restore all userspace registers
pop_fs!(),
pop_scratch!(),
swapgs_iff_ring3_fast!(),
@@ -457,7 +424,6 @@ macro_rules! interrupt_error {
// Push all userspace registers
push_scratch!(),
push_preserved!(),
push_fs!(),
// Put code in, it's now in rax
"push rax\n",
@@ -476,7 +442,6 @@ macro_rules! interrupt_error {
"add rsp, 8\n",
// Restore all userspace registers
pop_fs!(),
pop_preserved!(),
pop_scratch!(),

View File

@@ -62,10 +62,10 @@ function!(syscall_instruction => {
// Yes, this is magic. No, you don't need to understand
"
swapgs // Set gs segment to TSS
mov gs:[0x70], rsp // Save userspace stack pointer
mov rsp, gs:[4] // Load kernel stack pointer
push QWORD PTR 5 * 8 + 3 // Push fake SS (resembling iret stack frame)
push QWORD PTR gs:[0x70] // Push userspace rsp
mov gs:[0x08], rsp // Save userspace stack pointer
mov rsp, gs:[0x14] // Load kernel stack pointer
push QWORD PTR 5 * 8 + 3 // Push fake userspace SS (resembling iret frame)
push QWORD PTR gs:[0x08] // Push userspace rsp
push r11 // Push rflags
push QWORD PTR 6 * 8 + 3 // Push fake CS (resembling iret stack frame)
push rcx // Push userspace return pointer
@@ -75,7 +75,6 @@ function!(syscall_instruction => {
"push rax\n",
push_scratch!(),
push_preserved!(),
push_fs!(),
// TODO: Map PTI
// $crate::arch::x86_64::pti::map();
@@ -88,7 +87,6 @@ function!(syscall_instruction => {
// $crate::arch::x86_64::pti::unmap();
// Pop context registers
pop_fs!(),
pop_preserved!(),
pop_scratch!(),
@@ -115,8 +113,8 @@ function!(syscall_instruction => {
pop rcx // Pop userspace return pointer
add rsp, 8 // Pop fake userspace CS
pop r11 // Pop rflags
pop QWORD PTR gs:[0x70] // Pop userspace stack pointer
mov rsp, gs:[0x70] // Restore userspace stack pointer
pop QWORD PTR gs:[0x08] // Pop userspace stack pointer
mov rsp, gs:[0x08] // Restore userspace stack pointer
swapgs // Restore gs from TSS to user data
sysretq // Return into userspace; RCX=>RIP,R11=>RFLAGS