diff --git a/src/arch/mod.rs b/src/arch/mod.rs index 813ee20..376961f 100644 --- a/src/arch/mod.rs +++ b/src/arch/mod.rs @@ -4,6 +4,12 @@ pub mod aarch64; #[cfg(target_arch = "aarch64")] pub use self::aarch64::*; +#[cfg(target_arch = "x86")] +#[macro_use] +pub mod x86; +#[cfg(target_arch = "x86")] +pub use self::x86::*; + #[cfg(target_arch = "x86_64")] #[macro_use] pub mod x86_64; diff --git a/src/arch/x86/consts.rs b/src/arch/x86/consts.rs new file mode 100644 index 0000000..e05b275 --- /dev/null +++ b/src/arch/x86/consts.rs @@ -0,0 +1,28 @@ +// Because the memory map is so important to not be aliased, it is defined here, in one place +// The lower 256 PML4 entries are reserved for userspace +// Each PML4 entry references up to 512 GB of memory +// The second from the top (510) PML4 is reserved for the kernel + + /// Offset of kernel + pub const KERNEL_OFFSET: usize = 0xC000_0000; + + /// Offset to kernel heap + pub const KERNEL_HEAP_OFFSET: usize = 0xE000_0000; + /// Size of kernel heap + pub const KERNEL_HEAP_SIZE: usize = rmm::MEGABYTE; + + /// Offset to kernel percpu variables + pub const KERNEL_PERCPU_OFFSET: usize = 0xF000_0000; + /// Size of kernel percpu variables + pub const KERNEL_PERCPU_SHIFT: u8 = 16; // 2^16 = 64 KiB + pub const KERNEL_PERCPU_SIZE: usize = 1_usize << KERNEL_PERCPU_SHIFT; + + /// Offset of physmap + // This needs to match RMM's PHYS_OFFSET + pub const PHYS_OFFSET: usize = 0x8000_0000; + + /// Offset to user image + pub const USER_OFFSET: usize = 0; + + /// End offset of the user image, i.e. kernel start + pub const USER_END_OFFSET: usize = 0x8000_0000; diff --git a/src/arch/x86/cpuid.rs b/src/arch/x86/cpuid.rs new file mode 100644 index 0000000..760a851 --- /dev/null +++ b/src/arch/x86/cpuid.rs @@ -0,0 +1,14 @@ +use raw_cpuid::{CpuId, CpuIdResult}; + +pub fn cpuid() -> Option { + //TODO: ensure that CPUID exists! https://wiki.osdev.org/CPUID#Checking_CPUID_availability + Some(CpuId::with_cpuid_fn(|a, c| { + let result = unsafe { core::arch::x86::__cpuid_count(a, c) }; + CpuIdResult { + eax: result.eax, + ebx: result.ebx, + ecx: result.ecx, + edx: result.edx, + } + })) +} diff --git a/src/arch/x86/debug.rs b/src/arch/x86/debug.rs new file mode 100644 index 0000000..70f79db --- /dev/null +++ b/src/arch/x86/debug.rs @@ -0,0 +1,106 @@ +use core::fmt; +#[cfg(feature = "qemu_debug")] +use spin::Mutex; +use spin::MutexGuard; + +use crate::log::{LOG, Log}; +#[cfg(feature = "qemu_debug")] +use syscall::io::Io; +#[cfg(any(feature = "qemu_debug", feature = "serial_debug"))] +use crate::syscall::io::Pio; +#[cfg(feature = "lpss_debug")] +use crate::syscall::io::Mmio; +#[cfg(any(feature = "lpss_debug", feature = "serial_debug"))] +use crate::devices::uart_16550::SerialPort; + +#[cfg(feature = "graphical_debug")] +use super::graphical_debug::{DEBUG_DISPLAY, DebugDisplay}; +#[cfg(feature = "lpss_debug")] +use super::device::serial::LPSS; +#[cfg(feature = "serial_debug")] +use super::device::serial::COM1; +#[cfg(feature = "system76_ec_debug")] +use super::device::system76_ec::{SYSTEM76_EC, System76Ec}; + +#[cfg(feature = "qemu_debug")] +pub static QEMU: Mutex> = Mutex::new(Pio::::new(0x402)); + +pub struct Writer<'a> { + log: MutexGuard<'a, Option>, + #[cfg(feature = "graphical_debug")] + display: MutexGuard<'a, Option>, + #[cfg(feature = "lpss_debug")] + lpss: MutexGuard<'a, Option<&'static mut SerialPort>>>, + #[cfg(feature = "qemu_debug")] + qemu: MutexGuard<'a, Pio>, + #[cfg(feature = "serial_debug")] + serial: MutexGuard<'a, SerialPort>>, + #[cfg(feature = "system76_ec_debug")] + system76_ec: MutexGuard<'a, Option>, +} + +impl<'a> Writer<'a> { + pub fn new() -> Writer<'a> { + Writer { + log: LOG.lock(), + #[cfg(feature = "graphical_debug")] + display: DEBUG_DISPLAY.lock(), + #[cfg(feature = "lpss_debug")] + lpss: LPSS.lock(), + #[cfg(feature = "qemu_debug")] + qemu: QEMU.lock(), + #[cfg(feature = "serial_debug")] + serial: COM1.lock(), + #[cfg(feature = "system76_ec_debug")] + system76_ec: SYSTEM76_EC.lock(), + } + } + + pub fn write(&mut self, buf: &[u8]) { + { + if let Some(ref mut log) = *self.log { + log.write(buf); + } + } + + #[cfg(feature = "graphical_debug")] + { + if let Some(ref mut display) = *self.display { + let _ = display.write(buf); + } + } + + #[cfg(feature = "lpss_debug")] + { + if let Some(ref mut lpss) = *self.lpss { + lpss.write(buf); + } + } + + #[cfg(feature = "qemu_debug")] + { + for &b in buf { + self.qemu.write(b); + } + } + + #[cfg(feature = "serial_debug")] + { + self.serial.write(buf); + } + + #[cfg(feature = "system76_ec_debug")] + { + if let Some(ref mut system76_ec) = *self.system76_ec { + system76_ec.print_slice(buf); + } + } + } +} + +impl<'a> fmt::Write for Writer<'a> { + fn write_str(&mut self, s: &str) -> Result<(), fmt::Error> { + self.write(s.as_bytes()); + Ok(()) + } +} diff --git a/src/arch/x86/device/cpu.rs b/src/arch/x86/device/cpu.rs new file mode 100644 index 0000000..4e7c875 --- /dev/null +++ b/src/arch/x86/device/cpu.rs @@ -0,0 +1,131 @@ +use core::fmt::{Result, Write}; + +use super::super::cpuid::cpuid; + +pub fn cpu_info(w: &mut W) -> Result { + let cpuid = match cpuid() { + Some(some) => some, + None => { + writeln!(w, "CPUID instruction not supported")?; + return Ok(()); + } + }; + + if let Some(info) = cpuid.get_vendor_info() { + writeln!(w, "Vendor: {}", info.as_string())?; + } + + if let Some(brand) = cpuid.get_processor_brand_string() { + writeln!(w, "Model: {}", brand.as_str())?; + } + + if let Some(info) = cpuid.get_processor_frequency_info() { + writeln!(w, "CPU Base MHz: {}", info.processor_base_frequency())?; + writeln!(w, "CPU Max MHz: {}", info.processor_max_frequency())?; + writeln!(w, "Bus MHz: {}", info.bus_frequency())?; + } + + write!(w, "Features:")?; + + if let Some(info) = cpuid.get_feature_info() { + if info.has_fpu() { write!(w, " fpu")? }; + if info.has_vme() { write!(w, " vme")? }; + if info.has_de() { write!(w, " de")? }; + if info.has_pse() { write!(w, " pse")? }; + if info.has_tsc() { write!(w, " tsc")? }; + if info.has_msr() { write!(w, " msr")? }; + if info.has_pae() { write!(w, " pae")? }; + if info.has_mce() { write!(w, " mce")? }; + + if info.has_cmpxchg8b() { write!(w, " cx8")? }; + if info.has_apic() { write!(w, " apic")? }; + if info.has_sysenter_sysexit() { write!(w, " sep")? }; + if info.has_mtrr() { write!(w, " mtrr")? }; + if info.has_pge() { write!(w, " pge")? }; + if info.has_mca() { write!(w, " mca")? }; + if info.has_cmov() { write!(w, " cmov")? }; + if info.has_pat() { write!(w, " pat")? }; + + if info.has_pse36() { write!(w, " pse36")? }; + if info.has_psn() { write!(w, " psn")? }; + if info.has_clflush() { write!(w, " clflush")? }; + if info.has_ds() { write!(w, " ds")? }; + if info.has_acpi() { write!(w, " acpi")? }; + if info.has_mmx() { write!(w, " mmx")? }; + if info.has_fxsave_fxstor() { write!(w, " fxsr")? }; + if info.has_sse() { write!(w, " sse")? }; + + if info.has_sse2() { write!(w, " sse2")? }; + if info.has_ss() { write!(w, " ss")? }; + if info.has_htt() { write!(w, " ht")? }; + if info.has_tm() { write!(w, " tm")? }; + if info.has_pbe() { write!(w, " pbe")? }; + + if info.has_sse3() { write!(w, " sse3")? }; + if info.has_pclmulqdq() { write!(w, " pclmulqdq")? }; + if info.has_ds_area() { write!(w, " dtes64")? }; + if info.has_monitor_mwait() { write!(w, " monitor")? }; + if info.has_cpl() { write!(w, " ds_cpl")? }; + if info.has_vmx() { write!(w, " vmx")? }; + if info.has_smx() { write!(w, " smx")? }; + if info.has_eist() { write!(w, " est")? }; + + if info.has_tm2() { write!(w, " tm2")? }; + if info.has_ssse3() { write!(w, " ssse3")? }; + if info.has_cnxtid() { write!(w, " cnxtid")? }; + if info.has_fma() { write!(w, " fma")? }; + if info.has_cmpxchg16b() { write!(w, " cx16")? }; + if info.has_pdcm() { write!(w, " pdcm")? }; + if info.has_pcid() { write!(w, " pcid")? }; + if info.has_dca() { write!(w, " dca")? }; + + if info.has_sse41() { write!(w, " sse4_1")? }; + if info.has_sse42() { write!(w, " sse4_2")? }; + if info.has_x2apic() { write!(w, " x2apic")? }; + if info.has_movbe() { write!(w, " movbe")? }; + if info.has_popcnt() { write!(w, " popcnt")? }; + if info.has_tsc_deadline() { write!(w, " tsc_deadline_timer")? }; + if info.has_aesni() { write!(w, " aes")? }; + if info.has_xsave() { write!(w, " xsave")? }; + + if info.has_oxsave() { write!(w, " xsaveopt")? }; + if info.has_avx() { write!(w, " avx")? }; + if info.has_f16c() { write!(w, " f16c")? }; + if info.has_rdrand() { write!(w, " rdrand")? }; + } + + if let Some(info) = cpuid.get_extended_processor_and_feature_identifiers() { + if info.has_64bit_mode() { write!(w, " lm")? }; + if info.has_rdtscp() { write!(w, " rdtscp")? }; + if info.has_1gib_pages() { write!(w, " pdpe1gb")? }; + if info.has_execute_disable() { write!(w, " nx")? }; + if info.has_syscall_sysret() { write!(w, " syscall")? }; + if info.has_prefetchw() { write!(w, " prefetchw")? }; + if info.has_lzcnt() { write!(w, " lzcnt")? }; + if info.has_lahf_sahf() { write!(w, " lahf_lm")? }; + } + + if let Some(info) = cpuid.get_advanced_power_mgmt_info() { + if info.has_invariant_tsc() { write!(w, " constant_tsc")? }; + } + + if let Some(info) = cpuid.get_extended_feature_info() { + if info.has_fsgsbase() { write!(w, " fsgsbase")? }; + if info.has_tsc_adjust_msr() { write!(w, " tsc_adjust")? }; + if info.has_bmi1() { write!(w, " bmi1")? }; + if info.has_hle() { write!(w, " hle")? }; + if info.has_avx2() { write!(w, " avx2")? }; + if info.has_smep() { write!(w, " smep")? }; + if info.has_bmi2() { write!(w, " bmi2")? }; + if info.has_rep_movsb_stosb() { write!(w, " erms")? }; + if info.has_invpcid() { write!(w, " invpcid")? }; + if info.has_rtm() { write!(w, " rtm")? }; + //if info.has_qm() { write!(w, " qm")? }; + if info.has_fpu_cs_ds_deprecated() { write!(w, " fpu_seg")? }; + if info.has_mpx() { write!(w, " mpx")? }; + } + + writeln!(w)?; + + Ok(()) +} diff --git a/src/arch/x86/device/hpet.rs b/src/arch/x86/device/hpet.rs new file mode 100644 index 0000000..1222779 --- /dev/null +++ b/src/arch/x86/device/hpet.rs @@ -0,0 +1,101 @@ +use crate::acpi::hpet::Hpet; + +static LEG_RT_CNF: u64 = 2; +static ENABLE_CNF: u64 = 1; + +static TN_VAL_SET_CNF: u64 = 0x40; +static TN_TYPE_CNF: u64 = 0x08; +static TN_INT_ENB_CNF: u64 = 0x04; + +static CAPABILITY_OFFSET: usize = 0x00; +static GENERAL_CONFIG_OFFSET: usize = 0x10; +static GENERAL_INTERRUPT_OFFSET: usize = 0x20; +static MAIN_COUNTER_OFFSET: usize = 0xF0; +// static NUM_TIMER_CAP_MASK: u64 = 0x0f00; +static LEG_RT_CAP: u64 = 0x8000; +static T0_CONFIG_CAPABILITY_OFFSET: usize = 0x100; +static T0_COMPARATOR_OFFSET: usize = 0x108; + +static PER_INT_CAP: u64 = 0x10; + +pub unsafe fn init(hpet: &mut Hpet) -> bool { + println!("HPET Before Init"); + debug(hpet); + + // Disable HPET + { + let mut config_word = hpet.base_address.read_u64(GENERAL_CONFIG_OFFSET); + config_word &= !(LEG_RT_CNF | ENABLE_CNF); + hpet.base_address.write_u64(GENERAL_CONFIG_OFFSET, config_word); + } + + let capability = hpet.base_address.read_u64(CAPABILITY_OFFSET); + if capability & LEG_RT_CAP == 0 { + log::warn!("HPET missing capability LEG_RT_CAP"); + return false; + } + + let counter_clk_period_fs = capability >> 32; + let desired_fs_period: u64 = 2_250_286 * 1_000_000; + + let clk_periods_per_kernel_tick: u64 = desired_fs_period / counter_clk_period_fs; + + let t0_capabilities = hpet.base_address.read_u64(T0_CONFIG_CAPABILITY_OFFSET); + if t0_capabilities & PER_INT_CAP == 0 { + log::warn!("HPET T0 missing capability PER_INT_CAP"); + return false; + } + + let counter = hpet.base_address.read_u64(MAIN_COUNTER_OFFSET); + + let t0_config_word: u64 = TN_VAL_SET_CNF | TN_TYPE_CNF | TN_INT_ENB_CNF; + hpet.base_address.write_u64(T0_CONFIG_CAPABILITY_OFFSET, t0_config_word); + // set accumulator value + hpet.base_address.write_u64(T0_COMPARATOR_OFFSET, counter + clk_periods_per_kernel_tick); + // set interval + hpet.base_address.write_u64(T0_COMPARATOR_OFFSET, clk_periods_per_kernel_tick); + + // Enable interrupts from the HPET + { + let mut config_word: u64 = hpet.base_address.read_u64(GENERAL_CONFIG_OFFSET); + config_word |= LEG_RT_CNF | ENABLE_CNF; + hpet.base_address.write_u64(GENERAL_CONFIG_OFFSET, config_word); + } + + println!("HPET After Init"); + debug(hpet); + + true +} + +pub unsafe fn debug(hpet: &mut Hpet) { + println!("HPET @ {:#x}", hpet.base_address.address); + + let capability = hpet.base_address.read_u64(CAPABILITY_OFFSET); + { + println!(" caps: {:#x}", capability); + println!(" clock period: {}", (capability >> 32) as u32); + println!(" ID: {:#x}", (capability >> 16) as u16); + println!(" LEG_RT_CAP: {}", capability & (1 << 15) == (1 << 15)); + println!(" COUNT_SIZE_CAP: {}", capability & (1 << 13) == (1 << 13)); + println!(" timers: {}", (capability >> 8) as u8 & 0x1F); + println!(" revision: {}", capability as u8); + } + + let config_word = hpet.base_address.read_u64(GENERAL_CONFIG_OFFSET); + println!(" config: {:#x}", config_word); + + let interrupt_status = hpet.base_address.read_u64(GENERAL_INTERRUPT_OFFSET); + println!(" interrupt status: {:#x}", interrupt_status); + + let counter = hpet.base_address.read_u64(MAIN_COUNTER_OFFSET); + println!(" counter: {:#x}", counter); + + let t0_capabilities = hpet.base_address.read_u64(T0_CONFIG_CAPABILITY_OFFSET); + println!(" T0 caps: {:#x}", t0_capabilities); + println!(" interrupt routing: {:#x}", (t0_capabilities >> 32) as u32); + println!(" flags: {:#x}", t0_capabilities as u16); + + let t0_comparator = hpet.base_address.read_u64(T0_COMPARATOR_OFFSET); + println!(" T0 comparator: {:#x}", t0_comparator); +} diff --git a/src/arch/x86/device/ioapic.rs b/src/arch/x86/device/ioapic.rs new file mode 100644 index 0000000..9fdf416 --- /dev/null +++ b/src/arch/x86/device/ioapic.rs @@ -0,0 +1,406 @@ +use core::{fmt, ptr}; + +use alloc::vec::Vec; +use spin::Mutex; + +#[cfg(feature = "acpi")] +use crate::acpi::madt::{self, Madt, MadtEntry, MadtIoApic, MadtIntSrcOverride}; + +use crate::arch::interrupt::irq; +use crate::memory::Frame; +use crate::paging::{KernelMapper, Page, PageFlags, PhysicalAddress, RmmA, RmmArch}; +use crate::paging::entry::EntryFlags; + +use super::super::cpuid::cpuid; +use super::pic; + +pub struct IoApicRegs { + pointer: *const u32, +} +impl IoApicRegs { + fn ioregsel(&self) -> *const u32 { + self.pointer + } + fn iowin(&self) -> *const u32 { + // offset 0x10 + unsafe { self.pointer.offset(4) } + } + fn write_ioregsel(&mut self, value: u32) { + unsafe { ptr::write_volatile::(self.ioregsel() as *mut u32, value) } + } + fn read_iowin(&self) -> u32 { + unsafe { ptr::read_volatile::(self.iowin()) } + } + fn write_iowin(&mut self, value: u32) { + unsafe { ptr::write_volatile::(self.iowin() as *mut u32, value) } + } + fn read_reg(&mut self, reg: u8) -> u32 { + self.write_ioregsel(reg.into()); + self.read_iowin() + } + fn write_reg(&mut self, reg: u8, value: u32) { + self.write_ioregsel(reg.into()); + self.write_iowin(value); + } + pub fn read_ioapicid(&mut self) -> u32 { + self.read_reg(0x00) + } + pub fn write_ioapicid(&mut self, value: u32) { + self.write_reg(0x00, value); + } + pub fn read_ioapicver(&mut self) -> u32 { + self.read_reg(0x01) + } + pub fn read_ioapicarb(&mut self) -> u32 { + self.read_reg(0x02) + } + pub fn read_ioredtbl(&mut self, idx: u8) -> u64 { + assert!(idx < 24); + let lo = self.read_reg(0x10 + idx * 2); + let hi = self.read_reg(0x10 + idx * 2 + 1); + + u64::from(lo) | (u64::from(hi) << 32) + } + pub fn write_ioredtbl(&mut self, idx: u8, value: u64) { + assert!(idx < 24); + + let lo = value as u32; + let hi = (value >> 32) as u32; + + self.write_reg(0x10 + idx * 2, lo); + self.write_reg(0x10 + idx * 2 + 1, hi); + } + + pub fn max_redirection_table_entries(&mut self) -> u8 { + let ver = self.read_ioapicver(); + ((ver & 0x00FF_0000) >> 16) as u8 + } + pub fn id(&mut self) -> u8 { + let id_reg = self.read_ioapicid(); + ((id_reg & 0x0F00_0000) >> 24) as u8 + } +} +pub struct IoApic { + regs: Mutex, + gsi_start: u32, + count: u8, +} +impl IoApic { + pub fn new(regs_base: *const u32, gsi_start: u32) -> Self { + let mut regs = IoApicRegs { pointer: regs_base }; + let count = regs.max_redirection_table_entries(); + + Self { + regs: Mutex::new(regs), + gsi_start, + count, + } + } + /// Map an interrupt vector to a physical local APIC ID of a processor (thus physical mode). + pub fn map(&self, idx: u8, info: MapInfo) { + self.regs.lock().write_ioredtbl(idx, info.as_raw()) + } + pub fn set_mask(&self, gsi: u32, mask: bool) { + let idx = (gsi - self.gsi_start) as u8; + let mut guard = self.regs.lock(); + + let mut reg = guard.read_ioredtbl(idx); + reg &= !(1 << 16); + reg |= u64::from(mask) << 16; + guard.write_ioredtbl(idx, reg); + } +} +#[repr(u8)] +#[derive(Clone, Copy, Debug)] +pub enum ApicTriggerMode { + Edge = 0, + Level = 1, +} +#[repr(u8)] +#[derive(Clone, Copy, Debug)] +pub enum ApicPolarity { + ActiveHigh = 0, + ActiveLow = 1, +} +#[repr(u8)] +#[derive(Clone, Copy, Debug)] +pub enum DestinationMode { + Physical = 0, + Logical = 1, +} +#[repr(u8)] +#[derive(Clone, Copy, Debug)] +pub enum DeliveryMode { + Fixed = 0b000, + LowestPriority = 0b001, + Smi = 0b010, + Nmi = 0b100, + Init = 0b101, + ExtInt = 0b111, +} + +#[derive(Clone, Copy, Debug)] +pub struct MapInfo { + pub dest: u8, + pub mask: bool, + pub trigger_mode: ApicTriggerMode, + pub polarity: ApicPolarity, + pub dest_mode: DestinationMode, + pub delivery_mode: DeliveryMode, + pub vector: u8, +} + +impl MapInfo { + pub fn as_raw(&self) -> u64 { + assert!(self.vector >= 0x20); + assert!(self.vector <= 0xFE); + + // TODO: Check for reserved fields. + + (u64::from(self.dest) << 56) + | (u64::from(self.mask) << 16) + | ((self.trigger_mode as u64) << 15) + | ((self.polarity as u64) << 13) + | ((self.dest_mode as u64) << 11) + | ((self.delivery_mode as u64) << 8) + | u64::from(self.vector) + } +} + +impl fmt::Debug for IoApic { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + struct RedirTable<'a>(&'a Mutex); + + impl<'a> fmt::Debug for RedirTable<'a> { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + let mut guard = self.0.lock(); + + let count = guard.max_redirection_table_entries(); + f.debug_list().entries((0..count).map(|i| guard.read_ioredtbl(i))).finish() + } + } + + f.debug_struct("IoApic") + .field("redir_table", &RedirTable(&self.regs)) + .field("gsi_start", &self.gsi_start) + .field("count", &self.count) + .finish() + } +} + +#[derive(Clone, Copy, Debug)] +pub enum TriggerMode { + ConformsToSpecs, + Edge, + Level, +} + +#[derive(Clone, Copy, Debug)] +pub enum Polarity { + ConformsToSpecs, + ActiveHigh, + ActiveLow, +} + +#[derive(Clone, Copy, Debug)] +pub struct Override { + bus_irq: u8, + gsi: u32, + + trigger_mode: TriggerMode, + polarity: Polarity, +} + +// static mut because only the AP initializes the I/O Apic, and when that is done, it's solely +// accessed immutably. +static mut IOAPICS: Option> = None; + +// static mut for the same reason as above +static mut SRC_OVERRIDES: Option> = None; + +pub fn ioapics() -> &'static [IoApic] { + unsafe { + IOAPICS.as_ref().map_or(&[], |vector| &vector[..]) + } +} +pub fn src_overrides() -> &'static [Override] { + unsafe { + SRC_OVERRIDES.as_ref().map_or(&[], |vector| &vector[..]) + } +} + +#[cfg(feature = "acpi")] +pub unsafe fn handle_ioapic(mapper: &mut KernelMapper, madt_ioapic: &'static MadtIoApic) { + // map the I/O APIC registers + + let frame = Frame::containing_address(PhysicalAddress::new(madt_ioapic.address as usize)); + let page = Page::containing_address(RmmA::phys_to_virt(frame.start_address())); + + assert!(mapper.translate(page.start_address()).is_none()); + + mapper + .get_mut() + .expect("expected KernelMapper not to be locked re-entrant while mapping I/O APIC memory") + .map_phys(page.start_address(), frame.start_address(), PageFlags::new().write(true).custom_flag(EntryFlags::NO_CACHE.bits(), true)) + .expect("failed to map I/O APIC") + .flush(); + + let ioapic_registers = page.start_address().data() as *const u32; + let ioapic = IoApic::new(ioapic_registers, madt_ioapic.gsi_base); + + assert_eq!(ioapic.regs.lock().id(), madt_ioapic.id, "mismatched ACPI MADT I/O APIC ID, and the ID reported by the I/O APIC"); + + IOAPICS.get_or_insert_with(Vec::new).push(ioapic); +} +#[cfg(feature = "acpi")] +pub unsafe fn handle_src_override(src_override: &'static MadtIntSrcOverride) { + let flags = src_override.flags; + + let polarity_raw = (flags & 0x0003) as u8; + let trigger_mode_raw = ((flags & 0x000C) >> 2) as u8; + + let polarity = match polarity_raw { + 0b00 => Polarity::ConformsToSpecs, + 0b01 => Polarity::ActiveHigh, + 0b10 => return, // reserved + 0b11 => Polarity::ActiveLow, + + _ => unreachable!(), + }; + + let trigger_mode = match trigger_mode_raw { + 0b00 => TriggerMode::ConformsToSpecs, + 0b01 => TriggerMode::Edge, + 0b10 => return, // reserved + 0b11 => TriggerMode::Level, + _ => unreachable!(), + }; + + let over = Override { + bus_irq: src_override.irq_source, + gsi: src_override.gsi_base, + polarity, + trigger_mode, + }; + SRC_OVERRIDES.get_or_insert_with(Vec::new).push(over); +} + +pub unsafe fn init(active_table: &mut KernelMapper) { + let bsp_apic_id = cpuid().unwrap().get_feature_info().unwrap().initial_local_apic_id(); // TODO: remove unwraps + + // search the madt for all IOAPICs. + #[cfg(feature = "acpi")] + { + let madt: &'static Madt = match madt::MADT.as_ref() { + Some(m) => m, + // TODO: Parse MP tables too. + None => return, + }; + if madt.flags & madt::FLAG_PCAT != 0 { + pic::disable(); + } + + // find all I/O APICs (usually one). + + for entry in madt.iter() { + match entry { + MadtEntry::IoApic(ioapic) => handle_ioapic(active_table, ioapic), + MadtEntry::IntSrcOverride(src_override) => handle_src_override(src_override), + _ => (), + } + } + } + println!("I/O APICs: {:?}, overrides: {:?}", ioapics(), src_overrides()); + + // map the legacy PC-compatible IRQs (0-15) to 32-47, just like we did with 8259 PIC (if it + // wouldn't have been disabled due to this I/O APIC) + for legacy_irq in 0..=15 { + let (gsi, trigger_mode, polarity) = match get_override(legacy_irq) { + Some(over) => (over.gsi, over.trigger_mode, over.polarity), + None => { + if src_overrides().iter().any(|over| over.gsi == u32::from(legacy_irq) && over.bus_irq != legacy_irq) && !src_overrides().iter().any(|over| over.bus_irq == legacy_irq) { + // there's an IRQ conflict, making this legacy IRQ inaccessible. + continue; + } + (legacy_irq.into(), TriggerMode::ConformsToSpecs, Polarity::ConformsToSpecs) + } + }; + let apic = match find_ioapic(gsi) { + Some(ioapic) => ioapic, + None => { + println!("Unable to find a suitable APIC for legacy IRQ {} (GSI {}). It will not be mapped.", legacy_irq, gsi); + continue; + } + }; + let redir_tbl_index = (gsi - apic.gsi_start) as u8; + + let map_info = MapInfo { + // only send to the BSP + dest: bsp_apic_id, + dest_mode: DestinationMode::Physical, + delivery_mode: DeliveryMode::Fixed, + mask: false, + polarity: match polarity { + Polarity::ActiveHigh => ApicPolarity::ActiveHigh, + Polarity::ActiveLow => ApicPolarity::ActiveLow, + Polarity::ConformsToSpecs => ApicPolarity::ActiveHigh, + }, + trigger_mode: match trigger_mode { + TriggerMode::Edge => ApicTriggerMode::Edge, + TriggerMode::Level => ApicTriggerMode::Level, + TriggerMode::ConformsToSpecs => ApicTriggerMode::Edge, + }, + vector: 32 + legacy_irq, + }; + apic.map(redir_tbl_index, map_info); + } + println!("I/O APICs: {:?}, overrides: {:?}", ioapics(), src_overrides()); + irq::set_irq_method(irq::IrqMethod::Apic); + + // tell the firmware that we're using APIC rather than the default 8259 PIC. + + // FIXME: With ACPI moved to userspace, we should instead allow userspace to check whether the + // IOAPIC has been initialized, and then subsequently let some ACPI driver call the AML from + // userspace. + + /*#[cfg(feature = "acpi")] + { + let method = { + let namespace_guard = crate::acpi::ACPI_TABLE.namespace.read(); + if let Some(value) = namespace_guard.as_ref().unwrap().get("\\_PIC") { + value.get_as_method().ok() + } else { + None + } + }; + if let Some(m) = method { + m.execute("\\_PIC".into(), vec!(crate::acpi::aml::AmlValue::Integer(1))); + } + }*/ +} +fn get_override(irq: u8) -> Option<&'static Override> { + src_overrides().iter().find(|over| over.bus_irq == irq) +} +fn resolve(irq: u8) -> u32 { + get_override(irq).map_or(u32::from(irq), |over| over.gsi) +} +fn find_ioapic(gsi: u32) -> Option<&'static IoApic> { + ioapics().iter().find(|apic| gsi >= apic.gsi_start && gsi < apic.gsi_start + u32::from(apic.count)) +} + +pub unsafe fn mask(irq: u8) { + let gsi = resolve(irq); + let apic = match find_ioapic(gsi) { + Some(a) => a, + None => return, + }; + apic.set_mask(gsi, true); +} +pub unsafe fn unmask(irq: u8) { + let gsi = resolve(irq); + let apic = match find_ioapic(gsi) { + Some(a) => a, + None => return, + }; + apic.set_mask(gsi, false); +} diff --git a/src/arch/x86/device/local_apic.rs b/src/arch/x86/device/local_apic.rs new file mode 100644 index 0000000..58745bc --- /dev/null +++ b/src/arch/x86/device/local_apic.rs @@ -0,0 +1,246 @@ +use core::sync::atomic::{self, AtomicU32}; +use core::intrinsics::{volatile_load, volatile_store}; +use x86::msr::*; + +use crate::paging::{KernelMapper, PhysicalAddress, PageFlags, RmmA, RmmArch}; + +use super::super::cpuid::cpuid; + +pub static mut LOCAL_APIC: LocalApic = LocalApic { + address: 0, + x2: false +}; + +pub unsafe fn init(active_table: &mut KernelMapper) { + LOCAL_APIC.init(active_table); +} + +pub unsafe fn init_ap() { + LOCAL_APIC.init_ap(); +} + +/// Local APIC +pub struct LocalApic { + pub address: usize, + pub x2: bool +} + +#[derive(Debug)] +struct NoFreqInfo; + +static BSP_APIC_ID: AtomicU32 = AtomicU32::new(u32::max_value()); + +#[no_mangle] +pub fn bsp_apic_id() -> Option { + let value = BSP_APIC_ID.load(atomic::Ordering::SeqCst); + if value < u32::max_value() { + Some(value as u32) + } else { + None + } +} + +impl LocalApic { + unsafe fn init(&mut self, mapper: &mut KernelMapper) { + let mapper = mapper.get_mut().expect("expected KernelMapper not to be locked re-entrant while initializing LAPIC"); + + let physaddr = PhysicalAddress::new(rdmsr(IA32_APIC_BASE) as usize & 0xFFFF_0000); + let virtaddr = RmmA::phys_to_virt(physaddr); + + self.address = virtaddr.data(); + self.x2 = cpuid().map_or(false, |cpuid| { + cpuid.get_feature_info().map_or(false, |feature_info| { + feature_info.has_x2apic() + }) + }); + + if ! self.x2 { + log::info!("Detected xAPIC at {:#x}", physaddr.data()); + if let Some((_entry, _, flush)) = mapper.unmap_phys(virtaddr, true) { + // Unmap xAPIC page if already mapped + flush.flush(); + } + mapper + .map_phys(virtaddr, physaddr, PageFlags::new().write(true)) + .expect("failed to map local APIC memory") + .flush(); + } else { + log::info!("Detected x2APIC"); + } + + self.init_ap(); + BSP_APIC_ID.store(self.id(), atomic::Ordering::SeqCst); + } + + unsafe fn init_ap(&mut self) { + if self.x2 { + wrmsr(IA32_APIC_BASE, rdmsr(IA32_APIC_BASE) | 1 << 10); + wrmsr(IA32_X2APIC_SIVR, 0x100); + } else { + self.write(0xF0, 0x100); + } + self.setup_error_int(); + //self.setup_timer(); + } + + unsafe fn read(&self, reg: u32) -> u32 { + volatile_load((self.address + reg as usize) as *const u32) + } + + unsafe fn write(&mut self, reg: u32, value: u32) { + volatile_store((self.address + reg as usize) as *mut u32, value); + } + + pub fn id(&self) -> u32 { + if self.x2 { + unsafe { rdmsr(IA32_X2APIC_APICID) as u32 } + } else { + unsafe { self.read(0x20) } + } + } + + pub fn version(&self) -> u32 { + if self.x2 { + unsafe { rdmsr(IA32_X2APIC_VERSION) as u32 } + } else { + unsafe { self.read(0x30) } + } + } + + pub fn icr(&self) -> u64 { + if self.x2 { + unsafe { rdmsr(IA32_X2APIC_ICR) } + } else { + unsafe { + (self.read(0x310) as u64) << 32 | self.read(0x300) as u64 + } + } + } + + pub fn set_icr(&mut self, value: u64) { + if self.x2 { + unsafe { wrmsr(IA32_X2APIC_ICR, value); } + } else { + unsafe { + const PENDING: u32 = 1 << 12; + while self.read(0x300) & PENDING == PENDING { + core::hint::spin_loop(); + } + self.write(0x310, (value >> 32) as u32); + self.write(0x300, value as u32); + while self.read(0x300) & PENDING == PENDING { + core::hint::spin_loop(); + } + } + } + } + + pub fn ipi(&mut self, apic_id: usize) { + let mut icr = 0x4040; + if self.x2 { + icr |= (apic_id as u64) << 32; + } else { + icr |= (apic_id as u64) << 56; + } + self.set_icr(icr); + } + // Not used just yet, but allows triggering an NMI to another processor. + pub fn ipi_nmi(&mut self, apic_id: u32) { + let shift = if self.x2 { 32 } else { 56 }; + self.set_icr((u64::from(apic_id) << shift) | (1 << 14) | (0b100 << 8)); + } + + pub unsafe fn eoi(&mut self) { + if self.x2 { + wrmsr(IA32_X2APIC_EOI, 0); + } else { + self.write(0xB0, 0); + } + } + /// Reads the Error Status Register. + pub unsafe fn esr(&mut self) -> u32 { + if self.x2 { + // update the ESR to the current state of the local apic. + wrmsr(IA32_X2APIC_ESR, 0); + // read the updated value + rdmsr(IA32_X2APIC_ESR) as u32 + } else { + self.write(0x280, 0); + self.read(0x280) + } + } + pub unsafe fn lvt_timer(&mut self) -> u32 { + if self.x2 { + rdmsr(IA32_X2APIC_LVT_TIMER) as u32 + } else { + self.read(0x320) + } + } + pub unsafe fn set_lvt_timer(&mut self, value: u32) { + if self.x2 { + wrmsr(IA32_X2APIC_LVT_TIMER, u64::from(value)); + } else { + self.write(0x320, value); + } + } + pub unsafe fn init_count(&mut self) -> u32 { + if self.x2 { + rdmsr(IA32_X2APIC_INIT_COUNT) as u32 + } else { + self.read(0x380) + } + } + pub unsafe fn set_init_count(&mut self, initial_count: u32) { + if self.x2 { + wrmsr(IA32_X2APIC_INIT_COUNT, u64::from(initial_count)); + } else { + self.write(0x380, initial_count); + } + } + pub unsafe fn cur_count(&mut self) -> u32 { + if self.x2 { + rdmsr(IA32_X2APIC_CUR_COUNT) as u32 + } else { + self.read(0x390) + } + } + pub unsafe fn div_conf(&mut self) -> u32 { + if self.x2 { + rdmsr(IA32_X2APIC_DIV_CONF) as u32 + } else { + self.read(0x3E0) + } + } + pub unsafe fn set_div_conf(&mut self, div_conf: u32) { + if self.x2 { + wrmsr(IA32_X2APIC_DIV_CONF, u64::from(div_conf)); + } else { + self.write(0x3E0, div_conf); + } + } + pub unsafe fn lvt_error(&mut self) -> u32 { + if self.x2 { + rdmsr(IA32_X2APIC_LVT_ERROR) as u32 + } else { + self.read(0x370) + } + } + pub unsafe fn set_lvt_error(&mut self, lvt_error: u32) { + if self.x2 { + wrmsr(IA32_X2APIC_LVT_ERROR, u64::from(lvt_error)); + } else { + self.write(0x370, lvt_error); + } + } + unsafe fn setup_error_int(&mut self) { + let vector = 49u32; + self.set_lvt_error(vector); + } +} + +#[repr(u8)] +pub enum LvtTimerMode { + OneShot = 0b00, + Periodic = 0b01, + TscDeadline = 0b10, +} diff --git a/src/arch/x86/device/mod.rs b/src/arch/x86/device/mod.rs new file mode 100644 index 0000000..eafa245 --- /dev/null +++ b/src/arch/x86/device/mod.rs @@ -0,0 +1,53 @@ +pub mod cpu; +pub mod ioapic; +pub mod local_apic; +pub mod pic; +pub mod pit; +pub mod rtc; +pub mod serial; +#[cfg(feature = "acpi")] +pub mod hpet; +#[cfg(feature = "system76_ec_debug")] +pub mod system76_ec; + +use crate::paging::KernelMapper; + +pub unsafe fn init() { + pic::init(); + local_apic::init(&mut KernelMapper::lock()); +} +pub unsafe fn init_after_acpi() { + // this will disable the IOAPIC if needed. + //ioapic::init(mapper); +} + +#[cfg(feature = "acpi")] +unsafe fn init_hpet() -> bool { + use crate::acpi::ACPI_TABLE; + if let Some(ref mut hpet) = *ACPI_TABLE.hpet.write() { + hpet::init(hpet) + } else { + false + } +} + +#[cfg(not(feature = "acpi"))] +unsafe fn init_hpet() -> bool { + false +} + +pub unsafe fn init_noncore() { + if init_hpet() { + log::info!("HPET used as system timer"); + } else { + pit::init(); + log::info!("PIT used as system timer"); + } + + rtc::init(); + serial::init(); +} + +pub unsafe fn init_ap() { + local_apic::init_ap(); +} diff --git a/src/arch/x86/device/pic.rs b/src/arch/x86/device/pic.rs new file mode 100644 index 0000000..12e68a6 --- /dev/null +++ b/src/arch/x86/device/pic.rs @@ -0,0 +1,78 @@ +use crate::syscall::io::{Io, Pio}; +use crate::arch::interrupt::irq; + +pub static mut MASTER: Pic = Pic::new(0x20); +pub static mut SLAVE: Pic = Pic::new(0xA0); + +pub unsafe fn init() { + // Start initialization + MASTER.cmd.write(0x11); + SLAVE.cmd.write(0x11); + + // Set offsets + MASTER.data.write(0x20); + SLAVE.data.write(0x28); + + // Set up cascade + MASTER.data.write(4); + SLAVE.data.write(2); + + // Set up interrupt mode (1 is 8086/88 mode, 2 is auto EOI) + MASTER.data.write(1); + SLAVE.data.write(1); + + // Unmask interrupts + MASTER.data.write(0); + SLAVE.data.write(0); + + // Ack remaining interrupts + MASTER.ack(); + SLAVE.ack(); + + // probably already set to PIC, but double-check + irq::set_irq_method(irq::IrqMethod::Pic); +} + +pub unsafe fn disable() { + MASTER.data.write(0xFF); + SLAVE.data.write(0xFF); +} + +pub struct Pic { + cmd: Pio, + data: Pio, +} + +impl Pic { + pub const fn new(port: u16) -> Pic { + Pic { + cmd: Pio::new(port), + data: Pio::new(port + 1), + } + } + + pub fn ack(&mut self) { + self.cmd.write(0x20); + } + + pub fn mask_set(&mut self, irq: u8) { + assert!(irq < 8); + + let mut mask = self.data.read(); + mask |= 1 << irq; + self.data.write(mask); + } + + pub fn mask_clear(&mut self, irq: u8) { + assert!(irq < 8); + + let mut mask = self.data.read(); + mask &= !(1 << irq); + self.data.write(mask); + } + /// A bitmap of all currently servicing IRQs. Spurious IRQs will not have this bit set + pub fn isr(&mut self) -> u8 { + self.cmd.write(0x0A); + self.cmd.read() // note that cmd is read, rather than data + } +} diff --git a/src/arch/x86/device/pit.rs b/src/arch/x86/device/pit.rs new file mode 100644 index 0000000..a39cd00 --- /dev/null +++ b/src/arch/x86/device/pit.rs @@ -0,0 +1,17 @@ +use crate::syscall::io::{Io, Pio}; + +pub static mut CHAN0: Pio = Pio::new(0x40); +pub static mut CHAN1: Pio = Pio::new(0x41); +pub static mut CHAN2: Pio = Pio::new(0x42); +pub static mut COMMAND: Pio = Pio::new(0x43); + +static SELECT_CHAN0: u8 = 0; +static LOHI: u8 = 0x30; + +static CHAN0_DIVISOR: u16 = 2685; + +pub unsafe fn init() { + COMMAND.write(SELECT_CHAN0 | LOHI | 5); + CHAN0.write((CHAN0_DIVISOR & 0xFF) as u8); + CHAN0.write((CHAN0_DIVISOR >> 8) as u8); +} diff --git a/src/arch/x86/device/rtc.rs b/src/arch/x86/device/rtc.rs new file mode 100644 index 0000000..6c20f67 --- /dev/null +++ b/src/arch/x86/device/rtc.rs @@ -0,0 +1,148 @@ +use crate::syscall::io::{Io, Pio}; +use crate::time; + +pub fn init() { + let mut rtc = Rtc::new(); + time::START.lock().0 = rtc.time(); +} + +fn cvt_bcd(value: usize) -> usize { + (value & 0xF) + ((value / 16) * 10) +} + +/// RTC +pub struct Rtc { + addr: Pio, + data: Pio, + nmi: bool, +} + +impl Rtc { + /// Create new empty RTC + pub fn new() -> Self { + Rtc { + addr: Pio::::new(0x70), + data: Pio::::new(0x71), + nmi: false, + } + } + + /// Read + unsafe fn read(&mut self, reg: u8) -> u8 { + if self.nmi { + self.addr.write(reg & 0x7F); + } else { + self.addr.write(reg | 0x80); + } + self.data.read() + } + + /// Write + #[allow(dead_code)] + unsafe fn write(&mut self, reg: u8, value: u8) { + if self.nmi { + self.addr.write(reg & 0x7F); + } else { + self.addr.write(reg | 0x80); + } + self.data.write(value); + } + + /// Wait for an update, can take one second if full is specified! + unsafe fn wait(&mut self, full: bool) { + if full { + while self.read(0xA) & 0x80 != 0x80 {} + } + while self.read(0xA) & 0x80 == 0x80 {} + } + + /// Get time without waiting + pub unsafe fn time_no_wait(&mut self) -> u64 { + /*let century_register = if let Some(ref fadt) = acpi::ACPI_TABLE.lock().fadt { + Some(fadt.century) + } else { + None + };*/ + + let mut second = self.read(0) as usize; + let mut minute = self.read(2) as usize; + let mut hour = self.read(4) as usize; + let mut day = self.read(7) as usize; + let mut month = self.read(8) as usize; + let mut year = self.read(9) as usize; + let mut century = /* TODO: Fix invalid value from VirtualBox + if let Some(century_reg) = century_register { + self.read(century_reg) as usize + } else */ { + 20 + }; + let register_b = self.read(0xB); + + if register_b & 4 != 4 { + second = cvt_bcd(second); + minute = cvt_bcd(minute); + hour = cvt_bcd(hour & 0x7F) | (hour & 0x80); + day = cvt_bcd(day); + month = cvt_bcd(month); + year = cvt_bcd(year); + century = /* TODO: Fix invalid value from VirtualBox + if century_register.is_some() { + cvt_bcd(century) + } else */ { + century + }; + } + + if register_b & 2 != 2 || hour & 0x80 == 0x80 { + hour = ((hour & 0x7F) + 12) % 24; + } + + year += century * 100; + + // Unix time from clock + let mut secs: u64 = (year as u64 - 1970) * 31_536_000; + + let mut leap_days = (year as u64 - 1972) / 4 + 1; + if year % 4 == 0 && month <= 2 { + leap_days -= 1; + } + secs += leap_days * 86_400; + + match month { + 2 => secs += 2_678_400, + 3 => secs += 5_097_600, + 4 => secs += 7_776_000, + 5 => secs += 10_368_000, + 6 => secs += 13_046_400, + 7 => secs += 15_638_400, + 8 => secs += 18_316_800, + 9 => secs += 20_995_200, + 10 => secs += 23_587_200, + 11 => secs += 26_265_600, + 12 => secs += 28_857_600, + _ => (), + } + + secs += (day as u64 - 1) * 86_400; + secs += hour as u64 * 3600; + secs += minute as u64 * 60; + secs += second as u64; + + secs + } + + /// Get time + pub fn time(&mut self) -> u64 { + loop { + unsafe { + self.wait(false); + let time = self.time_no_wait(); + self.wait(false); + let next_time = self.time_no_wait(); + if time == next_time { + return time; + } + } + } + } +} diff --git a/src/arch/x86/device/serial.rs b/src/arch/x86/device/serial.rs new file mode 100644 index 0000000..280125b --- /dev/null +++ b/src/arch/x86/device/serial.rs @@ -0,0 +1,42 @@ +use crate::devices::uart_16550::SerialPort; +#[cfg(feature = "lpss_debug")] +use crate::syscall::io::Mmio; +use crate::syscall::io::Pio; +use spin::Mutex; + +pub static COM1: Mutex>> = Mutex::new(SerialPort::>::new(0x3F8)); +pub static COM2: Mutex>> = Mutex::new(SerialPort::>::new(0x2F8)); +pub static COM3: Mutex>> = Mutex::new(SerialPort::>::new(0x3E8)); +pub static COM4: Mutex>> = Mutex::new(SerialPort::>::new(0x2E8)); + +#[cfg(feature = "lpss_debug")] +pub static LPSS: Mutex>>> = Mutex::new(None); + +pub unsafe fn init() { + COM1.lock().init(); + COM2.lock().init(); + + #[cfg(feature = "lpss_debug")] + { + // TODO: Make this configurable + let address = crate::PHYS_OFFSET + 0xFE032000; + + { + use crate::paging::{ActivePageTable, Page, VirtualAddress, entry::EntryFlags}; + use crate::memory::{Frame, PhysicalAddress}; + + let mut active_table = ActivePageTable::new(); + let page = Page::containing_address(VirtualAddress::new(address)); + let frame = Frame::containing_address(PhysicalAddress::new(address - crate::PHYS_OFFSET)); + let result = active_table.map_to(page, frame, EntryFlags::PRESENT | EntryFlags::WRITABLE | EntryFlags::NO_EXECUTE); + result.flush(&mut active_table); + } + + let lpss = SerialPort::>::new( + crate::PHYS_OFFSET + 0xFE032000 + ); + lpss.init(); + + *LPSS.lock() = Some(lpss); + } +} diff --git a/src/arch/x86/device/system76_ec.rs b/src/arch/x86/device/system76_ec.rs new file mode 100644 index 0000000..49f3a29 --- /dev/null +++ b/src/arch/x86/device/system76_ec.rs @@ -0,0 +1,91 @@ +use spin::Mutex; +use syscall::io::{Io, Pio}; + +pub static SYSTEM76_EC: Mutex> = Mutex::new(None); + +pub fn init() { + *SYSTEM76_EC.lock() = System76Ec::new(); +} + +pub struct System76Ec { + base: u16, +} + +impl System76Ec { + pub fn new() -> Option { + let mut system76_ec = Self { + base: 0x0E00, + }; + if system76_ec.probe() { + Some(system76_ec) + } else { + None + } + } + + #[inline(always)] + pub fn read(&mut self, addr: u8) -> u8 { + Pio::::new(self.base + addr as u16).read() + } + + #[inline(always)] + pub fn write(&mut self, addr: u8, data: u8) { + Pio::::new(self.base + addr as u16).write(data) + } + + pub fn probe(&mut self) -> bool { + // Send probe command + self.write(0, 1); + + // Wait for response + let mut timeout = 1_000_000; + while timeout > 0 { + if self.read(0) == 0 { + break; + } + timeout -= 1; + } + if timeout == 0 { + return false; + } + + // Return false on command error + if self.read(1) != 0 { + return false; + } + + // Must receive 0x76, 0xEC as signature + self.read(2) == 0x76 && self.read(3) == 0xEC + } + + pub fn flush(&mut self) { + // Send command + self.write(0, 4); + + // TODO: timeout + while self.read(0) != 0 {} + + // Clear length + self.write(3, 0); + } + + pub fn print(&mut self, byte: u8) { + // Read length + let len = self.read(3); + // Write data at offset + self.write(len + 4, byte); + // Update length + self.write(3, len + 1); + + // If we hit the end of the buffer, or were given a newline, flush + if byte == b'\n' || len >= 128 { + self.flush(); + } + } + + pub fn print_slice(&mut self, bytes: &[u8]) { + for &byte in bytes { + self.print(byte); + } + } +} diff --git a/src/arch/x86/gdt.rs b/src/arch/x86/gdt.rs new file mode 100644 index 0000000..04d5131 --- /dev/null +++ b/src/arch/x86/gdt.rs @@ -0,0 +1,269 @@ +//! Global descriptor table + +use core::convert::TryInto; +use core::mem; + +use x86::segmentation::load_cs; +use x86::bits64::task::TaskStateSegment; +use x86::Ring; +use x86::dtables::{self, DescriptorTablePointer}; +use x86::segmentation::{self, Descriptor as SegmentDescriptor, SegmentSelector}; +use x86::task; + +use super::cpuid::cpuid; + +pub const GDT_NULL: usize = 0; +pub const GDT_KERNEL_CODE: usize = 1; +pub const GDT_KERNEL_DATA: usize = 2; +pub const GDT_KERNEL_KPCR: usize = 3; +pub const GDT_USER_CODE32_UNUSED: usize = 4; +pub const GDT_USER_DATA: usize = 5; +pub const GDT_USER_CODE: usize = 6; +pub const GDT_TSS: usize = 7; +pub const GDT_TSS_HIGH: usize = 8; +pub const GDT_CPU_ID_CONTAINER: usize = 9; + +pub const GDT_A_PRESENT: u8 = 1 << 7; +pub const GDT_A_RING_0: u8 = 0 << 5; +pub const GDT_A_RING_1: u8 = 1 << 5; +pub const GDT_A_RING_2: u8 = 2 << 5; +pub const GDT_A_RING_3: u8 = 3 << 5; +pub const GDT_A_SYSTEM: u8 = 1 << 4; +pub const GDT_A_EXECUTABLE: u8 = 1 << 3; +pub const GDT_A_CONFORMING: u8 = 1 << 2; +pub const GDT_A_PRIVILEGE: u8 = 1 << 1; +pub const GDT_A_DIRTY: u8 = 1; + +pub const GDT_A_TSS_AVAIL: u8 = 0x9; +pub const GDT_A_TSS_BUSY: u8 = 0xB; + +pub const GDT_F_PAGE_SIZE: u8 = 1 << 7; +pub const GDT_F_PROTECTED_MODE: u8 = 1 << 6; +pub const GDT_F_LONG_MODE: u8 = 1 << 5; + +static mut INIT_GDT: [GdtEntry; 4] = [ + // Null + GdtEntry::new(0, 0, 0, 0), + // Kernel code + GdtEntry::new(0, 0, GDT_A_PRESENT | GDT_A_RING_0 | GDT_A_SYSTEM | GDT_A_EXECUTABLE | GDT_A_PRIVILEGE, GDT_F_PROTECTED_MODE), + // Kernel data + GdtEntry::new(0, 0, GDT_A_PRESENT | GDT_A_RING_0 | GDT_A_SYSTEM | GDT_A_PRIVILEGE, GDT_F_PROTECTED_MODE), + // Kernel TLS + GdtEntry::new(0, 0, GDT_A_PRESENT | GDT_A_RING_0 | GDT_A_SYSTEM | GDT_A_PRIVILEGE, GDT_F_PROTECTED_MODE), +]; + +#[thread_local] +pub static mut GDT: [GdtEntry; 10] = [ + // Null + GdtEntry::new(0, 0, 0, 0), + // Kernel code + GdtEntry::new(0, 0, GDT_A_PRESENT | GDT_A_RING_0 | GDT_A_SYSTEM | GDT_A_EXECUTABLE | GDT_A_PRIVILEGE, GDT_F_PROTECTED_MODE), + // Kernel data + GdtEntry::new(0, 0, GDT_A_PRESENT | GDT_A_RING_0 | GDT_A_SYSTEM | GDT_A_PRIVILEGE, GDT_F_PROTECTED_MODE), + // Kernel TLS + GdtEntry::new(0, 0, GDT_A_PRESENT | GDT_A_RING_0 | GDT_A_SYSTEM | GDT_A_PRIVILEGE, GDT_F_PROTECTED_MODE), + // Dummy 32-bit user code - apparently necessary for SYSEXIT. We restrict it to ring 0 anyway. + GdtEntry::new(0, 0, GDT_A_PRESENT | GDT_A_RING_0 | GDT_A_SYSTEM | GDT_A_EXECUTABLE | GDT_A_PRIVILEGE, GDT_F_PROTECTED_MODE), + // User data + GdtEntry::new(0, 0, GDT_A_PRESENT | GDT_A_RING_3 | GDT_A_SYSTEM | GDT_A_PRIVILEGE, GDT_F_PROTECTED_MODE), + // User (64-bit) code + GdtEntry::new(0, 0, GDT_A_PRESENT | GDT_A_RING_3 | GDT_A_SYSTEM | GDT_A_EXECUTABLE | GDT_A_PRIVILEGE, GDT_F_PROTECTED_MODE), + // TSS + GdtEntry::new(0, 0, GDT_A_PRESENT | GDT_A_RING_3 | GDT_A_TSS_AVAIL, 0), + // TSS must be 16 bytes long, twice the normal size + GdtEntry::new(0, 0, 0, 0), + // Unused entry which stores the CPU ID. This is necessary for paranoid interrupts as they have + // no other way of determining it. + GdtEntry::new(0, 0, 0, 0), +]; + +#[repr(C, align(16))] +pub struct ProcessorControlRegion { + pub tcb_end: usize, + pub user_rsp_tmp: usize, + pub tss: TssWrapper, +} + +// NOTE: Despite not using #[repr(packed)], we do know that while there may be some padding +// inserted before and after the TSS, the main TSS structure will remain intact. +#[repr(C, align(16))] +pub struct TssWrapper(pub TaskStateSegment); + +#[thread_local] +pub static mut KPCR: ProcessorControlRegion = ProcessorControlRegion { + tcb_end: 0, + user_rsp_tmp: 0, + tss: TssWrapper(TaskStateSegment { + reserved: 0, + rsp: [0; 3], + reserved2: 0, + ist: [0; 7], + reserved3: 0, + reserved4: 0, + iomap_base: 0xFFFF + }), +}; + +#[cfg(feature = "pti")] +pub unsafe fn set_tss_stack(stack: usize) { + use super::pti::{PTI_CPU_STACK, PTI_CONTEXT_STACK}; + KPCR.tss.0.rsp[0] = (PTI_CPU_STACK.as_ptr() as usize + PTI_CPU_STACK.len()) as u64; + PTI_CONTEXT_STACK = stack; +} + +#[cfg(not(feature = "pti"))] +pub unsafe fn set_tss_stack(stack: usize) { + KPCR.tss.0.rsp[0] = stack as u64; +} + +// Initialize GDT +pub unsafe fn init() { + { + // Setup the initial GDT with TLS, so we can setup the TLS GDT (a little confusing) + // This means that each CPU will have its own GDT, but we only need to define it once as a thread local + + let limit = (INIT_GDT.len() * mem::size_of::() - 1) + .try_into() + .expect("initial GDT way too large"); + let base = INIT_GDT.as_ptr() as *const SegmentDescriptor; + + let init_gdtr: DescriptorTablePointer = DescriptorTablePointer { + limit, + base, + }; + + // Load the initial GDT, before we have access to thread locals + dtables::lgdt(&init_gdtr); + } + + // Load the segment descriptors + load_cs(SegmentSelector::new(GDT_KERNEL_CODE as u16, Ring::Ring0)); + segmentation::load_ds(SegmentSelector::new(GDT_KERNEL_DATA as u16, Ring::Ring0)); + segmentation::load_es(SegmentSelector::new(GDT_KERNEL_DATA as u16, Ring::Ring0)); + segmentation::load_fs(SegmentSelector::new(GDT_KERNEL_DATA as u16, Ring::Ring0)); + segmentation::load_gs(SegmentSelector::new(GDT_KERNEL_KPCR as u16, Ring::Ring0)); + segmentation::load_ss(SegmentSelector::new(GDT_KERNEL_DATA as u16, Ring::Ring0)); +} + +/// Initialize GDT with TLS +pub unsafe fn init_paging(cpu_id: u32, tcb_offset: usize, stack_offset: usize) { + // Set temporary TLS segment to the self-pointer of the Thread Control Block. + x86::msr::wrmsr(x86::msr::IA32_GS_BASE, tcb_offset as u64); + + //TODO: will this work with multicore? + { + INIT_GDT[GDT_KERNEL_KPCR].set_offset(tcb_offset as u32); + segmentation::load_gs(SegmentSelector::new(GDT_KERNEL_KPCR as u16, Ring::Ring0)); + } + + // Now that we have access to thread locals, begin by getting a pointer to the Processor + // Control Region. + let kpcr = &mut KPCR; + + // Then, setup the AP's individual GDT + let limit = (GDT.len() * mem::size_of::() - 1) + .try_into() + .expect("main GDT way too large"); + let base = GDT.as_ptr() as *const SegmentDescriptor; + + let gdtr: DescriptorTablePointer = DescriptorTablePointer { + limit, + base, + }; + + // Once we have fetched the real KPCR address, set the TLS segment to the TCB pointer there. + kpcr.tcb_end = (tcb_offset as *const usize).read(); + + { + // We can now access our TSS, via the KPCR, which is a thread local + let tss = &kpcr.tss.0 as *const _ as usize as u64; + let tss_lo = (tss & 0xFFFF_FFFF) as u32; + let tss_hi = (tss >> 32) as u32; + + GDT[GDT_TSS].set_offset(tss_lo); + GDT[GDT_TSS].set_limit(mem::size_of::() as u32); + + (&mut GDT[GDT_TSS_HIGH] as *mut GdtEntry).cast::().write(tss_hi); + } + + // And finally, populate the last GDT entry with the current CPU ID, to allow paranoid + // interrupt handlers to safely use TLS. + (&mut GDT[GDT_CPU_ID_CONTAINER] as *mut GdtEntry).cast::().write(cpu_id); + + // Set the stack pointer to use when coming back from userspace. + set_tss_stack(stack_offset); + + // Load the new GDT, which is correctly located in thread local storage. + dtables::lgdt(&gdtr); + + // Ensure that GS always points to the KPCR in kernel space. + x86::msr::wrmsr(x86::msr::IA32_GS_BASE, kpcr as *mut _ as usize as u64); + // Inside kernel space, GS should _always_ point to the TSS. When leaving userspace, `swapgs` + // is called again, making the userspace GS always point to user data. + x86::msr::wrmsr(x86::msr::IA32_KERNEL_GSBASE, 0); + + // Set the User TLS segment to zero, before we create any contexts and start scheduling. + x86::msr::wrmsr(x86::msr::IA32_FS_BASE, 0); + + // Reload the segment descriptors + load_cs(SegmentSelector::new(GDT_KERNEL_CODE as u16, Ring::Ring0)); + segmentation::load_ds(SegmentSelector::new(GDT_KERNEL_DATA as u16, Ring::Ring0)); + segmentation::load_es(SegmentSelector::new(GDT_KERNEL_DATA as u16, Ring::Ring0)); + segmentation::load_ss(SegmentSelector::new(GDT_KERNEL_DATA as u16, Ring::Ring0)); + + // NOTE: FS has already been updated while calling set_tcb. + // NOTE: We do not want to load GS again, since it has already been loaded into + // GDT_KERNEL_KPCR. Instead, we use the base MSR to allow for a 64-bit offset. + + // Load the task register + task::load_tr(SegmentSelector::new(GDT_TSS as u16, Ring::Ring0)); + + let has_fsgsbase = cpuid().map_or(false, |cpuid| { + cpuid.get_extended_feature_info().map_or(false, |extended_features| { + extended_features.has_fsgsbase() + }) + }); + + if cfg!(feature = "x86_fsgsbase") { + assert!(has_fsgsbase, "running kernel with features not supported by the current CPU"); + } + + if has_fsgsbase { + x86::controlregs::cr4_write(x86::controlregs::cr4() | x86::controlregs::Cr4::CR4_ENABLE_FSGSBASE); + } +} + +#[derive(Copy, Clone, Debug)] +#[repr(packed)] +pub struct GdtEntry { + pub limitl: u16, + pub offsetl: u16, + pub offsetm: u8, + pub access: u8, + pub flags_limith: u8, + pub offseth: u8 +} + +impl GdtEntry { + pub const fn new(offset: u32, limit: u32, access: u8, flags: u8) -> Self { + GdtEntry { + limitl: limit as u16, + offsetl: offset as u16, + offsetm: (offset >> 16) as u8, + access, + flags_limith: flags & 0xF0 | ((limit >> 16) as u8) & 0x0F, + offseth: (offset >> 24) as u8 + } + } + + pub fn set_offset(&mut self, offset: u32) { + self.offsetl = offset as u16; + self.offsetm = (offset >> 16) as u8; + self.offseth = (offset >> 24) as u8; + } + + pub fn set_limit(&mut self, limit: u32) { + self.limitl = limit as u16; + self.flags_limith = self.flags_limith & 0xF0 | ((limit >> 16) as u8) & 0x0F; + } +} diff --git a/src/arch/x86/idt.rs b/src/arch/x86/idt.rs new file mode 100644 index 0000000..fb08090 --- /dev/null +++ b/src/arch/x86/idt.rs @@ -0,0 +1,337 @@ +use core::num::NonZeroU8; +use core::sync::atomic::{AtomicU32, Ordering}; +use core::mem; + +use alloc::boxed::Box; +use alloc::collections::BTreeMap; + +use x86::segmentation::Descriptor as X86IdtEntry; +use x86::dtables::{self, DescriptorTablePointer}; + +use crate::interrupt::*; +use crate::ipi::IpiKind; + +use spin::RwLock; + +pub static mut INIT_IDTR: DescriptorTablePointer = DescriptorTablePointer { + limit: 0, + base: 0 as *const X86IdtEntry +}; + +#[thread_local] +pub static mut IDTR: DescriptorTablePointer = DescriptorTablePointer { + limit: 0, + base: 0 as *const X86IdtEntry +}; + +pub type IdtEntries = [IdtEntry; 256]; +pub type IdtReservations = [AtomicU32; 8]; + +#[repr(packed)] +pub struct Idt { + entries: IdtEntries, + reservations: IdtReservations, +} +impl Idt { + pub const fn new() -> Self { + Self { + entries: [IdtEntry::new(); 256], + reservations: new_idt_reservations(), + } + } + #[inline] + pub fn is_reserved(&self, index: u8) -> bool { + let byte_index = index / 32; + let bit = index % 32; + + { &self.reservations[usize::from(byte_index)] }.load(Ordering::Acquire) & (1 << bit) != 0 + } + + #[inline] + pub fn set_reserved(&self, index: u8, reserved: bool) { + let byte_index = index / 32; + let bit = index % 32; + + { &self.reservations[usize::from(byte_index)] }.fetch_or(u32::from(reserved) << bit, Ordering::AcqRel); + } + #[inline] + pub fn is_reserved_mut(&mut self, index: u8) -> bool { + let byte_index = index / 32; + let bit = index % 32; + + *{ &mut self.reservations[usize::from(byte_index)] }.get_mut() & (1 << bit) != 0 + } + + #[inline] + pub fn set_reserved_mut(&mut self, index: u8, reserved: bool) { + let byte_index = index / 32; + let bit = index % 32; + + *{ &mut self.reservations[usize::from(byte_index)] }.get_mut() |= u32::from(reserved) << bit; + } +} + +static mut INIT_BSP_IDT: Idt = Idt::new(); + +// TODO: VecMap? +pub static IDTS: RwLock>> = RwLock::new(None); + +#[inline] +pub fn is_reserved(cpu_id: usize, index: u8) -> bool { + let byte_index = index / 32; + let bit = index % 32; + + { &IDTS.read().as_ref().unwrap().get(&cpu_id).unwrap().reservations[usize::from(byte_index)] }.load(Ordering::Acquire) & (1 << bit) != 0 +} + +#[inline] +pub fn set_reserved(cpu_id: usize, index: u8, reserved: bool) { + let byte_index = index / 32; + let bit = index % 32; + + { &IDTS.read().as_ref().unwrap().get(&cpu_id).unwrap().reservations[usize::from(byte_index)] }.fetch_or(u32::from(reserved) << bit, Ordering::AcqRel); +} + +pub fn allocate_interrupt() -> Option { + let cpu_id = crate::cpu_id(); + for number in 50..=254 { + if ! is_reserved(cpu_id, number) { + set_reserved(cpu_id, number, true); + return Some(unsafe { NonZeroU8::new_unchecked(number) }); + } + } + None +} + +pub fn available_irqs_iter(cpu_id: usize) -> impl Iterator + 'static { + (32..=254).filter(move |&index| !is_reserved(cpu_id, index)) +} + +macro_rules! use_irq( + ( $idt: expr, $number:literal, $func:ident ) => {{ + $idt[$number].set_func($func); + }} +); + +macro_rules! use_default_irqs( + ($idt:expr) => {{ + use crate::interrupt::irq::*; + default_irqs!($idt, use_irq); + }} +); + +pub unsafe fn init() { + dtables::lidt(&INIT_IDTR); +} + +const fn new_idt_reservations() -> [AtomicU32; 8] { + [ + AtomicU32::new(0), AtomicU32::new(0), AtomicU32::new(0), AtomicU32::new(0), + AtomicU32::new(0), AtomicU32::new(0), AtomicU32::new(0), AtomicU32::new(0) + ] +} + +/// Initialize the IDT for a +pub unsafe fn init_paging_post_heap(is_bsp: bool, cpu_id: usize) { + let mut idts_guard = IDTS.write(); + let idts_btree = idts_guard.get_or_insert_with(|| BTreeMap::new()); + + if is_bsp { + idts_btree.insert(cpu_id, &mut INIT_BSP_IDT); + } else { + let idt = idts_btree.entry(cpu_id).or_insert_with(|| Box::leak(Box::new(Idt::new()))); + init_generic(is_bsp, idt); + } +} + +/// Initializes a fully functional IDT for use before it be moved into the map. This is ONLY called +/// on the BSP, since the kernel heap is ready for the APs. +pub unsafe fn init_paging_bsp() { + init_generic(true, &mut INIT_BSP_IDT); +} + +/// Initializes an IDT for any type of processor. +pub unsafe fn init_generic(is_bsp: bool, idt: &mut Idt) { + let (current_idt, current_reservations) = (&mut idt.entries, &mut idt.reservations); + + IDTR.limit = (current_idt.len() * mem::size_of::() - 1) as u16; + IDTR.base = current_idt.as_ptr() as *const X86IdtEntry; + + let backup_ist = { + // We give Non-Maskable Interrupts, Double Fault, and Machine Check exceptions separate + // stacks, since these (unless we are going to set up NMI watchdogs like Linux does) are + // considered the most fatal, especially Double Faults which are caused by errors __when + // accessing the system IDT__. If that goes wrong, then kernel memory may be partially + // corrupt, and we want a separate stack. + // + // Note that each CPU has its own "backup interrupt stack". + let index = 1_u8; + + // Allocate 64 KiB of stack space for the backup stack. + const BACKUP_STACK_SIZE: usize = 65536; + assert_eq!(BACKUP_STACK_SIZE % crate::memory::PAGE_SIZE, 0); + let page_count = BACKUP_STACK_SIZE / crate::memory::PAGE_SIZE; + let frames = crate::memory::allocate_frames(page_count) + .expect("failed to allocate pages for backup interrupt stack"); + + use crate::paging::{RmmA, RmmArch}; + + // Physical pages are mapped linearly. So is the linearly mapped virtual memory. + let base_address = RmmA::phys_to_virt(frames.start_address()); + + // Stack always grows downwards. + let address = base_address.data() + BACKUP_STACK_SIZE; + + // Put them in the 1st entry of the IST. + crate::gdt::KPCR.tss.0.ist[usize::from(index - 1)] = address as u64; + + index + }; + + // Set up exceptions + current_idt[0].set_func(exception::divide_by_zero); + current_idt[1].set_func(exception::debug); + current_idt[2].set_func(exception::non_maskable); + current_idt[2].set_ist(backup_ist); + current_idt[3].set_func(exception::breakpoint); + current_idt[3].set_flags(IdtFlags::PRESENT | IdtFlags::RING_3 | IdtFlags::INTERRUPT); + current_idt[4].set_func(exception::overflow); + current_idt[5].set_func(exception::bound_range); + current_idt[6].set_func(exception::invalid_opcode); + current_idt[7].set_func(exception::device_not_available); + current_idt[8].set_func(exception::double_fault); + current_idt[8].set_ist(backup_ist); + // 9 no longer available + current_idt[10].set_func(exception::invalid_tss); + current_idt[11].set_func(exception::segment_not_present); + current_idt[12].set_func(exception::stack_segment); + current_idt[13].set_func(exception::protection); + current_idt[14].set_func(exception::page); + // 15 reserved + current_idt[16].set_func(exception::fpu_fault); + current_idt[17].set_func(exception::alignment_check); + current_idt[18].set_func(exception::machine_check); + current_idt[18].set_ist(backup_ist); + current_idt[19].set_func(exception::simd); + current_idt[20].set_func(exception::virtualization); + // 21 through 29 reserved + current_idt[30].set_func(exception::security); + // 31 reserved + + // reserve bits 31:0, i.e. the first 32 interrupts, which are reserved for exceptions + *current_reservations[0].get_mut() |= 0x0000_0000_FFFF_FFFF; + + if is_bsp { + // Set up IRQs + current_idt[32].set_func(irq::pit_stack); + current_idt[33].set_func(irq::keyboard); + current_idt[34].set_func(irq::cascade); + current_idt[35].set_func(irq::com2); + current_idt[36].set_func(irq::com1); + current_idt[37].set_func(irq::lpt2); + current_idt[38].set_func(irq::floppy); + current_idt[39].set_func(irq::lpt1); + current_idt[40].set_func(irq::rtc); + current_idt[41].set_func(irq::pci1); + current_idt[42].set_func(irq::pci2); + current_idt[43].set_func(irq::pci3); + current_idt[44].set_func(irq::mouse); + current_idt[45].set_func(irq::fpu); + current_idt[46].set_func(irq::ata1); + current_idt[47].set_func(irq::ata2); + current_idt[48].set_func(irq::lapic_timer); + current_idt[49].set_func(irq::lapic_error); + + + // reserve bits 49:32, which are for the standard IRQs, and for the local apic timer and error. + *current_reservations[1].get_mut() |= 0x0003_FFFF; + } else { + // TODO: use_default_irqs! but also the legacy IRQs that are only needed on one CPU + current_idt[49].set_func(irq::lapic_error); + + // reserve bit 49 + *current_reservations[1].get_mut() |= (1 << 17); + } + + use_default_irqs!(current_idt); + + // Set IPI handlers + current_idt[IpiKind::Wakeup as usize].set_func(ipi::wakeup); + current_idt[IpiKind::Switch as usize].set_func(ipi::switch); + current_idt[IpiKind::Tlb as usize].set_func(ipi::tlb); + current_idt[IpiKind::Pit as usize].set_func(ipi::pit); + idt.set_reserved_mut(IpiKind::Wakeup as u8, true); + idt.set_reserved_mut(IpiKind::Switch as u8, true); + idt.set_reserved_mut(IpiKind::Tlb as u8, true); + idt.set_reserved_mut(IpiKind::Pit as u8, true); + let current_idt = &mut idt.entries; + + // Set syscall function + current_idt[0x80].set_func(syscall::syscall); + current_idt[0x80].set_flags(IdtFlags::PRESENT | IdtFlags::RING_3 | IdtFlags::INTERRUPT); + idt.set_reserved_mut(0x80, true); + + dtables::lidt(&IDTR); +} + +bitflags! { + pub struct IdtFlags: u8 { + const PRESENT = 1 << 7; + const RING_0 = 0 << 5; + const RING_1 = 1 << 5; + const RING_2 = 2 << 5; + const RING_3 = 3 << 5; + const SS = 1 << 4; + const INTERRUPT = 0xE; + const TRAP = 0xF; + } +} + +#[derive(Copy, Clone, Debug, Default)] +#[repr(packed)] +pub struct IdtEntry { + offsetl: u16, + selector: u16, + zero: u8, + attribute: u8, + offsetm: u16, + offseth: u32, + zero2: u32 +} + +impl IdtEntry { + pub const fn new() -> IdtEntry { + IdtEntry { + offsetl: 0, + selector: 0, + zero: 0, + attribute: 0, + offsetm: 0, + offseth: 0, + zero2: 0 + } + } + + pub fn set_flags(&mut self, flags: IdtFlags) { + self.attribute = flags.bits; + } + + pub fn set_ist(&mut self, ist: u8) { + assert_eq!(ist & 0x07, ist, "interrupt stack table must be within 0..=7"); + self.zero &= 0xF8; + self.zero |= ist; + } + + pub fn set_offset(&mut self, selector: u16, base: usize) { + self.selector = selector; + self.offsetl = base as u16; + self.offsetm = (base >> 16) as u16; + self.offseth = ((base as u64) >> 32) as u32; + } + + // A function to set the offset more easily + pub fn set_func(&mut self, func: unsafe extern fn()) { + self.set_flags(IdtFlags::PRESENT | IdtFlags::RING_0 | IdtFlags::INTERRUPT); + self.set_offset((crate::gdt::GDT_KERNEL_CODE as u16) << 3, func as usize); + } +} diff --git a/src/arch/x86/interrupt/exception.rs b/src/arch/x86/interrupt/exception.rs new file mode 100644 index 0000000..c01ae15 --- /dev/null +++ b/src/arch/x86/interrupt/exception.rs @@ -0,0 +1,187 @@ +use crate::{ + interrupt::stack_trace, + ptrace, + syscall::flag::*, + + interrupt_stack, + interrupt_error, +}; + +extern { + fn ksignal(signal: usize); +} + +interrupt_stack!(divide_by_zero, |stack| { + println!("Divide by zero"); + stack.dump(); + stack_trace(); + ksignal(SIGFPE); +}); + +interrupt_stack!(debug, @paranoid, |stack| { + let mut handled = false; + + // Disable singlestep before there is a breakpoint, since the breakpoint + // handler might end up setting it again but unless it does we want the + // default to be false. + let had_singlestep = stack.iret.eflags & (1 << 8) == 1 << 8; + stack.set_singlestep(false); + + if ptrace::breakpoint_callback(PTRACE_STOP_SINGLESTEP, None).is_some() { + handled = true; + } else { + // There was no breakpoint, restore original value + stack.set_singlestep(had_singlestep); + } + + if !handled { + println!("Debug trap"); + stack.dump(); + ksignal(SIGTRAP); + } +}); + +interrupt_stack!(non_maskable, @paranoid, |stack| { + println!("Non-maskable interrupt"); + stack.dump(); +}); + +interrupt_stack!(breakpoint, |stack| { + // The processor lets EIP point to the instruction *after* int3, so + // unhandled breakpoint interrupt don't go in an infinite loop. But we + // throw SIGTRAP anyway, so that's not a problem. + // + // We have the following code to prevent + // - EIP from going out of sync with instructions + // - The user having to do 2 syscalls to replace the instruction at EIP + // - Having more compatibility glue for GDB than necessary + // + // Let's just follow Linux convention and let EIP be EIP-1, point to the + // int3 instruction. After all, it's the sanest thing to do. + stack.iret.eip -= 1; + + if ptrace::breakpoint_callback(PTRACE_STOP_BREAKPOINT, None).is_none() { + println!("Breakpoint trap"); + stack.dump(); + ksignal(SIGTRAP); + } +}); + +interrupt_stack!(overflow, |stack| { + println!("Overflow trap"); + stack.dump(); + stack_trace(); + ksignal(SIGFPE); +}); + +interrupt_stack!(bound_range, |stack| { + println!("Bound range exceeded fault"); + stack.dump(); + stack_trace(); + ksignal(SIGSEGV); +}); + +interrupt_stack!(invalid_opcode, |stack| { + println!("Invalid opcode fault"); + stack.dump(); + stack_trace(); + ksignal(SIGILL); +}); + +interrupt_stack!(device_not_available, |stack| { + println!("Device not available fault"); + stack.dump(); + stack_trace(); + ksignal(SIGILL); +}); + +interrupt_error!(double_fault, |stack| { + println!("Double fault"); + stack.dump(); + stack_trace(); + ksignal(SIGSEGV); +}); + +interrupt_error!(invalid_tss, |stack| { + println!("Invalid TSS fault"); + stack.dump(); + stack_trace(); + ksignal(SIGSEGV); +}); + +interrupt_error!(segment_not_present, |stack| { + println!("Segment not present fault"); + stack.dump(); + stack_trace(); + ksignal(SIGSEGV); +}); + +interrupt_error!(stack_segment, |stack| { + println!("Stack segment fault"); + stack.dump(); + stack_trace(); + ksignal(SIGSEGV); +}); + +interrupt_error!(protection, |stack| { + println!("Protection fault"); + stack.dump(); + stack_trace(); + ksignal(SIGSEGV); +}); + +interrupt_error!(page, |stack| { + let cr2: usize; + core::arch::asm!("mov {}, cr2", out(reg) cr2); + println!("Page fault: {:>016X}", cr2); + println!(" Present: {}", stack.code & 1 << 0 != 0); + println!(" Write: {}", stack.code & 1 << 1 != 0); + println!(" User: {}", stack.code & 1 << 2 != 0); + println!(" Reserved write: {}", stack.code & 1 << 3 != 0); + println!(" Instruction fetch: {}", stack.code & 1 << 4 != 0); + stack.dump(); + stack_trace(); + ksignal(SIGSEGV); +}); + +interrupt_stack!(fpu_fault, |stack| { + println!("FPU floating point fault"); + stack.dump(); + stack_trace(); + ksignal(SIGFPE); +}); + +interrupt_error!(alignment_check, |stack| { + println!("Alignment check fault"); + stack.dump(); + stack_trace(); + ksignal(SIGBUS); +}); + +interrupt_stack!(machine_check, @paranoid, |stack| { + println!("Machine check fault"); + stack.dump(); + stack_trace(); + ksignal(SIGBUS); +}); + +interrupt_stack!(simd, |stack| { + println!("SIMD floating point fault"); + stack.dump(); + stack_trace(); + ksignal(SIGFPE); +}); + +interrupt_stack!(virtualization, |stack| { + println!("Virtualization fault"); + stack.dump(); + stack_trace(); + ksignal(SIGBUS); +}); + +interrupt_error!(security, |stack| { + println!("Security exception"); + stack.dump(); + stack_trace(); + ksignal(SIGBUS); +}); diff --git a/src/arch/x86/interrupt/handler.rs b/src/arch/x86/interrupt/handler.rs new file mode 100644 index 0000000..a601df1 --- /dev/null +++ b/src/arch/x86/interrupt/handler.rs @@ -0,0 +1,489 @@ +use core::mem; + +use crate::syscall::IntRegisters; + +use super::super::flags::*; + +#[derive(Default)] +#[repr(packed)] +pub struct ScratchRegisters { + pub edx: usize, + pub ecx: usize, + pub eax: usize, +} + +impl ScratchRegisters { + pub fn dump(&self) { + println!("EAX: {:016x}", { self.eax }); + println!("ECX: {:016x}", { self.ecx }); + println!("EDX: {:016x}", { self.edx }); + } +} + +#[derive(Default)] +#[repr(packed)] +pub struct PreservedRegisters { + pub ebp: usize, + pub esi: usize, + pub edi: usize, + pub ebx: usize, +} + +impl PreservedRegisters { + pub fn dump(&self) { + println!("EBX: {:016x}", { self.ebx }); + println!("EDI: {:016x}", { self.edi }); + println!("ESI: {:016x}", { self.esi }); + println!("EBP: {:016x}", { self.ebp }); + } +} + +#[derive(Default)] +#[repr(packed)] +pub struct IretRegisters { + pub eip: usize, + pub cs: usize, + pub eflags: usize, + + // ---- + // The following will only be present if interrupt is raised from another + // privilege ring. Otherwise, they are undefined values. + // ---- + + pub esp: usize, + pub ss: usize +} + +impl IretRegisters { + pub fn dump(&self) { + println!("EFLAG: {:016x}", { self.eflags }); + println!("CS: {:016x}", { self.cs }); + println!("EIP: {:016x}", { self.eip }); + + if self.cs & 0b11 != 0b00 { + println!("ESP: {:016x}", { self.esp }); + println!("SS: {:016x}", { self.ss }); + } + unsafe { + let fsbase = x86::msr::rdmsr(x86::msr::IA32_FS_BASE); + let gsbase = x86::msr::rdmsr(x86::msr::IA32_KERNEL_GSBASE); + let kgsbase = x86::msr::rdmsr(x86::msr::IA32_GS_BASE); + println!("FSBASE {:016x}\nGSBASE {:016x}\nKGSBASE {:016x}", fsbase, gsbase, kgsbase); + } + } +} + +#[derive(Default)] +#[repr(packed)] +pub struct InterruptStack { + pub preserved: PreservedRegisters, + pub scratch: ScratchRegisters, + pub iret: IretRegisters, +} + +impl InterruptStack { + pub fn dump(&self) { + self.iret.dump(); + self.scratch.dump(); + self.preserved.dump(); + } + /// Saves all registers to a struct used by the proc: + /// scheme to read/write registers. + pub fn save(&self, all: &mut IntRegisters) { + all.ebp = self.preserved.ebp; + all.esi = self.preserved.esi; + all.edi = self.preserved.edi; + all.ebx = self.preserved.ebx; + all.edx = self.scratch.edx; + all.ecx = self.scratch.ecx; + all.eax = self.scratch.eax; + all.eip = self.iret.eip; + all.cs = self.iret.cs; + all.eflags = self.iret.eflags; + + // Set esp and ss: + + const CPL_MASK: usize = 0b11; + + let cs: usize; + unsafe { + core::arch::asm!("mov {}, cs", out(reg) cs); + } + + if self.iret.cs & CPL_MASK == cs & CPL_MASK { + // Privilege ring didn't change, so neither did the stack + all.esp = self as *const Self as usize // esp after Self was pushed to the stack + + mem::size_of::() // disregard Self + - mem::size_of::() * 2; // well, almost: esp and ss need to be excluded as they aren't present + unsafe { + core::arch::asm!("mov {}, ss", out(reg) all.ss); + } + } else { + all.esp = self.iret.esp; + all.ss = self.iret.ss; + } + } + /// Loads all registers from a struct used by the proc: + /// scheme to read/write registers. + pub fn load(&mut self, all: &IntRegisters) { + // TODO: Which of these should be allowed to change? + + self.preserved.ebp = all.ebp; + self.preserved.esi = all.esi; + self.preserved.edi = all.edi; + self.preserved.ebx = all.ebx; + self.scratch.edx = all.edx; + self.scratch.ecx = all.ecx; + self.scratch.eax = all.eax; + self.iret.eip = all.eip; + + // These should probably be restricted + // self.iret.cs = all.cs; + // self.iret.eflags = all.eflags; + } + /// Enables the "Trap Flag" in the FLAGS register, causing the CPU + /// to send a Debug exception after the next instruction. This is + /// used for singlestep in the proc: scheme. + pub fn set_singlestep(&mut self, enabled: bool) { + if enabled { + self.iret.eflags |= FLAG_SINGLESTEP; + } else { + self.iret.eflags &= !FLAG_SINGLESTEP; + } + } + /// Checks if the trap flag is enabled, see `set_singlestep` + pub fn is_singlestep(&self) -> bool { + self.iret.eflags & FLAG_SINGLESTEP == FLAG_SINGLESTEP + } +} + +#[derive(Default)] +#[repr(packed)] +pub struct InterruptErrorStack { + pub code: usize, + pub inner: InterruptStack, +} + +impl InterruptErrorStack { + pub fn dump(&self) { + println!("CODE: {:016x}", { self.code }); + self.inner.dump(); + } +} + +#[macro_export] +macro_rules! push_scratch { + () => { " + // Push scratch registers (minus eax) + push ecx + push edx + " }; +} +#[macro_export] +macro_rules! pop_scratch { + () => { " + // Pop scratch registers + pop edx + pop ecx + pop eax + " }; +} + +#[macro_export] +macro_rules! push_preserved { + () => { " + // Push preserved registers + push ebx + push edi + push esi + push ebp + " }; +} +#[macro_export] +macro_rules! pop_preserved { + () => { " + // Pop preserved registers + pop ebp + pop esi + pop edi + pop ebx + " }; +} +macro_rules! swapgs_iff_ring3_fast { + () => { " + // Check whether the last two bits ESP+8 (code segment) are equal to zero. + test DWORD PTR [esp + 8], 0x3 + // Skip the SWAPGS instruction if CS & 0b11 == 0b00. + jz 1f + //TODO swapgs + 1: + " }; +} +macro_rules! swapgs_iff_ring3_fast_errorcode { + () => { " + test DWORD PTR [esp + 16], 0x3 + jz 1f + //TODO swapgs + 1: + " }; +} + +macro_rules! save_gsbase_paranoid { + () => { " + mov ecx, {IA32_GS_BASE} + rdmsr + shl edx, 32 + or eax, edx + + push eax + " } +} + +macro_rules! restore_gsbase_paranoid { + () => { " + pop edx + + mov ecx, {IA32_GS_BASE} + mov eax, edx + shr edx, 32 + wrmsr + " } +} + +macro_rules! set_gsbase_paranoid { + () => { " + mov ecx, {IA32_GS_BASE} + mov eax, edx + shr edx, 32 + wrmsr + " } +} + +macro_rules! save_and_set_gsbase_paranoid { + // For paranoid interrupt entries, we have to be extremely careful with how we use IA32_GS_BASE + // and IA32_KERNEL_GS_BASE. If FSGSBASE is enabled, then we have no way to differentiate these + // two, as paranoid interrupts (e.g. NMIs) can occur even in kernel mode. In fact, they can + // even occur within another IRQ, so we cannot check the the privilege level via the stack. + // + // What we do instead, is using a special entry in the GDT, since we know that the GDT will + // always be thread local, as it contains the TSS. This gives us more than 32 bits to work + // with, which already is the largest x2APIC ID that an x86 CPU can handle. Luckily we can also + // use the stack, even though there might be interrupts in between. + // + // TODO: Linux uses the Interrupt Stack Table to figure out which NMIs were nested. Perhaps + // this could be done here, because if nested (sp > initial_sp), that means the NMI could not + // have come from userspace. But then, knowing the initial sp would somehow have to involve + // percpu, which brings us back to square one. But it might be useful if we would allow faults + // in NMIs. If we do detect a nested interrupt, then we can perform the iretd procedure + // ourselves, so that the newly nested NMI still blocks additional interrupts while still + // returning to the previously (faulting) NMI. See https://lwn.net/Articles/484932/, although I + // think the solution becomes a bit simpler when we cannot longer rely on GSBASE anymore. + + () => { concat!( + save_gsbase_paranoid!(), + + // Allocate stack space for 8 bytes GDT base and 2 bytes size (ignored). + "sub esp, 16\n", + // Set it to the GDT base. + "sgdt [esp + 6]\n", + // Get the base pointer + " + mov eax, [esp + 8] + add esp, 16 + ", + // Load the lower 32 bits of that GDT entry. + "mov edx, [eax + {gdt_cpu_id_offset}]\n", + // Calculate the percpu offset. + " + mov ebx, {KERNEL_PERCPU_OFFSET} + shl edx, {KERNEL_PERCPU_SHIFT} + add edx, ebx + ", + // Set GSBASE to EAX accordingly + set_gsbase_paranoid!(), + ) } +} +macro_rules! nop { + () => { " + // Unused: {IA32_GS_BASE} {KERNEL_PERCPU_OFFSET} {KERNEL_PERCPU_SHIFT} {gdt_cpu_id_offset} + " } +} + +#[macro_export] +macro_rules! interrupt_stack { + // XXX: Apparently we cannot use $expr and check for bool exhaustiveness, so we will have to + // use idents directly instead. + ($name:ident, $save1:ident!, $save2:ident!, $rstor2:ident!, $rstor1:ident!, is_paranoid: $is_paranoid:expr, |$stack:ident| $code:block) => { + #[naked] + pub unsafe extern "C" fn $name() { + unsafe extern "C" fn inner($stack: &mut $crate::arch::x86::interrupt::InterruptStack) { + let _guard; + + if !$is_paranoid { + // Deadlock safety: (non-paranoid) interrupts are not normally enabled in the + // kernel, except in kmain. However, no locks for context list nor even + // individual context locks, are ever meant to be acquired there. + _guard = $crate::ptrace::set_process_regs($stack); + } + + // TODO: Force the declarations to specify unsafe? + + #[allow(unused_unsafe)] + unsafe { + $code + } + } + core::arch::asm!(concat!( + // Backup all userspace registers to stack + $save1!(), + "push eax\n", + push_scratch!(), + push_preserved!(), + + $save2!(), + + // TODO: Map PTI + // $crate::arch::x86::pti::map(); + + // Call inner function with pointer to stack + " + mov edi, esp + call {inner} + ", + + // TODO: Unmap PTI + // $crate::arch::x86::pti::unmap(); + + $rstor2!(), + + // Restore all userspace registers + pop_preserved!(), + pop_scratch!(), + + $rstor1!(), + "iretd\n", + ), + + inner = sym inner, + IA32_GS_BASE = const(x86::msr::IA32_GS_BASE), + KERNEL_PERCPU_SHIFT = const(crate::KERNEL_PERCPU_SHIFT), + KERNEL_PERCPU_OFFSET = const(crate::KERNEL_PERCPU_OFFSET), + + gdt_cpu_id_offset = const(crate::gdt::GDT_CPU_ID_CONTAINER * core::mem::size_of::()), + + options(noreturn), + + ); + } + }; + ($name:ident, |$stack:ident| $code:block) => { interrupt_stack!($name, swapgs_iff_ring3_fast!, nop!, nop!, swapgs_iff_ring3_fast!, is_paranoid: false, |$stack| $code); }; + ($name:ident, @paranoid, |$stack:ident| $code:block) => { interrupt_stack!($name, nop!, save_and_set_gsbase_paranoid!, restore_gsbase_paranoid!, nop!, is_paranoid: true, |$stack| $code); } +} + +#[macro_export] +macro_rules! interrupt { + ($name:ident, || $code:block) => { + #[naked] + pub unsafe extern "C" fn $name() { + unsafe extern "C" fn inner() { + $code + } + + core::arch::asm!(concat!( + // Backup all userspace registers to stack + swapgs_iff_ring3_fast!(), + "push eax\n", + push_scratch!(), + + // TODO: Map PTI + // $crate::arch::x86::pti::map(); + + // Call inner function with pointer to stack + "call {inner}\n", + + // TODO: Unmap PTI + // $crate::arch::x86::pti::unmap(); + + // Restore all userspace registers + pop_scratch!(), + + swapgs_iff_ring3_fast!(), + "iretd\n", + ), + + inner = sym inner, + + options(noreturn), + ); + } + }; +} + +#[macro_export] +macro_rules! interrupt_error { + ($name:ident, |$stack:ident| $code:block) => { + #[naked] + pub unsafe extern "C" fn $name() { + unsafe extern "C" fn inner($stack: &mut $crate::arch::x86::interrupt::handler::InterruptErrorStack) { + let _guard; + + // Only set_ptrace_process_regs if this error occured from userspace. If this fault + // originated from kernel mode, we have no idea what it might have locked (and + // kernel mode faults are never meant to occur unless something is wrong, and will + // not context switch anyway, rendering that statement useless in such a case + // anyway). + // + // Check the privilege level of CS against ring 3. + if $stack.inner.iret.cs & 0b11 == 0b11 { + _guard = $crate::ptrace::set_process_regs(&mut $stack.inner); + } + + #[allow(unused_unsafe)] + unsafe { + $code + } + } + + core::arch::asm!(concat!( + swapgs_iff_ring3_fast_errorcode!(), + // Move eax into code's place, put code in last instead (to be + // compatible with InterruptStack) + "xchg [esp], eax\n", + + // Push all userspace registers + push_scratch!(), + push_preserved!(), + + // Put code in, it's now in eax + "push eax\n", + + // TODO: Map PTI + // $crate::arch::x86::pti::map(); + + // Call inner function with pointer to stack + " + mov edi, esp + call {inner} + ", + + // TODO: Unmap PTI + // $crate::arch::x86::pti::unmap(); + + // Pop code + "add esp, 8\n", + + // Restore all userspace registers + pop_preserved!(), + pop_scratch!(), + + // The error code has already been popped, so use the regular macro. + swapgs_iff_ring3_fast!(), + "iretd\n", + ), + + inner = sym inner, + + options(noreturn)); + } + }; +} diff --git a/src/arch/x86/interrupt/ipi.rs b/src/arch/x86/interrupt/ipi.rs new file mode 100644 index 0000000..52aad2a --- /dev/null +++ b/src/arch/x86/interrupt/ipi.rs @@ -0,0 +1,30 @@ +use core::sync::atomic::Ordering; +use x86::tlb; + +use crate::context; +use crate::device::local_apic::LOCAL_APIC; +use super::irq::PIT_TICKS; + +interrupt!(wakeup, || { + LOCAL_APIC.eoi(); +}); + +interrupt!(tlb, || { + LOCAL_APIC.eoi(); + + tlb::flush_all(); +}); + +interrupt!(switch, || { + LOCAL_APIC.eoi(); + + let _ = context::switch(); +}); + +interrupt!(pit, || { + LOCAL_APIC.eoi(); + + if PIT_TICKS.fetch_add(1, Ordering::SeqCst) >= 10 { + let _ = context::switch(); + } +}); diff --git a/src/arch/x86/interrupt/irq.rs b/src/arch/x86/interrupt/irq.rs new file mode 100644 index 0000000..c36844a --- /dev/null +++ b/src/arch/x86/interrupt/irq.rs @@ -0,0 +1,325 @@ +use core::sync::atomic::{AtomicUsize, Ordering}; + +use alloc::vec::Vec; + +use crate::{interrupt, interrupt_stack}; +use crate::context::timeout; +use crate::device::{local_apic, ioapic, pic}; +use crate::device::serial::{COM1, COM2}; +use crate::ipi::{ipi, IpiKind, IpiTarget}; +use crate::scheme::debug::{debug_input, debug_notify}; +use crate::{context, time}; + +//resets to 0 in context::switch() +#[thread_local] +pub static PIT_TICKS: AtomicUsize = AtomicUsize::new(0); + +// The only way to read PS2 data without race conditions is to allow a keyboard interrupt to happen +// and then read data while reading mouse data, since keyboard data overrides mouse data and +// reading the status register is not done atomically with reading the data. This is not possible +// from userspace, so we do this minimal part of the PS2 driver in the kernel. +#[inline(always)] +unsafe fn ps2_interrupt(_index: usize) { + use crate::scheme::serio::serio_input; + + let data: u8; + let status: u8; + core::arch::asm!(" + sti + nop + cli + in al, 0x64 + mov ah, al + in al, 0x60 + mov {}, al + mov {}, ah + ", + out(reg_byte) data, + out(reg_byte) status, + ); + + if status & 1 != 0 { + let status_index = if status & (1 << 5) == 0 { + // Keyboard, according to status + 0 + } else { + // Mouse, according to status + 1 + }; + serio_input(status_index, data); + } +} + +#[repr(u8)] +#[derive(Clone, Copy, Debug, Eq, PartialEq)] +pub enum IrqMethod { + Pic = 0, + Apic = 1, +} + +static SPURIOUS_COUNT_IRQ7: AtomicUsize = AtomicUsize::new(0); +static SPURIOUS_COUNT_IRQ15: AtomicUsize = AtomicUsize::new(0); + +pub fn spurious_count_irq7() -> usize { + SPURIOUS_COUNT_IRQ7.load(Ordering::Relaxed) +} +pub fn spurious_count_irq15() -> usize { + SPURIOUS_COUNT_IRQ15.load(Ordering::Relaxed) +} +pub fn spurious_count() -> usize { + spurious_count_irq7() + spurious_count_irq15() +} +pub fn spurious_irq_resource() -> syscall::Result> { + match irq_method() { + IrqMethod::Apic => Ok(Vec::from(&b"(not implemented for APIC yet)"[..])), + IrqMethod::Pic => { + Ok(format!("{}\tIRQ7\n{}\tIRQ15\n{}\ttotal\n", spurious_count_irq7(), spurious_count_irq15(), spurious_count()).into_bytes()) + } + } +} + +static IRQ_METHOD: AtomicUsize = AtomicUsize::new(IrqMethod::Pic as usize); + +pub fn set_irq_method(method: IrqMethod) { + IRQ_METHOD.store(method as usize, core::sync::atomic::Ordering::Release); +} + +fn irq_method() -> IrqMethod { + let raw = IRQ_METHOD.load(core::sync::atomic::Ordering::Acquire); + + match raw { + 0 => IrqMethod::Pic, + 1 => IrqMethod::Apic, + _ => unreachable!(), + } +} + +extern { + // triggers irq scheme + fn irq_trigger(irq: u8); +} + +/// Notify the IRQ scheme that an IRQ has been registered. This should mask the IRQ until the +/// scheme user unmasks it ("acknowledges" it). +unsafe fn trigger(irq: u8) { + match irq_method() { + IrqMethod::Pic => if irq < 16 { pic_mask(irq) }, + IrqMethod::Apic => ioapic_mask(irq), + } + irq_trigger(irq); +} + +/// Unmask the IRQ. This is called from the IRQ scheme, which does this when a user process has +/// processed the IRQ. +pub unsafe fn acknowledge(irq: usize) { + match irq_method() { + IrqMethod::Pic => if irq < 16 { pic_unmask(irq) }, + IrqMethod::Apic => ioapic_unmask(irq), + } +} + +/// Sends an end-of-interrupt, so that the interrupt controller can go on to the next one. +pub unsafe fn eoi(irq: u8) { + match irq_method() { + IrqMethod::Pic => if irq < 16 { pic_eoi(irq) }, + IrqMethod::Apic => lapic_eoi(), + } +} + +unsafe fn pic_mask(irq: u8) { + debug_assert!(irq < 16); + + if irq >= 8 { + pic::SLAVE.mask_set(irq - 8); + } else { + pic::MASTER.mask_set(irq); + } +} + +unsafe fn ioapic_mask(irq: u8) { + ioapic::mask(irq); +} + +unsafe fn pic_eoi(irq: u8) { + debug_assert!(irq < 16); + + if irq >= 8 { + pic::MASTER.ack(); + pic::SLAVE.ack(); + } else { + pic::MASTER.ack(); + } +} + +unsafe fn lapic_eoi() { + local_apic::LOCAL_APIC.eoi() +} + +unsafe fn pic_unmask(irq: usize) { + debug_assert!(irq < 16); + + if irq >= 8 { + pic::SLAVE.mask_clear(irq as u8 - 8); + } else { + pic::MASTER.mask_clear(irq as u8); + } +} + +unsafe fn ioapic_unmask(irq: usize) { + ioapic::unmask(irq as u8); +} + +interrupt_stack!(pit_stack, |_stack| { + // Saves CPU time by not sending IRQ event irq_trigger(0); + + const PIT_RATE: u64 = 2_250_286; + + { + let mut offset = time::OFFSET.lock(); + let sum = offset.1 + PIT_RATE; + offset.1 = sum % 1_000_000_000; + offset.0 += sum / 1_000_000_000; + } + + eoi(0); + + // Wake up other CPUs + ipi(IpiKind::Pit, IpiTarget::Other); + + // Any better way of doing this? + timeout::trigger(); + + if PIT_TICKS.fetch_add(1, Ordering::SeqCst) >= 10 { + let _ = context::switch(); + } +}); + +interrupt!(keyboard, || { + ps2_interrupt(0); + eoi(1); +}); + +interrupt!(cascade, || { + // No need to do any operations on cascade + eoi(2); +}); + +interrupt!(com2, || { + while let Some(c) = COM2.lock().receive() { + debug_input(c); + } + debug_notify(); + eoi(3); +}); + +interrupt!(com1, || { + while let Some(c) = COM1.lock().receive() { + debug_input(c); + } + debug_notify(); + eoi(4); +}); + +interrupt!(lpt2, || { + trigger(5); + eoi(5); +}); + +interrupt!(floppy, || { + trigger(6); + eoi(6); +}); + +interrupt!(lpt1, || { + if irq_method() == IrqMethod::Pic && pic::MASTER.isr() & (1 << 7) == 0 { + // the IRQ was spurious, ignore it but increment a counter. + SPURIOUS_COUNT_IRQ7.fetch_add(1, Ordering::Relaxed); + return; + } + trigger(7); + eoi(7); +}); + +interrupt!(rtc, || { + trigger(8); + eoi(8); +}); + +interrupt!(pci1, || { + trigger(9); + eoi(9); +}); + +interrupt!(pci2, || { + trigger(10); + eoi(10); +}); + +interrupt!(pci3, || { + trigger(11); + eoi(11); +}); + +interrupt!(mouse, || { + ps2_interrupt(1); + eoi(12); +}); + +interrupt!(fpu, || { + trigger(13); + eoi(13); +}); + +interrupt!(ata1, || { + trigger(14); + eoi(14); +}); + +interrupt!(ata2, || { + if irq_method() == IrqMethod::Pic && pic::SLAVE.isr() & (1 << 7) == 0 { + SPURIOUS_COUNT_IRQ15.fetch_add(1, Ordering::Relaxed); + pic::MASTER.ack(); + return + } + trigger(15); + eoi(15); +}); + +interrupt!(lapic_timer, || { + println!("Local apic timer interrupt"); + lapic_eoi(); +}); + +interrupt!(lapic_error, || { + println!("Local apic internal error: ESR={:#0x}", local_apic::LOCAL_APIC.esr()); + lapic_eoi(); +}); + +interrupt!(calib_pit, || { + const PIT_RATE: u64 = 2_250_286; + + { + let mut offset = time::OFFSET.lock(); + let sum = offset.1 + PIT_RATE; + offset.1 = sum % 1_000_000_000; + offset.0 += sum / 1_000_000_000; + } + + eoi(0); +}); +// XXX: This would look way prettier using const generics. + +macro_rules! allocatable_irq( + ( $idt:expr, $number:literal, $name:ident ) => { + interrupt!($name, || { + allocatable_irq_generic($number); + }); + } +); + +pub unsafe fn allocatable_irq_generic(number: u8) { + irq_trigger(number - 32); + lapic_eoi(); +} + +define_default_irqs!(); diff --git a/src/arch/x86/interrupt/mod.rs b/src/arch/x86/interrupt/mod.rs new file mode 100644 index 0000000..ebd013f --- /dev/null +++ b/src/arch/x86/interrupt/mod.rs @@ -0,0 +1,57 @@ +//! Interrupt instructions + +#[macro_use] +pub mod handler; + +pub mod exception; +pub mod ipi; +pub mod irq; +pub mod syscall; +pub mod trace; + +pub use self::handler::InterruptStack; +pub use self::trace::stack_trace; + +pub use super::idt::{available_irqs_iter, is_reserved, set_reserved}; +pub use super::device::local_apic::bsp_apic_id; + +/// Clear interrupts +#[inline(always)] +pub unsafe fn disable() { + core::arch::asm!("cli", options(nomem, nostack)); +} + +/// Set interrupts +#[inline(always)] +pub unsafe fn enable() { + core::arch::asm!("sti", options(nomem, nostack)); +} + +/// Set interrupts and halt +/// This will atomically wait for the next interrupt +/// Performing enable followed by halt is not guaranteed to be atomic, use this instead! +#[inline(always)] +pub unsafe fn enable_and_halt() { + core::arch::asm!("sti; hlt", options(nomem, nostack)); +} + +/// Set interrupts and nop +/// This will enable interrupts and allow the IF flag to be processed +/// Simply enabling interrupts does not gurantee that they will trigger, use this instead! +#[inline(always)] +pub unsafe fn enable_and_nop() { + core::arch::asm!("sti; nop", options(nomem, nostack)); +} + +/// Halt instruction +#[inline(always)] +pub unsafe fn halt() { + core::arch::asm!("hlt", options(nomem, nostack)); +} + +/// Pause instruction +/// Safe because it is similar to a NOP, and has no memory effects +#[inline(always)] +pub fn pause() { + unsafe { core::arch::asm!("pause", options(nomem, nostack)); } +} diff --git a/src/arch/x86/interrupt/syscall.rs b/src/arch/x86/interrupt/syscall.rs new file mode 100644 index 0000000..6b48d2c --- /dev/null +++ b/src/arch/x86/interrupt/syscall.rs @@ -0,0 +1,57 @@ +use crate::{ + arch::{gdt, interrupt::InterruptStack}, + context, + ptrace, + syscall, + syscall::flag::{PTRACE_FLAG_IGNORE, PTRACE_STOP_PRE_SYSCALL, PTRACE_STOP_POST_SYSCALL}, +}; +use memoffset::offset_of; +use x86::{bits64::task::TaskStateSegment, msr, segmentation::SegmentSelector}; + +pub unsafe fn init() {} + +macro_rules! with_interrupt_stack { + (|$stack:ident| $code:block) => {{ + let allowed = ptrace::breakpoint_callback(PTRACE_STOP_PRE_SYSCALL, None) + .and_then(|_| ptrace::next_breakpoint().map(|f| !f.contains(PTRACE_FLAG_IGNORE))); + + if allowed.unwrap_or(true) { + // If the syscall is `clone`, the clone won't return here. Instead, + // it'll return early and leave any undropped values. This is + // actually GOOD, because any references are at that point UB + // anyway, because they are based on the wrong stack. + let $stack = &mut *$stack; + (*$stack).scratch.eax = $code; + } + + ptrace::breakpoint_callback(PTRACE_STOP_POST_SYSCALL, None); + }} +} + +interrupt_stack!(syscall, |stack| { + with_interrupt_stack!(|stack| { + let scratch = &stack.scratch; + let preserved = &stack.preserved; + syscall::syscall(scratch.eax, preserved.ebx, scratch.ecx, scratch.edx, preserved.esi, preserved.edi, stack) + }) +}); + +#[naked] +pub unsafe extern "C" fn clone_ret() { + core::arch::asm!(concat!( + // The address of this instruction is injected by `clone` in process.rs, on + // top of the stack syscall->inner in this file, which is done using the ebp + // register we save there. + // + // The top of our stack here is the address pointed to by ebp, which is: + // + // - the previous ebp + // - the return location + // + // Our goal is to return from the parent function, inner, so we restore + // ebp... + "pop ebp\n", + // ...and we return to the address at the top of the stack + "ret\n", + ), options(noreturn)); +} diff --git a/src/arch/x86/interrupt/trace.rs b/src/arch/x86/interrupt/trace.rs new file mode 100644 index 0000000..0acfcad --- /dev/null +++ b/src/arch/x86/interrupt/trace.rs @@ -0,0 +1,95 @@ +use core::{mem, str}; + +use goblin::elf::sym; +use rustc_demangle::demangle; + +use crate::{context, paging::{KernelMapper, VirtualAddress}}; + +/// Get a stack trace +//TODO: Check for stack being mapped before dereferencing +#[inline(never)] +pub unsafe fn stack_trace() { + let mut ebp: usize; + core::arch::asm!("mov {}, ebp", out(reg) ebp); + + println!("TRACE: {:>016X}", ebp); + //Maximum 64 frames + + let mapper = KernelMapper::lock(); + + for _frame in 0..64 { + if let Some(eip_ebp) = ebp.checked_add(mem::size_of::()) { + let ebp_virt = VirtualAddress::new(ebp); + let eip_ebp_virt = VirtualAddress::new(eip_ebp); + if mapper.translate(ebp_virt).is_some() && mapper.translate(eip_ebp_virt).is_some() { + let eip = *(eip_ebp as *const usize); + if eip == 0 { + println!(" {:>016X}: EMPTY RETURN", ebp); + break; + } + println!(" {:>016X}: {:>016X}", ebp, eip); + ebp = *(ebp as *const usize); + symbol_trace(eip); + } else { + println!(" {:>016X}: GUARD PAGE", ebp); + break; + } + } else { + println!(" {:>016X}: EBP OVERFLOW", ebp); + break; + } + } +} + +/// Get a symbol +//TODO: Do not create Elf object for every symbol lookup +#[inline(never)] +pub unsafe fn symbol_trace(addr: usize) { + use core::slice; + use core::sync::atomic::Ordering; + + use crate::elf::Elf; + use crate::start::{KERNEL_BASE, KERNEL_SIZE}; + + let kernel_ptr = (KERNEL_BASE.load(Ordering::SeqCst) + crate::PHYS_OFFSET) as *const u8; + let kernel_slice = slice::from_raw_parts(kernel_ptr, KERNEL_SIZE.load(Ordering::SeqCst)); + if let Ok(elf) = Elf::from(kernel_slice) { + let mut strtab_opt = None; + for section in elf.sections() { + if section.sh_type == ::goblin::elf::section_header::SHT_STRTAB { + strtab_opt = Some(section); + break; + } + } + + if let Some(symbols) = elf.symbols() { + for sym in symbols { + if sym::st_type(sym.st_info) == sym::STT_FUNC + && addr >= sym.st_value as usize + && addr < (sym.st_value + sym.st_size) as usize + { + println!(" {:>016X}+{:>04X}", sym.st_value, addr - sym.st_value as usize); + + if let Some(strtab) = strtab_opt { + let start = strtab.sh_offset as usize + sym.st_name as usize; + let mut end = start; + while end < elf.data.len() { + let b = elf.data[end]; + end += 1; + if b == 0 { + break; + } + } + + if end > start { + let sym_slice = &elf.data[start .. end - 1]; + if let Ok(sym_name) = str::from_utf8(sym_slice) { + println!(" {:#}", demangle(sym_name)); + } + } + } + } + } + } + } +} diff --git a/src/arch/x86/ipi.rs b/src/arch/x86/ipi.rs new file mode 100644 index 0000000..28d21cd --- /dev/null +++ b/src/arch/x86/ipi.rs @@ -0,0 +1,29 @@ +#[derive(Clone, Copy, Debug)] +#[repr(u8)] +pub enum IpiKind { + Wakeup = 0x40, + Tlb = 0x41, + Switch = 0x42, + Pit = 0x43, +} + +#[derive(Clone, Copy, Debug)] +#[repr(u8)] +pub enum IpiTarget { + Current = 1, + All = 2, + Other = 3, +} + +#[cfg(not(feature = "multi_core"))] +#[inline(always)] +pub fn ipi(_kind: IpiKind, _target: IpiTarget) {} + +#[cfg(feature = "multi_core")] +#[inline(always)] +pub fn ipi(kind: IpiKind, target: IpiTarget) { + use crate::device::local_apic::LOCAL_APIC; + + let icr = (target as u64) << 18 | 1 << 14 | (kind as u64); + unsafe { LOCAL_APIC.set_icr(icr) }; +} diff --git a/src/arch/x86/macros.rs b/src/arch/x86/macros.rs new file mode 100644 index 0000000..5f888cb --- /dev/null +++ b/src/arch/x86/macros.rs @@ -0,0 +1,69 @@ +/// Print to console +#[macro_export] +macro_rules! print { + ($($arg:tt)*) => ({ + use core::fmt::Write; + let _ = write!($crate::arch::debug::Writer::new(), $($arg)*); + }); +} + +/// Print with new line to console +#[macro_export] +macro_rules! println { + () => (print!("\n")); + ($fmt:expr) => (print!(concat!($fmt, "\n"))); + ($fmt:expr, $($arg:tt)*) => (print!(concat!($fmt, "\n"), $($arg)*)); +} + +#[macro_export] +macro_rules! irqs( + ( [ $( ($idt:expr, $number:literal, $name:ident) ,)* ], $submac:ident ) => { + $( + $submac!($idt, $number, $name); + )* + } +); + +// define the irq numbers specified in the list above, as functions of the names +// allocatable_irq_NUM. +#[macro_export] +macro_rules! default_irqs( + ($idt:expr, $submac:ident) => { + irqs!([ + // interrupt vectors below 32 are exceptions + // vectors 32..=47 are used for standard 8259 pic irqs. + // 48 and 49 are used for the local APIC timer and error register, respectively. + ($idt, 50, irq_50), ($idt, 51, irq_51), ($idt, 52, irq_52), ($idt, 53, irq_53), ($idt, 54, irq_54), ($idt, 55, irq_55), ($idt, 56, irq_56), ($idt, 57, irq_57), ($idt, 58, irq_58), ($idt, 59, irq_59), + ($idt, 60, irq_60), ($idt, 61, irq_61), ($idt, 62, irq_62), ($idt, 63, irq_63), + // 64..=67 used for IPI + ($idt, 68, irq_68), ($idt, 69, irq_69), + ($idt, 70, irq_70), ($idt, 71, irq_71), ($idt, 72, irq_72), ($idt, 73, irq_73), ($idt, 74, irq_74), ($idt, 75, irq_75), ($idt, 76, irq_76), ($idt, 77, irq_77), ($idt, 78, irq_78), ($idt, 79, irq_79), + ($idt, 80, irq_80), ($idt, 81, irq_81), ($idt, 82, irq_82), ($idt, 83, irq_83), ($idt, 84, irq_84), ($idt, 85, irq_85), ($idt, 86, irq_86), ($idt, 87, irq_87), ($idt, 88, irq_88), ($idt, 89, irq_89), + ($idt, 90, irq_90), ($idt, 91, irq_91), ($idt, 92, irq_92), ($idt, 93, irq_93), ($idt, 94, irq_94), ($idt, 95, irq_95), ($idt, 96, irq_96), ($idt, 97, irq_97), ($idt, 98, irq_98), ($idt, 99, irq_99), + ($idt, 100, irq_100), ($idt, 101, irq_101), ($idt, 102, irq_102), ($idt, 103, irq_103), ($idt, 104, irq_104), ($idt, 105, irq_105), ($idt, 106, irq_106), ($idt, 107, irq_107), ($idt, 108, irq_108), ($idt, 109, irq_109), + ($idt, 110, irq_110), ($idt, 111, irq_111), ($idt, 112, irq_112), ($idt, 113, irq_113), ($idt, 114, irq_114), ($idt, 115, irq_115), ($idt, 116, irq_116), ($idt, 117, irq_117), ($idt, 118, irq_118), ($idt, 119, irq_119), + ($idt, 120, irq_120), ($idt, 121, irq_121), ($idt, 122, irq_122), ($idt, 123, irq_123), ($idt, 124, irq_124), ($idt, 125, irq_125), ($idt, 126, irq_126), ($idt, 127, irq_127), + // 128 is used for software interrupts + ($idt, 129, irq_129), + ($idt, 130, irq_130), ($idt, 131, irq_131), ($idt, 132, irq_132), ($idt, 133, irq_133), ($idt, 134, irq_134), ($idt, 135, irq_135), ($idt, 136, irq_136), ($idt, 137, irq_137), ($idt, 138, irq_138), ($idt, 139, irq_139), + ($idt, 140, irq_140), ($idt, 141, irq_141), ($idt, 142, irq_142), ($idt, 143, irq_143), ($idt, 144, irq_144), ($idt, 145, irq_145), ($idt, 146, irq_146), ($idt, 147, irq_147), ($idt, 148, irq_148), ($idt, 149, irq_149), + ($idt, 150, irq_150), ($idt, 151, irq_151), ($idt, 152, irq_152), ($idt, 153, irq_153), ($idt, 154, irq_154), ($idt, 155, irq_155), ($idt, 156, irq_156), ($idt, 157, irq_157), ($idt, 158, irq_158), ($idt, 159, irq_159), + ($idt, 160, irq_160), ($idt, 161, irq_161), ($idt, 162, irq_162), ($idt, 163, irq_163), ($idt, 164, irq_164), ($idt, 165, irq_165), ($idt, 166, irq_166), ($idt, 167, irq_167), ($idt, 168, irq_168), ($idt, 169, irq_169), + ($idt, 170, irq_170), ($idt, 171, irq_171), ($idt, 172, irq_172), ($idt, 173, irq_173), ($idt, 174, irq_174), ($idt, 175, irq_175), ($idt, 176, irq_176), ($idt, 177, irq_177), ($idt, 178, irq_178), ($idt, 179, irq_179), + ($idt, 180, irq_180), ($idt, 181, irq_181), ($idt, 182, irq_182), ($idt, 183, irq_183), ($idt, 184, irq_184), ($idt, 185, irq_185), ($idt, 186, irq_186), ($idt, 187, irq_187), ($idt, 188, irq_188), ($idt, 189, irq_189), + ($idt, 190, irq_190), ($idt, 191, irq_191), ($idt, 192, irq_192), ($idt, 193, irq_193), ($idt, 194, irq_194), ($idt, 195, irq_195), ($idt, 196, irq_196), ($idt, 197, irq_197), ($idt, 198, irq_198), ($idt, 199, irq_199), + ($idt, 200, irq_200), ($idt, 201, irq_201), ($idt, 202, irq_202), ($idt, 203, irq_203), ($idt, 204, irq_204), ($idt, 205, irq_205), ($idt, 206, irq_206), ($idt, 207, irq_207), ($idt, 208, irq_208), ($idt, 209, irq_209), + ($idt, 210, irq_210), ($idt, 211, irq_211), ($idt, 212, irq_212), ($idt, 213, irq_213), ($idt, 214, irq_214), ($idt, 215, irq_215), ($idt, 216, irq_216), ($idt, 217, irq_217), ($idt, 218, irq_218), ($idt, 219, irq_219), + ($idt, 220, irq_220), ($idt, 221, irq_221), ($idt, 222, irq_222), ($idt, 223, irq_223), ($idt, 224, irq_224), ($idt, 225, irq_225), ($idt, 226, irq_226), ($idt, 227, irq_227), ($idt, 228, irq_228), ($idt, 229, irq_229), + ($idt, 230, irq_230), ($idt, 231, irq_231), ($idt, 232, irq_232), ($idt, 233, irq_233), ($idt, 234, irq_234), ($idt, 235, irq_235), ($idt, 236, irq_236), ($idt, 237, irq_237), ($idt, 238, irq_238), ($idt, 239, irq_239), + ($idt, 240, irq_240), ($idt, 241, irq_241), ($idt, 242, irq_242), ($idt, 243, irq_243), ($idt, 244, irq_244), ($idt, 245, irq_245), ($idt, 246, irq_246), ($idt, 247, irq_247), ($idt, 248, irq_248), ($idt, 249, irq_249), + ($idt, 250, irq_250), ($idt, 251, irq_251), ($idt, 252, irq_252), ($idt, 253, irq_253), ($idt, 254, irq_254), ($idt, 255, irq_255), + ], $submac); + } +); + +macro_rules! define_default_irqs( + () => { + default_irqs!((), allocatable_irq); + } +); diff --git a/src/arch/x86/mod.rs b/src/arch/x86/mod.rs new file mode 100644 index 0000000..358d03d --- /dev/null +++ b/src/arch/x86/mod.rs @@ -0,0 +1,54 @@ +#[macro_use] +pub mod macros; + +/// Constants like memory locations +pub mod consts; + +/// CPUID wrapper +pub mod cpuid; + +/// Debugging support +pub mod debug; + +/// Devices +pub mod device; + +/// Global descriptor table +pub mod gdt; + +/// Graphical debug +#[cfg(feature = "graphical_debug")] +mod graphical_debug; + +/// Interrupt instructions +#[macro_use] +pub mod interrupt; + +/// Interrupt descriptor table +pub mod idt; + +/// Inter-processor interrupts +pub mod ipi; + +/// Paging +pub mod paging; + +/// Page table isolation +pub mod pti; + +pub mod rmm; + +/// Initialization and start function +pub mod start; + +/// Stop function +pub mod stop; + +pub use ::rmm::X86Arch as CurrentRmmArch; + +// Flags +pub mod flags { + pub const SHIFT_SINGLESTEP: usize = 8; + pub const FLAG_SINGLESTEP: usize = 1 << SHIFT_SINGLESTEP; + pub const FLAG_INTERRUPTS: usize = 1 << 9; +} diff --git a/src/arch/x86/paging/entry.rs b/src/arch/x86/paging/entry.rs new file mode 100644 index 0000000..4092ab1 --- /dev/null +++ b/src/arch/x86/paging/entry.rs @@ -0,0 +1,80 @@ +//! # Page table entry +//! Some code borrowed from [Phil Opp's Blog](http://os.phil-opp.com/modifying-page-tables.html) + +use crate::memory::Frame; + +use super::{PageFlags, PhysicalAddress, RmmA, RmmArch}; + +/// A page table entry +#[repr(packed(8))] +pub struct Entry(u64); + +bitflags! { + pub struct EntryFlags: usize { + const NO_CACHE = 1 << 4; + const HUGE_PAGE = 1 << 7; + const GLOBAL = 1 << 8; + } +} + +pub const COUNTER_MASK: u64 = 0x3ff0_0000_0000_0000; + +impl Entry { + /// Clear entry + pub fn set_zero(&mut self) { + self.0 = 0; + } + + /// Is the entry unused? + pub fn is_unused(&self) -> bool { + self.0 == (self.0 & COUNTER_MASK) + } + + /// Make the entry unused + pub fn set_unused(&mut self) { + self.0 &= COUNTER_MASK; + } + + /// Get the address this page references + pub fn address(&self) -> PhysicalAddress { + PhysicalAddress::new(self.0 as usize & RmmA::PAGE_ADDRESS_MASK) + } + + /// Get the current entry flags + pub fn flags(&self) -> PageFlags { + unsafe { PageFlags::from_data((self.0 as usize & RmmA::ENTRY_FLAGS_MASK) & !(COUNTER_MASK as usize)) } + } + + /// Get the associated frame, if available + pub fn pointed_frame(&self) -> Option { + if self.flags().has_present() { + Some(Frame::containing_address(self.address())) + } else { + None + } + } + + pub fn set(&mut self, frame: Frame, flags: PageFlags) { + debug_assert!(frame.start_address().data() & !RmmA::PAGE_ADDRESS_MASK == 0); + self.0 = (frame.start_address().data() as u64) | (flags.data() as u64) | (self.0 & COUNTER_MASK); + } + + /// Get bits 52-61 in entry, used as counter for page table + pub fn counter_bits(&self) -> u64 { + (self.0 & COUNTER_MASK) >> 52 + } + + /// Set bits 52-61 in entry, used as counter for page table + pub fn set_counter_bits(&mut self, count: u64) { + self.0 = (self.0 & !COUNTER_MASK) | (count << 52); + } +} + +#[cfg(test)] +mod tests { + #[test] + fn entry_has_required_arch_alignment() { + use super::Entry; + assert!(core::mem::align_of::() >= core::mem::align_of::(), "alignment of Entry is less than the required alignment of u64 ({} < {})", core::mem::align_of::(), core::mem::align_of::()); + } +} diff --git a/src/arch/x86/paging/mapper.rs b/src/arch/x86/paging/mapper.rs new file mode 100644 index 0000000..9f7659b --- /dev/null +++ b/src/arch/x86/paging/mapper.rs @@ -0,0 +1,23 @@ +use crate::ipi::{ipi, IpiKind, IpiTarget}; + +use super::RmmA; + +pub use rmm::{Flusher, PageFlush, PageFlushAll}; + +pub struct InactiveFlusher { _inner: () } +impl InactiveFlusher { + // TODO: cpu id + pub fn new() -> Self { Self { _inner: () } } +} + +impl Flusher for InactiveFlusher { + fn consume(&mut self, flush: PageFlush) { + // TODO: Push to TLB "mailbox" or tell it to reload CR3 if there are too many entries. + unsafe { flush.ignore(); } + } +} +impl Drop for InactiveFlusher { + fn drop(&mut self) { + ipi(IpiKind::Tlb, IpiTarget::Other); + } +} diff --git a/src/arch/x86/paging/mod.rs b/src/arch/x86/paging/mod.rs new file mode 100644 index 0000000..73db998 --- /dev/null +++ b/src/arch/x86/paging/mod.rs @@ -0,0 +1,262 @@ +//! # Paging +//! Some code was borrowed from [Phil Opp's Blog](http://os.phil-opp.com/modifying-page-tables.html) + +use core::{mem, ptr}; +use x86::msr; + +use self::entry::EntryFlags; +use self::mapper::PageFlushAll; + +pub use rmm::{ + Arch as RmmArch, + Flusher, + PageFlags, + PhysicalAddress, + TableKind, + VirtualAddress, + X86Arch as RmmA, +}; + +pub type PageMapper = rmm::PageMapper; +pub use crate::rmm::KernelMapper; + +pub mod entry; +pub mod mapper; + +/// Number of entries per page table +pub const ENTRY_COUNT: usize = RmmA::PAGE_ENTRIES; + +/// Size of pages +pub const PAGE_SIZE: usize = RmmA::PAGE_SIZE; + +/// Setup page attribute table +unsafe fn init_pat() { + let uncacheable = 0; + let write_combining = 1; + let write_through = 4; + //let write_protected = 5; + let write_back = 6; + let uncached = 7; + + let pat0 = write_back; + let pat1 = write_through; + let pat2 = uncached; + let pat3 = uncacheable; + + let pat4 = write_combining; + let pat5 = pat1; + let pat6 = pat2; + let pat7 = pat3; + + msr::wrmsr( + msr::IA32_PAT, + pat7 << 56 + | pat6 << 48 + | pat5 << 40 + | pat4 << 32 + | pat3 << 24 + | pat2 << 16 + | pat1 << 8 + | pat0, + ); +} + +/// Map percpu +unsafe fn map_percpu(cpu_id: usize, mapper: &mut PageMapper) -> PageFlushAll { + extern "C" { + /// The starting byte of the thread data segment + static mut __tdata_start: u8; + /// The ending byte of the thread data segment + static mut __tdata_end: u8; + /// The starting byte of the thread BSS segment + static mut __tbss_start: u8; + /// The ending byte of the thread BSS segment + static mut __tbss_end: u8; + } + + let size = &__tbss_end as *const _ as usize - &__tdata_start as *const _ as usize; + let start = crate::KERNEL_PERCPU_OFFSET + crate::KERNEL_PERCPU_SIZE * cpu_id; + let end = start + size; + + let mut flush_all = PageFlushAll::new(); + let start_page = Page::containing_address(VirtualAddress::new(start)); + let end_page = Page::containing_address(VirtualAddress::new(end - 1)); + for page in Page::range_inclusive(start_page, end_page) { + let result = mapper.map( + page.start_address(), + PageFlags::new().write(true).custom_flag(EntryFlags::GLOBAL.bits(), cfg!(not(feature = "pti"))), + ) + .expect("failed to allocate page table frames while mapping percpu"); + flush_all.consume(result); + } + flush_all +} + +/// Copy tdata, clear tbss, set TCB self pointer +unsafe fn init_tcb(cpu_id: usize) -> usize { + extern "C" { + /// The starting byte of the thread data segment + static mut __tdata_start: u8; + /// The ending byte of the thread data segment + static mut __tdata_end: u8; + /// The starting byte of the thread BSS segment + static mut __tbss_start: u8; + /// The ending byte of the thread BSS segment + static mut __tbss_end: u8; + } + + let tcb_offset; + { + let size = &__tbss_end as *const _ as usize - &__tdata_start as *const _ as usize; + let tbss_offset = &__tbss_start as *const _ as usize - &__tdata_start as *const _ as usize; + + let start = crate::KERNEL_PERCPU_OFFSET + crate::KERNEL_PERCPU_SIZE * cpu_id; + let end = start + size; + tcb_offset = end - mem::size_of::(); + + ptr::copy(&__tdata_start as *const u8, start as *mut u8, tbss_offset); + ptr::write_bytes((start + tbss_offset) as *mut u8, 0, size - tbss_offset); + + *(tcb_offset as *mut usize) = end; + } + tcb_offset +} + +/// Initialize paging +/// +/// Returns page table and thread control block offset +pub unsafe fn init( + cpu_id: usize, +) -> usize { + extern "C" { + /// The starting byte of the text (code) data segment. + static mut __text_start: u8; + /// The ending byte of the text (code) data segment. + static mut __text_end: u8; + /// The starting byte of the _.rodata_ (read-only data) segment. + static mut __rodata_start: u8; + /// The ending byte of the _.rodata_ (read-only data) segment. + static mut __rodata_end: u8; + /// The starting byte of the _.data_ segment. + static mut __data_start: u8; + /// The ending byte of the _.data_ segment. + static mut __data_end: u8; + /// The starting byte of the thread data segment + static mut __tdata_start: u8; + /// The ending byte of the thread data segment + static mut __tdata_end: u8; + /// The starting byte of the thread BSS segment + static mut __tbss_start: u8; + /// The ending byte of the thread BSS segment + static mut __tbss_end: u8; + /// The starting byte of the _.bss_ (uninitialized data) segment. + static mut __bss_start: u8; + /// The ending byte of the _.bss_ (uninitialized data) segment. + static mut __bss_end: u8; + } + + init_pat(); + + let flush_all = map_percpu(cpu_id, KernelMapper::lock_manually(cpu_id).get_mut().expect("expected KernelMapper not to be locked re-entrant in paging::init")); + flush_all.flush(); + + return init_tcb(cpu_id); +} + +pub unsafe fn init_ap( + cpu_id: usize, + bsp_table: &mut KernelMapper, +) -> usize { + init_pat(); + + { + let flush_all = map_percpu(cpu_id, bsp_table.get_mut().expect("KernelMapper locked re-entrant for AP")); + + // The flush can be ignored as this is not the active table. See later make_current(). + flush_all.ignore(); + }; + + bsp_table.make_current(); + + init_tcb(cpu_id) +} + +/// Page +#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord)] +pub struct Page { + number: usize, +} + +impl Page { + pub fn start_address(self) -> VirtualAddress { + VirtualAddress::new(self.number * PAGE_SIZE) + } + + pub fn p4_index(self) -> usize { + (self.number >> 27) & 0o777 + } + + pub fn p3_index(self) -> usize { + (self.number >> 18) & 0o777 + } + + pub fn p2_index(self) -> usize { + (self.number >> 9) & 0o777 + } + + pub fn p1_index(self) -> usize { + self.number & 0o777 + } + + pub fn containing_address(address: VirtualAddress) -> Page { + //TODO assert!(address.data() < 0x0000_8000_0000_0000 || address.data() >= 0xffff_8000_0000_0000, + // "invalid address: 0x{:x}", address.data()); + Page { + number: address.data() / PAGE_SIZE, + } + } + + pub fn range_inclusive(start: Page, r#final: Page) -> PageIter { + PageIter { start, end: r#final.next() } + } + pub fn range_exclusive(start: Page, end: Page) -> PageIter { + PageIter { start, end } + } + + pub fn next(self) -> Page { + self.next_by(1) + } + pub fn next_by(self, n: usize) -> Page { + Self { + number: self.number + n, + } + } +} + +pub struct PageIter { + start: Page, + end: Page, +} + +impl Iterator for PageIter { + type Item = Page; + + fn next(&mut self) -> Option { + if self.start < self.end { + let page = self.start; + self.start = self.start.next(); + Some(page) + } else { + None + } + } +} + +/// Round down to the nearest multiple of page size +pub fn round_down_pages(number: usize) -> usize { + number - number % PAGE_SIZE +} +/// Round up to the nearest multiple of page size +pub fn round_up_pages(number: usize) -> usize { + round_down_pages(number + PAGE_SIZE - 1) +} diff --git a/src/arch/x86/pti.rs b/src/arch/x86/pti.rs new file mode 100644 index 0000000..0f5cc07 --- /dev/null +++ b/src/arch/x86/pti.rs @@ -0,0 +1,86 @@ +#[cfg(feature = "pti")] +use core::ptr; + +#[cfg(feature = "pti")] +use crate::memory::Frame; +#[cfg(feature = "pti")] +use crate::paging::ActivePageTable; +#[cfg(feature = "pti")] +use crate::paging::entry::EntryFlags; + +#[cfg(feature = "pti")] +#[thread_local] +pub static mut PTI_CPU_STACK: [u8; 256] = [0; 256]; + +#[cfg(feature = "pti")] +#[thread_local] +pub static mut PTI_CONTEXT_STACK: usize = 0; + +#[cfg(feature = "pti")] +#[inline(always)] +unsafe fn switch_stack(old: usize, new: usize) { + let old_rsp: usize; + asm!("", out("rsp") old_rsp); + + let offset_rsp = old - old_rsp; + + let new_rsp = new - offset_rsp; + + ptr::copy_nonoverlapping( + old_rsp as *const u8, + new_rsp as *mut u8, + offset_rsp + ); + + asm!("", out("rsp") new_rsp); +} + +#[cfg(feature = "pti")] +#[inline(always)] +pub unsafe fn map() { + // { + // let mut active_table = unsafe { ActivePageTable::new() }; + // + // // Map kernel heap + // let address = active_table.p4()[::KERNEL_HEAP_PML4].address(); + // let frame = Frame::containing_address(address); + // let mut flags = active_table.p4()[::KERNEL_HEAP_PML4].flags(); + // flags.remove(EntryFlags::PRESENT); + // active_table.p4_mut()[::KERNEL_HEAP_PML4].set(frame, flags); + // + // // Reload page tables + // active_table.flush_all(); + // } + + // Switch to per-context stack + switch_stack(PTI_CPU_STACK.as_ptr() as usize + PTI_CPU_STACK.len(), PTI_CONTEXT_STACK); +} + +#[cfg(feature = "pti")] +#[inline(always)] +pub unsafe extern "C" fn unmap() { + // Switch to per-CPU stack + switch_stack(PTI_CONTEXT_STACK, PTI_CPU_STACK.as_ptr() as usize + PTI_CPU_STACK.len()); + + // { + // let mut active_table = unsafe { ActivePageTable::new() }; + // + // // Unmap kernel heap + // let address = active_table.p4()[::KERNEL_HEAP_PML4].address(); + // let frame = Frame::containing_address(address); + // let mut flags = active_table.p4()[::KERNEL_HEAP_PML4].flags(); + // flags.insert(EntryFlags::PRESENT); + // active_table.p4_mut()[::KERNEL_HEAP_PML4].set(frame, flags); + // + // // Reload page tables + // active_table.flush_all(); + // } +} + +#[cfg(not(feature = "pti"))] +#[inline(always)] +pub unsafe fn map() {} + +#[cfg(not(feature = "pti"))] +#[inline(always)] +pub unsafe extern "C" fn unmap() {} diff --git a/src/arch/x86/rmm.rs b/src/arch/x86/rmm.rs new file mode 100644 index 0000000..cc8faeb --- /dev/null +++ b/src/arch/x86/rmm.rs @@ -0,0 +1,455 @@ +use core::{ + cmp, + mem, + slice, + sync::atomic::{self, AtomicUsize, Ordering}, +}; +use rmm::{ + KILOBYTE, + MEGABYTE, + Arch, + BuddyAllocator, + BumpAllocator, + FrameAllocator, + FrameCount, + FrameUsage, + MemoryArea, + PageFlags, + PageMapper, + PhysicalAddress, + VirtualAddress, + X86Arch as RmmA, +}; + +use spin::{Mutex, MutexGuard}; + +extern "C" { + /// The starting byte of the text (code) data segment. + static mut __text_start: u8; + /// The ending byte of the text (code) data segment. + static mut __text_end: u8; + /// The starting byte of the _.rodata_ (read-only data) segment. + static mut __rodata_start: u8; + /// The ending byte of the _.rodata_ (read-only data) segment. + static mut __rodata_end: u8; +} + +// Keep synced with OsMemoryKind in bootloader +#[derive(Clone, Copy, Debug, Eq, PartialEq)] +#[repr(u64)] +pub enum BootloaderMemoryKind { + Null = 0, + Free = 1, + Reclaim = 2, + Reserved = 3, +} + +// Keep synced with OsMemoryEntry in bootloader +#[derive(Clone, Copy, Debug)] +#[repr(packed)] +pub struct BootloaderMemoryEntry { + pub base: u64, + pub size: u64, + pub kind: BootloaderMemoryKind, +} + +unsafe fn page_flags(virt: VirtualAddress) -> PageFlags { + let virt_addr = virt.data(); + + // Test for being inside a region + macro_rules! in_section { + ($n: ident) => { + virt_addr >= &concat_idents!(__, $n, _start) as *const u8 as usize + && virt_addr < &concat_idents!(__, $n, _end) as *const u8 as usize + }; + } + + if in_section!(text) { + // Remap text read-only, execute + PageFlags::new().execute(true) + } else if in_section!(rodata) { + // Remap rodata read-only, no execute + PageFlags::new() + } else { + // Remap everything else read-write, no execute + PageFlags::new().write(true) + } +} + +//TODO: problems if RAM > 1GiB +unsafe fn inner( + areas: &'static [MemoryArea], + kernel_base: usize, kernel_size_aligned: usize, + stack_base: usize, stack_size_aligned: usize, + env_base: usize, env_size_aligned: usize, + acpi_base: usize, acpi_size_aligned: usize, + initfs_base: usize, initfs_size_aligned: usize, +) -> BuddyAllocator { + // First, calculate how much memory we have + let mut size = 0; + for area in areas.iter() { + if area.size > 0 { + log::debug!("{:X?}", area); + size += area.size; + } + } + + log::info!("Memory: {} MB", (size + (MEGABYTE - 1)) / MEGABYTE); + + // Create a basic allocator for the first pages + let mut bump_allocator = BumpAllocator::::new(areas, 0); + + { + let mut mapper = PageMapper::::create( + &mut bump_allocator + ).expect("failed to create Mapper"); + + // Map all physical areas at PHYS_OFFSET + for area in areas.iter() { + for i in 0..area.size / A::PAGE_SIZE { + let phys = area.base.add(i * A::PAGE_SIZE); + let virt = A::phys_to_virt(phys); + let flags = page_flags::(virt); + let flush = mapper.map_phys( + virt, + phys, + flags + ).expect("failed to map frame"); + flush.ignore(); // Not the active table + } + } + + // Map kernel at KERNEL_OFFSET and identity map too + for i in 0..kernel_size_aligned / A::PAGE_SIZE { + let phys = PhysicalAddress::new(kernel_base + i * A::PAGE_SIZE); + let virt = VirtualAddress::new(crate::KERNEL_OFFSET + i * A::PAGE_SIZE); + let flags = page_flags::(virt); + let flush = mapper.map_phys( + virt, + phys, + flags + ).expect("failed to map frame"); + flush.ignore(); // Not the active table + + let virt = A::phys_to_virt(phys); + let flush = mapper.map_phys( + virt, + phys, + flags + ).expect("failed to map frame"); + flush.ignore(); // Not the active table + } + + let mut identity_map = |base, size_aligned| { + // Map stack with identity mapping + for i in 0..size / A::PAGE_SIZE { + let phys = PhysicalAddress::new(base + i * A::PAGE_SIZE); + let virt = A::phys_to_virt(phys); + let flags = page_flags::(virt); + let flush = mapper.map_phys( + virt, + phys, + flags + ).expect("failed to map frame"); + flush.ignore(); // Not the active table + } + }; + + + identity_map(stack_base, stack_size_aligned); + identity_map(env_base, env_size_aligned); + identity_map(acpi_base, acpi_size_aligned); + identity_map(initfs_base, initfs_size_aligned); + + // Ensure graphical debug region remains paged + #[cfg(feature = "graphical_debug")] + { + use super::graphical_debug::DEBUG_DISPLAY; + use super::paging::entry::EntryFlags; + + let (base, size) = if let Some(debug_display) = &*DEBUG_DISPLAY.lock() { + let data = &debug_display.display.onscreen; + ( + data.as_ptr() as usize - crate::PHYS_OFFSET, + data.len() * 4 + ) + } else { + (0, 0) + }; + + let pages = (size + A::PAGE_SIZE - 1) / A::PAGE_SIZE; + for i in 0..pages { + let phys = PhysicalAddress::new(base + i * A::PAGE_SIZE); + let virt = A::phys_to_virt(phys); + let flags = PageFlags::new().write(true) + .custom_flag(EntryFlags::HUGE_PAGE.bits(), true); + let flush = mapper.map_phys( + virt, + phys, + flags + ).expect("failed to map frame"); + flush.ignore(); // Not the active table + } + } + + log::debug!("Table: {:X}", mapper.table().phys().data()); + for i in 0..A::PAGE_ENTRIES { + if let Some(entry) = mapper.table().entry(i) { + if entry.present() { + log::debug!("{}: {:X}", i, entry.data()); + } + } + } + + // Use the new table + mapper.make_current(); + } + + // Create the physical memory map + let offset = bump_allocator.offset(); + log::info!("Permanently used: {} KB", (offset + (KILOBYTE - 1)) / KILOBYTE); + + BuddyAllocator::::new(bump_allocator).expect("failed to create BuddyAllocator") +} + +// There can only be one allocator (at the moment), so making this a ZST is great! +#[derive(Clone, Copy)] +pub struct LockedAllocator; + +static INNER_ALLOCATOR: Mutex>> = Mutex::new(None); + +impl FrameAllocator for LockedAllocator { + unsafe fn allocate(&mut self, count: FrameCount) -> Option { + if let Some(ref mut allocator) = *INNER_ALLOCATOR.lock() { + allocator.allocate(count) + } else { + None + } + } + + unsafe fn free(&mut self, address: PhysicalAddress, count: FrameCount) { + if let Some(ref mut allocator) = *INNER_ALLOCATOR.lock() { + allocator.free(address, count) + } + } + + unsafe fn usage(&self) -> FrameUsage { + if let Some(ref allocator) = *INNER_ALLOCATOR.lock() { + allocator.usage() + } else { + FrameUsage::new(FrameCount::new(0), FrameCount::new(0)) + } + } +} +impl core::fmt::Debug for LockedAllocator { + fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result { + match INNER_ALLOCATOR.try_lock().as_deref() { + Some(Some(alloc)) => write!(f, "[locked allocator: {:?}]", unsafe { alloc.usage() }), + Some(None) => write!(f, "[uninitialized lock allocator]"), + None => write!(f, "[failed to lock]"), + } + } +} + +static mut AREAS: [MemoryArea; 512] = [MemoryArea { + base: PhysicalAddress::new(0), + size: 0, +}; 512]; + +pub static FRAME_ALLOCATOR: LockedAllocator = LockedAllocator; + +const NO_PROCESSOR: usize = !0; +static LOCK_OWNER: AtomicUsize = AtomicUsize::new(NO_PROCESSOR); +static LOCK_COUNT: AtomicUsize = AtomicUsize::new(0); + +// TODO: Support, perhaps via const generics, embedding address checking in PageMapper, thereby +// statically enforcing that the kernel mapper can only map things in the kernel half, and vice +// versa. +/// A guard to the global lock protecting the upper 128 TiB of kernel address space. +/// +/// NOTE: Use this with great care! Since heap allocations may also require this lock when the heap +/// needs to be expended, it must not be held while memory allocations are done! +// TODO: Make the lock finer-grained so that e.g. the heap part can be independent from e.g. +// PHYS_PML4? +pub struct KernelMapper { + mapper: crate::paging::PageMapper, + ro: bool, +} +impl KernelMapper { + fn lock_inner(current_processor: usize) -> bool { + loop { + match LOCK_OWNER.compare_exchange_weak(NO_PROCESSOR, current_processor, Ordering::Acquire, Ordering::Relaxed) { + Ok(_) => break, + // already owned by this hardware thread + Err(id) if id == current_processor => break, + // either CAS failed, or some other hardware thread holds the lock + Err(_) => core::hint::spin_loop(), + } + } + + let prev_count = LOCK_COUNT.fetch_add(1, Ordering::Relaxed); + atomic::compiler_fence(Ordering::Acquire); + + prev_count > 0 + } + pub unsafe fn lock_for_manual_mapper(current_processor: usize, mapper: crate::paging::PageMapper) -> Self { + let ro = Self::lock_inner(current_processor); + Self { + mapper, + ro, + } + } + pub fn lock_manually(current_processor: usize) -> Self { + unsafe { Self::lock_for_manual_mapper(current_processor, PageMapper::new(RmmA::table(), FRAME_ALLOCATOR)) } + } + pub fn lock() -> Self { + Self::lock_manually(crate::cpu_id()) + } + pub fn get_mut(&mut self) -> Option<&mut crate::paging::PageMapper> { + if self.ro { + None + } else { + Some(&mut self.mapper) + } + } +} +impl core::ops::Deref for KernelMapper { + type Target = crate::paging::PageMapper; + + fn deref(&self) -> &Self::Target { + &self.mapper + } +} +impl Drop for KernelMapper { + fn drop(&mut self) { + if LOCK_COUNT.fetch_sub(1, Ordering::Relaxed) == 1 { + LOCK_OWNER.store(NO_PROCESSOR, Ordering::Release); + } + atomic::compiler_fence(Ordering::Release); + } +} + +pub unsafe fn init( + kernel_base: usize, kernel_size: usize, + stack_base: usize, stack_size: usize, + env_base: usize, env_size: usize, + acpi_base: usize, acpi_size: usize, + areas_base: usize, areas_size: usize, + initfs_base: usize, initfs_size: usize, +) { + type A = RmmA; + + let real_base = 0; + let real_size = 0x100000; + let real_end = real_base + real_size; + + let kernel_size_aligned = ((kernel_size + (A::PAGE_SIZE - 1))/A::PAGE_SIZE) * A::PAGE_SIZE; + let kernel_end = kernel_base + kernel_size_aligned; + + let stack_size_aligned = ((stack_size + (A::PAGE_SIZE - 1))/A::PAGE_SIZE) * A::PAGE_SIZE; + let stack_end = stack_base + stack_size_aligned; + + let env_size_aligned = ((env_size + (A::PAGE_SIZE - 1))/A::PAGE_SIZE) * A::PAGE_SIZE; + let env_end = env_base + env_size_aligned; + + let acpi_size_aligned = ((acpi_size + (A::PAGE_SIZE - 1))/A::PAGE_SIZE) * A::PAGE_SIZE; + let acpi_end = acpi_base + acpi_size_aligned; + + let initfs_size_aligned = ((initfs_size + (A::PAGE_SIZE - 1))/A::PAGE_SIZE) * A::PAGE_SIZE; + let initfs_end = initfs_base + initfs_size_aligned; + + let bootloader_areas = slice::from_raw_parts( + areas_base as *const BootloaderMemoryEntry, + areas_size / mem::size_of::() + ); + + // Copy memory map from bootloader location, and page align it + let mut area_i = 0; + for bootloader_area in bootloader_areas.iter() { + if bootloader_area.kind != BootloaderMemoryKind::Free { + // Not a free area + continue; + } + + let mut base = bootloader_area.base as usize; + let mut size = bootloader_area.size as usize; + + log::debug!("{:X}:{:X}", base, size); + + // Page align base + let base_offset = (A::PAGE_SIZE - (base & A::PAGE_OFFSET_MASK)) & A::PAGE_OFFSET_MASK; + if base_offset > size { + // Area is too small to page align base + continue; + } + base += base_offset; + size -= base_offset; + + // Page align size + size &= !A::PAGE_OFFSET_MASK; + log::debug!(" => {:X}:{:X}", base, size); + + let mut new_base = base; + + // Ensure real-mode areas are not used + if base < real_end && base + size > real_base { + log::warn!("{:X}:{:X} overlaps with real mode {:X}:{:X}", base, size, real_base, real_size); + new_base = cmp::max(new_base, real_end); + } + + // Ensure kernel areas are not used + if base < kernel_end && base + size > kernel_base { + log::warn!("{:X}:{:X} overlaps with kernel {:X}:{:X}", base, size, kernel_base, kernel_size); + new_base = cmp::max(new_base, kernel_end); + } + + // Ensure stack areas are not used + if base < stack_end && base + size > stack_base { + log::warn!("{:X}:{:X} overlaps with stack {:X}:{:X}", base, size, stack_base, stack_size); + new_base = cmp::max(new_base, stack_end); + } + + // Ensure env areas are not used + if base < env_end && base + size > env_base { + log::warn!("{:X}:{:X} overlaps with env {:X}:{:X}", base, size, env_base, env_size); + new_base = cmp::max(new_base, env_end); + } + + // Ensure acpi areas are not used + if base < acpi_end && base + size > acpi_base { + log::warn!("{:X}:{:X} overlaps with acpi {:X}:{:X}", base, size, acpi_base, acpi_size); + new_base = cmp::max(new_base, acpi_end); + } + if base < initfs_end && base + size > initfs_base { + log::warn!("{:X}:{:X} overlaps with initfs {:X}:{:X}", base, size, initfs_base, initfs_size); + new_base = cmp::max(new_base, initfs_end); + } + + if new_base != base { + let end = base + size; + let new_size = end.checked_sub(new_base).unwrap_or(0); + log::info!("{:X}:{:X} moved to {:X}:{:X}", base, size, new_base, new_size); + base = new_base; + size = new_size; + } + + if size == 0 { + // Area is zero sized + continue; + } + + AREAS[area_i].base = PhysicalAddress::new(base); + AREAS[area_i].size = size; + area_i += 1; + } + + let allocator = inner::( + &AREAS, + kernel_base, kernel_size_aligned, + stack_base, stack_size_aligned, + env_base, env_size_aligned, + acpi_base, acpi_size_aligned, + initfs_base, initfs_size_aligned, + ); + *INNER_ALLOCATOR.lock() = Some(allocator); +} diff --git a/src/arch/x86/start.rs b/src/arch/x86/start.rs new file mode 100644 index 0000000..fe2692f --- /dev/null +++ b/src/arch/x86/start.rs @@ -0,0 +1,448 @@ +/// This function is where the kernel sets up IRQ handlers +/// It is increcibly unsafe, and should be minimal in nature +/// It must create the IDT with the correct entries, those entries are +/// defined in other files inside of the `arch` module + +use core::slice; +use core::sync::atomic::{AtomicBool, AtomicUsize, Ordering}; + +use crate::allocator; +#[cfg(feature = "acpi")] +use crate::acpi; +use crate::device; +use crate::gdt; +use crate::idt; +use crate::interrupt; +use crate::log::{self, info}; +use crate::paging::{self, KernelMapper}; + +#[cfg(feature = "graphical_debug")] +use super::graphical_debug; +use super::pti; +use super::flags::*; + +/// Test of zero values in BSS. +static BSS_TEST_ZERO: usize = 0; +/// Test of non-zero values in data. +static DATA_TEST_NONZERO: usize = usize::max_value(); +/// Test of zero values in thread BSS +#[thread_local] +static mut TBSS_TEST_ZERO: usize = 0; +/// Test of non-zero values in thread data. +#[thread_local] +static mut TDATA_TEST_NONZERO: usize = usize::max_value(); + +pub static KERNEL_BASE: AtomicUsize = AtomicUsize::new(0); +pub static KERNEL_SIZE: AtomicUsize = AtomicUsize::new(0); +pub static CPU_COUNT: AtomicUsize = AtomicUsize::new(0); +pub static AP_READY: AtomicBool = AtomicBool::new(false); +static BSP_READY: AtomicBool = AtomicBool::new(false); + +#[repr(packed)] +pub struct KernelArgs { + kernel_base: u64, + kernel_size: u64, + stack_base: u64, + stack_size: u64, + env_base: u64, + env_size: u64, + + /// The base 64-bit pointer to an array of saved RSDPs. It's up to the kernel (and possibly + /// userspace), to decide which RSDP to use. The buffer will be a linked list containing a + /// 32-bit relative (to this field) next, and the actual struct afterwards. + /// + /// This field can be NULL, and if so, the system has not booted with UEFI or in some other way + /// retrieved the RSDPs. The kernel or a userspace driver will thus try searching the BIOS + /// memory instead. On UEFI systems, BIOS-like searching is not guaranteed to actually work though. + acpi_rsdps_base: u64, + /// The size of the RSDPs region. + acpi_rsdps_size: u64, + + areas_base: u64, + areas_size: u64, + + /// The physical base 64-bit pointer to the contiguous bootstrap/initfs. + bootstrap_base: u64, + /// Size of contiguous bootstrap/initfs physical region, not necessarily page aligned. + bootstrap_size: u64, + /// Entry point the kernel will jump to. + bootstrap_entry: u64, +} + +/// The entry to Rust, all things must be initialized +#[no_mangle] +pub unsafe extern fn kstart(args_ptr: *const KernelArgs) -> ! { + let bootstrap = { + let args = args_ptr.read(); + + // BSS should already be zero + { + assert_eq!(BSS_TEST_ZERO, 0); + assert_eq!(DATA_TEST_NONZERO, usize::max_value()); + } + + KERNEL_BASE.store(args.kernel_base as usize, Ordering::SeqCst); + KERNEL_SIZE.store(args.kernel_size as usize, Ordering::SeqCst); + + // Convert env to slice + let env = slice::from_raw_parts((args.env_base as usize + crate::PHYS_OFFSET) as *const u8, args.env_size as usize); + + // Set up graphical debug + #[cfg(feature = "graphical_debug")] + graphical_debug::init(env); + + #[cfg(feature = "system76_ec_debug")] + device::system76_ec::init(); + + // Initialize logger + log::init_logger(|r| { + use core::fmt::Write; + let _ = write!( + super::debug::Writer::new(), + "{}:{} -- {}\n", + r.target(), + r.level(), + r.args() + ); + }); + + info!("Redox OS starting..."); + info!("Kernel: {:X}:{:X}", args.kernel_base, args.kernel_base + args.kernel_size); + info!("Stack: {:X}:{:X}", args.stack_base, args.stack_base + args.stack_size); + info!("Env: {:X}:{:X}", args.env_base, args.env_base + args.env_size); + info!("RSDPs: {:X}:{:X}", args.acpi_rsdps_base, args.acpi_rsdps_base + args.acpi_rsdps_size); + info!("Areas: {:X}:{:X}", args.areas_base, args.areas_base + args.areas_size); + info!("Bootstrap: {:X}:{:X}", args.bootstrap_base, args.bootstrap_base + args.bootstrap_size); + info!("Bootstrap entry point: {:X}", args.bootstrap_entry); + + // Set up GDT before paging + gdt::init(); + + // Set up IDT before paging + idt::init(); + + // Initialize RMM + crate::arch::rmm::init( + args.kernel_base as usize, args.kernel_size as usize, + args.stack_base as usize, args.stack_size as usize, + args.env_base as usize, args.env_size as usize, + args.acpi_rsdps_base as usize, args.acpi_rsdps_size as usize, + args.areas_base as usize, args.areas_size as usize, + args.bootstrap_base as usize, args.bootstrap_size as usize, + ); + + // Initialize paging + let tcb_offset = paging::init(0); + + // Set up GDT after paging with TLS + gdt::init_paging(0, tcb_offset, args.stack_base as usize + args.stack_size as usize); + + // Set up IDT + idt::init_paging_bsp(); + + // Set up syscall instruction + interrupt::syscall::init(); + + // Test tdata and tbss + { + assert_eq!(TBSS_TEST_ZERO, 0); + TBSS_TEST_ZERO += 1; + assert_eq!(TBSS_TEST_ZERO, 1); + assert_eq!(TDATA_TEST_NONZERO, usize::max_value()); + TDATA_TEST_NONZERO -= 1; + assert_eq!(TDATA_TEST_NONZERO, usize::max_value() - 1); + } + + // Reset AP variables + CPU_COUNT.store(1, Ordering::SeqCst); + AP_READY.store(false, Ordering::SeqCst); + BSP_READY.store(false, Ordering::SeqCst); + + // Setup kernel heap + allocator::init(); + + // Set up double buffer for grpahical debug now that heap is available + #[cfg(feature = "graphical_debug")] + graphical_debug::init_heap(); + + idt::init_paging_post_heap(true, 0); + + // Activate memory logging + log::init(); + + // Initialize devices + device::init(); + + // Read ACPI tables, starts APs + #[cfg(feature = "acpi")] + { + acpi::init(if args.acpi_rsdps_base != 0 && args.acpi_rsdps_size > 0 { + Some(((args.acpi_rsdps_base as usize + crate::PHYS_OFFSET) as u64, args.acpi_rsdps_size as u64)) + } else { + None + }); + device::init_after_acpi(); + } + + // Initialize all of the non-core devices not otherwise needed to complete initialization + device::init_noncore(); + + // Stop graphical debug + #[cfg(feature = "graphical_debug")] + graphical_debug::fini(); + + BSP_READY.store(true, Ordering::SeqCst); + + crate::Bootstrap { + base: crate::memory::Frame::containing_address(crate::paging::PhysicalAddress::new(args.bootstrap_base as usize)), + page_count: (args.bootstrap_size as usize) / crate::memory::PAGE_SIZE, + entry: args.bootstrap_entry as usize, + env, + } + }; + + crate::kmain(CPU_COUNT.load(Ordering::SeqCst), bootstrap); +} + +#[repr(packed)] +pub struct KernelArgsAp { + cpu_id: u64, + page_table: u64, + stack_start: u64, + stack_end: u64, +} + +/// Entry to rust for an AP +pub unsafe extern fn kstart_ap(args_ptr: *const KernelArgsAp) -> ! { + let cpu_id = { + let args = &*args_ptr; + let cpu_id = args.cpu_id as usize; + let bsp_table = args.page_table as usize; + let _stack_start = args.stack_start as usize; + let stack_end = args.stack_end as usize; + + assert_eq!(BSS_TEST_ZERO, 0); + assert_eq!(DATA_TEST_NONZERO, usize::max_value()); + + // Set up GDT before paging + gdt::init(); + + // Set up IDT before paging + idt::init(); + + // Initialize paging + let tcb_offset = { + use crate::paging::{PageMapper, PhysicalAddress}; + use crate::rmm::FRAME_ALLOCATOR; + + let mut mapper = KernelMapper::lock_for_manual_mapper(cpu_id, PageMapper::new(PhysicalAddress::new(bsp_table), FRAME_ALLOCATOR)); + paging::init_ap(cpu_id, &mut mapper) + }; + + // Set up GDT with TLS + gdt::init_paging(cpu_id as u32, tcb_offset, stack_end); + + // Set up IDT for AP + idt::init_paging_post_heap(false, cpu_id); + + // Set up syscall instruction + interrupt::syscall::init(); + + // Test tdata and tbss + { + assert_eq!(TBSS_TEST_ZERO, 0); + TBSS_TEST_ZERO += 1; + assert_eq!(TBSS_TEST_ZERO, 1); + assert_eq!(TDATA_TEST_NONZERO, usize::max_value()); + TDATA_TEST_NONZERO -= 1; + assert_eq!(TDATA_TEST_NONZERO, usize::max_value()); + } + + // Initialize devices (for AP) + device::init_ap(); + + AP_READY.store(true, Ordering::SeqCst); + + cpu_id + }; + + while ! BSP_READY.load(Ordering::SeqCst) { + interrupt::pause(); + } + + crate::kmain_ap(cpu_id); +} + +#[cfg(not(feature = "pit"))] +macro_rules! inner_pit_unmap( + () => { + " + // unused: {pti_unmap} + " + } +); +#[cfg(feature = "pit")] +macro_rules! inner_pit_unmap( + () => { + " + push rdi + push rsi + push rdx + push rcx + sub rsp, 8 + + call {pti_unmap} + + add rsp, 8 + pop rcx + pop rdx + pop rsi + pop rdi + " + } +); + +#[cfg(not(feature = "x86_fsgsbase"))] +macro_rules! save_fsgsbase( + () => { + " + mov ecx, {MSR_FSBASE} + rdmsr + shl rdx, 32 + or rdx, rax + mov r14, rdx + + mov ecx, {MSR_GSBASE} + rdmsr + shl rdx, 32 + or rdx, rax + mov r13, rdx + " + } +); +#[cfg(feature = "x86_fsgsbase")] +macro_rules! save_fsgsbase( + () => { + " + // placeholder: {MSR_FSBASE} {MSR_GSBASE} + rdfsbase r14 + rdgsbase r13 + " + } +); + +#[cfg(feature = "x86_fsgsbase")] +macro_rules! restore_fsgsbase( + () => { + " + wrfsbase r14 + wrgsbase r13 + " + } +); + +#[cfg(not(feature = "x86_fsgsbase"))] +macro_rules! restore_fsgsbase( + () => { + " + mov ecx, {MSR_FSBASE} + mov rdx, r14 + mov eax, edx + shr rdx, 32 + wrmsr + + mov ecx, {MSR_GSBASE} + mov rdx, r13 + mov eax, edx + shr rdx, 32 + wrmsr + " + } +); + +#[naked] +// TODO: AbiCompatBool +pub unsafe extern "C" fn usermode(_ip: usize, _sp: usize, _arg: usize, _is_singlestep: usize) -> ! { + core::arch::asm!(" + 1: + hlt + jmp 1f + ", options(noreturn)); + /*TODO + // rdi, rsi, rdx, rcx + core::arch::asm!( + concat!(" + shl rcx, {shift_singlestep} + or rcx, {flag_interrupts} + + ", inner_pit_unmap!(), " + + // Save rdx for later + mov r12, rdx + + // Target RFLAGS + mov r11, rcx + + // Go to usermode + swapgs + + ", save_fsgsbase!(), " + + mov r15, {user_data_seg_selector} + mov ds, r15d + mov es, r15d + mov fs, r15d + mov gs, r15d + ", + + // SS and CS will later be set via sysretq. + + restore_fsgsbase!(), " + + // Target instruction pointer + mov rcx, rdi + // Target stack pointer + mov rsp, rsi + // Target argument + mov rdi, r12 + + xor rax, rax + xor rbx, rbx + // Don't zero rcx; it's used for `ip`. + xor rdx, rdx + // Don't zero rdi; it's used for `arg`. + xor rsi, rsi + xor rbp, rbp + // Don't zero rsp, obviously. + xor r8, r8 + xor r9, r9 + xor r10, r10 + // Don't zero r11; it's used for `rflags`. + xor r12, r12 + xor r13, r13 + xor r14, r14 + xor r15, r15 + + fninit + ", + // NOTE: Regarding the sysretq vulnerability, this is safe as we cannot modify RCX, + // even though the caller can give us the wrong address. But, it's marked unsafe, so + // the caller is responsible for this! (And, the likelihood of rcx being changed in the + // middle here, is minimal, unless the attacker already has partial control of kernel + // memory.) + " + sysretq + "), + + flag_interrupts = const(FLAG_INTERRUPTS), + shift_singlestep = const(SHIFT_SINGLESTEP), + pti_unmap = sym pti::unmap, + user_data_seg_selector = const(gdt::GDT_USER_DATA << 3 | 3), + + MSR_FSBASE = const(x86::msr::IA32_FS_BASE), + MSR_GSBASE = const(x86::msr::IA32_GS_BASE), + + options(noreturn), + ); + */ +} diff --git a/src/arch/x86/stop.rs b/src/arch/x86/stop.rs new file mode 100644 index 0000000..0b12b8b --- /dev/null +++ b/src/arch/x86/stop.rs @@ -0,0 +1,94 @@ +#[cfg(feature = "acpi")] +use crate::{ + context, + scheme::acpi, + time, +}; + +use crate::syscall::io::{Io, Pio}; + +#[no_mangle] +pub unsafe extern fn kreset() -> ! { + println!("kreset"); + + // 8042 reset + { + println!("Reset with 8042"); + let mut port = Pio::::new(0x64); + while port.readf(2) {} + port.write(0xFE); + } + + // Use triple fault to guarantee reset + core::arch::asm!(" + cli + lidt cs:0 + int $3 + "); + + unreachable!(); +} + +#[cfg(feature = "acpi")] +fn userspace_acpi_shutdown() { + log::info!("Notifying any potential ACPI driver"); + // Tell whatever driver that handles ACPI, that it should enter the S5 state (i.e. + // shutdown). + if ! acpi::register_kstop() { + // There was no context to switch to. + log::info!("No ACPI driver was alive to handle shutdown."); + return; + } + log::info!("Waiting one second for ACPI driver to run the shutdown sequence."); + let (initial_s, initial_ns) = time::monotonic(); + + // Since this driver is a userspace process, and we do not use any magic like directly + // context switching, we have to wait for the userspace driver to complete, with a timeout. + // + // We switch context, and wait for one second. + loop { + // TODO: Switch directly to whichever process is handling the kstop pipe. We would add an + // event flag like EVENT_DIRECT, which has already been suggested for IRQs. + // TODO: Waitpid with timeout? Because, what if the ACPI driver would crash? + let _ = unsafe { context::switch() }; + let (current_s, current_ns) = time::monotonic(); + + let diff_s = current_s - initial_s; + let diff_part_ns = current_ns - initial_ns; + let diff_ns = diff_s * 1_000_000_000 + diff_part_ns; + + if diff_ns > 1_000_000_000 { + log::info!("Timeout reached, thus falling back to other shutdown methods."); + return; + } + } +} + +#[no_mangle] +pub unsafe extern fn kstop() -> ! { + log::info!("Running kstop()"); + + #[cfg(feature = "acpi")] + userspace_acpi_shutdown(); + + // Magic shutdown code for bochs and qemu (older versions). + for c in "Shutdown".bytes() { + let port = 0x8900; + println!("Shutdown with outb(0x{:X}, '{}')", port, c as char); + Pio::::new(port).write(c); + } + + // Magic shutdown using qemu default ACPI method + { + let port = 0x604; + let data = 0x2000; + println!("Shutdown with outb(0x{:X}, 0x{:X})", port, data); + Pio::::new(port).write(data); + } + + // Magic code for VMWare. Also a hard lock. + println!("Shutdown with cli hlt"); + loop { + core::arch::asm!("cli; hlt"); + } +} diff --git a/src/context/arch/x86.rs b/src/context/arch/x86.rs new file mode 100644 index 0000000..6b4cfde --- /dev/null +++ b/src/context/arch/x86.rs @@ -0,0 +1,276 @@ +use core::mem; +use core::sync::atomic::AtomicBool; + +use alloc::sync::Arc; + +use crate::paging::{RmmA, RmmArch}; +use crate::syscall::FloatRegisters; + +use memoffset::offset_of; +use spin::Once; + +/// This must be used by the kernel to ensure that context switches are done atomically +/// Compare and exchange this to true when beginning a context switch on any CPU +/// The `Context::switch_to` function will set it back to false, allowing other CPU's to switch +/// This must be done, as no locks can be held on the stack during switch +pub static CONTEXT_SWITCH_LOCK: AtomicBool = AtomicBool::new(false); + +const ST_RESERVED: u128 = 0xFFFF_FFFF_FFFF_0000_0000_0000_0000_0000; + +pub const KFX_SIZE: usize = 512; +pub const KFX_ALIGN: usize = 16; + +#[derive(Clone, Debug)] +#[repr(C)] +pub struct Context { + /// EFLAGS register + eflags: usize, + /// EBX register + ebx: usize, + /// EDI register + edi: usize, + /// ESI register + esi: usize, + /// Base pointer + ebp: usize, + /// Stack pointer + pub(crate) esp: usize, + /// FSBASE. + /// + /// NOTE: Same fsgsbase behavior as with gsbase. + pub(crate) fsbase: usize, + /// GSBASE. + /// + /// NOTE: Without fsgsbase, this register will strictly be equal to the register value when + /// running. With fsgsbase, this is neither saved nor restored upon every syscall (there is no + /// need to!), and thus it must be re-read from the register before copying this struct. + pub(crate) gsbase: usize, +} + +impl Context { + pub fn new() -> Context { + Context { + eflags: 0, + ebx: 0, + edi: 0, + esi: 0, + ebp: 0, + esp: 0, + fsbase: 0, + gsbase: 0, + } + } + + pub fn set_stack(&mut self, address: usize) { + self.esp = address; + } + + pub unsafe fn signal_stack(&mut self, handler: extern fn(usize), sig: u8) { + self.push_stack(sig as usize); + self.push_stack(handler as usize); + self.push_stack(signal_handler_wrapper as usize); + } + + pub unsafe fn push_stack(&mut self, value: usize) { + self.esp -= mem::size_of::(); + *(self.esp as *mut usize) = value; + } + + pub unsafe fn pop_stack(&mut self) -> usize { + let value = *(self.esp as *const usize); + self.esp += mem::size_of::(); + value + } +} +impl super::Context { + pub fn get_fx_regs(&self) -> FloatRegisters { + let mut regs = unsafe { self.kfx.as_ptr().cast::().read() }; + regs._reserved = 0; + let mut new_st = regs.st_space; + for st in &mut new_st { + // Only allow access to the 80 lowest bits + *st &= !ST_RESERVED; + } + regs.st_space = new_st; + regs + } + + pub fn set_fx_regs(&mut self, mut new: FloatRegisters) { + { + let old = unsafe { &*(self.kfx.as_ptr().cast::()) }; + new._reserved = old._reserved; + let old_st = new.st_space; + let mut new_st = new.st_space; + for (new_st, old_st) in new_st.iter_mut().zip(&old_st) { + *new_st &= !ST_RESERVED; + *new_st |= old_st & ST_RESERVED; + } + new.st_space = new_st; + + // Make sure we don't use `old` from now on + } + + unsafe { + self.kfx.as_mut_ptr().cast::().write(new); + } + } +} + +pub static EMPTY_CR3: Once = Once::new(); + +// SAFETY: EMPTY_CR3 must be initialized. +pub unsafe fn empty_cr3() -> rmm::PhysicalAddress { + debug_assert!(EMPTY_CR3.poll().is_some()); + *EMPTY_CR3.get_unchecked() +} + +/// Switch to the next context by restoring its stack and registers +pub unsafe fn switch_to(prev: &mut super::Context, next: &mut super::Context) { + core::arch::asm!(" + fxsave [{prev_fx}] + fxrstor [{next_fx}] + ", prev_fx = in(reg) prev.kfx.as_mut_ptr(), + next_fx = in(reg) next.kfx.as_ptr(), + ); + + { + use x86::{bits64::segmentation::*, msr}; + + prev.arch.fsbase = msr::rdmsr(msr::IA32_FS_BASE) as usize; + msr::wrmsr(msr::IA32_FS_BASE, next.arch.fsbase as u64); + prev.arch.gsbase = msr::rdmsr(msr::IA32_KERNEL_GSBASE) as usize; + msr::wrmsr(msr::IA32_KERNEL_GSBASE, next.arch.gsbase as u64); + } + + match next.addr_space { + // Since Arc is essentially just wraps a pointer, in this case a regular pointer (as + // opposed to dyn or slice fat pointers), and NonNull optimization exists, map_or will + // hopefully be optimized down to checking prev and next pointers, as next cannot be null. + Some(ref next_space) => if prev.addr_space.as_ref().map_or(true, |prev_space| !Arc::ptr_eq(&prev_space, &next_space)) { + // Suppose we have two sibling threads A and B. A runs on CPU 0 and B on CPU 1. A + // recently called yield and is now here about to switch back. Meanwhile, B is + // currently creating a new mapping in their shared address space, for example a + // message on a channel. + // + // Unless we acquire this lock, it may be possible that the TLB will not contain new + // entries. While this can be caught and corrected in a page fault handler, this is not + // true when entries are removed from a page table! + next_space.read().table.utable.make_current(); + } + None => { + RmmA::set_table(empty_cr3()); + } + } + switch_to_inner(&mut prev.arch, &mut next.arch) +} + +// Check disassembly! +#[naked] +unsafe extern "cdecl" fn switch_to_inner(_prev: &mut Context, _next: &mut Context) { + use Context as Cx; + + core::arch::asm!( + // As a quick reminder for those who are unfamiliar with the System V ABI (extern "C"): + // + // - the current parameters are passed in the registers `edi`, `esi`, + // - we can modify scratch registers, e.g. rax + // - we cannot change callee-preserved registers arbitrarily, e.g. ebx, which is why we + // store them here in the first place. + concat!(" + pop eax // Pop return address + pop ecx // Pop prev + pop edx // Pop next + push eax // Push return address + + // Save old registers, and load new ones + mov [ecx + {off_ebx}], ebx + mov ebx, [edx + {off_ebx}] + + mov [ecx + {off_edi}], edi + mov edi, [edx + {off_edi}] + + mov [ecx + {off_esi}], esi + mov esi, [edx + {off_esi}] + + mov [ecx + {off_ebp}], ebp + mov ebp, [edx + {off_ebp}] + + mov [ecx + {off_esp}], esp + mov esp, [edx + {off_esp}] + + // push EFLAGS (can only be modified via stack) + pushfd + // pop EFLAGS into `self.eflags` + pop DWORD PTR [ecx + {off_eflags}] + + // push `next.eflags` + push DWORD PTR [edx + {off_eflags}] + // pop into EFLAGS + popfd + + // When we return, we cannot even guarantee that the return address on the stack, points to + // the calling function, `context::switch`. Thus, we have to execute this Rust hook by + // ourselves, which will unlock the contexts before the later switch. + + // Note that switch_finish_hook will be responsible for executing `ret`. + jmp {switch_hook} + + "), + + off_eflags = const(offset_of!(Cx, eflags)), + + off_ebx = const(offset_of!(Cx, ebx)), + off_edi = const(offset_of!(Cx, edi)), + off_esi = const(offset_of!(Cx, esi)), + off_ebp = const(offset_of!(Cx, ebp)), + off_esp = const(offset_of!(Cx, esp)), + + switch_hook = sym crate::context::switch_finish_hook, + options(noreturn), + ); +} +#[allow(dead_code)] +#[repr(packed)] +pub struct SignalHandlerStack { + esi: usize, + edi: usize, + edx: usize, + ecx: usize, + eax: usize, + handler: extern fn(usize), + sig: usize, + eip: usize, +} + +#[naked] +unsafe extern fn signal_handler_wrapper() { + #[inline(never)] + unsafe extern "C" fn inner(stack: &SignalHandlerStack) { + (stack.handler)(stack.sig); + } + + // Push scratch registers + core::arch::asm!( + " + push eax + push ecx + push edx + push edi + push esi + + mov edi, esp + call {inner} + + pop esi + pop edi + pop edx + pop ecx + pop eax + add esp, 16 + ret + ", + + inner = sym inner, + options(noreturn), + ); +} diff --git a/src/context/memory.rs b/src/context/memory.rs index 7c8fafe..1d3f964 100644 --- a/src/context/memory.rs +++ b/src/context/memory.rs @@ -898,11 +898,37 @@ impl Drop for Table { } } +/// Allocates a new identically mapped ktable and empty utable (same memory on x86) +#[cfg(target_arch = "x86")] +pub fn setup_new_utable() -> Result { + let mut utable = unsafe { PageMapper::create(crate::rmm::FRAME_ALLOCATOR).ok_or(Error::new(ENOMEM))? }; + + { + let active_ktable = KernelMapper::lock(); + + let mut copy_mapping = |p4_no| unsafe { + let entry = active_ktable.table().entry(p4_no) + .unwrap_or_else(|| panic!("expected kernel PML {} to be mapped", p4_no)); + + utable.table().set_entry(p4_no, entry) + }; + + // Copy higher half (kernel) mappings + for i in 512..1024 { + copy_mapping(i); + } + } + + Ok(Table { + utable, + }) +} + /// Allocates a new identically mapped ktable and empty utable (same memory on x86_64). +#[cfg(target_arch = "x86_64")] pub fn setup_new_utable() -> Result
{ let utable = unsafe { PageMapper::create(crate::rmm::FRAME_ALLOCATOR).ok_or(Error::new(ENOMEM))? }; - #[cfg(target_arch = "x86_64")] { let active_ktable = KernelMapper::lock(); @@ -934,7 +960,6 @@ pub fn setup_new_utable() -> Result
{ }) } - #[cfg(tests)] mod tests { // TODO: Get these tests working diff --git a/src/context/mod.rs b/src/context/mod.rs index dea6e23..5a2d90c 100644 --- a/src/context/mod.rs +++ b/src/context/mod.rs @@ -18,6 +18,10 @@ pub use self::switch::switch; #[path = "arch/aarch64.rs"] mod arch; +#[cfg(target_arch = "x86")] +#[path = "arch/x86.rs"] +mod arch; + #[cfg(target_arch = "x86_64")] #[path = "arch/x86_64.rs"] mod arch;