WIP: Let userspace manage fsbase/gsbase and TLS.

This commit is contained in:
4lDO2
2021-08-01 12:09:22 +02:00
parent 0968e4f87e
commit 3eedbeb14d
13 changed files with 283 additions and 214 deletions

View File

@@ -39,13 +39,6 @@
pub const USER_OFFSET: usize = 0;
pub const USER_PML4: usize = (USER_OFFSET & PML4_MASK)/PML4_SIZE;
/// Offset to user TCB
/// Each process has 4096 bytes, at an offset of 4096 * PID
// TODO: Get a real 64-bit offset, and allow loading ELF sections higher up than the current
// limit, iff the processor supports fsgsbase (in which case it is cheap to use 64-bit FS
// offsets).
pub const USER_TCB_OFFSET: usize = 0xB000_0000;
/// Offset to user arguments
pub const USER_ARG_OFFSET: usize = USER_OFFSET + PML4_SIZE/2;
@@ -69,14 +62,8 @@
/// Size of user sigstack
pub const USER_SIGSTACK_SIZE: usize = 256 * 1024; // 256 KB
/// Offset to user TLS
pub const USER_TLS_OFFSET: usize = USER_SIGSTACK_OFFSET + PML4_SIZE;
pub const USER_TLS_PML4: usize = (USER_TLS_OFFSET & PML4_MASK)/PML4_SIZE;
// Maximum TLS allocated to each PID, should be approximately 8 MB
pub const USER_TLS_SIZE: usize = PML4_SIZE / 65536;
/// Offset to user temporary image (used when cloning)
pub const USER_TMP_OFFSET: usize = USER_TLS_OFFSET + PML4_SIZE;
pub const USER_TMP_OFFSET: usize = USER_SIGSTACK_OFFSET + PML4_SIZE;
pub const USER_TMP_PML4: usize = (USER_TMP_OFFSET & PML4_MASK)/PML4_SIZE;
/// Offset to user temporary heap (used when cloning)
@@ -95,10 +82,6 @@
pub const USER_TMP_SIGSTACK_OFFSET: usize = USER_TMP_STACK_OFFSET + PML4_SIZE;
pub const USER_TMP_SIGSTACK_PML4: usize = (USER_TMP_SIGSTACK_OFFSET & PML4_MASK)/PML4_SIZE;
/// Offset to user temporary tls (used when cloning)
pub const USER_TMP_TLS_OFFSET: usize = USER_TMP_SIGSTACK_OFFSET + PML4_SIZE;
pub const USER_TMP_TLS_PML4: usize = (USER_TMP_TLS_OFFSET & PML4_MASK)/PML4_SIZE;
/// Offset for usage in other temporary pages
pub const USER_TMP_MISC_OFFSET: usize = USER_TMP_TLS_OFFSET + PML4_SIZE;
pub const USER_TMP_MISC_OFFSET: usize = USER_TMP_SIGSTACK_OFFSET + PML4_SIZE;
pub const USER_TMP_MISC_PML4: usize = (USER_TMP_MISC_OFFSET & PML4_MASK)/PML4_SIZE;

View File

@@ -80,6 +80,11 @@ impl IretRegisters {
println!("RSP: {:>016X}", { self.rsp });
println!("SS: {:>016X}", { self.ss });
}
unsafe {
let fsbase = x86::msr::rdmsr(x86::msr::IA32_FS_BASE);
let gsbase = x86::msr::rdmsr(x86::msr::IA32_KERNEL_GSBASE);
println!("FSBASE {:>016X}\nGSBASE {:016X}", fsbase, gsbase);
}
}
}

View File

@@ -275,13 +275,13 @@ macro_rules! save_fsgsbase(
mov ecx, {MSR_FSBASE}
rdmsr
shl rdx, 32
mov edx, eax
or rdx, rax
mov r14, rdx
mov ecx, {MSR_GSBASE}
rdmsr
shl rdx, 32
mov edx, eax
or rdx, rax
mov r13, rdx
"
}
@@ -354,8 +354,11 @@ pub unsafe extern "C" fn usermode(_ip: usize, _sp: usize, _arg: usize, _is_singl
mov es, r15d
mov fs, r15d
mov gs, r15d
",
", restore_fsgsbase!(), "
// SS and CS will later be set via sysretq.
restore_fsgsbase!(), "
// Target instruction pointer
mov rcx, rdi
@@ -382,14 +385,15 @@ pub unsafe extern "C" fn usermode(_ip: usize, _sp: usize, _arg: usize, _is_singl
xor r15, r15
fninit
",
// NOTE: Regarding the sysretq vulnerability, this is safe as we cannot modify RCX,
// even though the caller can give us the wrong address. But, it's marked unsafe, so
// the caller is responsible for this! (And, the likelihood of rcx being changed in the
// middle here, is minimal, unless the attacker already has partial control of kernel
// memory.)
"
sysretq
"),
"),
flag_interrupts = const(FLAG_INTERRUPTS),
shift_singlestep = const(SHIFT_SINGLESTEP),

View File

@@ -36,10 +36,16 @@ pub struct Context {
rbp: usize,
/// Stack pointer
rsp: usize,
/// FSBASE
pub fsbase: usize,
/// GSBASE
gsbase: usize,
/// FSBASE.
///
/// NOTE: Same fsgsbase behavior as with gsbase.
pub(crate) fsbase: usize,
/// GSBASE.
///
/// NOTE: Without fsgsbase, this register will strictly be equal to the register value when
/// running. With fsgsbase, this is neither saved nor restored upon every syscall (there is no
/// need to!), and thus it must be re-read from the register before copying this struct.
pub(crate) gsbase: usize,
/// FX valid?
loadable: AbiCompatBool,
}
@@ -52,7 +58,7 @@ enum AbiCompatBool {
}
impl Context {
pub fn new(pid: usize) -> Context {
pub fn new() -> Context {
Context {
loadable: AbiCompatBool::False,
fx: 0,
@@ -65,13 +71,10 @@ impl Context {
r15: 0,
rbp: 0,
rsp: 0,
fsbase: crate::USER_TCB_OFFSET + pid * crate::memory::PAGE_SIZE,
fsbase: 0,
gsbase: 0,
}
}
pub fn update_tcb(&mut self, pid: usize) {
self.fsbase = crate::USER_TCB_OFFSET + pid * crate::memory::PAGE_SIZE;
}
pub fn get_page_utable(&mut self) -> usize {
self.cr3
@@ -147,19 +150,10 @@ impl Context {
}
}
macro_rules! switch_msr(
macro_rules! load_msr(
($name:literal, $offset:literal) => {
concat!("
// EDX:EAX <= MSR
mov ecx, {", $name, "}
rdmsr
shl rdx, 32
mov edx, eax
// Save old, load new.
mov [rdi + {", $offset, "}], rdx
mov rdx, [rsi + {", $offset, "}]
mov eax, edx
shr rdx, 32
@@ -198,10 +192,9 @@ macro_rules! switch_fsgsbase(
#[cfg(not(feature = "x86_fsgsbase"))]
macro_rules! switch_fsgsbase(
() => {
// TODO: Is it faster to perform two 32-bit memory accesses, rather than shifting?
concat!(
switch_msr!("MSR_FSBASE", "off_fsbase"),
switch_msr!("MSR_KERNELGSBASE", "off_gsbase"),
load_msr!("MSR_FSBASE", "off_fsbase"),
load_msr!("MSR_KERNELGSBASE", "off_gsbase"),
)
}
);

View File

@@ -9,6 +9,7 @@ use core::{
alloc::{GlobalAlloc, Layout},
cmp::Ordering,
mem,
ptr::NonNull,
};
use spin::RwLock;
@@ -20,7 +21,9 @@ use crate::context::memory::{UserGrants, Memory, SharedMemory, Tls};
use crate::ipi::{ipi, IpiKind, IpiTarget};
use crate::scheme::{SchemeNamespace, FileHandle};
use crate::sync::WaitMap;
use crate::syscall::data::SigAction;
use crate::syscall::error::{Result, Error, ENOMEM};
use crate::syscall::flag::{SIG_DFL, SigActionFlags};
/// Unique identifier for a context (i.e. `pid`).
@@ -203,9 +206,9 @@ pub struct Context {
/// Current system call
pub syscall: Option<(usize, usize, usize, usize, usize, usize)>,
/// Head buffer to use when system call buffers are not page aligned
pub syscall_head: Box<[u8]>,
pub syscall_head: AlignedBox<[u8; PAGE_SIZE], PAGE_SIZE>,
/// Tail buffer to use when system call buffers are not page aligned
pub syscall_tail: Box<[u8]>,
pub syscall_tail: AlignedBox<[u8; PAGE_SIZE], PAGE_SIZE>,
/// Context is halting parent
pub vfork: bool,
/// Context is being waited on
@@ -230,8 +233,6 @@ pub struct Context {
pub stack: Option<SharedMemory>,
/// User signal stack
pub sigstack: Option<Memory>,
/// User Thread local storage
pub tls: Option<Tls>,
/// User grants
pub grants: Arc<RwLock<UserGrants>>,
/// The name of the context
@@ -253,12 +254,63 @@ pub struct Context {
pub ptrace_stop: bool
}
impl Context {
pub fn new(id: ContextId) -> Context {
let syscall_head = unsafe { Box::from_raw(crate::ALLOCATOR.alloc(Layout::from_size_align_unchecked(PAGE_SIZE, PAGE_SIZE)) as *mut [u8; PAGE_SIZE]) };
let syscall_tail = unsafe { Box::from_raw(crate::ALLOCATOR.alloc(Layout::from_size_align_unchecked(PAGE_SIZE, PAGE_SIZE)) as *mut [u8; PAGE_SIZE]) };
// Necessary because GlobalAlloc::dealloc requires the layout to be the same, and therefore Box
// cannot be used for increased alignment directly.
// TODO: move to common?
pub struct AlignedBox<T, const ALIGN: usize> {
inner: Unique<T>,
}
pub unsafe trait ValidForZero {}
unsafe impl<const N: usize> ValidForZero for [u8; N] {}
Context {
impl<T, const ALIGN: usize> AlignedBox<T, ALIGN> {
const LAYOUT: core::alloc::Layout = {
const fn max(a: usize, b: usize) -> usize {
if a > b { a } else { b }
}
match core::alloc::Layout::from_size_align(mem::size_of::<T>(), max(mem::align_of::<T>(), ALIGN)) {
Ok(l) => l,
Err(_) => panic!("layout validation failed at compile time"),
}
};
#[inline(always)]
pub fn try_zeroed() -> Result<Self>
where
T: ValidForZero,
{
Ok(unsafe {
let ptr = crate::ALLOCATOR.alloc_zeroed(Self::LAYOUT);
if ptr.is_null() {
return Err(Error::new(ENOMEM))?;
}
Self {
inner: Unique::new_unchecked(ptr.cast()),
}
})
}
}
impl<T, const ALIGN: usize> core::fmt::Debug for AlignedBox<T, ALIGN> {
fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
write!(f, "[aligned box at {:p}, size {} alignment {}]", self.inner.as_ptr(), mem::size_of::<T>(), mem::align_of::<T>())
}
}
impl<T, const ALIGN: usize> Drop for AlignedBox<T, ALIGN> {
fn drop(&mut self) {
unsafe {
core::ptr::drop_in_place(self.inner.as_ptr());
crate::ALLOCATOR.dealloc(self.inner.as_ptr().cast(), Self::LAYOUT);
}
}
}
impl Context {
pub fn new(id: ContextId) -> Result<Context> {
let syscall_head = AlignedBox::try_zeroed()?;
let syscall_tail = AlignedBox::try_zeroed()?;
Ok(Context {
id,
pgid: id,
ppid: ContextId::from(0),
@@ -282,7 +334,7 @@ impl Context {
waitpid: Arc::new(WaitMap::new()),
pending: VecDeque::new(),
wake: None,
arch: arch::Context::new(id.into()),
arch: arch::Context::new(),
kfx: None,
kstack: None,
ksig: None,
@@ -290,7 +342,6 @@ impl Context {
image: Vec::new(),
stack: None,
sigstack: None,
tls: None,
grants: Arc::new(RwLock::new(UserGrants::default())),
name: Arc::new(RwLock::new(String::new().into_boxed_str())),
cwd: Arc::new(RwLock::new(String::new())),
@@ -305,7 +356,7 @@ impl Context {
); 128])),
regs: None,
ptrace_stop: false
}
})
}
/// Make a relative path absolute

View File

@@ -69,7 +69,7 @@ impl ContextList {
let id = ContextId::from(self.next_id);
self.next_id += 1;
assert!(self.map.insert(id, Arc::new(RwLock::new(Context::new(id)))).is_none());
assert!(self.map.insert(id, Arc::new(RwLock::new(Context::new(id)?))).is_none());
Ok(self.map.get(&id).expect("Failed to insert new context. ID is out of bounds."))
}

View File

@@ -82,6 +82,12 @@ impl<'a> Elf<'a> {
pub fn program_headers(&self) -> usize {
self.header.e_phoff as usize
}
pub fn program_header_count(&self) -> usize {
self.header.e_phnum as usize
}
pub fn program_headers_size(&self) -> usize {
self.header.e_phentsize as usize
}
}
pub struct ElfSections<'a> {

View File

@@ -48,6 +48,7 @@
#![feature(concat_idents)]
#![feature(const_btree_new)]
#![feature(const_maybe_uninit_as_ptr)]
#![feature(const_panic)]
#![feature(const_ptr_offset_from)]
#![feature(const_raw_ptr_deref)]
#![feature(core_intrinsics)]

View File

@@ -137,6 +137,7 @@ impl SchemeList {
//TODO: Only memory: is in the null namespace right now. It should be removed when
//anonymous mmap's are implemented
self.insert(ns, "memory", |_| Arc::new(MemoryScheme::new())).unwrap();
self.insert(ns, "thisproc", |_| Arc::new(ProcScheme::restricted())).unwrap();
}
/// Initialize a new namespace
@@ -168,6 +169,7 @@ impl SchemeList {
self.insert(ns, "initfs", |_| Arc::new(InitFsScheme::new())).unwrap();
self.insert(ns, "irq", |scheme_id| Arc::new(IrqScheme::new(scheme_id))).unwrap();
self.insert(ns, "proc", |scheme_id| Arc::new(ProcScheme::new(scheme_id))).unwrap();
self.insert(ns, "thisproc", |_| Arc::new(ProcScheme::restricted())).unwrap();
self.insert(ns, "serio", |scheme_id| Arc::new(SerioScheme::new(scheme_id))).unwrap();
#[cfg(feature = "live")] {

View File

@@ -6,6 +6,7 @@ use crate::{
syscall::{
FloatRegisters,
IntRegisters,
EnvRegisters,
data::{PtraceEvent, Stat},
error::*,
flag::*,
@@ -57,6 +58,9 @@ fn try_stop_context<F, T>(pid: ContextId, mut callback: F) -> Result<T>
where
F: FnMut(&mut Context) -> Result<T>,
{
if pid == context::context_id() {
return Err(Error::new(EBADF));
}
// Stop process
let (was_stopped, mut running) = with_context_mut(pid, |context| {
let was_stopped = context.ptrace_stop;
@@ -88,7 +92,8 @@ where
#[derive(Clone, Copy, PartialEq, Eq)]
enum RegsKind {
Float,
Int
Int,
Env,
}
#[derive(Clone, Copy, PartialEq, Eq)]
enum Operation {
@@ -195,6 +200,12 @@ pub static PROC_SCHEME_ID: AtomicSchemeId = AtomicSchemeId::default();
pub struct ProcScheme {
next_id: AtomicUsize,
handles: RwLock<BTreeMap<usize, Handle>>,
access: Access,
}
#[derive(PartialEq)]
pub enum Access {
OtherProcesses,
Restricted,
}
impl ProcScheme {
@@ -204,6 +215,14 @@ impl ProcScheme {
Self {
next_id: AtomicUsize::new(0),
handles: RwLock::new(BTreeMap::new()),
access: Access::OtherProcesses,
}
}
pub fn restricted() -> Self {
Self {
next_id: AtomicUsize::new(0),
handles: RwLock::new(BTreeMap::new()),
access: Access::Restricted,
}
}
}
@@ -211,15 +230,22 @@ impl ProcScheme {
impl Scheme for ProcScheme {
fn open(&self, path: &str, flags: usize, uid: u32, gid: u32) -> Result<usize> {
let mut parts = path.splitn(2, '/');
let pid = parts.next()
.and_then(|s| s.parse().ok())
.map(ContextId::from)
.ok_or(Error::new(EINVAL))?;
let pid_str = parts.next()
.ok_or(Error::new(ENOENT))?;
let pid = if pid_str == "current" {
context::context_id()
} else if self.access == Access::Restricted {
return Err(Error::new(EACCES));
} else {
ContextId::from(pid_str.parse().map_err(|_| Error::new(ENOENT))?)
};
let operation = match parts.next() {
Some("mem") => Operation::Memory,
Some("regs/float") => Operation::Regs(RegsKind::Float),
Some("regs/int") => Operation::Regs(RegsKind::Int),
Some("regs/env") => Operation::Regs(RegsKind::Env),
Some("trace") => Operation::Trace,
Some("exe") => Operation::Static("exe"),
_ => return Err(Error::new(EINVAL))
@@ -382,7 +408,8 @@ impl Scheme for ProcScheme {
Operation::Regs(kind) => {
union Output {
float: FloatRegisters,
int: IntRegisters
int: IntRegisters,
env: EnvRegisters,
}
let (output, size) = match kind {
@@ -406,7 +433,37 @@ impl Scheme for ProcScheme {
stack.save(&mut regs);
Ok((Output { int: regs }, mem::size_of::<IntRegisters>()))
}
})?
})?,
RegsKind::Env => {
let (fsbase, gsbase) = if info.pid == context::context_id() {
#[cfg(not(feature = "x86_fsgsbase"))]
unsafe {
(
x86::msr::rdmsr(x86::msr::IA32_FS_BASE),
x86::msr::rdmsr(x86::msr::IA32_KERNEL_GSBASE),
)
}
#[cfg(feature = "x86_fsgsbase")]
unsafe {
use x86::bits64::segmentation::*;
(
rdfsbase(),
{
swapgs();
let gsbase = rdgsbase();
swapgs();
gsbase
}
)
}
} else {
try_stop_context(info.pid, |context| {
Ok((context.arch.fsbase as u64, context.arch.gsbase as u64))
})?
};
(Output { env: EnvRegisters { fsbase, gsbase }}, mem::size_of::<EnvRegisters>())
}
};
let bytes = unsafe {
@@ -503,6 +560,9 @@ impl Scheme for ProcScheme {
if buf.len() < mem::size_of::<FloatRegisters>() {
return Ok(0);
}
if (buf.as_ptr() as usize) % mem::align_of::<FloatRegisters>() != 0 {
return Err(Error::new(EINVAL));
}
let regs = unsafe {
*(buf as *const _ as *const FloatRegisters)
};
@@ -521,6 +581,9 @@ impl Scheme for ProcScheme {
if buf.len() < mem::size_of::<IntRegisters>() {
return Ok(0);
}
if (buf.as_ptr() as usize) % mem::align_of::<FloatRegisters>() != 0 {
return Err(Error::new(EINVAL));
}
let regs = unsafe {
*(buf as *const _ as *const IntRegisters)
};
@@ -537,6 +600,57 @@ impl Scheme for ProcScheme {
}
})
}
RegsKind::Env => {
if buf.len() < mem::size_of::<EnvRegisters>() {
return Ok(0);
}
if (buf.as_ptr() as usize) % mem::align_of::<EnvRegisters>() != 0 {
return Err(Error::new(EINVAL));
}
let regs = unsafe {
*(buf as *const _ as *const EnvRegisters)
};
use rmm::{Arch as _, X8664Arch};
if !(X8664Arch::virt_is_valid(VirtualAddress::new(regs.fsbase as usize)) && X8664Arch::virt_is_valid(VirtualAddress::new(regs.gsbase as usize))) {
return Err(Error::new(EINVAL));
}
if info.pid == context::context_id() {
#[cfg(not(feature = "x86_fsgsbase"))]
unsafe {
x86::msr::wrmsr(x86::msr::IA32_FS_BASE, regs.fsbase);
// We have to write to KERNEL_GSBASE, because when the kernel returns to
// userspace, it will have executed SWAPGS first.
x86::msr::wrmsr(x86::msr::IA32_KERNEL_GSBASE, regs.gsbase);
match context::contexts().current().ok_or(Error::new(ESRCH))?.write().arch {
ref mut arch => {
arch.fsbase = regs.fsbase as usize;
arch.gsbase = regs.gsbase as usize;
}
}
}
#[cfg(feature = "x86_fsgsbase")]
unsafe {
use x86::bits64::segmentation::*;
wrfsbase(regs.fsbase);
swapgs();
wrgsbase(regs.gsbase);
swapgs();
// No need to update the current context; with fsgsbase enabled, these
// registers are automatically saved and restored.
}
} else {
try_stop_context(info.pid, |context| {
context.arch.fsbase = regs.fsbase as usize;
context.arch.gsbase = regs.gsbase as usize;
Ok(())
})?;
}
Ok(mem::size_of::<EnvRegisters>())
}
},
Operation::Trace => {
if buf.len() < mem::size_of::<u64>() {
@@ -621,6 +735,7 @@ impl Scheme for ProcScheme {
Operation::Memory => "mem",
Operation::Regs(RegsKind::Float) => "regs/float",
Operation::Regs(RegsKind::Int) => "regs/int",
Operation::Regs(RegsKind::Env) => "regs/env",
Operation::Trace => "trace",
Operation::Static(path) => path,
});

View File

@@ -7,6 +7,7 @@ extern crate syscall;
pub use self::syscall::{
FloatRegisters,
IntRegisters,
EnvRegisters,
data,
error,
flag,

View File

@@ -27,7 +27,7 @@ use crate::scheme::FileHandle;
use crate::start::usermode;
use crate::syscall::data::{SigAction, Stat};
use crate::syscall::error::*;
use crate::syscall::flag::{wifcontinued, wifstopped, AT_ENTRY, AT_NULL, AT_PHDR, CloneFlags,
use crate::syscall::flag::{wifcontinued, wifstopped, AT_ENTRY, AT_NULL, AT_PHDR, AT_PHENT, AT_PHNUM, CloneFlags,
CLONE_FILES, CLONE_FS, CLONE_SIGHAND, CLONE_STACK, CLONE_VFORK, CLONE_VM,
MapFlags, PROT_EXEC, PROT_READ, PROT_WRITE, PTRACE_EVENT_CLONE,
PTRACE_STOP_EXIT, SigActionFlags, SIG_BLOCK, SIG_DFL, SIG_SETMASK, SIG_UNBLOCK,
@@ -57,7 +57,6 @@ pub fn clone(flags: CloneFlags, stack_base: usize) -> Result<ContextId> {
let mut image = vec![];
let mut stack_opt = None;
let mut sigstack_opt = None;
let mut tls_opt = None;
let grants;
let name;
let cwd;
@@ -202,36 +201,6 @@ pub fn clone(flags: CloneFlags, stack_base: usize) -> Result<ContextId> {
sigstack_opt = Some(new_sigstack);
}
if let Some(ref tls) = context.tls {
let mut new_tls = context::memory::Tls {
master: tls.master,
file_size: tls.file_size,
mem: context::memory::Memory::new(
VirtualAddress::new(crate::USER_TMP_TLS_OFFSET),
tls.mem.size(),
PageFlags::new().write(true),
true
),
offset: tls.offset,
};
if flags.contains(CLONE_VM) {
unsafe {
new_tls.load();
}
} else {
unsafe {
intrinsics::copy(tls.mem.start_address().data() as *const u8,
new_tls.mem.start_address().data() as *mut u8,
tls.mem.size());
}
}
new_tls.mem.remap(tls.mem.flags());
tls_opt = Some(new_tls);
}
if flags.contains(CLONE_VM) {
grants = Arc::clone(&context.grants);
} else {
@@ -352,6 +321,14 @@ pub fn clone(flags: CloneFlags, stack_base: usize) -> Result<ContextId> {
context.arch = arch;
// This is needed because these registers may have changed after this context was
// switched to, but before this was called.
#[cfg(all(target_arch = "x86_64", feature = "x86_fsgsbase"))]
unsafe {
context.arch.fsbase = x86::bits64::segmentation::rdfsbase() as usize;
context.arch.gsbase = x86::bits64::segmentation::rdgsbase() as usize;
}
let mut active_utable = unsafe { ActivePageTable::new(TableKind::User) };
let mut active_ktable = unsafe { ActivePageTable::new(TableKind::Kernel) };
@@ -378,10 +355,6 @@ pub fn clone(flags: CloneFlags, stack_base: usize) -> Result<ContextId> {
let mut new_ktable = unsafe {
InactivePageTable::from_address(new_utable.address())
};
#[cfg(target_arch = "x86_64")]
{
context.arch.update_tcb(pid.into());
}
// Copy kernel image mapping
{
@@ -502,15 +475,6 @@ pub fn clone(flags: CloneFlags, stack_base: usize) -> Result<ContextId> {
context.sigstack = Some(sigstack);
}
// Set up TCB
let tcb_addr = crate::USER_TCB_OFFSET + context.id.into() * PAGE_SIZE;
let mut tcb = context::memory::Memory::new(
VirtualAddress::new(tcb_addr),
PAGE_SIZE,
PageFlags::new().write(true).user(true),
true
);
#[cfg(target_arch = "aarch64")]
{
if let Some(stack) = &mut context.kstack {
@@ -534,38 +498,6 @@ pub fn clone(flags: CloneFlags, stack_base: usize) -> Result<ContextId> {
}
}
// Setup user TLS
if let Some(mut tls) = tls_opt {
// Copy TLS mapping
{
let frame = active_utable.p4()[crate::USER_TLS_PML4].pointed_frame().expect("user tls not mapped");
let flags = active_utable.p4()[crate::USER_TLS_PML4].flags();
active_utable.with(&mut new_utable, &mut temporary_upage, |mapper| {
mapper.p4_mut()[crate::USER_TLS_PML4].set(frame, flags);
});
}
// TODO: Make sure size is not greater than USER_TLS_SIZE
let tls_addr = crate::USER_TLS_OFFSET + context.id.into() * crate::USER_TLS_SIZE;
//println!("{}: Copy TLS: address 0x{:x}, size 0x{:x}", context.id.into(), tls_addr, tls.mem.size());
tls.mem.move_to(VirtualAddress::new(tls_addr), &mut new_utable, &mut temporary_upage);
unsafe {
*(tcb_addr as *mut usize) = tls.mem.start_address().data() + tls.mem.size();
}
context.tls = Some(tls);
} else {
//println!("{}: Copy TCB", context.id.into());
let parent_tcb_addr = crate::USER_TCB_OFFSET + ppid.into() * PAGE_SIZE;
unsafe {
intrinsics::copy(parent_tcb_addr as *const u8,
tcb_addr as *mut u8,
tcb.size());
}
}
tcb.move_to(VirtualAddress::new(tcb_addr), &mut new_utable, &mut temporary_upage);
context.image.push(tcb.to_shared());
context.name = name;
context.cwd = cwd;
@@ -599,13 +531,11 @@ fn empty(context: &mut context::Context, reaping: bool) {
assert!(context.image.is_empty());
assert!(context.stack.is_none());
assert!(context.sigstack.is_none());
assert!(context.tls.is_none());
} else {
// Unmap previous image, heap, grants, stack, and tls
// Unmap previous image, heap, grants, stack
context.image.clear();
drop(context.stack.take());
drop(context.sigstack.take());
drop(context.tls.take());
}
// NOTE: If we do not replace the grants `Arc`, then a strange situation can appear where the
@@ -651,10 +581,12 @@ impl Drop for ExecFile {
}
}
#[allow(clippy::too_many_arguments)]
fn fexec_noreturn(
setuid: Option<u32>,
setgid: Option<u32>,
name: Box<str>,
phdrs_region: core::ops::Range<usize>,
data: Box<[u8]>,
args: Box<[Box<[u8]>]>,
vars: Box<[Box<[u8]>]>,
@@ -664,6 +596,11 @@ fn fexec_noreturn(
let singlestep;
let mut sp = crate::USER_STACK_OFFSET + crate::USER_STACK_SIZE - 256;
let phdrs_len = 4096;
let phdrs_base_addr = sp - phdrs_len;
sp -= phdrs_len;
{
let (vfork, ppid, files) = {
let contexts = context::contexts();
@@ -678,6 +615,25 @@ fn fexec_noreturn(
empty(&mut context, false);
#[cfg(all(target_arch = "x86_64"))]
{
context.arch.fsbase = 0;
context.arch.gsbase = 0;
#[cfg(feature = "x86_fsgsbase")]
unsafe {
x86::bits64::segmentation::wrfsbase(0);
x86::bits64::segmentation::swapgs();
x86::bits64::segmentation::wrgsbase(0);
x86::bits64::segmentation::swapgs();
}
#[cfg(not(feature = "x86_fsgsbase"))]
unsafe {
x86::msr::wrmsr(x86::msr::IA32_FS_BASE, 0);
x86::msr::wrmsr(x86::msr::IA32_KERNEL_GSBASE, 0);
}
}
if let Some(uid) = setuid {
context.euid = uid;
}
@@ -687,20 +643,10 @@ fn fexec_noreturn(
}
// Map and copy new segments
let mut tls_opt = None;
{
let elf = elf::Elf::from(&data).unwrap();
entry = elf.entry();
// Always map TCB
let tcb_addr = crate::USER_TCB_OFFSET + context.id.into() * PAGE_SIZE;
let tcb_mem = context::memory::Memory::new(
VirtualAddress::new(tcb_addr),
PAGE_SIZE,
PageFlags::new().write(true).user(true),
true
);
for segment in elf.segments() {
match segment.p_type {
program_header::PT_LOAD => {
@@ -734,45 +680,11 @@ fn fexec_noreturn(
context.image.push(memory.to_shared());
},
program_header::PT_TLS => {
let aligned_size = if segment.p_align > 0 {
((segment.p_memsz + (segment.p_align - 1))/segment.p_align) * segment.p_align
} else {
segment.p_memsz
} as usize;
let rounded_size = ((aligned_size + PAGE_SIZE - 1)/PAGE_SIZE) * PAGE_SIZE;
let rounded_offset = rounded_size - aligned_size;
// TODO: Make sure size is not greater than USER_TLS_SIZE
let tls_addr = crate::USER_TLS_OFFSET + context.id.into() * crate::USER_TLS_SIZE;
let tls = context::memory::Tls {
master: VirtualAddress::new(segment.p_vaddr as usize),
file_size: segment.p_filesz as usize,
mem: context::memory::Memory::new(
VirtualAddress::new(tls_addr),
rounded_size as usize,
PageFlags::new().write(true).user(true),
true
),
offset: rounded_offset as usize,
};
unsafe {
*(tcb_addr as *mut usize) = tls.mem.start_address().data() + tls.mem.size();
}
tls_opt = Some(tls);
},
_ => (),
}
}
context.image.push(tcb_mem.to_shared());
}
// Data no longer required, can deallocate
drop(data);
// Map stack
context.stack = Some(context::memory::Memory::new(
VirtualAddress::new(crate::USER_STACK_OFFSET),
@@ -789,20 +701,19 @@ fn fexec_noreturn(
true
));
// Map TLS
if let Some(mut tls) = tls_opt {
unsafe {
tls.load();
}
context.tls = Some(tls);
}
let mut push = |arg| {
sp -= mem::size_of::<usize>();
unsafe { *(sp as *mut usize) = arg; }
};
unsafe {
let mut source = core::slice::from_raw_parts_mut(phdrs_base_addr as *mut u8, phdrs_len);
source[..phdrs_region.len()].copy_from_slice(&data[phdrs_region.clone()]);
}
// Data no longer required, can deallocate
drop(data);
// Push auxiliary vector
push(AT_NULL);
for &arg in auxv.iter().rev() {
@@ -1019,7 +930,11 @@ pub fn fexec_kernel(fd: FileHandle, args: Box<[Box<[u8]>]>, vars: Box<[Box<[u8]>
auxv.push(AT_ENTRY);
auxv.push(elf.entry());
auxv.push(AT_PHDR);
auxv.push(elf.program_headers());
auxv.push(crate::USER_STACK_OFFSET + crate::USER_STACK_SIZE - 256 - 4096);
auxv.push(AT_PHENT);
auxv.push(elf.program_headers_size());
auxv.push(AT_PHNUM);
auxv.push(elf.program_header_count());
auxv
};
@@ -1068,26 +983,19 @@ pub fn fexec_kernel(fd: FileHandle, args: Box<[Box<[u8]>]>, vars: Box<[Box<[u8]>
Some(auxv),
);
},
program_header::PT_LOAD => {
let voff = segment.p_vaddr as usize % PAGE_SIZE;
let vaddr = segment.p_vaddr as usize - voff;
// Due to the Userspace and kernel TLS bases being located right above 2GB,
// limit any loadable sections to lower than that. Eventually we will need
// to replace this with a more intelligent TLS address
if vaddr >= 0x8000_0000 {
println!("exec: invalid section address {:X}", segment.p_vaddr);
return Err(Error::new(ENOEXEC));
}
},
_ => (),
}
}
let phdr_range = elf.program_headers()..elf.program_headers() + elf.program_headers_size() * elf.program_header_count();
if phdr_range.len() > 4096 {
return Err(Error::new(ENOMEM));
}
// This is the point of no return, quite literaly. Any checks for validity need
// to be done before, and appropriate errors returned. Otherwise, we have nothing
// to return to.
fexec_noreturn(setuid, setgid, name.into_boxed_str(), data.into_boxed_slice(), args, vars, auxv.into_boxed_slice());
fexec_noreturn(setuid, setgid, name.into_boxed_str(), phdr_range, data.into_boxed_slice(), args, vars, auxv.into_boxed_slice());
}
pub fn fexec(fd: FileHandle, arg_ptrs: &[[usize; 2]], var_ptrs: &[[usize; 2]]) -> Result<usize> {

Submodule syscall updated: 841b5f4221...519a09e964