From fec8f4aa0c2f0eb1de14603c38646b80dde7d3bc Mon Sep 17 00:00:00 2001
From: 4lDO2 <4lDO2@protonmail.com>
Date: Wed, 3 Feb 2021 17:37:09 +0100
Subject: [PATCH 1/3] Use physical addresses internally for futexes.

This solves a bug, that allows processes in different address spaces to
be the target of a futex wakeup call, even though that process is in
another address space!
---
 src/syscall/futex.rs | 49 +++++++++++++++++++++++++++++++++++---------
 1 file changed, 39 insertions(+), 10 deletions(-)
diff --git a/src/syscall/futex.rs b/src/syscall/futex.rs
index ee50c43..9e04feb 100644
--- a/src/syscall/futex.rs
+++ b/src/syscall/futex.rs
@@ -9,12 +9,19 @@ use spin::{Once, RwLock, RwLockReadGuard, RwLockWriteGuard};
 
 use crate::context::{self, Context};
 use crate::time;
+use crate::memory::PhysicalAddress;
+use crate::paging::{ActivePageTable, VirtualAddress};
 use crate::syscall::data::TimeSpec;
-use crate::syscall::error::{Error, Result, ESRCH, EAGAIN, EINVAL};
+use crate::syscall::error::{Error, Result, ESRCH, EAGAIN, EFAULT, EINVAL};
 use crate::syscall::flag::{FUTEX_WAIT, FUTEX_WAKE, FUTEX_REQUEUE};
 use crate::syscall::validate::{validate_slice, validate_slice_mut};
 
-type FutexList = VecDeque<(usize, Arc<RwLock<Context>>)>;
+type FutexList = VecDeque<FutexEntry>;
+
+pub struct FutexEntry {
+    target_physaddr: PhysicalAddress,
+    context_lock: Arc<RwLock<Context>>,
+}
 
 /// Fast userspace mutex list
 static FUTEXES: Once<RwLock<FutexList>> = Once::new();
@@ -34,7 +41,17 @@ pub fn futexes_mut() -> RwLockWriteGuard<'static, FutexList> {
     FUTEXES.call_once(init_futexes).write()
 }
 
+// FIXME: Don't take a mutable reference to the addr, since rustc can make assumptions that the
+// pointee cannot be changed by another thread, which could make atomic ops useless.
 pub fn futex(addr: &mut i32, op: usize, val: i32, val2: usize, addr2: *mut i32) -> Result<usize> {
+    let target_physaddr = unsafe {
+        let mut active_table = ActivePageTable::new();
+        let virtual_address = VirtualAddress::new(addr as *mut i32 as usize);
+
+        // FIXME: Already validated in syscall/mod.rs
+        active_table.translate(virtual_address).ok_or(Error::new(EFAULT))?
+    };
+
     match op {
         FUTEX_WAIT => {
             let timeout_opt = if val2 != 0 {
@@ -69,7 +86,10 @@ pub fn futex(addr: &mut i32, op: usize, val: i32, val2: usize, addr2: *mut i32)
                     context.block("futex");
                 }
 
-                futexes.push_back((addr as *mut i32 as usize, context_lock));
+                futexes.push_back(FutexEntry {
+                    target_physaddr,
+                    context_lock,
+                });
             }
 
             unsafe { context::switch(); }
@@ -97,9 +117,10 @@ pub fn futex(addr: &mut i32, op: usize, val: i32, val2: usize, addr2: *mut i32)
 
                 let mut i = 0;
                 while i < futexes.len() && (woken as i32) < val {
-                    if futexes[i].0 == addr as *mut i32 as usize {
+                    if futexes[i].target_physaddr == target_physaddr {
                         if let Some(futex) = futexes.swap_remove_back(i) {
-                            futex.1.write().unblock();
+                            let mut context_guard = futex.context_lock.write();
+                            context_guard.unblock();
                             woken += 1;
                         }
                     } else {
@@ -111,7 +132,15 @@ pub fn futex(addr: &mut i32, op: usize, val: i32, val2: usize, addr2: *mut i32)
             Ok(woken)
         },
         FUTEX_REQUEUE => {
-            let addr2_safe = validate_slice_mut(addr2, 1).map(|addr2_safe| &mut addr2_safe[0])?;
+            let addr2_physaddr = unsafe {
+                let mut active_table = ActivePageTable::new();
+
+                let addr2_safe = validate_slice_mut(addr2, 1).map(|addr2_safe| &mut addr2_safe[0])?;
+                let addr2_virt = VirtualAddress::new(addr2_safe as *mut i32 as usize);
+
+                // FIXME: Already validated.
+                active_table.translate(addr2_virt).ok_or(Error::new(EFAULT))?
+            };
 
             let mut woken = 0;
             let mut requeued = 0;
@@ -121,9 +150,9 @@ pub fn futex(addr: &mut i32, op: usize, val: i32, val2: usize, addr2: *mut i32)
 
                 let mut i = 0;
                 while i < futexes.len() && (woken as i32) < val {
-                    if futexes[i].0 == addr as *mut i32 as usize {
+                    if futexes[i].target_physaddr == target_physaddr {
                         if let Some(futex) = futexes.swap_remove_back(i) {
-                            futex.1.write().unblock();
+                            futex.context_lock.write().unblock();
                             woken += 1;
                         }
                     } else {
@@ -131,8 +160,8 @@ pub fn futex(addr: &mut i32, op: usize, val: i32, val2: usize, addr2: *mut i32)
                     }
                 }
                 while i < futexes.len() && requeued < val2 {
-                    if futexes[i].0 == addr as *mut i32 as usize {
-                        futexes[i].0 = addr2_safe as *mut i32 as usize;
+                    if futexes[i].target_physaddr == target_physaddr {
+                        futexes[i].target_physaddr = addr2_physaddr;
                         requeued += 1;
                     }
                     i += 1;

From 44527a8340f17a6ac023e8e7b28b4a2a8cda0a88 Mon Sep 17 00:00:00 2001
From: 4lDO2 <4lDO2@protonmail.com>
Date: Mon, 11 Jan 2021 10:58:20 +0100
Subject: [PATCH 2/3] Fix a very annoying multi_core data race*.

So, when I first introduced io_uring, it was not compiled with the
`multi_core` kernel feature, mainly to make development easier (I
thought). However, since io_uring allows multiple simultaneous system
calls, we cannot longer make the in-kernel contexts block, for example
when receiving a message from a pipe, if there can be multiple such
requests simultaneously.

This has required me to change WaitCondition into allowing multiple
simultaneous tasks; although, it introduces a potential race condition:
since a future can only return Pending and not block directly before
releasing the lock (condvar logic), we need some way to make sure that
nothing happens after the context finds out that it has to wait, and the
actual waiting. If a message is pushed in between, and the waker is
called (Context::unblock), just before it was going to block itself,
then we miss the message, and potentially cause a deadlock.

Fortunately, in order to block and unblock contexts, we need to
exclusively lock the context. So, what we can do to ensure that waking
while running is no longer a no-op, is to introduce a "wake flag", which
is set only if the context is currently running, and Runnable.

But, this still caused all weird kinds of hard-to-debug problems, with
arbitrary CPU exceptions and possibly memory corruption. The reason for
this, is that the context switching logic uses really unsafe operations,
which is why context switching (at the moment) requires an exclusive
lock. Before this commit, it would modify the `running` field after the
lock had been released, which obviously can cause a data race, when the
regular context waker code that is run within a system call, locks the
context but not the global switching lock.

The solution was to make sure that the locks were held, all the way
until the actual switching, which was done in assembly. There can still
be a race condition here, since it modifies memory containing registers
after the lock has been released, even if it may be behind &mut on
another context, which can be UB, but it has not contributed to any
actual bugs... yet.

* I have not yet done that rigorous testing, but it appears to work well
enough, and I have not encountered the bug after like 10 tries.
---
 src/context/list.rs    |   3 +
 src/context/switch.rs  | 138 +++++++++++++++++++++++------------------
 src/syscall/process.rs |   2 +-
 3 files changed, 81 insertions(+), 62 deletions(-)

diff --git a/src/context/list.rs b/src/context/list.rs
index d56b0e2..a1a2895 100644
--- a/src/context/list.rs
+++ b/src/context/list.rs
@@ -47,6 +47,9 @@ impl ContextList {
     pub fn iter(&self) -> ::alloc::collections::btree_map::Iter<ContextId, Arc<RwLock<Context>>> {
         self.map.iter()
     }
+    pub fn range(&self, range: impl core::ops::RangeBounds<ContextId>) -> ::alloc::collections::btree_map::Range<'_, ContextId, Arc<RwLock<Context>>> {
+        self.map.range(range)
+    }
 
     /// Create a new context.
     pub fn new_context(&mut self) -> Result<&Arc<RwLock<Context>>> {
diff --git a/src/context/switch.rs b/src/context/switch.rs
index 877033d..75d97b6 100644
--- a/src/context/switch.rs
+++ b/src/context/switch.rs
@@ -1,5 +1,8 @@
+use core::ops::Bound;
 use core::sync::atomic::Ordering;
 
+use alloc::sync::Arc;
+
 use crate::context::signal::signal_handler;
 use crate::context::{arch, contexts, Context, Status, CONTEXT_ID};
 use crate::gdt;
@@ -72,8 +75,6 @@ unsafe fn runnable(context: &Context, cpu_id: usize) -> bool {
 ///
 /// Do not call this while holding locks!
 pub unsafe fn switch() -> bool {
-    use core::ops::DerefMut;
-
     //set PIT Interrupt counter to 0, giving each process same amount of PIT ticks
     let ticks = PIT_TICKS.swap(0, Ordering::SeqCst);
 
@@ -84,93 +85,108 @@ pub unsafe fn switch() -> bool {
 
     let cpu_id = crate::cpu_id();
 
-    let from_ptr;
-    let mut to_ptr = 0 as *mut Context;
+    let from_context_lock;
+    let mut from_context_guard;
+    let mut to_context_lock: Option<(Arc<spin::RwLock<Context>>, *mut Context)> = None;
     let mut to_sig = None;
     {
         let contexts = contexts();
         {
-            let context_lock = contexts
+            from_context_lock = Arc::clone(contexts
                 .current()
-                .expect("context::switch: not inside of context");
-            let mut context = context_lock.write();
-            context.ticks += ticks as u64 + 1; // Always round ticks up
-            from_ptr = context.deref_mut() as *mut Context;
-        }
-
-        macro_rules! to {
-            ($context:expr) => {{
-                let context: &mut Context = $context;
-                if runnable(context, cpu_id) {
-                    to_ptr = context as *mut Context;
-                    if context.ksig.is_none() {
-                        to_sig = context.pending.pop_front();
-                    }
-                    true
-                } else {
-                    false
-                }
-            }};
-        };
-
-        for (_pid, context_lock) in contexts.iter() {
-            let mut context = context_lock.write();
-            update(&mut context, cpu_id);
+                .expect("context::switch: not inside of context"));
+            from_context_guard = from_context_lock.write();
+            from_context_guard.ticks += ticks as u64 + 1; // Always round ticks up
         }
 
         for (pid, context_lock) in contexts.iter() {
-            if *pid > (*from_ptr).id {
-                let mut context = context_lock.write();
-                if to!(&mut context) {
-                    break;
-                }
-            }
+            let mut context;
+            let context_ref = if *pid == from_context_guard.id {
+                &mut *from_context_guard
+            } else {
+                context = context_lock.write();
+                &mut *context
+            };
+            update(context_ref, cpu_id);
         }
 
-        if to_ptr as usize == 0 {
-            for (pid, context_lock) in contexts.iter() {
-                if *pid < (*from_ptr).id {
-                    let mut context = context_lock.write();
-                    if to!(&mut context) {
-                        break;
-                    }
+        for (_pid, context_lock) in contexts
+            // Include all contexts with IDs greater than the current...
+            .range(
+                (Bound::Excluded(from_context_guard.id), Bound::Unbounded)
+            )
+            .chain(contexts
+                // ... and all contexts with IDs less than the current...
+                .range((Bound::Unbounded, Bound::Excluded(from_context_guard.id)))
+            )
+            // ... but not the current context, which is already locked
+        {
+            let context_lock = Arc::clone(context_lock);
+            let mut to_context_guard = context_lock.write();
+
+            if runnable(&*to_context_guard, cpu_id) {
+                if to_context_guard.ksig.is_none() {
+                    to_sig = to_context_guard.pending.pop_front();
                 }
+                let ptr: *mut Context = &mut *to_context_guard;
+                core::mem::forget(to_context_guard);
+                to_context_lock = Some((context_lock, ptr));
+                break;
+            } else {
+                continue;
             }
         }
     };
 
     // Switch process states, TSS stack pointer, and store new context ID
-    if to_ptr as usize != 0 {
-        (*from_ptr).running = false;
-        (*to_ptr).running = true;
-        if let Some(ref stack) = (*to_ptr).kstack {
+    if let Some((to_context_lock, to_ptr)) = to_context_lock {
+        let to_context: &mut Context = &mut *to_ptr;
+
+        from_context_guard.running = false;
+        to_context.running = true;
+        if let Some(ref stack) = to_context.kstack {
             gdt::set_tss_stack(stack.as_ptr() as usize + stack.len());
         }
-        gdt::set_tcb((*to_ptr).id.into());
-        CONTEXT_ID.store((*to_ptr).id, Ordering::SeqCst);
-    }
+        gdt::set_tcb(to_context.id.into());
+        CONTEXT_ID.store(to_context.id, Ordering::SeqCst);
 
-    if to_ptr as usize == 0 {
-        // No target was found, unset global lock and return
-        arch::CONTEXT_SWITCH_LOCK.store(false, Ordering::SeqCst);
-
-        false
-    } else {
         if let Some(sig) = to_sig {
             // Signal was found, run signal handler
 
             //TODO: Allow nested signals
-            assert!((*to_ptr).ksig.is_none());
+            assert!(to_context.ksig.is_none());
 
-            let arch = (*to_ptr).arch.clone();
-            let kfx = (*to_ptr).kfx.clone();
-            let kstack = (*to_ptr).kstack.clone();
-            (*to_ptr).ksig = Some((arch, kfx, kstack, sig));
-            (*to_ptr).arch.signal_stack(signal_handler, sig);
+            let arch = to_context.arch.clone();
+            let kfx = to_context.kfx.clone();
+            let kstack = to_context.kstack.clone();
+            to_context.ksig = Some((arch, kfx, kstack, sig));
+            to_context.arch.signal_stack(signal_handler, sig);
         }
 
+        let from_ptr: *mut Context = &mut *from_context_guard;
+        let to_ptr: *mut Context = &mut *to_context;
+
+        // FIXME: Ensure that this critical section is somehow still protected by the lock, and not
+        // just for other processes' context switching, but for other operations which do not
+        // require CONTEXT_SWITCH_LOCK.
+        //
+        // What I suggest, is that we wrap the Context struct (typically stored as
+        // `Arc<RwLock<Context>>`), into a wrapper with interior locking for the inner context type
+        // (which could be something like `RwLock<ContextInner>`). The wrapper would also contain
+        // a field of type `UnsafeCell<ContextArch>`, which would be accessible if and only if the
+        // `running` field is set to false, making that field function as a lock.
+        drop(from_context_guard);
+        drop(from_context_lock);
+        to_context_lock.force_write_unlock();
+        drop(to_context_lock);
+
         (*from_ptr).arch.switch_to(&mut (*to_ptr).arch);
 
         true
+    } else {
+        // No target was found, unset global lock and return
+        arch::CONTEXT_SWITCH_LOCK.store(false, Ordering::SeqCst);
+
+        false
     }
 }
diff --git a/src/syscall/process.rs b/src/syscall/process.rs
index dfed1b1..b10e00b 100644
--- a/src/syscall/process.rs
+++ b/src/syscall/process.rs
@@ -46,7 +46,7 @@ pub fn clone(flags: CloneFlags, stack_base: usize) -> Result<ContextId> {
         let ens;
         let umask;
         let sigmask;
-        let cpu_id_opt = None;
+        let mut cpu_id_opt = None;
         let arch;
         let vfork;
         let mut kfx_opt = None;

From 6f3fc3a4f40c7120ee34fc147976f1b00b7ffed1 Mon Sep 17 00:00:00 2001
From: 4lDO2 <4lDO2@protonmail.com>
Date: Wed, 3 Feb 2021 18:10:39 +0100
Subject: [PATCH 3/3] Make cpu_id_opt non-mutable.

---
 src/syscall/process.rs | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/syscall/process.rs b/src/syscall/process.rs
index b10e00b..dfed1b1 100644
--- a/src/syscall/process.rs
+++ b/src/syscall/process.rs
@@ -46,7 +46,7 @@ pub fn clone(flags: CloneFlags, stack_base: usize) -> Result<ContextId> {
         let ens;
         let umask;
         let sigmask;
-        let mut cpu_id_opt = None;
+        let cpu_id_opt = None;
         let arch;
         let vfork;
         let mut kfx_opt = None;