2 * Copyright (c) 2015 Nuxi, https://nuxi.nl/
4 * Redistribution and use in source and binary forms, with or without
5 * modification, are permitted provided that the following conditions
7 * 1. Redistributions of source code must retain the above copyright
8 * notice, this list of conditions and the following disclaimer.
9 * 2. Redistributions in binary form must reproduce the above copyright
10 * notice, this list of conditions and the following disclaimer in the
11 * documentation and/or other materials provided with the distribution.
13 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
14 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
15 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
16 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
17 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
18 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
19 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
20 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
21 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
22 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
26 #include <sys/cdefs.h>
27 __FBSDID("$FreeBSD$");
29 #include <sys/param.h>
30 #include <sys/kernel.h>
31 #include <sys/limits.h>
33 #include <sys/malloc.h>
34 #include <sys/mutex.h>
37 #include <sys/systm.h>
40 #include <vm/vm_param.h>
42 #include <vm/vm_extern.h>
43 #include <vm/vm_map.h>
44 #include <vm/vm_object.h>
46 #include <compat/cloudabi/cloudabi_proto.h>
47 #include <compat/cloudabi/cloudabi_syscalldefs.h>
48 #include <compat/cloudabi/cloudabi_util.h>
51 * Futexes for CloudABI.
53 * On most systems, futexes are implemented as objects of a single type
54 * on which a set of operations can be performed. CloudABI makes a clear
55 * distinction between locks and condition variables. A lock may have
56 * zero or more associated condition variables. A condition variable is
57 * always associated with exactly one lock. There is a strict topology.
58 * This approach has two advantages:
60 * - This topology is guaranteed to be acyclic. Requeueing of threads
61 * only happens in one direction (from condition variables to locks).
63 * - It means that a futex object for a lock exists when it is unlocked,
64 * but has threads waiting on associated condition variables. Threads
65 * can be requeued to a lock even if the thread performing the wakeup
66 * does not have the lock mapped in its address space.
68 * This futex implementation only implements a single lock type, namely
69 * a read-write lock. A regular mutex type would not be necessary, as
70 * the read-write lock is as efficient as a mutex if used as such.
71 * Userspace futex locks are 32 bits in size:
73 * - 1 bit: has threads waiting in kernel-space.
74 * - 1 bit: is write-locked.
76 * - if write-locked: thread ID of owner.
77 * - if not write-locked: number of read locks held.
79 * Condition variables are also 32 bits in size. Its value is modified
80 * by kernel-space exclusively. Zero indicates that it has no waiting
81 * threads. Non-zero indicates the opposite.
83 * This implementation is optimal, in the sense that it only wakes up
84 * threads if they can actually continue execution. It does not suffer
85 * from the thundering herd problem. If multiple threads waiting on a
86 * condition variable need to be woken up, only a single thread is
87 * scheduled. All other threads are 'donated' to this thread. After the
88 * thread manages to reacquire the lock, it requeues its donated threads
91 * TODO(ed): Integrate this functionality into kern_umtx.c instead.
92 * TODO(ed): Store futex objects in a hash table.
93 * TODO(ed): Add actual priority inheritance.
94 * TODO(ed): Let futex_queue also take priorities into account.
95 * TODO(ed): Make locking fine-grained.
96 * TODO(ed): Perform sleeps until an actual absolute point in time,
97 * instead of converting the timestamp to a relative value.
100 struct futex_address;
101 struct futex_condvar;
106 /* Identifier of a location in memory. */
107 struct futex_address {
108 /* For process-private objects: address space of the process. */
109 struct vmspace * fa_vmspace;
110 /* For process-shared objects: VM object containing the object. */
111 struct vm_object * fa_vmobject;
113 /* Memory address within address space or offset within VM object. */
117 /* A set of waiting threads. */
119 STAILQ_HEAD(, futex_waiter) fq_list;
120 unsigned int fq_count;
123 /* Condition variables. */
124 struct futex_condvar {
125 /* Address of the condition variable. */
126 struct futex_address fc_address;
128 /* The lock the waiters should be moved to when signalled. */
129 struct futex_lock * fc_lock;
131 /* Threads waiting on the condition variable. */
132 struct futex_queue fc_waiters;
134 * Number of threads blocked on this condition variable, or
135 * being blocked on the lock after being requeued.
137 unsigned int fc_waitcount;
139 /* Global list pointers. */
140 LIST_ENTRY(futex_condvar) fc_next;
143 /* Read-write locks. */
145 /* Address of the lock. */
146 struct futex_address fl_address;
149 * Current owner of the lock. LOCK_UNMANAGED if the lock is
150 * currently not owned by the kernel. LOCK_OWNER_UNKNOWN in case
151 * the owner is not known (e.g., when the lock is read-locked).
153 cloudabi_tid_t fl_owner;
154 #define LOCK_UNMANAGED 0x0
155 #define LOCK_OWNER_UNKNOWN 0x1
157 /* Writers blocked on the lock. */
158 struct futex_queue fl_writers;
159 /* Readers blocked on the lock. */
160 struct futex_queue fl_readers;
161 /* Number of threads blocked on this lock + condition variables. */
162 unsigned int fl_waitcount;
164 /* Global list pointers. */
165 LIST_ENTRY(futex_lock) fl_next;
168 /* Information associated with a thread blocked on an object. */
169 struct futex_waiter {
171 cloudabi_tid_t fw_tid;
172 /* Condition variable used for waiting. */
175 /* Queue this waiter is currently placed in. */
176 struct futex_queue * fw_queue;
177 /* List pointers of fw_queue. */
178 STAILQ_ENTRY(futex_waiter) fw_next;
180 /* Lock has been acquired. */
182 /* If not locked, threads that should block after acquiring. */
183 struct futex_queue fw_donated;
186 /* Global data structures. */
187 static MALLOC_DEFINE(M_FUTEX, "futex", "CloudABI futex");
189 static struct sx futex_global_lock;
190 SX_SYSINIT(futex_global_lock, &futex_global_lock, "CloudABI futex global lock");
192 static LIST_HEAD(, futex_lock) futex_lock_list =
193 LIST_HEAD_INITIALIZER(&futex_lock_list);
194 static LIST_HEAD(, futex_condvar) futex_condvar_list =
195 LIST_HEAD_INITIALIZER(&futex_condvar_list);
197 /* Utility functions. */
198 static void futex_lock_assert(const struct futex_lock *);
199 static struct futex_lock *futex_lock_lookup_locked(struct futex_address *);
200 static void futex_lock_release(struct futex_lock *);
201 static int futex_lock_tryrdlock(struct futex_lock *, cloudabi_lock_t *);
202 static int futex_lock_unmanage(struct futex_lock *, cloudabi_lock_t *);
203 static int futex_lock_update_owner(struct futex_lock *, cloudabi_lock_t *);
204 static int futex_lock_wake_up_next(struct futex_lock *, cloudabi_lock_t *);
205 static unsigned int futex_queue_count(const struct futex_queue *);
206 static void futex_queue_init(struct futex_queue *);
207 static void futex_queue_requeue(struct futex_queue *, struct futex_queue *,
209 static int futex_queue_sleep(struct futex_queue *, struct futex_lock *,
210 struct futex_waiter *, struct thread *, cloudabi_clockid_t,
211 cloudabi_timestamp_t, cloudabi_timestamp_t);
212 static cloudabi_tid_t futex_queue_tid_best(const struct futex_queue *);
213 static void futex_queue_wake_up_all(struct futex_queue *);
214 static void futex_queue_wake_up_best(struct futex_queue *);
215 static void futex_queue_wake_up_donate(struct futex_queue *, unsigned int);
216 static int futex_user_load(uint32_t *, uint32_t *);
217 static int futex_user_store(uint32_t *, uint32_t);
218 static int futex_user_cmpxchg(uint32_t *, uint32_t, uint32_t *, uint32_t);
221 * futex_address operations.
225 futex_address_create(struct futex_address *fa, struct thread *td,
226 const void *object, cloudabi_mflags_t scope)
229 struct vm_object *vo;
231 vm_map_entry_t entry;
237 * Most of the time objects are stored in privately mapped
238 * anonymous memory. For these objects we wouldn't need to look
239 * up the corresponding VM object. The scope hint provided by
240 * userspace allows us to skip the VM map lookup for the common
243 * POSIX does permit enabling PTHREAD_PROCESS_SHARED on a lock
244 * stored in a private mapping, at the cost of additional
245 * performance overhead. Fall back to identifying the object by
246 * virtual memory address if the mapping isn't shared.
248 vs = td->td_proc->p_vmspace;
250 case CLOUDABI_MAP_SHARED:
252 if (vm_map_lookup(&map, (vm_offset_t)object,
253 VM_PROT_COPY | VM_PROT_WRITE, &entry, &vo, &pindex, &prot,
254 &wired) != KERN_SUCCESS)
257 if (entry->inheritance == VM_INHERIT_SHARE) {
259 * Address corresponds to a shared mapping.
260 * Identify the address by its VM object.
262 fa->fa_vmspace = NULL;
263 fa->fa_vmobject = vo;
264 vm_object_reference(vo);
265 fa->fa_offset = entry->offset - entry->start +
267 vm_map_lookup_done(map, entry);
270 vm_map_lookup_done(map, entry);
272 case CLOUDABI_MAP_PRIVATE:
274 * Address corresponds to a private mapping. Never
275 * identify the address by its VM object, as shadow
276 * objects may get inserted if another thread forks.
277 * Simply use the VM space instead.
280 fa->fa_vmobject = NULL;
281 fa->fa_offset = (uintptr_t)object;
289 futex_address_free(struct futex_address *fa)
292 if (fa->fa_vmobject != NULL)
293 vm_object_deallocate(fa->fa_vmobject);
297 futex_address_match(const struct futex_address *fa1,
298 const struct futex_address *fa2)
301 /* Either fa_vmspace or fa_vmobject is NULL. */
302 return (fa1->fa_vmspace == fa2->fa_vmspace &&
303 fa1->fa_vmobject == fa2->fa_vmobject &&
304 fa1->fa_offset == fa2->fa_offset);
308 * futex_condvar operations.
312 futex_condvar_assert(const struct futex_condvar *fc)
315 KASSERT(fc->fc_waitcount >= futex_queue_count(&fc->fc_waiters),
316 ("Total number of waiters cannot be smaller than the wait queue"));
317 futex_lock_assert(fc->fc_lock);
321 futex_condvar_lookup(struct thread *td, const cloudabi_condvar_t *address,
322 cloudabi_mflags_t scope, struct futex_condvar **fcret)
324 struct futex_address fa_condvar;
325 struct futex_condvar *fc;
328 error = futex_address_create(&fa_condvar, td, address, scope);
332 sx_xlock(&futex_global_lock);
333 LIST_FOREACH(fc, &futex_condvar_list, fc_next) {
334 if (futex_address_match(&fc->fc_address, &fa_condvar)) {
335 /* Found matching lock object. */
336 futex_address_free(&fa_condvar);
337 futex_condvar_assert(fc);
342 sx_xunlock(&futex_global_lock);
343 futex_address_free(&fa_condvar);
348 futex_condvar_lookup_or_create(struct thread *td,
349 const cloudabi_condvar_t *condvar, cloudabi_mflags_t condvar_scope,
350 const cloudabi_lock_t *lock, cloudabi_mflags_t lock_scope,
351 struct futex_condvar **fcret)
353 struct futex_address fa_condvar, fa_lock;
354 struct futex_condvar *fc;
355 struct futex_lock *fl;
358 error = futex_address_create(&fa_condvar, td, condvar, condvar_scope);
361 error = futex_address_create(&fa_lock, td, lock, lock_scope);
363 futex_address_free(&fa_condvar);
367 sx_xlock(&futex_global_lock);
368 LIST_FOREACH(fc, &futex_condvar_list, fc_next) {
369 if (!futex_address_match(&fc->fc_address, &fa_condvar))
372 if (!futex_address_match(&fl->fl_address, &fa_lock)) {
373 /* Condition variable is owned by a different lock. */
374 futex_address_free(&fa_condvar);
375 futex_address_free(&fa_lock);
376 sx_xunlock(&futex_global_lock);
380 /* Found fully matching condition variable. */
381 futex_address_free(&fa_condvar);
382 futex_address_free(&fa_lock);
383 futex_condvar_assert(fc);
388 /* None found. Create new condition variable object. */
389 fc = malloc(sizeof(*fc), M_FUTEX, M_WAITOK);
390 fc->fc_address = fa_condvar;
391 fc->fc_lock = futex_lock_lookup_locked(&fa_lock);
392 futex_queue_init(&fc->fc_waiters);
393 fc->fc_waitcount = 0;
394 LIST_INSERT_HEAD(&futex_condvar_list, fc, fc_next);
400 futex_condvar_release(struct futex_condvar *fc)
402 struct futex_lock *fl;
404 futex_condvar_assert(fc);
406 if (fc->fc_waitcount == 0) {
407 /* Condition variable has no waiters. Deallocate it. */
408 futex_address_free(&fc->fc_address);
409 LIST_REMOVE(fc, fc_next);
412 futex_lock_release(fl);
416 futex_condvar_unmanage(struct futex_condvar *fc,
417 cloudabi_condvar_t *condvar)
420 if (futex_queue_count(&fc->fc_waiters) != 0)
422 return (futex_user_store(condvar, CLOUDABI_CONDVAR_HAS_NO_WAITERS));
426 * futex_lock operations.
430 futex_lock_assert(const struct futex_lock *fl)
434 * A futex lock can only be kernel-managed if it has waiters.
435 * Vice versa: if a futex lock has waiters, it must be
438 KASSERT((fl->fl_owner == LOCK_UNMANAGED) ==
439 (futex_queue_count(&fl->fl_readers) == 0 &&
440 futex_queue_count(&fl->fl_writers) == 0),
441 ("Managed locks must have waiting threads"));
442 KASSERT(fl->fl_waitcount != 0 || fl->fl_owner == LOCK_UNMANAGED,
443 ("Lock with no waiters must be unmanaged"));
447 futex_lock_lookup(struct thread *td, const cloudabi_lock_t *address,
448 cloudabi_mflags_t scope, struct futex_lock **flret)
450 struct futex_address fa;
453 error = futex_address_create(&fa, td, address, scope);
457 sx_xlock(&futex_global_lock);
458 *flret = futex_lock_lookup_locked(&fa);
462 static struct futex_lock *
463 futex_lock_lookup_locked(struct futex_address *fa)
465 struct futex_lock *fl;
467 LIST_FOREACH(fl, &futex_lock_list, fl_next) {
468 if (futex_address_match(&fl->fl_address, fa)) {
469 /* Found matching lock object. */
470 futex_address_free(fa);
471 futex_lock_assert(fl);
476 /* None found. Create new lock object. */
477 fl = malloc(sizeof(*fl), M_FUTEX, M_WAITOK);
478 fl->fl_address = *fa;
479 fl->fl_owner = LOCK_UNMANAGED;
480 futex_queue_init(&fl->fl_readers);
481 futex_queue_init(&fl->fl_writers);
482 fl->fl_waitcount = 0;
483 LIST_INSERT_HEAD(&futex_lock_list, fl, fl_next);
488 futex_lock_rdlock(struct futex_lock *fl, struct thread *td,
489 cloudabi_lock_t *lock, cloudabi_clockid_t clock_id,
490 cloudabi_timestamp_t timeout, cloudabi_timestamp_t precision)
492 struct futex_waiter fw;
495 error = futex_lock_tryrdlock(fl, lock);
496 if (error == EBUSY) {
497 /* Suspend execution. */
498 KASSERT(fl->fl_owner != LOCK_UNMANAGED,
499 ("Attempted to sleep on an unmanaged lock"));
500 error = futex_queue_sleep(&fl->fl_readers, fl, &fw, td,
501 clock_id, timeout, precision);
502 KASSERT((error == 0) == fw.fw_locked,
503 ("Should have locked write lock on success"));
504 KASSERT(futex_queue_count(&fw.fw_donated) == 0,
505 ("Lock functions cannot receive threads"));
508 futex_lock_unmanage(fl, lock);
513 futex_lock_release(struct futex_lock *fl)
516 futex_lock_assert(fl);
517 if (fl->fl_waitcount == 0) {
518 /* Lock object is unreferenced. Deallocate it. */
519 KASSERT(fl->fl_owner == LOCK_UNMANAGED,
520 ("Attempted to free a managed lock"));
521 futex_address_free(&fl->fl_address);
522 LIST_REMOVE(fl, fl_next);
525 sx_xunlock(&futex_global_lock);
529 futex_lock_unmanage(struct futex_lock *fl, cloudabi_lock_t *lock)
531 cloudabi_lock_t cmp, old;
534 if (futex_queue_count(&fl->fl_readers) == 0 &&
535 futex_queue_count(&fl->fl_writers) == 0) {
536 /* Lock should be unmanaged. */
537 fl->fl_owner = LOCK_UNMANAGED;
539 /* Clear kernel-managed bit. */
540 error = futex_user_load(lock, &old);
545 error = futex_user_cmpxchg(lock, cmp, &old,
546 cmp & ~CLOUDABI_LOCK_KERNEL_MANAGED);
556 /* Sets an owner of a lock, based on a userspace lock value. */
558 futex_lock_set_owner(struct futex_lock *fl, cloudabi_lock_t lock)
561 /* Lock has no explicit owner. */
562 if ((lock & ~CLOUDABI_LOCK_WRLOCKED) == 0) {
563 fl->fl_owner = LOCK_OWNER_UNKNOWN;
566 lock &= ~(CLOUDABI_LOCK_WRLOCKED | CLOUDABI_LOCK_KERNEL_MANAGED);
568 /* Don't allow userspace to silently unlock. */
569 if (lock == LOCK_UNMANAGED) {
570 fl->fl_owner = LOCK_OWNER_UNKNOWN;
577 futex_lock_unlock(struct futex_lock *fl, struct thread *td,
578 cloudabi_lock_t *lock)
582 /* Validate that this thread is allowed to unlock. */
583 error = futex_lock_update_owner(fl, lock);
586 if (fl->fl_owner != LOCK_UNMANAGED && fl->fl_owner != td->td_tid)
588 return (futex_lock_wake_up_next(fl, lock));
591 /* Syncs in the owner of the lock from userspace if needed. */
593 futex_lock_update_owner(struct futex_lock *fl, cloudabi_lock_t *address)
595 cloudabi_lock_t lock;
598 if (fl->fl_owner == LOCK_OWNER_UNKNOWN) {
599 error = futex_user_load(address, &lock);
602 futex_lock_set_owner(fl, lock);
608 futex_lock_tryrdlock(struct futex_lock *fl, cloudabi_lock_t *address)
610 cloudabi_lock_t old, cmp;
613 if (fl->fl_owner != LOCK_UNMANAGED) {
614 /* Lock is already acquired. */
618 old = CLOUDABI_LOCK_UNLOCKED;
620 if ((old & CLOUDABI_LOCK_KERNEL_MANAGED) != 0) {
622 * Userspace lock is kernel-managed, even though
623 * the kernel disagrees.
628 if ((old & CLOUDABI_LOCK_WRLOCKED) == 0) {
630 * Lock is not write-locked. Attempt to acquire
631 * it by increasing the read count.
634 error = futex_user_cmpxchg(address, cmp, &old, cmp + 1);
642 /* Lock is write-locked. Make it kernel-managed. */
644 error = futex_user_cmpxchg(address, cmp, &old,
645 cmp | CLOUDABI_LOCK_KERNEL_MANAGED);
650 futex_lock_set_owner(fl, cmp);
658 futex_lock_trywrlock(struct futex_lock *fl, cloudabi_lock_t *address,
659 cloudabi_tid_t tid, bool force_kernel_managed)
661 cloudabi_lock_t old, new, cmp;
664 if (fl->fl_owner == tid) {
665 /* Attempted to acquire lock recursively. */
668 if (fl->fl_owner != LOCK_UNMANAGED) {
669 /* Lock is already acquired. */
673 old = CLOUDABI_LOCK_UNLOCKED;
675 if ((old & CLOUDABI_LOCK_KERNEL_MANAGED) != 0) {
677 * Userspace lock is kernel-managed, even though
678 * the kernel disagrees.
682 if (old == (tid | CLOUDABI_LOCK_WRLOCKED)) {
683 /* Attempted to acquire lock recursively. */
687 if (old == CLOUDABI_LOCK_UNLOCKED) {
688 /* Lock is unlocked. Attempt to acquire it. */
689 new = tid | CLOUDABI_LOCK_WRLOCKED;
690 if (force_kernel_managed)
691 new |= CLOUDABI_LOCK_KERNEL_MANAGED;
692 error = futex_user_cmpxchg(address,
693 CLOUDABI_LOCK_UNLOCKED, &old, new);
696 if (old == CLOUDABI_LOCK_UNLOCKED) {
698 if (force_kernel_managed)
703 /* Lock is still locked. Make it kernel-managed. */
705 error = futex_user_cmpxchg(address, cmp, &old,
706 cmp | CLOUDABI_LOCK_KERNEL_MANAGED);
711 futex_lock_set_owner(fl, cmp);
719 futex_lock_wake_up_next(struct futex_lock *fl, cloudabi_lock_t *lock)
725 * Determine which thread(s) to wake up. Prefer waking up
726 * writers over readers to prevent write starvation.
728 if (futex_queue_count(&fl->fl_writers) > 0) {
729 /* Transfer ownership to a single write-locker. */
730 if (futex_queue_count(&fl->fl_writers) > 1 ||
731 futex_queue_count(&fl->fl_readers) > 0) {
732 /* Lock should remain managed afterwards. */
733 tid = futex_queue_tid_best(&fl->fl_writers);
734 error = futex_user_store(lock,
735 tid | CLOUDABI_LOCK_WRLOCKED |
736 CLOUDABI_LOCK_KERNEL_MANAGED);
740 futex_queue_wake_up_best(&fl->fl_writers);
743 /* Lock can become unmanaged afterwards. */
744 error = futex_user_store(lock,
745 futex_queue_tid_best(&fl->fl_writers) |
746 CLOUDABI_LOCK_WRLOCKED);
750 futex_queue_wake_up_best(&fl->fl_writers);
751 fl->fl_owner = LOCK_UNMANAGED;
754 /* Transfer ownership to all read-lockers (if any). */
755 error = futex_user_store(lock,
756 futex_queue_count(&fl->fl_readers));
760 /* Wake up all threads. */
761 futex_queue_wake_up_all(&fl->fl_readers);
762 fl->fl_owner = LOCK_UNMANAGED;
768 futex_lock_wrlock(struct futex_lock *fl, struct thread *td,
769 cloudabi_lock_t *lock, cloudabi_clockid_t clock_id,
770 cloudabi_timestamp_t timeout, cloudabi_timestamp_t precision,
771 struct futex_queue *donated)
773 struct futex_waiter fw;
776 error = futex_lock_trywrlock(fl, lock, td->td_tid,
777 futex_queue_count(donated) > 0);
779 if (error == 0 || error == EBUSY) {
780 /* Put donated threads in queue before suspending. */
781 KASSERT(futex_queue_count(donated) == 0 ||
782 fl->fl_owner != LOCK_UNMANAGED,
783 ("Lock should be managed if we are going to donate"));
784 futex_queue_requeue(donated, &fl->fl_writers, UINT_MAX);
787 * This thread cannot deal with the donated threads.
788 * Wake up the next thread and let it try it by itself.
790 futex_queue_wake_up_donate(donated, UINT_MAX);
793 if (error == EBUSY) {
794 /* Suspend execution if the lock was busy. */
795 KASSERT(fl->fl_owner != LOCK_UNMANAGED,
796 ("Attempted to sleep on an unmanaged lock"));
797 error = futex_queue_sleep(&fl->fl_writers, fl, &fw, td,
798 clock_id, timeout, precision);
799 KASSERT((error == 0) == fw.fw_locked,
800 ("Should have locked write lock on success"));
801 KASSERT(futex_queue_count(&fw.fw_donated) == 0,
802 ("Lock functions cannot receive threads"));
805 futex_lock_unmanage(fl, lock);
810 * futex_queue operations.
813 static cloudabi_tid_t
814 futex_queue_tid_best(const struct futex_queue *fq)
817 return (STAILQ_FIRST(&fq->fq_list)->fw_tid);
821 futex_queue_count(const struct futex_queue *fq)
824 return (fq->fq_count);
828 futex_queue_init(struct futex_queue *fq)
831 STAILQ_INIT(&fq->fq_list);
835 /* Converts a relative timestamp to an sbintime. */
837 futex_queue_convert_timestamp_relative(cloudabi_timestamp_t ts)
839 cloudabi_timestamp_t s, ns;
842 ns = ts % 1000000000;
845 return ((s << 32) + (ns << 32) / 1000000000);
848 /* Converts an absolute timestamp and precision to a pair of sbintime values. */
850 futex_queue_convert_timestamp(struct thread *td, cloudabi_clockid_t clock_id,
851 cloudabi_timestamp_t timeout, cloudabi_timestamp_t precision,
852 sbintime_t *sbttimeout, sbintime_t *sbtprecision)
854 cloudabi_timestamp_t now;
857 /* Make the time relative. */
858 error = cloudabi_clock_time_get(td, clock_id, &now);
861 timeout = timeout < now ? 0 : timeout - now;
863 *sbttimeout = futex_queue_convert_timestamp_relative(timeout);
864 *sbtprecision = futex_queue_convert_timestamp_relative(precision);
869 futex_queue_sleep(struct futex_queue *fq, struct futex_lock *fl,
870 struct futex_waiter *fw, struct thread *td, cloudabi_clockid_t clock_id,
871 cloudabi_timestamp_t timeout, cloudabi_timestamp_t precision)
873 sbintime_t sbttimeout, sbtprecision;
876 /* Initialize futex_waiter object. */
877 fw->fw_tid = td->td_tid;
878 fw->fw_locked = false;
879 futex_queue_init(&fw->fw_donated);
881 if (timeout != UINT64_MAX) {
882 /* Convert timeout duration. */
883 error = futex_queue_convert_timestamp(td, clock_id, timeout,
884 precision, &sbttimeout, &sbtprecision);
889 /* Place object in the queue. */
891 STAILQ_INSERT_TAIL(&fq->fq_list, fw, fw_next);
894 cv_init(&fw->fw_wait, "futex");
897 futex_lock_assert(fl);
898 if (timeout == UINT64_MAX) {
899 /* Wait without a timeout. */
900 error = cv_wait_sig(&fw->fw_wait, &futex_global_lock);
902 /* Wait respecting the timeout. */
903 error = cv_timedwait_sig_sbt(&fw->fw_wait, &futex_global_lock,
904 sbttimeout, sbtprecision, 0);
905 futex_lock_assert(fl);
906 if (error == EWOULDBLOCK &&
907 fw->fw_queue != NULL && fw->fw_queue != fq) {
909 * We got signalled on a condition variable, but
910 * observed a timeout while waiting to reacquire
911 * the lock. In other words, we didn't actually
912 * time out. Go back to sleep and wait for the
913 * lock to be reacquired.
915 error = cv_wait_sig(&fw->fw_wait, &futex_global_lock);
918 futex_lock_assert(fl);
921 cv_destroy(&fw->fw_wait);
925 /* Thread got dequeued, so we've slept successfully. */
929 /* Thread is still enqueued. Remove it. */
930 KASSERT(error != 0, ("Woken up thread is still enqueued"));
931 STAILQ_REMOVE(&fq->fq_list, fw, futex_waiter, fw_next);
933 return (error == EWOULDBLOCK ? ETIMEDOUT : error);
936 /* Moves up to nwaiters waiters from one queue to another. */
938 futex_queue_requeue(struct futex_queue *fqfrom, struct futex_queue *fqto,
939 unsigned int nwaiters)
941 struct futex_waiter *fw;
943 /* Move waiters to the target queue. */
944 while (nwaiters-- > 0 && !STAILQ_EMPTY(&fqfrom->fq_list)) {
945 fw = STAILQ_FIRST(&fqfrom->fq_list);
946 STAILQ_REMOVE_HEAD(&fqfrom->fq_list, fw_next);
950 STAILQ_INSERT_TAIL(&fqto->fq_list, fw, fw_next);
955 /* Wakes up all waiters in a queue. */
957 futex_queue_wake_up_all(struct futex_queue *fq)
959 struct futex_waiter *fw;
961 STAILQ_FOREACH(fw, &fq->fq_list, fw_next) {
962 fw->fw_locked = true;
964 cv_signal(&fw->fw_wait);
967 STAILQ_INIT(&fq->fq_list);
972 * Wakes up the best waiter (i.e., the waiter having the highest
973 * priority) in a queue.
976 futex_queue_wake_up_best(struct futex_queue *fq)
978 struct futex_waiter *fw;
980 fw = STAILQ_FIRST(&fq->fq_list);
981 fw->fw_locked = true;
983 cv_signal(&fw->fw_wait);
985 STAILQ_REMOVE_HEAD(&fq->fq_list, fw_next);
990 futex_queue_wake_up_donate(struct futex_queue *fq, unsigned int nwaiters)
992 struct futex_waiter *fw;
994 fw = STAILQ_FIRST(&fq->fq_list);
997 fw->fw_locked = false;
999 cv_signal(&fw->fw_wait);
1001 STAILQ_REMOVE_HEAD(&fq->fq_list, fw_next);
1003 futex_queue_requeue(fq, &fw->fw_donated, nwaiters);
1007 * futex_user operations. Used to adjust values in userspace.
1011 futex_user_load(uint32_t *obj, uint32_t *val)
1014 return (fueword32(obj, val) != 0 ? EFAULT : 0);
1018 futex_user_store(uint32_t *obj, uint32_t val)
1021 return (suword32(obj, val) != 0 ? EFAULT : 0);
1025 futex_user_cmpxchg(uint32_t *obj, uint32_t cmp, uint32_t *old, uint32_t new)
1028 return (casueword32(obj, cmp, old, new) != 0 ? EFAULT : 0);
1032 * Blocking calls: acquiring locks, waiting on condition variables.
1036 cloudabi_futex_condvar_wait(struct thread *td, cloudabi_condvar_t *condvar,
1037 cloudabi_mflags_t condvar_scope, cloudabi_lock_t *lock,
1038 cloudabi_mflags_t lock_scope, cloudabi_clockid_t clock_id,
1039 cloudabi_timestamp_t timeout, cloudabi_timestamp_t precision)
1041 struct futex_condvar *fc;
1042 struct futex_lock *fl;
1043 struct futex_waiter fw;
1046 /* Lookup condition variable object. */
1047 error = futex_condvar_lookup_or_create(td, condvar, condvar_scope, lock,
1054 * Set the condition variable to something other than
1055 * CLOUDABI_CONDVAR_HAS_NO_WAITERS to make userspace threads
1056 * call into the kernel to perform wakeups.
1058 error = futex_user_store(condvar, ~CLOUDABI_CONDVAR_HAS_NO_WAITERS);
1060 futex_condvar_release(fc);
1064 /* Drop the lock. */
1065 error = futex_lock_unlock(fl, td, lock);
1067 futex_condvar_unmanage(fc, condvar);
1068 futex_condvar_release(fc);
1074 error = futex_queue_sleep(&fc->fc_waiters, fc->fc_lock, &fw, td,
1075 clock_id, timeout, precision);
1077 /* Waited and got the lock assigned to us. */
1078 KASSERT(futex_queue_count(&fw.fw_donated) == 0,
1079 ("Received threads while being locked"));
1080 } else if (error == 0 || error == ETIMEDOUT) {
1082 futex_condvar_unmanage(fc, condvar);
1084 * Got woken up without having the lock assigned to us.
1085 * This can happen in two cases:
1087 * 1. We observed a timeout on a condition variable.
1088 * 2. We got signalled on a condition variable while the
1089 * associated lock is unlocked. We are the first
1090 * thread that gets woken up. This thread is
1091 * responsible for reacquiring the userspace lock.
1093 error2 = futex_lock_wrlock(fl, td, lock,
1094 CLOUDABI_CLOCK_MONOTONIC, UINT64_MAX, 0, &fw.fw_donated);
1098 KASSERT(futex_queue_count(&fw.fw_donated) == 0,
1099 ("Received threads on error"));
1100 futex_condvar_unmanage(fc, condvar);
1101 futex_lock_unmanage(fl, lock);
1104 futex_condvar_release(fc);
1109 cloudabi_futex_lock_rdlock(struct thread *td, cloudabi_lock_t *lock,
1110 cloudabi_mflags_t scope, cloudabi_clockid_t clock_id,
1111 cloudabi_timestamp_t timeout, cloudabi_timestamp_t precision)
1113 struct futex_lock *fl;
1116 /* Look up lock object. */
1117 error = futex_lock_lookup(td, lock, scope, &fl);
1121 error = futex_lock_rdlock(fl, td, lock, clock_id, timeout,
1123 futex_lock_release(fl);
1128 cloudabi_futex_lock_wrlock(struct thread *td, cloudabi_lock_t *lock,
1129 cloudabi_mflags_t scope, cloudabi_clockid_t clock_id,
1130 cloudabi_timestamp_t timeout, cloudabi_timestamp_t precision)
1132 struct futex_lock *fl;
1133 struct futex_queue fq;
1136 /* Look up lock object. */
1137 error = futex_lock_lookup(td, lock, scope, &fl);
1141 futex_queue_init(&fq);
1142 error = futex_lock_wrlock(fl, td, lock, clock_id, timeout,
1144 futex_lock_release(fl);
1149 * Non-blocking calls: releasing locks, signalling condition variables.
1153 cloudabi_sys_condvar_signal(struct thread *td,
1154 struct cloudabi_sys_condvar_signal_args *uap)
1156 struct futex_condvar *fc;
1157 struct futex_lock *fl;
1158 cloudabi_nthreads_t nwaiters;
1161 nwaiters = uap->nwaiters;
1162 if (nwaiters == 0) {
1163 /* No threads to wake up. */
1167 /* Look up futex object. */
1168 error = futex_condvar_lookup(td, uap->condvar, uap->scope, &fc);
1170 /* Race condition: condition variable with no waiters. */
1171 return (error == ENOENT ? 0 : error);
1175 if (fl->fl_owner == LOCK_UNMANAGED) {
1177 * The lock is currently not managed by the kernel,
1178 * meaning we must attempt to acquire the userspace lock
1179 * first. We cannot requeue threads to an unmanaged lock,
1180 * as these threads will then never be scheduled.
1182 * Unfortunately, the memory address of the lock is
1183 * unknown from this context, meaning that we cannot
1184 * acquire the lock on behalf of the first thread to be
1185 * scheduled. The lock may even not be mapped within the
1186 * address space of the current thread.
1188 * To solve this, wake up a single waiter that will
1189 * attempt to acquire the lock. Donate all of the other
1190 * waiters that need to be woken up to this waiter, so
1191 * it can requeue them after acquiring the lock.
1193 futex_queue_wake_up_donate(&fc->fc_waiters, nwaiters - 1);
1196 * Lock is already managed by the kernel. This makes it
1197 * easy, as we can requeue the threads from the
1198 * condition variable directly to the associated lock.
1200 futex_queue_requeue(&fc->fc_waiters, &fl->fl_writers, nwaiters);
1203 /* Clear userspace condition variable if all waiters are gone. */
1204 error = futex_condvar_unmanage(fc, uap->condvar);
1205 futex_condvar_release(fc);
1210 cloudabi_sys_lock_unlock(struct thread *td,
1211 struct cloudabi_sys_lock_unlock_args *uap)
1213 struct futex_lock *fl;
1216 error = futex_lock_lookup(td, uap->lock, uap->scope, &fl);
1219 error = futex_lock_unlock(fl, td, uap->lock);
1220 futex_lock_release(fl);