]> CyberLeo.Net >> Repos - FreeBSD/FreeBSD.git/blob - sys/compat/cloudabi/cloudabi_futex.c
Add ELF Tool Chain's ar(1) and elfdump(1) to contrib
[FreeBSD/FreeBSD.git] / sys / compat / cloudabi / cloudabi_futex.c
1 /*-
2  * Copyright (c) 2015 Nuxi, https://nuxi.nl/
3  *
4  * Redistribution and use in source and binary forms, with or without
5  * modification, are permitted provided that the following conditions
6  * are met:
7  * 1. Redistributions of source code must retain the above copyright
8  *    notice, this list of conditions and the following disclaimer.
9  * 2. Redistributions in binary form must reproduce the above copyright
10  *    notice, this list of conditions and the following disclaimer in the
11  *    documentation and/or other materials provided with the distribution.
12  *
13  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
14  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
15  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
16  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
17  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
18  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
19  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
20  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
21  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
22  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
23  * SUCH DAMAGE.
24  */
25
26 #include <sys/cdefs.h>
27 __FBSDID("$FreeBSD$");
28
29 #include <sys/param.h>
30 #include <sys/kernel.h>
31 #include <sys/limits.h>
32 #include <sys/lock.h>
33 #include <sys/malloc.h>
34 #include <sys/mutex.h>
35 #include <sys/proc.h>
36 #include <sys/sx.h>
37 #include <sys/systm.h>
38
39 #include <vm/vm.h>
40 #include <vm/vm_param.h>
41 #include <vm/pmap.h>
42 #include <vm/vm_extern.h>
43 #include <vm/vm_map.h>
44 #include <vm/vm_object.h>
45
46 #include <compat/cloudabi/cloudabi_proto.h>
47 #include <compat/cloudabi/cloudabi_syscalldefs.h>
48 #include <compat/cloudabi/cloudabi_util.h>
49
50 /*
51  * Futexes for CloudABI.
52  *
53  * On most systems, futexes are implemented as objects of a single type
54  * on which a set of operations can be performed. CloudABI makes a clear
55  * distinction between locks and condition variables. A lock may have
56  * zero or more associated condition variables. A condition variable is
57  * always associated with exactly one lock. There is a strict topology.
58  * This approach has two advantages:
59  *
60  * - This topology is guaranteed to be acyclic. Requeueing of threads
61  *   only happens in one direction (from condition variables to locks).
62  *   This eases locking.
63  * - It means that a futex object for a lock exists when it is unlocked,
64  *   but has threads waiting on associated condition variables. Threads
65  *   can be requeued to a lock even if the thread performing the wakeup
66  *   does not have the lock mapped in its address space.
67  *
68  * This futex implementation only implements a single lock type, namely
69  * a read-write lock. A regular mutex type would not be necessary, as
70  * the read-write lock is as efficient as a mutex if used as such.
71  * Userspace futex locks are 32 bits in size:
72  *
73  * - 1 bit: has threads waiting in kernel-space.
74  * - 1 bit: is write-locked.
75  * - 30 bits:
76  *   - if write-locked: thread ID of owner.
77  *   - if not write-locked: number of read locks held.
78  *
79  * Condition variables are also 32 bits in size. Its value is modified
80  * by kernel-space exclusively. Zero indicates that it has no waiting
81  * threads. Non-zero indicates the opposite.
82  *
83  * This implementation is optimal, in the sense that it only wakes up
84  * threads if they can actually continue execution. It does not suffer
85  * from the thundering herd problem. If multiple threads waiting on a
86  * condition variable need to be woken up, only a single thread is
87  * scheduled. All other threads are 'donated' to this thread. After the
88  * thread manages to reacquire the lock, it requeues its donated threads
89  * to the lock.
90  *
91  * TODO(ed): Integrate this functionality into kern_umtx.c instead.
92  * TODO(ed): Store futex objects in a hash table.
93  * TODO(ed): Add actual priority inheritance.
94  * TODO(ed): Let futex_queue also take priorities into account.
95  * TODO(ed): Make locking fine-grained.
96  * TODO(ed): Perform sleeps until an actual absolute point in time,
97  *           instead of converting the timestamp to a relative value.
98  */
99
100 struct futex_address;
101 struct futex_condvar;
102 struct futex_lock;
103 struct futex_queue;
104 struct futex_waiter;
105
106 /* Identifier of a location in memory. */
107 struct futex_address {
108         /* For process-private objects: address space of the process. */
109         struct vmspace *                fa_vmspace;
110         /* For process-shared objects: VM object containing the object. */
111         struct vm_object *              fa_vmobject;
112
113         /* Memory address within address space or offset within VM object. */
114         uintptr_t                       fa_offset;
115 };
116
117 /* A set of waiting threads. */
118 struct futex_queue {
119         STAILQ_HEAD(, futex_waiter)     fq_list;
120         unsigned int                    fq_count;
121 };
122
123 /* Condition variables. */
124 struct futex_condvar {
125         /* Address of the condition variable. */
126         struct futex_address            fc_address;
127
128         /* The lock the waiters should be moved to when signalled. */
129         struct futex_lock *             fc_lock;
130
131         /* Threads waiting on the condition variable. */
132         struct futex_queue              fc_waiters;
133         /*
134          * Number of threads blocked on this condition variable, or
135          * being blocked on the lock after being requeued.
136          */
137         unsigned int                    fc_waitcount;
138
139         /* Global list pointers. */
140         LIST_ENTRY(futex_condvar)       fc_next;
141 };
142
143 /* Read-write locks. */
144 struct futex_lock {
145         /* Address of the lock. */
146         struct futex_address            fl_address;
147
148         /*
149          * Current owner of the lock. LOCK_UNMANAGED if the lock is
150          * currently not owned by the kernel. LOCK_OWNER_UNKNOWN in case
151          * the owner is not known (e.g., when the lock is read-locked).
152          */
153         cloudabi_tid_t                  fl_owner;
154 #define LOCK_UNMANAGED 0x0
155 #define LOCK_OWNER_UNKNOWN 0x1
156
157         /* Writers blocked on the lock. */
158         struct futex_queue              fl_writers;
159         /* Readers blocked on the lock. */
160         struct futex_queue              fl_readers;
161         /* Number of threads blocked on this lock + condition variables. */
162         unsigned int                    fl_waitcount;
163
164         /* Global list pointers. */
165         LIST_ENTRY(futex_lock)          fl_next;
166 };
167
168 /* Information associated with a thread blocked on an object. */
169 struct futex_waiter {
170         /* Thread ID. */
171         cloudabi_tid_t                  fw_tid;
172         /* Condition variable used for waiting. */
173         struct cv                       fw_wait;
174
175         /* Queue this waiter is currently placed in. */
176         struct futex_queue *            fw_queue;
177         /* List pointers of fw_queue. */
178         STAILQ_ENTRY(futex_waiter)      fw_next;
179
180         /* Lock has been acquired. */
181         bool                            fw_locked;
182         /* If not locked, threads that should block after acquiring. */
183         struct futex_queue              fw_donated;
184 };
185
186 /* Global data structures. */
187 static MALLOC_DEFINE(M_FUTEX, "futex", "CloudABI futex");
188
189 static struct sx futex_global_lock;
190 SX_SYSINIT(futex_global_lock, &futex_global_lock, "CloudABI futex global lock");
191
192 static LIST_HEAD(, futex_lock) futex_lock_list =
193     LIST_HEAD_INITIALIZER(&futex_lock_list);
194 static LIST_HEAD(, futex_condvar) futex_condvar_list =
195     LIST_HEAD_INITIALIZER(&futex_condvar_list);
196
197 /* Utility functions. */
198 static void futex_lock_assert(const struct futex_lock *);
199 static struct futex_lock *futex_lock_lookup_locked(struct futex_address *);
200 static void futex_lock_release(struct futex_lock *);
201 static int futex_lock_tryrdlock(struct futex_lock *, cloudabi_lock_t *);
202 static int futex_lock_unmanage(struct futex_lock *, cloudabi_lock_t *);
203 static int futex_lock_update_owner(struct futex_lock *, cloudabi_lock_t *);
204 static int futex_lock_wake_up_next(struct futex_lock *, cloudabi_lock_t *);
205 static unsigned int futex_queue_count(const struct futex_queue *);
206 static void futex_queue_init(struct futex_queue *);
207 static void futex_queue_requeue(struct futex_queue *, struct futex_queue *,
208     unsigned int);
209 static int futex_queue_sleep(struct futex_queue *, struct futex_lock *,
210     struct futex_waiter *, struct thread *, cloudabi_clockid_t,
211     cloudabi_timestamp_t, cloudabi_timestamp_t);
212 static cloudabi_tid_t futex_queue_tid_best(const struct futex_queue *);
213 static void futex_queue_wake_up_all(struct futex_queue *);
214 static void futex_queue_wake_up_best(struct futex_queue *);
215 static void futex_queue_wake_up_donate(struct futex_queue *, unsigned int);
216 static int futex_user_load(uint32_t *, uint32_t *);
217 static int futex_user_store(uint32_t *, uint32_t);
218 static int futex_user_cmpxchg(uint32_t *, uint32_t, uint32_t *, uint32_t);
219
220 /*
221  * futex_address operations.
222  */
223
224 static int
225 futex_address_create(struct futex_address *fa, struct thread *td,
226     const void *object, cloudabi_mflags_t scope)
227 {
228         struct vmspace *vs;
229         struct vm_object *vo;
230         vm_map_t map;
231         vm_map_entry_t entry;
232         vm_pindex_t pindex;
233         vm_prot_t prot;
234         boolean_t wired;
235
236         /*
237          * Most of the time objects are stored in privately mapped
238          * anonymous memory. For these objects we wouldn't need to look
239          * up the corresponding VM object. The scope hint provided by
240          * userspace allows us to skip the VM map lookup for the common
241          * case.
242          *
243          * POSIX does permit enabling PTHREAD_PROCESS_SHARED on a lock
244          * stored in a private mapping, at the cost of additional
245          * performance overhead. Fall back to identifying the object by
246          * virtual memory address if the mapping isn't shared.
247          */
248         vs = td->td_proc->p_vmspace;
249         switch (scope) {
250         case CLOUDABI_MAP_SHARED:
251                 map = &vs->vm_map;
252                 if (vm_map_lookup(&map, (vm_offset_t)object,
253                     VM_PROT_COPY | VM_PROT_WRITE, &entry, &vo, &pindex, &prot,
254                     &wired) != KERN_SUCCESS)
255                         return (EFAULT);
256
257                 if (entry->inheritance == VM_INHERIT_SHARE) {
258                         /*
259                          * Address corresponds to a shared mapping.
260                          * Identify the address by its VM object.
261                          */
262                         fa->fa_vmspace = NULL;
263                         fa->fa_vmobject = vo;
264                         vm_object_reference(vo);
265                         fa->fa_offset = entry->offset - entry->start +
266                             (vm_offset_t)object;
267                         vm_map_lookup_done(map, entry);
268                         return (0);
269                 }
270                 vm_map_lookup_done(map, entry);
271                 /* FALLTHROUGH */
272         case CLOUDABI_MAP_PRIVATE:
273                 /*
274                  * Address corresponds to a private mapping. Never
275                  * identify the address by its VM object, as shadow
276                  * objects may get inserted if another thread forks.
277                  * Simply use the VM space instead.
278                  */
279                 fa->fa_vmspace = vs;
280                 fa->fa_vmobject = NULL;
281                 fa->fa_offset = (uintptr_t)object;
282                 return (0);
283         default:
284                 return (EINVAL);
285         }
286 }
287
288 static void
289 futex_address_free(struct futex_address *fa)
290 {
291
292         if (fa->fa_vmobject != NULL)
293                 vm_object_deallocate(fa->fa_vmobject);
294 }
295
296 static bool
297 futex_address_match(const struct futex_address *fa1,
298     const struct futex_address *fa2)
299 {
300
301         /* Either fa_vmspace or fa_vmobject is NULL. */
302         return (fa1->fa_vmspace == fa2->fa_vmspace &&
303             fa1->fa_vmobject == fa2->fa_vmobject &&
304             fa1->fa_offset == fa2->fa_offset);
305 }
306
307 /*
308  * futex_condvar operations.
309  */
310
311 static void
312 futex_condvar_assert(const struct futex_condvar *fc)
313 {
314
315         KASSERT(fc->fc_waitcount >= futex_queue_count(&fc->fc_waiters),
316             ("Total number of waiters cannot be smaller than the wait queue"));
317         futex_lock_assert(fc->fc_lock);
318 }
319
320 static int
321 futex_condvar_lookup(struct thread *td, const cloudabi_condvar_t *address,
322     cloudabi_mflags_t scope, struct futex_condvar **fcret)
323 {
324         struct futex_address fa_condvar;
325         struct futex_condvar *fc;
326         int error;
327
328         error = futex_address_create(&fa_condvar, td, address, scope);
329         if (error != 0)
330                 return (error);
331
332         sx_xlock(&futex_global_lock);
333         LIST_FOREACH(fc, &futex_condvar_list, fc_next) {
334                 if (futex_address_match(&fc->fc_address, &fa_condvar)) {
335                         /* Found matching lock object. */
336                         futex_address_free(&fa_condvar);
337                         futex_condvar_assert(fc);
338                         *fcret = fc;
339                         return (0);
340                 }
341         }
342         sx_xunlock(&futex_global_lock);
343         futex_address_free(&fa_condvar);
344         return (ENOENT);
345 }
346
347 static int
348 futex_condvar_lookup_or_create(struct thread *td,
349     const cloudabi_condvar_t *condvar, cloudabi_mflags_t condvar_scope,
350     const cloudabi_lock_t *lock, cloudabi_mflags_t lock_scope,
351     struct futex_condvar **fcret)
352 {
353         struct futex_address fa_condvar, fa_lock;
354         struct futex_condvar *fc;
355         struct futex_lock *fl;
356         int error;
357
358         error = futex_address_create(&fa_condvar, td, condvar, condvar_scope);
359         if (error != 0)
360                 return (error);
361         error = futex_address_create(&fa_lock, td, lock, lock_scope);
362         if (error != 0) {
363                 futex_address_free(&fa_condvar);
364                 return (error);
365         }
366
367         sx_xlock(&futex_global_lock);
368         LIST_FOREACH(fc, &futex_condvar_list, fc_next) {
369                 if (!futex_address_match(&fc->fc_address, &fa_condvar))
370                         continue;
371                 fl = fc->fc_lock;
372                 if (!futex_address_match(&fl->fl_address, &fa_lock)) {
373                         /* Condition variable is owned by a different lock. */
374                         futex_address_free(&fa_condvar);
375                         futex_address_free(&fa_lock);
376                         sx_xunlock(&futex_global_lock);
377                         return (EINVAL);
378                 }
379
380                 /* Found fully matching condition variable. */
381                 futex_address_free(&fa_condvar);
382                 futex_address_free(&fa_lock);
383                 futex_condvar_assert(fc);
384                 *fcret = fc;
385                 return (0);
386         }
387
388         /* None found. Create new condition variable object. */
389         fc = malloc(sizeof(*fc), M_FUTEX, M_WAITOK);
390         fc->fc_address = fa_condvar;
391         fc->fc_lock = futex_lock_lookup_locked(&fa_lock);
392         futex_queue_init(&fc->fc_waiters);
393         fc->fc_waitcount = 0;
394         LIST_INSERT_HEAD(&futex_condvar_list, fc, fc_next);
395         *fcret = fc;
396         return (0);
397 }
398
399 static void
400 futex_condvar_release(struct futex_condvar *fc)
401 {
402         struct futex_lock *fl;
403
404         futex_condvar_assert(fc);
405         fl = fc->fc_lock;
406         if (fc->fc_waitcount == 0) {
407                 /* Condition variable has no waiters. Deallocate it. */
408                 futex_address_free(&fc->fc_address);
409                 LIST_REMOVE(fc, fc_next);
410                 free(fc, M_FUTEX);
411         }
412         futex_lock_release(fl);
413 }
414
415 static int
416 futex_condvar_unmanage(struct futex_condvar *fc,
417     cloudabi_condvar_t *condvar)
418 {
419
420         if (futex_queue_count(&fc->fc_waiters) != 0)
421                 return (0);
422         return (futex_user_store(condvar, CLOUDABI_CONDVAR_HAS_NO_WAITERS));
423 }
424
425 /*
426  * futex_lock operations.
427  */
428
429 static void
430 futex_lock_assert(const struct futex_lock *fl)
431 {
432
433         /*
434          * A futex lock can only be kernel-managed if it has waiters.
435          * Vice versa: if a futex lock has waiters, it must be
436          * kernel-managed.
437          */
438         KASSERT((fl->fl_owner == LOCK_UNMANAGED) ==
439             (futex_queue_count(&fl->fl_readers) == 0 &&
440             futex_queue_count(&fl->fl_writers) == 0),
441             ("Managed locks must have waiting threads"));
442         KASSERT(fl->fl_waitcount != 0 || fl->fl_owner == LOCK_UNMANAGED,
443             ("Lock with no waiters must be unmanaged"));
444 }
445
446 static int
447 futex_lock_lookup(struct thread *td, const cloudabi_lock_t *address,
448     cloudabi_mflags_t scope, struct futex_lock **flret)
449 {
450         struct futex_address fa;
451         int error;
452
453         error = futex_address_create(&fa, td, address, scope);
454         if (error != 0)
455                 return (error);
456
457         sx_xlock(&futex_global_lock);
458         *flret = futex_lock_lookup_locked(&fa);
459         return (0);
460 }
461
462 static struct futex_lock *
463 futex_lock_lookup_locked(struct futex_address *fa)
464 {
465         struct futex_lock *fl;
466
467         LIST_FOREACH(fl, &futex_lock_list, fl_next) {
468                 if (futex_address_match(&fl->fl_address, fa)) {
469                         /* Found matching lock object. */
470                         futex_address_free(fa);
471                         futex_lock_assert(fl);
472                         return (fl);
473                 }
474         }
475
476         /* None found. Create new lock object. */
477         fl = malloc(sizeof(*fl), M_FUTEX, M_WAITOK);
478         fl->fl_address = *fa;
479         fl->fl_owner = LOCK_UNMANAGED;
480         futex_queue_init(&fl->fl_readers);
481         futex_queue_init(&fl->fl_writers);
482         fl->fl_waitcount = 0;
483         LIST_INSERT_HEAD(&futex_lock_list, fl, fl_next);
484         return (fl);
485 }
486
487 static int
488 futex_lock_rdlock(struct futex_lock *fl, struct thread *td,
489     cloudabi_lock_t *lock, cloudabi_clockid_t clock_id,
490     cloudabi_timestamp_t timeout, cloudabi_timestamp_t precision)
491 {
492         struct futex_waiter fw;
493         int error;
494
495         error = futex_lock_tryrdlock(fl, lock);
496         if (error == EBUSY) {
497                 /* Suspend execution. */
498                 KASSERT(fl->fl_owner != LOCK_UNMANAGED,
499                     ("Attempted to sleep on an unmanaged lock"));
500                 error = futex_queue_sleep(&fl->fl_readers, fl, &fw, td,
501                     clock_id, timeout, precision);
502                 KASSERT((error == 0) == fw.fw_locked,
503                     ("Should have locked write lock on success"));
504                 KASSERT(futex_queue_count(&fw.fw_donated) == 0,
505                     ("Lock functions cannot receive threads"));
506         }
507         if (error != 0)
508                 futex_lock_unmanage(fl, lock);
509         return (error);
510 }
511
512 static void
513 futex_lock_release(struct futex_lock *fl)
514 {
515
516         futex_lock_assert(fl);
517         if (fl->fl_waitcount == 0) {
518                 /* Lock object is unreferenced. Deallocate it. */
519                 KASSERT(fl->fl_owner == LOCK_UNMANAGED,
520                     ("Attempted to free a managed lock"));
521                 futex_address_free(&fl->fl_address);
522                 LIST_REMOVE(fl, fl_next);
523                 free(fl, M_FUTEX);
524         }
525         sx_xunlock(&futex_global_lock);
526 }
527
528 static int
529 futex_lock_unmanage(struct futex_lock *fl, cloudabi_lock_t *lock)
530 {
531         cloudabi_lock_t cmp, old;
532         int error;
533
534         if (futex_queue_count(&fl->fl_readers) == 0 &&
535             futex_queue_count(&fl->fl_writers) == 0) {
536                 /* Lock should be unmanaged. */
537                 fl->fl_owner = LOCK_UNMANAGED;
538
539                 /* Clear kernel-managed bit. */
540                 error = futex_user_load(lock, &old);
541                 if (error != 0)
542                         return (error);
543                 for (;;) {
544                         cmp = old;
545                         error = futex_user_cmpxchg(lock, cmp, &old,
546                             cmp & ~CLOUDABI_LOCK_KERNEL_MANAGED);
547                         if (error != 0)
548                                 return (error);
549                         if (old == cmp)
550                                 break;
551                 }
552         }
553         return (0);
554 }
555
556 /* Sets an owner of a lock, based on a userspace lock value. */
557 static void
558 futex_lock_set_owner(struct futex_lock *fl, cloudabi_lock_t lock)
559 {
560
561         /* Lock has no explicit owner. */
562         if ((lock & ~CLOUDABI_LOCK_WRLOCKED) == 0) {
563                 fl->fl_owner = LOCK_OWNER_UNKNOWN;
564                 return;
565         }
566         lock &= ~(CLOUDABI_LOCK_WRLOCKED | CLOUDABI_LOCK_KERNEL_MANAGED);
567
568         /* Don't allow userspace to silently unlock. */
569         if (lock == LOCK_UNMANAGED) {
570                 fl->fl_owner = LOCK_OWNER_UNKNOWN;
571                 return;
572         }
573         fl->fl_owner = lock;
574 }
575
576 static int
577 futex_lock_unlock(struct futex_lock *fl, struct thread *td,
578     cloudabi_lock_t *lock)
579 {
580         int error;
581
582         /* Validate that this thread is allowed to unlock. */
583         error = futex_lock_update_owner(fl, lock);
584         if (error != 0)
585                 return (error);
586         if (fl->fl_owner != LOCK_UNMANAGED && fl->fl_owner != td->td_tid)
587                 return (EPERM);
588         return (futex_lock_wake_up_next(fl, lock));
589 }
590
591 /* Syncs in the owner of the lock from userspace if needed. */
592 static int
593 futex_lock_update_owner(struct futex_lock *fl, cloudabi_lock_t *address)
594 {
595         cloudabi_lock_t lock;
596         int error;
597
598         if (fl->fl_owner == LOCK_OWNER_UNKNOWN) {
599                 error = futex_user_load(address, &lock);
600                 if (error != 0)
601                         return (error);
602                 futex_lock_set_owner(fl, lock);
603         }
604         return (0);
605 }
606
607 static int
608 futex_lock_tryrdlock(struct futex_lock *fl, cloudabi_lock_t *address)
609 {
610         cloudabi_lock_t old, cmp;
611         int error;
612
613         if (fl->fl_owner != LOCK_UNMANAGED) {
614                 /* Lock is already acquired. */
615                 return (EBUSY);
616         }
617
618         old = CLOUDABI_LOCK_UNLOCKED;
619         for (;;) {
620                 if ((old & CLOUDABI_LOCK_KERNEL_MANAGED) != 0) {
621                         /*
622                          * Userspace lock is kernel-managed, even though
623                          * the kernel disagrees.
624                          */
625                         return (EINVAL);
626                 }
627
628                 if ((old & CLOUDABI_LOCK_WRLOCKED) == 0) {
629                         /*
630                          * Lock is not write-locked. Attempt to acquire
631                          * it by increasing the read count.
632                          */
633                         cmp = old;
634                         error = futex_user_cmpxchg(address, cmp, &old, cmp + 1);
635                         if (error != 0)
636                                 return (error);
637                         if (old == cmp) {
638                                 /* Success. */
639                                 return (0);
640                         }
641                 } else {
642                         /* Lock is write-locked. Make it kernel-managed. */
643                         cmp = old;
644                         error = futex_user_cmpxchg(address, cmp, &old,
645                             cmp | CLOUDABI_LOCK_KERNEL_MANAGED);
646                         if (error != 0)
647                                 return (error);
648                         if (old == cmp) {
649                                 /* Success. */
650                                 futex_lock_set_owner(fl, cmp);
651                                 return (EBUSY);
652                         }
653                 }
654         }
655 }
656
657 static int
658 futex_lock_trywrlock(struct futex_lock *fl, cloudabi_lock_t *address,
659     cloudabi_tid_t tid, bool force_kernel_managed)
660 {
661         cloudabi_lock_t old, new, cmp;
662         int error;
663
664         if (fl->fl_owner == tid) {
665                 /* Attempted to acquire lock recursively. */
666                 return (EDEADLK);
667         }
668         if (fl->fl_owner != LOCK_UNMANAGED) {
669                 /* Lock is already acquired. */
670                 return (EBUSY);
671         }
672
673         old = CLOUDABI_LOCK_UNLOCKED;
674         for (;;) {
675                 if ((old & CLOUDABI_LOCK_KERNEL_MANAGED) != 0) {
676                         /*
677                          * Userspace lock is kernel-managed, even though
678                          * the kernel disagrees.
679                          */
680                         return (EINVAL);
681                 }
682                 if (old == (tid | CLOUDABI_LOCK_WRLOCKED)) {
683                         /* Attempted to acquire lock recursively. */
684                         return (EDEADLK);
685                 }
686
687                 if (old == CLOUDABI_LOCK_UNLOCKED) {
688                         /* Lock is unlocked. Attempt to acquire it. */
689                         new = tid | CLOUDABI_LOCK_WRLOCKED;
690                         if (force_kernel_managed)
691                                 new |= CLOUDABI_LOCK_KERNEL_MANAGED;
692                         error = futex_user_cmpxchg(address,
693                             CLOUDABI_LOCK_UNLOCKED, &old, new);
694                         if (error != 0)
695                                 return (error);
696                         if (old == CLOUDABI_LOCK_UNLOCKED) {
697                                 /* Success. */
698                                 if (force_kernel_managed)
699                                         fl->fl_owner = tid;
700                                 return (0);
701                         }
702                 } else {
703                         /* Lock is still locked. Make it kernel-managed. */
704                         cmp = old;
705                         error = futex_user_cmpxchg(address, cmp, &old,
706                             cmp | CLOUDABI_LOCK_KERNEL_MANAGED);
707                         if (error != 0)
708                                 return (error);
709                         if (old == cmp) {
710                                 /* Success. */
711                                 futex_lock_set_owner(fl, cmp);
712                                 return (EBUSY);
713                         }
714                 }
715         }
716 }
717
718 static int
719 futex_lock_wake_up_next(struct futex_lock *fl, cloudabi_lock_t *lock)
720 {
721         cloudabi_tid_t tid;
722         int error;
723
724         /*
725          * Determine which thread(s) to wake up. Prefer waking up
726          * writers over readers to prevent write starvation.
727          */
728         if (futex_queue_count(&fl->fl_writers) > 0) {
729                 /* Transfer ownership to a single write-locker. */
730                 if (futex_queue_count(&fl->fl_writers) > 1 ||
731                     futex_queue_count(&fl->fl_readers) > 0) {
732                         /* Lock should remain managed afterwards. */
733                         tid = futex_queue_tid_best(&fl->fl_writers);
734                         error = futex_user_store(lock,
735                             tid | CLOUDABI_LOCK_WRLOCKED |
736                             CLOUDABI_LOCK_KERNEL_MANAGED);
737                         if (error != 0)
738                                 return (error);
739
740                         futex_queue_wake_up_best(&fl->fl_writers);
741                         fl->fl_owner = tid;
742                 } else {
743                         /* Lock can become unmanaged afterwards. */
744                         error = futex_user_store(lock,
745                             futex_queue_tid_best(&fl->fl_writers) |
746                             CLOUDABI_LOCK_WRLOCKED);
747                         if (error != 0)
748                                 return (error);
749
750                         futex_queue_wake_up_best(&fl->fl_writers);
751                         fl->fl_owner = LOCK_UNMANAGED;
752                 }
753         } else {
754                 /* Transfer ownership to all read-lockers (if any). */
755                 error = futex_user_store(lock,
756                     futex_queue_count(&fl->fl_readers));
757                 if (error != 0)
758                         return (error);
759
760                 /* Wake up all threads. */
761                 futex_queue_wake_up_all(&fl->fl_readers);
762                 fl->fl_owner = LOCK_UNMANAGED;
763         }
764         return (0);
765 }
766
767 static int
768 futex_lock_wrlock(struct futex_lock *fl, struct thread *td,
769     cloudabi_lock_t *lock, cloudabi_clockid_t clock_id,
770     cloudabi_timestamp_t timeout, cloudabi_timestamp_t precision,
771     struct futex_queue *donated)
772 {
773         struct futex_waiter fw;
774         int error;
775
776         error = futex_lock_trywrlock(fl, lock, td->td_tid,
777             futex_queue_count(donated) > 0);
778
779         if (error == 0 || error == EBUSY) {
780                 /* Put donated threads in queue before suspending. */
781                 KASSERT(futex_queue_count(donated) == 0 ||
782                     fl->fl_owner != LOCK_UNMANAGED,
783                     ("Lock should be managed if we are going to donate"));
784                 futex_queue_requeue(donated, &fl->fl_writers, UINT_MAX);
785         } else {
786                 /*
787                  * This thread cannot deal with the donated threads.
788                  * Wake up the next thread and let it try it by itself.
789                  */
790                 futex_queue_wake_up_donate(donated, UINT_MAX);
791         }
792
793         if (error == EBUSY) {
794                 /* Suspend execution if the lock was busy. */
795                 KASSERT(fl->fl_owner != LOCK_UNMANAGED,
796                     ("Attempted to sleep on an unmanaged lock"));
797                 error = futex_queue_sleep(&fl->fl_writers, fl, &fw, td,
798                     clock_id, timeout, precision);
799                 KASSERT((error == 0) == fw.fw_locked,
800                     ("Should have locked write lock on success"));
801                 KASSERT(futex_queue_count(&fw.fw_donated) == 0,
802                     ("Lock functions cannot receive threads"));
803         }
804         if (error != 0)
805                 futex_lock_unmanage(fl, lock);
806         return (error);
807 }
808
809 /*
810  * futex_queue operations.
811  */
812
813 static cloudabi_tid_t
814 futex_queue_tid_best(const struct futex_queue *fq)
815 {
816
817         return (STAILQ_FIRST(&fq->fq_list)->fw_tid);
818 }
819
820 static unsigned int
821 futex_queue_count(const struct futex_queue *fq)
822 {
823
824         return (fq->fq_count);
825 }
826
827 static void
828 futex_queue_init(struct futex_queue *fq)
829 {
830
831         STAILQ_INIT(&fq->fq_list);
832         fq->fq_count = 0;
833 }
834
835 /* Converts a relative timestamp to an sbintime. */
836 static sbintime_t
837 futex_queue_convert_timestamp_relative(cloudabi_timestamp_t ts)
838 {
839         cloudabi_timestamp_t s, ns;
840
841         s = ts / 1000000000;
842         ns = ts % 1000000000;
843         if (s > INT32_MAX)
844                 return (INT64_MAX);
845         return ((s << 32) + (ns << 32) / 1000000000);
846 }
847
848 /* Converts an absolute timestamp and precision to a pair of sbintime values. */
849 static int
850 futex_queue_convert_timestamp(struct thread *td, cloudabi_clockid_t clock_id,
851     cloudabi_timestamp_t timeout, cloudabi_timestamp_t precision,
852     sbintime_t *sbttimeout, sbintime_t *sbtprecision)
853 {
854         cloudabi_timestamp_t now;
855         int error;
856
857         /* Make the time relative. */
858         error = cloudabi_clock_time_get(td, clock_id, &now);
859         if (error != 0)
860                 return (error);
861         timeout = timeout < now ? 0 : timeout - now;
862
863         *sbttimeout = futex_queue_convert_timestamp_relative(timeout);
864         *sbtprecision = futex_queue_convert_timestamp_relative(precision);
865         return (0);
866 }
867
868 static int
869 futex_queue_sleep(struct futex_queue *fq, struct futex_lock *fl,
870     struct futex_waiter *fw, struct thread *td, cloudabi_clockid_t clock_id,
871     cloudabi_timestamp_t timeout, cloudabi_timestamp_t precision)
872 {
873         sbintime_t sbttimeout, sbtprecision;
874         int error;
875
876         /* Initialize futex_waiter object. */
877         fw->fw_tid = td->td_tid;
878         fw->fw_locked = false;
879         futex_queue_init(&fw->fw_donated);
880
881         if (timeout != UINT64_MAX) {
882                 /* Convert timeout duration. */
883                 error = futex_queue_convert_timestamp(td, clock_id, timeout,
884                     precision, &sbttimeout, &sbtprecision);
885                 if (error != 0)
886                         return (error);
887         }
888
889         /* Place object in the queue. */
890         fw->fw_queue = fq;
891         STAILQ_INSERT_TAIL(&fq->fq_list, fw, fw_next);
892         ++fq->fq_count;
893
894         cv_init(&fw->fw_wait, "futex");
895         ++fl->fl_waitcount;
896
897         futex_lock_assert(fl);
898         if (timeout == UINT64_MAX) {
899                 /* Wait without a timeout. */
900                 error = cv_wait_sig(&fw->fw_wait, &futex_global_lock);
901         } else {
902                 /* Wait respecting the timeout. */
903                 error = cv_timedwait_sig_sbt(&fw->fw_wait, &futex_global_lock,
904                     sbttimeout, sbtprecision, 0);
905                 futex_lock_assert(fl);
906                 if (error == EWOULDBLOCK &&
907                     fw->fw_queue != NULL && fw->fw_queue != fq) {
908                         /*
909                          * We got signalled on a condition variable, but
910                          * observed a timeout while waiting to reacquire
911                          * the lock. In other words, we didn't actually
912                          * time out. Go back to sleep and wait for the
913                          * lock to be reacquired.
914                          */
915                         error = cv_wait_sig(&fw->fw_wait, &futex_global_lock);
916                 }
917         }
918         futex_lock_assert(fl);
919
920         --fl->fl_waitcount;
921         cv_destroy(&fw->fw_wait);
922
923         fq = fw->fw_queue;
924         if (fq == NULL) {
925                 /* Thread got dequeued, so we've slept successfully. */
926                 return (0);
927         }
928
929         /* Thread is still enqueued. Remove it. */
930         KASSERT(error != 0, ("Woken up thread is still enqueued"));
931         STAILQ_REMOVE(&fq->fq_list, fw, futex_waiter, fw_next);
932         --fq->fq_count;
933         return (error == EWOULDBLOCK ? ETIMEDOUT : error);
934 }
935
936 /* Moves up to nwaiters waiters from one queue to another. */
937 static void
938 futex_queue_requeue(struct futex_queue *fqfrom, struct futex_queue *fqto,
939     unsigned int nwaiters)
940 {
941         struct futex_waiter *fw;
942
943         /* Move waiters to the target queue. */
944         while (nwaiters-- > 0 && !STAILQ_EMPTY(&fqfrom->fq_list)) {
945                 fw = STAILQ_FIRST(&fqfrom->fq_list);
946                 STAILQ_REMOVE_HEAD(&fqfrom->fq_list, fw_next);
947                 --fqfrom->fq_count;
948
949                 fw->fw_queue = fqto;
950                 STAILQ_INSERT_TAIL(&fqto->fq_list, fw, fw_next);
951                 ++fqto->fq_count;
952         }
953 }
954
955 /* Wakes up all waiters in a queue. */
956 static void
957 futex_queue_wake_up_all(struct futex_queue *fq)
958 {
959         struct futex_waiter *fw;
960
961         STAILQ_FOREACH(fw, &fq->fq_list, fw_next) {
962                 fw->fw_locked = true;
963                 fw->fw_queue = NULL;
964                 cv_signal(&fw->fw_wait);
965         }
966
967         STAILQ_INIT(&fq->fq_list);
968         fq->fq_count = 0;
969 }
970
971 /*
972  * Wakes up the best waiter (i.e., the waiter having the highest
973  * priority) in a queue.
974  */
975 static void
976 futex_queue_wake_up_best(struct futex_queue *fq)
977 {
978         struct futex_waiter *fw;
979
980         fw = STAILQ_FIRST(&fq->fq_list);
981         fw->fw_locked = true;
982         fw->fw_queue = NULL;
983         cv_signal(&fw->fw_wait);
984
985         STAILQ_REMOVE_HEAD(&fq->fq_list, fw_next);
986         --fq->fq_count;
987 }
988
989 static void
990 futex_queue_wake_up_donate(struct futex_queue *fq, unsigned int nwaiters)
991 {
992         struct futex_waiter *fw;
993
994         fw = STAILQ_FIRST(&fq->fq_list);
995         if (fw == NULL)
996                 return;
997         fw->fw_locked = false;
998         fw->fw_queue = NULL;
999         cv_signal(&fw->fw_wait);
1000
1001         STAILQ_REMOVE_HEAD(&fq->fq_list, fw_next);
1002         --fq->fq_count;
1003         futex_queue_requeue(fq, &fw->fw_donated, nwaiters);
1004 }
1005
1006 /*
1007  * futex_user operations. Used to adjust values in userspace.
1008  */
1009
1010 static int
1011 futex_user_load(uint32_t *obj, uint32_t *val)
1012 {
1013
1014         return (fueword32(obj, val) != 0 ? EFAULT : 0);
1015 }
1016
1017 static int
1018 futex_user_store(uint32_t *obj, uint32_t val)
1019 {
1020
1021         return (suword32(obj, val) != 0 ? EFAULT : 0);
1022 }
1023
1024 static int
1025 futex_user_cmpxchg(uint32_t *obj, uint32_t cmp, uint32_t *old, uint32_t new)
1026 {
1027
1028         return (casueword32(obj, cmp, old, new) != 0 ? EFAULT : 0);
1029 }
1030
1031 /*
1032  * Blocking calls: acquiring locks, waiting on condition variables.
1033  */
1034
1035 int
1036 cloudabi_futex_condvar_wait(struct thread *td, cloudabi_condvar_t *condvar,
1037     cloudabi_mflags_t condvar_scope, cloudabi_lock_t *lock,
1038     cloudabi_mflags_t lock_scope, cloudabi_clockid_t clock_id,
1039     cloudabi_timestamp_t timeout, cloudabi_timestamp_t precision)
1040 {
1041         struct futex_condvar *fc;
1042         struct futex_lock *fl;
1043         struct futex_waiter fw;
1044         int error, error2;
1045
1046         /* Lookup condition variable object. */
1047         error = futex_condvar_lookup_or_create(td, condvar, condvar_scope, lock,
1048             lock_scope, &fc);
1049         if (error != 0)
1050                 return (error);
1051         fl = fc->fc_lock;
1052
1053         /*
1054          * Set the condition variable to something other than
1055          * CLOUDABI_CONDVAR_HAS_NO_WAITERS to make userspace threads
1056          * call into the kernel to perform wakeups.
1057          */
1058         error = futex_user_store(condvar, ~CLOUDABI_CONDVAR_HAS_NO_WAITERS);
1059         if (error != 0) {
1060                 futex_condvar_release(fc);
1061                 return (error);
1062         }
1063
1064         /* Drop the lock. */
1065         error = futex_lock_unlock(fl, td, lock);
1066         if (error != 0) {
1067                 futex_condvar_unmanage(fc, condvar);
1068                 futex_condvar_release(fc);
1069                 return (error);
1070         }
1071
1072         /* Go to sleep. */
1073         ++fc->fc_waitcount;
1074         error = futex_queue_sleep(&fc->fc_waiters, fc->fc_lock, &fw, td,
1075             clock_id, timeout, precision);
1076         if (fw.fw_locked) {
1077                 /* Waited and got the lock assigned to us. */
1078                 KASSERT(futex_queue_count(&fw.fw_donated) == 0,
1079                     ("Received threads while being locked"));
1080         } else if (error == 0 || error == ETIMEDOUT) {
1081                 if (error != 0)
1082                         futex_condvar_unmanage(fc, condvar);
1083                 /*
1084                  * Got woken up without having the lock assigned to us.
1085                  * This can happen in two cases:
1086                  *
1087                  * 1. We observed a timeout on a condition variable.
1088                  * 2. We got signalled on a condition variable while the
1089                  *    associated lock is unlocked. We are the first
1090                  *    thread that gets woken up. This thread is
1091                  *    responsible for reacquiring the userspace lock.
1092                  */
1093                 error2 = futex_lock_wrlock(fl, td, lock,
1094                     CLOUDABI_CLOCK_MONOTONIC, UINT64_MAX, 0, &fw.fw_donated);
1095                 if (error2 != 0)
1096                         error = error2;
1097         } else {
1098                 KASSERT(futex_queue_count(&fw.fw_donated) == 0,
1099                     ("Received threads on error"));
1100                 futex_condvar_unmanage(fc, condvar);
1101                 futex_lock_unmanage(fl, lock);
1102         }
1103         --fc->fc_waitcount;
1104         futex_condvar_release(fc);
1105         return (error);
1106 }
1107
1108 int
1109 cloudabi_futex_lock_rdlock(struct thread *td, cloudabi_lock_t *lock,
1110     cloudabi_mflags_t scope, cloudabi_clockid_t clock_id,
1111     cloudabi_timestamp_t timeout, cloudabi_timestamp_t precision)
1112 {
1113         struct futex_lock *fl;
1114         int error;
1115
1116         /* Look up lock object. */
1117         error = futex_lock_lookup(td, lock, scope, &fl);
1118         if (error != 0)
1119                 return (error);
1120
1121         error = futex_lock_rdlock(fl, td, lock, clock_id, timeout,
1122             precision);
1123         futex_lock_release(fl);
1124         return (error);
1125 }
1126
1127 int
1128 cloudabi_futex_lock_wrlock(struct thread *td, cloudabi_lock_t *lock,
1129     cloudabi_mflags_t scope, cloudabi_clockid_t clock_id,
1130     cloudabi_timestamp_t timeout, cloudabi_timestamp_t precision)
1131 {
1132         struct futex_lock *fl;
1133         struct futex_queue fq;
1134         int error;
1135
1136         /* Look up lock object. */
1137         error = futex_lock_lookup(td, lock, scope, &fl);
1138         if (error != 0)
1139                 return (error);
1140
1141         futex_queue_init(&fq);
1142         error = futex_lock_wrlock(fl, td, lock, clock_id, timeout,
1143             precision, &fq);
1144         futex_lock_release(fl);
1145         return (error);
1146 }
1147
1148 /*
1149  * Non-blocking calls: releasing locks, signalling condition variables.
1150  */
1151
1152 int
1153 cloudabi_sys_condvar_signal(struct thread *td,
1154     struct cloudabi_sys_condvar_signal_args *uap)
1155 {
1156         struct futex_condvar *fc;
1157         struct futex_lock *fl;
1158         cloudabi_nthreads_t nwaiters;
1159         int error;
1160
1161         nwaiters = uap->nwaiters;
1162         if (nwaiters == 0) {
1163                 /* No threads to wake up. */
1164                 return (0);
1165         }
1166
1167         /* Look up futex object. */
1168         error = futex_condvar_lookup(td, uap->condvar, uap->scope, &fc);
1169         if (error != 0) {
1170                 /* Race condition: condition variable with no waiters. */
1171                 return (error == ENOENT ? 0 : error);
1172         }
1173         fl = fc->fc_lock;
1174
1175         if (fl->fl_owner == LOCK_UNMANAGED) {
1176                 /*
1177                  * The lock is currently not managed by the kernel,
1178                  * meaning we must attempt to acquire the userspace lock
1179                  * first. We cannot requeue threads to an unmanaged lock,
1180                  * as these threads will then never be scheduled.
1181                  *
1182                  * Unfortunately, the memory address of the lock is
1183                  * unknown from this context, meaning that we cannot
1184                  * acquire the lock on behalf of the first thread to be
1185                  * scheduled. The lock may even not be mapped within the
1186                  * address space of the current thread.
1187                  *
1188                  * To solve this, wake up a single waiter that will
1189                  * attempt to acquire the lock. Donate all of the other
1190                  * waiters that need to be woken up to this waiter, so
1191                  * it can requeue them after acquiring the lock.
1192                  */
1193                 futex_queue_wake_up_donate(&fc->fc_waiters, nwaiters - 1);
1194         } else {
1195                 /*
1196                  * Lock is already managed by the kernel. This makes it
1197                  * easy, as we can requeue the threads from the
1198                  * condition variable directly to the associated lock.
1199                  */
1200                 futex_queue_requeue(&fc->fc_waiters, &fl->fl_writers, nwaiters);
1201         }
1202
1203         /* Clear userspace condition variable if all waiters are gone. */
1204         error = futex_condvar_unmanage(fc, uap->condvar);
1205         futex_condvar_release(fc);
1206         return (error);
1207 }
1208
1209 int
1210 cloudabi_sys_lock_unlock(struct thread *td,
1211     struct cloudabi_sys_lock_unlock_args *uap)
1212 {
1213         struct futex_lock *fl;
1214         int error;
1215
1216         error = futex_lock_lookup(td, uap->lock, uap->scope, &fl);
1217         if (error != 0)
1218                 return (error);
1219         error = futex_lock_unlock(fl, td, uap->lock);
1220         futex_lock_release(fl);
1221         return (error);
1222 }