]> CyberLeo.Net >> Repos - FreeBSD/FreeBSD.git/blob - sys/compat/cloudabi/cloudabi_futex.c
Update ncurses to 20200118
[FreeBSD/FreeBSD.git] / sys / compat / cloudabi / cloudabi_futex.c
1 /*-
2  * Copyright (c) 2015 Nuxi, https://nuxi.nl/
3  *
4  * Redistribution and use in source and binary forms, with or without
5  * modification, are permitted provided that the following conditions
6  * are met:
7  * 1. Redistributions of source code must retain the above copyright
8  *    notice, this list of conditions and the following disclaimer.
9  * 2. Redistributions in binary form must reproduce the above copyright
10  *    notice, this list of conditions and the following disclaimer in the
11  *    documentation and/or other materials provided with the distribution.
12  *
13  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
14  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
15  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
16  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
17  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
18  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
19  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
20  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
21  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
22  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
23  * SUCH DAMAGE.
24  */
25
26 #include <sys/cdefs.h>
27 __FBSDID("$FreeBSD$");
28
29 #include <sys/param.h>
30 #include <sys/kernel.h>
31 #include <sys/limits.h>
32 #include <sys/lock.h>
33 #include <sys/malloc.h>
34 #include <sys/mutex.h>
35 #include <sys/proc.h>
36 #include <sys/sx.h>
37 #include <sys/systm.h>
38 #include <sys/umtx.h>
39
40 #include <contrib/cloudabi/cloudabi_types_common.h>
41
42 #include <compat/cloudabi/cloudabi_proto.h>
43 #include <compat/cloudabi/cloudabi_util.h>
44
45 /*
46  * Futexes for CloudABI.
47  *
48  * On most systems, futexes are implemented as objects of a single type
49  * on which a set of operations can be performed. CloudABI makes a clear
50  * distinction between locks and condition variables. A lock may have
51  * zero or more associated condition variables. A condition variable is
52  * always associated with exactly one lock. There is a strict topology.
53  * This approach has two advantages:
54  *
55  * - This topology is guaranteed to be acyclic. Requeueing of threads
56  *   only happens in one direction (from condition variables to locks).
57  *   This eases locking.
58  * - It means that a futex object for a lock exists when it is unlocked,
59  *   but has threads waiting on associated condition variables. Threads
60  *   can be requeued to a lock even if the thread performing the wakeup
61  *   does not have the lock mapped in its address space.
62  *
63  * This futex implementation only implements a single lock type, namely
64  * a read-write lock. A regular mutex type would not be necessary, as
65  * the read-write lock is as efficient as a mutex if used as such.
66  * Userspace futex locks are 32 bits in size:
67  *
68  * - 1 bit: has threads waiting in kernel-space.
69  * - 1 bit: is write-locked.
70  * - 30 bits:
71  *   - if write-locked: thread ID of owner.
72  *   - if not write-locked: number of read locks held.
73  *
74  * Condition variables are also 32 bits in size. Its value is modified
75  * by kernel-space exclusively. Zero indicates that it has no waiting
76  * threads. Non-zero indicates the opposite.
77  *
78  * This implementation is optimal, in the sense that it only wakes up
79  * threads if they can actually continue execution. It does not suffer
80  * from the thundering herd problem. If multiple threads waiting on a
81  * condition variable need to be woken up, only a single thread is
82  * scheduled. All other threads are 'donated' to this thread. After the
83  * thread manages to reacquire the lock, it requeues its donated threads
84  * to the lock.
85  *
86  * TODO(ed): Integrate this functionality into kern_umtx.c instead.
87  * TODO(ed): Store futex objects in a hash table.
88  * TODO(ed): Add actual priority inheritance.
89  * TODO(ed): Let futex_queue also take priorities into account.
90  * TODO(ed): Make locking fine-grained.
91  * TODO(ed): Perform sleeps until an actual absolute point in time,
92  *           instead of converting the timestamp to a relative value.
93  */
94
95 struct futex_address;
96 struct futex_condvar;
97 struct futex_lock;
98 struct futex_queue;
99 struct futex_waiter;
100
101 /* Identifier of a location in memory. */
102 struct futex_address {
103         struct umtx_key                 fa_key;
104 };
105
106 /* A set of waiting threads. */
107 struct futex_queue {
108         STAILQ_HEAD(, futex_waiter)     fq_list;
109         unsigned int                    fq_count;
110 };
111
112 /* Condition variables. */
113 struct futex_condvar {
114         /* Address of the condition variable. */
115         struct futex_address            fc_address;
116
117         /* The lock the waiters should be moved to when signalled. */
118         struct futex_lock *             fc_lock;
119
120         /* Threads waiting on the condition variable. */
121         struct futex_queue              fc_waiters;
122         /*
123          * Number of threads blocked on this condition variable, or
124          * being blocked on the lock after being requeued.
125          */
126         unsigned int                    fc_waitcount;
127
128         /* Global list pointers. */
129         LIST_ENTRY(futex_condvar)       fc_next;
130 };
131
132 /* Read-write locks. */
133 struct futex_lock {
134         /* Address of the lock. */
135         struct futex_address            fl_address;
136
137         /*
138          * Current owner of the lock. LOCK_UNMANAGED if the lock is
139          * currently not owned by the kernel. LOCK_OWNER_UNKNOWN in case
140          * the owner is not known (e.g., when the lock is read-locked).
141          */
142         cloudabi_tid_t                  fl_owner;
143 #define LOCK_UNMANAGED 0x0
144 #define LOCK_OWNER_UNKNOWN 0x1
145
146         /* Writers blocked on the lock. */
147         struct futex_queue              fl_writers;
148         /* Readers blocked on the lock. */
149         struct futex_queue              fl_readers;
150         /* Number of threads blocked on this lock + condition variables. */
151         unsigned int                    fl_waitcount;
152
153         /* Global list pointers. */
154         LIST_ENTRY(futex_lock)          fl_next;
155 };
156
157 /* Information associated with a thread blocked on an object. */
158 struct futex_waiter {
159         /* Thread ID. */
160         cloudabi_tid_t                  fw_tid;
161         /* Condition variable used for waiting. */
162         struct cv                       fw_wait;
163
164         /* Queue this waiter is currently placed in. */
165         struct futex_queue *            fw_queue;
166         /* List pointers of fw_queue. */
167         STAILQ_ENTRY(futex_waiter)      fw_next;
168
169         /* Lock has been acquired. */
170         bool                            fw_locked;
171         /* If not locked, threads that should block after acquiring. */
172         struct futex_queue              fw_donated;
173 };
174
175 /* Global data structures. */
176 static MALLOC_DEFINE(M_FUTEX, "futex", "CloudABI futex");
177
178 static struct sx futex_global_lock;
179 SX_SYSINIT(futex_global_lock, &futex_global_lock, "CloudABI futex global lock");
180
181 static LIST_HEAD(, futex_lock) futex_lock_list =
182     LIST_HEAD_INITIALIZER(&futex_lock_list);
183 static LIST_HEAD(, futex_condvar) futex_condvar_list =
184     LIST_HEAD_INITIALIZER(&futex_condvar_list);
185
186 /* Utility functions. */
187 static void futex_lock_assert(const struct futex_lock *);
188 static struct futex_lock *futex_lock_lookup_locked(struct futex_address *);
189 static void futex_lock_release(struct futex_lock *);
190 static int futex_lock_tryrdlock(struct futex_lock *, cloudabi_lock_t *);
191 static int futex_lock_unmanage(struct futex_lock *, cloudabi_lock_t *);
192 static int futex_lock_update_owner(struct futex_lock *, cloudabi_lock_t *);
193 static int futex_lock_wake_up_next(struct futex_lock *, cloudabi_lock_t *);
194 static unsigned int futex_queue_count(const struct futex_queue *);
195 static void futex_queue_init(struct futex_queue *);
196 static void futex_queue_requeue(struct futex_queue *, struct futex_queue *,
197     unsigned int);
198 static int futex_queue_sleep(struct futex_queue *, struct futex_lock *,
199     struct futex_waiter *, struct thread *, cloudabi_clockid_t,
200     cloudabi_timestamp_t, cloudabi_timestamp_t, bool);
201 static cloudabi_tid_t futex_queue_tid_best(const struct futex_queue *);
202 static void futex_queue_wake_up_all(struct futex_queue *);
203 static void futex_queue_wake_up_best(struct futex_queue *);
204 static void futex_queue_wake_up_donate(struct futex_queue *, unsigned int);
205 static int futex_user_load(uint32_t *, uint32_t *);
206 static int futex_user_store(uint32_t *, uint32_t);
207 static int futex_user_cmpxchg(uint32_t *, uint32_t, uint32_t *, uint32_t);
208
209 /*
210  * futex_address operations.
211  */
212
213 static int
214 futex_address_create(struct futex_address *fa, struct thread *td,
215     const void *object, cloudabi_scope_t scope)
216 {
217
218         KASSERT(td == curthread,
219             ("Can only create umtx keys for the current thread"));
220         switch (scope) {
221         case CLOUDABI_SCOPE_PRIVATE:
222                 return (umtx_key_get(object, TYPE_FUTEX, THREAD_SHARE,
223                     &fa->fa_key));
224         case CLOUDABI_SCOPE_SHARED:
225                 return (umtx_key_get(object, TYPE_FUTEX, AUTO_SHARE,
226                     &fa->fa_key));
227         default:
228                 return (EINVAL);
229         }
230 }
231
232 static void
233 futex_address_free(struct futex_address *fa)
234 {
235
236         umtx_key_release(&fa->fa_key);
237 }
238
239 static bool
240 futex_address_match(const struct futex_address *fa1,
241     const struct futex_address *fa2)
242 {
243
244         return (umtx_key_match(&fa1->fa_key, &fa2->fa_key));
245 }
246
247 /*
248  * futex_condvar operations.
249  */
250
251 static void
252 futex_condvar_assert(const struct futex_condvar *fc)
253 {
254
255         KASSERT(fc->fc_waitcount >= futex_queue_count(&fc->fc_waiters),
256             ("Total number of waiters cannot be smaller than the wait queue"));
257         futex_lock_assert(fc->fc_lock);
258 }
259
260 static int
261 futex_condvar_lookup(struct thread *td, const cloudabi_condvar_t *address,
262     cloudabi_scope_t scope, struct futex_condvar **fcret)
263 {
264         struct futex_address fa_condvar;
265         struct futex_condvar *fc;
266         int error;
267
268         error = futex_address_create(&fa_condvar, td, address, scope);
269         if (error != 0)
270                 return (error);
271
272         sx_xlock(&futex_global_lock);
273         LIST_FOREACH(fc, &futex_condvar_list, fc_next) {
274                 if (futex_address_match(&fc->fc_address, &fa_condvar)) {
275                         /* Found matching lock object. */
276                         futex_address_free(&fa_condvar);
277                         futex_condvar_assert(fc);
278                         *fcret = fc;
279                         return (0);
280                 }
281         }
282         sx_xunlock(&futex_global_lock);
283         futex_address_free(&fa_condvar);
284         return (ENOENT);
285 }
286
287 static int
288 futex_condvar_lookup_or_create(struct thread *td,
289     const cloudabi_condvar_t *condvar, cloudabi_scope_t condvar_scope,
290     const cloudabi_lock_t *lock, cloudabi_scope_t lock_scope,
291     struct futex_condvar **fcret)
292 {
293         struct futex_address fa_condvar, fa_lock;
294         struct futex_condvar *fc;
295         struct futex_lock *fl;
296         int error;
297
298         error = futex_address_create(&fa_condvar, td, condvar, condvar_scope);
299         if (error != 0)
300                 return (error);
301         error = futex_address_create(&fa_lock, td, lock, lock_scope);
302         if (error != 0) {
303                 futex_address_free(&fa_condvar);
304                 return (error);
305         }
306
307         sx_xlock(&futex_global_lock);
308         LIST_FOREACH(fc, &futex_condvar_list, fc_next) {
309                 if (!futex_address_match(&fc->fc_address, &fa_condvar))
310                         continue;
311                 fl = fc->fc_lock;
312                 if (!futex_address_match(&fl->fl_address, &fa_lock)) {
313                         /* Condition variable is owned by a different lock. */
314                         futex_address_free(&fa_condvar);
315                         futex_address_free(&fa_lock);
316                         sx_xunlock(&futex_global_lock);
317                         return (EINVAL);
318                 }
319
320                 /* Found fully matching condition variable. */
321                 futex_address_free(&fa_condvar);
322                 futex_address_free(&fa_lock);
323                 futex_condvar_assert(fc);
324                 *fcret = fc;
325                 return (0);
326         }
327
328         /* None found. Create new condition variable object. */
329         fc = malloc(sizeof(*fc), M_FUTEX, M_WAITOK);
330         fc->fc_address = fa_condvar;
331         fc->fc_lock = futex_lock_lookup_locked(&fa_lock);
332         futex_queue_init(&fc->fc_waiters);
333         fc->fc_waitcount = 0;
334         LIST_INSERT_HEAD(&futex_condvar_list, fc, fc_next);
335         *fcret = fc;
336         return (0);
337 }
338
339 static void
340 futex_condvar_release(struct futex_condvar *fc)
341 {
342         struct futex_lock *fl;
343
344         futex_condvar_assert(fc);
345         fl = fc->fc_lock;
346         if (fc->fc_waitcount == 0) {
347                 /* Condition variable has no waiters. Deallocate it. */
348                 futex_address_free(&fc->fc_address);
349                 LIST_REMOVE(fc, fc_next);
350                 free(fc, M_FUTEX);
351         }
352         futex_lock_release(fl);
353 }
354
355 static int
356 futex_condvar_unmanage(struct futex_condvar *fc,
357     cloudabi_condvar_t *condvar)
358 {
359
360         if (futex_queue_count(&fc->fc_waiters) != 0)
361                 return (0);
362         return (futex_user_store(condvar, CLOUDABI_CONDVAR_HAS_NO_WAITERS));
363 }
364
365 /*
366  * futex_lock operations.
367  */
368
369 static void
370 futex_lock_assert(const struct futex_lock *fl)
371 {
372
373         /*
374          * A futex lock can only be kernel-managed if it has waiters.
375          * Vice versa: if a futex lock has waiters, it must be
376          * kernel-managed.
377          */
378         KASSERT((fl->fl_owner == LOCK_UNMANAGED) ==
379             (futex_queue_count(&fl->fl_readers) == 0 &&
380             futex_queue_count(&fl->fl_writers) == 0),
381             ("Managed locks must have waiting threads"));
382         KASSERT(fl->fl_waitcount != 0 || fl->fl_owner == LOCK_UNMANAGED,
383             ("Lock with no waiters must be unmanaged"));
384 }
385
386 static int
387 futex_lock_lookup(struct thread *td, const cloudabi_lock_t *address,
388     cloudabi_scope_t scope, struct futex_lock **flret)
389 {
390         struct futex_address fa;
391         int error;
392
393         error = futex_address_create(&fa, td, address, scope);
394         if (error != 0)
395                 return (error);
396
397         sx_xlock(&futex_global_lock);
398         *flret = futex_lock_lookup_locked(&fa);
399         return (0);
400 }
401
402 static struct futex_lock *
403 futex_lock_lookup_locked(struct futex_address *fa)
404 {
405         struct futex_lock *fl;
406
407         LIST_FOREACH(fl, &futex_lock_list, fl_next) {
408                 if (futex_address_match(&fl->fl_address, fa)) {
409                         /* Found matching lock object. */
410                         futex_address_free(fa);
411                         futex_lock_assert(fl);
412                         return (fl);
413                 }
414         }
415
416         /* None found. Create new lock object. */
417         fl = malloc(sizeof(*fl), M_FUTEX, M_WAITOK);
418         fl->fl_address = *fa;
419         fl->fl_owner = LOCK_UNMANAGED;
420         futex_queue_init(&fl->fl_readers);
421         futex_queue_init(&fl->fl_writers);
422         fl->fl_waitcount = 0;
423         LIST_INSERT_HEAD(&futex_lock_list, fl, fl_next);
424         return (fl);
425 }
426
427 static int
428 futex_lock_rdlock(struct futex_lock *fl, struct thread *td,
429     cloudabi_lock_t *lock, cloudabi_clockid_t clock_id,
430     cloudabi_timestamp_t timeout, cloudabi_timestamp_t precision, bool abstime)
431 {
432         struct futex_waiter fw;
433         int error;
434
435         error = futex_lock_tryrdlock(fl, lock);
436         if (error == EBUSY) {
437                 /* Suspend execution. */
438                 KASSERT(fl->fl_owner != LOCK_UNMANAGED,
439                     ("Attempted to sleep on an unmanaged lock"));
440                 error = futex_queue_sleep(&fl->fl_readers, fl, &fw, td,
441                     clock_id, timeout, precision, abstime);
442                 KASSERT((error == 0) == fw.fw_locked,
443                     ("Should have locked write lock on success"));
444                 KASSERT(futex_queue_count(&fw.fw_donated) == 0,
445                     ("Lock functions cannot receive threads"));
446         }
447         if (error != 0)
448                 futex_lock_unmanage(fl, lock);
449         return (error);
450 }
451
452 static void
453 futex_lock_release(struct futex_lock *fl)
454 {
455
456         futex_lock_assert(fl);
457         if (fl->fl_waitcount == 0) {
458                 /* Lock object is unreferenced. Deallocate it. */
459                 KASSERT(fl->fl_owner == LOCK_UNMANAGED,
460                     ("Attempted to free a managed lock"));
461                 futex_address_free(&fl->fl_address);
462                 LIST_REMOVE(fl, fl_next);
463                 free(fl, M_FUTEX);
464         }
465         sx_xunlock(&futex_global_lock);
466 }
467
468 static int
469 futex_lock_unmanage(struct futex_lock *fl, cloudabi_lock_t *lock)
470 {
471         cloudabi_lock_t cmp, old;
472         int error;
473
474         if (futex_queue_count(&fl->fl_readers) == 0 &&
475             futex_queue_count(&fl->fl_writers) == 0) {
476                 /* Lock should be unmanaged. */
477                 fl->fl_owner = LOCK_UNMANAGED;
478
479                 /* Clear kernel-managed bit. */
480                 error = futex_user_load(lock, &old);
481                 if (error != 0)
482                         return (error);
483                 for (;;) {
484                         cmp = old;
485                         error = futex_user_cmpxchg(lock, cmp, &old,
486                             cmp & ~CLOUDABI_LOCK_KERNEL_MANAGED);
487                         if (error != 0)
488                                 return (error);
489                         if (old == cmp)
490                                 break;
491                 }
492         }
493         return (0);
494 }
495
496 /* Sets an owner of a lock, based on a userspace lock value. */
497 static void
498 futex_lock_set_owner(struct futex_lock *fl, cloudabi_lock_t lock)
499 {
500
501         /* Lock has no explicit owner. */
502         if ((lock & ~CLOUDABI_LOCK_WRLOCKED) == 0) {
503                 fl->fl_owner = LOCK_OWNER_UNKNOWN;
504                 return;
505         }
506         lock &= ~(CLOUDABI_LOCK_WRLOCKED | CLOUDABI_LOCK_KERNEL_MANAGED);
507
508         /* Don't allow userspace to silently unlock. */
509         if (lock == LOCK_UNMANAGED) {
510                 fl->fl_owner = LOCK_OWNER_UNKNOWN;
511                 return;
512         }
513         fl->fl_owner = lock;
514 }
515
516 static int
517 futex_lock_unlock(struct futex_lock *fl, struct thread *td,
518     cloudabi_lock_t *lock)
519 {
520         int error;
521
522         /* Validate that this thread is allowed to unlock. */
523         error = futex_lock_update_owner(fl, lock);
524         if (error != 0)
525                 return (error);
526         if (fl->fl_owner != LOCK_UNMANAGED && fl->fl_owner != td->td_tid)
527                 return (EPERM);
528         return (futex_lock_wake_up_next(fl, lock));
529 }
530
531 /* Syncs in the owner of the lock from userspace if needed. */
532 static int
533 futex_lock_update_owner(struct futex_lock *fl, cloudabi_lock_t *address)
534 {
535         cloudabi_lock_t lock;
536         int error;
537
538         if (fl->fl_owner == LOCK_OWNER_UNKNOWN) {
539                 error = futex_user_load(address, &lock);
540                 if (error != 0)
541                         return (error);
542                 futex_lock_set_owner(fl, lock);
543         }
544         return (0);
545 }
546
547 static int
548 futex_lock_tryrdlock(struct futex_lock *fl, cloudabi_lock_t *address)
549 {
550         cloudabi_lock_t old, cmp;
551         int error;
552
553         if (fl->fl_owner != LOCK_UNMANAGED) {
554                 /* Lock is already acquired. */
555                 return (EBUSY);
556         }
557
558         old = CLOUDABI_LOCK_UNLOCKED;
559         for (;;) {
560                 if ((old & CLOUDABI_LOCK_KERNEL_MANAGED) != 0) {
561                         /*
562                          * Userspace lock is kernel-managed, even though
563                          * the kernel disagrees.
564                          */
565                         return (EINVAL);
566                 }
567
568                 if ((old & CLOUDABI_LOCK_WRLOCKED) == 0) {
569                         /*
570                          * Lock is not write-locked. Attempt to acquire
571                          * it by increasing the read count.
572                          */
573                         cmp = old;
574                         error = futex_user_cmpxchg(address, cmp, &old, cmp + 1);
575                         if (error != 0)
576                                 return (error);
577                         if (old == cmp) {
578                                 /* Success. */
579                                 return (0);
580                         }
581                 } else {
582                         /* Lock is write-locked. Make it kernel-managed. */
583                         cmp = old;
584                         error = futex_user_cmpxchg(address, cmp, &old,
585                             cmp | CLOUDABI_LOCK_KERNEL_MANAGED);
586                         if (error != 0)
587                                 return (error);
588                         if (old == cmp) {
589                                 /* Success. */
590                                 futex_lock_set_owner(fl, cmp);
591                                 return (EBUSY);
592                         }
593                 }
594         }
595 }
596
597 static int
598 futex_lock_trywrlock(struct futex_lock *fl, cloudabi_lock_t *address,
599     cloudabi_tid_t tid, bool force_kernel_managed)
600 {
601         cloudabi_lock_t old, new, cmp;
602         int error;
603
604         if (fl->fl_owner == tid) {
605                 /* Attempted to acquire lock recursively. */
606                 return (EDEADLK);
607         }
608         if (fl->fl_owner != LOCK_UNMANAGED) {
609                 /* Lock is already acquired. */
610                 return (EBUSY);
611         }
612
613         old = CLOUDABI_LOCK_UNLOCKED;
614         for (;;) {
615                 if ((old & CLOUDABI_LOCK_KERNEL_MANAGED) != 0) {
616                         /*
617                          * Userspace lock is kernel-managed, even though
618                          * the kernel disagrees.
619                          */
620                         return (EINVAL);
621                 }
622                 if (old == (tid | CLOUDABI_LOCK_WRLOCKED)) {
623                         /* Attempted to acquire lock recursively. */
624                         return (EDEADLK);
625                 }
626
627                 if (old == CLOUDABI_LOCK_UNLOCKED) {
628                         /* Lock is unlocked. Attempt to acquire it. */
629                         new = tid | CLOUDABI_LOCK_WRLOCKED;
630                         if (force_kernel_managed)
631                                 new |= CLOUDABI_LOCK_KERNEL_MANAGED;
632                         error = futex_user_cmpxchg(address,
633                             CLOUDABI_LOCK_UNLOCKED, &old, new);
634                         if (error != 0)
635                                 return (error);
636                         if (old == CLOUDABI_LOCK_UNLOCKED) {
637                                 /* Success. */
638                                 if (force_kernel_managed)
639                                         fl->fl_owner = tid;
640                                 return (0);
641                         }
642                 } else {
643                         /* Lock is still locked. Make it kernel-managed. */
644                         cmp = old;
645                         error = futex_user_cmpxchg(address, cmp, &old,
646                             cmp | CLOUDABI_LOCK_KERNEL_MANAGED);
647                         if (error != 0)
648                                 return (error);
649                         if (old == cmp) {
650                                 /* Success. */
651                                 futex_lock_set_owner(fl, cmp);
652                                 return (EBUSY);
653                         }
654                 }
655         }
656 }
657
658 static int
659 futex_lock_wake_up_next(struct futex_lock *fl, cloudabi_lock_t *lock)
660 {
661         cloudabi_tid_t tid;
662         int error;
663
664         /*
665          * Determine which thread(s) to wake up. Prefer waking up
666          * writers over readers to prevent write starvation.
667          */
668         if (futex_queue_count(&fl->fl_writers) > 0) {
669                 /* Transfer ownership to a single write-locker. */
670                 if (futex_queue_count(&fl->fl_writers) > 1 ||
671                     futex_queue_count(&fl->fl_readers) > 0) {
672                         /* Lock should remain managed afterwards. */
673                         tid = futex_queue_tid_best(&fl->fl_writers);
674                         error = futex_user_store(lock,
675                             tid | CLOUDABI_LOCK_WRLOCKED |
676                             CLOUDABI_LOCK_KERNEL_MANAGED);
677                         if (error != 0)
678                                 return (error);
679
680                         futex_queue_wake_up_best(&fl->fl_writers);
681                         fl->fl_owner = tid;
682                 } else {
683                         /* Lock can become unmanaged afterwards. */
684                         error = futex_user_store(lock,
685                             futex_queue_tid_best(&fl->fl_writers) |
686                             CLOUDABI_LOCK_WRLOCKED);
687                         if (error != 0)
688                                 return (error);
689
690                         futex_queue_wake_up_best(&fl->fl_writers);
691                         fl->fl_owner = LOCK_UNMANAGED;
692                 }
693         } else {
694                 /* Transfer ownership to all read-lockers (if any). */
695                 error = futex_user_store(lock,
696                     futex_queue_count(&fl->fl_readers));
697                 if (error != 0)
698                         return (error);
699
700                 /* Wake up all threads. */
701                 futex_queue_wake_up_all(&fl->fl_readers);
702                 fl->fl_owner = LOCK_UNMANAGED;
703         }
704         return (0);
705 }
706
707 static int
708 futex_lock_wrlock(struct futex_lock *fl, struct thread *td,
709     cloudabi_lock_t *lock, cloudabi_clockid_t clock_id,
710     cloudabi_timestamp_t timeout, cloudabi_timestamp_t precision, bool abstime,
711     struct futex_queue *donated)
712 {
713         struct futex_waiter fw;
714         int error;
715
716         error = futex_lock_trywrlock(fl, lock, td->td_tid,
717             futex_queue_count(donated) > 0);
718
719         if (error == 0 || error == EBUSY) {
720                 /* Put donated threads in queue before suspending. */
721                 KASSERT(futex_queue_count(donated) == 0 ||
722                     fl->fl_owner != LOCK_UNMANAGED,
723                     ("Lock should be managed if we are going to donate"));
724                 futex_queue_requeue(donated, &fl->fl_writers, UINT_MAX);
725         } else {
726                 /*
727                  * This thread cannot deal with the donated threads.
728                  * Wake up the next thread and let it try it by itself.
729                  */
730                 futex_queue_wake_up_donate(donated, UINT_MAX);
731         }
732
733         if (error == EBUSY) {
734                 /* Suspend execution if the lock was busy. */
735                 KASSERT(fl->fl_owner != LOCK_UNMANAGED,
736                     ("Attempted to sleep on an unmanaged lock"));
737                 error = futex_queue_sleep(&fl->fl_writers, fl, &fw, td,
738                     clock_id, timeout, precision, abstime);
739                 KASSERT((error == 0) == fw.fw_locked,
740                     ("Should have locked write lock on success"));
741                 KASSERT(futex_queue_count(&fw.fw_donated) == 0,
742                     ("Lock functions cannot receive threads"));
743         }
744         if (error != 0)
745                 futex_lock_unmanage(fl, lock);
746         return (error);
747 }
748
749 /*
750  * futex_queue operations.
751  */
752
753 static cloudabi_tid_t
754 futex_queue_tid_best(const struct futex_queue *fq)
755 {
756
757         return (STAILQ_FIRST(&fq->fq_list)->fw_tid);
758 }
759
760 static unsigned int
761 futex_queue_count(const struct futex_queue *fq)
762 {
763
764         return (fq->fq_count);
765 }
766
767 static void
768 futex_queue_init(struct futex_queue *fq)
769 {
770
771         STAILQ_INIT(&fq->fq_list);
772         fq->fq_count = 0;
773 }
774
775 /* Converts a relative timestamp to an sbintime. */
776 static sbintime_t
777 futex_queue_convert_timestamp_relative(cloudabi_timestamp_t ts)
778 {
779         cloudabi_timestamp_t s, ns;
780
781         s = ts / 1000000000;
782         ns = ts % 1000000000;
783         if (s > INT32_MAX)
784                 return (INT64_MAX);
785         return ((s << 32) + (ns << 32) / 1000000000);
786 }
787
788 /* Converts an absolute timestamp and precision to a pair of sbintime values. */
789 static int
790 futex_queue_convert_timestamp(struct thread *td, cloudabi_clockid_t clock_id,
791     cloudabi_timestamp_t timeout, cloudabi_timestamp_t precision,
792     sbintime_t *sbttimeout, sbintime_t *sbtprecision, bool abstime)
793 {
794         cloudabi_timestamp_t now;
795         int error;
796
797         if (abstime) {
798                 /* Make the time relative. */
799                 error = cloudabi_clock_time_get(td, clock_id, &now);
800                 if (error != 0)
801                         return (error);
802                 timeout = timeout < now ? 0 : timeout - now;
803         }
804
805         *sbttimeout = futex_queue_convert_timestamp_relative(timeout);
806         *sbtprecision = futex_queue_convert_timestamp_relative(precision);
807         return (0);
808 }
809
810 static int
811 futex_queue_sleep(struct futex_queue *fq, struct futex_lock *fl,
812     struct futex_waiter *fw, struct thread *td, cloudabi_clockid_t clock_id,
813     cloudabi_timestamp_t timeout, cloudabi_timestamp_t precision, bool abstime)
814 {
815         sbintime_t sbttimeout, sbtprecision;
816         int error;
817
818         /* Initialize futex_waiter object. */
819         fw->fw_tid = td->td_tid;
820         fw->fw_locked = false;
821         futex_queue_init(&fw->fw_donated);
822
823         if (timeout != UINT64_MAX) {
824                 /* Convert timeout duration. */
825                 error = futex_queue_convert_timestamp(td, clock_id, timeout,
826                     precision, &sbttimeout, &sbtprecision, abstime);
827                 if (error != 0)
828                         return (error);
829         }
830
831         /* Place object in the queue. */
832         fw->fw_queue = fq;
833         STAILQ_INSERT_TAIL(&fq->fq_list, fw, fw_next);
834         ++fq->fq_count;
835
836         cv_init(&fw->fw_wait, "futex");
837         ++fl->fl_waitcount;
838
839         futex_lock_assert(fl);
840         if (timeout == UINT64_MAX) {
841                 /* Wait without a timeout. */
842                 error = cv_wait_sig(&fw->fw_wait, &futex_global_lock);
843         } else {
844                 /* Wait respecting the timeout. */
845                 error = cv_timedwait_sig_sbt(&fw->fw_wait, &futex_global_lock,
846                     sbttimeout, sbtprecision, 0);
847                 futex_lock_assert(fl);
848                 if (error == EWOULDBLOCK &&
849                     fw->fw_queue != NULL && fw->fw_queue != fq) {
850                         /*
851                          * We got signalled on a condition variable, but
852                          * observed a timeout while waiting to reacquire
853                          * the lock. In other words, we didn't actually
854                          * time out. Go back to sleep and wait for the
855                          * lock to be reacquired.
856                          */
857                         error = cv_wait_sig(&fw->fw_wait, &futex_global_lock);
858                 }
859         }
860         futex_lock_assert(fl);
861
862         --fl->fl_waitcount;
863         cv_destroy(&fw->fw_wait);
864
865         fq = fw->fw_queue;
866         if (fq == NULL) {
867                 /* Thread got dequeued, so we've slept successfully. */
868                 return (0);
869         }
870
871         /* Thread is still enqueued. Remove it. */
872         KASSERT(error != 0, ("Woken up thread is still enqueued"));
873         STAILQ_REMOVE(&fq->fq_list, fw, futex_waiter, fw_next);
874         --fq->fq_count;
875         return (error == EWOULDBLOCK ? ETIMEDOUT : error);
876 }
877
878 /* Moves up to nwaiters waiters from one queue to another. */
879 static void
880 futex_queue_requeue(struct futex_queue *fqfrom, struct futex_queue *fqto,
881     unsigned int nwaiters)
882 {
883         struct futex_waiter *fw;
884
885         /* Move waiters to the target queue. */
886         while (nwaiters-- > 0 && !STAILQ_EMPTY(&fqfrom->fq_list)) {
887                 fw = STAILQ_FIRST(&fqfrom->fq_list);
888                 STAILQ_REMOVE_HEAD(&fqfrom->fq_list, fw_next);
889                 --fqfrom->fq_count;
890
891                 fw->fw_queue = fqto;
892                 STAILQ_INSERT_TAIL(&fqto->fq_list, fw, fw_next);
893                 ++fqto->fq_count;
894         }
895 }
896
897 /* Wakes up all waiters in a queue. */
898 static void
899 futex_queue_wake_up_all(struct futex_queue *fq)
900 {
901         struct futex_waiter *fw;
902
903         STAILQ_FOREACH(fw, &fq->fq_list, fw_next) {
904                 fw->fw_locked = true;
905                 fw->fw_queue = NULL;
906                 cv_signal(&fw->fw_wait);
907         }
908
909         STAILQ_INIT(&fq->fq_list);
910         fq->fq_count = 0;
911 }
912
913 /*
914  * Wakes up the best waiter (i.e., the waiter having the highest
915  * priority) in a queue.
916  */
917 static void
918 futex_queue_wake_up_best(struct futex_queue *fq)
919 {
920         struct futex_waiter *fw;
921
922         fw = STAILQ_FIRST(&fq->fq_list);
923         fw->fw_locked = true;
924         fw->fw_queue = NULL;
925         cv_signal(&fw->fw_wait);
926
927         STAILQ_REMOVE_HEAD(&fq->fq_list, fw_next);
928         --fq->fq_count;
929 }
930
931 static void
932 futex_queue_wake_up_donate(struct futex_queue *fq, unsigned int nwaiters)
933 {
934         struct futex_waiter *fw;
935
936         fw = STAILQ_FIRST(&fq->fq_list);
937         if (fw == NULL)
938                 return;
939         fw->fw_locked = false;
940         fw->fw_queue = NULL;
941         cv_signal(&fw->fw_wait);
942
943         STAILQ_REMOVE_HEAD(&fq->fq_list, fw_next);
944         --fq->fq_count;
945         futex_queue_requeue(fq, &fw->fw_donated, nwaiters);
946 }
947
948 /*
949  * futex_user operations. Used to adjust values in userspace.
950  */
951
952 static int
953 futex_user_load(uint32_t *obj, uint32_t *val)
954 {
955
956         return (fueword32(obj, val) != 0 ? EFAULT : 0);
957 }
958
959 static int
960 futex_user_store(uint32_t *obj, uint32_t val)
961 {
962
963         return (suword32(obj, val) != 0 ? EFAULT : 0);
964 }
965
966 static int
967 futex_user_cmpxchg(uint32_t *obj, uint32_t cmp, uint32_t *old, uint32_t new)
968 {
969
970         return (casueword32(obj, cmp, old, new) != 0 ? EFAULT : 0);
971 }
972
973 /*
974  * Blocking calls: acquiring locks, waiting on condition variables.
975  */
976
977 int
978 cloudabi_futex_condvar_wait(struct thread *td, cloudabi_condvar_t *condvar,
979     cloudabi_scope_t condvar_scope, cloudabi_lock_t *lock,
980     cloudabi_scope_t lock_scope, cloudabi_clockid_t clock_id,
981     cloudabi_timestamp_t timeout, cloudabi_timestamp_t precision, bool abstime)
982 {
983         struct futex_condvar *fc;
984         struct futex_lock *fl;
985         struct futex_waiter fw;
986         int error, error2;
987
988         /* Lookup condition variable object. */
989         error = futex_condvar_lookup_or_create(td, condvar, condvar_scope, lock,
990             lock_scope, &fc);
991         if (error != 0)
992                 return (error);
993         fl = fc->fc_lock;
994
995         /*
996          * Set the condition variable to something other than
997          * CLOUDABI_CONDVAR_HAS_NO_WAITERS to make userspace threads
998          * call into the kernel to perform wakeups.
999          */
1000         error = futex_user_store(condvar, ~CLOUDABI_CONDVAR_HAS_NO_WAITERS);
1001         if (error != 0) {
1002                 futex_condvar_release(fc);
1003                 return (error);
1004         }
1005
1006         /* Drop the lock. */
1007         error = futex_lock_unlock(fl, td, lock);
1008         if (error != 0) {
1009                 futex_condvar_unmanage(fc, condvar);
1010                 futex_condvar_release(fc);
1011                 return (error);
1012         }
1013
1014         /* Go to sleep. */
1015         ++fc->fc_waitcount;
1016         error = futex_queue_sleep(&fc->fc_waiters, fc->fc_lock, &fw, td,
1017             clock_id, timeout, precision, abstime);
1018         if (fw.fw_locked) {
1019                 /* Waited and got the lock assigned to us. */
1020                 KASSERT(futex_queue_count(&fw.fw_donated) == 0,
1021                     ("Received threads while being locked"));
1022         } else if (error == 0 || error == ETIMEDOUT) {
1023                 if (error != 0)
1024                         futex_condvar_unmanage(fc, condvar);
1025                 /*
1026                  * Got woken up without having the lock assigned to us.
1027                  * This can happen in two cases:
1028                  *
1029                  * 1. We observed a timeout on a condition variable.
1030                  * 2. We got signalled on a condition variable while the
1031                  *    associated lock is unlocked. We are the first
1032                  *    thread that gets woken up. This thread is
1033                  *    responsible for reacquiring the userspace lock.
1034                  */
1035                 error2 = futex_lock_wrlock(fl, td, lock,
1036                     CLOUDABI_CLOCK_MONOTONIC, UINT64_MAX, 0, abstime,
1037                     &fw.fw_donated);
1038                 if (error2 != 0)
1039                         error = error2;
1040         } else {
1041                 KASSERT(futex_queue_count(&fw.fw_donated) == 0,
1042                     ("Received threads on error"));
1043                 futex_condvar_unmanage(fc, condvar);
1044                 futex_lock_unmanage(fl, lock);
1045         }
1046         --fc->fc_waitcount;
1047         futex_condvar_release(fc);
1048         return (error);
1049 }
1050
1051 int
1052 cloudabi_futex_lock_rdlock(struct thread *td, cloudabi_lock_t *lock,
1053     cloudabi_scope_t scope, cloudabi_clockid_t clock_id,
1054     cloudabi_timestamp_t timeout, cloudabi_timestamp_t precision, bool abstime)
1055 {
1056         struct futex_lock *fl;
1057         int error;
1058
1059         /* Look up lock object. */
1060         error = futex_lock_lookup(td, lock, scope, &fl);
1061         if (error != 0)
1062                 return (error);
1063
1064         error = futex_lock_rdlock(fl, td, lock, clock_id, timeout,
1065             precision, abstime);
1066         futex_lock_release(fl);
1067         return (error);
1068 }
1069
1070 int
1071 cloudabi_futex_lock_wrlock(struct thread *td, cloudabi_lock_t *lock,
1072     cloudabi_scope_t scope, cloudabi_clockid_t clock_id,
1073     cloudabi_timestamp_t timeout, cloudabi_timestamp_t precision, bool abstime)
1074 {
1075         struct futex_lock *fl;
1076         struct futex_queue fq;
1077         int error;
1078
1079         /* Look up lock object. */
1080         error = futex_lock_lookup(td, lock, scope, &fl);
1081         if (error != 0)
1082                 return (error);
1083
1084         futex_queue_init(&fq);
1085         error = futex_lock_wrlock(fl, td, lock, clock_id, timeout,
1086             precision, abstime, &fq);
1087         futex_lock_release(fl);
1088         return (error);
1089 }
1090
1091 /*
1092  * Non-blocking calls: releasing locks, signalling condition variables.
1093  */
1094
1095 int
1096 cloudabi_sys_condvar_signal(struct thread *td,
1097     struct cloudabi_sys_condvar_signal_args *uap)
1098 {
1099         struct futex_condvar *fc;
1100         struct futex_lock *fl;
1101         cloudabi_nthreads_t nwaiters;
1102         int error;
1103
1104         nwaiters = uap->nwaiters;
1105         if (nwaiters == 0) {
1106                 /* No threads to wake up. */
1107                 return (0);
1108         }
1109
1110         /* Look up futex object. */
1111         error = futex_condvar_lookup(td, uap->condvar, uap->scope, &fc);
1112         if (error != 0) {
1113                 /* Race condition: condition variable with no waiters. */
1114                 return (error == ENOENT ? 0 : error);
1115         }
1116         fl = fc->fc_lock;
1117
1118         if (fl->fl_owner == LOCK_UNMANAGED) {
1119                 /*
1120                  * The lock is currently not managed by the kernel,
1121                  * meaning we must attempt to acquire the userspace lock
1122                  * first. We cannot requeue threads to an unmanaged lock,
1123                  * as these threads will then never be scheduled.
1124                  *
1125                  * Unfortunately, the memory address of the lock is
1126                  * unknown from this context, meaning that we cannot
1127                  * acquire the lock on behalf of the first thread to be
1128                  * scheduled. The lock may even not be mapped within the
1129                  * address space of the current thread.
1130                  *
1131                  * To solve this, wake up a single waiter that will
1132                  * attempt to acquire the lock. Donate all of the other
1133                  * waiters that need to be woken up to this waiter, so
1134                  * it can requeue them after acquiring the lock.
1135                  */
1136                 futex_queue_wake_up_donate(&fc->fc_waiters, nwaiters - 1);
1137         } else {
1138                 /*
1139                  * Lock is already managed by the kernel. This makes it
1140                  * easy, as we can requeue the threads from the
1141                  * condition variable directly to the associated lock.
1142                  */
1143                 futex_queue_requeue(&fc->fc_waiters, &fl->fl_writers, nwaiters);
1144         }
1145
1146         /* Clear userspace condition variable if all waiters are gone. */
1147         error = futex_condvar_unmanage(fc, uap->condvar);
1148         futex_condvar_release(fc);
1149         return (error);
1150 }
1151
1152 int
1153 cloudabi_sys_lock_unlock(struct thread *td,
1154     struct cloudabi_sys_lock_unlock_args *uap)
1155 {
1156         struct futex_lock *fl;
1157         int error;
1158
1159         error = futex_lock_lookup(td, uap->lock, uap->scope, &fl);
1160         if (error != 0)
1161                 return (error);
1162         error = futex_lock_unlock(fl, td, uap->lock);
1163         futex_lock_release(fl);
1164         return (error);
1165 }