]> CyberLeo.Net >> Repos - FreeBSD/FreeBSD.git/blob - sys/kern/kern_rmlock.c
cache: use flexible array member
[FreeBSD/FreeBSD.git] / sys / kern / kern_rmlock.c
1 /*-
2  * SPDX-License-Identifier: BSD-3-Clause
3  *
4  * Copyright (c) 2007 Stephan Uphoff <ups@FreeBSD.org>
5  * All rights reserved.
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  * 1. Redistributions of source code must retain the above copyright
11  *    notice, this list of conditions and the following disclaimer.
12  * 2. Redistributions in binary form must reproduce the above copyright
13  *    notice, this list of conditions and the following disclaimer in the
14  *    documentation and/or other materials provided with the distribution.
15  * 3. Neither the name of the author nor the names of any co-contributors
16  *    may be used to endorse or promote products derived from this software
17  *    without specific prior written permission.
18  *
19  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
20  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
21  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
22  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
23  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
24  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
25  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
26  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
27  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
28  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
29  * SUCH DAMAGE.
30  */
31
32 /*
33  * Machine independent bits of reader/writer lock implementation.
34  */
35
36 #include <sys/cdefs.h>
37 __FBSDID("$FreeBSD$");
38
39 #include "opt_ddb.h"
40
41 #include <sys/param.h>
42 #include <sys/systm.h>
43
44 #include <sys/kernel.h>
45 #include <sys/kdb.h>
46 #include <sys/ktr.h>
47 #include <sys/lock.h>
48 #include <sys/mutex.h>
49 #include <sys/proc.h>
50 #include <sys/rmlock.h>
51 #include <sys/sched.h>
52 #include <sys/smp.h>
53 #include <sys/turnstile.h>
54 #include <sys/lock_profile.h>
55 #include <machine/cpu.h>
56 #include <vm/uma.h>
57
58 #ifdef DDB
59 #include <ddb/ddb.h>
60 #endif
61
62 /*
63  * A cookie to mark destroyed rmlocks.  This is stored in the head of
64  * rm_activeReaders.
65  */
66 #define RM_DESTROYED    ((void *)0xdead)
67
68 #define rm_destroyed(rm)                                                \
69         (LIST_FIRST(&(rm)->rm_activeReaders) == RM_DESTROYED)
70
71 #define RMPF_ONQUEUE    1
72 #define RMPF_SIGNAL     2
73
74 #ifndef INVARIANTS
75 #define _rm_assert(c, what, file, line)
76 #endif
77
78 static void     assert_rm(const struct lock_object *lock, int what);
79 #ifdef DDB
80 static void     db_show_rm(const struct lock_object *lock);
81 #endif
82 static void     lock_rm(struct lock_object *lock, uintptr_t how);
83 #ifdef KDTRACE_HOOKS
84 static int      owner_rm(const struct lock_object *lock, struct thread **owner);
85 #endif
86 static uintptr_t unlock_rm(struct lock_object *lock);
87
88 struct lock_class lock_class_rm = {
89         .lc_name = "rm",
90         .lc_flags = LC_SLEEPLOCK | LC_RECURSABLE,
91         .lc_assert = assert_rm,
92 #ifdef DDB
93         .lc_ddb_show = db_show_rm,
94 #endif
95         .lc_lock = lock_rm,
96         .lc_unlock = unlock_rm,
97 #ifdef KDTRACE_HOOKS
98         .lc_owner = owner_rm,
99 #endif
100 };
101
102 struct lock_class lock_class_rm_sleepable = {
103         .lc_name = "sleepable rm",
104         .lc_flags = LC_SLEEPLOCK | LC_SLEEPABLE | LC_RECURSABLE,
105         .lc_assert = assert_rm,
106 #ifdef DDB
107         .lc_ddb_show = db_show_rm,
108 #endif
109         .lc_lock = lock_rm,
110         .lc_unlock = unlock_rm,
111 #ifdef KDTRACE_HOOKS
112         .lc_owner = owner_rm,
113 #endif
114 };
115
116 static void
117 assert_rm(const struct lock_object *lock, int what)
118 {
119
120         rm_assert((const struct rmlock *)lock, what);
121 }
122
123 static void
124 lock_rm(struct lock_object *lock, uintptr_t how)
125 {
126         struct rmlock *rm;
127         struct rm_priotracker *tracker;
128
129         rm = (struct rmlock *)lock;
130         if (how == 0)
131                 rm_wlock(rm);
132         else {
133                 tracker = (struct rm_priotracker *)how;
134                 rm_rlock(rm, tracker);
135         }
136 }
137
138 static uintptr_t
139 unlock_rm(struct lock_object *lock)
140 {
141         struct thread *td;
142         struct pcpu *pc;
143         struct rmlock *rm;
144         struct rm_queue *queue;
145         struct rm_priotracker *tracker;
146         uintptr_t how;
147
148         rm = (struct rmlock *)lock;
149         tracker = NULL;
150         how = 0;
151         rm_assert(rm, RA_LOCKED | RA_NOTRECURSED);
152         if (rm_wowned(rm))
153                 rm_wunlock(rm);
154         else {
155                 /*
156                  * Find the right rm_priotracker structure for curthread.
157                  * The guarantee about its uniqueness is given by the fact
158                  * we already asserted the lock wasn't recursively acquired.
159                  */
160                 critical_enter();
161                 td = curthread;
162                 pc = get_pcpu();
163                 for (queue = pc->pc_rm_queue.rmq_next;
164                     queue != &pc->pc_rm_queue; queue = queue->rmq_next) {
165                         tracker = (struct rm_priotracker *)queue;
166                                 if ((tracker->rmp_rmlock == rm) &&
167                                     (tracker->rmp_thread == td)) {
168                                         how = (uintptr_t)tracker;
169                                         break;
170                                 }
171                 }
172                 KASSERT(tracker != NULL,
173                     ("rm_priotracker is non-NULL when lock held in read mode"));
174                 critical_exit();
175                 rm_runlock(rm, tracker);
176         }
177         return (how);
178 }
179
180 #ifdef KDTRACE_HOOKS
181 static int
182 owner_rm(const struct lock_object *lock, struct thread **owner)
183 {
184         const struct rmlock *rm;
185         struct lock_class *lc;
186
187         rm = (const struct rmlock *)lock;
188         lc = LOCK_CLASS(&rm->rm_wlock_object);
189         return (lc->lc_owner(&rm->rm_wlock_object, owner));
190 }
191 #endif
192
193 static struct mtx rm_spinlock;
194
195 MTX_SYSINIT(rm_spinlock, &rm_spinlock, "rm_spinlock", MTX_SPIN);
196
197 /*
198  * Add or remove tracker from per-cpu list.
199  *
200  * The per-cpu list can be traversed at any time in forward direction from an
201  * interrupt on the *local* cpu.
202  */
203 static void inline
204 rm_tracker_add(struct pcpu *pc, struct rm_priotracker *tracker)
205 {
206         struct rm_queue *next;
207
208         /* Initialize all tracker pointers */
209         tracker->rmp_cpuQueue.rmq_prev = &pc->pc_rm_queue;
210         next = pc->pc_rm_queue.rmq_next;
211         tracker->rmp_cpuQueue.rmq_next = next;
212
213         /* rmq_prev is not used during froward traversal. */
214         next->rmq_prev = &tracker->rmp_cpuQueue;
215
216         /* Update pointer to first element. */
217         pc->pc_rm_queue.rmq_next = &tracker->rmp_cpuQueue;
218 }
219
220 /*
221  * Return a count of the number of trackers the thread 'td' already
222  * has on this CPU for the lock 'rm'.
223  */
224 static int
225 rm_trackers_present(const struct pcpu *pc, const struct rmlock *rm,
226     const struct thread *td)
227 {
228         struct rm_queue *queue;
229         struct rm_priotracker *tracker;
230         int count;
231
232         count = 0;
233         for (queue = pc->pc_rm_queue.rmq_next; queue != &pc->pc_rm_queue;
234             queue = queue->rmq_next) {
235                 tracker = (struct rm_priotracker *)queue;
236                 if ((tracker->rmp_rmlock == rm) && (tracker->rmp_thread == td))
237                         count++;
238         }
239         return (count);
240 }
241
242 static void inline
243 rm_tracker_remove(struct pcpu *pc, struct rm_priotracker *tracker)
244 {
245         struct rm_queue *next, *prev;
246
247         next = tracker->rmp_cpuQueue.rmq_next;
248         prev = tracker->rmp_cpuQueue.rmq_prev;
249
250         /* Not used during forward traversal. */
251         next->rmq_prev = prev;
252
253         /* Remove from list. */
254         prev->rmq_next = next;
255 }
256
257 static void
258 rm_cleanIPI(void *arg)
259 {
260         struct pcpu *pc;
261         struct rmlock *rm = arg;
262         struct rm_priotracker *tracker;
263         struct rm_queue *queue;
264         pc = get_pcpu();
265
266         for (queue = pc->pc_rm_queue.rmq_next; queue != &pc->pc_rm_queue;
267             queue = queue->rmq_next) {
268                 tracker = (struct rm_priotracker *)queue;
269                 if (tracker->rmp_rmlock == rm && tracker->rmp_flags == 0) {
270                         tracker->rmp_flags = RMPF_ONQUEUE;
271                         mtx_lock_spin(&rm_spinlock);
272                         LIST_INSERT_HEAD(&rm->rm_activeReaders, tracker,
273                             rmp_qentry);
274                         mtx_unlock_spin(&rm_spinlock);
275                 }
276         }
277 }
278
279 void
280 rm_init_flags(struct rmlock *rm, const char *name, int opts)
281 {
282         struct lock_class *lc;
283         int liflags, xflags;
284
285         liflags = 0;
286         if (!(opts & RM_NOWITNESS))
287                 liflags |= LO_WITNESS;
288         if (opts & RM_RECURSE)
289                 liflags |= LO_RECURSABLE;
290         if (opts & RM_NEW)
291                 liflags |= LO_NEW;
292         if (opts & RM_DUPOK)
293                 liflags |= LO_DUPOK;
294         rm->rm_writecpus = all_cpus;
295         LIST_INIT(&rm->rm_activeReaders);
296         if (opts & RM_SLEEPABLE) {
297                 liflags |= LO_SLEEPABLE;
298                 lc = &lock_class_rm_sleepable;
299                 xflags = (opts & RM_NEW ? SX_NEW : 0);
300                 sx_init_flags(&rm->rm_lock_sx, "rmlock_sx",
301                     xflags | SX_NOWITNESS);
302         } else {
303                 lc = &lock_class_rm;
304                 xflags = (opts & RM_NEW ? MTX_NEW : 0);
305                 mtx_init(&rm->rm_lock_mtx, name, "rmlock_mtx",
306                     xflags | MTX_NOWITNESS);
307         }
308         lock_init(&rm->lock_object, lc, name, NULL, liflags);
309 }
310
311 void
312 rm_init(struct rmlock *rm, const char *name)
313 {
314
315         rm_init_flags(rm, name, 0);
316 }
317
318 void
319 rm_destroy(struct rmlock *rm)
320 {
321
322         rm_assert(rm, RA_UNLOCKED);
323         LIST_FIRST(&rm->rm_activeReaders) = RM_DESTROYED;
324         if (rm->lock_object.lo_flags & LO_SLEEPABLE)
325                 sx_destroy(&rm->rm_lock_sx);
326         else
327                 mtx_destroy(&rm->rm_lock_mtx);
328         lock_destroy(&rm->lock_object);
329 }
330
331 int
332 rm_wowned(const struct rmlock *rm)
333 {
334
335         if (rm->lock_object.lo_flags & LO_SLEEPABLE)
336                 return (sx_xlocked(&rm->rm_lock_sx));
337         else
338                 return (mtx_owned(&rm->rm_lock_mtx));
339 }
340
341 void
342 rm_sysinit(void *arg)
343 {
344         struct rm_args *args;
345
346         args = arg;
347         rm_init_flags(args->ra_rm, args->ra_desc, args->ra_flags);
348 }
349
350 static __noinline int
351 _rm_rlock_hard(struct rmlock *rm, struct rm_priotracker *tracker, int trylock)
352 {
353         struct pcpu *pc;
354
355         critical_enter();
356         pc = get_pcpu();
357
358         /* Check if we just need to do a proper critical_exit. */
359         if (!CPU_ISSET(pc->pc_cpuid, &rm->rm_writecpus)) {
360                 critical_exit();
361                 return (1);
362         }
363
364         /* Remove our tracker from the per-cpu list. */
365         rm_tracker_remove(pc, tracker);
366
367         /*
368          * Check to see if the IPI granted us the lock after all.  The load of
369          * rmp_flags must happen after the tracker is removed from the list.
370          */
371         atomic_interrupt_fence();
372         if (tracker->rmp_flags) {
373                 /* Just add back tracker - we hold the lock. */
374                 rm_tracker_add(pc, tracker);
375                 critical_exit();
376                 return (1);
377         }
378
379         /*
380          * We allow readers to acquire a lock even if a writer is blocked if
381          * the lock is recursive and the reader already holds the lock.
382          */
383         if ((rm->lock_object.lo_flags & LO_RECURSABLE) != 0) {
384                 /*
385                  * Just grant the lock if this thread already has a tracker
386                  * for this lock on the per-cpu queue.
387                  */
388                 if (rm_trackers_present(pc, rm, curthread) != 0) {
389                         mtx_lock_spin(&rm_spinlock);
390                         LIST_INSERT_HEAD(&rm->rm_activeReaders, tracker,
391                             rmp_qentry);
392                         tracker->rmp_flags = RMPF_ONQUEUE;
393                         mtx_unlock_spin(&rm_spinlock);
394                         rm_tracker_add(pc, tracker);
395                         critical_exit();
396                         return (1);
397                 }
398         }
399
400         sched_unpin();
401         critical_exit();
402
403         if (trylock) {
404                 if (rm->lock_object.lo_flags & LO_SLEEPABLE) {
405                         if (!sx_try_xlock(&rm->rm_lock_sx))
406                                 return (0);
407                 } else {
408                         if (!mtx_trylock(&rm->rm_lock_mtx))
409                                 return (0);
410                 }
411         } else {
412                 if (rm->lock_object.lo_flags & LO_SLEEPABLE) {
413                         THREAD_SLEEPING_OK();
414                         sx_xlock(&rm->rm_lock_sx);
415                         THREAD_NO_SLEEPING();
416                 } else
417                         mtx_lock(&rm->rm_lock_mtx);
418         }
419
420         critical_enter();
421         pc = get_pcpu();
422         CPU_CLR(pc->pc_cpuid, &rm->rm_writecpus);
423         rm_tracker_add(pc, tracker);
424         sched_pin();
425         critical_exit();
426
427         if (rm->lock_object.lo_flags & LO_SLEEPABLE)
428                 sx_xunlock(&rm->rm_lock_sx);
429         else
430                 mtx_unlock(&rm->rm_lock_mtx);
431
432         return (1);
433 }
434
435 int
436 _rm_rlock(struct rmlock *rm, struct rm_priotracker *tracker, int trylock)
437 {
438         struct thread *td = curthread;
439         struct pcpu *pc;
440
441         if (SCHEDULER_STOPPED())
442                 return (1);
443
444         tracker->rmp_flags  = 0;
445         tracker->rmp_thread = td;
446         tracker->rmp_rmlock = rm;
447
448         if (rm->lock_object.lo_flags & LO_SLEEPABLE)
449                 THREAD_NO_SLEEPING();
450
451         td->td_critnest++;      /* critical_enter(); */
452         atomic_interrupt_fence();
453
454         pc = cpuid_to_pcpu[td->td_oncpu];
455         rm_tracker_add(pc, tracker);
456         sched_pin();
457
458         atomic_interrupt_fence();
459         td->td_critnest--;
460
461         /*
462          * Fast path to combine two common conditions into a single
463          * conditional jump.
464          */
465         if (__predict_true(0 == (td->td_owepreempt |
466             CPU_ISSET(pc->pc_cpuid, &rm->rm_writecpus))))
467                 return (1);
468
469         /* We do not have a read token and need to acquire one. */
470         return _rm_rlock_hard(rm, tracker, trylock);
471 }
472
473 static __noinline void
474 _rm_unlock_hard(struct thread *td,struct rm_priotracker *tracker)
475 {
476
477         if (td->td_owepreempt) {
478                 td->td_critnest++;
479                 critical_exit();
480         }
481
482         if (!tracker->rmp_flags)
483                 return;
484
485         mtx_lock_spin(&rm_spinlock);
486         LIST_REMOVE(tracker, rmp_qentry);
487
488         if (tracker->rmp_flags & RMPF_SIGNAL) {
489                 struct rmlock *rm;
490                 struct turnstile *ts;
491
492                 rm = tracker->rmp_rmlock;
493
494                 turnstile_chain_lock(&rm->lock_object);
495                 mtx_unlock_spin(&rm_spinlock);
496
497                 ts = turnstile_lookup(&rm->lock_object);
498
499                 turnstile_signal(ts, TS_EXCLUSIVE_QUEUE);
500                 turnstile_unpend(ts);
501                 turnstile_chain_unlock(&rm->lock_object);
502         } else
503                 mtx_unlock_spin(&rm_spinlock);
504 }
505
506 void
507 _rm_runlock(struct rmlock *rm, struct rm_priotracker *tracker)
508 {
509         struct pcpu *pc;
510         struct thread *td = tracker->rmp_thread;
511
512         if (SCHEDULER_STOPPED())
513                 return;
514
515         td->td_critnest++;      /* critical_enter(); */
516         atomic_interrupt_fence();
517
518         pc = cpuid_to_pcpu[td->td_oncpu];
519         rm_tracker_remove(pc, tracker);
520
521         atomic_interrupt_fence();
522         td->td_critnest--;
523         sched_unpin();
524
525         if (rm->lock_object.lo_flags & LO_SLEEPABLE)
526                 THREAD_SLEEPING_OK();
527
528         if (__predict_true(0 == (td->td_owepreempt | tracker->rmp_flags)))
529                 return;
530
531         _rm_unlock_hard(td, tracker);
532 }
533
534 void
535 _rm_wlock(struct rmlock *rm)
536 {
537         struct rm_priotracker *prio;
538         struct turnstile *ts;
539         cpuset_t readcpus;
540
541         if (SCHEDULER_STOPPED())
542                 return;
543
544         if (rm->lock_object.lo_flags & LO_SLEEPABLE)
545                 sx_xlock(&rm->rm_lock_sx);
546         else
547                 mtx_lock(&rm->rm_lock_mtx);
548
549         if (CPU_CMP(&rm->rm_writecpus, &all_cpus)) {
550                 /* Get all read tokens back */
551                 readcpus = all_cpus;
552                 CPU_ANDNOT(&readcpus, &readcpus, &rm->rm_writecpus);
553                 rm->rm_writecpus = all_cpus;
554
555                 /*
556                  * Assumes rm->rm_writecpus update is visible on other CPUs
557                  * before rm_cleanIPI is called.
558                  */
559 #ifdef SMP
560                 smp_rendezvous_cpus(readcpus,
561                     smp_no_rendezvous_barrier,
562                     rm_cleanIPI,
563                     smp_no_rendezvous_barrier,
564                     rm);
565
566 #else
567                 rm_cleanIPI(rm);
568 #endif
569
570                 mtx_lock_spin(&rm_spinlock);
571                 while ((prio = LIST_FIRST(&rm->rm_activeReaders)) != NULL) {
572                         ts = turnstile_trywait(&rm->lock_object);
573                         prio->rmp_flags = RMPF_ONQUEUE | RMPF_SIGNAL;
574                         mtx_unlock_spin(&rm_spinlock);
575                         turnstile_wait(ts, prio->rmp_thread,
576                             TS_EXCLUSIVE_QUEUE);
577                         mtx_lock_spin(&rm_spinlock);
578                 }
579                 mtx_unlock_spin(&rm_spinlock);
580         }
581 }
582
583 void
584 _rm_wunlock(struct rmlock *rm)
585 {
586
587         if (rm->lock_object.lo_flags & LO_SLEEPABLE)
588                 sx_xunlock(&rm->rm_lock_sx);
589         else
590                 mtx_unlock(&rm->rm_lock_mtx);
591 }
592
593 #if LOCK_DEBUG > 0
594
595 void
596 _rm_wlock_debug(struct rmlock *rm, const char *file, int line)
597 {
598
599         if (SCHEDULER_STOPPED())
600                 return;
601
602         KASSERT(kdb_active != 0 || !TD_IS_IDLETHREAD(curthread),
603             ("rm_wlock() by idle thread %p on rmlock %s @ %s:%d",
604             curthread, rm->lock_object.lo_name, file, line));
605         KASSERT(!rm_destroyed(rm),
606             ("rm_wlock() of destroyed rmlock @ %s:%d", file, line));
607         _rm_assert(rm, RA_UNLOCKED, file, line);
608
609         WITNESS_CHECKORDER(&rm->lock_object, LOP_NEWORDER | LOP_EXCLUSIVE,
610             file, line, NULL);
611
612         _rm_wlock(rm);
613
614         LOCK_LOG_LOCK("RMWLOCK", &rm->lock_object, 0, 0, file, line);
615         WITNESS_LOCK(&rm->lock_object, LOP_EXCLUSIVE, file, line);
616         TD_LOCKS_INC(curthread);
617 }
618
619 void
620 _rm_wunlock_debug(struct rmlock *rm, const char *file, int line)
621 {
622
623         if (SCHEDULER_STOPPED())
624                 return;
625
626         KASSERT(!rm_destroyed(rm),
627             ("rm_wunlock() of destroyed rmlock @ %s:%d", file, line));
628         _rm_assert(rm, RA_WLOCKED, file, line);
629         WITNESS_UNLOCK(&rm->lock_object, LOP_EXCLUSIVE, file, line);
630         LOCK_LOG_LOCK("RMWUNLOCK", &rm->lock_object, 0, 0, file, line);
631         _rm_wunlock(rm);
632         TD_LOCKS_DEC(curthread);
633 }
634
635 int
636 _rm_rlock_debug(struct rmlock *rm, struct rm_priotracker *tracker,
637     int trylock, const char *file, int line)
638 {
639
640         if (SCHEDULER_STOPPED())
641                 return (1);
642
643 #ifdef INVARIANTS
644         if (!(rm->lock_object.lo_flags & LO_RECURSABLE) && !trylock) {
645                 critical_enter();
646                 KASSERT(rm_trackers_present(get_pcpu(), rm,
647                     curthread) == 0,
648                     ("rm_rlock: recursed on non-recursive rmlock %s @ %s:%d\n",
649                     rm->lock_object.lo_name, file, line));
650                 critical_exit();
651         }
652 #endif
653         KASSERT(kdb_active != 0 || !TD_IS_IDLETHREAD(curthread),
654             ("rm_rlock() by idle thread %p on rmlock %s @ %s:%d",
655             curthread, rm->lock_object.lo_name, file, line));
656         KASSERT(!rm_destroyed(rm),
657             ("rm_rlock() of destroyed rmlock @ %s:%d", file, line));
658         if (!trylock) {
659                 KASSERT(!rm_wowned(rm),
660                     ("rm_rlock: wlock already held for %s @ %s:%d",
661                     rm->lock_object.lo_name, file, line));
662                 WITNESS_CHECKORDER(&rm->lock_object,
663                     LOP_NEWORDER | LOP_NOSLEEP, file, line, NULL);
664         }
665
666         if (_rm_rlock(rm, tracker, trylock)) {
667                 if (trylock)
668                         LOCK_LOG_TRY("RMRLOCK", &rm->lock_object, 0, 1, file,
669                             line);
670                 else
671                         LOCK_LOG_LOCK("RMRLOCK", &rm->lock_object, 0, 0, file,
672                             line);
673                 WITNESS_LOCK(&rm->lock_object, LOP_NOSLEEP, file, line);
674                 TD_LOCKS_INC(curthread);
675                 return (1);
676         } else if (trylock)
677                 LOCK_LOG_TRY("RMRLOCK", &rm->lock_object, 0, 0, file, line);
678
679         return (0);
680 }
681
682 void
683 _rm_runlock_debug(struct rmlock *rm, struct rm_priotracker *tracker,
684     const char *file, int line)
685 {
686
687         if (SCHEDULER_STOPPED())
688                 return;
689
690         KASSERT(!rm_destroyed(rm),
691             ("rm_runlock() of destroyed rmlock @ %s:%d", file, line));
692         _rm_assert(rm, RA_RLOCKED, file, line);
693         WITNESS_UNLOCK(&rm->lock_object, 0, file, line);
694         LOCK_LOG_LOCK("RMRUNLOCK", &rm->lock_object, 0, 0, file, line);
695         _rm_runlock(rm, tracker);
696         TD_LOCKS_DEC(curthread);
697 }
698
699 #else
700
701 /*
702  * Just strip out file and line arguments if no lock debugging is enabled in
703  * the kernel - we are called from a kernel module.
704  */
705 void
706 _rm_wlock_debug(struct rmlock *rm, const char *file, int line)
707 {
708
709         _rm_wlock(rm);
710 }
711
712 void
713 _rm_wunlock_debug(struct rmlock *rm, const char *file, int line)
714 {
715
716         _rm_wunlock(rm);
717 }
718
719 int
720 _rm_rlock_debug(struct rmlock *rm, struct rm_priotracker *tracker,
721     int trylock, const char *file, int line)
722 {
723
724         return _rm_rlock(rm, tracker, trylock);
725 }
726
727 void
728 _rm_runlock_debug(struct rmlock *rm, struct rm_priotracker *tracker,
729     const char *file, int line)
730 {
731
732         _rm_runlock(rm, tracker);
733 }
734
735 #endif
736
737 #ifdef INVARIANT_SUPPORT
738 #ifndef INVARIANTS
739 #undef _rm_assert
740 #endif
741
742 /*
743  * Note that this does not need to use witness_assert() for read lock
744  * assertions since an exact count of read locks held by this thread
745  * is computable.
746  */
747 void
748 _rm_assert(const struct rmlock *rm, int what, const char *file, int line)
749 {
750         int count;
751
752         if (SCHEDULER_STOPPED())
753                 return;
754         switch (what) {
755         case RA_LOCKED:
756         case RA_LOCKED | RA_RECURSED:
757         case RA_LOCKED | RA_NOTRECURSED:
758         case RA_RLOCKED:
759         case RA_RLOCKED | RA_RECURSED:
760         case RA_RLOCKED | RA_NOTRECURSED:
761                 /*
762                  * Handle the write-locked case.  Unlike other
763                  * primitives, writers can never recurse.
764                  */
765                 if (rm_wowned(rm)) {
766                         if (what & RA_RLOCKED)
767                                 panic("Lock %s exclusively locked @ %s:%d\n",
768                                     rm->lock_object.lo_name, file, line);
769                         if (what & RA_RECURSED)
770                                 panic("Lock %s not recursed @ %s:%d\n",
771                                     rm->lock_object.lo_name, file, line);
772                         break;
773                 }
774
775                 critical_enter();
776                 count = rm_trackers_present(get_pcpu(), rm, curthread);
777                 critical_exit();
778
779                 if (count == 0)
780                         panic("Lock %s not %slocked @ %s:%d\n",
781                             rm->lock_object.lo_name, (what & RA_RLOCKED) ?
782                             "read " : "", file, line);
783                 if (count > 1) {
784                         if (what & RA_NOTRECURSED)
785                                 panic("Lock %s recursed @ %s:%d\n",
786                                     rm->lock_object.lo_name, file, line);
787                 } else if (what & RA_RECURSED)
788                         panic("Lock %s not recursed @ %s:%d\n",
789                             rm->lock_object.lo_name, file, line);
790                 break;
791         case RA_WLOCKED:
792                 if (!rm_wowned(rm))
793                         panic("Lock %s not exclusively locked @ %s:%d\n",
794                             rm->lock_object.lo_name, file, line);
795                 break;
796         case RA_UNLOCKED:
797                 if (rm_wowned(rm))
798                         panic("Lock %s exclusively locked @ %s:%d\n",
799                             rm->lock_object.lo_name, file, line);
800
801                 critical_enter();
802                 count = rm_trackers_present(get_pcpu(), rm, curthread);
803                 critical_exit();
804
805                 if (count != 0)
806                         panic("Lock %s read locked @ %s:%d\n",
807                             rm->lock_object.lo_name, file, line);
808                 break;
809         default:
810                 panic("Unknown rm lock assertion: %d @ %s:%d", what, file,
811                     line);
812         }
813 }
814 #endif /* INVARIANT_SUPPORT */
815
816 #ifdef DDB
817 static void
818 print_tracker(struct rm_priotracker *tr)
819 {
820         struct thread *td;
821
822         td = tr->rmp_thread;
823         db_printf("   thread %p (tid %d, pid %d, \"%s\") {", td, td->td_tid,
824             td->td_proc->p_pid, td->td_name);
825         if (tr->rmp_flags & RMPF_ONQUEUE) {
826                 db_printf("ONQUEUE");
827                 if (tr->rmp_flags & RMPF_SIGNAL)
828                         db_printf(",SIGNAL");
829         } else
830                 db_printf("0");
831         db_printf("}\n");
832 }
833
834 static void
835 db_show_rm(const struct lock_object *lock)
836 {
837         struct rm_priotracker *tr;
838         struct rm_queue *queue;
839         const struct rmlock *rm;
840         struct lock_class *lc;
841         struct pcpu *pc;
842
843         rm = (const struct rmlock *)lock;
844         db_printf(" writecpus: ");
845         ddb_display_cpuset(__DEQUALIFY(const cpuset_t *, &rm->rm_writecpus));
846         db_printf("\n");
847         db_printf(" per-CPU readers:\n");
848         STAILQ_FOREACH(pc, &cpuhead, pc_allcpu)
849                 for (queue = pc->pc_rm_queue.rmq_next;
850                     queue != &pc->pc_rm_queue; queue = queue->rmq_next) {
851                         tr = (struct rm_priotracker *)queue;
852                         if (tr->rmp_rmlock == rm)
853                                 print_tracker(tr);
854                 }
855         db_printf(" active readers:\n");
856         LIST_FOREACH(tr, &rm->rm_activeReaders, rmp_qentry)
857                 print_tracker(tr);
858         lc = LOCK_CLASS(&rm->rm_wlock_object);
859         db_printf("Backing write-lock (%s):\n", lc->lc_name);
860         lc->lc_ddb_show(&rm->rm_wlock_object);
861 }
862 #endif
863
864 /*
865  * Read-mostly sleepable locks.
866  *
867  * These primitives allow both readers and writers to sleep. However, neither
868  * readers nor writers are tracked and subsequently there is no priority
869  * propagation.
870  *
871  * They are intended to be only used when write-locking is almost never needed
872  * (e.g., they can guard against unloading a kernel module) while read-locking
873  * happens all the time.
874  *
875  * Concurrent writers take turns taking the lock while going off cpu. If this is
876  * of concern for your usecase, this is not the right primitive.
877  *
878  * Neither rms_rlock nor rms_runlock use thread fences. Instead interrupt
879  * fences are inserted to ensure ordering with the code executed in the IPI
880  * handler.
881  *
882  * No attempt is made to track which CPUs read locked at least once,
883  * consequently write locking sends IPIs to all of them. This will become a
884  * problem at some point. The easiest way to lessen it is to provide a bitmap.
885  */
886
887 #define RMS_NOOWNER     ((void *)0x1)
888 #define RMS_TRANSIENT   ((void *)0x2)
889 #define RMS_FLAGMASK    0xf
890
891 struct rmslock_pcpu {
892         int influx;
893         int readers;
894 };
895
896 _Static_assert(sizeof(struct rmslock_pcpu) == 8, "bad size");
897
898 /*
899  * Internal routines
900  */
901 static struct rmslock_pcpu *
902 rms_int_pcpu(struct rmslock *rms)
903 {
904
905         CRITICAL_ASSERT(curthread);
906         return (zpcpu_get(rms->pcpu));
907 }
908
909 static struct rmslock_pcpu *
910 rms_int_remote_pcpu(struct rmslock *rms, int cpu)
911 {
912
913         return (zpcpu_get_cpu(rms->pcpu, cpu));
914 }
915
916 static void
917 rms_int_influx_enter(struct rmslock *rms, struct rmslock_pcpu *pcpu)
918 {
919
920         CRITICAL_ASSERT(curthread);
921         MPASS(pcpu->influx == 0);
922         pcpu->influx = 1;
923 }
924
925 static void
926 rms_int_influx_exit(struct rmslock *rms, struct rmslock_pcpu *pcpu)
927 {
928
929         CRITICAL_ASSERT(curthread);
930         MPASS(pcpu->influx == 1);
931         pcpu->influx = 0;
932 }
933
934 #ifdef INVARIANTS
935 static void
936 rms_int_debug_readers_inc(struct rmslock *rms)
937 {
938         int old;
939         old = atomic_fetchadd_int(&rms->debug_readers, 1);
940         KASSERT(old >= 0, ("%s: bad readers count %d\n", __func__, old));
941 }
942
943 static void
944 rms_int_debug_readers_dec(struct rmslock *rms)
945 {
946         int old;
947
948         old = atomic_fetchadd_int(&rms->debug_readers, -1);
949         KASSERT(old > 0, ("%s: bad readers count %d\n", __func__, old));
950 }
951 #else
952 static void
953 rms_int_debug_readers_inc(struct rmslock *rms)
954 {
955 }
956
957 static void
958 rms_int_debug_readers_dec(struct rmslock *rms)
959 {
960 }
961 #endif
962
963 static void
964 rms_int_readers_inc(struct rmslock *rms, struct rmslock_pcpu *pcpu)
965 {
966
967         CRITICAL_ASSERT(curthread);
968         rms_int_debug_readers_inc(rms);
969         pcpu->readers++;
970 }
971
972 static void
973 rms_int_readers_dec(struct rmslock *rms, struct rmslock_pcpu *pcpu)
974 {
975
976         CRITICAL_ASSERT(curthread);
977         rms_int_debug_readers_dec(rms);
978         pcpu->readers--;
979 }
980
981 /*
982  * Public API
983  */
984 void
985 rms_init(struct rmslock *rms, const char *name)
986 {
987
988         rms->owner = RMS_NOOWNER;
989         rms->writers = 0;
990         rms->readers = 0;
991         rms->debug_readers = 0;
992         mtx_init(&rms->mtx, name, NULL, MTX_DEF | MTX_NEW);
993         rms->pcpu = uma_zalloc_pcpu(pcpu_zone_8, M_WAITOK | M_ZERO);
994 }
995
996 void
997 rms_destroy(struct rmslock *rms)
998 {
999
1000         MPASS(rms->writers == 0);
1001         MPASS(rms->readers == 0);
1002         mtx_destroy(&rms->mtx);
1003         uma_zfree_pcpu(pcpu_zone_8, rms->pcpu);
1004 }
1005
1006 static void __noinline
1007 rms_rlock_fallback(struct rmslock *rms)
1008 {
1009
1010         rms_int_influx_exit(rms, rms_int_pcpu(rms));
1011         critical_exit();
1012
1013         mtx_lock(&rms->mtx);
1014         while (rms->writers > 0)
1015                 msleep(&rms->readers, &rms->mtx, PUSER - 1, mtx_name(&rms->mtx), 0);
1016         critical_enter();
1017         rms_int_readers_inc(rms, rms_int_pcpu(rms));
1018         mtx_unlock(&rms->mtx);
1019         critical_exit();
1020         TD_LOCKS_INC(curthread);
1021 }
1022
1023 void
1024 rms_rlock(struct rmslock *rms)
1025 {
1026         struct rmslock_pcpu *pcpu;
1027
1028         WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK, NULL, __func__);
1029         MPASS(atomic_load_ptr(&rms->owner) != curthread);
1030
1031         critical_enter();
1032         pcpu = rms_int_pcpu(rms);
1033         rms_int_influx_enter(rms, pcpu);
1034         atomic_interrupt_fence();
1035         if (__predict_false(rms->writers > 0)) {
1036                 rms_rlock_fallback(rms);
1037                 return;
1038         }
1039         atomic_interrupt_fence();
1040         rms_int_readers_inc(rms, pcpu);
1041         atomic_interrupt_fence();
1042         rms_int_influx_exit(rms, pcpu);
1043         critical_exit();
1044         TD_LOCKS_INC(curthread);
1045 }
1046
1047 int
1048 rms_try_rlock(struct rmslock *rms)
1049 {
1050         struct rmslock_pcpu *pcpu;
1051
1052         MPASS(atomic_load_ptr(&rms->owner) != curthread);
1053
1054         critical_enter();
1055         pcpu = rms_int_pcpu(rms);
1056         rms_int_influx_enter(rms, pcpu);
1057         atomic_interrupt_fence();
1058         if (__predict_false(rms->writers > 0)) {
1059                 rms_int_influx_exit(rms, pcpu);
1060                 critical_exit();
1061                 return (0);
1062         }
1063         atomic_interrupt_fence();
1064         rms_int_readers_inc(rms, pcpu);
1065         atomic_interrupt_fence();
1066         rms_int_influx_exit(rms, pcpu);
1067         critical_exit();
1068         TD_LOCKS_INC(curthread);
1069         return (1);
1070 }
1071
1072 static void __noinline
1073 rms_runlock_fallback(struct rmslock *rms)
1074 {
1075
1076         rms_int_influx_exit(rms, rms_int_pcpu(rms));
1077         critical_exit();
1078
1079         mtx_lock(&rms->mtx);
1080         MPASS(rms->writers > 0);
1081         MPASS(rms->readers > 0);
1082         MPASS(rms->debug_readers == rms->readers);
1083         rms_int_debug_readers_dec(rms);
1084         rms->readers--;
1085         if (rms->readers == 0)
1086                 wakeup_one(&rms->writers);
1087         mtx_unlock(&rms->mtx);
1088         TD_LOCKS_DEC(curthread);
1089 }
1090
1091 void
1092 rms_runlock(struct rmslock *rms)
1093 {
1094         struct rmslock_pcpu *pcpu;
1095
1096         critical_enter();
1097         pcpu = rms_int_pcpu(rms);
1098         rms_int_influx_enter(rms, pcpu);
1099         atomic_interrupt_fence();
1100         if (__predict_false(rms->writers > 0)) {
1101                 rms_runlock_fallback(rms);
1102                 return;
1103         }
1104         atomic_interrupt_fence();
1105         rms_int_readers_dec(rms, pcpu);
1106         atomic_interrupt_fence();
1107         rms_int_influx_exit(rms, pcpu);
1108         critical_exit();
1109         TD_LOCKS_DEC(curthread);
1110 }
1111
1112 struct rmslock_ipi {
1113         struct rmslock *rms;
1114         struct smp_rendezvous_cpus_retry_arg srcra;
1115 };
1116
1117 static void
1118 rms_action_func(void *arg)
1119 {
1120         struct rmslock_ipi *rmsipi;
1121         struct rmslock_pcpu *pcpu;
1122         struct rmslock *rms;
1123
1124         rmsipi = __containerof(arg, struct rmslock_ipi, srcra);
1125         rms = rmsipi->rms;
1126         pcpu = rms_int_pcpu(rms);
1127
1128         if (pcpu->influx)
1129                 return;
1130         if (pcpu->readers != 0) {
1131                 atomic_add_int(&rms->readers, pcpu->readers);
1132                 pcpu->readers = 0;
1133         }
1134         smp_rendezvous_cpus_done(arg);
1135 }
1136
1137 static void
1138 rms_wait_func(void *arg, int cpu)
1139 {
1140         struct rmslock_ipi *rmsipi;
1141         struct rmslock_pcpu *pcpu;
1142         struct rmslock *rms;
1143
1144         rmsipi = __containerof(arg, struct rmslock_ipi, srcra);
1145         rms = rmsipi->rms;
1146         pcpu = rms_int_remote_pcpu(rms, cpu);
1147
1148         while (atomic_load_int(&pcpu->influx))
1149                 cpu_spinwait();
1150 }
1151
1152 #ifdef INVARIANTS
1153 static void
1154 rms_assert_no_pcpu_readers(struct rmslock *rms)
1155 {
1156         struct rmslock_pcpu *pcpu;
1157         int cpu;
1158
1159         CPU_FOREACH(cpu) {
1160                 pcpu = rms_int_remote_pcpu(rms, cpu);
1161                 if (pcpu->readers != 0) {
1162                         panic("%s: got %d readers on cpu %d\n", __func__,
1163                             pcpu->readers, cpu);
1164                 }
1165         }
1166 }
1167 #else
1168 static void
1169 rms_assert_no_pcpu_readers(struct rmslock *rms)
1170 {
1171 }
1172 #endif
1173
1174 static void
1175 rms_wlock_switch(struct rmslock *rms)
1176 {
1177         struct rmslock_ipi rmsipi;
1178
1179         MPASS(rms->readers == 0);
1180         MPASS(rms->writers == 1);
1181
1182         rmsipi.rms = rms;
1183
1184         smp_rendezvous_cpus_retry(all_cpus,
1185             smp_no_rendezvous_barrier,
1186             rms_action_func,
1187             smp_no_rendezvous_barrier,
1188             rms_wait_func,
1189             &rmsipi.srcra);
1190 }
1191
1192 void
1193 rms_wlock(struct rmslock *rms)
1194 {
1195
1196         WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK, NULL, __func__);
1197         MPASS(atomic_load_ptr(&rms->owner) != curthread);
1198
1199         mtx_lock(&rms->mtx);
1200         rms->writers++;
1201         if (rms->writers > 1) {
1202                 msleep(&rms->owner, &rms->mtx, (PUSER - 1),
1203                     mtx_name(&rms->mtx), 0);
1204                 MPASS(rms->readers == 0);
1205                 KASSERT(rms->owner == RMS_TRANSIENT,
1206                     ("%s: unexpected owner value %p\n", __func__,
1207                     rms->owner));
1208                 goto out_grab;
1209         }
1210
1211         KASSERT(rms->owner == RMS_NOOWNER,
1212             ("%s: unexpected owner value %p\n", __func__, rms->owner));
1213
1214         rms_wlock_switch(rms);
1215         rms_assert_no_pcpu_readers(rms);
1216
1217         if (rms->readers > 0) {
1218                 msleep(&rms->writers, &rms->mtx, (PUSER - 1),
1219                     mtx_name(&rms->mtx), 0);
1220         }
1221
1222 out_grab:
1223         rms->owner = curthread;
1224         rms_assert_no_pcpu_readers(rms);
1225         mtx_unlock(&rms->mtx);
1226         MPASS(rms->readers == 0);
1227         TD_LOCKS_INC(curthread);
1228 }
1229
1230 void
1231 rms_wunlock(struct rmslock *rms)
1232 {
1233
1234         mtx_lock(&rms->mtx);
1235         KASSERT(rms->owner == curthread,
1236             ("%s: unexpected owner value %p\n", __func__, rms->owner));
1237         MPASS(rms->writers >= 1);
1238         MPASS(rms->readers == 0);
1239         rms->writers--;
1240         if (rms->writers > 0) {
1241                 wakeup_one(&rms->owner);
1242                 rms->owner = RMS_TRANSIENT;
1243         } else {
1244                 wakeup(&rms->readers);
1245                 rms->owner = RMS_NOOWNER;
1246         }
1247         mtx_unlock(&rms->mtx);
1248         TD_LOCKS_DEC(curthread);
1249 }
1250
1251 void
1252 rms_unlock(struct rmslock *rms)
1253 {
1254
1255         if (rms_wowned(rms))
1256                 rms_wunlock(rms);
1257         else
1258                 rms_runlock(rms);
1259 }