]> CyberLeo.Net >> Repos - FreeBSD/FreeBSD.git/blob - sys/kern/kern_thread.c
Stop using eventhandlers for itimers subsystem exec and exit hooks.
[FreeBSD/FreeBSD.git] / sys / kern / kern_thread.c
1 /*-
2  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
3  *
4  * Copyright (C) 2001 Julian Elischer <julian@freebsd.org>.
5  *  All rights reserved.
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  * 1. Redistributions of source code must retain the above copyright
11  *    notice(s), this list of conditions and the following disclaimer as
12  *    the first lines of this file unmodified other than the possible
13  *    addition of one or more copyright notices.
14  * 2. Redistributions in binary form must reproduce the above copyright
15  *    notice(s), this list of conditions and the following disclaimer in the
16  *    documentation and/or other materials provided with the distribution.
17  *
18  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDER(S) ``AS IS'' AND ANY
19  * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
20  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
21  * DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT HOLDER(S) BE LIABLE FOR ANY
22  * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
23  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
24  * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
25  * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
26  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
27  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH
28  * DAMAGE.
29  */
30
31 #include "opt_witness.h"
32 #include "opt_hwpmc_hooks.h"
33
34 #include <sys/cdefs.h>
35 __FBSDID("$FreeBSD$");
36
37 #include <sys/param.h>
38 #include <sys/systm.h>
39 #include <sys/kernel.h>
40 #include <sys/lock.h>
41 #include <sys/mutex.h>
42 #include <sys/proc.h>
43 #include <sys/bitstring.h>
44 #include <sys/epoch.h>
45 #include <sys/rangelock.h>
46 #include <sys/resourcevar.h>
47 #include <sys/sdt.h>
48 #include <sys/smp.h>
49 #include <sys/sched.h>
50 #include <sys/sleepqueue.h>
51 #include <sys/selinfo.h>
52 #include <sys/syscallsubr.h>
53 #include <sys/sysent.h>
54 #include <sys/turnstile.h>
55 #include <sys/taskqueue.h>
56 #include <sys/ktr.h>
57 #include <sys/rwlock.h>
58 #include <sys/umtx.h>
59 #include <sys/vmmeter.h>
60 #include <sys/cpuset.h>
61 #ifdef  HWPMC_HOOKS
62 #include <sys/pmckern.h>
63 #endif
64 #include <sys/priv.h>
65
66 #include <security/audit/audit.h>
67
68 #include <vm/pmap.h>
69 #include <vm/vm.h>
70 #include <vm/vm_extern.h>
71 #include <vm/uma.h>
72 #include <vm/vm_phys.h>
73 #include <sys/eventhandler.h>
74
75 /*
76  * Asserts below verify the stability of struct thread and struct proc
77  * layout, as exposed by KBI to modules.  On head, the KBI is allowed
78  * to drift, change to the structures must be accompanied by the
79  * assert update.
80  *
81  * On the stable branches after KBI freeze, conditions must not be
82  * violated.  Typically new fields are moved to the end of the
83  * structures.
84  */
85 #ifdef __amd64__
86 _Static_assert(offsetof(struct thread, td_flags) == 0xfc,
87     "struct thread KBI td_flags");
88 _Static_assert(offsetof(struct thread, td_pflags) == 0x104,
89     "struct thread KBI td_pflags");
90 _Static_assert(offsetof(struct thread, td_frame) == 0x4a0,
91     "struct thread KBI td_frame");
92 _Static_assert(offsetof(struct thread, td_emuldata) == 0x6b0,
93     "struct thread KBI td_emuldata");
94 _Static_assert(offsetof(struct proc, p_flag) == 0xb8,
95     "struct proc KBI p_flag");
96 _Static_assert(offsetof(struct proc, p_pid) == 0xc4,
97     "struct proc KBI p_pid");
98 _Static_assert(offsetof(struct proc, p_filemon) == 0x3c0,
99     "struct proc KBI p_filemon");
100 _Static_assert(offsetof(struct proc, p_comm) == 0x3d8,
101     "struct proc KBI p_comm");
102 _Static_assert(offsetof(struct proc, p_emuldata) == 0x4b8,
103     "struct proc KBI p_emuldata");
104 #endif
105 #ifdef __i386__
106 _Static_assert(offsetof(struct thread, td_flags) == 0x98,
107     "struct thread KBI td_flags");
108 _Static_assert(offsetof(struct thread, td_pflags) == 0xa0,
109     "struct thread KBI td_pflags");
110 _Static_assert(offsetof(struct thread, td_frame) == 0x300,
111     "struct thread KBI td_frame");
112 _Static_assert(offsetof(struct thread, td_emuldata) == 0x344,
113     "struct thread KBI td_emuldata");
114 _Static_assert(offsetof(struct proc, p_flag) == 0x6c,
115     "struct proc KBI p_flag");
116 _Static_assert(offsetof(struct proc, p_pid) == 0x78,
117     "struct proc KBI p_pid");
118 _Static_assert(offsetof(struct proc, p_filemon) == 0x26c,
119     "struct proc KBI p_filemon");
120 _Static_assert(offsetof(struct proc, p_comm) == 0x280,
121     "struct proc KBI p_comm");
122 _Static_assert(offsetof(struct proc, p_emuldata) == 0x30c,
123     "struct proc KBI p_emuldata");
124 #endif
125
126 SDT_PROVIDER_DECLARE(proc);
127 SDT_PROBE_DEFINE(proc, , , lwp__exit);
128
129 /*
130  * thread related storage.
131  */
132 static uma_zone_t thread_zone;
133
134 struct thread_domain_data {
135         struct thread   *tdd_zombies;
136         int             tdd_reapticks;
137 } __aligned(CACHE_LINE_SIZE);
138
139 static struct thread_domain_data thread_domain_data[MAXMEMDOM];
140
141 static struct task      thread_reap_task;
142 static struct callout   thread_reap_callout;
143
144 static void thread_zombie(struct thread *);
145 static void thread_reap_all(void);
146 static void thread_reap_task_cb(void *, int);
147 static void thread_reap_callout_cb(void *);
148 static int thread_unsuspend_one(struct thread *td, struct proc *p,
149     bool boundary);
150 static void thread_free_batched(struct thread *td);
151
152 static __exclusive_cache_line struct mtx tid_lock;
153 static bitstr_t *tid_bitmap;
154
155 static MALLOC_DEFINE(M_TIDHASH, "tidhash", "thread hash");
156
157 static int maxthread;
158 SYSCTL_INT(_kern, OID_AUTO, maxthread, CTLFLAG_RDTUN,
159     &maxthread, 0, "Maximum number of threads");
160
161 static __exclusive_cache_line int nthreads;
162
163 static LIST_HEAD(tidhashhead, thread) *tidhashtbl;
164 static u_long   tidhash;
165 static u_long   tidhashlock;
166 static struct   rwlock *tidhashtbl_lock;
167 #define TIDHASH(tid)            (&tidhashtbl[(tid) & tidhash])
168 #define TIDHASHLOCK(tid)        (&tidhashtbl_lock[(tid) & tidhashlock])
169
170 EVENTHANDLER_LIST_DEFINE(thread_ctor);
171 EVENTHANDLER_LIST_DEFINE(thread_dtor);
172 EVENTHANDLER_LIST_DEFINE(thread_init);
173 EVENTHANDLER_LIST_DEFINE(thread_fini);
174
175 static bool
176 thread_count_inc_try(void)
177 {
178         int nthreads_new;
179
180         nthreads_new = atomic_fetchadd_int(&nthreads, 1) + 1;
181         if (nthreads_new >= maxthread - 100) {
182                 if (priv_check_cred(curthread->td_ucred, PRIV_MAXPROC) != 0 ||
183                     nthreads_new >= maxthread) {
184                         atomic_subtract_int(&nthreads, 1);
185                         return (false);
186                 }
187         }
188         return (true);
189 }
190
191 static bool
192 thread_count_inc(void)
193 {
194         static struct timeval lastfail;
195         static int curfail;
196
197         thread_reap();
198         if (thread_count_inc_try()) {
199                 return (true);
200         }
201
202         thread_reap_all();
203         if (thread_count_inc_try()) {
204                 return (true);
205         }
206
207         if (ppsratecheck(&lastfail, &curfail, 1)) {
208                 printf("maxthread limit exceeded by uid %u "
209                     "(pid %d); consider increasing kern.maxthread\n",
210                     curthread->td_ucred->cr_ruid, curproc->p_pid);
211         }
212         return (false);
213 }
214
215 static void
216 thread_count_sub(int n)
217 {
218
219         atomic_subtract_int(&nthreads, n);
220 }
221
222 static void
223 thread_count_dec(void)
224 {
225
226         thread_count_sub(1);
227 }
228
229 static lwpid_t
230 tid_alloc(void)
231 {
232         static lwpid_t trytid;
233         lwpid_t tid;
234
235         mtx_lock(&tid_lock);
236         /*
237          * It is an invariant that the bitmap is big enough to hold maxthread
238          * IDs. If we got to this point there has to be at least one free.
239          */
240         if (trytid >= maxthread)
241                 trytid = 0;
242         bit_ffc_at(tid_bitmap, trytid, maxthread, &tid);
243         if (tid == -1) {
244                 KASSERT(trytid != 0, ("unexpectedly ran out of IDs"));
245                 trytid = 0;
246                 bit_ffc_at(tid_bitmap, trytid, maxthread, &tid);
247                 KASSERT(tid != -1, ("unexpectedly ran out of IDs"));
248         }
249         bit_set(tid_bitmap, tid);
250         trytid = tid + 1;
251         mtx_unlock(&tid_lock);
252         return (tid + NO_PID);
253 }
254
255 static void
256 tid_free_locked(lwpid_t rtid)
257 {
258         lwpid_t tid;
259
260         mtx_assert(&tid_lock, MA_OWNED);
261         KASSERT(rtid >= NO_PID,
262             ("%s: invalid tid %d\n", __func__, rtid));
263         tid = rtid - NO_PID;
264         KASSERT(bit_test(tid_bitmap, tid) != 0,
265             ("thread ID %d not allocated\n", rtid));
266         bit_clear(tid_bitmap, tid);
267 }
268
269 static void
270 tid_free(lwpid_t rtid)
271 {
272
273         mtx_lock(&tid_lock);
274         tid_free_locked(rtid);
275         mtx_unlock(&tid_lock);
276 }
277
278 static void
279 tid_free_batch(lwpid_t *batch, int n)
280 {
281         int i;
282
283         mtx_lock(&tid_lock);
284         for (i = 0; i < n; i++) {
285                 tid_free_locked(batch[i]);
286         }
287         mtx_unlock(&tid_lock);
288 }
289
290 /*
291  * Batching for thread reapping.
292  */
293 struct tidbatch {
294         lwpid_t tab[16];
295         int n;
296 };
297
298 static void
299 tidbatch_prep(struct tidbatch *tb)
300 {
301
302         tb->n = 0;
303 }
304
305 static void
306 tidbatch_add(struct tidbatch *tb, struct thread *td)
307 {
308
309         KASSERT(tb->n < nitems(tb->tab),
310             ("%s: count too high %d", __func__, tb->n));
311         tb->tab[tb->n] = td->td_tid;
312         tb->n++;
313 }
314
315 static void
316 tidbatch_process(struct tidbatch *tb)
317 {
318
319         KASSERT(tb->n <= nitems(tb->tab),
320             ("%s: count too high %d", __func__, tb->n));
321         if (tb->n == nitems(tb->tab)) {
322                 tid_free_batch(tb->tab, tb->n);
323                 tb->n = 0;
324         }
325 }
326
327 static void
328 tidbatch_final(struct tidbatch *tb)
329 {
330
331         KASSERT(tb->n <= nitems(tb->tab),
332             ("%s: count too high %d", __func__, tb->n));
333         if (tb->n != 0) {
334                 tid_free_batch(tb->tab, tb->n);
335         }
336 }
337
338 /*
339  * Prepare a thread for use.
340  */
341 static int
342 thread_ctor(void *mem, int size, void *arg, int flags)
343 {
344         struct thread   *td;
345
346         td = (struct thread *)mem;
347         td->td_state = TDS_INACTIVE;
348         td->td_lastcpu = td->td_oncpu = NOCPU;
349
350         /*
351          * Note that td_critnest begins life as 1 because the thread is not
352          * running and is thereby implicitly waiting to be on the receiving
353          * end of a context switch.
354          */
355         td->td_critnest = 1;
356         td->td_lend_user_pri = PRI_MAX;
357 #ifdef AUDIT
358         audit_thread_alloc(td);
359 #endif
360         umtx_thread_alloc(td);
361         MPASS(td->td_sel == NULL);
362         return (0);
363 }
364
365 /*
366  * Reclaim a thread after use.
367  */
368 static void
369 thread_dtor(void *mem, int size, void *arg)
370 {
371         struct thread *td;
372
373         td = (struct thread *)mem;
374
375 #ifdef INVARIANTS
376         /* Verify that this thread is in a safe state to free. */
377         switch (td->td_state) {
378         case TDS_INHIBITED:
379         case TDS_RUNNING:
380         case TDS_CAN_RUN:
381         case TDS_RUNQ:
382                 /*
383                  * We must never unlink a thread that is in one of
384                  * these states, because it is currently active.
385                  */
386                 panic("bad state for thread unlinking");
387                 /* NOTREACHED */
388         case TDS_INACTIVE:
389                 break;
390         default:
391                 panic("bad thread state");
392                 /* NOTREACHED */
393         }
394 #endif
395 #ifdef AUDIT
396         audit_thread_free(td);
397 #endif
398         /* Free all OSD associated to this thread. */
399         osd_thread_exit(td);
400         td_softdep_cleanup(td);
401         MPASS(td->td_su == NULL);
402         seltdfini(td);
403 }
404
405 /*
406  * Initialize type-stable parts of a thread (when newly created).
407  */
408 static int
409 thread_init(void *mem, int size, int flags)
410 {
411         struct thread *td;
412
413         td = (struct thread *)mem;
414
415         td->td_sleepqueue = sleepq_alloc();
416         td->td_turnstile = turnstile_alloc();
417         td->td_rlqe = NULL;
418         EVENTHANDLER_DIRECT_INVOKE(thread_init, td);
419         umtx_thread_init(td);
420         td->td_kstack = 0;
421         td->td_sel = NULL;
422         return (0);
423 }
424
425 /*
426  * Tear down type-stable parts of a thread (just before being discarded).
427  */
428 static void
429 thread_fini(void *mem, int size)
430 {
431         struct thread *td;
432
433         td = (struct thread *)mem;
434         EVENTHANDLER_DIRECT_INVOKE(thread_fini, td);
435         rlqentry_free(td->td_rlqe);
436         turnstile_free(td->td_turnstile);
437         sleepq_free(td->td_sleepqueue);
438         umtx_thread_fini(td);
439         MPASS(td->td_sel == NULL);
440 }
441
442 /*
443  * For a newly created process,
444  * link up all the structures and its initial threads etc.
445  * called from:
446  * {arch}/{arch}/machdep.c   {arch}_init(), init386() etc.
447  * proc_dtor() (should go away)
448  * proc_init()
449  */
450 void
451 proc_linkup0(struct proc *p, struct thread *td)
452 {
453         TAILQ_INIT(&p->p_threads);           /* all threads in proc */
454         proc_linkup(p, td);
455 }
456
457 void
458 proc_linkup(struct proc *p, struct thread *td)
459 {
460
461         sigqueue_init(&p->p_sigqueue, p);
462         p->p_ksi = ksiginfo_alloc(1);
463         if (p->p_ksi != NULL) {
464                 /* XXX p_ksi may be null if ksiginfo zone is not ready */
465                 p->p_ksi->ksi_flags = KSI_EXT | KSI_INS;
466         }
467         LIST_INIT(&p->p_mqnotifier);
468         p->p_numthreads = 0;
469         thread_link(td, p);
470 }
471
472 extern int max_threads_per_proc;
473
474 /*
475  * Initialize global thread allocation resources.
476  */
477 void
478 threadinit(void)
479 {
480         u_long i;
481         lwpid_t tid0;
482         uint32_t flags;
483
484         /*
485          * Place an upper limit on threads which can be allocated.
486          *
487          * Note that other factors may make the de facto limit much lower.
488          *
489          * Platform limits are somewhat arbitrary but deemed "more than good
490          * enough" for the foreseable future.
491          */
492         if (maxthread == 0) {
493 #ifdef _LP64
494                 maxthread = MIN(maxproc * max_threads_per_proc, 1000000);
495 #else
496                 maxthread = MIN(maxproc * max_threads_per_proc, 100000);
497 #endif
498         }
499
500         mtx_init(&tid_lock, "TID lock", NULL, MTX_DEF);
501         tid_bitmap = bit_alloc(maxthread, M_TIDHASH, M_WAITOK);
502         /*
503          * Handle thread0.
504          */
505         thread_count_inc();
506         tid0 = tid_alloc();
507         if (tid0 != THREAD0_TID)
508                 panic("tid0 %d != %d\n", tid0, THREAD0_TID);
509
510         flags = UMA_ZONE_NOFREE;
511 #ifdef __aarch64__
512         /*
513          * Force thread structures to be allocated from the direct map.
514          * Otherwise, superpage promotions and demotions may temporarily
515          * invalidate thread structure mappings.  For most dynamically allocated
516          * structures this is not a problem, but translation faults cannot be
517          * handled without accessing curthread.
518          */
519         flags |= UMA_ZONE_CONTIG;
520 #endif
521         thread_zone = uma_zcreate("THREAD", sched_sizeof_thread(),
522             thread_ctor, thread_dtor, thread_init, thread_fini,
523             32 - 1, flags);
524         tidhashtbl = hashinit(maxproc / 2, M_TIDHASH, &tidhash);
525         tidhashlock = (tidhash + 1) / 64;
526         if (tidhashlock > 0)
527                 tidhashlock--;
528         tidhashtbl_lock = malloc(sizeof(*tidhashtbl_lock) * (tidhashlock + 1),
529             M_TIDHASH, M_WAITOK | M_ZERO);
530         for (i = 0; i < tidhashlock + 1; i++)
531                 rw_init(&tidhashtbl_lock[i], "tidhash");
532
533         TASK_INIT(&thread_reap_task, 0, thread_reap_task_cb, NULL);
534         callout_init(&thread_reap_callout, 1);
535         callout_reset(&thread_reap_callout, 5 * hz, thread_reap_callout_cb, NULL);
536 }
537
538 /*
539  * Place an unused thread on the zombie list.
540  */
541 void
542 thread_zombie(struct thread *td)
543 {
544         struct thread_domain_data *tdd;
545         struct thread *ztd;
546
547         tdd = &thread_domain_data[vm_phys_domain(vtophys(td))];
548         ztd = atomic_load_ptr(&tdd->tdd_zombies);
549         for (;;) {
550                 td->td_zombie = ztd;
551                 if (atomic_fcmpset_rel_ptr((uintptr_t *)&tdd->tdd_zombies,
552                     (uintptr_t *)&ztd, (uintptr_t)td))
553                         break;
554                 continue;
555         }
556 }
557
558 /*
559  * Release a thread that has exited after cpu_throw().
560  */
561 void
562 thread_stash(struct thread *td)
563 {
564         atomic_subtract_rel_int(&td->td_proc->p_exitthreads, 1);
565         thread_zombie(td);
566 }
567
568 /*
569  * Reap zombies from passed domain.
570  */
571 static void
572 thread_reap_domain(struct thread_domain_data *tdd)
573 {
574         struct thread *itd, *ntd;
575         struct tidbatch tidbatch;
576         struct credbatch credbatch;
577         int tdcount;
578         struct plimit *lim;
579         int limcount;
580
581         /*
582          * Reading upfront is pessimal if followed by concurrent atomic_swap,
583          * but most of the time the list is empty.
584          */
585         if (tdd->tdd_zombies == NULL)
586                 return;
587
588         itd = (struct thread *)atomic_swap_ptr((uintptr_t *)&tdd->tdd_zombies,
589             (uintptr_t)NULL);
590         if (itd == NULL)
591                 return;
592
593         /*
594          * Multiple CPUs can get here, the race is fine as ticks is only
595          * advisory.
596          */
597         tdd->tdd_reapticks = ticks;
598
599         tidbatch_prep(&tidbatch);
600         credbatch_prep(&credbatch);
601         tdcount = 0;
602         lim = NULL;
603         limcount = 0;
604
605         while (itd != NULL) {
606                 ntd = itd->td_zombie;
607                 EVENTHANDLER_DIRECT_INVOKE(thread_dtor, itd);
608                 tidbatch_add(&tidbatch, itd);
609                 credbatch_add(&credbatch, itd);
610                 MPASS(itd->td_limit != NULL);
611                 if (lim != itd->td_limit) {
612                         if (limcount != 0) {
613                                 lim_freen(lim, limcount);
614                                 limcount = 0;
615                         }
616                 }
617                 lim = itd->td_limit;
618                 limcount++;
619                 thread_free_batched(itd);
620                 tidbatch_process(&tidbatch);
621                 credbatch_process(&credbatch);
622                 tdcount++;
623                 if (tdcount == 32) {
624                         thread_count_sub(tdcount);
625                         tdcount = 0;
626                 }
627                 itd = ntd;
628         }
629
630         tidbatch_final(&tidbatch);
631         credbatch_final(&credbatch);
632         if (tdcount != 0) {
633                 thread_count_sub(tdcount);
634         }
635         MPASS(limcount != 0);
636         lim_freen(lim, limcount);
637 }
638
639 /*
640  * Reap zombies from all domains.
641  */
642 static void
643 thread_reap_all(void)
644 {
645         struct thread_domain_data *tdd;
646         int i, domain;
647
648         domain = PCPU_GET(domain);
649         for (i = 0; i < vm_ndomains; i++) {
650                 tdd = &thread_domain_data[(i + domain) % vm_ndomains];
651                 thread_reap_domain(tdd);
652         }
653 }
654
655 /*
656  * Reap zombies from local domain.
657  */
658 void
659 thread_reap(void)
660 {
661         struct thread_domain_data *tdd;
662         int domain;
663
664         domain = PCPU_GET(domain);
665         tdd = &thread_domain_data[domain];
666
667         thread_reap_domain(tdd);
668 }
669
670 static void
671 thread_reap_task_cb(void *arg __unused, int pending __unused)
672 {
673
674         thread_reap_all();
675 }
676
677 static void
678 thread_reap_callout_cb(void *arg __unused)
679 {
680         struct thread_domain_data *tdd;
681         int i, cticks, lticks;
682         bool wantreap;
683
684         wantreap = false;
685         cticks = atomic_load_int(&ticks);
686         for (i = 0; i < vm_ndomains; i++) {
687                 tdd = &thread_domain_data[i];
688                 lticks = tdd->tdd_reapticks;
689                 if (tdd->tdd_zombies != NULL &&
690                     (u_int)(cticks - lticks) > 5 * hz) {
691                         wantreap = true;
692                         break;
693                 }
694         }
695
696         if (wantreap)
697                 taskqueue_enqueue(taskqueue_thread, &thread_reap_task);
698         callout_reset(&thread_reap_callout, 5 * hz, thread_reap_callout_cb, NULL);
699 }
700
701 /*
702  * Allocate a thread.
703  */
704 struct thread *
705 thread_alloc(int pages)
706 {
707         struct thread *td;
708         lwpid_t tid;
709
710         if (!thread_count_inc()) {
711                 return (NULL);
712         }
713
714         tid = tid_alloc();
715         td = uma_zalloc(thread_zone, M_WAITOK);
716         KASSERT(td->td_kstack == 0, ("thread_alloc got thread with kstack"));
717         if (!vm_thread_new(td, pages)) {
718                 uma_zfree(thread_zone, td);
719                 tid_free(tid);
720                 thread_count_dec();
721                 return (NULL);
722         }
723         td->td_tid = tid;
724         cpu_thread_alloc(td);
725         EVENTHANDLER_DIRECT_INVOKE(thread_ctor, td);
726         return (td);
727 }
728
729 int
730 thread_alloc_stack(struct thread *td, int pages)
731 {
732
733         KASSERT(td->td_kstack == 0,
734             ("thread_alloc_stack called on a thread with kstack"));
735         if (!vm_thread_new(td, pages))
736                 return (0);
737         cpu_thread_alloc(td);
738         return (1);
739 }
740
741 /*
742  * Deallocate a thread.
743  */
744 static void
745 thread_free_batched(struct thread *td)
746 {
747
748         lock_profile_thread_exit(td);
749         if (td->td_cpuset)
750                 cpuset_rel(td->td_cpuset);
751         td->td_cpuset = NULL;
752         cpu_thread_free(td);
753         if (td->td_kstack != 0)
754                 vm_thread_dispose(td);
755         callout_drain(&td->td_slpcallout);
756         /*
757          * Freeing handled by the caller.
758          */
759         td->td_tid = -1;
760         uma_zfree(thread_zone, td);
761 }
762
763 void
764 thread_free(struct thread *td)
765 {
766         lwpid_t tid;
767
768         EVENTHANDLER_DIRECT_INVOKE(thread_dtor, td);
769         tid = td->td_tid;
770         thread_free_batched(td);
771         tid_free(tid);
772         thread_count_dec();
773 }
774
775 void
776 thread_cow_get_proc(struct thread *newtd, struct proc *p)
777 {
778
779         PROC_LOCK_ASSERT(p, MA_OWNED);
780         newtd->td_realucred = crcowget(p->p_ucred);
781         newtd->td_ucred = newtd->td_realucred;
782         newtd->td_limit = lim_hold(p->p_limit);
783         newtd->td_cowgen = p->p_cowgen;
784 }
785
786 void
787 thread_cow_get(struct thread *newtd, struct thread *td)
788 {
789
790         MPASS(td->td_realucred == td->td_ucred);
791         newtd->td_realucred = crcowget(td->td_realucred);
792         newtd->td_ucred = newtd->td_realucred;
793         newtd->td_limit = lim_hold(td->td_limit);
794         newtd->td_cowgen = td->td_cowgen;
795 }
796
797 void
798 thread_cow_free(struct thread *td)
799 {
800
801         if (td->td_realucred != NULL)
802                 crcowfree(td);
803         if (td->td_limit != NULL)
804                 lim_free(td->td_limit);
805 }
806
807 void
808 thread_cow_update(struct thread *td)
809 {
810         struct proc *p;
811         struct ucred *oldcred;
812         struct plimit *oldlimit;
813
814         p = td->td_proc;
815         oldlimit = NULL;
816         PROC_LOCK(p);
817         oldcred = crcowsync();
818         if (td->td_limit != p->p_limit) {
819                 oldlimit = td->td_limit;
820                 td->td_limit = lim_hold(p->p_limit);
821         }
822         td->td_cowgen = p->p_cowgen;
823         PROC_UNLOCK(p);
824         if (oldcred != NULL)
825                 crfree(oldcred);
826         if (oldlimit != NULL)
827                 lim_free(oldlimit);
828 }
829
830 /*
831  * Discard the current thread and exit from its context.
832  * Always called with scheduler locked.
833  *
834  * Because we can't free a thread while we're operating under its context,
835  * push the current thread into our CPU's deadthread holder. This means
836  * we needn't worry about someone else grabbing our context before we
837  * do a cpu_throw().
838  */
839 void
840 thread_exit(void)
841 {
842         uint64_t runtime, new_switchtime;
843         struct thread *td;
844         struct thread *td2;
845         struct proc *p;
846         int wakeup_swapper;
847
848         td = curthread;
849         p = td->td_proc;
850
851         PROC_SLOCK_ASSERT(p, MA_OWNED);
852         mtx_assert(&Giant, MA_NOTOWNED);
853
854         PROC_LOCK_ASSERT(p, MA_OWNED);
855         KASSERT(p != NULL, ("thread exiting without a process"));
856         CTR3(KTR_PROC, "thread_exit: thread %p (pid %ld, %s)", td,
857             (long)p->p_pid, td->td_name);
858         SDT_PROBE0(proc, , , lwp__exit);
859         KASSERT(TAILQ_EMPTY(&td->td_sigqueue.sq_list), ("signal pending"));
860         MPASS(td->td_realucred == td->td_ucred);
861
862         /*
863          * drop FPU & debug register state storage, or any other
864          * architecture specific resources that
865          * would not be on a new untouched process.
866          */
867         cpu_thread_exit(td);
868
869         /*
870          * The last thread is left attached to the process
871          * So that the whole bundle gets recycled. Skip
872          * all this stuff if we never had threads.
873          * EXIT clears all sign of other threads when
874          * it goes to single threading, so the last thread always
875          * takes the short path.
876          */
877         if (p->p_flag & P_HADTHREADS) {
878                 if (p->p_numthreads > 1) {
879                         atomic_add_int(&td->td_proc->p_exitthreads, 1);
880                         thread_unlink(td);
881                         td2 = FIRST_THREAD_IN_PROC(p);
882                         sched_exit_thread(td2, td);
883
884                         /*
885                          * The test below is NOT true if we are the
886                          * sole exiting thread. P_STOPPED_SINGLE is unset
887                          * in exit1() after it is the only survivor.
888                          */
889                         if (P_SHOULDSTOP(p) == P_STOPPED_SINGLE) {
890                                 if (p->p_numthreads == p->p_suspcount) {
891                                         thread_lock(p->p_singlethread);
892                                         wakeup_swapper = thread_unsuspend_one(
893                                                 p->p_singlethread, p, false);
894                                         if (wakeup_swapper)
895                                                 kick_proc0();
896                                 }
897                         }
898
899                         PCPU_SET(deadthread, td);
900                 } else {
901                         /*
902                          * The last thread is exiting.. but not through exit()
903                          */
904                         panic ("thread_exit: Last thread exiting on its own");
905                 }
906         } 
907 #ifdef  HWPMC_HOOKS
908         /*
909          * If this thread is part of a process that is being tracked by hwpmc(4),
910          * inform the module of the thread's impending exit.
911          */
912         if (PMC_PROC_IS_USING_PMCS(td->td_proc)) {
913                 PMC_SWITCH_CONTEXT(td, PMC_FN_CSW_OUT);
914                 PMC_CALL_HOOK_UNLOCKED(td, PMC_FN_THR_EXIT, NULL);
915         } else if (PMC_SYSTEM_SAMPLING_ACTIVE())
916                 PMC_CALL_HOOK_UNLOCKED(td, PMC_FN_THR_EXIT_LOG, NULL);
917 #endif
918         PROC_UNLOCK(p);
919         PROC_STATLOCK(p);
920         thread_lock(td);
921         PROC_SUNLOCK(p);
922
923         /* Do the same timestamp bookkeeping that mi_switch() would do. */
924         new_switchtime = cpu_ticks();
925         runtime = new_switchtime - PCPU_GET(switchtime);
926         td->td_runtime += runtime;
927         td->td_incruntime += runtime;
928         PCPU_SET(switchtime, new_switchtime);
929         PCPU_SET(switchticks, ticks);
930         VM_CNT_INC(v_swtch);
931
932         /* Save our resource usage in our process. */
933         td->td_ru.ru_nvcsw++;
934         ruxagg_locked(p, td);
935         rucollect(&p->p_ru, &td->td_ru);
936         PROC_STATUNLOCK(p);
937
938         td->td_state = TDS_INACTIVE;
939 #ifdef WITNESS
940         witness_thread_exit(td);
941 #endif
942         CTR1(KTR_PROC, "thread_exit: cpu_throw() thread %p", td);
943         sched_throw(td);
944         panic("I'm a teapot!");
945         /* NOTREACHED */
946 }
947
948 /*
949  * Do any thread specific cleanups that may be needed in wait()
950  * called with Giant, proc and schedlock not held.
951  */
952 void
953 thread_wait(struct proc *p)
954 {
955         struct thread *td;
956
957         mtx_assert(&Giant, MA_NOTOWNED);
958         KASSERT(p->p_numthreads == 1, ("multiple threads in thread_wait()"));
959         KASSERT(p->p_exitthreads == 0, ("p_exitthreads leaking"));
960         td = FIRST_THREAD_IN_PROC(p);
961         /* Lock the last thread so we spin until it exits cpu_throw(). */
962         thread_lock(td);
963         thread_unlock(td);
964         lock_profile_thread_exit(td);
965         cpuset_rel(td->td_cpuset);
966         td->td_cpuset = NULL;
967         cpu_thread_clean(td);
968         thread_cow_free(td);
969         callout_drain(&td->td_slpcallout);
970         thread_reap();  /* check for zombie threads etc. */
971 }
972
973 /*
974  * Link a thread to a process.
975  * set up anything that needs to be initialized for it to
976  * be used by the process.
977  */
978 void
979 thread_link(struct thread *td, struct proc *p)
980 {
981
982         /*
983          * XXX This can't be enabled because it's called for proc0 before
984          * its lock has been created.
985          * PROC_LOCK_ASSERT(p, MA_OWNED);
986          */
987         td->td_state    = TDS_INACTIVE;
988         td->td_proc     = p;
989         td->td_flags    = TDF_INMEM;
990
991         LIST_INIT(&td->td_contested);
992         LIST_INIT(&td->td_lprof[0]);
993         LIST_INIT(&td->td_lprof[1]);
994 #ifdef EPOCH_TRACE
995         SLIST_INIT(&td->td_epochs);
996 #endif
997         sigqueue_init(&td->td_sigqueue, p);
998         callout_init(&td->td_slpcallout, 1);
999         TAILQ_INSERT_TAIL(&p->p_threads, td, td_plist);
1000         p->p_numthreads++;
1001 }
1002
1003 /*
1004  * Called from:
1005  *  thread_exit()
1006  */
1007 void
1008 thread_unlink(struct thread *td)
1009 {
1010         struct proc *p = td->td_proc;
1011
1012         PROC_LOCK_ASSERT(p, MA_OWNED);
1013 #ifdef EPOCH_TRACE
1014         MPASS(SLIST_EMPTY(&td->td_epochs));
1015 #endif
1016
1017         TAILQ_REMOVE(&p->p_threads, td, td_plist);
1018         p->p_numthreads--;
1019         /* could clear a few other things here */
1020         /* Must  NOT clear links to proc! */
1021 }
1022
1023 static int
1024 calc_remaining(struct proc *p, int mode)
1025 {
1026         int remaining;
1027
1028         PROC_LOCK_ASSERT(p, MA_OWNED);
1029         PROC_SLOCK_ASSERT(p, MA_OWNED);
1030         if (mode == SINGLE_EXIT)
1031                 remaining = p->p_numthreads;
1032         else if (mode == SINGLE_BOUNDARY)
1033                 remaining = p->p_numthreads - p->p_boundary_count;
1034         else if (mode == SINGLE_NO_EXIT || mode == SINGLE_ALLPROC)
1035                 remaining = p->p_numthreads - p->p_suspcount;
1036         else
1037                 panic("calc_remaining: wrong mode %d", mode);
1038         return (remaining);
1039 }
1040
1041 static int
1042 remain_for_mode(int mode)
1043 {
1044
1045         return (mode == SINGLE_ALLPROC ? 0 : 1);
1046 }
1047
1048 static int
1049 weed_inhib(int mode, struct thread *td2, struct proc *p)
1050 {
1051         int wakeup_swapper;
1052
1053         PROC_LOCK_ASSERT(p, MA_OWNED);
1054         PROC_SLOCK_ASSERT(p, MA_OWNED);
1055         THREAD_LOCK_ASSERT(td2, MA_OWNED);
1056
1057         wakeup_swapper = 0;
1058
1059         /*
1060          * Since the thread lock is dropped by the scheduler we have
1061          * to retry to check for races.
1062          */
1063 restart:
1064         switch (mode) {
1065         case SINGLE_EXIT:
1066                 if (TD_IS_SUSPENDED(td2)) {
1067                         wakeup_swapper |= thread_unsuspend_one(td2, p, true);
1068                         thread_lock(td2);
1069                         goto restart;
1070                 }
1071                 if (TD_CAN_ABORT(td2)) {
1072                         wakeup_swapper |= sleepq_abort(td2, EINTR);
1073                         return (wakeup_swapper);
1074                 }
1075                 break;
1076         case SINGLE_BOUNDARY:
1077         case SINGLE_NO_EXIT:
1078                 if (TD_IS_SUSPENDED(td2) &&
1079                     (td2->td_flags & TDF_BOUNDARY) == 0) {
1080                         wakeup_swapper |= thread_unsuspend_one(td2, p, false);
1081                         thread_lock(td2);
1082                         goto restart;
1083                 }
1084                 if (TD_CAN_ABORT(td2)) {
1085                         wakeup_swapper |= sleepq_abort(td2, ERESTART);
1086                         return (wakeup_swapper);
1087                 }
1088                 break;
1089         case SINGLE_ALLPROC:
1090                 /*
1091                  * ALLPROC suspend tries to avoid spurious EINTR for
1092                  * threads sleeping interruptable, by suspending the
1093                  * thread directly, similarly to sig_suspend_threads().
1094                  * Since such sleep is not performed at the user
1095                  * boundary, TDF_BOUNDARY flag is not set, and TDF_ALLPROCSUSP
1096                  * is used to avoid immediate un-suspend.
1097                  */
1098                 if (TD_IS_SUSPENDED(td2) && (td2->td_flags & (TDF_BOUNDARY |
1099                     TDF_ALLPROCSUSP)) == 0) {
1100                         wakeup_swapper |= thread_unsuspend_one(td2, p, false);
1101                         thread_lock(td2);
1102                         goto restart;
1103                 }
1104                 if (TD_CAN_ABORT(td2)) {
1105                         if ((td2->td_flags & TDF_SBDRY) == 0) {
1106                                 thread_suspend_one(td2);
1107                                 td2->td_flags |= TDF_ALLPROCSUSP;
1108                         } else {
1109                                 wakeup_swapper |= sleepq_abort(td2, ERESTART);
1110                                 return (wakeup_swapper);
1111                         }
1112                 }
1113                 break;
1114         default:
1115                 break;
1116         }
1117         thread_unlock(td2);
1118         return (wakeup_swapper);
1119 }
1120
1121 /*
1122  * Enforce single-threading.
1123  *
1124  * Returns 1 if the caller must abort (another thread is waiting to
1125  * exit the process or similar). Process is locked!
1126  * Returns 0 when you are successfully the only thread running.
1127  * A process has successfully single threaded in the suspend mode when
1128  * There are no threads in user mode. Threads in the kernel must be
1129  * allowed to continue until they get to the user boundary. They may even
1130  * copy out their return values and data before suspending. They may however be
1131  * accelerated in reaching the user boundary as we will wake up
1132  * any sleeping threads that are interruptable. (PCATCH).
1133  */
1134 int
1135 thread_single(struct proc *p, int mode)
1136 {
1137         struct thread *td;
1138         struct thread *td2;
1139         int remaining, wakeup_swapper;
1140
1141         td = curthread;
1142         KASSERT(mode == SINGLE_EXIT || mode == SINGLE_BOUNDARY ||
1143             mode == SINGLE_ALLPROC || mode == SINGLE_NO_EXIT,
1144             ("invalid mode %d", mode));
1145         /*
1146          * If allowing non-ALLPROC singlethreading for non-curproc
1147          * callers, calc_remaining() and remain_for_mode() should be
1148          * adjusted to also account for td->td_proc != p.  For now
1149          * this is not implemented because it is not used.
1150          */
1151         KASSERT((mode == SINGLE_ALLPROC && td->td_proc != p) ||
1152             (mode != SINGLE_ALLPROC && td->td_proc == p),
1153             ("mode %d proc %p curproc %p", mode, p, td->td_proc));
1154         mtx_assert(&Giant, MA_NOTOWNED);
1155         PROC_LOCK_ASSERT(p, MA_OWNED);
1156
1157         if ((p->p_flag & P_HADTHREADS) == 0 && mode != SINGLE_ALLPROC)
1158                 return (0);
1159
1160         /* Is someone already single threading? */
1161         if (p->p_singlethread != NULL && p->p_singlethread != td)
1162                 return (1);
1163
1164         if (mode == SINGLE_EXIT) {
1165                 p->p_flag |= P_SINGLE_EXIT;
1166                 p->p_flag &= ~P_SINGLE_BOUNDARY;
1167         } else {
1168                 p->p_flag &= ~P_SINGLE_EXIT;
1169                 if (mode == SINGLE_BOUNDARY)
1170                         p->p_flag |= P_SINGLE_BOUNDARY;
1171                 else
1172                         p->p_flag &= ~P_SINGLE_BOUNDARY;
1173         }
1174         if (mode == SINGLE_ALLPROC)
1175                 p->p_flag |= P_TOTAL_STOP;
1176         p->p_flag |= P_STOPPED_SINGLE;
1177         PROC_SLOCK(p);
1178         p->p_singlethread = td;
1179         remaining = calc_remaining(p, mode);
1180         while (remaining != remain_for_mode(mode)) {
1181                 if (P_SHOULDSTOP(p) != P_STOPPED_SINGLE)
1182                         goto stopme;
1183                 wakeup_swapper = 0;
1184                 FOREACH_THREAD_IN_PROC(p, td2) {
1185                         if (td2 == td)
1186                                 continue;
1187                         thread_lock(td2);
1188                         td2->td_flags |= TDF_ASTPENDING | TDF_NEEDSUSPCHK;
1189                         if (TD_IS_INHIBITED(td2)) {
1190                                 wakeup_swapper |= weed_inhib(mode, td2, p);
1191 #ifdef SMP
1192                         } else if (TD_IS_RUNNING(td2) && td != td2) {
1193                                 forward_signal(td2);
1194                                 thread_unlock(td2);
1195 #endif
1196                         } else
1197                                 thread_unlock(td2);
1198                 }
1199                 if (wakeup_swapper)
1200                         kick_proc0();
1201                 remaining = calc_remaining(p, mode);
1202
1203                 /*
1204                  * Maybe we suspended some threads.. was it enough?
1205                  */
1206                 if (remaining == remain_for_mode(mode))
1207                         break;
1208
1209 stopme:
1210                 /*
1211                  * Wake us up when everyone else has suspended.
1212                  * In the mean time we suspend as well.
1213                  */
1214                 thread_suspend_switch(td, p);
1215                 remaining = calc_remaining(p, mode);
1216         }
1217         if (mode == SINGLE_EXIT) {
1218                 /*
1219                  * Convert the process to an unthreaded process.  The
1220                  * SINGLE_EXIT is called by exit1() or execve(), in
1221                  * both cases other threads must be retired.
1222                  */
1223                 KASSERT(p->p_numthreads == 1, ("Unthreading with >1 threads"));
1224                 p->p_singlethread = NULL;
1225                 p->p_flag &= ~(P_STOPPED_SINGLE | P_SINGLE_EXIT | P_HADTHREADS);
1226
1227                 /*
1228                  * Wait for any remaining threads to exit cpu_throw().
1229                  */
1230                 while (p->p_exitthreads != 0) {
1231                         PROC_SUNLOCK(p);
1232                         PROC_UNLOCK(p);
1233                         sched_relinquish(td);
1234                         PROC_LOCK(p);
1235                         PROC_SLOCK(p);
1236                 }
1237         } else if (mode == SINGLE_BOUNDARY) {
1238                 /*
1239                  * Wait until all suspended threads are removed from
1240                  * the processors.  The thread_suspend_check()
1241                  * increments p_boundary_count while it is still
1242                  * running, which makes it possible for the execve()
1243                  * to destroy vmspace while our other threads are
1244                  * still using the address space.
1245                  *
1246                  * We lock the thread, which is only allowed to
1247                  * succeed after context switch code finished using
1248                  * the address space.
1249                  */
1250                 FOREACH_THREAD_IN_PROC(p, td2) {
1251                         if (td2 == td)
1252                                 continue;
1253                         thread_lock(td2);
1254                         KASSERT((td2->td_flags & TDF_BOUNDARY) != 0,
1255                             ("td %p not on boundary", td2));
1256                         KASSERT(TD_IS_SUSPENDED(td2),
1257                             ("td %p is not suspended", td2));
1258                         thread_unlock(td2);
1259                 }
1260         }
1261         PROC_SUNLOCK(p);
1262         return (0);
1263 }
1264
1265 bool
1266 thread_suspend_check_needed(void)
1267 {
1268         struct proc *p;
1269         struct thread *td;
1270
1271         td = curthread;
1272         p = td->td_proc;
1273         PROC_LOCK_ASSERT(p, MA_OWNED);
1274         return (P_SHOULDSTOP(p) || ((p->p_flag & P_TRACED) != 0 &&
1275             (td->td_dbgflags & TDB_SUSPEND) != 0));
1276 }
1277
1278 /*
1279  * Called in from locations that can safely check to see
1280  * whether we have to suspend or at least throttle for a
1281  * single-thread event (e.g. fork).
1282  *
1283  * Such locations include userret().
1284  * If the "return_instead" argument is non zero, the thread must be able to
1285  * accept 0 (caller may continue), or 1 (caller must abort) as a result.
1286  *
1287  * The 'return_instead' argument tells the function if it may do a
1288  * thread_exit() or suspend, or whether the caller must abort and back
1289  * out instead.
1290  *
1291  * If the thread that set the single_threading request has set the
1292  * P_SINGLE_EXIT bit in the process flags then this call will never return
1293  * if 'return_instead' is false, but will exit.
1294  *
1295  * P_SINGLE_EXIT | return_instead == 0| return_instead != 0
1296  *---------------+--------------------+---------------------
1297  *       0       | returns 0          |   returns 0 or 1
1298  *               | when ST ends       |   immediately
1299  *---------------+--------------------+---------------------
1300  *       1       | thread exits       |   returns 1
1301  *               |                    |  immediately
1302  * 0 = thread_exit() or suspension ok,
1303  * other = return error instead of stopping the thread.
1304  *
1305  * While a full suspension is under effect, even a single threading
1306  * thread would be suspended if it made this call (but it shouldn't).
1307  * This call should only be made from places where
1308  * thread_exit() would be safe as that may be the outcome unless
1309  * return_instead is set.
1310  */
1311 int
1312 thread_suspend_check(int return_instead)
1313 {
1314         struct thread *td;
1315         struct proc *p;
1316         int wakeup_swapper;
1317
1318         td = curthread;
1319         p = td->td_proc;
1320         mtx_assert(&Giant, MA_NOTOWNED);
1321         PROC_LOCK_ASSERT(p, MA_OWNED);
1322         while (thread_suspend_check_needed()) {
1323                 if (P_SHOULDSTOP(p) == P_STOPPED_SINGLE) {
1324                         KASSERT(p->p_singlethread != NULL,
1325                             ("singlethread not set"));
1326                         /*
1327                          * The only suspension in action is a
1328                          * single-threading. Single threader need not stop.
1329                          * It is safe to access p->p_singlethread unlocked
1330                          * because it can only be set to our address by us.
1331                          */
1332                         if (p->p_singlethread == td)
1333                                 return (0);     /* Exempt from stopping. */
1334                 }
1335                 if ((p->p_flag & P_SINGLE_EXIT) && return_instead)
1336                         return (EINTR);
1337
1338                 /* Should we goto user boundary if we didn't come from there? */
1339                 if (P_SHOULDSTOP(p) == P_STOPPED_SINGLE &&
1340                     (p->p_flag & P_SINGLE_BOUNDARY) && return_instead)
1341                         return (ERESTART);
1342
1343                 /*
1344                  * Ignore suspend requests if they are deferred.
1345                  */
1346                 if ((td->td_flags & TDF_SBDRY) != 0) {
1347                         KASSERT(return_instead,
1348                             ("TDF_SBDRY set for unsafe thread_suspend_check"));
1349                         KASSERT((td->td_flags & (TDF_SEINTR | TDF_SERESTART)) !=
1350                             (TDF_SEINTR | TDF_SERESTART),
1351                             ("both TDF_SEINTR and TDF_SERESTART"));
1352                         return (TD_SBDRY_INTR(td) ? TD_SBDRY_ERRNO(td) : 0);
1353                 }
1354
1355                 /*
1356                  * If the process is waiting for us to exit,
1357                  * this thread should just suicide.
1358                  * Assumes that P_SINGLE_EXIT implies P_STOPPED_SINGLE.
1359                  */
1360                 if ((p->p_flag & P_SINGLE_EXIT) && (p->p_singlethread != td)) {
1361                         PROC_UNLOCK(p);
1362
1363                         /*
1364                          * Allow Linux emulation layer to do some work
1365                          * before thread suicide.
1366                          */
1367                         if (__predict_false(p->p_sysent->sv_thread_detach != NULL))
1368                                 (p->p_sysent->sv_thread_detach)(td);
1369                         umtx_thread_exit(td);
1370                         kern_thr_exit(td);
1371                         panic("stopped thread did not exit");
1372                 }
1373
1374                 PROC_SLOCK(p);
1375                 thread_stopped(p);
1376                 if (P_SHOULDSTOP(p) == P_STOPPED_SINGLE) {
1377                         if (p->p_numthreads == p->p_suspcount + 1) {
1378                                 thread_lock(p->p_singlethread);
1379                                 wakeup_swapper = thread_unsuspend_one(
1380                                     p->p_singlethread, p, false);
1381                                 if (wakeup_swapper)
1382                                         kick_proc0();
1383                         }
1384                 }
1385                 PROC_UNLOCK(p);
1386                 thread_lock(td);
1387                 /*
1388                  * When a thread suspends, it just
1389                  * gets taken off all queues.
1390                  */
1391                 thread_suspend_one(td);
1392                 if (return_instead == 0) {
1393                         p->p_boundary_count++;
1394                         td->td_flags |= TDF_BOUNDARY;
1395                 }
1396                 PROC_SUNLOCK(p);
1397                 mi_switch(SW_INVOL | SWT_SUSPEND);
1398                 PROC_LOCK(p);
1399         }
1400         return (0);
1401 }
1402
1403 /*
1404  * Check for possible stops and suspensions while executing a
1405  * casueword or similar transiently failing operation.
1406  *
1407  * The sleep argument controls whether the function can handle a stop
1408  * request itself or it should return ERESTART and the request is
1409  * proceed at the kernel/user boundary in ast.
1410  *
1411  * Typically, when retrying due to casueword(9) failure (rv == 1), we
1412  * should handle the stop requests there, with exception of cases when
1413  * the thread owns a kernel resource, for instance busied the umtx
1414  * key, or when functions return immediately if thread_check_susp()
1415  * returned non-zero.  On the other hand, retrying the whole lock
1416  * operation, we better not stop there but delegate the handling to
1417  * ast.
1418  *
1419  * If the request is for thread termination P_SINGLE_EXIT, we cannot
1420  * handle it at all, and simply return EINTR.
1421  */
1422 int
1423 thread_check_susp(struct thread *td, bool sleep)
1424 {
1425         struct proc *p;
1426         int error;
1427
1428         /*
1429          * The check for TDF_NEEDSUSPCHK is racy, but it is enough to
1430          * eventually break the lockstep loop.
1431          */
1432         if ((td->td_flags & TDF_NEEDSUSPCHK) == 0)
1433                 return (0);
1434         error = 0;
1435         p = td->td_proc;
1436         PROC_LOCK(p);
1437         if (p->p_flag & P_SINGLE_EXIT)
1438                 error = EINTR;
1439         else if (P_SHOULDSTOP(p) ||
1440             ((p->p_flag & P_TRACED) && (td->td_dbgflags & TDB_SUSPEND)))
1441                 error = sleep ? thread_suspend_check(0) : ERESTART;
1442         PROC_UNLOCK(p);
1443         return (error);
1444 }
1445
1446 void
1447 thread_suspend_switch(struct thread *td, struct proc *p)
1448 {
1449
1450         KASSERT(!TD_IS_SUSPENDED(td), ("already suspended"));
1451         PROC_LOCK_ASSERT(p, MA_OWNED);
1452         PROC_SLOCK_ASSERT(p, MA_OWNED);
1453         /*
1454          * We implement thread_suspend_one in stages here to avoid
1455          * dropping the proc lock while the thread lock is owned.
1456          */
1457         if (p == td->td_proc) {
1458                 thread_stopped(p);
1459                 p->p_suspcount++;
1460         }
1461         PROC_UNLOCK(p);
1462         thread_lock(td);
1463         td->td_flags &= ~TDF_NEEDSUSPCHK;
1464         TD_SET_SUSPENDED(td);
1465         sched_sleep(td, 0);
1466         PROC_SUNLOCK(p);
1467         DROP_GIANT();
1468         mi_switch(SW_VOL | SWT_SUSPEND);
1469         PICKUP_GIANT();
1470         PROC_LOCK(p);
1471         PROC_SLOCK(p);
1472 }
1473
1474 void
1475 thread_suspend_one(struct thread *td)
1476 {
1477         struct proc *p;
1478
1479         p = td->td_proc;
1480         PROC_SLOCK_ASSERT(p, MA_OWNED);
1481         THREAD_LOCK_ASSERT(td, MA_OWNED);
1482         KASSERT(!TD_IS_SUSPENDED(td), ("already suspended"));
1483         p->p_suspcount++;
1484         td->td_flags &= ~TDF_NEEDSUSPCHK;
1485         TD_SET_SUSPENDED(td);
1486         sched_sleep(td, 0);
1487 }
1488
1489 static int
1490 thread_unsuspend_one(struct thread *td, struct proc *p, bool boundary)
1491 {
1492
1493         THREAD_LOCK_ASSERT(td, MA_OWNED);
1494         KASSERT(TD_IS_SUSPENDED(td), ("Thread not suspended"));
1495         TD_CLR_SUSPENDED(td);
1496         td->td_flags &= ~TDF_ALLPROCSUSP;
1497         if (td->td_proc == p) {
1498                 PROC_SLOCK_ASSERT(p, MA_OWNED);
1499                 p->p_suspcount--;
1500                 if (boundary && (td->td_flags & TDF_BOUNDARY) != 0) {
1501                         td->td_flags &= ~TDF_BOUNDARY;
1502                         p->p_boundary_count--;
1503                 }
1504         }
1505         return (setrunnable(td, 0));
1506 }
1507
1508 /*
1509  * Allow all threads blocked by single threading to continue running.
1510  */
1511 void
1512 thread_unsuspend(struct proc *p)
1513 {
1514         struct thread *td;
1515         int wakeup_swapper;
1516
1517         PROC_LOCK_ASSERT(p, MA_OWNED);
1518         PROC_SLOCK_ASSERT(p, MA_OWNED);
1519         wakeup_swapper = 0;
1520         if (!P_SHOULDSTOP(p)) {
1521                 FOREACH_THREAD_IN_PROC(p, td) {
1522                         thread_lock(td);
1523                         if (TD_IS_SUSPENDED(td)) {
1524                                 wakeup_swapper |= thread_unsuspend_one(td, p,
1525                                     true);
1526                         } else
1527                                 thread_unlock(td);
1528                 }
1529         } else if (P_SHOULDSTOP(p) == P_STOPPED_SINGLE &&
1530             p->p_numthreads == p->p_suspcount) {
1531                 /*
1532                  * Stopping everything also did the job for the single
1533                  * threading request. Now we've downgraded to single-threaded,
1534                  * let it continue.
1535                  */
1536                 if (p->p_singlethread->td_proc == p) {
1537                         thread_lock(p->p_singlethread);
1538                         wakeup_swapper = thread_unsuspend_one(
1539                             p->p_singlethread, p, false);
1540                 }
1541         }
1542         if (wakeup_swapper)
1543                 kick_proc0();
1544 }
1545
1546 /*
1547  * End the single threading mode..
1548  */
1549 void
1550 thread_single_end(struct proc *p, int mode)
1551 {
1552         struct thread *td;
1553         int wakeup_swapper;
1554
1555         KASSERT(mode == SINGLE_EXIT || mode == SINGLE_BOUNDARY ||
1556             mode == SINGLE_ALLPROC || mode == SINGLE_NO_EXIT,
1557             ("invalid mode %d", mode));
1558         PROC_LOCK_ASSERT(p, MA_OWNED);
1559         KASSERT((mode == SINGLE_ALLPROC && (p->p_flag & P_TOTAL_STOP) != 0) ||
1560             (mode != SINGLE_ALLPROC && (p->p_flag & P_TOTAL_STOP) == 0),
1561             ("mode %d does not match P_TOTAL_STOP", mode));
1562         KASSERT(mode == SINGLE_ALLPROC || p->p_singlethread == curthread,
1563             ("thread_single_end from other thread %p %p",
1564             curthread, p->p_singlethread));
1565         KASSERT(mode != SINGLE_BOUNDARY ||
1566             (p->p_flag & P_SINGLE_BOUNDARY) != 0,
1567             ("mis-matched SINGLE_BOUNDARY flags %x", p->p_flag));
1568         p->p_flag &= ~(P_STOPPED_SINGLE | P_SINGLE_EXIT | P_SINGLE_BOUNDARY |
1569             P_TOTAL_STOP);
1570         PROC_SLOCK(p);
1571         p->p_singlethread = NULL;
1572         wakeup_swapper = 0;
1573         /*
1574          * If there are other threads they may now run,
1575          * unless of course there is a blanket 'stop order'
1576          * on the process. The single threader must be allowed
1577          * to continue however as this is a bad place to stop.
1578          */
1579         if (p->p_numthreads != remain_for_mode(mode) && !P_SHOULDSTOP(p)) {
1580                 FOREACH_THREAD_IN_PROC(p, td) {
1581                         thread_lock(td);
1582                         if (TD_IS_SUSPENDED(td)) {
1583                                 wakeup_swapper |= thread_unsuspend_one(td, p,
1584                                     mode == SINGLE_BOUNDARY);
1585                         } else
1586                                 thread_unlock(td);
1587                 }
1588         }
1589         KASSERT(mode != SINGLE_BOUNDARY || p->p_boundary_count == 0,
1590             ("inconsistent boundary count %d", p->p_boundary_count));
1591         PROC_SUNLOCK(p);
1592         if (wakeup_swapper)
1593                 kick_proc0();
1594 }
1595
1596 /*
1597  * Locate a thread by number and return with proc lock held.
1598  *
1599  * thread exit establishes proc -> tidhash lock ordering, but lookup
1600  * takes tidhash first and needs to return locked proc.
1601  *
1602  * The problem is worked around by relying on type-safety of both
1603  * structures and doing the work in 2 steps:
1604  * - tidhash-locked lookup which saves both thread and proc pointers
1605  * - proc-locked verification that the found thread still matches
1606  */
1607 static bool
1608 tdfind_hash(lwpid_t tid, pid_t pid, struct proc **pp, struct thread **tdp)
1609 {
1610 #define RUN_THRESH      16
1611         struct proc *p;
1612         struct thread *td;
1613         int run;
1614         bool locked;
1615
1616         run = 0;
1617         rw_rlock(TIDHASHLOCK(tid));
1618         locked = true;
1619         LIST_FOREACH(td, TIDHASH(tid), td_hash) {
1620                 if (td->td_tid != tid) {
1621                         run++;
1622                         continue;
1623                 }
1624                 p = td->td_proc;
1625                 if (pid != -1 && p->p_pid != pid) {
1626                         td = NULL;
1627                         break;
1628                 }
1629                 if (run > RUN_THRESH) {
1630                         if (rw_try_upgrade(TIDHASHLOCK(tid))) {
1631                                 LIST_REMOVE(td, td_hash);
1632                                 LIST_INSERT_HEAD(TIDHASH(td->td_tid),
1633                                         td, td_hash);
1634                                 rw_wunlock(TIDHASHLOCK(tid));
1635                                 locked = false;
1636                                 break;
1637                         }
1638                 }
1639                 break;
1640         }
1641         if (locked)
1642                 rw_runlock(TIDHASHLOCK(tid));
1643         if (td == NULL)
1644                 return (false);
1645         *pp = p;
1646         *tdp = td;
1647         return (true);
1648 }
1649
1650 struct thread *
1651 tdfind(lwpid_t tid, pid_t pid)
1652 {
1653         struct proc *p;
1654         struct thread *td;
1655
1656         td = curthread;
1657         if (td->td_tid == tid) {
1658                 if (pid != -1 && td->td_proc->p_pid != pid)
1659                         return (NULL);
1660                 PROC_LOCK(td->td_proc);
1661                 return (td);
1662         }
1663
1664         for (;;) {
1665                 if (!tdfind_hash(tid, pid, &p, &td))
1666                         return (NULL);
1667                 PROC_LOCK(p);
1668                 if (td->td_tid != tid) {
1669                         PROC_UNLOCK(p);
1670                         continue;
1671                 }
1672                 if (td->td_proc != p) {
1673                         PROC_UNLOCK(p);
1674                         continue;
1675                 }
1676                 if (p->p_state == PRS_NEW) {
1677                         PROC_UNLOCK(p);
1678                         return (NULL);
1679                 }
1680                 return (td);
1681         }
1682 }
1683
1684 void
1685 tidhash_add(struct thread *td)
1686 {
1687         rw_wlock(TIDHASHLOCK(td->td_tid));
1688         LIST_INSERT_HEAD(TIDHASH(td->td_tid), td, td_hash);
1689         rw_wunlock(TIDHASHLOCK(td->td_tid));
1690 }
1691
1692 void
1693 tidhash_remove(struct thread *td)
1694 {
1695
1696         rw_wlock(TIDHASHLOCK(td->td_tid));
1697         LIST_REMOVE(td, td_hash);
1698         rw_wunlock(TIDHASHLOCK(td->td_tid));
1699 }