sys/kern/kern_kse.c

   1 /*
   2  * Copyright (C) 2001 Julian Elischer <julian@freebsd.org>.
   3  *  All rights reserved.
   4  *
   5  * Redistribution and use in source and binary forms, with or without
   6  * modification, are permitted provided that the following conditions
   7  * are met:
   8  * 1. Redistributions of source code must retain the above copyright
   9  *    notice(s), this list of conditions and the following disclaimer as
  10  *    the first lines of this file unmodified other than the possible
  11  *    addition of one or more copyright notices.
  12  * 2. Redistributions in binary form must reproduce the above copyright
  13  *    notice(s), this list of conditions and the following disclaimer in the
  14  *    documentation and/or other materials provided with the distribution.
  15  *
  16  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDER(S) ``AS IS'' AND ANY
  17  * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
  18  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
  19  * DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT HOLDER(S) BE LIABLE FOR ANY
  20  * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
  21  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
  22  * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
  23  * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  24  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  25  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH
  26  * DAMAGE.
  27  *
  28  * $FreeBSD$
  29  */
  30
  31 #include <sys/param.h>
  32 #include <sys/systm.h>
  33 #include <sys/kernel.h>
  34 #include <sys/lock.h>
  35 #include <sys/malloc.h>
  36 #include <sys/mutex.h>
  37 #include <sys/proc.h>
  38 #include <sys/smp.h>
  39 #include <sys/sysctl.h>
  40 #include <sys/sysproto.h>
  41 #include <sys/filedesc.h>
  42 #include <sys/sched.h>
  43 #include <sys/signalvar.h>
  44 #include <sys/sx.h>
  45 #include <sys/tty.h>
  46 #include <sys/user.h>
  47 #include <sys/jail.h>
  48 #include <sys/kse.h>
  49 #include <sys/ktr.h>
  50 #include <sys/ucontext.h>
  51
  52 #include <vm/vm.h>
  53 #include <vm/vm_object.h>
  54 #include <vm/pmap.h>
  55 #include <vm/uma.h>
  56 #include <vm/vm_map.h>
  57
  58 #include <machine/frame.h>
  59
  60 /*
  61  * KSEGRP related storage.
  62  */
  63 static uma_zone_t ksegrp_zone;
  64 static uma_zone_t kse_zone;
  65 static uma_zone_t thread_zone;
  66
  67 /* DEBUG ONLY */
  68 SYSCTL_NODE(_kern, OID_AUTO, threads, CTLFLAG_RW, 0, "thread allocation");
  69 static int oiks_debug = 0;      /* 0 disable, 1 printf, 2 enter debugger */
  70 SYSCTL_INT(_kern_threads, OID_AUTO, oiks, CTLFLAG_RW,
  71         &oiks_debug, 0, "OIKS thread debug");
  72
  73 static int oiks_max_threads_per_proc = 10;
  74 SYSCTL_INT(_kern_threads, OID_AUTO, oiks_max_per_proc, CTLFLAG_RW,
  75         &oiks_max_threads_per_proc, 0, "Debug limit on threads per proc");
  76
  77 static int max_threads_per_proc = 30;
  78 SYSCTL_INT(_kern_threads, OID_AUTO, max_threads_per_proc, CTLFLAG_RW,
  79         &max_threads_per_proc, 0, "Limit on threads per proc");
  80
  81 static int max_groups_per_proc = 5;
  82 SYSCTL_INT(_kern_threads, OID_AUTO, max_groups_per_proc, CTLFLAG_RW,
  83         &max_groups_per_proc, 0, "Limit on thread groups per proc");
  84
  85 #define RANGEOF(type, start, end) (offsetof(type, end) - offsetof(type, start))
  86
  87 struct threadqueue zombie_threads = TAILQ_HEAD_INITIALIZER(zombie_threads);
  88 TAILQ_HEAD(, kse) zombie_kses = TAILQ_HEAD_INITIALIZER(zombie_kses);
  89 TAILQ_HEAD(, ksegrp) zombie_ksegrps = TAILQ_HEAD_INITIALIZER(zombie_ksegrps);
  90 struct mtx zombie_thread_lock;
  91 MTX_SYSINIT(zombie_thread_lock, &zombie_thread_lock,
  92     "zombie_thread_lock", MTX_SPIN);
  93
  94
  95
  96 void kse_purge(struct proc *p, struct thread *td);
  97 /*
  98  * Pepare a thread for use.
  99  */
 100 static void
 101 thread_ctor(void *mem, int size, void *arg)
 102 {
 103         struct thread   *td;
 104
 105         td = (struct thread *)mem;
 106         td->td_state = TDS_INACTIVE;
 107         td->td_flags |= TDF_UNBOUND;
 108 }
 109
 110 /*
 111  * Reclaim a thread after use.
 112  */
 113 static void
 114 thread_dtor(void *mem, int size, void *arg)
 115 {
 116         struct thread   *td;
 117
 118         mtx_assert(&Giant, MA_OWNED);
 119         td = (struct thread *)mem;
 120
 121 #ifdef INVARIANTS
 122         /* Verify that this thread is in a safe state to free. */
 123         switch (td->td_state) {
 124         case TDS_INHIBITED:
 125         case TDS_RUNNING:
 126         case TDS_CAN_RUN:
 127         case TDS_RUNQ:
 128                 /*
 129                  * We must never unlink a thread that is in one of
 130                  * these states, because it is currently active.
 131                  */
 132                 panic("bad state for thread unlinking");
 133                 /* NOTREACHED */
 134         case TDS_INACTIVE:
 135                 break;
 136         default:
 137                 panic("bad thread state");
 138                 /* NOTREACHED */
 139         }
 140 #endif
 141
 142         cpu_thread_dtor(td);
 143 }
 144
 145 /*
 146  * Initialize type-stable parts of a thread (when newly created).
 147  */
 148 static void
 149 thread_init(void *mem, int size)
 150 {
 151         struct thread   *td;
 152
 153         td = (struct thread *)mem;
 154         mtx_lock(&Giant);
 155         pmap_new_thread(td, 0);
 156         mtx_unlock(&Giant);
 157         cpu_thread_setup(td);
 158         td->td_sched = (struct td_sched *)&td[1];
 159 }
 160
 161 /*
 162  * Tear down type-stable parts of a thread (just before being discarded).
 163  */
 164 static void
 165 thread_fini(void *mem, int size)
 166 {
 167         struct thread   *td;
 168
 169         td = (struct thread *)mem;
 170         pmap_dispose_thread(td);
 171 }
 172 /*
 173  * Initialize type-stable parts of a kse (when newly created).
 174  */
 175 static void
 176 kse_init(void *mem, int size)
 177 {
 178         struct kse      *ke;
 179
 180         ke = (struct kse *)mem;
 181         ke->ke_sched = (struct ke_sched *)&ke[1];
 182 }
 183 /*
 184  * Initialize type-stable parts of a ksegrp (when newly created).
 185  */
 186 static void
 187 ksegrp_init(void *mem, int size)
 188 {
 189         struct ksegrp   *kg;
 190
 191         kg = (struct ksegrp *)mem;
 192         kg->kg_sched = (struct kg_sched *)&kg[1];
 193 }
 194
 195 /*
 196  * KSE is linked onto the idle queue.
 197  */
 198 void
 199 kse_link(struct kse *ke, struct ksegrp *kg)
 200 {
 201         struct proc *p = kg->kg_proc;
 202
 203         TAILQ_INSERT_HEAD(&kg->kg_kseq, ke, ke_kglist);
 204         kg->kg_kses++;
 205         ke->ke_state = KES_UNQUEUED;
 206         ke->ke_proc     = p;
 207         ke->ke_ksegrp   = kg;
 208         ke->ke_thread   = NULL;
 209         ke->ke_oncpu = NOCPU;
 210 }
 211
 212 void
 213 kse_unlink(struct kse *ke)
 214 {
 215         struct ksegrp *kg;
 216
 217         mtx_assert(&sched_lock, MA_OWNED);
 218         kg = ke->ke_ksegrp;
 219         if (ke->ke_state == KES_IDLE) {
 220                 kg->kg_idle_kses--;
 221                 TAILQ_REMOVE(&kg->kg_iq, ke, ke_kgrlist);
 222         }
 223
 224         TAILQ_REMOVE(&kg->kg_kseq, ke, ke_kglist);
 225         if (--kg->kg_kses == 0) {
 226                         ksegrp_unlink(kg);
 227         }
 228         /*
 229          * Aggregate stats from the KSE
 230          */
 231         kse_stash(ke);
 232 }
 233
 234 void
 235 ksegrp_link(struct ksegrp *kg, struct proc *p)
 236 {
 237
 238         TAILQ_INIT(&kg->kg_threads);
 239         TAILQ_INIT(&kg->kg_runq);       /* links with td_runq */
 240         TAILQ_INIT(&kg->kg_slpq);       /* links with td_runq */
 241         TAILQ_INIT(&kg->kg_kseq);       /* all kses in ksegrp */
 242         TAILQ_INIT(&kg->kg_iq);         /* idle kses in ksegrp */
 243         TAILQ_INIT(&kg->kg_lq);         /* loan kses in ksegrp */
 244         kg->kg_proc     = p;
 245 /* the following counters are in the -zero- section and may not need clearing */
 246         kg->kg_numthreads = 0;
 247         kg->kg_runnable = 0;
 248         kg->kg_kses = 0;
 249         kg->kg_idle_kses = 0;
 250         kg->kg_loan_kses = 0;
 251         kg->kg_runq_kses = 0; /* XXXKSE change name */
 252 /* link it in now that it's consistent */
 253         p->p_numksegrps++;
 254         TAILQ_INSERT_HEAD(&p->p_ksegrps, kg, kg_ksegrp);
 255 }
 256
 257 void
 258 ksegrp_unlink(struct ksegrp *kg)
 259 {
 260         struct proc *p;
 261
 262         mtx_assert(&sched_lock, MA_OWNED);
 263         p = kg->kg_proc;
 264         KASSERT(((kg->kg_numthreads == 0) && (kg->kg_kses == 0)),
 265             ("kseg_unlink: residual threads or KSEs"));
 266         TAILQ_REMOVE(&p->p_ksegrps, kg, kg_ksegrp);
 267         p->p_numksegrps--;
 268         /*
 269          * Aggregate stats from the KSE
 270          */
 271         ksegrp_stash(kg);
 272 }
 273
 274 /*
 275  * for a newly created process,
 276  * link up a the structure and its initial threads etc.
 277  */
 278 void
 279 proc_linkup(struct proc *p, struct ksegrp *kg,
 280                         struct kse *ke, struct thread *td)
 281 {
 282
 283         TAILQ_INIT(&p->p_ksegrps);           /* all ksegrps in proc */
 284         TAILQ_INIT(&p->p_threads);           /* all threads in proc */
 285         TAILQ_INIT(&p->p_suspended);         /* Threads suspended */
 286         p->p_numksegrps = 0;
 287         p->p_numthreads = 0;
 288
 289         ksegrp_link(kg, p);
 290         kse_link(ke, kg);
 291         thread_link(td, kg);
 292 }
 293
 294 int
 295 kse_thr_interrupt(struct thread *td, struct kse_thr_interrupt_args *uap)
 296 {
 297         struct proc *p;
 298         struct thread *td2;
 299
 300         p = td->td_proc;
 301         /* KSE-enabled processes only, please. */
 302         if (!(p->p_flag & P_KSES))
 303                 return (EINVAL);
 304         if (uap->tmbx == NULL)
 305                 return (EINVAL);
 306         mtx_lock_spin(&sched_lock);
 307         FOREACH_THREAD_IN_PROC(p, td2) {
 308                 if (td2->td_mailbox == uap->tmbx) {
 309                         td2->td_flags |= TDF_INTERRUPT;
 310                         if (TD_ON_SLEEPQ(td2) && (td2->td_flags & TDF_SINTR)) {
 311                                 if (td2->td_flags & TDF_CVWAITQ)
 312                                         cv_abort(td2);
 313                                 else
 314                                         abortsleep(td2);
 315                         }
 316                         mtx_unlock_spin(&sched_lock);
 317                         td->td_retval[0] = 0;
 318                         td->td_retval[1] = 0;
 319                         return (0);
 320                 }
 321         }
 322         mtx_unlock_spin(&sched_lock);
 323         return (ESRCH);
 324 }
 325
 326 int
 327 kse_exit(struct thread *td, struct kse_exit_args *uap)
 328 {
 329         struct proc *p;
 330         struct ksegrp *kg;
 331
 332         p = td->td_proc;
 333         /* KSE-enabled processes only, please. */
 334         if (!(p->p_flag & P_KSES))
 335                 return (EINVAL);
 336         /* must be a bound thread */
 337         if (td->td_flags & TDF_UNBOUND)
 338                 return (EINVAL);
 339         kg = td->td_ksegrp;
 340         /* serialize killing kse */
 341         PROC_LOCK(p);
 342         mtx_lock_spin(&sched_lock);
 343         if ((kg->kg_kses == 1) && (kg->kg_numthreads > 1)) {
 344                 mtx_unlock_spin(&sched_lock);
 345                 PROC_UNLOCK(p);
 346                 return (EDEADLK);
 347         }
 348         if ((p->p_numthreads == 1) && (p->p_numksegrps == 1)) {
 349                 p->p_flag &= ~P_KSES;
 350                 mtx_unlock_spin(&sched_lock);
 351                 PROC_UNLOCK(p);
 352         } else {
 353                 while (mtx_owned(&Giant))
 354                         mtx_unlock(&Giant);
 355                 td->td_kse->ke_flags |= KEF_EXIT;
 356                 thread_exit();
 357                 /* NOTREACHED */
 358         }
 359         return (0);
 360 }
 361
 362 int
 363 kse_release(struct thread *td, struct kse_release_args *uap)
 364 {
 365         struct proc *p;
 366
 367         p = td->td_proc;
 368         /* KSE-enabled processes only */
 369         if (!(p->p_flag & P_KSES))
 370                 return (EINVAL);
 371         /*
 372          * Must be a bound thread. And kse must have a mailbox ready,
 373          * if not, the kse would can not generate an upcall.
 374          */
 375         if (!(td->td_flags & TDF_UNBOUND) && (td->td_kse->ke_mailbox != NULL)) {
 376                 PROC_LOCK(p);
 377                 mtx_lock_spin(&sched_lock);
 378                 /* prevent last thread from exiting */
 379                 if (p->p_numthreads == 1) {
 380                         mtx_unlock_spin(&sched_lock);
 381                         if (td->td_standin == NULL) {
 382                                 PROC_UNLOCK(p);
 383                                 td->td_standin = thread_alloc();
 384                                 PROC_LOCK(p);
 385                         }
 386                         msleep(p->p_sigacts, &p->p_mtx, PPAUSE|PCATCH,
 387                                "pause", 0);
 388                         mtx_lock_spin(&sched_lock);
 389                         td->td_flags |= TDF_UNBOUND;
 390                         thread_schedule_upcall(td, td->td_kse);
 391                 }
 392                 thread_exit();
 393                 /* NOTREACHED */
 394         }
 395         return (EINVAL);
 396 }
 397
 398 /* struct kse_wakeup_args {
 399         struct kse_mailbox *mbx;
 400 }; */
 401 int
 402 kse_wakeup(struct thread *td, struct kse_wakeup_args *uap)
 403 {
 404         struct proc *p;
 405         struct kse *ke, *ke2;
 406         struct ksegrp *kg;
 407
 408         p = td->td_proc;
 409         /* KSE-enabled processes only, please. */
 410         if (!(p->p_flag & P_KSES))
 411                 return EINVAL;
 412         if (td->td_standin == NULL)
 413                 td->td_standin = thread_alloc();
 414         ke = NULL;
 415         mtx_lock_spin(&sched_lock);
 416         if (uap->mbx) {
 417                 FOREACH_KSEGRP_IN_PROC(p, kg) {
 418                         FOREACH_KSE_IN_GROUP(kg, ke2) {
 419                                 if (ke2->ke_mailbox != uap->mbx)
 420                                         continue;
 421                                 if (ke2->ke_state == KES_IDLE) {
 422                                         ke = ke2;
 423                                         goto found;
 424                                 } else {
 425                                         mtx_unlock_spin(&sched_lock);
 426                                         td->td_retval[0] = 0;
 427                                         td->td_retval[1] = 0;
 428                                         return (0);
 429                                 }
 430                         }
 431                 }
 432         } else {
 433                 kg = td->td_ksegrp;
 434                 ke = TAILQ_FIRST(&kg->kg_iq);
 435         }
 436         if (ke == NULL) {
 437                 mtx_unlock_spin(&sched_lock);
 438                 return (ESRCH);
 439         }
 440 found:
 441         thread_schedule_upcall(td, ke);
 442         mtx_unlock_spin(&sched_lock);
 443         td->td_retval[0] = 0;
 444         td->td_retval[1] = 0;
 445         return (0);
 446 }
 447
 448 /*
 449  * No new KSEG: first call: use current KSE, don't schedule an upcall
 450  * All other situations, do allocate a new KSE and schedule an upcall on it.
 451  */
 452 /* struct kse_create_args {
 453         struct kse_mailbox *mbx;
 454         int newgroup;
 455 }; */
 456 int
 457 kse_create(struct thread *td, struct kse_create_args *uap)
 458 {
 459         struct kse *newke;
 460         struct kse *ke;
 461         struct ksegrp *newkg;
 462         struct ksegrp *kg;
 463         struct proc *p;
 464         struct kse_mailbox mbx;
 465         int err;
 466
 467         p = td->td_proc;
 468         if ((err = copyin(uap->mbx, &mbx, sizeof(mbx))))
 469                 return (err);
 470
 471         p->p_flag |= P_KSES; /* easier to just set it than to test and set */
 472         kg = td->td_ksegrp;
 473         if (uap->newgroup) {
 474                 if (p->p_numksegrps >= max_groups_per_proc)
 475                         return (EPROCLIM);
 476                 /*
 477                  * If we want a new KSEGRP it doesn't matter whether
 478                  * we have already fired up KSE mode before or not.
 479                  * We put the process in KSE mode and create a new KSEGRP
 480                  * and KSE. If our KSE has not got a mailbox yet then
 481                  * that doesn't matter, just leave it that way. It will
 482                  * ensure that this thread stay BOUND. It's possible
 483                  * that the call came form a threaded library and the main
 484                  * program knows nothing of threads.
 485                  */
 486                 newkg = ksegrp_alloc();
 487                 bzero(&newkg->kg_startzero, RANGEOF(struct ksegrp,
 488                       kg_startzero, kg_endzero));
 489                 bcopy(&kg->kg_startcopy, &newkg->kg_startcopy,
 490                       RANGEOF(struct ksegrp, kg_startcopy, kg_endcopy));
 491                 newke = kse_alloc();
 492         } else {
 493                 /*
 494                  * Otherwise, if we have already set this KSE
 495                  * to have a mailbox, we want to make another KSE here,
 496                  * but only if there are not already the limit, which
 497                  * is 1 per CPU max.
 498                  *
 499                  * If the current KSE doesn't have a mailbox we just use it
 500                  * and give it one.
 501                  *
 502                  * Because we don't like to access
 503                  * the KSE outside of schedlock if we are UNBOUND,
 504                  * (because it can change if we are preempted by an interrupt)
 505                  * we can deduce it as having a mailbox if we are UNBOUND,
 506                  * and only need to actually look at it if we are BOUND,
 507                  * which is safe.
 508                  */
 509                 if ((td->td_flags & TDF_UNBOUND) || td->td_kse->ke_mailbox) {
 510                         if (oiks_debug == 0) {
 511 #ifdef SMP
 512                         if (kg->kg_kses > mp_ncpus)
 513 #endif
 514                                 return (EPROCLIM);
 515                         }
 516                         newke = kse_alloc();
 517                 } else {
 518                         newke = NULL;
 519                 }
 520                 newkg = NULL;
 521         }
 522         if (newke) {
 523                 bzero(&newke->ke_startzero, RANGEOF(struct kse,
 524                       ke_startzero, ke_endzero));
 525 #if 0
 526                 bcopy(&ke->ke_startcopy, &newke->ke_startcopy,
 527                       RANGEOF(struct kse, ke_startcopy, ke_endcopy));
 528 #endif
 529                 /* For the first call this may not have been set */
 530                 if (td->td_standin == NULL) {
 531                         td->td_standin = thread_alloc();
 532                 }
 533                 mtx_lock_spin(&sched_lock);
 534                 if (newkg) {
 535                         if (p->p_numksegrps >= max_groups_per_proc) {
 536                                 mtx_unlock_spin(&sched_lock);
 537                                 ksegrp_free(newkg);
 538                                 kse_free(newke);
 539                                 return (EPROCLIM);
 540                         }
 541                         ksegrp_link(newkg, p);
 542                 }
 543                 else
 544                         newkg = kg;
 545                 kse_link(newke, newkg);
 546                 if (p->p_sflag & PS_NEEDSIGCHK)
 547                         newke->ke_flags |= KEF_ASTPENDING;
 548                 newke->ke_mailbox = uap->mbx;
 549                 newke->ke_upcall = mbx.km_func;
 550                 bcopy(&mbx.km_stack, &newke->ke_stack, sizeof(stack_t));
 551                 thread_schedule_upcall(td, newke);
 552                 mtx_unlock_spin(&sched_lock);
 553         } else {
 554                 /*
 555                  * If we didn't allocate a new KSE then the we are using
 556                  * the exisiting (BOUND) kse.
 557                  */
 558                 ke = td->td_kse;
 559                 ke->ke_mailbox = uap->mbx;
 560                 ke->ke_upcall = mbx.km_func;
 561                 bcopy(&mbx.km_stack, &ke->ke_stack, sizeof(stack_t));
 562         }
 563         /*
 564          * Fill out the KSE-mode specific fields of the new kse.
 565          */
 566
 567         td->td_retval[0] = 0;
 568         td->td_retval[1] = 0;
 569         return (0);
 570 }
 571
 572 /*
 573  * Fill a ucontext_t with a thread's context information.
 574  *
 575  * This is an analogue to getcontext(3).
 576  */
 577 void
 578 thread_getcontext(struct thread *td, ucontext_t *uc)
 579 {
 580
 581 /*
 582  * XXX this is declared in a MD include file, i386/include/ucontext.h but
 583  * is used in MI code.
 584  */
 585 #ifdef __i386__
 586         get_mcontext(td, &uc->uc_mcontext);
 587 #endif
 588         uc->uc_sigmask = td->td_proc->p_sigmask;
 589 }
 590
 591 /*
 592  * Set a thread's context from a ucontext_t.
 593  *
 594  * This is an analogue to setcontext(3).
 595  */
 596 int
 597 thread_setcontext(struct thread *td, ucontext_t *uc)
 598 {
 599         int ret;
 600
 601 /*
 602  * XXX this is declared in a MD include file, i386/include/ucontext.h but
 603  * is used in MI code.
 604  */
 605 #ifdef __i386__
 606         ret = set_mcontext(td, &uc->uc_mcontext);
 607 #else
 608         ret = ENOSYS;
 609 #endif
 610         if (ret == 0) {
 611                 SIG_CANTMASK(uc->uc_sigmask);
 612                 PROC_LOCK(td->td_proc);
 613                 td->td_proc->p_sigmask = uc->uc_sigmask;
 614                 PROC_UNLOCK(td->td_proc);
 615         }
 616         return (ret);
 617 }
 618
 619 /*
 620  * Initialize global thread allocation resources.
 621  */
 622 void
 623 threadinit(void)
 624 {
 625
 626 #ifndef __ia64__
 627         thread_zone = uma_zcreate("THREAD", sched_sizeof_thread(),
 628             thread_ctor, thread_dtor, thread_init, thread_fini,
 629             UMA_ALIGN_CACHE, 0);
 630 #else
 631         /*
 632          * XXX the ia64 kstack allocator is really lame and is at the mercy
 633          * of contigmallloc().  This hackery is to pre-construct a whole
 634          * pile of thread structures with associated kernel stacks early
 635          * in the system startup while contigmalloc() still works. Once we
 636          * have them, keep them.  Sigh.
 637          */
 638         thread_zone = uma_zcreate("THREAD", sched_sizeof_thread(),
 639             thread_ctor, thread_dtor, thread_init, thread_fini,
 640             UMA_ALIGN_CACHE, UMA_ZONE_NOFREE);
 641         uma_prealloc(thread_zone, 512);         /* XXX arbitary */
 642 #endif
 643         ksegrp_zone = uma_zcreate("KSEGRP", sched_sizeof_ksegrp(),
 644             NULL, NULL, ksegrp_init, NULL,
 645             UMA_ALIGN_CACHE, 0);
 646         kse_zone = uma_zcreate("KSE", sched_sizeof_kse(),
 647             NULL, NULL, kse_init, NULL,
 648             UMA_ALIGN_CACHE, 0);
 649 }
 650
 651 /*
 652  * Stash an embarasingly extra thread into the zombie thread queue.
 653  */
 654 void
 655 thread_stash(struct thread *td)
 656 {
 657         mtx_lock_spin(&zombie_thread_lock);
 658         TAILQ_INSERT_HEAD(&zombie_threads, td, td_runq);
 659         mtx_unlock_spin(&zombie_thread_lock);
 660 }
 661
 662 /*
 663  * Stash an embarasingly extra kse into the zombie kse queue.
 664  */
 665 void
 666 kse_stash(struct kse *ke)
 667 {
 668         mtx_lock_spin(&zombie_thread_lock);
 669         TAILQ_INSERT_HEAD(&zombie_kses, ke, ke_procq);
 670         mtx_unlock_spin(&zombie_thread_lock);
 671 }
 672
 673 /*
 674  * Stash an embarasingly extra ksegrp into the zombie ksegrp queue.
 675  */
 676 void
 677 ksegrp_stash(struct ksegrp *kg)
 678 {
 679         mtx_lock_spin(&zombie_thread_lock);
 680         TAILQ_INSERT_HEAD(&zombie_ksegrps, kg, kg_ksegrp);
 681         mtx_unlock_spin(&zombie_thread_lock);
 682 }
 683
 684 /*
 685  * Reap zombie threads.
 686  */
 687 void
 688 thread_reap(void)
 689 {
 690         struct thread *td_first, *td_next;
 691         struct kse *ke_first, *ke_next;
 692         struct ksegrp *kg_first, * kg_next;
 693
 694         /*
 695          * don't even bother to lock if none at this instant
 696          * We really don't care about the next instant..
 697          */
 698         if ((!TAILQ_EMPTY(&zombie_threads))
 699             || (!TAILQ_EMPTY(&zombie_kses))
 700             || (!TAILQ_EMPTY(&zombie_ksegrps))) {
 701                 mtx_lock_spin(&zombie_thread_lock);
 702                 td_first = TAILQ_FIRST(&zombie_threads);
 703                 ke_first = TAILQ_FIRST(&zombie_kses);
 704                 kg_first = TAILQ_FIRST(&zombie_ksegrps);
 705                 if (td_first)
 706                         TAILQ_INIT(&zombie_threads);
 707                 if (ke_first)
 708                         TAILQ_INIT(&zombie_kses);
 709                 if (kg_first)
 710                         TAILQ_INIT(&zombie_ksegrps);
 711                 mtx_unlock_spin(&zombie_thread_lock);
 712                 while (td_first) {
 713                         td_next = TAILQ_NEXT(td_first, td_runq);
 714                         thread_free(td_first);
 715                         td_first = td_next;
 716                 }
 717                 while (ke_first) {
 718                         ke_next = TAILQ_NEXT(ke_first, ke_procq);
 719                         kse_free(ke_first);
 720                         ke_first = ke_next;
 721                 }
 722                 while (kg_first) {
 723                         kg_next = TAILQ_NEXT(kg_first, kg_ksegrp);
 724                         ksegrp_free(kg_first);
 725                         kg_first = kg_next;
 726                 }
 727         }
 728 }
 729
 730 /*
 731  * Allocate a ksegrp.
 732  */
 733 struct ksegrp *
 734 ksegrp_alloc(void)
 735 {
 736         return (uma_zalloc(ksegrp_zone, M_WAITOK));
 737 }
 738
 739 /*
 740  * Allocate a kse.
 741  */
 742 struct kse *
 743 kse_alloc(void)
 744 {
 745         return (uma_zalloc(kse_zone, M_WAITOK));
 746 }
 747
 748 /*
 749  * Allocate a thread.
 750  */
 751 struct thread *
 752 thread_alloc(void)
 753 {
 754         thread_reap(); /* check if any zombies to get */
 755         return (uma_zalloc(thread_zone, M_WAITOK));
 756 }
 757
 758 /*
 759  * Deallocate a ksegrp.
 760  */
 761 void
 762 ksegrp_free(struct ksegrp *td)
 763 {
 764         uma_zfree(ksegrp_zone, td);
 765 }
 766
 767 /*
 768  * Deallocate a kse.
 769  */
 770 void
 771 kse_free(struct kse *td)
 772 {
 773         uma_zfree(kse_zone, td);
 774 }
 775
 776 /*
 777  * Deallocate a thread.
 778  */
 779 void
 780 thread_free(struct thread *td)
 781 {
 782         uma_zfree(thread_zone, td);
 783 }
 784
 785 /*
 786  * Store the thread context in the UTS's mailbox.
 787  * then add the mailbox at the head of a list we are building in user space.
 788  * The list is anchored in the ksegrp structure.
 789  */
 790 int
 791 thread_export_context(struct thread *td)
 792 {
 793         struct proc *p;
 794         struct ksegrp *kg;
 795         uintptr_t mbx;
 796         void *addr;
 797         int error;
 798         ucontext_t uc;
 799         uint temp;
 800
 801         p = td->td_proc;
 802         kg = td->td_ksegrp;
 803
 804         /* Export the user/machine context. */
 805 #if 0
 806         addr = (caddr_t)td->td_mailbox +
 807             offsetof(struct kse_thr_mailbox, tm_context);
 808 #else /* if user pointer arithmetic is valid in the kernel */
 809                 addr = (void *)(&td->td_mailbox->tm_context);
 810 #endif
 811         error = copyin(addr, &uc, sizeof(ucontext_t));
 812         if (error == 0) {
 813                 thread_getcontext(td, &uc);
 814                 error = copyout(&uc, addr, sizeof(ucontext_t));
 815
 816         }
 817         if (error) {
 818                 PROC_LOCK(p);
 819                 psignal(p, SIGSEGV);
 820                 PROC_UNLOCK(p);
 821                 return (error);
 822         }
 823         /* get address in latest mbox of list pointer */
 824 #if 0
 825         addr = (caddr_t)td->td_mailbox
 826             + offsetof(struct kse_thr_mailbox , tm_next);
 827 #else /* if user pointer arithmetic is valid in the kernel */
 828         addr = (void *)(&td->td_mailbox->tm_next);
 829 #endif
 830         /*
 831          * Put the saved address of the previous first
 832          * entry into this one
 833          */
 834         for (;;) {
 835                 mbx = (uintptr_t)kg->kg_completed;
 836                 if (suword(addr, mbx)) {
 837                         goto bad;
 838                 }
 839                 PROC_LOCK(p);
 840                 if (mbx == (uintptr_t)kg->kg_completed) {
 841                         kg->kg_completed = td->td_mailbox;
 842                         PROC_UNLOCK(p);
 843                         break;
 844                 }
 845                 PROC_UNLOCK(p);
 846         }
 847         addr = (caddr_t)td->td_mailbox
 848                  + offsetof(struct kse_thr_mailbox, tm_sticks);
 849         temp = fuword(addr) + td->td_usticks;
 850         if (suword(addr, temp))
 851                 goto bad;
 852         return (0);
 853
 854 bad:
 855         PROC_LOCK(p);
 856         psignal(p, SIGSEGV);
 857         PROC_UNLOCK(p);
 858         return (EFAULT);
 859 }
 860
 861 /*
 862  * Take the list of completed mailboxes for this KSEGRP and put them on this
 863  * KSE's mailbox as it's the next one going up.
 864  */
 865 static int
 866 thread_link_mboxes(struct ksegrp *kg, struct kse *ke)
 867 {
 868         struct proc *p = kg->kg_proc;
 869         void *addr;
 870         uintptr_t mbx;
 871
 872 #if 0
 873         addr = (caddr_t)ke->ke_mailbox
 874             + offsetof(struct kse_mailbox, km_completed);
 875 #else /* if user pointer arithmetic is valid in the kernel */
 876                 addr = (void *)(&ke->ke_mailbox->km_completed);
 877 #endif
 878         for (;;) {
 879                 mbx = (uintptr_t)kg->kg_completed;
 880                 if (suword(addr, mbx)) {
 881                         PROC_LOCK(p);
 882                         psignal(p, SIGSEGV);
 883                         PROC_UNLOCK(p);
 884                         return (EFAULT);
 885                 }
 886                 /* XXXKSE could use atomic CMPXCH here */
 887                 PROC_LOCK(p);
 888                 if (mbx == (uintptr_t)kg->kg_completed) {
 889                         kg->kg_completed = NULL;
 890                         PROC_UNLOCK(p);
 891                         break;
 892                 }
 893                 PROC_UNLOCK(p);
 894         }
 895         return (0);
 896 }
 897
 898 /*
 899  * This function should be called at statclock interrupt time
 900  */
 901 int
 902 thread_add_ticks_intr(int user, uint ticks)
 903 {
 904         struct thread *td = curthread;
 905         struct kse *ke = td->td_kse;
 906
 907         if (ke->ke_mailbox == NULL)
 908                 return -1;
 909         if (user) {
 910                 /* Current always do via ast() */
 911                 ke->ke_flags |= KEF_ASTPENDING;
 912                 ke->ke_uuticks += ticks;
 913         } else {
 914                 if (td->td_mailbox != NULL)
 915                         td->td_usticks += ticks;
 916                 else
 917                         ke->ke_usticks += ticks;
 918         }
 919         return 0;
 920 }
 921
 922 static int
 923 thread_update_uticks(void)
 924 {
 925         struct thread *td = curthread;
 926         struct proc *p = td->td_proc;
 927         struct kse *ke = td->td_kse;
 928         struct kse_thr_mailbox *tmbx;
 929         caddr_t addr;
 930         uint uticks, sticks;
 931
 932         KASSERT(!(td->td_flags & TDF_UNBOUND), ("thread not bound."));
 933
 934         if (ke->ke_mailbox == NULL)
 935                 return 0;
 936
 937         uticks = ke->ke_uuticks;
 938         ke->ke_uuticks = 0;
 939         sticks = ke->ke_usticks;
 940         ke->ke_usticks = 0;
 941         tmbx = (void *)fuword((caddr_t)ke->ke_mailbox
 942                         + offsetof(struct kse_mailbox, km_curthread));
 943         if ((tmbx == NULL) || (tmbx == (void *)-1))
 944                 return 0;
 945         if (uticks) {
 946                 addr = (caddr_t)tmbx + offsetof(struct kse_thr_mailbox, tm_uticks);
 947                 uticks += fuword(addr);
 948                 if (suword(addr, uticks))
 949                         goto bad;
 950         }
 951         if (sticks) {
 952                 addr = (caddr_t)tmbx + offsetof(struct kse_thr_mailbox, tm_sticks);
 953                 sticks += fuword(addr);
 954                 if (suword(addr, sticks))
 955                         goto bad;
 956         }
 957         return 0;
 958 bad:
 959         PROC_LOCK(p);
 960         psignal(p, SIGSEGV);
 961         PROC_UNLOCK(p);
 962         return -1;
 963 }
 964
 965 /*
 966  * Discard the current thread and exit from its context.
 967  *
 968  * Because we can't free a thread while we're operating under its context,
 969  * push the current thread into our KSE's ke_tdspare slot, freeing the
 970  * thread that might be there currently. Because we know that only this
 971  * processor will run our KSE, we needn't worry about someone else grabbing
 972  * our context before we do a cpu_throw.
 973  */
 974 void
 975 thread_exit(void)
 976 {
 977         struct thread *td;
 978         struct kse *ke;
 979         struct proc *p;
 980         struct ksegrp   *kg;
 981
 982         td = curthread;
 983         kg = td->td_ksegrp;
 984         p = td->td_proc;
 985         ke = td->td_kse;
 986
 987         mtx_assert(&sched_lock, MA_OWNED);
 988         KASSERT(p != NULL, ("thread exiting without a process"));
 989         KASSERT(ke != NULL, ("thread exiting without a kse"));
 990         KASSERT(kg != NULL, ("thread exiting without a kse group"));
 991         PROC_LOCK_ASSERT(p, MA_OWNED);
 992         CTR1(KTR_PROC, "thread_exit: thread %p", td);
 993         KASSERT(!mtx_owned(&Giant), ("dying thread owns giant"));
 994
 995         if (ke->ke_tdspare != NULL) {
 996                 thread_stash(ke->ke_tdspare);
 997                 ke->ke_tdspare = NULL;
 998         }
 999         if (td->td_standin != NULL) {
1000                 thread_stash(td->td_standin);
1001                 td->td_standin = NULL;
1002         }
1003
1004         cpu_thread_exit(td);    /* XXXSMP */
1005
1006         /*
1007          * The last thread is left attached to the process
1008          * So that the whole bundle gets recycled. Skip
1009          * all this stuff.
1010          */
1011         if (p->p_numthreads > 1) {
1012                 /*
1013                  * Unlink this thread from its proc and the kseg.
1014                  * In keeping with the other structs we probably should
1015                  * have a thread_unlink() that does some of this but it
1016                  * would only be called from here (I think) so it would
1017                  * be a waste. (might be useful for proc_fini() as well.)
1018                  */
1019                 TAILQ_REMOVE(&p->p_threads, td, td_plist);
1020                 p->p_numthreads--;
1021                 TAILQ_REMOVE(&kg->kg_threads, td, td_kglist);
1022                 kg->kg_numthreads--;
1023                 /*
1024                  * The test below is NOT true if we are the
1025                  * sole exiting thread. P_STOPPED_SNGL is unset
1026                  * in exit1() after it is the only survivor.
1027                  */
1028                 if (P_SHOULDSTOP(p) == P_STOPPED_SINGLE) {
1029                         if (p->p_numthreads == p->p_suspcount) {
1030                                 thread_unsuspend_one(p->p_singlethread);
1031                         }
1032                 }
1033
1034                 /* Reassign this thread's KSE. */
1035                 ke->ke_thread = NULL;
1036                 td->td_kse = NULL;
1037                 ke->ke_state = KES_UNQUEUED;
1038                 KASSERT((ke->ke_bound != td),
1039                     ("thread_exit: entered with ke_bound set"));
1040
1041                 /*
1042                  * The reason for all this hoopla is
1043                  * an attempt to stop our thread stack from being freed
1044                  * until AFTER we have stopped running on it.
1045                  * Since we are under schedlock, almost any method where
1046                  * it is eventually freed by someone else is probably ok.
1047                  * (Especially if they do it under schedlock). We could
1048                  * almost free it here if we could be certain that
1049                  * the uma code wouldn't pull it apart immediatly,
1050                  * but unfortunatly we can not guarantee that.
1051                  *
1052                  * For threads that are exiting and NOT killing their
1053                  * KSEs we can just stash it in the KSE, however
1054                  * in the case where the KSE is also being deallocated,
1055                  * we need to store it somewhere else. It turns out that
1056                  * we will never free the last KSE, so there is always one
1057                  * other KSE available. We might as well just choose one
1058                  * and stash it there. Being under schedlock should make that
1059                  * safe.
1060                  *
1061                  * In borrower threads, we can stash it in the lender
1062                  * Where it won't be needed until this thread is long gone.
1063                  * Borrower threads can't kill their KSE anyhow, so even
1064                  * the KSE would be a safe place for them. It is not
1065                  * necessary to have a KSE (or KSEGRP) at all beyond this
1066                  * point, while we are under the protection of schedlock.
1067                  *
1068                  * Either give the KSE to another thread to use (or make
1069                  * it idle), or free it entirely, possibly along with its
1070                  * ksegrp if it's the last one.
1071                  */
1072                 if (ke->ke_flags & KEF_EXIT) {
1073                         kse_unlink(ke);
1074                         /*
1075                          * Designate another KSE to hold our thread.
1076                          * Safe as long as we abide by whatever lock
1077                          * we control it with.. The other KSE will not
1078                          * be able to run it until we release the schelock,
1079                          * but we need to be careful about it deciding to
1080                          * write to the stack before then. Luckily
1081                          * I believe that while another thread's
1082                          * standin thread can be used in this way, the
1083                          * spare thread for the KSE cannot be used without
1084                          * holding schedlock at least once.
1085                          */
1086                         ke =  FIRST_KSE_IN_PROC(p);
1087                 } else {
1088                         kse_reassign(ke);
1089                 }
1090 #if 0
1091                 if (ke->ke_bound) {
1092                         /*
1093                          * WE are a borrower..
1094                          * stash our thread with the owner.
1095                          */
1096                         if (ke->ke_bound->td_standin) {
1097                                 thread_stash(ke->ke_bound->td_standin);
1098                         }
1099                         ke->ke_bound->td_standin = td;
1100                 } else {
1101 #endif
1102                         if (ke->ke_tdspare != NULL) {
1103                                 thread_stash(ke->ke_tdspare);
1104                                 ke->ke_tdspare = NULL;
1105                         }
1106                         ke->ke_tdspare = td;
1107 #if 0
1108                 }
1109 #endif
1110                 PROC_UNLOCK(p);
1111                 td->td_state    = TDS_INACTIVE;
1112                 td->td_proc     = NULL;
1113                 td->td_ksegrp   = NULL;
1114                 td->td_last_kse = NULL;
1115         } else {
1116                 PROC_UNLOCK(p);
1117         }
1118
1119         cpu_throw();
1120         /* NOTREACHED */
1121 }
1122
1123 /*
1124  * Link a thread to a process.
1125  * set up anything that needs to be initialized for it to
1126  * be used by the process.
1127  *
1128  * Note that we do not link to the proc's ucred here.
1129  * The thread is linked as if running but no KSE assigned.
1130  */
1131 void
1132 thread_link(struct thread *td, struct ksegrp *kg)
1133 {
1134         struct proc *p;
1135
1136         p = kg->kg_proc;
1137         td->td_state = TDS_INACTIVE;
1138         td->td_proc     = p;
1139         td->td_ksegrp   = kg;
1140         td->td_last_kse = NULL;
1141
1142         LIST_INIT(&td->td_contested);
1143         callout_init(&td->td_slpcallout, 1);
1144         TAILQ_INSERT_HEAD(&p->p_threads, td, td_plist);
1145         TAILQ_INSERT_HEAD(&kg->kg_threads, td, td_kglist);
1146         p->p_numthreads++;
1147         kg->kg_numthreads++;
1148         if (oiks_debug && (p->p_numthreads > oiks_max_threads_per_proc)) {
1149                 printf("OIKS %d\n", p->p_numthreads);
1150                 if (oiks_debug > 1)
1151                         Debugger("OIKS");
1152         }
1153         td->td_kse      = NULL;
1154 }
1155
1156 void
1157 kse_purge(struct proc *p, struct thread *td)
1158 {
1159         struct kse *ke;
1160         struct ksegrp *kg;
1161
1162         KASSERT(p->p_numthreads == 1, ("bad thread number"));
1163         mtx_lock_spin(&sched_lock);
1164         while ((kg = TAILQ_FIRST(&p->p_ksegrps)) != NULL) {
1165                 while ((ke = TAILQ_FIRST(&kg->kg_iq)) != NULL) {
1166                         TAILQ_REMOVE(&kg->kg_iq, ke, ke_kgrlist);
1167                         kg->kg_idle_kses--;
1168                         TAILQ_REMOVE(&kg->kg_kseq, ke, ke_kglist);
1169                         kg->kg_kses--;
1170                         if (ke->ke_tdspare)
1171                                 thread_stash(ke->ke_tdspare);
1172                         kse_stash(ke);
1173                 }
1174                 TAILQ_REMOVE(&p->p_ksegrps, kg, kg_ksegrp);
1175                 p->p_numksegrps--;
1176                 KASSERT(((kg->kg_kses == 0) && (kg != td->td_ksegrp)) ||
1177                     ((kg->kg_kses == 1) && (kg == td->td_ksegrp)),
1178                         ("wrong kg_kses"));
1179                 if (kg != td->td_ksegrp) {
1180                         ksegrp_stash(kg);
1181                 }
1182         }
1183         TAILQ_INSERT_HEAD(&p->p_ksegrps, td->td_ksegrp, kg_ksegrp);
1184         p->p_numksegrps++;
1185         mtx_unlock_spin(&sched_lock);
1186 }
1187
1188
1189 /*
1190  * Create a thread and schedule it for upcall on the KSE given.
1191  */
1192 struct thread *
1193 thread_schedule_upcall(struct thread *td, struct kse *ke)
1194 {
1195         struct thread *td2;
1196         struct ksegrp *kg;
1197         int newkse;
1198
1199         mtx_assert(&sched_lock, MA_OWNED);
1200         newkse = (ke != td->td_kse);
1201
1202         /*
1203          * If the kse is already owned by another thread then we can't
1204          * schedule an upcall because the other thread must be BOUND
1205          * which means it is not in a position to take an upcall.
1206          * We must be borrowing the KSE to allow us to complete some in-kernel
1207          * work. When we complete, the Bound thread will have teh chance to
1208          * complete. This thread will sleep as planned. Hopefully there will
1209          * eventually be un unbound thread that can be converted to an
1210          * upcall to report the completion of this thread.
1211          */
1212         if (ke->ke_bound && ((ke->ke_bound->td_flags & TDF_UNBOUND) == 0)) {
1213                 return (NULL);
1214         }
1215         KASSERT((ke->ke_bound == NULL), ("kse already bound"));
1216
1217         if (ke->ke_state == KES_IDLE) {
1218                 kg = ke->ke_ksegrp;
1219                 TAILQ_REMOVE(&kg->kg_iq, ke, ke_kgrlist);
1220                 kg->kg_idle_kses--;
1221                 ke->ke_state = KES_UNQUEUED;
1222         }
1223         if ((td2 = td->td_standin) != NULL) {
1224                 td->td_standin = NULL;
1225         } else {
1226                 if (newkse)
1227                         panic("no reserve thread when called with a new kse");
1228                 /*
1229                  * If called from (e.g.) sleep and we do not have
1230                  * a reserve thread, then we've used it, so do not
1231                  * create an upcall.
1232                  */
1233                 return (NULL);
1234         }
1235         CTR3(KTR_PROC, "thread_schedule_upcall: thread %p (pid %d, %s)",
1236              td2, td->td_proc->p_pid, td->td_proc->p_comm);
1237         bzero(&td2->td_startzero,
1238             (unsigned)RANGEOF(struct thread, td_startzero, td_endzero));
1239         bcopy(&td->td_startcopy, &td2->td_startcopy,
1240             (unsigned) RANGEOF(struct thread, td_startcopy, td_endcopy));
1241         thread_link(td2, ke->ke_ksegrp);
1242         cpu_set_upcall(td2, td->td_pcb);
1243
1244         /*
1245          * XXXKSE do we really need this? (default values for the
1246          * frame).
1247          */
1248         bcopy(td->td_frame, td2->td_frame, sizeof(struct trapframe));
1249
1250         /*
1251          * Bind the new thread to the KSE,
1252          * and if it's our KSE, lend it back to ourself
1253          * so we can continue running.
1254          */
1255         td2->td_ucred = crhold(td->td_ucred);
1256         td2->td_flags = TDF_UPCALLING; /* note: BOUND */
1257         td2->td_kse = ke;
1258         td2->td_state = TDS_CAN_RUN;
1259         td2->td_inhibitors = 0;
1260         /*
1261          * If called from msleep(), we are working on the current
1262          * KSE so fake that we borrowed it. If called from
1263          * kse_create(), don't, as we have a new kse too.
1264          */
1265         if (!newkse) {
1266                 /*
1267                  * This thread will be scheduled when the current thread
1268                  * blocks, exits or tries to enter userspace, (which ever
1269                  * happens first). When that happens the KSe will "revert"
1270                  * to this thread in a BOUND manner. Since we are called
1271                  * from msleep() this is going to be "very soon" in nearly
1272                  * all cases.
1273                  */
1274                 ke->ke_bound = td2;
1275                 TD_SET_LOAN(td2);
1276         } else {
1277                 ke->ke_bound = NULL;
1278                 ke->ke_thread = td2;
1279                 ke->ke_state = KES_THREAD;
1280                 setrunqueue(td2);
1281         }
1282         return (td2);   /* bogus.. should be a void function */
1283 }
1284
1285 /*
1286  * Schedule an upcall to notify a KSE process recieved signals.
1287  *
1288  * XXX - Modifying a sigset_t like this is totally bogus.
1289  */
1290 struct thread *
1291 signal_upcall(struct proc *p, int sig)
1292 {
1293         struct thread *td, *td2;
1294         struct kse *ke;
1295         sigset_t ss;
1296         int error;
1297
1298         PROC_LOCK_ASSERT(p, MA_OWNED);
1299 return (NULL);
1300
1301         td = FIRST_THREAD_IN_PROC(p);
1302         ke = td->td_kse;
1303         PROC_UNLOCK(p);
1304         error = copyin(&ke->ke_mailbox->km_sigscaught, &ss, sizeof(sigset_t));
1305         PROC_LOCK(p);
1306         if (error)
1307                 return (NULL);
1308         SIGADDSET(ss, sig);
1309         PROC_UNLOCK(p);
1310         error = copyout(&ss, &ke->ke_mailbox->km_sigscaught, sizeof(sigset_t));
1311         PROC_LOCK(p);
1312         if (error)
1313                 return (NULL);
1314         if (td->td_standin == NULL)
1315                 td->td_standin = thread_alloc();
1316         mtx_lock_spin(&sched_lock);
1317         td2 = thread_schedule_upcall(td, ke); /* Bogus JRE */
1318         mtx_unlock_spin(&sched_lock);
1319         return (td2);
1320 }
1321
1322 /*
1323  * setup done on the thread when it enters the kernel.
1324  * XXXKSE Presently only for syscalls but eventually all kernel entries.
1325  */
1326 void
1327 thread_user_enter(struct proc *p, struct thread *td)
1328 {
1329         struct kse *ke;
1330
1331         /*
1332          * First check that we shouldn't just abort.
1333          * But check if we are the single thread first!
1334          * XXX p_singlethread not locked, but should be safe.
1335          */
1336         if ((p->p_flag & P_WEXIT) && (p->p_singlethread != td)) {
1337                 PROC_LOCK(p);
1338                 mtx_lock_spin(&sched_lock);
1339                 thread_exit();
1340                 /* NOTREACHED */
1341         }
1342
1343         /*
1344          * If we are doing a syscall in a KSE environment,
1345          * note where our mailbox is. There is always the
1346          * possibility that we could do this lazily (in sleep()),
1347          * but for now do it every time.
1348          */
1349         ke = td->td_kse;
1350         if (ke->ke_mailbox != NULL) {
1351 #if 0
1352                 td->td_mailbox = (void *)fuword((caddr_t)ke->ke_mailbox
1353                     + offsetof(struct kse_mailbox, km_curthread));
1354 #else /* if user pointer arithmetic is ok in the kernel */
1355                 td->td_mailbox =
1356                     (void *)fuword( (void *)&ke->ke_mailbox->km_curthread);
1357 #endif
1358                 if ((td->td_mailbox == NULL) ||
1359                     (td->td_mailbox == (void *)-1)) {
1360                         td->td_mailbox = NULL;  /* single thread it.. */
1361                         mtx_lock_spin(&sched_lock);
1362                         td->td_flags &= ~TDF_UNBOUND;
1363                         mtx_unlock_spin(&sched_lock);
1364                 } else {
1365                         /*
1366                          * when thread limit reached, act like that the thread
1367                          * has already done an upcall.
1368                          */
1369                         if (p->p_numthreads > max_threads_per_proc) {
1370                                 if (td->td_standin != NULL)
1371                                         thread_stash(td->td_standin);
1372                                 td->td_standin = NULL;
1373                         } else {
1374                                 if (td->td_standin == NULL)
1375                                         td->td_standin = thread_alloc();
1376                         }
1377                         mtx_lock_spin(&sched_lock);
1378                         td->td_flags |= TDF_UNBOUND;
1379                         mtx_unlock_spin(&sched_lock);
1380                         td->td_usticks = 0;
1381                 }
1382         }
1383 }
1384
1385 /*
1386  * The extra work we go through if we are a threaded process when we
1387  * return to userland.
1388  *
1389  * If we are a KSE process and returning to user mode, check for
1390  * extra work to do before we return (e.g. for more syscalls
1391  * to complete first).  If we were in a critical section, we should
1392  * just return to let it finish. Same if we were in the UTS (in
1393  * which case the mailbox's context's busy indicator will be set).
1394  * The only traps we suport will have set the mailbox.
1395  * We will clear it here.
1396  */
1397 int
1398 thread_userret(struct thread *td, struct trapframe *frame)
1399 {
1400         int error;
1401         int unbound;
1402         struct kse *ke;
1403         struct ksegrp *kg;
1404         struct thread *td2;
1405         struct proc *p;
1406         struct timespec ts;
1407
1408         error = 0;
1409
1410         unbound = td->td_flags & TDF_UNBOUND;
1411
1412         kg = td->td_ksegrp;
1413         p = td->td_proc;
1414
1415         /*
1416          * Originally bound threads never upcall but they may
1417          * loan out their KSE at this point.
1418          * Upcalls imply bound.. They also may want to do some Philantropy.
1419          * Unbound threads on the other hand either yield to other work
1420          * or transform into an upcall.
1421          * (having saved their context to user space in both cases)
1422          */
1423         if (unbound) {
1424                 /*
1425                  * We are an unbound thread, looking to return to
1426                  * user space.
1427                  * THere are several possibilities:
1428                  * 1) we are using a borrowed KSE. save state and exit.
1429                  *    kse_reassign() will recycle the kse as needed,
1430                  * 2) we are not.. save state, and then convert ourself
1431                  *    to be an upcall, bound to the KSE.
1432                  *    if there are others that need the kse,
1433                  *    give them a chance by doing an mi_switch().
1434                  *    Because we are bound, control will eventually return
1435                  *    to us here.
1436                  * ***
1437                  * Save the thread's context, and link it
1438                  * into the KSEGRP's list of completed threads.
1439                  */
1440                 error = thread_export_context(td);
1441                 td->td_mailbox = NULL;
1442                 td->td_usticks = 0;
1443                 if (error) {
1444                         /*
1445                          * If we are not running on a borrowed KSE, then
1446                          * failing to do the KSE operation just defaults
1447                          * back to synchonous operation, so just return from
1448                          * the syscall. If it IS borrowed, there is nothing
1449                          * we can do. We just lose that context. We
1450                          * probably should note this somewhere and send
1451                          * the process a signal.
1452                          */
1453                         PROC_LOCK(td->td_proc);
1454                         psignal(td->td_proc, SIGSEGV);
1455                         mtx_lock_spin(&sched_lock);
1456                         if (td->td_kse->ke_bound == NULL) {
1457                                 td->td_flags &= ~TDF_UNBOUND;
1458                                 PROC_UNLOCK(td->td_proc);
1459                                 mtx_unlock_spin(&sched_lock);
1460                                 thread_update_uticks();
1461                                 return (error); /* go sync */
1462                         }
1463                         thread_exit();
1464                 }
1465
1466                 /*
1467                  * if the KSE is owned and we are borrowing it,
1468                  * don't make an upcall, just exit so that the owner
1469                  * can get its KSE if it wants it.
1470                  * Our context is already safely stored for later
1471                  * use by the UTS.
1472                  */
1473                 PROC_LOCK(p);
1474                 mtx_lock_spin(&sched_lock);
1475                 if (td->td_kse->ke_bound) {
1476                         thread_exit();
1477                 }
1478                 PROC_UNLOCK(p);
1479
1480                 /*
1481                  * Turn ourself into a bound upcall.
1482                  * We will rely on kse_reassign()
1483                  * to make us run at a later time.
1484                  * We should look just like a sheduled upcall
1485                  * from msleep() or cv_wait().
1486                  */
1487                 td->td_flags &= ~TDF_UNBOUND;
1488                 td->td_flags |= TDF_UPCALLING;
1489                 /* Only get here if we have become an upcall */
1490
1491         } else {
1492                 mtx_lock_spin(&sched_lock);
1493         }
1494         /*
1495          * We ARE going back to userland with this KSE.
1496          * Check for threads that need to borrow it.
1497          * Optimisation: don't call mi_switch if no-one wants the KSE.
1498          * Any other thread that comes ready after this missed the boat.
1499          */
1500         ke = td->td_kse;
1501         if ((td2 = kg->kg_last_assigned))
1502                 td2 = TAILQ_NEXT(td2, td_runq);
1503         else
1504                 td2 = TAILQ_FIRST(&kg->kg_runq);
1505         if (td2)  {
1506                 /*
1507                  * force a switch to more urgent 'in kernel'
1508                  * work. Control will return to this thread
1509                  * when there is no more work to do.
1510                  * kse_reassign() will do tha for us.
1511                  */
1512                 TD_SET_LOAN(td);
1513                 ke->ke_bound = td;
1514                 ke->ke_thread = NULL;
1515                 mi_switch(); /* kse_reassign() will (re)find td2 */
1516         }
1517         mtx_unlock_spin(&sched_lock);
1518
1519         /*
1520          * Optimisation:
1521          * Ensure that we have a spare thread available,
1522          * for when we re-enter the kernel.
1523          */
1524         if (td->td_standin == NULL) {
1525                 if (ke->ke_tdspare) {
1526                         td->td_standin = ke->ke_tdspare;
1527                         ke->ke_tdspare = NULL;
1528                 } else {
1529                         td->td_standin = thread_alloc();
1530                 }
1531         }
1532
1533         thread_update_uticks();
1534         /*
1535          * To get here, we know there is no other need for our
1536          * KSE so we can proceed. If not upcalling, go back to
1537          * userspace. If we are, get the upcall set up.
1538          */
1539         if ((td->td_flags & TDF_UPCALLING) == 0)
1540                 return (0);
1541
1542         /*
1543          * We must be an upcall to get this far.
1544          * There is no more work to do and we are going to ride
1545          * this thead/KSE up to userland as an upcall.
1546          * Do the last parts of the setup needed for the upcall.
1547          */
1548         CTR3(KTR_PROC, "userret: upcall thread %p (pid %d, %s)",
1549             td, td->td_proc->p_pid, td->td_proc->p_comm);
1550
1551         /*
1552          * Set user context to the UTS.
1553          */
1554         cpu_set_upcall_kse(td, ke);
1555
1556         /*
1557          * Put any completed mailboxes on this KSE's list.
1558          */
1559         error = thread_link_mboxes(kg, ke);
1560         if (error)
1561                 goto bad;
1562
1563         /*
1564          * Set state and mailbox.
1565          * From now on we are just a bound outgoing process.
1566          * **Problem** userret is often called several times.
1567          * it would be nice if this all happenned only on the first time
1568          * through. (the scan for extra work etc.)
1569          */
1570         mtx_lock_spin(&sched_lock);
1571         td->td_flags &= ~TDF_UPCALLING;
1572         mtx_unlock_spin(&sched_lock);
1573 #if 0
1574         error = suword((caddr_t)ke->ke_mailbox +
1575             offsetof(struct kse_mailbox, km_curthread), 0);
1576 #else   /* if user pointer arithmetic is ok in the kernel */
1577         error = suword((caddr_t)&ke->ke_mailbox->km_curthread, 0);
1578 #endif
1579         ke->ke_uuticks = ke->ke_usticks = 0;
1580         if (!error) {
1581                 nanotime(&ts);
1582                 if (copyout(&ts, (caddr_t)&ke->ke_mailbox->km_timeofday,
1583                     sizeof(ts))) {
1584                         goto bad;
1585                 }
1586         }
1587         return (0);
1588
1589 bad:
1590         /*
1591          * Things are going to be so screwed we should just kill the process.
1592          * how do we do that?
1593          */
1594         PROC_LOCK(td->td_proc);
1595         psignal(td->td_proc, SIGSEGV);
1596         PROC_UNLOCK(td->td_proc);
1597         return (error); /* go sync */
1598 }
1599
1600 /*
1601  * Enforce single-threading.
1602  *
1603  * Returns 1 if the caller must abort (another thread is waiting to
1604  * exit the process or similar). Process is locked!
1605  * Returns 0 when you are successfully the only thread running.
1606  * A process has successfully single threaded in the suspend mode when
1607  * There are no threads in user mode. Threads in the kernel must be
1608  * allowed to continue until they get to the user boundary. They may even
1609  * copy out their return values and data before suspending. They may however be
1610  * accellerated in reaching the user boundary as we will wake up
1611  * any sleeping threads that are interruptable. (PCATCH).
1612  */
1613 int
1614 thread_single(int force_exit)
1615 {
1616         struct thread *td;
1617         struct thread *td2;
1618         struct proc *p;
1619
1620         td = curthread;
1621         p = td->td_proc;
1622         PROC_LOCK_ASSERT(p, MA_OWNED);
1623         KASSERT((td != NULL), ("curthread is NULL"));
1624
1625         if ((p->p_flag & P_KSES) == 0)
1626                 return (0);
1627
1628         /* Is someone already single threading? */
1629         if (p->p_singlethread)
1630                 return (1);
1631
1632         if (force_exit == SINGLE_EXIT)
1633                 p->p_flag |= P_SINGLE_EXIT;
1634         else
1635                 p->p_flag &= ~P_SINGLE_EXIT;
1636         p->p_flag |= P_STOPPED_SINGLE;
1637         p->p_singlethread = td;
1638         /* XXXKSE Which lock protects the below values? */
1639         while ((p->p_numthreads - p->p_suspcount) != 1) {
1640                 mtx_lock_spin(&sched_lock);
1641                 FOREACH_THREAD_IN_PROC(p, td2) {
1642                         if (td2 == td)
1643                                 continue;
1644                         if (TD_IS_INHIBITED(td2)) {
1645                                 if (force_exit == SINGLE_EXIT) {
1646                                         if (TD_IS_SUSPENDED(td2)) {
1647                                                 thread_unsuspend_one(td2);
1648                                         }
1649                                         if (TD_ON_SLEEPQ(td2) &&
1650                                             (td2->td_flags & TDF_SINTR)) {
1651                                                 if (td2->td_flags & TDF_CVWAITQ)
1652                                                         cv_abort(td2);
1653                                                 else
1654                                                         abortsleep(td2);
1655                                         }
1656                                 } else {
1657                                         if (TD_IS_SUSPENDED(td2))
1658                                                 continue;
1659                                         /* maybe other inhibitted states too? */
1660                                         if (TD_IS_SLEEPING(td2))
1661                                                 thread_suspend_one(td2);
1662                                 }
1663                         }
1664                 }
1665                 /*
1666                  * Maybe we suspended some threads.. was it enough?
1667                  */
1668                 if ((p->p_numthreads - p->p_suspcount) == 1) {
1669                         mtx_unlock_spin(&sched_lock);
1670                         break;
1671                 }
1672
1673                 /*
1674                  * Wake us up when everyone else has suspended.
1675                  * In the mean time we suspend as well.
1676                  */
1677                 thread_suspend_one(td);
1678                 mtx_unlock(&Giant);
1679                 PROC_UNLOCK(p);
1680                 mi_switch();
1681                 mtx_unlock_spin(&sched_lock);
1682                 mtx_lock(&Giant);
1683                 PROC_LOCK(p);
1684         }
1685         if (force_exit == SINGLE_EXIT)
1686                 kse_purge(p, td);
1687         return (0);
1688 }
1689
1690 /*
1691  * Called in from locations that can safely check to see
1692  * whether we have to suspend or at least throttle for a
1693  * single-thread event (e.g. fork).
1694  *
1695  * Such locations include userret().
1696  * If the "return_instead" argument is non zero, the thread must be able to
1697  * accept 0 (caller may continue), or 1 (caller must abort) as a result.
1698  *
1699  * The 'return_instead' argument tells the function if it may do a
1700  * thread_exit() or suspend, or whether the caller must abort and back
1701  * out instead.
1702  *
1703  * If the thread that set the single_threading request has set the
1704  * P_SINGLE_EXIT bit in the process flags then this call will never return
1705  * if 'return_instead' is false, but will exit.
1706  *
1707  * P_SINGLE_EXIT | return_instead == 0| return_instead != 0
1708  *---------------+--------------------+---------------------
1709  *       0       | returns 0          |   returns 0 or 1
1710  *               | when ST ends       |   immediatly
1711  *---------------+--------------------+---------------------
1712  *       1       | thread exits       |   returns 1
1713  *               |                    |  immediatly
1714  * 0 = thread_exit() or suspension ok,
1715  * other = return error instead of stopping the thread.
1716  *
1717  * While a full suspension is under effect, even a single threading
1718  * thread would be suspended if it made this call (but it shouldn't).
1719  * This call should only be made from places where
1720  * thread_exit() would be safe as that may be the outcome unless
1721  * return_instead is set.
1722  */
1723 int
1724 thread_suspend_check(int return_instead)
1725 {
1726         struct thread *td;
1727         struct proc *p;
1728         struct kse *ke;
1729         struct ksegrp *kg;
1730
1731         td = curthread;
1732         p = td->td_proc;
1733         kg = td->td_ksegrp;
1734         PROC_LOCK_ASSERT(p, MA_OWNED);
1735         while (P_SHOULDSTOP(p)) {
1736                 if (P_SHOULDSTOP(p) == P_STOPPED_SINGLE) {
1737                         KASSERT(p->p_singlethread != NULL,
1738                             ("singlethread not set"));
1739                         /*
1740                          * The only suspension in action is a
1741                          * single-threading. Single threader need not stop.
1742                          * XXX Should be safe to access unlocked
1743                          * as it can only be set to be true by us.
1744                          */
1745                         if (p->p_singlethread == td)
1746                                 return (0);     /* Exempt from stopping. */
1747                 }
1748                 if (return_instead)
1749                         return (1);
1750
1751                 /*
1752                  * If the process is waiting for us to exit,
1753                  * this thread should just suicide.
1754                  * Assumes that P_SINGLE_EXIT implies P_STOPPED_SINGLE.
1755                  */
1756                 if ((p->p_flag & P_SINGLE_EXIT) && (p->p_singlethread != td)) {
1757                         mtx_lock_spin(&sched_lock);
1758                         while (mtx_owned(&Giant))
1759                                 mtx_unlock(&Giant);
1760                         /*
1761                          * free extra kses and ksegrps, we needn't worry
1762                          * about if current thread is in same ksegrp as
1763                          * p_singlethread and last kse in the group
1764                          * could be killed, this is protected by kg_numthreads,
1765                          * in this case, we deduce that kg_numthreads must > 1.
1766                          */
1767                         ke = td->td_kse;
1768                         if (ke->ke_bound == NULL &&
1769                             ((kg->kg_kses != 1) || (kg->kg_numthreads == 1)))
1770                                 ke->ke_flags |= KEF_EXIT;
1771                         thread_exit();
1772                 }
1773
1774                 /*
1775                  * When a thread suspends, it just
1776                  * moves to the processes's suspend queue
1777                  * and stays there.
1778                  *
1779                  * XXXKSE if TDF_BOUND is true
1780                  * it will not release it's KSE which might
1781                  * lead to deadlock if there are not enough KSEs
1782                  * to complete all waiting threads.
1783                  * Maybe be able to 'lend' it out again.
1784                  * (lent kse's can not go back to userland?)
1785                  * and can only be lent in STOPPED state.
1786                  */
1787                 mtx_lock_spin(&sched_lock);
1788                 if ((p->p_flag & P_STOPPED_SIG) &&
1789                     (p->p_suspcount+1 == p->p_numthreads)) {
1790                         mtx_unlock_spin(&sched_lock);
1791                         PROC_LOCK(p->p_pptr);
1792                         if ((p->p_pptr->p_procsig->ps_flag &
1793                                 PS_NOCLDSTOP) == 0) {
1794                                 psignal(p->p_pptr, SIGCHLD);
1795                         }
1796                         PROC_UNLOCK(p->p_pptr);
1797                         mtx_lock_spin(&sched_lock);
1798                 }
1799                 mtx_assert(&Giant, MA_NOTOWNED);
1800                 thread_suspend_one(td);
1801                 PROC_UNLOCK(p);
1802                 if (P_SHOULDSTOP(p) == P_STOPPED_SINGLE) {
1803                         if (p->p_numthreads == p->p_suspcount) {
1804                                 thread_unsuspend_one(p->p_singlethread);
1805                         }
1806                 }
1807                 p->p_stats->p_ru.ru_nivcsw++;
1808                 mi_switch();
1809                 mtx_unlock_spin(&sched_lock);
1810                 PROC_LOCK(p);
1811         }
1812         return (0);
1813 }
1814
1815 void
1816 thread_suspend_one(struct thread *td)
1817 {
1818         struct proc *p = td->td_proc;
1819
1820         mtx_assert(&sched_lock, MA_OWNED);
1821         p->p_suspcount++;
1822         TD_SET_SUSPENDED(td);
1823         TAILQ_INSERT_TAIL(&p->p_suspended, td, td_runq);
1824         /*
1825          * Hack: If we are suspending but are on the sleep queue
1826          * then we are in msleep or the cv equivalent. We
1827          * want to look like we have two Inhibitors.
1828          * May already be set.. doesn't matter.
1829          */
1830         if (TD_ON_SLEEPQ(td))
1831                 TD_SET_SLEEPING(td);
1832 }
1833
1834 void
1835 thread_unsuspend_one(struct thread *td)
1836 {
1837         struct proc *p = td->td_proc;
1838
1839         mtx_assert(&sched_lock, MA_OWNED);
1840         TAILQ_REMOVE(&p->p_suspended, td, td_runq);
1841         TD_CLR_SUSPENDED(td);
1842         p->p_suspcount--;
1843         setrunnable(td);
1844 }
1845
1846 /*
1847  * Allow all threads blocked by single threading to continue running.
1848  */
1849 void
1850 thread_unsuspend(struct proc *p)
1851 {
1852         struct thread *td;
1853
1854         mtx_assert(&sched_lock, MA_OWNED);
1855         PROC_LOCK_ASSERT(p, MA_OWNED);
1856         if (!P_SHOULDSTOP(p)) {
1857                 while (( td = TAILQ_FIRST(&p->p_suspended))) {
1858                         thread_unsuspend_one(td);
1859                 }
1860         } else if ((P_SHOULDSTOP(p) == P_STOPPED_SINGLE) &&
1861             (p->p_numthreads == p->p_suspcount)) {
1862                 /*
1863                  * Stopping everything also did the job for the single
1864                  * threading request. Now we've downgraded to single-threaded,
1865                  * let it continue.
1866                  */
1867                 thread_unsuspend_one(p->p_singlethread);
1868         }
1869 }
1870
1871 void
1872 thread_single_end(void)
1873 {
1874         struct thread *td;
1875         struct proc *p;
1876
1877         td = curthread;
1878         p = td->td_proc;
1879         PROC_LOCK_ASSERT(p, MA_OWNED);
1880         p->p_flag &= ~P_STOPPED_SINGLE;
1881         p->p_singlethread = NULL;
1882         /*
1883          * If there are other threads they mey now run,
1884          * unless of course there is a blanket 'stop order'
1885          * on the process. The single threader must be allowed
1886          * to continue however as this is a bad place to stop.
1887          */
1888         if ((p->p_numthreads != 1) && (!P_SHOULDSTOP(p))) {
1889                 mtx_lock_spin(&sched_lock);
1890                 while (( td = TAILQ_FIRST(&p->p_suspended))) {
1891                         thread_unsuspend_one(td);
1892                 }
1893                 mtx_unlock_spin(&sched_lock);
1894         }
1895 }
1896
1897