sys/kern/kern_fork.c

   1 /*
   2  * Copyright (c) 1982, 1986, 1989, 1991, 1993
   3  *      The Regents of the University of California.  All rights reserved.
   4  * (c) UNIX System Laboratories, Inc.
   5  * All or some portions of this file are derived from material licensed
   6  * to the University of California by American Telephone and Telegraph
   7  * Co. or Unix System Laboratories, Inc. and are reproduced herein with
   8  * the permission of UNIX System Laboratories, Inc.
   9  *
  10  * Redistribution and use in source and binary forms, with or without
  11  * modification, are permitted provided that the following conditions
  12  * are met:
  13  * 1. Redistributions of source code must retain the above copyright
  14  *    notice, this list of conditions and the following disclaimer.
  15  * 2. Redistributions in binary form must reproduce the above copyright
  16  *    notice, this list of conditions and the following disclaimer in the
  17  *    documentation and/or other materials provided with the distribution.
  18  * 3. All advertising materials mentioning features or use of this software
  19  *    must display the following acknowledgement:
  20  *      This product includes software developed by the University of
  21  *      California, Berkeley and its contributors.
  22  * 4. Neither the name of the University nor the names of its contributors
  23  *    may be used to endorse or promote products derived from this software
  24  *    without specific prior written permission.
  25  *
  26  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  27  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  28  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  29  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  30  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  31  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  32  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  33  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  34  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  35  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  36  * SUCH DAMAGE.
  37  *
  38  *      @(#)kern_fork.c 8.6 (Berkeley) 4/8/94
  39  * $FreeBSD$
  40  */
  41
  42 #include "opt_ktrace.h"
  43
  44 #include <sys/param.h>
  45 #include <sys/systm.h>
  46 #include <sys/sysproto.h>
  47 #include <sys/filedesc.h>
  48 #include <sys/kernel.h>
  49 #include <sys/sysctl.h>
  50 #include <sys/lock.h>
  51 #include <sys/malloc.h>
  52 #include <sys/mutex.h>
  53 #include <sys/proc.h>
  54 #include <sys/resourcevar.h>
  55 #include <sys/syscall.h>
  56 #include <sys/vnode.h>
  57 #include <sys/acct.h>
  58 #include <sys/ktr.h>
  59 #include <sys/ktrace.h>
  60 #include <sys/kthread.h>
  61 #include <sys/unistd.h>
  62 #include <sys/jail.h>
  63 #include <sys/sx.h>
  64
  65 #include <vm/vm.h>
  66 #include <vm/pmap.h>
  67 #include <vm/vm_map.h>
  68 #include <vm/vm_extern.h>
  69 #include <vm/vm_zone.h>
  70
  71 #include <sys/vmmeter.h>
  72 #include <sys/user.h>
  73
  74 static MALLOC_DEFINE(M_ATFORK, "atfork", "atfork callback");
  75
  76 static int      fast_vfork = 1;
  77 SYSCTL_INT(_kern, OID_AUTO, fast_vfork, CTLFLAG_RW, &fast_vfork, 0,
  78     "flag to indicate whether we have a fast vfork()");
  79
  80 /*
  81  * These are the stuctures used to create a callout list for things to do
  82  * when forking a process
  83  */
  84 struct forklist {
  85         forklist_fn function;
  86         TAILQ_ENTRY(forklist) next;
  87 };
  88
  89 static struct sx fork_list_lock;
  90
  91 TAILQ_HEAD(forklist_head, forklist);
  92 static struct forklist_head fork_list = TAILQ_HEAD_INITIALIZER(fork_list);
  93
  94 #ifndef _SYS_SYSPROTO_H_
  95 struct fork_args {
  96         int     dummy;
  97 };
  98 #endif
  99
 100 static void
 101 init_fork_list(void *data __unused)
 102 {
 103
 104         sx_init(&fork_list_lock, "fork list");
 105 }
 106 SYSINIT(fork_list, SI_SUB_INTRINSIC, SI_ORDER_ANY, init_fork_list, NULL);
 107
 108 /*
 109  * MPSAFE
 110  */
 111 /* ARGSUSED */
 112 int
 113 fork(p, uap)
 114         struct proc *p;
 115         struct fork_args *uap;
 116 {
 117         int error;
 118         struct proc *p2;
 119
 120         mtx_lock(&Giant);
 121         error = fork1(p, RFFDG | RFPROC, &p2);
 122         if (error == 0) {
 123                 p->p_retval[0] = p2->p_pid;
 124                 p->p_retval[1] = 0;
 125         }
 126         mtx_unlock(&Giant);
 127         return error;
 128 }
 129
 130 /*
 131  * MPSAFE
 132  */
 133 /* ARGSUSED */
 134 int
 135 vfork(p, uap)
 136         struct proc *p;
 137         struct vfork_args *uap;
 138 {
 139         int error;
 140         struct proc *p2;
 141
 142         mtx_lock(&Giant);
 143         error = fork1(p, RFFDG | RFPROC | RFPPWAIT | RFMEM, &p2);
 144         if (error == 0) {
 145                 p->p_retval[0] = p2->p_pid;
 146                 p->p_retval[1] = 0;
 147         }
 148         mtx_unlock(&Giant);
 149         return error;
 150 }
 151
 152 /*
 153  * MPSAFE
 154  */
 155 int
 156 rfork(p, uap)
 157         struct proc *p;
 158         struct rfork_args *uap;
 159 {
 160         int error;
 161         struct proc *p2;
 162
 163         /* mask kernel only flags out of the user flags */
 164         mtx_lock(&Giant);
 165         error = fork1(p, uap->flags & ~RFKERNELONLY, &p2);
 166         if (error == 0) {
 167                 p->p_retval[0] = p2 ? p2->p_pid : 0;
 168                 p->p_retval[1] = 0;
 169         }
 170         mtx_unlock(&Giant);
 171         return error;
 172 }
 173
 174
 175 int     nprocs = 1;                             /* process 0 */
 176 int     lastpid = 0;
 177 SYSCTL_INT(_kern, OID_AUTO, lastpid, CTLFLAG_RD, &lastpid, 0,
 178     "Last used PID");
 179
 180 /*
 181  * Random component to lastpid generation.  We mix in a random factor to make
 182  * it a little harder to predict.  We sanity check the modulus value to avoid
 183  * doing it in critical paths.  Don't let it be too small or we pointlessly
 184  * waste randomness entropy, and don't let it be impossibly large.  Using a
 185  * modulus that is too big causes a LOT more process table scans and slows
 186  * down fork processing as the pidchecked caching is defeated.
 187  */
 188 static int randompid = 0;
 189
 190 static int
 191 sysctl_kern_randompid(SYSCTL_HANDLER_ARGS)
 192 {
 193         int error, pid;
 194
 195         pid = randompid;
 196         error = sysctl_handle_int(oidp, &pid, 0, req);
 197         if (error || !req->newptr)
 198                 return (error);
 199         if (pid < 0 || pid > PID_MAX - 100)     /* out of range */
 200                 pid = PID_MAX - 100;
 201         else if (pid < 2)                       /* NOP */
 202                 pid = 0;
 203         else if (pid < 100)                     /* Make it reasonable */
 204                 pid = 100;
 205         randompid = pid;
 206         return (error);
 207 }
 208
 209 SYSCTL_PROC(_kern, OID_AUTO, randompid, CTLTYPE_INT|CTLFLAG_RW,
 210     0, 0, sysctl_kern_randompid, "I", "Random PID modulus");
 211
 212 int
 213 fork1(p1, flags, procp)
 214         struct proc *p1;                        /* parent proc */
 215         int flags;
 216         struct proc **procp;                    /* child proc */
 217 {
 218         struct proc *p2, *pptr;
 219         uid_t uid;
 220         struct proc *newproc;
 221         int trypid;
 222         int ok;
 223         static int pidchecked = 0;
 224         struct forklist *ep;
 225         struct filedesc *fd;
 226
 227         GIANT_REQUIRED;
 228
 229         /* Can't copy and clear */
 230         if ((flags & (RFFDG|RFCFDG)) == (RFFDG|RFCFDG))
 231                 return (EINVAL);
 232
 233         /*
 234          * Here we don't create a new process, but we divorce
 235          * certain parts of a process from itself.
 236          */
 237         if ((flags & RFPROC) == 0) {
 238                 vm_fork(p1, 0, flags);
 239
 240                 /*
 241                  * Close all file descriptors.
 242                  */
 243                 if (flags & RFCFDG) {
 244                         struct filedesc *fdtmp;
 245                         fdtmp = fdinit(p1);
 246                         PROC_LOCK(p1);
 247                         fdfree(p1);
 248                         p1->p_fd = fdtmp;
 249                         PROC_UNLOCK(p1);
 250                 }
 251
 252                 /*
 253                  * Unshare file descriptors (from parent.)
 254                  */
 255                 if (flags & RFFDG) {
 256                         if (p1->p_fd->fd_refcnt > 1) {
 257                                 struct filedesc *newfd;
 258                                 newfd = fdcopy(p1);
 259                                 PROC_LOCK(p1);
 260                                 fdfree(p1);
 261                                 p1->p_fd = newfd;
 262                                 PROC_UNLOCK(p1);
 263                         }
 264                 }
 265                 *procp = NULL;
 266                 return (0);
 267         }
 268
 269         /*
 270          * Although process entries are dynamically created, we still keep
 271          * a global limit on the maximum number we will create.  Don't allow
 272          * a nonprivileged user to use the last process; don't let root
 273          * exceed the limit. The variable nprocs is the current number of
 274          * processes, maxproc is the limit.
 275          */
 276         uid = p1->p_ucred->cr_ruid;
 277         if ((nprocs >= maxproc - 1 && uid != 0) || nprocs >= maxproc) {
 278                 tablefull("proc");
 279                 return (EAGAIN);
 280         }
 281         /*
 282          * Increment the nprocs resource before blocking can occur.  There
 283          * are hard-limits as to the number of processes that can run.
 284          */
 285         nprocs++;
 286
 287         /*
 288          * Increment the count of procs running with this uid. Don't allow
 289          * a nonprivileged user to exceed their current limit.
 290          */
 291         ok = chgproccnt(p1->p_ucred->cr_ruidinfo, 1,
 292                 (uid != 0) ? p1->p_rlimit[RLIMIT_NPROC].rlim_cur : 0);
 293         if (!ok) {
 294                 /*
 295                  * Back out the process count
 296                  */
 297                 nprocs--;
 298                 return (EAGAIN);
 299         }
 300
 301         /* Allocate new proc. */
 302         newproc = zalloc(proc_zone);
 303
 304         /*
 305          * Setup linkage for kernel based threading
 306          */
 307         if((flags & RFTHREAD) != 0) {
 308                 newproc->p_peers = p1->p_peers;
 309                 p1->p_peers = newproc;
 310                 newproc->p_leader = p1->p_leader;
 311         } else {
 312                 newproc->p_peers = NULL;
 313                 newproc->p_leader = newproc;
 314         }
 315
 316         newproc->p_vmspace = NULL;
 317
 318         /*
 319          * Find an unused process ID.  We remember a range of unused IDs
 320          * ready to use (from lastpid+1 through pidchecked-1).
 321          *
 322          * If RFHIGHPID is set (used during system boot), do not allocate
 323          * low-numbered pids.
 324          */
 325         sx_xlock(&allproc_lock);
 326         trypid = lastpid + 1;
 327         if (flags & RFHIGHPID) {
 328                 if (trypid < 10) {
 329                         trypid = 10;
 330                 }
 331         } else {
 332                 if (randompid)
 333                         trypid += arc4random() % randompid;
 334         }
 335 retry:
 336         /*
 337          * If the process ID prototype has wrapped around,
 338          * restart somewhat above 0, as the low-numbered procs
 339          * tend to include daemons that don't exit.
 340          */
 341         if (trypid >= PID_MAX) {
 342                 trypid = trypid % PID_MAX;
 343                 if (trypid < 100)
 344                         trypid += 100;
 345                 pidchecked = 0;
 346         }
 347         if (trypid >= pidchecked) {
 348                 int doingzomb = 0;
 349
 350                 pidchecked = PID_MAX;
 351                 /*
 352                  * Scan the active and zombie procs to check whether this pid
 353                  * is in use.  Remember the lowest pid that's greater
 354                  * than trypid, so we can avoid checking for a while.
 355                  */
 356                 p2 = LIST_FIRST(&allproc);
 357 again:
 358                 for (; p2 != NULL; p2 = LIST_NEXT(p2, p_list)) {
 359                         while (p2->p_pid == trypid ||
 360                             p2->p_pgrp->pg_id == trypid ||
 361                             p2->p_session->s_sid == trypid) {
 362                                 trypid++;
 363                                 if (trypid >= pidchecked)
 364                                         goto retry;
 365                         }
 366                         if (p2->p_pid > trypid && pidchecked > p2->p_pid)
 367                                 pidchecked = p2->p_pid;
 368                         if (p2->p_pgrp->pg_id > trypid &&
 369                             pidchecked > p2->p_pgrp->pg_id)
 370                                 pidchecked = p2->p_pgrp->pg_id;
 371                         if (p2->p_session->s_sid > trypid &&
 372                             pidchecked > p2->p_session->s_sid)
 373                                 pidchecked = p2->p_session->s_sid;
 374                 }
 375                 if (!doingzomb) {
 376                         doingzomb = 1;
 377                         p2 = LIST_FIRST(&zombproc);
 378                         goto again;
 379                 }
 380         }
 381
 382         /*
 383          * RFHIGHPID does not mess with the lastpid counter during boot.
 384          */
 385         if (flags & RFHIGHPID)
 386                 pidchecked = 0;
 387         else
 388                 lastpid = trypid;
 389
 390         p2 = newproc;
 391         p2->p_stat = SIDL;                      /* protect against others */
 392         p2->p_pid = trypid;
 393         LIST_INSERT_HEAD(&allproc, p2, p_list);
 394         LIST_INSERT_HEAD(PIDHASH(p2->p_pid), p2, p_hash);
 395         sx_xunlock(&allproc_lock);
 396
 397         /*
 398          * Make a proc table entry for the new process.
 399          * Start by zeroing the section of proc that is zero-initialized,
 400          * then copy the section that is copied directly from the parent.
 401          */
 402         bzero(&p2->p_startzero,
 403             (unsigned) ((caddr_t)&p2->p_endzero - (caddr_t)&p2->p_startzero));
 404         PROC_LOCK(p1);
 405         bcopy(&p1->p_startcopy, &p2->p_startcopy,
 406             (unsigned) ((caddr_t)&p2->p_endcopy - (caddr_t)&p2->p_startcopy));
 407         PROC_UNLOCK(p1);
 408
 409         mtx_init(&p2->p_mtx, "process lock", MTX_DEF);
 410         PROC_LOCK(p2);
 411
 412         /*
 413          * Duplicate sub-structures as needed.
 414          * Increase reference counts on shared objects.
 415          * The p_stats and p_sigacts substructs are set in vm_fork.
 416          */
 417         p2->p_flag = 0;
 418         mtx_lock_spin(&sched_lock);
 419         p2->p_sflag = PS_INMEM;
 420         if (p1->p_sflag & PS_PROFIL)
 421                 startprofclock(p2);
 422         mtx_unlock_spin(&sched_lock);
 423         /*
 424          * We start off holding one spinlock after fork: sched_lock.
 425          */
 426         PROC_LOCK(p1);
 427         crhold(p1->p_ucred);
 428         p2->p_ucred = p1->p_ucred;
 429
 430         if (p2->p_args)
 431                 p2->p_args->ar_ref++;
 432
 433         if (flags & RFSIGSHARE) {
 434                 p2->p_procsig = p1->p_procsig;
 435                 p2->p_procsig->ps_refcnt++;
 436                 if (p1->p_sigacts == &p1->p_addr->u_sigacts) {
 437                         struct sigacts *newsigacts;
 438
 439                         PROC_UNLOCK(p1);
 440                         PROC_UNLOCK(p2);
 441                         /* Create the shared sigacts structure */
 442                         MALLOC(newsigacts, struct sigacts *,
 443                             sizeof(struct sigacts), M_SUBPROC, M_WAITOK);
 444                         PROC_LOCK(p2);
 445                         PROC_LOCK(p1);
 446                         /*
 447                          * Set p_sigacts to the new shared structure.
 448                          * Note that this is updating p1->p_sigacts at the
 449                          * same time, since p_sigacts is just a pointer to
 450                          * the shared p_procsig->ps_sigacts.
 451                          */
 452                         p2->p_sigacts  = newsigacts;
 453                         *p2->p_sigacts = p1->p_addr->u_sigacts;
 454                 }
 455         } else {
 456                 PROC_UNLOCK(p1);
 457                 PROC_UNLOCK(p2);
 458                 MALLOC(p2->p_procsig, struct procsig *, sizeof(struct procsig),
 459                     M_SUBPROC, M_WAITOK);
 460                 PROC_LOCK(p2);
 461                 PROC_LOCK(p1);
 462                 bcopy(p1->p_procsig, p2->p_procsig, sizeof(*p2->p_procsig));
 463                 p2->p_procsig->ps_refcnt = 1;
 464                 p2->p_sigacts = NULL;   /* finished in vm_fork() */
 465         }
 466         if (flags & RFLINUXTHPN)
 467                 p2->p_sigparent = SIGUSR1;
 468         else
 469                 p2->p_sigparent = SIGCHLD;
 470
 471         /* bump references to the text vnode (for procfs) */
 472         p2->p_textvp = p1->p_textvp;
 473         PROC_UNLOCK(p1);
 474         PROC_UNLOCK(p2);
 475         if (p2->p_textvp)
 476                 VREF(p2->p_textvp);
 477
 478         if (flags & RFCFDG)
 479                 fd = fdinit(p1);
 480         else if (flags & RFFDG)
 481                 fd = fdcopy(p1);
 482         else
 483                 fd = fdshare(p1);
 484         PROC_LOCK(p2);
 485         p2->p_fd = fd;
 486
 487         /*
 488          * If p_limit is still copy-on-write, bump refcnt,
 489          * otherwise get a copy that won't be modified.
 490          * (If PL_SHAREMOD is clear, the structure is shared
 491          * copy-on-write.)
 492          */
 493         PROC_LOCK(p1);
 494         if (p1->p_limit->p_lflags & PL_SHAREMOD)
 495                 p2->p_limit = limcopy(p1->p_limit);
 496         else {
 497                 p2->p_limit = p1->p_limit;
 498                 p2->p_limit->p_refcnt++;
 499         }
 500
 501         /*
 502          * Preserve some more flags in subprocess.  PS_PROFIL has already
 503          * been preserved.
 504          */
 505         p2->p_flag |= p1->p_flag & (P_SUGID | P_ALTSTACK);
 506         if (p1->p_session->s_ttyvp != NULL && p1->p_flag & P_CONTROLT)
 507                 p2->p_flag |= P_CONTROLT;
 508         if (flags & RFPPWAIT)
 509                 p2->p_flag |= P_PPWAIT;
 510
 511         LIST_INSERT_AFTER(p1, p2, p_pglist);
 512         PROC_UNLOCK(p1);
 513         PROC_UNLOCK(p2);
 514
 515         /*
 516          * Attach the new process to its parent.
 517          *
 518          * If RFNOWAIT is set, the newly created process becomes a child
 519          * of init.  This effectively disassociates the child from the
 520          * parent.
 521          */
 522         if (flags & RFNOWAIT)
 523                 pptr = initproc;
 524         else
 525                 pptr = p1;
 526         sx_xlock(&proctree_lock);
 527         PROC_LOCK(p2);
 528         p2->p_pptr = pptr;
 529         PROC_UNLOCK(p2);
 530         LIST_INSERT_HEAD(&pptr->p_children, p2, p_sibling);
 531         sx_xunlock(&proctree_lock);
 532         PROC_LOCK(p2);
 533         LIST_INIT(&p2->p_children);
 534         LIST_INIT(&p2->p_contested);
 535
 536         callout_init(&p2->p_itcallout, 0);
 537         callout_init(&p2->p_slpcallout, 1);
 538
 539         PROC_LOCK(p1);
 540 #ifdef KTRACE
 541         /*
 542          * Copy traceflag and tracefile if enabled.
 543          * If not inherited, these were zeroed above.
 544          */
 545         if (p1->p_traceflag & KTRFAC_INHERIT) {
 546                 p2->p_traceflag = p1->p_traceflag;
 547                 if ((p2->p_tracep = p1->p_tracep) != NULL) {
 548                         PROC_UNLOCK(p1);
 549                         PROC_UNLOCK(p2);
 550                         VREF(p2->p_tracep);
 551                         PROC_LOCK(p2);
 552                         PROC_LOCK(p1);
 553                 }
 554         }
 555 #endif
 556
 557         /*
 558          * set priority of child to be that of parent
 559          */
 560         mtx_lock_spin(&sched_lock);
 561         p2->p_estcpu = p1->p_estcpu;
 562         mtx_unlock_spin(&sched_lock);
 563
 564         /*
 565          * This begins the section where we must prevent the parent
 566          * from being swapped.
 567          */
 568         _PHOLD(p1);
 569         PROC_UNLOCK(p1);
 570         PROC_UNLOCK(p2);
 571
 572         /*
 573          * Finish creating the child process.  It will return via a different
 574          * execution path later.  (ie: directly into user mode)
 575          */
 576         vm_fork(p1, p2, flags);
 577
 578         if (flags == (RFFDG | RFPROC)) {
 579                 cnt.v_forks++;
 580                 cnt.v_forkpages += p2->p_vmspace->vm_dsize + p2->p_vmspace->vm_ssize;
 581         } else if (flags == (RFFDG | RFPROC | RFPPWAIT | RFMEM)) {
 582                 cnt.v_vforks++;
 583                 cnt.v_vforkpages += p2->p_vmspace->vm_dsize + p2->p_vmspace->vm_ssize;
 584         } else if (p1 == &proc0) {
 585                 cnt.v_kthreads++;
 586                 cnt.v_kthreadpages += p2->p_vmspace->vm_dsize + p2->p_vmspace->vm_ssize;
 587         } else {
 588                 cnt.v_rforks++;
 589                 cnt.v_rforkpages += p2->p_vmspace->vm_dsize + p2->p_vmspace->vm_ssize;
 590         }
 591
 592         /*
 593          * Both processes are set up, now check if any loadable modules want
 594          * to adjust anything.
 595          *   What if they have an error? XXX
 596          */
 597         sx_slock(&fork_list_lock);
 598         TAILQ_FOREACH(ep, &fork_list, next) {
 599                 (*ep->function)(p1, p2, flags);
 600         }
 601         sx_sunlock(&fork_list_lock);
 602
 603         /*
 604          * If RFSTOPPED not requested, make child runnable and add to
 605          * run queue.
 606          */
 607         microtime(&(p2->p_stats->p_start));
 608         p2->p_acflag = AFORK;
 609         if ((flags & RFSTOPPED) == 0) {
 610                 mtx_lock_spin(&sched_lock);
 611                 p2->p_stat = SRUN;
 612                 setrunqueue(p2);
 613                 mtx_unlock_spin(&sched_lock);
 614         }
 615
 616         /*
 617          * Now can be swapped.
 618          */
 619         PROC_LOCK(p1);
 620         _PRELE(p1);
 621
 622         /*
 623          * tell any interested parties about the new process
 624          */
 625         KNOTE(&p1->p_klist, NOTE_FORK | p2->p_pid);
 626         PROC_UNLOCK(p1);
 627
 628         /*
 629          * Preserve synchronization semantics of vfork.  If waiting for
 630          * child to exec or exit, set P_PPWAIT on child, and sleep on our
 631          * proc (in case of exit).
 632          */
 633         PROC_LOCK(p2);
 634         while (p2->p_flag & P_PPWAIT)
 635                 msleep(p1, &p2->p_mtx, PWAIT, "ppwait", 0);
 636         PROC_UNLOCK(p2);
 637
 638         /*
 639          * Return child proc pointer to parent.
 640          */
 641         *procp = p2;
 642         return (0);
 643 }
 644
 645 /*
 646  * The next two functionms are general routines to handle adding/deleting
 647  * items on the fork callout list.
 648  *
 649  * at_fork():
 650  * Take the arguments given and put them onto the fork callout list,
 651  * However first make sure that it's not already there.
 652  * Returns 0 on success or a standard error number.
 653  */
 654
 655 int
 656 at_fork(function)
 657         forklist_fn function;
 658 {
 659         struct forklist *ep;
 660
 661 #ifdef INVARIANTS
 662         /* let the programmer know if he's been stupid */
 663         if (rm_at_fork(function))
 664                 printf("WARNING: fork callout entry (%p) already present\n",
 665                     function);
 666 #endif
 667         ep = malloc(sizeof(*ep), M_ATFORK, M_NOWAIT);
 668         if (ep == NULL)
 669                 return (ENOMEM);
 670         ep->function = function;
 671         sx_xlock(&fork_list_lock);
 672         TAILQ_INSERT_TAIL(&fork_list, ep, next);
 673         sx_xunlock(&fork_list_lock);
 674         return (0);
 675 }
 676
 677 /*
 678  * Scan the exit callout list for the given item and remove it..
 679  * Returns the number of items removed (0 or 1)
 680  */
 681
 682 int
 683 rm_at_fork(function)
 684         forklist_fn function;
 685 {
 686         struct forklist *ep;
 687
 688         sx_xlock(&fork_list_lock);
 689         TAILQ_FOREACH(ep, &fork_list, next) {
 690                 if (ep->function == function) {
 691                         TAILQ_REMOVE(&fork_list, ep, next);
 692                         sx_xunlock(&fork_list_lock);
 693                         free(ep, M_ATFORK);
 694                         return(1);
 695                 }
 696         }
 697         sx_xunlock(&fork_list_lock);
 698         return (0);
 699 }
 700
 701 /*
 702  * Handle the return of a child process from fork1().  This function
 703  * is called from the MD fork_trampoline() entry point.
 704  */
 705 void
 706 fork_exit(callout, arg, frame)
 707         void (*callout)(void *, struct trapframe *);
 708         void *arg;
 709         struct trapframe *frame;
 710 {
 711         struct proc *p;
 712
 713         p = curproc;
 714
 715         /*
 716          * Setup the sched_lock state so that we can release it.
 717          */
 718         sched_lock.mtx_lock = (uintptr_t)p;
 719         sched_lock.mtx_recurse = 0;
 720         /*
 721          * XXX: We really shouldn't have to do this.
 722          */
 723         mtx_intr_enable(&sched_lock);
 724         mtx_unlock_spin(&sched_lock);
 725
 726 #ifdef SMP
 727         if (PCPU_GET(switchtime.tv_sec) == 0)
 728                 microuptime(PCPU_PTR(switchtime));
 729         PCPU_SET(switchticks, ticks);
 730 #endif
 731
 732         /*
 733          * cpu_set_fork_handler intercepts this function call to
 734          * have this call a non-return function to stay in kernel mode.
 735          * initproc has its own fork handler, but it does return.
 736          */
 737         KASSERT(callout != NULL, ("NULL callout in fork_exit"));
 738         callout(arg, frame);
 739
 740         /*
 741          * Check if a kernel thread misbehaved and returned from its main
 742          * function.
 743          */
 744         PROC_LOCK(p);
 745         if (p->p_flag & P_KTHREAD) {
 746                 PROC_UNLOCK(p);
 747                 mtx_lock(&Giant);
 748                 printf("Kernel thread \"%s\" (pid %d) exited prematurely.\n",
 749                     p->p_comm, p->p_pid);
 750                 kthread_exit(0);
 751         }
 752         PROC_UNLOCK(p);
 753         mtx_assert(&Giant, MA_NOTOWNED);
 754 }
 755
 756 /*
 757  * Simplified back end of syscall(), used when returning from fork()
 758  * directly into user mode.  Giant is not held on entry, and must not
 759  * be held on return.  This function is passed in to fork_exit() as the
 760  * first parameter and is called when returning to a new userland process.
 761  */
 762 void
 763 fork_return(p, frame)
 764         struct proc *p;
 765         struct trapframe *frame;
 766 {
 767
 768         userret(p, frame, 0);
 769 #ifdef KTRACE
 770         if (KTRPOINT(p, KTR_SYSRET)) {
 771                 ktrsysret(p->p_tracep, SYS_fork, 0, 0);
 772         }
 773 #endif
 774         mtx_assert(&Giant, MA_NOTOWNED);
 775 }