2 * Copyright (c) 1982, 1986, 1989, 1991, 1993
3 * The Regents of the University of California. All rights reserved.
4 * (c) UNIX System Laboratories, Inc.
5 * All or some portions of this file are derived from material licensed
6 * to the University of California by American Telephone and Telegraph
7 * Co. or Unix System Laboratories, Inc. and are reproduced herein with
8 * the permission of UNIX System Laboratories, Inc.
10 * Redistribution and use in source and binary forms, with or without
11 * modification, are permitted provided that the following conditions
13 * 1. Redistributions of source code must retain the above copyright
14 * notice, this list of conditions and the following disclaimer.
15 * 2. Redistributions in binary form must reproduce the above copyright
16 * notice, this list of conditions and the following disclaimer in the
17 * documentation and/or other materials provided with the distribution.
18 * 3. All advertising materials mentioning features or use of this software
19 * must display the following acknowledgement:
20 * This product includes software developed by the University of
21 * California, Berkeley and its contributors.
22 * 4. Neither the name of the University nor the names of its contributors
23 * may be used to endorse or promote products derived from this software
24 * without specific prior written permission.
26 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
27 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
30 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
31 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
32 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
33 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
34 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
35 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
38 * @(#)kern_fork.c 8.6 (Berkeley) 4/8/94
42 #include "opt_ktrace.h"
44 #include <sys/param.h>
45 #include <sys/systm.h>
46 #include <sys/sysproto.h>
47 #include <sys/filedesc.h>
48 #include <sys/kernel.h>
49 #include <sys/sysctl.h>
51 #include <sys/malloc.h>
52 #include <sys/mutex.h>
54 #include <sys/resourcevar.h>
55 #include <sys/syscall.h>
56 #include <sys/vnode.h>
59 #include <sys/ktrace.h>
60 #include <sys/kthread.h>
61 #include <sys/unistd.h>
67 #include <vm/vm_map.h>
68 #include <vm/vm_extern.h>
69 #include <vm/vm_zone.h>
71 #include <sys/vmmeter.h>
74 static MALLOC_DEFINE(M_ATFORK, "atfork", "atfork callback");
76 static int fast_vfork = 1;
77 SYSCTL_INT(_kern, OID_AUTO, fast_vfork, CTLFLAG_RW, &fast_vfork, 0,
78 "flag to indicate whether we have a fast vfork()");
81 * These are the stuctures used to create a callout list for things to do
82 * when forking a process
86 TAILQ_ENTRY(forklist) next;
89 static struct sx fork_list_lock;
91 TAILQ_HEAD(forklist_head, forklist);
92 static struct forklist_head fork_list = TAILQ_HEAD_INITIALIZER(fork_list);
94 #ifndef _SYS_SYSPROTO_H_
101 init_fork_list(void *data __unused)
104 sx_init(&fork_list_lock, "fork list");
106 SYSINIT(fork_list, SI_SUB_INTRINSIC, SI_ORDER_ANY, init_fork_list, NULL);
115 struct fork_args *uap;
121 error = fork1(p, RFFDG | RFPROC, &p2);
123 p->p_retval[0] = p2->p_pid;
137 struct vfork_args *uap;
143 error = fork1(p, RFFDG | RFPROC | RFPPWAIT | RFMEM, &p2);
145 p->p_retval[0] = p2->p_pid;
158 struct rfork_args *uap;
163 /* mask kernel only flags out of the user flags */
165 error = fork1(p, uap->flags & ~RFKERNELONLY, &p2);
167 p->p_retval[0] = p2 ? p2->p_pid : 0;
175 int nprocs = 1; /* process 0 */
177 SYSCTL_INT(_kern, OID_AUTO, lastpid, CTLFLAG_RD, &lastpid, 0,
181 * Random component to lastpid generation. We mix in a random factor to make
182 * it a little harder to predict. We sanity check the modulus value to avoid
183 * doing it in critical paths. Don't let it be too small or we pointlessly
184 * waste randomness entropy, and don't let it be impossibly large. Using a
185 * modulus that is too big causes a LOT more process table scans and slows
186 * down fork processing as the pidchecked caching is defeated.
188 static int randompid = 0;
191 sysctl_kern_randompid(SYSCTL_HANDLER_ARGS)
196 error = sysctl_handle_int(oidp, &pid, 0, req);
197 if (error || !req->newptr)
199 if (pid < 0 || pid > PID_MAX - 100) /* out of range */
201 else if (pid < 2) /* NOP */
203 else if (pid < 100) /* Make it reasonable */
209 SYSCTL_PROC(_kern, OID_AUTO, randompid, CTLTYPE_INT|CTLFLAG_RW,
210 0, 0, sysctl_kern_randompid, "I", "Random PID modulus");
213 fork1(p1, flags, procp)
214 struct proc *p1; /* parent proc */
216 struct proc **procp; /* child proc */
218 struct proc *p2, *pptr;
220 struct proc *newproc;
223 static int pidchecked = 0;
229 /* Can't copy and clear */
230 if ((flags & (RFFDG|RFCFDG)) == (RFFDG|RFCFDG))
234 * Here we don't create a new process, but we divorce
235 * certain parts of a process from itself.
237 if ((flags & RFPROC) == 0) {
238 vm_fork(p1, 0, flags);
241 * Close all file descriptors.
243 if (flags & RFCFDG) {
244 struct filedesc *fdtmp;
253 * Unshare file descriptors (from parent.)
256 if (p1->p_fd->fd_refcnt > 1) {
257 struct filedesc *newfd;
270 * Although process entries are dynamically created, we still keep
271 * a global limit on the maximum number we will create. Don't allow
272 * a nonprivileged user to use the last process; don't let root
273 * exceed the limit. The variable nprocs is the current number of
274 * processes, maxproc is the limit.
276 uid = p1->p_ucred->cr_ruid;
277 if ((nprocs >= maxproc - 1 && uid != 0) || nprocs >= maxproc) {
282 * Increment the nprocs resource before blocking can occur. There
283 * are hard-limits as to the number of processes that can run.
288 * Increment the count of procs running with this uid. Don't allow
289 * a nonprivileged user to exceed their current limit.
291 ok = chgproccnt(p1->p_ucred->cr_ruidinfo, 1,
292 (uid != 0) ? p1->p_rlimit[RLIMIT_NPROC].rlim_cur : 0);
295 * Back out the process count
301 /* Allocate new proc. */
302 newproc = zalloc(proc_zone);
305 * Setup linkage for kernel based threading
307 if((flags & RFTHREAD) != 0) {
308 newproc->p_peers = p1->p_peers;
309 p1->p_peers = newproc;
310 newproc->p_leader = p1->p_leader;
312 newproc->p_peers = NULL;
313 newproc->p_leader = newproc;
316 newproc->p_vmspace = NULL;
319 * Find an unused process ID. We remember a range of unused IDs
320 * ready to use (from lastpid+1 through pidchecked-1).
322 * If RFHIGHPID is set (used during system boot), do not allocate
325 sx_xlock(&allproc_lock);
326 trypid = lastpid + 1;
327 if (flags & RFHIGHPID) {
333 trypid += arc4random() % randompid;
337 * If the process ID prototype has wrapped around,
338 * restart somewhat above 0, as the low-numbered procs
339 * tend to include daemons that don't exit.
341 if (trypid >= PID_MAX) {
342 trypid = trypid % PID_MAX;
347 if (trypid >= pidchecked) {
350 pidchecked = PID_MAX;
352 * Scan the active and zombie procs to check whether this pid
353 * is in use. Remember the lowest pid that's greater
354 * than trypid, so we can avoid checking for a while.
356 p2 = LIST_FIRST(&allproc);
358 for (; p2 != NULL; p2 = LIST_NEXT(p2, p_list)) {
359 while (p2->p_pid == trypid ||
360 p2->p_pgrp->pg_id == trypid ||
361 p2->p_session->s_sid == trypid) {
363 if (trypid >= pidchecked)
366 if (p2->p_pid > trypid && pidchecked > p2->p_pid)
367 pidchecked = p2->p_pid;
368 if (p2->p_pgrp->pg_id > trypid &&
369 pidchecked > p2->p_pgrp->pg_id)
370 pidchecked = p2->p_pgrp->pg_id;
371 if (p2->p_session->s_sid > trypid &&
372 pidchecked > p2->p_session->s_sid)
373 pidchecked = p2->p_session->s_sid;
377 p2 = LIST_FIRST(&zombproc);
383 * RFHIGHPID does not mess with the lastpid counter during boot.
385 if (flags & RFHIGHPID)
391 p2->p_stat = SIDL; /* protect against others */
393 LIST_INSERT_HEAD(&allproc, p2, p_list);
394 LIST_INSERT_HEAD(PIDHASH(p2->p_pid), p2, p_hash);
395 sx_xunlock(&allproc_lock);
398 * Make a proc table entry for the new process.
399 * Start by zeroing the section of proc that is zero-initialized,
400 * then copy the section that is copied directly from the parent.
402 bzero(&p2->p_startzero,
403 (unsigned) ((caddr_t)&p2->p_endzero - (caddr_t)&p2->p_startzero));
405 bcopy(&p1->p_startcopy, &p2->p_startcopy,
406 (unsigned) ((caddr_t)&p2->p_endcopy - (caddr_t)&p2->p_startcopy));
409 mtx_init(&p2->p_mtx, "process lock", MTX_DEF);
413 * Duplicate sub-structures as needed.
414 * Increase reference counts on shared objects.
415 * The p_stats and p_sigacts substructs are set in vm_fork.
418 mtx_lock_spin(&sched_lock);
419 p2->p_sflag = PS_INMEM;
420 if (p1->p_sflag & PS_PROFIL)
422 mtx_unlock_spin(&sched_lock);
424 * We start off holding one spinlock after fork: sched_lock.
428 p2->p_ucred = p1->p_ucred;
431 p2->p_args->ar_ref++;
433 if (flags & RFSIGSHARE) {
434 p2->p_procsig = p1->p_procsig;
435 p2->p_procsig->ps_refcnt++;
436 if (p1->p_sigacts == &p1->p_addr->u_sigacts) {
437 struct sigacts *newsigacts;
441 /* Create the shared sigacts structure */
442 MALLOC(newsigacts, struct sigacts *,
443 sizeof(struct sigacts), M_SUBPROC, M_WAITOK);
447 * Set p_sigacts to the new shared structure.
448 * Note that this is updating p1->p_sigacts at the
449 * same time, since p_sigacts is just a pointer to
450 * the shared p_procsig->ps_sigacts.
452 p2->p_sigacts = newsigacts;
453 *p2->p_sigacts = p1->p_addr->u_sigacts;
458 MALLOC(p2->p_procsig, struct procsig *, sizeof(struct procsig),
459 M_SUBPROC, M_WAITOK);
462 bcopy(p1->p_procsig, p2->p_procsig, sizeof(*p2->p_procsig));
463 p2->p_procsig->ps_refcnt = 1;
464 p2->p_sigacts = NULL; /* finished in vm_fork() */
466 if (flags & RFLINUXTHPN)
467 p2->p_sigparent = SIGUSR1;
469 p2->p_sigparent = SIGCHLD;
471 /* bump references to the text vnode (for procfs) */
472 p2->p_textvp = p1->p_textvp;
480 else if (flags & RFFDG)
488 * If p_limit is still copy-on-write, bump refcnt,
489 * otherwise get a copy that won't be modified.
490 * (If PL_SHAREMOD is clear, the structure is shared
494 if (p1->p_limit->p_lflags & PL_SHAREMOD)
495 p2->p_limit = limcopy(p1->p_limit);
497 p2->p_limit = p1->p_limit;
498 p2->p_limit->p_refcnt++;
502 * Preserve some more flags in subprocess. PS_PROFIL has already
505 p2->p_flag |= p1->p_flag & (P_SUGID | P_ALTSTACK);
506 if (p1->p_session->s_ttyvp != NULL && p1->p_flag & P_CONTROLT)
507 p2->p_flag |= P_CONTROLT;
508 if (flags & RFPPWAIT)
509 p2->p_flag |= P_PPWAIT;
511 LIST_INSERT_AFTER(p1, p2, p_pglist);
516 * Attach the new process to its parent.
518 * If RFNOWAIT is set, the newly created process becomes a child
519 * of init. This effectively disassociates the child from the
522 if (flags & RFNOWAIT)
526 sx_xlock(&proctree_lock);
530 LIST_INSERT_HEAD(&pptr->p_children, p2, p_sibling);
531 sx_xunlock(&proctree_lock);
533 LIST_INIT(&p2->p_children);
534 LIST_INIT(&p2->p_contested);
536 callout_init(&p2->p_itcallout, 0);
537 callout_init(&p2->p_slpcallout, 1);
542 * Copy traceflag and tracefile if enabled.
543 * If not inherited, these were zeroed above.
545 if (p1->p_traceflag & KTRFAC_INHERIT) {
546 p2->p_traceflag = p1->p_traceflag;
547 if ((p2->p_tracep = p1->p_tracep) != NULL) {
558 * set priority of child to be that of parent
560 mtx_lock_spin(&sched_lock);
561 p2->p_estcpu = p1->p_estcpu;
562 mtx_unlock_spin(&sched_lock);
565 * This begins the section where we must prevent the parent
566 * from being swapped.
573 * Finish creating the child process. It will return via a different
574 * execution path later. (ie: directly into user mode)
576 vm_fork(p1, p2, flags);
578 if (flags == (RFFDG | RFPROC)) {
580 cnt.v_forkpages += p2->p_vmspace->vm_dsize + p2->p_vmspace->vm_ssize;
581 } else if (flags == (RFFDG | RFPROC | RFPPWAIT | RFMEM)) {
583 cnt.v_vforkpages += p2->p_vmspace->vm_dsize + p2->p_vmspace->vm_ssize;
584 } else if (p1 == &proc0) {
586 cnt.v_kthreadpages += p2->p_vmspace->vm_dsize + p2->p_vmspace->vm_ssize;
589 cnt.v_rforkpages += p2->p_vmspace->vm_dsize + p2->p_vmspace->vm_ssize;
593 * Both processes are set up, now check if any loadable modules want
594 * to adjust anything.
595 * What if they have an error? XXX
597 sx_slock(&fork_list_lock);
598 TAILQ_FOREACH(ep, &fork_list, next) {
599 (*ep->function)(p1, p2, flags);
601 sx_sunlock(&fork_list_lock);
604 * If RFSTOPPED not requested, make child runnable and add to
607 microtime(&(p2->p_stats->p_start));
608 p2->p_acflag = AFORK;
609 if ((flags & RFSTOPPED) == 0) {
610 mtx_lock_spin(&sched_lock);
613 mtx_unlock_spin(&sched_lock);
617 * Now can be swapped.
623 * tell any interested parties about the new process
625 KNOTE(&p1->p_klist, NOTE_FORK | p2->p_pid);
629 * Preserve synchronization semantics of vfork. If waiting for
630 * child to exec or exit, set P_PPWAIT on child, and sleep on our
631 * proc (in case of exit).
634 while (p2->p_flag & P_PPWAIT)
635 msleep(p1, &p2->p_mtx, PWAIT, "ppwait", 0);
639 * Return child proc pointer to parent.
646 * The next two functionms are general routines to handle adding/deleting
647 * items on the fork callout list.
650 * Take the arguments given and put them onto the fork callout list,
651 * However first make sure that it's not already there.
652 * Returns 0 on success or a standard error number.
657 forklist_fn function;
662 /* let the programmer know if he's been stupid */
663 if (rm_at_fork(function))
664 printf("WARNING: fork callout entry (%p) already present\n",
667 ep = malloc(sizeof(*ep), M_ATFORK, M_NOWAIT);
670 ep->function = function;
671 sx_xlock(&fork_list_lock);
672 TAILQ_INSERT_TAIL(&fork_list, ep, next);
673 sx_xunlock(&fork_list_lock);
678 * Scan the exit callout list for the given item and remove it..
679 * Returns the number of items removed (0 or 1)
684 forklist_fn function;
688 sx_xlock(&fork_list_lock);
689 TAILQ_FOREACH(ep, &fork_list, next) {
690 if (ep->function == function) {
691 TAILQ_REMOVE(&fork_list, ep, next);
692 sx_xunlock(&fork_list_lock);
697 sx_xunlock(&fork_list_lock);
702 * Handle the return of a child process from fork1(). This function
703 * is called from the MD fork_trampoline() entry point.
706 fork_exit(callout, arg, frame)
707 void (*callout)(void *, struct trapframe *);
709 struct trapframe *frame;
716 * Setup the sched_lock state so that we can release it.
718 sched_lock.mtx_lock = (uintptr_t)p;
719 sched_lock.mtx_recurse = 0;
721 * XXX: We really shouldn't have to do this.
723 mtx_intr_enable(&sched_lock);
724 mtx_unlock_spin(&sched_lock);
727 if (PCPU_GET(switchtime.tv_sec) == 0)
728 microuptime(PCPU_PTR(switchtime));
729 PCPU_SET(switchticks, ticks);
733 * cpu_set_fork_handler intercepts this function call to
734 * have this call a non-return function to stay in kernel mode.
735 * initproc has its own fork handler, but it does return.
737 KASSERT(callout != NULL, ("NULL callout in fork_exit"));
741 * Check if a kernel thread misbehaved and returned from its main
745 if (p->p_flag & P_KTHREAD) {
748 printf("Kernel thread \"%s\" (pid %d) exited prematurely.\n",
749 p->p_comm, p->p_pid);
753 mtx_assert(&Giant, MA_NOTOWNED);
757 * Simplified back end of syscall(), used when returning from fork()
758 * directly into user mode. Giant is not held on entry, and must not
759 * be held on return. This function is passed in to fork_exit() as the
760 * first parameter and is called when returning to a new userland process.
763 fork_return(p, frame)
765 struct trapframe *frame;
768 userret(p, frame, 0);
770 if (KTRPOINT(p, KTR_SYSRET)) {
771 ktrsysret(p->p_tracep, SYS_fork, 0, 0);
774 mtx_assert(&Giant, MA_NOTOWNED);