sys/kern/kern_resource.c

   1 /*-
   2  * Copyright (c) 1982, 1986, 1991, 1993
   3  *      The Regents of the University of California.  All rights reserved.
   4  * (c) UNIX System Laboratories, Inc.
   5  * All or some portions of this file are derived from material licensed
   6  * to the University of California by American Telephone and Telegraph
   7  * Co. or Unix System Laboratories, Inc. and are reproduced herein with
   8  * the permission of UNIX System Laboratories, Inc.
   9  *
  10  * Redistribution and use in source and binary forms, with or without
  11  * modification, are permitted provided that the following conditions
  12  * are met:
  13  * 1. Redistributions of source code must retain the above copyright
  14  *    notice, this list of conditions and the following disclaimer.
  15  * 2. Redistributions in binary form must reproduce the above copyright
  16  *    notice, this list of conditions and the following disclaimer in the
  17  *    documentation and/or other materials provided with the distribution.
  18  * 4. Neither the name of the University nor the names of its contributors
  19  *    may be used to endorse or promote products derived from this software
  20  *    without specific prior written permission.
  21  *
  22  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  23  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  24  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  25  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  26  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  27  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  28  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  29  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  30  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  31  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  32  * SUCH DAMAGE.
  33  *
  34  *      @(#)kern_resource.c     8.5 (Berkeley) 1/21/94
  35  */
  36
  37 #include <sys/cdefs.h>
  38 __FBSDID("$FreeBSD$");
  39
  40 #include "opt_compat.h"
  41
  42 #include <sys/param.h>
  43 #include <sys/systm.h>
  44 #include <sys/sysproto.h>
  45 #include <sys/file.h>
  46 #include <sys/kernel.h>
  47 #include <sys/lock.h>
  48 #include <sys/malloc.h>
  49 #include <sys/mutex.h>
  50 #include <sys/priv.h>
  51 #include <sys/proc.h>
  52 #include <sys/refcount.h>
  53 #include <sys/resourcevar.h>
  54 #include <sys/sched.h>
  55 #include <sys/sx.h>
  56 #include <sys/syscallsubr.h>
  57 #include <sys/sysent.h>
  58 #include <sys/time.h>
  59
  60 #include <vm/vm.h>
  61 #include <vm/vm_param.h>
  62 #include <vm/pmap.h>
  63 #include <vm/vm_map.h>
  64
  65
  66 static MALLOC_DEFINE(M_PLIMIT, "plimit", "plimit structures");
  67 static MALLOC_DEFINE(M_UIDINFO, "uidinfo", "uidinfo structures");
  68 #define UIHASH(uid)     (&uihashtbl[(uid) & uihash])
  69 static struct mtx uihashtbl_mtx;
  70 static LIST_HEAD(uihashhead, uidinfo) *uihashtbl;
  71 static u_long uihash;           /* size of hash table - 1 */
  72
  73 static void     calcru1(struct proc *p, struct rusage_ext *ruxp,
  74                     struct timeval *up, struct timeval *sp);
  75 static int      donice(struct thread *td, struct proc *chgp, int n);
  76 static struct uidinfo *uilookup(uid_t uid);
  77
  78 /*
  79  * Resource controls and accounting.
  80  */
  81 #ifndef _SYS_SYSPROTO_H_
  82 struct getpriority_args {
  83         int     which;
  84         int     who;
  85 };
  86 #endif
  87 int
  88 getpriority(td, uap)
  89         struct thread *td;
  90         register struct getpriority_args *uap;
  91 {
  92         struct proc *p;
  93         struct pgrp *pg;
  94         int error, low;
  95
  96         error = 0;
  97         low = PRIO_MAX + 1;
  98         switch (uap->which) {
  99
 100         case PRIO_PROCESS:
 101                 if (uap->who == 0)
 102                         low = td->td_proc->p_nice;
 103                 else {
 104                         p = pfind(uap->who);
 105                         if (p == NULL)
 106                                 break;
 107                         if (p_cansee(td, p) == 0)
 108                                 low = p->p_nice;
 109                         PROC_UNLOCK(p);
 110                 }
 111                 break;
 112
 113         case PRIO_PGRP:
 114                 sx_slock(&proctree_lock);
 115                 if (uap->who == 0) {
 116                         pg = td->td_proc->p_pgrp;
 117                         PGRP_LOCK(pg);
 118                 } else {
 119                         pg = pgfind(uap->who);
 120                         if (pg == NULL) {
 121                                 sx_sunlock(&proctree_lock);
 122                                 break;
 123                         }
 124                 }
 125                 sx_sunlock(&proctree_lock);
 126                 LIST_FOREACH(p, &pg->pg_members, p_pglist) {
 127                         PROC_LOCK(p);
 128                         if (!p_cansee(td, p)) {
 129                                 if (p->p_nice < low)
 130                                         low = p->p_nice;
 131                         }
 132                         PROC_UNLOCK(p);
 133                 }
 134                 PGRP_UNLOCK(pg);
 135                 break;
 136
 137         case PRIO_USER:
 138                 if (uap->who == 0)
 139                         uap->who = td->td_ucred->cr_uid;
 140                 sx_slock(&allproc_lock);
 141                 FOREACH_PROC_IN_SYSTEM(p) {
 142                         /* Do not bother to check PRS_NEW processes */
 143                         if (p->p_state == PRS_NEW)
 144                                 continue;
 145                         PROC_LOCK(p);
 146                         if (!p_cansee(td, p) &&
 147                             p->p_ucred->cr_uid == uap->who) {
 148                                 if (p->p_nice < low)
 149                                         low = p->p_nice;
 150                         }
 151                         PROC_UNLOCK(p);
 152                 }
 153                 sx_sunlock(&allproc_lock);
 154                 break;
 155
 156         default:
 157                 error = EINVAL;
 158                 break;
 159         }
 160         if (low == PRIO_MAX + 1 && error == 0)
 161                 error = ESRCH;
 162         td->td_retval[0] = low;
 163         return (error);
 164 }
 165
 166 #ifndef _SYS_SYSPROTO_H_
 167 struct setpriority_args {
 168         int     which;
 169         int     who;
 170         int     prio;
 171 };
 172 #endif
 173 int
 174 setpriority(td, uap)
 175         struct thread *td;
 176         struct setpriority_args *uap;
 177 {
 178         struct proc *curp, *p;
 179         struct pgrp *pg;
 180         int found = 0, error = 0;
 181
 182         curp = td->td_proc;
 183         switch (uap->which) {
 184         case PRIO_PROCESS:
 185                 if (uap->who == 0) {
 186                         PROC_LOCK(curp);
 187                         error = donice(td, curp, uap->prio);
 188                         PROC_UNLOCK(curp);
 189                 } else {
 190                         p = pfind(uap->who);
 191                         if (p == 0)
 192                                 break;
 193                         if (p_cansee(td, p) == 0)
 194                                 error = donice(td, p, uap->prio);
 195                         PROC_UNLOCK(p);
 196                 }
 197                 found++;
 198                 break;
 199
 200         case PRIO_PGRP:
 201                 sx_slock(&proctree_lock);
 202                 if (uap->who == 0) {
 203                         pg = curp->p_pgrp;
 204                         PGRP_LOCK(pg);
 205                 } else {
 206                         pg = pgfind(uap->who);
 207                         if (pg == NULL) {
 208                                 sx_sunlock(&proctree_lock);
 209                                 break;
 210                         }
 211                 }
 212                 sx_sunlock(&proctree_lock);
 213                 LIST_FOREACH(p, &pg->pg_members, p_pglist) {
 214                         PROC_LOCK(p);
 215                         if (!p_cansee(td, p)) {
 216                                 error = donice(td, p, uap->prio);
 217                                 found++;
 218                         }
 219                         PROC_UNLOCK(p);
 220                 }
 221                 PGRP_UNLOCK(pg);
 222                 break;
 223
 224         case PRIO_USER:
 225                 if (uap->who == 0)
 226                         uap->who = td->td_ucred->cr_uid;
 227                 sx_slock(&allproc_lock);
 228                 FOREACH_PROC_IN_SYSTEM(p) {
 229                         PROC_LOCK(p);
 230                         if (p->p_ucred->cr_uid == uap->who &&
 231                             !p_cansee(td, p)) {
 232                                 error = donice(td, p, uap->prio);
 233                                 found++;
 234                         }
 235                         PROC_UNLOCK(p);
 236                 }
 237                 sx_sunlock(&allproc_lock);
 238                 break;
 239
 240         default:
 241                 error = EINVAL;
 242                 break;
 243         }
 244         if (found == 0 && error == 0)
 245                 error = ESRCH;
 246         return (error);
 247 }
 248
 249 /*
 250  * Set "nice" for a (whole) process.
 251  */
 252 static int
 253 donice(struct thread *td, struct proc *p, int n)
 254 {
 255         int error;
 256
 257         PROC_LOCK_ASSERT(p, MA_OWNED);
 258         if ((error = p_cansched(td, p)))
 259                 return (error);
 260         if (n > PRIO_MAX)
 261                 n = PRIO_MAX;
 262         if (n < PRIO_MIN)
 263                 n = PRIO_MIN;
 264         if (n < p->p_nice && priv_check(td, PRIV_SCHED_SETPRIORITY) != 0)
 265                 return (EACCES);
 266         PROC_SLOCK(p);
 267         sched_nice(p, n);
 268         PROC_SUNLOCK(p);
 269         return (0);
 270 }
 271
 272 /*
 273  * Set realtime priority for LWP.
 274  */
 275 #ifndef _SYS_SYSPROTO_H_
 276 struct rtprio_thread_args {
 277         int             function;
 278         lwpid_t         lwpid;
 279         struct rtprio   *rtp;
 280 };
 281 #endif
 282 int
 283 rtprio_thread(struct thread *td, struct rtprio_thread_args *uap)
 284 {
 285         struct proc *curp;
 286         struct proc *p;
 287         struct rtprio rtp;
 288         struct thread *td1;
 289         int cierror, error;
 290
 291         /* Perform copyin before acquiring locks if needed. */
 292         if (uap->function == RTP_SET)
 293                 cierror = copyin(uap->rtp, &rtp, sizeof(struct rtprio));
 294         else
 295                 cierror = 0;
 296
 297         curp = td->td_proc;
 298         /*
 299          * Though lwpid is unique, only current process is supported
 300          * since there is no efficient way to look up a LWP yet.
 301          */
 302         p = curp;
 303         PROC_LOCK(p);
 304
 305         switch (uap->function) {
 306         case RTP_LOOKUP:
 307                 if ((error = p_cansee(td, p)))
 308                         break;
 309                 PROC_SLOCK(p);
 310                 if (uap->lwpid == 0 || uap->lwpid == td->td_tid)
 311                         td1 = td;
 312                 else
 313                         td1 = thread_find(p, uap->lwpid);
 314                 if (td1 != NULL)
 315                         pri_to_rtp(td1, &rtp);
 316                 else
 317                         error = ESRCH;
 318                 PROC_SUNLOCK(p);
 319                 PROC_UNLOCK(p);
 320                 return (copyout(&rtp, uap->rtp, sizeof(struct rtprio)));
 321         case RTP_SET:
 322                 if ((error = p_cansched(td, p)) || (error = cierror))
 323                         break;
 324
 325                 /* Disallow setting rtprio in most cases if not superuser. */
 326 /*
 327  * Realtime priority has to be restricted for reasons which should be
 328  * obvious.  However, for idle priority, there is a potential for
 329  * system deadlock if an idleprio process gains a lock on a resource
 330  * that other processes need (and the idleprio process can't run
 331  * due to a CPU-bound normal process).  Fix me!  XXX
 332  */
 333 #if 0
 334                 if (RTP_PRIO_IS_REALTIME(rtp.type)) {
 335 #else
 336                 if (rtp.type != RTP_PRIO_NORMAL) {
 337 #endif
 338                         error = priv_check(td, PRIV_SCHED_RTPRIO);
 339                         if (error)
 340                                 break;
 341                 }
 342
 343                 PROC_SLOCK(p);
 344                 if (uap->lwpid == 0 || uap->lwpid == td->td_tid)
 345                         td1 = td;
 346                 else
 347                         td1 = thread_find(p, uap->lwpid);
 348                 if (td1 != NULL)
 349                         error = rtp_to_pri(&rtp, td1);
 350                 else
 351                         error = ESRCH;
 352                 PROC_SUNLOCK(p);
 353                 break;
 354         default:
 355                 error = EINVAL;
 356                 break;
 357         }
 358         PROC_UNLOCK(p);
 359         return (error);
 360 }
 361
 362 /*
 363  * Set realtime priority.
 364  */
 365 #ifndef _SYS_SYSPROTO_H_
 366 struct rtprio_args {
 367         int             function;
 368         pid_t           pid;
 369         struct rtprio   *rtp;
 370 };
 371 #endif
 372 int
 373 rtprio(td, uap)
 374         struct thread *td;              /* curthread */
 375         register struct rtprio_args *uap;
 376 {
 377         struct proc *curp;
 378         struct proc *p;
 379         struct thread *tdp;
 380         struct rtprio rtp;
 381         int cierror, error;
 382
 383         /* Perform copyin before acquiring locks if needed. */
 384         if (uap->function == RTP_SET)
 385                 cierror = copyin(uap->rtp, &rtp, sizeof(struct rtprio));
 386         else
 387                 cierror = 0;
 388
 389         curp = td->td_proc;
 390         if (uap->pid == 0) {
 391                 p = curp;
 392                 PROC_LOCK(p);
 393         } else {
 394                 p = pfind(uap->pid);
 395                 if (p == NULL)
 396                         return (ESRCH);
 397         }
 398
 399         switch (uap->function) {
 400         case RTP_LOOKUP:
 401                 if ((error = p_cansee(td, p)))
 402                         break;
 403                 PROC_SLOCK(p);
 404                 /*
 405                  * Return OUR priority if no pid specified,
 406                  * or if one is, report the highest priority
 407                  * in the process.  There isn't much more you can do as
 408                  * there is only room to return a single priority.
 409                  * XXXKSE: maybe need a new interface to report
 410                  * priorities of multiple system scope threads.
 411                  * Note: specifying our own pid is not the same
 412                  * as leaving it zero.
 413                  */
 414                 if (uap->pid == 0) {
 415                         pri_to_rtp(td, &rtp);
 416                 } else {
 417                         struct rtprio rtp2;
 418
 419                         rtp.type = RTP_PRIO_IDLE;
 420                         rtp.prio = RTP_PRIO_MAX;
 421                         FOREACH_THREAD_IN_PROC(p, tdp) {
 422                                 pri_to_rtp(tdp, &rtp2);
 423                                 if (rtp2.type <  rtp.type ||
 424                                     (rtp2.type == rtp.type &&
 425                                     rtp2.prio < rtp.prio)) {
 426                                         rtp.type = rtp2.type;
 427                                         rtp.prio = rtp2.prio;
 428                                 }
 429                         }
 430                 }
 431                 PROC_SUNLOCK(p);
 432                 PROC_UNLOCK(p);
 433                 return (copyout(&rtp, uap->rtp, sizeof(struct rtprio)));
 434         case RTP_SET:
 435                 if ((error = p_cansched(td, p)) || (error = cierror))
 436                         break;
 437
 438                 /* Disallow setting rtprio in most cases if not superuser. */
 439 /*
 440  * Realtime priority has to be restricted for reasons which should be
 441  * obvious.  However, for idle priority, there is a potential for
 442  * system deadlock if an idleprio process gains a lock on a resource
 443  * that other processes need (and the idleprio process can't run
 444  * due to a CPU-bound normal process).  Fix me!  XXX
 445  */
 446 #if 0
 447                 if (RTP_PRIO_IS_REALTIME(rtp.type)) {
 448 #else
 449                 if (rtp.type != RTP_PRIO_NORMAL) {
 450 #endif
 451                         error = priv_check(td, PRIV_SCHED_RTPRIO);
 452                         if (error)
 453                                 break;
 454                 }
 455
 456                 /*
 457                  * If we are setting our own priority, set just our
 458                  * thread but if we are doing another process,
 459                  * do all the threads on that process. If we
 460                  * specify our own pid we do the latter.
 461                  */
 462                 PROC_SLOCK(p);
 463                 if (uap->pid == 0) {
 464                         error = rtp_to_pri(&rtp, td);
 465                 } else {
 466                         FOREACH_THREAD_IN_PROC(p, td) {
 467                                 if ((error = rtp_to_pri(&rtp, td)) != 0)
 468                                         break;
 469                         }
 470                 }
 471                 PROC_SUNLOCK(p);
 472                 break;
 473         default:
 474                 error = EINVAL;
 475                 break;
 476         }
 477         PROC_UNLOCK(p);
 478         return (error);
 479 }
 480
 481 int
 482 rtp_to_pri(struct rtprio *rtp, struct thread *td)
 483 {
 484         u_char  newpri;
 485
 486         if (rtp->prio > RTP_PRIO_MAX)
 487                 return (EINVAL);
 488         thread_lock(td);
 489         switch (RTP_PRIO_BASE(rtp->type)) {
 490         case RTP_PRIO_REALTIME:
 491                 newpri = PRI_MIN_REALTIME + rtp->prio;
 492                 break;
 493         case RTP_PRIO_NORMAL:
 494                 newpri = PRI_MIN_TIMESHARE + rtp->prio;
 495                 break;
 496         case RTP_PRIO_IDLE:
 497                 newpri = PRI_MIN_IDLE + rtp->prio;
 498                 break;
 499         default:
 500                 thread_unlock(td);
 501                 return (EINVAL);
 502         }
 503         sched_class(td, rtp->type);     /* XXX fix */
 504         sched_user_prio(td, newpri);
 505         if (curthread == td)
 506                 sched_prio(curthread, td->td_user_pri); /* XXX dubious */
 507         thread_unlock(td);
 508         return (0);
 509 }
 510
 511 void
 512 pri_to_rtp(struct thread *td, struct rtprio *rtp)
 513 {
 514
 515         thread_lock(td);
 516         switch (PRI_BASE(td->td_pri_class)) {
 517         case PRI_REALTIME:
 518                 rtp->prio = td->td_base_user_pri - PRI_MIN_REALTIME;
 519                 break;
 520         case PRI_TIMESHARE:
 521                 rtp->prio = td->td_base_user_pri - PRI_MIN_TIMESHARE;
 522                 break;
 523         case PRI_IDLE:
 524                 rtp->prio = td->td_base_user_pri - PRI_MIN_IDLE;
 525                 break;
 526         default:
 527                 break;
 528         }
 529         rtp->type = td->td_pri_class;
 530         thread_unlock(td);
 531 }
 532
 533 #if defined(COMPAT_43)
 534 #ifndef _SYS_SYSPROTO_H_
 535 struct osetrlimit_args {
 536         u_int   which;
 537         struct  orlimit *rlp;
 538 };
 539 #endif
 540 int
 541 osetrlimit(td, uap)
 542         struct thread *td;
 543         register struct osetrlimit_args *uap;
 544 {
 545         struct orlimit olim;
 546         struct rlimit lim;
 547         int error;
 548
 549         if ((error = copyin(uap->rlp, &olim, sizeof(struct orlimit))))
 550                 return (error);
 551         lim.rlim_cur = olim.rlim_cur;
 552         lim.rlim_max = olim.rlim_max;
 553         error = kern_setrlimit(td, uap->which, &lim);
 554         return (error);
 555 }
 556
 557 #ifndef _SYS_SYSPROTO_H_
 558 struct ogetrlimit_args {
 559         u_int   which;
 560         struct  orlimit *rlp;
 561 };
 562 #endif
 563 int
 564 ogetrlimit(td, uap)
 565         struct thread *td;
 566         register struct ogetrlimit_args *uap;
 567 {
 568         struct orlimit olim;
 569         struct rlimit rl;
 570         struct proc *p;
 571         int error;
 572
 573         if (uap->which >= RLIM_NLIMITS)
 574                 return (EINVAL);
 575         p = td->td_proc;
 576         PROC_LOCK(p);
 577         lim_rlimit(p, uap->which, &rl);
 578         PROC_UNLOCK(p);
 579
 580         /*
 581          * XXX would be more correct to convert only RLIM_INFINITY to the
 582          * old RLIM_INFINITY and fail with EOVERFLOW for other larger
 583          * values.  Most 64->32 and 32->16 conversions, including not
 584          * unimportant ones of uids are even more broken than what we
 585          * do here (they blindly truncate).  We don't do this correctly
 586          * here since we have little experience with EOVERFLOW yet.
 587          * Elsewhere, getuid() can't fail...
 588          */
 589         olim.rlim_cur = rl.rlim_cur > 0x7fffffff ? 0x7fffffff : rl.rlim_cur;
 590         olim.rlim_max = rl.rlim_max > 0x7fffffff ? 0x7fffffff : rl.rlim_max;
 591         error = copyout(&olim, uap->rlp, sizeof(olim));
 592         return (error);
 593 }
 594 #endif /* COMPAT_43 */
 595
 596 #ifndef _SYS_SYSPROTO_H_
 597 struct __setrlimit_args {
 598         u_int   which;
 599         struct  rlimit *rlp;
 600 };
 601 #endif
 602 int
 603 setrlimit(td, uap)
 604         struct thread *td;
 605         register struct __setrlimit_args *uap;
 606 {
 607         struct rlimit alim;
 608         int error;
 609
 610         if ((error = copyin(uap->rlp, &alim, sizeof(struct rlimit))))
 611                 return (error);
 612         error = kern_setrlimit(td, uap->which, &alim);
 613         return (error);
 614 }
 615
 616 static void
 617 lim_cb(void *arg)
 618 {
 619         struct rlimit rlim;
 620         struct thread *td;
 621         struct proc *p;
 622
 623         p = arg;
 624         PROC_LOCK_ASSERT(p, MA_OWNED);
 625         /*
 626          * Check if the process exceeds its cpu resource allocation.  If
 627          * it reaches the max, arrange to kill the process in ast().
 628          */
 629         if (p->p_cpulimit == RLIM_INFINITY)
 630                 return;
 631         PROC_SLOCK(p);
 632         FOREACH_THREAD_IN_PROC(p, td) {
 633                 thread_lock(td);
 634                 ruxagg(&p->p_rux, td);
 635                 thread_unlock(td);
 636         }
 637         PROC_SUNLOCK(p);
 638         if (p->p_rux.rux_runtime > p->p_cpulimit * cpu_tickrate()) {
 639                 lim_rlimit(p, RLIMIT_CPU, &rlim);
 640                 if (p->p_rux.rux_runtime >= rlim.rlim_max * cpu_tickrate()) {
 641                         killproc(p, "exceeded maximum CPU limit");
 642                 } else {
 643                         if (p->p_cpulimit < rlim.rlim_max)
 644                                 p->p_cpulimit += 5;
 645                         psignal(p, SIGXCPU);
 646                 }
 647         }
 648         callout_reset(&p->p_limco, hz, lim_cb, p);
 649 }
 650
 651 int
 652 kern_setrlimit(td, which, limp)
 653         struct thread *td;
 654         u_int which;
 655         struct rlimit *limp;
 656 {
 657         struct plimit *newlim, *oldlim;
 658         struct proc *p;
 659         register struct rlimit *alimp;
 660         rlim_t oldssiz;
 661         int error;
 662
 663         if (which >= RLIM_NLIMITS)
 664                 return (EINVAL);
 665
 666         /*
 667          * Preserve historical bugs by treating negative limits as unsigned.
 668          */
 669         if (limp->rlim_cur < 0)
 670                 limp->rlim_cur = RLIM_INFINITY;
 671         if (limp->rlim_max < 0)
 672                 limp->rlim_max = RLIM_INFINITY;
 673
 674         oldssiz = 0;
 675         p = td->td_proc;
 676         newlim = lim_alloc();
 677         PROC_LOCK(p);
 678         oldlim = p->p_limit;
 679         alimp = &oldlim->pl_rlimit[which];
 680         if (limp->rlim_cur > alimp->rlim_max ||
 681             limp->rlim_max > alimp->rlim_max)
 682                 if ((error = priv_check(td, PRIV_PROC_SETRLIMIT))) {
 683                         PROC_UNLOCK(p);
 684                         lim_free(newlim);
 685                         return (error);
 686                 }
 687         if (limp->rlim_cur > limp->rlim_max)
 688                 limp->rlim_cur = limp->rlim_max;
 689         lim_copy(newlim, oldlim);
 690         alimp = &newlim->pl_rlimit[which];
 691
 692         switch (which) {
 693
 694         case RLIMIT_CPU:
 695                 if (limp->rlim_cur != RLIM_INFINITY &&
 696                     p->p_cpulimit == RLIM_INFINITY)
 697                         callout_reset(&p->p_limco, hz, lim_cb, p);
 698                 PROC_SLOCK(p);
 699                 p->p_cpulimit = limp->rlim_cur;
 700                 PROC_SUNLOCK(p);
 701                 break;
 702         case RLIMIT_DATA:
 703                 if (limp->rlim_cur > maxdsiz)
 704                         limp->rlim_cur = maxdsiz;
 705                 if (limp->rlim_max > maxdsiz)
 706                         limp->rlim_max = maxdsiz;
 707                 break;
 708
 709         case RLIMIT_STACK:
 710                 if (limp->rlim_cur > maxssiz)
 711                         limp->rlim_cur = maxssiz;
 712                 if (limp->rlim_max > maxssiz)
 713                         limp->rlim_max = maxssiz;
 714                 oldssiz = alimp->rlim_cur;
 715                 break;
 716
 717         case RLIMIT_NOFILE:
 718                 if (limp->rlim_cur > maxfilesperproc)
 719                         limp->rlim_cur = maxfilesperproc;
 720                 if (limp->rlim_max > maxfilesperproc)
 721                         limp->rlim_max = maxfilesperproc;
 722                 break;
 723
 724         case RLIMIT_NPROC:
 725                 if (limp->rlim_cur > maxprocperuid)
 726                         limp->rlim_cur = maxprocperuid;
 727                 if (limp->rlim_max > maxprocperuid)
 728                         limp->rlim_max = maxprocperuid;
 729                 if (limp->rlim_cur < 1)
 730                         limp->rlim_cur = 1;
 731                 if (limp->rlim_max < 1)
 732                         limp->rlim_max = 1;
 733                 break;
 734         }
 735         if (td->td_proc->p_sysent->sv_fixlimit != NULL)
 736                 td->td_proc->p_sysent->sv_fixlimit(limp, which);
 737         *alimp = *limp;
 738         p->p_limit = newlim;
 739         PROC_UNLOCK(p);
 740         lim_free(oldlim);
 741
 742         if (which == RLIMIT_STACK) {
 743                 /*
 744                  * Stack is allocated to the max at exec time with only
 745                  * "rlim_cur" bytes accessible.  If stack limit is going
 746                  * up make more accessible, if going down make inaccessible.
 747                  */
 748                 if (limp->rlim_cur != oldssiz) {
 749                         vm_offset_t addr;
 750                         vm_size_t size;
 751                         vm_prot_t prot;
 752
 753                         if (limp->rlim_cur > oldssiz) {
 754                                 prot = p->p_sysent->sv_stackprot;
 755                                 size = limp->rlim_cur - oldssiz;
 756                                 addr = p->p_sysent->sv_usrstack -
 757                                     limp->rlim_cur;
 758                         } else {
 759                                 prot = VM_PROT_NONE;
 760                                 size = oldssiz - limp->rlim_cur;
 761                                 addr = p->p_sysent->sv_usrstack - oldssiz;
 762                         }
 763                         addr = trunc_page(addr);
 764                         size = round_page(size);
 765                         (void)vm_map_protect(&p->p_vmspace->vm_map,
 766                             addr, addr + size, prot, FALSE);
 767                 }
 768         }
 769
 770         return (0);
 771 }
 772
 773 #ifndef _SYS_SYSPROTO_H_
 774 struct __getrlimit_args {
 775         u_int   which;
 776         struct  rlimit *rlp;
 777 };
 778 #endif
 779 /* ARGSUSED */
 780 int
 781 getrlimit(td, uap)
 782         struct thread *td;
 783         register struct __getrlimit_args *uap;
 784 {
 785         struct rlimit rlim;
 786         struct proc *p;
 787         int error;
 788
 789         if (uap->which >= RLIM_NLIMITS)
 790                 return (EINVAL);
 791         p = td->td_proc;
 792         PROC_LOCK(p);
 793         lim_rlimit(p, uap->which, &rlim);
 794         PROC_UNLOCK(p);
 795         error = copyout(&rlim, uap->rlp, sizeof(struct rlimit));
 796         return (error);
 797 }
 798
 799 /*
 800  * Transform the running time and tick information for children of proc p
 801  * into user and system time usage.
 802  */
 803 void
 804 calccru(p, up, sp)
 805         struct proc *p;
 806         struct timeval *up;
 807         struct timeval *sp;
 808 {
 809
 810         PROC_LOCK_ASSERT(p, MA_OWNED);
 811         calcru1(p, &p->p_crux, up, sp);
 812 }
 813
 814 /*
 815  * Transform the running time and tick information in proc p into user
 816  * and system time usage.  If appropriate, include the current time slice
 817  * on this CPU.
 818  */
 819 void
 820 calcru(struct proc *p, struct timeval *up, struct timeval *sp)
 821 {
 822         struct thread *td;
 823         uint64_t u;
 824
 825         PROC_LOCK_ASSERT(p, MA_OWNED);
 826         PROC_SLOCK_ASSERT(p, MA_OWNED);
 827         /*
 828          * If we are getting stats for the current process, then add in the
 829          * stats that this thread has accumulated in its current time slice.
 830          * We reset the thread and CPU state as if we had performed a context
 831          * switch right here.
 832          */
 833         td = curthread;
 834         if (td->td_proc == p) {
 835                 u = cpu_ticks();
 836                 p->p_rux.rux_runtime += u - PCPU_GET(switchtime);
 837                 PCPU_SET(switchtime, u);
 838         }
 839         calcru1(p, &p->p_rux, up, sp);
 840 }
 841
 842 static void
 843 calcru1(struct proc *p, struct rusage_ext *ruxp, struct timeval *up,
 844     struct timeval *sp)
 845 {
 846         /* {user, system, interrupt, total} {ticks, usec}: */
 847         u_int64_t ut, uu, st, su, it, tt, tu;
 848
 849         ut = ruxp->rux_uticks;
 850         st = ruxp->rux_sticks;
 851         it = ruxp->rux_iticks;
 852         tt = ut + st + it;
 853         if (tt == 0) {
 854                 /* Avoid divide by zero */
 855                 st = 1;
 856                 tt = 1;
 857         }
 858         tu = cputick2usec(ruxp->rux_runtime);
 859         if ((int64_t)tu < 0) {
 860                 /* XXX: this should be an assert /phk */
 861                 printf("calcru: negative runtime of %jd usec for pid %d (%s)\n",
 862                     (intmax_t)tu, p->p_pid, p->p_comm);
 863                 tu = ruxp->rux_tu;
 864         }
 865
 866         if (tu >= ruxp->rux_tu) {
 867                 /*
 868                  * The normal case, time increased.
 869                  * Enforce monotonicity of bucketed numbers.
 870                  */
 871                 uu = (tu * ut) / tt;
 872                 if (uu < ruxp->rux_uu)
 873                         uu = ruxp->rux_uu;
 874                 su = (tu * st) / tt;
 875                 if (su < ruxp->rux_su)
 876                         su = ruxp->rux_su;
 877         } else if (tu + 3 > ruxp->rux_tu || 101 * tu > 100 * ruxp->rux_tu) {
 878                 /*
 879                  * When we calibrate the cputicker, it is not uncommon to
 880                  * see the presumably fixed frequency increase slightly over
 881                  * time as a result of thermal stabilization and NTP
 882                  * discipline (of the reference clock).  We therefore ignore
 883                  * a bit of backwards slop because we  expect to catch up
 884                  * shortly.  We use a 3 microsecond limit to catch low
 885                  * counts and a 1% limit for high counts.
 886                  */
 887                 uu = ruxp->rux_uu;
 888                 su = ruxp->rux_su;
 889                 tu = ruxp->rux_tu;
 890         } else { /* tu < ruxp->rux_tu */
 891                 /*
 892                  * What happene here was likely that a laptop, which ran at
 893                  * a reduced clock frequency at boot, kicked into high gear.
 894                  * The wisdom of spamming this message in that case is
 895                  * dubious, but it might also be indicative of something
 896                  * serious, so lets keep it and hope laptops can be made
 897                  * more truthful about their CPU speed via ACPI.
 898                  */
 899                 printf("calcru: runtime went backwards from %ju usec "
 900                     "to %ju usec for pid %d (%s)\n",
 901                     (uintmax_t)ruxp->rux_tu, (uintmax_t)tu,
 902                     p->p_pid, p->p_comm);
 903                 uu = (tu * ut) / tt;
 904                 su = (tu * st) / tt;
 905         }
 906
 907         ruxp->rux_uu = uu;
 908         ruxp->rux_su = su;
 909         ruxp->rux_tu = tu;
 910
 911         up->tv_sec = uu / 1000000;
 912         up->tv_usec = uu % 1000000;
 913         sp->tv_sec = su / 1000000;
 914         sp->tv_usec = su % 1000000;
 915 }
 916
 917 #ifndef _SYS_SYSPROTO_H_
 918 struct getrusage_args {
 919         int     who;
 920         struct  rusage *rusage;
 921 };
 922 #endif
 923 int
 924 getrusage(td, uap)
 925         register struct thread *td;
 926         register struct getrusage_args *uap;
 927 {
 928         struct rusage ru;
 929         int error;
 930
 931         error = kern_getrusage(td, uap->who, &ru);
 932         if (error == 0)
 933                 error = copyout(&ru, uap->rusage, sizeof(struct rusage));
 934         return (error);
 935 }
 936
 937 int
 938 kern_getrusage(td, who, rup)
 939         struct thread *td;
 940         int who;
 941         struct rusage *rup;
 942 {
 943         struct proc *p;
 944
 945         p = td->td_proc;
 946         PROC_LOCK(p);
 947         switch (who) {
 948
 949         case RUSAGE_SELF:
 950                 rufetchcalc(p, rup, &rup->ru_utime,
 951                     &rup->ru_stime);
 952                 break;
 953
 954         case RUSAGE_CHILDREN:
 955                 *rup = p->p_stats->p_cru;
 956                 calccru(p, &rup->ru_utime, &rup->ru_stime);
 957                 break;
 958
 959         default:
 960                 PROC_UNLOCK(p);
 961                 return (EINVAL);
 962         }
 963         PROC_UNLOCK(p);
 964         return (0);
 965 }
 966
 967 void
 968 rucollect(struct rusage *ru, struct rusage *ru2)
 969 {
 970         long *ip, *ip2;
 971         int i;
 972
 973         if (ru->ru_maxrss < ru2->ru_maxrss)
 974                 ru->ru_maxrss = ru2->ru_maxrss;
 975         ip = &ru->ru_first;
 976         ip2 = &ru2->ru_first;
 977         for (i = &ru->ru_last - &ru->ru_first; i >= 0; i--)
 978                 *ip++ += *ip2++;
 979 }
 980
 981 void
 982 ruadd(struct rusage *ru, struct rusage_ext *rux, struct rusage *ru2,
 983     struct rusage_ext *rux2)
 984 {
 985
 986         rux->rux_runtime += rux2->rux_runtime;
 987         rux->rux_uticks += rux2->rux_uticks;
 988         rux->rux_sticks += rux2->rux_sticks;
 989         rux->rux_iticks += rux2->rux_iticks;
 990         rux->rux_uu += rux2->rux_uu;
 991         rux->rux_su += rux2->rux_su;
 992         rux->rux_tu += rux2->rux_tu;
 993         rucollect(ru, ru2);
 994 }
 995
 996 /*
 997  * Aggregate tick counts into the proc's rusage_ext.
 998  */
 999 void
1000 ruxagg(struct rusage_ext *rux, struct thread *td)
1001 {
1002
1003         THREAD_LOCK_ASSERT(td, MA_OWNED);
1004         PROC_SLOCK_ASSERT(td->td_proc, MA_OWNED);
1005         rux->rux_runtime += td->td_runtime;
1006         rux->rux_uticks += td->td_uticks;
1007         rux->rux_sticks += td->td_sticks;
1008         rux->rux_iticks += td->td_iticks;
1009         td->td_runtime = 0;
1010         td->td_uticks = 0;
1011         td->td_iticks = 0;
1012         td->td_sticks = 0;
1013 }
1014
1015 /*
1016  * Update the rusage_ext structure and fetch a valid aggregate rusage
1017  * for proc p if storage for one is supplied.
1018  */
1019 void
1020 rufetch(struct proc *p, struct rusage *ru)
1021 {
1022         struct thread *td;
1023
1024         PROC_SLOCK_ASSERT(p, MA_OWNED);
1025
1026         *ru = p->p_ru;
1027         if (p->p_numthreads > 0)  {
1028                 FOREACH_THREAD_IN_PROC(p, td) {
1029                         thread_lock(td);
1030                         ruxagg(&p->p_rux, td);
1031                         thread_unlock(td);
1032                         rucollect(ru, &td->td_ru);
1033                 }
1034         }
1035 }
1036
1037 /*
1038  * Atomically perform a rufetch and a calcru together.
1039  * Consumers, can safely assume the calcru is executed only once
1040  * rufetch is completed.
1041  */
1042 void
1043 rufetchcalc(struct proc *p, struct rusage *ru, struct timeval *up,
1044     struct timeval *sp)
1045 {
1046
1047         PROC_SLOCK(p);
1048         rufetch(p, ru);
1049         calcru(p, up, sp);
1050         PROC_SUNLOCK(p);
1051 }
1052
1053 /*
1054  * Allocate a new resource limits structure and initialize its
1055  * reference count and mutex pointer.
1056  */
1057 struct plimit *
1058 lim_alloc()
1059 {
1060         struct plimit *limp;
1061
1062         limp = malloc(sizeof(struct plimit), M_PLIMIT, M_WAITOK);
1063         refcount_init(&limp->pl_refcnt, 1);
1064         return (limp);
1065 }
1066
1067 struct plimit *
1068 lim_hold(limp)
1069         struct plimit *limp;
1070 {
1071
1072         refcount_acquire(&limp->pl_refcnt);
1073         return (limp);
1074 }
1075
1076 void
1077 lim_fork(struct proc *p1, struct proc *p2)
1078 {
1079         p2->p_limit = lim_hold(p1->p_limit);
1080         callout_init_mtx(&p2->p_limco, &p2->p_mtx, 0);
1081         if (p1->p_cpulimit != RLIM_INFINITY)
1082                 callout_reset(&p2->p_limco, hz, lim_cb, p2);
1083 }
1084
1085 void
1086 lim_free(limp)
1087         struct plimit *limp;
1088 {
1089
1090         KASSERT(limp->pl_refcnt > 0, ("plimit refcnt underflow"));
1091         if (refcount_release(&limp->pl_refcnt))
1092                 free((void *)limp, M_PLIMIT);
1093 }
1094
1095 /*
1096  * Make a copy of the plimit structure.
1097  * We share these structures copy-on-write after fork.
1098  */
1099 void
1100 lim_copy(dst, src)
1101         struct plimit *dst, *src;
1102 {
1103
1104         KASSERT(dst->pl_refcnt == 1, ("lim_copy to shared limit"));
1105         bcopy(src->pl_rlimit, dst->pl_rlimit, sizeof(src->pl_rlimit));
1106 }
1107
1108 /*
1109  * Return the hard limit for a particular system resource.  The
1110  * which parameter specifies the index into the rlimit array.
1111  */
1112 rlim_t
1113 lim_max(struct proc *p, int which)
1114 {
1115         struct rlimit rl;
1116
1117         lim_rlimit(p, which, &rl);
1118         return (rl.rlim_max);
1119 }
1120
1121 /*
1122  * Return the current (soft) limit for a particular system resource.
1123  * The which parameter which specifies the index into the rlimit array
1124  */
1125 rlim_t
1126 lim_cur(struct proc *p, int which)
1127 {
1128         struct rlimit rl;
1129
1130         lim_rlimit(p, which, &rl);
1131         return (rl.rlim_cur);
1132 }
1133
1134 /*
1135  * Return a copy of the entire rlimit structure for the system limit
1136  * specified by 'which' in the rlimit structure pointed to by 'rlp'.
1137  */
1138 void
1139 lim_rlimit(struct proc *p, int which, struct rlimit *rlp)
1140 {
1141
1142         PROC_LOCK_ASSERT(p, MA_OWNED);
1143         KASSERT(which >= 0 && which < RLIM_NLIMITS,
1144             ("request for invalid resource limit"));
1145         *rlp = p->p_limit->pl_rlimit[which];
1146         if (p->p_sysent->sv_fixlimit != NULL)
1147                 p->p_sysent->sv_fixlimit(rlp, which);
1148 }
1149
1150 /*
1151  * Find the uidinfo structure for a uid.  This structure is used to
1152  * track the total resource consumption (process count, socket buffer
1153  * size, etc.) for the uid and impose limits.
1154  */
1155 void
1156 uihashinit()
1157 {
1158
1159         uihashtbl = hashinit(maxproc / 16, M_UIDINFO, &uihash);
1160         mtx_init(&uihashtbl_mtx, "uidinfo hash", NULL, MTX_DEF);
1161 }
1162
1163 /*
1164  * Look up a uidinfo struct for the parameter uid.
1165  * uihashtbl_mtx must be locked.
1166  */
1167 static struct uidinfo *
1168 uilookup(uid)
1169         uid_t uid;
1170 {
1171         struct uihashhead *uipp;
1172         struct uidinfo *uip;
1173
1174         mtx_assert(&uihashtbl_mtx, MA_OWNED);
1175         uipp = UIHASH(uid);
1176         LIST_FOREACH(uip, uipp, ui_hash)
1177                 if (uip->ui_uid == uid)
1178                         break;
1179
1180         return (uip);
1181 }
1182
1183 /*
1184  * Find or allocate a struct uidinfo for a particular uid.
1185  * Increase refcount on uidinfo struct returned.
1186  * uifree() should be called on a struct uidinfo when released.
1187  */
1188 struct uidinfo *
1189 uifind(uid)
1190         uid_t uid;
1191 {
1192         struct uidinfo *old_uip, *uip;
1193
1194         mtx_lock(&uihashtbl_mtx);
1195         uip = uilookup(uid);
1196         if (uip == NULL) {
1197                 mtx_unlock(&uihashtbl_mtx);
1198                 uip = malloc(sizeof(*uip), M_UIDINFO, M_WAITOK | M_ZERO);
1199                 mtx_lock(&uihashtbl_mtx);
1200                 /*
1201                  * There's a chance someone created our uidinfo while we
1202                  * were in malloc and not holding the lock, so we have to
1203                  * make sure we don't insert a duplicate uidinfo.
1204                  */
1205                 if ((old_uip = uilookup(uid)) != NULL) {
1206                         /* Someone else beat us to it. */
1207                         free(uip, M_UIDINFO);
1208                         uip = old_uip;
1209                 } else {
1210                         uip->ui_mtxp = mtx_pool_alloc(mtxpool_sleep);
1211                         uip->ui_uid = uid;
1212                         LIST_INSERT_HEAD(UIHASH(uid), uip, ui_hash);
1213                 }
1214         }
1215         uihold(uip);
1216         mtx_unlock(&uihashtbl_mtx);
1217         return (uip);
1218 }
1219
1220 /*
1221  * Place another refcount on a uidinfo struct.
1222  */
1223 void
1224 uihold(uip)
1225         struct uidinfo *uip;
1226 {
1227
1228         UIDINFO_LOCK(uip);
1229         uip->ui_ref++;
1230         UIDINFO_UNLOCK(uip);
1231 }
1232
1233 /*-
1234  * Since uidinfo structs have a long lifetime, we use an
1235  * opportunistic refcounting scheme to avoid locking the lookup hash
1236  * for each release.
1237  *
1238  * If the refcount hits 0, we need to free the structure,
1239  * which means we need to lock the hash.
1240  * Optimal case:
1241  *   After locking the struct and lowering the refcount, if we find
1242  *   that we don't need to free, simply unlock and return.
1243  * Suboptimal case:
1244  *   If refcount lowering results in need to free, bump the count
1245  *   back up, lose the lock and acquire the locks in the proper
1246  *   order to try again.
1247  */
1248 void
1249 uifree(uip)
1250         struct uidinfo *uip;
1251 {
1252
1253         /* Prepare for optimal case. */
1254         UIDINFO_LOCK(uip);
1255
1256         if (--uip->ui_ref != 0) {
1257                 UIDINFO_UNLOCK(uip);
1258                 return;
1259         }
1260
1261         /* Prepare for suboptimal case. */
1262         uip->ui_ref++;
1263         UIDINFO_UNLOCK(uip);
1264         mtx_lock(&uihashtbl_mtx);
1265         UIDINFO_LOCK(uip);
1266
1267         /*
1268          * We must subtract one from the count again because we backed out
1269          * our initial subtraction before dropping the lock.
1270          * Since another thread may have added a reference after we dropped the
1271          * initial lock we have to test for zero again.
1272          */
1273         if (--uip->ui_ref == 0) {
1274                 LIST_REMOVE(uip, ui_hash);
1275                 mtx_unlock(&uihashtbl_mtx);
1276                 if (uip->ui_sbsize != 0)
1277                         printf("freeing uidinfo: uid = %d, sbsize = %jd\n",
1278                             uip->ui_uid, (intmax_t)uip->ui_sbsize);
1279                 if (uip->ui_proccnt != 0)
1280                         printf("freeing uidinfo: uid = %d, proccnt = %ld\n",
1281                             uip->ui_uid, uip->ui_proccnt);
1282                 UIDINFO_UNLOCK(uip);
1283                 FREE(uip, M_UIDINFO);
1284                 return;
1285         }
1286
1287         mtx_unlock(&uihashtbl_mtx);
1288         UIDINFO_UNLOCK(uip);
1289 }
1290
1291 /*
1292  * Change the count associated with number of processes
1293  * a given user is using.  When 'max' is 0, don't enforce a limit
1294  */
1295 int
1296 chgproccnt(uip, diff, max)
1297         struct  uidinfo *uip;
1298         int     diff;
1299         int     max;
1300 {
1301
1302         UIDINFO_LOCK(uip);
1303         /* Don't allow them to exceed max, but allow subtraction. */
1304         if (diff > 0 && uip->ui_proccnt + diff > max && max != 0) {
1305                 UIDINFO_UNLOCK(uip);
1306                 return (0);
1307         }
1308         uip->ui_proccnt += diff;
1309         if (uip->ui_proccnt < 0)
1310                 printf("negative proccnt for uid = %d\n", uip->ui_uid);
1311         UIDINFO_UNLOCK(uip);
1312         return (1);
1313 }
1314
1315 /*
1316  * Change the total socket buffer size a user has used.
1317  */
1318 int
1319 chgsbsize(uip, hiwat, to, max)
1320         struct  uidinfo *uip;
1321         u_int  *hiwat;
1322         u_int   to;
1323         rlim_t  max;
1324 {
1325         rlim_t new;
1326
1327         UIDINFO_LOCK(uip);
1328         new = uip->ui_sbsize + to - *hiwat;
1329         /* Don't allow them to exceed max, but allow subtraction. */
1330         if (to > *hiwat && new > max) {
1331                 UIDINFO_UNLOCK(uip);
1332                 return (0);
1333         }
1334         uip->ui_sbsize = new;
1335         UIDINFO_UNLOCK(uip);
1336         *hiwat = to;
1337         if (new < 0)
1338                 printf("negative sbsize for uid = %d\n", uip->ui_uid);
1339         return (1);
1340 }