sys/kern/kern_resource.c

   1 /*-
   2  * Copyright (c) 1982, 1986, 1991, 1993
   3  *      The Regents of the University of California.  All rights reserved.
   4  * (c) UNIX System Laboratories, Inc.
   5  * All or some portions of this file are derived from material licensed
   6  * to the University of California by American Telephone and Telegraph
   7  * Co. or Unix System Laboratories, Inc. and are reproduced herein with
   8  * the permission of UNIX System Laboratories, Inc.
   9  *
  10  * Redistribution and use in source and binary forms, with or without
  11  * modification, are permitted provided that the following conditions
  12  * are met:
  13  * 1. Redistributions of source code must retain the above copyright
  14  *    notice, this list of conditions and the following disclaimer.
  15  * 2. Redistributions in binary form must reproduce the above copyright
  16  *    notice, this list of conditions and the following disclaimer in the
  17  *    documentation and/or other materials provided with the distribution.
  18  * 4. Neither the name of the University nor the names of its contributors
  19  *    may be used to endorse or promote products derived from this software
  20  *    without specific prior written permission.
  21  *
  22  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  23  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  24  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  25  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  26  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  27  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  28  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  29  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  30  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  31  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  32  * SUCH DAMAGE.
  33  *
  34  *      @(#)kern_resource.c     8.5 (Berkeley) 1/21/94
  35  */
  36
  37 #include <sys/cdefs.h>
  38 __FBSDID("$FreeBSD$");
  39
  40 #include "opt_compat.h"
  41
  42 #include <sys/param.h>
  43 #include <sys/systm.h>
  44 #include <sys/sysproto.h>
  45 #include <sys/file.h>
  46 #include <sys/kernel.h>
  47 #include <sys/lock.h>
  48 #include <sys/malloc.h>
  49 #include <sys/mutex.h>
  50 #include <sys/priv.h>
  51 #include <sys/proc.h>
  52 #include <sys/refcount.h>
  53 #include <sys/resourcevar.h>
  54 #include <sys/sched.h>
  55 #include <sys/sx.h>
  56 #include <sys/syscallsubr.h>
  57 #include <sys/sysent.h>
  58 #include <sys/time.h>
  59
  60 #include <vm/vm.h>
  61 #include <vm/vm_param.h>
  62 #include <vm/pmap.h>
  63 #include <vm/vm_map.h>
  64
  65
  66 static MALLOC_DEFINE(M_PLIMIT, "plimit", "plimit structures");
  67 static MALLOC_DEFINE(M_UIDINFO, "uidinfo", "uidinfo structures");
  68 #define UIHASH(uid)     (&uihashtbl[(uid) & uihash])
  69 static struct mtx uihashtbl_mtx;
  70 static LIST_HEAD(uihashhead, uidinfo) *uihashtbl;
  71 static u_long uihash;           /* size of hash table - 1 */
  72
  73 static void     calcru1(struct proc *p, struct rusage_ext *ruxp,
  74                     struct timeval *up, struct timeval *sp);
  75 static int      donice(struct thread *td, struct proc *chgp, int n);
  76 static struct uidinfo *uilookup(uid_t uid);
  77
  78 /*
  79  * Resource controls and accounting.
  80  */
  81 #ifndef _SYS_SYSPROTO_H_
  82 struct getpriority_args {
  83         int     which;
  84         int     who;
  85 };
  86 #endif
  87 int
  88 getpriority(td, uap)
  89         struct thread *td;
  90         register struct getpriority_args *uap;
  91 {
  92         struct proc *p;
  93         struct pgrp *pg;
  94         int error, low;
  95
  96         error = 0;
  97         low = PRIO_MAX + 1;
  98         switch (uap->which) {
  99
 100         case PRIO_PROCESS:
 101                 if (uap->who == 0)
 102                         low = td->td_proc->p_nice;
 103                 else {
 104                         p = pfind(uap->who);
 105                         if (p == NULL)
 106                                 break;
 107                         if (p_cansee(td, p) == 0)
 108                                 low = p->p_nice;
 109                         PROC_UNLOCK(p);
 110                 }
 111                 break;
 112
 113         case PRIO_PGRP:
 114                 sx_slock(&proctree_lock);
 115                 if (uap->who == 0) {
 116                         pg = td->td_proc->p_pgrp;
 117                         PGRP_LOCK(pg);
 118                 } else {
 119                         pg = pgfind(uap->who);
 120                         if (pg == NULL) {
 121                                 sx_sunlock(&proctree_lock);
 122                                 break;
 123                         }
 124                 }
 125                 sx_sunlock(&proctree_lock);
 126                 LIST_FOREACH(p, &pg->pg_members, p_pglist) {
 127                         PROC_LOCK(p);
 128                         if (!p_cansee(td, p)) {
 129                                 if (p->p_nice < low)
 130                                         low = p->p_nice;
 131                         }
 132                         PROC_UNLOCK(p);
 133                 }
 134                 PGRP_UNLOCK(pg);
 135                 break;
 136
 137         case PRIO_USER:
 138                 if (uap->who == 0)
 139                         uap->who = td->td_ucred->cr_uid;
 140                 sx_slock(&allproc_lock);
 141                 FOREACH_PROC_IN_SYSTEM(p) {
 142                         /* Do not bother to check PRS_NEW processes */
 143                         if (p->p_state == PRS_NEW)
 144                                 continue;
 145                         PROC_LOCK(p);
 146                         if (!p_cansee(td, p) &&
 147                             p->p_ucred->cr_uid == uap->who) {
 148                                 if (p->p_nice < low)
 149                                         low = p->p_nice;
 150                         }
 151                         PROC_UNLOCK(p);
 152                 }
 153                 sx_sunlock(&allproc_lock);
 154                 break;
 155
 156         default:
 157                 error = EINVAL;
 158                 break;
 159         }
 160         if (low == PRIO_MAX + 1 && error == 0)
 161                 error = ESRCH;
 162         td->td_retval[0] = low;
 163         return (error);
 164 }
 165
 166 #ifndef _SYS_SYSPROTO_H_
 167 struct setpriority_args {
 168         int     which;
 169         int     who;
 170         int     prio;
 171 };
 172 #endif
 173 int
 174 setpriority(td, uap)
 175         struct thread *td;
 176         struct setpriority_args *uap;
 177 {
 178         struct proc *curp, *p;
 179         struct pgrp *pg;
 180         int found = 0, error = 0;
 181
 182         curp = td->td_proc;
 183         switch (uap->which) {
 184         case PRIO_PROCESS:
 185                 if (uap->who == 0) {
 186                         PROC_LOCK(curp);
 187                         error = donice(td, curp, uap->prio);
 188                         PROC_UNLOCK(curp);
 189                 } else {
 190                         p = pfind(uap->who);
 191                         if (p == 0)
 192                                 break;
 193                         if (p_cansee(td, p) == 0)
 194                                 error = donice(td, p, uap->prio);
 195                         PROC_UNLOCK(p);
 196                 }
 197                 found++;
 198                 break;
 199
 200         case PRIO_PGRP:
 201                 sx_slock(&proctree_lock);
 202                 if (uap->who == 0) {
 203                         pg = curp->p_pgrp;
 204                         PGRP_LOCK(pg);
 205                 } else {
 206                         pg = pgfind(uap->who);
 207                         if (pg == NULL) {
 208                                 sx_sunlock(&proctree_lock);
 209                                 break;
 210                         }
 211                 }
 212                 sx_sunlock(&proctree_lock);
 213                 LIST_FOREACH(p, &pg->pg_members, p_pglist) {
 214                         PROC_LOCK(p);
 215                         if (!p_cansee(td, p)) {
 216                                 error = donice(td, p, uap->prio);
 217                                 found++;
 218                         }
 219                         PROC_UNLOCK(p);
 220                 }
 221                 PGRP_UNLOCK(pg);
 222                 break;
 223
 224         case PRIO_USER:
 225                 if (uap->who == 0)
 226                         uap->who = td->td_ucred->cr_uid;
 227                 sx_slock(&allproc_lock);
 228                 FOREACH_PROC_IN_SYSTEM(p) {
 229                         PROC_LOCK(p);
 230                         if (p->p_ucred->cr_uid == uap->who &&
 231                             !p_cansee(td, p)) {
 232                                 error = donice(td, p, uap->prio);
 233                                 found++;
 234                         }
 235                         PROC_UNLOCK(p);
 236                 }
 237                 sx_sunlock(&allproc_lock);
 238                 break;
 239
 240         default:
 241                 error = EINVAL;
 242                 break;
 243         }
 244         if (found == 0 && error == 0)
 245                 error = ESRCH;
 246         return (error);
 247 }
 248
 249 /*
 250  * Set "nice" for a (whole) process.
 251  */
 252 static int
 253 donice(struct thread *td, struct proc *p, int n)
 254 {
 255         int error;
 256
 257         PROC_LOCK_ASSERT(p, MA_OWNED);
 258         if ((error = p_cansched(td, p)))
 259                 return (error);
 260         if (n > PRIO_MAX)
 261                 n = PRIO_MAX;
 262         if (n < PRIO_MIN)
 263                 n = PRIO_MIN;
 264         if (n < p->p_nice && priv_check(td, PRIV_SCHED_SETPRIORITY) != 0)
 265                 return (EACCES);
 266         mtx_lock_spin(&sched_lock);
 267         sched_nice(p, n);
 268         mtx_unlock_spin(&sched_lock);
 269         return (0);
 270 }
 271
 272 /*
 273  * Set realtime priority for LWP.
 274  */
 275 #ifndef _SYS_SYSPROTO_H_
 276 struct rtprio_thread_args {
 277         int             function;
 278         lwpid_t         lwpid;
 279         struct rtprio   *rtp;
 280 };
 281 #endif
 282 int
 283 rtprio_thread(struct thread *td, struct rtprio_thread_args *uap)
 284 {
 285         struct proc *curp;
 286         struct proc *p;
 287         struct rtprio rtp;
 288         struct thread *td1;
 289         int cierror, error;
 290
 291         /* Perform copyin before acquiring locks if needed. */
 292         if (uap->function == RTP_SET)
 293                 cierror = copyin(uap->rtp, &rtp, sizeof(struct rtprio));
 294         else
 295                 cierror = 0;
 296
 297         curp = td->td_proc;
 298         /*
 299          * Though lwpid is unique, only current process is supported
 300          * since there is no efficient way to look up a LWP yet.
 301          */
 302         p = curp;
 303         PROC_LOCK(p);
 304
 305         switch (uap->function) {
 306         case RTP_LOOKUP:
 307                 if ((error = p_cansee(td, p)))
 308                         break;
 309                 mtx_lock_spin(&sched_lock);
 310                 if (uap->lwpid == 0 || uap->lwpid == td->td_tid)
 311                         td1 = td;
 312                 else
 313                         td1 = thread_find(p, uap->lwpid);
 314                 if (td1 != NULL)
 315                         pri_to_rtp(td1, &rtp);
 316                 else
 317                         error = ESRCH;
 318                 mtx_unlock_spin(&sched_lock);
 319                 PROC_UNLOCK(p);
 320                 return (copyout(&rtp, uap->rtp, sizeof(struct rtprio)));
 321         case RTP_SET:
 322                 if ((error = p_cansched(td, p)) || (error = cierror))
 323                         break;
 324
 325                 /* Disallow setting rtprio in most cases if not superuser. */
 326                 if (priv_check(td, PRIV_SCHED_RTPRIO) != 0) {
 327                         /* can't set realtime priority */
 328 /*
 329  * Realtime priority has to be restricted for reasons which should be
 330  * obvious.  However, for idle priority, there is a potential for
 331  * system deadlock if an idleprio process gains a lock on a resource
 332  * that other processes need (and the idleprio process can't run
 333  * due to a CPU-bound normal process).  Fix me!  XXX
 334  */
 335 #if 0
 336                         if (RTP_PRIO_IS_REALTIME(rtp.type)) {
 337 #else
 338                         if (rtp.type != RTP_PRIO_NORMAL) {
 339 #endif
 340                                 error = EPERM;
 341                                 break;
 342                         }
 343                 }
 344
 345                 mtx_lock_spin(&sched_lock);
 346                 if (uap->lwpid == 0 || uap->lwpid == td->td_tid)
 347                         td1 = td;
 348                 else
 349                         td1 = thread_find(p, uap->lwpid);
 350                 if (td1 != NULL)
 351                         error = rtp_to_pri(&rtp, td1);
 352                 else
 353                         error = ESRCH;
 354                 mtx_unlock_spin(&sched_lock);
 355                 break;
 356         default:
 357                 error = EINVAL;
 358                 break;
 359         }
 360         PROC_UNLOCK(p);
 361         return (error);
 362 }
 363
 364 /*
 365  * Set realtime priority.
 366  */
 367 #ifndef _SYS_SYSPROTO_H_
 368 struct rtprio_args {
 369         int             function;
 370         pid_t           pid;
 371         struct rtprio   *rtp;
 372 };
 373 #endif
 374 int
 375 rtprio(td, uap)
 376         struct thread *td;              /* curthread */
 377         register struct rtprio_args *uap;
 378 {
 379         struct proc *curp;
 380         struct proc *p;
 381         struct thread *tdp;
 382         struct rtprio rtp;
 383         int cierror, error;
 384
 385         /* Perform copyin before acquiring locks if needed. */
 386         if (uap->function == RTP_SET)
 387                 cierror = copyin(uap->rtp, &rtp, sizeof(struct rtprio));
 388         else
 389                 cierror = 0;
 390
 391         curp = td->td_proc;
 392         if (uap->pid == 0) {
 393                 p = curp;
 394                 PROC_LOCK(p);
 395         } else {
 396                 p = pfind(uap->pid);
 397                 if (p == NULL)
 398                         return (ESRCH);
 399         }
 400
 401         switch (uap->function) {
 402         case RTP_LOOKUP:
 403                 if ((error = p_cansee(td, p)))
 404                         break;
 405                 mtx_lock_spin(&sched_lock);
 406                 /*
 407                  * Return OUR priority if no pid specified,
 408                  * or if one is, report the highest priority
 409                  * in the process.  There isn't much more you can do as
 410                  * there is only room to return a single priority.
 411                  * XXXKSE: maybe need a new interface to report
 412                  * priorities of multiple system scope threads.
 413                  * Note: specifying our own pid is not the same
 414                  * as leaving it zero.
 415                  */
 416                 if (uap->pid == 0) {
 417                         pri_to_rtp(td, &rtp);
 418                 } else {
 419                         struct rtprio rtp2;
 420
 421                         rtp.type = RTP_PRIO_IDLE;
 422                         rtp.prio = RTP_PRIO_MAX;
 423                         FOREACH_THREAD_IN_PROC(p, tdp) {
 424                                 pri_to_rtp(tdp, &rtp2);
 425                                 if (rtp2.type <  rtp.type ||
 426                                     (rtp2.type == rtp.type &&
 427                                     rtp2.prio < rtp.prio)) {
 428                                         rtp.type = rtp2.type;
 429                                         rtp.prio = rtp2.prio;
 430                                 }
 431                         }
 432                 }
 433                 mtx_unlock_spin(&sched_lock);
 434                 PROC_UNLOCK(p);
 435                 return (copyout(&rtp, uap->rtp, sizeof(struct rtprio)));
 436         case RTP_SET:
 437                 if ((error = p_cansched(td, p)) || (error = cierror))
 438                         break;
 439
 440                 /* Disallow setting rtprio in most cases if not superuser. */
 441                 if (priv_check(td, PRIV_SCHED_RTPRIO) != 0) {
 442                         /* can't set someone else's */
 443                         if (uap->pid) {
 444                                 error = EPERM;
 445                                 break;
 446                         }
 447                         /* can't set realtime priority */
 448 /*
 449  * Realtime priority has to be restricted for reasons which should be
 450  * obvious.  However, for idle priority, there is a potential for
 451  * system deadlock if an idleprio process gains a lock on a resource
 452  * that other processes need (and the idleprio process can't run
 453  * due to a CPU-bound normal process).  Fix me!  XXX
 454  */
 455 #if 0
 456                         if (RTP_PRIO_IS_REALTIME(rtp.type)) {
 457 #else
 458                         if (rtp.type != RTP_PRIO_NORMAL) {
 459 #endif
 460                                 error = EPERM;
 461                                 break;
 462                         }
 463                 }
 464
 465                 /*
 466                  * If we are setting our own priority, set just our
 467                  * thread but if we are doing another process,
 468                  * do all the threads on that process. If we
 469                  * specify our own pid we do the latter.
 470                  */
 471                 mtx_lock_spin(&sched_lock);
 472                 if (uap->pid == 0) {
 473                         error = rtp_to_pri(&rtp, td);
 474                 } else {
 475                         FOREACH_THREAD_IN_PROC(p, td) {
 476                                 if ((error = rtp_to_pri(&rtp, td)) != 0)
 477                                         break;
 478                         }
 479                 }
 480                 mtx_unlock_spin(&sched_lock);
 481                 break;
 482         default:
 483                 error = EINVAL;
 484                 break;
 485         }
 486         PROC_UNLOCK(p);
 487         return (error);
 488 }
 489
 490 int
 491 rtp_to_pri(struct rtprio *rtp, struct thread *td)
 492 {
 493         u_char  newpri;
 494
 495         mtx_assert(&sched_lock, MA_OWNED);
 496         if (rtp->prio > RTP_PRIO_MAX)
 497                 return (EINVAL);
 498         switch (RTP_PRIO_BASE(rtp->type)) {
 499         case RTP_PRIO_REALTIME:
 500                 newpri = PRI_MIN_REALTIME + rtp->prio;
 501                 break;
 502         case RTP_PRIO_NORMAL:
 503                 newpri = PRI_MIN_TIMESHARE + rtp->prio;
 504                 break;
 505         case RTP_PRIO_IDLE:
 506                 newpri = PRI_MIN_IDLE + rtp->prio;
 507                 break;
 508         default:
 509                 return (EINVAL);
 510         }
 511         sched_class(td, rtp->type);     /* XXX fix */
 512         sched_user_prio(td, newpri);
 513         if (curthread == td)
 514                 sched_prio(curthread, td->td_user_pri); /* XXX dubious */
 515         return (0);
 516 }
 517
 518 void
 519 pri_to_rtp(struct thread *td, struct rtprio *rtp)
 520 {
 521
 522         mtx_assert(&sched_lock, MA_OWNED);
 523         switch (PRI_BASE(td->td_pri_class)) {
 524         case PRI_REALTIME:
 525                 rtp->prio = td->td_base_user_pri - PRI_MIN_REALTIME;
 526                 break;
 527         case PRI_TIMESHARE:
 528                 rtp->prio = td->td_base_user_pri - PRI_MIN_TIMESHARE;
 529                 break;
 530         case PRI_IDLE:
 531                 rtp->prio = td->td_base_user_pri - PRI_MIN_IDLE;
 532                 break;
 533         default:
 534                 break;
 535         }
 536         rtp->type = td->td_pri_class;
 537 }
 538
 539 #if defined(COMPAT_43)
 540 #ifndef _SYS_SYSPROTO_H_
 541 struct osetrlimit_args {
 542         u_int   which;
 543         struct  orlimit *rlp;
 544 };
 545 #endif
 546 int
 547 osetrlimit(td, uap)
 548         struct thread *td;
 549         register struct osetrlimit_args *uap;
 550 {
 551         struct orlimit olim;
 552         struct rlimit lim;
 553         int error;
 554
 555         if ((error = copyin(uap->rlp, &olim, sizeof(struct orlimit))))
 556                 return (error);
 557         lim.rlim_cur = olim.rlim_cur;
 558         lim.rlim_max = olim.rlim_max;
 559         error = kern_setrlimit(td, uap->which, &lim);
 560         return (error);
 561 }
 562
 563 #ifndef _SYS_SYSPROTO_H_
 564 struct ogetrlimit_args {
 565         u_int   which;
 566         struct  orlimit *rlp;
 567 };
 568 #endif
 569 int
 570 ogetrlimit(td, uap)
 571         struct thread *td;
 572         register struct ogetrlimit_args *uap;
 573 {
 574         struct orlimit olim;
 575         struct rlimit rl;
 576         struct proc *p;
 577         int error;
 578
 579         if (uap->which >= RLIM_NLIMITS)
 580                 return (EINVAL);
 581         p = td->td_proc;
 582         PROC_LOCK(p);
 583         lim_rlimit(p, uap->which, &rl);
 584         PROC_UNLOCK(p);
 585
 586         /*
 587          * XXX would be more correct to convert only RLIM_INFINITY to the
 588          * old RLIM_INFINITY and fail with EOVERFLOW for other larger
 589          * values.  Most 64->32 and 32->16 conversions, including not
 590          * unimportant ones of uids are even more broken than what we
 591          * do here (they blindly truncate).  We don't do this correctly
 592          * here since we have little experience with EOVERFLOW yet.
 593          * Elsewhere, getuid() can't fail...
 594          */
 595         olim.rlim_cur = rl.rlim_cur > 0x7fffffff ? 0x7fffffff : rl.rlim_cur;
 596         olim.rlim_max = rl.rlim_max > 0x7fffffff ? 0x7fffffff : rl.rlim_max;
 597         error = copyout(&olim, uap->rlp, sizeof(olim));
 598         return (error);
 599 }
 600 #endif /* COMPAT_43 */
 601
 602 #ifndef _SYS_SYSPROTO_H_
 603 struct __setrlimit_args {
 604         u_int   which;
 605         struct  rlimit *rlp;
 606 };
 607 #endif
 608 int
 609 setrlimit(td, uap)
 610         struct thread *td;
 611         register struct __setrlimit_args *uap;
 612 {
 613         struct rlimit alim;
 614         int error;
 615
 616         if ((error = copyin(uap->rlp, &alim, sizeof(struct rlimit))))
 617                 return (error);
 618         error = kern_setrlimit(td, uap->which, &alim);
 619         return (error);
 620 }
 621
 622 int
 623 kern_setrlimit(td, which, limp)
 624         struct thread *td;
 625         u_int which;
 626         struct rlimit *limp;
 627 {
 628         struct plimit *newlim, *oldlim;
 629         struct proc *p;
 630         register struct rlimit *alimp;
 631         rlim_t oldssiz;
 632         int error;
 633
 634         if (which >= RLIM_NLIMITS)
 635                 return (EINVAL);
 636
 637         /*
 638          * Preserve historical bugs by treating negative limits as unsigned.
 639          */
 640         if (limp->rlim_cur < 0)
 641                 limp->rlim_cur = RLIM_INFINITY;
 642         if (limp->rlim_max < 0)
 643                 limp->rlim_max = RLIM_INFINITY;
 644
 645         oldssiz = 0;
 646         p = td->td_proc;
 647         newlim = lim_alloc();
 648         PROC_LOCK(p);
 649         oldlim = p->p_limit;
 650         alimp = &oldlim->pl_rlimit[which];
 651         if (limp->rlim_cur > alimp->rlim_max ||
 652             limp->rlim_max > alimp->rlim_max)
 653                 if ((error = priv_check_cred(td->td_ucred,
 654                     PRIV_PROC_SETRLIMIT, SUSER_ALLOWJAIL))) {
 655                         PROC_UNLOCK(p);
 656                         lim_free(newlim);
 657                         return (error);
 658                 }
 659         if (limp->rlim_cur > limp->rlim_max)
 660                 limp->rlim_cur = limp->rlim_max;
 661         lim_copy(newlim, oldlim);
 662         alimp = &newlim->pl_rlimit[which];
 663
 664         switch (which) {
 665
 666         case RLIMIT_CPU:
 667                 mtx_lock_spin(&sched_lock);
 668                 p->p_cpulimit = limp->rlim_cur;
 669                 mtx_unlock_spin(&sched_lock);
 670                 break;
 671         case RLIMIT_DATA:
 672                 if (limp->rlim_cur > maxdsiz)
 673                         limp->rlim_cur = maxdsiz;
 674                 if (limp->rlim_max > maxdsiz)
 675                         limp->rlim_max = maxdsiz;
 676                 break;
 677
 678         case RLIMIT_STACK:
 679                 if (limp->rlim_cur > maxssiz)
 680                         limp->rlim_cur = maxssiz;
 681                 if (limp->rlim_max > maxssiz)
 682                         limp->rlim_max = maxssiz;
 683                 oldssiz = alimp->rlim_cur;
 684                 break;
 685
 686         case RLIMIT_NOFILE:
 687                 if (limp->rlim_cur > maxfilesperproc)
 688                         limp->rlim_cur = maxfilesperproc;
 689                 if (limp->rlim_max > maxfilesperproc)
 690                         limp->rlim_max = maxfilesperproc;
 691                 break;
 692
 693         case RLIMIT_NPROC:
 694                 if (limp->rlim_cur > maxprocperuid)
 695                         limp->rlim_cur = maxprocperuid;
 696                 if (limp->rlim_max > maxprocperuid)
 697                         limp->rlim_max = maxprocperuid;
 698                 if (limp->rlim_cur < 1)
 699                         limp->rlim_cur = 1;
 700                 if (limp->rlim_max < 1)
 701                         limp->rlim_max = 1;
 702                 break;
 703         }
 704         if (td->td_proc->p_sysent->sv_fixlimit != NULL)
 705                 td->td_proc->p_sysent->sv_fixlimit(limp, which);
 706         *alimp = *limp;
 707         p->p_limit = newlim;
 708         PROC_UNLOCK(p);
 709         lim_free(oldlim);
 710
 711         if (which == RLIMIT_STACK) {
 712                 /*
 713                  * Stack is allocated to the max at exec time with only
 714                  * "rlim_cur" bytes accessible.  If stack limit is going
 715                  * up make more accessible, if going down make inaccessible.
 716                  */
 717                 if (limp->rlim_cur != oldssiz) {
 718                         vm_offset_t addr;
 719                         vm_size_t size;
 720                         vm_prot_t prot;
 721
 722                         if (limp->rlim_cur > oldssiz) {
 723                                 prot = p->p_sysent->sv_stackprot;
 724                                 size = limp->rlim_cur - oldssiz;
 725                                 addr = p->p_sysent->sv_usrstack -
 726                                     limp->rlim_cur;
 727                         } else {
 728                                 prot = VM_PROT_NONE;
 729                                 size = oldssiz - limp->rlim_cur;
 730                                 addr = p->p_sysent->sv_usrstack - oldssiz;
 731                         }
 732                         addr = trunc_page(addr);
 733                         size = round_page(size);
 734                         (void)vm_map_protect(&p->p_vmspace->vm_map,
 735                             addr, addr + size, prot, FALSE);
 736                 }
 737         }
 738
 739         return (0);
 740 }
 741
 742 #ifndef _SYS_SYSPROTO_H_
 743 struct __getrlimit_args {
 744         u_int   which;
 745         struct  rlimit *rlp;
 746 };
 747 #endif
 748 /* ARGSUSED */
 749 int
 750 getrlimit(td, uap)
 751         struct thread *td;
 752         register struct __getrlimit_args *uap;
 753 {
 754         struct rlimit rlim;
 755         struct proc *p;
 756         int error;
 757
 758         if (uap->which >= RLIM_NLIMITS)
 759                 return (EINVAL);
 760         p = td->td_proc;
 761         PROC_LOCK(p);
 762         lim_rlimit(p, uap->which, &rlim);
 763         PROC_UNLOCK(p);
 764         error = copyout(&rlim, uap->rlp, sizeof(struct rlimit));
 765         return (error);
 766 }
 767
 768 /*
 769  * Transform the running time and tick information for children of proc p
 770  * into user and system time usage.
 771  */
 772 void
 773 calccru(p, up, sp)
 774         struct proc *p;
 775         struct timeval *up;
 776         struct timeval *sp;
 777 {
 778
 779         PROC_LOCK_ASSERT(p, MA_OWNED);
 780         calcru1(p, &p->p_crux, up, sp);
 781 }
 782
 783 /*
 784  * Transform the running time and tick information in proc p into user
 785  * and system time usage.  If appropriate, include the current time slice
 786  * on this CPU.
 787  */
 788 void
 789 calcru(struct proc *p, struct timeval *up, struct timeval *sp)
 790 {
 791         struct rusage_ext rux;
 792         struct thread *td;
 793         uint64_t u;
 794
 795         PROC_LOCK_ASSERT(p, MA_OWNED);
 796         mtx_assert(&sched_lock, MA_NOTOWNED);
 797         mtx_lock_spin(&sched_lock);
 798
 799         /*
 800          * If we are getting stats for the current process, then add in the
 801          * stats that this thread has accumulated in its current time slice.
 802          * We reset the thread and CPU state as if we had performed a context
 803          * switch right here.
 804          */
 805         if (curthread->td_proc == p) {
 806                 td = curthread;
 807                 u = cpu_ticks();
 808                 p->p_rux.rux_runtime += u - PCPU_GET(switchtime);
 809                 PCPU_SET(switchtime, u);
 810                 p->p_rux.rux_uticks += td->td_uticks;
 811                 td->td_uticks = 0;
 812                 p->p_rux.rux_iticks += td->td_iticks;
 813                 td->td_iticks = 0;
 814                 p->p_rux.rux_sticks += td->td_sticks;
 815                 td->td_sticks = 0;
 816         }
 817         /* Work on a copy of p_rux so we can let go of sched_lock */
 818         rux = p->p_rux;
 819         mtx_unlock_spin(&sched_lock);
 820         calcru1(p, &rux, up, sp);
 821         /* Update the result from the p_rux copy */
 822         p->p_rux.rux_uu = rux.rux_uu;
 823         p->p_rux.rux_su = rux.rux_su;
 824         p->p_rux.rux_tu = rux.rux_tu;
 825 }
 826
 827 static void
 828 calcru1(struct proc *p, struct rusage_ext *ruxp, struct timeval *up,
 829     struct timeval *sp)
 830 {
 831         /* {user, system, interrupt, total} {ticks, usec}: */
 832         u_int64_t ut, uu, st, su, it, tt, tu;
 833
 834         ut = ruxp->rux_uticks;
 835         st = ruxp->rux_sticks;
 836         it = ruxp->rux_iticks;
 837         tt = ut + st + it;
 838         if (tt == 0) {
 839                 /* Avoid divide by zero */
 840                 st = 1;
 841                 tt = 1;
 842         }
 843         tu = cputick2usec(ruxp->rux_runtime);
 844         if ((int64_t)tu < 0) {
 845                 /* XXX: this should be an assert /phk */
 846                 printf("calcru: negative runtime of %jd usec for pid %d (%s)\n",
 847                     (intmax_t)tu, p->p_pid, p->p_comm);
 848                 tu = ruxp->rux_tu;
 849         }
 850
 851         if (tu >= ruxp->rux_tu) {
 852                 /*
 853                  * The normal case, time increased.
 854                  * Enforce monotonicity of bucketed numbers.
 855                  */
 856                 uu = (tu * ut) / tt;
 857                 if (uu < ruxp->rux_uu)
 858                         uu = ruxp->rux_uu;
 859                 su = (tu * st) / tt;
 860                 if (su < ruxp->rux_su)
 861                         su = ruxp->rux_su;
 862         } else if (tu + 3 > ruxp->rux_tu || 101 * tu > 100 * ruxp->rux_tu) {
 863                 /*
 864                  * When we calibrate the cputicker, it is not uncommon to
 865                  * see the presumably fixed frequency increase slightly over
 866                  * time as a result of thermal stabilization and NTP
 867                  * discipline (of the reference clock).  We therefore ignore
 868                  * a bit of backwards slop because we  expect to catch up
 869                  * shortly.  We use a 3 microsecond limit to catch low
 870                  * counts and a 1% limit for high counts.
 871                  */
 872                 uu = ruxp->rux_uu;
 873                 su = ruxp->rux_su;
 874                 tu = ruxp->rux_tu;
 875         } else { /* tu < ruxp->rux_tu */
 876                 /*
 877                  * What happene here was likely that a laptop, which ran at
 878                  * a reduced clock frequency at boot, kicked into high gear.
 879                  * The wisdom of spamming this message in that case is
 880                  * dubious, but it might also be indicative of something
 881                  * serious, so lets keep it and hope laptops can be made
 882                  * more truthful about their CPU speed via ACPI.
 883                  */
 884                 printf("calcru: runtime went backwards from %ju usec "
 885                     "to %ju usec for pid %d (%s)\n",
 886                     (uintmax_t)ruxp->rux_tu, (uintmax_t)tu,
 887                     p->p_pid, p->p_comm);
 888                 uu = (tu * ut) / tt;
 889                 su = (tu * st) / tt;
 890         }
 891
 892         ruxp->rux_uu = uu;
 893         ruxp->rux_su = su;
 894         ruxp->rux_tu = tu;
 895
 896         up->tv_sec = uu / 1000000;
 897         up->tv_usec = uu % 1000000;
 898         sp->tv_sec = su / 1000000;
 899         sp->tv_usec = su % 1000000;
 900 }
 901
 902 #ifndef _SYS_SYSPROTO_H_
 903 struct getrusage_args {
 904         int     who;
 905         struct  rusage *rusage;
 906 };
 907 #endif
 908 int
 909 getrusage(td, uap)
 910         register struct thread *td;
 911         register struct getrusage_args *uap;
 912 {
 913         struct rusage ru;
 914         int error;
 915
 916         error = kern_getrusage(td, uap->who, &ru);
 917         if (error == 0)
 918                 error = copyout(&ru, uap->rusage, sizeof(struct rusage));
 919         return (error);
 920 }
 921
 922 int
 923 kern_getrusage(td, who, rup)
 924         struct thread *td;
 925         int who;
 926         struct rusage *rup;
 927 {
 928         struct proc *p;
 929
 930         p = td->td_proc;
 931         PROC_LOCK(p);
 932         switch (who) {
 933
 934         case RUSAGE_SELF:
 935                 *rup = p->p_stats->p_ru;
 936                 calcru(p, &rup->ru_utime, &rup->ru_stime);
 937                 break;
 938
 939         case RUSAGE_CHILDREN:
 940                 *rup = p->p_stats->p_cru;
 941                 calccru(p, &rup->ru_utime, &rup->ru_stime);
 942                 break;
 943
 944         default:
 945                 PROC_UNLOCK(p);
 946                 return (EINVAL);
 947         }
 948         PROC_UNLOCK(p);
 949         return (0);
 950 }
 951
 952 void
 953 ruadd(ru, rux, ru2, rux2)
 954         struct rusage *ru;
 955         struct rusage_ext *rux;
 956         struct rusage *ru2;
 957         struct rusage_ext *rux2;
 958 {
 959         register long *ip, *ip2;
 960         register int i;
 961
 962         rux->rux_runtime += rux2->rux_runtime;
 963         rux->rux_uticks += rux2->rux_uticks;
 964         rux->rux_sticks += rux2->rux_sticks;
 965         rux->rux_iticks += rux2->rux_iticks;
 966         rux->rux_uu += rux2->rux_uu;
 967         rux->rux_su += rux2->rux_su;
 968         rux->rux_tu += rux2->rux_tu;
 969         if (ru->ru_maxrss < ru2->ru_maxrss)
 970                 ru->ru_maxrss = ru2->ru_maxrss;
 971         ip = &ru->ru_first;
 972         ip2 = &ru2->ru_first;
 973         for (i = &ru->ru_last - &ru->ru_first; i >= 0; i--)
 974                 *ip++ += *ip2++;
 975 }
 976
 977 /*
 978  * Allocate a new resource limits structure and initialize its
 979  * reference count and mutex pointer.
 980  */
 981 struct plimit *
 982 lim_alloc()
 983 {
 984         struct plimit *limp;
 985
 986         limp = malloc(sizeof(struct plimit), M_PLIMIT, M_WAITOK);
 987         refcount_init(&limp->pl_refcnt, 1);
 988         return (limp);
 989 }
 990
 991 struct plimit *
 992 lim_hold(limp)
 993         struct plimit *limp;
 994 {
 995
 996         refcount_acquire(&limp->pl_refcnt);
 997         return (limp);
 998 }
 999
1000 void
1001 lim_free(limp)
1002         struct plimit *limp;
1003 {
1004
1005         KASSERT(limp->pl_refcnt > 0, ("plimit refcnt underflow"));
1006         if (refcount_release(&limp->pl_refcnt))
1007                 free((void *)limp, M_PLIMIT);
1008 }
1009
1010 /*
1011  * Make a copy of the plimit structure.
1012  * We share these structures copy-on-write after fork.
1013  */
1014 void
1015 lim_copy(dst, src)
1016         struct plimit *dst, *src;
1017 {
1018
1019         KASSERT(dst->pl_refcnt == 1, ("lim_copy to shared limit"));
1020         bcopy(src->pl_rlimit, dst->pl_rlimit, sizeof(src->pl_rlimit));
1021 }
1022
1023 /*
1024  * Return the hard limit for a particular system resource.  The
1025  * which parameter specifies the index into the rlimit array.
1026  */
1027 rlim_t
1028 lim_max(struct proc *p, int which)
1029 {
1030         struct rlimit rl;
1031
1032         lim_rlimit(p, which, &rl);
1033         return (rl.rlim_max);
1034 }
1035
1036 /*
1037  * Return the current (soft) limit for a particular system resource.
1038  * The which parameter which specifies the index into the rlimit array
1039  */
1040 rlim_t
1041 lim_cur(struct proc *p, int which)
1042 {
1043         struct rlimit rl;
1044
1045         lim_rlimit(p, which, &rl);
1046         return (rl.rlim_cur);
1047 }
1048
1049 /*
1050  * Return a copy of the entire rlimit structure for the system limit
1051  * specified by 'which' in the rlimit structure pointed to by 'rlp'.
1052  */
1053 void
1054 lim_rlimit(struct proc *p, int which, struct rlimit *rlp)
1055 {
1056
1057         PROC_LOCK_ASSERT(p, MA_OWNED);
1058         KASSERT(which >= 0 && which < RLIM_NLIMITS,
1059             ("request for invalid resource limit"));
1060         *rlp = p->p_limit->pl_rlimit[which];
1061         if (p->p_sysent->sv_fixlimit != NULL)
1062                 p->p_sysent->sv_fixlimit(rlp, which);
1063 }
1064
1065 /*
1066  * Find the uidinfo structure for a uid.  This structure is used to
1067  * track the total resource consumption (process count, socket buffer
1068  * size, etc.) for the uid and impose limits.
1069  */
1070 void
1071 uihashinit()
1072 {
1073
1074         uihashtbl = hashinit(maxproc / 16, M_UIDINFO, &uihash);
1075         mtx_init(&uihashtbl_mtx, "uidinfo hash", NULL, MTX_DEF);
1076 }
1077
1078 /*
1079  * Look up a uidinfo struct for the parameter uid.
1080  * uihashtbl_mtx must be locked.
1081  */
1082 static struct uidinfo *
1083 uilookup(uid)
1084         uid_t uid;
1085 {
1086         struct uihashhead *uipp;
1087         struct uidinfo *uip;
1088
1089         mtx_assert(&uihashtbl_mtx, MA_OWNED);
1090         uipp = UIHASH(uid);
1091         LIST_FOREACH(uip, uipp, ui_hash)
1092                 if (uip->ui_uid == uid)
1093                         break;
1094
1095         return (uip);
1096 }
1097
1098 /*
1099  * Find or allocate a struct uidinfo for a particular uid.
1100  * Increase refcount on uidinfo struct returned.
1101  * uifree() should be called on a struct uidinfo when released.
1102  */
1103 struct uidinfo *
1104 uifind(uid)
1105         uid_t uid;
1106 {
1107         struct uidinfo *old_uip, *uip;
1108
1109         mtx_lock(&uihashtbl_mtx);
1110         uip = uilookup(uid);
1111         if (uip == NULL) {
1112                 mtx_unlock(&uihashtbl_mtx);
1113                 uip = malloc(sizeof(*uip), M_UIDINFO, M_WAITOK | M_ZERO);
1114                 mtx_lock(&uihashtbl_mtx);
1115                 /*
1116                  * There's a chance someone created our uidinfo while we
1117                  * were in malloc and not holding the lock, so we have to
1118                  * make sure we don't insert a duplicate uidinfo.
1119                  */
1120                 if ((old_uip = uilookup(uid)) != NULL) {
1121                         /* Someone else beat us to it. */
1122                         free(uip, M_UIDINFO);
1123                         uip = old_uip;
1124                 } else {
1125                         uip->ui_mtxp = mtx_pool_alloc(mtxpool_sleep);
1126                         uip->ui_uid = uid;
1127                         LIST_INSERT_HEAD(UIHASH(uid), uip, ui_hash);
1128                 }
1129         }
1130         uihold(uip);
1131         mtx_unlock(&uihashtbl_mtx);
1132         return (uip);
1133 }
1134
1135 /*
1136  * Place another refcount on a uidinfo struct.
1137  */
1138 void
1139 uihold(uip)
1140         struct uidinfo *uip;
1141 {
1142
1143         UIDINFO_LOCK(uip);
1144         uip->ui_ref++;
1145         UIDINFO_UNLOCK(uip);
1146 }
1147
1148 /*-
1149  * Since uidinfo structs have a long lifetime, we use an
1150  * opportunistic refcounting scheme to avoid locking the lookup hash
1151  * for each release.
1152  *
1153  * If the refcount hits 0, we need to free the structure,
1154  * which means we need to lock the hash.
1155  * Optimal case:
1156  *   After locking the struct and lowering the refcount, if we find
1157  *   that we don't need to free, simply unlock and return.
1158  * Suboptimal case:
1159  *   If refcount lowering results in need to free, bump the count
1160  *   back up, lose the lock and aquire the locks in the proper
1161  *   order to try again.
1162  */
1163 void
1164 uifree(uip)
1165         struct uidinfo *uip;
1166 {
1167
1168         /* Prepare for optimal case. */
1169         UIDINFO_LOCK(uip);
1170
1171         if (--uip->ui_ref != 0) {
1172                 UIDINFO_UNLOCK(uip);
1173                 return;
1174         }
1175
1176         /* Prepare for suboptimal case. */
1177         uip->ui_ref++;
1178         UIDINFO_UNLOCK(uip);
1179         mtx_lock(&uihashtbl_mtx);
1180         UIDINFO_LOCK(uip);
1181
1182         /*
1183          * We must subtract one from the count again because we backed out
1184          * our initial subtraction before dropping the lock.
1185          * Since another thread may have added a reference after we dropped the
1186          * initial lock we have to test for zero again.
1187          */
1188         if (--uip->ui_ref == 0) {
1189                 LIST_REMOVE(uip, ui_hash);
1190                 mtx_unlock(&uihashtbl_mtx);
1191                 if (uip->ui_sbsize != 0)
1192                         printf("freeing uidinfo: uid = %d, sbsize = %jd\n",
1193                             uip->ui_uid, (intmax_t)uip->ui_sbsize);
1194                 if (uip->ui_proccnt != 0)
1195                         printf("freeing uidinfo: uid = %d, proccnt = %ld\n",
1196                             uip->ui_uid, uip->ui_proccnt);
1197                 UIDINFO_UNLOCK(uip);
1198                 FREE(uip, M_UIDINFO);
1199                 return;
1200         }
1201
1202         mtx_unlock(&uihashtbl_mtx);
1203         UIDINFO_UNLOCK(uip);
1204 }
1205
1206 /*
1207  * Change the count associated with number of processes
1208  * a given user is using.  When 'max' is 0, don't enforce a limit
1209  */
1210 int
1211 chgproccnt(uip, diff, max)
1212         struct  uidinfo *uip;
1213         int     diff;
1214         int     max;
1215 {
1216
1217         UIDINFO_LOCK(uip);
1218         /* Don't allow them to exceed max, but allow subtraction. */
1219         if (diff > 0 && uip->ui_proccnt + diff > max && max != 0) {
1220                 UIDINFO_UNLOCK(uip);
1221                 return (0);
1222         }
1223         uip->ui_proccnt += diff;
1224         if (uip->ui_proccnt < 0)
1225                 printf("negative proccnt for uid = %d\n", uip->ui_uid);
1226         UIDINFO_UNLOCK(uip);
1227         return (1);
1228 }
1229
1230 /*
1231  * Change the total socket buffer size a user has used.
1232  */
1233 int
1234 chgsbsize(uip, hiwat, to, max)
1235         struct  uidinfo *uip;
1236         u_int  *hiwat;
1237         u_int   to;
1238         rlim_t  max;
1239 {
1240         rlim_t new;
1241
1242         UIDINFO_LOCK(uip);
1243         new = uip->ui_sbsize + to - *hiwat;
1244         /* Don't allow them to exceed max, but allow subtraction. */
1245         if (to > *hiwat && new > max) {
1246                 UIDINFO_UNLOCK(uip);
1247                 return (0);
1248         }
1249         uip->ui_sbsize = new;
1250         UIDINFO_UNLOCK(uip);
1251         *hiwat = to;
1252         if (new < 0)
1253                 printf("negative sbsize for uid = %d\n", uip->ui_uid);
1254         return (1);
1255 }