sys/kern/kern_resource.c

   1 /*-
   2  * Copyright (c) 1982, 1986, 1991, 1993
   3  *      The Regents of the University of California.  All rights reserved.
   4  * (c) UNIX System Laboratories, Inc.
   5  * All or some portions of this file are derived from material licensed
   6  * to the University of California by American Telephone and Telegraph
   7  * Co. or Unix System Laboratories, Inc. and are reproduced herein with
   8  * the permission of UNIX System Laboratories, Inc.
   9  *
  10  * Redistribution and use in source and binary forms, with or without
  11  * modification, are permitted provided that the following conditions
  12  * are met:
  13  * 1. Redistributions of source code must retain the above copyright
  14  *    notice, this list of conditions and the following disclaimer.
  15  * 2. Redistributions in binary form must reproduce the above copyright
  16  *    notice, this list of conditions and the following disclaimer in the
  17  *    documentation and/or other materials provided with the distribution.
  18  * 4. Neither the name of the University nor the names of its contributors
  19  *    may be used to endorse or promote products derived from this software
  20  *    without specific prior written permission.
  21  *
  22  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  23  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  24  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  25  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  26  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  27  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  28  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  29  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  30  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  31  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  32  * SUCH DAMAGE.
  33  *
  34  *      @(#)kern_resource.c     8.5 (Berkeley) 1/21/94
  35  */
  36
  37 #include <sys/cdefs.h>
  38 __FBSDID("$FreeBSD$");
  39
  40 #include "opt_compat.h"
  41
  42 #include <sys/param.h>
  43 #include <sys/systm.h>
  44 #include <sys/sysproto.h>
  45 #include <sys/file.h>
  46 #include <sys/kernel.h>
  47 #include <sys/lock.h>
  48 #include <sys/malloc.h>
  49 #include <sys/mutex.h>
  50 #include <sys/proc.h>
  51 #include <sys/refcount.h>
  52 #include <sys/resourcevar.h>
  53 #include <sys/sched.h>
  54 #include <sys/sx.h>
  55 #include <sys/syscallsubr.h>
  56 #include <sys/sysent.h>
  57 #include <sys/time.h>
  58
  59 #include <vm/vm.h>
  60 #include <vm/vm_param.h>
  61 #include <vm/pmap.h>
  62 #include <vm/vm_map.h>
  63
  64
  65 static MALLOC_DEFINE(M_PLIMIT, "plimit", "plimit structures");
  66 static MALLOC_DEFINE(M_UIDINFO, "uidinfo", "uidinfo structures");
  67 #define UIHASH(uid)     (&uihashtbl[(uid) & uihash])
  68 static struct mtx uihashtbl_mtx;
  69 static LIST_HEAD(uihashhead, uidinfo) *uihashtbl;
  70 static u_long uihash;           /* size of hash table - 1 */
  71
  72 static void     calcru1(struct proc *p, struct rusage_ext *ruxp,
  73                     struct timeval *up, struct timeval *sp);
  74 static int      donice(struct thread *td, struct proc *chgp, int n);
  75 static struct uidinfo *uilookup(uid_t uid);
  76
  77 /*
  78  * Resource controls and accounting.
  79  */
  80
  81 #ifndef _SYS_SYSPROTO_H_
  82 struct getpriority_args {
  83         int     which;
  84         int     who;
  85 };
  86 #endif
  87 /*
  88  * MPSAFE
  89  */
  90 int
  91 getpriority(td, uap)
  92         struct thread *td;
  93         register struct getpriority_args *uap;
  94 {
  95         struct proc *p;
  96         struct pgrp *pg;
  97         int error, low;
  98
  99         error = 0;
 100         low = PRIO_MAX + 1;
 101         switch (uap->which) {
 102
 103         case PRIO_PROCESS:
 104                 if (uap->who == 0)
 105                         low = td->td_proc->p_nice;
 106                 else {
 107                         p = pfind(uap->who);
 108                         if (p == NULL)
 109                                 break;
 110                         if (p_cansee(td, p) == 0)
 111                                 low = p->p_nice;
 112                         PROC_UNLOCK(p);
 113                 }
 114                 break;
 115
 116         case PRIO_PGRP:
 117                 sx_slock(&proctree_lock);
 118                 if (uap->who == 0) {
 119                         pg = td->td_proc->p_pgrp;
 120                         PGRP_LOCK(pg);
 121                 } else {
 122                         pg = pgfind(uap->who);
 123                         if (pg == NULL) {
 124                                 sx_sunlock(&proctree_lock);
 125                                 break;
 126                         }
 127                 }
 128                 sx_sunlock(&proctree_lock);
 129                 LIST_FOREACH(p, &pg->pg_members, p_pglist) {
 130                         PROC_LOCK(p);
 131                         if (!p_cansee(td, p)) {
 132                                 if (p->p_nice < low)
 133                                         low = p->p_nice;
 134                         }
 135                         PROC_UNLOCK(p);
 136                 }
 137                 PGRP_UNLOCK(pg);
 138                 break;
 139
 140         case PRIO_USER:
 141                 if (uap->who == 0)
 142                         uap->who = td->td_ucred->cr_uid;
 143                 sx_slock(&allproc_lock);
 144                 LIST_FOREACH(p, &allproc, p_list) {
 145                         PROC_LOCK(p);
 146                         if (!p_cansee(td, p) &&
 147                             p->p_ucred->cr_uid == uap->who) {
 148                                 if (p->p_nice < low)
 149                                         low = p->p_nice;
 150                         }
 151                         PROC_UNLOCK(p);
 152                 }
 153                 sx_sunlock(&allproc_lock);
 154                 break;
 155
 156         default:
 157                 error = EINVAL;
 158                 break;
 159         }
 160         if (low == PRIO_MAX + 1 && error == 0)
 161                 error = ESRCH;
 162         td->td_retval[0] = low;
 163         return (error);
 164 }
 165
 166 #ifndef _SYS_SYSPROTO_H_
 167 struct setpriority_args {
 168         int     which;
 169         int     who;
 170         int     prio;
 171 };
 172 #endif
 173 /*
 174  * MPSAFE
 175  */
 176 int
 177 setpriority(td, uap)
 178         struct thread *td;
 179         struct setpriority_args *uap;
 180 {
 181         struct proc *curp, *p;
 182         struct pgrp *pg;
 183         int found = 0, error = 0;
 184
 185         curp = td->td_proc;
 186         switch (uap->which) {
 187         case PRIO_PROCESS:
 188                 if (uap->who == 0) {
 189                         PROC_LOCK(curp);
 190                         error = donice(td, curp, uap->prio);
 191                         PROC_UNLOCK(curp);
 192                 } else {
 193                         p = pfind(uap->who);
 194                         if (p == 0)
 195                                 break;
 196                         if (p_cansee(td, p) == 0)
 197                                 error = donice(td, p, uap->prio);
 198                         PROC_UNLOCK(p);
 199                 }
 200                 found++;
 201                 break;
 202
 203         case PRIO_PGRP:
 204                 sx_slock(&proctree_lock);
 205                 if (uap->who == 0) {
 206                         pg = curp->p_pgrp;
 207                         PGRP_LOCK(pg);
 208                 } else {
 209                         pg = pgfind(uap->who);
 210                         if (pg == NULL) {
 211                                 sx_sunlock(&proctree_lock);
 212                                 break;
 213                         }
 214                 }
 215                 sx_sunlock(&proctree_lock);
 216                 LIST_FOREACH(p, &pg->pg_members, p_pglist) {
 217                         PROC_LOCK(p);
 218                         if (!p_cansee(td, p)) {
 219                                 error = donice(td, p, uap->prio);
 220                                 found++;
 221                         }
 222                         PROC_UNLOCK(p);
 223                 }
 224                 PGRP_UNLOCK(pg);
 225                 break;
 226
 227         case PRIO_USER:
 228                 if (uap->who == 0)
 229                         uap->who = td->td_ucred->cr_uid;
 230                 sx_slock(&allproc_lock);
 231                 FOREACH_PROC_IN_SYSTEM(p) {
 232                         PROC_LOCK(p);
 233                         if (p->p_ucred->cr_uid == uap->who &&
 234                             !p_cansee(td, p)) {
 235                                 error = donice(td, p, uap->prio);
 236                                 found++;
 237                         }
 238                         PROC_UNLOCK(p);
 239                 }
 240                 sx_sunlock(&allproc_lock);
 241                 break;
 242
 243         default:
 244                 error = EINVAL;
 245                 break;
 246         }
 247         if (found == 0 && error == 0)
 248                 error = ESRCH;
 249         return (error);
 250 }
 251
 252 /*
 253  * Set "nice" for a (whole) process.
 254  */
 255 static int
 256 donice(struct thread *td, struct proc *p, int n)
 257 {
 258         int error;
 259
 260         PROC_LOCK_ASSERT(p, MA_OWNED);
 261         if ((error = p_cansched(td, p)))
 262                 return (error);
 263         if (n > PRIO_MAX)
 264                 n = PRIO_MAX;
 265         if (n < PRIO_MIN)
 266                 n = PRIO_MIN;
 267         if (n < p->p_nice && suser(td) != 0)
 268                 return (EACCES);
 269         mtx_lock_spin(&sched_lock);
 270         sched_nice(p, n);
 271         mtx_unlock_spin(&sched_lock);
 272         return (0);
 273 }
 274
 275 /*
 276  * Set realtime priority for LWP.
 277  *
 278  * MPSAFE
 279  */
 280 #ifndef _SYS_SYSPROTO_H_
 281 struct rtprio_thread_args {
 282         int             function;
 283         lwpid_t         lwpid;
 284         struct rtprio   *rtp;
 285 };
 286 #endif
 287
 288 int
 289 rtprio_thread(struct thread *td, struct rtprio_thread_args *uap)
 290 {
 291         struct proc *curp;
 292         struct proc *p;
 293         struct rtprio rtp;
 294         struct thread *td1;
 295         int cierror, error;
 296
 297         /* Perform copyin before acquiring locks if needed. */
 298         if (uap->function == RTP_SET)
 299                 cierror = copyin(uap->rtp, &rtp, sizeof(struct rtprio));
 300         else
 301                 cierror = 0;
 302
 303         curp = td->td_proc;
 304         /*
 305          * Though lwpid is unique, only current process is supported
 306          * since there is no efficient way to look up a LWP yet.
 307          */
 308         p = curp;
 309         PROC_LOCK(p);
 310
 311         switch (uap->function) {
 312         case RTP_LOOKUP:
 313                 if ((error = p_cansee(td, p)))
 314                         break;
 315                 mtx_lock_spin(&sched_lock);
 316                 if (uap->lwpid == 0 || uap->lwpid == td->td_tid)
 317                         td1 = td;
 318                 else
 319                         td1 = thread_find(p, uap->lwpid);
 320                 if (td1 != NULL)
 321 #ifdef KSE
 322                         pri_to_rtp(td1->td_ksegrp, &rtp);
 323 #else
 324                         pri_to_rtp(td1, &rtp);
 325 #endif
 326                 else
 327                         error = ESRCH;
 328                 mtx_unlock_spin(&sched_lock);
 329                 PROC_UNLOCK(p);
 330                 return (copyout(&rtp, uap->rtp, sizeof(struct rtprio)));
 331         case RTP_SET:
 332                 if ((error = p_cansched(td, p)) || (error = cierror))
 333                         break;
 334
 335                 /* Disallow setting rtprio in most cases if not superuser. */
 336                 if (suser(td) != 0) {
 337                         /* can't set realtime priority */
 338 /*
 339  * Realtime priority has to be restricted for reasons which should be
 340  * obvious.  However, for idle priority, there is a potential for
 341  * system deadlock if an idleprio process gains a lock on a resource
 342  * that other processes need (and the idleprio process can't run
 343  * due to a CPU-bound normal process).  Fix me!  XXX
 344  */
 345 #if 0
 346                         if (RTP_PRIO_IS_REALTIME(rtp.type)) {
 347 #else
 348                         if (rtp.type != RTP_PRIO_NORMAL) {
 349 #endif
 350                                 error = EPERM;
 351                                 break;
 352                         }
 353                 }
 354
 355                 mtx_lock_spin(&sched_lock);
 356                 if (uap->lwpid == 0 || uap->lwpid == td->td_tid)
 357                         td1 = td;
 358                 else
 359                         td1 = thread_find(p, uap->lwpid);
 360                 if (td1 != NULL)
 361 #ifdef KSE
 362                         error = rtp_to_pri(&rtp, td1->td_ksegrp);
 363 #else
 364                         error = rtp_to_pri(&rtp, td1);
 365 #endif
 366                 else
 367                         error = ESRCH;
 368                 mtx_unlock_spin(&sched_lock);
 369                 break;
 370         default:
 371                 error = EINVAL;
 372                 break;
 373         }
 374         PROC_UNLOCK(p);
 375         return (error);
 376 }
 377
 378 /*
 379  * Set realtime priority.
 380  *
 381  * MPSAFE
 382  */
 383 #ifndef _SYS_SYSPROTO_H_
 384 struct rtprio_args {
 385         int             function;
 386         pid_t           pid;
 387         struct rtprio   *rtp;
 388 };
 389 #endif
 390
 391 int
 392 rtprio(td, uap)
 393         struct thread *td;              /* curthread */
 394         register struct rtprio_args *uap;
 395 {
 396         struct proc *curp;
 397         struct proc *p;
 398 #ifdef KSE
 399         struct ksegrp *kg;
 400 #else
 401         struct thread *tdp;
 402 #endif
 403         struct rtprio rtp;
 404         int cierror, error;
 405
 406         /* Perform copyin before acquiring locks if needed. */
 407         if (uap->function == RTP_SET)
 408                 cierror = copyin(uap->rtp, &rtp, sizeof(struct rtprio));
 409         else
 410                 cierror = 0;
 411
 412         curp = td->td_proc;
 413         if (uap->pid == 0) {
 414                 p = curp;
 415                 PROC_LOCK(p);
 416         } else {
 417                 p = pfind(uap->pid);
 418                 if (p == NULL)
 419                         return (ESRCH);
 420         }
 421
 422         switch (uap->function) {
 423         case RTP_LOOKUP:
 424                 if ((error = p_cansee(td, p)))
 425                         break;
 426                 mtx_lock_spin(&sched_lock);
 427                 /*
 428                  * Return OUR priority if no pid specified,
 429                  * or if one is, report the highest priority
 430                  * in the process.  There isn't much more you can do as
 431                  * there is only room to return a single priority.
 432                  * XXXKSE: maybe need a new interface to report
 433                  * priorities of multiple system scope threads.
 434                  * Note: specifying our own pid is not the same
 435                  * as leaving it zero.
 436                  */
 437                 if (uap->pid == 0) {
 438 #ifdef KSE
 439                         pri_to_rtp(td->td_ksegrp, &rtp);
 440 #else
 441                         pri_to_rtp(td, &rtp);
 442 #endif
 443                 } else {
 444                         struct rtprio rtp2;
 445
 446                         rtp.type = RTP_PRIO_IDLE;
 447                         rtp.prio = RTP_PRIO_MAX;
 448 #ifdef KSE
 449                         FOREACH_KSEGRP_IN_PROC(p, kg) {
 450                                 pri_to_rtp(kg, &rtp2);
 451 #else
 452                         FOREACH_THREAD_IN_PROC(p, tdp) {
 453                                 pri_to_rtp(tdp, &rtp2);
 454 #endif
 455                                 if (rtp2.type <  rtp.type ||
 456                                     (rtp2.type == rtp.type &&
 457                                     rtp2.prio < rtp.prio)) {
 458                                         rtp.type = rtp2.type;
 459                                         rtp.prio = rtp2.prio;
 460                                 }
 461                         }
 462                 }
 463                 mtx_unlock_spin(&sched_lock);
 464                 PROC_UNLOCK(p);
 465                 return (copyout(&rtp, uap->rtp, sizeof(struct rtprio)));
 466         case RTP_SET:
 467                 if ((error = p_cansched(td, p)) || (error = cierror))
 468                         break;
 469
 470                 /* Disallow setting rtprio in most cases if not superuser. */
 471                 if (suser(td) != 0) {
 472                         /* can't set someone else's */
 473                         if (uap->pid) {
 474                                 error = EPERM;
 475                                 break;
 476                         }
 477                         /* can't set realtime priority */
 478 /*
 479  * Realtime priority has to be restricted for reasons which should be
 480  * obvious.  However, for idle priority, there is a potential for
 481  * system deadlock if an idleprio process gains a lock on a resource
 482  * that other processes need (and the idleprio process can't run
 483  * due to a CPU-bound normal process).  Fix me!  XXX
 484  */
 485 #if 0
 486                         if (RTP_PRIO_IS_REALTIME(rtp.type)) {
 487 #else
 488                         if (rtp.type != RTP_PRIO_NORMAL) {
 489 #endif
 490                                 error = EPERM;
 491                                 break;
 492                         }
 493                 }
 494
 495 #ifdef KSE
 496                 /*
 497                  * If we are setting our own priority, set just our
 498                  * KSEGRP but if we are doing another process,
 499                  * do all the groups on that process. If we
 500                  * specify our own pid we do the latter.
 501                  */
 502 #else
 503                 /*
 504                  * If we are setting our own priority, set just our
 505                  * thread but if we are doing another process,
 506                  * do all the threads on that process. If we
 507                  * specify our own pid we do the latter.
 508                  */
 509 #endif
 510                 mtx_lock_spin(&sched_lock);
 511                 if (uap->pid == 0) {
 512 #ifdef KSE
 513                         error = rtp_to_pri(&rtp, td->td_ksegrp);
 514 #else
 515                         error = rtp_to_pri(&rtp, td);
 516 #endif
 517                 } else {
 518 #ifdef KSE
 519                         FOREACH_KSEGRP_IN_PROC(p, kg) {
 520                                 if ((error = rtp_to_pri(&rtp, kg)) != 0) {
 521                                         break;
 522                                 }
 523 #else
 524                         FOREACH_THREAD_IN_PROC(p, td) {
 525                                 if ((error = rtp_to_pri(&rtp, td)) != 0)
 526                                         break;
 527 #endif
 528                         }
 529                 }
 530                 mtx_unlock_spin(&sched_lock);
 531                 break;
 532         default:
 533                 error = EINVAL;
 534                 break;
 535         }
 536         PROC_UNLOCK(p);
 537         return (error);
 538 }
 539
 540 int
 541 #ifdef KSE
 542 rtp_to_pri(struct rtprio *rtp, struct ksegrp *kg)
 543 #else
 544 rtp_to_pri(struct rtprio *rtp, struct thread *td)
 545 #endif
 546 {
 547
 548         mtx_assert(&sched_lock, MA_OWNED);
 549         if (rtp->prio > RTP_PRIO_MAX)
 550                 return (EINVAL);
 551         switch (RTP_PRIO_BASE(rtp->type)) {
 552         case RTP_PRIO_REALTIME:
 553 #ifdef KSE
 554                 kg->kg_user_pri = PRI_MIN_REALTIME + rtp->prio;
 555 #else
 556                 td->td_user_pri = PRI_MIN_REALTIME + rtp->prio;
 557 #endif
 558                 break;
 559         case RTP_PRIO_NORMAL:
 560 #ifdef KSE
 561                 kg->kg_user_pri = PRI_MIN_TIMESHARE + rtp->prio;
 562 #else
 563                 td->td_user_pri = PRI_MIN_TIMESHARE + rtp->prio;
 564 #endif
 565                 break;
 566         case RTP_PRIO_IDLE:
 567 #ifdef KSE
 568                 kg->kg_user_pri = PRI_MIN_IDLE + rtp->prio;
 569 #else
 570                 td->td_user_pri = PRI_MIN_IDLE + rtp->prio;
 571 #endif
 572                 break;
 573         default:
 574                 return (EINVAL);
 575         }
 576 #ifdef KSE
 577         sched_class(kg, rtp->type);
 578         if (curthread->td_ksegrp == kg) {
 579                 sched_prio(curthread, kg->kg_user_pri); /* XXX dubious */
 580         }
 581 #else
 582         sched_class(td, rtp->type);     /* XXX fix */
 583         if (curthread == td)
 584                 sched_prio(curthread, td->td_user_pri); /* XXX dubious */
 585 #endif
 586         return (0);
 587 }
 588
 589 void
 590 #ifdef KSE
 591 pri_to_rtp(struct ksegrp *kg, struct rtprio *rtp)
 592 #else
 593 pri_to_rtp(struct thread *td, struct rtprio *rtp)
 594 #endif
 595 {
 596
 597         mtx_assert(&sched_lock, MA_OWNED);
 598 #ifdef KSE
 599         switch (PRI_BASE(kg->kg_pri_class)) {
 600 #else
 601         switch (PRI_BASE(td->td_pri_class)) {
 602 #endif
 603         case PRI_REALTIME:
 604 #ifdef KSE
 605                 rtp->prio = kg->kg_user_pri - PRI_MIN_REALTIME;
 606 #else
 607                 rtp->prio = td->td_user_pri - PRI_MIN_REALTIME;
 608 #endif
 609                 break;
 610         case PRI_TIMESHARE:
 611 #ifdef KSE
 612                 rtp->prio = kg->kg_user_pri - PRI_MIN_TIMESHARE;
 613 #else
 614                 rtp->prio = td->td_user_pri - PRI_MIN_TIMESHARE;
 615 #endif
 616                 break;
 617         case PRI_IDLE:
 618 #ifdef KSE
 619                 rtp->prio = kg->kg_user_pri - PRI_MIN_IDLE;
 620 #else
 621                 rtp->prio = td->td_user_pri - PRI_MIN_IDLE;
 622 #endif
 623                 break;
 624         default:
 625                 break;
 626         }
 627 #ifdef KSE
 628         rtp->type = kg->kg_pri_class;
 629 #else
 630         rtp->type = td->td_pri_class;
 631 #endif
 632 }
 633
 634 #if defined(COMPAT_43)
 635 #ifndef _SYS_SYSPROTO_H_
 636 struct osetrlimit_args {
 637         u_int   which;
 638         struct  orlimit *rlp;
 639 };
 640 #endif
 641 /*
 642  * MPSAFE
 643  */
 644 int
 645 osetrlimit(td, uap)
 646         struct thread *td;
 647         register struct osetrlimit_args *uap;
 648 {
 649         struct orlimit olim;
 650         struct rlimit lim;
 651         int error;
 652
 653         if ((error = copyin(uap->rlp, &olim, sizeof(struct orlimit))))
 654                 return (error);
 655         lim.rlim_cur = olim.rlim_cur;
 656         lim.rlim_max = olim.rlim_max;
 657         error = kern_setrlimit(td, uap->which, &lim);
 658         return (error);
 659 }
 660
 661 #ifndef _SYS_SYSPROTO_H_
 662 struct ogetrlimit_args {
 663         u_int   which;
 664         struct  orlimit *rlp;
 665 };
 666 #endif
 667 /*
 668  * MPSAFE
 669  */
 670 int
 671 ogetrlimit(td, uap)
 672         struct thread *td;
 673         register struct ogetrlimit_args *uap;
 674 {
 675         struct orlimit olim;
 676         struct rlimit rl;
 677         struct proc *p;
 678         int error;
 679
 680         if (uap->which >= RLIM_NLIMITS)
 681                 return (EINVAL);
 682         p = td->td_proc;
 683         PROC_LOCK(p);
 684         lim_rlimit(p, uap->which, &rl);
 685         PROC_UNLOCK(p);
 686
 687         /*
 688          * XXX would be more correct to convert only RLIM_INFINITY to the
 689          * old RLIM_INFINITY and fail with EOVERFLOW for other larger
 690          * values.  Most 64->32 and 32->16 conversions, including not
 691          * unimportant ones of uids are even more broken than what we
 692          * do here (they blindly truncate).  We don't do this correctly
 693          * here since we have little experience with EOVERFLOW yet.
 694          * Elsewhere, getuid() can't fail...
 695          */
 696         olim.rlim_cur = rl.rlim_cur > 0x7fffffff ? 0x7fffffff : rl.rlim_cur;
 697         olim.rlim_max = rl.rlim_max > 0x7fffffff ? 0x7fffffff : rl.rlim_max;
 698         error = copyout(&olim, uap->rlp, sizeof(olim));
 699         return (error);
 700 }
 701 #endif /* COMPAT_43 */
 702
 703 #ifndef _SYS_SYSPROTO_H_
 704 struct __setrlimit_args {
 705         u_int   which;
 706         struct  rlimit *rlp;
 707 };
 708 #endif
 709 /*
 710  * MPSAFE
 711  */
 712 int
 713 setrlimit(td, uap)
 714         struct thread *td;
 715         register struct __setrlimit_args *uap;
 716 {
 717         struct rlimit alim;
 718         int error;
 719
 720         if ((error = copyin(uap->rlp, &alim, sizeof(struct rlimit))))
 721                 return (error);
 722         error = kern_setrlimit(td, uap->which, &alim);
 723         return (error);
 724 }
 725
 726 int
 727 kern_setrlimit(td, which, limp)
 728         struct thread *td;
 729         u_int which;
 730         struct rlimit *limp;
 731 {
 732         struct plimit *newlim, *oldlim;
 733         struct proc *p;
 734         register struct rlimit *alimp;
 735         rlim_t oldssiz;
 736         int error;
 737
 738         if (which >= RLIM_NLIMITS)
 739                 return (EINVAL);
 740
 741         /*
 742          * Preserve historical bugs by treating negative limits as unsigned.
 743          */
 744         if (limp->rlim_cur < 0)
 745                 limp->rlim_cur = RLIM_INFINITY;
 746         if (limp->rlim_max < 0)
 747                 limp->rlim_max = RLIM_INFINITY;
 748
 749         oldssiz = 0;
 750         p = td->td_proc;
 751         newlim = lim_alloc();
 752         PROC_LOCK(p);
 753         oldlim = p->p_limit;
 754         alimp = &oldlim->pl_rlimit[which];
 755         if (limp->rlim_cur > alimp->rlim_max ||
 756             limp->rlim_max > alimp->rlim_max)
 757                 if ((error = suser_cred(td->td_ucred, SUSER_ALLOWJAIL))) {
 758                         PROC_UNLOCK(p);
 759                         lim_free(newlim);
 760                         return (error);
 761                 }
 762         if (limp->rlim_cur > limp->rlim_max)
 763                 limp->rlim_cur = limp->rlim_max;
 764         lim_copy(newlim, oldlim);
 765         alimp = &newlim->pl_rlimit[which];
 766
 767         switch (which) {
 768
 769         case RLIMIT_CPU:
 770                 mtx_lock_spin(&sched_lock);
 771                 p->p_cpulimit = limp->rlim_cur;
 772                 mtx_unlock_spin(&sched_lock);
 773                 break;
 774         case RLIMIT_DATA:
 775                 if (limp->rlim_cur > maxdsiz)
 776                         limp->rlim_cur = maxdsiz;
 777                 if (limp->rlim_max > maxdsiz)
 778                         limp->rlim_max = maxdsiz;
 779                 break;
 780
 781         case RLIMIT_STACK:
 782                 if (limp->rlim_cur > maxssiz)
 783                         limp->rlim_cur = maxssiz;
 784                 if (limp->rlim_max > maxssiz)
 785                         limp->rlim_max = maxssiz;
 786                 oldssiz = alimp->rlim_cur;
 787                 break;
 788
 789         case RLIMIT_NOFILE:
 790                 if (limp->rlim_cur > maxfilesperproc)
 791                         limp->rlim_cur = maxfilesperproc;
 792                 if (limp->rlim_max > maxfilesperproc)
 793                         limp->rlim_max = maxfilesperproc;
 794                 break;
 795
 796         case RLIMIT_NPROC:
 797                 if (limp->rlim_cur > maxprocperuid)
 798                         limp->rlim_cur = maxprocperuid;
 799                 if (limp->rlim_max > maxprocperuid)
 800                         limp->rlim_max = maxprocperuid;
 801                 if (limp->rlim_cur < 1)
 802                         limp->rlim_cur = 1;
 803                 if (limp->rlim_max < 1)
 804                         limp->rlim_max = 1;
 805                 break;
 806         }
 807         *alimp = *limp;
 808         p->p_limit = newlim;
 809         PROC_UNLOCK(p);
 810         lim_free(oldlim);
 811
 812         if (which == RLIMIT_STACK) {
 813                 /*
 814                  * Stack is allocated to the max at exec time with only
 815                  * "rlim_cur" bytes accessible.  If stack limit is going
 816                  * up make more accessible, if going down make inaccessible.
 817                  */
 818                 if (limp->rlim_cur != oldssiz) {
 819                         vm_offset_t addr;
 820                         vm_size_t size;
 821                         vm_prot_t prot;
 822
 823                         if (limp->rlim_cur > oldssiz) {
 824                                 prot = p->p_sysent->sv_stackprot;
 825                                 size = limp->rlim_cur - oldssiz;
 826                                 addr = p->p_sysent->sv_usrstack -
 827                                     limp->rlim_cur;
 828                         } else {
 829                                 prot = VM_PROT_NONE;
 830                                 size = oldssiz - limp->rlim_cur;
 831                                 addr = p->p_sysent->sv_usrstack - oldssiz;
 832                         }
 833                         addr = trunc_page(addr);
 834                         size = round_page(size);
 835                         (void)vm_map_protect(&p->p_vmspace->vm_map,
 836                             addr, addr + size, prot, FALSE);
 837                 }
 838         }
 839
 840         /*
 841          * The data size limit may need to be changed to a value
 842          * that makes sense for the 32 bit binary.
 843          */
 844         if (p->p_sysent->sv_fixlimits != NULL)
 845                 p->p_sysent->sv_fixlimits(p);
 846         return (0);
 847 }
 848
 849 #ifndef _SYS_SYSPROTO_H_
 850 struct __getrlimit_args {
 851         u_int   which;
 852         struct  rlimit *rlp;
 853 };
 854 #endif
 855 /*
 856  * MPSAFE
 857  */
 858 /* ARGSUSED */
 859 int
 860 getrlimit(td, uap)
 861         struct thread *td;
 862         register struct __getrlimit_args *uap;
 863 {
 864         struct rlimit rlim;
 865         struct proc *p;
 866         int error;
 867
 868         if (uap->which >= RLIM_NLIMITS)
 869                 return (EINVAL);
 870         p = td->td_proc;
 871         PROC_LOCK(p);
 872         lim_rlimit(p, uap->which, &rlim);
 873         PROC_UNLOCK(p);
 874         error = copyout(&rlim, uap->rlp, sizeof(struct rlimit));
 875         return (error);
 876 }
 877
 878 /*
 879  * Transform the running time and tick information for children of proc p
 880  * into user and system time usage.
 881  */
 882 void
 883 calccru(p, up, sp)
 884         struct proc *p;
 885         struct timeval *up;
 886         struct timeval *sp;
 887 {
 888
 889         PROC_LOCK_ASSERT(p, MA_OWNED);
 890         calcru1(p, &p->p_crux, up, sp);
 891 }
 892
 893 /*
 894  * Transform the running time and tick information in proc p into user
 895  * and system time usage.  If appropriate, include the current time slice
 896  * on this CPU.
 897  */
 898 void
 899 calcru(struct proc *p, struct timeval *up, struct timeval *sp)
 900 {
 901         struct rusage_ext rux;
 902         struct thread *td;
 903         uint64_t u;
 904
 905         PROC_LOCK_ASSERT(p, MA_OWNED);
 906         mtx_assert(&sched_lock, MA_NOTOWNED);
 907         mtx_lock_spin(&sched_lock);
 908
 909         /*
 910          * If we are getting stats for the current process, then add in the
 911          * stats that this thread has accumulated in its current time slice.
 912          * We reset the thread and CPU state as if we had performed a context
 913          * switch right here.
 914          */
 915         if (curthread->td_proc == p) {
 916                 td = curthread;
 917                 u = cpu_ticks();
 918                 p->p_rux.rux_runtime += u - PCPU_GET(switchtime);
 919                 PCPU_SET(switchtime, u);
 920                 p->p_rux.rux_uticks += td->td_uticks;
 921                 td->td_uticks = 0;
 922                 p->p_rux.rux_iticks += td->td_iticks;
 923                 td->td_iticks = 0;
 924                 p->p_rux.rux_sticks += td->td_sticks;
 925                 td->td_sticks = 0;
 926         }
 927         /* Work on a copy of p_rux so we can let go of sched_lock */
 928         rux = p->p_rux;
 929         mtx_unlock_spin(&sched_lock);
 930         calcru1(p, &rux, up, sp);
 931         /* Update the result from the p_rux copy */
 932         p->p_rux.rux_uu = rux.rux_uu;
 933         p->p_rux.rux_su = rux.rux_su;
 934         p->p_rux.rux_tu = rux.rux_tu;
 935 }
 936
 937 static void
 938 calcru1(struct proc *p, struct rusage_ext *ruxp, struct timeval *up,
 939     struct timeval *sp)
 940 {
 941         /* {user, system, interrupt, total} {ticks, usec}: */
 942         u_int64_t ut, uu, st, su, it, tt, tu;
 943
 944         ut = ruxp->rux_uticks;
 945         st = ruxp->rux_sticks;
 946         it = ruxp->rux_iticks;
 947         tt = ut + st + it;
 948         if (tt == 0) {
 949                 /* Avoid divide by zero */
 950                 st = 1;
 951                 tt = 1;
 952         }
 953         tu = cputick2usec(ruxp->rux_runtime);
 954         if ((int64_t)tu < 0) {
 955                 /* XXX: this should be an assert /phk */
 956                 printf("calcru: negative runtime of %jd usec for pid %d (%s)\n",
 957                     (intmax_t)tu, p->p_pid, p->p_comm);
 958                 tu = ruxp->rux_tu;
 959         }
 960
 961         if (tu >= ruxp->rux_tu) {
 962                 /*
 963                  * The normal case, time increased.
 964                  * Enforce monotonicity of bucketed numbers.
 965                  */
 966                 uu = (tu * ut) / tt;
 967                 if (uu < ruxp->rux_uu)
 968                         uu = ruxp->rux_uu;
 969                 su = (tu * st) / tt;
 970                 if (su < ruxp->rux_su)
 971                         su = ruxp->rux_su;
 972         } else if (tu + 3 > ruxp->rux_tu || 101 * tu > 100 * ruxp->rux_tu) {
 973                 /*
 974                  * When we calibrate the cputicker, it is not uncommon to
 975                  * see the presumably fixed frequency increase slightly over
 976                  * time as a result of thermal stabilization and NTP
 977                  * discipline (of the reference clock).  We therefore ignore
 978                  * a bit of backwards slop because we  expect to catch up
 979                  * shortly.  We use a 3 microsecond limit to catch low
 980                  * counts and a 1% limit for high counts.
 981                  */
 982                 uu = ruxp->rux_uu;
 983                 su = ruxp->rux_su;
 984                 tu = ruxp->rux_tu;
 985         } else { /* tu < ruxp->rux_tu */
 986                 /*
 987                  * What happene here was likely that a laptop, which ran at
 988                  * a reduced clock frequency at boot, kicked into high gear.
 989                  * The wisdom of spamming this message in that case is
 990                  * dubious, but it might also be indicative of something
 991                  * serious, so lets keep it and hope laptops can be made
 992                  * more truthful about their CPU speed via ACPI.
 993                  */
 994                 printf("calcru: runtime went backwards from %ju usec "
 995                     "to %ju usec for pid %d (%s)\n",
 996                     (uintmax_t)ruxp->rux_tu, (uintmax_t)tu,
 997                     p->p_pid, p->p_comm);
 998                 uu = (tu * ut) / tt;
 999                 su = (tu * st) / tt;
1000         }
1001
1002         ruxp->rux_uu = uu;
1003         ruxp->rux_su = su;
1004         ruxp->rux_tu = tu;
1005
1006         up->tv_sec = uu / 1000000;
1007         up->tv_usec = uu % 1000000;
1008         sp->tv_sec = su / 1000000;
1009         sp->tv_usec = su % 1000000;
1010 }
1011
1012 #ifndef _SYS_SYSPROTO_H_
1013 struct getrusage_args {
1014         int     who;
1015         struct  rusage *rusage;
1016 };
1017 #endif
1018 /*
1019  * MPSAFE
1020  */
1021 int
1022 getrusage(td, uap)
1023         register struct thread *td;
1024         register struct getrusage_args *uap;
1025 {
1026         struct rusage ru;
1027         int error;
1028
1029         error = kern_getrusage(td, uap->who, &ru);
1030         if (error == 0)
1031                 error = copyout(&ru, uap->rusage, sizeof(struct rusage));
1032         return (error);
1033 }
1034
1035 int
1036 kern_getrusage(td, who, rup)
1037         struct thread *td;
1038         int who;
1039         struct rusage *rup;
1040 {
1041         struct proc *p;
1042
1043         p = td->td_proc;
1044         PROC_LOCK(p);
1045         switch (who) {
1046
1047         case RUSAGE_SELF:
1048                 *rup = p->p_stats->p_ru;
1049                 calcru(p, &rup->ru_utime, &rup->ru_stime);
1050                 break;
1051
1052         case RUSAGE_CHILDREN:
1053                 *rup = p->p_stats->p_cru;
1054                 calccru(p, &rup->ru_utime, &rup->ru_stime);
1055                 break;
1056
1057         default:
1058                 PROC_UNLOCK(p);
1059                 return (EINVAL);
1060         }
1061         PROC_UNLOCK(p);
1062         return (0);
1063 }
1064
1065 void
1066 ruadd(ru, rux, ru2, rux2)
1067         struct rusage *ru;
1068         struct rusage_ext *rux;
1069         struct rusage *ru2;
1070         struct rusage_ext *rux2;
1071 {
1072         register long *ip, *ip2;
1073         register int i;
1074
1075         rux->rux_runtime += rux2->rux_runtime;
1076         rux->rux_uticks += rux2->rux_uticks;
1077         rux->rux_sticks += rux2->rux_sticks;
1078         rux->rux_iticks += rux2->rux_iticks;
1079         rux->rux_uu += rux2->rux_uu;
1080         rux->rux_su += rux2->rux_su;
1081         rux->rux_tu += rux2->rux_tu;
1082         if (ru->ru_maxrss < ru2->ru_maxrss)
1083                 ru->ru_maxrss = ru2->ru_maxrss;
1084         ip = &ru->ru_first;
1085         ip2 = &ru2->ru_first;
1086         for (i = &ru->ru_last - &ru->ru_first; i >= 0; i--)
1087                 *ip++ += *ip2++;
1088 }
1089
1090 /*
1091  * Allocate a new resource limits structure and initialize its
1092  * reference count and mutex pointer.
1093  */
1094 struct plimit *
1095 lim_alloc()
1096 {
1097         struct plimit *limp;
1098
1099         limp = malloc(sizeof(struct plimit), M_PLIMIT, M_WAITOK);
1100         refcount_init(&limp->pl_refcnt, 1);
1101         return (limp);
1102 }
1103
1104 struct plimit *
1105 lim_hold(limp)
1106         struct plimit *limp;
1107 {
1108
1109         refcount_acquire(&limp->pl_refcnt);
1110         return (limp);
1111 }
1112
1113 void
1114 lim_free(limp)
1115         struct plimit *limp;
1116 {
1117
1118         KASSERT(limp->pl_refcnt > 0, ("plimit refcnt underflow"));
1119         if (refcount_release(&limp->pl_refcnt))
1120                 free((void *)limp, M_PLIMIT);
1121 }
1122
1123 /*
1124  * Make a copy of the plimit structure.
1125  * We share these structures copy-on-write after fork.
1126  */
1127 void
1128 lim_copy(dst, src)
1129         struct plimit *dst, *src;
1130 {
1131
1132         KASSERT(dst->pl_refcnt == 1, ("lim_copy to shared limit"));
1133         bcopy(src->pl_rlimit, dst->pl_rlimit, sizeof(src->pl_rlimit));
1134 }
1135
1136 /*
1137  * Return the hard limit for a particular system resource.  The
1138  * which parameter specifies the index into the rlimit array.
1139  */
1140 rlim_t
1141 lim_max(struct proc *p, int which)
1142 {
1143         struct rlimit rl;
1144
1145         lim_rlimit(p, which, &rl);
1146         return (rl.rlim_max);
1147 }
1148
1149 /*
1150  * Return the current (soft) limit for a particular system resource.
1151  * The which parameter which specifies the index into the rlimit array
1152  */
1153 rlim_t
1154 lim_cur(struct proc *p, int which)
1155 {
1156         struct rlimit rl;
1157
1158         lim_rlimit(p, which, &rl);
1159         return (rl.rlim_cur);
1160 }
1161
1162 /*
1163  * Return a copy of the entire rlimit structure for the system limit
1164  * specified by 'which' in the rlimit structure pointed to by 'rlp'.
1165  */
1166 void
1167 lim_rlimit(struct proc *p, int which, struct rlimit *rlp)
1168 {
1169
1170         PROC_LOCK_ASSERT(p, MA_OWNED);
1171         KASSERT(which >= 0 && which < RLIM_NLIMITS,
1172             ("request for invalid resource limit"));
1173         *rlp = p->p_limit->pl_rlimit[which];
1174 }
1175
1176 /*
1177  * Find the uidinfo structure for a uid.  This structure is used to
1178  * track the total resource consumption (process count, socket buffer
1179  * size, etc.) for the uid and impose limits.
1180  */
1181 void
1182 uihashinit()
1183 {
1184
1185         uihashtbl = hashinit(maxproc / 16, M_UIDINFO, &uihash);
1186         mtx_init(&uihashtbl_mtx, "uidinfo hash", NULL, MTX_DEF);
1187 }
1188
1189 /*
1190  * Look up a uidinfo struct for the parameter uid.
1191  * uihashtbl_mtx must be locked.
1192  */
1193 static struct uidinfo *
1194 uilookup(uid)
1195         uid_t uid;
1196 {
1197         struct uihashhead *uipp;
1198         struct uidinfo *uip;
1199
1200         mtx_assert(&uihashtbl_mtx, MA_OWNED);
1201         uipp = UIHASH(uid);
1202         LIST_FOREACH(uip, uipp, ui_hash)
1203                 if (uip->ui_uid == uid)
1204                         break;
1205
1206         return (uip);
1207 }
1208
1209 /*
1210  * Find or allocate a struct uidinfo for a particular uid.
1211  * Increase refcount on uidinfo struct returned.
1212  * uifree() should be called on a struct uidinfo when released.
1213  */
1214 struct uidinfo *
1215 uifind(uid)
1216         uid_t uid;
1217 {
1218         struct uidinfo *old_uip, *uip;
1219
1220         mtx_lock(&uihashtbl_mtx);
1221         uip = uilookup(uid);
1222         if (uip == NULL) {
1223                 mtx_unlock(&uihashtbl_mtx);
1224                 uip = malloc(sizeof(*uip), M_UIDINFO, M_WAITOK | M_ZERO);
1225                 mtx_lock(&uihashtbl_mtx);
1226                 /*
1227                  * There's a chance someone created our uidinfo while we
1228                  * were in malloc and not holding the lock, so we have to
1229                  * make sure we don't insert a duplicate uidinfo.
1230                  */
1231                 if ((old_uip = uilookup(uid)) != NULL) {
1232                         /* Someone else beat us to it. */
1233                         free(uip, M_UIDINFO);
1234                         uip = old_uip;
1235                 } else {
1236                         uip->ui_mtxp = mtx_pool_alloc(mtxpool_sleep);
1237                         uip->ui_uid = uid;
1238                         LIST_INSERT_HEAD(UIHASH(uid), uip, ui_hash);
1239                 }
1240         }
1241         uihold(uip);
1242         mtx_unlock(&uihashtbl_mtx);
1243         return (uip);
1244 }
1245
1246 /*
1247  * Place another refcount on a uidinfo struct.
1248  */
1249 void
1250 uihold(uip)
1251         struct uidinfo *uip;
1252 {
1253
1254         UIDINFO_LOCK(uip);
1255         uip->ui_ref++;
1256         UIDINFO_UNLOCK(uip);
1257 }
1258
1259 /*-
1260  * Since uidinfo structs have a long lifetime, we use an
1261  * opportunistic refcounting scheme to avoid locking the lookup hash
1262  * for each release.
1263  *
1264  * If the refcount hits 0, we need to free the structure,
1265  * which means we need to lock the hash.
1266  * Optimal case:
1267  *   After locking the struct and lowering the refcount, if we find
1268  *   that we don't need to free, simply unlock and return.
1269  * Suboptimal case:
1270  *   If refcount lowering results in need to free, bump the count
1271  *   back up, lose the lock and aquire the locks in the proper
1272  *   order to try again.
1273  */
1274 void
1275 uifree(uip)
1276         struct uidinfo *uip;
1277 {
1278
1279         /* Prepare for optimal case. */
1280         UIDINFO_LOCK(uip);
1281
1282         if (--uip->ui_ref != 0) {
1283                 UIDINFO_UNLOCK(uip);
1284                 return;
1285         }
1286
1287         /* Prepare for suboptimal case. */
1288         uip->ui_ref++;
1289         UIDINFO_UNLOCK(uip);
1290         mtx_lock(&uihashtbl_mtx);
1291         UIDINFO_LOCK(uip);
1292
1293         /*
1294          * We must subtract one from the count again because we backed out
1295          * our initial subtraction before dropping the lock.
1296          * Since another thread may have added a reference after we dropped the
1297          * initial lock we have to test for zero again.
1298          */
1299         if (--uip->ui_ref == 0) {
1300                 LIST_REMOVE(uip, ui_hash);
1301                 mtx_unlock(&uihashtbl_mtx);
1302                 if (uip->ui_sbsize != 0)
1303                         printf("freeing uidinfo: uid = %d, sbsize = %jd\n",
1304                             uip->ui_uid, (intmax_t)uip->ui_sbsize);
1305                 if (uip->ui_proccnt != 0)
1306                         printf("freeing uidinfo: uid = %d, proccnt = %ld\n",
1307                             uip->ui_uid, uip->ui_proccnt);
1308                 UIDINFO_UNLOCK(uip);
1309                 FREE(uip, M_UIDINFO);
1310                 return;
1311         }
1312
1313         mtx_unlock(&uihashtbl_mtx);
1314         UIDINFO_UNLOCK(uip);
1315 }
1316
1317 /*
1318  * Change the count associated with number of processes
1319  * a given user is using.  When 'max' is 0, don't enforce a limit
1320  */
1321 int
1322 chgproccnt(uip, diff, max)
1323         struct  uidinfo *uip;
1324         int     diff;
1325         int     max;
1326 {
1327
1328         UIDINFO_LOCK(uip);
1329         /* Don't allow them to exceed max, but allow subtraction. */
1330         if (diff > 0 && uip->ui_proccnt + diff > max && max != 0) {
1331                 UIDINFO_UNLOCK(uip);
1332                 return (0);
1333         }
1334         uip->ui_proccnt += diff;
1335         if (uip->ui_proccnt < 0)
1336                 printf("negative proccnt for uid = %d\n", uip->ui_uid);
1337         UIDINFO_UNLOCK(uip);
1338         return (1);
1339 }
1340
1341 /*
1342  * Change the total socket buffer size a user has used.
1343  */
1344 int
1345 chgsbsize(uip, hiwat, to, max)
1346         struct  uidinfo *uip;
1347         u_int  *hiwat;
1348         u_int   to;
1349         rlim_t  max;
1350 {
1351         rlim_t new;
1352
1353         UIDINFO_LOCK(uip);
1354         new = uip->ui_sbsize + to - *hiwat;
1355         /* Don't allow them to exceed max, but allow subtraction. */
1356         if (to > *hiwat && new > max) {
1357                 UIDINFO_UNLOCK(uip);
1358                 return (0);
1359         }
1360         uip->ui_sbsize = new;
1361         UIDINFO_UNLOCK(uip);
1362         *hiwat = to;
1363         if (new < 0)
1364                 printf("negative sbsize for uid = %d\n", uip->ui_uid);
1365         return (1);
1366 }