]> CyberLeo.Net >> Repos - FreeBSD/FreeBSD.git/blob - sys/kern/kern_rctl.c
Import DTS files for arm, arm64, riscv from Linux 5.8
[FreeBSD/FreeBSD.git] / sys / kern / kern_rctl.c
1 /*-
2  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
3  *
4  * Copyright (c) 2010 The FreeBSD Foundation
5  * All rights reserved.
6  *
7  * This software was developed by Edward Tomasz Napierala under sponsorship
8  * from the FreeBSD Foundation.
9  *
10  * Redistribution and use in source and binary forms, with or without
11  * modification, are permitted provided that the following conditions
12  * are met:
13  * 1. Redistributions of source code must retain the above copyright
14  *    notice, this list of conditions and the following disclaimer.
15  * 2. Redistributions in binary form must reproduce the above copyright
16  *    notice, this list of conditions and the following disclaimer in the
17  *    documentation and/or other materials provided with the distribution.
18  *
19  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
20  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
21  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
22  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
23  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
24  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
25  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
26  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
27  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
28  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
29  * SUCH DAMAGE.
30  *
31  * $FreeBSD$
32  */
33
34 #include <sys/cdefs.h>
35 __FBSDID("$FreeBSD$");
36
37 #include <sys/param.h>
38 #include <sys/bus.h>
39 #include <sys/malloc.h>
40 #include <sys/queue.h>
41 #include <sys/refcount.h>
42 #include <sys/jail.h>
43 #include <sys/kernel.h>
44 #include <sys/limits.h>
45 #include <sys/loginclass.h>
46 #include <sys/priv.h>
47 #include <sys/proc.h>
48 #include <sys/racct.h>
49 #include <sys/rctl.h>
50 #include <sys/resourcevar.h>
51 #include <sys/sx.h>
52 #include <sys/sysent.h>
53 #include <sys/sysproto.h>
54 #include <sys/systm.h>
55 #include <sys/types.h>
56 #include <sys/eventhandler.h>
57 #include <sys/lock.h>
58 #include <sys/mutex.h>
59 #include <sys/rwlock.h>
60 #include <sys/sbuf.h>
61 #include <sys/taskqueue.h>
62 #include <sys/tree.h>
63 #include <vm/uma.h>
64
65 #ifdef RCTL
66 #ifndef RACCT
67 #error "The RCTL option requires the RACCT option"
68 #endif
69
70 FEATURE(rctl, "Resource Limits");
71
72 #define HRF_DEFAULT             0
73 #define HRF_DONT_INHERIT        1
74 #define HRF_DONT_ACCUMULATE     2
75
76 #define RCTL_MAX_INBUFSIZE      4 * 1024
77 #define RCTL_MAX_OUTBUFSIZE     16 * 1024 * 1024
78 #define RCTL_LOG_BUFSIZE        128
79
80 #define RCTL_PCPU_SHIFT         (10 * 1000000)
81
82 static unsigned int rctl_maxbufsize = RCTL_MAX_OUTBUFSIZE;
83 static int rctl_log_rate_limit = 10;
84 static int rctl_devctl_rate_limit = 10;
85
86 /*
87  * Values below are initialized in rctl_init().
88  */
89 static int rctl_throttle_min = -1;
90 static int rctl_throttle_max = -1;
91 static int rctl_throttle_pct = -1;
92 static int rctl_throttle_pct2 = -1;
93
94 static int rctl_throttle_min_sysctl(SYSCTL_HANDLER_ARGS);
95 static int rctl_throttle_max_sysctl(SYSCTL_HANDLER_ARGS);
96 static int rctl_throttle_pct_sysctl(SYSCTL_HANDLER_ARGS);
97 static int rctl_throttle_pct2_sysctl(SYSCTL_HANDLER_ARGS);
98
99 SYSCTL_NODE(_kern_racct, OID_AUTO, rctl, CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
100     "Resource Limits");
101 SYSCTL_UINT(_kern_racct_rctl, OID_AUTO, maxbufsize, CTLFLAG_RWTUN,
102     &rctl_maxbufsize, 0, "Maximum output buffer size");
103 SYSCTL_UINT(_kern_racct_rctl, OID_AUTO, log_rate_limit, CTLFLAG_RW,
104     &rctl_log_rate_limit, 0, "Maximum number of log messages per second");
105 SYSCTL_UINT(_kern_racct_rctl, OID_AUTO, devctl_rate_limit, CTLFLAG_RWTUN,
106     &rctl_devctl_rate_limit, 0, "Maximum number of devctl messages per second");
107 SYSCTL_PROC(_kern_racct_rctl, OID_AUTO, throttle_min,
108     CTLTYPE_UINT | CTLFLAG_RWTUN | CTLFLAG_MPSAFE, 0, 0,
109     &rctl_throttle_min_sysctl, "IU",
110     "Shortest throttling duration, in hz");
111 TUNABLE_INT("kern.racct.rctl.throttle_min", &rctl_throttle_min);
112 SYSCTL_PROC(_kern_racct_rctl, OID_AUTO, throttle_max,
113     CTLTYPE_UINT | CTLFLAG_RWTUN | CTLFLAG_MPSAFE, 0, 0,
114     &rctl_throttle_max_sysctl, "IU",
115     "Longest throttling duration, in hz");
116 TUNABLE_INT("kern.racct.rctl.throttle_max", &rctl_throttle_max);
117 SYSCTL_PROC(_kern_racct_rctl, OID_AUTO, throttle_pct,
118     CTLTYPE_UINT | CTLFLAG_RWTUN | CTLFLAG_MPSAFE, 0, 0,
119     &rctl_throttle_pct_sysctl, "IU",
120     "Throttling penalty for process consumption, in percent");
121 TUNABLE_INT("kern.racct.rctl.throttle_pct", &rctl_throttle_pct);
122 SYSCTL_PROC(_kern_racct_rctl, OID_AUTO, throttle_pct2,
123     CTLTYPE_UINT | CTLFLAG_RWTUN | CTLFLAG_MPSAFE, 0, 0,
124     &rctl_throttle_pct2_sysctl, "IU",
125     "Throttling penalty for container consumption, in percent");
126 TUNABLE_INT("kern.racct.rctl.throttle_pct2", &rctl_throttle_pct2);
127
128 /*
129  * 'rctl_rule_link' connects a rule with every racct it's related to.
130  * For example, rule 'user:X:openfiles:deny=N/process' is linked
131  * with uidinfo for user X, and to each process of that user.
132  */
133 struct rctl_rule_link {
134         LIST_ENTRY(rctl_rule_link)      rrl_next;
135         struct rctl_rule                *rrl_rule;
136         int                             rrl_exceeded;
137 };
138
139 struct dict {
140         const char      *d_name;
141         int             d_value;
142 };
143
144 static struct dict subjectnames[] = {
145         { "process", RCTL_SUBJECT_TYPE_PROCESS },
146         { "user", RCTL_SUBJECT_TYPE_USER },
147         { "loginclass", RCTL_SUBJECT_TYPE_LOGINCLASS },
148         { "jail", RCTL_SUBJECT_TYPE_JAIL },
149         { NULL, -1 }};
150
151 static struct dict resourcenames[] = {
152         { "cputime", RACCT_CPU },
153         { "datasize", RACCT_DATA },
154         { "stacksize", RACCT_STACK },
155         { "coredumpsize", RACCT_CORE },
156         { "memoryuse", RACCT_RSS },
157         { "memorylocked", RACCT_MEMLOCK },
158         { "maxproc", RACCT_NPROC },
159         { "openfiles", RACCT_NOFILE },
160         { "vmemoryuse", RACCT_VMEM },
161         { "pseudoterminals", RACCT_NPTS },
162         { "swapuse", RACCT_SWAP },
163         { "nthr", RACCT_NTHR },
164         { "msgqqueued", RACCT_MSGQQUEUED },
165         { "msgqsize", RACCT_MSGQSIZE },
166         { "nmsgq", RACCT_NMSGQ },
167         { "nsem", RACCT_NSEM },
168         { "nsemop", RACCT_NSEMOP },
169         { "nshm", RACCT_NSHM },
170         { "shmsize", RACCT_SHMSIZE },
171         { "wallclock", RACCT_WALLCLOCK },
172         { "pcpu", RACCT_PCTCPU },
173         { "readbps", RACCT_READBPS },
174         { "writebps", RACCT_WRITEBPS },
175         { "readiops", RACCT_READIOPS },
176         { "writeiops", RACCT_WRITEIOPS },
177         { NULL, -1 }};
178
179 static struct dict actionnames[] = {
180         { "sighup", RCTL_ACTION_SIGHUP },
181         { "sigint", RCTL_ACTION_SIGINT },
182         { "sigquit", RCTL_ACTION_SIGQUIT },
183         { "sigill", RCTL_ACTION_SIGILL },
184         { "sigtrap", RCTL_ACTION_SIGTRAP },
185         { "sigabrt", RCTL_ACTION_SIGABRT },
186         { "sigemt", RCTL_ACTION_SIGEMT },
187         { "sigfpe", RCTL_ACTION_SIGFPE },
188         { "sigkill", RCTL_ACTION_SIGKILL },
189         { "sigbus", RCTL_ACTION_SIGBUS },
190         { "sigsegv", RCTL_ACTION_SIGSEGV },
191         { "sigsys", RCTL_ACTION_SIGSYS },
192         { "sigpipe", RCTL_ACTION_SIGPIPE },
193         { "sigalrm", RCTL_ACTION_SIGALRM },
194         { "sigterm", RCTL_ACTION_SIGTERM },
195         { "sigurg", RCTL_ACTION_SIGURG },
196         { "sigstop", RCTL_ACTION_SIGSTOP },
197         { "sigtstp", RCTL_ACTION_SIGTSTP },
198         { "sigchld", RCTL_ACTION_SIGCHLD },
199         { "sigttin", RCTL_ACTION_SIGTTIN },
200         { "sigttou", RCTL_ACTION_SIGTTOU },
201         { "sigio", RCTL_ACTION_SIGIO },
202         { "sigxcpu", RCTL_ACTION_SIGXCPU },
203         { "sigxfsz", RCTL_ACTION_SIGXFSZ },
204         { "sigvtalrm", RCTL_ACTION_SIGVTALRM },
205         { "sigprof", RCTL_ACTION_SIGPROF },
206         { "sigwinch", RCTL_ACTION_SIGWINCH },
207         { "siginfo", RCTL_ACTION_SIGINFO },
208         { "sigusr1", RCTL_ACTION_SIGUSR1 },
209         { "sigusr2", RCTL_ACTION_SIGUSR2 },
210         { "sigthr", RCTL_ACTION_SIGTHR },
211         { "deny", RCTL_ACTION_DENY },
212         { "log", RCTL_ACTION_LOG },
213         { "devctl", RCTL_ACTION_DEVCTL },
214         { "throttle", RCTL_ACTION_THROTTLE },
215         { NULL, -1 }};
216
217 static void rctl_init(void);
218 SYSINIT(rctl, SI_SUB_RACCT, SI_ORDER_FIRST, rctl_init, NULL);
219
220 static uma_zone_t rctl_rule_zone;
221 static uma_zone_t rctl_rule_link_zone;
222
223 static int rctl_rule_fully_specified(const struct rctl_rule *rule);
224 static void rctl_rule_to_sbuf(struct sbuf *sb, const struct rctl_rule *rule);
225
226 static MALLOC_DEFINE(M_RCTL, "rctl", "Resource Limits");
227
228 static int rctl_throttle_min_sysctl(SYSCTL_HANDLER_ARGS)
229 {
230         int error, val = rctl_throttle_min;
231
232         error = sysctl_handle_int(oidp, &val, 0, req);
233         if (error || !req->newptr)
234                 return (error);
235         if (val < 1 || val > rctl_throttle_max)
236                 return (EINVAL);
237
238         RACCT_LOCK();
239         rctl_throttle_min = val;
240         RACCT_UNLOCK();
241
242         return (0);
243 }
244
245 static int rctl_throttle_max_sysctl(SYSCTL_HANDLER_ARGS)
246 {
247         int error, val = rctl_throttle_max;
248
249         error = sysctl_handle_int(oidp, &val, 0, req);
250         if (error || !req->newptr)
251                 return (error);
252         if (val < rctl_throttle_min)
253                 return (EINVAL);
254
255         RACCT_LOCK();
256         rctl_throttle_max = val;
257         RACCT_UNLOCK();
258
259         return (0);
260 }
261
262 static int rctl_throttle_pct_sysctl(SYSCTL_HANDLER_ARGS)
263 {
264         int error, val = rctl_throttle_pct;
265
266         error = sysctl_handle_int(oidp, &val, 0, req);
267         if (error || !req->newptr)
268                 return (error);
269         if (val < 0)
270                 return (EINVAL);
271
272         RACCT_LOCK();
273         rctl_throttle_pct = val;
274         RACCT_UNLOCK();
275
276         return (0);
277 }
278
279 static int rctl_throttle_pct2_sysctl(SYSCTL_HANDLER_ARGS)
280 {
281         int error, val = rctl_throttle_pct2;
282
283         error = sysctl_handle_int(oidp, &val, 0, req);
284         if (error || !req->newptr)
285                 return (error);
286         if (val < 0)
287                 return (EINVAL);
288
289         RACCT_LOCK();
290         rctl_throttle_pct2 = val;
291         RACCT_UNLOCK();
292
293         return (0);
294 }
295
296 static const char *
297 rctl_subject_type_name(int subject)
298 {
299         int i;
300
301         for (i = 0; subjectnames[i].d_name != NULL; i++) {
302                 if (subjectnames[i].d_value == subject)
303                         return (subjectnames[i].d_name);
304         }
305
306         panic("rctl_subject_type_name: unknown subject type %d", subject);
307 }
308
309 static const char *
310 rctl_action_name(int action)
311 {
312         int i;
313
314         for (i = 0; actionnames[i].d_name != NULL; i++) {
315                 if (actionnames[i].d_value == action)
316                         return (actionnames[i].d_name);
317         }
318
319         panic("rctl_action_name: unknown action %d", action);
320 }
321
322 const char *
323 rctl_resource_name(int resource)
324 {
325         int i;
326
327         for (i = 0; resourcenames[i].d_name != NULL; i++) {
328                 if (resourcenames[i].d_value == resource)
329                         return (resourcenames[i].d_name);
330         }
331
332         panic("rctl_resource_name: unknown resource %d", resource);
333 }
334
335 static struct racct *
336 rctl_proc_rule_to_racct(const struct proc *p, const struct rctl_rule *rule)
337 {
338         struct ucred *cred = p->p_ucred;
339
340         ASSERT_RACCT_ENABLED();
341         RACCT_LOCK_ASSERT();
342
343         switch (rule->rr_per) {
344         case RCTL_SUBJECT_TYPE_PROCESS:
345                 return (p->p_racct);
346         case RCTL_SUBJECT_TYPE_USER:
347                 return (cred->cr_ruidinfo->ui_racct);
348         case RCTL_SUBJECT_TYPE_LOGINCLASS:
349                 return (cred->cr_loginclass->lc_racct);
350         case RCTL_SUBJECT_TYPE_JAIL:
351                 return (cred->cr_prison->pr_prison_racct->prr_racct);
352         default:
353                 panic("%s: unknown per %d", __func__, rule->rr_per);
354         }
355 }
356
357 /*
358  * Return the amount of resource that can be allocated by 'p' before
359  * hitting 'rule'.
360  */
361 static int64_t
362 rctl_available_resource(const struct proc *p, const struct rctl_rule *rule)
363 {
364         const struct racct *racct;
365         int64_t available;
366
367         ASSERT_RACCT_ENABLED();
368         RACCT_LOCK_ASSERT();
369
370         racct = rctl_proc_rule_to_racct(p, rule);
371         available = rule->rr_amount - racct->r_resources[rule->rr_resource];
372
373         return (available);
374 }
375
376 /*
377  * Called every second for proc, uidinfo, loginclass, and jail containers.
378  * If the limit isn't exceeded, it decreases the usage amount to zero.
379  * Otherwise, it decreases it by the value of the limit.  This way
380  * resource consumption exceeding the limit "carries over" to the next
381  * period.
382  */
383 void
384 rctl_throttle_decay(struct racct *racct, int resource)
385 {
386         struct rctl_rule *rule;
387         struct rctl_rule_link *link;
388         int64_t minavailable;
389
390         ASSERT_RACCT_ENABLED();
391         RACCT_LOCK_ASSERT();
392
393         minavailable = INT64_MAX;
394
395         LIST_FOREACH(link, &racct->r_rule_links, rrl_next) {
396                 rule = link->rrl_rule;
397
398                 if (rule->rr_resource != resource)
399                         continue;
400                 if (rule->rr_action != RCTL_ACTION_THROTTLE)
401                         continue;
402
403                 if (rule->rr_amount < minavailable)
404                         minavailable = rule->rr_amount;
405         }
406
407         if (racct->r_resources[resource] < minavailable) {
408                 racct->r_resources[resource] = 0;
409         } else {
410                 /*
411                  * Cap utilization counter at ten times the limit.  Otherwise,
412                  * if we changed the rule lowering the allowed amount, it could
413                  * take unreasonably long time for the accumulated resource
414                  * usage to drop.
415                  */
416                 if (racct->r_resources[resource] > minavailable * 10)
417                         racct->r_resources[resource] = minavailable * 10;
418
419                 racct->r_resources[resource] -= minavailable;
420         }
421 }
422
423 /*
424  * Special version of rctl_get_available() for the %CPU resource.
425  * We slightly cheat here and return less than we normally would.
426  */
427 int64_t
428 rctl_pcpu_available(const struct proc *p) {
429         struct rctl_rule *rule;
430         struct rctl_rule_link *link;
431         int64_t available, minavailable, limit;
432
433         ASSERT_RACCT_ENABLED();
434         RACCT_LOCK_ASSERT();
435
436         minavailable = INT64_MAX;
437         limit = 0;
438
439         LIST_FOREACH(link, &p->p_racct->r_rule_links, rrl_next) {
440                 rule = link->rrl_rule;
441                 if (rule->rr_resource != RACCT_PCTCPU)
442                         continue;
443                 if (rule->rr_action != RCTL_ACTION_DENY)
444                         continue;
445                 available = rctl_available_resource(p, rule);
446                 if (available < minavailable) {
447                         minavailable = available;
448                         limit = rule->rr_amount;
449                 }
450         }
451
452         /*
453          * Return slightly less than actual value of the available
454          * %cpu resource.  This makes %cpu throttling more aggressive
455          * and lets us act sooner than the limits are already exceeded.
456          */
457         if (limit != 0) {
458                 if (limit > 2 * RCTL_PCPU_SHIFT)
459                         minavailable -= RCTL_PCPU_SHIFT;
460                 else
461                         minavailable -= (limit / 2);
462         }
463
464         return (minavailable);
465 }
466
467 static uint64_t
468 xadd(uint64_t a, uint64_t b)
469 {
470         uint64_t c;
471
472         c = a + b;
473
474         /*
475          * Detect overflow.
476          */
477         if (c < a || c < b)
478                 return (UINT64_MAX);
479
480         return (c);
481 }
482
483 static uint64_t
484 xmul(uint64_t a, uint64_t b)
485 {
486
487         if (b != 0 && a > UINT64_MAX / b)
488                 return (UINT64_MAX);
489
490         return (a * b);
491 }
492
493 /*
494  * Check whether the proc 'p' can allocate 'amount' of 'resource' in addition
495  * to what it keeps allocated now.  Returns non-zero if the allocation should
496  * be denied, 0 otherwise.
497  */
498 int
499 rctl_enforce(struct proc *p, int resource, uint64_t amount)
500 {
501         static struct timeval log_lasttime, devctl_lasttime;
502         static int log_curtime = 0, devctl_curtime = 0;
503         struct rctl_rule *rule;
504         struct rctl_rule_link *link;
505         struct sbuf sb;
506         char *buf;
507         int64_t available;
508         uint64_t sleep_ms, sleep_ratio;
509         int should_deny = 0;
510
511         ASSERT_RACCT_ENABLED();
512         RACCT_LOCK_ASSERT();
513
514         /*
515          * There may be more than one matching rule; go through all of them.
516          * Denial should be done last, after logging and sending signals.
517          */
518         LIST_FOREACH(link, &p->p_racct->r_rule_links, rrl_next) {
519                 rule = link->rrl_rule;
520                 if (rule->rr_resource != resource)
521                         continue;
522
523                 available = rctl_available_resource(p, rule);
524                 if (available >= (int64_t)amount) {
525                         link->rrl_exceeded = 0;
526                         continue;
527                 }
528
529                 switch (rule->rr_action) {
530                 case RCTL_ACTION_DENY:
531                         should_deny = 1;
532                         continue;
533                 case RCTL_ACTION_LOG:
534                         /*
535                          * If rrl_exceeded != 0, it means we've already
536                          * logged a warning for this process.
537                          */
538                         if (link->rrl_exceeded != 0)
539                                 continue;
540
541                         /*
542                          * If the process state is not fully initialized yet,
543                          * we can't access most of the required fields, e.g.
544                          * p->p_comm.  This happens when called from fork1().
545                          * Ignore this rule for now; it will be processed just
546                          * after fork, when called from racct_proc_fork_done().
547                          */
548                         if (p->p_state != PRS_NORMAL)
549                                 continue;
550
551                         if (!ppsratecheck(&log_lasttime, &log_curtime,
552                             rctl_log_rate_limit))
553                                 continue;
554
555                         buf = malloc(RCTL_LOG_BUFSIZE, M_RCTL, M_NOWAIT);
556                         if (buf == NULL) {
557                                 printf("rctl_enforce: out of memory\n");
558                                 continue;
559                         }
560                         sbuf_new(&sb, buf, RCTL_LOG_BUFSIZE, SBUF_FIXEDLEN);
561                         rctl_rule_to_sbuf(&sb, rule);
562                         sbuf_finish(&sb);
563                         printf("rctl: rule \"%s\" matched by pid %d "
564                             "(%s), uid %d, jail %s\n", sbuf_data(&sb),
565                             p->p_pid, p->p_comm, p->p_ucred->cr_uid,
566                             p->p_ucred->cr_prison->pr_prison_racct->prr_name);
567                         sbuf_delete(&sb);
568                         free(buf, M_RCTL);
569                         link->rrl_exceeded = 1;
570                         continue;
571                 case RCTL_ACTION_DEVCTL:
572                         if (link->rrl_exceeded != 0)
573                                 continue;
574
575                         if (p->p_state != PRS_NORMAL)
576                                 continue;
577
578                         if (!ppsratecheck(&devctl_lasttime, &devctl_curtime,
579                             rctl_devctl_rate_limit))
580                                 continue;
581
582                         buf = malloc(RCTL_LOG_BUFSIZE, M_RCTL, M_NOWAIT);
583                         if (buf == NULL) {
584                                 printf("rctl_enforce: out of memory\n");
585                                 continue;
586                         }
587                         sbuf_new(&sb, buf, RCTL_LOG_BUFSIZE, SBUF_FIXEDLEN);
588                         sbuf_printf(&sb, "rule=");
589                         rctl_rule_to_sbuf(&sb, rule);
590                         sbuf_printf(&sb, " pid=%d ruid=%d jail=%s",
591                             p->p_pid, p->p_ucred->cr_ruid,
592                             p->p_ucred->cr_prison->pr_prison_racct->prr_name);
593                         sbuf_finish(&sb);
594                         devctl_notify_f("RCTL", "rule", "matched",
595                             sbuf_data(&sb), M_NOWAIT);
596                         sbuf_delete(&sb);
597                         free(buf, M_RCTL);
598                         link->rrl_exceeded = 1;
599                         continue;
600                 case RCTL_ACTION_THROTTLE:
601                         if (p->p_state != PRS_NORMAL)
602                                 continue;
603
604                         /*
605                          * Make the process sleep for a fraction of second
606                          * proportional to the ratio of process' resource
607                          * utilization compared to the limit.  The point is
608                          * to penalize resource hogs: processes that consume
609                          * more of the available resources sleep for longer.
610                          *
611                          * We're trying to defer division until the very end,
612                          * to minimize the rounding effects.  The following
613                          * calculation could have been written in a clearer
614                          * way like this:
615                          *
616                          * sleep_ms = hz * p->p_racct->r_resources[resource] /
617                          *     rule->rr_amount;
618                          * sleep_ms *= rctl_throttle_pct / 100;
619                          * if (sleep_ms < rctl_throttle_min)
620                          *         sleep_ms = rctl_throttle_min;
621                          *
622                          */
623                         sleep_ms = xmul(hz, p->p_racct->r_resources[resource]);
624                         sleep_ms = xmul(sleep_ms,  rctl_throttle_pct) / 100;
625                         if (sleep_ms < rctl_throttle_min * rule->rr_amount)
626                                 sleep_ms = rctl_throttle_min * rule->rr_amount;
627
628                         /*
629                          * Multiply that by the ratio of the resource
630                          * consumption for the container compared to the limit,
631                          * squared.  In other words, a process in a container
632                          * that is two times over the limit will be throttled
633                          * four times as much for hitting the same rule.  The
634                          * point is to penalize processes more if the container
635                          * itself (eg certain UID or jail) is above the limit.
636                          */
637                         if (available < 0)
638                                 sleep_ratio = -available / rule->rr_amount;
639                         else
640                                 sleep_ratio = 0;
641                         sleep_ratio = xmul(sleep_ratio, sleep_ratio);
642                         sleep_ratio = xmul(sleep_ratio, rctl_throttle_pct2) / 100;
643                         sleep_ms = xadd(sleep_ms, xmul(sleep_ms, sleep_ratio));
644
645                         /*
646                          * Finally the division.
647                          */
648                         sleep_ms /= rule->rr_amount;
649
650                         if (sleep_ms > rctl_throttle_max)
651                                 sleep_ms = rctl_throttle_max;
652 #if 0
653                         printf("%s: pid %d (%s), %jd of %jd, will sleep for %ju ms (ratio %ju, available %jd)\n",
654                            __func__, p->p_pid, p->p_comm,
655                            p->p_racct->r_resources[resource],
656                            rule->rr_amount, (uintmax_t)sleep_ms,
657                            (uintmax_t)sleep_ratio, (intmax_t)available);
658 #endif
659
660                         KASSERT(sleep_ms >= rctl_throttle_min, ("%s: %ju < %d\n",
661                             __func__, (uintmax_t)sleep_ms, rctl_throttle_min));
662                         racct_proc_throttle(p, sleep_ms);
663                         continue;
664                 default:
665                         if (link->rrl_exceeded != 0)
666                                 continue;
667
668                         if (p->p_state != PRS_NORMAL)
669                                 continue;
670
671                         KASSERT(rule->rr_action > 0 &&
672                             rule->rr_action <= RCTL_ACTION_SIGNAL_MAX,
673                             ("rctl_enforce: unknown action %d",
674                              rule->rr_action));
675
676                         /*
677                          * We're using the fact that RCTL_ACTION_SIG* values
678                          * are equal to their counterparts from sys/signal.h.
679                          */
680                         kern_psignal(p, rule->rr_action);
681                         link->rrl_exceeded = 1;
682                         continue;
683                 }
684         }
685
686         if (should_deny) {
687                 /*
688                  * Return fake error code; the caller should change it
689                  * into one proper for the situation - EFSIZ, ENOMEM etc.
690                  */
691                 return (EDOOFUS);
692         }
693
694         return (0);
695 }
696
697 uint64_t
698 rctl_get_limit(struct proc *p, int resource)
699 {
700         struct rctl_rule *rule;
701         struct rctl_rule_link *link;
702         uint64_t amount = UINT64_MAX;
703
704         ASSERT_RACCT_ENABLED();
705         RACCT_LOCK_ASSERT();
706
707         /*
708          * There may be more than one matching rule; go through all of them.
709          * Denial should be done last, after logging and sending signals.
710          */
711         LIST_FOREACH(link, &p->p_racct->r_rule_links, rrl_next) {
712                 rule = link->rrl_rule;
713                 if (rule->rr_resource != resource)
714                         continue;
715                 if (rule->rr_action != RCTL_ACTION_DENY)
716                         continue;
717                 if (rule->rr_amount < amount)
718                         amount = rule->rr_amount;
719         }
720
721         return (amount);
722 }
723
724 uint64_t
725 rctl_get_available(struct proc *p, int resource)
726 {
727         struct rctl_rule *rule;
728         struct rctl_rule_link *link;
729         int64_t available, minavailable, allocated;
730
731         minavailable = INT64_MAX;
732
733         ASSERT_RACCT_ENABLED();
734         RACCT_LOCK_ASSERT();
735
736         /*
737          * There may be more than one matching rule; go through all of them.
738          * Denial should be done last, after logging and sending signals.
739          */
740         LIST_FOREACH(link, &p->p_racct->r_rule_links, rrl_next) {
741                 rule = link->rrl_rule;
742                 if (rule->rr_resource != resource)
743                         continue;
744                 if (rule->rr_action != RCTL_ACTION_DENY)
745                         continue;
746                 available = rctl_available_resource(p, rule);
747                 if (available < minavailable)
748                         minavailable = available;
749         }
750
751         /*
752          * XXX: Think about this _hard_.
753          */
754         allocated = p->p_racct->r_resources[resource];
755         if (minavailable < INT64_MAX - allocated)
756                 minavailable += allocated;
757         if (minavailable < 0)
758                 minavailable = 0;
759
760         return (minavailable);
761 }
762
763 static int
764 rctl_rule_matches(const struct rctl_rule *rule, const struct rctl_rule *filter)
765 {
766
767         ASSERT_RACCT_ENABLED();
768
769         if (filter->rr_subject_type != RCTL_SUBJECT_TYPE_UNDEFINED) {
770                 if (rule->rr_subject_type != filter->rr_subject_type)
771                         return (0);
772
773                 switch (filter->rr_subject_type) {
774                 case RCTL_SUBJECT_TYPE_PROCESS:
775                         if (filter->rr_subject.rs_proc != NULL &&
776                             rule->rr_subject.rs_proc !=
777                             filter->rr_subject.rs_proc)
778                                 return (0);
779                         break;
780                 case RCTL_SUBJECT_TYPE_USER:
781                         if (filter->rr_subject.rs_uip != NULL &&
782                             rule->rr_subject.rs_uip !=
783                             filter->rr_subject.rs_uip)
784                                 return (0);
785                         break;
786                 case RCTL_SUBJECT_TYPE_LOGINCLASS:
787                         if (filter->rr_subject.rs_loginclass != NULL &&
788                             rule->rr_subject.rs_loginclass !=
789                             filter->rr_subject.rs_loginclass)
790                                 return (0);
791                         break;
792                 case RCTL_SUBJECT_TYPE_JAIL:
793                         if (filter->rr_subject.rs_prison_racct != NULL &&
794                             rule->rr_subject.rs_prison_racct !=
795                             filter->rr_subject.rs_prison_racct)
796                                 return (0);
797                         break;
798                 default:
799                         panic("rctl_rule_matches: unknown subject type %d",
800                             filter->rr_subject_type);
801                 }
802         }
803
804         if (filter->rr_resource != RACCT_UNDEFINED) {
805                 if (rule->rr_resource != filter->rr_resource)
806                         return (0);
807         }
808
809         if (filter->rr_action != RCTL_ACTION_UNDEFINED) {
810                 if (rule->rr_action != filter->rr_action)
811                         return (0);
812         }
813
814         if (filter->rr_amount != RCTL_AMOUNT_UNDEFINED) {
815                 if (rule->rr_amount != filter->rr_amount)
816                         return (0);
817         }
818
819         if (filter->rr_per != RCTL_SUBJECT_TYPE_UNDEFINED) {
820                 if (rule->rr_per != filter->rr_per)
821                         return (0);
822         }
823
824         return (1);
825 }
826
827 static int
828 str2value(const char *str, int *value, struct dict *table)
829 {
830         int i;
831
832         if (value == NULL)
833                 return (EINVAL);
834
835         for (i = 0; table[i].d_name != NULL; i++) {
836                 if (strcasecmp(table[i].d_name, str) == 0) {
837                         *value =  table[i].d_value;
838                         return (0);
839                 }
840         }
841
842         return (EINVAL);
843 }
844
845 static int
846 str2id(const char *str, id_t *value)
847 {
848         char *end;
849
850         if (str == NULL)
851                 return (EINVAL);
852
853         *value = strtoul(str, &end, 10);
854         if ((size_t)(end - str) != strlen(str))
855                 return (EINVAL);
856
857         return (0);
858 }
859
860 static int
861 str2int64(const char *str, int64_t *value)
862 {
863         char *end;
864
865         if (str == NULL)
866                 return (EINVAL);
867
868         *value = strtoul(str, &end, 10);
869         if ((size_t)(end - str) != strlen(str))
870                 return (EINVAL);
871
872         if (*value < 0)
873                 return (ERANGE);
874
875         return (0);
876 }
877
878 /*
879  * Connect the rule to the racct, increasing refcount for the rule.
880  */
881 static void
882 rctl_racct_add_rule(struct racct *racct, struct rctl_rule *rule)
883 {
884         struct rctl_rule_link *link;
885
886         ASSERT_RACCT_ENABLED();
887         KASSERT(rctl_rule_fully_specified(rule), ("rule not fully specified"));
888
889         rctl_rule_acquire(rule);
890         link = uma_zalloc(rctl_rule_link_zone, M_WAITOK);
891         link->rrl_rule = rule;
892         link->rrl_exceeded = 0;
893
894         RACCT_LOCK();
895         LIST_INSERT_HEAD(&racct->r_rule_links, link, rrl_next);
896         RACCT_UNLOCK();
897 }
898
899 static int
900 rctl_racct_add_rule_locked(struct racct *racct, struct rctl_rule *rule)
901 {
902         struct rctl_rule_link *link;
903
904         ASSERT_RACCT_ENABLED();
905         KASSERT(rctl_rule_fully_specified(rule), ("rule not fully specified"));
906         RACCT_LOCK_ASSERT();
907
908         link = uma_zalloc(rctl_rule_link_zone, M_NOWAIT);
909         if (link == NULL)
910                 return (ENOMEM);
911         rctl_rule_acquire(rule);
912         link->rrl_rule = rule;
913         link->rrl_exceeded = 0;
914
915         LIST_INSERT_HEAD(&racct->r_rule_links, link, rrl_next);
916
917         return (0);
918 }
919
920 /*
921  * Remove limits for a rules matching the filter and release
922  * the refcounts for the rules, possibly freeing them.  Returns
923  * the number of limit structures removed.
924  */
925 static int
926 rctl_racct_remove_rules(struct racct *racct,
927     const struct rctl_rule *filter)
928 {
929         struct rctl_rule_link *link, *linktmp;
930         int removed = 0;
931
932         ASSERT_RACCT_ENABLED();
933         RACCT_LOCK_ASSERT();
934
935         LIST_FOREACH_SAFE(link, &racct->r_rule_links, rrl_next, linktmp) {
936                 if (!rctl_rule_matches(link->rrl_rule, filter))
937                         continue;
938
939                 LIST_REMOVE(link, rrl_next);
940                 rctl_rule_release(link->rrl_rule);
941                 uma_zfree(rctl_rule_link_zone, link);
942                 removed++;
943         }
944         return (removed);
945 }
946
947 static void
948 rctl_rule_acquire_subject(struct rctl_rule *rule)
949 {
950
951         ASSERT_RACCT_ENABLED();
952
953         switch (rule->rr_subject_type) {
954         case RCTL_SUBJECT_TYPE_UNDEFINED:
955         case RCTL_SUBJECT_TYPE_PROCESS:
956                 break;
957         case RCTL_SUBJECT_TYPE_JAIL:
958                 if (rule->rr_subject.rs_prison_racct != NULL)
959                         prison_racct_hold(rule->rr_subject.rs_prison_racct);
960                 break;
961         case RCTL_SUBJECT_TYPE_USER:
962                 if (rule->rr_subject.rs_uip != NULL)
963                         uihold(rule->rr_subject.rs_uip);
964                 break;
965         case RCTL_SUBJECT_TYPE_LOGINCLASS:
966                 if (rule->rr_subject.rs_loginclass != NULL)
967                         loginclass_hold(rule->rr_subject.rs_loginclass);
968                 break;
969         default:
970                 panic("rctl_rule_acquire_subject: unknown subject type %d",
971                     rule->rr_subject_type);
972         }
973 }
974
975 static void
976 rctl_rule_release_subject(struct rctl_rule *rule)
977 {
978
979         ASSERT_RACCT_ENABLED();
980
981         switch (rule->rr_subject_type) {
982         case RCTL_SUBJECT_TYPE_UNDEFINED:
983         case RCTL_SUBJECT_TYPE_PROCESS:
984                 break;
985         case RCTL_SUBJECT_TYPE_JAIL:
986                 if (rule->rr_subject.rs_prison_racct != NULL)
987                         prison_racct_free(rule->rr_subject.rs_prison_racct);
988                 break;
989         case RCTL_SUBJECT_TYPE_USER:
990                 if (rule->rr_subject.rs_uip != NULL)
991                         uifree(rule->rr_subject.rs_uip);
992                 break;
993         case RCTL_SUBJECT_TYPE_LOGINCLASS:
994                 if (rule->rr_subject.rs_loginclass != NULL)
995                         loginclass_free(rule->rr_subject.rs_loginclass);
996                 break;
997         default:
998                 panic("rctl_rule_release_subject: unknown subject type %d",
999                     rule->rr_subject_type);
1000         }
1001 }
1002
1003 struct rctl_rule *
1004 rctl_rule_alloc(int flags)
1005 {
1006         struct rctl_rule *rule;
1007
1008         ASSERT_RACCT_ENABLED();
1009
1010         rule = uma_zalloc(rctl_rule_zone, flags);
1011         if (rule == NULL)
1012                 return (NULL);
1013         rule->rr_subject_type = RCTL_SUBJECT_TYPE_UNDEFINED;
1014         rule->rr_subject.rs_proc = NULL;
1015         rule->rr_subject.rs_uip = NULL;
1016         rule->rr_subject.rs_loginclass = NULL;
1017         rule->rr_subject.rs_prison_racct = NULL;
1018         rule->rr_per = RCTL_SUBJECT_TYPE_UNDEFINED;
1019         rule->rr_resource = RACCT_UNDEFINED;
1020         rule->rr_action = RCTL_ACTION_UNDEFINED;
1021         rule->rr_amount = RCTL_AMOUNT_UNDEFINED;
1022         refcount_init(&rule->rr_refcount, 1);
1023
1024         return (rule);
1025 }
1026
1027 struct rctl_rule *
1028 rctl_rule_duplicate(const struct rctl_rule *rule, int flags)
1029 {
1030         struct rctl_rule *copy;
1031
1032         ASSERT_RACCT_ENABLED();
1033
1034         copy = uma_zalloc(rctl_rule_zone, flags);
1035         if (copy == NULL)
1036                 return (NULL);
1037         copy->rr_subject_type = rule->rr_subject_type;
1038         copy->rr_subject.rs_proc = rule->rr_subject.rs_proc;
1039         copy->rr_subject.rs_uip = rule->rr_subject.rs_uip;
1040         copy->rr_subject.rs_loginclass = rule->rr_subject.rs_loginclass;
1041         copy->rr_subject.rs_prison_racct = rule->rr_subject.rs_prison_racct;
1042         copy->rr_per = rule->rr_per;
1043         copy->rr_resource = rule->rr_resource;
1044         copy->rr_action = rule->rr_action;
1045         copy->rr_amount = rule->rr_amount;
1046         refcount_init(&copy->rr_refcount, 1);
1047         rctl_rule_acquire_subject(copy);
1048
1049         return (copy);
1050 }
1051
1052 void
1053 rctl_rule_acquire(struct rctl_rule *rule)
1054 {
1055
1056         ASSERT_RACCT_ENABLED();
1057         KASSERT(rule->rr_refcount > 0, ("rule->rr_refcount <= 0"));
1058
1059         refcount_acquire(&rule->rr_refcount);
1060 }
1061
1062 static void
1063 rctl_rule_free(void *context, int pending)
1064 {
1065         struct rctl_rule *rule;
1066         
1067         rule = (struct rctl_rule *)context;
1068
1069         ASSERT_RACCT_ENABLED();
1070         KASSERT(rule->rr_refcount == 0, ("rule->rr_refcount != 0"));
1071         
1072         /*
1073          * We don't need locking here; rule is guaranteed to be inaccessible.
1074          */
1075         
1076         rctl_rule_release_subject(rule);
1077         uma_zfree(rctl_rule_zone, rule);
1078 }
1079
1080 void
1081 rctl_rule_release(struct rctl_rule *rule)
1082 {
1083
1084         ASSERT_RACCT_ENABLED();
1085         KASSERT(rule->rr_refcount > 0, ("rule->rr_refcount <= 0"));
1086
1087         if (refcount_release(&rule->rr_refcount)) {
1088                 /*
1089                  * rctl_rule_release() is often called when iterating
1090                  * over all the uidinfo structures in the system,
1091                  * holding uihashtbl_lock.  Since rctl_rule_free()
1092                  * might end up calling uifree(), this would lead
1093                  * to lock recursion.  Use taskqueue to avoid this.
1094                  */
1095                 TASK_INIT(&rule->rr_task, 0, rctl_rule_free, rule);
1096                 taskqueue_enqueue(taskqueue_thread, &rule->rr_task);
1097         }
1098 }
1099
1100 static int
1101 rctl_rule_fully_specified(const struct rctl_rule *rule)
1102 {
1103
1104         ASSERT_RACCT_ENABLED();
1105
1106         switch (rule->rr_subject_type) {
1107         case RCTL_SUBJECT_TYPE_UNDEFINED:
1108                 return (0);
1109         case RCTL_SUBJECT_TYPE_PROCESS:
1110                 if (rule->rr_subject.rs_proc == NULL)
1111                         return (0);
1112                 break;
1113         case RCTL_SUBJECT_TYPE_USER:
1114                 if (rule->rr_subject.rs_uip == NULL)
1115                         return (0);
1116                 break;
1117         case RCTL_SUBJECT_TYPE_LOGINCLASS:
1118                 if (rule->rr_subject.rs_loginclass == NULL)
1119                         return (0);
1120                 break;
1121         case RCTL_SUBJECT_TYPE_JAIL:
1122                 if (rule->rr_subject.rs_prison_racct == NULL)
1123                         return (0);
1124                 break;
1125         default:
1126                 panic("rctl_rule_fully_specified: unknown subject type %d",
1127                     rule->rr_subject_type);
1128         }
1129         if (rule->rr_resource == RACCT_UNDEFINED)
1130                 return (0);
1131         if (rule->rr_action == RCTL_ACTION_UNDEFINED)
1132                 return (0);
1133         if (rule->rr_amount == RCTL_AMOUNT_UNDEFINED)
1134                 return (0);
1135         if (rule->rr_per == RCTL_SUBJECT_TYPE_UNDEFINED)
1136                 return (0);
1137
1138         return (1);
1139 }
1140
1141 static int
1142 rctl_string_to_rule(char *rulestr, struct rctl_rule **rulep)
1143 {
1144         struct rctl_rule *rule;
1145         char *subjectstr, *subject_idstr, *resourcestr, *actionstr,
1146              *amountstr, *perstr;
1147         id_t id;
1148         int error = 0;
1149
1150         ASSERT_RACCT_ENABLED();
1151
1152         rule = rctl_rule_alloc(M_WAITOK);
1153
1154         subjectstr = strsep(&rulestr, ":");
1155         subject_idstr = strsep(&rulestr, ":");
1156         resourcestr = strsep(&rulestr, ":");
1157         actionstr = strsep(&rulestr, "=/");
1158         amountstr = strsep(&rulestr, "/");
1159         perstr = rulestr;
1160
1161         if (subjectstr == NULL || subjectstr[0] == '\0')
1162                 rule->rr_subject_type = RCTL_SUBJECT_TYPE_UNDEFINED;
1163         else {
1164                 error = str2value(subjectstr, &rule->rr_subject_type, subjectnames);
1165                 if (error != 0)
1166                         goto out;
1167         }
1168
1169         if (subject_idstr == NULL || subject_idstr[0] == '\0') {
1170                 rule->rr_subject.rs_proc = NULL;
1171                 rule->rr_subject.rs_uip = NULL;
1172                 rule->rr_subject.rs_loginclass = NULL;
1173                 rule->rr_subject.rs_prison_racct = NULL;
1174         } else {
1175                 switch (rule->rr_subject_type) {
1176                 case RCTL_SUBJECT_TYPE_UNDEFINED:
1177                         error = EINVAL;
1178                         goto out;
1179                 case RCTL_SUBJECT_TYPE_PROCESS:
1180                         error = str2id(subject_idstr, &id);
1181                         if (error != 0)
1182                                 goto out;
1183                         sx_assert(&allproc_lock, SA_LOCKED);
1184                         rule->rr_subject.rs_proc = pfind(id);
1185                         if (rule->rr_subject.rs_proc == NULL) {
1186                                 error = ESRCH;
1187                                 goto out;
1188                         }
1189                         PROC_UNLOCK(rule->rr_subject.rs_proc);
1190                         break;
1191                 case RCTL_SUBJECT_TYPE_USER:
1192                         error = str2id(subject_idstr, &id);
1193                         if (error != 0)
1194                                 goto out;
1195                         rule->rr_subject.rs_uip = uifind(id);
1196                         break;
1197                 case RCTL_SUBJECT_TYPE_LOGINCLASS:
1198                         rule->rr_subject.rs_loginclass =
1199                             loginclass_find(subject_idstr);
1200                         if (rule->rr_subject.rs_loginclass == NULL) {
1201                                 error = ENAMETOOLONG;
1202                                 goto out;
1203                         }
1204                         break;
1205                 case RCTL_SUBJECT_TYPE_JAIL:
1206                         rule->rr_subject.rs_prison_racct =
1207                             prison_racct_find(subject_idstr);
1208                         if (rule->rr_subject.rs_prison_racct == NULL) {
1209                                 error = ENAMETOOLONG;
1210                                 goto out;
1211                         }
1212                         break;
1213                default:
1214                        panic("rctl_string_to_rule: unknown subject type %d",
1215                            rule->rr_subject_type);
1216                }
1217         }
1218
1219         if (resourcestr == NULL || resourcestr[0] == '\0')
1220                 rule->rr_resource = RACCT_UNDEFINED;
1221         else {
1222                 error = str2value(resourcestr, &rule->rr_resource,
1223                     resourcenames);
1224                 if (error != 0)
1225                         goto out;
1226         }
1227
1228         if (actionstr == NULL || actionstr[0] == '\0')
1229                 rule->rr_action = RCTL_ACTION_UNDEFINED;
1230         else {
1231                 error = str2value(actionstr, &rule->rr_action, actionnames);
1232                 if (error != 0)
1233                         goto out;
1234         }
1235
1236         if (amountstr == NULL || amountstr[0] == '\0')
1237                 rule->rr_amount = RCTL_AMOUNT_UNDEFINED;
1238         else {
1239                 error = str2int64(amountstr, &rule->rr_amount);
1240                 if (error != 0)
1241                         goto out;
1242                 if (RACCT_IS_IN_MILLIONS(rule->rr_resource)) {
1243                         if (rule->rr_amount > INT64_MAX / 1000000) {
1244                                 error = ERANGE;
1245                                 goto out;
1246                         }
1247                         rule->rr_amount *= 1000000;
1248                 }
1249         }
1250
1251         if (perstr == NULL || perstr[0] == '\0')
1252                 rule->rr_per = RCTL_SUBJECT_TYPE_UNDEFINED;
1253         else {
1254                 error = str2value(perstr, &rule->rr_per, subjectnames);
1255                 if (error != 0)
1256                         goto out;
1257         }
1258
1259 out:
1260         if (error == 0)
1261                 *rulep = rule;
1262         else
1263                 rctl_rule_release(rule);
1264
1265         return (error);
1266 }
1267
1268 /*
1269  * Link a rule with all the subjects it applies to.
1270  */
1271 int
1272 rctl_rule_add(struct rctl_rule *rule)
1273 {
1274         struct proc *p;
1275         struct ucred *cred;
1276         struct uidinfo *uip;
1277         struct prison *pr;
1278         struct prison_racct *prr;
1279         struct loginclass *lc;
1280         struct rctl_rule *rule2;
1281         int match;
1282
1283         ASSERT_RACCT_ENABLED();
1284         KASSERT(rctl_rule_fully_specified(rule), ("rule not fully specified"));
1285
1286         /*
1287          * Some rules just don't make sense, like "deny" rule for an undeniable
1288          * resource.  The exception are the RSS and %CPU resources - they are
1289          * not deniable in the racct sense, but the limit is enforced in
1290          * a different way.
1291          */
1292         if (rule->rr_action == RCTL_ACTION_DENY &&
1293             !RACCT_IS_DENIABLE(rule->rr_resource) &&
1294             rule->rr_resource != RACCT_RSS &&
1295             rule->rr_resource != RACCT_PCTCPU) {
1296                 return (EOPNOTSUPP);
1297         }
1298
1299         if (rule->rr_action == RCTL_ACTION_THROTTLE &&
1300             !RACCT_IS_DECAYING(rule->rr_resource)) {
1301                 return (EOPNOTSUPP);
1302         }
1303
1304         if (rule->rr_action == RCTL_ACTION_THROTTLE &&
1305             rule->rr_resource == RACCT_PCTCPU) {
1306                 return (EOPNOTSUPP);
1307         }
1308
1309         if (rule->rr_per == RCTL_SUBJECT_TYPE_PROCESS &&
1310             RACCT_IS_SLOPPY(rule->rr_resource)) {
1311                 return (EOPNOTSUPP);
1312         }
1313
1314         /*
1315          * Make sure there are no duplicated rules.  Also, for the "deny"
1316          * rules, remove ones differing only by "amount".
1317          */
1318         if (rule->rr_action == RCTL_ACTION_DENY) {
1319                 rule2 = rctl_rule_duplicate(rule, M_WAITOK);
1320                 rule2->rr_amount = RCTL_AMOUNT_UNDEFINED;
1321                 rctl_rule_remove(rule2);
1322                 rctl_rule_release(rule2);
1323         } else
1324                 rctl_rule_remove(rule);
1325
1326         switch (rule->rr_subject_type) {
1327         case RCTL_SUBJECT_TYPE_PROCESS:
1328                 p = rule->rr_subject.rs_proc;
1329                 KASSERT(p != NULL, ("rctl_rule_add: NULL proc"));
1330
1331                 rctl_racct_add_rule(p->p_racct, rule);
1332                 /*
1333                  * In case of per-process rule, we don't have anything more
1334                  * to do.
1335                  */
1336                 return (0);
1337
1338         case RCTL_SUBJECT_TYPE_USER:
1339                 uip = rule->rr_subject.rs_uip;
1340                 KASSERT(uip != NULL, ("rctl_rule_add: NULL uip"));
1341                 rctl_racct_add_rule(uip->ui_racct, rule);
1342                 break;
1343
1344         case RCTL_SUBJECT_TYPE_LOGINCLASS:
1345                 lc = rule->rr_subject.rs_loginclass;
1346                 KASSERT(lc != NULL, ("rctl_rule_add: NULL loginclass"));
1347                 rctl_racct_add_rule(lc->lc_racct, rule);
1348                 break;
1349
1350         case RCTL_SUBJECT_TYPE_JAIL:
1351                 prr = rule->rr_subject.rs_prison_racct;
1352                 KASSERT(prr != NULL, ("rctl_rule_add: NULL pr"));
1353                 rctl_racct_add_rule(prr->prr_racct, rule);
1354                 break;
1355
1356         default:
1357                 panic("rctl_rule_add: unknown subject type %d",
1358                     rule->rr_subject_type);
1359         }
1360
1361         /*
1362          * Now go through all the processes and add the new rule to the ones
1363          * it applies to.
1364          */
1365         sx_assert(&allproc_lock, SA_LOCKED);
1366         FOREACH_PROC_IN_SYSTEM(p) {
1367                 cred = p->p_ucred;
1368                 switch (rule->rr_subject_type) {
1369                 case RCTL_SUBJECT_TYPE_USER:
1370                         if (cred->cr_uidinfo == rule->rr_subject.rs_uip ||
1371                             cred->cr_ruidinfo == rule->rr_subject.rs_uip)
1372                                 break;
1373                         continue;
1374                 case RCTL_SUBJECT_TYPE_LOGINCLASS:
1375                         if (cred->cr_loginclass == rule->rr_subject.rs_loginclass)
1376                                 break;
1377                         continue;
1378                 case RCTL_SUBJECT_TYPE_JAIL:
1379                         match = 0;
1380                         for (pr = cred->cr_prison; pr != NULL; pr = pr->pr_parent) {
1381                                 if (pr->pr_prison_racct == rule->rr_subject.rs_prison_racct) {
1382                                         match = 1;
1383                                         break;
1384                                 }
1385                         }
1386                         if (match)
1387                                 break;
1388                         continue;
1389                 default:
1390                         panic("rctl_rule_add: unknown subject type %d",
1391                             rule->rr_subject_type);
1392                 }
1393
1394                 rctl_racct_add_rule(p->p_racct, rule);
1395         }
1396
1397         return (0);
1398 }
1399
1400 static void
1401 rctl_rule_pre_callback(void)
1402 {
1403
1404         RACCT_LOCK();
1405 }
1406
1407 static void
1408 rctl_rule_post_callback(void)
1409 {
1410
1411         RACCT_UNLOCK();
1412 }
1413
1414 static void
1415 rctl_rule_remove_callback(struct racct *racct, void *arg2, void *arg3)
1416 {
1417         struct rctl_rule *filter = (struct rctl_rule *)arg2;
1418         int found = 0;
1419
1420         ASSERT_RACCT_ENABLED();
1421         RACCT_LOCK_ASSERT();
1422
1423         found += rctl_racct_remove_rules(racct, filter);
1424
1425         *((int *)arg3) += found;
1426 }
1427
1428 /*
1429  * Remove all rules that match the filter.
1430  */
1431 int
1432 rctl_rule_remove(struct rctl_rule *filter)
1433 {
1434         struct proc *p;
1435         int found = 0;
1436
1437         ASSERT_RACCT_ENABLED();
1438
1439         if (filter->rr_subject_type == RCTL_SUBJECT_TYPE_PROCESS &&
1440             filter->rr_subject.rs_proc != NULL) {
1441                 p = filter->rr_subject.rs_proc;
1442                 RACCT_LOCK();
1443                 found = rctl_racct_remove_rules(p->p_racct, filter);
1444                 RACCT_UNLOCK();
1445                 if (found)
1446                         return (0);
1447                 return (ESRCH);
1448         }
1449
1450         loginclass_racct_foreach(rctl_rule_remove_callback,
1451             rctl_rule_pre_callback, rctl_rule_post_callback,
1452             filter, (void *)&found);
1453         ui_racct_foreach(rctl_rule_remove_callback,
1454             rctl_rule_pre_callback, rctl_rule_post_callback,
1455             filter, (void *)&found);
1456         prison_racct_foreach(rctl_rule_remove_callback,
1457             rctl_rule_pre_callback, rctl_rule_post_callback,
1458             filter, (void *)&found);
1459
1460         sx_assert(&allproc_lock, SA_LOCKED);
1461         RACCT_LOCK();
1462         FOREACH_PROC_IN_SYSTEM(p) {
1463                 found += rctl_racct_remove_rules(p->p_racct, filter);
1464         }
1465         RACCT_UNLOCK();
1466
1467         if (found)
1468                 return (0);
1469         return (ESRCH);
1470 }
1471
1472 /*
1473  * Appends a rule to the sbuf.
1474  */
1475 static void
1476 rctl_rule_to_sbuf(struct sbuf *sb, const struct rctl_rule *rule)
1477 {
1478         int64_t amount;
1479
1480         ASSERT_RACCT_ENABLED();
1481
1482         sbuf_printf(sb, "%s:", rctl_subject_type_name(rule->rr_subject_type));
1483
1484         switch (rule->rr_subject_type) {
1485         case RCTL_SUBJECT_TYPE_PROCESS:
1486                 if (rule->rr_subject.rs_proc == NULL)
1487                         sbuf_printf(sb, ":");
1488                 else
1489                         sbuf_printf(sb, "%d:",
1490                             rule->rr_subject.rs_proc->p_pid);
1491                 break;
1492         case RCTL_SUBJECT_TYPE_USER:
1493                 if (rule->rr_subject.rs_uip == NULL)
1494                         sbuf_printf(sb, ":");
1495                 else
1496                         sbuf_printf(sb, "%d:",
1497                             rule->rr_subject.rs_uip->ui_uid);
1498                 break;
1499         case RCTL_SUBJECT_TYPE_LOGINCLASS:
1500                 if (rule->rr_subject.rs_loginclass == NULL)
1501                         sbuf_printf(sb, ":");
1502                 else
1503                         sbuf_printf(sb, "%s:",
1504                             rule->rr_subject.rs_loginclass->lc_name);
1505                 break;
1506         case RCTL_SUBJECT_TYPE_JAIL:
1507                 if (rule->rr_subject.rs_prison_racct == NULL)
1508                         sbuf_printf(sb, ":");
1509                 else
1510                         sbuf_printf(sb, "%s:",
1511                             rule->rr_subject.rs_prison_racct->prr_name);
1512                 break;
1513         default:
1514                 panic("rctl_rule_to_sbuf: unknown subject type %d",
1515                     rule->rr_subject_type);
1516         }
1517
1518         amount = rule->rr_amount;
1519         if (amount != RCTL_AMOUNT_UNDEFINED &&
1520             RACCT_IS_IN_MILLIONS(rule->rr_resource))
1521                 amount /= 1000000;
1522
1523         sbuf_printf(sb, "%s:%s=%jd",
1524             rctl_resource_name(rule->rr_resource),
1525             rctl_action_name(rule->rr_action),
1526             amount);
1527
1528         if (rule->rr_per != rule->rr_subject_type)
1529                 sbuf_printf(sb, "/%s", rctl_subject_type_name(rule->rr_per));
1530 }
1531
1532 /*
1533  * Routine used by RCTL syscalls to read in input string.
1534  */
1535 static int
1536 rctl_read_inbuf(char **inputstr, const char *inbufp, size_t inbuflen)
1537 {
1538         char *str;
1539         int error;
1540
1541         ASSERT_RACCT_ENABLED();
1542
1543         if (inbuflen <= 0)
1544                 return (EINVAL);
1545         if (inbuflen > RCTL_MAX_INBUFSIZE)
1546                 return (E2BIG);
1547
1548         str = malloc(inbuflen + 1, M_RCTL, M_WAITOK);
1549         error = copyinstr(inbufp, str, inbuflen, NULL);
1550         if (error != 0) {
1551                 free(str, M_RCTL);
1552                 return (error);
1553         }
1554
1555         *inputstr = str;
1556
1557         return (0);
1558 }
1559
1560 /*
1561  * Routine used by RCTL syscalls to write out output string.
1562  */
1563 static int
1564 rctl_write_outbuf(struct sbuf *outputsbuf, char *outbufp, size_t outbuflen)
1565 {
1566         int error;
1567
1568         ASSERT_RACCT_ENABLED();
1569
1570         if (outputsbuf == NULL)
1571                 return (0);
1572
1573         sbuf_finish(outputsbuf);
1574         if (outbuflen < sbuf_len(outputsbuf) + 1) {
1575                 sbuf_delete(outputsbuf);
1576                 return (ERANGE);
1577         }
1578         error = copyout(sbuf_data(outputsbuf), outbufp,
1579             sbuf_len(outputsbuf) + 1);
1580         sbuf_delete(outputsbuf);
1581         return (error);
1582 }
1583
1584 static struct sbuf *
1585 rctl_racct_to_sbuf(struct racct *racct, int sloppy)
1586 {
1587         struct sbuf *sb;
1588         int64_t amount;
1589         int i;
1590
1591         ASSERT_RACCT_ENABLED();
1592
1593         sb = sbuf_new_auto();
1594         for (i = 0; i <= RACCT_MAX; i++) {
1595                 if (sloppy == 0 && RACCT_IS_SLOPPY(i))
1596                         continue;
1597                 RACCT_LOCK();
1598                 amount = racct->r_resources[i];
1599                 RACCT_UNLOCK();
1600                 if (RACCT_IS_IN_MILLIONS(i))
1601                         amount /= 1000000;
1602                 sbuf_printf(sb, "%s=%jd,", rctl_resource_name(i), amount);
1603         }
1604         sbuf_setpos(sb, sbuf_len(sb) - 1);
1605         return (sb);
1606 }
1607
1608 int
1609 sys_rctl_get_racct(struct thread *td, struct rctl_get_racct_args *uap)
1610 {
1611         struct rctl_rule *filter;
1612         struct sbuf *outputsbuf = NULL;
1613         struct proc *p;
1614         struct uidinfo *uip;
1615         struct loginclass *lc;
1616         struct prison_racct *prr;
1617         char *inputstr;
1618         int error;
1619
1620         if (!racct_enable)
1621                 return (ENOSYS);
1622
1623         error = priv_check(td, PRIV_RCTL_GET_RACCT);
1624         if (error != 0)
1625                 return (error);
1626
1627         error = rctl_read_inbuf(&inputstr, uap->inbufp, uap->inbuflen);
1628         if (error != 0)
1629                 return (error);
1630
1631         sx_slock(&allproc_lock);
1632         error = rctl_string_to_rule(inputstr, &filter);
1633         free(inputstr, M_RCTL);
1634         if (error != 0) {
1635                 sx_sunlock(&allproc_lock);
1636                 return (error);
1637         }
1638
1639         switch (filter->rr_subject_type) {
1640         case RCTL_SUBJECT_TYPE_PROCESS:
1641                 p = filter->rr_subject.rs_proc;
1642                 if (p == NULL) {
1643                         error = EINVAL;
1644                         goto out;
1645                 }
1646                 outputsbuf = rctl_racct_to_sbuf(p->p_racct, 0);
1647                 break;
1648         case RCTL_SUBJECT_TYPE_USER:
1649                 uip = filter->rr_subject.rs_uip;
1650                 if (uip == NULL) {
1651                         error = EINVAL;
1652                         goto out;
1653                 }
1654                 outputsbuf = rctl_racct_to_sbuf(uip->ui_racct, 1);
1655                 break;
1656         case RCTL_SUBJECT_TYPE_LOGINCLASS:
1657                 lc = filter->rr_subject.rs_loginclass;
1658                 if (lc == NULL) {
1659                         error = EINVAL;
1660                         goto out;
1661                 }
1662                 outputsbuf = rctl_racct_to_sbuf(lc->lc_racct, 1);
1663                 break;
1664         case RCTL_SUBJECT_TYPE_JAIL:
1665                 prr = filter->rr_subject.rs_prison_racct;
1666                 if (prr == NULL) {
1667                         error = EINVAL;
1668                         goto out;
1669                 }
1670                 outputsbuf = rctl_racct_to_sbuf(prr->prr_racct, 1);
1671                 break;
1672         default:
1673                 error = EINVAL;
1674         }
1675 out:
1676         rctl_rule_release(filter);
1677         sx_sunlock(&allproc_lock);
1678         if (error != 0)
1679                 return (error);
1680
1681         error = rctl_write_outbuf(outputsbuf, uap->outbufp, uap->outbuflen);
1682
1683         return (error);
1684 }
1685
1686 static void
1687 rctl_get_rules_callback(struct racct *racct, void *arg2, void *arg3)
1688 {
1689         struct rctl_rule *filter = (struct rctl_rule *)arg2;
1690         struct rctl_rule_link *link;
1691         struct sbuf *sb = (struct sbuf *)arg3;
1692
1693         ASSERT_RACCT_ENABLED();
1694         RACCT_LOCK_ASSERT();
1695
1696         LIST_FOREACH(link, &racct->r_rule_links, rrl_next) {
1697                 if (!rctl_rule_matches(link->rrl_rule, filter))
1698                         continue;
1699                 rctl_rule_to_sbuf(sb, link->rrl_rule);
1700                 sbuf_printf(sb, ",");
1701         }
1702 }
1703
1704 int
1705 sys_rctl_get_rules(struct thread *td, struct rctl_get_rules_args *uap)
1706 {
1707         struct sbuf *sb;
1708         struct rctl_rule *filter;
1709         struct rctl_rule_link *link;
1710         struct proc *p;
1711         char *inputstr, *buf;
1712         size_t bufsize;
1713         int error;
1714
1715         if (!racct_enable)
1716                 return (ENOSYS);
1717
1718         error = priv_check(td, PRIV_RCTL_GET_RULES);
1719         if (error != 0)
1720                 return (error);
1721
1722         error = rctl_read_inbuf(&inputstr, uap->inbufp, uap->inbuflen);
1723         if (error != 0)
1724                 return (error);
1725
1726         sx_slock(&allproc_lock);
1727         error = rctl_string_to_rule(inputstr, &filter);
1728         free(inputstr, M_RCTL);
1729         if (error != 0) {
1730                 sx_sunlock(&allproc_lock);
1731                 return (error);
1732         }
1733
1734         bufsize = uap->outbuflen;
1735         if (bufsize > rctl_maxbufsize) {
1736                 sx_sunlock(&allproc_lock);
1737                 return (E2BIG);
1738         }
1739
1740         buf = malloc(bufsize, M_RCTL, M_WAITOK);
1741         sb = sbuf_new(NULL, buf, bufsize, SBUF_FIXEDLEN);
1742         KASSERT(sb != NULL, ("sbuf_new failed"));
1743
1744         FOREACH_PROC_IN_SYSTEM(p) {
1745                 RACCT_LOCK();
1746                 LIST_FOREACH(link, &p->p_racct->r_rule_links, rrl_next) {
1747                         /*
1748                          * Non-process rules will be added to the buffer later.
1749                          * Adding them here would result in duplicated output.
1750                          */
1751                         if (link->rrl_rule->rr_subject_type !=
1752                             RCTL_SUBJECT_TYPE_PROCESS)
1753                                 continue;
1754                         if (!rctl_rule_matches(link->rrl_rule, filter))
1755                                 continue;
1756                         rctl_rule_to_sbuf(sb, link->rrl_rule);
1757                         sbuf_printf(sb, ",");
1758                 }
1759                 RACCT_UNLOCK();
1760         }
1761
1762         loginclass_racct_foreach(rctl_get_rules_callback,
1763             rctl_rule_pre_callback, rctl_rule_post_callback,
1764             filter, sb);
1765         ui_racct_foreach(rctl_get_rules_callback,
1766             rctl_rule_pre_callback, rctl_rule_post_callback,
1767             filter, sb);
1768         prison_racct_foreach(rctl_get_rules_callback,
1769             rctl_rule_pre_callback, rctl_rule_post_callback,
1770             filter, sb);
1771         if (sbuf_error(sb) == ENOMEM) {
1772                 error = ERANGE;
1773                 goto out;
1774         }
1775
1776         /*
1777          * Remove trailing ",".
1778          */
1779         if (sbuf_len(sb) > 0)
1780                 sbuf_setpos(sb, sbuf_len(sb) - 1);
1781
1782         error = rctl_write_outbuf(sb, uap->outbufp, uap->outbuflen);
1783 out:
1784         rctl_rule_release(filter);
1785         sx_sunlock(&allproc_lock);
1786         free(buf, M_RCTL);
1787         return (error);
1788 }
1789
1790 int
1791 sys_rctl_get_limits(struct thread *td, struct rctl_get_limits_args *uap)
1792 {
1793         struct sbuf *sb;
1794         struct rctl_rule *filter;
1795         struct rctl_rule_link *link;
1796         char *inputstr, *buf;
1797         size_t bufsize;
1798         int error;
1799
1800         if (!racct_enable)
1801                 return (ENOSYS);
1802
1803         error = priv_check(td, PRIV_RCTL_GET_LIMITS);
1804         if (error != 0)
1805                 return (error);
1806
1807         error = rctl_read_inbuf(&inputstr, uap->inbufp, uap->inbuflen);
1808         if (error != 0)
1809                 return (error);
1810
1811         sx_slock(&allproc_lock);
1812         error = rctl_string_to_rule(inputstr, &filter);
1813         free(inputstr, M_RCTL);
1814         if (error != 0) {
1815                 sx_sunlock(&allproc_lock);
1816                 return (error);
1817         }
1818
1819         if (filter->rr_subject_type == RCTL_SUBJECT_TYPE_UNDEFINED) {
1820                 rctl_rule_release(filter);
1821                 sx_sunlock(&allproc_lock);
1822                 return (EINVAL);
1823         }
1824         if (filter->rr_subject_type != RCTL_SUBJECT_TYPE_PROCESS) {
1825                 rctl_rule_release(filter);
1826                 sx_sunlock(&allproc_lock);
1827                 return (EOPNOTSUPP);
1828         }
1829         if (filter->rr_subject.rs_proc == NULL) {
1830                 rctl_rule_release(filter);
1831                 sx_sunlock(&allproc_lock);
1832                 return (EINVAL);
1833         }
1834
1835         bufsize = uap->outbuflen;
1836         if (bufsize > rctl_maxbufsize) {
1837                 rctl_rule_release(filter);
1838                 sx_sunlock(&allproc_lock);
1839                 return (E2BIG);
1840         }
1841
1842         buf = malloc(bufsize, M_RCTL, M_WAITOK);
1843         sb = sbuf_new(NULL, buf, bufsize, SBUF_FIXEDLEN);
1844         KASSERT(sb != NULL, ("sbuf_new failed"));
1845
1846         RACCT_LOCK();
1847         LIST_FOREACH(link, &filter->rr_subject.rs_proc->p_racct->r_rule_links,
1848             rrl_next) {
1849                 rctl_rule_to_sbuf(sb, link->rrl_rule);
1850                 sbuf_printf(sb, ",");
1851         }
1852         RACCT_UNLOCK();
1853         if (sbuf_error(sb) == ENOMEM) {
1854                 error = ERANGE;
1855                 sbuf_delete(sb);
1856                 goto out;
1857         }
1858
1859         /*
1860          * Remove trailing ",".
1861          */
1862         if (sbuf_len(sb) > 0)
1863                 sbuf_setpos(sb, sbuf_len(sb) - 1);
1864
1865         error = rctl_write_outbuf(sb, uap->outbufp, uap->outbuflen);
1866 out:
1867         rctl_rule_release(filter);
1868         sx_sunlock(&allproc_lock);
1869         free(buf, M_RCTL);
1870         return (error);
1871 }
1872
1873 int
1874 sys_rctl_add_rule(struct thread *td, struct rctl_add_rule_args *uap)
1875 {
1876         struct rctl_rule *rule;
1877         char *inputstr;
1878         int error;
1879
1880         if (!racct_enable)
1881                 return (ENOSYS);
1882
1883         error = priv_check(td, PRIV_RCTL_ADD_RULE);
1884         if (error != 0)
1885                 return (error);
1886
1887         error = rctl_read_inbuf(&inputstr, uap->inbufp, uap->inbuflen);
1888         if (error != 0)
1889                 return (error);
1890
1891         sx_slock(&allproc_lock);
1892         error = rctl_string_to_rule(inputstr, &rule);
1893         free(inputstr, M_RCTL);
1894         if (error != 0) {
1895                 sx_sunlock(&allproc_lock);
1896                 return (error);
1897         }
1898         /*
1899          * The 'per' part of a rule is optional.
1900          */
1901         if (rule->rr_per == RCTL_SUBJECT_TYPE_UNDEFINED &&
1902             rule->rr_subject_type != RCTL_SUBJECT_TYPE_UNDEFINED)
1903                 rule->rr_per = rule->rr_subject_type;
1904
1905         if (!rctl_rule_fully_specified(rule)) {
1906                 error = EINVAL;
1907                 goto out;
1908         }
1909
1910         error = rctl_rule_add(rule);
1911
1912 out:
1913         rctl_rule_release(rule);
1914         sx_sunlock(&allproc_lock);
1915         return (error);
1916 }
1917
1918 int
1919 sys_rctl_remove_rule(struct thread *td, struct rctl_remove_rule_args *uap)
1920 {
1921         struct rctl_rule *filter;
1922         char *inputstr;
1923         int error;
1924
1925         if (!racct_enable)
1926                 return (ENOSYS);
1927
1928         error = priv_check(td, PRIV_RCTL_REMOVE_RULE);
1929         if (error != 0)
1930                 return (error);
1931
1932         error = rctl_read_inbuf(&inputstr, uap->inbufp, uap->inbuflen);
1933         if (error != 0)
1934                 return (error);
1935
1936         sx_slock(&allproc_lock);
1937         error = rctl_string_to_rule(inputstr, &filter);
1938         free(inputstr, M_RCTL);
1939         if (error != 0) {
1940                 sx_sunlock(&allproc_lock);
1941                 return (error);
1942         }
1943
1944         error = rctl_rule_remove(filter);
1945         rctl_rule_release(filter);
1946         sx_sunlock(&allproc_lock);
1947
1948         return (error);
1949 }
1950
1951 /*
1952  * Update RCTL rule list after credential change.
1953  */
1954 void
1955 rctl_proc_ucred_changed(struct proc *p, struct ucred *newcred)
1956 {
1957         LIST_HEAD(, rctl_rule_link) newrules;
1958         struct rctl_rule_link *link, *newlink;
1959         struct uidinfo *newuip;
1960         struct loginclass *newlc;
1961         struct prison_racct *newprr;
1962         int rulecnt, i;
1963
1964         if (!racct_enable)
1965                 return;
1966
1967         PROC_LOCK_ASSERT(p, MA_NOTOWNED);
1968
1969         newuip = newcred->cr_ruidinfo;
1970         newlc = newcred->cr_loginclass;
1971         newprr = newcred->cr_prison->pr_prison_racct;
1972
1973         LIST_INIT(&newrules);
1974
1975 again:
1976         /*
1977          * First, count the rules that apply to the process with new
1978          * credentials.
1979          */
1980         rulecnt = 0;
1981         RACCT_LOCK();
1982         LIST_FOREACH(link, &p->p_racct->r_rule_links, rrl_next) {
1983                 if (link->rrl_rule->rr_subject_type ==
1984                     RCTL_SUBJECT_TYPE_PROCESS)
1985                         rulecnt++;
1986         }
1987         LIST_FOREACH(link, &newuip->ui_racct->r_rule_links, rrl_next)
1988                 rulecnt++;
1989         LIST_FOREACH(link, &newlc->lc_racct->r_rule_links, rrl_next)
1990                 rulecnt++;
1991         LIST_FOREACH(link, &newprr->prr_racct->r_rule_links, rrl_next)
1992                 rulecnt++;
1993         RACCT_UNLOCK();
1994
1995         /*
1996          * Create temporary list.  We've dropped the rctl_lock in order
1997          * to use M_WAITOK.
1998          */
1999         for (i = 0; i < rulecnt; i++) {
2000                 newlink = uma_zalloc(rctl_rule_link_zone, M_WAITOK);
2001                 newlink->rrl_rule = NULL;
2002                 newlink->rrl_exceeded = 0;
2003                 LIST_INSERT_HEAD(&newrules, newlink, rrl_next);
2004         }
2005
2006         newlink = LIST_FIRST(&newrules);
2007
2008         /*
2009          * Assign rules to the newly allocated list entries.
2010          */
2011         RACCT_LOCK();
2012         LIST_FOREACH(link, &p->p_racct->r_rule_links, rrl_next) {
2013                 if (link->rrl_rule->rr_subject_type ==
2014                     RCTL_SUBJECT_TYPE_PROCESS) {
2015                         if (newlink == NULL)
2016                                 goto goaround;
2017                         rctl_rule_acquire(link->rrl_rule);
2018                         newlink->rrl_rule = link->rrl_rule;
2019                         newlink->rrl_exceeded = link->rrl_exceeded;
2020                         newlink = LIST_NEXT(newlink, rrl_next);
2021                         rulecnt--;
2022                 }
2023         }
2024         
2025         LIST_FOREACH(link, &newuip->ui_racct->r_rule_links, rrl_next) {
2026                 if (newlink == NULL)
2027                         goto goaround;
2028                 rctl_rule_acquire(link->rrl_rule);
2029                 newlink->rrl_rule = link->rrl_rule;
2030                 newlink->rrl_exceeded = link->rrl_exceeded;
2031                 newlink = LIST_NEXT(newlink, rrl_next);
2032                 rulecnt--;
2033         }
2034
2035         LIST_FOREACH(link, &newlc->lc_racct->r_rule_links, rrl_next) {
2036                 if (newlink == NULL)
2037                         goto goaround;
2038                 rctl_rule_acquire(link->rrl_rule);
2039                 newlink->rrl_rule = link->rrl_rule;
2040                 newlink->rrl_exceeded = link->rrl_exceeded;
2041                 newlink = LIST_NEXT(newlink, rrl_next);
2042                 rulecnt--;
2043         }
2044
2045         LIST_FOREACH(link, &newprr->prr_racct->r_rule_links, rrl_next) {
2046                 if (newlink == NULL)
2047                         goto goaround;
2048                 rctl_rule_acquire(link->rrl_rule);
2049                 newlink->rrl_rule = link->rrl_rule;
2050                 newlink->rrl_exceeded = link->rrl_exceeded;
2051                 newlink = LIST_NEXT(newlink, rrl_next);
2052                 rulecnt--;
2053         }
2054
2055         if (rulecnt == 0) {
2056                 /*
2057                  * Free the old rule list.
2058                  */
2059                 while (!LIST_EMPTY(&p->p_racct->r_rule_links)) {
2060                         link = LIST_FIRST(&p->p_racct->r_rule_links);
2061                         LIST_REMOVE(link, rrl_next);
2062                         rctl_rule_release(link->rrl_rule);
2063                         uma_zfree(rctl_rule_link_zone, link);
2064                 }
2065
2066                 /*
2067                  * Replace lists and we're done.
2068                  *
2069                  * XXX: Is there any way to switch list heads instead
2070                  *      of iterating here?
2071                  */
2072                 while (!LIST_EMPTY(&newrules)) {
2073                         newlink = LIST_FIRST(&newrules);
2074                         LIST_REMOVE(newlink, rrl_next);
2075                         LIST_INSERT_HEAD(&p->p_racct->r_rule_links,
2076                             newlink, rrl_next);
2077                 }
2078
2079                 RACCT_UNLOCK();
2080
2081                 return;
2082         }
2083
2084 goaround:
2085         RACCT_UNLOCK();
2086
2087         /*
2088          * Rule list changed while we were not holding the rctl_lock.
2089          * Free the new list and try again.
2090          */
2091         while (!LIST_EMPTY(&newrules)) {
2092                 newlink = LIST_FIRST(&newrules);
2093                 LIST_REMOVE(newlink, rrl_next);
2094                 if (newlink->rrl_rule != NULL)
2095                         rctl_rule_release(newlink->rrl_rule);
2096                 uma_zfree(rctl_rule_link_zone, newlink);
2097         }
2098
2099         goto again;
2100 }
2101
2102 /*
2103  * Assign RCTL rules to the newly created process.
2104  */
2105 int
2106 rctl_proc_fork(struct proc *parent, struct proc *child)
2107 {
2108         struct rctl_rule *rule;
2109         struct rctl_rule_link *link;
2110         int error;
2111
2112         ASSERT_RACCT_ENABLED();
2113         RACCT_LOCK_ASSERT();
2114         KASSERT(parent->p_racct != NULL, ("process without racct; p = %p", parent));
2115
2116         LIST_INIT(&child->p_racct->r_rule_links);
2117
2118         /*
2119          * Go through limits applicable to the parent and assign them
2120          * to the child.  Rules with 'process' subject have to be duplicated
2121          * in order to make their rr_subject point to the new process.
2122          */
2123         LIST_FOREACH(link, &parent->p_racct->r_rule_links, rrl_next) {
2124                 if (link->rrl_rule->rr_subject_type ==
2125                     RCTL_SUBJECT_TYPE_PROCESS) {
2126                         rule = rctl_rule_duplicate(link->rrl_rule, M_NOWAIT);
2127                         if (rule == NULL)
2128                                 goto fail;
2129                         KASSERT(rule->rr_subject.rs_proc == parent,
2130                             ("rule->rr_subject.rs_proc != parent"));
2131                         rule->rr_subject.rs_proc = child;
2132                         error = rctl_racct_add_rule_locked(child->p_racct,
2133                             rule);
2134                         rctl_rule_release(rule);
2135                         if (error != 0)
2136                                 goto fail;
2137                 } else {
2138                         error = rctl_racct_add_rule_locked(child->p_racct,
2139                             link->rrl_rule);
2140                         if (error != 0)
2141                                 goto fail;
2142                 }
2143         }
2144
2145         return (0);
2146
2147 fail:
2148         while (!LIST_EMPTY(&child->p_racct->r_rule_links)) {
2149                 link = LIST_FIRST(&child->p_racct->r_rule_links);
2150                 LIST_REMOVE(link, rrl_next);
2151                 rctl_rule_release(link->rrl_rule);
2152                 uma_zfree(rctl_rule_link_zone, link);
2153         }
2154
2155         return (EAGAIN);
2156 }
2157
2158 /*
2159  * Release rules attached to the racct.
2160  */
2161 void
2162 rctl_racct_release(struct racct *racct)
2163 {
2164         struct rctl_rule_link *link;
2165
2166         ASSERT_RACCT_ENABLED();
2167         RACCT_LOCK_ASSERT();
2168
2169         while (!LIST_EMPTY(&racct->r_rule_links)) {
2170                 link = LIST_FIRST(&racct->r_rule_links);
2171                 LIST_REMOVE(link, rrl_next);
2172                 rctl_rule_release(link->rrl_rule);
2173                 uma_zfree(rctl_rule_link_zone, link);
2174         }
2175 }
2176
2177 static void
2178 rctl_init(void)
2179 {
2180
2181         if (!racct_enable)
2182                 return;
2183
2184         rctl_rule_zone = uma_zcreate("rctl_rule", sizeof(struct rctl_rule),
2185             NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0);
2186         rctl_rule_link_zone = uma_zcreate("rctl_rule_link",
2187             sizeof(struct rctl_rule_link), NULL, NULL, NULL, NULL,
2188             UMA_ALIGN_PTR, 0);
2189
2190         /*
2191          * Set default values, making sure not to overwrite the ones
2192          * fetched from tunables.  Most of those could be set at the
2193          * declaration, except for the rctl_throttle_max - we cannot
2194          * set it there due to hz not being compile time constant.
2195          */
2196         if (rctl_throttle_min < 1)
2197                 rctl_throttle_min = 1;
2198         if (rctl_throttle_max < rctl_throttle_min)
2199                 rctl_throttle_max = 2 * hz;
2200         if (rctl_throttle_pct < 0)
2201                 rctl_throttle_pct = 100;
2202         if (rctl_throttle_pct2 < 0)
2203                 rctl_throttle_pct2 = 100;
2204 }
2205
2206 #else /* !RCTL */
2207
2208 int
2209 sys_rctl_get_racct(struct thread *td, struct rctl_get_racct_args *uap)
2210 {
2211         
2212         return (ENOSYS);
2213 }
2214
2215 int
2216 sys_rctl_get_rules(struct thread *td, struct rctl_get_rules_args *uap)
2217 {
2218         
2219         return (ENOSYS);
2220 }
2221
2222 int
2223 sys_rctl_get_limits(struct thread *td, struct rctl_get_limits_args *uap)
2224 {
2225         
2226         return (ENOSYS);
2227 }
2228
2229 int
2230 sys_rctl_add_rule(struct thread *td, struct rctl_add_rule_args *uap)
2231 {
2232         
2233         return (ENOSYS);
2234 }
2235
2236 int
2237 sys_rctl_remove_rule(struct thread *td, struct rctl_remove_rule_args *uap)
2238 {
2239         
2240         return (ENOSYS);
2241 }
2242
2243 #endif /* !RCTL */