]> CyberLeo.Net >> Repos - FreeBSD/FreeBSD.git/blob - sys/kern/kern_rctl.c
Drop "All rights reserved" from all my stuff. This includes
[FreeBSD/FreeBSD.git] / sys / kern / kern_rctl.c
1 /*-
2  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
3  *
4  * Copyright (c) 2010 The FreeBSD Foundation
5  *
6  * This software was developed by Edward Tomasz Napierala under sponsorship
7  * from the FreeBSD Foundation.
8  *
9  * Redistribution and use in source and binary forms, with or without
10  * modification, are permitted provided that the following conditions
11  * are met:
12  * 1. Redistributions of source code must retain the above copyright
13  *    notice, this list of conditions and the following disclaimer.
14  * 2. Redistributions in binary form must reproduce the above copyright
15  *    notice, this list of conditions and the following disclaimer in the
16  *    documentation and/or other materials provided with the distribution.
17  *
18  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
19  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
20  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
21  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
22  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
23  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
24  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
25  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
26  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
27  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
28  * SUCH DAMAGE.
29  *
30  * $FreeBSD$
31  */
32
33 #include <sys/cdefs.h>
34 __FBSDID("$FreeBSD$");
35
36 #include <sys/param.h>
37 #include <sys/devctl.h>
38 #include <sys/malloc.h>
39 #include <sys/queue.h>
40 #include <sys/refcount.h>
41 #include <sys/jail.h>
42 #include <sys/kernel.h>
43 #include <sys/limits.h>
44 #include <sys/loginclass.h>
45 #include <sys/priv.h>
46 #include <sys/proc.h>
47 #include <sys/racct.h>
48 #include <sys/rctl.h>
49 #include <sys/resourcevar.h>
50 #include <sys/sx.h>
51 #include <sys/sysent.h>
52 #include <sys/sysproto.h>
53 #include <sys/systm.h>
54 #include <sys/types.h>
55 #include <sys/eventhandler.h>
56 #include <sys/lock.h>
57 #include <sys/mutex.h>
58 #include <sys/rwlock.h>
59 #include <sys/sbuf.h>
60 #include <sys/taskqueue.h>
61 #include <sys/tree.h>
62 #include <vm/uma.h>
63
64 #ifdef RCTL
65 #ifndef RACCT
66 #error "The RCTL option requires the RACCT option"
67 #endif
68
69 FEATURE(rctl, "Resource Limits");
70
71 #define HRF_DEFAULT             0
72 #define HRF_DONT_INHERIT        1
73 #define HRF_DONT_ACCUMULATE     2
74
75 #define RCTL_MAX_INBUFSIZE      4 * 1024
76 #define RCTL_MAX_OUTBUFSIZE     16 * 1024 * 1024
77 #define RCTL_LOG_BUFSIZE        128
78
79 #define RCTL_PCPU_SHIFT         (10 * 1000000)
80
81 static unsigned int rctl_maxbufsize = RCTL_MAX_OUTBUFSIZE;
82 static int rctl_log_rate_limit = 10;
83 static int rctl_devctl_rate_limit = 10;
84
85 /*
86  * Values below are initialized in rctl_init().
87  */
88 static int rctl_throttle_min = -1;
89 static int rctl_throttle_max = -1;
90 static int rctl_throttle_pct = -1;
91 static int rctl_throttle_pct2 = -1;
92
93 static int rctl_throttle_min_sysctl(SYSCTL_HANDLER_ARGS);
94 static int rctl_throttle_max_sysctl(SYSCTL_HANDLER_ARGS);
95 static int rctl_throttle_pct_sysctl(SYSCTL_HANDLER_ARGS);
96 static int rctl_throttle_pct2_sysctl(SYSCTL_HANDLER_ARGS);
97
98 SYSCTL_NODE(_kern_racct, OID_AUTO, rctl, CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
99     "Resource Limits");
100 SYSCTL_UINT(_kern_racct_rctl, OID_AUTO, maxbufsize, CTLFLAG_RWTUN,
101     &rctl_maxbufsize, 0, "Maximum output buffer size");
102 SYSCTL_UINT(_kern_racct_rctl, OID_AUTO, log_rate_limit, CTLFLAG_RW,
103     &rctl_log_rate_limit, 0, "Maximum number of log messages per second");
104 SYSCTL_UINT(_kern_racct_rctl, OID_AUTO, devctl_rate_limit, CTLFLAG_RWTUN,
105     &rctl_devctl_rate_limit, 0, "Maximum number of devctl messages per second");
106 SYSCTL_PROC(_kern_racct_rctl, OID_AUTO, throttle_min,
107     CTLTYPE_UINT | CTLFLAG_RWTUN | CTLFLAG_MPSAFE, 0, 0,
108     &rctl_throttle_min_sysctl, "IU",
109     "Shortest throttling duration, in hz");
110 TUNABLE_INT("kern.racct.rctl.throttle_min", &rctl_throttle_min);
111 SYSCTL_PROC(_kern_racct_rctl, OID_AUTO, throttle_max,
112     CTLTYPE_UINT | CTLFLAG_RWTUN | CTLFLAG_MPSAFE, 0, 0,
113     &rctl_throttle_max_sysctl, "IU",
114     "Longest throttling duration, in hz");
115 TUNABLE_INT("kern.racct.rctl.throttle_max", &rctl_throttle_max);
116 SYSCTL_PROC(_kern_racct_rctl, OID_AUTO, throttle_pct,
117     CTLTYPE_UINT | CTLFLAG_RWTUN | CTLFLAG_MPSAFE, 0, 0,
118     &rctl_throttle_pct_sysctl, "IU",
119     "Throttling penalty for process consumption, in percent");
120 TUNABLE_INT("kern.racct.rctl.throttle_pct", &rctl_throttle_pct);
121 SYSCTL_PROC(_kern_racct_rctl, OID_AUTO, throttle_pct2,
122     CTLTYPE_UINT | CTLFLAG_RWTUN | CTLFLAG_MPSAFE, 0, 0,
123     &rctl_throttle_pct2_sysctl, "IU",
124     "Throttling penalty for container consumption, in percent");
125 TUNABLE_INT("kern.racct.rctl.throttle_pct2", &rctl_throttle_pct2);
126
127 /*
128  * 'rctl_rule_link' connects a rule with every racct it's related to.
129  * For example, rule 'user:X:openfiles:deny=N/process' is linked
130  * with uidinfo for user X, and to each process of that user.
131  */
132 struct rctl_rule_link {
133         LIST_ENTRY(rctl_rule_link)      rrl_next;
134         struct rctl_rule                *rrl_rule;
135         int                             rrl_exceeded;
136 };
137
138 struct dict {
139         const char      *d_name;
140         int             d_value;
141 };
142
143 static struct dict subjectnames[] = {
144         { "process", RCTL_SUBJECT_TYPE_PROCESS },
145         { "user", RCTL_SUBJECT_TYPE_USER },
146         { "loginclass", RCTL_SUBJECT_TYPE_LOGINCLASS },
147         { "jail", RCTL_SUBJECT_TYPE_JAIL },
148         { NULL, -1 }};
149
150 static struct dict resourcenames[] = {
151         { "cputime", RACCT_CPU },
152         { "datasize", RACCT_DATA },
153         { "stacksize", RACCT_STACK },
154         { "coredumpsize", RACCT_CORE },
155         { "memoryuse", RACCT_RSS },
156         { "memorylocked", RACCT_MEMLOCK },
157         { "maxproc", RACCT_NPROC },
158         { "openfiles", RACCT_NOFILE },
159         { "vmemoryuse", RACCT_VMEM },
160         { "pseudoterminals", RACCT_NPTS },
161         { "swapuse", RACCT_SWAP },
162         { "nthr", RACCT_NTHR },
163         { "msgqqueued", RACCT_MSGQQUEUED },
164         { "msgqsize", RACCT_MSGQSIZE },
165         { "nmsgq", RACCT_NMSGQ },
166         { "nsem", RACCT_NSEM },
167         { "nsemop", RACCT_NSEMOP },
168         { "nshm", RACCT_NSHM },
169         { "shmsize", RACCT_SHMSIZE },
170         { "wallclock", RACCT_WALLCLOCK },
171         { "pcpu", RACCT_PCTCPU },
172         { "readbps", RACCT_READBPS },
173         { "writebps", RACCT_WRITEBPS },
174         { "readiops", RACCT_READIOPS },
175         { "writeiops", RACCT_WRITEIOPS },
176         { NULL, -1 }};
177
178 static struct dict actionnames[] = {
179         { "sighup", RCTL_ACTION_SIGHUP },
180         { "sigint", RCTL_ACTION_SIGINT },
181         { "sigquit", RCTL_ACTION_SIGQUIT },
182         { "sigill", RCTL_ACTION_SIGILL },
183         { "sigtrap", RCTL_ACTION_SIGTRAP },
184         { "sigabrt", RCTL_ACTION_SIGABRT },
185         { "sigemt", RCTL_ACTION_SIGEMT },
186         { "sigfpe", RCTL_ACTION_SIGFPE },
187         { "sigkill", RCTL_ACTION_SIGKILL },
188         { "sigbus", RCTL_ACTION_SIGBUS },
189         { "sigsegv", RCTL_ACTION_SIGSEGV },
190         { "sigsys", RCTL_ACTION_SIGSYS },
191         { "sigpipe", RCTL_ACTION_SIGPIPE },
192         { "sigalrm", RCTL_ACTION_SIGALRM },
193         { "sigterm", RCTL_ACTION_SIGTERM },
194         { "sigurg", RCTL_ACTION_SIGURG },
195         { "sigstop", RCTL_ACTION_SIGSTOP },
196         { "sigtstp", RCTL_ACTION_SIGTSTP },
197         { "sigchld", RCTL_ACTION_SIGCHLD },
198         { "sigttin", RCTL_ACTION_SIGTTIN },
199         { "sigttou", RCTL_ACTION_SIGTTOU },
200         { "sigio", RCTL_ACTION_SIGIO },
201         { "sigxcpu", RCTL_ACTION_SIGXCPU },
202         { "sigxfsz", RCTL_ACTION_SIGXFSZ },
203         { "sigvtalrm", RCTL_ACTION_SIGVTALRM },
204         { "sigprof", RCTL_ACTION_SIGPROF },
205         { "sigwinch", RCTL_ACTION_SIGWINCH },
206         { "siginfo", RCTL_ACTION_SIGINFO },
207         { "sigusr1", RCTL_ACTION_SIGUSR1 },
208         { "sigusr2", RCTL_ACTION_SIGUSR2 },
209         { "sigthr", RCTL_ACTION_SIGTHR },
210         { "deny", RCTL_ACTION_DENY },
211         { "log", RCTL_ACTION_LOG },
212         { "devctl", RCTL_ACTION_DEVCTL },
213         { "throttle", RCTL_ACTION_THROTTLE },
214         { NULL, -1 }};
215
216 static void rctl_init(void);
217 SYSINIT(rctl, SI_SUB_RACCT, SI_ORDER_FIRST, rctl_init, NULL);
218
219 static uma_zone_t rctl_rule_zone;
220 static uma_zone_t rctl_rule_link_zone;
221
222 static int rctl_rule_fully_specified(const struct rctl_rule *rule);
223 static void rctl_rule_to_sbuf(struct sbuf *sb, const struct rctl_rule *rule);
224
225 static MALLOC_DEFINE(M_RCTL, "rctl", "Resource Limits");
226
227 static int rctl_throttle_min_sysctl(SYSCTL_HANDLER_ARGS)
228 {
229         int error, val = rctl_throttle_min;
230
231         error = sysctl_handle_int(oidp, &val, 0, req);
232         if (error || !req->newptr)
233                 return (error);
234         if (val < 1 || val > rctl_throttle_max)
235                 return (EINVAL);
236
237         RACCT_LOCK();
238         rctl_throttle_min = val;
239         RACCT_UNLOCK();
240
241         return (0);
242 }
243
244 static int rctl_throttle_max_sysctl(SYSCTL_HANDLER_ARGS)
245 {
246         int error, val = rctl_throttle_max;
247
248         error = sysctl_handle_int(oidp, &val, 0, req);
249         if (error || !req->newptr)
250                 return (error);
251         if (val < rctl_throttle_min)
252                 return (EINVAL);
253
254         RACCT_LOCK();
255         rctl_throttle_max = val;
256         RACCT_UNLOCK();
257
258         return (0);
259 }
260
261 static int rctl_throttle_pct_sysctl(SYSCTL_HANDLER_ARGS)
262 {
263         int error, val = rctl_throttle_pct;
264
265         error = sysctl_handle_int(oidp, &val, 0, req);
266         if (error || !req->newptr)
267                 return (error);
268         if (val < 0)
269                 return (EINVAL);
270
271         RACCT_LOCK();
272         rctl_throttle_pct = val;
273         RACCT_UNLOCK();
274
275         return (0);
276 }
277
278 static int rctl_throttle_pct2_sysctl(SYSCTL_HANDLER_ARGS)
279 {
280         int error, val = rctl_throttle_pct2;
281
282         error = sysctl_handle_int(oidp, &val, 0, req);
283         if (error || !req->newptr)
284                 return (error);
285         if (val < 0)
286                 return (EINVAL);
287
288         RACCT_LOCK();
289         rctl_throttle_pct2 = val;
290         RACCT_UNLOCK();
291
292         return (0);
293 }
294
295 static const char *
296 rctl_subject_type_name(int subject)
297 {
298         int i;
299
300         for (i = 0; subjectnames[i].d_name != NULL; i++) {
301                 if (subjectnames[i].d_value == subject)
302                         return (subjectnames[i].d_name);
303         }
304
305         panic("rctl_subject_type_name: unknown subject type %d", subject);
306 }
307
308 static const char *
309 rctl_action_name(int action)
310 {
311         int i;
312
313         for (i = 0; actionnames[i].d_name != NULL; i++) {
314                 if (actionnames[i].d_value == action)
315                         return (actionnames[i].d_name);
316         }
317
318         panic("rctl_action_name: unknown action %d", action);
319 }
320
321 const char *
322 rctl_resource_name(int resource)
323 {
324         int i;
325
326         for (i = 0; resourcenames[i].d_name != NULL; i++) {
327                 if (resourcenames[i].d_value == resource)
328                         return (resourcenames[i].d_name);
329         }
330
331         panic("rctl_resource_name: unknown resource %d", resource);
332 }
333
334 static struct racct *
335 rctl_proc_rule_to_racct(const struct proc *p, const struct rctl_rule *rule)
336 {
337         struct ucred *cred = p->p_ucred;
338
339         ASSERT_RACCT_ENABLED();
340         RACCT_LOCK_ASSERT();
341
342         switch (rule->rr_per) {
343         case RCTL_SUBJECT_TYPE_PROCESS:
344                 return (p->p_racct);
345         case RCTL_SUBJECT_TYPE_USER:
346                 return (cred->cr_ruidinfo->ui_racct);
347         case RCTL_SUBJECT_TYPE_LOGINCLASS:
348                 return (cred->cr_loginclass->lc_racct);
349         case RCTL_SUBJECT_TYPE_JAIL:
350                 return (cred->cr_prison->pr_prison_racct->prr_racct);
351         default:
352                 panic("%s: unknown per %d", __func__, rule->rr_per);
353         }
354 }
355
356 /*
357  * Return the amount of resource that can be allocated by 'p' before
358  * hitting 'rule'.
359  */
360 static int64_t
361 rctl_available_resource(const struct proc *p, const struct rctl_rule *rule)
362 {
363         const struct racct *racct;
364         int64_t available;
365
366         ASSERT_RACCT_ENABLED();
367         RACCT_LOCK_ASSERT();
368
369         racct = rctl_proc_rule_to_racct(p, rule);
370         available = rule->rr_amount - racct->r_resources[rule->rr_resource];
371
372         return (available);
373 }
374
375 /*
376  * Called every second for proc, uidinfo, loginclass, and jail containers.
377  * If the limit isn't exceeded, it decreases the usage amount to zero.
378  * Otherwise, it decreases it by the value of the limit.  This way
379  * resource consumption exceeding the limit "carries over" to the next
380  * period.
381  */
382 void
383 rctl_throttle_decay(struct racct *racct, int resource)
384 {
385         struct rctl_rule *rule;
386         struct rctl_rule_link *link;
387         int64_t minavailable;
388
389         ASSERT_RACCT_ENABLED();
390         RACCT_LOCK_ASSERT();
391
392         minavailable = INT64_MAX;
393
394         LIST_FOREACH(link, &racct->r_rule_links, rrl_next) {
395                 rule = link->rrl_rule;
396
397                 if (rule->rr_resource != resource)
398                         continue;
399                 if (rule->rr_action != RCTL_ACTION_THROTTLE)
400                         continue;
401
402                 if (rule->rr_amount < minavailable)
403                         minavailable = rule->rr_amount;
404         }
405
406         if (racct->r_resources[resource] < minavailable) {
407                 racct->r_resources[resource] = 0;
408         } else {
409                 /*
410                  * Cap utilization counter at ten times the limit.  Otherwise,
411                  * if we changed the rule lowering the allowed amount, it could
412                  * take unreasonably long time for the accumulated resource
413                  * usage to drop.
414                  */
415                 if (racct->r_resources[resource] > minavailable * 10)
416                         racct->r_resources[resource] = minavailable * 10;
417
418                 racct->r_resources[resource] -= minavailable;
419         }
420 }
421
422 /*
423  * Special version of rctl_get_available() for the %CPU resource.
424  * We slightly cheat here and return less than we normally would.
425  */
426 int64_t
427 rctl_pcpu_available(const struct proc *p) {
428         struct rctl_rule *rule;
429         struct rctl_rule_link *link;
430         int64_t available, minavailable, limit;
431
432         ASSERT_RACCT_ENABLED();
433         RACCT_LOCK_ASSERT();
434
435         minavailable = INT64_MAX;
436         limit = 0;
437
438         LIST_FOREACH(link, &p->p_racct->r_rule_links, rrl_next) {
439                 rule = link->rrl_rule;
440                 if (rule->rr_resource != RACCT_PCTCPU)
441                         continue;
442                 if (rule->rr_action != RCTL_ACTION_DENY)
443                         continue;
444                 available = rctl_available_resource(p, rule);
445                 if (available < minavailable) {
446                         minavailable = available;
447                         limit = rule->rr_amount;
448                 }
449         }
450
451         /*
452          * Return slightly less than actual value of the available
453          * %cpu resource.  This makes %cpu throttling more aggressive
454          * and lets us act sooner than the limits are already exceeded.
455          */
456         if (limit != 0) {
457                 if (limit > 2 * RCTL_PCPU_SHIFT)
458                         minavailable -= RCTL_PCPU_SHIFT;
459                 else
460                         minavailable -= (limit / 2);
461         }
462
463         return (minavailable);
464 }
465
466 static uint64_t
467 xadd(uint64_t a, uint64_t b)
468 {
469         uint64_t c;
470
471         c = a + b;
472
473         /*
474          * Detect overflow.
475          */
476         if (c < a || c < b)
477                 return (UINT64_MAX);
478
479         return (c);
480 }
481
482 static uint64_t
483 xmul(uint64_t a, uint64_t b)
484 {
485
486         if (b != 0 && a > UINT64_MAX / b)
487                 return (UINT64_MAX);
488
489         return (a * b);
490 }
491
492 /*
493  * Check whether the proc 'p' can allocate 'amount' of 'resource' in addition
494  * to what it keeps allocated now.  Returns non-zero if the allocation should
495  * be denied, 0 otherwise.
496  */
497 int
498 rctl_enforce(struct proc *p, int resource, uint64_t amount)
499 {
500         static struct timeval log_lasttime, devctl_lasttime;
501         static int log_curtime = 0, devctl_curtime = 0;
502         struct rctl_rule *rule;
503         struct rctl_rule_link *link;
504         struct sbuf sb;
505         char *buf;
506         int64_t available;
507         uint64_t sleep_ms, sleep_ratio;
508         int should_deny = 0;
509
510         ASSERT_RACCT_ENABLED();
511         RACCT_LOCK_ASSERT();
512
513         /*
514          * There may be more than one matching rule; go through all of them.
515          * Denial should be done last, after logging and sending signals.
516          */
517         LIST_FOREACH(link, &p->p_racct->r_rule_links, rrl_next) {
518                 rule = link->rrl_rule;
519                 if (rule->rr_resource != resource)
520                         continue;
521
522                 available = rctl_available_resource(p, rule);
523                 if (available >= (int64_t)amount) {
524                         link->rrl_exceeded = 0;
525                         continue;
526                 }
527
528                 switch (rule->rr_action) {
529                 case RCTL_ACTION_DENY:
530                         should_deny = 1;
531                         continue;
532                 case RCTL_ACTION_LOG:
533                         /*
534                          * If rrl_exceeded != 0, it means we've already
535                          * logged a warning for this process.
536                          */
537                         if (link->rrl_exceeded != 0)
538                                 continue;
539
540                         /*
541                          * If the process state is not fully initialized yet,
542                          * we can't access most of the required fields, e.g.
543                          * p->p_comm.  This happens when called from fork1().
544                          * Ignore this rule for now; it will be processed just
545                          * after fork, when called from racct_proc_fork_done().
546                          */
547                         if (p->p_state != PRS_NORMAL)
548                                 continue;
549
550                         if (!ppsratecheck(&log_lasttime, &log_curtime,
551                             rctl_log_rate_limit))
552                                 continue;
553
554                         buf = malloc(RCTL_LOG_BUFSIZE, M_RCTL, M_NOWAIT);
555                         if (buf == NULL) {
556                                 printf("rctl_enforce: out of memory\n");
557                                 continue;
558                         }
559                         sbuf_new(&sb, buf, RCTL_LOG_BUFSIZE, SBUF_FIXEDLEN);
560                         rctl_rule_to_sbuf(&sb, rule);
561                         sbuf_finish(&sb);
562                         printf("rctl: rule \"%s\" matched by pid %d "
563                             "(%s), uid %d, jail %s\n", sbuf_data(&sb),
564                             p->p_pid, p->p_comm, p->p_ucred->cr_uid,
565                             p->p_ucred->cr_prison->pr_prison_racct->prr_name);
566                         sbuf_delete(&sb);
567                         free(buf, M_RCTL);
568                         link->rrl_exceeded = 1;
569                         continue;
570                 case RCTL_ACTION_DEVCTL:
571                         if (link->rrl_exceeded != 0)
572                                 continue;
573
574                         if (p->p_state != PRS_NORMAL)
575                                 continue;
576
577                         if (!ppsratecheck(&devctl_lasttime, &devctl_curtime,
578                             rctl_devctl_rate_limit))
579                                 continue;
580
581                         buf = malloc(RCTL_LOG_BUFSIZE, M_RCTL, M_NOWAIT);
582                         if (buf == NULL) {
583                                 printf("rctl_enforce: out of memory\n");
584                                 continue;
585                         }
586                         sbuf_new(&sb, buf, RCTL_LOG_BUFSIZE, SBUF_FIXEDLEN);
587                         sbuf_printf(&sb, "rule=");
588                         rctl_rule_to_sbuf(&sb, rule);
589                         sbuf_printf(&sb, " pid=%d ruid=%d jail=%s",
590                             p->p_pid, p->p_ucred->cr_ruid,
591                             p->p_ucred->cr_prison->pr_prison_racct->prr_name);
592                         sbuf_finish(&sb);
593                         devctl_notify("RCTL", "rule", "matched",
594                             sbuf_data(&sb));
595                         sbuf_delete(&sb);
596                         free(buf, M_RCTL);
597                         link->rrl_exceeded = 1;
598                         continue;
599                 case RCTL_ACTION_THROTTLE:
600                         if (p->p_state != PRS_NORMAL)
601                                 continue;
602
603                         /*
604                          * Make the process sleep for a fraction of second
605                          * proportional to the ratio of process' resource
606                          * utilization compared to the limit.  The point is
607                          * to penalize resource hogs: processes that consume
608                          * more of the available resources sleep for longer.
609                          *
610                          * We're trying to defer division until the very end,
611                          * to minimize the rounding effects.  The following
612                          * calculation could have been written in a clearer
613                          * way like this:
614                          *
615                          * sleep_ms = hz * p->p_racct->r_resources[resource] /
616                          *     rule->rr_amount;
617                          * sleep_ms *= rctl_throttle_pct / 100;
618                          * if (sleep_ms < rctl_throttle_min)
619                          *         sleep_ms = rctl_throttle_min;
620                          *
621                          */
622                         sleep_ms = xmul(hz, p->p_racct->r_resources[resource]);
623                         sleep_ms = xmul(sleep_ms,  rctl_throttle_pct) / 100;
624                         if (sleep_ms < rctl_throttle_min * rule->rr_amount)
625                                 sleep_ms = rctl_throttle_min * rule->rr_amount;
626
627                         /*
628                          * Multiply that by the ratio of the resource
629                          * consumption for the container compared to the limit,
630                          * squared.  In other words, a process in a container
631                          * that is two times over the limit will be throttled
632                          * four times as much for hitting the same rule.  The
633                          * point is to penalize processes more if the container
634                          * itself (eg certain UID or jail) is above the limit.
635                          */
636                         if (available < 0)
637                                 sleep_ratio = -available / rule->rr_amount;
638                         else
639                                 sleep_ratio = 0;
640                         sleep_ratio = xmul(sleep_ratio, sleep_ratio);
641                         sleep_ratio = xmul(sleep_ratio, rctl_throttle_pct2) / 100;
642                         sleep_ms = xadd(sleep_ms, xmul(sleep_ms, sleep_ratio));
643
644                         /*
645                          * Finally the division.
646                          */
647                         sleep_ms /= rule->rr_amount;
648
649                         if (sleep_ms > rctl_throttle_max)
650                                 sleep_ms = rctl_throttle_max;
651 #if 0
652                         printf("%s: pid %d (%s), %jd of %jd, will sleep for %ju ms (ratio %ju, available %jd)\n",
653                            __func__, p->p_pid, p->p_comm,
654                            p->p_racct->r_resources[resource],
655                            rule->rr_amount, (uintmax_t)sleep_ms,
656                            (uintmax_t)sleep_ratio, (intmax_t)available);
657 #endif
658
659                         KASSERT(sleep_ms >= rctl_throttle_min, ("%s: %ju < %d\n",
660                             __func__, (uintmax_t)sleep_ms, rctl_throttle_min));
661                         racct_proc_throttle(p, sleep_ms);
662                         continue;
663                 default:
664                         if (link->rrl_exceeded != 0)
665                                 continue;
666
667                         if (p->p_state != PRS_NORMAL)
668                                 continue;
669
670                         KASSERT(rule->rr_action > 0 &&
671                             rule->rr_action <= RCTL_ACTION_SIGNAL_MAX,
672                             ("rctl_enforce: unknown action %d",
673                              rule->rr_action));
674
675                         /*
676                          * We're using the fact that RCTL_ACTION_SIG* values
677                          * are equal to their counterparts from sys/signal.h.
678                          */
679                         kern_psignal(p, rule->rr_action);
680                         link->rrl_exceeded = 1;
681                         continue;
682                 }
683         }
684
685         if (should_deny) {
686                 /*
687                  * Return fake error code; the caller should change it
688                  * into one proper for the situation - EFSIZ, ENOMEM etc.
689                  */
690                 return (EDOOFUS);
691         }
692
693         return (0);
694 }
695
696 uint64_t
697 rctl_get_limit(struct proc *p, int resource)
698 {
699         struct rctl_rule *rule;
700         struct rctl_rule_link *link;
701         uint64_t amount = UINT64_MAX;
702
703         ASSERT_RACCT_ENABLED();
704         RACCT_LOCK_ASSERT();
705
706         /*
707          * There may be more than one matching rule; go through all of them.
708          * Denial should be done last, after logging and sending signals.
709          */
710         LIST_FOREACH(link, &p->p_racct->r_rule_links, rrl_next) {
711                 rule = link->rrl_rule;
712                 if (rule->rr_resource != resource)
713                         continue;
714                 if (rule->rr_action != RCTL_ACTION_DENY)
715                         continue;
716                 if (rule->rr_amount < amount)
717                         amount = rule->rr_amount;
718         }
719
720         return (amount);
721 }
722
723 uint64_t
724 rctl_get_available(struct proc *p, int resource)
725 {
726         struct rctl_rule *rule;
727         struct rctl_rule_link *link;
728         int64_t available, minavailable, allocated;
729
730         minavailable = INT64_MAX;
731
732         ASSERT_RACCT_ENABLED();
733         RACCT_LOCK_ASSERT();
734
735         /*
736          * There may be more than one matching rule; go through all of them.
737          * Denial should be done last, after logging and sending signals.
738          */
739         LIST_FOREACH(link, &p->p_racct->r_rule_links, rrl_next) {
740                 rule = link->rrl_rule;
741                 if (rule->rr_resource != resource)
742                         continue;
743                 if (rule->rr_action != RCTL_ACTION_DENY)
744                         continue;
745                 available = rctl_available_resource(p, rule);
746                 if (available < minavailable)
747                         minavailable = available;
748         }
749
750         /*
751          * XXX: Think about this _hard_.
752          */
753         allocated = p->p_racct->r_resources[resource];
754         if (minavailable < INT64_MAX - allocated)
755                 minavailable += allocated;
756         if (minavailable < 0)
757                 minavailable = 0;
758
759         return (minavailable);
760 }
761
762 static int
763 rctl_rule_matches(const struct rctl_rule *rule, const struct rctl_rule *filter)
764 {
765
766         ASSERT_RACCT_ENABLED();
767
768         if (filter->rr_subject_type != RCTL_SUBJECT_TYPE_UNDEFINED) {
769                 if (rule->rr_subject_type != filter->rr_subject_type)
770                         return (0);
771
772                 switch (filter->rr_subject_type) {
773                 case RCTL_SUBJECT_TYPE_PROCESS:
774                         if (filter->rr_subject.rs_proc != NULL &&
775                             rule->rr_subject.rs_proc !=
776                             filter->rr_subject.rs_proc)
777                                 return (0);
778                         break;
779                 case RCTL_SUBJECT_TYPE_USER:
780                         if (filter->rr_subject.rs_uip != NULL &&
781                             rule->rr_subject.rs_uip !=
782                             filter->rr_subject.rs_uip)
783                                 return (0);
784                         break;
785                 case RCTL_SUBJECT_TYPE_LOGINCLASS:
786                         if (filter->rr_subject.rs_loginclass != NULL &&
787                             rule->rr_subject.rs_loginclass !=
788                             filter->rr_subject.rs_loginclass)
789                                 return (0);
790                         break;
791                 case RCTL_SUBJECT_TYPE_JAIL:
792                         if (filter->rr_subject.rs_prison_racct != NULL &&
793                             rule->rr_subject.rs_prison_racct !=
794                             filter->rr_subject.rs_prison_racct)
795                                 return (0);
796                         break;
797                 default:
798                         panic("rctl_rule_matches: unknown subject type %d",
799                             filter->rr_subject_type);
800                 }
801         }
802
803         if (filter->rr_resource != RACCT_UNDEFINED) {
804                 if (rule->rr_resource != filter->rr_resource)
805                         return (0);
806         }
807
808         if (filter->rr_action != RCTL_ACTION_UNDEFINED) {
809                 if (rule->rr_action != filter->rr_action)
810                         return (0);
811         }
812
813         if (filter->rr_amount != RCTL_AMOUNT_UNDEFINED) {
814                 if (rule->rr_amount != filter->rr_amount)
815                         return (0);
816         }
817
818         if (filter->rr_per != RCTL_SUBJECT_TYPE_UNDEFINED) {
819                 if (rule->rr_per != filter->rr_per)
820                         return (0);
821         }
822
823         return (1);
824 }
825
826 static int
827 str2value(const char *str, int *value, struct dict *table)
828 {
829         int i;
830
831         if (value == NULL)
832                 return (EINVAL);
833
834         for (i = 0; table[i].d_name != NULL; i++) {
835                 if (strcasecmp(table[i].d_name, str) == 0) {
836                         *value =  table[i].d_value;
837                         return (0);
838                 }
839         }
840
841         return (EINVAL);
842 }
843
844 static int
845 str2id(const char *str, id_t *value)
846 {
847         char *end;
848
849         if (str == NULL)
850                 return (EINVAL);
851
852         *value = strtoul(str, &end, 10);
853         if ((size_t)(end - str) != strlen(str))
854                 return (EINVAL);
855
856         return (0);
857 }
858
859 static int
860 str2int64(const char *str, int64_t *value)
861 {
862         char *end;
863
864         if (str == NULL)
865                 return (EINVAL);
866
867         *value = strtoul(str, &end, 10);
868         if ((size_t)(end - str) != strlen(str))
869                 return (EINVAL);
870
871         if (*value < 0)
872                 return (ERANGE);
873
874         return (0);
875 }
876
877 /*
878  * Connect the rule to the racct, increasing refcount for the rule.
879  */
880 static void
881 rctl_racct_add_rule(struct racct *racct, struct rctl_rule *rule)
882 {
883         struct rctl_rule_link *link;
884
885         ASSERT_RACCT_ENABLED();
886         KASSERT(rctl_rule_fully_specified(rule), ("rule not fully specified"));
887
888         rctl_rule_acquire(rule);
889         link = uma_zalloc(rctl_rule_link_zone, M_WAITOK);
890         link->rrl_rule = rule;
891         link->rrl_exceeded = 0;
892
893         RACCT_LOCK();
894         LIST_INSERT_HEAD(&racct->r_rule_links, link, rrl_next);
895         RACCT_UNLOCK();
896 }
897
898 static int
899 rctl_racct_add_rule_locked(struct racct *racct, struct rctl_rule *rule)
900 {
901         struct rctl_rule_link *link;
902
903         ASSERT_RACCT_ENABLED();
904         KASSERT(rctl_rule_fully_specified(rule), ("rule not fully specified"));
905         RACCT_LOCK_ASSERT();
906
907         link = uma_zalloc(rctl_rule_link_zone, M_NOWAIT);
908         if (link == NULL)
909                 return (ENOMEM);
910         rctl_rule_acquire(rule);
911         link->rrl_rule = rule;
912         link->rrl_exceeded = 0;
913
914         LIST_INSERT_HEAD(&racct->r_rule_links, link, rrl_next);
915
916         return (0);
917 }
918
919 /*
920  * Remove limits for a rules matching the filter and release
921  * the refcounts for the rules, possibly freeing them.  Returns
922  * the number of limit structures removed.
923  */
924 static int
925 rctl_racct_remove_rules(struct racct *racct,
926     const struct rctl_rule *filter)
927 {
928         struct rctl_rule_link *link, *linktmp;
929         int removed = 0;
930
931         ASSERT_RACCT_ENABLED();
932         RACCT_LOCK_ASSERT();
933
934         LIST_FOREACH_SAFE(link, &racct->r_rule_links, rrl_next, linktmp) {
935                 if (!rctl_rule_matches(link->rrl_rule, filter))
936                         continue;
937
938                 LIST_REMOVE(link, rrl_next);
939                 rctl_rule_release(link->rrl_rule);
940                 uma_zfree(rctl_rule_link_zone, link);
941                 removed++;
942         }
943         return (removed);
944 }
945
946 static void
947 rctl_rule_acquire_subject(struct rctl_rule *rule)
948 {
949
950         ASSERT_RACCT_ENABLED();
951
952         switch (rule->rr_subject_type) {
953         case RCTL_SUBJECT_TYPE_UNDEFINED:
954         case RCTL_SUBJECT_TYPE_PROCESS:
955                 break;
956         case RCTL_SUBJECT_TYPE_JAIL:
957                 if (rule->rr_subject.rs_prison_racct != NULL)
958                         prison_racct_hold(rule->rr_subject.rs_prison_racct);
959                 break;
960         case RCTL_SUBJECT_TYPE_USER:
961                 if (rule->rr_subject.rs_uip != NULL)
962                         uihold(rule->rr_subject.rs_uip);
963                 break;
964         case RCTL_SUBJECT_TYPE_LOGINCLASS:
965                 if (rule->rr_subject.rs_loginclass != NULL)
966                         loginclass_hold(rule->rr_subject.rs_loginclass);
967                 break;
968         default:
969                 panic("rctl_rule_acquire_subject: unknown subject type %d",
970                     rule->rr_subject_type);
971         }
972 }
973
974 static void
975 rctl_rule_release_subject(struct rctl_rule *rule)
976 {
977
978         ASSERT_RACCT_ENABLED();
979
980         switch (rule->rr_subject_type) {
981         case RCTL_SUBJECT_TYPE_UNDEFINED:
982         case RCTL_SUBJECT_TYPE_PROCESS:
983                 break;
984         case RCTL_SUBJECT_TYPE_JAIL:
985                 if (rule->rr_subject.rs_prison_racct != NULL)
986                         prison_racct_free(rule->rr_subject.rs_prison_racct);
987                 break;
988         case RCTL_SUBJECT_TYPE_USER:
989                 if (rule->rr_subject.rs_uip != NULL)
990                         uifree(rule->rr_subject.rs_uip);
991                 break;
992         case RCTL_SUBJECT_TYPE_LOGINCLASS:
993                 if (rule->rr_subject.rs_loginclass != NULL)
994                         loginclass_free(rule->rr_subject.rs_loginclass);
995                 break;
996         default:
997                 panic("rctl_rule_release_subject: unknown subject type %d",
998                     rule->rr_subject_type);
999         }
1000 }
1001
1002 struct rctl_rule *
1003 rctl_rule_alloc(int flags)
1004 {
1005         struct rctl_rule *rule;
1006
1007         ASSERT_RACCT_ENABLED();
1008
1009         rule = uma_zalloc(rctl_rule_zone, flags);
1010         if (rule == NULL)
1011                 return (NULL);
1012         rule->rr_subject_type = RCTL_SUBJECT_TYPE_UNDEFINED;
1013         rule->rr_subject.rs_proc = NULL;
1014         rule->rr_subject.rs_uip = NULL;
1015         rule->rr_subject.rs_loginclass = NULL;
1016         rule->rr_subject.rs_prison_racct = NULL;
1017         rule->rr_per = RCTL_SUBJECT_TYPE_UNDEFINED;
1018         rule->rr_resource = RACCT_UNDEFINED;
1019         rule->rr_action = RCTL_ACTION_UNDEFINED;
1020         rule->rr_amount = RCTL_AMOUNT_UNDEFINED;
1021         refcount_init(&rule->rr_refcount, 1);
1022
1023         return (rule);
1024 }
1025
1026 struct rctl_rule *
1027 rctl_rule_duplicate(const struct rctl_rule *rule, int flags)
1028 {
1029         struct rctl_rule *copy;
1030
1031         ASSERT_RACCT_ENABLED();
1032
1033         copy = uma_zalloc(rctl_rule_zone, flags);
1034         if (copy == NULL)
1035                 return (NULL);
1036         copy->rr_subject_type = rule->rr_subject_type;
1037         copy->rr_subject.rs_proc = rule->rr_subject.rs_proc;
1038         copy->rr_subject.rs_uip = rule->rr_subject.rs_uip;
1039         copy->rr_subject.rs_loginclass = rule->rr_subject.rs_loginclass;
1040         copy->rr_subject.rs_prison_racct = rule->rr_subject.rs_prison_racct;
1041         copy->rr_per = rule->rr_per;
1042         copy->rr_resource = rule->rr_resource;
1043         copy->rr_action = rule->rr_action;
1044         copy->rr_amount = rule->rr_amount;
1045         refcount_init(&copy->rr_refcount, 1);
1046         rctl_rule_acquire_subject(copy);
1047
1048         return (copy);
1049 }
1050
1051 void
1052 rctl_rule_acquire(struct rctl_rule *rule)
1053 {
1054
1055         ASSERT_RACCT_ENABLED();
1056         KASSERT(rule->rr_refcount > 0, ("rule->rr_refcount <= 0"));
1057
1058         refcount_acquire(&rule->rr_refcount);
1059 }
1060
1061 static void
1062 rctl_rule_free(void *context, int pending)
1063 {
1064         struct rctl_rule *rule;
1065
1066         rule = (struct rctl_rule *)context;
1067
1068         ASSERT_RACCT_ENABLED();
1069         KASSERT(rule->rr_refcount == 0, ("rule->rr_refcount != 0"));
1070
1071         /*
1072          * We don't need locking here; rule is guaranteed to be inaccessible.
1073          */
1074
1075         rctl_rule_release_subject(rule);
1076         uma_zfree(rctl_rule_zone, rule);
1077 }
1078
1079 void
1080 rctl_rule_release(struct rctl_rule *rule)
1081 {
1082
1083         ASSERT_RACCT_ENABLED();
1084         KASSERT(rule->rr_refcount > 0, ("rule->rr_refcount <= 0"));
1085
1086         if (refcount_release(&rule->rr_refcount)) {
1087                 /*
1088                  * rctl_rule_release() is often called when iterating
1089                  * over all the uidinfo structures in the system,
1090                  * holding uihashtbl_lock.  Since rctl_rule_free()
1091                  * might end up calling uifree(), this would lead
1092                  * to lock recursion.  Use taskqueue to avoid this.
1093                  */
1094                 TASK_INIT(&rule->rr_task, 0, rctl_rule_free, rule);
1095                 taskqueue_enqueue(taskqueue_thread, &rule->rr_task);
1096         }
1097 }
1098
1099 static int
1100 rctl_rule_fully_specified(const struct rctl_rule *rule)
1101 {
1102
1103         ASSERT_RACCT_ENABLED();
1104
1105         switch (rule->rr_subject_type) {
1106         case RCTL_SUBJECT_TYPE_UNDEFINED:
1107                 return (0);
1108         case RCTL_SUBJECT_TYPE_PROCESS:
1109                 if (rule->rr_subject.rs_proc == NULL)
1110                         return (0);
1111                 break;
1112         case RCTL_SUBJECT_TYPE_USER:
1113                 if (rule->rr_subject.rs_uip == NULL)
1114                         return (0);
1115                 break;
1116         case RCTL_SUBJECT_TYPE_LOGINCLASS:
1117                 if (rule->rr_subject.rs_loginclass == NULL)
1118                         return (0);
1119                 break;
1120         case RCTL_SUBJECT_TYPE_JAIL:
1121                 if (rule->rr_subject.rs_prison_racct == NULL)
1122                         return (0);
1123                 break;
1124         default:
1125                 panic("rctl_rule_fully_specified: unknown subject type %d",
1126                     rule->rr_subject_type);
1127         }
1128         if (rule->rr_resource == RACCT_UNDEFINED)
1129                 return (0);
1130         if (rule->rr_action == RCTL_ACTION_UNDEFINED)
1131                 return (0);
1132         if (rule->rr_amount == RCTL_AMOUNT_UNDEFINED)
1133                 return (0);
1134         if (rule->rr_per == RCTL_SUBJECT_TYPE_UNDEFINED)
1135                 return (0);
1136
1137         return (1);
1138 }
1139
1140 static int
1141 rctl_string_to_rule(char *rulestr, struct rctl_rule **rulep)
1142 {
1143         struct rctl_rule *rule;
1144         char *subjectstr, *subject_idstr, *resourcestr, *actionstr,
1145              *amountstr, *perstr;
1146         id_t id;
1147         int error = 0;
1148
1149         ASSERT_RACCT_ENABLED();
1150
1151         rule = rctl_rule_alloc(M_WAITOK);
1152
1153         subjectstr = strsep(&rulestr, ":");
1154         subject_idstr = strsep(&rulestr, ":");
1155         resourcestr = strsep(&rulestr, ":");
1156         actionstr = strsep(&rulestr, "=/");
1157         amountstr = strsep(&rulestr, "/");
1158         perstr = rulestr;
1159
1160         if (subjectstr == NULL || subjectstr[0] == '\0')
1161                 rule->rr_subject_type = RCTL_SUBJECT_TYPE_UNDEFINED;
1162         else {
1163                 error = str2value(subjectstr, &rule->rr_subject_type, subjectnames);
1164                 if (error != 0)
1165                         goto out;
1166         }
1167
1168         if (subject_idstr == NULL || subject_idstr[0] == '\0') {
1169                 rule->rr_subject.rs_proc = NULL;
1170                 rule->rr_subject.rs_uip = NULL;
1171                 rule->rr_subject.rs_loginclass = NULL;
1172                 rule->rr_subject.rs_prison_racct = NULL;
1173         } else {
1174                 switch (rule->rr_subject_type) {
1175                 case RCTL_SUBJECT_TYPE_UNDEFINED:
1176                         error = EINVAL;
1177                         goto out;
1178                 case RCTL_SUBJECT_TYPE_PROCESS:
1179                         error = str2id(subject_idstr, &id);
1180                         if (error != 0)
1181                                 goto out;
1182                         sx_assert(&allproc_lock, SA_LOCKED);
1183                         rule->rr_subject.rs_proc = pfind(id);
1184                         if (rule->rr_subject.rs_proc == NULL) {
1185                                 error = ESRCH;
1186                                 goto out;
1187                         }
1188                         PROC_UNLOCK(rule->rr_subject.rs_proc);
1189                         break;
1190                 case RCTL_SUBJECT_TYPE_USER:
1191                         error = str2id(subject_idstr, &id);
1192                         if (error != 0)
1193                                 goto out;
1194                         rule->rr_subject.rs_uip = uifind(id);
1195                         break;
1196                 case RCTL_SUBJECT_TYPE_LOGINCLASS:
1197                         rule->rr_subject.rs_loginclass =
1198                             loginclass_find(subject_idstr);
1199                         if (rule->rr_subject.rs_loginclass == NULL) {
1200                                 error = ENAMETOOLONG;
1201                                 goto out;
1202                         }
1203                         break;
1204                 case RCTL_SUBJECT_TYPE_JAIL:
1205                         rule->rr_subject.rs_prison_racct =
1206                             prison_racct_find(subject_idstr);
1207                         if (rule->rr_subject.rs_prison_racct == NULL) {
1208                                 error = ENAMETOOLONG;
1209                                 goto out;
1210                         }
1211                         break;
1212                default:
1213                        panic("rctl_string_to_rule: unknown subject type %d",
1214                            rule->rr_subject_type);
1215                }
1216         }
1217
1218         if (resourcestr == NULL || resourcestr[0] == '\0')
1219                 rule->rr_resource = RACCT_UNDEFINED;
1220         else {
1221                 error = str2value(resourcestr, &rule->rr_resource,
1222                     resourcenames);
1223                 if (error != 0)
1224                         goto out;
1225         }
1226
1227         if (actionstr == NULL || actionstr[0] == '\0')
1228                 rule->rr_action = RCTL_ACTION_UNDEFINED;
1229         else {
1230                 error = str2value(actionstr, &rule->rr_action, actionnames);
1231                 if (error != 0)
1232                         goto out;
1233         }
1234
1235         if (amountstr == NULL || amountstr[0] == '\0')
1236                 rule->rr_amount = RCTL_AMOUNT_UNDEFINED;
1237         else {
1238                 error = str2int64(amountstr, &rule->rr_amount);
1239                 if (error != 0)
1240                         goto out;
1241                 if (RACCT_IS_IN_MILLIONS(rule->rr_resource)) {
1242                         if (rule->rr_amount > INT64_MAX / 1000000) {
1243                                 error = ERANGE;
1244                                 goto out;
1245                         }
1246                         rule->rr_amount *= 1000000;
1247                 }
1248         }
1249
1250         if (perstr == NULL || perstr[0] == '\0')
1251                 rule->rr_per = RCTL_SUBJECT_TYPE_UNDEFINED;
1252         else {
1253                 error = str2value(perstr, &rule->rr_per, subjectnames);
1254                 if (error != 0)
1255                         goto out;
1256         }
1257
1258 out:
1259         if (error == 0)
1260                 *rulep = rule;
1261         else
1262                 rctl_rule_release(rule);
1263
1264         return (error);
1265 }
1266
1267 /*
1268  * Link a rule with all the subjects it applies to.
1269  */
1270 int
1271 rctl_rule_add(struct rctl_rule *rule)
1272 {
1273         struct proc *p;
1274         struct ucred *cred;
1275         struct uidinfo *uip;
1276         struct prison *pr;
1277         struct prison_racct *prr;
1278         struct loginclass *lc;
1279         struct rctl_rule *rule2;
1280         int match;
1281
1282         ASSERT_RACCT_ENABLED();
1283         KASSERT(rctl_rule_fully_specified(rule), ("rule not fully specified"));
1284
1285         /*
1286          * Some rules just don't make sense, like "deny" rule for an undeniable
1287          * resource.  The exception are the RSS and %CPU resources - they are
1288          * not deniable in the racct sense, but the limit is enforced in
1289          * a different way.
1290          */
1291         if (rule->rr_action == RCTL_ACTION_DENY &&
1292             !RACCT_IS_DENIABLE(rule->rr_resource) &&
1293             rule->rr_resource != RACCT_RSS &&
1294             rule->rr_resource != RACCT_PCTCPU) {
1295                 return (EOPNOTSUPP);
1296         }
1297
1298         if (rule->rr_action == RCTL_ACTION_THROTTLE &&
1299             !RACCT_IS_DECAYING(rule->rr_resource)) {
1300                 return (EOPNOTSUPP);
1301         }
1302
1303         if (rule->rr_action == RCTL_ACTION_THROTTLE &&
1304             rule->rr_resource == RACCT_PCTCPU) {
1305                 return (EOPNOTSUPP);
1306         }
1307
1308         if (rule->rr_per == RCTL_SUBJECT_TYPE_PROCESS &&
1309             RACCT_IS_SLOPPY(rule->rr_resource)) {
1310                 return (EOPNOTSUPP);
1311         }
1312
1313         /*
1314          * Make sure there are no duplicated rules.  Also, for the "deny"
1315          * rules, remove ones differing only by "amount".
1316          */
1317         if (rule->rr_action == RCTL_ACTION_DENY) {
1318                 rule2 = rctl_rule_duplicate(rule, M_WAITOK);
1319                 rule2->rr_amount = RCTL_AMOUNT_UNDEFINED;
1320                 rctl_rule_remove(rule2);
1321                 rctl_rule_release(rule2);
1322         } else
1323                 rctl_rule_remove(rule);
1324
1325         switch (rule->rr_subject_type) {
1326         case RCTL_SUBJECT_TYPE_PROCESS:
1327                 p = rule->rr_subject.rs_proc;
1328                 KASSERT(p != NULL, ("rctl_rule_add: NULL proc"));
1329
1330                 rctl_racct_add_rule(p->p_racct, rule);
1331                 /*
1332                  * In case of per-process rule, we don't have anything more
1333                  * to do.
1334                  */
1335                 return (0);
1336
1337         case RCTL_SUBJECT_TYPE_USER:
1338                 uip = rule->rr_subject.rs_uip;
1339                 KASSERT(uip != NULL, ("rctl_rule_add: NULL uip"));
1340                 rctl_racct_add_rule(uip->ui_racct, rule);
1341                 break;
1342
1343         case RCTL_SUBJECT_TYPE_LOGINCLASS:
1344                 lc = rule->rr_subject.rs_loginclass;
1345                 KASSERT(lc != NULL, ("rctl_rule_add: NULL loginclass"));
1346                 rctl_racct_add_rule(lc->lc_racct, rule);
1347                 break;
1348
1349         case RCTL_SUBJECT_TYPE_JAIL:
1350                 prr = rule->rr_subject.rs_prison_racct;
1351                 KASSERT(prr != NULL, ("rctl_rule_add: NULL pr"));
1352                 rctl_racct_add_rule(prr->prr_racct, rule);
1353                 break;
1354
1355         default:
1356                 panic("rctl_rule_add: unknown subject type %d",
1357                     rule->rr_subject_type);
1358         }
1359
1360         /*
1361          * Now go through all the processes and add the new rule to the ones
1362          * it applies to.
1363          */
1364         sx_assert(&allproc_lock, SA_LOCKED);
1365         FOREACH_PROC_IN_SYSTEM(p) {
1366                 cred = p->p_ucred;
1367                 switch (rule->rr_subject_type) {
1368                 case RCTL_SUBJECT_TYPE_USER:
1369                         if (cred->cr_uidinfo == rule->rr_subject.rs_uip ||
1370                             cred->cr_ruidinfo == rule->rr_subject.rs_uip)
1371                                 break;
1372                         continue;
1373                 case RCTL_SUBJECT_TYPE_LOGINCLASS:
1374                         if (cred->cr_loginclass == rule->rr_subject.rs_loginclass)
1375                                 break;
1376                         continue;
1377                 case RCTL_SUBJECT_TYPE_JAIL:
1378                         match = 0;
1379                         for (pr = cred->cr_prison; pr != NULL; pr = pr->pr_parent) {
1380                                 if (pr->pr_prison_racct == rule->rr_subject.rs_prison_racct) {
1381                                         match = 1;
1382                                         break;
1383                                 }
1384                         }
1385                         if (match)
1386                                 break;
1387                         continue;
1388                 default:
1389                         panic("rctl_rule_add: unknown subject type %d",
1390                             rule->rr_subject_type);
1391                 }
1392
1393                 rctl_racct_add_rule(p->p_racct, rule);
1394         }
1395
1396         return (0);
1397 }
1398
1399 static void
1400 rctl_rule_pre_callback(void)
1401 {
1402
1403         RACCT_LOCK();
1404 }
1405
1406 static void
1407 rctl_rule_post_callback(void)
1408 {
1409
1410         RACCT_UNLOCK();
1411 }
1412
1413 static void
1414 rctl_rule_remove_callback(struct racct *racct, void *arg2, void *arg3)
1415 {
1416         struct rctl_rule *filter = (struct rctl_rule *)arg2;
1417         int found = 0;
1418
1419         ASSERT_RACCT_ENABLED();
1420         RACCT_LOCK_ASSERT();
1421
1422         found += rctl_racct_remove_rules(racct, filter);
1423
1424         *((int *)arg3) += found;
1425 }
1426
1427 /*
1428  * Remove all rules that match the filter.
1429  */
1430 int
1431 rctl_rule_remove(struct rctl_rule *filter)
1432 {
1433         struct proc *p;
1434         int found = 0;
1435
1436         ASSERT_RACCT_ENABLED();
1437
1438         if (filter->rr_subject_type == RCTL_SUBJECT_TYPE_PROCESS &&
1439             filter->rr_subject.rs_proc != NULL) {
1440                 p = filter->rr_subject.rs_proc;
1441                 RACCT_LOCK();
1442                 found = rctl_racct_remove_rules(p->p_racct, filter);
1443                 RACCT_UNLOCK();
1444                 if (found)
1445                         return (0);
1446                 return (ESRCH);
1447         }
1448
1449         loginclass_racct_foreach(rctl_rule_remove_callback,
1450             rctl_rule_pre_callback, rctl_rule_post_callback,
1451             filter, (void *)&found);
1452         ui_racct_foreach(rctl_rule_remove_callback,
1453             rctl_rule_pre_callback, rctl_rule_post_callback,
1454             filter, (void *)&found);
1455         prison_racct_foreach(rctl_rule_remove_callback,
1456             rctl_rule_pre_callback, rctl_rule_post_callback,
1457             filter, (void *)&found);
1458
1459         sx_assert(&allproc_lock, SA_LOCKED);
1460         RACCT_LOCK();
1461         FOREACH_PROC_IN_SYSTEM(p) {
1462                 found += rctl_racct_remove_rules(p->p_racct, filter);
1463         }
1464         RACCT_UNLOCK();
1465
1466         if (found)
1467                 return (0);
1468         return (ESRCH);
1469 }
1470
1471 /*
1472  * Appends a rule to the sbuf.
1473  */
1474 static void
1475 rctl_rule_to_sbuf(struct sbuf *sb, const struct rctl_rule *rule)
1476 {
1477         int64_t amount;
1478
1479         ASSERT_RACCT_ENABLED();
1480
1481         sbuf_printf(sb, "%s:", rctl_subject_type_name(rule->rr_subject_type));
1482
1483         switch (rule->rr_subject_type) {
1484         case RCTL_SUBJECT_TYPE_PROCESS:
1485                 if (rule->rr_subject.rs_proc == NULL)
1486                         sbuf_printf(sb, ":");
1487                 else
1488                         sbuf_printf(sb, "%d:",
1489                             rule->rr_subject.rs_proc->p_pid);
1490                 break;
1491         case RCTL_SUBJECT_TYPE_USER:
1492                 if (rule->rr_subject.rs_uip == NULL)
1493                         sbuf_printf(sb, ":");
1494                 else
1495                         sbuf_printf(sb, "%d:",
1496                             rule->rr_subject.rs_uip->ui_uid);
1497                 break;
1498         case RCTL_SUBJECT_TYPE_LOGINCLASS:
1499                 if (rule->rr_subject.rs_loginclass == NULL)
1500                         sbuf_printf(sb, ":");
1501                 else
1502                         sbuf_printf(sb, "%s:",
1503                             rule->rr_subject.rs_loginclass->lc_name);
1504                 break;
1505         case RCTL_SUBJECT_TYPE_JAIL:
1506                 if (rule->rr_subject.rs_prison_racct == NULL)
1507                         sbuf_printf(sb, ":");
1508                 else
1509                         sbuf_printf(sb, "%s:",
1510                             rule->rr_subject.rs_prison_racct->prr_name);
1511                 break;
1512         default:
1513                 panic("rctl_rule_to_sbuf: unknown subject type %d",
1514                     rule->rr_subject_type);
1515         }
1516
1517         amount = rule->rr_amount;
1518         if (amount != RCTL_AMOUNT_UNDEFINED &&
1519             RACCT_IS_IN_MILLIONS(rule->rr_resource))
1520                 amount /= 1000000;
1521
1522         sbuf_printf(sb, "%s:%s=%jd",
1523             rctl_resource_name(rule->rr_resource),
1524             rctl_action_name(rule->rr_action),
1525             amount);
1526
1527         if (rule->rr_per != rule->rr_subject_type)
1528                 sbuf_printf(sb, "/%s", rctl_subject_type_name(rule->rr_per));
1529 }
1530
1531 /*
1532  * Routine used by RCTL syscalls to read in input string.
1533  */
1534 static int
1535 rctl_read_inbuf(char **inputstr, const char *inbufp, size_t inbuflen)
1536 {
1537         char *str;
1538         int error;
1539
1540         ASSERT_RACCT_ENABLED();
1541
1542         if (inbuflen <= 0)
1543                 return (EINVAL);
1544         if (inbuflen > RCTL_MAX_INBUFSIZE)
1545                 return (E2BIG);
1546
1547         str = malloc(inbuflen + 1, M_RCTL, M_WAITOK);
1548         error = copyinstr(inbufp, str, inbuflen, NULL);
1549         if (error != 0) {
1550                 free(str, M_RCTL);
1551                 return (error);
1552         }
1553
1554         *inputstr = str;
1555
1556         return (0);
1557 }
1558
1559 /*
1560  * Routine used by RCTL syscalls to write out output string.
1561  */
1562 static int
1563 rctl_write_outbuf(struct sbuf *outputsbuf, char *outbufp, size_t outbuflen)
1564 {
1565         int error;
1566
1567         ASSERT_RACCT_ENABLED();
1568
1569         if (outputsbuf == NULL)
1570                 return (0);
1571
1572         sbuf_finish(outputsbuf);
1573         if (outbuflen < sbuf_len(outputsbuf) + 1) {
1574                 sbuf_delete(outputsbuf);
1575                 return (ERANGE);
1576         }
1577         error = copyout(sbuf_data(outputsbuf), outbufp,
1578             sbuf_len(outputsbuf) + 1);
1579         sbuf_delete(outputsbuf);
1580         return (error);
1581 }
1582
1583 static struct sbuf *
1584 rctl_racct_to_sbuf(struct racct *racct, int sloppy)
1585 {
1586         struct sbuf *sb;
1587         int64_t amount;
1588         int i;
1589
1590         ASSERT_RACCT_ENABLED();
1591
1592         sb = sbuf_new_auto();
1593         for (i = 0; i <= RACCT_MAX; i++) {
1594                 if (sloppy == 0 && RACCT_IS_SLOPPY(i))
1595                         continue;
1596                 RACCT_LOCK();
1597                 amount = racct->r_resources[i];
1598                 RACCT_UNLOCK();
1599                 if (RACCT_IS_IN_MILLIONS(i))
1600                         amount /= 1000000;
1601                 sbuf_printf(sb, "%s=%jd,", rctl_resource_name(i), amount);
1602         }
1603         sbuf_setpos(sb, sbuf_len(sb) - 1);
1604         return (sb);
1605 }
1606
1607 int
1608 sys_rctl_get_racct(struct thread *td, struct rctl_get_racct_args *uap)
1609 {
1610         struct rctl_rule *filter;
1611         struct sbuf *outputsbuf = NULL;
1612         struct proc *p;
1613         struct uidinfo *uip;
1614         struct loginclass *lc;
1615         struct prison_racct *prr;
1616         char *inputstr;
1617         int error;
1618
1619         if (!racct_enable)
1620                 return (ENOSYS);
1621
1622         error = priv_check(td, PRIV_RCTL_GET_RACCT);
1623         if (error != 0)
1624                 return (error);
1625
1626         error = rctl_read_inbuf(&inputstr, uap->inbufp, uap->inbuflen);
1627         if (error != 0)
1628                 return (error);
1629
1630         sx_slock(&allproc_lock);
1631         error = rctl_string_to_rule(inputstr, &filter);
1632         free(inputstr, M_RCTL);
1633         if (error != 0) {
1634                 sx_sunlock(&allproc_lock);
1635                 return (error);
1636         }
1637
1638         switch (filter->rr_subject_type) {
1639         case RCTL_SUBJECT_TYPE_PROCESS:
1640                 p = filter->rr_subject.rs_proc;
1641                 if (p == NULL) {
1642                         error = EINVAL;
1643                         goto out;
1644                 }
1645                 outputsbuf = rctl_racct_to_sbuf(p->p_racct, 0);
1646                 break;
1647         case RCTL_SUBJECT_TYPE_USER:
1648                 uip = filter->rr_subject.rs_uip;
1649                 if (uip == NULL) {
1650                         error = EINVAL;
1651                         goto out;
1652                 }
1653                 outputsbuf = rctl_racct_to_sbuf(uip->ui_racct, 1);
1654                 break;
1655         case RCTL_SUBJECT_TYPE_LOGINCLASS:
1656                 lc = filter->rr_subject.rs_loginclass;
1657                 if (lc == NULL) {
1658                         error = EINVAL;
1659                         goto out;
1660                 }
1661                 outputsbuf = rctl_racct_to_sbuf(lc->lc_racct, 1);
1662                 break;
1663         case RCTL_SUBJECT_TYPE_JAIL:
1664                 prr = filter->rr_subject.rs_prison_racct;
1665                 if (prr == NULL) {
1666                         error = EINVAL;
1667                         goto out;
1668                 }
1669                 outputsbuf = rctl_racct_to_sbuf(prr->prr_racct, 1);
1670                 break;
1671         default:
1672                 error = EINVAL;
1673         }
1674 out:
1675         rctl_rule_release(filter);
1676         sx_sunlock(&allproc_lock);
1677         if (error != 0)
1678                 return (error);
1679
1680         error = rctl_write_outbuf(outputsbuf, uap->outbufp, uap->outbuflen);
1681
1682         return (error);
1683 }
1684
1685 static void
1686 rctl_get_rules_callback(struct racct *racct, void *arg2, void *arg3)
1687 {
1688         struct rctl_rule *filter = (struct rctl_rule *)arg2;
1689         struct rctl_rule_link *link;
1690         struct sbuf *sb = (struct sbuf *)arg3;
1691
1692         ASSERT_RACCT_ENABLED();
1693         RACCT_LOCK_ASSERT();
1694
1695         LIST_FOREACH(link, &racct->r_rule_links, rrl_next) {
1696                 if (!rctl_rule_matches(link->rrl_rule, filter))
1697                         continue;
1698                 rctl_rule_to_sbuf(sb, link->rrl_rule);
1699                 sbuf_printf(sb, ",");
1700         }
1701 }
1702
1703 int
1704 sys_rctl_get_rules(struct thread *td, struct rctl_get_rules_args *uap)
1705 {
1706         struct sbuf *sb;
1707         struct rctl_rule *filter;
1708         struct rctl_rule_link *link;
1709         struct proc *p;
1710         char *inputstr, *buf;
1711         size_t bufsize;
1712         int error;
1713
1714         if (!racct_enable)
1715                 return (ENOSYS);
1716
1717         error = priv_check(td, PRIV_RCTL_GET_RULES);
1718         if (error != 0)
1719                 return (error);
1720
1721         error = rctl_read_inbuf(&inputstr, uap->inbufp, uap->inbuflen);
1722         if (error != 0)
1723                 return (error);
1724
1725         sx_slock(&allproc_lock);
1726         error = rctl_string_to_rule(inputstr, &filter);
1727         free(inputstr, M_RCTL);
1728         if (error != 0) {
1729                 sx_sunlock(&allproc_lock);
1730                 return (error);
1731         }
1732
1733         bufsize = uap->outbuflen;
1734         if (bufsize > rctl_maxbufsize) {
1735                 sx_sunlock(&allproc_lock);
1736                 return (E2BIG);
1737         }
1738
1739         buf = malloc(bufsize, M_RCTL, M_WAITOK);
1740         sb = sbuf_new(NULL, buf, bufsize, SBUF_FIXEDLEN);
1741         KASSERT(sb != NULL, ("sbuf_new failed"));
1742
1743         FOREACH_PROC_IN_SYSTEM(p) {
1744                 RACCT_LOCK();
1745                 LIST_FOREACH(link, &p->p_racct->r_rule_links, rrl_next) {
1746                         /*
1747                          * Non-process rules will be added to the buffer later.
1748                          * Adding them here would result in duplicated output.
1749                          */
1750                         if (link->rrl_rule->rr_subject_type !=
1751                             RCTL_SUBJECT_TYPE_PROCESS)
1752                                 continue;
1753                         if (!rctl_rule_matches(link->rrl_rule, filter))
1754                                 continue;
1755                         rctl_rule_to_sbuf(sb, link->rrl_rule);
1756                         sbuf_printf(sb, ",");
1757                 }
1758                 RACCT_UNLOCK();
1759         }
1760
1761         loginclass_racct_foreach(rctl_get_rules_callback,
1762             rctl_rule_pre_callback, rctl_rule_post_callback,
1763             filter, sb);
1764         ui_racct_foreach(rctl_get_rules_callback,
1765             rctl_rule_pre_callback, rctl_rule_post_callback,
1766             filter, sb);
1767         prison_racct_foreach(rctl_get_rules_callback,
1768             rctl_rule_pre_callback, rctl_rule_post_callback,
1769             filter, sb);
1770         if (sbuf_error(sb) == ENOMEM) {
1771                 error = ERANGE;
1772                 goto out;
1773         }
1774
1775         /*
1776          * Remove trailing ",".
1777          */
1778         if (sbuf_len(sb) > 0)
1779                 sbuf_setpos(sb, sbuf_len(sb) - 1);
1780
1781         error = rctl_write_outbuf(sb, uap->outbufp, uap->outbuflen);
1782 out:
1783         rctl_rule_release(filter);
1784         sx_sunlock(&allproc_lock);
1785         free(buf, M_RCTL);
1786         return (error);
1787 }
1788
1789 int
1790 sys_rctl_get_limits(struct thread *td, struct rctl_get_limits_args *uap)
1791 {
1792         struct sbuf *sb;
1793         struct rctl_rule *filter;
1794         struct rctl_rule_link *link;
1795         char *inputstr, *buf;
1796         size_t bufsize;
1797         int error;
1798
1799         if (!racct_enable)
1800                 return (ENOSYS);
1801
1802         error = priv_check(td, PRIV_RCTL_GET_LIMITS);
1803         if (error != 0)
1804                 return (error);
1805
1806         error = rctl_read_inbuf(&inputstr, uap->inbufp, uap->inbuflen);
1807         if (error != 0)
1808                 return (error);
1809
1810         sx_slock(&allproc_lock);
1811         error = rctl_string_to_rule(inputstr, &filter);
1812         free(inputstr, M_RCTL);
1813         if (error != 0) {
1814                 sx_sunlock(&allproc_lock);
1815                 return (error);
1816         }
1817
1818         if (filter->rr_subject_type == RCTL_SUBJECT_TYPE_UNDEFINED) {
1819                 rctl_rule_release(filter);
1820                 sx_sunlock(&allproc_lock);
1821                 return (EINVAL);
1822         }
1823         if (filter->rr_subject_type != RCTL_SUBJECT_TYPE_PROCESS) {
1824                 rctl_rule_release(filter);
1825                 sx_sunlock(&allproc_lock);
1826                 return (EOPNOTSUPP);
1827         }
1828         if (filter->rr_subject.rs_proc == NULL) {
1829                 rctl_rule_release(filter);
1830                 sx_sunlock(&allproc_lock);
1831                 return (EINVAL);
1832         }
1833
1834         bufsize = uap->outbuflen;
1835         if (bufsize > rctl_maxbufsize) {
1836                 rctl_rule_release(filter);
1837                 sx_sunlock(&allproc_lock);
1838                 return (E2BIG);
1839         }
1840
1841         buf = malloc(bufsize, M_RCTL, M_WAITOK);
1842         sb = sbuf_new(NULL, buf, bufsize, SBUF_FIXEDLEN);
1843         KASSERT(sb != NULL, ("sbuf_new failed"));
1844
1845         RACCT_LOCK();
1846         LIST_FOREACH(link, &filter->rr_subject.rs_proc->p_racct->r_rule_links,
1847             rrl_next) {
1848                 rctl_rule_to_sbuf(sb, link->rrl_rule);
1849                 sbuf_printf(sb, ",");
1850         }
1851         RACCT_UNLOCK();
1852         if (sbuf_error(sb) == ENOMEM) {
1853                 error = ERANGE;
1854                 sbuf_delete(sb);
1855                 goto out;
1856         }
1857
1858         /*
1859          * Remove trailing ",".
1860          */
1861         if (sbuf_len(sb) > 0)
1862                 sbuf_setpos(sb, sbuf_len(sb) - 1);
1863
1864         error = rctl_write_outbuf(sb, uap->outbufp, uap->outbuflen);
1865 out:
1866         rctl_rule_release(filter);
1867         sx_sunlock(&allproc_lock);
1868         free(buf, M_RCTL);
1869         return (error);
1870 }
1871
1872 int
1873 sys_rctl_add_rule(struct thread *td, struct rctl_add_rule_args *uap)
1874 {
1875         struct rctl_rule *rule;
1876         char *inputstr;
1877         int error;
1878
1879         if (!racct_enable)
1880                 return (ENOSYS);
1881
1882         error = priv_check(td, PRIV_RCTL_ADD_RULE);
1883         if (error != 0)
1884                 return (error);
1885
1886         error = rctl_read_inbuf(&inputstr, uap->inbufp, uap->inbuflen);
1887         if (error != 0)
1888                 return (error);
1889
1890         sx_slock(&allproc_lock);
1891         error = rctl_string_to_rule(inputstr, &rule);
1892         free(inputstr, M_RCTL);
1893         if (error != 0) {
1894                 sx_sunlock(&allproc_lock);
1895                 return (error);
1896         }
1897         /*
1898          * The 'per' part of a rule is optional.
1899          */
1900         if (rule->rr_per == RCTL_SUBJECT_TYPE_UNDEFINED &&
1901             rule->rr_subject_type != RCTL_SUBJECT_TYPE_UNDEFINED)
1902                 rule->rr_per = rule->rr_subject_type;
1903
1904         if (!rctl_rule_fully_specified(rule)) {
1905                 error = EINVAL;
1906                 goto out;
1907         }
1908
1909         error = rctl_rule_add(rule);
1910
1911 out:
1912         rctl_rule_release(rule);
1913         sx_sunlock(&allproc_lock);
1914         return (error);
1915 }
1916
1917 int
1918 sys_rctl_remove_rule(struct thread *td, struct rctl_remove_rule_args *uap)
1919 {
1920         struct rctl_rule *filter;
1921         char *inputstr;
1922         int error;
1923
1924         if (!racct_enable)
1925                 return (ENOSYS);
1926
1927         error = priv_check(td, PRIV_RCTL_REMOVE_RULE);
1928         if (error != 0)
1929                 return (error);
1930
1931         error = rctl_read_inbuf(&inputstr, uap->inbufp, uap->inbuflen);
1932         if (error != 0)
1933                 return (error);
1934
1935         sx_slock(&allproc_lock);
1936         error = rctl_string_to_rule(inputstr, &filter);
1937         free(inputstr, M_RCTL);
1938         if (error != 0) {
1939                 sx_sunlock(&allproc_lock);
1940                 return (error);
1941         }
1942
1943         error = rctl_rule_remove(filter);
1944         rctl_rule_release(filter);
1945         sx_sunlock(&allproc_lock);
1946
1947         return (error);
1948 }
1949
1950 /*
1951  * Update RCTL rule list after credential change.
1952  */
1953 void
1954 rctl_proc_ucred_changed(struct proc *p, struct ucred *newcred)
1955 {
1956         LIST_HEAD(, rctl_rule_link) newrules;
1957         struct rctl_rule_link *link, *newlink;
1958         struct uidinfo *newuip;
1959         struct loginclass *newlc;
1960         struct prison_racct *newprr;
1961         int rulecnt, i;
1962
1963         if (!racct_enable)
1964                 return;
1965
1966         PROC_LOCK_ASSERT(p, MA_NOTOWNED);
1967
1968         newuip = newcred->cr_ruidinfo;
1969         newlc = newcred->cr_loginclass;
1970         newprr = newcred->cr_prison->pr_prison_racct;
1971
1972         LIST_INIT(&newrules);
1973
1974 again:
1975         /*
1976          * First, count the rules that apply to the process with new
1977          * credentials.
1978          */
1979         rulecnt = 0;
1980         RACCT_LOCK();
1981         LIST_FOREACH(link, &p->p_racct->r_rule_links, rrl_next) {
1982                 if (link->rrl_rule->rr_subject_type ==
1983                     RCTL_SUBJECT_TYPE_PROCESS)
1984                         rulecnt++;
1985         }
1986         LIST_FOREACH(link, &newuip->ui_racct->r_rule_links, rrl_next)
1987                 rulecnt++;
1988         LIST_FOREACH(link, &newlc->lc_racct->r_rule_links, rrl_next)
1989                 rulecnt++;
1990         LIST_FOREACH(link, &newprr->prr_racct->r_rule_links, rrl_next)
1991                 rulecnt++;
1992         RACCT_UNLOCK();
1993
1994         /*
1995          * Create temporary list.  We've dropped the rctl_lock in order
1996          * to use M_WAITOK.
1997          */
1998         for (i = 0; i < rulecnt; i++) {
1999                 newlink = uma_zalloc(rctl_rule_link_zone, M_WAITOK);
2000                 newlink->rrl_rule = NULL;
2001                 newlink->rrl_exceeded = 0;
2002                 LIST_INSERT_HEAD(&newrules, newlink, rrl_next);
2003         }
2004
2005         newlink = LIST_FIRST(&newrules);
2006
2007         /*
2008          * Assign rules to the newly allocated list entries.
2009          */
2010         RACCT_LOCK();
2011         LIST_FOREACH(link, &p->p_racct->r_rule_links, rrl_next) {
2012                 if (link->rrl_rule->rr_subject_type ==
2013                     RCTL_SUBJECT_TYPE_PROCESS) {
2014                         if (newlink == NULL)
2015                                 goto goaround;
2016                         rctl_rule_acquire(link->rrl_rule);
2017                         newlink->rrl_rule = link->rrl_rule;
2018                         newlink->rrl_exceeded = link->rrl_exceeded;
2019                         newlink = LIST_NEXT(newlink, rrl_next);
2020                         rulecnt--;
2021                 }
2022         }
2023
2024         LIST_FOREACH(link, &newuip->ui_racct->r_rule_links, rrl_next) {
2025                 if (newlink == NULL)
2026                         goto goaround;
2027                 rctl_rule_acquire(link->rrl_rule);
2028                 newlink->rrl_rule = link->rrl_rule;
2029                 newlink->rrl_exceeded = link->rrl_exceeded;
2030                 newlink = LIST_NEXT(newlink, rrl_next);
2031                 rulecnt--;
2032         }
2033
2034         LIST_FOREACH(link, &newlc->lc_racct->r_rule_links, rrl_next) {
2035                 if (newlink == NULL)
2036                         goto goaround;
2037                 rctl_rule_acquire(link->rrl_rule);
2038                 newlink->rrl_rule = link->rrl_rule;
2039                 newlink->rrl_exceeded = link->rrl_exceeded;
2040                 newlink = LIST_NEXT(newlink, rrl_next);
2041                 rulecnt--;
2042         }
2043
2044         LIST_FOREACH(link, &newprr->prr_racct->r_rule_links, rrl_next) {
2045                 if (newlink == NULL)
2046                         goto goaround;
2047                 rctl_rule_acquire(link->rrl_rule);
2048                 newlink->rrl_rule = link->rrl_rule;
2049                 newlink->rrl_exceeded = link->rrl_exceeded;
2050                 newlink = LIST_NEXT(newlink, rrl_next);
2051                 rulecnt--;
2052         }
2053
2054         if (rulecnt == 0) {
2055                 /*
2056                  * Free the old rule list.
2057                  */
2058                 while (!LIST_EMPTY(&p->p_racct->r_rule_links)) {
2059                         link = LIST_FIRST(&p->p_racct->r_rule_links);
2060                         LIST_REMOVE(link, rrl_next);
2061                         rctl_rule_release(link->rrl_rule);
2062                         uma_zfree(rctl_rule_link_zone, link);
2063                 }
2064
2065                 /*
2066                  * Replace lists and we're done.
2067                  *
2068                  * XXX: Is there any way to switch list heads instead
2069                  *      of iterating here?
2070                  */
2071                 while (!LIST_EMPTY(&newrules)) {
2072                         newlink = LIST_FIRST(&newrules);
2073                         LIST_REMOVE(newlink, rrl_next);
2074                         LIST_INSERT_HEAD(&p->p_racct->r_rule_links,
2075                             newlink, rrl_next);
2076                 }
2077
2078                 RACCT_UNLOCK();
2079
2080                 return;
2081         }
2082
2083 goaround:
2084         RACCT_UNLOCK();
2085
2086         /*
2087          * Rule list changed while we were not holding the rctl_lock.
2088          * Free the new list and try again.
2089          */
2090         while (!LIST_EMPTY(&newrules)) {
2091                 newlink = LIST_FIRST(&newrules);
2092                 LIST_REMOVE(newlink, rrl_next);
2093                 if (newlink->rrl_rule != NULL)
2094                         rctl_rule_release(newlink->rrl_rule);
2095                 uma_zfree(rctl_rule_link_zone, newlink);
2096         }
2097
2098         goto again;
2099 }
2100
2101 /*
2102  * Assign RCTL rules to the newly created process.
2103  */
2104 int
2105 rctl_proc_fork(struct proc *parent, struct proc *child)
2106 {
2107         struct rctl_rule *rule;
2108         struct rctl_rule_link *link;
2109         int error;
2110
2111         ASSERT_RACCT_ENABLED();
2112         RACCT_LOCK_ASSERT();
2113         KASSERT(parent->p_racct != NULL, ("process without racct; p = %p", parent));
2114
2115         LIST_INIT(&child->p_racct->r_rule_links);
2116
2117         /*
2118          * Go through limits applicable to the parent and assign them
2119          * to the child.  Rules with 'process' subject have to be duplicated
2120          * in order to make their rr_subject point to the new process.
2121          */
2122         LIST_FOREACH(link, &parent->p_racct->r_rule_links, rrl_next) {
2123                 if (link->rrl_rule->rr_subject_type ==
2124                     RCTL_SUBJECT_TYPE_PROCESS) {
2125                         rule = rctl_rule_duplicate(link->rrl_rule, M_NOWAIT);
2126                         if (rule == NULL)
2127                                 goto fail;
2128                         KASSERT(rule->rr_subject.rs_proc == parent,
2129                             ("rule->rr_subject.rs_proc != parent"));
2130                         rule->rr_subject.rs_proc = child;
2131                         error = rctl_racct_add_rule_locked(child->p_racct,
2132                             rule);
2133                         rctl_rule_release(rule);
2134                         if (error != 0)
2135                                 goto fail;
2136                 } else {
2137                         error = rctl_racct_add_rule_locked(child->p_racct,
2138                             link->rrl_rule);
2139                         if (error != 0)
2140                                 goto fail;
2141                 }
2142         }
2143
2144         return (0);
2145
2146 fail:
2147         while (!LIST_EMPTY(&child->p_racct->r_rule_links)) {
2148                 link = LIST_FIRST(&child->p_racct->r_rule_links);
2149                 LIST_REMOVE(link, rrl_next);
2150                 rctl_rule_release(link->rrl_rule);
2151                 uma_zfree(rctl_rule_link_zone, link);
2152         }
2153
2154         return (EAGAIN);
2155 }
2156
2157 /*
2158  * Release rules attached to the racct.
2159  */
2160 void
2161 rctl_racct_release(struct racct *racct)
2162 {
2163         struct rctl_rule_link *link;
2164
2165         ASSERT_RACCT_ENABLED();
2166         RACCT_LOCK_ASSERT();
2167
2168         while (!LIST_EMPTY(&racct->r_rule_links)) {
2169                 link = LIST_FIRST(&racct->r_rule_links);
2170                 LIST_REMOVE(link, rrl_next);
2171                 rctl_rule_release(link->rrl_rule);
2172                 uma_zfree(rctl_rule_link_zone, link);
2173         }
2174 }
2175
2176 static void
2177 rctl_init(void)
2178 {
2179
2180         if (!racct_enable)
2181                 return;
2182
2183         rctl_rule_zone = uma_zcreate("rctl_rule", sizeof(struct rctl_rule),
2184             NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0);
2185         rctl_rule_link_zone = uma_zcreate("rctl_rule_link",
2186             sizeof(struct rctl_rule_link), NULL, NULL, NULL, NULL,
2187             UMA_ALIGN_PTR, 0);
2188
2189         /*
2190          * Set default values, making sure not to overwrite the ones
2191          * fetched from tunables.  Most of those could be set at the
2192          * declaration, except for the rctl_throttle_max - we cannot
2193          * set it there due to hz not being compile time constant.
2194          */
2195         if (rctl_throttle_min < 1)
2196                 rctl_throttle_min = 1;
2197         if (rctl_throttle_max < rctl_throttle_min)
2198                 rctl_throttle_max = 2 * hz;
2199         if (rctl_throttle_pct < 0)
2200                 rctl_throttle_pct = 100;
2201         if (rctl_throttle_pct2 < 0)
2202                 rctl_throttle_pct2 = 100;
2203 }
2204
2205 #else /* !RCTL */
2206
2207 int
2208 sys_rctl_get_racct(struct thread *td, struct rctl_get_racct_args *uap)
2209 {
2210
2211         return (ENOSYS);
2212 }
2213
2214 int
2215 sys_rctl_get_rules(struct thread *td, struct rctl_get_rules_args *uap)
2216 {
2217
2218         return (ENOSYS);
2219 }
2220
2221 int
2222 sys_rctl_get_limits(struct thread *td, struct rctl_get_limits_args *uap)
2223 {
2224
2225         return (ENOSYS);
2226 }
2227
2228 int
2229 sys_rctl_add_rule(struct thread *td, struct rctl_add_rule_args *uap)
2230 {
2231
2232         return (ENOSYS);
2233 }
2234
2235 int
2236 sys_rctl_remove_rule(struct thread *td, struct rctl_remove_rule_args *uap)
2237 {
2238
2239         return (ENOSYS);
2240 }
2241
2242 #endif /* !RCTL */