]> CyberLeo.Net >> Repos - FreeBSD/FreeBSD.git/blob - sys/kern/kern_rctl.c
Fix missing pfctl(8) tunable.
[FreeBSD/FreeBSD.git] / sys / kern / kern_rctl.c
1 /*-
2  * Copyright (c) 2010 The FreeBSD Foundation
3  * All rights reserved.
4  *
5  * This software was developed by Edward Tomasz Napierala under sponsorship
6  * from the FreeBSD Foundation.
7  *
8  * Redistribution and use in source and binary forms, with or without
9  * modification, are permitted provided that the following conditions
10  * are met:
11  * 1. Redistributions of source code must retain the above copyright
12  *    notice, this list of conditions and the following disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in the
15  *    documentation and/or other materials provided with the distribution.
16  *
17  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
18  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
19  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
20  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
21  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
22  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
23  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
24  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
25  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
26  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
27  * SUCH DAMAGE.
28  *
29  * $FreeBSD$
30  */
31
32 #include <sys/cdefs.h>
33 __FBSDID("$FreeBSD$");
34
35 #include <sys/param.h>
36 #include <sys/bus.h>
37 #include <sys/malloc.h>
38 #include <sys/queue.h>
39 #include <sys/refcount.h>
40 #include <sys/jail.h>
41 #include <sys/kernel.h>
42 #include <sys/limits.h>
43 #include <sys/loginclass.h>
44 #include <sys/priv.h>
45 #include <sys/proc.h>
46 #include <sys/racct.h>
47 #include <sys/rctl.h>
48 #include <sys/resourcevar.h>
49 #include <sys/sx.h>
50 #include <sys/sysent.h>
51 #include <sys/sysproto.h>
52 #include <sys/systm.h>
53 #include <sys/types.h>
54 #include <sys/eventhandler.h>
55 #include <sys/lock.h>
56 #include <sys/mutex.h>
57 #include <sys/rwlock.h>
58 #include <sys/sbuf.h>
59 #include <sys/taskqueue.h>
60 #include <sys/tree.h>
61 #include <vm/uma.h>
62
63 #ifdef RCTL
64 #ifndef RACCT
65 #error "The RCTL option requires the RACCT option"
66 #endif
67
68 FEATURE(rctl, "Resource Limits");
69
70 #define HRF_DEFAULT             0
71 #define HRF_DONT_INHERIT        1
72 #define HRF_DONT_ACCUMULATE     2
73
74 #define RCTL_MAX_INBUFSIZE      4 * 1024
75 #define RCTL_MAX_OUTBUFSIZE     16 * 1024 * 1024
76 #define RCTL_LOG_BUFSIZE        128
77
78 #define RCTL_PCPU_SHIFT         (10 * 1000000)
79
80 static unsigned int rctl_maxbufsize = RCTL_MAX_OUTBUFSIZE;
81 static int rctl_log_rate_limit = 10;
82 static int rctl_devctl_rate_limit = 10;
83
84 /*
85  * Values below are initialized in rctl_init().
86  */
87 static int rctl_throttle_min = -1;
88 static int rctl_throttle_max = -1;
89 static int rctl_throttle_pct = -1;
90 static int rctl_throttle_pct2 = -1;
91
92 static int rctl_throttle_min_sysctl(SYSCTL_HANDLER_ARGS);
93 static int rctl_throttle_max_sysctl(SYSCTL_HANDLER_ARGS);
94 static int rctl_throttle_pct_sysctl(SYSCTL_HANDLER_ARGS);
95 static int rctl_throttle_pct2_sysctl(SYSCTL_HANDLER_ARGS);
96
97 SYSCTL_NODE(_kern_racct, OID_AUTO, rctl, CTLFLAG_RW, 0, "Resource Limits");
98 SYSCTL_UINT(_kern_racct_rctl, OID_AUTO, maxbufsize, CTLFLAG_RWTUN,
99     &rctl_maxbufsize, 0, "Maximum output buffer size");
100 SYSCTL_UINT(_kern_racct_rctl, OID_AUTO, log_rate_limit, CTLFLAG_RW,
101     &rctl_log_rate_limit, 0, "Maximum number of log messages per second");
102 SYSCTL_UINT(_kern_racct_rctl, OID_AUTO, devctl_rate_limit, CTLFLAG_RWTUN,
103     &rctl_devctl_rate_limit, 0, "Maximum number of devctl messages per second");
104 SYSCTL_PROC(_kern_racct_rctl, OID_AUTO, throttle_min,
105     CTLTYPE_UINT | CTLFLAG_RWTUN, 0, 0, &rctl_throttle_min_sysctl, "IU",
106     "Shortest throttling duration, in hz");
107 TUNABLE_INT("kern.racct.rctl.throttle_min", &rctl_throttle_min);
108 SYSCTL_PROC(_kern_racct_rctl, OID_AUTO, throttle_max,
109     CTLTYPE_UINT | CTLFLAG_RWTUN, 0, 0, &rctl_throttle_max_sysctl, "IU",
110     "Longest throttling duration, in hz");
111 TUNABLE_INT("kern.racct.rctl.throttle_max", &rctl_throttle_max);
112 SYSCTL_PROC(_kern_racct_rctl, OID_AUTO, throttle_pct,
113     CTLTYPE_UINT | CTLFLAG_RWTUN, 0, 0, &rctl_throttle_pct_sysctl, "IU",
114     "Throttling penalty for process consumption, in percent");
115 TUNABLE_INT("kern.racct.rctl.throttle_pct", &rctl_throttle_pct);
116 SYSCTL_PROC(_kern_racct_rctl, OID_AUTO, throttle_pct2,
117     CTLTYPE_UINT | CTLFLAG_RWTUN, 0, 0, &rctl_throttle_pct2_sysctl, "IU",
118     "Throttling penalty for container consumption, in percent");
119 TUNABLE_INT("kern.racct.rctl.throttle_pct2", &rctl_throttle_pct2);
120
121 /*
122  * 'rctl_rule_link' connects a rule with every racct it's related to.
123  * For example, rule 'user:X:openfiles:deny=N/process' is linked
124  * with uidinfo for user X, and to each process of that user.
125  */
126 struct rctl_rule_link {
127         LIST_ENTRY(rctl_rule_link)      rrl_next;
128         struct rctl_rule                *rrl_rule;
129         int                             rrl_exceeded;
130 };
131
132 struct dict {
133         const char      *d_name;
134         int             d_value;
135 };
136
137 static struct dict subjectnames[] = {
138         { "process", RCTL_SUBJECT_TYPE_PROCESS },
139         { "user", RCTL_SUBJECT_TYPE_USER },
140         { "loginclass", RCTL_SUBJECT_TYPE_LOGINCLASS },
141         { "jail", RCTL_SUBJECT_TYPE_JAIL },
142         { NULL, -1 }};
143
144 static struct dict resourcenames[] = {
145         { "cputime", RACCT_CPU },
146         { "datasize", RACCT_DATA },
147         { "stacksize", RACCT_STACK },
148         { "coredumpsize", RACCT_CORE },
149         { "memoryuse", RACCT_RSS },
150         { "memorylocked", RACCT_MEMLOCK },
151         { "maxproc", RACCT_NPROC },
152         { "openfiles", RACCT_NOFILE },
153         { "vmemoryuse", RACCT_VMEM },
154         { "pseudoterminals", RACCT_NPTS },
155         { "swapuse", RACCT_SWAP },
156         { "nthr", RACCT_NTHR },
157         { "msgqqueued", RACCT_MSGQQUEUED },
158         { "msgqsize", RACCT_MSGQSIZE },
159         { "nmsgq", RACCT_NMSGQ },
160         { "nsem", RACCT_NSEM },
161         { "nsemop", RACCT_NSEMOP },
162         { "nshm", RACCT_NSHM },
163         { "shmsize", RACCT_SHMSIZE },
164         { "wallclock", RACCT_WALLCLOCK },
165         { "pcpu", RACCT_PCTCPU },
166         { "readbps", RACCT_READBPS },
167         { "writebps", RACCT_WRITEBPS },
168         { "readiops", RACCT_READIOPS },
169         { "writeiops", RACCT_WRITEIOPS },
170         { NULL, -1 }};
171
172 static struct dict actionnames[] = {
173         { "sighup", RCTL_ACTION_SIGHUP },
174         { "sigint", RCTL_ACTION_SIGINT },
175         { "sigquit", RCTL_ACTION_SIGQUIT },
176         { "sigill", RCTL_ACTION_SIGILL },
177         { "sigtrap", RCTL_ACTION_SIGTRAP },
178         { "sigabrt", RCTL_ACTION_SIGABRT },
179         { "sigemt", RCTL_ACTION_SIGEMT },
180         { "sigfpe", RCTL_ACTION_SIGFPE },
181         { "sigkill", RCTL_ACTION_SIGKILL },
182         { "sigbus", RCTL_ACTION_SIGBUS },
183         { "sigsegv", RCTL_ACTION_SIGSEGV },
184         { "sigsys", RCTL_ACTION_SIGSYS },
185         { "sigpipe", RCTL_ACTION_SIGPIPE },
186         { "sigalrm", RCTL_ACTION_SIGALRM },
187         { "sigterm", RCTL_ACTION_SIGTERM },
188         { "sigurg", RCTL_ACTION_SIGURG },
189         { "sigstop", RCTL_ACTION_SIGSTOP },
190         { "sigtstp", RCTL_ACTION_SIGTSTP },
191         { "sigchld", RCTL_ACTION_SIGCHLD },
192         { "sigttin", RCTL_ACTION_SIGTTIN },
193         { "sigttou", RCTL_ACTION_SIGTTOU },
194         { "sigio", RCTL_ACTION_SIGIO },
195         { "sigxcpu", RCTL_ACTION_SIGXCPU },
196         { "sigxfsz", RCTL_ACTION_SIGXFSZ },
197         { "sigvtalrm", RCTL_ACTION_SIGVTALRM },
198         { "sigprof", RCTL_ACTION_SIGPROF },
199         { "sigwinch", RCTL_ACTION_SIGWINCH },
200         { "siginfo", RCTL_ACTION_SIGINFO },
201         { "sigusr1", RCTL_ACTION_SIGUSR1 },
202         { "sigusr2", RCTL_ACTION_SIGUSR2 },
203         { "sigthr", RCTL_ACTION_SIGTHR },
204         { "deny", RCTL_ACTION_DENY },
205         { "log", RCTL_ACTION_LOG },
206         { "devctl", RCTL_ACTION_DEVCTL },
207         { "throttle", RCTL_ACTION_THROTTLE },
208         { NULL, -1 }};
209
210 static void rctl_init(void);
211 SYSINIT(rctl, SI_SUB_RACCT, SI_ORDER_FIRST, rctl_init, NULL);
212
213 static uma_zone_t rctl_rule_zone;
214 static uma_zone_t rctl_rule_link_zone;
215
216 static int rctl_rule_fully_specified(const struct rctl_rule *rule);
217 static void rctl_rule_to_sbuf(struct sbuf *sb, const struct rctl_rule *rule);
218
219 static MALLOC_DEFINE(M_RCTL, "rctl", "Resource Limits");
220
221 static int rctl_throttle_min_sysctl(SYSCTL_HANDLER_ARGS)
222 {
223         int error, val = rctl_throttle_min;
224
225         error = sysctl_handle_int(oidp, &val, 0, req);
226         if (error || !req->newptr)
227                 return (error);
228         if (val < 1 || val > rctl_throttle_max)
229                 return (EINVAL);
230
231         RACCT_LOCK();
232         rctl_throttle_min = val;
233         RACCT_UNLOCK();
234
235         return (0);
236 }
237
238 static int rctl_throttle_max_sysctl(SYSCTL_HANDLER_ARGS)
239 {
240         int error, val = rctl_throttle_max;
241
242         error = sysctl_handle_int(oidp, &val, 0, req);
243         if (error || !req->newptr)
244                 return (error);
245         if (val < rctl_throttle_min)
246                 return (EINVAL);
247
248         RACCT_LOCK();
249         rctl_throttle_max = val;
250         RACCT_UNLOCK();
251
252         return (0);
253 }
254
255 static int rctl_throttle_pct_sysctl(SYSCTL_HANDLER_ARGS)
256 {
257         int error, val = rctl_throttle_pct;
258
259         error = sysctl_handle_int(oidp, &val, 0, req);
260         if (error || !req->newptr)
261                 return (error);
262         if (val < 0)
263                 return (EINVAL);
264
265         RACCT_LOCK();
266         rctl_throttle_pct = val;
267         RACCT_UNLOCK();
268
269         return (0);
270 }
271
272 static int rctl_throttle_pct2_sysctl(SYSCTL_HANDLER_ARGS)
273 {
274         int error, val = rctl_throttle_pct2;
275
276         error = sysctl_handle_int(oidp, &val, 0, req);
277         if (error || !req->newptr)
278                 return (error);
279         if (val < 0)
280                 return (EINVAL);
281
282         RACCT_LOCK();
283         rctl_throttle_pct2 = val;
284         RACCT_UNLOCK();
285
286         return (0);
287 }
288
289 static const char *
290 rctl_subject_type_name(int subject)
291 {
292         int i;
293
294         for (i = 0; subjectnames[i].d_name != NULL; i++) {
295                 if (subjectnames[i].d_value == subject)
296                         return (subjectnames[i].d_name);
297         }
298
299         panic("rctl_subject_type_name: unknown subject type %d", subject);
300 }
301
302 static const char *
303 rctl_action_name(int action)
304 {
305         int i;
306
307         for (i = 0; actionnames[i].d_name != NULL; i++) {
308                 if (actionnames[i].d_value == action)
309                         return (actionnames[i].d_name);
310         }
311
312         panic("rctl_action_name: unknown action %d", action);
313 }
314
315 const char *
316 rctl_resource_name(int resource)
317 {
318         int i;
319
320         for (i = 0; resourcenames[i].d_name != NULL; i++) {
321                 if (resourcenames[i].d_value == resource)
322                         return (resourcenames[i].d_name);
323         }
324
325         panic("rctl_resource_name: unknown resource %d", resource);
326 }
327
328 static struct racct *
329 rctl_proc_rule_to_racct(const struct proc *p, const struct rctl_rule *rule)
330 {
331         struct ucred *cred = p->p_ucred;
332
333         ASSERT_RACCT_ENABLED();
334         RACCT_LOCK_ASSERT();
335
336         switch (rule->rr_per) {
337         case RCTL_SUBJECT_TYPE_PROCESS:
338                 return (p->p_racct);
339         case RCTL_SUBJECT_TYPE_USER:
340                 return (cred->cr_ruidinfo->ui_racct);
341         case RCTL_SUBJECT_TYPE_LOGINCLASS:
342                 return (cred->cr_loginclass->lc_racct);
343         case RCTL_SUBJECT_TYPE_JAIL:
344                 return (cred->cr_prison->pr_prison_racct->prr_racct);
345         default:
346                 panic("%s: unknown per %d", __func__, rule->rr_per);
347         }
348 }
349
350 /*
351  * Return the amount of resource that can be allocated by 'p' before
352  * hitting 'rule'.
353  */
354 static int64_t
355 rctl_available_resource(const struct proc *p, const struct rctl_rule *rule)
356 {
357         const struct racct *racct;
358         int64_t available;
359
360         ASSERT_RACCT_ENABLED();
361         RACCT_LOCK_ASSERT();
362
363         racct = rctl_proc_rule_to_racct(p, rule);
364         available = rule->rr_amount - racct->r_resources[rule->rr_resource];
365
366         return (available);
367 }
368
369 /*
370  * Called every second for proc, uidinfo, loginclass, and jail containers.
371  * If the limit isn't exceeded, it decreases the usage amount to zero.
372  * Otherwise, it decreases it by the value of the limit.  This way
373  * resource consumption exceeding the limit "carries over" to the next
374  * period.
375  */
376 void
377 rctl_throttle_decay(struct racct *racct, int resource)
378 {
379         struct rctl_rule *rule;
380         struct rctl_rule_link *link;
381         int64_t minavailable;
382
383         ASSERT_RACCT_ENABLED();
384         RACCT_LOCK_ASSERT();
385
386         minavailable = INT64_MAX;
387
388         LIST_FOREACH(link, &racct->r_rule_links, rrl_next) {
389                 rule = link->rrl_rule;
390
391                 if (rule->rr_resource != resource)
392                         continue;
393                 if (rule->rr_action != RCTL_ACTION_THROTTLE)
394                         continue;
395
396                 if (rule->rr_amount < minavailable)
397                         minavailable = rule->rr_amount;
398         }
399
400         if (racct->r_resources[resource] < minavailable) {
401                 racct->r_resources[resource] = 0;
402         } else {
403                 /*
404                  * Cap utilization counter at ten times the limit.  Otherwise,
405                  * if we changed the rule lowering the allowed amount, it could
406                  * take unreasonably long time for the accumulated resource
407                  * usage to drop.
408                  */
409                 if (racct->r_resources[resource] > minavailable * 10)
410                         racct->r_resources[resource] = minavailable * 10;
411
412                 racct->r_resources[resource] -= minavailable;
413         }
414 }
415
416 /*
417  * Special version of rctl_get_available() for the %CPU resource.
418  * We slightly cheat here and return less than we normally would.
419  */
420 int64_t
421 rctl_pcpu_available(const struct proc *p) {
422         struct rctl_rule *rule;
423         struct rctl_rule_link *link;
424         int64_t available, minavailable, limit;
425
426         ASSERT_RACCT_ENABLED();
427         RACCT_LOCK_ASSERT();
428
429         minavailable = INT64_MAX;
430         limit = 0;
431
432         LIST_FOREACH(link, &p->p_racct->r_rule_links, rrl_next) {
433                 rule = link->rrl_rule;
434                 if (rule->rr_resource != RACCT_PCTCPU)
435                         continue;
436                 if (rule->rr_action != RCTL_ACTION_DENY)
437                         continue;
438                 available = rctl_available_resource(p, rule);
439                 if (available < minavailable) {
440                         minavailable = available;
441                         limit = rule->rr_amount;
442                 }
443         }
444
445         /*
446          * Return slightly less than actual value of the available
447          * %cpu resource.  This makes %cpu throttling more aggressive
448          * and lets us act sooner than the limits are already exceeded.
449          */
450         if (limit != 0) {
451                 if (limit > 2 * RCTL_PCPU_SHIFT)
452                         minavailable -= RCTL_PCPU_SHIFT;
453                 else
454                         minavailable -= (limit / 2);
455         }
456
457         return (minavailable);
458 }
459
460 static uint64_t
461 xadd(uint64_t a, uint64_t b)
462 {
463         uint64_t c;
464
465         c = a + b;
466
467         /*
468          * Detect overflow.
469          */
470         if (c < a || c < b)
471                 return (UINT64_MAX);
472
473         return (c);
474 }
475
476 static uint64_t
477 xmul(uint64_t a, uint64_t b)
478 {
479
480         if (b != 0 && a > UINT64_MAX / b)
481                 return (UINT64_MAX);
482
483         return (a * b);
484 }
485
486 /*
487  * Check whether the proc 'p' can allocate 'amount' of 'resource' in addition
488  * to what it keeps allocated now.  Returns non-zero if the allocation should
489  * be denied, 0 otherwise.
490  */
491 int
492 rctl_enforce(struct proc *p, int resource, uint64_t amount)
493 {
494         static struct timeval log_lasttime, devctl_lasttime;
495         static int log_curtime = 0, devctl_curtime = 0;
496         struct rctl_rule *rule;
497         struct rctl_rule_link *link;
498         struct sbuf sb;
499         char *buf;
500         int64_t available;
501         uint64_t sleep_ms, sleep_ratio;
502         int should_deny = 0;
503
504         ASSERT_RACCT_ENABLED();
505         RACCT_LOCK_ASSERT();
506
507         /*
508          * There may be more than one matching rule; go through all of them.
509          * Denial should be done last, after logging and sending signals.
510          */
511         LIST_FOREACH(link, &p->p_racct->r_rule_links, rrl_next) {
512                 rule = link->rrl_rule;
513                 if (rule->rr_resource != resource)
514                         continue;
515
516                 available = rctl_available_resource(p, rule);
517                 if (available >= (int64_t)amount) {
518                         link->rrl_exceeded = 0;
519                         continue;
520                 }
521
522                 switch (rule->rr_action) {
523                 case RCTL_ACTION_DENY:
524                         should_deny = 1;
525                         continue;
526                 case RCTL_ACTION_LOG:
527                         /*
528                          * If rrl_exceeded != 0, it means we've already
529                          * logged a warning for this process.
530                          */
531                         if (link->rrl_exceeded != 0)
532                                 continue;
533
534                         /*
535                          * If the process state is not fully initialized yet,
536                          * we can't access most of the required fields, e.g.
537                          * p->p_comm.  This happens when called from fork1().
538                          * Ignore this rule for now; it will be processed just
539                          * after fork, when called from racct_proc_fork_done().
540                          */
541                         if (p->p_state != PRS_NORMAL)
542                                 continue;
543
544                         if (!ppsratecheck(&log_lasttime, &log_curtime,
545                             rctl_log_rate_limit))
546                                 continue;
547
548                         buf = malloc(RCTL_LOG_BUFSIZE, M_RCTL, M_NOWAIT);
549                         if (buf == NULL) {
550                                 printf("rctl_enforce: out of memory\n");
551                                 continue;
552                         }
553                         sbuf_new(&sb, buf, RCTL_LOG_BUFSIZE, SBUF_FIXEDLEN);
554                         rctl_rule_to_sbuf(&sb, rule);
555                         sbuf_finish(&sb);
556                         printf("rctl: rule \"%s\" matched by pid %d "
557                             "(%s), uid %d, jail %s\n", sbuf_data(&sb),
558                             p->p_pid, p->p_comm, p->p_ucred->cr_uid,
559                             p->p_ucred->cr_prison->pr_prison_racct->prr_name);
560                         sbuf_delete(&sb);
561                         free(buf, M_RCTL);
562                         link->rrl_exceeded = 1;
563                         continue;
564                 case RCTL_ACTION_DEVCTL:
565                         if (link->rrl_exceeded != 0)
566                                 continue;
567
568                         if (p->p_state != PRS_NORMAL)
569                                 continue;
570
571                         if (!ppsratecheck(&devctl_lasttime, &devctl_curtime,
572                             rctl_devctl_rate_limit))
573                                 continue;
574
575                         buf = malloc(RCTL_LOG_BUFSIZE, M_RCTL, M_NOWAIT);
576                         if (buf == NULL) {
577                                 printf("rctl_enforce: out of memory\n");
578                                 continue;
579                         }
580                         sbuf_new(&sb, buf, RCTL_LOG_BUFSIZE, SBUF_FIXEDLEN);
581                         sbuf_printf(&sb, "rule=");
582                         rctl_rule_to_sbuf(&sb, rule);
583                         sbuf_printf(&sb, " pid=%d ruid=%d jail=%s",
584                             p->p_pid, p->p_ucred->cr_ruid,
585                             p->p_ucred->cr_prison->pr_prison_racct->prr_name);
586                         sbuf_finish(&sb);
587                         devctl_notify_f("RCTL", "rule", "matched",
588                             sbuf_data(&sb), M_NOWAIT);
589                         sbuf_delete(&sb);
590                         free(buf, M_RCTL);
591                         link->rrl_exceeded = 1;
592                         continue;
593                 case RCTL_ACTION_THROTTLE:
594                         if (p->p_state != PRS_NORMAL)
595                                 continue;
596
597                         /*
598                          * Make the process sleep for a fraction of second
599                          * proportional to the ratio of process' resource
600                          * utilization compared to the limit.  The point is
601                          * to penalize resource hogs: processes that consume
602                          * more of the available resources sleep for longer.
603                          *
604                          * We're trying to defer division until the very end,
605                          * to minimize the rounding effects.  The following
606                          * calculation could have been written in a clearer
607                          * way like this:
608                          *
609                          * sleep_ms = hz * p->p_racct->r_resources[resource] /
610                          *     rule->rr_amount;
611                          * sleep_ms *= rctl_throttle_pct / 100;
612                          * if (sleep_ms < rctl_throttle_min)
613                          *         sleep_ms = rctl_throttle_min;
614                          *
615                          */
616                         sleep_ms = xmul(hz, p->p_racct->r_resources[resource]);
617                         sleep_ms = xmul(sleep_ms,  rctl_throttle_pct) / 100;
618                         if (sleep_ms < rctl_throttle_min * rule->rr_amount)
619                                 sleep_ms = rctl_throttle_min * rule->rr_amount;
620
621                         /*
622                          * Multiply that by the ratio of the resource
623                          * consumption for the container compared to the limit,
624                          * squared.  In other words, a process in a container
625                          * that is two times over the limit will be throttled
626                          * four times as much for hitting the same rule.  The
627                          * point is to penalize processes more if the container
628                          * itself (eg certain UID or jail) is above the limit.
629                          */
630                         if (available < 0)
631                                 sleep_ratio = -available / rule->rr_amount;
632                         else
633                                 sleep_ratio = 0;
634                         sleep_ratio = xmul(sleep_ratio, sleep_ratio);
635                         sleep_ratio = xmul(sleep_ratio, rctl_throttle_pct2) / 100;
636                         sleep_ms = xadd(sleep_ms, xmul(sleep_ms, sleep_ratio));
637
638                         /*
639                          * Finally the division.
640                          */
641                         sleep_ms /= rule->rr_amount;
642
643                         if (sleep_ms > rctl_throttle_max)
644                                 sleep_ms = rctl_throttle_max;
645 #if 0
646                         printf("%s: pid %d (%s), %jd of %jd, will sleep for %ju ms (ratio %ju, available %jd)\n",
647                            __func__, p->p_pid, p->p_comm,
648                            p->p_racct->r_resources[resource],
649                            rule->rr_amount, (uintmax_t)sleep_ms,
650                            (uintmax_t)sleep_ratio, (intmax_t)available);
651 #endif
652
653                         KASSERT(sleep_ms >= rctl_throttle_min, ("%s: %ju < %d\n",
654                             __func__, (uintmax_t)sleep_ms, rctl_throttle_min));
655                         racct_proc_throttle(p, sleep_ms);
656                         continue;
657                 default:
658                         if (link->rrl_exceeded != 0)
659                                 continue;
660
661                         if (p->p_state != PRS_NORMAL)
662                                 continue;
663
664                         KASSERT(rule->rr_action > 0 &&
665                             rule->rr_action <= RCTL_ACTION_SIGNAL_MAX,
666                             ("rctl_enforce: unknown action %d",
667                              rule->rr_action));
668
669                         /*
670                          * We're using the fact that RCTL_ACTION_SIG* values
671                          * are equal to their counterparts from sys/signal.h.
672                          */
673                         kern_psignal(p, rule->rr_action);
674                         link->rrl_exceeded = 1;
675                         continue;
676                 }
677         }
678
679         if (should_deny) {
680                 /*
681                  * Return fake error code; the caller should change it
682                  * into one proper for the situation - EFSIZ, ENOMEM etc.
683                  */
684                 return (EDOOFUS);
685         }
686
687         return (0);
688 }
689
690 uint64_t
691 rctl_get_limit(struct proc *p, int resource)
692 {
693         struct rctl_rule *rule;
694         struct rctl_rule_link *link;
695         uint64_t amount = UINT64_MAX;
696
697         ASSERT_RACCT_ENABLED();
698         RACCT_LOCK_ASSERT();
699
700         /*
701          * There may be more than one matching rule; go through all of them.
702          * Denial should be done last, after logging and sending signals.
703          */
704         LIST_FOREACH(link, &p->p_racct->r_rule_links, rrl_next) {
705                 rule = link->rrl_rule;
706                 if (rule->rr_resource != resource)
707                         continue;
708                 if (rule->rr_action != RCTL_ACTION_DENY)
709                         continue;
710                 if (rule->rr_amount < amount)
711                         amount = rule->rr_amount;
712         }
713
714         return (amount);
715 }
716
717 uint64_t
718 rctl_get_available(struct proc *p, int resource)
719 {
720         struct rctl_rule *rule;
721         struct rctl_rule_link *link;
722         int64_t available, minavailable, allocated;
723
724         minavailable = INT64_MAX;
725
726         ASSERT_RACCT_ENABLED();
727         RACCT_LOCK_ASSERT();
728
729         /*
730          * There may be more than one matching rule; go through all of them.
731          * Denial should be done last, after logging and sending signals.
732          */
733         LIST_FOREACH(link, &p->p_racct->r_rule_links, rrl_next) {
734                 rule = link->rrl_rule;
735                 if (rule->rr_resource != resource)
736                         continue;
737                 if (rule->rr_action != RCTL_ACTION_DENY)
738                         continue;
739                 available = rctl_available_resource(p, rule);
740                 if (available < minavailable)
741                         minavailable = available;
742         }
743
744         /*
745          * XXX: Think about this _hard_.
746          */
747         allocated = p->p_racct->r_resources[resource];
748         if (minavailable < INT64_MAX - allocated)
749                 minavailable += allocated;
750         if (minavailable < 0)
751                 minavailable = 0;
752
753         return (minavailable);
754 }
755
756 static int
757 rctl_rule_matches(const struct rctl_rule *rule, const struct rctl_rule *filter)
758 {
759
760         ASSERT_RACCT_ENABLED();
761
762         if (filter->rr_subject_type != RCTL_SUBJECT_TYPE_UNDEFINED) {
763                 if (rule->rr_subject_type != filter->rr_subject_type)
764                         return (0);
765
766                 switch (filter->rr_subject_type) {
767                 case RCTL_SUBJECT_TYPE_PROCESS:
768                         if (filter->rr_subject.rs_proc != NULL &&
769                             rule->rr_subject.rs_proc !=
770                             filter->rr_subject.rs_proc)
771                                 return (0);
772                         break;
773                 case RCTL_SUBJECT_TYPE_USER:
774                         if (filter->rr_subject.rs_uip != NULL &&
775                             rule->rr_subject.rs_uip !=
776                             filter->rr_subject.rs_uip)
777                                 return (0);
778                         break;
779                 case RCTL_SUBJECT_TYPE_LOGINCLASS:
780                         if (filter->rr_subject.rs_loginclass != NULL &&
781                             rule->rr_subject.rs_loginclass !=
782                             filter->rr_subject.rs_loginclass)
783                                 return (0);
784                         break;
785                 case RCTL_SUBJECT_TYPE_JAIL:
786                         if (filter->rr_subject.rs_prison_racct != NULL &&
787                             rule->rr_subject.rs_prison_racct !=
788                             filter->rr_subject.rs_prison_racct)
789                                 return (0);
790                         break;
791                 default:
792                         panic("rctl_rule_matches: unknown subject type %d",
793                             filter->rr_subject_type);
794                 }
795         }
796
797         if (filter->rr_resource != RACCT_UNDEFINED) {
798                 if (rule->rr_resource != filter->rr_resource)
799                         return (0);
800         }
801
802         if (filter->rr_action != RCTL_ACTION_UNDEFINED) {
803                 if (rule->rr_action != filter->rr_action)
804                         return (0);
805         }
806
807         if (filter->rr_amount != RCTL_AMOUNT_UNDEFINED) {
808                 if (rule->rr_amount != filter->rr_amount)
809                         return (0);
810         }
811
812         if (filter->rr_per != RCTL_SUBJECT_TYPE_UNDEFINED) {
813                 if (rule->rr_per != filter->rr_per)
814                         return (0);
815         }
816
817         return (1);
818 }
819
820 static int
821 str2value(const char *str, int *value, struct dict *table)
822 {
823         int i;
824
825         if (value == NULL)
826                 return (EINVAL);
827
828         for (i = 0; table[i].d_name != NULL; i++) {
829                 if (strcasecmp(table[i].d_name, str) == 0) {
830                         *value =  table[i].d_value;
831                         return (0);
832                 }
833         }
834
835         return (EINVAL);
836 }
837
838 static int
839 str2id(const char *str, id_t *value)
840 {
841         char *end;
842
843         if (str == NULL)
844                 return (EINVAL);
845
846         *value = strtoul(str, &end, 10);
847         if ((size_t)(end - str) != strlen(str))
848                 return (EINVAL);
849
850         return (0);
851 }
852
853 static int
854 str2int64(const char *str, int64_t *value)
855 {
856         char *end;
857
858         if (str == NULL)
859                 return (EINVAL);
860
861         *value = strtoul(str, &end, 10);
862         if ((size_t)(end - str) != strlen(str))
863                 return (EINVAL);
864
865         if (*value < 0)
866                 return (ERANGE);
867
868         return (0);
869 }
870
871 /*
872  * Connect the rule to the racct, increasing refcount for the rule.
873  */
874 static void
875 rctl_racct_add_rule(struct racct *racct, struct rctl_rule *rule)
876 {
877         struct rctl_rule_link *link;
878
879         ASSERT_RACCT_ENABLED();
880         KASSERT(rctl_rule_fully_specified(rule), ("rule not fully specified"));
881
882         rctl_rule_acquire(rule);
883         link = uma_zalloc(rctl_rule_link_zone, M_WAITOK);
884         link->rrl_rule = rule;
885         link->rrl_exceeded = 0;
886
887         RACCT_LOCK();
888         LIST_INSERT_HEAD(&racct->r_rule_links, link, rrl_next);
889         RACCT_UNLOCK();
890 }
891
892 static int
893 rctl_racct_add_rule_locked(struct racct *racct, struct rctl_rule *rule)
894 {
895         struct rctl_rule_link *link;
896
897         ASSERT_RACCT_ENABLED();
898         KASSERT(rctl_rule_fully_specified(rule), ("rule not fully specified"));
899         RACCT_LOCK_ASSERT();
900
901         link = uma_zalloc(rctl_rule_link_zone, M_NOWAIT);
902         if (link == NULL)
903                 return (ENOMEM);
904         rctl_rule_acquire(rule);
905         link->rrl_rule = rule;
906         link->rrl_exceeded = 0;
907
908         LIST_INSERT_HEAD(&racct->r_rule_links, link, rrl_next);
909
910         return (0);
911 }
912
913 /*
914  * Remove limits for a rules matching the filter and release
915  * the refcounts for the rules, possibly freeing them.  Returns
916  * the number of limit structures removed.
917  */
918 static int
919 rctl_racct_remove_rules(struct racct *racct,
920     const struct rctl_rule *filter)
921 {
922         struct rctl_rule_link *link, *linktmp;
923         int removed = 0;
924
925         ASSERT_RACCT_ENABLED();
926         RACCT_LOCK_ASSERT();
927
928         LIST_FOREACH_SAFE(link, &racct->r_rule_links, rrl_next, linktmp) {
929                 if (!rctl_rule_matches(link->rrl_rule, filter))
930                         continue;
931
932                 LIST_REMOVE(link, rrl_next);
933                 rctl_rule_release(link->rrl_rule);
934                 uma_zfree(rctl_rule_link_zone, link);
935                 removed++;
936         }
937         return (removed);
938 }
939
940 static void
941 rctl_rule_acquire_subject(struct rctl_rule *rule)
942 {
943
944         ASSERT_RACCT_ENABLED();
945
946         switch (rule->rr_subject_type) {
947         case RCTL_SUBJECT_TYPE_UNDEFINED:
948         case RCTL_SUBJECT_TYPE_PROCESS:
949                 break;
950         case RCTL_SUBJECT_TYPE_JAIL:
951                 if (rule->rr_subject.rs_prison_racct != NULL)
952                         prison_racct_hold(rule->rr_subject.rs_prison_racct);
953                 break;
954         case RCTL_SUBJECT_TYPE_USER:
955                 if (rule->rr_subject.rs_uip != NULL)
956                         uihold(rule->rr_subject.rs_uip);
957                 break;
958         case RCTL_SUBJECT_TYPE_LOGINCLASS:
959                 if (rule->rr_subject.rs_loginclass != NULL)
960                         loginclass_hold(rule->rr_subject.rs_loginclass);
961                 break;
962         default:
963                 panic("rctl_rule_acquire_subject: unknown subject type %d",
964                     rule->rr_subject_type);
965         }
966 }
967
968 static void
969 rctl_rule_release_subject(struct rctl_rule *rule)
970 {
971
972         ASSERT_RACCT_ENABLED();
973
974         switch (rule->rr_subject_type) {
975         case RCTL_SUBJECT_TYPE_UNDEFINED:
976         case RCTL_SUBJECT_TYPE_PROCESS:
977                 break;
978         case RCTL_SUBJECT_TYPE_JAIL:
979                 if (rule->rr_subject.rs_prison_racct != NULL)
980                         prison_racct_free(rule->rr_subject.rs_prison_racct);
981                 break;
982         case RCTL_SUBJECT_TYPE_USER:
983                 if (rule->rr_subject.rs_uip != NULL)
984                         uifree(rule->rr_subject.rs_uip);
985                 break;
986         case RCTL_SUBJECT_TYPE_LOGINCLASS:
987                 if (rule->rr_subject.rs_loginclass != NULL)
988                         loginclass_free(rule->rr_subject.rs_loginclass);
989                 break;
990         default:
991                 panic("rctl_rule_release_subject: unknown subject type %d",
992                     rule->rr_subject_type);
993         }
994 }
995
996 struct rctl_rule *
997 rctl_rule_alloc(int flags)
998 {
999         struct rctl_rule *rule;
1000
1001         ASSERT_RACCT_ENABLED();
1002
1003         rule = uma_zalloc(rctl_rule_zone, flags);
1004         if (rule == NULL)
1005                 return (NULL);
1006         rule->rr_subject_type = RCTL_SUBJECT_TYPE_UNDEFINED;
1007         rule->rr_subject.rs_proc = NULL;
1008         rule->rr_subject.rs_uip = NULL;
1009         rule->rr_subject.rs_loginclass = NULL;
1010         rule->rr_subject.rs_prison_racct = NULL;
1011         rule->rr_per = RCTL_SUBJECT_TYPE_UNDEFINED;
1012         rule->rr_resource = RACCT_UNDEFINED;
1013         rule->rr_action = RCTL_ACTION_UNDEFINED;
1014         rule->rr_amount = RCTL_AMOUNT_UNDEFINED;
1015         refcount_init(&rule->rr_refcount, 1);
1016
1017         return (rule);
1018 }
1019
1020 struct rctl_rule *
1021 rctl_rule_duplicate(const struct rctl_rule *rule, int flags)
1022 {
1023         struct rctl_rule *copy;
1024
1025         ASSERT_RACCT_ENABLED();
1026
1027         copy = uma_zalloc(rctl_rule_zone, flags);
1028         if (copy == NULL)
1029                 return (NULL);
1030         copy->rr_subject_type = rule->rr_subject_type;
1031         copy->rr_subject.rs_proc = rule->rr_subject.rs_proc;
1032         copy->rr_subject.rs_uip = rule->rr_subject.rs_uip;
1033         copy->rr_subject.rs_loginclass = rule->rr_subject.rs_loginclass;
1034         copy->rr_subject.rs_prison_racct = rule->rr_subject.rs_prison_racct;
1035         copy->rr_per = rule->rr_per;
1036         copy->rr_resource = rule->rr_resource;
1037         copy->rr_action = rule->rr_action;
1038         copy->rr_amount = rule->rr_amount;
1039         refcount_init(&copy->rr_refcount, 1);
1040         rctl_rule_acquire_subject(copy);
1041
1042         return (copy);
1043 }
1044
1045 void
1046 rctl_rule_acquire(struct rctl_rule *rule)
1047 {
1048
1049         ASSERT_RACCT_ENABLED();
1050         KASSERT(rule->rr_refcount > 0, ("rule->rr_refcount <= 0"));
1051
1052         refcount_acquire(&rule->rr_refcount);
1053 }
1054
1055 static void
1056 rctl_rule_free(void *context, int pending)
1057 {
1058         struct rctl_rule *rule;
1059         
1060         rule = (struct rctl_rule *)context;
1061
1062         ASSERT_RACCT_ENABLED();
1063         KASSERT(rule->rr_refcount == 0, ("rule->rr_refcount != 0"));
1064         
1065         /*
1066          * We don't need locking here; rule is guaranteed to be inaccessible.
1067          */
1068         
1069         rctl_rule_release_subject(rule);
1070         uma_zfree(rctl_rule_zone, rule);
1071 }
1072
1073 void
1074 rctl_rule_release(struct rctl_rule *rule)
1075 {
1076
1077         ASSERT_RACCT_ENABLED();
1078         KASSERT(rule->rr_refcount > 0, ("rule->rr_refcount <= 0"));
1079
1080         if (refcount_release(&rule->rr_refcount)) {
1081                 /*
1082                  * rctl_rule_release() is often called when iterating
1083                  * over all the uidinfo structures in the system,
1084                  * holding uihashtbl_lock.  Since rctl_rule_free()
1085                  * might end up calling uifree(), this would lead
1086                  * to lock recursion.  Use taskqueue to avoid this.
1087                  */
1088                 TASK_INIT(&rule->rr_task, 0, rctl_rule_free, rule);
1089                 taskqueue_enqueue(taskqueue_thread, &rule->rr_task);
1090         }
1091 }
1092
1093 static int
1094 rctl_rule_fully_specified(const struct rctl_rule *rule)
1095 {
1096
1097         ASSERT_RACCT_ENABLED();
1098
1099         switch (rule->rr_subject_type) {
1100         case RCTL_SUBJECT_TYPE_UNDEFINED:
1101                 return (0);
1102         case RCTL_SUBJECT_TYPE_PROCESS:
1103                 if (rule->rr_subject.rs_proc == NULL)
1104                         return (0);
1105                 break;
1106         case RCTL_SUBJECT_TYPE_USER:
1107                 if (rule->rr_subject.rs_uip == NULL)
1108                         return (0);
1109                 break;
1110         case RCTL_SUBJECT_TYPE_LOGINCLASS:
1111                 if (rule->rr_subject.rs_loginclass == NULL)
1112                         return (0);
1113                 break;
1114         case RCTL_SUBJECT_TYPE_JAIL:
1115                 if (rule->rr_subject.rs_prison_racct == NULL)
1116                         return (0);
1117                 break;
1118         default:
1119                 panic("rctl_rule_fully_specified: unknown subject type %d",
1120                     rule->rr_subject_type);
1121         }
1122         if (rule->rr_resource == RACCT_UNDEFINED)
1123                 return (0);
1124         if (rule->rr_action == RCTL_ACTION_UNDEFINED)
1125                 return (0);
1126         if (rule->rr_amount == RCTL_AMOUNT_UNDEFINED)
1127                 return (0);
1128         if (rule->rr_per == RCTL_SUBJECT_TYPE_UNDEFINED)
1129                 return (0);
1130
1131         return (1);
1132 }
1133
1134 static int
1135 rctl_string_to_rule(char *rulestr, struct rctl_rule **rulep)
1136 {
1137         struct rctl_rule *rule;
1138         char *subjectstr, *subject_idstr, *resourcestr, *actionstr,
1139              *amountstr, *perstr;
1140         id_t id;
1141         int error = 0;
1142
1143         ASSERT_RACCT_ENABLED();
1144
1145         rule = rctl_rule_alloc(M_WAITOK);
1146
1147         subjectstr = strsep(&rulestr, ":");
1148         subject_idstr = strsep(&rulestr, ":");
1149         resourcestr = strsep(&rulestr, ":");
1150         actionstr = strsep(&rulestr, "=/");
1151         amountstr = strsep(&rulestr, "/");
1152         perstr = rulestr;
1153
1154         if (subjectstr == NULL || subjectstr[0] == '\0')
1155                 rule->rr_subject_type = RCTL_SUBJECT_TYPE_UNDEFINED;
1156         else {
1157                 error = str2value(subjectstr, &rule->rr_subject_type, subjectnames);
1158                 if (error != 0)
1159                         goto out;
1160         }
1161
1162         if (subject_idstr == NULL || subject_idstr[0] == '\0') {
1163                 rule->rr_subject.rs_proc = NULL;
1164                 rule->rr_subject.rs_uip = NULL;
1165                 rule->rr_subject.rs_loginclass = NULL;
1166                 rule->rr_subject.rs_prison_racct = NULL;
1167         } else {
1168                 switch (rule->rr_subject_type) {
1169                 case RCTL_SUBJECT_TYPE_UNDEFINED:
1170                         error = EINVAL;
1171                         goto out;
1172                 case RCTL_SUBJECT_TYPE_PROCESS:
1173                         error = str2id(subject_idstr, &id);
1174                         if (error != 0)
1175                                 goto out;
1176                         sx_assert(&allproc_lock, SA_LOCKED);
1177                         rule->rr_subject.rs_proc = pfind(id);
1178                         if (rule->rr_subject.rs_proc == NULL) {
1179                                 error = ESRCH;
1180                                 goto out;
1181                         }
1182                         PROC_UNLOCK(rule->rr_subject.rs_proc);
1183                         break;
1184                 case RCTL_SUBJECT_TYPE_USER:
1185                         error = str2id(subject_idstr, &id);
1186                         if (error != 0)
1187                                 goto out;
1188                         rule->rr_subject.rs_uip = uifind(id);
1189                         break;
1190                 case RCTL_SUBJECT_TYPE_LOGINCLASS:
1191                         rule->rr_subject.rs_loginclass =
1192                             loginclass_find(subject_idstr);
1193                         if (rule->rr_subject.rs_loginclass == NULL) {
1194                                 error = ENAMETOOLONG;
1195                                 goto out;
1196                         }
1197                         break;
1198                 case RCTL_SUBJECT_TYPE_JAIL:
1199                         rule->rr_subject.rs_prison_racct =
1200                             prison_racct_find(subject_idstr);
1201                         if (rule->rr_subject.rs_prison_racct == NULL) {
1202                                 error = ENAMETOOLONG;
1203                                 goto out;
1204                         }
1205                         break;
1206                default:
1207                        panic("rctl_string_to_rule: unknown subject type %d",
1208                            rule->rr_subject_type);
1209                }
1210         }
1211
1212         if (resourcestr == NULL || resourcestr[0] == '\0')
1213                 rule->rr_resource = RACCT_UNDEFINED;
1214         else {
1215                 error = str2value(resourcestr, &rule->rr_resource,
1216                     resourcenames);
1217                 if (error != 0)
1218                         goto out;
1219         }
1220
1221         if (actionstr == NULL || actionstr[0] == '\0')
1222                 rule->rr_action = RCTL_ACTION_UNDEFINED;
1223         else {
1224                 error = str2value(actionstr, &rule->rr_action, actionnames);
1225                 if (error != 0)
1226                         goto out;
1227         }
1228
1229         if (amountstr == NULL || amountstr[0] == '\0')
1230                 rule->rr_amount = RCTL_AMOUNT_UNDEFINED;
1231         else {
1232                 error = str2int64(amountstr, &rule->rr_amount);
1233                 if (error != 0)
1234                         goto out;
1235                 if (RACCT_IS_IN_MILLIONS(rule->rr_resource)) {
1236                         if (rule->rr_amount > INT64_MAX / 1000000) {
1237                                 error = ERANGE;
1238                                 goto out;
1239                         }
1240                         rule->rr_amount *= 1000000;
1241                 }
1242         }
1243
1244         if (perstr == NULL || perstr[0] == '\0')
1245                 rule->rr_per = RCTL_SUBJECT_TYPE_UNDEFINED;
1246         else {
1247                 error = str2value(perstr, &rule->rr_per, subjectnames);
1248                 if (error != 0)
1249                         goto out;
1250         }
1251
1252 out:
1253         if (error == 0)
1254                 *rulep = rule;
1255         else
1256                 rctl_rule_release(rule);
1257
1258         return (error);
1259 }
1260
1261 /*
1262  * Link a rule with all the subjects it applies to.
1263  */
1264 int
1265 rctl_rule_add(struct rctl_rule *rule)
1266 {
1267         struct proc *p;
1268         struct ucred *cred;
1269         struct uidinfo *uip;
1270         struct prison *pr;
1271         struct prison_racct *prr;
1272         struct loginclass *lc;
1273         struct rctl_rule *rule2;
1274         int match;
1275
1276         ASSERT_RACCT_ENABLED();
1277         KASSERT(rctl_rule_fully_specified(rule), ("rule not fully specified"));
1278
1279         /*
1280          * Some rules just don't make sense, like "deny" rule for an undeniable
1281          * resource.  The exception are the RSS and %CPU resources - they are
1282          * not deniable in the racct sense, but the limit is enforced in
1283          * a different way.
1284          */
1285         if (rule->rr_action == RCTL_ACTION_DENY &&
1286             !RACCT_IS_DENIABLE(rule->rr_resource) &&
1287             rule->rr_resource != RACCT_RSS &&
1288             rule->rr_resource != RACCT_PCTCPU) {
1289                 return (EOPNOTSUPP);
1290         }
1291
1292         if (rule->rr_action == RCTL_ACTION_THROTTLE &&
1293             !RACCT_IS_DECAYING(rule->rr_resource)) {
1294                 return (EOPNOTSUPP);
1295         }
1296
1297         if (rule->rr_action == RCTL_ACTION_THROTTLE &&
1298             rule->rr_resource == RACCT_PCTCPU) {
1299                 return (EOPNOTSUPP);
1300         }
1301
1302         if (rule->rr_per == RCTL_SUBJECT_TYPE_PROCESS &&
1303             RACCT_IS_SLOPPY(rule->rr_resource)) {
1304                 return (EOPNOTSUPP);
1305         }
1306
1307         /*
1308          * Make sure there are no duplicated rules.  Also, for the "deny"
1309          * rules, remove ones differing only by "amount".
1310          */
1311         if (rule->rr_action == RCTL_ACTION_DENY) {
1312                 rule2 = rctl_rule_duplicate(rule, M_WAITOK);
1313                 rule2->rr_amount = RCTL_AMOUNT_UNDEFINED;
1314                 rctl_rule_remove(rule2);
1315                 rctl_rule_release(rule2);
1316         } else
1317                 rctl_rule_remove(rule);
1318
1319         switch (rule->rr_subject_type) {
1320         case RCTL_SUBJECT_TYPE_PROCESS:
1321                 p = rule->rr_subject.rs_proc;
1322                 KASSERT(p != NULL, ("rctl_rule_add: NULL proc"));
1323
1324                 rctl_racct_add_rule(p->p_racct, rule);
1325                 /*
1326                  * In case of per-process rule, we don't have anything more
1327                  * to do.
1328                  */
1329                 return (0);
1330
1331         case RCTL_SUBJECT_TYPE_USER:
1332                 uip = rule->rr_subject.rs_uip;
1333                 KASSERT(uip != NULL, ("rctl_rule_add: NULL uip"));
1334                 rctl_racct_add_rule(uip->ui_racct, rule);
1335                 break;
1336
1337         case RCTL_SUBJECT_TYPE_LOGINCLASS:
1338                 lc = rule->rr_subject.rs_loginclass;
1339                 KASSERT(lc != NULL, ("rctl_rule_add: NULL loginclass"));
1340                 rctl_racct_add_rule(lc->lc_racct, rule);
1341                 break;
1342
1343         case RCTL_SUBJECT_TYPE_JAIL:
1344                 prr = rule->rr_subject.rs_prison_racct;
1345                 KASSERT(prr != NULL, ("rctl_rule_add: NULL pr"));
1346                 rctl_racct_add_rule(prr->prr_racct, rule);
1347                 break;
1348
1349         default:
1350                 panic("rctl_rule_add: unknown subject type %d",
1351                     rule->rr_subject_type);
1352         }
1353
1354         /*
1355          * Now go through all the processes and add the new rule to the ones
1356          * it applies to.
1357          */
1358         sx_assert(&allproc_lock, SA_LOCKED);
1359         FOREACH_PROC_IN_SYSTEM(p) {
1360                 cred = p->p_ucred;
1361                 switch (rule->rr_subject_type) {
1362                 case RCTL_SUBJECT_TYPE_USER:
1363                         if (cred->cr_uidinfo == rule->rr_subject.rs_uip ||
1364                             cred->cr_ruidinfo == rule->rr_subject.rs_uip)
1365                                 break;
1366                         continue;
1367                 case RCTL_SUBJECT_TYPE_LOGINCLASS:
1368                         if (cred->cr_loginclass == rule->rr_subject.rs_loginclass)
1369                                 break;
1370                         continue;
1371                 case RCTL_SUBJECT_TYPE_JAIL:
1372                         match = 0;
1373                         for (pr = cred->cr_prison; pr != NULL; pr = pr->pr_parent) {
1374                                 if (pr->pr_prison_racct == rule->rr_subject.rs_prison_racct) {
1375                                         match = 1;
1376                                         break;
1377                                 }
1378                         }
1379                         if (match)
1380                                 break;
1381                         continue;
1382                 default:
1383                         panic("rctl_rule_add: unknown subject type %d",
1384                             rule->rr_subject_type);
1385                 }
1386
1387                 rctl_racct_add_rule(p->p_racct, rule);
1388         }
1389
1390         return (0);
1391 }
1392
1393 static void
1394 rctl_rule_pre_callback(void)
1395 {
1396
1397         RACCT_LOCK();
1398 }
1399
1400 static void
1401 rctl_rule_post_callback(void)
1402 {
1403
1404         RACCT_UNLOCK();
1405 }
1406
1407 static void
1408 rctl_rule_remove_callback(struct racct *racct, void *arg2, void *arg3)
1409 {
1410         struct rctl_rule *filter = (struct rctl_rule *)arg2;
1411         int found = 0;
1412
1413         ASSERT_RACCT_ENABLED();
1414         RACCT_LOCK_ASSERT();
1415
1416         found += rctl_racct_remove_rules(racct, filter);
1417
1418         *((int *)arg3) += found;
1419 }
1420
1421 /*
1422  * Remove all rules that match the filter.
1423  */
1424 int
1425 rctl_rule_remove(struct rctl_rule *filter)
1426 {
1427         struct proc *p;
1428         int found = 0;
1429
1430         ASSERT_RACCT_ENABLED();
1431
1432         if (filter->rr_subject_type == RCTL_SUBJECT_TYPE_PROCESS &&
1433             filter->rr_subject.rs_proc != NULL) {
1434                 p = filter->rr_subject.rs_proc;
1435                 RACCT_LOCK();
1436                 found = rctl_racct_remove_rules(p->p_racct, filter);
1437                 RACCT_UNLOCK();
1438                 if (found)
1439                         return (0);
1440                 return (ESRCH);
1441         }
1442
1443         loginclass_racct_foreach(rctl_rule_remove_callback,
1444             rctl_rule_pre_callback, rctl_rule_post_callback,
1445             filter, (void *)&found);
1446         ui_racct_foreach(rctl_rule_remove_callback,
1447             rctl_rule_pre_callback, rctl_rule_post_callback,
1448             filter, (void *)&found);
1449         prison_racct_foreach(rctl_rule_remove_callback,
1450             rctl_rule_pre_callback, rctl_rule_post_callback,
1451             filter, (void *)&found);
1452
1453         sx_assert(&allproc_lock, SA_LOCKED);
1454         RACCT_LOCK();
1455         FOREACH_PROC_IN_SYSTEM(p) {
1456                 found += rctl_racct_remove_rules(p->p_racct, filter);
1457         }
1458         RACCT_UNLOCK();
1459
1460         if (found)
1461                 return (0);
1462         return (ESRCH);
1463 }
1464
1465 /*
1466  * Appends a rule to the sbuf.
1467  */
1468 static void
1469 rctl_rule_to_sbuf(struct sbuf *sb, const struct rctl_rule *rule)
1470 {
1471         int64_t amount;
1472
1473         ASSERT_RACCT_ENABLED();
1474
1475         sbuf_printf(sb, "%s:", rctl_subject_type_name(rule->rr_subject_type));
1476
1477         switch (rule->rr_subject_type) {
1478         case RCTL_SUBJECT_TYPE_PROCESS:
1479                 if (rule->rr_subject.rs_proc == NULL)
1480                         sbuf_printf(sb, ":");
1481                 else
1482                         sbuf_printf(sb, "%d:",
1483                             rule->rr_subject.rs_proc->p_pid);
1484                 break;
1485         case RCTL_SUBJECT_TYPE_USER:
1486                 if (rule->rr_subject.rs_uip == NULL)
1487                         sbuf_printf(sb, ":");
1488                 else
1489                         sbuf_printf(sb, "%d:",
1490                             rule->rr_subject.rs_uip->ui_uid);
1491                 break;
1492         case RCTL_SUBJECT_TYPE_LOGINCLASS:
1493                 if (rule->rr_subject.rs_loginclass == NULL)
1494                         sbuf_printf(sb, ":");
1495                 else
1496                         sbuf_printf(sb, "%s:",
1497                             rule->rr_subject.rs_loginclass->lc_name);
1498                 break;
1499         case RCTL_SUBJECT_TYPE_JAIL:
1500                 if (rule->rr_subject.rs_prison_racct == NULL)
1501                         sbuf_printf(sb, ":");
1502                 else
1503                         sbuf_printf(sb, "%s:",
1504                             rule->rr_subject.rs_prison_racct->prr_name);
1505                 break;
1506         default:
1507                 panic("rctl_rule_to_sbuf: unknown subject type %d",
1508                     rule->rr_subject_type);
1509         }
1510
1511         amount = rule->rr_amount;
1512         if (amount != RCTL_AMOUNT_UNDEFINED &&
1513             RACCT_IS_IN_MILLIONS(rule->rr_resource))
1514                 amount /= 1000000;
1515
1516         sbuf_printf(sb, "%s:%s=%jd",
1517             rctl_resource_name(rule->rr_resource),
1518             rctl_action_name(rule->rr_action),
1519             amount);
1520
1521         if (rule->rr_per != rule->rr_subject_type)
1522                 sbuf_printf(sb, "/%s", rctl_subject_type_name(rule->rr_per));
1523 }
1524
1525 /*
1526  * Routine used by RCTL syscalls to read in input string.
1527  */
1528 static int
1529 rctl_read_inbuf(char **inputstr, const char *inbufp, size_t inbuflen)
1530 {
1531         char *str;
1532         int error;
1533
1534         ASSERT_RACCT_ENABLED();
1535
1536         if (inbuflen <= 0)
1537                 return (EINVAL);
1538         if (inbuflen > RCTL_MAX_INBUFSIZE)
1539                 return (E2BIG);
1540
1541         str = malloc(inbuflen + 1, M_RCTL, M_WAITOK);
1542         error = copyinstr(inbufp, str, inbuflen, NULL);
1543         if (error != 0) {
1544                 free(str, M_RCTL);
1545                 return (error);
1546         }
1547
1548         *inputstr = str;
1549
1550         return (0);
1551 }
1552
1553 /*
1554  * Routine used by RCTL syscalls to write out output string.
1555  */
1556 static int
1557 rctl_write_outbuf(struct sbuf *outputsbuf, char *outbufp, size_t outbuflen)
1558 {
1559         int error;
1560
1561         ASSERT_RACCT_ENABLED();
1562
1563         if (outputsbuf == NULL)
1564                 return (0);
1565
1566         sbuf_finish(outputsbuf);
1567         if (outbuflen < sbuf_len(outputsbuf) + 1) {
1568                 sbuf_delete(outputsbuf);
1569                 return (ERANGE);
1570         }
1571         error = copyout(sbuf_data(outputsbuf), outbufp,
1572             sbuf_len(outputsbuf) + 1);
1573         sbuf_delete(outputsbuf);
1574         return (error);
1575 }
1576
1577 static struct sbuf *
1578 rctl_racct_to_sbuf(struct racct *racct, int sloppy)
1579 {
1580         struct sbuf *sb;
1581         int64_t amount;
1582         int i;
1583
1584         ASSERT_RACCT_ENABLED();
1585
1586         sb = sbuf_new_auto();
1587         for (i = 0; i <= RACCT_MAX; i++) {
1588                 if (sloppy == 0 && RACCT_IS_SLOPPY(i))
1589                         continue;
1590                 RACCT_LOCK();
1591                 amount = racct->r_resources[i];
1592                 RACCT_UNLOCK();
1593                 if (RACCT_IS_IN_MILLIONS(i))
1594                         amount /= 1000000;
1595                 sbuf_printf(sb, "%s=%jd,", rctl_resource_name(i), amount);
1596         }
1597         sbuf_setpos(sb, sbuf_len(sb) - 1);
1598         return (sb);
1599 }
1600
1601 int
1602 sys_rctl_get_racct(struct thread *td, struct rctl_get_racct_args *uap)
1603 {
1604         struct rctl_rule *filter;
1605         struct sbuf *outputsbuf = NULL;
1606         struct proc *p;
1607         struct uidinfo *uip;
1608         struct loginclass *lc;
1609         struct prison_racct *prr;
1610         char *inputstr;
1611         int error;
1612
1613         if (!racct_enable)
1614                 return (ENOSYS);
1615
1616         error = priv_check(td, PRIV_RCTL_GET_RACCT);
1617         if (error != 0)
1618                 return (error);
1619
1620         error = rctl_read_inbuf(&inputstr, uap->inbufp, uap->inbuflen);
1621         if (error != 0)
1622                 return (error);
1623
1624         sx_slock(&allproc_lock);
1625         error = rctl_string_to_rule(inputstr, &filter);
1626         free(inputstr, M_RCTL);
1627         if (error != 0) {
1628                 sx_sunlock(&allproc_lock);
1629                 return (error);
1630         }
1631
1632         switch (filter->rr_subject_type) {
1633         case RCTL_SUBJECT_TYPE_PROCESS:
1634                 p = filter->rr_subject.rs_proc;
1635                 if (p == NULL) {
1636                         error = EINVAL;
1637                         goto out;
1638                 }
1639                 outputsbuf = rctl_racct_to_sbuf(p->p_racct, 0);
1640                 break;
1641         case RCTL_SUBJECT_TYPE_USER:
1642                 uip = filter->rr_subject.rs_uip;
1643                 if (uip == NULL) {
1644                         error = EINVAL;
1645                         goto out;
1646                 }
1647                 outputsbuf = rctl_racct_to_sbuf(uip->ui_racct, 1);
1648                 break;
1649         case RCTL_SUBJECT_TYPE_LOGINCLASS:
1650                 lc = filter->rr_subject.rs_loginclass;
1651                 if (lc == NULL) {
1652                         error = EINVAL;
1653                         goto out;
1654                 }
1655                 outputsbuf = rctl_racct_to_sbuf(lc->lc_racct, 1);
1656                 break;
1657         case RCTL_SUBJECT_TYPE_JAIL:
1658                 prr = filter->rr_subject.rs_prison_racct;
1659                 if (prr == NULL) {
1660                         error = EINVAL;
1661                         goto out;
1662                 }
1663                 outputsbuf = rctl_racct_to_sbuf(prr->prr_racct, 1);
1664                 break;
1665         default:
1666                 error = EINVAL;
1667         }
1668 out:
1669         rctl_rule_release(filter);
1670         sx_sunlock(&allproc_lock);
1671         if (error != 0)
1672                 return (error);
1673
1674         error = rctl_write_outbuf(outputsbuf, uap->outbufp, uap->outbuflen);
1675
1676         return (error);
1677 }
1678
1679 static void
1680 rctl_get_rules_callback(struct racct *racct, void *arg2, void *arg3)
1681 {
1682         struct rctl_rule *filter = (struct rctl_rule *)arg2;
1683         struct rctl_rule_link *link;
1684         struct sbuf *sb = (struct sbuf *)arg3;
1685
1686         ASSERT_RACCT_ENABLED();
1687         RACCT_LOCK_ASSERT();
1688
1689         LIST_FOREACH(link, &racct->r_rule_links, rrl_next) {
1690                 if (!rctl_rule_matches(link->rrl_rule, filter))
1691                         continue;
1692                 rctl_rule_to_sbuf(sb, link->rrl_rule);
1693                 sbuf_printf(sb, ",");
1694         }
1695 }
1696
1697 int
1698 sys_rctl_get_rules(struct thread *td, struct rctl_get_rules_args *uap)
1699 {
1700         struct sbuf *sb;
1701         struct rctl_rule *filter;
1702         struct rctl_rule_link *link;
1703         struct proc *p;
1704         char *inputstr, *buf;
1705         size_t bufsize;
1706         int error;
1707
1708         if (!racct_enable)
1709                 return (ENOSYS);
1710
1711         error = priv_check(td, PRIV_RCTL_GET_RULES);
1712         if (error != 0)
1713                 return (error);
1714
1715         error = rctl_read_inbuf(&inputstr, uap->inbufp, uap->inbuflen);
1716         if (error != 0)
1717                 return (error);
1718
1719         sx_slock(&allproc_lock);
1720         error = rctl_string_to_rule(inputstr, &filter);
1721         free(inputstr, M_RCTL);
1722         if (error != 0) {
1723                 sx_sunlock(&allproc_lock);
1724                 return (error);
1725         }
1726
1727         bufsize = uap->outbuflen;
1728         if (bufsize > rctl_maxbufsize) {
1729                 sx_sunlock(&allproc_lock);
1730                 return (E2BIG);
1731         }
1732
1733         buf = malloc(bufsize, M_RCTL, M_WAITOK);
1734         sb = sbuf_new(NULL, buf, bufsize, SBUF_FIXEDLEN);
1735         KASSERT(sb != NULL, ("sbuf_new failed"));
1736
1737         FOREACH_PROC_IN_SYSTEM(p) {
1738                 RACCT_LOCK();
1739                 LIST_FOREACH(link, &p->p_racct->r_rule_links, rrl_next) {
1740                         /*
1741                          * Non-process rules will be added to the buffer later.
1742                          * Adding them here would result in duplicated output.
1743                          */
1744                         if (link->rrl_rule->rr_subject_type !=
1745                             RCTL_SUBJECT_TYPE_PROCESS)
1746                                 continue;
1747                         if (!rctl_rule_matches(link->rrl_rule, filter))
1748                                 continue;
1749                         rctl_rule_to_sbuf(sb, link->rrl_rule);
1750                         sbuf_printf(sb, ",");
1751                 }
1752                 RACCT_UNLOCK();
1753         }
1754
1755         loginclass_racct_foreach(rctl_get_rules_callback,
1756             rctl_rule_pre_callback, rctl_rule_post_callback,
1757             filter, sb);
1758         ui_racct_foreach(rctl_get_rules_callback,
1759             rctl_rule_pre_callback, rctl_rule_post_callback,
1760             filter, sb);
1761         prison_racct_foreach(rctl_get_rules_callback,
1762             rctl_rule_pre_callback, rctl_rule_post_callback,
1763             filter, sb);
1764         if (sbuf_error(sb) == ENOMEM) {
1765                 error = ERANGE;
1766                 goto out;
1767         }
1768
1769         /*
1770          * Remove trailing ",".
1771          */
1772         if (sbuf_len(sb) > 0)
1773                 sbuf_setpos(sb, sbuf_len(sb) - 1);
1774
1775         error = rctl_write_outbuf(sb, uap->outbufp, uap->outbuflen);
1776 out:
1777         rctl_rule_release(filter);
1778         sx_sunlock(&allproc_lock);
1779         free(buf, M_RCTL);
1780         return (error);
1781 }
1782
1783 int
1784 sys_rctl_get_limits(struct thread *td, struct rctl_get_limits_args *uap)
1785 {
1786         struct sbuf *sb;
1787         struct rctl_rule *filter;
1788         struct rctl_rule_link *link;
1789         char *inputstr, *buf;
1790         size_t bufsize;
1791         int error;
1792
1793         if (!racct_enable)
1794                 return (ENOSYS);
1795
1796         error = priv_check(td, PRIV_RCTL_GET_LIMITS);
1797         if (error != 0)
1798                 return (error);
1799
1800         error = rctl_read_inbuf(&inputstr, uap->inbufp, uap->inbuflen);
1801         if (error != 0)
1802                 return (error);
1803
1804         sx_slock(&allproc_lock);
1805         error = rctl_string_to_rule(inputstr, &filter);
1806         free(inputstr, M_RCTL);
1807         if (error != 0) {
1808                 sx_sunlock(&allproc_lock);
1809                 return (error);
1810         }
1811
1812         if (filter->rr_subject_type == RCTL_SUBJECT_TYPE_UNDEFINED) {
1813                 rctl_rule_release(filter);
1814                 sx_sunlock(&allproc_lock);
1815                 return (EINVAL);
1816         }
1817         if (filter->rr_subject_type != RCTL_SUBJECT_TYPE_PROCESS) {
1818                 rctl_rule_release(filter);
1819                 sx_sunlock(&allproc_lock);
1820                 return (EOPNOTSUPP);
1821         }
1822         if (filter->rr_subject.rs_proc == NULL) {
1823                 rctl_rule_release(filter);
1824                 sx_sunlock(&allproc_lock);
1825                 return (EINVAL);
1826         }
1827
1828         bufsize = uap->outbuflen;
1829         if (bufsize > rctl_maxbufsize) {
1830                 rctl_rule_release(filter);
1831                 sx_sunlock(&allproc_lock);
1832                 return (E2BIG);
1833         }
1834
1835         buf = malloc(bufsize, M_RCTL, M_WAITOK);
1836         sb = sbuf_new(NULL, buf, bufsize, SBUF_FIXEDLEN);
1837         KASSERT(sb != NULL, ("sbuf_new failed"));
1838
1839         RACCT_LOCK();
1840         LIST_FOREACH(link, &filter->rr_subject.rs_proc->p_racct->r_rule_links,
1841             rrl_next) {
1842                 rctl_rule_to_sbuf(sb, link->rrl_rule);
1843                 sbuf_printf(sb, ",");
1844         }
1845         RACCT_UNLOCK();
1846         if (sbuf_error(sb) == ENOMEM) {
1847                 error = ERANGE;
1848                 sbuf_delete(sb);
1849                 goto out;
1850         }
1851
1852         /*
1853          * Remove trailing ",".
1854          */
1855         if (sbuf_len(sb) > 0)
1856                 sbuf_setpos(sb, sbuf_len(sb) - 1);
1857
1858         error = rctl_write_outbuf(sb, uap->outbufp, uap->outbuflen);
1859 out:
1860         rctl_rule_release(filter);
1861         sx_sunlock(&allproc_lock);
1862         free(buf, M_RCTL);
1863         return (error);
1864 }
1865
1866 int
1867 sys_rctl_add_rule(struct thread *td, struct rctl_add_rule_args *uap)
1868 {
1869         struct rctl_rule *rule;
1870         char *inputstr;
1871         int error;
1872
1873         if (!racct_enable)
1874                 return (ENOSYS);
1875
1876         error = priv_check(td, PRIV_RCTL_ADD_RULE);
1877         if (error != 0)
1878                 return (error);
1879
1880         error = rctl_read_inbuf(&inputstr, uap->inbufp, uap->inbuflen);
1881         if (error != 0)
1882                 return (error);
1883
1884         sx_slock(&allproc_lock);
1885         error = rctl_string_to_rule(inputstr, &rule);
1886         free(inputstr, M_RCTL);
1887         if (error != 0) {
1888                 sx_sunlock(&allproc_lock);
1889                 return (error);
1890         }
1891         /*
1892          * The 'per' part of a rule is optional.
1893          */
1894         if (rule->rr_per == RCTL_SUBJECT_TYPE_UNDEFINED &&
1895             rule->rr_subject_type != RCTL_SUBJECT_TYPE_UNDEFINED)
1896                 rule->rr_per = rule->rr_subject_type;
1897
1898         if (!rctl_rule_fully_specified(rule)) {
1899                 error = EINVAL;
1900                 goto out;
1901         }
1902
1903         error = rctl_rule_add(rule);
1904
1905 out:
1906         rctl_rule_release(rule);
1907         sx_sunlock(&allproc_lock);
1908         return (error);
1909 }
1910
1911 int
1912 sys_rctl_remove_rule(struct thread *td, struct rctl_remove_rule_args *uap)
1913 {
1914         struct rctl_rule *filter;
1915         char *inputstr;
1916         int error;
1917
1918         if (!racct_enable)
1919                 return (ENOSYS);
1920
1921         error = priv_check(td, PRIV_RCTL_REMOVE_RULE);
1922         if (error != 0)
1923                 return (error);
1924
1925         error = rctl_read_inbuf(&inputstr, uap->inbufp, uap->inbuflen);
1926         if (error != 0)
1927                 return (error);
1928
1929         sx_slock(&allproc_lock);
1930         error = rctl_string_to_rule(inputstr, &filter);
1931         free(inputstr, M_RCTL);
1932         if (error != 0) {
1933                 sx_sunlock(&allproc_lock);
1934                 return (error);
1935         }
1936
1937         error = rctl_rule_remove(filter);
1938         rctl_rule_release(filter);
1939         sx_sunlock(&allproc_lock);
1940
1941         return (error);
1942 }
1943
1944 /*
1945  * Update RCTL rule list after credential change.
1946  */
1947 void
1948 rctl_proc_ucred_changed(struct proc *p, struct ucred *newcred)
1949 {
1950         LIST_HEAD(, rctl_rule_link) newrules;
1951         struct rctl_rule_link *link, *newlink;
1952         struct uidinfo *newuip;
1953         struct loginclass *newlc;
1954         struct prison_racct *newprr;
1955         int rulecnt, i;
1956
1957         if (!racct_enable)
1958                 return;
1959
1960         PROC_LOCK_ASSERT(p, MA_NOTOWNED);
1961
1962         newuip = newcred->cr_ruidinfo;
1963         newlc = newcred->cr_loginclass;
1964         newprr = newcred->cr_prison->pr_prison_racct;
1965
1966         LIST_INIT(&newrules);
1967
1968 again:
1969         /*
1970          * First, count the rules that apply to the process with new
1971          * credentials.
1972          */
1973         rulecnt = 0;
1974         RACCT_LOCK();
1975         LIST_FOREACH(link, &p->p_racct->r_rule_links, rrl_next) {
1976                 if (link->rrl_rule->rr_subject_type ==
1977                     RCTL_SUBJECT_TYPE_PROCESS)
1978                         rulecnt++;
1979         }
1980         LIST_FOREACH(link, &newuip->ui_racct->r_rule_links, rrl_next)
1981                 rulecnt++;
1982         LIST_FOREACH(link, &newlc->lc_racct->r_rule_links, rrl_next)
1983                 rulecnt++;
1984         LIST_FOREACH(link, &newprr->prr_racct->r_rule_links, rrl_next)
1985                 rulecnt++;
1986         RACCT_UNLOCK();
1987
1988         /*
1989          * Create temporary list.  We've dropped the rctl_lock in order
1990          * to use M_WAITOK.
1991          */
1992         for (i = 0; i < rulecnt; i++) {
1993                 newlink = uma_zalloc(rctl_rule_link_zone, M_WAITOK);
1994                 newlink->rrl_rule = NULL;
1995                 newlink->rrl_exceeded = 0;
1996                 LIST_INSERT_HEAD(&newrules, newlink, rrl_next);
1997         }
1998
1999         newlink = LIST_FIRST(&newrules);
2000
2001         /*
2002          * Assign rules to the newly allocated list entries.
2003          */
2004         RACCT_LOCK();
2005         LIST_FOREACH(link, &p->p_racct->r_rule_links, rrl_next) {
2006                 if (link->rrl_rule->rr_subject_type ==
2007                     RCTL_SUBJECT_TYPE_PROCESS) {
2008                         if (newlink == NULL)
2009                                 goto goaround;
2010                         rctl_rule_acquire(link->rrl_rule);
2011                         newlink->rrl_rule = link->rrl_rule;
2012                         newlink->rrl_exceeded = link->rrl_exceeded;
2013                         newlink = LIST_NEXT(newlink, rrl_next);
2014                         rulecnt--;
2015                 }
2016         }
2017         
2018         LIST_FOREACH(link, &newuip->ui_racct->r_rule_links, rrl_next) {
2019                 if (newlink == NULL)
2020                         goto goaround;
2021                 rctl_rule_acquire(link->rrl_rule);
2022                 newlink->rrl_rule = link->rrl_rule;
2023                 newlink->rrl_exceeded = link->rrl_exceeded;
2024                 newlink = LIST_NEXT(newlink, rrl_next);
2025                 rulecnt--;
2026         }
2027
2028         LIST_FOREACH(link, &newlc->lc_racct->r_rule_links, rrl_next) {
2029                 if (newlink == NULL)
2030                         goto goaround;
2031                 rctl_rule_acquire(link->rrl_rule);
2032                 newlink->rrl_rule = link->rrl_rule;
2033                 newlink->rrl_exceeded = link->rrl_exceeded;
2034                 newlink = LIST_NEXT(newlink, rrl_next);
2035                 rulecnt--;
2036         }
2037
2038         LIST_FOREACH(link, &newprr->prr_racct->r_rule_links, rrl_next) {
2039                 if (newlink == NULL)
2040                         goto goaround;
2041                 rctl_rule_acquire(link->rrl_rule);
2042                 newlink->rrl_rule = link->rrl_rule;
2043                 newlink->rrl_exceeded = link->rrl_exceeded;
2044                 newlink = LIST_NEXT(newlink, rrl_next);
2045                 rulecnt--;
2046         }
2047
2048         if (rulecnt == 0) {
2049                 /*
2050                  * Free the old rule list.
2051                  */
2052                 while (!LIST_EMPTY(&p->p_racct->r_rule_links)) {
2053                         link = LIST_FIRST(&p->p_racct->r_rule_links);
2054                         LIST_REMOVE(link, rrl_next);
2055                         rctl_rule_release(link->rrl_rule);
2056                         uma_zfree(rctl_rule_link_zone, link);
2057                 }
2058
2059                 /*
2060                  * Replace lists and we're done.
2061                  *
2062                  * XXX: Is there any way to switch list heads instead
2063                  *      of iterating here?
2064                  */
2065                 while (!LIST_EMPTY(&newrules)) {
2066                         newlink = LIST_FIRST(&newrules);
2067                         LIST_REMOVE(newlink, rrl_next);
2068                         LIST_INSERT_HEAD(&p->p_racct->r_rule_links,
2069                             newlink, rrl_next);
2070                 }
2071
2072                 RACCT_UNLOCK();
2073
2074                 return;
2075         }
2076
2077 goaround:
2078         RACCT_UNLOCK();
2079
2080         /*
2081          * Rule list changed while we were not holding the rctl_lock.
2082          * Free the new list and try again.
2083          */
2084         while (!LIST_EMPTY(&newrules)) {
2085                 newlink = LIST_FIRST(&newrules);
2086                 LIST_REMOVE(newlink, rrl_next);
2087                 if (newlink->rrl_rule != NULL)
2088                         rctl_rule_release(newlink->rrl_rule);
2089                 uma_zfree(rctl_rule_link_zone, newlink);
2090         }
2091
2092         goto again;
2093 }
2094
2095 /*
2096  * Assign RCTL rules to the newly created process.
2097  */
2098 int
2099 rctl_proc_fork(struct proc *parent, struct proc *child)
2100 {
2101         struct rctl_rule *rule;
2102         struct rctl_rule_link *link;
2103         int error;
2104
2105         ASSERT_RACCT_ENABLED();
2106         RACCT_LOCK_ASSERT();
2107         KASSERT(parent->p_racct != NULL, ("process without racct; p = %p", parent));
2108
2109         LIST_INIT(&child->p_racct->r_rule_links);
2110
2111         /*
2112          * Go through limits applicable to the parent and assign them
2113          * to the child.  Rules with 'process' subject have to be duplicated
2114          * in order to make their rr_subject point to the new process.
2115          */
2116         LIST_FOREACH(link, &parent->p_racct->r_rule_links, rrl_next) {
2117                 if (link->rrl_rule->rr_subject_type ==
2118                     RCTL_SUBJECT_TYPE_PROCESS) {
2119                         rule = rctl_rule_duplicate(link->rrl_rule, M_NOWAIT);
2120                         if (rule == NULL)
2121                                 goto fail;
2122                         KASSERT(rule->rr_subject.rs_proc == parent,
2123                             ("rule->rr_subject.rs_proc != parent"));
2124                         rule->rr_subject.rs_proc = child;
2125                         error = rctl_racct_add_rule_locked(child->p_racct,
2126                             rule);
2127                         rctl_rule_release(rule);
2128                         if (error != 0)
2129                                 goto fail;
2130                 } else {
2131                         error = rctl_racct_add_rule_locked(child->p_racct,
2132                             link->rrl_rule);
2133                         if (error != 0)
2134                                 goto fail;
2135                 }
2136         }
2137
2138         return (0);
2139
2140 fail:
2141         while (!LIST_EMPTY(&child->p_racct->r_rule_links)) {
2142                 link = LIST_FIRST(&child->p_racct->r_rule_links);
2143                 LIST_REMOVE(link, rrl_next);
2144                 rctl_rule_release(link->rrl_rule);
2145                 uma_zfree(rctl_rule_link_zone, link);
2146         }
2147
2148         return (EAGAIN);
2149 }
2150
2151 /*
2152  * Release rules attached to the racct.
2153  */
2154 void
2155 rctl_racct_release(struct racct *racct)
2156 {
2157         struct rctl_rule_link *link;
2158
2159         ASSERT_RACCT_ENABLED();
2160         RACCT_LOCK_ASSERT();
2161
2162         while (!LIST_EMPTY(&racct->r_rule_links)) {
2163                 link = LIST_FIRST(&racct->r_rule_links);
2164                 LIST_REMOVE(link, rrl_next);
2165                 rctl_rule_release(link->rrl_rule);
2166                 uma_zfree(rctl_rule_link_zone, link);
2167         }
2168 }
2169
2170 static void
2171 rctl_init(void)
2172 {
2173
2174         if (!racct_enable)
2175                 return;
2176
2177         rctl_rule_zone = uma_zcreate("rctl_rule", sizeof(struct rctl_rule),
2178             NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0);
2179         rctl_rule_link_zone = uma_zcreate("rctl_rule_link",
2180             sizeof(struct rctl_rule_link), NULL, NULL, NULL, NULL,
2181             UMA_ALIGN_PTR, 0);
2182
2183         /*
2184          * Set default values, making sure not to overwrite the ones
2185          * fetched from tunables.  Most of those could be set at the
2186          * declaration, except for the rctl_throttle_max - we cannot
2187          * set it there due to hz not being compile time constant.
2188          */
2189         if (rctl_throttle_min < 1)
2190                 rctl_throttle_min = 1;
2191         if (rctl_throttle_max < rctl_throttle_min)
2192                 rctl_throttle_max = 2 * hz;
2193         if (rctl_throttle_pct < 0)
2194                 rctl_throttle_pct = 100;
2195         if (rctl_throttle_pct2 < 0)
2196                 rctl_throttle_pct2 = 100;
2197 }
2198
2199 #else /* !RCTL */
2200
2201 int
2202 sys_rctl_get_racct(struct thread *td, struct rctl_get_racct_args *uap)
2203 {
2204         
2205         return (ENOSYS);
2206 }
2207
2208 int
2209 sys_rctl_get_rules(struct thread *td, struct rctl_get_rules_args *uap)
2210 {
2211         
2212         return (ENOSYS);
2213 }
2214
2215 int
2216 sys_rctl_get_limits(struct thread *td, struct rctl_get_limits_args *uap)
2217 {
2218         
2219         return (ENOSYS);
2220 }
2221
2222 int
2223 sys_rctl_add_rule(struct thread *td, struct rctl_add_rule_args *uap)
2224 {
2225         
2226         return (ENOSYS);
2227 }
2228
2229 int
2230 sys_rctl_remove_rule(struct thread *td, struct rctl_remove_rule_args *uap)
2231 {
2232         
2233         return (ENOSYS);
2234 }
2235
2236 #endif /* !RCTL */