]> CyberLeo.Net >> Repos - FreeBSD/FreeBSD.git/blob - sys/kern/kern_rctl.c
Implement pci_enable_msi() and pci_disable_msi() in the LinuxKPI.
[FreeBSD/FreeBSD.git] / sys / kern / kern_rctl.c
1 /*-
2  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
3  *
4  * Copyright (c) 2010 The FreeBSD Foundation
5  * All rights reserved.
6  *
7  * This software was developed by Edward Tomasz Napierala under sponsorship
8  * from the FreeBSD Foundation.
9  *
10  * Redistribution and use in source and binary forms, with or without
11  * modification, are permitted provided that the following conditions
12  * are met:
13  * 1. Redistributions of source code must retain the above copyright
14  *    notice, this list of conditions and the following disclaimer.
15  * 2. Redistributions in binary form must reproduce the above copyright
16  *    notice, this list of conditions and the following disclaimer in the
17  *    documentation and/or other materials provided with the distribution.
18  *
19  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
20  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
21  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
22  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
23  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
24  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
25  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
26  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
27  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
28  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
29  * SUCH DAMAGE.
30  *
31  * $FreeBSD$
32  */
33
34 #include <sys/cdefs.h>
35 __FBSDID("$FreeBSD$");
36
37 #include <sys/param.h>
38 #include <sys/bus.h>
39 #include <sys/malloc.h>
40 #include <sys/queue.h>
41 #include <sys/refcount.h>
42 #include <sys/jail.h>
43 #include <sys/kernel.h>
44 #include <sys/limits.h>
45 #include <sys/loginclass.h>
46 #include <sys/priv.h>
47 #include <sys/proc.h>
48 #include <sys/racct.h>
49 #include <sys/rctl.h>
50 #include <sys/resourcevar.h>
51 #include <sys/sx.h>
52 #include <sys/sysent.h>
53 #include <sys/sysproto.h>
54 #include <sys/systm.h>
55 #include <sys/types.h>
56 #include <sys/eventhandler.h>
57 #include <sys/lock.h>
58 #include <sys/mutex.h>
59 #include <sys/rwlock.h>
60 #include <sys/sbuf.h>
61 #include <sys/taskqueue.h>
62 #include <sys/tree.h>
63 #include <vm/uma.h>
64
65 #ifdef RCTL
66 #ifndef RACCT
67 #error "The RCTL option requires the RACCT option"
68 #endif
69
70 FEATURE(rctl, "Resource Limits");
71
72 #define HRF_DEFAULT             0
73 #define HRF_DONT_INHERIT        1
74 #define HRF_DONT_ACCUMULATE     2
75
76 #define RCTL_MAX_INBUFSIZE      4 * 1024
77 #define RCTL_MAX_OUTBUFSIZE     16 * 1024 * 1024
78 #define RCTL_LOG_BUFSIZE        128
79
80 #define RCTL_PCPU_SHIFT         (10 * 1000000)
81
82 static unsigned int rctl_maxbufsize = RCTL_MAX_OUTBUFSIZE;
83 static int rctl_log_rate_limit = 10;
84 static int rctl_devctl_rate_limit = 10;
85
86 /*
87  * Values below are initialized in rctl_init().
88  */
89 static int rctl_throttle_min = -1;
90 static int rctl_throttle_max = -1;
91 static int rctl_throttle_pct = -1;
92 static int rctl_throttle_pct2 = -1;
93
94 static int rctl_throttle_min_sysctl(SYSCTL_HANDLER_ARGS);
95 static int rctl_throttle_max_sysctl(SYSCTL_HANDLER_ARGS);
96 static int rctl_throttle_pct_sysctl(SYSCTL_HANDLER_ARGS);
97 static int rctl_throttle_pct2_sysctl(SYSCTL_HANDLER_ARGS);
98
99 SYSCTL_NODE(_kern_racct, OID_AUTO, rctl, CTLFLAG_RW, 0, "Resource Limits");
100 SYSCTL_UINT(_kern_racct_rctl, OID_AUTO, maxbufsize, CTLFLAG_RWTUN,
101     &rctl_maxbufsize, 0, "Maximum output buffer size");
102 SYSCTL_UINT(_kern_racct_rctl, OID_AUTO, log_rate_limit, CTLFLAG_RW,
103     &rctl_log_rate_limit, 0, "Maximum number of log messages per second");
104 SYSCTL_UINT(_kern_racct_rctl, OID_AUTO, devctl_rate_limit, CTLFLAG_RWTUN,
105     &rctl_devctl_rate_limit, 0, "Maximum number of devctl messages per second");
106 SYSCTL_PROC(_kern_racct_rctl, OID_AUTO, throttle_min,
107     CTLTYPE_UINT | CTLFLAG_RWTUN, 0, 0, &rctl_throttle_min_sysctl, "IU",
108     "Shortest throttling duration, in hz");
109 TUNABLE_INT("kern.racct.rctl.throttle_min", &rctl_throttle_min);
110 SYSCTL_PROC(_kern_racct_rctl, OID_AUTO, throttle_max,
111     CTLTYPE_UINT | CTLFLAG_RWTUN, 0, 0, &rctl_throttle_max_sysctl, "IU",
112     "Longest throttling duration, in hz");
113 TUNABLE_INT("kern.racct.rctl.throttle_max", &rctl_throttle_max);
114 SYSCTL_PROC(_kern_racct_rctl, OID_AUTO, throttle_pct,
115     CTLTYPE_UINT | CTLFLAG_RWTUN, 0, 0, &rctl_throttle_pct_sysctl, "IU",
116     "Throttling penalty for process consumption, in percent");
117 TUNABLE_INT("kern.racct.rctl.throttle_pct", &rctl_throttle_pct);
118 SYSCTL_PROC(_kern_racct_rctl, OID_AUTO, throttle_pct2,
119     CTLTYPE_UINT | CTLFLAG_RWTUN, 0, 0, &rctl_throttle_pct2_sysctl, "IU",
120     "Throttling penalty for container consumption, in percent");
121 TUNABLE_INT("kern.racct.rctl.throttle_pct2", &rctl_throttle_pct2);
122
123 /*
124  * 'rctl_rule_link' connects a rule with every racct it's related to.
125  * For example, rule 'user:X:openfiles:deny=N/process' is linked
126  * with uidinfo for user X, and to each process of that user.
127  */
128 struct rctl_rule_link {
129         LIST_ENTRY(rctl_rule_link)      rrl_next;
130         struct rctl_rule                *rrl_rule;
131         int                             rrl_exceeded;
132 };
133
134 struct dict {
135         const char      *d_name;
136         int             d_value;
137 };
138
139 static struct dict subjectnames[] = {
140         { "process", RCTL_SUBJECT_TYPE_PROCESS },
141         { "user", RCTL_SUBJECT_TYPE_USER },
142         { "loginclass", RCTL_SUBJECT_TYPE_LOGINCLASS },
143         { "jail", RCTL_SUBJECT_TYPE_JAIL },
144         { NULL, -1 }};
145
146 static struct dict resourcenames[] = {
147         { "cputime", RACCT_CPU },
148         { "datasize", RACCT_DATA },
149         { "stacksize", RACCT_STACK },
150         { "coredumpsize", RACCT_CORE },
151         { "memoryuse", RACCT_RSS },
152         { "memorylocked", RACCT_MEMLOCK },
153         { "maxproc", RACCT_NPROC },
154         { "openfiles", RACCT_NOFILE },
155         { "vmemoryuse", RACCT_VMEM },
156         { "pseudoterminals", RACCT_NPTS },
157         { "swapuse", RACCT_SWAP },
158         { "nthr", RACCT_NTHR },
159         { "msgqqueued", RACCT_MSGQQUEUED },
160         { "msgqsize", RACCT_MSGQSIZE },
161         { "nmsgq", RACCT_NMSGQ },
162         { "nsem", RACCT_NSEM },
163         { "nsemop", RACCT_NSEMOP },
164         { "nshm", RACCT_NSHM },
165         { "shmsize", RACCT_SHMSIZE },
166         { "wallclock", RACCT_WALLCLOCK },
167         { "pcpu", RACCT_PCTCPU },
168         { "readbps", RACCT_READBPS },
169         { "writebps", RACCT_WRITEBPS },
170         { "readiops", RACCT_READIOPS },
171         { "writeiops", RACCT_WRITEIOPS },
172         { NULL, -1 }};
173
174 static struct dict actionnames[] = {
175         { "sighup", RCTL_ACTION_SIGHUP },
176         { "sigint", RCTL_ACTION_SIGINT },
177         { "sigquit", RCTL_ACTION_SIGQUIT },
178         { "sigill", RCTL_ACTION_SIGILL },
179         { "sigtrap", RCTL_ACTION_SIGTRAP },
180         { "sigabrt", RCTL_ACTION_SIGABRT },
181         { "sigemt", RCTL_ACTION_SIGEMT },
182         { "sigfpe", RCTL_ACTION_SIGFPE },
183         { "sigkill", RCTL_ACTION_SIGKILL },
184         { "sigbus", RCTL_ACTION_SIGBUS },
185         { "sigsegv", RCTL_ACTION_SIGSEGV },
186         { "sigsys", RCTL_ACTION_SIGSYS },
187         { "sigpipe", RCTL_ACTION_SIGPIPE },
188         { "sigalrm", RCTL_ACTION_SIGALRM },
189         { "sigterm", RCTL_ACTION_SIGTERM },
190         { "sigurg", RCTL_ACTION_SIGURG },
191         { "sigstop", RCTL_ACTION_SIGSTOP },
192         { "sigtstp", RCTL_ACTION_SIGTSTP },
193         { "sigchld", RCTL_ACTION_SIGCHLD },
194         { "sigttin", RCTL_ACTION_SIGTTIN },
195         { "sigttou", RCTL_ACTION_SIGTTOU },
196         { "sigio", RCTL_ACTION_SIGIO },
197         { "sigxcpu", RCTL_ACTION_SIGXCPU },
198         { "sigxfsz", RCTL_ACTION_SIGXFSZ },
199         { "sigvtalrm", RCTL_ACTION_SIGVTALRM },
200         { "sigprof", RCTL_ACTION_SIGPROF },
201         { "sigwinch", RCTL_ACTION_SIGWINCH },
202         { "siginfo", RCTL_ACTION_SIGINFO },
203         { "sigusr1", RCTL_ACTION_SIGUSR1 },
204         { "sigusr2", RCTL_ACTION_SIGUSR2 },
205         { "sigthr", RCTL_ACTION_SIGTHR },
206         { "deny", RCTL_ACTION_DENY },
207         { "log", RCTL_ACTION_LOG },
208         { "devctl", RCTL_ACTION_DEVCTL },
209         { "throttle", RCTL_ACTION_THROTTLE },
210         { NULL, -1 }};
211
212 static void rctl_init(void);
213 SYSINIT(rctl, SI_SUB_RACCT, SI_ORDER_FIRST, rctl_init, NULL);
214
215 static uma_zone_t rctl_rule_zone;
216 static uma_zone_t rctl_rule_link_zone;
217
218 static int rctl_rule_fully_specified(const struct rctl_rule *rule);
219 static void rctl_rule_to_sbuf(struct sbuf *sb, const struct rctl_rule *rule);
220
221 static MALLOC_DEFINE(M_RCTL, "rctl", "Resource Limits");
222
223 static int rctl_throttle_min_sysctl(SYSCTL_HANDLER_ARGS)
224 {
225         int error, val = rctl_throttle_min;
226
227         error = sysctl_handle_int(oidp, &val, 0, req);
228         if (error || !req->newptr)
229                 return (error);
230         if (val < 1 || val > rctl_throttle_max)
231                 return (EINVAL);
232
233         RACCT_LOCK();
234         rctl_throttle_min = val;
235         RACCT_UNLOCK();
236
237         return (0);
238 }
239
240 static int rctl_throttle_max_sysctl(SYSCTL_HANDLER_ARGS)
241 {
242         int error, val = rctl_throttle_max;
243
244         error = sysctl_handle_int(oidp, &val, 0, req);
245         if (error || !req->newptr)
246                 return (error);
247         if (val < rctl_throttle_min)
248                 return (EINVAL);
249
250         RACCT_LOCK();
251         rctl_throttle_max = val;
252         RACCT_UNLOCK();
253
254         return (0);
255 }
256
257 static int rctl_throttle_pct_sysctl(SYSCTL_HANDLER_ARGS)
258 {
259         int error, val = rctl_throttle_pct;
260
261         error = sysctl_handle_int(oidp, &val, 0, req);
262         if (error || !req->newptr)
263                 return (error);
264         if (val < 0)
265                 return (EINVAL);
266
267         RACCT_LOCK();
268         rctl_throttle_pct = val;
269         RACCT_UNLOCK();
270
271         return (0);
272 }
273
274 static int rctl_throttle_pct2_sysctl(SYSCTL_HANDLER_ARGS)
275 {
276         int error, val = rctl_throttle_pct2;
277
278         error = sysctl_handle_int(oidp, &val, 0, req);
279         if (error || !req->newptr)
280                 return (error);
281         if (val < 0)
282                 return (EINVAL);
283
284         RACCT_LOCK();
285         rctl_throttle_pct2 = val;
286         RACCT_UNLOCK();
287
288         return (0);
289 }
290
291 static const char *
292 rctl_subject_type_name(int subject)
293 {
294         int i;
295
296         for (i = 0; subjectnames[i].d_name != NULL; i++) {
297                 if (subjectnames[i].d_value == subject)
298                         return (subjectnames[i].d_name);
299         }
300
301         panic("rctl_subject_type_name: unknown subject type %d", subject);
302 }
303
304 static const char *
305 rctl_action_name(int action)
306 {
307         int i;
308
309         for (i = 0; actionnames[i].d_name != NULL; i++) {
310                 if (actionnames[i].d_value == action)
311                         return (actionnames[i].d_name);
312         }
313
314         panic("rctl_action_name: unknown action %d", action);
315 }
316
317 const char *
318 rctl_resource_name(int resource)
319 {
320         int i;
321
322         for (i = 0; resourcenames[i].d_name != NULL; i++) {
323                 if (resourcenames[i].d_value == resource)
324                         return (resourcenames[i].d_name);
325         }
326
327         panic("rctl_resource_name: unknown resource %d", resource);
328 }
329
330 static struct racct *
331 rctl_proc_rule_to_racct(const struct proc *p, const struct rctl_rule *rule)
332 {
333         struct ucred *cred = p->p_ucred;
334
335         ASSERT_RACCT_ENABLED();
336         RACCT_LOCK_ASSERT();
337
338         switch (rule->rr_per) {
339         case RCTL_SUBJECT_TYPE_PROCESS:
340                 return (p->p_racct);
341         case RCTL_SUBJECT_TYPE_USER:
342                 return (cred->cr_ruidinfo->ui_racct);
343         case RCTL_SUBJECT_TYPE_LOGINCLASS:
344                 return (cred->cr_loginclass->lc_racct);
345         case RCTL_SUBJECT_TYPE_JAIL:
346                 return (cred->cr_prison->pr_prison_racct->prr_racct);
347         default:
348                 panic("%s: unknown per %d", __func__, rule->rr_per);
349         }
350 }
351
352 /*
353  * Return the amount of resource that can be allocated by 'p' before
354  * hitting 'rule'.
355  */
356 static int64_t
357 rctl_available_resource(const struct proc *p, const struct rctl_rule *rule)
358 {
359         const struct racct *racct;
360         int64_t available;
361
362         ASSERT_RACCT_ENABLED();
363         RACCT_LOCK_ASSERT();
364
365         racct = rctl_proc_rule_to_racct(p, rule);
366         available = rule->rr_amount - racct->r_resources[rule->rr_resource];
367
368         return (available);
369 }
370
371 /*
372  * Called every second for proc, uidinfo, loginclass, and jail containers.
373  * If the limit isn't exceeded, it decreases the usage amount to zero.
374  * Otherwise, it decreases it by the value of the limit.  This way
375  * resource consumption exceeding the limit "carries over" to the next
376  * period.
377  */
378 void
379 rctl_throttle_decay(struct racct *racct, int resource)
380 {
381         struct rctl_rule *rule;
382         struct rctl_rule_link *link;
383         int64_t minavailable;
384
385         ASSERT_RACCT_ENABLED();
386         RACCT_LOCK_ASSERT();
387
388         minavailable = INT64_MAX;
389
390         LIST_FOREACH(link, &racct->r_rule_links, rrl_next) {
391                 rule = link->rrl_rule;
392
393                 if (rule->rr_resource != resource)
394                         continue;
395                 if (rule->rr_action != RCTL_ACTION_THROTTLE)
396                         continue;
397
398                 if (rule->rr_amount < minavailable)
399                         minavailable = rule->rr_amount;
400         }
401
402         if (racct->r_resources[resource] < minavailable) {
403                 racct->r_resources[resource] = 0;
404         } else {
405                 /*
406                  * Cap utilization counter at ten times the limit.  Otherwise,
407                  * if we changed the rule lowering the allowed amount, it could
408                  * take unreasonably long time for the accumulated resource
409                  * usage to drop.
410                  */
411                 if (racct->r_resources[resource] > minavailable * 10)
412                         racct->r_resources[resource] = minavailable * 10;
413
414                 racct->r_resources[resource] -= minavailable;
415         }
416 }
417
418 /*
419  * Special version of rctl_get_available() for the %CPU resource.
420  * We slightly cheat here and return less than we normally would.
421  */
422 int64_t
423 rctl_pcpu_available(const struct proc *p) {
424         struct rctl_rule *rule;
425         struct rctl_rule_link *link;
426         int64_t available, minavailable, limit;
427
428         ASSERT_RACCT_ENABLED();
429         RACCT_LOCK_ASSERT();
430
431         minavailable = INT64_MAX;
432         limit = 0;
433
434         LIST_FOREACH(link, &p->p_racct->r_rule_links, rrl_next) {
435                 rule = link->rrl_rule;
436                 if (rule->rr_resource != RACCT_PCTCPU)
437                         continue;
438                 if (rule->rr_action != RCTL_ACTION_DENY)
439                         continue;
440                 available = rctl_available_resource(p, rule);
441                 if (available < minavailable) {
442                         minavailable = available;
443                         limit = rule->rr_amount;
444                 }
445         }
446
447         /*
448          * Return slightly less than actual value of the available
449          * %cpu resource.  This makes %cpu throttling more aggressive
450          * and lets us act sooner than the limits are already exceeded.
451          */
452         if (limit != 0) {
453                 if (limit > 2 * RCTL_PCPU_SHIFT)
454                         minavailable -= RCTL_PCPU_SHIFT;
455                 else
456                         minavailable -= (limit / 2);
457         }
458
459         return (minavailable);
460 }
461
462 static uint64_t
463 xadd(uint64_t a, uint64_t b)
464 {
465         uint64_t c;
466
467         c = a + b;
468
469         /*
470          * Detect overflow.
471          */
472         if (c < a || c < b)
473                 return (UINT64_MAX);
474
475         return (c);
476 }
477
478 static uint64_t
479 xmul(uint64_t a, uint64_t b)
480 {
481
482         if (b != 0 && a > UINT64_MAX / b)
483                 return (UINT64_MAX);
484
485         return (a * b);
486 }
487
488 /*
489  * Check whether the proc 'p' can allocate 'amount' of 'resource' in addition
490  * to what it keeps allocated now.  Returns non-zero if the allocation should
491  * be denied, 0 otherwise.
492  */
493 int
494 rctl_enforce(struct proc *p, int resource, uint64_t amount)
495 {
496         static struct timeval log_lasttime, devctl_lasttime;
497         static int log_curtime = 0, devctl_curtime = 0;
498         struct rctl_rule *rule;
499         struct rctl_rule_link *link;
500         struct sbuf sb;
501         char *buf;
502         int64_t available;
503         uint64_t sleep_ms, sleep_ratio;
504         int should_deny = 0;
505
506         ASSERT_RACCT_ENABLED();
507         RACCT_LOCK_ASSERT();
508
509         /*
510          * There may be more than one matching rule; go through all of them.
511          * Denial should be done last, after logging and sending signals.
512          */
513         LIST_FOREACH(link, &p->p_racct->r_rule_links, rrl_next) {
514                 rule = link->rrl_rule;
515                 if (rule->rr_resource != resource)
516                         continue;
517
518                 available = rctl_available_resource(p, rule);
519                 if (available >= (int64_t)amount) {
520                         link->rrl_exceeded = 0;
521                         continue;
522                 }
523
524                 switch (rule->rr_action) {
525                 case RCTL_ACTION_DENY:
526                         should_deny = 1;
527                         continue;
528                 case RCTL_ACTION_LOG:
529                         /*
530                          * If rrl_exceeded != 0, it means we've already
531                          * logged a warning for this process.
532                          */
533                         if (link->rrl_exceeded != 0)
534                                 continue;
535
536                         /*
537                          * If the process state is not fully initialized yet,
538                          * we can't access most of the required fields, e.g.
539                          * p->p_comm.  This happens when called from fork1().
540                          * Ignore this rule for now; it will be processed just
541                          * after fork, when called from racct_proc_fork_done().
542                          */
543                         if (p->p_state != PRS_NORMAL)
544                                 continue;
545
546                         if (!ppsratecheck(&log_lasttime, &log_curtime,
547                             rctl_log_rate_limit))
548                                 continue;
549
550                         buf = malloc(RCTL_LOG_BUFSIZE, M_RCTL, M_NOWAIT);
551                         if (buf == NULL) {
552                                 printf("rctl_enforce: out of memory\n");
553                                 continue;
554                         }
555                         sbuf_new(&sb, buf, RCTL_LOG_BUFSIZE, SBUF_FIXEDLEN);
556                         rctl_rule_to_sbuf(&sb, rule);
557                         sbuf_finish(&sb);
558                         printf("rctl: rule \"%s\" matched by pid %d "
559                             "(%s), uid %d, jail %s\n", sbuf_data(&sb),
560                             p->p_pid, p->p_comm, p->p_ucred->cr_uid,
561                             p->p_ucred->cr_prison->pr_prison_racct->prr_name);
562                         sbuf_delete(&sb);
563                         free(buf, M_RCTL);
564                         link->rrl_exceeded = 1;
565                         continue;
566                 case RCTL_ACTION_DEVCTL:
567                         if (link->rrl_exceeded != 0)
568                                 continue;
569
570                         if (p->p_state != PRS_NORMAL)
571                                 continue;
572
573                         if (!ppsratecheck(&devctl_lasttime, &devctl_curtime,
574                             rctl_devctl_rate_limit))
575                                 continue;
576
577                         buf = malloc(RCTL_LOG_BUFSIZE, M_RCTL, M_NOWAIT);
578                         if (buf == NULL) {
579                                 printf("rctl_enforce: out of memory\n");
580                                 continue;
581                         }
582                         sbuf_new(&sb, buf, RCTL_LOG_BUFSIZE, SBUF_FIXEDLEN);
583                         sbuf_printf(&sb, "rule=");
584                         rctl_rule_to_sbuf(&sb, rule);
585                         sbuf_printf(&sb, " pid=%d ruid=%d jail=%s",
586                             p->p_pid, p->p_ucred->cr_ruid,
587                             p->p_ucred->cr_prison->pr_prison_racct->prr_name);
588                         sbuf_finish(&sb);
589                         devctl_notify_f("RCTL", "rule", "matched",
590                             sbuf_data(&sb), M_NOWAIT);
591                         sbuf_delete(&sb);
592                         free(buf, M_RCTL);
593                         link->rrl_exceeded = 1;
594                         continue;
595                 case RCTL_ACTION_THROTTLE:
596                         if (p->p_state != PRS_NORMAL)
597                                 continue;
598
599                         /*
600                          * Make the process sleep for a fraction of second
601                          * proportional to the ratio of process' resource
602                          * utilization compared to the limit.  The point is
603                          * to penalize resource hogs: processes that consume
604                          * more of the available resources sleep for longer.
605                          *
606                          * We're trying to defer division until the very end,
607                          * to minimize the rounding effects.  The following
608                          * calculation could have been written in a clearer
609                          * way like this:
610                          *
611                          * sleep_ms = hz * p->p_racct->r_resources[resource] /
612                          *     rule->rr_amount;
613                          * sleep_ms *= rctl_throttle_pct / 100;
614                          * if (sleep_ms < rctl_throttle_min)
615                          *         sleep_ms = rctl_throttle_min;
616                          *
617                          */
618                         sleep_ms = xmul(hz, p->p_racct->r_resources[resource]);
619                         sleep_ms = xmul(sleep_ms,  rctl_throttle_pct) / 100;
620                         if (sleep_ms < rctl_throttle_min * rule->rr_amount)
621                                 sleep_ms = rctl_throttle_min * rule->rr_amount;
622
623                         /*
624                          * Multiply that by the ratio of the resource
625                          * consumption for the container compared to the limit,
626                          * squared.  In other words, a process in a container
627                          * that is two times over the limit will be throttled
628                          * four times as much for hitting the same rule.  The
629                          * point is to penalize processes more if the container
630                          * itself (eg certain UID or jail) is above the limit.
631                          */
632                         if (available < 0)
633                                 sleep_ratio = -available / rule->rr_amount;
634                         else
635                                 sleep_ratio = 0;
636                         sleep_ratio = xmul(sleep_ratio, sleep_ratio);
637                         sleep_ratio = xmul(sleep_ratio, rctl_throttle_pct2) / 100;
638                         sleep_ms = xadd(sleep_ms, xmul(sleep_ms, sleep_ratio));
639
640                         /*
641                          * Finally the division.
642                          */
643                         sleep_ms /= rule->rr_amount;
644
645                         if (sleep_ms > rctl_throttle_max)
646                                 sleep_ms = rctl_throttle_max;
647 #if 0
648                         printf("%s: pid %d (%s), %jd of %jd, will sleep for %ju ms (ratio %ju, available %jd)\n",
649                            __func__, p->p_pid, p->p_comm,
650                            p->p_racct->r_resources[resource],
651                            rule->rr_amount, (uintmax_t)sleep_ms,
652                            (uintmax_t)sleep_ratio, (intmax_t)available);
653 #endif
654
655                         KASSERT(sleep_ms >= rctl_throttle_min, ("%s: %ju < %d\n",
656                             __func__, (uintmax_t)sleep_ms, rctl_throttle_min));
657                         racct_proc_throttle(p, sleep_ms);
658                         continue;
659                 default:
660                         if (link->rrl_exceeded != 0)
661                                 continue;
662
663                         if (p->p_state != PRS_NORMAL)
664                                 continue;
665
666                         KASSERT(rule->rr_action > 0 &&
667                             rule->rr_action <= RCTL_ACTION_SIGNAL_MAX,
668                             ("rctl_enforce: unknown action %d",
669                              rule->rr_action));
670
671                         /*
672                          * We're using the fact that RCTL_ACTION_SIG* values
673                          * are equal to their counterparts from sys/signal.h.
674                          */
675                         kern_psignal(p, rule->rr_action);
676                         link->rrl_exceeded = 1;
677                         continue;
678                 }
679         }
680
681         if (should_deny) {
682                 /*
683                  * Return fake error code; the caller should change it
684                  * into one proper for the situation - EFSIZ, ENOMEM etc.
685                  */
686                 return (EDOOFUS);
687         }
688
689         return (0);
690 }
691
692 uint64_t
693 rctl_get_limit(struct proc *p, int resource)
694 {
695         struct rctl_rule *rule;
696         struct rctl_rule_link *link;
697         uint64_t amount = UINT64_MAX;
698
699         ASSERT_RACCT_ENABLED();
700         RACCT_LOCK_ASSERT();
701
702         /*
703          * There may be more than one matching rule; go through all of them.
704          * Denial should be done last, after logging and sending signals.
705          */
706         LIST_FOREACH(link, &p->p_racct->r_rule_links, rrl_next) {
707                 rule = link->rrl_rule;
708                 if (rule->rr_resource != resource)
709                         continue;
710                 if (rule->rr_action != RCTL_ACTION_DENY)
711                         continue;
712                 if (rule->rr_amount < amount)
713                         amount = rule->rr_amount;
714         }
715
716         return (amount);
717 }
718
719 uint64_t
720 rctl_get_available(struct proc *p, int resource)
721 {
722         struct rctl_rule *rule;
723         struct rctl_rule_link *link;
724         int64_t available, minavailable, allocated;
725
726         minavailable = INT64_MAX;
727
728         ASSERT_RACCT_ENABLED();
729         RACCT_LOCK_ASSERT();
730
731         /*
732          * There may be more than one matching rule; go through all of them.
733          * Denial should be done last, after logging and sending signals.
734          */
735         LIST_FOREACH(link, &p->p_racct->r_rule_links, rrl_next) {
736                 rule = link->rrl_rule;
737                 if (rule->rr_resource != resource)
738                         continue;
739                 if (rule->rr_action != RCTL_ACTION_DENY)
740                         continue;
741                 available = rctl_available_resource(p, rule);
742                 if (available < minavailable)
743                         minavailable = available;
744         }
745
746         /*
747          * XXX: Think about this _hard_.
748          */
749         allocated = p->p_racct->r_resources[resource];
750         if (minavailable < INT64_MAX - allocated)
751                 minavailable += allocated;
752         if (minavailable < 0)
753                 minavailable = 0;
754
755         return (minavailable);
756 }
757
758 static int
759 rctl_rule_matches(const struct rctl_rule *rule, const struct rctl_rule *filter)
760 {
761
762         ASSERT_RACCT_ENABLED();
763
764         if (filter->rr_subject_type != RCTL_SUBJECT_TYPE_UNDEFINED) {
765                 if (rule->rr_subject_type != filter->rr_subject_type)
766                         return (0);
767
768                 switch (filter->rr_subject_type) {
769                 case RCTL_SUBJECT_TYPE_PROCESS:
770                         if (filter->rr_subject.rs_proc != NULL &&
771                             rule->rr_subject.rs_proc !=
772                             filter->rr_subject.rs_proc)
773                                 return (0);
774                         break;
775                 case RCTL_SUBJECT_TYPE_USER:
776                         if (filter->rr_subject.rs_uip != NULL &&
777                             rule->rr_subject.rs_uip !=
778                             filter->rr_subject.rs_uip)
779                                 return (0);
780                         break;
781                 case RCTL_SUBJECT_TYPE_LOGINCLASS:
782                         if (filter->rr_subject.rs_loginclass != NULL &&
783                             rule->rr_subject.rs_loginclass !=
784                             filter->rr_subject.rs_loginclass)
785                                 return (0);
786                         break;
787                 case RCTL_SUBJECT_TYPE_JAIL:
788                         if (filter->rr_subject.rs_prison_racct != NULL &&
789                             rule->rr_subject.rs_prison_racct !=
790                             filter->rr_subject.rs_prison_racct)
791                                 return (0);
792                         break;
793                 default:
794                         panic("rctl_rule_matches: unknown subject type %d",
795                             filter->rr_subject_type);
796                 }
797         }
798
799         if (filter->rr_resource != RACCT_UNDEFINED) {
800                 if (rule->rr_resource != filter->rr_resource)
801                         return (0);
802         }
803
804         if (filter->rr_action != RCTL_ACTION_UNDEFINED) {
805                 if (rule->rr_action != filter->rr_action)
806                         return (0);
807         }
808
809         if (filter->rr_amount != RCTL_AMOUNT_UNDEFINED) {
810                 if (rule->rr_amount != filter->rr_amount)
811                         return (0);
812         }
813
814         if (filter->rr_per != RCTL_SUBJECT_TYPE_UNDEFINED) {
815                 if (rule->rr_per != filter->rr_per)
816                         return (0);
817         }
818
819         return (1);
820 }
821
822 static int
823 str2value(const char *str, int *value, struct dict *table)
824 {
825         int i;
826
827         if (value == NULL)
828                 return (EINVAL);
829
830         for (i = 0; table[i].d_name != NULL; i++) {
831                 if (strcasecmp(table[i].d_name, str) == 0) {
832                         *value =  table[i].d_value;
833                         return (0);
834                 }
835         }
836
837         return (EINVAL);
838 }
839
840 static int
841 str2id(const char *str, id_t *value)
842 {
843         char *end;
844
845         if (str == NULL)
846                 return (EINVAL);
847
848         *value = strtoul(str, &end, 10);
849         if ((size_t)(end - str) != strlen(str))
850                 return (EINVAL);
851
852         return (0);
853 }
854
855 static int
856 str2int64(const char *str, int64_t *value)
857 {
858         char *end;
859
860         if (str == NULL)
861                 return (EINVAL);
862
863         *value = strtoul(str, &end, 10);
864         if ((size_t)(end - str) != strlen(str))
865                 return (EINVAL);
866
867         if (*value < 0)
868                 return (ERANGE);
869
870         return (0);
871 }
872
873 /*
874  * Connect the rule to the racct, increasing refcount for the rule.
875  */
876 static void
877 rctl_racct_add_rule(struct racct *racct, struct rctl_rule *rule)
878 {
879         struct rctl_rule_link *link;
880
881         ASSERT_RACCT_ENABLED();
882         KASSERT(rctl_rule_fully_specified(rule), ("rule not fully specified"));
883
884         rctl_rule_acquire(rule);
885         link = uma_zalloc(rctl_rule_link_zone, M_WAITOK);
886         link->rrl_rule = rule;
887         link->rrl_exceeded = 0;
888
889         RACCT_LOCK();
890         LIST_INSERT_HEAD(&racct->r_rule_links, link, rrl_next);
891         RACCT_UNLOCK();
892 }
893
894 static int
895 rctl_racct_add_rule_locked(struct racct *racct, struct rctl_rule *rule)
896 {
897         struct rctl_rule_link *link;
898
899         ASSERT_RACCT_ENABLED();
900         KASSERT(rctl_rule_fully_specified(rule), ("rule not fully specified"));
901         RACCT_LOCK_ASSERT();
902
903         link = uma_zalloc(rctl_rule_link_zone, M_NOWAIT);
904         if (link == NULL)
905                 return (ENOMEM);
906         rctl_rule_acquire(rule);
907         link->rrl_rule = rule;
908         link->rrl_exceeded = 0;
909
910         LIST_INSERT_HEAD(&racct->r_rule_links, link, rrl_next);
911
912         return (0);
913 }
914
915 /*
916  * Remove limits for a rules matching the filter and release
917  * the refcounts for the rules, possibly freeing them.  Returns
918  * the number of limit structures removed.
919  */
920 static int
921 rctl_racct_remove_rules(struct racct *racct,
922     const struct rctl_rule *filter)
923 {
924         struct rctl_rule_link *link, *linktmp;
925         int removed = 0;
926
927         ASSERT_RACCT_ENABLED();
928         RACCT_LOCK_ASSERT();
929
930         LIST_FOREACH_SAFE(link, &racct->r_rule_links, rrl_next, linktmp) {
931                 if (!rctl_rule_matches(link->rrl_rule, filter))
932                         continue;
933
934                 LIST_REMOVE(link, rrl_next);
935                 rctl_rule_release(link->rrl_rule);
936                 uma_zfree(rctl_rule_link_zone, link);
937                 removed++;
938         }
939         return (removed);
940 }
941
942 static void
943 rctl_rule_acquire_subject(struct rctl_rule *rule)
944 {
945
946         ASSERT_RACCT_ENABLED();
947
948         switch (rule->rr_subject_type) {
949         case RCTL_SUBJECT_TYPE_UNDEFINED:
950         case RCTL_SUBJECT_TYPE_PROCESS:
951                 break;
952         case RCTL_SUBJECT_TYPE_JAIL:
953                 if (rule->rr_subject.rs_prison_racct != NULL)
954                         prison_racct_hold(rule->rr_subject.rs_prison_racct);
955                 break;
956         case RCTL_SUBJECT_TYPE_USER:
957                 if (rule->rr_subject.rs_uip != NULL)
958                         uihold(rule->rr_subject.rs_uip);
959                 break;
960         case RCTL_SUBJECT_TYPE_LOGINCLASS:
961                 if (rule->rr_subject.rs_loginclass != NULL)
962                         loginclass_hold(rule->rr_subject.rs_loginclass);
963                 break;
964         default:
965                 panic("rctl_rule_acquire_subject: unknown subject type %d",
966                     rule->rr_subject_type);
967         }
968 }
969
970 static void
971 rctl_rule_release_subject(struct rctl_rule *rule)
972 {
973
974         ASSERT_RACCT_ENABLED();
975
976         switch (rule->rr_subject_type) {
977         case RCTL_SUBJECT_TYPE_UNDEFINED:
978         case RCTL_SUBJECT_TYPE_PROCESS:
979                 break;
980         case RCTL_SUBJECT_TYPE_JAIL:
981                 if (rule->rr_subject.rs_prison_racct != NULL)
982                         prison_racct_free(rule->rr_subject.rs_prison_racct);
983                 break;
984         case RCTL_SUBJECT_TYPE_USER:
985                 if (rule->rr_subject.rs_uip != NULL)
986                         uifree(rule->rr_subject.rs_uip);
987                 break;
988         case RCTL_SUBJECT_TYPE_LOGINCLASS:
989                 if (rule->rr_subject.rs_loginclass != NULL)
990                         loginclass_free(rule->rr_subject.rs_loginclass);
991                 break;
992         default:
993                 panic("rctl_rule_release_subject: unknown subject type %d",
994                     rule->rr_subject_type);
995         }
996 }
997
998 struct rctl_rule *
999 rctl_rule_alloc(int flags)
1000 {
1001         struct rctl_rule *rule;
1002
1003         ASSERT_RACCT_ENABLED();
1004
1005         rule = uma_zalloc(rctl_rule_zone, flags);
1006         if (rule == NULL)
1007                 return (NULL);
1008         rule->rr_subject_type = RCTL_SUBJECT_TYPE_UNDEFINED;
1009         rule->rr_subject.rs_proc = NULL;
1010         rule->rr_subject.rs_uip = NULL;
1011         rule->rr_subject.rs_loginclass = NULL;
1012         rule->rr_subject.rs_prison_racct = NULL;
1013         rule->rr_per = RCTL_SUBJECT_TYPE_UNDEFINED;
1014         rule->rr_resource = RACCT_UNDEFINED;
1015         rule->rr_action = RCTL_ACTION_UNDEFINED;
1016         rule->rr_amount = RCTL_AMOUNT_UNDEFINED;
1017         refcount_init(&rule->rr_refcount, 1);
1018
1019         return (rule);
1020 }
1021
1022 struct rctl_rule *
1023 rctl_rule_duplicate(const struct rctl_rule *rule, int flags)
1024 {
1025         struct rctl_rule *copy;
1026
1027         ASSERT_RACCT_ENABLED();
1028
1029         copy = uma_zalloc(rctl_rule_zone, flags);
1030         if (copy == NULL)
1031                 return (NULL);
1032         copy->rr_subject_type = rule->rr_subject_type;
1033         copy->rr_subject.rs_proc = rule->rr_subject.rs_proc;
1034         copy->rr_subject.rs_uip = rule->rr_subject.rs_uip;
1035         copy->rr_subject.rs_loginclass = rule->rr_subject.rs_loginclass;
1036         copy->rr_subject.rs_prison_racct = rule->rr_subject.rs_prison_racct;
1037         copy->rr_per = rule->rr_per;
1038         copy->rr_resource = rule->rr_resource;
1039         copy->rr_action = rule->rr_action;
1040         copy->rr_amount = rule->rr_amount;
1041         refcount_init(&copy->rr_refcount, 1);
1042         rctl_rule_acquire_subject(copy);
1043
1044         return (copy);
1045 }
1046
1047 void
1048 rctl_rule_acquire(struct rctl_rule *rule)
1049 {
1050
1051         ASSERT_RACCT_ENABLED();
1052         KASSERT(rule->rr_refcount > 0, ("rule->rr_refcount <= 0"));
1053
1054         refcount_acquire(&rule->rr_refcount);
1055 }
1056
1057 static void
1058 rctl_rule_free(void *context, int pending)
1059 {
1060         struct rctl_rule *rule;
1061         
1062         rule = (struct rctl_rule *)context;
1063
1064         ASSERT_RACCT_ENABLED();
1065         KASSERT(rule->rr_refcount == 0, ("rule->rr_refcount != 0"));
1066         
1067         /*
1068          * We don't need locking here; rule is guaranteed to be inaccessible.
1069          */
1070         
1071         rctl_rule_release_subject(rule);
1072         uma_zfree(rctl_rule_zone, rule);
1073 }
1074
1075 void
1076 rctl_rule_release(struct rctl_rule *rule)
1077 {
1078
1079         ASSERT_RACCT_ENABLED();
1080         KASSERT(rule->rr_refcount > 0, ("rule->rr_refcount <= 0"));
1081
1082         if (refcount_release(&rule->rr_refcount)) {
1083                 /*
1084                  * rctl_rule_release() is often called when iterating
1085                  * over all the uidinfo structures in the system,
1086                  * holding uihashtbl_lock.  Since rctl_rule_free()
1087                  * might end up calling uifree(), this would lead
1088                  * to lock recursion.  Use taskqueue to avoid this.
1089                  */
1090                 TASK_INIT(&rule->rr_task, 0, rctl_rule_free, rule);
1091                 taskqueue_enqueue(taskqueue_thread, &rule->rr_task);
1092         }
1093 }
1094
1095 static int
1096 rctl_rule_fully_specified(const struct rctl_rule *rule)
1097 {
1098
1099         ASSERT_RACCT_ENABLED();
1100
1101         switch (rule->rr_subject_type) {
1102         case RCTL_SUBJECT_TYPE_UNDEFINED:
1103                 return (0);
1104         case RCTL_SUBJECT_TYPE_PROCESS:
1105                 if (rule->rr_subject.rs_proc == NULL)
1106                         return (0);
1107                 break;
1108         case RCTL_SUBJECT_TYPE_USER:
1109                 if (rule->rr_subject.rs_uip == NULL)
1110                         return (0);
1111                 break;
1112         case RCTL_SUBJECT_TYPE_LOGINCLASS:
1113                 if (rule->rr_subject.rs_loginclass == NULL)
1114                         return (0);
1115                 break;
1116         case RCTL_SUBJECT_TYPE_JAIL:
1117                 if (rule->rr_subject.rs_prison_racct == NULL)
1118                         return (0);
1119                 break;
1120         default:
1121                 panic("rctl_rule_fully_specified: unknown subject type %d",
1122                     rule->rr_subject_type);
1123         }
1124         if (rule->rr_resource == RACCT_UNDEFINED)
1125                 return (0);
1126         if (rule->rr_action == RCTL_ACTION_UNDEFINED)
1127                 return (0);
1128         if (rule->rr_amount == RCTL_AMOUNT_UNDEFINED)
1129                 return (0);
1130         if (rule->rr_per == RCTL_SUBJECT_TYPE_UNDEFINED)
1131                 return (0);
1132
1133         return (1);
1134 }
1135
1136 static int
1137 rctl_string_to_rule(char *rulestr, struct rctl_rule **rulep)
1138 {
1139         struct rctl_rule *rule;
1140         char *subjectstr, *subject_idstr, *resourcestr, *actionstr,
1141              *amountstr, *perstr;
1142         id_t id;
1143         int error = 0;
1144
1145         ASSERT_RACCT_ENABLED();
1146
1147         rule = rctl_rule_alloc(M_WAITOK);
1148
1149         subjectstr = strsep(&rulestr, ":");
1150         subject_idstr = strsep(&rulestr, ":");
1151         resourcestr = strsep(&rulestr, ":");
1152         actionstr = strsep(&rulestr, "=/");
1153         amountstr = strsep(&rulestr, "/");
1154         perstr = rulestr;
1155
1156         if (subjectstr == NULL || subjectstr[0] == '\0')
1157                 rule->rr_subject_type = RCTL_SUBJECT_TYPE_UNDEFINED;
1158         else {
1159                 error = str2value(subjectstr, &rule->rr_subject_type, subjectnames);
1160                 if (error != 0)
1161                         goto out;
1162         }
1163
1164         if (subject_idstr == NULL || subject_idstr[0] == '\0') {
1165                 rule->rr_subject.rs_proc = NULL;
1166                 rule->rr_subject.rs_uip = NULL;
1167                 rule->rr_subject.rs_loginclass = NULL;
1168                 rule->rr_subject.rs_prison_racct = NULL;
1169         } else {
1170                 switch (rule->rr_subject_type) {
1171                 case RCTL_SUBJECT_TYPE_UNDEFINED:
1172                         error = EINVAL;
1173                         goto out;
1174                 case RCTL_SUBJECT_TYPE_PROCESS:
1175                         error = str2id(subject_idstr, &id);
1176                         if (error != 0)
1177                                 goto out;
1178                         sx_assert(&allproc_lock, SA_LOCKED);
1179                         rule->rr_subject.rs_proc = pfind(id);
1180                         if (rule->rr_subject.rs_proc == NULL) {
1181                                 error = ESRCH;
1182                                 goto out;
1183                         }
1184                         PROC_UNLOCK(rule->rr_subject.rs_proc);
1185                         break;
1186                 case RCTL_SUBJECT_TYPE_USER:
1187                         error = str2id(subject_idstr, &id);
1188                         if (error != 0)
1189                                 goto out;
1190                         rule->rr_subject.rs_uip = uifind(id);
1191                         break;
1192                 case RCTL_SUBJECT_TYPE_LOGINCLASS:
1193                         rule->rr_subject.rs_loginclass =
1194                             loginclass_find(subject_idstr);
1195                         if (rule->rr_subject.rs_loginclass == NULL) {
1196                                 error = ENAMETOOLONG;
1197                                 goto out;
1198                         }
1199                         break;
1200                 case RCTL_SUBJECT_TYPE_JAIL:
1201                         rule->rr_subject.rs_prison_racct =
1202                             prison_racct_find(subject_idstr);
1203                         if (rule->rr_subject.rs_prison_racct == NULL) {
1204                                 error = ENAMETOOLONG;
1205                                 goto out;
1206                         }
1207                         break;
1208                default:
1209                        panic("rctl_string_to_rule: unknown subject type %d",
1210                            rule->rr_subject_type);
1211                }
1212         }
1213
1214         if (resourcestr == NULL || resourcestr[0] == '\0')
1215                 rule->rr_resource = RACCT_UNDEFINED;
1216         else {
1217                 error = str2value(resourcestr, &rule->rr_resource,
1218                     resourcenames);
1219                 if (error != 0)
1220                         goto out;
1221         }
1222
1223         if (actionstr == NULL || actionstr[0] == '\0')
1224                 rule->rr_action = RCTL_ACTION_UNDEFINED;
1225         else {
1226                 error = str2value(actionstr, &rule->rr_action, actionnames);
1227                 if (error != 0)
1228                         goto out;
1229         }
1230
1231         if (amountstr == NULL || amountstr[0] == '\0')
1232                 rule->rr_amount = RCTL_AMOUNT_UNDEFINED;
1233         else {
1234                 error = str2int64(amountstr, &rule->rr_amount);
1235                 if (error != 0)
1236                         goto out;
1237                 if (RACCT_IS_IN_MILLIONS(rule->rr_resource)) {
1238                         if (rule->rr_amount > INT64_MAX / 1000000) {
1239                                 error = ERANGE;
1240                                 goto out;
1241                         }
1242                         rule->rr_amount *= 1000000;
1243                 }
1244         }
1245
1246         if (perstr == NULL || perstr[0] == '\0')
1247                 rule->rr_per = RCTL_SUBJECT_TYPE_UNDEFINED;
1248         else {
1249                 error = str2value(perstr, &rule->rr_per, subjectnames);
1250                 if (error != 0)
1251                         goto out;
1252         }
1253
1254 out:
1255         if (error == 0)
1256                 *rulep = rule;
1257         else
1258                 rctl_rule_release(rule);
1259
1260         return (error);
1261 }
1262
1263 /*
1264  * Link a rule with all the subjects it applies to.
1265  */
1266 int
1267 rctl_rule_add(struct rctl_rule *rule)
1268 {
1269         struct proc *p;
1270         struct ucred *cred;
1271         struct uidinfo *uip;
1272         struct prison *pr;
1273         struct prison_racct *prr;
1274         struct loginclass *lc;
1275         struct rctl_rule *rule2;
1276         int match;
1277
1278         ASSERT_RACCT_ENABLED();
1279         KASSERT(rctl_rule_fully_specified(rule), ("rule not fully specified"));
1280
1281         /*
1282          * Some rules just don't make sense, like "deny" rule for an undeniable
1283          * resource.  The exception are the RSS and %CPU resources - they are
1284          * not deniable in the racct sense, but the limit is enforced in
1285          * a different way.
1286          */
1287         if (rule->rr_action == RCTL_ACTION_DENY &&
1288             !RACCT_IS_DENIABLE(rule->rr_resource) &&
1289             rule->rr_resource != RACCT_RSS &&
1290             rule->rr_resource != RACCT_PCTCPU) {
1291                 return (EOPNOTSUPP);
1292         }
1293
1294         if (rule->rr_action == RCTL_ACTION_THROTTLE &&
1295             !RACCT_IS_DECAYING(rule->rr_resource)) {
1296                 return (EOPNOTSUPP);
1297         }
1298
1299         if (rule->rr_action == RCTL_ACTION_THROTTLE &&
1300             rule->rr_resource == RACCT_PCTCPU) {
1301                 return (EOPNOTSUPP);
1302         }
1303
1304         if (rule->rr_per == RCTL_SUBJECT_TYPE_PROCESS &&
1305             RACCT_IS_SLOPPY(rule->rr_resource)) {
1306                 return (EOPNOTSUPP);
1307         }
1308
1309         /*
1310          * Make sure there are no duplicated rules.  Also, for the "deny"
1311          * rules, remove ones differing only by "amount".
1312          */
1313         if (rule->rr_action == RCTL_ACTION_DENY) {
1314                 rule2 = rctl_rule_duplicate(rule, M_WAITOK);
1315                 rule2->rr_amount = RCTL_AMOUNT_UNDEFINED;
1316                 rctl_rule_remove(rule2);
1317                 rctl_rule_release(rule2);
1318         } else
1319                 rctl_rule_remove(rule);
1320
1321         switch (rule->rr_subject_type) {
1322         case RCTL_SUBJECT_TYPE_PROCESS:
1323                 p = rule->rr_subject.rs_proc;
1324                 KASSERT(p != NULL, ("rctl_rule_add: NULL proc"));
1325
1326                 rctl_racct_add_rule(p->p_racct, rule);
1327                 /*
1328                  * In case of per-process rule, we don't have anything more
1329                  * to do.
1330                  */
1331                 return (0);
1332
1333         case RCTL_SUBJECT_TYPE_USER:
1334                 uip = rule->rr_subject.rs_uip;
1335                 KASSERT(uip != NULL, ("rctl_rule_add: NULL uip"));
1336                 rctl_racct_add_rule(uip->ui_racct, rule);
1337                 break;
1338
1339         case RCTL_SUBJECT_TYPE_LOGINCLASS:
1340                 lc = rule->rr_subject.rs_loginclass;
1341                 KASSERT(lc != NULL, ("rctl_rule_add: NULL loginclass"));
1342                 rctl_racct_add_rule(lc->lc_racct, rule);
1343                 break;
1344
1345         case RCTL_SUBJECT_TYPE_JAIL:
1346                 prr = rule->rr_subject.rs_prison_racct;
1347                 KASSERT(prr != NULL, ("rctl_rule_add: NULL pr"));
1348                 rctl_racct_add_rule(prr->prr_racct, rule);
1349                 break;
1350
1351         default:
1352                 panic("rctl_rule_add: unknown subject type %d",
1353                     rule->rr_subject_type);
1354         }
1355
1356         /*
1357          * Now go through all the processes and add the new rule to the ones
1358          * it applies to.
1359          */
1360         sx_assert(&allproc_lock, SA_LOCKED);
1361         FOREACH_PROC_IN_SYSTEM(p) {
1362                 cred = p->p_ucred;
1363                 switch (rule->rr_subject_type) {
1364                 case RCTL_SUBJECT_TYPE_USER:
1365                         if (cred->cr_uidinfo == rule->rr_subject.rs_uip ||
1366                             cred->cr_ruidinfo == rule->rr_subject.rs_uip)
1367                                 break;
1368                         continue;
1369                 case RCTL_SUBJECT_TYPE_LOGINCLASS:
1370                         if (cred->cr_loginclass == rule->rr_subject.rs_loginclass)
1371                                 break;
1372                         continue;
1373                 case RCTL_SUBJECT_TYPE_JAIL:
1374                         match = 0;
1375                         for (pr = cred->cr_prison; pr != NULL; pr = pr->pr_parent) {
1376                                 if (pr->pr_prison_racct == rule->rr_subject.rs_prison_racct) {
1377                                         match = 1;
1378                                         break;
1379                                 }
1380                         }
1381                         if (match)
1382                                 break;
1383                         continue;
1384                 default:
1385                         panic("rctl_rule_add: unknown subject type %d",
1386                             rule->rr_subject_type);
1387                 }
1388
1389                 rctl_racct_add_rule(p->p_racct, rule);
1390         }
1391
1392         return (0);
1393 }
1394
1395 static void
1396 rctl_rule_pre_callback(void)
1397 {
1398
1399         RACCT_LOCK();
1400 }
1401
1402 static void
1403 rctl_rule_post_callback(void)
1404 {
1405
1406         RACCT_UNLOCK();
1407 }
1408
1409 static void
1410 rctl_rule_remove_callback(struct racct *racct, void *arg2, void *arg3)
1411 {
1412         struct rctl_rule *filter = (struct rctl_rule *)arg2;
1413         int found = 0;
1414
1415         ASSERT_RACCT_ENABLED();
1416         RACCT_LOCK_ASSERT();
1417
1418         found += rctl_racct_remove_rules(racct, filter);
1419
1420         *((int *)arg3) += found;
1421 }
1422
1423 /*
1424  * Remove all rules that match the filter.
1425  */
1426 int
1427 rctl_rule_remove(struct rctl_rule *filter)
1428 {
1429         struct proc *p;
1430         int found = 0;
1431
1432         ASSERT_RACCT_ENABLED();
1433
1434         if (filter->rr_subject_type == RCTL_SUBJECT_TYPE_PROCESS &&
1435             filter->rr_subject.rs_proc != NULL) {
1436                 p = filter->rr_subject.rs_proc;
1437                 RACCT_LOCK();
1438                 found = rctl_racct_remove_rules(p->p_racct, filter);
1439                 RACCT_UNLOCK();
1440                 if (found)
1441                         return (0);
1442                 return (ESRCH);
1443         }
1444
1445         loginclass_racct_foreach(rctl_rule_remove_callback,
1446             rctl_rule_pre_callback, rctl_rule_post_callback,
1447             filter, (void *)&found);
1448         ui_racct_foreach(rctl_rule_remove_callback,
1449             rctl_rule_pre_callback, rctl_rule_post_callback,
1450             filter, (void *)&found);
1451         prison_racct_foreach(rctl_rule_remove_callback,
1452             rctl_rule_pre_callback, rctl_rule_post_callback,
1453             filter, (void *)&found);
1454
1455         sx_assert(&allproc_lock, SA_LOCKED);
1456         RACCT_LOCK();
1457         FOREACH_PROC_IN_SYSTEM(p) {
1458                 found += rctl_racct_remove_rules(p->p_racct, filter);
1459         }
1460         RACCT_UNLOCK();
1461
1462         if (found)
1463                 return (0);
1464         return (ESRCH);
1465 }
1466
1467 /*
1468  * Appends a rule to the sbuf.
1469  */
1470 static void
1471 rctl_rule_to_sbuf(struct sbuf *sb, const struct rctl_rule *rule)
1472 {
1473         int64_t amount;
1474
1475         ASSERT_RACCT_ENABLED();
1476
1477         sbuf_printf(sb, "%s:", rctl_subject_type_name(rule->rr_subject_type));
1478
1479         switch (rule->rr_subject_type) {
1480         case RCTL_SUBJECT_TYPE_PROCESS:
1481                 if (rule->rr_subject.rs_proc == NULL)
1482                         sbuf_printf(sb, ":");
1483                 else
1484                         sbuf_printf(sb, "%d:",
1485                             rule->rr_subject.rs_proc->p_pid);
1486                 break;
1487         case RCTL_SUBJECT_TYPE_USER:
1488                 if (rule->rr_subject.rs_uip == NULL)
1489                         sbuf_printf(sb, ":");
1490                 else
1491                         sbuf_printf(sb, "%d:",
1492                             rule->rr_subject.rs_uip->ui_uid);
1493                 break;
1494         case RCTL_SUBJECT_TYPE_LOGINCLASS:
1495                 if (rule->rr_subject.rs_loginclass == NULL)
1496                         sbuf_printf(sb, ":");
1497                 else
1498                         sbuf_printf(sb, "%s:",
1499                             rule->rr_subject.rs_loginclass->lc_name);
1500                 break;
1501         case RCTL_SUBJECT_TYPE_JAIL:
1502                 if (rule->rr_subject.rs_prison_racct == NULL)
1503                         sbuf_printf(sb, ":");
1504                 else
1505                         sbuf_printf(sb, "%s:",
1506                             rule->rr_subject.rs_prison_racct->prr_name);
1507                 break;
1508         default:
1509                 panic("rctl_rule_to_sbuf: unknown subject type %d",
1510                     rule->rr_subject_type);
1511         }
1512
1513         amount = rule->rr_amount;
1514         if (amount != RCTL_AMOUNT_UNDEFINED &&
1515             RACCT_IS_IN_MILLIONS(rule->rr_resource))
1516                 amount /= 1000000;
1517
1518         sbuf_printf(sb, "%s:%s=%jd",
1519             rctl_resource_name(rule->rr_resource),
1520             rctl_action_name(rule->rr_action),
1521             amount);
1522
1523         if (rule->rr_per != rule->rr_subject_type)
1524                 sbuf_printf(sb, "/%s", rctl_subject_type_name(rule->rr_per));
1525 }
1526
1527 /*
1528  * Routine used by RCTL syscalls to read in input string.
1529  */
1530 static int
1531 rctl_read_inbuf(char **inputstr, const char *inbufp, size_t inbuflen)
1532 {
1533         char *str;
1534         int error;
1535
1536         ASSERT_RACCT_ENABLED();
1537
1538         if (inbuflen <= 0)
1539                 return (EINVAL);
1540         if (inbuflen > RCTL_MAX_INBUFSIZE)
1541                 return (E2BIG);
1542
1543         str = malloc(inbuflen + 1, M_RCTL, M_WAITOK);
1544         error = copyinstr(inbufp, str, inbuflen, NULL);
1545         if (error != 0) {
1546                 free(str, M_RCTL);
1547                 return (error);
1548         }
1549
1550         *inputstr = str;
1551
1552         return (0);
1553 }
1554
1555 /*
1556  * Routine used by RCTL syscalls to write out output string.
1557  */
1558 static int
1559 rctl_write_outbuf(struct sbuf *outputsbuf, char *outbufp, size_t outbuflen)
1560 {
1561         int error;
1562
1563         ASSERT_RACCT_ENABLED();
1564
1565         if (outputsbuf == NULL)
1566                 return (0);
1567
1568         sbuf_finish(outputsbuf);
1569         if (outbuflen < sbuf_len(outputsbuf) + 1) {
1570                 sbuf_delete(outputsbuf);
1571                 return (ERANGE);
1572         }
1573         error = copyout(sbuf_data(outputsbuf), outbufp,
1574             sbuf_len(outputsbuf) + 1);
1575         sbuf_delete(outputsbuf);
1576         return (error);
1577 }
1578
1579 static struct sbuf *
1580 rctl_racct_to_sbuf(struct racct *racct, int sloppy)
1581 {
1582         struct sbuf *sb;
1583         int64_t amount;
1584         int i;
1585
1586         ASSERT_RACCT_ENABLED();
1587
1588         sb = sbuf_new_auto();
1589         for (i = 0; i <= RACCT_MAX; i++) {
1590                 if (sloppy == 0 && RACCT_IS_SLOPPY(i))
1591                         continue;
1592                 RACCT_LOCK();
1593                 amount = racct->r_resources[i];
1594                 RACCT_UNLOCK();
1595                 if (RACCT_IS_IN_MILLIONS(i))
1596                         amount /= 1000000;
1597                 sbuf_printf(sb, "%s=%jd,", rctl_resource_name(i), amount);
1598         }
1599         sbuf_setpos(sb, sbuf_len(sb) - 1);
1600         return (sb);
1601 }
1602
1603 int
1604 sys_rctl_get_racct(struct thread *td, struct rctl_get_racct_args *uap)
1605 {
1606         struct rctl_rule *filter;
1607         struct sbuf *outputsbuf = NULL;
1608         struct proc *p;
1609         struct uidinfo *uip;
1610         struct loginclass *lc;
1611         struct prison_racct *prr;
1612         char *inputstr;
1613         int error;
1614
1615         if (!racct_enable)
1616                 return (ENOSYS);
1617
1618         error = priv_check(td, PRIV_RCTL_GET_RACCT);
1619         if (error != 0)
1620                 return (error);
1621
1622         error = rctl_read_inbuf(&inputstr, uap->inbufp, uap->inbuflen);
1623         if (error != 0)
1624                 return (error);
1625
1626         sx_slock(&allproc_lock);
1627         error = rctl_string_to_rule(inputstr, &filter);
1628         free(inputstr, M_RCTL);
1629         if (error != 0) {
1630                 sx_sunlock(&allproc_lock);
1631                 return (error);
1632         }
1633
1634         switch (filter->rr_subject_type) {
1635         case RCTL_SUBJECT_TYPE_PROCESS:
1636                 p = filter->rr_subject.rs_proc;
1637                 if (p == NULL) {
1638                         error = EINVAL;
1639                         goto out;
1640                 }
1641                 outputsbuf = rctl_racct_to_sbuf(p->p_racct, 0);
1642                 break;
1643         case RCTL_SUBJECT_TYPE_USER:
1644                 uip = filter->rr_subject.rs_uip;
1645                 if (uip == NULL) {
1646                         error = EINVAL;
1647                         goto out;
1648                 }
1649                 outputsbuf = rctl_racct_to_sbuf(uip->ui_racct, 1);
1650                 break;
1651         case RCTL_SUBJECT_TYPE_LOGINCLASS:
1652                 lc = filter->rr_subject.rs_loginclass;
1653                 if (lc == NULL) {
1654                         error = EINVAL;
1655                         goto out;
1656                 }
1657                 outputsbuf = rctl_racct_to_sbuf(lc->lc_racct, 1);
1658                 break;
1659         case RCTL_SUBJECT_TYPE_JAIL:
1660                 prr = filter->rr_subject.rs_prison_racct;
1661                 if (prr == NULL) {
1662                         error = EINVAL;
1663                         goto out;
1664                 }
1665                 outputsbuf = rctl_racct_to_sbuf(prr->prr_racct, 1);
1666                 break;
1667         default:
1668                 error = EINVAL;
1669         }
1670 out:
1671         rctl_rule_release(filter);
1672         sx_sunlock(&allproc_lock);
1673         if (error != 0)
1674                 return (error);
1675
1676         error = rctl_write_outbuf(outputsbuf, uap->outbufp, uap->outbuflen);
1677
1678         return (error);
1679 }
1680
1681 static void
1682 rctl_get_rules_callback(struct racct *racct, void *arg2, void *arg3)
1683 {
1684         struct rctl_rule *filter = (struct rctl_rule *)arg2;
1685         struct rctl_rule_link *link;
1686         struct sbuf *sb = (struct sbuf *)arg3;
1687
1688         ASSERT_RACCT_ENABLED();
1689         RACCT_LOCK_ASSERT();
1690
1691         LIST_FOREACH(link, &racct->r_rule_links, rrl_next) {
1692                 if (!rctl_rule_matches(link->rrl_rule, filter))
1693                         continue;
1694                 rctl_rule_to_sbuf(sb, link->rrl_rule);
1695                 sbuf_printf(sb, ",");
1696         }
1697 }
1698
1699 int
1700 sys_rctl_get_rules(struct thread *td, struct rctl_get_rules_args *uap)
1701 {
1702         struct sbuf *sb;
1703         struct rctl_rule *filter;
1704         struct rctl_rule_link *link;
1705         struct proc *p;
1706         char *inputstr, *buf;
1707         size_t bufsize;
1708         int error;
1709
1710         if (!racct_enable)
1711                 return (ENOSYS);
1712
1713         error = priv_check(td, PRIV_RCTL_GET_RULES);
1714         if (error != 0)
1715                 return (error);
1716
1717         error = rctl_read_inbuf(&inputstr, uap->inbufp, uap->inbuflen);
1718         if (error != 0)
1719                 return (error);
1720
1721         sx_slock(&allproc_lock);
1722         error = rctl_string_to_rule(inputstr, &filter);
1723         free(inputstr, M_RCTL);
1724         if (error != 0) {
1725                 sx_sunlock(&allproc_lock);
1726                 return (error);
1727         }
1728
1729         bufsize = uap->outbuflen;
1730         if (bufsize > rctl_maxbufsize) {
1731                 sx_sunlock(&allproc_lock);
1732                 return (E2BIG);
1733         }
1734
1735         buf = malloc(bufsize, M_RCTL, M_WAITOK);
1736         sb = sbuf_new(NULL, buf, bufsize, SBUF_FIXEDLEN);
1737         KASSERT(sb != NULL, ("sbuf_new failed"));
1738
1739         FOREACH_PROC_IN_SYSTEM(p) {
1740                 RACCT_LOCK();
1741                 LIST_FOREACH(link, &p->p_racct->r_rule_links, rrl_next) {
1742                         /*
1743                          * Non-process rules will be added to the buffer later.
1744                          * Adding them here would result in duplicated output.
1745                          */
1746                         if (link->rrl_rule->rr_subject_type !=
1747                             RCTL_SUBJECT_TYPE_PROCESS)
1748                                 continue;
1749                         if (!rctl_rule_matches(link->rrl_rule, filter))
1750                                 continue;
1751                         rctl_rule_to_sbuf(sb, link->rrl_rule);
1752                         sbuf_printf(sb, ",");
1753                 }
1754                 RACCT_UNLOCK();
1755         }
1756
1757         loginclass_racct_foreach(rctl_get_rules_callback,
1758             rctl_rule_pre_callback, rctl_rule_post_callback,
1759             filter, sb);
1760         ui_racct_foreach(rctl_get_rules_callback,
1761             rctl_rule_pre_callback, rctl_rule_post_callback,
1762             filter, sb);
1763         prison_racct_foreach(rctl_get_rules_callback,
1764             rctl_rule_pre_callback, rctl_rule_post_callback,
1765             filter, sb);
1766         if (sbuf_error(sb) == ENOMEM) {
1767                 error = ERANGE;
1768                 goto out;
1769         }
1770
1771         /*
1772          * Remove trailing ",".
1773          */
1774         if (sbuf_len(sb) > 0)
1775                 sbuf_setpos(sb, sbuf_len(sb) - 1);
1776
1777         error = rctl_write_outbuf(sb, uap->outbufp, uap->outbuflen);
1778 out:
1779         rctl_rule_release(filter);
1780         sx_sunlock(&allproc_lock);
1781         free(buf, M_RCTL);
1782         return (error);
1783 }
1784
1785 int
1786 sys_rctl_get_limits(struct thread *td, struct rctl_get_limits_args *uap)
1787 {
1788         struct sbuf *sb;
1789         struct rctl_rule *filter;
1790         struct rctl_rule_link *link;
1791         char *inputstr, *buf;
1792         size_t bufsize;
1793         int error;
1794
1795         if (!racct_enable)
1796                 return (ENOSYS);
1797
1798         error = priv_check(td, PRIV_RCTL_GET_LIMITS);
1799         if (error != 0)
1800                 return (error);
1801
1802         error = rctl_read_inbuf(&inputstr, uap->inbufp, uap->inbuflen);
1803         if (error != 0)
1804                 return (error);
1805
1806         sx_slock(&allproc_lock);
1807         error = rctl_string_to_rule(inputstr, &filter);
1808         free(inputstr, M_RCTL);
1809         if (error != 0) {
1810                 sx_sunlock(&allproc_lock);
1811                 return (error);
1812         }
1813
1814         if (filter->rr_subject_type == RCTL_SUBJECT_TYPE_UNDEFINED) {
1815                 rctl_rule_release(filter);
1816                 sx_sunlock(&allproc_lock);
1817                 return (EINVAL);
1818         }
1819         if (filter->rr_subject_type != RCTL_SUBJECT_TYPE_PROCESS) {
1820                 rctl_rule_release(filter);
1821                 sx_sunlock(&allproc_lock);
1822                 return (EOPNOTSUPP);
1823         }
1824         if (filter->rr_subject.rs_proc == NULL) {
1825                 rctl_rule_release(filter);
1826                 sx_sunlock(&allproc_lock);
1827                 return (EINVAL);
1828         }
1829
1830         bufsize = uap->outbuflen;
1831         if (bufsize > rctl_maxbufsize) {
1832                 rctl_rule_release(filter);
1833                 sx_sunlock(&allproc_lock);
1834                 return (E2BIG);
1835         }
1836
1837         buf = malloc(bufsize, M_RCTL, M_WAITOK);
1838         sb = sbuf_new(NULL, buf, bufsize, SBUF_FIXEDLEN);
1839         KASSERT(sb != NULL, ("sbuf_new failed"));
1840
1841         RACCT_LOCK();
1842         LIST_FOREACH(link, &filter->rr_subject.rs_proc->p_racct->r_rule_links,
1843             rrl_next) {
1844                 rctl_rule_to_sbuf(sb, link->rrl_rule);
1845                 sbuf_printf(sb, ",");
1846         }
1847         RACCT_UNLOCK();
1848         if (sbuf_error(sb) == ENOMEM) {
1849                 error = ERANGE;
1850                 sbuf_delete(sb);
1851                 goto out;
1852         }
1853
1854         /*
1855          * Remove trailing ",".
1856          */
1857         if (sbuf_len(sb) > 0)
1858                 sbuf_setpos(sb, sbuf_len(sb) - 1);
1859
1860         error = rctl_write_outbuf(sb, uap->outbufp, uap->outbuflen);
1861 out:
1862         rctl_rule_release(filter);
1863         sx_sunlock(&allproc_lock);
1864         free(buf, M_RCTL);
1865         return (error);
1866 }
1867
1868 int
1869 sys_rctl_add_rule(struct thread *td, struct rctl_add_rule_args *uap)
1870 {
1871         struct rctl_rule *rule;
1872         char *inputstr;
1873         int error;
1874
1875         if (!racct_enable)
1876                 return (ENOSYS);
1877
1878         error = priv_check(td, PRIV_RCTL_ADD_RULE);
1879         if (error != 0)
1880                 return (error);
1881
1882         error = rctl_read_inbuf(&inputstr, uap->inbufp, uap->inbuflen);
1883         if (error != 0)
1884                 return (error);
1885
1886         sx_slock(&allproc_lock);
1887         error = rctl_string_to_rule(inputstr, &rule);
1888         free(inputstr, M_RCTL);
1889         if (error != 0) {
1890                 sx_sunlock(&allproc_lock);
1891                 return (error);
1892         }
1893         /*
1894          * The 'per' part of a rule is optional.
1895          */
1896         if (rule->rr_per == RCTL_SUBJECT_TYPE_UNDEFINED &&
1897             rule->rr_subject_type != RCTL_SUBJECT_TYPE_UNDEFINED)
1898                 rule->rr_per = rule->rr_subject_type;
1899
1900         if (!rctl_rule_fully_specified(rule)) {
1901                 error = EINVAL;
1902                 goto out;
1903         }
1904
1905         error = rctl_rule_add(rule);
1906
1907 out:
1908         rctl_rule_release(rule);
1909         sx_sunlock(&allproc_lock);
1910         return (error);
1911 }
1912
1913 int
1914 sys_rctl_remove_rule(struct thread *td, struct rctl_remove_rule_args *uap)
1915 {
1916         struct rctl_rule *filter;
1917         char *inputstr;
1918         int error;
1919
1920         if (!racct_enable)
1921                 return (ENOSYS);
1922
1923         error = priv_check(td, PRIV_RCTL_REMOVE_RULE);
1924         if (error != 0)
1925                 return (error);
1926
1927         error = rctl_read_inbuf(&inputstr, uap->inbufp, uap->inbuflen);
1928         if (error != 0)
1929                 return (error);
1930
1931         sx_slock(&allproc_lock);
1932         error = rctl_string_to_rule(inputstr, &filter);
1933         free(inputstr, M_RCTL);
1934         if (error != 0) {
1935                 sx_sunlock(&allproc_lock);
1936                 return (error);
1937         }
1938
1939         error = rctl_rule_remove(filter);
1940         rctl_rule_release(filter);
1941         sx_sunlock(&allproc_lock);
1942
1943         return (error);
1944 }
1945
1946 /*
1947  * Update RCTL rule list after credential change.
1948  */
1949 void
1950 rctl_proc_ucred_changed(struct proc *p, struct ucred *newcred)
1951 {
1952         LIST_HEAD(, rctl_rule_link) newrules;
1953         struct rctl_rule_link *link, *newlink;
1954         struct uidinfo *newuip;
1955         struct loginclass *newlc;
1956         struct prison_racct *newprr;
1957         int rulecnt, i;
1958
1959         if (!racct_enable)
1960                 return;
1961
1962         PROC_LOCK_ASSERT(p, MA_NOTOWNED);
1963
1964         newuip = newcred->cr_ruidinfo;
1965         newlc = newcred->cr_loginclass;
1966         newprr = newcred->cr_prison->pr_prison_racct;
1967
1968         LIST_INIT(&newrules);
1969
1970 again:
1971         /*
1972          * First, count the rules that apply to the process with new
1973          * credentials.
1974          */
1975         rulecnt = 0;
1976         RACCT_LOCK();
1977         LIST_FOREACH(link, &p->p_racct->r_rule_links, rrl_next) {
1978                 if (link->rrl_rule->rr_subject_type ==
1979                     RCTL_SUBJECT_TYPE_PROCESS)
1980                         rulecnt++;
1981         }
1982         LIST_FOREACH(link, &newuip->ui_racct->r_rule_links, rrl_next)
1983                 rulecnt++;
1984         LIST_FOREACH(link, &newlc->lc_racct->r_rule_links, rrl_next)
1985                 rulecnt++;
1986         LIST_FOREACH(link, &newprr->prr_racct->r_rule_links, rrl_next)
1987                 rulecnt++;
1988         RACCT_UNLOCK();
1989
1990         /*
1991          * Create temporary list.  We've dropped the rctl_lock in order
1992          * to use M_WAITOK.
1993          */
1994         for (i = 0; i < rulecnt; i++) {
1995                 newlink = uma_zalloc(rctl_rule_link_zone, M_WAITOK);
1996                 newlink->rrl_rule = NULL;
1997                 newlink->rrl_exceeded = 0;
1998                 LIST_INSERT_HEAD(&newrules, newlink, rrl_next);
1999         }
2000
2001         newlink = LIST_FIRST(&newrules);
2002
2003         /*
2004          * Assign rules to the newly allocated list entries.
2005          */
2006         RACCT_LOCK();
2007         LIST_FOREACH(link, &p->p_racct->r_rule_links, rrl_next) {
2008                 if (link->rrl_rule->rr_subject_type ==
2009                     RCTL_SUBJECT_TYPE_PROCESS) {
2010                         if (newlink == NULL)
2011                                 goto goaround;
2012                         rctl_rule_acquire(link->rrl_rule);
2013                         newlink->rrl_rule = link->rrl_rule;
2014                         newlink->rrl_exceeded = link->rrl_exceeded;
2015                         newlink = LIST_NEXT(newlink, rrl_next);
2016                         rulecnt--;
2017                 }
2018         }
2019         
2020         LIST_FOREACH(link, &newuip->ui_racct->r_rule_links, rrl_next) {
2021                 if (newlink == NULL)
2022                         goto goaround;
2023                 rctl_rule_acquire(link->rrl_rule);
2024                 newlink->rrl_rule = link->rrl_rule;
2025                 newlink->rrl_exceeded = link->rrl_exceeded;
2026                 newlink = LIST_NEXT(newlink, rrl_next);
2027                 rulecnt--;
2028         }
2029
2030         LIST_FOREACH(link, &newlc->lc_racct->r_rule_links, rrl_next) {
2031                 if (newlink == NULL)
2032                         goto goaround;
2033                 rctl_rule_acquire(link->rrl_rule);
2034                 newlink->rrl_rule = link->rrl_rule;
2035                 newlink->rrl_exceeded = link->rrl_exceeded;
2036                 newlink = LIST_NEXT(newlink, rrl_next);
2037                 rulecnt--;
2038         }
2039
2040         LIST_FOREACH(link, &newprr->prr_racct->r_rule_links, rrl_next) {
2041                 if (newlink == NULL)
2042                         goto goaround;
2043                 rctl_rule_acquire(link->rrl_rule);
2044                 newlink->rrl_rule = link->rrl_rule;
2045                 newlink->rrl_exceeded = link->rrl_exceeded;
2046                 newlink = LIST_NEXT(newlink, rrl_next);
2047                 rulecnt--;
2048         }
2049
2050         if (rulecnt == 0) {
2051                 /*
2052                  * Free the old rule list.
2053                  */
2054                 while (!LIST_EMPTY(&p->p_racct->r_rule_links)) {
2055                         link = LIST_FIRST(&p->p_racct->r_rule_links);
2056                         LIST_REMOVE(link, rrl_next);
2057                         rctl_rule_release(link->rrl_rule);
2058                         uma_zfree(rctl_rule_link_zone, link);
2059                 }
2060
2061                 /*
2062                  * Replace lists and we're done.
2063                  *
2064                  * XXX: Is there any way to switch list heads instead
2065                  *      of iterating here?
2066                  */
2067                 while (!LIST_EMPTY(&newrules)) {
2068                         newlink = LIST_FIRST(&newrules);
2069                         LIST_REMOVE(newlink, rrl_next);
2070                         LIST_INSERT_HEAD(&p->p_racct->r_rule_links,
2071                             newlink, rrl_next);
2072                 }
2073
2074                 RACCT_UNLOCK();
2075
2076                 return;
2077         }
2078
2079 goaround:
2080         RACCT_UNLOCK();
2081
2082         /*
2083          * Rule list changed while we were not holding the rctl_lock.
2084          * Free the new list and try again.
2085          */
2086         while (!LIST_EMPTY(&newrules)) {
2087                 newlink = LIST_FIRST(&newrules);
2088                 LIST_REMOVE(newlink, rrl_next);
2089                 if (newlink->rrl_rule != NULL)
2090                         rctl_rule_release(newlink->rrl_rule);
2091                 uma_zfree(rctl_rule_link_zone, newlink);
2092         }
2093
2094         goto again;
2095 }
2096
2097 /*
2098  * Assign RCTL rules to the newly created process.
2099  */
2100 int
2101 rctl_proc_fork(struct proc *parent, struct proc *child)
2102 {
2103         struct rctl_rule *rule;
2104         struct rctl_rule_link *link;
2105         int error;
2106
2107         ASSERT_RACCT_ENABLED();
2108         RACCT_LOCK_ASSERT();
2109         KASSERT(parent->p_racct != NULL, ("process without racct; p = %p", parent));
2110
2111         LIST_INIT(&child->p_racct->r_rule_links);
2112
2113         /*
2114          * Go through limits applicable to the parent and assign them
2115          * to the child.  Rules with 'process' subject have to be duplicated
2116          * in order to make their rr_subject point to the new process.
2117          */
2118         LIST_FOREACH(link, &parent->p_racct->r_rule_links, rrl_next) {
2119                 if (link->rrl_rule->rr_subject_type ==
2120                     RCTL_SUBJECT_TYPE_PROCESS) {
2121                         rule = rctl_rule_duplicate(link->rrl_rule, M_NOWAIT);
2122                         if (rule == NULL)
2123                                 goto fail;
2124                         KASSERT(rule->rr_subject.rs_proc == parent,
2125                             ("rule->rr_subject.rs_proc != parent"));
2126                         rule->rr_subject.rs_proc = child;
2127                         error = rctl_racct_add_rule_locked(child->p_racct,
2128                             rule);
2129                         rctl_rule_release(rule);
2130                         if (error != 0)
2131                                 goto fail;
2132                 } else {
2133                         error = rctl_racct_add_rule_locked(child->p_racct,
2134                             link->rrl_rule);
2135                         if (error != 0)
2136                                 goto fail;
2137                 }
2138         }
2139
2140         return (0);
2141
2142 fail:
2143         while (!LIST_EMPTY(&child->p_racct->r_rule_links)) {
2144                 link = LIST_FIRST(&child->p_racct->r_rule_links);
2145                 LIST_REMOVE(link, rrl_next);
2146                 rctl_rule_release(link->rrl_rule);
2147                 uma_zfree(rctl_rule_link_zone, link);
2148         }
2149
2150         return (EAGAIN);
2151 }
2152
2153 /*
2154  * Release rules attached to the racct.
2155  */
2156 void
2157 rctl_racct_release(struct racct *racct)
2158 {
2159         struct rctl_rule_link *link;
2160
2161         ASSERT_RACCT_ENABLED();
2162         RACCT_LOCK_ASSERT();
2163
2164         while (!LIST_EMPTY(&racct->r_rule_links)) {
2165                 link = LIST_FIRST(&racct->r_rule_links);
2166                 LIST_REMOVE(link, rrl_next);
2167                 rctl_rule_release(link->rrl_rule);
2168                 uma_zfree(rctl_rule_link_zone, link);
2169         }
2170 }
2171
2172 static void
2173 rctl_init(void)
2174 {
2175
2176         if (!racct_enable)
2177                 return;
2178
2179         rctl_rule_zone = uma_zcreate("rctl_rule", sizeof(struct rctl_rule),
2180             NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0);
2181         rctl_rule_link_zone = uma_zcreate("rctl_rule_link",
2182             sizeof(struct rctl_rule_link), NULL, NULL, NULL, NULL,
2183             UMA_ALIGN_PTR, 0);
2184
2185         /*
2186          * Set default values, making sure not to overwrite the ones
2187          * fetched from tunables.  Most of those could be set at the
2188          * declaration, except for the rctl_throttle_max - we cannot
2189          * set it there due to hz not being compile time constant.
2190          */
2191         if (rctl_throttle_min < 1)
2192                 rctl_throttle_min = 1;
2193         if (rctl_throttle_max < rctl_throttle_min)
2194                 rctl_throttle_max = 2 * hz;
2195         if (rctl_throttle_pct < 0)
2196                 rctl_throttle_pct = 100;
2197         if (rctl_throttle_pct2 < 0)
2198                 rctl_throttle_pct2 = 100;
2199 }
2200
2201 #else /* !RCTL */
2202
2203 int
2204 sys_rctl_get_racct(struct thread *td, struct rctl_get_racct_args *uap)
2205 {
2206         
2207         return (ENOSYS);
2208 }
2209
2210 int
2211 sys_rctl_get_rules(struct thread *td, struct rctl_get_rules_args *uap)
2212 {
2213         
2214         return (ENOSYS);
2215 }
2216
2217 int
2218 sys_rctl_get_limits(struct thread *td, struct rctl_get_limits_args *uap)
2219 {
2220         
2221         return (ENOSYS);
2222 }
2223
2224 int
2225 sys_rctl_add_rule(struct thread *td, struct rctl_add_rule_args *uap)
2226 {
2227         
2228         return (ENOSYS);
2229 }
2230
2231 int
2232 sys_rctl_remove_rule(struct thread *td, struct rctl_remove_rule_args *uap)
2233 {
2234         
2235         return (ENOSYS);
2236 }
2237
2238 #endif /* !RCTL */