]> CyberLeo.Net >> Repos - FreeBSD/FreeBSD.git/blob - sys/net/route/route_helpers.c
zfs: merge openzfs/zfs@7d9f3ef0e (zfs-2.1-release) into stable/13
[FreeBSD/FreeBSD.git] / sys / net / route / route_helpers.c
1 /*-
2  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
3  *
4  * Copyright (c) 2020 Alexander V. Chernikov
5  *
6  * Redistribution and use in source and binary forms, with or without
7  * modification, are permitted provided that the following conditions
8  * are met:
9  * 1. Redistributions of source code must retain the above copyright
10  *    notice, this list of conditions and the following disclaimer.
11  * 2. Redistributions in binary form must reproduce the above copyright
12  *    notice, this list of conditions and the following disclaimer in the
13  *    documentation and/or other materials provided with the distribution.
14  *
15  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
16  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
17  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
18  * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
19  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
20  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
21  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
22  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
23  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
24  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
25  * SUCH DAMAGE.
26  */
27
28 #include <sys/cdefs.h>
29 __FBSDID("$FreeBSD$");
30 #include "opt_inet.h"
31 #include "opt_inet6.h"
32 #include "opt_route.h"
33
34 #include <sys/param.h>
35 #include <sys/jail.h>
36 #include <sys/systm.h>
37 #include <sys/malloc.h>
38 #include <sys/mbuf.h>
39 #include <sys/socket.h>
40 #include <sys/sysctl.h>
41 #include <sys/syslog.h>
42 #include <sys/sysproto.h>
43 #include <sys/proc.h>
44 #include <sys/domain.h>
45 #include <sys/kernel.h>
46 #include <sys/lock.h>
47 #include <sys/rmlock.h>
48
49 #include <net/if.h>
50 #include <net/if_var.h>
51 #include <net/if_dl.h>
52 #include <net/route.h>
53 #include <net/route/route_ctl.h>
54 #include <net/route/route_var.h>
55 #include <net/route/nhop_utils.h>
56 #include <net/route/nhop.h>
57 #include <net/route/nhop_var.h>
58 #ifdef INET
59 #include <netinet/in_fib.h>
60 #endif
61 #ifdef INET6
62 #include <netinet6/in6_fib.h>
63 #endif
64 #include <net/vnet.h>
65
66 /*
67  * RIB helper functions.
68  */
69
70 void
71 rib_walk_ext_locked(struct rib_head *rnh, rib_walktree_f_t *wa_f,
72     rib_walk_hook_f_t *hook_f, void *arg)
73 {
74         if (hook_f != NULL)
75                 hook_f(rnh, RIB_WALK_HOOK_PRE, arg);
76         rnh->rnh_walktree(&rnh->head, (walktree_f_t *)wa_f, arg);
77         if (hook_f != NULL)
78                 hook_f(rnh, RIB_WALK_HOOK_POST, arg);
79 }
80
81 /*
82  * Calls @wa_f with @arg for each entry in the table specified by
83  * @af and @fibnum.
84  *
85  * @ss_t callback is called before and after the tree traversal
86  *  while holding table lock.
87  *
88  * Table is traversed under read lock unless @wlock is set.
89  */
90 void
91 rib_walk_ext_internal(struct rib_head *rnh, bool wlock, rib_walktree_f_t *wa_f,
92     rib_walk_hook_f_t *hook_f, void *arg)
93 {
94         RIB_RLOCK_TRACKER;
95
96         if (wlock)
97                 RIB_WLOCK(rnh);
98         else
99                 RIB_RLOCK(rnh);
100         rib_walk_ext_locked(rnh, wa_f, hook_f, arg);
101         if (wlock)
102                 RIB_WUNLOCK(rnh);
103         else
104                 RIB_RUNLOCK(rnh);
105 }
106
107 void
108 rib_walk_ext(uint32_t fibnum, int family, bool wlock, rib_walktree_f_t *wa_f,
109     rib_walk_hook_f_t *hook_f, void *arg)
110 {
111         struct rib_head *rnh;
112
113         if ((rnh = rt_tables_get_rnh(fibnum, family)) != NULL)
114                 rib_walk_ext_internal(rnh, wlock, wa_f, hook_f, arg);
115 }
116
117 /*
118  * Calls @wa_f with @arg for each entry in the table specified by
119  * @af and @fibnum.
120  *
121  * Table is traversed under read lock unless @wlock is set.
122  */
123 void
124 rib_walk(uint32_t fibnum, int family, bool wlock, rib_walktree_f_t *wa_f,
125     void *arg)
126 {
127
128         rib_walk_ext(fibnum, family, wlock, wa_f, NULL, arg);
129 }
130
131 /*
132  * Calls @wa_f with @arg for each entry in the table matching @prefix/@mask.
133  *
134  * The following flags are supported:
135  *  RIB_FLAG_WLOCK: acquire exclusive lock
136  *  RIB_FLAG_LOCKED: Assumes the table is already locked & skip locking
137  *
138  * By default, table is traversed under read lock.
139  */
140 void
141 rib_walk_from(uint32_t fibnum, int family, uint32_t flags, struct sockaddr *prefix,
142     struct sockaddr *mask, rib_walktree_f_t *wa_f, void *arg)
143 {
144         RIB_RLOCK_TRACKER;
145         struct rib_head *rnh = rt_tables_get_rnh(fibnum, family);
146
147         if (rnh == NULL)
148                 return;
149
150         if (flags & RIB_FLAG_WLOCK)
151                 RIB_WLOCK(rnh);
152         else if (!(flags & RIB_FLAG_LOCKED))
153                 RIB_RLOCK(rnh);
154
155         rnh->rnh_walktree_from(&rnh->head, prefix, mask, (walktree_f_t *)wa_f, arg);
156
157         if (flags & RIB_FLAG_WLOCK)
158                 RIB_WUNLOCK(rnh);
159         else if (!(flags & RIB_FLAG_LOCKED))
160                 RIB_RUNLOCK(rnh);
161 }
162
163 /*
164  * Iterates over all existing fibs in system calling
165  *  @hook_f function before/after traversing each fib.
166  *  Calls @wa_f function for each element in current fib.
167  * If af is not AF_UNSPEC, iterates over fibs in particular
168  * address family.
169  */
170 void
171 rib_foreach_table_walk(int family, bool wlock, rib_walktree_f_t *wa_f,
172     rib_walk_hook_f_t *hook_f, void *arg)
173 {
174
175         for (uint32_t fibnum = 0; fibnum < rt_numfibs; fibnum++) {
176                 /* Do we want some specific family? */
177                 if (family != AF_UNSPEC) {
178                         rib_walk_ext(fibnum, family, wlock, wa_f, hook_f, arg); 
179                         continue;
180                 }
181
182                 for (int i = 1; i <= AF_MAX; i++)
183                         rib_walk_ext(fibnum, i, wlock, wa_f, hook_f, arg);
184         }
185 }
186
187 /*
188  * Iterates over all existing fibs in system and deletes each element
189  *  for which @filter_f function returns non-zero value.
190  * If @family is not AF_UNSPEC, iterates over fibs in particular
191  * address family.
192  */
193 void
194 rib_foreach_table_walk_del(int family, rib_filter_f_t *filter_f, void *arg)
195 {
196
197         for (uint32_t fibnum = 0; fibnum < rt_numfibs; fibnum++) {
198                 /* Do we want some specific family? */
199                 if (family != AF_UNSPEC) {
200                         rib_walk_del(fibnum, family, filter_f, arg, 0);
201                         continue;
202                 }
203
204                 for (int i = 1; i <= AF_MAX; i++)
205                         rib_walk_del(fibnum, i, filter_f, arg, 0);
206         }
207 }
208
209
210 /*
211  * Wrapper for the control plane functions for performing af-agnostic
212  *  lookups.
213  * @fibnum: fib to perform the lookup.
214  * @dst: sockaddr with family and addr filled in. IPv6 addresses needs to be in
215  *  deembedded from.
216  * @flags: fib(9) flags.
217  * @flowid: flow id for path selection in multipath use case.
218  *
219  * Returns nhop_object or NULL.
220  *
221  * Requires NET_EPOCH.
222  *
223  */
224 struct nhop_object *
225 rib_lookup(uint32_t fibnum, const struct sockaddr *dst, uint32_t flags,
226     uint32_t flowid)
227 {
228         struct nhop_object *nh;
229
230         nh = NULL;
231
232         switch (dst->sa_family) {
233 #ifdef INET
234         case AF_INET:
235         {
236                 const struct sockaddr_in *a = (const struct sockaddr_in *)dst;
237                 nh = fib4_lookup(fibnum, a->sin_addr, 0, flags, flowid);
238                 break;
239         }
240 #endif
241 #ifdef INET6
242         case AF_INET6:
243         {
244                 const struct sockaddr_in6 *a = (const struct sockaddr_in6*)dst;
245                 nh = fib6_lookup(fibnum, &a->sin6_addr, a->sin6_scope_id,
246                     flags, flowid);
247                 break;
248         }
249 #endif
250         }
251
252         return (nh);
253 }
254
255 #ifdef ROUTE_MPATH
256 static void
257 decompose_change_notification(struct rib_cmd_info *rc, route_notification_t *cb,
258     void *cbdata)
259 {
260         uint32_t num_old, num_new;
261         uint32_t nh_idx_old, nh_idx_new;
262         struct weightened_nhop *wn_old, *wn_new;
263         struct weightened_nhop tmp = { NULL, 0 };
264         uint32_t idx_old = 0, idx_new = 0;
265
266         struct rib_cmd_info rc_del = { .rc_cmd = RTM_DELETE, .rc_rt = rc->rc_rt };
267         struct rib_cmd_info rc_add = { .rc_cmd = RTM_ADD, .rc_rt = rc->rc_rt };
268
269         if (NH_IS_NHGRP(rc->rc_nh_old)) {
270                 wn_old = nhgrp_get_nhops((struct nhgrp_object *)rc->rc_nh_old, &num_old);
271         } else {
272                 tmp.nh = rc->rc_nh_old;
273                 tmp.weight = rc->rc_nh_weight;
274                 wn_old = &tmp;
275                 num_old = 1;
276         }
277         if (NH_IS_NHGRP(rc->rc_nh_new)) {
278                 wn_new = nhgrp_get_nhops((struct nhgrp_object *)rc->rc_nh_new, &num_new);
279         } else {
280                 tmp.nh = rc->rc_nh_new;
281                 tmp.weight = rc->rc_nh_weight;
282                 wn_new = &tmp;
283                 num_new = 1;
284         }
285
286         /* Use the fact that each @wn array is sorted */
287         /*
288          * Want to convert into set of add and delete operations
289          * [1] -> [1, 2] = A{2}
290          * [2] -> [1, 2] = A{1}
291          * [1, 2, 4]->[1, 3, 4] = A{2}, D{3}
292          * [1, 2, 4]->[1, 4] = D{2}
293          * [1, 2, 4] -> [3, 4] = D{1}, C{2,3} OR C{1,3}, D{2} OR D{1},D{2},A{3}
294          * [1, 2] -> [3, 4] =
295          *
296          */
297         idx_old = 0;
298         while ((idx_old < num_old) && (idx_new < num_new)) {
299                 nh_idx_old = wn_old[idx_old].nh->nh_priv->nh_idx;
300                 nh_idx_new = wn_new[idx_new].nh->nh_priv->nh_idx;
301
302                 if (nh_idx_old == nh_idx_new) {
303                         if (wn_old[idx_old].weight != wn_new[idx_new].weight) {
304                                 /* Update weight by providing del/add notifications */
305                                 rc_del.rc_nh_old = wn_old[idx_old].nh;
306                                 rc_del.rc_nh_weight = wn_old[idx_old].weight;
307                                 cb(&rc_del, cbdata);
308
309                                 rc_add.rc_nh_new = wn_new[idx_new].nh;
310                                 rc_add.rc_nh_weight = wn_new[idx_new].weight;
311                                 cb(&rc_add, cbdata);
312                         }
313                         idx_old++;
314                         idx_new++;
315                 } else if (nh_idx_old < nh_idx_new) {
316                         /*
317                          * [1, ~2~, 4], [1, ~3~, 4]
318                          * [1, ~2~, 5], [1, ~3~, 4]
319                          * [1, ~2~], [1, ~3~, 4]
320                          */
321                         if ((idx_old + 1 >= num_old) ||
322                             (wn_old[idx_old + 1].nh->nh_priv->nh_idx > nh_idx_new)) {
323                                 /* Add new unless the next old item is still <= new */
324                                 rc_add.rc_nh_new = wn_new[idx_new].nh;
325                                 rc_add.rc_nh_weight = wn_new[idx_new].weight;
326                                 cb(&rc_add, cbdata);
327                                 idx_new++;
328                         }
329                         /* In any case, delete current old */
330                         rc_del.rc_nh_old = wn_old[idx_old].nh;
331                         rc_del.rc_nh_weight = wn_old[idx_old].weight;
332                         cb(&rc_del, cbdata);
333                         idx_old++;
334                 } else {
335                         /*
336                          * nh_idx_old > nh_idx_new
337                          *
338                          * [1, ~3~, 4], [1, ~2~, 4]
339                          * [1, ~3~, 5], [1, ~2~, 4]
340                          * [1, ~3~, 4], [1, ~2~]
341                          */
342                         if ((idx_new + 1 >= num_new) ||
343                             (wn_new[idx_new + 1].nh->nh_priv->nh_idx > nh_idx_old)) {
344                                 /* No next item or next item is > current one */
345                                 rc_add.rc_nh_new = wn_new[idx_new].nh;
346                                 rc_add.rc_nh_weight = wn_new[idx_new].weight;
347                                 cb(&rc_add, cbdata);
348                                 idx_new++;
349                         }
350                         /* In any case, delete current old */
351                         rc_del.rc_nh_old = wn_old[idx_old].nh;
352                         rc_del.rc_nh_weight = wn_old[idx_old].weight;
353                         cb(&rc_del, cbdata);
354                         idx_old++;
355                 }
356         }
357
358         while (idx_old < num_old) {
359                 rc_del.rc_nh_old = wn_old[idx_old].nh;
360                 rc_del.rc_nh_weight = wn_old[idx_old].weight;
361                 cb(&rc_del, cbdata);
362                 idx_old++;
363         }
364
365         while (idx_new < num_new) {
366                 rc_add.rc_nh_new = wn_new[idx_new].nh;
367                 rc_add.rc_nh_weight = wn_new[idx_new].weight;
368                 cb(&rc_add, cbdata);
369                 idx_new++;
370         }
371 }
372
373 /*
374  * Decompose multipath cmd info @rc into a list of add/del/change
375  *  single-path operations, calling @cb callback for each operation.
376  * Assumes at least one of the nexthops in @rc is multipath.
377  */
378 void
379 rib_decompose_notification(struct rib_cmd_info *rc, route_notification_t *cb,
380     void *cbdata)
381 {
382         struct weightened_nhop *wn;
383         uint32_t num_nhops;
384         struct rib_cmd_info rc_new;
385
386         rc_new = *rc;
387         DPRINTF("cb=%p cmd=%d nh_old=%p nh_new=%p",
388             cb, rc->cmd, rc->nh_old, rc->nh_new);
389         switch (rc->rc_cmd) {
390         case RTM_ADD:
391                 if (!NH_IS_NHGRP(rc->rc_nh_new))
392                         return;
393                 wn = nhgrp_get_nhops((struct nhgrp_object *)rc->rc_nh_new, &num_nhops);
394                 for (uint32_t i = 0; i < num_nhops; i++) {
395                         rc_new.rc_nh_new = wn[i].nh;
396                         rc_new.rc_nh_weight = wn[i].weight;
397                         cb(&rc_new, cbdata);
398                 }
399                 break;
400         case RTM_DELETE:
401                 if (!NH_IS_NHGRP(rc->rc_nh_old))
402                         return;
403                 wn = nhgrp_get_nhops((struct nhgrp_object *)rc->rc_nh_old, &num_nhops);
404                 for (uint32_t i = 0; i < num_nhops; i++) {
405                         rc_new.rc_nh_old = wn[i].nh;
406                         rc_new.rc_nh_weight = wn[i].weight;
407                         cb(&rc_new, cbdata);
408                 }
409                 break;
410         case RTM_CHANGE:
411                 if (!NH_IS_NHGRP(rc->rc_nh_old) && !NH_IS_NHGRP(rc->rc_nh_new))
412                         return;
413                 decompose_change_notification(rc, cb, cbdata);
414                 break;
415         }
416 }
417 #endif