2 * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
4 * Copyright (c) 2020 Alexander V. Chernikov
6 * Redistribution and use in source and binary forms, with or without
7 * modification, are permitted provided that the following conditions
9 * 1. Redistributions of source code must retain the above copyright
10 * notice, this list of conditions and the following disclaimer.
11 * 2. Redistributions in binary form must reproduce the above copyright
12 * notice, this list of conditions and the following disclaimer in the
13 * documentation and/or other materials provided with the distribution.
15 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
16 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
17 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
18 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
19 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
20 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
21 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
22 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
23 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
24 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
28 #include <sys/cdefs.h>
29 __FBSDID("$FreeBSD$");
31 #include "opt_inet6.h"
32 #include "opt_route.h"
34 #include <sys/param.h>
36 #include <sys/systm.h>
37 #include <sys/malloc.h>
39 #include <sys/socket.h>
40 #include <sys/sysctl.h>
41 #include <sys/syslog.h>
42 #include <sys/sysproto.h>
44 #include <sys/domain.h>
45 #include <sys/kernel.h>
47 #include <sys/rmlock.h>
50 #include <net/if_var.h>
51 #include <net/if_dl.h>
52 #include <net/route.h>
53 #include <net/route/route_ctl.h>
54 #include <net/route/route_var.h>
55 #include <net/route/nhop_utils.h>
56 #include <net/route/nhop.h>
57 #include <net/route/nhop_var.h>
59 #include <netinet/in_fib.h>
62 #include <netinet6/in6_fib.h>
63 #include <netinet6/in6_var.h>
67 #define DEBUG_MOD_NAME rt_helpers
68 #define DEBUG_MAX_LEVEL LOG_DEBUG2
69 #include <net/route/route_debug.h>
70 _DECLARE_DEBUG(LOG_INFO);
73 * RIB helper functions.
77 rib_walk_ext_locked(struct rib_head *rnh, rib_walktree_f_t *wa_f,
78 rib_walk_hook_f_t *hook_f, void *arg)
81 hook_f(rnh, RIB_WALK_HOOK_PRE, arg);
82 rnh->rnh_walktree(&rnh->head, (walktree_f_t *)wa_f, arg);
84 hook_f(rnh, RIB_WALK_HOOK_POST, arg);
88 * Calls @wa_f with @arg for each entry in the table specified by
91 * @ss_t callback is called before and after the tree traversal
92 * while holding table lock.
94 * Table is traversed under read lock unless @wlock is set.
97 rib_walk_ext_internal(struct rib_head *rnh, bool wlock, rib_walktree_f_t *wa_f,
98 rib_walk_hook_f_t *hook_f, void *arg)
106 rib_walk_ext_locked(rnh, wa_f, hook_f, arg);
114 rib_walk_ext(uint32_t fibnum, int family, bool wlock, rib_walktree_f_t *wa_f,
115 rib_walk_hook_f_t *hook_f, void *arg)
117 struct rib_head *rnh;
119 if ((rnh = rt_tables_get_rnh(fibnum, family)) != NULL)
120 rib_walk_ext_internal(rnh, wlock, wa_f, hook_f, arg);
124 * Calls @wa_f with @arg for each entry in the table specified by
127 * Table is traversed under read lock unless @wlock is set.
130 rib_walk(uint32_t fibnum, int family, bool wlock, rib_walktree_f_t *wa_f,
134 rib_walk_ext(fibnum, family, wlock, wa_f, NULL, arg);
138 * Calls @wa_f with @arg for each entry in the table matching @prefix/@mask.
140 * The following flags are supported:
141 * RIB_FLAG_WLOCK: acquire exclusive lock
142 * RIB_FLAG_LOCKED: Assumes the table is already locked & skip locking
144 * By default, table is traversed under read lock.
147 rib_walk_from(uint32_t fibnum, int family, uint32_t flags, struct sockaddr *prefix,
148 struct sockaddr *mask, rib_walktree_f_t *wa_f, void *arg)
151 struct rib_head *rnh = rt_tables_get_rnh(fibnum, family);
156 if (flags & RIB_FLAG_WLOCK)
158 else if (!(flags & RIB_FLAG_LOCKED))
161 rnh->rnh_walktree_from(&rnh->head, prefix, mask, (walktree_f_t *)wa_f, arg);
163 if (flags & RIB_FLAG_WLOCK)
165 else if (!(flags & RIB_FLAG_LOCKED))
170 * Iterates over all existing fibs in system calling
171 * @hook_f function before/after traversing each fib.
172 * Calls @wa_f function for each element in current fib.
173 * If af is not AF_UNSPEC, iterates over fibs in particular
177 rib_foreach_table_walk(int family, bool wlock, rib_walktree_f_t *wa_f,
178 rib_walk_hook_f_t *hook_f, void *arg)
181 for (uint32_t fibnum = 0; fibnum < rt_numfibs; fibnum++) {
182 /* Do we want some specific family? */
183 if (family != AF_UNSPEC) {
184 rib_walk_ext(fibnum, family, wlock, wa_f, hook_f, arg);
188 for (int i = 1; i <= AF_MAX; i++)
189 rib_walk_ext(fibnum, i, wlock, wa_f, hook_f, arg);
194 * Iterates over all existing fibs in system and deletes each element
195 * for which @filter_f function returns non-zero value.
196 * If @family is not AF_UNSPEC, iterates over fibs in particular
200 rib_foreach_table_walk_del(int family, rib_filter_f_t *filter_f, void *arg)
203 for (uint32_t fibnum = 0; fibnum < rt_numfibs; fibnum++) {
204 /* Do we want some specific family? */
205 if (family != AF_UNSPEC) {
206 rib_walk_del(fibnum, family, filter_f, arg, 0);
210 for (int i = 1; i <= AF_MAX; i++)
211 rib_walk_del(fibnum, i, filter_f, arg, 0);
217 * Wrapper for the control plane functions for performing af-agnostic
219 * @fibnum: fib to perform the lookup.
220 * @dst: sockaddr with family and addr filled in. IPv6 addresses needs to be in
222 * @flags: fib(9) flags.
223 * @flowid: flow id for path selection in multipath use case.
225 * Returns nhop_object or NULL.
227 * Requires NET_EPOCH.
231 rib_lookup(uint32_t fibnum, const struct sockaddr *dst, uint32_t flags,
234 struct nhop_object *nh;
238 switch (dst->sa_family) {
242 const struct sockaddr_in *a = (const struct sockaddr_in *)dst;
243 nh = fib4_lookup(fibnum, a->sin_addr, 0, flags, flowid);
250 const struct sockaddr_in6 *a = (const struct sockaddr_in6*)dst;
251 nh = fib6_lookup(fibnum, &a->sin6_addr, a->sin6_scope_id,
263 notify_add(struct rib_cmd_info *rc, const struct weightened_nhop *wn_src,
264 route_notification_t *cb, void *cbdata) {
265 rc->rc_nh_new = wn_src->nh;
266 rc->rc_nh_weight = wn_src->weight;
267 #if DEBUG_MAX_LEVEL >= LOG_DEBUG2
268 char nhbuf[NHOP_PRINT_BUFSIZE];
269 FIB_NH_LOG(LOG_DEBUG2, wn_src->nh, "RTM_ADD for %s @ w=%u",
270 nhop_print_buf(wn_src->nh, nhbuf, sizeof(nhbuf)), wn_src->weight);
276 notify_del(struct rib_cmd_info *rc, const struct weightened_nhop *wn_src,
277 route_notification_t *cb, void *cbdata) {
278 rc->rc_nh_old = wn_src->nh;
279 rc->rc_nh_weight = wn_src->weight;
280 #if DEBUG_MAX_LEVEL >= LOG_DEBUG2
281 char nhbuf[NHOP_PRINT_BUFSIZE];
282 FIB_NH_LOG(LOG_DEBUG2, wn_src->nh, "RTM_DEL for %s @ w=%u",
283 nhop_print_buf(wn_src->nh, nhbuf, sizeof(nhbuf)), wn_src->weight);
289 decompose_change_notification(struct rib_cmd_info *rc, route_notification_t *cb,
292 uint32_t num_old, num_new;
293 struct weightened_nhop *wn_old, *wn_new;
294 struct weightened_nhop tmp = { NULL, 0 };
295 uint32_t idx_old = 0, idx_new = 0;
297 struct rib_cmd_info rc_del = { .rc_cmd = RTM_DELETE, .rc_rt = rc->rc_rt };
298 struct rib_cmd_info rc_add = { .rc_cmd = RTM_ADD, .rc_rt = rc->rc_rt };
300 if (NH_IS_NHGRP(rc->rc_nh_old)) {
301 wn_old = nhgrp_get_nhops((struct nhgrp_object *)rc->rc_nh_old, &num_old);
303 tmp.nh = rc->rc_nh_old;
304 tmp.weight = rc->rc_nh_weight;
308 if (NH_IS_NHGRP(rc->rc_nh_new)) {
309 wn_new = nhgrp_get_nhops((struct nhgrp_object *)rc->rc_nh_new, &num_new);
311 tmp.nh = rc->rc_nh_new;
312 tmp.weight = rc->rc_nh_weight;
316 #if DEBUG_MAX_LEVEL >= LOG_DEBUG
318 char buf_old[NHOP_PRINT_BUFSIZE], buf_new[NHOP_PRINT_BUFSIZE];
319 nhop_print_buf_any(rc->rc_nh_old, buf_old, NHOP_PRINT_BUFSIZE);
320 nhop_print_buf_any(rc->rc_nh_new, buf_new, NHOP_PRINT_BUFSIZE);
321 FIB_NH_LOG(LOG_DEBUG, wn_old[0].nh, "change %s -> %s", buf_old, buf_new);
325 /* Use the fact that each @wn array is sorted */
327 * Here we have one (or two) multipath groups and transition
328 * between them needs to be reported to the caller, using series
329 * of primitive (RTM_DEL, RTM_ADD) operations.
331 * Leverage the fact that each nexthop group has its nexthops sorted
333 * [1] -> [1, 2] = A{2}
334 * [1, 2] -> [1] = D{2}
335 * [1, 2, 4] -> [1, 3, 4] = D{2}, A{3}
336 * [1, 2] -> [3, 4] = D{1}, D{2}, A{3}, A{4]
338 while ((idx_old < num_old) && (idx_new < num_new)) {
339 uint32_t nh_idx_old = wn_old[idx_old].nh->nh_priv->nh_idx;
340 uint32_t nh_idx_new = wn_new[idx_new].nh->nh_priv->nh_idx;
342 if (nh_idx_old == nh_idx_new) {
343 if (wn_old[idx_old].weight != wn_new[idx_new].weight) {
344 /* Update weight by providing del/add notifications */
345 notify_del(&rc_del, &wn_old[idx_old], cb, cbdata);
346 notify_add(&rc_add, &wn_new[idx_new], cb, cbdata);
350 } else if (nh_idx_old < nh_idx_new) {
351 /* [1, ~2~, 4], [1, ~3~, 4] */
352 notify_del(&rc_del, &wn_old[idx_old], cb, cbdata);
355 /* nh_idx_old > nh_idx_new. */
356 notify_add(&rc_add, &wn_new[idx_new], cb, cbdata);
361 while (idx_old < num_old) {
362 notify_del(&rc_del, &wn_old[idx_old], cb, cbdata);
366 while (idx_new < num_new) {
367 notify_add(&rc_add, &wn_new[idx_new], cb, cbdata);
373 * Decompose multipath cmd info @rc into a list of add/del/change
374 * single-path operations, calling @cb callback for each operation.
375 * Assumes at least one of the nexthops in @rc is multipath.
378 rib_decompose_notification(struct rib_cmd_info *rc, route_notification_t *cb,
381 struct weightened_nhop *wn;
383 struct rib_cmd_info rc_new;
386 switch (rc->rc_cmd) {
388 if (!NH_IS_NHGRP(rc->rc_nh_new))
390 wn = nhgrp_get_nhops((struct nhgrp_object *)rc->rc_nh_new, &num_nhops);
391 for (uint32_t i = 0; i < num_nhops; i++) {
392 notify_add(&rc_new, &wn[i], cb, cbdata);
396 if (!NH_IS_NHGRP(rc->rc_nh_old))
398 wn = nhgrp_get_nhops((struct nhgrp_object *)rc->rc_nh_old, &num_nhops);
399 for (uint32_t i = 0; i < num_nhops; i++) {
400 notify_del(&rc_new, &wn[i], cb, cbdata);
404 if (!NH_IS_NHGRP(rc->rc_nh_old) && !NH_IS_NHGRP(rc->rc_nh_new))
406 decompose_change_notification(rc, cb, cbdata);
414 * Checks if the found key in the trie contains (<=) a prefix covering
416 * Returns the most specific rtentry matching the condition or NULL.
418 static struct rtentry *
419 get_inet_parent_prefix(uint32_t fibnum, struct in_addr addr, int plen)
421 struct route_nhop_data rnd;
423 struct in_addr addr4;
426 struct radix_node *rn;
428 rt = fib4_lookup_rt(fibnum, addr, 0, NHR_UNLOCKED, &rnd);
432 rt_get_inet_prefix_plen(rt, &addr4, &parent_plen, &scopeid);
433 if (parent_plen <= plen)
437 * There can be multiple prefixes associated with the found key:
438 * 10.0.0.0 -> 10.0.0.0/24, 10.0.0.0/23, 10.0.0.0/22, etc.
439 * All such prefixes are linked via rn_dupedkey, from most specific
440 * to least specific. Iterate over them to check if any of these
441 * prefixes are wider than desired plen.
443 rn = (struct radix_node *)rt;
444 while ((rn = rn_nextprefix(rn)) != NULL) {
446 rt_get_inet_prefix_plen(rt, &addr4, &parent_plen, &scopeid);
447 if (parent_plen <= plen)
455 * Returns the most specific prefix containing (>) @paddr/plen.
458 rt_get_inet_parent(uint32_t fibnum, struct in_addr addr, int plen)
460 struct in_addr lookup_addr = { .s_addr = INADDR_BROADCAST };
461 struct in_addr addr4 = addr;
462 struct in_addr mask4;
466 /* Calculate wider mask & new key to lookup */
467 mask4.s_addr = htonl(plen ? ~((1 << (32 - plen)) - 1) : 0);
468 addr4.s_addr = htonl(ntohl(addr4.s_addr) & ntohl(mask4.s_addr));
469 if (addr4.s_addr == lookup_addr.s_addr) {
470 /* Skip lookup if the key is the same */
475 rt = get_inet_parent_prefix(fibnum, lookup_addr, plen);
486 * Checks if the found key in the trie contains (<=) a prefix covering
488 * Returns the most specific rtentry matching the condition or NULL.
490 static struct rtentry *
491 get_inet6_parent_prefix(uint32_t fibnum, const struct in6_addr *paddr, int plen)
493 struct route_nhop_data rnd;
495 struct in6_addr addr6;
498 struct radix_node *rn;
500 rt = fib6_lookup_rt(fibnum, paddr, 0, NHR_UNLOCKED, &rnd);
504 rt_get_inet6_prefix_plen(rt, &addr6, &parent_plen, &scopeid);
505 if (parent_plen <= plen)
509 * There can be multiple prefixes associated with the found key:
510 * 2001:db8:1::/64 -> 2001:db8:1::/56, 2001:db8:1::/48, etc.
511 * All such prefixes are linked via rn_dupedkey, from most specific
512 * to least specific. Iterate over them to check if any of these
513 * prefixes are wider than desired plen.
515 rn = (struct radix_node *)rt;
516 while ((rn = rn_nextprefix(rn)) != NULL) {
518 rt_get_inet6_prefix_plen(rt, &addr6, &parent_plen, &scopeid);
519 if (parent_plen <= plen)
527 ipv6_writemask(struct in6_addr *addr6, uint8_t mask)
531 for (cp = (uint32_t *)addr6; mask >= 32; mask -= 32)
534 *cp = htonl(mask ? ~((1 << (32 - mask)) - 1) : 0);
538 * Returns the most specific prefix containing (>) @paddr/plen.
541 rt_get_inet6_parent(uint32_t fibnum, const struct in6_addr *paddr, int plen)
543 struct in6_addr lookup_addr = in6mask128;
544 struct in6_addr addr6 = *paddr;
545 struct in6_addr mask6;
549 /* Calculate wider mask & new key to lookup */
550 ipv6_writemask(&mask6, plen);
551 IN6_MASK_ADDR(&addr6, &mask6);
552 if (IN6_ARE_ADDR_EQUAL(&addr6, &lookup_addr)) {
553 /* Skip lookup if the key is the same */
558 rt = get_inet6_parent_prefix(fibnum, &lookup_addr, plen);
568 * Prints rtentry @rt data in the provided @buf.
569 * Example: rt/192.168.0.0/24
572 rt_print_buf(const struct rtentry *rt, char *buf, size_t bufsize)
574 char abuf[INET6_ADDRSTRLEN];
578 switch (rt_get_family(rt)) {
582 struct in_addr addr4;
583 rt_get_inet_prefix_plen(rt, &addr4, &plen, &scopeid);
584 inet_ntop(AF_INET, &addr4, abuf, sizeof(abuf));
585 snprintf(buf, bufsize, "rt/%s/%d", abuf, plen);
592 struct in6_addr addr6;
593 rt_get_inet6_prefix_plen(rt, &addr6, &plen, &scopeid);
594 inet_ntop(AF_INET6, &addr6, abuf, sizeof(abuf));
595 snprintf(buf, bufsize, "rt/%s/%d", abuf, plen);
600 snprintf(buf, bufsize, "rt/unknown_af#%d", rt_get_family(rt));
608 rib_print_cmd(int rib_cmd)
614 return ("RTM_CHANGE");
616 return ("RTM_DELETE");