2 * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
4 * Copyright (c) 2020 Alexander V. Chernikov
6 * Redistribution and use in source and binary forms, with or without
7 * modification, are permitted provided that the following conditions
9 * 1. Redistributions of source code must retain the above copyright
10 * notice, this list of conditions and the following disclaimer.
11 * 2. Redistributions in binary form must reproduce the above copyright
12 * notice, this list of conditions and the following disclaimer in the
13 * documentation and/or other materials provided with the distribution.
15 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
16 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
17 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
18 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
19 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
20 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
21 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
22 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
23 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
24 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
28 #include <sys/cdefs.h>
29 __FBSDID("$FreeBSD$");
31 #include "opt_inet6.h"
32 #include "opt_route.h"
34 #include <sys/param.h>
35 #include <sys/systm.h>
36 #include <sys/malloc.h>
38 #include <sys/socket.h>
39 #include <sys/sysctl.h>
40 #include <sys/syslog.h>
41 #include <sys/kernel.h>
43 #include <sys/rmlock.h>
46 #include <net/if_var.h>
47 #include <net/if_dl.h>
49 #include <net/route.h>
50 #include <net/route/route_ctl.h>
51 #include <net/route/route_var.h>
52 #include <net/route/nhop_utils.h>
53 #include <net/route/nhop.h>
54 #include <net/route/nhop_var.h>
55 #include <netinet/in.h>
56 #include <netinet6/scope6_var.h>
57 #include <netinet6/in6_var.h>
59 #define DEBUG_MOD_NAME route_ctl
60 #define DEBUG_MAX_LEVEL LOG_DEBUG
61 #include <net/route/route_debug.h>
62 _DECLARE_DEBUG(LOG_INFO);
65 * This file contains control plane routing tables functions.
67 * All functions assumes they are called in net epoch.
70 union sockaddr_union {
72 struct sockaddr_in sin;
73 struct sockaddr_in6 sin6;
77 static int add_route_byinfo(struct rib_head *rnh, struct rt_addrinfo *info,
78 struct rib_cmd_info *rc);
79 static int change_route_byinfo(struct rib_head *rnh, struct rtentry *rt,
80 struct rt_addrinfo *info, struct route_nhop_data *nhd_orig,
81 struct rib_cmd_info *rc);
83 static int add_route_flags(struct rib_head *rnh, struct rtentry *rt,
84 struct route_nhop_data *rnd_add, int op_flags, struct rib_cmd_info *rc);
86 static int add_route_flags_mpath(struct rib_head *rnh, struct rtentry *rt,
87 struct route_nhop_data *rnd_add, struct route_nhop_data *rnd_orig,
88 int op_flags, struct rib_cmd_info *rc);
91 static int add_route(struct rib_head *rnh, struct rtentry *rt,
92 struct route_nhop_data *rnd, struct rib_cmd_info *rc);
93 static int delete_route(struct rib_head *rnh, struct rtentry *rt,
94 struct rib_cmd_info *rc);
95 static int rt_delete_conditional(struct rib_head *rnh, struct rtentry *rt,
96 int prio, rib_filter_f_t *cb, void *cbdata, struct rib_cmd_info *rc);
98 static int get_prio_from_info(const struct rt_addrinfo *info);
99 static int nhop_get_prio(const struct nhop_object *nh);
102 static bool rib_can_multipath(struct rib_head *rh);
105 /* Per-vnet multipath routing configuration */
106 SYSCTL_DECL(_net_route);
107 #define V_rib_route_multipath VNET(rib_route_multipath)
109 #define _MP_FLAGS CTLFLAG_RW
111 #define _MP_FLAGS CTLFLAG_RD
113 VNET_DEFINE(u_int, rib_route_multipath) = 1;
114 SYSCTL_UINT(_net_route, OID_AUTO, multipath, _MP_FLAGS | CTLFLAG_VNET,
115 &VNET_NAME(rib_route_multipath), 0, "Enable route multipath");
119 VNET_DEFINE(u_int, fib_hash_outbound) = 0;
120 SYSCTL_UINT(_net_route, OID_AUTO, hash_outbound, CTLFLAG_RD | CTLFLAG_VNET,
121 &VNET_NAME(fib_hash_outbound), 0,
122 "Compute flowid for locally-originated packets");
124 /* Default entropy to add to the hash calculation for the outbound connections*/
125 uint8_t mpath_entropy_key[MPATH_ENTROPY_KEY_LEN] = {
126 0x6d, 0x5a, 0x56, 0xda, 0x25, 0x5b, 0x0e, 0xc2,
127 0x41, 0x67, 0x25, 0x3d, 0x43, 0xa3, 0x8f, 0xb0,
128 0xd0, 0xca, 0x2b, 0xcb, 0xae, 0x7b, 0x30, 0xb4,
129 0x77, 0xcb, 0x2d, 0xa3, 0x80, 0x30, 0xf2, 0x0c,
130 0x6a, 0x42, 0xb7, 0x3b, 0xbe, 0xac, 0x01, 0xfa,
134 #if defined(INET) && defined(INET6)
135 FEATURE(ipv4_rfc5549_support, "Route IPv4 packets via IPv6 nexthops");
136 #define V_rib_route_ipv6_nexthop VNET(rib_route_ipv6_nexthop)
137 VNET_DEFINE(u_int, rib_route_ipv6_nexthop) = 1;
138 SYSCTL_UINT(_net_route, OID_AUTO, ipv6_nexthop, CTLFLAG_RW | CTLFLAG_VNET,
139 &VNET_NAME(rib_route_ipv6_nexthop), 0, "Enable IPv4 route via IPv6 Next Hop address");
143 SYSCTL_NODE(_net_route, OID_AUTO, debug, CTLFLAG_RD | CTLFLAG_MPSAFE, 0, "");
145 static struct rib_head *
146 get_rnh(uint32_t fibnum, const struct rt_addrinfo *info)
148 struct rib_head *rnh;
149 struct sockaddr *dst;
151 KASSERT((fibnum < rt_numfibs), ("rib_add_route: bad fibnum"));
153 dst = info->rti_info[RTAX_DST];
154 rnh = rt_tables_get_rnh(fibnum, dst->sa_family);
159 #if defined(INET) && defined(INET6)
161 rib_can_ipv6_nexthop_address(struct rib_head *rh)
165 CURVNET_SET(rh->rib_vnet);
166 result = !!V_rib_route_ipv6_nexthop;
175 rib_can_multipath(struct rib_head *rh)
179 CURVNET_SET(rh->rib_vnet);
180 result = !!V_rib_route_multipath;
187 * Check is nhop is multipath-eligible.
188 * Avoid nhops without gateways and redirects.
190 * Returns 1 for multipath-eligible nexthop,
194 nhop_can_multipath(const struct nhop_object *nh)
197 if ((nh->nh_flags & NHF_MULTIPATH) != 0)
199 if ((nh->nh_flags & NHF_GATEWAY) == 0)
201 if ((nh->nh_flags & NHF_REDIRECT) != 0)
209 get_info_weight(const struct rt_addrinfo *info, uint32_t default_weight)
213 if (info->rti_mflags & RTV_WEIGHT)
214 weight = info->rti_rmx->rmx_weight;
216 weight = default_weight;
217 /* Keep upper 1 byte for adm distance purposes */
218 if (weight > RT_MAX_WEIGHT)
219 weight = RT_MAX_WEIGHT;
220 else if (weight == 0)
221 weight = default_weight;
227 * File-local concept for distingushing between the normal and
228 * RTF_PINNED routes tha can override the "normal" one.
230 #define NH_PRIORITY_HIGH 2
231 #define NH_PRIORITY_NORMAL 1
233 get_prio_from_info(const struct rt_addrinfo *info)
235 if (info->rti_flags & RTF_PINNED)
236 return (NH_PRIORITY_HIGH);
237 return (NH_PRIORITY_NORMAL);
241 nhop_get_prio(const struct nhop_object *nh)
243 if (NH_IS_PINNED(nh))
244 return (NH_PRIORITY_HIGH);
245 return (NH_PRIORITY_NORMAL);
249 * Check if specified @gw matches gw data in the nexthop @nh.
251 * Returns true if matches, false otherwise.
254 match_nhop_gw(const struct nhop_object *nh, const struct sockaddr *gw)
257 if (nh->gw_sa.sa_family != gw->sa_family)
260 switch (gw->sa_family) {
262 return (nh->gw4_sa.sin_addr.s_addr ==
263 ((const struct sockaddr_in *)gw)->sin_addr.s_addr);
266 const struct sockaddr_in6 *gw6;
267 gw6 = (const struct sockaddr_in6 *)gw;
270 * Currently (2020-09) IPv6 gws in kernel have their
271 * scope embedded. Once this becomes false, this code
272 * has to be revisited.
274 if (IN6_ARE_ADDR_EQUAL(&nh->gw6_sa.sin6_addr,
281 const struct sockaddr_dl *sdl;
282 sdl = (const struct sockaddr_dl *)gw;
283 return (nh->gwl_sa.sdl_index == sdl->sdl_index);
286 return (memcmp(&nh->gw_sa, gw, nh->gw_sa.sa_len) == 0);
294 * Matches all nexthop with given @gw.
295 * Can be used as rib_filter_f callback.
298 rib_match_gw(const struct rtentry *rt, const struct nhop_object *nh, void *gw_sa)
300 const struct sockaddr *gw = (const struct sockaddr *)gw_sa;
302 return (match_nhop_gw(nh, gw));
305 struct gw_filter_data {
306 const struct sockaddr *gw;
311 * Matches first occurence of the gateway provided in @gwd
314 match_gw_one(const struct rtentry *rt, const struct nhop_object *nh, void *_data)
316 struct gw_filter_data *gwd = (struct gw_filter_data *)_data;
318 /* Return only first match to make rtsock happy */
319 if (match_nhop_gw(nh, gwd->gw) && gwd->count++ == 0)
325 * Checks if data in @info matches nexhop @nh.
327 * Returns 0 on success,
328 * ESRCH if not matched,
329 * ENOENT if filter function returned false
332 check_info_match_nhop(const struct rt_addrinfo *info, const struct rtentry *rt,
333 const struct nhop_object *nh)
335 const struct sockaddr *gw = info->rti_info[RTAX_GATEWAY];
337 if (info->rti_filter != NULL) {
338 if (info->rti_filter(rt, nh, info->rti_filterdata) == 0)
343 if ((gw != NULL) && !match_nhop_gw(nh, gw))
350 * Runs exact prefix match based on @dst and @netmask.
351 * Returns matched @rtentry if found or NULL.
352 * If rtentry was found, saves nexthop / weight value into @rnd.
354 static struct rtentry *
355 lookup_prefix_bysa(struct rib_head *rnh, const struct sockaddr *dst,
356 const struct sockaddr *netmask, struct route_nhop_data *rnd)
360 RIB_LOCK_ASSERT(rnh);
362 rt = (struct rtentry *)rnh->rnh_lookup(dst, netmask, &rnh->head);
364 rnd->rnd_nhop = rt->rt_nhop;
365 rnd->rnd_weight = rt->rt_weight;
367 rnd->rnd_nhop = NULL;
375 lookup_prefix_rt(struct rib_head *rnh, const struct rtentry *rt,
376 struct route_nhop_data *rnd)
378 return (lookup_prefix_bysa(rnh, rt_key_const(rt), rt_mask_const(rt), rnd));
382 * Runs exact prefix match based on dst/netmask from @info.
383 * Assumes RIB lock is held.
384 * Returns matched @rtentry if found or NULL.
385 * If rtentry was found, saves nexthop / weight value into @rnd.
388 lookup_prefix(struct rib_head *rnh, const struct rt_addrinfo *info,
389 struct route_nhop_data *rnd)
393 rt = lookup_prefix_bysa(rnh, info->rti_info[RTAX_DST],
394 info->rti_info[RTAX_NETMASK], rnd);
400 fill_pxmask_family(int family, int plen, struct sockaddr *_dst,
401 struct sockaddr **pmask)
412 struct sockaddr_in *mask = (struct sockaddr_in *)(*pmask);
413 struct sockaddr_in *dst= (struct sockaddr_in *)_dst;
415 memset(mask, 0, sizeof(*mask));
416 mask->sin_family = family;
417 mask->sin_len = sizeof(*mask);
420 else if (plen > 32 || plen < 0)
423 uint32_t daddr, maddr;
424 maddr = htonl(plen ? ~((1 << (32 - plen)) - 1) : 0);
425 mask->sin_addr.s_addr = maddr;
426 daddr = dst->sin_addr.s_addr;
427 daddr = htonl(ntohl(daddr) & ntohl(maddr));
428 dst->sin_addr.s_addr = daddr;
437 struct sockaddr_in6 *mask = (struct sockaddr_in6 *)(*pmask);
438 struct sockaddr_in6 *dst = (struct sockaddr_in6 *)_dst;
440 memset(mask, 0, sizeof(*mask));
441 mask->sin6_family = family;
442 mask->sin6_len = sizeof(*mask);
445 else if (plen > 128 || plen < 0)
448 ip6_writemask(&mask->sin6_addr, plen);
449 IN6_MASK_ADDR(&dst->sin6_addr, &mask->sin6_addr);
460 * Attempts to add @dst/plen prefix with nexthop/nexhopgroup data @rnd
461 * to the routing table.
463 * @fibnum: rtable id to insert route to
464 * @dst: verified kernel-originated sockaddr, can be masked if plen non-empty
465 * @plen: prefix length (or -1 if host route or not applicable for AF)
466 * @op_flags: combination of RTM_F_ flags
467 * @rc: storage to report operation result
469 * Returns 0 on success.
472 rib_add_route_px(uint32_t fibnum, struct sockaddr *dst, int plen,
473 struct route_nhop_data *rnd, int op_flags, struct rib_cmd_info *rc)
475 union sockaddr_union mask_storage;
476 struct sockaddr *netmask = &mask_storage.sa;
477 struct rtentry *rt = NULL;
481 bzero(rc, sizeof(struct rib_cmd_info));
482 rc->rc_cmd = RTM_ADD;
484 struct rib_head *rnh = rt_tables_get_rnh(fibnum, dst->sa_family);
486 return (EAFNOSUPPORT);
488 if (!fill_pxmask_family(dst->sa_family, plen, dst, &netmask)) {
489 FIB_RH_LOG(LOG_DEBUG, rnh, "error: invalid plen %d", plen);
493 if (op_flags & RTM_F_CREATE) {
494 if ((rt = rt_alloc(rnh, dst, netmask)) == NULL) {
495 FIB_RH_LOG(LOG_INFO, rnh, "rtentry allocation failed");
500 return (add_route_flags(rnh, rt, rnd, op_flags, rc));
504 * Attempts to delete @dst/plen prefix matching gateway @gw from the
507 * @fibnum: rtable id to remove route from
508 * @dst: verified kernel-originated sockaddr, can be masked if plen non-empty
509 * @plen: prefix length (or -1 if host route or not applicable for AF)
510 * @gw: gateway to match
511 * @op_flags: combination of RTM_F_ flags
512 * @rc: storage to report operation result
514 * Returns 0 on success.
517 rib_del_route_px_gw(uint32_t fibnum, struct sockaddr *dst, int plen,
518 const struct sockaddr *gw, int op_flags, struct rib_cmd_info *rc)
520 struct gw_filter_data gwd = { .gw = gw };
522 return (rib_del_route_px(fibnum, dst, plen, match_gw_one, &gwd, op_flags, rc));
526 * Attempts to delete @dst/plen prefix matching @filter_func from the
529 * @fibnum: rtable id to remove route from
530 * @dst: verified kernel-originated sockaddr, can be masked if plen non-empty
531 * @plen: prefix length (or -1 if host route or not applicable for AF)
532 * @filter_func: func to be called for each nexthop of the prefix for matching
533 * @filter_arg: argument to pass to @filter_func
534 * @op_flags: combination of RTM_F_ flags
535 * @rc: storage to report operation result
537 * Returns 0 on success.
540 rib_del_route_px(uint32_t fibnum, struct sockaddr *dst, int plen,
541 rib_filter_f_t *filter_func, void *filter_arg, int op_flags,
542 struct rib_cmd_info *rc)
544 union sockaddr_union mask_storage;
545 struct sockaddr *netmask = &mask_storage.sa;
550 bzero(rc, sizeof(struct rib_cmd_info));
551 rc->rc_cmd = RTM_DELETE;
553 struct rib_head *rnh = rt_tables_get_rnh(fibnum, dst->sa_family);
555 return (EAFNOSUPPORT);
557 if (dst->sa_len > sizeof(mask_storage)) {
558 FIB_RH_LOG(LOG_DEBUG, rnh, "error: dst->sa_len too big: %d", dst->sa_len);
562 if (!fill_pxmask_family(dst->sa_family, plen, dst, &netmask)) {
563 FIB_RH_LOG(LOG_DEBUG, rnh, "error: invalid plen %d", plen);
567 int prio = (op_flags & RTM_F_FORCE) ? NH_PRIORITY_HIGH : NH_PRIORITY_NORMAL;
570 struct route_nhop_data rnd;
571 struct rtentry *rt = lookup_prefix_bysa(rnh, dst, netmask, &rnd);
573 error = rt_delete_conditional(rnh, rt, prio, filter_func,
582 rib_notify(rnh, RIB_NOTIFY_DELAYED, rc);
584 if (rc->rc_cmd == RTM_DELETE)
589 * Deleting 1 path may result in RTM_CHANGE to
590 * a different mpath group/nhop.
591 * Free old mpath group.
593 nhop_free_any(rc->rc_nh_old);
601 * Tries to copy route @rt from one rtable to the rtable specified by @dst_rh.
602 * @rt: route to copy.
603 * @rnd_src: nhop and weight. Multipath routes are not supported
604 * @rh_dst: target rtable.
605 * @rc: operation result storage
607 * Return 0 on success.
610 rib_copy_route(struct rtentry *rt, const struct route_nhop_data *rnd_src,
611 struct rib_head *rh_dst, struct rib_cmd_info *rc)
613 struct nhop_object __diagused *nh_src = rnd_src->rnd_nhop;
616 MPASS((nh_src->nh_flags & NHF_MULTIPATH) == 0);
618 #if DEBUG_MAX_LEVEL >= LOG_DEBUG2
619 char nhbuf[NHOP_PRINT_BUFSIZE], rtbuf[NHOP_PRINT_BUFSIZE];
620 nhop_print_buf_any(nh_src, nhbuf, sizeof(nhbuf));
621 rt_print_buf(rt, rtbuf, sizeof(rtbuf));
622 FIB_RH_LOG(LOG_DEBUG2, rh_dst, "copying %s -> %s from fib %u",
623 rtbuf, nhbuf, nhop_get_fibnum(nh_src));
625 struct nhop_object *nh = nhop_alloc(rh_dst->rib_fibnum, rh_dst->rib_family);
627 FIB_RH_LOG(LOG_INFO, rh_dst, "unable to allocate new nexthop");
630 nhop_copy(nh, rnd_src->rnd_nhop);
631 nhop_set_fibnum(nh, rh_dst->rib_fibnum);
632 nh = nhop_get_nhop_internal(rh_dst, nh, &error);
634 FIB_RH_LOG(LOG_INFO, rh_dst,
635 "unable to finalize new nexthop: error %d", error);
639 struct rtentry *rt_new = rt_alloc(rh_dst, rt_key(rt), rt_mask(rt));
640 if (rt_new == NULL) {
641 FIB_RH_LOG(LOG_INFO, rh_dst, "unable to create new rtentry");
646 struct route_nhop_data rnd = {
648 .rnd_weight = rnd_src->rnd_weight
650 int op_flags = RTM_F_CREATE | (NH_IS_PINNED(nh) ? RTM_F_FORCE : 0);
651 error = add_route_flags(rh_dst, rt_new, &rnd, op_flags, rc);
654 #if DEBUG_MAX_LEVEL >= LOG_DEBUG
655 char buf[NHOP_PRINT_BUFSIZE];
656 rt_print_buf(rt_new, buf, sizeof(buf));
657 FIB_RH_LOG(LOG_DEBUG, rh_dst, "Unable to add route %s: error %d", buf, error);
660 rt_free_immediate(rt_new);
666 * Adds route defined by @info into the kernel table specified by @fibnum and
667 * sa_family in @info->rti_info[RTAX_DST].
669 * Returns 0 on success and fills in operation metadata into @rc.
672 rib_add_route(uint32_t fibnum, struct rt_addrinfo *info,
673 struct rib_cmd_info *rc)
675 struct rib_head *rnh;
680 rnh = get_rnh(fibnum, info);
682 return (EAFNOSUPPORT);
685 * Check consistency between RTF_HOST flag and netmask
688 if (info->rti_flags & RTF_HOST)
689 info->rti_info[RTAX_NETMASK] = NULL;
690 else if (info->rti_info[RTAX_NETMASK] == NULL) {
691 FIB_RH_LOG(LOG_DEBUG, rnh, "error: no RTF_HOST and empty netmask");
695 bzero(rc, sizeof(struct rib_cmd_info));
696 rc->rc_cmd = RTM_ADD;
698 error = add_route_byinfo(rnh, info, rc);
700 rib_notify(rnh, RIB_NOTIFY_DELAYED, rc);
706 * Checks if @dst and @gateway is valid combination.
708 * Returns true if is valid, false otherwise.
711 check_gateway(struct rib_head *rnh, struct sockaddr *dst,
712 struct sockaddr *gateway)
714 if (dst->sa_family == gateway->sa_family)
716 else if (gateway->sa_family == AF_UNSPEC)
718 else if (gateway->sa_family == AF_LINK)
720 #if defined(INET) && defined(INET6)
721 else if (dst->sa_family == AF_INET && gateway->sa_family == AF_INET6 &&
722 rib_can_ipv6_nexthop_address(rnh))
730 add_route_byinfo(struct rib_head *rnh, struct rt_addrinfo *info,
731 struct rib_cmd_info *rc)
733 struct route_nhop_data rnd_add;
734 struct nhop_object *nh;
736 struct sockaddr *dst, *gateway, *netmask;
739 dst = info->rti_info[RTAX_DST];
740 gateway = info->rti_info[RTAX_GATEWAY];
741 netmask = info->rti_info[RTAX_NETMASK];
743 if ((info->rti_flags & RTF_GATEWAY) && !gateway) {
744 FIB_RH_LOG(LOG_DEBUG, rnh, "error: RTF_GATEWAY set with empty gw");
747 if (dst && gateway && !check_gateway(rnh, dst, gateway)) {
748 FIB_RH_LOG(LOG_DEBUG, rnh,
749 "error: invalid dst/gateway family combination (%d, %d)",
750 dst->sa_family, gateway->sa_family);
754 if (dst->sa_len > sizeof(((struct rtentry *)NULL)->rt_dstb)) {
755 FIB_RH_LOG(LOG_DEBUG, rnh, "error: dst->sa_len too large: %d",
760 if (info->rti_ifa == NULL) {
761 error = rt_getifa_fib(info, rnh->rib_fibnum);
766 if ((rt = rt_alloc(rnh, dst, netmask)) == NULL)
769 error = nhop_create_from_info(rnh, info, &nh);
771 rt_free_immediate(rt);
775 rnd_add.rnd_nhop = nh;
776 rnd_add.rnd_weight = get_info_weight(info, RT_DEFAULT_WEIGHT);
778 int op_flags = RTM_F_CREATE;
779 if (get_prio_from_info(info) == NH_PRIORITY_HIGH)
780 op_flags |= RTM_F_FORCE;
782 op_flags |= RTM_F_APPEND;
783 return (add_route_flags(rnh, rt, &rnd_add, op_flags, rc));
788 add_route_flags(struct rib_head *rnh, struct rtentry *rt, struct route_nhop_data *rnd_add,
789 int op_flags, struct rib_cmd_info *rc)
791 struct route_nhop_data rnd_orig;
792 struct nhop_object *nh;
793 struct rtentry *rt_orig;
796 nh = rnd_add->rnd_nhop;
800 rt_orig = lookup_prefix_rt(rnh, rt, &rnd_orig);
802 if (rt_orig == NULL) {
803 if (op_flags & RTM_F_CREATE)
804 error = add_route(rnh, rt, rnd_add, rc);
806 error = ESRCH; /* no entry but creation was not required */
813 if (op_flags & RTM_F_EXCL) {
814 /* We have existing route in the RIB but not allowed to replace. */
820 /* Now either append or replace */
821 if (op_flags & RTM_F_REPLACE) {
822 if (nhop_get_prio(rnd_orig.rnd_nhop) > nhop_get_prio(rnd_add->rnd_nhop)) {
823 /* Old path is "better" (e.g. has PINNED flag set) */
827 change_route(rnh, rt_orig, rnd_add, rc);
836 if ((op_flags & RTM_F_APPEND) && rib_can_multipath(rnh) &&
837 nhop_can_multipath(rnd_add->rnd_nhop) &&
838 nhop_can_multipath(rnd_orig.rnd_nhop)) {
840 for (int i = 0; i < RIB_MAX_RETRIES; i++) {
841 error = add_route_flags_mpath(rnh, rt_orig, rnd_add, &rnd_orig,
845 RTSTAT_INC(rts_add_retry);
849 * Original nhop reference is unused in any case.
851 nhop_free_any(rnd_add->rnd_nhop);
852 if (op_flags & RTM_F_CREATE) {
853 if (error != 0 || rc->rc_cmd != RTM_ADD)
854 rt_free_immediate(rt);
859 /* Out of options - free state and return error */
862 if (op_flags & RTM_F_CREATE)
863 rt_free_immediate(rt);
871 add_route_flags_mpath(struct rib_head *rnh, struct rtentry *rt,
872 struct route_nhop_data *rnd_add, struct route_nhop_data *rnd_orig,
873 int op_flags, struct rib_cmd_info *rc)
876 struct route_nhop_data rnd_new;
879 error = nhgrp_get_addition_group(rnh, rnd_orig, rnd_add, &rnd_new);
881 if (error == EAGAIN) {
883 * Group creation failed, most probably because
884 * @rnd_orig data got scheduled for deletion.
885 * Refresh @rnd_orig data and retry.
888 lookup_prefix_rt(rnh, rt, rnd_orig);
890 if (rnd_orig == NULL && !(op_flags & RTM_F_CREATE)) {
891 /* In this iteration route doesn't exist */
897 error = change_route_conditional(rnh, rt, rnd_orig, &rnd_new, rc);
901 if (V_fib_hash_outbound == 0 && NH_IS_NHGRP(rc->rc_nh_new)) {
903 * First multipath route got installed. Enable local
904 * outbound connections hashing.
907 printf("FIB: enabled flowid calculation for locally-originated packets\n");
908 V_fib_hash_outbound = 1;
916 * Removes route defined by @info from the kernel table specified by @fibnum and
917 * sa_family in @info->rti_info[RTAX_DST].
919 * Returns 0 on success and fills in operation metadata into @rc.
922 rib_del_route(uint32_t fibnum, struct rt_addrinfo *info, struct rib_cmd_info *rc)
924 struct rib_head *rnh;
925 struct sockaddr *dst, *netmask;
926 struct sockaddr_storage mdst;
931 rnh = get_rnh(fibnum, info);
933 return (EAFNOSUPPORT);
935 bzero(rc, sizeof(struct rib_cmd_info));
936 rc->rc_cmd = RTM_DELETE;
938 dst = info->rti_info[RTAX_DST];
939 netmask = info->rti_info[RTAX_NETMASK];
941 if (netmask != NULL) {
942 /* Ensure @dst is always properly masked */
943 if (dst->sa_len > sizeof(mdst)) {
944 FIB_RH_LOG(LOG_DEBUG, rnh, "error: dst->sa_len too large");
947 rt_maskedcopy(dst, (struct sockaddr *)&mdst, netmask);
948 dst = (struct sockaddr *)&mdst;
951 rib_filter_f_t *filter_func = NULL;
952 void *filter_arg = NULL;
953 struct gw_filter_data gwd = { .gw = info->rti_info[RTAX_GATEWAY] };
955 if (info->rti_filter != NULL) {
956 filter_func = info->rti_filter;
957 filter_arg = info->rti_filterdata;
958 } else if (gwd.gw != NULL) {
959 filter_func = match_gw_one;
963 int prio = get_prio_from_info(info);
966 struct route_nhop_data rnd;
967 struct rtentry *rt = lookup_prefix_bysa(rnh, dst, netmask, &rnd);
969 error = rt_delete_conditional(rnh, rt, prio, filter_func,
978 rib_notify(rnh, RIB_NOTIFY_DELAYED, rc);
980 if (rc->rc_cmd == RTM_DELETE)
985 * Deleting 1 path may result in RTM_CHANGE to
986 * a different mpath group/nhop.
987 * Free old mpath group.
989 nhop_free_any(rc->rc_nh_old);
997 * Conditionally unlinks rtentry paths from @rnh matching @cb.
998 * Returns 0 on success with operation result stored in @rc.
1000 * ESRCH - if prefix was not found or filter function failed to match
1001 * EADDRINUSE - if trying to delete higher priority route.
1004 rt_delete_conditional(struct rib_head *rnh, struct rtentry *rt,
1005 int prio, rib_filter_f_t *cb, void *cbdata, struct rib_cmd_info *rc)
1007 struct nhop_object *nh = rt->rt_nhop;
1010 if (NH_IS_NHGRP(nh)) {
1011 struct nhgrp_object *nhg = (struct nhgrp_object *)nh;
1012 struct route_nhop_data rnd;
1017 error = nhgrp_get_filtered_group(rnh, rt, nhg, cb, cbdata, &rnd);
1019 if (rnd.rnd_nhgrp == nhg) {
1020 /* No match, unreference new group and return. */
1021 nhop_free_any(rnd.rnd_nhop);
1024 error = change_route(rnh, rt, &rnd, rc);
1029 if (cb != NULL && !cb(rt, nh, cbdata))
1032 if (prio < nhop_get_prio(nh))
1033 return (EADDRINUSE);
1035 return (delete_route(rnh, rt, rc));
1039 rib_change_route(uint32_t fibnum, struct rt_addrinfo *info,
1040 struct rib_cmd_info *rc)
1043 struct route_nhop_data rnd_orig;
1044 struct rib_head *rnh;
1050 rnh = get_rnh(fibnum, info);
1052 return (EAFNOSUPPORT);
1054 bzero(rc, sizeof(struct rib_cmd_info));
1055 rc->rc_cmd = RTM_CHANGE;
1057 /* Check if updated gateway exists */
1058 if ((info->rti_flags & RTF_GATEWAY) &&
1059 (info->rti_info[RTAX_GATEWAY] == NULL)) {
1062 * route(8) adds RTF_GATEWAY flag if -interface is not set.
1063 * Remove RTF_GATEWAY to enforce consistency and maintain
1066 info->rti_flags &= ~RTF_GATEWAY;
1070 * route change is done in multiple steps, with dropping and
1071 * reacquiring lock. In the situations with multiple processes
1072 * changes the same route in can lead to the case when route
1073 * is changed between the steps. Address it by retrying the operation
1074 * multiple times before failing.
1078 rt = (struct rtentry *)rnh->rnh_lookup(info->rti_info[RTAX_DST],
1079 info->rti_info[RTAX_NETMASK], &rnh->head);
1086 rnd_orig.rnd_nhop = rt->rt_nhop;
1087 rnd_orig.rnd_weight = rt->rt_weight;
1091 for (int i = 0; i < RIB_MAX_RETRIES; i++) {
1092 error = change_route_byinfo(rnh, rt, info, &rnd_orig, rc);
1093 if (error != EAGAIN)
1101 change_nhop(struct rib_head *rnh, struct rt_addrinfo *info,
1102 struct nhop_object *nh_orig, struct nhop_object **nh_new)
1107 * New gateway could require new ifaddr, ifp;
1108 * flags may also be different; ifp may be specified
1109 * by ll sockaddr when protocol address is ambiguous
1111 if (((nh_orig->nh_flags & NHF_GATEWAY) &&
1112 info->rti_info[RTAX_GATEWAY] != NULL) ||
1113 info->rti_info[RTAX_IFP] != NULL ||
1114 (info->rti_info[RTAX_IFA] != NULL &&
1115 !sa_equal(info->rti_info[RTAX_IFA], nh_orig->nh_ifa->ifa_addr))) {
1116 error = rt_getifa_fib(info, rnh->rib_fibnum);
1119 info->rti_ifa = NULL;
1124 error = nhop_create_from_nhop(rnh, nh_orig, info, nh_new);
1125 info->rti_ifa = NULL;
1132 change_mpath_route(struct rib_head *rnh, struct rtentry *rt,
1133 struct rt_addrinfo *info, struct route_nhop_data *rnd_orig,
1134 struct rib_cmd_info *rc)
1136 int error = 0, found_idx = 0;
1137 struct nhop_object *nh_orig = NULL, *nh_new;
1138 struct route_nhop_data rnd_new = {};
1139 const struct weightened_nhop *wn = NULL;
1140 struct weightened_nhop *wn_new;
1143 wn = nhgrp_get_nhops(rnd_orig->rnd_nhgrp, &num_nhops);
1144 for (int i = 0; i < num_nhops; i++) {
1145 if (check_info_match_nhop(info, NULL, wn[i].nh) == 0) {
1152 if (nh_orig == NULL)
1155 error = change_nhop(rnh, info, nh_orig, &nh_new);
1159 wn_new = mallocarray(num_nhops, sizeof(struct weightened_nhop),
1160 M_TEMP, M_NOWAIT | M_ZERO);
1161 if (wn_new == NULL) {
1166 memcpy(wn_new, wn, num_nhops * sizeof(struct weightened_nhop));
1167 wn_new[found_idx].nh = nh_new;
1168 wn_new[found_idx].weight = get_info_weight(info, wn[found_idx].weight);
1170 error = nhgrp_get_group(rnh, wn_new, num_nhops, 0, &rnd_new.rnd_nhgrp);
1172 free(wn_new, M_TEMP);
1177 error = change_route_conditional(rnh, rt, rnd_orig, &rnd_new, rc);
1184 change_route_byinfo(struct rib_head *rnh, struct rtentry *rt,
1185 struct rt_addrinfo *info, struct route_nhop_data *rnd_orig,
1186 struct rib_cmd_info *rc)
1189 struct nhop_object *nh_orig;
1190 struct route_nhop_data rnd_new;
1192 nh_orig = rnd_orig->rnd_nhop;
1193 if (nh_orig == NULL)
1197 if (NH_IS_NHGRP(nh_orig))
1198 return (change_mpath_route(rnh, rt, info, rnd_orig, rc));
1201 rnd_new.rnd_weight = get_info_weight(info, rnd_orig->rnd_weight);
1202 error = change_nhop(rnh, info, nh_orig, &rnd_new.rnd_nhop);
1205 error = change_route_conditional(rnh, rt, rnd_orig, &rnd_new, rc);
1211 * Insert @rt with nhop data from @rnd_new to @rnh.
1212 * Returns 0 on success and stores operation results in @rc.
1215 add_route(struct rib_head *rnh, struct rtentry *rt,
1216 struct route_nhop_data *rnd, struct rib_cmd_info *rc)
1218 struct radix_node *rn;
1220 RIB_WLOCK_ASSERT(rnh);
1222 rt->rt_nhop = rnd->rnd_nhop;
1223 rt->rt_weight = rnd->rnd_weight;
1224 rn = rnh->rnh_addaddr(rt_key(rt), rt_mask_const(rt), &rnh->head, rt->rt_nodes);
1227 if (!NH_IS_NHGRP(rnd->rnd_nhop) && nhop_get_expire(rnd->rnd_nhop))
1228 tmproutes_update(rnh, rt, rnd->rnd_nhop);
1230 /* Finalize notification */
1232 rnh->rnh_prefixes++;
1234 rc->rc_cmd = RTM_ADD;
1236 rc->rc_nh_old = NULL;
1237 rc->rc_nh_new = rnd->rnd_nhop;
1238 rc->rc_nh_weight = rnd->rnd_weight;
1240 rib_notify(rnh, RIB_NOTIFY_IMMEDIATE, rc);
1244 /* Existing route or memory allocation failure. */
1249 * Unconditionally deletes @rt from @rnh.
1252 delete_route(struct rib_head *rnh, struct rtentry *rt, struct rib_cmd_info *rc)
1254 RIB_WLOCK_ASSERT(rnh);
1256 /* Route deletion requested. */
1257 struct radix_node *rn;
1259 rn = rnh->rnh_deladdr(rt_key_const(rt), rt_mask_const(rt), &rnh->head);
1263 rt->rte_flags &= ~RTF_UP;
1266 rnh->rnh_prefixes--;
1268 rc->rc_cmd = RTM_DELETE;
1270 rc->rc_nh_old = rt->rt_nhop;
1271 rc->rc_nh_new = NULL;
1272 rc->rc_nh_weight = rt->rt_weight;
1274 rib_notify(rnh, RIB_NOTIFY_IMMEDIATE, rc);
1280 * Switch @rt nhop/weigh to the ones specified in @rnd.
1281 * Returns 0 on success.
1284 change_route(struct rib_head *rnh, struct rtentry *rt,
1285 struct route_nhop_data *rnd, struct rib_cmd_info *rc)
1287 struct nhop_object *nh_orig;
1289 RIB_WLOCK_ASSERT(rnh);
1291 nh_orig = rt->rt_nhop;
1293 if (rnd->rnd_nhop == NULL)
1294 return (delete_route(rnh, rt, rc));
1296 /* Changing nexthop & weight to a new one */
1297 rt->rt_nhop = rnd->rnd_nhop;
1298 rt->rt_weight = rnd->rnd_weight;
1299 if (!NH_IS_NHGRP(rnd->rnd_nhop) && nhop_get_expire(rnd->rnd_nhop))
1300 tmproutes_update(rnh, rt, rnd->rnd_nhop);
1302 /* Finalize notification */
1304 rc->rc_cmd = RTM_CHANGE;
1306 rc->rc_nh_old = nh_orig;
1307 rc->rc_nh_new = rnd->rnd_nhop;
1308 rc->rc_nh_weight = rnd->rnd_weight;
1310 rib_notify(rnh, RIB_NOTIFY_IMMEDIATE, rc);
1316 * Conditionally update route nhop/weight IFF data in @nhd_orig is
1317 * consistent with the current route data.
1318 * Nexthop in @nhd_new is consumed.
1321 change_route_conditional(struct rib_head *rnh, struct rtentry *rt,
1322 struct route_nhop_data *rnd_orig, struct route_nhop_data *rnd_new,
1323 struct rib_cmd_info *rc)
1325 struct rtentry *rt_new;
1328 #if DEBUG_MAX_LEVEL >= LOG_DEBUG2
1330 char buf_old[NHOP_PRINT_BUFSIZE], buf_new[NHOP_PRINT_BUFSIZE];
1331 nhop_print_buf_any(rnd_orig->rnd_nhop, buf_old, NHOP_PRINT_BUFSIZE);
1332 nhop_print_buf_any(rnd_new->rnd_nhop, buf_new, NHOP_PRINT_BUFSIZE);
1333 FIB_LOG(LOG_DEBUG2, rnh->rib_fibnum, rnh->rib_family,
1334 "trying change %s -> %s", buf_old, buf_new);
1339 struct route_nhop_data rnd;
1340 rt_new = lookup_prefix_rt(rnh, rt, &rnd);
1342 if (rt_new == NULL) {
1343 if (rnd_orig->rnd_nhop == NULL)
1344 error = add_route(rnh, rt, rnd_new, rc);
1347 * Prefix does not exist, which was not our assumption.
1348 * Update @rnd_orig with the new data and return
1350 rnd_orig->rnd_nhop = NULL;
1351 rnd_orig->rnd_weight = 0;
1355 /* Prefix exists, try to update */
1356 if (rnd_orig->rnd_nhop == rt_new->rt_nhop) {
1358 * Nhop/mpath group hasn't changed. Flip
1359 * to the new precalculated one and return
1361 error = change_route(rnh, rt_new, rnd_new, rc);
1363 /* Update and retry */
1364 rnd_orig->rnd_nhop = rt_new->rt_nhop;
1365 rnd_orig->rnd_weight = rt_new->rt_weight;
1373 rib_notify(rnh, RIB_NOTIFY_DELAYED, rc);
1375 if (rnd_orig->rnd_nhop != NULL)
1376 nhop_free_any(rnd_orig->rnd_nhop);
1379 if (rnd_new->rnd_nhop != NULL)
1380 nhop_free_any(rnd_new->rnd_nhop);
1387 * Performs modification of routing table specificed by @action.
1388 * Table is specified by @fibnum and sa_family in @info->rti_info[RTAX_DST].
1389 * Needs to be run in network epoch.
1391 * Returns 0 on success and fills in @rc with action result.
1394 rib_action(uint32_t fibnum, int action, struct rt_addrinfo *info,
1395 struct rib_cmd_info *rc)
1401 error = rib_add_route(fibnum, info, rc);
1404 error = rib_del_route(fibnum, info, rc);
1407 error = rib_change_route(fibnum, info, rc);
1418 struct rib_head *rnh;
1419 struct rtentry *head;
1420 rib_filter_f_t *filter_f;
1423 struct rib_cmd_info rc;
1427 * Conditionally unlinks rtenties or paths from radix tree based
1428 * on the callback data passed in @arg.
1431 rt_checkdelroute(struct radix_node *rn, void *arg)
1433 struct rt_delinfo *di = (struct rt_delinfo *)arg;
1434 struct rtentry *rt = (struct rtentry *)rn;
1436 if (rt_delete_conditional(di->rnh, rt, di->prio,
1437 di->filter_f, di->filter_arg, &di->rc) != 0)
1441 * Add deleted rtentries to the list to GC them
1442 * after dropping the lock.
1444 * XXX: Delayed notifications not implemented
1445 * for nexthop updates.
1447 if (di->rc.rc_cmd == RTM_DELETE) {
1448 /* Add to the list and return */
1449 rt->rt_chain = di->head;
1454 * RTM_CHANGE to a different nexthop or nexthop group.
1455 * Free old multipath group.
1457 nhop_free_any(di->rc.rc_nh_old);
1465 * Iterates over a routing table specified by @fibnum and @family and
1466 * deletes elements marked by @filter_f.
1467 * @fibnum: rtable id
1468 * @family: AF_ address family
1469 * @filter_f: function returning non-zero value for items to delete
1470 * @arg: data to pass to the @filter_f function
1471 * @report: true if rtsock notification is needed.
1474 rib_walk_del(u_int fibnum, int family, rib_filter_f_t *filter_f, void *filter_arg,
1477 struct rib_head *rnh;
1479 struct nhop_object *nh;
1480 struct epoch_tracker et;
1482 rnh = rt_tables_get_rnh(fibnum, family);
1486 struct rt_delinfo di = {
1488 .filter_f = filter_f,
1489 .filter_arg = filter_arg,
1490 .prio = NH_PRIORITY_NORMAL,
1493 NET_EPOCH_ENTER(et);
1496 rnh->rnh_walktree(&rnh->head, rt_checkdelroute, &di);
1499 /* We might have something to reclaim. */
1500 bzero(&di.rc, sizeof(di.rc));
1501 di.rc.rc_cmd = RTM_DELETE;
1502 while (di.head != NULL) {
1504 di.head = rt->rt_chain;
1505 rt->rt_chain = NULL;
1509 di.rc.rc_nh_old = nh;
1510 rib_notify(rnh, RIB_NOTIFY_DELAYED, &di.rc);
1514 struct nhgrp_object *nhg;
1515 const struct weightened_nhop *wn;
1517 if (NH_IS_NHGRP(nh)) {
1518 nhg = (struct nhgrp_object *)nh;
1519 wn = nhgrp_get_nhops(nhg, &num_nhops);
1520 for (int i = 0; i < num_nhops; i++)
1521 rt_routemsg(RTM_DELETE, rt, wn[i].nh, fibnum);
1524 rt_routemsg(RTM_DELETE, rt, nh, fibnum);
1533 rt_delete_unconditional(struct radix_node *rn, void *arg)
1535 struct rtentry *rt = RNTORT(rn);
1536 struct rib_head *rnh = (struct rib_head *)arg;
1538 rn = rnh->rnh_deladdr(rt_key(rt), rt_mask(rt), &rnh->head);
1539 if (RNTORT(rn) == rt)
1546 * Removes all routes from the routing table without executing notifications.
1547 * rtentres will be removed after the end of a current epoch.
1550 rib_flush_routes(struct rib_head *rnh)
1553 rnh->rnh_walktree(&rnh->head, rt_delete_unconditional, rnh);
1558 rib_flush_routes_family(int family)
1560 struct rib_head *rnh;
1562 for (uint32_t fibnum = 0; fibnum < rt_numfibs; fibnum++) {
1563 if ((rnh = rt_tables_get_rnh(fibnum, family)) != NULL)
1564 rib_flush_routes(rnh);
1569 rib_print_family(int family)