2 * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
4 * Copyright (c) 2020 Alexander V. Chernikov
6 * Redistribution and use in source and binary forms, with or without
7 * modification, are permitted provided that the following conditions
9 * 1. Redistributions of source code must retain the above copyright
10 * notice, this list of conditions and the following disclaimer.
11 * 2. Redistributions in binary form must reproduce the above copyright
12 * notice, this list of conditions and the following disclaimer in the
13 * documentation and/or other materials provided with the distribution.
15 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
16 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
17 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
18 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
19 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
20 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
21 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
22 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
23 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
24 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
28 #include <sys/cdefs.h>
29 __FBSDID("$FreeBSD$");
31 #include "opt_inet6.h"
32 #include "opt_route.h"
34 #include <sys/param.h>
35 #include <sys/systm.h>
36 #include <sys/malloc.h>
38 #include <sys/socket.h>
39 #include <sys/sysctl.h>
40 #include <sys/syslog.h>
41 #include <sys/kernel.h>
43 #include <sys/rmlock.h>
46 #include <net/if_var.h>
47 #include <net/if_dl.h>
49 #include <net/route.h>
50 #include <net/route/route_ctl.h>
51 #include <net/route/route_var.h>
52 #include <net/route/nhop_utils.h>
53 #include <net/route/nhop.h>
54 #include <net/route/nhop_var.h>
55 #include <netinet/in.h>
56 #include <netinet6/scope6_var.h>
57 #include <netinet6/in6_var.h>
59 #define DEBUG_MOD_NAME route_ctl
60 #define DEBUG_MAX_LEVEL LOG_DEBUG
61 #include <net/route/route_debug.h>
62 _DECLARE_DEBUG(LOG_INFO);
65 * This file contains control plane routing tables functions.
67 * All functions assumes they are called in net epoch.
70 union sockaddr_union {
72 struct sockaddr_in sin;
73 struct sockaddr_in6 sin6;
77 static int add_route_byinfo(struct rib_head *rnh, struct rt_addrinfo *info,
78 struct rib_cmd_info *rc);
79 static int change_route_byinfo(struct rib_head *rnh, struct rtentry *rt,
80 struct rt_addrinfo *info, struct route_nhop_data *nhd_orig,
81 struct rib_cmd_info *rc);
83 static int add_route_flags(struct rib_head *rnh, struct rtentry *rt,
84 struct route_nhop_data *rnd_add, int op_flags, struct rib_cmd_info *rc);
86 static int add_route_flags_mpath(struct rib_head *rnh, struct rtentry *rt,
87 struct route_nhop_data *rnd_add, struct route_nhop_data *rnd_orig,
88 int op_flags, struct rib_cmd_info *rc);
91 static int add_route(struct rib_head *rnh, struct rtentry *rt,
92 struct route_nhop_data *rnd, struct rib_cmd_info *rc);
93 static int delete_route(struct rib_head *rnh, struct rtentry *rt,
94 struct rib_cmd_info *rc);
95 static int rt_delete_conditional(struct rib_head *rnh, struct rtentry *rt,
96 int prio, rib_filter_f_t *cb, void *cbdata, struct rib_cmd_info *rc);
98 static int get_prio_from_info(const struct rt_addrinfo *info);
99 static int nhop_get_prio(const struct nhop_object *nh);
102 static bool rib_can_multipath(struct rib_head *rh);
105 /* Per-vnet multipath routing configuration */
106 SYSCTL_DECL(_net_route);
107 #define V_rib_route_multipath VNET(rib_route_multipath)
109 #define _MP_FLAGS CTLFLAG_RW
111 #define _MP_FLAGS CTLFLAG_RD
113 VNET_DEFINE(u_int, rib_route_multipath) = 1;
114 SYSCTL_UINT(_net_route, OID_AUTO, multipath, _MP_FLAGS | CTLFLAG_VNET,
115 &VNET_NAME(rib_route_multipath), 0, "Enable route multipath");
119 VNET_DEFINE(u_int, fib_hash_outbound) = 0;
120 SYSCTL_UINT(_net_route, OID_AUTO, hash_outbound, CTLFLAG_RD | CTLFLAG_VNET,
121 &VNET_NAME(fib_hash_outbound), 0,
122 "Compute flowid for locally-originated packets");
124 /* Default entropy to add to the hash calculation for the outbound connections*/
125 uint8_t mpath_entropy_key[MPATH_ENTROPY_KEY_LEN] = {
126 0x6d, 0x5a, 0x56, 0xda, 0x25, 0x5b, 0x0e, 0xc2,
127 0x41, 0x67, 0x25, 0x3d, 0x43, 0xa3, 0x8f, 0xb0,
128 0xd0, 0xca, 0x2b, 0xcb, 0xae, 0x7b, 0x30, 0xb4,
129 0x77, 0xcb, 0x2d, 0xa3, 0x80, 0x30, 0xf2, 0x0c,
130 0x6a, 0x42, 0xb7, 0x3b, 0xbe, 0xac, 0x01, 0xfa,
134 #if defined(INET) && defined(INET6)
135 FEATURE(ipv4_rfc5549_support, "Route IPv4 packets via IPv6 nexthops");
136 #define V_rib_route_ipv6_nexthop VNET(rib_route_ipv6_nexthop)
137 VNET_DEFINE(u_int, rib_route_ipv6_nexthop) = 1;
138 SYSCTL_UINT(_net_route, OID_AUTO, ipv6_nexthop, CTLFLAG_RW | CTLFLAG_VNET,
139 &VNET_NAME(rib_route_ipv6_nexthop), 0, "Enable IPv4 route via IPv6 Next Hop address");
143 SYSCTL_NODE(_net_route, OID_AUTO, debug, CTLFLAG_RD | CTLFLAG_MPSAFE, 0, "");
145 static struct rib_head *
146 get_rnh(uint32_t fibnum, const struct rt_addrinfo *info)
148 struct rib_head *rnh;
149 struct sockaddr *dst;
151 KASSERT((fibnum < rt_numfibs), ("rib_add_route: bad fibnum"));
153 dst = info->rti_info[RTAX_DST];
154 rnh = rt_tables_get_rnh(fibnum, dst->sa_family);
159 #if defined(INET) && defined(INET6)
161 rib_can_ipv6_nexthop_address(struct rib_head *rh)
165 CURVNET_SET(rh->rib_vnet);
166 result = !!V_rib_route_ipv6_nexthop;
175 rib_can_multipath(struct rib_head *rh)
179 CURVNET_SET(rh->rib_vnet);
180 result = !!V_rib_route_multipath;
187 * Check is nhop is multipath-eligible.
188 * Avoid nhops without gateways and redirects.
190 * Returns 1 for multipath-eligible nexthop,
194 nhop_can_multipath(const struct nhop_object *nh)
197 if ((nh->nh_flags & NHF_MULTIPATH) != 0)
199 if ((nh->nh_flags & NHF_GATEWAY) == 0)
201 if ((nh->nh_flags & NHF_REDIRECT) != 0)
209 get_info_weight(const struct rt_addrinfo *info, uint32_t default_weight)
213 if (info->rti_mflags & RTV_WEIGHT)
214 weight = info->rti_rmx->rmx_weight;
216 weight = default_weight;
217 /* Keep upper 1 byte for adm distance purposes */
218 if (weight > RT_MAX_WEIGHT)
219 weight = RT_MAX_WEIGHT;
220 else if (weight == 0)
221 weight = default_weight;
227 * File-local concept for distingushing between the normal and
228 * RTF_PINNED routes tha can override the "normal" one.
230 #define NH_PRIORITY_HIGH 2
231 #define NH_PRIORITY_NORMAL 1
233 get_prio_from_info(const struct rt_addrinfo *info)
235 if (info->rti_flags & RTF_PINNED)
236 return (NH_PRIORITY_HIGH);
237 return (NH_PRIORITY_NORMAL);
241 nhop_get_prio(const struct nhop_object *nh)
243 if (NH_IS_PINNED(nh))
244 return (NH_PRIORITY_HIGH);
245 return (NH_PRIORITY_NORMAL);
249 * Check if specified @gw matches gw data in the nexthop @nh.
251 * Returns true if matches, false otherwise.
254 match_nhop_gw(const struct nhop_object *nh, const struct sockaddr *gw)
257 if (nh->gw_sa.sa_family != gw->sa_family)
260 switch (gw->sa_family) {
262 return (nh->gw4_sa.sin_addr.s_addr ==
263 ((const struct sockaddr_in *)gw)->sin_addr.s_addr);
266 const struct sockaddr_in6 *gw6;
267 gw6 = (const struct sockaddr_in6 *)gw;
270 * Currently (2020-09) IPv6 gws in kernel have their
271 * scope embedded. Once this becomes false, this code
272 * has to be revisited.
274 if (IN6_ARE_ADDR_EQUAL(&nh->gw6_sa.sin6_addr,
281 const struct sockaddr_dl *sdl;
282 sdl = (const struct sockaddr_dl *)gw;
283 return (nh->gwl_sa.sdl_index == sdl->sdl_index);
286 return (memcmp(&nh->gw_sa, gw, nh->gw_sa.sa_len) == 0);
293 struct gw_filter_data {
294 const struct sockaddr *gw;
299 gw_fulter_func(const struct rtentry *rt, const struct nhop_object *nh, void *_data)
301 struct gw_filter_data *gwd = (struct gw_filter_data *)_data;
303 /* Return only first match to make rtsock happy */
304 if (match_nhop_gw(nh, gwd->gw) && gwd->count++ == 0)
310 * Checks if data in @info matches nexhop @nh.
312 * Returns 0 on success,
313 * ESRCH if not matched,
314 * ENOENT if filter function returned false
317 check_info_match_nhop(const struct rt_addrinfo *info, const struct rtentry *rt,
318 const struct nhop_object *nh)
320 const struct sockaddr *gw = info->rti_info[RTAX_GATEWAY];
322 if (info->rti_filter != NULL) {
323 if (info->rti_filter(rt, nh, info->rti_filterdata) == 0)
328 if ((gw != NULL) && !match_nhop_gw(nh, gw))
335 * Runs exact prefix match based on @dst and @netmask.
336 * Returns matched @rtentry if found or NULL.
337 * If rtentry was found, saves nexthop / weight value into @rnd.
339 static struct rtentry *
340 lookup_prefix_bysa(struct rib_head *rnh, const struct sockaddr *dst,
341 const struct sockaddr *netmask, struct route_nhop_data *rnd)
345 RIB_LOCK_ASSERT(rnh);
347 rt = (struct rtentry *)rnh->rnh_lookup(dst, netmask, &rnh->head);
349 rnd->rnd_nhop = rt->rt_nhop;
350 rnd->rnd_weight = rt->rt_weight;
352 rnd->rnd_nhop = NULL;
360 lookup_prefix_rt(struct rib_head *rnh, const struct rtentry *rt,
361 struct route_nhop_data *rnd)
363 return (lookup_prefix_bysa(rnh, rt_key_const(rt), rt_mask_const(rt), rnd));
367 * Runs exact prefix match based on dst/netmask from @info.
368 * Assumes RIB lock is held.
369 * Returns matched @rtentry if found or NULL.
370 * If rtentry was found, saves nexthop / weight value into @rnd.
373 lookup_prefix(struct rib_head *rnh, const struct rt_addrinfo *info,
374 struct route_nhop_data *rnd)
378 rt = lookup_prefix_bysa(rnh, info->rti_info[RTAX_DST],
379 info->rti_info[RTAX_NETMASK], rnd);
385 fill_pxmask_family(int family, int plen, struct sockaddr *_dst,
386 struct sockaddr **pmask)
397 struct sockaddr_in *mask = (struct sockaddr_in *)(*pmask);
398 struct sockaddr_in *dst= (struct sockaddr_in *)_dst;
400 memset(mask, 0, sizeof(*mask));
401 mask->sin_family = family;
402 mask->sin_len = sizeof(*mask);
405 else if (plen > 32 || plen < 0)
408 uint32_t daddr, maddr;
409 maddr = htonl(plen ? ~((1 << (32 - plen)) - 1) : 0);
410 mask->sin_addr.s_addr = maddr;
411 daddr = dst->sin_addr.s_addr;
412 daddr = htonl(ntohl(daddr) & ntohl(maddr));
413 dst->sin_addr.s_addr = daddr;
422 struct sockaddr_in6 *mask = (struct sockaddr_in6 *)(*pmask);
423 struct sockaddr_in6 *dst = (struct sockaddr_in6 *)_dst;
425 memset(mask, 0, sizeof(*mask));
426 mask->sin6_family = family;
427 mask->sin6_len = sizeof(*mask);
430 else if (plen > 128 || plen < 0)
433 ip6_writemask(&mask->sin6_addr, plen);
434 IN6_MASK_ADDR(&dst->sin6_addr, &mask->sin6_addr);
445 * Attempts to add @dst/plen prefix with nexthop/nexhopgroup data @rnd
446 * to the routing table.
448 * @fibnum: rtable id to insert route to
449 * @dst: verified kernel-originated sockaddr, can be masked if plen non-empty
450 * @plen: prefix length (or -1 if host route or not applicable for AF)
451 * @op_flags: combination of RTM_F_ flags
452 * @rc: storage to report operation result
454 * Returns 0 on success.
457 rib_add_route_px(uint32_t fibnum, struct sockaddr *dst, int plen,
458 struct route_nhop_data *rnd, int op_flags, struct rib_cmd_info *rc)
460 union sockaddr_union mask_storage;
461 struct sockaddr *netmask = &mask_storage.sa;
466 bzero(rc, sizeof(struct rib_cmd_info));
467 rc->rc_cmd = RTM_ADD;
469 struct rib_head *rnh = rt_tables_get_rnh(fibnum, dst->sa_family);
471 return (EAFNOSUPPORT);
473 if (!fill_pxmask_family(dst->sa_family, plen, dst, &netmask)) {
474 FIB_RH_LOG(LOG_DEBUG, rnh, "error: invalid plen %d", plen);
478 if (op_flags & RTM_F_CREATE) {
479 if ((rt = rt_alloc(rnh, dst, netmask)) == NULL) {
480 FIB_RH_LOG(LOG_INFO, rnh, "rtentry allocation failed");
484 struct route_nhop_data rnd_tmp;
486 rt = lookup_prefix_bysa(rnh, dst, netmask, &rnd_tmp);
491 #if DEBUG_MAX_LEVEL >= LOG_DEBUG2
493 char nhbuf[NHOP_PRINT_BUFSIZE], rtbuf[NHOP_PRINT_BUFSIZE];
494 nhop_print_buf_any(rnd->rnd_nhop, nhbuf, sizeof(nhbuf));
495 rt_print_buf(rt, rtbuf, sizeof(rtbuf));
496 FIB_RH_LOG(LOG_DEBUG2, rnh, "request %s -> %s", rtbuf, nhbuf);
499 return (add_route_flags(rnh, rt, rnd, op_flags, rc));
503 * Attempts to delete @dst/plen prefix matching gateway @gw from the
506 * @fibnum: rtable id to remove route from
507 * @dst: verified kernel-originated sockaddr, can be masked if plen non-empty
508 * @plen: prefix length (or -1 if host route or not applicable for AF)
509 * @gw: gateway to match
510 * @op_flags: combination of RTM_F_ flags
511 * @rc: storage to report operation result
513 * Returns 0 on success.
516 rib_del_route_px_gw(uint32_t fibnum, struct sockaddr *dst, int plen,
517 const struct sockaddr *gw, int op_flags, struct rib_cmd_info *rc)
519 struct gw_filter_data gwd = { .gw = gw };
521 return (rib_del_route_px(fibnum, dst, plen, gw_fulter_func, &gwd, op_flags, rc));
525 * Attempts to delete @dst/plen prefix matching @filter_func from the
528 * @fibnum: rtable id to remove route from
529 * @dst: verified kernel-originated sockaddr, can be masked if plen non-empty
530 * @plen: prefix length (or -1 if host route or not applicable for AF)
531 * @filter_func: func to be called for each nexthop of the prefix for matching
532 * @filter_arg: argument to pass to @filter_func
533 * @op_flags: combination of RTM_F_ flags
534 * @rc: storage to report operation result
536 * Returns 0 on success.
539 rib_del_route_px(uint32_t fibnum, struct sockaddr *dst, int plen,
540 rib_filter_f_t *filter_func, void *filter_arg, int op_flags,
541 struct rib_cmd_info *rc)
543 union sockaddr_union mask_storage;
544 struct sockaddr *netmask = &mask_storage.sa;
549 bzero(rc, sizeof(struct rib_cmd_info));
550 rc->rc_cmd = RTM_DELETE;
552 struct rib_head *rnh = rt_tables_get_rnh(fibnum, dst->sa_family);
554 return (EAFNOSUPPORT);
556 if (dst->sa_len > sizeof(mask_storage)) {
557 FIB_RH_LOG(LOG_DEBUG, rnh, "error: dst->sa_len too big: %d", dst->sa_len);
561 if (!fill_pxmask_family(dst->sa_family, plen, dst, &netmask)) {
562 FIB_RH_LOG(LOG_DEBUG, rnh, "error: invalid plen %d", plen);
566 int prio = (op_flags & RTM_F_FORCE) ? NH_PRIORITY_HIGH : NH_PRIORITY_NORMAL;
569 struct route_nhop_data rnd;
570 struct rtentry *rt = lookup_prefix_bysa(rnh, dst, netmask, &rnd);
572 error = rt_delete_conditional(rnh, rt, prio, filter_func,
581 rib_notify(rnh, RIB_NOTIFY_DELAYED, rc);
583 if (rc->rc_cmd == RTM_DELETE)
588 * Deleting 1 path may result in RTM_CHANGE to
589 * a different mpath group/nhop.
590 * Free old mpath group.
592 nhop_free_any(rc->rc_nh_old);
600 * Tries to copy route @rt from one rtable to the rtable specified by @dst_rh.
601 * @rt: route to copy.
602 * @rnd_src: nhop and weight. Multipath routes are not supported
603 * @rh_dst: target rtable.
604 * @rc: operation result storage
606 * Return 0 on success.
609 rib_copy_route(struct rtentry *rt, const struct route_nhop_data *rnd_src,
610 struct rib_head *rh_dst, struct rib_cmd_info *rc)
612 struct nhop_object __diagused *nh_src = rnd_src->rnd_nhop;
615 MPASS((nh_src->nh_flags & NHF_MULTIPATH) == 0);
617 #if DEBUG_MAX_LEVEL >= LOG_DEBUG2
618 char nhbuf[NHOP_PRINT_BUFSIZE], rtbuf[NHOP_PRINT_BUFSIZE];
619 nhop_print_buf_any(nh_src, nhbuf, sizeof(nhbuf));
620 rt_print_buf(rt, rtbuf, sizeof(rtbuf));
621 FIB_RH_LOG(LOG_DEBUG2, rh_dst, "copying %s -> %s from fib %u",
622 rtbuf, nhbuf, nhop_get_fibnum(nh_src));
624 struct nhop_object *nh = nhop_alloc(rh_dst->rib_fibnum, rh_dst->rib_family);
626 FIB_RH_LOG(LOG_INFO, rh_dst, "unable to allocate new nexthop");
629 nhop_copy(nh, rnd_src->rnd_nhop);
630 nhop_set_fibnum(nh, rh_dst->rib_fibnum);
631 nh = nhop_get_nhop_internal(rh_dst, nh, &error);
633 FIB_RH_LOG(LOG_INFO, rh_dst,
634 "unable to finalize new nexthop: error %d", error);
638 struct rtentry *rt_new = rt_alloc(rh_dst, rt_key(rt), rt_mask(rt));
639 if (rt_new == NULL) {
640 FIB_RH_LOG(LOG_INFO, rh_dst, "unable to create new rtentry");
645 struct route_nhop_data rnd = {
647 .rnd_weight = rnd_src->rnd_weight
649 int op_flags = RTM_F_CREATE | (NH_IS_PINNED(nh) ? RTM_F_FORCE : 0);
650 error = add_route_flags(rh_dst, rt_new, &rnd, op_flags, rc);
653 #if DEBUG_MAX_LEVEL >= LOG_DEBUG
654 char buf[NHOP_PRINT_BUFSIZE];
655 rt_print_buf(rt_new, buf, sizeof(buf));
656 FIB_RH_LOG(LOG_DEBUG, rh_dst, "Unable to add route %s: error %d", buf, error);
659 rt_free_immediate(rt_new);
665 * Adds route defined by @info into the kernel table specified by @fibnum and
666 * sa_family in @info->rti_info[RTAX_DST].
668 * Returns 0 on success and fills in operation metadata into @rc.
671 rib_add_route(uint32_t fibnum, struct rt_addrinfo *info,
672 struct rib_cmd_info *rc)
674 struct rib_head *rnh;
679 rnh = get_rnh(fibnum, info);
681 return (EAFNOSUPPORT);
684 * Check consistency between RTF_HOST flag and netmask
687 if (info->rti_flags & RTF_HOST)
688 info->rti_info[RTAX_NETMASK] = NULL;
689 else if (info->rti_info[RTAX_NETMASK] == NULL) {
690 FIB_RH_LOG(LOG_DEBUG, rnh, "error: no RTF_HOST and empty netmask");
694 bzero(rc, sizeof(struct rib_cmd_info));
695 rc->rc_cmd = RTM_ADD;
697 error = add_route_byinfo(rnh, info, rc);
699 rib_notify(rnh, RIB_NOTIFY_DELAYED, rc);
705 * Checks if @dst and @gateway is valid combination.
707 * Returns true if is valid, false otherwise.
710 check_gateway(struct rib_head *rnh, struct sockaddr *dst,
711 struct sockaddr *gateway)
713 if (dst->sa_family == gateway->sa_family)
715 else if (gateway->sa_family == AF_UNSPEC)
717 else if (gateway->sa_family == AF_LINK)
719 #if defined(INET) && defined(INET6)
720 else if (dst->sa_family == AF_INET && gateway->sa_family == AF_INET6 &&
721 rib_can_ipv6_nexthop_address(rnh))
729 add_route_byinfo(struct rib_head *rnh, struct rt_addrinfo *info,
730 struct rib_cmd_info *rc)
732 struct route_nhop_data rnd_add;
733 struct nhop_object *nh;
735 struct sockaddr *dst, *gateway, *netmask;
738 dst = info->rti_info[RTAX_DST];
739 gateway = info->rti_info[RTAX_GATEWAY];
740 netmask = info->rti_info[RTAX_NETMASK];
742 if ((info->rti_flags & RTF_GATEWAY) && !gateway) {
743 FIB_RH_LOG(LOG_DEBUG, rnh, "error: RTF_GATEWAY set with empty gw");
746 if (dst && gateway && !check_gateway(rnh, dst, gateway)) {
747 FIB_RH_LOG(LOG_DEBUG, rnh,
748 "error: invalid dst/gateway family combination (%d, %d)",
749 dst->sa_family, gateway->sa_family);
753 if (dst->sa_len > sizeof(((struct rtentry *)NULL)->rt_dstb)) {
754 FIB_RH_LOG(LOG_DEBUG, rnh, "error: dst->sa_len too large: %d",
759 if (info->rti_ifa == NULL) {
760 error = rt_getifa_fib(info, rnh->rib_fibnum);
765 if ((rt = rt_alloc(rnh, dst, netmask)) == NULL)
768 error = nhop_create_from_info(rnh, info, &nh);
770 rt_free_immediate(rt);
774 rnd_add.rnd_nhop = nh;
775 rnd_add.rnd_weight = get_info_weight(info, RT_DEFAULT_WEIGHT);
777 int op_flags = RTM_F_CREATE;
778 if (get_prio_from_info(info) == NH_PRIORITY_HIGH)
779 op_flags |= RTM_F_FORCE;
781 op_flags |= RTM_F_APPEND;
782 return (add_route_flags(rnh, rt, &rnd_add, op_flags, rc));
787 add_route_flags(struct rib_head *rnh, struct rtentry *rt, struct route_nhop_data *rnd_add,
788 int op_flags, struct rib_cmd_info *rc)
790 struct route_nhop_data rnd_orig;
791 struct nhop_object *nh;
792 struct rtentry *rt_orig;
795 nh = rnd_add->rnd_nhop;
799 rt_orig = lookup_prefix_rt(rnh, rt, &rnd_orig);
801 if (rt_orig == NULL) {
802 if (op_flags & RTM_F_CREATE)
803 error = add_route(rnh, rt, rnd_add, rc);
805 error = ENOENT; // no entry but creation was not required
812 if (op_flags & RTM_F_EXCL) {
813 /* We have existing route in the RIB but not allowed to replace. */
819 /* Now either append or replace */
820 if (op_flags & RTM_F_REPLACE) {
821 if (nhop_get_prio(rnd_orig.rnd_nhop) > nhop_get_prio(rnd_add->rnd_nhop)) {
822 /* Old path is "better" (e.g. has PINNED flag set) */
826 change_route(rnh, rt_orig, rnd_add, rc);
835 if ((op_flags & RTM_F_APPEND) && rib_can_multipath(rnh) &&
836 nhop_can_multipath(rnd_add->rnd_nhop) &&
837 nhop_can_multipath(rnd_orig.rnd_nhop)) {
839 for (int i = 0; i < RIB_MAX_RETRIES; i++) {
840 error = add_route_flags_mpath(rnh, rt, rnd_add, &rnd_orig,
844 RTSTAT_INC(rts_add_retry);
848 * Original nhop reference is unused in any case.
850 nhop_free_any(rnd_add->rnd_nhop);
851 if (op_flags & RTM_F_CREATE) {
852 if (error != 0 || rc->rc_cmd != RTM_ADD)
853 rt_free_immediate(rt);
858 /* Out of options - free state and return error */
861 if (op_flags & RTM_F_CREATE)
862 rt_free_immediate(rt);
870 add_route_flags_mpath(struct rib_head *rnh, struct rtentry *rt,
871 struct route_nhop_data *rnd_add, struct route_nhop_data *rnd_orig,
872 int op_flags, struct rib_cmd_info *rc)
875 struct route_nhop_data rnd_new;
878 error = nhgrp_get_addition_group(rnh, rnd_orig, rnd_add, &rnd_new);
880 if (error == EAGAIN) {
882 * Group creation failed, most probably because
883 * @rnd_orig data got scheduled for deletion.
884 * Refresh @rnd_orig data and retry.
887 lookup_prefix_rt(rnh, rt, rnd_orig);
889 if (rnd_orig == NULL && !(op_flags & RTM_F_CREATE)) {
890 /* In this iteration route doesn't exist */
896 error = change_route_conditional(rnh, rt, rnd_orig, &rnd_new, rc);
900 if (V_fib_hash_outbound == 0 && NH_IS_NHGRP(rc->rc_nh_new)) {
902 * First multipath route got installed. Enable local
903 * outbound connections hashing.
906 printf("FIB: enabled flowid calculation for locally-originated packets\n");
907 V_fib_hash_outbound = 1;
915 * Removes route defined by @info from the kernel table specified by @fibnum and
916 * sa_family in @info->rti_info[RTAX_DST].
918 * Returns 0 on success and fills in operation metadata into @rc.
921 rib_del_route(uint32_t fibnum, struct rt_addrinfo *info, struct rib_cmd_info *rc)
923 struct rib_head *rnh;
924 struct sockaddr *dst, *netmask;
925 struct sockaddr_storage mdst;
930 rnh = get_rnh(fibnum, info);
932 return (EAFNOSUPPORT);
934 bzero(rc, sizeof(struct rib_cmd_info));
935 rc->rc_cmd = RTM_DELETE;
937 dst = info->rti_info[RTAX_DST];
938 netmask = info->rti_info[RTAX_NETMASK];
940 if (netmask != NULL) {
941 /* Ensure @dst is always properly masked */
942 if (dst->sa_len > sizeof(mdst)) {
943 FIB_RH_LOG(LOG_DEBUG, rnh, "error: dst->sa_len too large");
946 rt_maskedcopy(dst, (struct sockaddr *)&mdst, netmask);
947 dst = (struct sockaddr *)&mdst;
950 rib_filter_f_t *filter_func = NULL;
951 void *filter_arg = NULL;
952 struct gw_filter_data gwd = { .gw = info->rti_info[RTAX_GATEWAY] };
954 if (info->rti_filter != NULL) {
955 filter_func = info->rti_filter;
956 filter_arg = info->rti_filterdata;
957 } else if (gwd.gw != NULL) {
958 filter_func = gw_fulter_func;
962 int prio = get_prio_from_info(info);
965 struct route_nhop_data rnd;
966 struct rtentry *rt = lookup_prefix_bysa(rnh, dst, netmask, &rnd);
968 error = rt_delete_conditional(rnh, rt, prio, filter_func,
977 rib_notify(rnh, RIB_NOTIFY_DELAYED, rc);
979 if (rc->rc_cmd == RTM_DELETE)
984 * Deleting 1 path may result in RTM_CHANGE to
985 * a different mpath group/nhop.
986 * Free old mpath group.
988 nhop_free_any(rc->rc_nh_old);
996 * Conditionally unlinks rtentry paths from @rnh matching @cb.
997 * Returns 0 on success with operation result stored in @rc.
999 * ESRCH - if prefix was not found or filter function failed to match
1000 * EADDRINUSE - if trying to delete higher priority route.
1003 rt_delete_conditional(struct rib_head *rnh, struct rtentry *rt,
1004 int prio, rib_filter_f_t *cb, void *cbdata, struct rib_cmd_info *rc)
1006 struct nhop_object *nh = rt->rt_nhop;
1009 if (NH_IS_NHGRP(nh)) {
1010 struct nhgrp_object *nhg = (struct nhgrp_object *)nh;
1011 struct route_nhop_data rnd;
1016 error = nhgrp_get_filtered_group(rnh, rt, nhg, cb, cbdata, &rnd);
1018 if (rnd.rnd_nhgrp == nhg) {
1019 /* No match, unreference new group and return. */
1020 nhop_free_any(rnd.rnd_nhop);
1023 error = change_route(rnh, rt, &rnd, rc);
1028 if (cb != NULL && !cb(rt, nh, cbdata))
1031 if (prio < nhop_get_prio(nh))
1032 return (EADDRINUSE);
1034 return (delete_route(rnh, rt, rc));
1038 rib_change_route(uint32_t fibnum, struct rt_addrinfo *info,
1039 struct rib_cmd_info *rc)
1042 struct route_nhop_data rnd_orig;
1043 struct rib_head *rnh;
1049 rnh = get_rnh(fibnum, info);
1051 return (EAFNOSUPPORT);
1053 bzero(rc, sizeof(struct rib_cmd_info));
1054 rc->rc_cmd = RTM_CHANGE;
1056 /* Check if updated gateway exists */
1057 if ((info->rti_flags & RTF_GATEWAY) &&
1058 (info->rti_info[RTAX_GATEWAY] == NULL)) {
1061 * route(8) adds RTF_GATEWAY flag if -interface is not set.
1062 * Remove RTF_GATEWAY to enforce consistency and maintain
1065 info->rti_flags &= ~RTF_GATEWAY;
1069 * route change is done in multiple steps, with dropping and
1070 * reacquiring lock. In the situations with multiple processes
1071 * changes the same route in can lead to the case when route
1072 * is changed between the steps. Address it by retrying the operation
1073 * multiple times before failing.
1077 rt = (struct rtentry *)rnh->rnh_lookup(info->rti_info[RTAX_DST],
1078 info->rti_info[RTAX_NETMASK], &rnh->head);
1085 rnd_orig.rnd_nhop = rt->rt_nhop;
1086 rnd_orig.rnd_weight = rt->rt_weight;
1090 for (int i = 0; i < RIB_MAX_RETRIES; i++) {
1091 error = change_route_byinfo(rnh, rt, info, &rnd_orig, rc);
1092 if (error != EAGAIN)
1100 change_nhop(struct rib_head *rnh, struct rt_addrinfo *info,
1101 struct nhop_object *nh_orig, struct nhop_object **nh_new)
1106 * New gateway could require new ifaddr, ifp;
1107 * flags may also be different; ifp may be specified
1108 * by ll sockaddr when protocol address is ambiguous
1110 if (((nh_orig->nh_flags & NHF_GATEWAY) &&
1111 info->rti_info[RTAX_GATEWAY] != NULL) ||
1112 info->rti_info[RTAX_IFP] != NULL ||
1113 (info->rti_info[RTAX_IFA] != NULL &&
1114 !sa_equal(info->rti_info[RTAX_IFA], nh_orig->nh_ifa->ifa_addr))) {
1115 error = rt_getifa_fib(info, rnh->rib_fibnum);
1118 info->rti_ifa = NULL;
1123 error = nhop_create_from_nhop(rnh, nh_orig, info, nh_new);
1124 info->rti_ifa = NULL;
1131 change_mpath_route(struct rib_head *rnh, struct rtentry *rt,
1132 struct rt_addrinfo *info, struct route_nhop_data *rnd_orig,
1133 struct rib_cmd_info *rc)
1135 int error = 0, found_idx = 0;
1136 struct nhop_object *nh_orig = NULL, *nh_new;
1137 struct route_nhop_data rnd_new = {};
1138 const struct weightened_nhop *wn = NULL;
1139 struct weightened_nhop *wn_new;
1142 wn = nhgrp_get_nhops(rnd_orig->rnd_nhgrp, &num_nhops);
1143 for (int i = 0; i < num_nhops; i++) {
1144 if (check_info_match_nhop(info, NULL, wn[i].nh) == 0) {
1151 if (nh_orig == NULL)
1154 error = change_nhop(rnh, info, nh_orig, &nh_new);
1158 wn_new = mallocarray(num_nhops, sizeof(struct weightened_nhop),
1159 M_TEMP, M_NOWAIT | M_ZERO);
1160 if (wn_new == NULL) {
1165 memcpy(wn_new, wn, num_nhops * sizeof(struct weightened_nhop));
1166 wn_new[found_idx].nh = nh_new;
1167 wn_new[found_idx].weight = get_info_weight(info, wn[found_idx].weight);
1169 error = nhgrp_get_group(rnh, wn_new, num_nhops, &rnd_new.rnd_nhgrp);
1171 free(wn_new, M_TEMP);
1176 error = change_route_conditional(rnh, rt, rnd_orig, &rnd_new, rc);
1183 change_route_byinfo(struct rib_head *rnh, struct rtentry *rt,
1184 struct rt_addrinfo *info, struct route_nhop_data *rnd_orig,
1185 struct rib_cmd_info *rc)
1188 struct nhop_object *nh_orig;
1189 struct route_nhop_data rnd_new;
1191 nh_orig = rnd_orig->rnd_nhop;
1192 if (nh_orig == NULL)
1196 if (NH_IS_NHGRP(nh_orig))
1197 return (change_mpath_route(rnh, rt, info, rnd_orig, rc));
1200 rnd_new.rnd_weight = get_info_weight(info, rnd_orig->rnd_weight);
1201 error = change_nhop(rnh, info, nh_orig, &rnd_new.rnd_nhop);
1204 error = change_route_conditional(rnh, rt, rnd_orig, &rnd_new, rc);
1210 * Insert @rt with nhop data from @rnd_new to @rnh.
1211 * Returns 0 on success and stores operation results in @rc.
1214 add_route(struct rib_head *rnh, struct rtentry *rt,
1215 struct route_nhop_data *rnd, struct rib_cmd_info *rc)
1217 struct radix_node *rn;
1219 RIB_WLOCK_ASSERT(rnh);
1221 rt->rt_nhop = rnd->rnd_nhop;
1222 rt->rt_weight = rnd->rnd_weight;
1223 rn = rnh->rnh_addaddr(rt_key(rt), rt_mask_const(rt), &rnh->head, rt->rt_nodes);
1226 if (!NH_IS_NHGRP(rnd->rnd_nhop) && nhop_get_expire(rnd->rnd_nhop))
1227 tmproutes_update(rnh, rt, rnd->rnd_nhop);
1229 /* Finalize notification */
1231 rnh->rnh_prefixes++;
1233 rc->rc_cmd = RTM_ADD;
1235 rc->rc_nh_old = NULL;
1236 rc->rc_nh_new = rnd->rnd_nhop;
1237 rc->rc_nh_weight = rnd->rnd_weight;
1239 rib_notify(rnh, RIB_NOTIFY_IMMEDIATE, rc);
1243 /* Existing route or memory allocation failure. */
1248 * Unconditionally deletes @rt from @rnh.
1251 delete_route(struct rib_head *rnh, struct rtentry *rt, struct rib_cmd_info *rc)
1253 RIB_WLOCK_ASSERT(rnh);
1255 /* Route deletion requested. */
1256 struct radix_node *rn;
1258 rn = rnh->rnh_deladdr(rt_key_const(rt), rt_mask_const(rt), &rnh->head);
1262 rt->rte_flags &= ~RTF_UP;
1265 rnh->rnh_prefixes--;
1267 rc->rc_cmd = RTM_DELETE;
1269 rc->rc_nh_old = rt->rt_nhop;
1270 rc->rc_nh_new = NULL;
1271 rc->rc_nh_weight = rt->rt_weight;
1273 rib_notify(rnh, RIB_NOTIFY_IMMEDIATE, rc);
1279 * Switch @rt nhop/weigh to the ones specified in @rnd.
1280 * Returns 0 on success.
1283 change_route(struct rib_head *rnh, struct rtentry *rt,
1284 struct route_nhop_data *rnd, struct rib_cmd_info *rc)
1286 struct nhop_object *nh_orig;
1288 RIB_WLOCK_ASSERT(rnh);
1290 nh_orig = rt->rt_nhop;
1292 if (rnd->rnd_nhop == NULL)
1293 return (delete_route(rnh, rt, rc));
1295 /* Changing nexthop & weight to a new one */
1296 rt->rt_nhop = rnd->rnd_nhop;
1297 rt->rt_weight = rnd->rnd_weight;
1298 if (!NH_IS_NHGRP(rnd->rnd_nhop) && nhop_get_expire(rnd->rnd_nhop))
1299 tmproutes_update(rnh, rt, rnd->rnd_nhop);
1301 /* Finalize notification */
1303 rc->rc_cmd = RTM_CHANGE;
1305 rc->rc_nh_old = nh_orig;
1306 rc->rc_nh_new = rnd->rnd_nhop;
1307 rc->rc_nh_weight = rnd->rnd_weight;
1309 rib_notify(rnh, RIB_NOTIFY_IMMEDIATE, rc);
1315 * Conditionally update route nhop/weight IFF data in @nhd_orig is
1316 * consistent with the current route data.
1317 * Nexthop in @nhd_new is consumed.
1320 change_route_conditional(struct rib_head *rnh, struct rtentry *rt,
1321 struct route_nhop_data *rnd_orig, struct route_nhop_data *rnd_new,
1322 struct rib_cmd_info *rc)
1324 struct rtentry *rt_new;
1327 #if DEBUG_MAX_LEVEL >= LOG_DEBUG2
1329 char buf_old[NHOP_PRINT_BUFSIZE], buf_new[NHOP_PRINT_BUFSIZE];
1330 nhop_print_buf_any(rnd_orig->rnd_nhop, buf_old, NHOP_PRINT_BUFSIZE);
1331 nhop_print_buf_any(rnd_new->rnd_nhop, buf_new, NHOP_PRINT_BUFSIZE);
1332 FIB_LOG(LOG_DEBUG2, rnh->rib_fibnum, rnh->rib_family,
1333 "trying change %s -> %s", buf_old, buf_new);
1338 struct route_nhop_data rnd;
1339 rt_new = lookup_prefix_rt(rnh, rt, &rnd);
1341 if (rt_new == NULL) {
1342 if (rnd_orig->rnd_nhop == NULL)
1343 error = add_route(rnh, rt, rnd_new, rc);
1346 * Prefix does not exist, which was not our assumption.
1347 * Update @rnd_orig with the new data and return
1349 rnd_orig->rnd_nhop = NULL;
1350 rnd_orig->rnd_weight = 0;
1354 /* Prefix exists, try to update */
1355 if (rnd_orig->rnd_nhop == rt_new->rt_nhop) {
1357 * Nhop/mpath group hasn't changed. Flip
1358 * to the new precalculated one and return
1360 error = change_route(rnh, rt_new, rnd_new, rc);
1362 /* Update and retry */
1363 rnd_orig->rnd_nhop = rt_new->rt_nhop;
1364 rnd_orig->rnd_weight = rt_new->rt_weight;
1372 rib_notify(rnh, RIB_NOTIFY_DELAYED, rc);
1374 if (rnd_orig->rnd_nhop != NULL)
1375 nhop_free_any(rnd_orig->rnd_nhop);
1378 if (rnd_new->rnd_nhop != NULL)
1379 nhop_free_any(rnd_new->rnd_nhop);
1386 * Performs modification of routing table specificed by @action.
1387 * Table is specified by @fibnum and sa_family in @info->rti_info[RTAX_DST].
1388 * Needs to be run in network epoch.
1390 * Returns 0 on success and fills in @rc with action result.
1393 rib_action(uint32_t fibnum, int action, struct rt_addrinfo *info,
1394 struct rib_cmd_info *rc)
1400 error = rib_add_route(fibnum, info, rc);
1403 error = rib_del_route(fibnum, info, rc);
1406 error = rib_change_route(fibnum, info, rc);
1417 struct rib_head *rnh;
1418 struct rtentry *head;
1419 rib_filter_f_t *filter_f;
1422 struct rib_cmd_info rc;
1426 * Conditionally unlinks rtenties or paths from radix tree based
1427 * on the callback data passed in @arg.
1430 rt_checkdelroute(struct radix_node *rn, void *arg)
1432 struct rt_delinfo *di = (struct rt_delinfo *)arg;
1433 struct rtentry *rt = (struct rtentry *)rn;
1435 if (rt_delete_conditional(di->rnh, rt, di->prio,
1436 di->filter_f, di->filter_arg, &di->rc) != 0)
1440 * Add deleted rtentries to the list to GC them
1441 * after dropping the lock.
1443 * XXX: Delayed notifications not implemented
1444 * for nexthop updates.
1446 if (di->rc.rc_cmd == RTM_DELETE) {
1447 /* Add to the list and return */
1448 rt->rt_chain = di->head;
1453 * RTM_CHANGE to a different nexthop or nexthop group.
1454 * Free old multipath group.
1456 nhop_free_any(di->rc.rc_nh_old);
1464 * Iterates over a routing table specified by @fibnum and @family and
1465 * deletes elements marked by @filter_f.
1466 * @fibnum: rtable id
1467 * @family: AF_ address family
1468 * @filter_f: function returning non-zero value for items to delete
1469 * @arg: data to pass to the @filter_f function
1470 * @report: true if rtsock notification is needed.
1473 rib_walk_del(u_int fibnum, int family, rib_filter_f_t *filter_f, void *filter_arg,
1476 struct rib_head *rnh;
1478 struct nhop_object *nh;
1479 struct epoch_tracker et;
1481 rnh = rt_tables_get_rnh(fibnum, family);
1485 struct rt_delinfo di = {
1487 .filter_f = filter_f,
1488 .filter_arg = filter_arg,
1489 .prio = NH_PRIORITY_NORMAL,
1492 NET_EPOCH_ENTER(et);
1495 rnh->rnh_walktree(&rnh->head, rt_checkdelroute, &di);
1498 /* We might have something to reclaim. */
1499 bzero(&di.rc, sizeof(di.rc));
1500 di.rc.rc_cmd = RTM_DELETE;
1501 while (di.head != NULL) {
1503 di.head = rt->rt_chain;
1504 rt->rt_chain = NULL;
1508 di.rc.rc_nh_old = nh;
1509 rib_notify(rnh, RIB_NOTIFY_DELAYED, &di.rc);
1513 struct nhgrp_object *nhg;
1514 const struct weightened_nhop *wn;
1516 if (NH_IS_NHGRP(nh)) {
1517 nhg = (struct nhgrp_object *)nh;
1518 wn = nhgrp_get_nhops(nhg, &num_nhops);
1519 for (int i = 0; i < num_nhops; i++)
1520 rt_routemsg(RTM_DELETE, rt, wn[i].nh, fibnum);
1523 rt_routemsg(RTM_DELETE, rt, nh, fibnum);
1532 rt_delete_unconditional(struct radix_node *rn, void *arg)
1534 struct rtentry *rt = RNTORT(rn);
1535 struct rib_head *rnh = (struct rib_head *)arg;
1537 rn = rnh->rnh_deladdr(rt_key(rt), rt_mask(rt), &rnh->head);
1538 if (RNTORT(rn) == rt)
1545 * Removes all routes from the routing table without executing notifications.
1546 * rtentres will be removed after the end of a current epoch.
1549 rib_flush_routes(struct rib_head *rnh)
1552 rnh->rnh_walktree(&rnh->head, rt_delete_unconditional, rnh);
1557 rib_flush_routes_family(int family)
1559 struct rib_head *rnh;
1561 for (uint32_t fibnum = 0; fibnum < rt_numfibs; fibnum++) {
1562 if ((rnh = rt_tables_get_rnh(fibnum, family)) != NULL)
1563 rib_flush_routes(rnh);
1568 rib_print_family(int family)