2 * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
4 * Copyright (c) 2020 Alexander V. Chernikov
6 * Redistribution and use in source and binary forms, with or without
7 * modification, are permitted provided that the following conditions
9 * 1. Redistributions of source code must retain the above copyright
10 * notice, this list of conditions and the following disclaimer.
11 * 2. Redistributions in binary form must reproduce the above copyright
12 * notice, this list of conditions and the following disclaimer in the
13 * documentation and/or other materials provided with the distribution.
15 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
16 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
17 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
18 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
19 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
20 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
21 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
22 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
23 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
24 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
28 #include <sys/cdefs.h>
29 __FBSDID("$FreeBSD$");
31 #include "opt_inet6.h"
32 #include "opt_route.h"
34 #include <sys/param.h>
35 #include <sys/systm.h>
36 #include <sys/malloc.h>
38 #include <sys/socket.h>
39 #include <sys/sysctl.h>
40 #include <sys/syslog.h>
41 #include <sys/kernel.h>
43 #include <sys/rmlock.h>
46 #include <net/if_var.h>
47 #include <net/if_dl.h>
49 #include <net/route.h>
50 #include <net/route/route_ctl.h>
51 #include <net/route/route_var.h>
52 #include <net/route/nhop_utils.h>
53 #include <net/route/nhop.h>
54 #include <net/route/nhop_var.h>
55 #include <netinet/in.h>
56 #include <netinet6/scope6_var.h>
57 #include <netinet6/in6_var.h>
59 #define DEBUG_MOD_NAME route_ctl
60 #define DEBUG_MAX_LEVEL LOG_DEBUG
61 #include <net/route/route_debug.h>
62 _DECLARE_DEBUG(LOG_INFO);
65 * This file contains control plane routing tables functions.
67 * All functions assumes they are called in net epoch.
70 union sockaddr_union {
72 struct sockaddr_in sin;
73 struct sockaddr_in6 sin6;
77 static int add_route_byinfo(struct rib_head *rnh, struct rt_addrinfo *info,
78 struct rib_cmd_info *rc);
79 static int change_route_byinfo(struct rib_head *rnh, struct rtentry *rt,
80 struct rt_addrinfo *info, struct route_nhop_data *nhd_orig,
81 struct rib_cmd_info *rc);
83 static int add_route_flags(struct rib_head *rnh, struct rtentry *rt,
84 struct route_nhop_data *rnd_add, int op_flags, struct rib_cmd_info *rc);
86 static int add_route_flags_mpath(struct rib_head *rnh, struct rtentry *rt,
87 struct route_nhop_data *rnd_add, struct route_nhop_data *rnd_orig,
88 int op_flags, struct rib_cmd_info *rc);
91 static int add_route(struct rib_head *rnh, struct rtentry *rt,
92 struct route_nhop_data *rnd, struct rib_cmd_info *rc);
93 static int delete_route(struct rib_head *rnh, struct rtentry *rt,
94 struct rib_cmd_info *rc);
95 static int rt_delete_conditional(struct rib_head *rnh, struct rtentry *rt,
96 int prio, rib_filter_f_t *cb, void *cbdata, struct rib_cmd_info *rc);
98 static int get_prio_from_info(const struct rt_addrinfo *info);
99 static int nhop_get_prio(const struct nhop_object *nh);
102 static bool rib_can_multipath(struct rib_head *rh);
105 /* Per-vnet multipath routing configuration */
106 SYSCTL_DECL(_net_route);
107 #define V_rib_route_multipath VNET(rib_route_multipath)
109 #define _MP_FLAGS CTLFLAG_RW
111 #define _MP_FLAGS CTLFLAG_RD
113 VNET_DEFINE(u_int, rib_route_multipath) = 1;
114 SYSCTL_UINT(_net_route, OID_AUTO, multipath, _MP_FLAGS | CTLFLAG_VNET,
115 &VNET_NAME(rib_route_multipath), 0, "Enable route multipath");
119 VNET_DEFINE(u_int, fib_hash_outbound) = 0;
120 SYSCTL_UINT(_net_route, OID_AUTO, hash_outbound, CTLFLAG_RD | CTLFLAG_VNET,
121 &VNET_NAME(fib_hash_outbound), 0,
122 "Compute flowid for locally-originated packets");
124 /* Default entropy to add to the hash calculation for the outbound connections*/
125 uint8_t mpath_entropy_key[MPATH_ENTROPY_KEY_LEN] = {
126 0x6d, 0x5a, 0x56, 0xda, 0x25, 0x5b, 0x0e, 0xc2,
127 0x41, 0x67, 0x25, 0x3d, 0x43, 0xa3, 0x8f, 0xb0,
128 0xd0, 0xca, 0x2b, 0xcb, 0xae, 0x7b, 0x30, 0xb4,
129 0x77, 0xcb, 0x2d, 0xa3, 0x80, 0x30, 0xf2, 0x0c,
130 0x6a, 0x42, 0xb7, 0x3b, 0xbe, 0xac, 0x01, 0xfa,
134 #if defined(INET) && defined(INET6)
135 FEATURE(ipv4_rfc5549_support, "Route IPv4 packets via IPv6 nexthops");
136 #define V_rib_route_ipv6_nexthop VNET(rib_route_ipv6_nexthop)
137 VNET_DEFINE_STATIC(u_int, rib_route_ipv6_nexthop) = 1;
138 SYSCTL_UINT(_net_route, OID_AUTO, ipv6_nexthop, CTLFLAG_RW | CTLFLAG_VNET,
139 &VNET_NAME(rib_route_ipv6_nexthop), 0, "Enable IPv4 route via IPv6 Next Hop address");
143 SYSCTL_NODE(_net_route, OID_AUTO, debug, CTLFLAG_RD | CTLFLAG_MPSAFE, 0, "");
145 static struct rib_head *
146 get_rnh(uint32_t fibnum, const struct rt_addrinfo *info)
148 struct rib_head *rnh;
149 struct sockaddr *dst;
151 KASSERT((fibnum < rt_numfibs), ("rib_add_route: bad fibnum"));
153 dst = info->rti_info[RTAX_DST];
154 rnh = rt_tables_get_rnh(fibnum, dst->sa_family);
159 #if defined(INET) && defined(INET6)
161 rib_can_4o6_nhop(void)
163 return (!!V_rib_route_ipv6_nexthop);
169 rib_can_multipath(struct rib_head *rh)
173 CURVNET_SET(rh->rib_vnet);
174 result = !!V_rib_route_multipath;
181 * Check is nhop is multipath-eligible.
182 * Avoid nhops without gateways and redirects.
184 * Returns 1 for multipath-eligible nexthop,
188 nhop_can_multipath(const struct nhop_object *nh)
191 if ((nh->nh_flags & NHF_MULTIPATH) != 0)
193 if ((nh->nh_flags & NHF_GATEWAY) == 0)
195 if ((nh->nh_flags & NHF_REDIRECT) != 0)
203 get_info_weight(const struct rt_addrinfo *info, uint32_t default_weight)
207 if (info->rti_mflags & RTV_WEIGHT)
208 weight = info->rti_rmx->rmx_weight;
210 weight = default_weight;
211 /* Keep upper 1 byte for adm distance purposes */
212 if (weight > RT_MAX_WEIGHT)
213 weight = RT_MAX_WEIGHT;
214 else if (weight == 0)
215 weight = default_weight;
221 * File-local concept for distingushing between the normal and
222 * RTF_PINNED routes tha can override the "normal" one.
224 #define NH_PRIORITY_HIGH 2
225 #define NH_PRIORITY_NORMAL 1
227 get_prio_from_info(const struct rt_addrinfo *info)
229 if (info->rti_flags & RTF_PINNED)
230 return (NH_PRIORITY_HIGH);
231 return (NH_PRIORITY_NORMAL);
235 nhop_get_prio(const struct nhop_object *nh)
237 if (NH_IS_PINNED(nh))
238 return (NH_PRIORITY_HIGH);
239 return (NH_PRIORITY_NORMAL);
243 * Check if specified @gw matches gw data in the nexthop @nh.
245 * Returns true if matches, false otherwise.
248 match_nhop_gw(const struct nhop_object *nh, const struct sockaddr *gw)
251 if (nh->gw_sa.sa_family != gw->sa_family)
254 switch (gw->sa_family) {
256 return (nh->gw4_sa.sin_addr.s_addr ==
257 ((const struct sockaddr_in *)gw)->sin_addr.s_addr);
260 const struct sockaddr_in6 *gw6;
261 gw6 = (const struct sockaddr_in6 *)gw;
264 * Currently (2020-09) IPv6 gws in kernel have their
265 * scope embedded. Once this becomes false, this code
266 * has to be revisited.
268 if (IN6_ARE_ADDR_EQUAL(&nh->gw6_sa.sin6_addr,
275 const struct sockaddr_dl *sdl;
276 sdl = (const struct sockaddr_dl *)gw;
277 return (nh->gwl_sa.sdl_index == sdl->sdl_index);
280 return (memcmp(&nh->gw_sa, gw, nh->gw_sa.sa_len) == 0);
288 * Matches all nexthop with given @gw.
289 * Can be used as rib_filter_f callback.
292 rib_match_gw(const struct rtentry *rt, const struct nhop_object *nh, void *gw_sa)
294 const struct sockaddr *gw = (const struct sockaddr *)gw_sa;
296 return (match_nhop_gw(nh, gw));
299 struct gw_filter_data {
300 const struct sockaddr *gw;
305 * Matches first occurence of the gateway provided in @gwd
308 match_gw_one(const struct rtentry *rt, const struct nhop_object *nh, void *_data)
310 struct gw_filter_data *gwd = (struct gw_filter_data *)_data;
312 /* Return only first match to make rtsock happy */
313 if (match_nhop_gw(nh, gwd->gw) && gwd->count++ == 0)
319 * Checks if data in @info matches nexhop @nh.
321 * Returns 0 on success,
322 * ESRCH if not matched,
323 * ENOENT if filter function returned false
326 check_info_match_nhop(const struct rt_addrinfo *info, const struct rtentry *rt,
327 const struct nhop_object *nh)
329 const struct sockaddr *gw = info->rti_info[RTAX_GATEWAY];
331 if (info->rti_filter != NULL) {
332 if (info->rti_filter(rt, nh, info->rti_filterdata) == 0)
337 if ((gw != NULL) && !match_nhop_gw(nh, gw))
344 * Runs exact prefix match based on @dst and @netmask.
345 * Returns matched @rtentry if found or NULL.
346 * If rtentry was found, saves nexthop / weight value into @rnd.
348 static struct rtentry *
349 lookup_prefix_bysa(struct rib_head *rnh, const struct sockaddr *dst,
350 const struct sockaddr *netmask, struct route_nhop_data *rnd)
354 RIB_LOCK_ASSERT(rnh);
356 rt = (struct rtentry *)rnh->rnh_lookup(dst, netmask, &rnh->head);
358 rnd->rnd_nhop = rt->rt_nhop;
359 rnd->rnd_weight = rt->rt_weight;
361 rnd->rnd_nhop = NULL;
369 lookup_prefix_rt(struct rib_head *rnh, const struct rtentry *rt,
370 struct route_nhop_data *rnd)
372 return (lookup_prefix_bysa(rnh, rt_key_const(rt), rt_mask_const(rt), rnd));
376 * Runs exact prefix match based on dst/netmask from @info.
377 * Assumes RIB lock is held.
378 * Returns matched @rtentry if found or NULL.
379 * If rtentry was found, saves nexthop / weight value into @rnd.
382 lookup_prefix(struct rib_head *rnh, const struct rt_addrinfo *info,
383 struct route_nhop_data *rnd)
387 rt = lookup_prefix_bysa(rnh, info->rti_info[RTAX_DST],
388 info->rti_info[RTAX_NETMASK], rnd);
394 fill_pxmask_family(int family, int plen, struct sockaddr *_dst,
395 struct sockaddr **pmask)
406 struct sockaddr_in *mask = (struct sockaddr_in *)(*pmask);
407 struct sockaddr_in *dst= (struct sockaddr_in *)_dst;
409 memset(mask, 0, sizeof(*mask));
410 mask->sin_family = family;
411 mask->sin_len = sizeof(*mask);
414 else if (plen > 32 || plen < 0)
417 uint32_t daddr, maddr;
418 maddr = htonl(plen ? ~((1 << (32 - plen)) - 1) : 0);
419 mask->sin_addr.s_addr = maddr;
420 daddr = dst->sin_addr.s_addr;
421 daddr = htonl(ntohl(daddr) & ntohl(maddr));
422 dst->sin_addr.s_addr = daddr;
431 struct sockaddr_in6 *mask = (struct sockaddr_in6 *)(*pmask);
432 struct sockaddr_in6 *dst = (struct sockaddr_in6 *)_dst;
434 memset(mask, 0, sizeof(*mask));
435 mask->sin6_family = family;
436 mask->sin6_len = sizeof(*mask);
439 else if (plen > 128 || plen < 0)
442 ip6_writemask(&mask->sin6_addr, plen);
443 IN6_MASK_ADDR(&dst->sin6_addr, &mask->sin6_addr);
454 * Attempts to add @dst/plen prefix with nexthop/nexhopgroup data @rnd
455 * to the routing table.
457 * @fibnum: rtable id to insert route to
458 * @dst: verified kernel-originated sockaddr, can be masked if plen non-empty
459 * @plen: prefix length (or -1 if host route or not applicable for AF)
460 * @op_flags: combination of RTM_F_ flags
461 * @rc: storage to report operation result
463 * Returns 0 on success.
466 rib_add_route_px(uint32_t fibnum, struct sockaddr *dst, int plen,
467 struct route_nhop_data *rnd, int op_flags, struct rib_cmd_info *rc)
469 union sockaddr_union mask_storage;
470 struct sockaddr *netmask = &mask_storage.sa;
471 struct rtentry *rt = NULL;
475 bzero(rc, sizeof(struct rib_cmd_info));
476 rc->rc_cmd = RTM_ADD;
478 struct rib_head *rnh = rt_tables_get_rnh(fibnum, dst->sa_family);
480 return (EAFNOSUPPORT);
482 if (!fill_pxmask_family(dst->sa_family, plen, dst, &netmask)) {
483 FIB_RH_LOG(LOG_DEBUG, rnh, "error: invalid plen %d", plen);
487 if (op_flags & RTM_F_CREATE) {
488 if ((rt = rt_alloc(rnh, dst, netmask)) == NULL) {
489 FIB_RH_LOG(LOG_INFO, rnh, "rtentry allocation failed");
494 return (add_route_flags(rnh, rt, rnd, op_flags, rc));
498 * Attempts to delete @dst/plen prefix matching gateway @gw from the
501 * @fibnum: rtable id to remove route from
502 * @dst: verified kernel-originated sockaddr, can be masked if plen non-empty
503 * @plen: prefix length (or -1 if host route or not applicable for AF)
504 * @gw: gateway to match
505 * @op_flags: combination of RTM_F_ flags
506 * @rc: storage to report operation result
508 * Returns 0 on success.
511 rib_del_route_px_gw(uint32_t fibnum, struct sockaddr *dst, int plen,
512 const struct sockaddr *gw, int op_flags, struct rib_cmd_info *rc)
514 struct gw_filter_data gwd = { .gw = gw };
516 return (rib_del_route_px(fibnum, dst, plen, match_gw_one, &gwd, op_flags, rc));
520 * Attempts to delete @dst/plen prefix matching @filter_func from the
523 * @fibnum: rtable id to remove route from
524 * @dst: verified kernel-originated sockaddr, can be masked if plen non-empty
525 * @plen: prefix length (or -1 if host route or not applicable for AF)
526 * @filter_func: func to be called for each nexthop of the prefix for matching
527 * @filter_arg: argument to pass to @filter_func
528 * @op_flags: combination of RTM_F_ flags
529 * @rc: storage to report operation result
531 * Returns 0 on success.
534 rib_del_route_px(uint32_t fibnum, struct sockaddr *dst, int plen,
535 rib_filter_f_t *filter_func, void *filter_arg, int op_flags,
536 struct rib_cmd_info *rc)
538 union sockaddr_union mask_storage;
539 struct sockaddr *netmask = &mask_storage.sa;
544 bzero(rc, sizeof(struct rib_cmd_info));
545 rc->rc_cmd = RTM_DELETE;
547 struct rib_head *rnh = rt_tables_get_rnh(fibnum, dst->sa_family);
549 return (EAFNOSUPPORT);
551 if (dst->sa_len > sizeof(mask_storage)) {
552 FIB_RH_LOG(LOG_DEBUG, rnh, "error: dst->sa_len too big: %d", dst->sa_len);
556 if (!fill_pxmask_family(dst->sa_family, plen, dst, &netmask)) {
557 FIB_RH_LOG(LOG_DEBUG, rnh, "error: invalid plen %d", plen);
561 int prio = (op_flags & RTM_F_FORCE) ? NH_PRIORITY_HIGH : NH_PRIORITY_NORMAL;
564 struct route_nhop_data rnd;
565 struct rtentry *rt = lookup_prefix_bysa(rnh, dst, netmask, &rnd);
567 error = rt_delete_conditional(rnh, rt, prio, filter_func,
576 rib_notify(rnh, RIB_NOTIFY_DELAYED, rc);
578 if (rc->rc_cmd == RTM_DELETE)
583 * Deleting 1 path may result in RTM_CHANGE to
584 * a different mpath group/nhop.
585 * Free old mpath group.
587 nhop_free_any(rc->rc_nh_old);
595 * Tries to copy route @rt from one rtable to the rtable specified by @dst_rh.
596 * @rt: route to copy.
597 * @rnd_src: nhop and weight. Multipath routes are not supported
598 * @rh_dst: target rtable.
599 * @rc: operation result storage
601 * Return 0 on success.
604 rib_copy_route(struct rtentry *rt, const struct route_nhop_data *rnd_src,
605 struct rib_head *rh_dst, struct rib_cmd_info *rc)
607 struct nhop_object __diagused *nh_src = rnd_src->rnd_nhop;
610 MPASS((nh_src->nh_flags & NHF_MULTIPATH) == 0);
612 IF_DEBUG_LEVEL(LOG_DEBUG2) {
613 char nhbuf[NHOP_PRINT_BUFSIZE], rtbuf[NHOP_PRINT_BUFSIZE];
614 nhop_print_buf_any(nh_src, nhbuf, sizeof(nhbuf));
615 rt_print_buf(rt, rtbuf, sizeof(rtbuf));
616 FIB_RH_LOG(LOG_DEBUG2, rh_dst, "copying %s -> %s from fib %u",
617 rtbuf, nhbuf, nhop_get_fibnum(nh_src));
619 struct nhop_object *nh = nhop_alloc(rh_dst->rib_fibnum, rh_dst->rib_family);
621 FIB_RH_LOG(LOG_INFO, rh_dst, "unable to allocate new nexthop");
624 nhop_copy(nh, rnd_src->rnd_nhop);
625 nhop_set_origin(nh, nhop_get_origin(rnd_src->rnd_nhop));
626 nhop_set_fibnum(nh, rh_dst->rib_fibnum);
627 nh = nhop_get_nhop_internal(rh_dst, nh, &error);
629 FIB_RH_LOG(LOG_INFO, rh_dst,
630 "unable to finalize new nexthop: error %d", error);
634 struct rtentry *rt_new = rt_alloc(rh_dst, rt_key(rt), rt_mask(rt));
635 if (rt_new == NULL) {
636 FIB_RH_LOG(LOG_INFO, rh_dst, "unable to create new rtentry");
641 struct route_nhop_data rnd = {
643 .rnd_weight = rnd_src->rnd_weight
645 int op_flags = RTM_F_CREATE | (NH_IS_PINNED(nh) ? RTM_F_FORCE : 0);
646 error = add_route_flags(rh_dst, rt_new, &rnd, op_flags, rc);
649 IF_DEBUG_LEVEL(LOG_DEBUG2) {
650 char buf[NHOP_PRINT_BUFSIZE];
651 rt_print_buf(rt_new, buf, sizeof(buf));
652 FIB_RH_LOG(LOG_DEBUG, rh_dst,
653 "Unable to add route %s: error %d", buf, error);
656 rt_free_immediate(rt_new);
662 * Adds route defined by @info into the kernel table specified by @fibnum and
663 * sa_family in @info->rti_info[RTAX_DST].
665 * Returns 0 on success and fills in operation metadata into @rc.
668 rib_add_route(uint32_t fibnum, struct rt_addrinfo *info,
669 struct rib_cmd_info *rc)
671 struct rib_head *rnh;
676 rnh = get_rnh(fibnum, info);
678 return (EAFNOSUPPORT);
681 * Check consistency between RTF_HOST flag and netmask
684 if (info->rti_flags & RTF_HOST)
685 info->rti_info[RTAX_NETMASK] = NULL;
686 else if (info->rti_info[RTAX_NETMASK] == NULL) {
687 FIB_RH_LOG(LOG_DEBUG, rnh, "error: no RTF_HOST and empty netmask");
691 bzero(rc, sizeof(struct rib_cmd_info));
692 rc->rc_cmd = RTM_ADD;
694 error = add_route_byinfo(rnh, info, rc);
696 rib_notify(rnh, RIB_NOTIFY_DELAYED, rc);
702 add_route_byinfo(struct rib_head *rnh, struct rt_addrinfo *info,
703 struct rib_cmd_info *rc)
705 struct route_nhop_data rnd_add;
706 struct nhop_object *nh;
708 struct sockaddr *dst, *gateway, *netmask;
711 dst = info->rti_info[RTAX_DST];
712 gateway = info->rti_info[RTAX_GATEWAY];
713 netmask = info->rti_info[RTAX_NETMASK];
715 if ((info->rti_flags & RTF_GATEWAY) && !gateway) {
716 FIB_RH_LOG(LOG_DEBUG, rnh, "error: RTF_GATEWAY set with empty gw");
719 if (dst && gateway && !nhop_check_gateway(dst->sa_family, gateway->sa_family)) {
720 FIB_RH_LOG(LOG_DEBUG, rnh,
721 "error: invalid dst/gateway family combination (%d, %d)",
722 dst->sa_family, gateway->sa_family);
726 if (dst->sa_len > sizeof(((struct rtentry *)NULL)->rt_dstb)) {
727 FIB_RH_LOG(LOG_DEBUG, rnh, "error: dst->sa_len too large: %d",
732 if (info->rti_ifa == NULL) {
733 error = rt_getifa_fib(info, rnh->rib_fibnum);
738 if ((rt = rt_alloc(rnh, dst, netmask)) == NULL)
741 error = nhop_create_from_info(rnh, info, &nh);
743 rt_free_immediate(rt);
747 rnd_add.rnd_nhop = nh;
748 rnd_add.rnd_weight = get_info_weight(info, RT_DEFAULT_WEIGHT);
750 int op_flags = RTM_F_CREATE;
751 if (get_prio_from_info(info) == NH_PRIORITY_HIGH)
752 op_flags |= RTM_F_FORCE;
754 op_flags |= RTM_F_APPEND;
755 return (add_route_flags(rnh, rt, &rnd_add, op_flags, rc));
760 add_route_flags(struct rib_head *rnh, struct rtentry *rt, struct route_nhop_data *rnd_add,
761 int op_flags, struct rib_cmd_info *rc)
763 struct route_nhop_data rnd_orig;
764 struct nhop_object *nh;
765 struct rtentry *rt_orig;
768 nh = rnd_add->rnd_nhop;
772 rt_orig = lookup_prefix_rt(rnh, rt, &rnd_orig);
774 if (rt_orig == NULL) {
775 if (op_flags & RTM_F_CREATE)
776 error = add_route(rnh, rt, rnd_add, rc);
778 error = ESRCH; /* no entry but creation was not required */
785 if (op_flags & RTM_F_EXCL) {
786 /* We have existing route in the RIB but not allowed to replace. */
792 /* Now either append or replace */
793 if (op_flags & RTM_F_REPLACE) {
794 if (nhop_get_prio(rnd_orig.rnd_nhop) > nhop_get_prio(rnd_add->rnd_nhop)) {
795 /* Old path is "better" (e.g. has PINNED flag set) */
800 change_route(rnh, rt_orig, rnd_add, rc);
809 if ((op_flags & RTM_F_APPEND) && rib_can_multipath(rnh) &&
810 nhop_can_multipath(rnd_add->rnd_nhop) &&
811 nhop_can_multipath(rnd_orig.rnd_nhop)) {
813 for (int i = 0; i < RIB_MAX_RETRIES; i++) {
814 error = add_route_flags_mpath(rnh, rt_orig, rnd_add, &rnd_orig,
818 RTSTAT_INC(rts_add_retry);
822 * Original nhop reference is unused in any case.
824 nhop_free_any(rnd_add->rnd_nhop);
825 if (op_flags & RTM_F_CREATE) {
826 if (error != 0 || rc->rc_cmd != RTM_ADD)
827 rt_free_immediate(rt);
832 /* Out of options - free state and return error */
835 if (op_flags & RTM_F_CREATE)
836 rt_free_immediate(rt);
844 add_route_flags_mpath(struct rib_head *rnh, struct rtentry *rt,
845 struct route_nhop_data *rnd_add, struct route_nhop_data *rnd_orig,
846 int op_flags, struct rib_cmd_info *rc)
849 struct route_nhop_data rnd_new;
852 error = nhgrp_get_addition_group(rnh, rnd_orig, rnd_add, &rnd_new);
854 if (error == EAGAIN) {
856 * Group creation failed, most probably because
857 * @rnd_orig data got scheduled for deletion.
858 * Refresh @rnd_orig data and retry.
861 lookup_prefix_rt(rnh, rt, rnd_orig);
863 if (rnd_orig == NULL && !(op_flags & RTM_F_CREATE)) {
864 /* In this iteration route doesn't exist */
870 error = change_route_conditional(rnh, rt, rnd_orig, &rnd_new, rc);
874 if (V_fib_hash_outbound == 0 && NH_IS_NHGRP(rc->rc_nh_new)) {
876 * First multipath route got installed. Enable local
877 * outbound connections hashing.
880 printf("FIB: enabled flowid calculation for locally-originated packets\n");
881 V_fib_hash_outbound = 1;
889 * Removes route defined by @info from the kernel table specified by @fibnum and
890 * sa_family in @info->rti_info[RTAX_DST].
892 * Returns 0 on success and fills in operation metadata into @rc.
895 rib_del_route(uint32_t fibnum, struct rt_addrinfo *info, struct rib_cmd_info *rc)
897 struct rib_head *rnh;
898 struct sockaddr *dst, *netmask;
899 struct sockaddr_storage mdst;
904 rnh = get_rnh(fibnum, info);
906 return (EAFNOSUPPORT);
908 bzero(rc, sizeof(struct rib_cmd_info));
909 rc->rc_cmd = RTM_DELETE;
911 dst = info->rti_info[RTAX_DST];
912 netmask = info->rti_info[RTAX_NETMASK];
914 if (netmask != NULL) {
915 /* Ensure @dst is always properly masked */
916 if (dst->sa_len > sizeof(mdst)) {
917 FIB_RH_LOG(LOG_DEBUG, rnh, "error: dst->sa_len too large");
920 rt_maskedcopy(dst, (struct sockaddr *)&mdst, netmask);
921 dst = (struct sockaddr *)&mdst;
924 rib_filter_f_t *filter_func = NULL;
925 void *filter_arg = NULL;
926 struct gw_filter_data gwd = { .gw = info->rti_info[RTAX_GATEWAY] };
928 if (info->rti_filter != NULL) {
929 filter_func = info->rti_filter;
930 filter_arg = info->rti_filterdata;
931 } else if (gwd.gw != NULL) {
932 filter_func = match_gw_one;
936 int prio = get_prio_from_info(info);
939 struct route_nhop_data rnd;
940 struct rtentry *rt = lookup_prefix_bysa(rnh, dst, netmask, &rnd);
942 error = rt_delete_conditional(rnh, rt, prio, filter_func,
951 rib_notify(rnh, RIB_NOTIFY_DELAYED, rc);
953 if (rc->rc_cmd == RTM_DELETE)
958 * Deleting 1 path may result in RTM_CHANGE to
959 * a different mpath group/nhop.
960 * Free old mpath group.
962 nhop_free_any(rc->rc_nh_old);
970 * Conditionally unlinks rtentry paths from @rnh matching @cb.
971 * Returns 0 on success with operation result stored in @rc.
973 * ESRCH - if prefix was not found or filter function failed to match
974 * EADDRINUSE - if trying to delete higher priority route.
977 rt_delete_conditional(struct rib_head *rnh, struct rtentry *rt,
978 int prio, rib_filter_f_t *cb, void *cbdata, struct rib_cmd_info *rc)
980 struct nhop_object *nh = rt->rt_nhop;
983 if (NH_IS_NHGRP(nh)) {
984 struct nhgrp_object *nhg = (struct nhgrp_object *)nh;
985 struct route_nhop_data rnd;
990 error = nhgrp_get_filtered_group(rnh, rt, nhg, cb, cbdata, &rnd);
992 if (rnd.rnd_nhgrp == nhg) {
993 /* No match, unreference new group and return. */
994 nhop_free_any(rnd.rnd_nhop);
997 error = change_route(rnh, rt, &rnd, rc);
1002 if (cb != NULL && !cb(rt, nh, cbdata))
1005 if (prio < nhop_get_prio(nh))
1006 return (EADDRINUSE);
1008 return (delete_route(rnh, rt, rc));
1012 rib_change_route(uint32_t fibnum, struct rt_addrinfo *info,
1013 struct rib_cmd_info *rc)
1016 struct route_nhop_data rnd_orig;
1017 struct rib_head *rnh;
1023 rnh = get_rnh(fibnum, info);
1025 return (EAFNOSUPPORT);
1027 bzero(rc, sizeof(struct rib_cmd_info));
1028 rc->rc_cmd = RTM_CHANGE;
1030 /* Check if updated gateway exists */
1031 if ((info->rti_flags & RTF_GATEWAY) &&
1032 (info->rti_info[RTAX_GATEWAY] == NULL)) {
1035 * route(8) adds RTF_GATEWAY flag if -interface is not set.
1036 * Remove RTF_GATEWAY to enforce consistency and maintain
1039 info->rti_flags &= ~RTF_GATEWAY;
1043 * route change is done in multiple steps, with dropping and
1044 * reacquiring lock. In the situations with multiple processes
1045 * changes the same route in can lead to the case when route
1046 * is changed between the steps. Address it by retrying the operation
1047 * multiple times before failing.
1051 rt = (struct rtentry *)rnh->rnh_lookup(info->rti_info[RTAX_DST],
1052 info->rti_info[RTAX_NETMASK], &rnh->head);
1059 rnd_orig.rnd_nhop = rt->rt_nhop;
1060 rnd_orig.rnd_weight = rt->rt_weight;
1064 for (int i = 0; i < RIB_MAX_RETRIES; i++) {
1065 error = change_route_byinfo(rnh, rt, info, &rnd_orig, rc);
1066 if (error != EAGAIN)
1074 change_nhop(struct rib_head *rnh, struct rt_addrinfo *info,
1075 struct nhop_object *nh_orig, struct nhop_object **nh_new)
1080 * New gateway could require new ifaddr, ifp;
1081 * flags may also be different; ifp may be specified
1082 * by ll sockaddr when protocol address is ambiguous
1084 if (((nh_orig->nh_flags & NHF_GATEWAY) &&
1085 info->rti_info[RTAX_GATEWAY] != NULL) ||
1086 info->rti_info[RTAX_IFP] != NULL ||
1087 (info->rti_info[RTAX_IFA] != NULL &&
1088 !sa_equal(info->rti_info[RTAX_IFA], nh_orig->nh_ifa->ifa_addr))) {
1089 error = rt_getifa_fib(info, rnh->rib_fibnum);
1092 info->rti_ifa = NULL;
1097 error = nhop_create_from_nhop(rnh, nh_orig, info, nh_new);
1098 info->rti_ifa = NULL;
1105 change_mpath_route(struct rib_head *rnh, struct rtentry *rt,
1106 struct rt_addrinfo *info, struct route_nhop_data *rnd_orig,
1107 struct rib_cmd_info *rc)
1109 int error = 0, found_idx = 0;
1110 struct nhop_object *nh_orig = NULL, *nh_new;
1111 struct route_nhop_data rnd_new = {};
1112 const struct weightened_nhop *wn = NULL;
1113 struct weightened_nhop *wn_new;
1116 wn = nhgrp_get_nhops(rnd_orig->rnd_nhgrp, &num_nhops);
1117 for (int i = 0; i < num_nhops; i++) {
1118 if (check_info_match_nhop(info, NULL, wn[i].nh) == 0) {
1125 if (nh_orig == NULL)
1128 error = change_nhop(rnh, info, nh_orig, &nh_new);
1132 wn_new = mallocarray(num_nhops, sizeof(struct weightened_nhop),
1133 M_TEMP, M_NOWAIT | M_ZERO);
1134 if (wn_new == NULL) {
1139 memcpy(wn_new, wn, num_nhops * sizeof(struct weightened_nhop));
1140 wn_new[found_idx].nh = nh_new;
1141 wn_new[found_idx].weight = get_info_weight(info, wn[found_idx].weight);
1143 error = nhgrp_get_group(rnh, wn_new, num_nhops, 0, &rnd_new.rnd_nhgrp);
1145 free(wn_new, M_TEMP);
1150 error = change_route_conditional(rnh, rt, rnd_orig, &rnd_new, rc);
1157 change_route_byinfo(struct rib_head *rnh, struct rtentry *rt,
1158 struct rt_addrinfo *info, struct route_nhop_data *rnd_orig,
1159 struct rib_cmd_info *rc)
1162 struct nhop_object *nh_orig;
1163 struct route_nhop_data rnd_new;
1165 nh_orig = rnd_orig->rnd_nhop;
1166 if (nh_orig == NULL)
1170 if (NH_IS_NHGRP(nh_orig))
1171 return (change_mpath_route(rnh, rt, info, rnd_orig, rc));
1174 rnd_new.rnd_weight = get_info_weight(info, rnd_orig->rnd_weight);
1175 error = change_nhop(rnh, info, nh_orig, &rnd_new.rnd_nhop);
1178 error = change_route_conditional(rnh, rt, rnd_orig, &rnd_new, rc);
1184 * Insert @rt with nhop data from @rnd_new to @rnh.
1185 * Returns 0 on success and stores operation results in @rc.
1188 add_route(struct rib_head *rnh, struct rtentry *rt,
1189 struct route_nhop_data *rnd, struct rib_cmd_info *rc)
1191 struct radix_node *rn;
1193 RIB_WLOCK_ASSERT(rnh);
1195 rt->rt_nhop = rnd->rnd_nhop;
1196 rt->rt_weight = rnd->rnd_weight;
1197 rn = rnh->rnh_addaddr(rt_key(rt), rt_mask_const(rt), &rnh->head, rt->rt_nodes);
1200 if (!NH_IS_NHGRP(rnd->rnd_nhop) && nhop_get_expire(rnd->rnd_nhop))
1201 tmproutes_update(rnh, rt, rnd->rnd_nhop);
1203 /* Finalize notification */
1205 rnh->rnh_prefixes++;
1207 rc->rc_cmd = RTM_ADD;
1209 rc->rc_nh_old = NULL;
1210 rc->rc_nh_new = rnd->rnd_nhop;
1211 rc->rc_nh_weight = rnd->rnd_weight;
1213 rib_notify(rnh, RIB_NOTIFY_IMMEDIATE, rc);
1217 /* Existing route or memory allocation failure. */
1222 * Unconditionally deletes @rt from @rnh.
1225 delete_route(struct rib_head *rnh, struct rtentry *rt, struct rib_cmd_info *rc)
1227 RIB_WLOCK_ASSERT(rnh);
1229 /* Route deletion requested. */
1230 struct radix_node *rn;
1232 rn = rnh->rnh_deladdr(rt_key_const(rt), rt_mask_const(rt), &rnh->head);
1236 rt->rte_flags &= ~RTF_UP;
1239 rnh->rnh_prefixes--;
1241 rc->rc_cmd = RTM_DELETE;
1243 rc->rc_nh_old = rt->rt_nhop;
1244 rc->rc_nh_new = NULL;
1245 rc->rc_nh_weight = rt->rt_weight;
1247 rib_notify(rnh, RIB_NOTIFY_IMMEDIATE, rc);
1253 * Switch @rt nhop/weigh to the ones specified in @rnd.
1254 * Returns 0 on success.
1257 change_route(struct rib_head *rnh, struct rtentry *rt,
1258 struct route_nhop_data *rnd, struct rib_cmd_info *rc)
1260 struct nhop_object *nh_orig;
1262 RIB_WLOCK_ASSERT(rnh);
1264 nh_orig = rt->rt_nhop;
1266 if (rnd->rnd_nhop == NULL)
1267 return (delete_route(rnh, rt, rc));
1269 /* Changing nexthop & weight to a new one */
1270 rt->rt_nhop = rnd->rnd_nhop;
1271 rt->rt_weight = rnd->rnd_weight;
1272 if (!NH_IS_NHGRP(rnd->rnd_nhop) && nhop_get_expire(rnd->rnd_nhop))
1273 tmproutes_update(rnh, rt, rnd->rnd_nhop);
1275 /* Finalize notification */
1277 rc->rc_cmd = RTM_CHANGE;
1279 rc->rc_nh_old = nh_orig;
1280 rc->rc_nh_new = rnd->rnd_nhop;
1281 rc->rc_nh_weight = rnd->rnd_weight;
1283 rib_notify(rnh, RIB_NOTIFY_IMMEDIATE, rc);
1289 * Conditionally update route nhop/weight IFF data in @nhd_orig is
1290 * consistent with the current route data.
1291 * Nexthop in @nhd_new is consumed.
1294 change_route_conditional(struct rib_head *rnh, struct rtentry *rt,
1295 struct route_nhop_data *rnd_orig, struct route_nhop_data *rnd_new,
1296 struct rib_cmd_info *rc)
1298 struct rtentry *rt_new;
1301 IF_DEBUG_LEVEL(LOG_DEBUG2) {
1302 char buf_old[NHOP_PRINT_BUFSIZE], buf_new[NHOP_PRINT_BUFSIZE];
1303 nhop_print_buf_any(rnd_orig->rnd_nhop, buf_old, NHOP_PRINT_BUFSIZE);
1304 nhop_print_buf_any(rnd_new->rnd_nhop, buf_new, NHOP_PRINT_BUFSIZE);
1305 FIB_LOG(LOG_DEBUG2, rnh->rib_fibnum, rnh->rib_family,
1306 "trying change %s -> %s", buf_old, buf_new);
1310 struct route_nhop_data rnd;
1311 rt_new = lookup_prefix_rt(rnh, rt, &rnd);
1313 if (rt_new == NULL) {
1314 if (rnd_orig->rnd_nhop == NULL)
1315 error = add_route(rnh, rt, rnd_new, rc);
1318 * Prefix does not exist, which was not our assumption.
1319 * Update @rnd_orig with the new data and return
1321 rnd_orig->rnd_nhop = NULL;
1322 rnd_orig->rnd_weight = 0;
1326 /* Prefix exists, try to update */
1327 if (rnd_orig->rnd_nhop == rt_new->rt_nhop) {
1329 * Nhop/mpath group hasn't changed. Flip
1330 * to the new precalculated one and return
1332 error = change_route(rnh, rt_new, rnd_new, rc);
1334 /* Update and retry */
1335 rnd_orig->rnd_nhop = rt_new->rt_nhop;
1336 rnd_orig->rnd_weight = rt_new->rt_weight;
1344 rib_notify(rnh, RIB_NOTIFY_DELAYED, rc);
1346 if (rnd_orig->rnd_nhop != NULL)
1347 nhop_free_any(rnd_orig->rnd_nhop);
1350 if (rnd_new->rnd_nhop != NULL)
1351 nhop_free_any(rnd_new->rnd_nhop);
1358 * Performs modification of routing table specificed by @action.
1359 * Table is specified by @fibnum and sa_family in @info->rti_info[RTAX_DST].
1360 * Needs to be run in network epoch.
1362 * Returns 0 on success and fills in @rc with action result.
1365 rib_action(uint32_t fibnum, int action, struct rt_addrinfo *info,
1366 struct rib_cmd_info *rc)
1372 error = rib_add_route(fibnum, info, rc);
1375 error = rib_del_route(fibnum, info, rc);
1378 error = rib_change_route(fibnum, info, rc);
1389 struct rib_head *rnh;
1390 struct rtentry *head;
1391 rib_filter_f_t *filter_f;
1394 struct rib_cmd_info rc;
1398 * Conditionally unlinks rtenties or paths from radix tree based
1399 * on the callback data passed in @arg.
1402 rt_checkdelroute(struct radix_node *rn, void *arg)
1404 struct rt_delinfo *di = (struct rt_delinfo *)arg;
1405 struct rtentry *rt = (struct rtentry *)rn;
1407 if (rt_delete_conditional(di->rnh, rt, di->prio,
1408 di->filter_f, di->filter_arg, &di->rc) != 0)
1412 * Add deleted rtentries to the list to GC them
1413 * after dropping the lock.
1415 * XXX: Delayed notifications not implemented
1416 * for nexthop updates.
1418 if (di->rc.rc_cmd == RTM_DELETE) {
1419 /* Add to the list and return */
1420 rt->rt_chain = di->head;
1425 * RTM_CHANGE to a different nexthop or nexthop group.
1426 * Free old multipath group.
1428 nhop_free_any(di->rc.rc_nh_old);
1436 * Iterates over a routing table specified by @fibnum and @family and
1437 * deletes elements marked by @filter_f.
1438 * @fibnum: rtable id
1439 * @family: AF_ address family
1440 * @filter_f: function returning non-zero value for items to delete
1441 * @arg: data to pass to the @filter_f function
1442 * @report: true if rtsock notification is needed.
1445 rib_walk_del(u_int fibnum, int family, rib_filter_f_t *filter_f, void *filter_arg,
1448 struct rib_head *rnh;
1450 struct nhop_object *nh;
1451 struct epoch_tracker et;
1453 rnh = rt_tables_get_rnh(fibnum, family);
1457 struct rt_delinfo di = {
1459 .filter_f = filter_f,
1460 .filter_arg = filter_arg,
1461 .prio = NH_PRIORITY_NORMAL,
1464 NET_EPOCH_ENTER(et);
1467 rnh->rnh_walktree(&rnh->head, rt_checkdelroute, &di);
1470 /* We might have something to reclaim. */
1471 bzero(&di.rc, sizeof(di.rc));
1472 di.rc.rc_cmd = RTM_DELETE;
1473 while (di.head != NULL) {
1475 di.head = rt->rt_chain;
1476 rt->rt_chain = NULL;
1480 di.rc.rc_nh_old = nh;
1481 rib_notify(rnh, RIB_NOTIFY_DELAYED, &di.rc);
1485 struct nhgrp_object *nhg;
1486 const struct weightened_nhop *wn;
1488 if (NH_IS_NHGRP(nh)) {
1489 nhg = (struct nhgrp_object *)nh;
1490 wn = nhgrp_get_nhops(nhg, &num_nhops);
1491 for (int i = 0; i < num_nhops; i++)
1492 rt_routemsg(RTM_DELETE, rt, wn[i].nh, fibnum);
1495 rt_routemsg(RTM_DELETE, rt, nh, fibnum);
1504 rt_delete_unconditional(struct radix_node *rn, void *arg)
1506 struct rtentry *rt = RNTORT(rn);
1507 struct rib_head *rnh = (struct rib_head *)arg;
1509 rn = rnh->rnh_deladdr(rt_key(rt), rt_mask(rt), &rnh->head);
1510 if (RNTORT(rn) == rt)
1517 * Removes all routes from the routing table without executing notifications.
1518 * rtentres will be removed after the end of a current epoch.
1521 rib_flush_routes(struct rib_head *rnh)
1524 rnh->rnh_walktree(&rnh->head, rt_delete_unconditional, rnh);
1529 rib_flush_routes_family(int family)
1531 struct rib_head *rnh;
1533 for (uint32_t fibnum = 0; fibnum < rt_numfibs; fibnum++) {
1534 if ((rnh = rt_tables_get_rnh(fibnum, family)) != NULL)
1535 rib_flush_routes(rnh);
1540 rib_print_family(int family)