2 * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
4 * Copyright (c) 2020 Alexander V. Chernikov
6 * Redistribution and use in source and binary forms, with or without
7 * modification, are permitted provided that the following conditions
9 * 1. Redistributions of source code must retain the above copyright
10 * notice, this list of conditions and the following disclaimer.
11 * 2. Redistributions in binary form must reproduce the above copyright
12 * notice, this list of conditions and the following disclaimer in the
13 * documentation and/or other materials provided with the distribution.
15 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
16 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
17 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
18 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
19 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
20 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
21 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
22 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
23 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
24 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
28 #include <sys/cdefs.h>
29 __FBSDID("$FreeBSD$");
31 #include "opt_inet6.h"
32 #include "opt_route.h"
34 #include <sys/param.h>
35 #include <sys/eventhandler.h>
36 #include <sys/kernel.h>
39 #include <sys/rmlock.h>
40 #include <sys/malloc.h>
42 #include <sys/module.h>
43 #include <sys/kernel.h>
46 #include <sys/socket.h>
47 #include <sys/socketvar.h>
48 #include <sys/sysctl.h>
49 #include <sys/syslog.h>
50 #include <sys/queue.h>
54 #include <net/if_var.h>
56 #include <netinet/in.h>
57 #include <netinet/in_var.h>
58 #include <netinet/ip.h>
59 #include <netinet/ip_var.h>
61 #include <netinet/ip6.h>
62 #include <netinet6/ip6_var.h>
65 #include <net/route.h>
66 #include <net/route/nhop.h>
67 #include <net/route/route_ctl.h>
68 #include <net/route/route_var.h>
69 #include <net/route/fib_algo.h>
71 #include <machine/stdarg.h>
74 * Fib lookup framework.
76 * This framework enables accelerated longest-prefix-match lookups for the
77 * routing tables by adding the ability to dynamically attach/detach lookup
78 * algorithms implementation to/from the datapath.
80 * flm - fib lookup modules - implementation of particular lookup algorithm
81 * fd - fib data - instance of an flm bound to specific routing table
83 * This file provides main framework functionality.
85 * The following are the features provided by the framework
87 * 1) nexhops abstraction -> provides transparent referencing, indexing
88 * and efficient idx->ptr mappings for nexthop and nexthop groups.
89 * 2) Routing table synchronisation
90 * 3) dataplane attachment points
91 * 4) automatic algorithm selection based on the provided preference.
95 * For each supported address family, there is a an allocated array of fib_dp
96 * structures, indexed by fib number. Each array entry contains callback function
97 * and its argument. This function will be called with a family-specific lookup key,
98 * scope and provided argument. This array gets re-created every time when new algo
99 * instance gets created. Please take a look at the replace_rtables_family() function
104 SYSCTL_DECL(_net_route);
105 SYSCTL_NODE(_net_route, OID_AUTO, algo, CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
106 "Fib algorithm lookups");
108 /* Algorithm sync policy */
110 /* Time interval to bucket updates */
111 VNET_DEFINE(unsigned int, bucket_time_ms) = 50;
112 #define V_bucket_time_ms VNET(bucket_time_ms)
113 SYSCTL_UINT(_net_route_algo, OID_AUTO, bucket_time_ms, CTLFLAG_RW | CTLFLAG_VNET,
114 &VNET_NAME(bucket_time_ms), 0, "Time interval to calculate update rate");
116 /* Minimum update rate to delay sync */
117 VNET_DEFINE(unsigned int, bucket_change_threshold_rate) = 500;
118 #define V_bucket_change_threshold_rate VNET(bucket_change_threshold_rate)
119 SYSCTL_UINT(_net_route_algo, OID_AUTO, bucket_change_threshold_rate, CTLFLAG_RW | CTLFLAG_VNET,
120 &VNET_NAME(bucket_change_threshold_rate), 0, "Minimum update rate to delay sync");
122 /* Max allowed delay to sync */
123 VNET_DEFINE(unsigned int, fib_max_sync_delay_ms) = 1000;
124 #define V_fib_max_sync_delay_ms VNET(fib_max_sync_delay_ms)
125 SYSCTL_UINT(_net_route_algo, OID_AUTO, fib_max_sync_delay_ms, CTLFLAG_RW | CTLFLAG_VNET,
126 &VNET_NAME(fib_max_sync_delay_ms), 0, "Maximum time to delay sync (ms)");
130 VNET_DEFINE_STATIC(bool, algo_fixed_inet6) = false;
131 #define V_algo_fixed_inet6 VNET(algo_fixed_inet6)
132 SYSCTL_NODE(_net_route_algo, OID_AUTO, inet6, CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
133 "IPv6 longest prefix match lookups");
136 VNET_DEFINE_STATIC(bool, algo_fixed_inet) = false;
137 #define V_algo_fixed_inet VNET(algo_fixed_inet)
138 SYSCTL_NODE(_net_route_algo, OID_AUTO, inet, CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
139 "IPv4 longest prefix match lookups");
142 /* Fib instance counter */
143 static uint32_t fib_gen = 0;
145 struct nhop_ref_table {
150 enum fib_callout_action {
151 FDA_NONE, /* No callout scheduled */
152 FDA_REBUILD, /* Asks to rebuild algo instance */
153 FDA_EVAL, /* Asks to evaluate if the current algo is still be best */
154 FDA_BATCH, /* Asks to submit batch of updates to the algo */
157 struct fib_sync_status {
158 struct timeval diverge_time; /* ts when diverged */
159 uint32_t num_changes; /* number of changes since sync */
160 uint32_t bucket_changes; /* num changes within the current bucket */
161 uint64_t bucket_id; /* 50ms bucket # */
162 struct fib_change_queue fd_change_queue;/* list of scheduled entries */
166 * Data structure for the fib lookup instance tied to the particular rib.
169 uint32_t number_nhops; /* current # of nhops */
170 uint8_t hit_nhops; /* true if out of nhop limit */
171 uint8_t init_done; /* true if init is competed */
172 uint32_t fd_dead:1; /* Scheduled for deletion */
173 uint32_t fd_linked:1; /* true if linked */
174 uint32_t fd_need_rebuild:1; /* true if rebuild scheduled */
175 uint32_t fd_batch:1; /* true if batched notification scheduled */
176 uint8_t fd_family; /* family */
177 uint32_t fd_fibnum; /* fibnum */
178 uint32_t fd_failed_rebuilds; /* stat: failed rebuilds */
179 uint32_t fd_gen; /* instance gen# */
180 struct callout fd_callout; /* rebuild callout */
181 enum fib_callout_action fd_callout_action; /* Callout action to take */
182 void *fd_algo_data; /* algorithm data */
183 struct nhop_object **nh_idx; /* nhop idx->ptr array */
184 struct nhop_ref_table *nh_ref_table; /* array with # of nhop references */
185 struct rib_head *fd_rh; /* RIB table we're attached to */
186 struct rib_subscription *fd_rs; /* storing table subscription */
187 struct fib_dp fd_dp; /* fib datapath data */
188 struct vnet *fd_vnet; /* vnet fib belongs to */
189 struct epoch_context fd_epoch_ctx; /* epoch context for deletion */
190 struct fib_lookup_module *fd_flm;/* pointer to the lookup module */
191 struct fib_sync_status fd_ss; /* State relevant to the rib sync */
192 uint32_t fd_num_changes; /* number of changes since last callout */
193 TAILQ_ENTRY(fib_data) entries; /* list of all fds in vnet */
196 static bool rebuild_fd(struct fib_data *fd, const char *reason);
197 static bool rebuild_fd_flm(struct fib_data *fd, struct fib_lookup_module *flm_new);
198 static void handle_fd_callout(void *_data);
199 static void destroy_fd_instance_epoch(epoch_context_t ctx);
200 static bool is_idx_free(struct fib_data *fd, uint32_t index);
201 static void set_algo_fixed(struct rib_head *rh);
202 static bool is_algo_fixed(struct rib_head *rh);
204 static uint32_t fib_ref_nhop(struct fib_data *fd, struct nhop_object *nh);
205 static void fib_unref_nhop(struct fib_data *fd, struct nhop_object *nh);
207 static struct fib_lookup_module *fib_check_best_algo(struct rib_head *rh,
208 struct fib_lookup_module *orig_flm);
209 static void fib_unref_algo(struct fib_lookup_module *flm);
210 static bool flm_error_check(const struct fib_lookup_module *flm, uint32_t fibnum);
213 #define FIB_MOD_LOCK() mtx_lock(&fib_mtx)
214 #define FIB_MOD_UNLOCK() mtx_unlock(&fib_mtx)
215 #define FIB_MOD_LOCK_ASSERT() mtx_assert(&fib_mtx, MA_OWNED)
217 MTX_SYSINIT(fib_mtx, &fib_mtx, "algo list mutex", MTX_DEF);
219 /* Algorithm has to be this percent better than the current to switch */
220 #define BEST_DIFF_PERCENT (5 * 256 / 100)
221 /* Schedule algo re-evaluation X seconds after a change */
222 #define ALGO_EVAL_DELAY_MS 30000
223 /* Force algo re-evaluation after X changes */
224 #define ALGO_EVAL_NUM_ROUTES 100
225 /* Try to setup algorithm X times */
226 #define FIB_MAX_TRIES 32
227 /* Max amount of supported nexthops */
228 #define FIB_MAX_NHOPS 262144
229 #define FIB_CALLOUT_DELAY_MS 50
233 static int flm_debug_level = LOG_NOTICE;
234 SYSCTL_INT(_net_route_algo, OID_AUTO, debug_level, CTLFLAG_RW | CTLFLAG_RWTUN,
235 &flm_debug_level, 0, "debuglevel");
236 #define FLM_MAX_DEBUG_LEVEL LOG_DEBUG
241 #define _PASS_MSG(_l) (flm_debug_level >= (_l))
242 #define ALGO_PRINTF(_fmt, ...) printf("[fib_algo] %s: " _fmt "\n", __func__, ##__VA_ARGS__)
243 #define _ALGO_PRINTF(_fib, _fam, _aname, _gen, _func, _fmt, ...) \
244 printf("[fib_algo] %s.%u (%s#%u) %s: " _fmt "\n",\
245 print_family(_fam), _fib, _aname, _gen, _func, ## __VA_ARGS__)
246 #define _RH_PRINTF(_fib, _fam, _func, _fmt, ...) \
247 printf("[fib_algo] %s.%u %s: " _fmt "\n", print_family(_fam), _fib, _func, ## __VA_ARGS__)
248 #define RH_PRINTF(_l, _rh, _fmt, ...) if (_PASS_MSG(_l)) { \
249 _RH_PRINTF(_rh->rib_fibnum, _rh->rib_family, __func__, _fmt, ## __VA_ARGS__);\
251 #define FD_PRINTF(_l, _fd, _fmt, ...) FD_PRINTF_##_l(_l, _fd, _fmt, ## __VA_ARGS__)
252 #define _FD_PRINTF(_l, _fd, _fmt, ...) if (_PASS_MSG(_l)) { \
253 _ALGO_PRINTF(_fd->fd_fibnum, _fd->fd_family, _fd->fd_flm->flm_name, \
254 _fd->fd_gen, __func__, _fmt, ## __VA_ARGS__); \
256 #if FLM_MAX_DEBUG_LEVEL>=LOG_DEBUG2
257 #define FD_PRINTF_LOG_DEBUG2 _FD_PRINTF
259 #define FD_PRINTF_LOG_DEBUG2(_l, _fd, _fmt, ...)
261 #if FLM_MAX_DEBUG_LEVEL>=LOG_DEBUG
262 #define FD_PRINTF_LOG_DEBUG _FD_PRINTF
264 #define FD_PRINTF_LOG_DEBUG()
266 #if FLM_MAX_DEBUG_LEVEL>=LOG_INFO
267 #define FD_PRINTF_LOG_INFO _FD_PRINTF
269 #define FD_PRINTF_LOG_INFO()
271 #define FD_PRINTF_LOG_NOTICE _FD_PRINTF
272 #define FD_PRINTF_LOG_ERR _FD_PRINTF
273 #define FD_PRINTF_LOG_WARNING _FD_PRINTF
276 /* List of all registered lookup algorithms */
277 static TAILQ_HEAD(, fib_lookup_module) all_algo_list = TAILQ_HEAD_INITIALIZER(all_algo_list);
279 /* List of all fib lookup instances in the vnet */
280 VNET_DEFINE_STATIC(TAILQ_HEAD(fib_data_head, fib_data), fib_data_list);
281 #define V_fib_data_list VNET(fib_data_list)
283 /* Datastructure for storing non-transient fib lookup module failures */
286 uint32_t fe_fibnum; /* failed rtable */
287 struct fib_lookup_module *fe_flm; /* failed module */
288 TAILQ_ENTRY(fib_error) entries;/* list of all errored entries */
290 VNET_DEFINE_STATIC(TAILQ_HEAD(fib_error_head, fib_error), fib_error_list);
291 #define V_fib_error_list VNET(fib_error_list)
293 /* Per-family array of fibnum -> {func, arg} mappings used in datapath */
294 struct fib_dp_header {
295 struct epoch_context fdh_epoch_ctx;
296 uint32_t fdh_num_tables;
297 struct fib_dp fdh_idx[0];
301 * Tries to add new non-transient algorithm error to the list of
303 * Returns true on success.
306 flm_error_add(struct fib_lookup_module *flm, uint32_t fibnum)
308 struct fib_error *fe;
310 fe = malloc(sizeof(struct fib_error), M_TEMP, M_NOWAIT | M_ZERO);
314 fe->fe_family = flm->flm_family;
315 fe->fe_fibnum = fibnum;
318 /* Avoid duplicates by checking if error already exists first */
319 if (flm_error_check(flm, fibnum)) {
324 TAILQ_INSERT_HEAD(&V_fib_error_list, fe, entries);
331 * True if non-transient error has been registered for @flm in @fibnum.
334 flm_error_check(const struct fib_lookup_module *flm, uint32_t fibnum)
336 const struct fib_error *fe;
338 TAILQ_FOREACH(fe, &V_fib_error_list, entries) {
339 if ((fe->fe_flm == flm) && (fe->fe_fibnum == fibnum))
347 * Clear all errors of algo specified by @flm.
350 fib_error_clear_flm(struct fib_lookup_module *flm)
352 struct fib_error *fe, *fe_tmp;
354 FIB_MOD_LOCK_ASSERT();
356 TAILQ_FOREACH_SAFE(fe, &V_fib_error_list, entries, fe_tmp) {
357 if (fe->fe_flm == flm) {
358 TAILQ_REMOVE(&V_fib_error_list, fe, entries);
365 * Clears all errors in current VNET.
370 struct fib_error *fe, *fe_tmp;
372 FIB_MOD_LOCK_ASSERT();
374 TAILQ_FOREACH_SAFE(fe, &V_fib_error_list, entries, fe_tmp) {
375 TAILQ_REMOVE(&V_fib_error_list, fe, entries);
381 print_op_result(enum flm_op_result result)
398 print_family(int family)
401 if (family == AF_INET)
403 else if (family == AF_INET6)
410 * Debug function used by lookup algorithms.
411 * Outputs message denoted by @fmt, prepended by "[fib_algo] inetX.Y (algo) "
414 fib_printf(int level, struct fib_data *fd, const char *func, char *fmt, ...)
419 if (level > flm_debug_level)
423 vsnprintf(buf, sizeof(buf), fmt, ap);
426 _ALGO_PRINTF(fd->fd_fibnum, fd->fd_family, fd->fd_flm->flm_name,
427 fd->fd_gen, func, "%s", buf);
431 * Outputs list of algorithms supported by the provided address family.
434 print_algos_sysctl(struct sysctl_req *req, int family)
436 struct fib_lookup_module *flm;
438 int error, count = 0;
440 error = sysctl_wire_old_buffer(req, 0);
442 sbuf_new_for_sysctl(&sbuf, NULL, 512, req);
443 TAILQ_FOREACH(flm, &all_algo_list, entries) {
444 if (flm->flm_family == family) {
446 sbuf_cat(&sbuf, ", ");
447 sbuf_cat(&sbuf, flm->flm_name);
450 error = sbuf_finish(&sbuf);
458 print_algos_sysctl_inet6(SYSCTL_HANDLER_ARGS)
461 return (print_algos_sysctl(req, AF_INET6));
463 SYSCTL_PROC(_net_route_algo_inet6, OID_AUTO, algo_list,
464 CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, 0,
465 print_algos_sysctl_inet6, "A", "List of IPv6 lookup algorithms");
470 print_algos_sysctl_inet(SYSCTL_HANDLER_ARGS)
473 return (print_algos_sysctl(req, AF_INET));
475 SYSCTL_PROC(_net_route_algo_inet, OID_AUTO, algo_list,
476 CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, 0,
477 print_algos_sysctl_inet, "A", "List of IPv4 lookup algorithms");
481 * Calculate delay between repeated failures.
482 * Returns current delay in milliseconds.
485 callout_calc_delay_ms(struct fib_data *fd)
489 if (fd->fd_failed_rebuilds > 10)
492 shift = fd->fd_failed_rebuilds;
494 return ((1 << shift) * FIB_CALLOUT_DELAY_MS);
498 schedule_callout(struct fib_data *fd, enum fib_callout_action action, int delay_ms)
501 FD_PRINTF(LOG_DEBUG, fd, "delay=%d action=%d", delay_ms, action);
502 fd->fd_callout_action = action;
503 callout_reset_sbt(&fd->fd_callout, SBT_1MS * delay_ms, 0,
504 handle_fd_callout, fd, 0);
508 schedule_fd_rebuild(struct fib_data *fd, const char *reason)
511 RIB_WLOCK_ASSERT(fd->fd_rh);
513 if (!fd->fd_need_rebuild) {
514 fd->fd_need_rebuild = true;
515 /* Stop batch updates */
516 fd->fd_batch = false;
519 * Potentially re-schedules pending callout
520 * initiated by schedule_algo_eval.
522 FD_PRINTF(LOG_INFO, fd, "Scheduling rebuild: %s (failures=%d)",
523 reason, fd->fd_failed_rebuilds);
524 schedule_callout(fd, FDA_REBUILD, callout_calc_delay_ms(fd));
529 get_tv_diff_ms(const struct timeval *old_tv, const struct timeval *new_tv)
533 diff = ((int64_t)(new_tv->tv_sec - old_tv->tv_sec)) * 1000;
534 diff += (new_tv->tv_usec - old_tv->tv_usec) / 1000;
540 add_tv_diff_ms(struct timeval *tv, int ms)
542 tv->tv_sec += ms / 1000;
544 if (ms * 1000 + tv->tv_usec < 1000000)
545 tv->tv_usec += ms * 1000;
548 tv->tv_usec = ms * 1000 + tv->tv_usec - 1000000;
553 * Marks the time when algo state diverges from the rib state.
556 mark_diverge_time(struct fib_data *fd)
558 struct fib_sync_status *fd_ss = &fd->fd_ss;
560 getmicrouptime(&fd_ss->diverge_time);
561 fd_ss->bucket_id = 0;
562 fd_ss->bucket_changes = 0;
566 * Calculates and updates the next algorithm sync time, based on the current activity.
568 * The intent is to provide reasonable balance between the update
569 * latency and efficient batching when changing large amount of routes.
571 * High-level algorithm looks the following:
572 * 1) all changes are bucketed in 50ms intervals
573 * 2) If amount of changes within the bucket is greater than the threshold,
574 * the update gets delayed, up to maximum delay threshold.
577 update_rebuild_delay(struct fib_data *fd, enum fib_callout_action action)
579 uint32_t bucket_id, new_delay = 0;
582 /* Fetch all variables at once to ensure consistent reads */
583 uint32_t bucket_time_ms = V_bucket_time_ms;
584 uint32_t threshold_rate = V_bucket_change_threshold_rate;
585 uint32_t max_delay_ms = V_fib_max_sync_delay_ms;
587 if (bucket_time_ms == 0)
589 /* calculate per-bucket threshold rate */
590 threshold_rate = threshold_rate * bucket_time_ms / 1000;
594 struct fib_sync_status *fd_ss = &fd->fd_ss;
596 bucket_id = get_tv_diff_ms(&fd_ss->diverge_time, &tv) / bucket_time_ms;
598 if (fd_ss->bucket_id == bucket_id) {
599 fd_ss->bucket_changes++;
600 if (fd_ss->bucket_changes == threshold_rate) {
601 new_delay = (bucket_id + 2) * bucket_time_ms;
602 if (new_delay <= max_delay_ms) {
603 FD_PRINTF(LOG_DEBUG, fd,
604 "hit threshold of %u routes, delay update,"
605 "bucket: %u, total delay: %u",
606 threshold_rate, bucket_id + 1, new_delay);
609 FD_PRINTF(LOG_DEBUG, fd,
610 "maximum sync delay (%u ms) reached", max_delay_ms);
612 } else if ((bucket_id == 0) && (fd_ss->bucket_changes == 1))
613 new_delay = bucket_time_ms;
615 fd_ss->bucket_id = bucket_id;
616 fd_ss->bucket_changes = 1;
620 /* Calculated time has been updated */
621 struct timeval new_tv = fd_ss->diverge_time;
622 add_tv_diff_ms(&new_tv, new_delay);
624 int32_t delay_ms = get_tv_diff_ms(&tv, &new_tv);
625 schedule_callout(fd, action, delay_ms);
630 update_algo_state(struct fib_data *fd)
633 RIB_WLOCK_ASSERT(fd->fd_rh);
635 if (fd->fd_batch || fd->fd_need_rebuild) {
636 enum fib_callout_action action = fd->fd_need_rebuild ? FDA_REBUILD : FDA_BATCH;
637 update_rebuild_delay(fd, action);
641 if (fd->fd_num_changes++ == 0) {
642 /* Start callout to consider switch */
643 if (!callout_pending(&fd->fd_callout))
644 schedule_callout(fd, FDA_EVAL, ALGO_EVAL_DELAY_MS);
645 } else if (fd->fd_num_changes == ALGO_EVAL_NUM_ROUTES) {
646 /* Reset callout to exec immediately */
647 if (fd->fd_callout_action == FDA_EVAL)
648 schedule_callout(fd, FDA_EVAL, 1);
653 need_immediate_sync(struct fib_data *fd, struct rib_cmd_info *rc)
655 struct nhop_object *nh;
657 /* Sync addition/removal of interface routes */
658 switch (rc->rc_cmd) {
661 if (!NH_IS_NHGRP(nh) && (!(nh->nh_flags & NHF_GATEWAY)))
666 if (!NH_IS_NHGRP(nh) && (!(nh->nh_flags & NHF_GATEWAY)))
675 apply_rtable_changes(struct fib_data *fd)
677 enum flm_op_result result;
678 struct fib_change_queue *q = &fd->fd_ss.fd_change_queue;
680 result = fd->fd_flm->flm_change_rib_items_cb(fd->fd_rh, q, fd->fd_algo_data);
682 if (result == FLM_SUCCESS) {
683 for (int i = 0; i < q->count; i++)
684 if (q->entries[i].nh_old)
685 fib_unref_nhop(fd, q->entries[i].nh_old);
688 fd->fd_batch = false;
690 return (result == FLM_SUCCESS);
694 fill_change_entry(struct fib_data *fd, struct fib_change_entry *ce, struct rib_cmd_info *rc)
698 switch (fd->fd_family) {
700 rt_get_inet_prefix_plen(rc->rc_rt, &ce->addr4, &plen, &ce->scopeid);
703 rt_get_inet6_prefix_plen(rc->rc_rt, &ce->addr6, &plen, &ce->scopeid);
708 ce->nh_old = rc->rc_nh_old;
709 ce->nh_new = rc->rc_nh_new;
710 if (ce->nh_new != NULL) {
711 if (fib_ref_nhop(fd, ce->nh_new) == 0)
719 queue_rtable_change(struct fib_data *fd, struct rib_cmd_info *rc)
721 struct fib_change_queue *q = &fd->fd_ss.fd_change_queue;
723 if (q->count >= q->size) {
727 q_size = 256; /* ~18k memory */
729 q_size = q->size * 2;
731 size_t size = q_size * sizeof(struct fib_change_entry);
732 void *a = realloc(q->entries, size, M_TEMP, M_NOWAIT | M_ZERO);
734 FD_PRINTF(LOG_INFO, fd, "Unable to realloc queue for %u elements",
742 return (fill_change_entry(fd, &q->entries[q->count++], rc));
746 * Rib subscription handler. Checks if the algorithm is ready to
747 * receive updates, handles nexthop refcounting and passes change
748 * data to the algorithm callback.
751 handle_rtable_change_cb(struct rib_head *rnh, struct rib_cmd_info *rc,
754 struct fib_data *fd = (struct fib_data *)_data;
755 enum flm_op_result result;
757 RIB_WLOCK_ASSERT(rnh);
760 * There is a small gap between subscribing for route changes
761 * and initiating rtable dump. Avoid receiving route changes
762 * prior to finishing rtable dump by checking `init_done`.
767 bool immediate_sync = need_immediate_sync(fd, rc);
769 /* Consider scheduling algorithm re-evaluation */
770 update_algo_state(fd);
773 * If algo requested rebuild, stop sending updates by default.
774 * This simplifies nexthop refcount handling logic.
776 if (fd->fd_need_rebuild) {
778 rebuild_fd(fd, "rtable change type enforced sync");
783 * Algo requested updates to be delivered in batches.
784 * Add the current change to the queue and return.
787 if (immediate_sync) {
788 if (!queue_rtable_change(fd, rc) || !apply_rtable_changes(fd))
789 rebuild_fd(fd, "batch sync failed");
791 if (!queue_rtable_change(fd, rc))
792 schedule_fd_rebuild(fd, "batch queue failed");
798 * Maintain guarantee that every nexthop returned by the dataplane
799 * lookup has > 0 refcount, so can be safely referenced within current
802 if (rc->rc_nh_new != NULL) {
803 if (fib_ref_nhop(fd, rc->rc_nh_new) == 0) {
804 /* ran out of indexes */
805 schedule_fd_rebuild(fd, "ran out of nhop indexes");
810 result = fd->fd_flm->flm_change_rib_item_cb(rnh, rc, fd->fd_algo_data);
814 /* Unref old nexthop on success */
815 if (rc->rc_nh_old != NULL)
816 fib_unref_nhop(fd, rc->rc_nh_old);
821 * Algo asks to batch the changes.
823 if (queue_rtable_change(fd, rc)) {
824 if (!immediate_sync) {
826 mark_diverge_time(fd);
827 update_rebuild_delay(fd, FDA_BATCH);
830 if (apply_rtable_changes(fd))
833 FD_PRINTF(LOG_ERR, fd, "batched sync failed, force the rebuild");
838 * Algo is not able to apply the update.
839 * Schedule algo rebuild.
841 if (!immediate_sync) {
842 mark_diverge_time(fd);
843 schedule_fd_rebuild(fd, "algo requested rebuild");
847 FD_PRINTF(LOG_INFO, fd, "running sync rebuild");
848 rebuild_fd(fd, "rtable change type enforced sync");
853 * Algo reported a non-recoverable error.
854 * Record the error and schedule rebuild, which will
855 * trigger best algo selection.
857 FD_PRINTF(LOG_ERR, fd, "algo reported non-recoverable error");
858 if (!flm_error_add(fd->fd_flm, fd->fd_fibnum))
859 FD_PRINTF(LOG_ERR, fd, "failed to ban algo");
860 schedule_fd_rebuild(fd, "algo reported non-recoverable error");
865 estimate_nhop_scale(const struct fib_data *old_fd, struct fib_data *fd)
868 if (old_fd == NULL) {
869 // TODO: read from rtable
870 fd->number_nhops = 16;
874 if (old_fd->hit_nhops && old_fd->number_nhops < FIB_MAX_NHOPS)
875 fd->number_nhops = 2 * old_fd->number_nhops;
877 fd->number_nhops = old_fd->number_nhops;
883 enum flm_op_result result;
887 * Handler called after all rtenties have been dumped.
888 * Performs post-dump framework checks and calls
889 * algo:flm_dump_end_cb().
891 * Updates walk_cbdata result.
894 sync_algo_end_cb(struct rib_head *rnh, enum rib_walk_hook stage, void *_data)
896 struct walk_cbdata *w = (struct walk_cbdata *)_data;
897 struct fib_data *fd = w->fd;
899 RIB_WLOCK_ASSERT(w->fd->fd_rh);
901 if (rnh->rib_dying) {
902 w->result = FLM_ERROR;
907 FD_PRINTF(LOG_INFO, fd, "ran out of nexthops at %u nhops",
908 fd->nh_ref_table->count);
909 if (w->result == FLM_SUCCESS)
910 w->result = FLM_REBUILD;
914 if (stage != RIB_WALK_HOOK_POST || w->result != FLM_SUCCESS)
917 /* Post-dump hook, dump successful */
918 w->result = fd->fd_flm->flm_dump_end_cb(fd->fd_algo_data, &fd->fd_dp);
920 if (w->result == FLM_SUCCESS) {
921 /* Mark init as done to allow routing updates */
927 * Callback for each entry in rib.
928 * Calls algo:flm_dump_rib_item_cb func as a part of initial
929 * route table synchronisation.
932 sync_algo_cb(struct rtentry *rt, void *_data)
934 struct walk_cbdata *w = (struct walk_cbdata *)_data;
936 RIB_WLOCK_ASSERT(w->fd->fd_rh);
938 if (w->result == FLM_SUCCESS && w->func) {
941 * Reference nexthops to maintain guarantee that
942 * each nexthop returned by datapath has > 0 references
943 * and can be safely referenced within current epoch.
945 struct nhop_object *nh = rt_get_raw_nhop(rt);
946 if (fib_ref_nhop(w->fd, nh) != 0)
947 w->result = w->func(rt, w->fd->fd_algo_data);
949 w->result = FLM_REBUILD;
956 * Dump all routing table state to the algo instance.
958 static enum flm_op_result
959 sync_algo(struct fib_data *fd)
961 struct walk_cbdata w = {
963 .func = fd->fd_flm->flm_dump_rib_item_cb,
964 .result = FLM_SUCCESS,
967 rib_walk_ext_locked(fd->fd_rh, sync_algo_cb, sync_algo_end_cb, &w);
969 FD_PRINTF(LOG_INFO, fd,
970 "initial dump completed (rtable version: %d), result: %s",
971 fd->fd_rh->rnh_gen, print_op_result(w.result));
977 * Schedules epoch-backed @fd instance deletion.
978 * * Unlinks @fd from the list of active algo instances.
979 * * Removes rib subscription.
981 * * Schedules actual deletion.
983 * Assume @fd is already unlinked from the datapath.
986 schedule_destroy_fd_instance(struct fib_data *fd, bool in_callout)
991 RIB_WLOCK_ASSERT(fd->fd_rh);
994 is_dead = fd->fd_dead;
998 TAILQ_REMOVE(&V_fib_data_list, fd, entries);
999 fd->fd_linked = false;
1005 FD_PRINTF(LOG_INFO, fd, "DETACH");
1007 if (fd->fd_rs != NULL)
1008 rib_unsibscribe_locked(fd->fd_rs);
1011 * After rib_unsubscribe() no _new_ handle_rtable_change_cb() calls
1012 * will be executed, hence no _new_ callout schedules will happen.
1014 callout_stop(&fd->fd_callout);
1016 fib_epoch_call(destroy_fd_instance_epoch, &fd->fd_epoch_ctx);
1022 * Wipe all fd instances from the list matching rib specified by @rh.
1023 * If @keep_first is set, remove all but the first record.
1026 fib_cleanup_algo(struct rib_head *rh, bool keep_first, bool in_callout)
1028 struct fib_data_head tmp_head = TAILQ_HEAD_INITIALIZER(tmp_head);
1029 struct fib_data *fd, *fd_tmp;
1030 struct epoch_tracker et;
1033 TAILQ_FOREACH_SAFE(fd, &V_fib_data_list, entries, fd_tmp) {
1034 if (fd->fd_rh == rh) {
1039 TAILQ_REMOVE(&V_fib_data_list, fd, entries);
1040 fd->fd_linked = false;
1041 TAILQ_INSERT_TAIL(&tmp_head, fd, entries);
1046 /* Pass 2: remove each entry */
1047 NET_EPOCH_ENTER(et);
1048 TAILQ_FOREACH_SAFE(fd, &tmp_head, entries, fd_tmp) {
1050 RIB_WLOCK(fd->fd_rh);
1051 schedule_destroy_fd_instance(fd, in_callout);
1053 RIB_WUNLOCK(fd->fd_rh);
1059 fib_destroy_rib(struct rib_head *rh)
1063 * rnh has `is_dying` flag set, so setup of new fd's will fail at
1064 * sync_algo() stage, preventing new entries to be added to the list
1065 * of active algos. Remove all existing entries for the particular rib.
1067 fib_cleanup_algo(rh, false, false);
1071 * Finalises fd destruction by freeing all fd resources.
1074 destroy_fd_instance(struct fib_data *fd)
1077 FD_PRINTF(LOG_INFO, fd, "destroy fd %p", fd);
1079 /* Call destroy callback first */
1080 if (fd->fd_algo_data != NULL)
1081 fd->fd_flm->flm_destroy_cb(fd->fd_algo_data);
1084 if ((fd->nh_idx != NULL) && (fd->nh_ref_table != NULL)) {
1085 for (int i = 0; i < fd->number_nhops; i++) {
1086 if (!is_idx_free(fd, i)) {
1087 FD_PRINTF(LOG_DEBUG2, fd, " FREE nhop %d %p",
1089 nhop_free_any(fd->nh_idx[i]);
1092 free(fd->nh_idx, M_RTABLE);
1094 if (fd->nh_ref_table != NULL)
1095 free(fd->nh_ref_table, M_RTABLE);
1097 if (fd->fd_ss.fd_change_queue.entries != NULL)
1098 free(fd->fd_ss.fd_change_queue.entries, M_TEMP);
1100 fib_unref_algo(fd->fd_flm);
1106 * Epoch callback indicating fd is safe to destroy
1109 destroy_fd_instance_epoch(epoch_context_t ctx)
1111 struct fib_data *fd;
1113 fd = __containerof(ctx, struct fib_data, fd_epoch_ctx);
1115 destroy_fd_instance(fd);
1119 * Tries to setup fd instance.
1120 * - Allocates fd/nhop table
1121 * - Runs algo:flm_init_cb algo init
1122 * - Subscribes fd to the rib
1123 * - Runs rtable dump
1124 * - Adds instance to the list of active instances.
1126 * Returns: operation result. Fills in @pfd with resulting fd on success.
1129 static enum flm_op_result
1130 try_setup_fd_instance(struct fib_lookup_module *flm, struct rib_head *rh,
1131 struct fib_data *old_fd, struct fib_data **pfd)
1133 struct fib_data *fd;
1135 enum flm_op_result result;
1138 fd = malloc(sizeof(struct fib_data), M_RTABLE, M_NOWAIT | M_ZERO);
1141 RH_PRINTF(LOG_INFO, rh, "Unable to allocate fib_data structure");
1142 return (FLM_REBUILD);
1146 estimate_nhop_scale(old_fd, fd);
1149 fd->fd_gen = ++fib_gen;
1150 fd->fd_family = rh->rib_family;
1151 fd->fd_fibnum = rh->rib_fibnum;
1152 callout_init_rm(&fd->fd_callout, &rh->rib_lock, 0);
1153 fd->fd_vnet = curvnet;
1156 FD_PRINTF(LOG_DEBUG, fd, "allocated fd %p", fd);
1159 flm->flm_refcount++;
1162 /* Allocate nhidx -> nhop_ptr table */
1163 size = fd->number_nhops * sizeof(void *);
1164 fd->nh_idx = malloc(size, M_RTABLE, M_NOWAIT | M_ZERO);
1165 if (fd->nh_idx == NULL) {
1166 FD_PRINTF(LOG_INFO, fd, "Unable to allocate nhop table idx (sz:%zu)", size);
1167 return (FLM_REBUILD);
1170 /* Allocate nhop index refcount table */
1171 size = sizeof(struct nhop_ref_table);
1172 size += fd->number_nhops * sizeof(uint32_t);
1173 fd->nh_ref_table = malloc(size, M_RTABLE, M_NOWAIT | M_ZERO);
1174 if (fd->nh_ref_table == NULL) {
1175 FD_PRINTF(LOG_INFO, fd, "Unable to allocate nhop refcount table (sz:%zu)", size);
1176 return (FLM_REBUILD);
1178 FD_PRINTF(LOG_DEBUG, fd, "Allocated %u nhop indexes", fd->number_nhops);
1180 /* Okay, we're ready for algo init */
1181 void *old_algo_data = (old_fd != NULL) ? old_fd->fd_algo_data : NULL;
1182 result = flm->flm_init_cb(fd->fd_fibnum, fd, old_algo_data, &fd->fd_algo_data);
1183 if (result != FLM_SUCCESS) {
1184 FD_PRINTF(LOG_INFO, fd, "%s algo init failed", flm->flm_name);
1188 /* Try to subscribe */
1189 if (flm->flm_change_rib_item_cb != NULL) {
1190 fd->fd_rs = rib_subscribe_locked(fd->fd_rh,
1191 handle_rtable_change_cb, fd, RIB_NOTIFY_IMMEDIATE);
1192 if (fd->fd_rs == NULL) {
1193 FD_PRINTF(LOG_INFO, fd, "failed to subscribe to the rib changes");
1194 return (FLM_REBUILD);
1199 result = sync_algo(fd);
1200 if (result != FLM_SUCCESS) {
1201 FD_PRINTF(LOG_INFO, fd, "rib sync failed");
1204 FD_PRINTF(LOG_INFO, fd, "DUMP completed successfully.");
1208 * Insert fd in the beginning of a list, to maintain invariant
1209 * that first matching entry for the AF/fib is always the active
1212 TAILQ_INSERT_HEAD(&V_fib_data_list, fd, entries);
1213 fd->fd_linked = true;
1216 return (FLM_SUCCESS);
1220 * Sets up algo @flm for table @rh and links it to the datapath.
1223 static enum flm_op_result
1224 setup_fd_instance(struct fib_lookup_module *flm, struct rib_head *rh,
1225 struct fib_data *orig_fd, struct fib_data **pfd, bool attach)
1227 struct fib_data *prev_fd, *new_fd;
1228 enum flm_op_result result;
1231 RIB_WLOCK_ASSERT(rh);
1235 for (int i = 0; i < FIB_MAX_TRIES; i++) {
1236 result = try_setup_fd_instance(flm, rh, prev_fd, &new_fd);
1238 if ((result == FLM_SUCCESS) && attach) {
1239 if (!fib_set_datapath_ptr(new_fd, &new_fd->fd_dp))
1240 result = FLM_REBUILD;
1243 if ((prev_fd != NULL) && (prev_fd != orig_fd)) {
1244 schedule_destroy_fd_instance(prev_fd, false);
1248 RH_PRINTF(LOG_INFO, rh, "try %d: fib algo result: %s", i,
1249 print_op_result(result));
1251 if (result == FLM_REBUILD) {
1260 if (result != FLM_SUCCESS) {
1261 RH_PRINTF(LOG_WARNING, rh,
1262 "%s algo instance setup failed, failures=%d", flm->flm_name,
1263 orig_fd ? orig_fd->fd_failed_rebuilds + 1 : 0);
1264 /* update failure count */
1266 if (orig_fd != NULL)
1267 orig_fd->fd_failed_rebuilds++;
1270 /* Ban algo on non-recoverable error */
1271 if (result == FLM_ERROR)
1272 flm_error_add(flm, rh->rib_fibnum);
1274 if ((prev_fd != NULL) && (prev_fd != orig_fd))
1275 schedule_destroy_fd_instance(prev_fd, false);
1276 if (new_fd != NULL) {
1277 schedule_destroy_fd_instance(new_fd, false);
1287 * Tries to sync algo with the current rtable state, either
1288 * by executing batch update or rebuilding.
1289 * Returns true on success.
1292 execute_callout_action(struct fib_data *fd)
1294 enum fib_callout_action action = fd->fd_callout_action;
1295 struct fib_lookup_module *flm_new = NULL;
1299 RIB_WLOCK_ASSERT(fd->fd_rh);
1301 fd->fd_need_rebuild = false;
1302 fd->fd_batch = false;
1303 fd->fd_num_changes = 0;
1305 /* First, check if we're still OK to use this algo */
1306 if (!is_algo_fixed(fd->fd_rh))
1307 flm_new = fib_check_best_algo(fd->fd_rh, fd->fd_flm);
1308 if (flm_new != NULL)
1309 action = FDA_REBUILD;
1311 if (action == FDA_BATCH) {
1313 if (!apply_rtable_changes(fd))
1314 action = FDA_REBUILD;
1317 if (action == FDA_REBUILD)
1318 result = rebuild_fd_flm(fd, flm_new != NULL ? flm_new : fd->fd_flm);
1319 if (flm_new != NULL)
1320 fib_unref_algo(flm_new);
1326 * Callout for all scheduled fd-related work.
1327 * - Checks if the current algo is still the best algo
1328 * - Synchronises algo instance to the rtable (batch usecase)
1329 * - Creates a new instance of an algo for af/fib if desired.
1332 handle_fd_callout(void *_data)
1334 struct fib_data *fd = (struct fib_data *)_data;
1335 struct epoch_tracker et;
1337 FD_PRINTF(LOG_INFO, fd, "running callout type=%d", fd->fd_callout_action);
1339 NET_EPOCH_ENTER(et);
1340 CURVNET_SET(fd->fd_vnet);
1341 execute_callout_action(fd);
1347 * Tries to create new algo instance based on @fd data.
1348 * Returns true on success.
1351 rebuild_fd_flm(struct fib_data *fd, struct fib_lookup_module *flm_new)
1353 struct fib_data *fd_new, *fd_tmp = NULL;
1356 if (flm_new == fd->fd_flm)
1359 FD_PRINTF(LOG_NOTICE, fd, "switching algo to %s", flm_new->flm_name);
1361 result = setup_fd_instance(flm_new, fd->fd_rh, fd_tmp, &fd_new, true);
1362 if (result != FLM_SUCCESS) {
1363 FD_PRINTF(LOG_NOTICE, fd, "table rebuild failed");
1366 FD_PRINTF(LOG_INFO, fd_new, "switched to new instance");
1368 /* Remove old instance */
1369 schedule_destroy_fd_instance(fd, true);
1375 rebuild_fd(struct fib_data *fd, const char *reason)
1377 struct fib_lookup_module *flm_new = NULL;
1380 if (!is_algo_fixed(fd->fd_rh))
1381 flm_new = fib_check_best_algo(fd->fd_rh, fd->fd_flm);
1383 FD_PRINTF(LOG_INFO, fd, "running sync rebuild: %s", reason);
1384 result = rebuild_fd_flm(fd, flm_new != NULL ? flm_new : fd->fd_flm);
1385 if (flm_new != NULL)
1386 fib_unref_algo(flm_new);
1389 FD_PRINTF(LOG_ERR, fd, "sync rebuild failed");
1390 schedule_fd_rebuild(fd, "sync rebuild failed");
1397 * Finds algo by name/family.
1398 * Returns referenced algo or NULL.
1400 static struct fib_lookup_module *
1401 fib_find_algo(const char *algo_name, int family)
1403 struct fib_lookup_module *flm;
1406 TAILQ_FOREACH(flm, &all_algo_list, entries) {
1407 if ((strcmp(flm->flm_name, algo_name) == 0) &&
1408 (family == flm->flm_family)) {
1409 flm->flm_refcount++;
1420 fib_unref_algo(struct fib_lookup_module *flm)
1424 flm->flm_refcount--;
1429 set_fib_algo(uint32_t fibnum, int family, struct sysctl_oid *oidp, struct sysctl_req *req)
1431 struct fib_lookup_module *flm = NULL;
1432 struct fib_data *fd = NULL;
1433 char old_algo_name[32], algo_name[32];
1434 struct rib_head *rh = NULL;
1435 enum flm_op_result result;
1436 struct epoch_tracker et;
1439 /* Fetch current algo/rib for af/family */
1441 TAILQ_FOREACH(fd, &V_fib_data_list, entries) {
1442 if ((fd->fd_family == family) && (fd->fd_fibnum == fibnum))
1450 strlcpy(old_algo_name, fd->fd_flm->flm_name,
1451 sizeof(old_algo_name));
1454 strlcpy(algo_name, old_algo_name, sizeof(algo_name));
1455 error = sysctl_handle_string(oidp, algo_name, sizeof(algo_name), req);
1456 if (error != 0 || req->newptr == NULL)
1459 if (strcmp(algo_name, old_algo_name) == 0)
1462 /* New algorithm name is different */
1463 flm = fib_find_algo(algo_name, family);
1465 RH_PRINTF(LOG_INFO, rh, "unable to find algo %s", algo_name);
1470 NET_EPOCH_ENTER(et);
1472 result = setup_fd_instance(flm, rh, NULL, &fd, true);
1475 fib_unref_algo(flm);
1476 if (result != FLM_SUCCESS)
1479 /* Disable automated jumping between algos */
1483 /* Remove old instance(s) */
1484 fib_cleanup_algo(rh, true, false);
1486 /* Drain cb so user can unload the module after userret if so desired */
1487 epoch_drain_callbacks(net_epoch_preempt);
1494 set_algo_inet_sysctl_handler(SYSCTL_HANDLER_ARGS)
1497 return (set_fib_algo(curthread->td_proc->p_fibnum, AF_INET, oidp, req));
1499 SYSCTL_PROC(_net_route_algo_inet, OID_AUTO, algo,
1500 CTLFLAG_VNET | CTLTYPE_STRING | CTLFLAG_RW | CTLFLAG_MPSAFE, NULL, 0,
1501 set_algo_inet_sysctl_handler, "A", "Set IPv4 lookup algo");
1506 set_algo_inet6_sysctl_handler(SYSCTL_HANDLER_ARGS)
1509 return (set_fib_algo(curthread->td_proc->p_fibnum, AF_INET6, oidp, req));
1511 SYSCTL_PROC(_net_route_algo_inet6, OID_AUTO, algo,
1512 CTLFLAG_VNET | CTLTYPE_STRING | CTLFLAG_RW | CTLFLAG_MPSAFE, NULL, 0,
1513 set_algo_inet6_sysctl_handler, "A", "Set IPv6 lookup algo");
1517 destroy_fdh_epoch(epoch_context_t ctx)
1519 struct fib_dp_header *fdh;
1521 fdh = __containerof(ctx, struct fib_dp_header, fdh_epoch_ctx);
1522 free(fdh, M_RTABLE);
1525 static struct fib_dp_header *
1526 alloc_fib_dp_array(uint32_t num_tables, bool waitok)
1529 struct fib_dp_header *fdh;
1531 sz = sizeof(struct fib_dp_header);
1532 sz += sizeof(struct fib_dp) * num_tables;
1533 fdh = malloc(sz, M_RTABLE, (waitok ? M_WAITOK : M_NOWAIT) | M_ZERO);
1535 fdh->fdh_num_tables = num_tables;
1539 static struct fib_dp_header *
1540 get_fib_dp_header(struct fib_dp *dp)
1543 return (__containerof((void *)dp, struct fib_dp_header, fdh_idx));
1547 * Replace per-family index pool @pdp with a new one which
1548 * contains updated callback/algo data from @fd.
1549 * Returns true on success.
1552 replace_rtables_family(struct fib_dp **pdp, struct fib_data *fd, struct fib_dp *dp)
1554 struct fib_dp_header *new_fdh, *old_fdh;
1558 FD_PRINTF(LOG_DEBUG, fd, "[vnet %p] replace with f:%p arg:%p",
1559 curvnet, dp->f, dp->arg);
1562 old_fdh = get_fib_dp_header(*pdp);
1564 if (old_fdh->fdh_idx[fd->fd_fibnum].f == dp->f) {
1566 * Function is the same, data pointer needs update.
1567 * Perform in-line replace without reallocation.
1569 old_fdh->fdh_idx[fd->fd_fibnum].arg = dp->arg;
1570 FD_PRINTF(LOG_DEBUG, fd, "FDH %p inline update", old_fdh);
1575 new_fdh = alloc_fib_dp_array(old_fdh->fdh_num_tables, false);
1576 FD_PRINTF(LOG_DEBUG, fd, "OLD FDH: %p NEW FDH: %p", old_fdh, new_fdh);
1577 if (new_fdh == NULL) {
1579 FD_PRINTF(LOG_WARNING, fd, "error attaching datapath");
1583 memcpy(&new_fdh->fdh_idx[0], &old_fdh->fdh_idx[0],
1584 old_fdh->fdh_num_tables * sizeof(struct fib_dp));
1585 /* Update relevant data structure for @fd */
1586 new_fdh->fdh_idx[fd->fd_fibnum] = *dp;
1588 /* Ensure memcpy() writes have completed */
1589 atomic_thread_fence_rel();
1590 /* Set new datapath pointer */
1591 *pdp = &new_fdh->fdh_idx[0];
1593 FD_PRINTF(LOG_DEBUG, fd, "update %p -> %p", old_fdh, new_fdh);
1595 fib_epoch_call(destroy_fdh_epoch, &old_fdh->fdh_epoch_ctx);
1600 static struct fib_dp **
1601 get_family_dp_ptr(int family)
1605 return (&V_inet_dp);
1607 return (&V_inet6_dp);
1613 * Make datapath use fib instance @fd
1616 fib_set_datapath_ptr(struct fib_data *fd, struct fib_dp *dp)
1618 struct fib_dp **pdp;
1620 pdp = get_family_dp_ptr(fd->fd_family);
1621 return (replace_rtables_family(pdp, fd, dp));
1625 * Grow datapath pointers array.
1626 * Called from sysctl handler on growing number of routing tables.
1629 grow_rtables_family(struct fib_dp **pdp, uint32_t new_num_tables)
1631 struct fib_dp_header *new_fdh, *old_fdh = NULL;
1633 new_fdh = alloc_fib_dp_array(new_num_tables, true);
1637 old_fdh = get_fib_dp_header(*pdp);
1638 memcpy(&new_fdh->fdh_idx[0], &old_fdh->fdh_idx[0],
1639 old_fdh->fdh_num_tables * sizeof(struct fib_dp));
1642 /* Wait till all writes completed */
1643 atomic_thread_fence_rel();
1645 *pdp = &new_fdh->fdh_idx[0];
1648 if (old_fdh != NULL)
1649 fib_epoch_call(destroy_fdh_epoch, &old_fdh->fdh_epoch_ctx);
1653 * Grows per-AF arrays of datapath pointers for each supported family.
1654 * Called from fibs resize sysctl handler.
1657 fib_grow_rtables(uint32_t new_num_tables)
1661 grow_rtables_family(get_family_dp_ptr(AF_INET), new_num_tables);
1664 grow_rtables_family(get_family_dp_ptr(AF_INET6), new_num_tables);
1669 fib_get_rtable_info(struct rib_head *rh, struct rib_rtable_info *rinfo)
1672 bzero(rinfo, sizeof(struct rib_rtable_info));
1673 rinfo->num_prefixes = rh->rnh_prefixes;
1674 rinfo->num_nhops = nhops_get_count(rh);
1676 rinfo->num_nhgrp = nhgrp_get_count(rh);
1681 * Updates pointer to the algo data for the @fd.
1684 fib_set_algo_ptr(struct fib_data *fd, void *algo_data)
1686 RIB_WLOCK_ASSERT(fd->fd_rh);
1688 fd->fd_algo_data = algo_data;
1692 * Calls @callback with @ctx after the end of a current epoch.
1695 fib_epoch_call(epoch_callback_t callback, epoch_context_t ctx)
1697 epoch_call(net_epoch_preempt, callback, ctx);
1701 * Accessor to get rib instance @fd is attached to.
1704 fib_get_rh(struct fib_data *fd)
1711 * Accessor to export idx->nhop array
1713 struct nhop_object **
1714 fib_get_nhop_array(struct fib_data *fd)
1717 return (fd->nh_idx);
1721 get_nhop_idx(struct nhop_object *nh)
1724 if (NH_IS_NHGRP(nh))
1725 return (nhgrp_get_idx((struct nhgrp_object *)nh) * 2 - 1);
1727 return (nhop_get_idx(nh) * 2);
1729 return (nhop_get_idx(nh));
1734 fib_get_nhop_idx(struct fib_data *fd, struct nhop_object *nh)
1737 return (get_nhop_idx(nh));
1741 is_idx_free(struct fib_data *fd, uint32_t index)
1744 return (fd->nh_ref_table->refcnt[index] == 0);
1748 fib_ref_nhop(struct fib_data *fd, struct nhop_object *nh)
1750 uint32_t idx = get_nhop_idx(nh);
1752 if (idx >= fd->number_nhops) {
1757 if (is_idx_free(fd, idx)) {
1759 fd->nh_idx[idx] = nh;
1760 fd->nh_ref_table->count++;
1761 FD_PRINTF(LOG_DEBUG2, fd, " REF nhop %u %p", idx, fd->nh_idx[idx]);
1763 fd->nh_ref_table->refcnt[idx]++;
1768 struct nhop_release_data {
1769 struct nhop_object *nh;
1770 struct epoch_context ctx;
1774 release_nhop_epoch(epoch_context_t ctx)
1776 struct nhop_release_data *nrd;
1778 nrd = __containerof(ctx, struct nhop_release_data, ctx);
1779 nhop_free_any(nrd->nh);
1784 * Delays nexthop refcount release.
1785 * Datapath may have the datastructures not updated yet, so the old
1786 * nexthop may still be returned till the end of current epoch. Delay
1787 * refcount removal, as we may be removing the last instance, which will
1788 * trigger nexthop deletion, rendering returned nexthop invalid.
1791 fib_schedule_release_nhop(struct fib_data *fd, struct nhop_object *nh)
1793 struct nhop_release_data *nrd;
1795 nrd = malloc(sizeof(struct nhop_release_data), M_TEMP, M_NOWAIT | M_ZERO);
1798 fib_epoch_call(release_nhop_epoch, &nrd->ctx);
1801 * Unable to allocate memory. Leak nexthop to maintain guarantee
1802 * that each nhop can be referenced.
1804 FD_PRINTF(LOG_ERR, fd, "unable to schedule nhop %p deletion", nh);
1809 fib_unref_nhop(struct fib_data *fd, struct nhop_object *nh)
1811 uint32_t idx = get_nhop_idx(nh);
1813 KASSERT((idx < fd->number_nhops), ("invalid nhop index"));
1814 KASSERT((nh == fd->nh_idx[idx]), ("index table contains whong nh"));
1816 fd->nh_ref_table->refcnt[idx]--;
1817 if (fd->nh_ref_table->refcnt[idx] == 0) {
1818 FD_PRINTF(LOG_DEBUG, fd, " FREE nhop %d %p", idx, fd->nh_idx[idx]);
1819 fib_schedule_release_nhop(fd, fd->nh_idx[idx]);
1824 set_algo_fixed(struct rib_head *rh)
1826 switch (rh->rib_family) {
1829 V_algo_fixed_inet = true;
1834 V_algo_fixed_inet6 = true;
1841 is_algo_fixed(struct rib_head *rh)
1844 switch (rh->rib_family) {
1847 return (V_algo_fixed_inet);
1851 return (V_algo_fixed_inet6);
1858 * Runs the check on what would be the best algo for rib @rh, assuming
1859 * that the current algo is the one specified by @orig_flm. Note that
1860 * it can be NULL for initial selection.
1862 * Returns referenced new algo or NULL if the current one is the best.
1864 static struct fib_lookup_module *
1865 fib_check_best_algo(struct rib_head *rh, struct fib_lookup_module *orig_flm)
1867 uint8_t preference, curr_preference = 0, best_preference = 0;
1868 struct fib_lookup_module *flm, *best_flm = NULL;
1869 struct rib_rtable_info rinfo;
1870 int candidate_algos = 0;
1872 fib_get_rtable_info(rh, &rinfo);
1875 TAILQ_FOREACH(flm, &all_algo_list, entries) {
1876 if (flm->flm_family != rh->rib_family)
1879 preference = flm->flm_get_pref(&rinfo);
1880 if (preference > best_preference) {
1881 if (!flm_error_check(flm, rh->rib_fibnum)) {
1882 best_preference = preference;
1886 if (flm == orig_flm)
1887 curr_preference = preference;
1889 if ((best_flm != NULL) && (curr_preference + BEST_DIFF_PERCENT < best_preference))
1890 best_flm->flm_refcount++;
1895 RH_PRINTF(LOG_DEBUG, rh, "candidate_algos: %d, curr: %s(%d) result: %s(%d)",
1896 candidate_algos, orig_flm ? orig_flm->flm_name : "NULL", curr_preference,
1897 best_flm ? best_flm->flm_name : (orig_flm ? orig_flm->flm_name : "NULL"),
1904 * Called when new route table is created.
1905 * Selects, allocates and attaches fib algo for the table.
1908 fib_select_algo_initial(struct rib_head *rh)
1910 struct fib_lookup_module *flm;
1911 struct fib_data *fd = NULL;
1912 enum flm_op_result result;
1913 struct epoch_tracker et;
1916 flm = fib_check_best_algo(rh, NULL);
1918 RH_PRINTF(LOG_CRIT, rh, "no algo selected");
1921 RH_PRINTF(LOG_INFO, rh, "selected algo %s", flm->flm_name);
1923 NET_EPOCH_ENTER(et);
1925 result = setup_fd_instance(flm, rh, NULL, &fd, false);
1929 RH_PRINTF(LOG_DEBUG, rh, "result=%d fd=%p", result, fd);
1930 if (result == FLM_SUCCESS) {
1933 * Attach datapath directly to avoid multiple reallocations
1936 struct fib_dp_header *fdp;
1937 struct fib_dp **pdp;
1939 pdp = get_family_dp_ptr(rh->rib_family);
1941 fdp = get_fib_dp_header(*pdp);
1942 fdp->fdh_idx[fd->fd_fibnum] = fd->fd_dp;
1943 FD_PRINTF(LOG_INFO, fd, "datapath attached");
1947 RH_PRINTF(LOG_CRIT, rh, "unable to setup algo %s", flm->flm_name);
1950 fib_unref_algo(flm);
1956 * Registers fib lookup module within the subsystem.
1959 fib_module_register(struct fib_lookup_module *flm)
1963 ALGO_PRINTF("attaching %s to %s", flm->flm_name,
1964 print_family(flm->flm_family));
1965 TAILQ_INSERT_TAIL(&all_algo_list, flm, entries);
1972 * Tries to unregister fib lookup module.
1974 * Returns 0 on success, EBUSY if module is still used
1975 * by some of the tables.
1978 fib_module_unregister(struct fib_lookup_module *flm)
1982 if (flm->flm_refcount > 0) {
1986 fib_error_clear_flm(flm);
1987 ALGO_PRINTF("detaching %s from %s", flm->flm_name,
1988 print_family(flm->flm_family));
1989 TAILQ_REMOVE(&all_algo_list, flm, entries);
1999 TAILQ_INIT(&V_fib_data_list);
2003 vnet_fib_destroy(void)