2 * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
4 * Copyright (c) 2020 Alexander V. Chernikov
6 * Redistribution and use in source and binary forms, with or without
7 * modification, are permitted provided that the following conditions
9 * 1. Redistributions of source code must retain the above copyright
10 * notice, this list of conditions and the following disclaimer.
11 * 2. Redistributions in binary form must reproduce the above copyright
12 * notice, this list of conditions and the following disclaimer in the
13 * documentation and/or other materials provided with the distribution.
15 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
16 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
17 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
18 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
19 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
20 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
21 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
22 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
23 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
24 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
28 #include <sys/cdefs.h>
29 __FBSDID("$FreeBSD$");
32 #include <sys/param.h>
33 #include <sys/kernel.h>
35 #include <sys/rmlock.h>
36 #include <sys/malloc.h>
37 #include <sys/kernel.h>
39 #include <sys/socket.h>
40 #include <sys/sysctl.h>
44 #include <netinet/in.h>
46 #include <net/route.h>
47 #include <net/route/nhop.h>
48 #include <net/route/route_ctl.h>
49 #include <net/route/route_var.h>
50 #include <net/route/fib_algo.h>
53 * Binary search lookup algo.
55 * Compiles route table into a sorted array.
56 * Used with small amount of routes (< 16).
57 * As array is immutable, it is rebuild on each rtable change.
75 struct bsearch4_record {
78 struct nhop_object *nh;
81 struct bsearch4_data {
86 struct bsearch4_record *rr;
87 struct bsearch4_record br[0];
91 * Main IPv4 address lookup function.
93 * Finds array record with maximum index that is <= provided key.
94 * Assumes 0.0.0.0/0 always exists (may be with NULL nhop)
96 static struct nhop_object *
97 bsearch4_lookup(void *algo_data, const struct flm_lookup_key key, uint32_t scopeid)
99 const struct bsearch4_data *bd = (const struct bsearch4_data *)algo_data;
100 const struct bsearch4_record *br;
101 uint32_t addr4 = ntohl(key.addr4.s_addr);
104 int end = bd->num_items;
106 int i = (start + end) / 2;
107 while (start + 1 < end) {
108 i = (start + end) / 2;
110 if (addr4 < br->addr4) {
111 /* key < average, reduce right boundary */
114 } else if (addr4 > br->addr4) {
115 /* key > average, increase left aboundary */
123 /* start + 1 == end */
124 return (bd->br[start].nh);
128 * Preference function.
129 * Assume ideal for < 10 (typical single-interface setup has 5)
130 * Then gradually degrade.
131 * Assume 30 prefixes is at least 60 records, so it will require 8 lookup,
132 * which is even worse than radix.
135 bsearch4_get_pref(const struct rib_rtable_info *rinfo)
138 if (rinfo->num_prefixes < 10)
140 else if (rinfo->num_prefixes < 30)
141 return (255 - rinfo->num_prefixes * 8);
146 static enum flm_op_result
147 bsearch4_init(uint32_t fibnum, struct fib_data *fd, void *_old_data, void **_data)
149 struct bsearch4_data *bd;
150 struct rib_rtable_info rinfo;
155 fib_get_rtable_info(fib_get_rh(fd), &rinfo);
156 count = rinfo.num_prefixes * 11 / 10 + 64;
158 sz = sizeof(struct bsearch4_data) + sizeof(struct bsearch4_record) * count;
159 /* add cache line sz to ease alignment */
160 sz += CACHE_LINE_SIZE;
161 mem = malloc(sz, M_RTABLE, M_NOWAIT | M_ZERO);
163 return (FLM_REBUILD);
164 /* Align datapath-usable structure to cache line boundary */
165 bd = (struct bsearch4_data *)roundup2((uintptr_t)mem, CACHE_LINE_SIZE);
167 bd->alloc_items = count;
173 * Allocate temporary array to store all rtable data.
174 * This step is required to provide the required prefix iteration order.
176 bd->rr = mallocarray(count, sizeof(struct bsearch4_record), M_TEMP, M_NOWAIT | M_ZERO);
178 return (FLM_REBUILD);
180 return (FLM_SUCCESS);
184 bsearch4_destroy(void *_data)
186 struct bsearch4_data *bd = (struct bsearch4_data *)_data;
189 free(bd->rr, M_TEMP);
190 free(bd->mem, M_RTABLE);
194 * Callback storing converted rtable prefixes in the temporary array.
195 * Addresses are converted to a host order.
197 static enum flm_op_result
198 bsearch4_add_route_cb(struct rtentry *rt, void *_data)
200 struct bsearch4_data *bd = (struct bsearch4_data *)_data;
201 struct bsearch4_record *rr;
202 struct in_addr addr4, mask4;
205 if (bd->num_items >= bd->alloc_items)
206 return (FLM_REBUILD);
208 rr = &bd->rr[bd->num_items++];
209 rt_get_inet_prefix_pmask(rt, &addr4, &mask4, &scopeid);
210 rr->addr4 = ntohl(addr4.s_addr);
211 rr->mask4 = ntohl(mask4.s_addr);
212 rr->nh = rt_get_raw_nhop(rt);
214 return (FLM_SUCCESS);
218 * Prefix comparison function.
219 * 10.0.0.0/24 < 10.0.0.0/25 <- less specific wins
220 * 10.0.0.0/25 < 10.0.0.1/32 <- bigger base wins
223 rr_cmp(const void *_rec1, const void *_rec2)
225 const struct bsearch4_record *rec1, *rec2;
229 if (rec1->addr4 < rec2->addr4)
231 else if (rec1->addr4 > rec2->addr4)
235 * wider mask value is lesser mask
236 * we want less specific come first, e.g. <
238 if (rec1->mask4 < rec2->mask4)
240 else if (rec1->mask4 > rec2->mask4)
245 struct bsearch4_array {
246 uint32_t alloc_items;
248 struct bsearch4_record *arr;
252 add_array_entry(struct bsearch4_array *ba, struct bsearch4_record *br_new)
255 if (ba->num_items < ba->alloc_items) {
256 ba->arr[ba->num_items++] = *br_new;
262 static struct bsearch4_record *
263 get_last_entry(struct bsearch4_array *ba)
266 return (&ba->arr[ba->num_items - 1]);
272 * stack: 10.0.1.0/24,nh=3 array: 10.0.1.0/25,nh=4 -> ++10.0.1.128/24,nh=3
277 pop_stack_entry(struct bsearch4_array *dst_array, struct bsearch4_array *stack)
279 uint32_t last_stack_addr, last_array_addr;
281 struct bsearch4_record *br_prev = get_last_entry(dst_array);
282 struct bsearch4_record *pstack = get_last_entry(stack);
284 /* Regardless of the result, pop stack entry */
287 /* Prefix last address for the last entry in lookup array */
288 last_array_addr = (br_prev->addr4 | ~br_prev->mask4);
289 /* Prefix last address for the stack record entry */
290 last_stack_addr = (pstack->addr4 | ~pstack->mask4);
292 if (last_stack_addr > last_array_addr) {
294 * Stack record covers > address space than
295 * the last entry in the lookup array.
296 * Add the remaining parts of a stack record to
299 struct bsearch4_record br_new = {
300 .addr4 = last_array_addr + 1,
301 .mask4 = pstack->mask4,
304 return (add_array_entry(dst_array, &br_new));
311 * Updates resulting array @dst_array with a rib entry @rib_entry.
314 bsearch4_process_record(struct bsearch4_array *dst_array,
315 struct bsearch4_array *stack, struct bsearch4_record *rib_entry)
319 * Maintain invariant: current rib_entry is always contained
320 * in the top stack entry.
321 * Note we always have 0.0.0.0/0.
323 while (stack->num_items > 0) {
324 struct bsearch4_record *pst = get_last_entry(stack);
327 * Check if we need to pop stack.
328 * Rely on the ordering - larger prefixes comes up first
329 * Pop any entry that doesn't contain current prefix.
331 if (pst->addr4 == (rib_entry->addr4 & pst->mask4))
334 if (!pop_stack_entry(dst_array, stack))
338 if (dst_array->num_items > 0) {
341 * Check if there is a gap between previous entry and a
342 * current entry. Code above guarantees that both previous
343 * and current entry are contained in the top stack entry.
345 * Example: last: 10.0.0.1(/32,nh=3) cur: 10.0.0.3(/32,nh=4),
346 * stack: 10.0.0.0/24,nh=2.
347 * Cover a gap between previous and current by adding stack
350 struct bsearch4_record *br_tmp = get_last_entry(dst_array);
351 uint32_t last_declared_addr = br_tmp->addr4 | ~br_tmp->mask4;
352 if (last_declared_addr < rib_entry->addr4 - 1) {
354 struct bsearch4_record *pst = get_last_entry(stack);
355 struct bsearch4_record new_entry = {
356 .addr4 = last_declared_addr + 1,
360 if (!add_array_entry(dst_array, &new_entry))
365 * Special case: adding more specific prefix at the start of
366 * the previous interval:
367 * 10.0.0.0(/24,nh=3), 10.0.0.0(/25,nh=4)
368 * Alter the last record, seeting new nexthop and mask.
370 if (br_tmp->addr4 == rib_entry->addr4) {
371 *br_tmp = *rib_entry;
372 add_array_entry(stack, rib_entry);
377 if (!add_array_entry(dst_array, rib_entry))
379 add_array_entry(stack, rib_entry);
384 static enum flm_op_result
385 bsearch4_build_array(struct bsearch4_array *dst_array, struct bsearch4_array *src_array)
389 * During iteration, we keep track of all prefixes in rtable
390 * we currently match, by maintaining stack. As there can be only
391 * 32 prefixes for a single address, pre-allocate stack of size 32.
393 struct bsearch4_array stack = {
395 .arr = mallocarray(32, sizeof(struct bsearch4_record), M_TEMP, M_NOWAIT | M_ZERO),
397 if (stack.arr == NULL)
398 return (FLM_REBUILD);
400 for (int i = 0; i < src_array->num_items; i++) {
401 struct bsearch4_record *rib_entry = &src_array->arr[i];
403 if (!bsearch4_process_record(dst_array, &stack, rib_entry)) {
404 free(stack.arr, M_TEMP);
405 return (FLM_REBUILD);
410 * We know that last record is contained in the top stack entry.
412 while (stack.num_items > 0) {
413 if (!pop_stack_entry(dst_array, &stack))
414 return (FLM_REBUILD);
416 free(stack.arr, M_TEMP);
418 return (FLM_SUCCESS);
421 static enum flm_op_result
422 bsearch4_build(struct bsearch4_data *bd)
424 enum flm_op_result ret;
426 struct bsearch4_array prefixes_array = {
427 .alloc_items = bd->alloc_items,
428 .num_items = bd->num_items,
432 /* Add default route if not exists */
433 bool default_found = false;
434 for (int i = 0; i < prefixes_array.num_items; i++) {
435 if (prefixes_array.arr[i].mask4 == 0) {
436 default_found = true;
440 if (!default_found) {
441 /* Add default route with NULL nhop */
442 struct bsearch4_record default_entry = {};
443 if (!add_array_entry(&prefixes_array, &default_entry))
444 return (FLM_REBUILD);
448 qsort(prefixes_array.arr, prefixes_array.num_items, sizeof(struct bsearch4_record), rr_cmp);
450 struct bsearch4_array dst_array = {
451 .alloc_items = bd->alloc_items,
455 ret = bsearch4_build_array(&dst_array, &prefixes_array);
456 bd->num_items = dst_array.num_items;
458 free(bd->rr, M_TEMP);
464 static enum flm_op_result
465 bsearch4_end_dump(void *_data, struct fib_dp *dp)
467 struct bsearch4_data *bd = (struct bsearch4_data *)_data;
468 enum flm_op_result ret;
470 ret = bsearch4_build(bd);
471 if (ret == FLM_SUCCESS) {
472 dp->f = bsearch4_lookup;
479 static enum flm_op_result
480 bsearch4_change_cb(struct rib_head *rnh, struct rib_cmd_info *rc,
484 return (FLM_REBUILD);
487 struct fib_lookup_module flm_bsearch4= {
488 .flm_name = "bsearch4",
489 .flm_family = AF_INET,
490 .flm_init_cb = bsearch4_init,
491 .flm_destroy_cb = bsearch4_destroy,
492 .flm_dump_rib_item_cb = bsearch4_add_route_cb,
493 .flm_dump_end_cb = bsearch4_end_dump,
494 .flm_change_rib_item_cb = bsearch4_change_cb,
495 .flm_get_pref = bsearch4_get_pref,
499 * Lockless radix lookup algo.
501 * Compiles immutable radix from the current routing table.
502 * Used with small amount of routes (<1000).
503 * As datastructure is immutable, it gets rebuild on each rtable change.
505 * Lookups are slightly faster as shorter lookup keys are used
506 * (4 bytes instead of 8 in stock radix).
509 #define KEY_LEN_INET (offsetof(struct sockaddr_in, sin_addr) + sizeof(in_addr_t))
510 #define OFF_LEN_INET (8 * offsetof(struct sockaddr_in, sin_addr))
511 struct radix4_addr_entry {
512 struct radix_node rn[2];
513 struct sockaddr_in addr;
514 struct nhop_object *nhop;
516 #define LRADIX4_ITEM_SZ roundup2(sizeof(struct radix4_addr_entry), 64)
518 struct lradix4_data {
519 struct radix_node_head *rnh;
523 uint32_t alloc_items;
527 static struct nhop_object *
528 lradix4_lookup(void *algo_data, const struct flm_lookup_key key, uint32_t scopeid)
530 struct radix_node_head *rnh = (struct radix_node_head *)algo_data;
531 struct radix4_addr_entry *ent;
532 struct sockaddr_in addr4 = {
533 .sin_len = KEY_LEN_INET,
534 .sin_addr = key.addr4,
536 ent = (struct radix4_addr_entry *)(rn_match(&addr4, &rnh->rh));
543 * Preference function.
544 * Assume close-to-ideal of < 10 routes (though worse than bsearch), then
545 * gradually degrade until 1000 routes are reached.
548 lradix4_get_pref(const struct rib_rtable_info *rinfo)
551 if (rinfo->num_prefixes < 10)
553 else if (rinfo->num_prefixes < 1000)
554 return (254 - rinfo->num_prefixes / 4);
559 static enum flm_op_result
560 lradix4_init(uint32_t fibnum, struct fib_data *fd, void *_old_data, void **_data)
562 struct lradix4_data *lr;
563 struct rib_rtable_info rinfo;
567 lr = malloc(sizeof(struct lradix4_data), M_RTABLE, M_NOWAIT | M_ZERO);
568 if (lr == NULL || !rn_inithead((void **)&lr->rnh, OFF_LEN_INET))
569 return (FLM_REBUILD);
570 fib_get_rtable_info(fib_get_rh(fd), &rinfo);
572 count = rinfo.num_prefixes * 11 / 10;
573 sz = count * LRADIX4_ITEM_SZ + CACHE_LINE_SIZE;
574 lr->mem = malloc(sz, M_RTABLE, M_NOWAIT | M_ZERO);
576 return (FLM_REBUILD);
577 /* Align all rtentries to a cacheline boundary */
578 lr->rt_base = (char *)roundup2((uintptr_t)lr->mem, CACHE_LINE_SIZE);
579 lr->alloc_items = count;
584 return (FLM_SUCCESS);
588 lradix4_destroy(void *_data)
590 struct lradix4_data *lr = (struct lradix4_data *)_data;
593 rn_detachhead((void **)&lr->rnh);
595 free(lr->mem, M_RTABLE);
599 static enum flm_op_result
600 lradix4_add_route_cb(struct rtentry *rt, void *_data)
602 struct lradix4_data *lr = (struct lradix4_data *)_data;
603 struct radix4_addr_entry *ae;
604 struct sockaddr_in mask;
605 struct sockaddr *rt_mask;
606 struct radix_node *rn;
607 struct in_addr addr4, mask4;
610 if (lr->num_items >= lr->alloc_items)
611 return (FLM_REBUILD);
613 ae = (struct radix4_addr_entry *)(lr->rt_base + lr->num_items * LRADIX4_ITEM_SZ);
616 ae->nhop = rt_get_raw_nhop(rt);
618 rt_get_inet_prefix_pmask(rt, &addr4, &mask4, &scopeid);
619 ae->addr.sin_len = KEY_LEN_INET;
620 ae->addr.sin_addr = addr4;
622 if (mask4.s_addr != INADDR_BROADCAST) {
623 bzero(&mask, sizeof(mask));
624 mask.sin_len = KEY_LEN_INET;
625 mask.sin_addr = mask4;
626 rt_mask = (struct sockaddr *)&mask;
630 rn = lr->rnh->rnh_addaddr((struct sockaddr *)&ae->addr, rt_mask,
631 &lr->rnh->rh, ae->rn);
633 return (FLM_REBUILD);
635 return (FLM_SUCCESS);
638 static enum flm_op_result
639 lradix4_end_dump(void *_data, struct fib_dp *dp)
641 struct lradix4_data *lr = (struct lradix4_data *)_data;
643 dp->f = lradix4_lookup;
646 return (FLM_SUCCESS);
649 static enum flm_op_result
650 lradix4_change_cb(struct rib_head *rnh, struct rib_cmd_info *rc,
654 return (FLM_REBUILD);
657 struct fib_lookup_module flm_radix4_lockless = {
658 .flm_name = "radix4_lockless",
659 .flm_family = AF_INET,
660 .flm_init_cb = lradix4_init,
661 .flm_destroy_cb = lradix4_destroy,
662 .flm_dump_rib_item_cb = lradix4_add_route_cb,
663 .flm_dump_end_cb = lradix4_end_dump,
664 .flm_change_rib_item_cb = lradix4_change_cb,
665 .flm_get_pref = lradix4_get_pref,
669 * Fallback lookup algorithm.
670 * This is a simple wrapper around system radix.
678 static struct nhop_object *
679 radix4_lookup(void *algo_data, const struct flm_lookup_key key, uint32_t scopeid)
682 struct rib_head *rh = (struct rib_head *)algo_data;
683 struct radix_node *rn;
684 struct nhop_object *nh;
686 /* Prepare lookup key */
687 struct sockaddr_in sin4 = {
688 .sin_family = AF_INET,
689 .sin_len = sizeof(struct sockaddr_in),
690 .sin_addr = key.addr4,
695 rn = rn_match((void *)&sin4, &rh->head);
696 if (rn != NULL && ((rn->rn_flags & RNF_ROOT) == 0))
697 nh = (RNTORT(rn))->rt_nhop;
704 radix4_get_pref(const struct rib_rtable_info *rinfo)
710 static enum flm_op_result
711 radix4_init(uint32_t fibnum, struct fib_data *fd, void *_old_data, void **_data)
713 struct radix4_data *r4;
715 r4 = malloc(sizeof(struct radix4_data), M_RTABLE, M_NOWAIT | M_ZERO);
717 return (FLM_REBUILD);
719 r4->rh = fib_get_rh(fd);
723 return (FLM_SUCCESS);
727 radix4_destroy(void *_data)
730 free(_data, M_RTABLE);
733 static enum flm_op_result
734 radix4_add_route_cb(struct rtentry *rt, void *_data)
737 return (FLM_SUCCESS);
740 static enum flm_op_result
741 radix4_end_dump(void *_data, struct fib_dp *dp)
743 struct radix4_data *r4 = (struct radix4_data *)_data;
745 dp->f = radix4_lookup;
748 return (FLM_SUCCESS);
751 static enum flm_op_result
752 radix4_change_cb(struct rib_head *rnh, struct rib_cmd_info *rc,
756 return (FLM_SUCCESS);
759 struct fib_lookup_module flm_radix4 = {
760 .flm_name = "radix4",
761 .flm_family = AF_INET,
762 .flm_init_cb = radix4_init,
763 .flm_destroy_cb = radix4_destroy,
764 .flm_dump_rib_item_cb = radix4_add_route_cb,
765 .flm_dump_end_cb = radix4_end_dump,
766 .flm_change_rib_item_cb = radix4_change_cb,
767 .flm_get_pref = radix4_get_pref,
774 fib_module_register(&flm_bsearch4);
775 fib_module_register(&flm_radix4_lockless);
776 fib_module_register(&flm_radix4);
778 SYSINIT(fib4_algo_init, SI_SUB_PROTO_DOMAIN, SI_ORDER_THIRD, fib4_algo_init, NULL);