2 * SPDX-License-Identifier: BSD-2-Clause
4 * Copyright (c) 2020 Alexander V. Chernikov
6 * Redistribution and use in source and binary forms, with or without
7 * modification, are permitted provided that the following conditions
9 * 1. Redistributions of source code must retain the above copyright
10 * notice, this list of conditions and the following disclaimer.
11 * 2. Redistributions in binary form must reproduce the above copyright
12 * notice, this list of conditions and the following disclaimer in the
13 * documentation and/or other materials provided with the distribution.
15 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
16 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
17 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
18 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
19 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
20 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
21 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
22 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
23 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
24 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
28 #include <sys/cdefs.h>
31 #include <sys/param.h>
32 #include <sys/kernel.h>
34 #include <sys/rmlock.h>
35 #include <sys/malloc.h>
36 #include <sys/kernel.h>
38 #include <sys/socket.h>
39 #include <sys/sysctl.h>
43 #include <netinet/in.h>
45 #include <net/route.h>
46 #include <net/route/nhop.h>
47 #include <net/route/route_ctl.h>
48 #include <net/route/route_var.h>
49 #include <net/route/fib_algo.h>
52 * Binary search lookup algo.
54 * Compiles route table into a sorted array.
55 * Used with small amount of routes (< 16).
56 * As array is immutable, it is rebuild on each rtable change.
74 struct bsearch4_record {
77 struct nhop_object *nh;
80 struct bsearch4_data {
85 struct bsearch4_record *rr;
86 struct bsearch4_record br[0];
90 * Main IPv4 address lookup function.
92 * Finds array record with maximum index that is <= provided key.
93 * Assumes 0.0.0.0/0 always exists (may be with NULL nhop)
95 static struct nhop_object *
96 bsearch4_lookup(void *algo_data, const struct flm_lookup_key key, uint32_t scopeid)
98 const struct bsearch4_data *bd = (const struct bsearch4_data *)algo_data;
99 const struct bsearch4_record *br;
100 uint32_t addr4 = ntohl(key.addr4.s_addr);
103 int end = bd->num_items;
105 int i = (start + end) / 2;
106 while (start + 1 < end) {
107 i = (start + end) / 2;
109 if (addr4 < br->addr4) {
110 /* key < average, reduce right boundary */
113 } else if (addr4 > br->addr4) {
114 /* key > average, increase left aboundary */
122 /* start + 1 == end */
123 return (bd->br[start].nh);
127 * Preference function.
128 * Assume ideal for < 10 (typical single-interface setup has 5)
129 * Then gradually degrade.
130 * Assume 30 prefixes is at least 60 records, so it will require 8 lookup,
131 * which is even worse than radix.
134 bsearch4_get_pref(const struct rib_rtable_info *rinfo)
137 if (rinfo->num_prefixes < 10)
139 else if (rinfo->num_prefixes < 30)
140 return (255 - rinfo->num_prefixes * 8);
145 static enum flm_op_result
146 bsearch4_init(uint32_t fibnum, struct fib_data *fd, void *_old_data, void **_data)
148 struct bsearch4_data *bd;
149 struct rib_rtable_info rinfo;
154 fib_get_rtable_info(fib_get_rh(fd), &rinfo);
155 count = rinfo.num_prefixes * 11 / 10 + 64;
157 sz = sizeof(struct bsearch4_data) + sizeof(struct bsearch4_record) * count;
158 /* add cache line sz to ease alignment */
159 sz += CACHE_LINE_SIZE;
160 mem = malloc(sz, M_RTABLE, M_NOWAIT | M_ZERO);
162 return (FLM_REBUILD);
163 /* Align datapath-usable structure to cache line boundary */
164 bd = (struct bsearch4_data *)roundup2((uintptr_t)mem, CACHE_LINE_SIZE);
166 bd->alloc_items = count;
172 * Allocate temporary array to store all rtable data.
173 * This step is required to provide the required prefix iteration order.
175 bd->rr = mallocarray(count, sizeof(struct bsearch4_record), M_TEMP, M_NOWAIT | M_ZERO);
177 return (FLM_REBUILD);
179 return (FLM_SUCCESS);
183 bsearch4_destroy(void *_data)
185 struct bsearch4_data *bd = (struct bsearch4_data *)_data;
188 free(bd->rr, M_TEMP);
189 free(bd->mem, M_RTABLE);
193 * Callback storing converted rtable prefixes in the temporary array.
194 * Addresses are converted to a host order.
196 static enum flm_op_result
197 bsearch4_add_route_cb(struct rtentry *rt, void *_data)
199 struct bsearch4_data *bd = (struct bsearch4_data *)_data;
200 struct bsearch4_record *rr;
201 struct in_addr addr4, mask4;
204 if (bd->num_items >= bd->alloc_items)
205 return (FLM_REBUILD);
207 rr = &bd->rr[bd->num_items++];
208 rt_get_inet_prefix_pmask(rt, &addr4, &mask4, &scopeid);
209 rr->addr4 = ntohl(addr4.s_addr);
210 rr->mask4 = ntohl(mask4.s_addr);
211 rr->nh = rt_get_raw_nhop(rt);
213 return (FLM_SUCCESS);
217 * Prefix comparison function.
218 * 10.0.0.0/24 < 10.0.0.0/25 <- less specific wins
219 * 10.0.0.0/25 < 10.0.0.1/32 <- bigger base wins
222 rr_cmp(const void *_rec1, const void *_rec2)
224 const struct bsearch4_record *rec1, *rec2;
228 if (rec1->addr4 < rec2->addr4)
230 else if (rec1->addr4 > rec2->addr4)
234 * wider mask value is lesser mask
235 * we want less specific come first, e.g. <
237 if (rec1->mask4 < rec2->mask4)
239 else if (rec1->mask4 > rec2->mask4)
244 struct bsearch4_array {
245 uint32_t alloc_items;
247 struct bsearch4_record *arr;
251 add_array_entry(struct bsearch4_array *ba, struct bsearch4_record *br_new)
254 if (ba->num_items < ba->alloc_items) {
255 ba->arr[ba->num_items++] = *br_new;
261 static struct bsearch4_record *
262 get_last_entry(struct bsearch4_array *ba)
265 return (&ba->arr[ba->num_items - 1]);
271 * stack: 10.0.1.0/24,nh=3 array: 10.0.1.0/25,nh=4 -> ++10.0.1.128/24,nh=3
276 pop_stack_entry(struct bsearch4_array *dst_array, struct bsearch4_array *stack)
278 uint32_t last_stack_addr, last_array_addr;
280 struct bsearch4_record *br_prev = get_last_entry(dst_array);
281 struct bsearch4_record *pstack = get_last_entry(stack);
283 /* Regardless of the result, pop stack entry */
286 /* Prefix last address for the last entry in lookup array */
287 last_array_addr = (br_prev->addr4 | ~br_prev->mask4);
288 /* Prefix last address for the stack record entry */
289 last_stack_addr = (pstack->addr4 | ~pstack->mask4);
291 if (last_stack_addr > last_array_addr) {
293 * Stack record covers > address space than
294 * the last entry in the lookup array.
295 * Add the remaining parts of a stack record to
298 struct bsearch4_record br_new = {
299 .addr4 = last_array_addr + 1,
300 .mask4 = pstack->mask4,
303 return (add_array_entry(dst_array, &br_new));
310 * Updates resulting array @dst_array with a rib entry @rib_entry.
313 bsearch4_process_record(struct bsearch4_array *dst_array,
314 struct bsearch4_array *stack, struct bsearch4_record *rib_entry)
318 * Maintain invariant: current rib_entry is always contained
319 * in the top stack entry.
320 * Note we always have 0.0.0.0/0.
322 while (stack->num_items > 0) {
323 struct bsearch4_record *pst = get_last_entry(stack);
326 * Check if we need to pop stack.
327 * Rely on the ordering - larger prefixes comes up first
328 * Pop any entry that doesn't contain current prefix.
330 if (pst->addr4 == (rib_entry->addr4 & pst->mask4))
333 if (!pop_stack_entry(dst_array, stack))
337 if (dst_array->num_items > 0) {
340 * Check if there is a gap between previous entry and a
341 * current entry. Code above guarantees that both previous
342 * and current entry are contained in the top stack entry.
344 * Example: last: 10.0.0.1(/32,nh=3) cur: 10.0.0.3(/32,nh=4),
345 * stack: 10.0.0.0/24,nh=2.
346 * Cover a gap between previous and current by adding stack
349 struct bsearch4_record *br_tmp = get_last_entry(dst_array);
350 uint32_t last_declared_addr = br_tmp->addr4 | ~br_tmp->mask4;
351 if (last_declared_addr < rib_entry->addr4 - 1) {
353 struct bsearch4_record *pst = get_last_entry(stack);
354 struct bsearch4_record new_entry = {
355 .addr4 = last_declared_addr + 1,
359 if (!add_array_entry(dst_array, &new_entry))
364 * Special case: adding more specific prefix at the start of
365 * the previous interval:
366 * 10.0.0.0(/24,nh=3), 10.0.0.0(/25,nh=4)
367 * Alter the last record, seeting new nexthop and mask.
369 if (br_tmp->addr4 == rib_entry->addr4) {
370 *br_tmp = *rib_entry;
371 add_array_entry(stack, rib_entry);
376 if (!add_array_entry(dst_array, rib_entry))
378 add_array_entry(stack, rib_entry);
383 static enum flm_op_result
384 bsearch4_build_array(struct bsearch4_array *dst_array, struct bsearch4_array *src_array)
388 * During iteration, we keep track of all prefixes in rtable
389 * we currently match, by maintaining stack. As there can be only
390 * 32 prefixes for a single address, pre-allocate stack of size 32.
392 struct bsearch4_array stack = {
394 .arr = mallocarray(32, sizeof(struct bsearch4_record), M_TEMP, M_NOWAIT | M_ZERO),
396 if (stack.arr == NULL)
397 return (FLM_REBUILD);
399 for (int i = 0; i < src_array->num_items; i++) {
400 struct bsearch4_record *rib_entry = &src_array->arr[i];
402 if (!bsearch4_process_record(dst_array, &stack, rib_entry)) {
403 free(stack.arr, M_TEMP);
404 return (FLM_REBUILD);
409 * We know that last record is contained in the top stack entry.
411 while (stack.num_items > 0) {
412 if (!pop_stack_entry(dst_array, &stack))
413 return (FLM_REBUILD);
415 free(stack.arr, M_TEMP);
417 return (FLM_SUCCESS);
420 static enum flm_op_result
421 bsearch4_build(struct bsearch4_data *bd)
423 enum flm_op_result ret;
425 struct bsearch4_array prefixes_array = {
426 .alloc_items = bd->alloc_items,
427 .num_items = bd->num_items,
431 /* Add default route if not exists */
432 bool default_found = false;
433 for (int i = 0; i < prefixes_array.num_items; i++) {
434 if (prefixes_array.arr[i].mask4 == 0) {
435 default_found = true;
439 if (!default_found) {
440 /* Add default route with NULL nhop */
441 struct bsearch4_record default_entry = {};
442 if (!add_array_entry(&prefixes_array, &default_entry))
443 return (FLM_REBUILD);
447 qsort(prefixes_array.arr, prefixes_array.num_items, sizeof(struct bsearch4_record), rr_cmp);
449 struct bsearch4_array dst_array = {
450 .alloc_items = bd->alloc_items,
454 ret = bsearch4_build_array(&dst_array, &prefixes_array);
455 bd->num_items = dst_array.num_items;
457 free(bd->rr, M_TEMP);
463 static enum flm_op_result
464 bsearch4_end_dump(void *_data, struct fib_dp *dp)
466 struct bsearch4_data *bd = (struct bsearch4_data *)_data;
467 enum flm_op_result ret;
469 ret = bsearch4_build(bd);
470 if (ret == FLM_SUCCESS) {
471 dp->f = bsearch4_lookup;
478 static enum flm_op_result
479 bsearch4_change_cb(struct rib_head *rnh, struct rib_cmd_info *rc,
483 return (FLM_REBUILD);
486 struct fib_lookup_module flm_bsearch4= {
487 .flm_name = "bsearch4",
488 .flm_family = AF_INET,
489 .flm_init_cb = bsearch4_init,
490 .flm_destroy_cb = bsearch4_destroy,
491 .flm_dump_rib_item_cb = bsearch4_add_route_cb,
492 .flm_dump_end_cb = bsearch4_end_dump,
493 .flm_change_rib_item_cb = bsearch4_change_cb,
494 .flm_get_pref = bsearch4_get_pref,
498 * Lockless radix lookup algo.
500 * Compiles immutable radix from the current routing table.
501 * Used with small amount of routes (<1000).
502 * As datastructure is immutable, it gets rebuild on each rtable change.
504 * Lookups are slightly faster as shorter lookup keys are used
505 * (4 bytes instead of 8 in stock radix).
508 #define KEY_LEN_INET (offsetof(struct sockaddr_in, sin_addr) + sizeof(in_addr_t))
509 #define OFF_LEN_INET (8 * offsetof(struct sockaddr_in, sin_addr))
510 struct radix4_addr_entry {
511 struct radix_node rn[2];
512 struct sockaddr_in addr;
513 struct nhop_object *nhop;
515 #define LRADIX4_ITEM_SZ roundup2(sizeof(struct radix4_addr_entry), 64)
517 struct lradix4_data {
518 struct radix_node_head *rnh;
522 uint32_t alloc_items;
526 static struct nhop_object *
527 lradix4_lookup(void *algo_data, const struct flm_lookup_key key, uint32_t scopeid)
529 struct radix_node_head *rnh = (struct radix_node_head *)algo_data;
530 struct radix4_addr_entry *ent;
531 struct sockaddr_in addr4 = {
532 .sin_len = KEY_LEN_INET,
533 .sin_addr = key.addr4,
535 ent = (struct radix4_addr_entry *)(rn_match(&addr4, &rnh->rh));
542 * Preference function.
543 * Assume close-to-ideal of < 10 routes (though worse than bsearch), then
544 * gradually degrade until 1000 routes are reached.
547 lradix4_get_pref(const struct rib_rtable_info *rinfo)
550 if (rinfo->num_prefixes < 10)
552 else if (rinfo->num_prefixes < 1000)
553 return (254 - rinfo->num_prefixes / 4);
558 static enum flm_op_result
559 lradix4_init(uint32_t fibnum, struct fib_data *fd, void *_old_data, void **_data)
561 struct lradix4_data *lr;
562 struct rib_rtable_info rinfo;
566 lr = malloc(sizeof(struct lradix4_data), M_RTABLE, M_NOWAIT | M_ZERO);
567 if (lr == NULL || !rn_inithead((void **)&lr->rnh, OFF_LEN_INET))
568 return (FLM_REBUILD);
569 fib_get_rtable_info(fib_get_rh(fd), &rinfo);
571 count = rinfo.num_prefixes * 11 / 10;
572 sz = count * LRADIX4_ITEM_SZ + CACHE_LINE_SIZE;
573 lr->mem = malloc(sz, M_RTABLE, M_NOWAIT | M_ZERO);
575 return (FLM_REBUILD);
576 /* Align all rtentries to a cacheline boundary */
577 lr->rt_base = (char *)roundup2((uintptr_t)lr->mem, CACHE_LINE_SIZE);
578 lr->alloc_items = count;
583 return (FLM_SUCCESS);
587 lradix4_destroy(void *_data)
589 struct lradix4_data *lr = (struct lradix4_data *)_data;
592 rn_detachhead((void **)&lr->rnh);
594 free(lr->mem, M_RTABLE);
598 static enum flm_op_result
599 lradix4_add_route_cb(struct rtentry *rt, void *_data)
601 struct lradix4_data *lr = (struct lradix4_data *)_data;
602 struct radix4_addr_entry *ae;
603 struct sockaddr_in mask;
604 struct sockaddr *rt_mask;
605 struct radix_node *rn;
606 struct in_addr addr4, mask4;
609 if (lr->num_items >= lr->alloc_items)
610 return (FLM_REBUILD);
612 ae = (struct radix4_addr_entry *)(lr->rt_base + lr->num_items * LRADIX4_ITEM_SZ);
615 ae->nhop = rt_get_raw_nhop(rt);
617 rt_get_inet_prefix_pmask(rt, &addr4, &mask4, &scopeid);
618 ae->addr.sin_len = KEY_LEN_INET;
619 ae->addr.sin_addr = addr4;
621 if (mask4.s_addr != INADDR_BROADCAST) {
622 bzero(&mask, sizeof(mask));
623 mask.sin_len = KEY_LEN_INET;
624 mask.sin_addr = mask4;
625 rt_mask = (struct sockaddr *)&mask;
629 rn = lr->rnh->rnh_addaddr((struct sockaddr *)&ae->addr, rt_mask,
630 &lr->rnh->rh, ae->rn);
632 return (FLM_REBUILD);
634 return (FLM_SUCCESS);
637 static enum flm_op_result
638 lradix4_end_dump(void *_data, struct fib_dp *dp)
640 struct lradix4_data *lr = (struct lradix4_data *)_data;
642 dp->f = lradix4_lookup;
645 return (FLM_SUCCESS);
648 static enum flm_op_result
649 lradix4_change_cb(struct rib_head *rnh, struct rib_cmd_info *rc,
653 return (FLM_REBUILD);
656 struct fib_lookup_module flm_radix4_lockless = {
657 .flm_name = "radix4_lockless",
658 .flm_family = AF_INET,
659 .flm_init_cb = lradix4_init,
660 .flm_destroy_cb = lradix4_destroy,
661 .flm_dump_rib_item_cb = lradix4_add_route_cb,
662 .flm_dump_end_cb = lradix4_end_dump,
663 .flm_change_rib_item_cb = lradix4_change_cb,
664 .flm_get_pref = lradix4_get_pref,
668 * Fallback lookup algorithm.
669 * This is a simple wrapper around system radix.
677 static struct nhop_object *
678 radix4_lookup(void *algo_data, const struct flm_lookup_key key, uint32_t scopeid)
681 struct rib_head *rh = (struct rib_head *)algo_data;
682 struct radix_node *rn;
683 struct nhop_object *nh;
685 /* Prepare lookup key */
686 struct sockaddr_in sin4 = {
687 .sin_family = AF_INET,
688 .sin_len = sizeof(struct sockaddr_in),
689 .sin_addr = key.addr4,
694 rn = rn_match((void *)&sin4, &rh->head);
695 if (rn != NULL && ((rn->rn_flags & RNF_ROOT) == 0))
696 nh = (RNTORT(rn))->rt_nhop;
703 radix4_get_pref(const struct rib_rtable_info *rinfo)
709 static enum flm_op_result
710 radix4_init(uint32_t fibnum, struct fib_data *fd, void *_old_data, void **_data)
712 struct radix4_data *r4;
714 r4 = malloc(sizeof(struct radix4_data), M_RTABLE, M_NOWAIT | M_ZERO);
716 return (FLM_REBUILD);
718 r4->rh = fib_get_rh(fd);
722 return (FLM_SUCCESS);
726 radix4_destroy(void *_data)
729 free(_data, M_RTABLE);
732 static enum flm_op_result
733 radix4_add_route_cb(struct rtentry *rt, void *_data)
736 return (FLM_SUCCESS);
739 static enum flm_op_result
740 radix4_end_dump(void *_data, struct fib_dp *dp)
742 struct radix4_data *r4 = (struct radix4_data *)_data;
744 dp->f = radix4_lookup;
747 return (FLM_SUCCESS);
750 static enum flm_op_result
751 radix4_change_cb(struct rib_head *rnh, struct rib_cmd_info *rc,
755 return (FLM_SUCCESS);
758 struct fib_lookup_module flm_radix4 = {
759 .flm_name = "radix4",
760 .flm_family = AF_INET,
761 .flm_init_cb = radix4_init,
762 .flm_destroy_cb = radix4_destroy,
763 .flm_dump_rib_item_cb = radix4_add_route_cb,
764 .flm_dump_end_cb = radix4_end_dump,
765 .flm_change_rib_item_cb = radix4_change_cb,
766 .flm_get_pref = radix4_get_pref,
773 fib_module_register(&flm_bsearch4);
774 fib_module_register(&flm_radix4_lockless);
775 fib_module_register(&flm_radix4);
777 SYSINIT(fib4_algo_init, SI_SUB_PROTO_DOMAIN, SI_ORDER_THIRD, fib4_algo_init, NULL);