2 * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
4 * Copyright (c) 2012-2021 Marko Zec
5 * Copyright (c) 2005, 2018 University of Zagreb
6 * Copyright (c) 2005 International Computer Science Institute
8 * Redistribution and use in source and binary forms, with or without
9 * modification, are permitted provided that the following conditions
11 * 1. Redistributions of source code must retain the above copyright
12 * notice, this list of conditions and the following disclaimer.
13 * 2. Redistributions in binary form must reproduce the above copyright
14 * notice, this list of conditions and the following disclaimer in the
15 * documentation and/or other materials provided with the distribution.
17 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
18 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
19 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
20 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
21 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
22 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
23 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
24 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
25 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
26 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
31 * An implementation of DXR, a simple IPv4 LPM scheme with compact lookup
32 * structures and a trivial search procedure. More significant bits of
33 * the search key are used to directly index a two-stage trie, while the
34 * remaining bits are used for finding the next hop in a sorted array.
37 * M. Zec, L. Rizzo, M. Mikuc, DXR: towards a billion routing lookups per
38 * second in software, ACM SIGCOMM Computer Communication Review, September
41 * M. Zec, M. Mikuc, Pushing the envelope: beyond two billion IP routing
42 * lookups per second on commodity CPUs, IEEE SoftCOM, September 2017, Split
45 #include <sys/cdefs.h>
46 __FBSDID("$FreeBSD$");
50 #include <sys/param.h>
51 #include <sys/kernel.h>
52 #include <sys/epoch.h>
53 #include <sys/malloc.h>
54 #include <sys/module.h>
55 #include <sys/socket.h>
56 #include <sys/sysctl.h>
57 #include <sys/syslog.h>
61 #include <netinet/in.h>
62 #include <netinet/in_fib.h>
64 #include <net/route.h>
65 #include <net/route/route_ctl.h>
66 #include <net/route/fib_algo.h>
68 #define DXR_TRIE_BITS 20
70 CTASSERT(DXR_TRIE_BITS >= 16 && DXR_TRIE_BITS <= 24);
72 /* DXR2: two-stage primary trie, instead of a single direct lookup table */
75 #if DXR_TRIE_BITS > 16
78 #define DXR_D (DXR_TRIE_BITS - 1)
80 #define DXR_X (DXR_TRIE_BITS - DXR_D)
82 #define D_TBL_SIZE (1 << DXR_D)
83 #define DIRECT_TBL_SIZE (1 << DXR_TRIE_BITS)
84 #define DXR_RANGE_MASK (0xffffffffU >> DXR_TRIE_BITS)
85 #define DXR_RANGE_SHIFT (32 - DXR_TRIE_BITS)
87 #define DESC_BASE_BITS 22
88 #define DESC_FRAGMENTS_BITS (32 - DESC_BASE_BITS)
89 #define BASE_MAX ((1 << DESC_BASE_BITS) - 1)
90 #define RTBL_SIZE_INCR (BASE_MAX / 64)
92 #if DXR_TRIE_BITS < 24
93 #define FRAGS_MASK_SHORT ((1 << (23 - DXR_TRIE_BITS)) - 1)
95 #define FRAGS_MASK_SHORT 0
97 #define FRAGS_PREF_SHORT (((1 << DESC_FRAGMENTS_BITS) - 1) & \
99 #define FRAGS_MARK_XL (FRAGS_PREF_SHORT - 1)
100 #define FRAGS_MARK_HIT (FRAGS_PREF_SHORT - 2)
102 #define IS_SHORT_FORMAT(x) ((x & FRAGS_PREF_SHORT) == FRAGS_PREF_SHORT)
103 #define IS_LONG_FORMAT(x) ((x & FRAGS_PREF_SHORT) != FRAGS_PREF_SHORT)
104 #define IS_XL_FORMAT(x) (x == FRAGS_MARK_XL)
106 #define RE_SHORT_MAX_NH ((1 << (DXR_TRIE_BITS - 8)) - 1)
108 #define CHUNK_HASH_BITS 16
109 #define CHUNK_HASH_SIZE (1 << CHUNK_HASH_BITS)
110 #define CHUNK_HASH_MASK (CHUNK_HASH_SIZE - 1)
112 #define TRIE_HASH_BITS 16
113 #define TRIE_HASH_SIZE (1 << TRIE_HASH_BITS)
114 #define TRIE_HASH_MASK (TRIE_HASH_SIZE - 1)
116 #define XTBL_SIZE_INCR (DIRECT_TBL_SIZE / 16)
118 #define UNUSED_BUCKETS 8
120 /* Lookup structure elements */
122 struct direct_entry {
123 uint32_t fragments: DESC_FRAGMENTS_BITS,
124 base: DESC_BASE_BITS;
127 struct range_entry_long {
128 uint32_t start: DXR_RANGE_SHIFT,
129 nexthop: DXR_TRIE_BITS;
132 #if DXR_TRIE_BITS < 24
133 struct range_entry_short {
134 uint16_t start: DXR_RANGE_SHIFT - 8,
135 nexthop: DXR_TRIE_BITS - 8;
139 /* Auxiliary structures */
149 LIST_ENTRY(chunk_desc) cd_all_le;
150 LIST_ENTRY(chunk_desc) cd_hash_le;
154 uint32_t cd_cur_size;
155 uint32_t cd_max_size;
159 LIST_ENTRY(trie_desc) td_all_le;
160 LIST_ENTRY(trie_desc) td_hash_le;
167 /* Glue to external state */
172 /* Auxiliary build-time tables */
173 struct direct_entry direct_tbl[DIRECT_TBL_SIZE];
174 uint16_t d_tbl[D_TBL_SIZE];
175 struct direct_entry *x_tbl;
177 struct range_entry_long re;
181 /* Auxiliary internal state */
182 uint32_t updates_mask[DIRECT_TBL_SIZE / 32];
183 struct trie_desc *trietbl[D_TBL_SIZE];
184 LIST_HEAD(, chunk_desc) chunk_hashtbl[CHUNK_HASH_SIZE];
185 LIST_HEAD(, chunk_desc) all_chunks;
186 LIST_HEAD(, chunk_desc) unused_chunks[UNUSED_BUCKETS];
187 LIST_HEAD(, trie_desc) trie_hashtbl[TRIE_HASH_SIZE];
188 LIST_HEAD(, trie_desc) all_trie;
189 LIST_HEAD(, trie_desc) unused_trie; /* abuses hash link entry */
190 struct sockaddr_in dst;
191 struct sockaddr_in mask;
192 struct heap_entry heap[33];
194 uint32_t updates_low;
195 uint32_t updates_high;
196 uint32_t all_chunks_cnt;
197 uint32_t unused_chunks_cnt;
199 uint32_t all_trie_cnt;
200 uint32_t unused_trie_cnt;
201 uint32_t trie_rebuilt_prefixes;
206 uint32_t rtbl_work_frags;
210 /* Main lookup structure container */
220 struct nhop_object **nh_tbl;
222 /* Glue to external state */
225 struct epoch_context epoch_ctx;
229 static MALLOC_DEFINE(M_DXRLPM, "dxr", "DXR LPM");
230 static MALLOC_DEFINE(M_DXRAUX, "dxr aux", "DXR auxiliary");
232 uma_zone_t chunk_zone;
233 uma_zone_t trie_zone;
235 SYSCTL_DECL(_net_route_algo);
236 SYSCTL_NODE(_net_route_algo, OID_AUTO, dxr, CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
239 VNET_DEFINE_STATIC(int, max_trie_holes) = 8;
240 #define V_max_trie_holes VNET(max_trie_holes)
241 SYSCTL_INT(_net_route_algo_dxr, OID_AUTO, max_trie_holes,
242 CTLFLAG_RW | CTLFLAG_VNET, &VNET_NAME(max_trie_holes), 0,
243 "Trie fragmentation threshold before triggering a full rebuild");
245 VNET_DEFINE_STATIC(int, max_range_holes) = 16;
246 #define V_max_range_holes VNET(max_range_holes)
247 SYSCTL_INT(_net_route_algo_dxr, OID_AUTO, max_range_holes,
248 CTLFLAG_RW | CTLFLAG_VNET, &VNET_NAME(max_range_holes), 0,
249 "Range table fragmentation threshold before triggering a full rebuild");
251 /* Binary search for a matching address range */
252 #define DXR_LOOKUP_STAGE \
253 if (masked_dst < range[middle].start) { \
254 upperbound = middle; \
255 middle = (middle + lowerbound) / 2; \
256 } else if (masked_dst < range[middle + 1].start) \
257 return (range[middle].nexthop); \
259 lowerbound = middle + 1; \
260 middle = (upperbound + middle + 1) / 2; \
262 if (upperbound == lowerbound) \
263 return (range[lowerbound].nexthop);
266 dxr_lookup(struct dxr *dxr, uint32_t dst)
269 uint16_t *dt = dxr->d;
270 struct direct_entry *xt = dxr->x;
273 struct direct_entry *dt = dxr->d;
275 struct direct_entry de;
276 struct range_entry_long *rt;
284 xi = (dt[dst >> dxr->d_shift] << dxr->x_shift) +
285 ((dst >> DXR_RANGE_SHIFT) & dxr->x_mask);
288 de = dt[dst >> DXR_RANGE_SHIFT];
291 if (__predict_true(de.fragments == FRAGS_MARK_HIT))
297 masked_dst = dst & DXR_RANGE_MASK;
299 #if DXR_TRIE_BITS < 24
300 if (__predict_true(IS_SHORT_FORMAT(de.fragments))) {
301 upperbound = de.fragments & FRAGS_MASK_SHORT;
302 struct range_entry_short *range =
303 (struct range_entry_short *) &rt[base];
307 upperbound = upperbound * 2 + 1;
316 upperbound = de.fragments;
317 middle = upperbound / 2;
318 struct range_entry_long *range = &rt[base];
319 if (__predict_false(IS_XL_FORMAT(de.fragments))) {
320 upperbound = *((uint32_t *) range);
322 middle = upperbound / 2;
332 initheap(struct dxr_aux *da, uint32_t dst_u32, uint32_t chunk)
334 struct heap_entry *fhp = &da->heap[0];
336 struct route_nhop_data rnd;
339 da->dst.sin_addr.s_addr = htonl(dst_u32);
340 rt = fib4_lookup_rt(da->fibnum, da->dst.sin_addr, 0, NHR_UNLOCKED,
346 rt_get_inet_prefix_plen(rt, &addr, &fhp->preflen, &scopeid);
347 fhp->start = ntohl(addr.s_addr);
348 fhp->end = fhp->start;
349 if (fhp->preflen < 32)
350 fhp->end |= (0xffffffffU >> fhp->preflen);
351 fhp->nexthop = fib_get_nhop_idx(da->fd, rnd.rnd_nhop);
353 fhp->preflen = fhp->nexthop = fhp->start = 0;
354 fhp->end = 0xffffffffU;
359 chunk_size(struct dxr_aux *da, struct direct_entry *fdesc)
362 if (IS_SHORT_FORMAT(fdesc->fragments))
363 return ((fdesc->fragments & FRAGS_MASK_SHORT) + 1);
364 else if (IS_XL_FORMAT(fdesc->fragments))
365 return (da->range_tbl[fdesc->base].fragments + 2);
366 else /* if (IS_LONG_FORMAT(fdesc->fragments)) */
367 return (fdesc->fragments + 1);
371 chunk_hash(struct dxr_aux *da, struct direct_entry *fdesc)
373 uint32_t size = chunk_size(da, fdesc);
374 uint32_t *p = (uint32_t *) &da->range_tbl[fdesc->base];
375 uint32_t *l = (uint32_t *) &da->range_tbl[fdesc->base + size];
376 uint32_t hash = fdesc->fragments;
379 hash = (hash << 7) + (hash >> 13) + *p;
381 return (hash + (hash >> 16));
385 chunk_ref(struct dxr_aux *da, uint32_t chunk)
387 struct direct_entry *fdesc = &da->direct_tbl[chunk];
388 struct chunk_desc *cdp, *empty_cdp;
389 uint32_t base = fdesc->base;
390 uint32_t size = chunk_size(da, fdesc);
391 uint32_t hash = chunk_hash(da, fdesc);
394 /* Find an existing descriptor */
395 LIST_FOREACH(cdp, &da->chunk_hashtbl[hash & CHUNK_HASH_MASK],
397 if (cdp->cd_hash != hash || cdp->cd_cur_size != size ||
398 memcmp(&da->range_tbl[base], &da->range_tbl[cdp->cd_base],
399 sizeof(struct range_entry_long) * size))
401 da->rtbl_top = fdesc->base;
402 fdesc->base = cdp->cd_base;
407 /* No matching chunks found. Find an empty one to recycle. */
408 for (cdp = NULL, i = size; cdp == NULL && i < UNUSED_BUCKETS; i++)
409 cdp = LIST_FIRST(&da->unused_chunks[i]);
412 LIST_FOREACH(empty_cdp, &da->unused_chunks[0], cd_hash_le)
413 if (empty_cdp->cd_max_size >= size && (cdp == NULL ||
414 empty_cdp->cd_max_size < cdp->cd_max_size)) {
416 if (empty_cdp->cd_max_size == size)
421 /* Copy from heap into the recycled chunk */
422 bcopy(&da->range_tbl[fdesc->base], &da->range_tbl[cdp->cd_base],
423 size * sizeof(struct range_entry_long));
424 fdesc->base = cdp->cd_base;
425 da->rtbl_top -= size;
426 da->unused_chunks_cnt--;
427 if (cdp->cd_max_size > size) {
428 /* Split the range in two, need a new descriptor */
429 empty_cdp = uma_zalloc(chunk_zone, M_NOWAIT);
430 if (empty_cdp == NULL)
432 LIST_INSERT_BEFORE(cdp, empty_cdp, cd_all_le);
433 empty_cdp->cd_base = cdp->cd_base + size;
434 empty_cdp->cd_cur_size = 0;
435 empty_cdp->cd_max_size = cdp->cd_max_size - size;
437 i = empty_cdp->cd_max_size;
438 if (i >= UNUSED_BUCKETS)
440 LIST_INSERT_HEAD(&da->unused_chunks[i], empty_cdp,
443 da->all_chunks_cnt++;
444 da->unused_chunks_cnt++;
445 cdp->cd_max_size = size;
447 LIST_REMOVE(cdp, cd_hash_le);
449 /* Alloc a new descriptor at the top of the heap*/
450 cdp = uma_zalloc(chunk_zone, M_NOWAIT);
453 cdp->cd_max_size = size;
454 cdp->cd_base = fdesc->base;
455 LIST_INSERT_HEAD(&da->all_chunks, cdp, cd_all_le);
456 da->all_chunks_cnt++;
457 KASSERT(cdp->cd_base + cdp->cd_max_size == da->rtbl_top,
458 ("dxr: %s %d", __FUNCTION__, __LINE__));
463 cdp->cd_cur_size = size;
464 LIST_INSERT_HEAD(&da->chunk_hashtbl[hash & CHUNK_HASH_MASK], cdp,
466 if (da->rtbl_top >= da->rtbl_size) {
467 if (da->rtbl_top >= BASE_MAX) {
468 FIB_PRINTF(LOG_ERR, da->fd,
469 "structural limit exceeded at %d "
470 "range table elements", da->rtbl_top);
473 da->rtbl_size += RTBL_SIZE_INCR;
474 if (da->rtbl_top >= BASE_MAX / 4)
475 FIB_PRINTF(LOG_WARNING, da->fd, "range table at %d%%",
476 da->rtbl_top * 100 / BASE_MAX);
477 da->range_tbl = realloc(da->range_tbl,
478 sizeof(*da->range_tbl) * da->rtbl_size + FRAGS_PREF_SHORT,
480 if (da->range_tbl == NULL)
488 chunk_unref(struct dxr_aux *da, uint32_t chunk)
490 struct direct_entry *fdesc = &da->direct_tbl[chunk];
491 struct chunk_desc *cdp, *cdp2;
492 uint32_t base = fdesc->base;
493 uint32_t size = chunk_size(da, fdesc);
494 uint32_t hash = chunk_hash(da, fdesc);
497 /* Find the corresponding descriptor */
498 LIST_FOREACH(cdp, &da->chunk_hashtbl[hash & CHUNK_HASH_MASK],
500 if (cdp->cd_hash == hash && cdp->cd_cur_size == size &&
501 memcmp(&da->range_tbl[base], &da->range_tbl[cdp->cd_base],
502 sizeof(struct range_entry_long) * size) == 0)
505 KASSERT(cdp != NULL, ("dxr: dangling chunk"));
506 if (--cdp->cd_refcnt > 0)
509 LIST_REMOVE(cdp, cd_hash_le);
510 da->unused_chunks_cnt++;
511 cdp->cd_cur_size = 0;
513 /* Attempt to merge with the preceding chunk, if empty */
514 cdp2 = LIST_NEXT(cdp, cd_all_le);
515 if (cdp2 != NULL && cdp2->cd_cur_size == 0) {
516 KASSERT(cdp2->cd_base + cdp2->cd_max_size == cdp->cd_base,
517 ("dxr: %s %d", __FUNCTION__, __LINE__));
518 LIST_REMOVE(cdp, cd_all_le);
519 da->all_chunks_cnt--;
520 LIST_REMOVE(cdp2, cd_hash_le);
521 da->unused_chunks_cnt--;
522 cdp2->cd_max_size += cdp->cd_max_size;
523 uma_zfree(chunk_zone, cdp);
527 /* Attempt to merge with the subsequent chunk, if empty */
528 cdp2 = LIST_PREV(cdp, &da->all_chunks, chunk_desc, cd_all_le);
529 if (cdp2 != NULL && cdp2->cd_cur_size == 0) {
530 KASSERT(cdp->cd_base + cdp->cd_max_size == cdp2->cd_base,
531 ("dxr: %s %d", __FUNCTION__, __LINE__));
532 LIST_REMOVE(cdp, cd_all_le);
533 da->all_chunks_cnt--;
534 LIST_REMOVE(cdp2, cd_hash_le);
535 da->unused_chunks_cnt--;
536 cdp2->cd_max_size += cdp->cd_max_size;
537 cdp2->cd_base = cdp->cd_base;
538 uma_zfree(chunk_zone, cdp);
542 if (cdp->cd_base + cdp->cd_max_size == da->rtbl_top) {
543 /* Free the chunk on the top of the range heap, trim the heap */
544 KASSERT(cdp == LIST_FIRST(&da->all_chunks),
545 ("dxr: %s %d", __FUNCTION__, __LINE__));
546 da->all_chunks_cnt--;
547 da->unused_chunks_cnt--;
548 da->rtbl_top -= cdp->cd_max_size;
549 LIST_REMOVE(cdp, cd_all_le);
550 uma_zfree(chunk_zone, cdp);
554 i = cdp->cd_max_size;
555 if (i >= UNUSED_BUCKETS)
557 LIST_INSERT_HEAD(&da->unused_chunks[i], cdp, cd_hash_le);
562 trie_hash(struct dxr_aux *da, uint32_t dxr_x, uint32_t index)
567 for (i = 0; i < (1 << dxr_x); i++) {
568 hash = (hash << 3) ^ (hash >> 3);
570 (void *) &da->direct_tbl[(index << dxr_x) + i];
575 return (hash + (hash >> 16));
579 trie_ref(struct dxr_aux *da, uint32_t index)
581 struct trie_desc *tp;
582 uint32_t dxr_d = da->d_bits;
583 uint32_t dxr_x = DXR_TRIE_BITS - dxr_d;
584 uint32_t hash = trie_hash(da, dxr_x, index);
586 /* Find an existing descriptor */
587 LIST_FOREACH(tp, &da->trie_hashtbl[hash & TRIE_HASH_MASK], td_hash_le)
588 if (tp->td_hash == hash &&
589 memcmp(&da->direct_tbl[index << dxr_x],
590 &da->x_tbl[tp->td_index << dxr_x],
591 sizeof(*da->x_tbl) << dxr_x) == 0) {
593 da->trietbl[index] = tp;
594 return(tp->td_index);
597 tp = LIST_FIRST(&da->unused_trie);
599 LIST_REMOVE(tp, td_hash_le);
600 da->unused_trie_cnt--;
602 tp = uma_zalloc(trie_zone, M_NOWAIT);
605 LIST_INSERT_HEAD(&da->all_trie, tp, td_all_le);
606 tp->td_index = da->all_trie_cnt++;
611 LIST_INSERT_HEAD(&da->trie_hashtbl[hash & TRIE_HASH_MASK], tp,
613 memcpy(&da->x_tbl[tp->td_index << dxr_x],
614 &da->direct_tbl[index << dxr_x], sizeof(*da->x_tbl) << dxr_x);
615 da->trietbl[index] = tp;
616 if (da->all_trie_cnt >= da->xtbl_size >> dxr_x) {
617 da->xtbl_size += XTBL_SIZE_INCR;
618 da->x_tbl = realloc(da->x_tbl,
619 sizeof(*da->x_tbl) * da->xtbl_size, M_DXRAUX, M_NOWAIT);
620 if (da->x_tbl == NULL)
623 return(tp->td_index);
627 trie_unref(struct dxr_aux *da, uint32_t index)
629 struct trie_desc *tp = da->trietbl[index];
633 da->trietbl[index] = NULL;
634 if (--tp->td_refcnt > 0)
637 LIST_REMOVE(tp, td_hash_le);
638 da->unused_trie_cnt++;
639 if (tp->td_index != da->all_trie_cnt - 1) {
640 LIST_INSERT_HEAD(&da->unused_trie, tp, td_hash_le);
646 da->unused_trie_cnt--;
647 LIST_REMOVE(tp, td_all_le);
648 uma_zfree(trie_zone, tp);
649 LIST_FOREACH(tp, &da->unused_trie, td_hash_le)
650 if (tp->td_index == da->all_trie_cnt - 1) {
651 LIST_REMOVE(tp, td_hash_le);
654 } while (tp != NULL);
659 heap_inject(struct dxr_aux *da, uint32_t start, uint32_t end, uint32_t preflen,
662 struct heap_entry *fhp;
665 for (i = da->heap_index; i >= 0; i--) {
666 if (preflen > da->heap[i].preflen)
668 else if (preflen < da->heap[i].preflen)
669 da->heap[i + 1] = da->heap[i];
674 fhp = &da->heap[i + 1];
675 fhp->preflen = preflen;
683 dxr_walk(struct rtentry *rt, void *arg)
685 struct dxr_aux *da = arg;
686 uint32_t chunk = da->work_chunk;
687 uint32_t first = chunk << DXR_RANGE_SHIFT;
688 uint32_t last = first | DXR_RANGE_MASK;
689 struct range_entry_long *fp =
690 &da->range_tbl[da->rtbl_top + da->rtbl_work_frags].re;
691 struct heap_entry *fhp = &da->heap[da->heap_index];
692 uint32_t preflen, nh, start, end, scopeid;
695 rt_get_inet_prefix_plen(rt, &addr, &preflen, &scopeid);
696 start = ntohl(addr.s_addr);
698 return (-1); /* Beyond chunk boundaries, we are done */
700 return (0); /* Skip this route */
704 end |= (0xffffffffU >> preflen);
705 nh = fib_get_nhop_idx(da->fd, rt_get_raw_nhop(rt));
707 if (start == fhp->start)
708 heap_inject(da, start, end, preflen, nh);
710 /* start > fhp->start */
711 while (start > fhp->end) {
712 uint32_t oend = fhp->end;
714 if (da->heap_index > 0) {
718 initheap(da, fhp->end + 1, chunk);
719 if (fhp->end > oend && fhp->nexthop != fp->nexthop) {
721 da->rtbl_work_frags++;
722 fp->start = (oend + 1) & DXR_RANGE_MASK;
723 fp->nexthop = fhp->nexthop;
726 if (start > ((chunk << DXR_RANGE_SHIFT) | fp->start) &&
729 da->rtbl_work_frags++;
730 fp->start = start & DXR_RANGE_MASK;
731 } else if (da->rtbl_work_frags) {
732 if ((--fp)->nexthop == nh)
733 da->rtbl_work_frags--;
738 heap_inject(da, start, end, preflen, nh);
745 update_chunk(struct dxr_aux *da, uint32_t chunk)
747 struct range_entry_long *fp;
748 #if DXR_TRIE_BITS < 24
749 struct range_entry_short *fps;
750 uint32_t start, nh, i;
752 struct heap_entry *fhp;
753 uint32_t first = chunk << DXR_RANGE_SHIFT;
754 uint32_t last = first | DXR_RANGE_MASK;
756 if (da->direct_tbl[chunk].fragments != FRAGS_MARK_HIT)
757 chunk_unref(da, chunk);
759 initheap(da, first, chunk);
761 fp = &da->range_tbl[da->rtbl_top].re;
762 da->rtbl_work_frags = 0;
763 fp->start = first & DXR_RANGE_MASK;
764 fp->nexthop = da->heap[0].nexthop;
766 da->dst.sin_addr.s_addr = htonl(first);
767 da->mask.sin_addr.s_addr = htonl(~DXR_RANGE_MASK);
769 da->work_chunk = chunk;
770 rib_walk_from(da->fibnum, AF_INET, RIB_FLAG_LOCKED,
771 (struct sockaddr *) &da->dst, (struct sockaddr *) &da->mask,
774 /* Flush any remaining objects on the heap */
775 fp = &da->range_tbl[da->rtbl_top + da->rtbl_work_frags].re;
776 fhp = &da->heap[da->heap_index];
777 while (fhp->preflen > DXR_TRIE_BITS) {
778 uint32_t oend = fhp->end;
780 if (da->heap_index > 0) {
784 initheap(da, fhp->end + 1, chunk);
785 if (fhp->end > oend && fhp->nexthop != fp->nexthop) {
786 /* Have we crossed the upper chunk boundary? */
790 da->rtbl_work_frags++;
791 fp->start = (oend + 1) & DXR_RANGE_MASK;
792 fp->nexthop = fhp->nexthop;
796 /* Direct hit if the chunk contains only a single fragment */
797 if (da->rtbl_work_frags == 0) {
798 da->direct_tbl[chunk].base = fp->nexthop;
799 da->direct_tbl[chunk].fragments = FRAGS_MARK_HIT;
803 da->direct_tbl[chunk].base = da->rtbl_top;
804 da->direct_tbl[chunk].fragments = da->rtbl_work_frags;
806 #if DXR_TRIE_BITS < 24
807 /* Check whether the chunk can be more compactly encoded */
808 fp = &da->range_tbl[da->rtbl_top].re;
809 for (i = 0; i <= da->rtbl_work_frags; i++, fp++)
810 if ((fp->start & 0xff) != 0 || fp->nexthop > RE_SHORT_MAX_NH)
812 if (i == da->rtbl_work_frags + 1) {
813 fp = &da->range_tbl[da->rtbl_top].re;
815 for (i = 0; i <= da->rtbl_work_frags; i++, fp++, fps++) {
818 fps->start = start >> 8;
821 fps->start = start >> 8;
823 da->rtbl_work_frags >>= 1;
824 da->direct_tbl[chunk].fragments =
825 da->rtbl_work_frags | FRAGS_PREF_SHORT;
828 if (da->rtbl_work_frags >= FRAGS_MARK_HIT) {
829 da->direct_tbl[chunk].fragments = FRAGS_MARK_XL;
830 memmove(&da->range_tbl[da->rtbl_top + 1],
831 &da->range_tbl[da->rtbl_top],
832 (da->rtbl_work_frags + 1) * sizeof(*da->range_tbl));
833 da->range_tbl[da->rtbl_top].fragments = da->rtbl_work_frags;
834 da->rtbl_work_frags++;
836 da->rtbl_top += (da->rtbl_work_frags + 1);
837 return (chunk_ref(da, chunk));
841 dxr_build(struct dxr *dxr)
843 struct dxr_aux *da = dxr->aux;
844 struct chunk_desc *cdp;
845 struct rib_rtable_info rinfo;
846 struct timeval t0, t1, t2, t3;
847 uint32_t r_size, dxr_tot_size;
848 uint32_t i, m, range_rebuild = 0;
850 struct trie_desc *tp;
851 uint32_t d_tbl_size, dxr_x, d_size, x_size;
852 uint32_t ti, trie_rebuild = 0, prev_size = 0;
855 KASSERT(dxr->d == NULL, ("dxr: d not free"));
858 da = malloc(sizeof(*dxr->aux), M_DXRAUX, M_NOWAIT);
862 da->fibnum = dxr->fibnum;
864 LIST_INIT(&da->all_chunks);
865 LIST_INIT(&da->all_trie);
866 da->rtbl_size = RTBL_SIZE_INCR;
867 da->range_tbl = NULL;
868 da->xtbl_size = XTBL_SIZE_INCR;
870 bzero(&da->dst, sizeof(da->dst));
871 bzero(&da->mask, sizeof(da->mask));
872 da->dst.sin_len = sizeof(da->dst);
873 da->mask.sin_len = sizeof(da->mask);
874 da->dst.sin_family = AF_INET;
875 da->mask.sin_family = AF_INET;
877 if (da->range_tbl == NULL) {
878 da->range_tbl = malloc(sizeof(*da->range_tbl) * da->rtbl_size
879 + FRAGS_PREF_SHORT, M_DXRAUX, M_NOWAIT);
880 if (da->range_tbl == NULL)
885 if (da->x_tbl == NULL) {
886 da->x_tbl = malloc(sizeof(*da->x_tbl) * da->xtbl_size,
888 if (da->x_tbl == NULL)
897 dxr->nh_tbl = fib_get_nhop_array(da->fd);
898 fib_get_rtable_info(fib_get_rh(da->fd), &rinfo);
900 if (da->updates_low > da->updates_high ||
901 da->unused_chunks_cnt > V_max_range_holes)
905 bzero(da->chunk_hashtbl, sizeof(da->chunk_hashtbl));
906 while ((cdp = LIST_FIRST(&da->all_chunks)) != NULL) {
907 LIST_REMOVE(cdp, cd_all_le);
908 uma_zfree(chunk_zone, cdp);
910 for (i = 0; i < UNUSED_BUCKETS; i++)
911 LIST_INIT(&da->unused_chunks[i]);
912 da->all_chunks_cnt = da->unused_chunks_cnt = 0;
915 da->updates_high = DIRECT_TBL_SIZE - 1;
916 memset(da->updates_mask, 0xff, sizeof(da->updates_mask));
917 for (i = 0; i < DIRECT_TBL_SIZE; i++) {
918 da->direct_tbl[i].fragments = FRAGS_MARK_HIT;
919 da->direct_tbl[i].base = 0;
922 da->prefixes = rinfo.num_prefixes;
924 /* DXR: construct direct & range table */
925 for (i = da->updates_low; i <= da->updates_high; i++) {
926 m = da->updates_mask[i >> 5] >> (i & 0x1f);
929 else if (m & 1 && update_chunk(da, i) != 0)
932 r_size = sizeof(*da->range_tbl) * da->rtbl_top;
936 if (range_rebuild || da->unused_trie_cnt > V_max_trie_holes ||
937 abs(fls(da->prefixes) - fls(da->trie_rebuilt_prefixes)) > 1)
940 da->trie_rebuilt_prefixes = da->prefixes;
943 da->updates_high = DIRECT_TBL_SIZE - 1;
949 bzero(da->trietbl, sizeof(da->trietbl));
950 bzero(da->trie_hashtbl, sizeof(da->trie_hashtbl));
951 while ((tp = LIST_FIRST(&da->all_trie)) != NULL) {
952 LIST_REMOVE(tp, td_all_le);
953 uma_zfree(trie_zone, tp);
955 LIST_INIT(&da->unused_trie);
956 da->all_trie_cnt = da->unused_trie_cnt = 0;
959 /* Populate d_tbl, x_tbl */
960 dxr_x = DXR_TRIE_BITS - da->d_bits;
961 d_tbl_size = (1 << da->d_bits);
963 for (i = da->updates_low >> dxr_x; i <= da->updates_high >> dxr_x;
967 for (int j = 0; j < (1 << dxr_x); j += 32)
968 m |= da->updates_mask[((i << dxr_x) + j) >> 5];
973 ti = trie_ref(da, i);
979 d_size = sizeof(*da->d_tbl) * d_tbl_size;
980 x_size = sizeof(*da->x_tbl) * DIRECT_TBL_SIZE / d_tbl_size
982 dxr_tot_size = d_size + x_size + r_size;
984 if (trie_rebuild == 1) {
985 /* Try to find a more compact D/X split */
986 if (prev_size == 0 || dxr_tot_size <= prev_size)
992 prev_size = dxr_tot_size;
993 goto dxr2_try_squeeze;
997 dxr_tot_size = sizeof(da->direct_tbl) + r_size;
1001 dxr->d = malloc(dxr_tot_size, M_DXRLPM, M_NOWAIT);
1005 memcpy(dxr->d, da->d_tbl, d_size);
1006 dxr->x = ((char *) dxr->d) + d_size;
1007 memcpy(dxr->x, da->x_tbl, x_size);
1008 dxr->r = ((char *) dxr->x) + x_size;
1009 dxr->d_shift = 32 - da->d_bits;
1010 dxr->x_shift = dxr_x;
1011 dxr->x_mask = 0xffffffffU >> (32 - dxr_x);
1013 memcpy(dxr->d, da->direct_tbl, sizeof(da->direct_tbl));
1014 dxr->r = ((char *) dxr->d) + sizeof(da->direct_tbl);
1016 memcpy(dxr->r, da->range_tbl, r_size);
1018 if (da->updates_low <= da->updates_high)
1019 bzero(&da->updates_mask[da->updates_low / 32],
1020 (da->updates_high - da->updates_low) / 8 + 1);
1021 da->updates_low = DIRECT_TBL_SIZE - 1;
1022 da->updates_high = 0;
1026 FIB_PRINTF(LOG_INFO, da->fd, "D%dX%dR, %d prefixes, %d nhops (max)",
1027 da->d_bits, dxr_x, rinfo.num_prefixes, rinfo.num_nhops);
1029 FIB_PRINTF(LOG_INFO, da->fd, "D%dR, %d prefixes, %d nhops (max)",
1030 DXR_D, rinfo.num_prefixes, rinfo.num_nhops);
1032 i = dxr_tot_size * 100;
1033 if (rinfo.num_prefixes)
1034 i /= rinfo.num_prefixes;
1035 FIB_PRINTF(LOG_INFO, da->fd, "%d.%02d KBytes, %d.%02d Bytes/prefix",
1036 dxr_tot_size / 1024, dxr_tot_size * 100 / 1024 % 100,
1038 i = (t1.tv_sec - t0.tv_sec) * 1000000 + t1.tv_usec - t0.tv_usec;
1039 FIB_PRINTF(LOG_INFO, da->fd, "range table %s in %u.%03u ms",
1040 range_rebuild ? "rebuilt" : "updated", i / 1000, i % 1000);
1042 i = (t2.tv_sec - t1.tv_sec) * 1000000 + t2.tv_usec - t1.tv_usec;
1043 FIB_PRINTF(LOG_INFO, da->fd, "trie %s in %u.%03u ms",
1044 trie_rebuild ? "rebuilt" : "updated", i / 1000, i % 1000);
1046 i = (t3.tv_sec - t2.tv_sec) * 1000000 + t3.tv_usec - t2.tv_usec;
1047 FIB_PRINTF(LOG_INFO, da->fd, "snapshot forked in %u.%03u ms",
1048 i / 1000, i % 1000);
1049 FIB_PRINTF(LOG_INFO, da->fd, "range table: %d%%, %d chunks, %d holes",
1050 da->rtbl_top * 100 / BASE_MAX, da->all_chunks_cnt,
1051 da->unused_chunks_cnt);
1055 * Glue functions for attaching to FreeBSD 13 fib_algo infrastructure.
1058 static struct nhop_object *
1059 dxr_fib_lookup(void *algo_data, const struct flm_lookup_key key,
1062 struct dxr *dxr = algo_data;
1065 nh = dxr_lookup(dxr, ntohl(key.addr4.s_addr));
1067 return (dxr->nh_tbl[nh]);
1070 static enum flm_op_result
1071 dxr_init(uint32_t fibnum, struct fib_data *fd, void *old_data, void **data)
1073 struct dxr *old_dxr = old_data;
1074 struct dxr_aux *da = NULL;
1077 dxr = malloc(sizeof(*dxr), M_DXRAUX, M_NOWAIT);
1079 return (FLM_REBUILD);
1081 /* Check whether we may reuse the old auxiliary structures */
1082 if (old_dxr != NULL && old_dxr->aux != NULL) {
1084 atomic_add_int(&da->refcnt, 1);
1090 dxr->fibnum = fibnum;
1093 return (FLM_SUCCESS);
1097 dxr_destroy(void *data)
1099 struct dxr *dxr = data;
1101 struct chunk_desc *cdp;
1102 struct trie_desc *tp;
1105 free(dxr->d, M_DXRLPM);
1108 free(dxr, M_DXRAUX);
1110 if (da == NULL || atomic_fetchadd_int(&da->refcnt, -1) > 1)
1113 /* Release auxiliary structures */
1114 while ((cdp = LIST_FIRST(&da->all_chunks)) != NULL) {
1115 LIST_REMOVE(cdp, cd_all_le);
1116 uma_zfree(chunk_zone, cdp);
1118 while ((tp = LIST_FIRST(&da->all_trie)) != NULL) {
1119 LIST_REMOVE(tp, td_all_le);
1120 uma_zfree(trie_zone, tp);
1122 free(da->range_tbl, M_DXRAUX);
1123 free(da->x_tbl, M_DXRAUX);
1128 epoch_dxr_destroy(epoch_context_t ctx)
1130 struct dxr *dxr = __containerof(ctx, struct dxr, epoch_ctx);
1135 static enum flm_op_result
1136 dxr_dump_end(void *data, struct fib_dp *dp)
1138 struct dxr *dxr = data;
1145 return (FLM_REBUILD);
1147 /* Structural limit exceeded, hard error */
1148 if (da->rtbl_top >= BASE_MAX)
1151 /* A malloc(,, M_NOWAIT) failed somewhere, retry later */
1153 return (FLM_REBUILD);
1155 dp->f = dxr_fib_lookup;
1158 return (FLM_SUCCESS);
1161 static enum flm_op_result
1162 dxr_dump_rib_item(struct rtentry *rt, void *data)
1165 return (FLM_SUCCESS);
1168 static enum flm_op_result
1169 dxr_change_rib_item(struct rib_head *rnh, struct rib_cmd_info *rc,
1176 static enum flm_op_result
1177 dxr_change_rib_batch(struct rib_head *rnh, struct fib_change_queue *q,
1180 struct dxr *dxr = data;
1181 struct dxr *new_dxr;
1183 struct fib_dp new_dp;
1184 enum flm_op_result res;
1185 uint32_t ip, plen, hmask, start, end, i, ui;
1187 struct rib_rtable_info rinfo;
1188 int update_delta = 0;
1191 KASSERT(data != NULL, ("%s: NULL data", __FUNCTION__));
1192 KASSERT(q != NULL, ("%s: NULL q", __FUNCTION__));
1193 KASSERT(q->count < q->size, ("%s: q->count %d q->size %d",
1194 __FUNCTION__, q->count, q->size));
1197 KASSERT(da != NULL, ("%s: NULL dxr->aux", __FUNCTION__));
1199 FIB_PRINTF(LOG_INFO, da->fd, "processing %d update(s)", q->count);
1200 for (ui = 0; ui < q->count; ui++) {
1202 if (q->entries[ui].nh_new != NULL)
1204 if (q->entries[ui].nh_old != NULL)
1207 plen = q->entries[ui].plen;
1208 ip = ntohl(q->entries[ui].addr4.s_addr);
1210 hmask = 0xffffffffU >> plen;
1213 start = (ip & ~hmask) >> DXR_RANGE_SHIFT;
1214 end = (ip | hmask) >> DXR_RANGE_SHIFT;
1216 if ((start & 0x1f) == 0 && (end & 0x1f) == 0x1f)
1217 for (i = start >> 5; i <= end >> 5; i++)
1218 da->updates_mask[i] = 0xffffffffU;
1220 for (i = start; i <= end; i++)
1221 da->updates_mask[i >> 5] |= (1 << (i & 0x1f));
1222 if (start < da->updates_low)
1223 da->updates_low = start;
1224 if (end > da->updates_high)
1225 da->updates_high = end;
1229 fib_get_rtable_info(fib_get_rh(da->fd), &rinfo);
1230 KASSERT(da->prefixes + update_delta == rinfo.num_prefixes,
1231 ("%s: update count mismatch", __FUNCTION__));
1234 res = dxr_init(0, dxr->fd, data, (void **) &new_dxr);
1235 if (res != FLM_SUCCESS)
1240 /* Structural limit exceeded, hard error */
1241 if (da->rtbl_top >= BASE_MAX) {
1242 dxr_destroy(new_dxr);
1246 /* A malloc(,, M_NOWAIT) failed somewhere, retry later */
1247 if (new_dxr->d == NULL) {
1248 dxr_destroy(new_dxr);
1249 return (FLM_REBUILD);
1252 new_dp.f = dxr_fib_lookup;
1253 new_dp.arg = new_dxr;
1254 if (fib_set_datapath_ptr(dxr->fd, &new_dp)) {
1255 fib_set_algo_ptr(dxr->fd, new_dxr);
1256 fib_epoch_call(epoch_dxr_destroy, &dxr->epoch_ctx);
1257 return (FLM_SUCCESS);
1260 dxr_destroy(new_dxr);
1261 return (FLM_REBUILD);
1265 dxr_get_pref(const struct rib_rtable_info *rinfo)
1268 /* Below bsearch4 up to 10 prefixes. Always supersedes dpdk_lpm4. */
1272 static struct fib_lookup_module fib_dxr_mod = {
1274 .flm_family = AF_INET,
1275 .flm_init_cb = dxr_init,
1276 .flm_destroy_cb = dxr_destroy,
1277 .flm_dump_rib_item_cb = dxr_dump_rib_item,
1278 .flm_dump_end_cb = dxr_dump_end,
1279 .flm_change_rib_item_cb = dxr_change_rib_item,
1280 .flm_change_rib_items_cb = dxr_change_rib_batch,
1281 .flm_get_pref = dxr_get_pref,
1285 dxr_modevent(module_t mod, int type, void *unused)
1291 chunk_zone = uma_zcreate("dxr chunk", sizeof(struct chunk_desc),
1292 NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0);
1293 trie_zone = uma_zcreate("dxr trie", sizeof(struct trie_desc),
1294 NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0);
1295 fib_module_register(&fib_dxr_mod);
1298 error = fib_module_unregister(&fib_dxr_mod);
1301 uma_zdestroy(chunk_zone);
1302 uma_zdestroy(trie_zone);
1309 static moduledata_t dxr_mod = {"fib_dxr", dxr_modevent, 0};
1311 DECLARE_MODULE(fib_dxr, dxr_mod, SI_SUB_PSEUDO, SI_ORDER_ANY);
1312 MODULE_VERSION(fib_dxr, 1);