]> CyberLeo.Net >> Repos - FreeBSD/FreeBSD.git/blob - sys/netinet/in_fib_dxr.c
ocs_fc: IO timeout handling and error reporting fix.
[FreeBSD/FreeBSD.git] / sys / netinet / in_fib_dxr.c
1 /*-
2  * SPDX-License-Identifier: BSD-2-Clause
3  *
4  * Copyright (c) 2012-2022 Marko Zec
5  * Copyright (c) 2005, 2018 University of Zagreb
6  * Copyright (c) 2005 International Computer Science Institute
7  *
8  * Redistribution and use in source and binary forms, with or without
9  * modification, are permitted provided that the following conditions
10  * are met:
11  * 1. Redistributions of source code must retain the above copyright
12  *    notice, this list of conditions and the following disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in the
15  *    documentation and/or other materials provided with the distribution.
16  *
17  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
18  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
19  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
20  * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
21  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
22  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
23  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
24  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
25  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
26  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
27  * SUCH DAMAGE.
28  */
29
30 /*
31  * An implementation of DXR, a simple IPv4 LPM scheme with compact lookup
32  * structures and a trivial search procedure.  More significant bits of
33  * the search key are used to directly index a two-stage trie, while the
34  * remaining bits are used for finding the next hop in a sorted array.
35  * More details in:
36  *
37  * M. Zec, L. Rizzo, M. Mikuc, DXR: towards a billion routing lookups per
38  * second in software, ACM SIGCOMM Computer Communication Review, September
39  * 2012
40  *
41  * M. Zec, M. Mikuc, Pushing the envelope: beyond two billion IP routing
42  * lookups per second on commodity CPUs, IEEE SoftCOM, September 2017, Split
43  */
44
45 #include <sys/cdefs.h>
46 #include "opt_inet.h"
47
48 #include <sys/param.h>
49 #include <sys/kernel.h>
50 #include <sys/ctype.h>
51 #include <sys/epoch.h>
52 #include <sys/malloc.h>
53 #include <sys/module.h>
54 #include <sys/socket.h>
55 #include <sys/sysctl.h>
56 #include <sys/syslog.h>
57
58 #include <vm/uma.h>
59
60 #include <netinet/in.h>
61 #include <netinet/in_fib.h>
62
63 #include <net/route.h>
64 #include <net/route/route_ctl.h>
65 #include <net/route/fib_algo.h>
66
67 #define DXR_TRIE_BITS           20
68
69 CTASSERT(DXR_TRIE_BITS >= 16 && DXR_TRIE_BITS <= 24);
70
71 /* DXR2: two-stage primary trie, instead of a single direct lookup table */
72 #define DXR2
73
74 #if DXR_TRIE_BITS > 16
75 #define DXR_D                   16
76 #else
77 #define DXR_D                   (DXR_TRIE_BITS - 1)
78 #endif
79
80 #define D_TBL_SIZE              (1 << DXR_D)
81 #define DIRECT_TBL_SIZE         (1 << DXR_TRIE_BITS)
82 #define DXR_RANGE_MASK          (0xffffffffU >> DXR_TRIE_BITS)
83 #define DXR_RANGE_SHIFT         (32 - DXR_TRIE_BITS)
84
85 #define DESC_BASE_BITS          22
86 #define DESC_FRAGMENTS_BITS     (32 - DESC_BASE_BITS)
87 #define BASE_MAX                ((1 << DESC_BASE_BITS) - 1)
88 #define RTBL_SIZE_INCR          (BASE_MAX / 64)
89
90 #if DXR_TRIE_BITS < 24
91 #define FRAGS_MASK_SHORT        ((1 << (23 - DXR_TRIE_BITS)) - 1)
92 #else
93 #define FRAGS_MASK_SHORT        0
94 #endif
95 #define FRAGS_PREF_SHORT        (((1 << DESC_FRAGMENTS_BITS) - 1) & \
96                                  ~FRAGS_MASK_SHORT)
97 #define FRAGS_MARK_XL           (FRAGS_PREF_SHORT - 1)
98 #define FRAGS_MARK_HIT          (FRAGS_PREF_SHORT - 2)
99
100 #define IS_SHORT_FORMAT(x)      ((x & FRAGS_PREF_SHORT) == FRAGS_PREF_SHORT)
101 #define IS_LONG_FORMAT(x)       ((x & FRAGS_PREF_SHORT) != FRAGS_PREF_SHORT)
102 #define IS_XL_FORMAT(x)         (x == FRAGS_MARK_XL)
103
104 #define RE_SHORT_MAX_NH         ((1 << (DXR_TRIE_BITS - 8)) - 1)
105
106 #define CHUNK_HASH_BITS         16
107 #define CHUNK_HASH_SIZE         (1 << CHUNK_HASH_BITS)
108 #define CHUNK_HASH_MASK         (CHUNK_HASH_SIZE - 1)
109
110 #define TRIE_HASH_BITS          16
111 #define TRIE_HASH_SIZE          (1 << TRIE_HASH_BITS)
112 #define TRIE_HASH_MASK          (TRIE_HASH_SIZE - 1)
113
114 #define XTBL_SIZE_INCR          (DIRECT_TBL_SIZE / 16)
115
116 #define UNUSED_BUCKETS          8
117
118 /* Lookup structure elements */
119
120 struct direct_entry {
121         uint32_t                fragments: DESC_FRAGMENTS_BITS,
122                                 base: DESC_BASE_BITS;
123 };
124
125 struct range_entry_long {
126         uint32_t                start: DXR_RANGE_SHIFT,
127                                 nexthop: DXR_TRIE_BITS;
128 };
129
130 #if DXR_TRIE_BITS < 24
131 struct range_entry_short {
132         uint16_t                start: DXR_RANGE_SHIFT - 8,
133                                 nexthop: DXR_TRIE_BITS - 8;
134 };
135 #endif
136
137 /* Auxiliary structures */
138
139 struct heap_entry {
140         uint32_t                start;
141         uint32_t                end;
142         uint32_t                preflen;
143         uint32_t                nexthop;
144 };
145
146 struct chunk_desc {
147         LIST_ENTRY(chunk_desc)  cd_all_le;
148         LIST_ENTRY(chunk_desc)  cd_hash_le;
149         uint32_t                cd_hash;
150         uint32_t                cd_refcnt;
151         uint32_t                cd_base;
152         uint32_t                cd_cur_size;
153         uint32_t                cd_max_size;
154 };
155
156 struct trie_desc {
157         LIST_ENTRY(trie_desc)   td_all_le;
158         LIST_ENTRY(trie_desc)   td_hash_le;
159         uint32_t                td_hash;
160         uint32_t                td_index;
161         uint32_t                td_refcnt;
162 };
163
164 struct dxr_aux {
165         /* Glue to external state */
166         struct fib_data         *fd;
167         uint32_t                fibnum;
168         int                     refcnt;
169
170         /* Auxiliary build-time tables */
171         struct direct_entry     direct_tbl[DIRECT_TBL_SIZE];
172         uint16_t                d_tbl[D_TBL_SIZE];
173         struct direct_entry     *x_tbl;
174         union {
175                 struct range_entry_long re;
176                 uint32_t        fragments;
177         }                       *range_tbl;
178
179         /* Auxiliary internal state */
180         uint32_t                updates_mask[DIRECT_TBL_SIZE / 32];
181         struct trie_desc        *trietbl[D_TBL_SIZE];
182         LIST_HEAD(, chunk_desc) chunk_hashtbl[CHUNK_HASH_SIZE];
183         LIST_HEAD(, chunk_desc) all_chunks;
184         LIST_HEAD(, chunk_desc) unused_chunks[UNUSED_BUCKETS];
185         LIST_HEAD(, trie_desc)  trie_hashtbl[TRIE_HASH_SIZE];
186         LIST_HEAD(, trie_desc)  all_trie;
187         LIST_HEAD(, trie_desc)  unused_trie; /* abuses hash link entry */
188         struct sockaddr_in      dst;
189         struct sockaddr_in      mask;
190         struct heap_entry       heap[33];
191         uint32_t                prefixes;
192         uint32_t                updates_low;
193         uint32_t                updates_high;
194         uint32_t                unused_chunks_size;
195         uint32_t                xtbl_size;
196         uint32_t                all_trie_cnt;
197         uint32_t                unused_trie_cnt;
198         uint32_t                trie_rebuilt_prefixes;
199         uint32_t                heap_index;
200         uint32_t                d_bits;
201         uint32_t                rtbl_size;
202         uint32_t                rtbl_top;
203         uint32_t                rtbl_work_frags;
204         uint32_t                work_chunk;
205 };
206
207 /* Main lookup structure container */
208
209 struct dxr {
210         /* Lookup tables */
211         void                    *d;
212         void                    *x;
213         void                    *r;
214         struct nhop_object      **nh_tbl;
215
216         /* Glue to external state */
217         struct dxr_aux          *aux;
218         struct fib_data         *fd;
219         struct epoch_context    epoch_ctx;
220         uint32_t                fibnum;
221         uint16_t                d_shift;
222         uint16_t                x_shift;
223         uint32_t                x_mask;
224 };
225
226 static MALLOC_DEFINE(M_DXRLPM, "dxr", "DXR LPM");
227 static MALLOC_DEFINE(M_DXRAUX, "dxr aux", "DXR auxiliary");
228
229 uma_zone_t chunk_zone;
230 uma_zone_t trie_zone;
231
232 VNET_DEFINE_STATIC(int, frag_limit) = 100;
233 #define V_frag_limit    VNET(frag_limit)
234
235 /* Binary search for a matching address range */
236 #define DXR_LOOKUP_STAGE                                        \
237         if (masked_dst < range[middle].start) {                 \
238                 upperbound = middle;                            \
239                 middle = (middle + lowerbound) / 2;             \
240         } else if (masked_dst < range[middle + 1].start)        \
241                 return (range[middle].nexthop);                 \
242         else {                                                  \
243                 lowerbound = middle + 1;                        \
244                 middle = (upperbound + middle + 1) / 2;         \
245         }                                                       \
246         if (upperbound == lowerbound)                           \
247                 return (range[lowerbound].nexthop);
248
249 static int
250 range_lookup(struct range_entry_long *rt, struct direct_entry de, uint32_t dst)
251 {
252         uint32_t base;
253         uint32_t upperbound;
254         uint32_t middle;
255         uint32_t lowerbound;
256         uint32_t masked_dst;
257
258         base = de.base;
259         lowerbound = 0;
260         masked_dst = dst & DXR_RANGE_MASK;
261
262 #if DXR_TRIE_BITS < 24
263         if (__predict_true(IS_SHORT_FORMAT(de.fragments))) {
264                 upperbound = de.fragments & FRAGS_MASK_SHORT;
265                 struct range_entry_short *range =
266                     (struct range_entry_short *) &rt[base];
267
268                 masked_dst >>= 8;
269                 middle = upperbound;
270                 upperbound = upperbound * 2 + 1;
271
272                 for (;;) {
273                         DXR_LOOKUP_STAGE
274                         DXR_LOOKUP_STAGE
275                 }
276         }
277 #endif
278
279         upperbound = de.fragments;
280         middle = upperbound / 2;
281         struct range_entry_long *range = &rt[base];
282         if (__predict_false(IS_XL_FORMAT(de.fragments))) {
283                 upperbound = *((uint32_t *) range);
284                 range++;
285                 middle = upperbound / 2;
286         }
287
288         for (;;) {
289                 DXR_LOOKUP_STAGE
290                 DXR_LOOKUP_STAGE
291         }
292 }
293
294 #define DXR_LOOKUP_DEFINE(D)                                            \
295         static int inline                                               \
296         dxr_lookup_##D(struct dxr *dxr, uint32_t dst)                   \
297         {                                                               \
298                 struct direct_entry de;                                 \
299                 uint16_t *dt = dxr->d;                                  \
300                 struct direct_entry *xt = dxr->x;                       \
301                                                                         \
302                 de = xt[(dt[dst >> (32 - (D))] << (DXR_TRIE_BITS - (D))) \
303                     + ((dst >> DXR_RANGE_SHIFT) &                       \
304                     (0xffffffffU >> (32 - DXR_TRIE_BITS + (D))))];      \
305                 if (__predict_true(de.fragments == FRAGS_MARK_HIT))     \
306                         return (de.base);                               \
307                 return (range_lookup(dxr->r, de, dst));                 \
308         }                                                               \
309                                                                         \
310         static struct nhop_object *                                     \
311         dxr_fib_lookup_##D(void *algo_data,                             \
312             const struct flm_lookup_key key, uint32_t scopeid __unused) \
313         {                                                               \
314                 struct dxr *dxr = algo_data;                            \
315                                                                         \
316                 return (dxr->nh_tbl[dxr_lookup_##D(dxr,                 \
317                     ntohl(key.addr4.s_addr))]);                         \
318         }
319
320 #ifdef DXR2
321 #if DXR_TRIE_BITS > 16
322 DXR_LOOKUP_DEFINE(16)
323 #endif
324 DXR_LOOKUP_DEFINE(15)
325 DXR_LOOKUP_DEFINE(14)
326 DXR_LOOKUP_DEFINE(13)
327 DXR_LOOKUP_DEFINE(12)
328 DXR_LOOKUP_DEFINE(11)
329 DXR_LOOKUP_DEFINE(10)
330 DXR_LOOKUP_DEFINE(9)
331 #endif /* DXR2 */
332
333 static int inline
334 dxr_lookup(struct dxr *dxr, uint32_t dst)
335 {
336         struct direct_entry de;
337 #ifdef DXR2
338         uint16_t *dt = dxr->d;
339         struct direct_entry *xt = dxr->x;
340
341         de = xt[(dt[dst >> dxr->d_shift] << dxr->x_shift) +
342             ((dst >> DXR_RANGE_SHIFT) & dxr->x_mask)];
343 #else /* !DXR2 */
344         struct direct_entry *dt = dxr->d;
345
346         de = dt[dst >> DXR_RANGE_SHIFT];
347 #endif /* !DXR2 */
348         if (__predict_true(de.fragments == FRAGS_MARK_HIT))
349                 return (de.base);
350         return (range_lookup(dxr->r, de, dst));
351 }
352
353 static void
354 initheap(struct dxr_aux *da, uint32_t dst_u32, uint32_t chunk)
355 {
356         struct heap_entry *fhp = &da->heap[0];
357         struct rtentry *rt;
358         struct route_nhop_data rnd;
359  
360         da->heap_index = 0;
361         da->dst.sin_addr.s_addr = htonl(dst_u32);
362         rt = fib4_lookup_rt(da->fibnum, da->dst.sin_addr, 0, NHR_UNLOCKED,
363             &rnd);
364         if (rt != NULL) {
365                 struct in_addr addr;
366                 uint32_t scopeid;
367
368                 rt_get_inet_prefix_plen(rt, &addr, &fhp->preflen, &scopeid);
369                 fhp->start = ntohl(addr.s_addr);
370                 fhp->end = fhp->start;
371                 if (fhp->preflen < 32)
372                         fhp->end |= (0xffffffffU >> fhp->preflen);
373                 fhp->nexthop = fib_get_nhop_idx(da->fd, rnd.rnd_nhop);
374         } else {
375                 fhp->preflen = fhp->nexthop = fhp->start = 0;
376                 fhp->end = 0xffffffffU;
377         }
378 }
379
380 static uint32_t
381 chunk_size(struct dxr_aux *da, struct direct_entry *fdesc)
382 {
383
384         if (IS_SHORT_FORMAT(fdesc->fragments))
385                 return ((fdesc->fragments & FRAGS_MASK_SHORT) + 1);
386         else if (IS_XL_FORMAT(fdesc->fragments))
387                 return (da->range_tbl[fdesc->base].fragments + 2);
388         else /* if (IS_LONG_FORMAT(fdesc->fragments)) */
389                 return (fdesc->fragments + 1);
390 }
391
392 static uint32_t
393 chunk_hash(struct dxr_aux *da, struct direct_entry *fdesc)
394 {
395         uint32_t size = chunk_size(da, fdesc);
396         uint32_t *p = (uint32_t *) &da->range_tbl[fdesc->base];
397         uint32_t *l = (uint32_t *) &da->range_tbl[fdesc->base + size];
398         uint32_t hash = fdesc->fragments;
399
400         for (; p < l; p++)
401                 hash = (hash << 7) + (hash >> 13) + *p;
402
403         return (hash + (hash >> 16));
404 }
405
406 static int
407 chunk_ref(struct dxr_aux *da, uint32_t chunk)
408 {
409         struct direct_entry *fdesc = &da->direct_tbl[chunk];
410         struct chunk_desc *cdp, *empty_cdp;
411         uint32_t base = fdesc->base;
412         uint32_t size = chunk_size(da, fdesc);
413         uint32_t hash = chunk_hash(da, fdesc);
414         int i;
415
416         /* Find an existing descriptor */
417         LIST_FOREACH(cdp, &da->chunk_hashtbl[hash & CHUNK_HASH_MASK],
418             cd_hash_le) {
419                 if (cdp->cd_hash != hash || cdp->cd_cur_size != size ||
420                     memcmp(&da->range_tbl[base], &da->range_tbl[cdp->cd_base],
421                     sizeof(struct range_entry_long) * size))
422                         continue;
423                 da->rtbl_top = fdesc->base;
424                 fdesc->base = cdp->cd_base;
425                 cdp->cd_refcnt++;
426                 return (0);
427         }
428
429         /* No matching chunks found. Find an empty one to recycle. */
430         for (cdp = NULL, i = size; cdp == NULL && i < UNUSED_BUCKETS; i++)
431                 cdp = LIST_FIRST(&da->unused_chunks[i]);
432
433         if (cdp == NULL)
434                 LIST_FOREACH(empty_cdp, &da->unused_chunks[0], cd_hash_le)
435                         if (empty_cdp->cd_max_size >= size && (cdp == NULL ||
436                             empty_cdp->cd_max_size < cdp->cd_max_size)) {
437                                 cdp = empty_cdp;
438                                 if (empty_cdp->cd_max_size == size)
439                                         break;
440                         }
441
442         if (cdp != NULL) {
443                 /* Copy from heap into the recycled chunk */
444                 bcopy(&da->range_tbl[fdesc->base], &da->range_tbl[cdp->cd_base],
445                     size * sizeof(struct range_entry_long));
446                 fdesc->base = cdp->cd_base;
447                 da->rtbl_top -= size;
448                 da->unused_chunks_size -= cdp->cd_max_size;
449                 if (cdp->cd_max_size > size) {
450                         /* Split the range in two, need a new descriptor */
451                         empty_cdp = uma_zalloc(chunk_zone, M_NOWAIT);
452                         if (empty_cdp == NULL)
453                                 return (1);
454                         LIST_INSERT_BEFORE(cdp, empty_cdp, cd_all_le);
455                         empty_cdp->cd_base = cdp->cd_base + size;
456                         empty_cdp->cd_cur_size = 0;
457                         empty_cdp->cd_max_size = cdp->cd_max_size - size;
458
459                         i = empty_cdp->cd_max_size;
460                         if (i >= UNUSED_BUCKETS)
461                                 i = 0;
462                         LIST_INSERT_HEAD(&da->unused_chunks[i], empty_cdp,
463                             cd_hash_le);
464
465                         da->unused_chunks_size += empty_cdp->cd_max_size;
466                         cdp->cd_max_size = size;
467                 }
468                 LIST_REMOVE(cdp, cd_hash_le);
469         } else {
470                 /* Alloc a new descriptor at the top of the heap*/
471                 cdp = uma_zalloc(chunk_zone, M_NOWAIT);
472                 if (cdp == NULL)
473                         return (1);
474                 cdp->cd_max_size = size;
475                 cdp->cd_base = fdesc->base;
476                 LIST_INSERT_HEAD(&da->all_chunks, cdp, cd_all_le);
477                 KASSERT(cdp->cd_base + cdp->cd_max_size == da->rtbl_top,
478                     ("dxr: %s %d", __FUNCTION__, __LINE__));
479         }
480
481         cdp->cd_hash = hash;
482         cdp->cd_refcnt = 1;
483         cdp->cd_cur_size = size;
484         LIST_INSERT_HEAD(&da->chunk_hashtbl[hash & CHUNK_HASH_MASK], cdp,
485             cd_hash_le);
486         if (da->rtbl_top >= da->rtbl_size) {
487                 if (da->rtbl_top >= BASE_MAX) {
488                         FIB_PRINTF(LOG_ERR, da->fd,
489                             "structural limit exceeded at %d "
490                             "range table elements", da->rtbl_top);
491                         return (1);
492                 }
493                 da->rtbl_size += RTBL_SIZE_INCR;
494                 i = (BASE_MAX - da->rtbl_top) * LOG_DEBUG / BASE_MAX;
495                 FIB_PRINTF(i, da->fd, "range table at %d%% structural limit",
496                     da->rtbl_top * 100 / BASE_MAX);
497                 da->range_tbl = realloc(da->range_tbl,
498                     sizeof(*da->range_tbl) * da->rtbl_size + FRAGS_PREF_SHORT,
499                     M_DXRAUX, M_NOWAIT);
500                 if (da->range_tbl == NULL)
501                         return (1);
502         }
503
504         return (0);
505 }
506
507 static void
508 chunk_unref(struct dxr_aux *da, uint32_t chunk)
509 {
510         struct direct_entry *fdesc = &da->direct_tbl[chunk];
511         struct chunk_desc *cdp, *cdp2;
512         uint32_t base = fdesc->base;
513         uint32_t size = chunk_size(da, fdesc);
514         uint32_t hash = chunk_hash(da, fdesc);
515         int i;
516
517         /* Find the corresponding descriptor */
518         LIST_FOREACH(cdp, &da->chunk_hashtbl[hash & CHUNK_HASH_MASK],
519             cd_hash_le)
520                 if (cdp->cd_hash == hash && cdp->cd_cur_size == size &&
521                     memcmp(&da->range_tbl[base], &da->range_tbl[cdp->cd_base],
522                     sizeof(struct range_entry_long) * size) == 0)
523                         break;
524
525         KASSERT(cdp != NULL, ("dxr: dangling chunk"));
526         if (--cdp->cd_refcnt > 0)
527                 return;
528
529         LIST_REMOVE(cdp, cd_hash_le);
530         da->unused_chunks_size += cdp->cd_max_size;
531         cdp->cd_cur_size = 0;
532
533         /* Attempt to merge with the preceding chunk, if empty */
534         cdp2 = LIST_NEXT(cdp, cd_all_le);
535         if (cdp2 != NULL && cdp2->cd_cur_size == 0) {
536                 KASSERT(cdp2->cd_base + cdp2->cd_max_size == cdp->cd_base,
537                     ("dxr: %s %d", __FUNCTION__, __LINE__));
538                 LIST_REMOVE(cdp, cd_all_le);
539                 LIST_REMOVE(cdp2, cd_hash_le);
540                 cdp2->cd_max_size += cdp->cd_max_size;
541                 uma_zfree(chunk_zone, cdp);
542                 cdp = cdp2;
543         }
544
545         /* Attempt to merge with the subsequent chunk, if empty */
546         cdp2 = LIST_PREV(cdp, &da->all_chunks, chunk_desc, cd_all_le);
547         if (cdp2 != NULL && cdp2->cd_cur_size == 0) {
548                 KASSERT(cdp->cd_base + cdp->cd_max_size == cdp2->cd_base,
549                     ("dxr: %s %d", __FUNCTION__, __LINE__));
550                 LIST_REMOVE(cdp, cd_all_le);
551                 LIST_REMOVE(cdp2, cd_hash_le);
552                 cdp2->cd_max_size += cdp->cd_max_size;
553                 cdp2->cd_base = cdp->cd_base;
554                 uma_zfree(chunk_zone, cdp);
555                 cdp = cdp2;
556         }
557
558         if (cdp->cd_base + cdp->cd_max_size == da->rtbl_top) {
559                 /* Free the chunk on the top of the range heap, trim the heap */
560                 KASSERT(cdp == LIST_FIRST(&da->all_chunks),
561                     ("dxr: %s %d", __FUNCTION__, __LINE__));
562                 da->rtbl_top -= cdp->cd_max_size;
563                 da->unused_chunks_size -= cdp->cd_max_size;
564                 LIST_REMOVE(cdp, cd_all_le);
565                 uma_zfree(chunk_zone, cdp);
566                 return;
567         }
568
569         i = cdp->cd_max_size;
570         if (i >= UNUSED_BUCKETS)
571                 i = 0;
572         LIST_INSERT_HEAD(&da->unused_chunks[i], cdp, cd_hash_le);
573 }
574
575 #ifdef DXR2
576 static uint32_t
577 trie_hash(struct dxr_aux *da, uint32_t dxr_x, uint32_t index)
578 {
579         uint32_t i, *val;
580         uint32_t hash = 0;
581
582         for (i = 0; i < (1 << dxr_x); i++) {
583                 hash = (hash << 3) ^ (hash >> 3);
584                 val = (uint32_t *)
585                     (void *) &da->direct_tbl[(index << dxr_x) + i];
586                 hash += (*val << 5);
587                 hash += (*val >> 5);
588         }
589
590         return (hash + (hash >> 16));
591 }
592
593 static int
594 trie_ref(struct dxr_aux *da, uint32_t index)
595 {
596         struct trie_desc *tp;
597         uint32_t dxr_d = da->d_bits;
598         uint32_t dxr_x = DXR_TRIE_BITS - dxr_d;
599         uint32_t hash = trie_hash(da, dxr_x, index);
600
601         /* Find an existing descriptor */
602         LIST_FOREACH(tp, &da->trie_hashtbl[hash & TRIE_HASH_MASK], td_hash_le)
603                 if (tp->td_hash == hash &&
604                     memcmp(&da->direct_tbl[index << dxr_x],
605                     &da->x_tbl[tp->td_index << dxr_x],
606                     sizeof(*da->x_tbl) << dxr_x) == 0) {
607                         tp->td_refcnt++;
608                         da->trietbl[index] = tp;
609                         return(tp->td_index);
610                 }
611
612         tp = LIST_FIRST(&da->unused_trie);
613         if (tp != NULL) {
614                 LIST_REMOVE(tp, td_hash_le);
615                 da->unused_trie_cnt--;
616         } else {
617                 tp = uma_zalloc(trie_zone, M_NOWAIT);
618                 if (tp == NULL)
619                         return (-1);
620                 LIST_INSERT_HEAD(&da->all_trie, tp, td_all_le);
621                 tp->td_index = da->all_trie_cnt++;
622         }
623
624         tp->td_hash = hash;
625         tp->td_refcnt = 1;
626         LIST_INSERT_HEAD(&da->trie_hashtbl[hash & TRIE_HASH_MASK], tp,
627            td_hash_le);
628         memcpy(&da->x_tbl[tp->td_index << dxr_x],
629             &da->direct_tbl[index << dxr_x], sizeof(*da->x_tbl) << dxr_x);
630         da->trietbl[index] = tp;
631         if (da->all_trie_cnt >= da->xtbl_size >> dxr_x) {
632                 da->xtbl_size += XTBL_SIZE_INCR;
633                 da->x_tbl = realloc(da->x_tbl,
634                     sizeof(*da->x_tbl) * da->xtbl_size, M_DXRAUX, M_NOWAIT);
635                 if (da->x_tbl == NULL)
636                         return (-1);
637         }
638         return(tp->td_index);
639 }
640
641 static void
642 trie_unref(struct dxr_aux *da, uint32_t index)
643 {
644         struct trie_desc *tp = da->trietbl[index];
645
646         if (tp == NULL)
647                 return;
648         da->trietbl[index] = NULL;
649         if (--tp->td_refcnt > 0)
650                 return;
651
652         LIST_REMOVE(tp, td_hash_le);
653         da->unused_trie_cnt++;
654         if (tp->td_index != da->all_trie_cnt - 1) {
655                 LIST_INSERT_HEAD(&da->unused_trie, tp, td_hash_le);
656                 return;
657         }
658
659         do {
660                 da->all_trie_cnt--;
661                 da->unused_trie_cnt--;
662                 LIST_REMOVE(tp, td_all_le);
663                 uma_zfree(trie_zone, tp);
664                 LIST_FOREACH(tp, &da->unused_trie, td_hash_le)
665                         if (tp->td_index == da->all_trie_cnt - 1) {
666                                 LIST_REMOVE(tp, td_hash_le);
667                                 break;
668                         }
669         } while (tp != NULL);
670 }
671 #endif
672
673 static void
674 heap_inject(struct dxr_aux *da, uint32_t start, uint32_t end, uint32_t preflen,
675     uint32_t nh)
676 {
677         struct heap_entry *fhp;
678         int i;
679
680         for (i = da->heap_index; i >= 0; i--) {
681                 if (preflen > da->heap[i].preflen)
682                         break;
683                 else if (preflen < da->heap[i].preflen)
684                         da->heap[i + 1] = da->heap[i];
685                 else
686                         return;
687         }
688
689         fhp = &da->heap[i + 1];
690         fhp->preflen = preflen;
691         fhp->start = start;
692         fhp->end = end;
693         fhp->nexthop = nh;
694         da->heap_index++;
695 }
696
697 static int
698 dxr_walk(struct rtentry *rt, void *arg)
699 {
700         struct dxr_aux *da = arg;
701         uint32_t chunk = da->work_chunk;
702         uint32_t first = chunk << DXR_RANGE_SHIFT;
703         uint32_t last = first | DXR_RANGE_MASK;
704         struct range_entry_long *fp =
705             &da->range_tbl[da->rtbl_top + da->rtbl_work_frags].re;
706         struct heap_entry *fhp = &da->heap[da->heap_index];
707         uint32_t preflen, nh, start, end, scopeid;
708         struct in_addr addr;
709
710         rt_get_inet_prefix_plen(rt, &addr, &preflen, &scopeid);
711         start = ntohl(addr.s_addr);
712         if (start > last)
713                 return (-1);    /* Beyond chunk boundaries, we are done */
714         if (start < first)
715                 return (0);     /* Skip this route */
716
717         end = start;
718         if (preflen < 32)
719                 end |= (0xffffffffU >> preflen);
720         nh = fib_get_nhop_idx(da->fd, rt_get_raw_nhop(rt));
721
722         if (start == fhp->start)
723                 heap_inject(da, start, end, preflen, nh);
724         else {
725                 /* start > fhp->start */
726                 while (start > fhp->end) {
727                         uint32_t oend = fhp->end;
728
729                         if (da->heap_index > 0) {
730                                 fhp--;
731                                 da->heap_index--;
732                         } else
733                                 initheap(da, fhp->end + 1, chunk);
734                         if (fhp->end > oend && fhp->nexthop != fp->nexthop) {
735                                 fp++;
736                                 da->rtbl_work_frags++;
737                                 fp->start = (oend + 1) & DXR_RANGE_MASK;
738                                 fp->nexthop = fhp->nexthop;
739                         }
740                 }
741                 if (start > ((chunk << DXR_RANGE_SHIFT) | fp->start) &&
742                     nh != fp->nexthop) {
743                         fp++;
744                         da->rtbl_work_frags++;
745                         fp->start = start & DXR_RANGE_MASK;
746                 } else if (da->rtbl_work_frags) {
747                         if ((--fp)->nexthop == nh)
748                                 da->rtbl_work_frags--;
749                         else
750                                 fp++;
751                 }
752                 fp->nexthop = nh;
753                 heap_inject(da, start, end, preflen, nh);
754         }
755
756         return (0);
757 }
758
759 static int
760 update_chunk(struct dxr_aux *da, uint32_t chunk)
761 {
762         struct range_entry_long *fp;
763 #if DXR_TRIE_BITS < 24
764         struct range_entry_short *fps;
765         uint32_t start, nh, i;
766 #endif
767         struct heap_entry *fhp;
768         uint32_t first = chunk << DXR_RANGE_SHIFT;
769         uint32_t last = first | DXR_RANGE_MASK;
770
771         if (da->direct_tbl[chunk].fragments != FRAGS_MARK_HIT)
772                 chunk_unref(da, chunk);
773
774         initheap(da, first, chunk);
775
776         fp = &da->range_tbl[da->rtbl_top].re;
777         da->rtbl_work_frags = 0;
778         fp->start = first & DXR_RANGE_MASK;
779         fp->nexthop = da->heap[0].nexthop;
780
781         da->dst.sin_addr.s_addr = htonl(first);
782         da->mask.sin_addr.s_addr = htonl(~DXR_RANGE_MASK);
783
784         da->work_chunk = chunk;
785         rib_walk_from(da->fibnum, AF_INET, RIB_FLAG_LOCKED,
786             (struct sockaddr *) &da->dst, (struct sockaddr *) &da->mask,
787             dxr_walk, da);
788
789         /* Flush any remaining objects on the heap */
790         fp = &da->range_tbl[da->rtbl_top + da->rtbl_work_frags].re;
791         fhp = &da->heap[da->heap_index];
792         while (fhp->preflen > DXR_TRIE_BITS) {
793                 uint32_t oend = fhp->end;
794
795                 if (da->heap_index > 0) {
796                         fhp--;
797                         da->heap_index--;
798                 } else
799                         initheap(da, fhp->end + 1, chunk);
800                 if (fhp->end > oend && fhp->nexthop != fp->nexthop) {
801                         /* Have we crossed the upper chunk boundary? */
802                         if (oend >= last)
803                                 break;
804                         fp++;
805                         da->rtbl_work_frags++;
806                         fp->start = (oend + 1) & DXR_RANGE_MASK;
807                         fp->nexthop = fhp->nexthop;
808                 }
809         }
810
811         /* Direct hit if the chunk contains only a single fragment */
812         if (da->rtbl_work_frags == 0) {
813                 da->direct_tbl[chunk].base = fp->nexthop;
814                 da->direct_tbl[chunk].fragments = FRAGS_MARK_HIT;
815                 return (0);
816         }
817
818         da->direct_tbl[chunk].base = da->rtbl_top;
819         da->direct_tbl[chunk].fragments = da->rtbl_work_frags;
820
821 #if DXR_TRIE_BITS < 24
822         /* Check whether the chunk can be more compactly encoded */
823         fp = &da->range_tbl[da->rtbl_top].re;
824         for (i = 0; i <= da->rtbl_work_frags; i++, fp++)
825                 if ((fp->start & 0xff) != 0 || fp->nexthop > RE_SHORT_MAX_NH)
826                         break;
827         if (i == da->rtbl_work_frags + 1) {
828                 fp = &da->range_tbl[da->rtbl_top].re;
829                 fps = (void *) fp;
830                 for (i = 0; i <= da->rtbl_work_frags; i++, fp++, fps++) {
831                         start = fp->start;
832                         nh = fp->nexthop;
833                         fps->start = start >> 8;
834                         fps->nexthop = nh;
835                 }
836                 fps->start = start >> 8;
837                 fps->nexthop = nh;
838                 da->rtbl_work_frags >>= 1;
839                 da->direct_tbl[chunk].fragments =
840                     da->rtbl_work_frags | FRAGS_PREF_SHORT;
841         } else
842 #endif
843         if (da->rtbl_work_frags >= FRAGS_MARK_HIT) {
844                 da->direct_tbl[chunk].fragments = FRAGS_MARK_XL;
845                 memmove(&da->range_tbl[da->rtbl_top + 1],
846                    &da->range_tbl[da->rtbl_top],
847                    (da->rtbl_work_frags + 1) * sizeof(*da->range_tbl));
848                 da->range_tbl[da->rtbl_top].fragments = da->rtbl_work_frags;
849                 da->rtbl_work_frags++;
850         }
851         da->rtbl_top += (da->rtbl_work_frags + 1);
852         return (chunk_ref(da, chunk));
853 }
854
855 static void
856 dxr_build(struct dxr *dxr)
857 {
858         struct dxr_aux *da = dxr->aux;
859         struct chunk_desc *cdp;
860         struct rib_rtable_info rinfo;
861         struct timeval t0, t1, t2, t3;
862         uint32_t r_size, dxr_tot_size;
863         uint32_t i, m, range_rebuild = 0;
864         uint32_t range_frag;
865 #ifdef DXR2
866         struct trie_desc *tp;
867         uint32_t d_tbl_size, dxr_x, d_size, x_size;
868         uint32_t ti, trie_rebuild = 0, prev_size = 0;
869         uint32_t trie_frag;
870 #endif
871
872         KASSERT(dxr->d == NULL, ("dxr: d not free"));
873
874         if (da == NULL) {
875                 da = malloc(sizeof(*dxr->aux), M_DXRAUX, M_NOWAIT);
876                 if (da == NULL)
877                         return;
878                 dxr->aux = da;
879                 da->fibnum = dxr->fibnum;
880                 da->refcnt = 1;
881                 LIST_INIT(&da->all_chunks);
882                 LIST_INIT(&da->all_trie);
883                 da->rtbl_size = RTBL_SIZE_INCR;
884                 da->range_tbl = NULL;
885                 da->xtbl_size = XTBL_SIZE_INCR;
886                 da->x_tbl = NULL;
887                 bzero(&da->dst, sizeof(da->dst));
888                 bzero(&da->mask, sizeof(da->mask));
889                 da->dst.sin_len = sizeof(da->dst);
890                 da->mask.sin_len = sizeof(da->mask);
891                 da->dst.sin_family = AF_INET;
892                 da->mask.sin_family = AF_INET;
893         }
894         if (da->range_tbl == NULL) {
895                 da->range_tbl = malloc(sizeof(*da->range_tbl) * da->rtbl_size
896                     + FRAGS_PREF_SHORT, M_DXRAUX, M_NOWAIT);
897                 if (da->range_tbl == NULL)
898                         return;
899                 range_rebuild = 1;
900         }
901 #ifdef DXR2
902         if (da->x_tbl == NULL) {
903                 da->x_tbl = malloc(sizeof(*da->x_tbl) * da->xtbl_size,
904                     M_DXRAUX, M_NOWAIT);
905                 if (da->x_tbl == NULL)
906                         return;
907                 trie_rebuild = 1;
908         }
909 #endif
910         da->fd = dxr->fd;
911
912         microuptime(&t0);
913
914         dxr->nh_tbl = fib_get_nhop_array(da->fd);
915         fib_get_rtable_info(fib_get_rh(da->fd), &rinfo);
916
917         if (da->updates_low > da->updates_high)
918                 range_rebuild = 1;
919
920 range_build:
921         if (range_rebuild) {
922                 /* Bulk cleanup */
923                 bzero(da->chunk_hashtbl, sizeof(da->chunk_hashtbl));
924                 while ((cdp = LIST_FIRST(&da->all_chunks)) != NULL) {
925                         LIST_REMOVE(cdp, cd_all_le);
926                         uma_zfree(chunk_zone, cdp);
927                 }
928                 for (i = 0; i < UNUSED_BUCKETS; i++)
929                         LIST_INIT(&da->unused_chunks[i]);
930                 da->unused_chunks_size = 0;
931                 da->rtbl_top = 0;
932                 da->updates_low = 0;
933                 da->updates_high = DIRECT_TBL_SIZE - 1;
934                 memset(da->updates_mask, 0xff, sizeof(da->updates_mask));
935                 for (i = 0; i < DIRECT_TBL_SIZE; i++) {
936                         da->direct_tbl[i].fragments = FRAGS_MARK_HIT;
937                         da->direct_tbl[i].base = 0;
938                 }
939         }
940         da->prefixes = rinfo.num_prefixes;
941
942         /* DXR: construct direct & range table */
943         for (i = da->updates_low; i <= da->updates_high; i++) {
944                 m = da->updates_mask[i >> 5] >> (i & 0x1f);
945                 if (m == 0)
946                         i |= 0x1f;
947                 else if (m & 1 && update_chunk(da, i) != 0)
948                         return;
949         }
950
951         range_frag = 0;
952         if (da->rtbl_top)
953                 range_frag = da->unused_chunks_size * 10000ULL / da->rtbl_top;
954         if (range_frag > V_frag_limit) {
955                 range_rebuild = 1;
956                 goto range_build;
957         }
958
959         r_size = sizeof(*da->range_tbl) * da->rtbl_top;
960         microuptime(&t1);
961
962 #ifdef DXR2
963         if (range_rebuild ||
964             abs(fls(da->prefixes) - fls(da->trie_rebuilt_prefixes)) > 1)
965                 trie_rebuild = 1;
966
967 trie_build:
968         if (trie_rebuild) {
969                 da->trie_rebuilt_prefixes = da->prefixes;
970                 da->d_bits = DXR_D;
971                 da->updates_low = 0;
972                 da->updates_high = DIRECT_TBL_SIZE - 1;
973                 if (!range_rebuild)
974                         memset(da->updates_mask, 0xff,
975                             sizeof(da->updates_mask));
976         }
977
978 dxr2_try_squeeze:
979         if (trie_rebuild) {
980                 /* Bulk cleanup */
981                 bzero(da->trietbl, sizeof(da->trietbl));
982                 bzero(da->trie_hashtbl, sizeof(da->trie_hashtbl));
983                 while ((tp = LIST_FIRST(&da->all_trie)) != NULL) {
984                         LIST_REMOVE(tp, td_all_le);
985                         uma_zfree(trie_zone, tp);
986                 }
987                 LIST_INIT(&da->unused_trie);
988                 da->all_trie_cnt = da->unused_trie_cnt = 0;
989         }
990
991         /* Populate d_tbl, x_tbl */
992         dxr_x = DXR_TRIE_BITS - da->d_bits;
993         d_tbl_size = (1 << da->d_bits);
994
995         for (i = da->updates_low >> dxr_x; i <= da->updates_high >> dxr_x;
996             i++) {
997                 if (!trie_rebuild) {
998                         m = 0;
999                         for (int j = 0; j < (1 << dxr_x); j += 32)
1000                                 m |= da->updates_mask[((i << dxr_x) + j) >> 5];
1001                         if (m == 0)
1002                                 continue;
1003                         trie_unref(da, i);
1004                 }
1005                 ti = trie_ref(da, i);
1006                 if (ti < 0)
1007                         return;
1008                 da->d_tbl[i] = ti;
1009         }
1010
1011         trie_frag = 0;
1012         if (da->all_trie_cnt)
1013                 trie_frag = da->unused_trie_cnt * 10000ULL / da->all_trie_cnt;
1014         if (trie_frag > V_frag_limit) {
1015                 trie_rebuild = 1;
1016                 goto trie_build;
1017         }
1018
1019         d_size = sizeof(*da->d_tbl) * d_tbl_size;
1020         x_size = sizeof(*da->x_tbl) * DIRECT_TBL_SIZE / d_tbl_size
1021             * da->all_trie_cnt;
1022         dxr_tot_size = d_size + x_size + r_size;
1023
1024         if (trie_rebuild == 1) {
1025                 /* Try to find a more compact D/X split */
1026                 if (prev_size == 0 || dxr_tot_size <= prev_size)
1027                         da->d_bits--;
1028                 else {
1029                         da->d_bits++;
1030                         trie_rebuild = 2;
1031                 }
1032                 prev_size = dxr_tot_size;
1033                 goto dxr2_try_squeeze;
1034         }
1035         microuptime(&t2);
1036 #else /* !DXR2 */
1037         dxr_tot_size = sizeof(da->direct_tbl) + r_size;
1038         t2 = t1;
1039 #endif
1040
1041         dxr->d = malloc(dxr_tot_size, M_DXRLPM, M_NOWAIT);
1042         if (dxr->d == NULL)
1043                 return;
1044 #ifdef DXR2
1045         memcpy(dxr->d, da->d_tbl, d_size);
1046         dxr->x = ((char *) dxr->d) + d_size;
1047         memcpy(dxr->x, da->x_tbl, x_size);
1048         dxr->r = ((char *) dxr->x) + x_size;
1049         dxr->d_shift = 32 - da->d_bits;
1050         dxr->x_shift = dxr_x;
1051         dxr->x_mask = 0xffffffffU >> (32 - dxr_x);
1052 #else /* !DXR2 */
1053         memcpy(dxr->d, da->direct_tbl, sizeof(da->direct_tbl));
1054         dxr->r = ((char *) dxr->d) + sizeof(da->direct_tbl);
1055 #endif
1056         memcpy(dxr->r, da->range_tbl, r_size);
1057
1058         if (da->updates_low <= da->updates_high)
1059                 bzero(&da->updates_mask[da->updates_low / 32],
1060                     (da->updates_high - da->updates_low) / 8 + 1);
1061         da->updates_low = DIRECT_TBL_SIZE - 1;
1062         da->updates_high = 0;
1063         microuptime(&t3);
1064
1065 #ifdef DXR2
1066         FIB_PRINTF(LOG_INFO, da->fd, "D%dX%dR, %d prefixes, %d nhops (max)",
1067             da->d_bits, dxr_x, rinfo.num_prefixes, rinfo.num_nhops);
1068 #else
1069         FIB_PRINTF(LOG_INFO, da->fd, "D%dR, %d prefixes, %d nhops (max)",
1070             DXR_D, rinfo.num_prefixes, rinfo.num_nhops);
1071 #endif
1072         i = dxr_tot_size * 100;
1073         if (rinfo.num_prefixes)
1074                 i /= rinfo.num_prefixes;
1075         FIB_PRINTF(LOG_INFO, da->fd, "%d.%02d KBytes, %d.%02d Bytes/prefix",
1076             dxr_tot_size / 1024, dxr_tot_size * 100 / 1024 % 100,
1077             i / 100, i % 100);
1078 #ifdef DXR2
1079         FIB_PRINTF(LOG_INFO, da->fd,
1080             "%d.%02d%% trie, %d.%02d%% range fragmentation",
1081             trie_frag / 100, trie_frag % 100,
1082             range_frag / 100, range_frag % 100);
1083 #else
1084         FIB_PRINTF(LOG_INFO, da->fd, "%d.%01d%% range fragmentation",
1085             range_frag / 100, range_frag % 100);
1086 #endif
1087         i = (t1.tv_sec - t0.tv_sec) * 1000000 + t1.tv_usec - t0.tv_usec;
1088         FIB_PRINTF(LOG_INFO, da->fd, "range table %s in %u.%03u ms",
1089             range_rebuild ? "rebuilt" : "updated", i / 1000, i % 1000);
1090 #ifdef DXR2
1091         i = (t2.tv_sec - t1.tv_sec) * 1000000 + t2.tv_usec - t1.tv_usec;
1092         FIB_PRINTF(LOG_INFO, da->fd, "trie %s in %u.%03u ms",
1093             trie_rebuild ? "rebuilt" : "updated", i / 1000, i % 1000);
1094 #endif
1095         i = (t3.tv_sec - t2.tv_sec) * 1000000 + t3.tv_usec - t2.tv_usec;
1096         FIB_PRINTF(LOG_INFO, da->fd, "snapshot forked in %u.%03u ms",
1097             i / 1000, i % 1000);
1098 }
1099
1100 /*
1101  * Glue functions for attaching to FreeBSD 13 fib_algo infrastructure.
1102  */
1103
1104 static struct nhop_object *
1105 dxr_fib_lookup(void *algo_data, const struct flm_lookup_key key,
1106     uint32_t scopeid)
1107 {
1108         struct dxr *dxr = algo_data;
1109
1110         return (dxr->nh_tbl[dxr_lookup(dxr, ntohl(key.addr4.s_addr))]);
1111 }
1112
1113 static enum flm_op_result
1114 dxr_init(uint32_t fibnum, struct fib_data *fd, void *old_data, void **data)
1115 {
1116         struct dxr *old_dxr = old_data;
1117         struct dxr_aux *da = NULL;
1118         struct dxr *dxr;
1119
1120         dxr = malloc(sizeof(*dxr), M_DXRAUX, M_NOWAIT);
1121         if (dxr == NULL)
1122                 return (FLM_REBUILD);
1123
1124         /* Check whether we may reuse the old auxiliary structures */
1125         if (old_dxr != NULL && old_dxr->aux != NULL) {
1126                 da = old_dxr->aux;
1127                 atomic_add_int(&da->refcnt, 1);
1128         }
1129
1130         dxr->aux = da;
1131         dxr->d = NULL;
1132         dxr->fd = fd;
1133         dxr->fibnum = fibnum;
1134         *data = dxr;
1135
1136         return (FLM_SUCCESS);
1137 }
1138
1139 static void
1140 dxr_destroy(void *data)
1141 {
1142         struct dxr *dxr = data;
1143         struct dxr_aux *da;
1144         struct chunk_desc *cdp;
1145         struct trie_desc *tp;
1146
1147         if (dxr->d != NULL)
1148                 free(dxr->d, M_DXRLPM);
1149
1150         da = dxr->aux;
1151         free(dxr, M_DXRAUX);
1152
1153         if (da == NULL || atomic_fetchadd_int(&da->refcnt, -1) > 1)
1154                 return;
1155
1156         /* Release auxiliary structures */
1157         while ((cdp = LIST_FIRST(&da->all_chunks)) != NULL) {
1158                 LIST_REMOVE(cdp, cd_all_le);
1159                 uma_zfree(chunk_zone, cdp);
1160         }
1161         while ((tp = LIST_FIRST(&da->all_trie)) != NULL) {
1162                 LIST_REMOVE(tp, td_all_le);
1163                 uma_zfree(trie_zone, tp);
1164         }
1165         free(da->range_tbl, M_DXRAUX);
1166         free(da->x_tbl, M_DXRAUX);
1167         free(da, M_DXRAUX);
1168 }
1169
1170 static void 
1171 epoch_dxr_destroy(epoch_context_t ctx)
1172 {
1173         struct dxr *dxr = __containerof(ctx, struct dxr, epoch_ctx);
1174
1175         dxr_destroy(dxr);
1176 }
1177
1178 static void *
1179 choose_lookup_fn(struct dxr_aux *da)
1180 {
1181
1182 #ifdef DXR2
1183         switch (da->d_bits) {
1184 #if DXR_TRIE_BITS > 16
1185         case 16:
1186                 return (dxr_fib_lookup_16);
1187 #endif
1188         case 15:
1189                 return (dxr_fib_lookup_15);
1190         case 14:
1191                 return (dxr_fib_lookup_14);
1192         case 13:
1193                 return (dxr_fib_lookup_13);
1194         case 12:
1195                 return (dxr_fib_lookup_12);
1196         case 11:
1197                 return (dxr_fib_lookup_11);
1198         case 10:
1199                 return (dxr_fib_lookup_10);
1200         case 9:
1201                 return (dxr_fib_lookup_9);
1202         }
1203 #endif /* DXR2 */
1204         return (dxr_fib_lookup);
1205 }
1206
1207 static enum flm_op_result
1208 dxr_dump_end(void *data, struct fib_dp *dp)
1209 {
1210         struct dxr *dxr = data;
1211         struct dxr_aux *da;
1212
1213         dxr_build(dxr);
1214
1215         da = dxr->aux;
1216         if (da == NULL)
1217                 return (FLM_REBUILD);
1218
1219         /* Structural limit exceeded, hard error */
1220         if (da->rtbl_top >= BASE_MAX)
1221                 return (FLM_ERROR);
1222
1223         /* A malloc(,, M_NOWAIT) failed somewhere, retry later */
1224         if (dxr->d == NULL)
1225                 return (FLM_REBUILD);
1226
1227         dp->f = choose_lookup_fn(da);
1228         dp->arg = dxr;
1229
1230         return (FLM_SUCCESS);
1231 }
1232
1233 static enum flm_op_result
1234 dxr_dump_rib_item(struct rtentry *rt, void *data)
1235 {
1236         
1237         return (FLM_SUCCESS);
1238 }
1239
1240 static enum flm_op_result
1241 dxr_change_rib_item(struct rib_head *rnh, struct rib_cmd_info *rc,
1242     void *data)
1243 {
1244
1245         return (FLM_BATCH);
1246 }
1247
1248 static enum flm_op_result
1249 dxr_change_rib_batch(struct rib_head *rnh, struct fib_change_queue *q,
1250     void *data)
1251 {
1252         struct dxr *dxr = data;
1253         struct dxr *new_dxr;
1254         struct dxr_aux *da;
1255         struct fib_dp new_dp;
1256         enum flm_op_result res;
1257         uint32_t ip, plen, hmask, start, end, i, ui;
1258 #ifdef INVARIANTS
1259         struct rib_rtable_info rinfo;
1260         int update_delta = 0;
1261 #endif
1262
1263         KASSERT(data != NULL, ("%s: NULL data", __FUNCTION__));
1264         KASSERT(q != NULL, ("%s: NULL q", __FUNCTION__));
1265         KASSERT(q->count < q->size, ("%s: q->count %d q->size %d",
1266             __FUNCTION__, q->count, q->size));
1267
1268         da = dxr->aux;
1269         KASSERT(da != NULL, ("%s: NULL dxr->aux", __FUNCTION__));
1270
1271         FIB_PRINTF(LOG_INFO, da->fd, "processing %d update(s)", q->count);
1272         for (ui = 0; ui < q->count; ui++) {
1273 #ifdef INVARIANTS
1274                 if (q->entries[ui].nh_new != NULL)
1275                         update_delta++;
1276                 if (q->entries[ui].nh_old != NULL)
1277                         update_delta--;
1278 #endif
1279                 plen = q->entries[ui].plen;
1280                 ip = ntohl(q->entries[ui].addr4.s_addr);
1281                 if (plen < 32)
1282                         hmask = 0xffffffffU >> plen;
1283                 else
1284                         hmask = 0;
1285                 start = (ip & ~hmask) >> DXR_RANGE_SHIFT;
1286                 end = (ip | hmask) >> DXR_RANGE_SHIFT;
1287
1288                 if ((start & 0x1f) == 0 && (end & 0x1f) == 0x1f)
1289                         for (i = start >> 5; i <= end >> 5; i++)
1290                                 da->updates_mask[i] = 0xffffffffU;
1291                 else
1292                         for (i = start; i <= end; i++)
1293                                 da->updates_mask[i >> 5] |= (1 << (i & 0x1f));
1294                 if (start < da->updates_low)
1295                         da->updates_low = start;
1296                 if (end > da->updates_high)
1297                         da->updates_high = end;
1298         }
1299
1300 #ifdef INVARIANTS
1301         fib_get_rtable_info(fib_get_rh(da->fd), &rinfo);
1302         KASSERT(da->prefixes + update_delta == rinfo.num_prefixes,
1303             ("%s: update count mismatch", __FUNCTION__));
1304 #endif
1305
1306         res = dxr_init(0, dxr->fd, data, (void **) &new_dxr);
1307         if (res != FLM_SUCCESS)
1308                 return (res);
1309
1310         dxr_build(new_dxr);
1311
1312         /* Structural limit exceeded, hard error */
1313         if (da->rtbl_top >= BASE_MAX) {
1314                 dxr_destroy(new_dxr);
1315                 return (FLM_ERROR);
1316         }
1317
1318         /* A malloc(,, M_NOWAIT) failed somewhere, retry later */
1319         if (new_dxr->d == NULL) {
1320                 dxr_destroy(new_dxr);
1321                 return (FLM_REBUILD);
1322         }
1323
1324         new_dp.f = choose_lookup_fn(da);
1325         new_dp.arg = new_dxr;
1326         if (fib_set_datapath_ptr(dxr->fd, &new_dp)) {
1327                 fib_set_algo_ptr(dxr->fd, new_dxr);
1328                 fib_epoch_call(epoch_dxr_destroy, &dxr->epoch_ctx);
1329                 return (FLM_SUCCESS);
1330         }
1331
1332         dxr_destroy(new_dxr);
1333         return (FLM_REBUILD);
1334 }
1335
1336 static uint8_t
1337 dxr_get_pref(const struct rib_rtable_info *rinfo)
1338 {
1339
1340         /* Below bsearch4 up to 10 prefixes. Always supersedes dpdk_lpm4. */
1341         return (251);
1342 }
1343
1344 SYSCTL_DECL(_net_route_algo);
1345 SYSCTL_NODE(_net_route_algo, OID_AUTO, dxr, CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
1346     "DXR tunables");
1347
1348 static int
1349 sysctl_dxr_frag_limit(SYSCTL_HANDLER_ARGS)
1350 {
1351         char buf[8];
1352         int error, new, i;
1353
1354         snprintf(buf, sizeof(buf), "%d.%02d%%", V_frag_limit / 100,
1355             V_frag_limit % 100);
1356         error = sysctl_handle_string(oidp, buf, sizeof(buf), req);
1357         if (error != 0 || req->newptr == NULL)
1358                 return (error);
1359         if (!isdigit(*buf) && *buf != '.')
1360                 return (EINVAL);
1361         for (i = 0, new = 0; isdigit(buf[i]) && i < sizeof(buf); i++)
1362                 new = new * 10 + buf[i] - '0';
1363         new *= 100;
1364         if (buf[i++] == '.') {
1365                 if (!isdigit(buf[i]))
1366                         return (EINVAL);
1367                 new += (buf[i++] - '0') * 10;
1368                 if (isdigit(buf[i]))
1369                         new += buf[i++] - '0';
1370         }
1371         if (new > 1000)
1372                 return (EINVAL);
1373         V_frag_limit = new;
1374         snprintf(buf, sizeof(buf), "%d.%02d%%", V_frag_limit / 100,
1375             V_frag_limit % 100);
1376         return (0);
1377 }
1378
1379 SYSCTL_PROC(_net_route_algo_dxr, OID_AUTO, frag_limit,
1380     CTLTYPE_STRING | CTLFLAG_RW | CTLFLAG_VNET,
1381     0, 0, sysctl_dxr_frag_limit, "A",
1382     "Fragmentation threshold to full rebuild");
1383
1384 static struct fib_lookup_module fib_dxr_mod = {
1385         .flm_name = "dxr",
1386         .flm_family = AF_INET,
1387         .flm_init_cb = dxr_init,
1388         .flm_destroy_cb = dxr_destroy,
1389         .flm_dump_rib_item_cb = dxr_dump_rib_item,
1390         .flm_dump_end_cb = dxr_dump_end,
1391         .flm_change_rib_item_cb = dxr_change_rib_item,
1392         .flm_change_rib_items_cb = dxr_change_rib_batch,
1393         .flm_get_pref = dxr_get_pref,
1394 };
1395
1396 static int
1397 dxr_modevent(module_t mod, int type, void *unused)
1398 {
1399         int error;
1400
1401         switch (type) {
1402         case MOD_LOAD:
1403                 chunk_zone = uma_zcreate("dxr chunk", sizeof(struct chunk_desc),
1404                     NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0);
1405                 trie_zone = uma_zcreate("dxr trie", sizeof(struct trie_desc),
1406                     NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0);
1407                 fib_module_register(&fib_dxr_mod);
1408                 return(0);
1409         case MOD_UNLOAD:
1410                 error = fib_module_unregister(&fib_dxr_mod);
1411                 if (error)
1412                         return (error);
1413                 uma_zdestroy(chunk_zone);
1414                 uma_zdestroy(trie_zone);
1415                 return(0);
1416         default:
1417                 return(EOPNOTSUPP);
1418         }
1419 }
1420
1421 static moduledata_t dxr_mod = {"fib_dxr", dxr_modevent, 0};
1422
1423 DECLARE_MODULE(fib_dxr, dxr_mod, SI_SUB_PSEUDO, SI_ORDER_ANY);
1424 MODULE_VERSION(fib_dxr, 1);