]> CyberLeo.Net >> Repos - FreeBSD/FreeBSD.git/blob - sys/netinet/in_fib_dxr.c
ssh: Update to OpenSSH 9.3p2
[FreeBSD/FreeBSD.git] / sys / netinet / in_fib_dxr.c
1 /*-
2  * SPDX-License-Identifier: BSD-2-Clause
3  *
4  * Copyright (c) 2012-2022 Marko Zec
5  * Copyright (c) 2005, 2018 University of Zagreb
6  * Copyright (c) 2005 International Computer Science Institute
7  *
8  * Redistribution and use in source and binary forms, with or without
9  * modification, are permitted provided that the following conditions
10  * are met:
11  * 1. Redistributions of source code must retain the above copyright
12  *    notice, this list of conditions and the following disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in the
15  *    documentation and/or other materials provided with the distribution.
16  *
17  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
18  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
19  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
20  * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
21  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
22  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
23  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
24  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
25  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
26  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
27  * SUCH DAMAGE.
28  */
29
30 /*
31  * An implementation of DXR, a simple IPv4 LPM scheme with compact lookup
32  * structures and a trivial search procedure.  More significant bits of
33  * the search key are used to directly index a two-stage trie, while the
34  * remaining bits are used for finding the next hop in a sorted array.
35  * More details in:
36  *
37  * M. Zec, L. Rizzo, M. Mikuc, DXR: towards a billion routing lookups per
38  * second in software, ACM SIGCOMM Computer Communication Review, September
39  * 2012
40  *
41  * M. Zec, M. Mikuc, Pushing the envelope: beyond two billion IP routing
42  * lookups per second on commodity CPUs, IEEE SoftCOM, September 2017, Split
43  */
44
45 #include <sys/cdefs.h>
46 __FBSDID("$FreeBSD$");
47
48 #include "opt_inet.h"
49
50 #include <sys/param.h>
51 #include <sys/kernel.h>
52 #include <sys/ctype.h>
53 #include <sys/epoch.h>
54 #include <sys/malloc.h>
55 #include <sys/module.h>
56 #include <sys/socket.h>
57 #include <sys/sysctl.h>
58 #include <sys/syslog.h>
59
60 #include <vm/uma.h>
61
62 #include <netinet/in.h>
63 #include <netinet/in_fib.h>
64
65 #include <net/route.h>
66 #include <net/route/route_ctl.h>
67 #include <net/route/fib_algo.h>
68
69 #define DXR_TRIE_BITS           20
70
71 CTASSERT(DXR_TRIE_BITS >= 16 && DXR_TRIE_BITS <= 24);
72
73 /* DXR2: two-stage primary trie, instead of a single direct lookup table */
74 #define DXR2
75
76 #if DXR_TRIE_BITS > 16
77 #define DXR_D                   16
78 #else
79 #define DXR_D                   (DXR_TRIE_BITS - 1)
80 #endif
81
82 #define D_TBL_SIZE              (1 << DXR_D)
83 #define DIRECT_TBL_SIZE         (1 << DXR_TRIE_BITS)
84 #define DXR_RANGE_MASK          (0xffffffffU >> DXR_TRIE_BITS)
85 #define DXR_RANGE_SHIFT         (32 - DXR_TRIE_BITS)
86
87 #define DESC_BASE_BITS          22
88 #define DESC_FRAGMENTS_BITS     (32 - DESC_BASE_BITS)
89 #define BASE_MAX                ((1 << DESC_BASE_BITS) - 1)
90 #define RTBL_SIZE_INCR          (BASE_MAX / 64)
91
92 #if DXR_TRIE_BITS < 24
93 #define FRAGS_MASK_SHORT        ((1 << (23 - DXR_TRIE_BITS)) - 1)
94 #else
95 #define FRAGS_MASK_SHORT        0
96 #endif
97 #define FRAGS_PREF_SHORT        (((1 << DESC_FRAGMENTS_BITS) - 1) & \
98                                  ~FRAGS_MASK_SHORT)
99 #define FRAGS_MARK_XL           (FRAGS_PREF_SHORT - 1)
100 #define FRAGS_MARK_HIT          (FRAGS_PREF_SHORT - 2)
101
102 #define IS_SHORT_FORMAT(x)      ((x & FRAGS_PREF_SHORT) == FRAGS_PREF_SHORT)
103 #define IS_LONG_FORMAT(x)       ((x & FRAGS_PREF_SHORT) != FRAGS_PREF_SHORT)
104 #define IS_XL_FORMAT(x)         (x == FRAGS_MARK_XL)
105
106 #define RE_SHORT_MAX_NH         ((1 << (DXR_TRIE_BITS - 8)) - 1)
107
108 #define CHUNK_HASH_BITS         16
109 #define CHUNK_HASH_SIZE         (1 << CHUNK_HASH_BITS)
110 #define CHUNK_HASH_MASK         (CHUNK_HASH_SIZE - 1)
111
112 #define TRIE_HASH_BITS          16
113 #define TRIE_HASH_SIZE          (1 << TRIE_HASH_BITS)
114 #define TRIE_HASH_MASK          (TRIE_HASH_SIZE - 1)
115
116 #define XTBL_SIZE_INCR          (DIRECT_TBL_SIZE / 16)
117
118 #define UNUSED_BUCKETS          8
119
120 /* Lookup structure elements */
121
122 struct direct_entry {
123         uint32_t                fragments: DESC_FRAGMENTS_BITS,
124                                 base: DESC_BASE_BITS;
125 };
126
127 struct range_entry_long {
128         uint32_t                start: DXR_RANGE_SHIFT,
129                                 nexthop: DXR_TRIE_BITS;
130 };
131
132 #if DXR_TRIE_BITS < 24
133 struct range_entry_short {
134         uint16_t                start: DXR_RANGE_SHIFT - 8,
135                                 nexthop: DXR_TRIE_BITS - 8;
136 };
137 #endif
138
139 /* Auxiliary structures */
140
141 struct heap_entry {
142         uint32_t                start;
143         uint32_t                end;
144         uint32_t                preflen;
145         uint32_t                nexthop;
146 };
147
148 struct chunk_desc {
149         LIST_ENTRY(chunk_desc)  cd_all_le;
150         LIST_ENTRY(chunk_desc)  cd_hash_le;
151         uint32_t                cd_hash;
152         uint32_t                cd_refcnt;
153         uint32_t                cd_base;
154         uint32_t                cd_cur_size;
155         uint32_t                cd_max_size;
156 };
157
158 struct trie_desc {
159         LIST_ENTRY(trie_desc)   td_all_le;
160         LIST_ENTRY(trie_desc)   td_hash_le;
161         uint32_t                td_hash;
162         uint32_t                td_index;
163         uint32_t                td_refcnt;
164 };
165
166 struct dxr_aux {
167         /* Glue to external state */
168         struct fib_data         *fd;
169         uint32_t                fibnum;
170         int                     refcnt;
171
172         /* Auxiliary build-time tables */
173         struct direct_entry     direct_tbl[DIRECT_TBL_SIZE];
174         uint16_t                d_tbl[D_TBL_SIZE];
175         struct direct_entry     *x_tbl;
176         union {
177                 struct range_entry_long re;
178                 uint32_t        fragments;
179         }                       *range_tbl;
180
181         /* Auxiliary internal state */
182         uint32_t                updates_mask[DIRECT_TBL_SIZE / 32];
183         struct trie_desc        *trietbl[D_TBL_SIZE];
184         LIST_HEAD(, chunk_desc) chunk_hashtbl[CHUNK_HASH_SIZE];
185         LIST_HEAD(, chunk_desc) all_chunks;
186         LIST_HEAD(, chunk_desc) unused_chunks[UNUSED_BUCKETS];
187         LIST_HEAD(, trie_desc)  trie_hashtbl[TRIE_HASH_SIZE];
188         LIST_HEAD(, trie_desc)  all_trie;
189         LIST_HEAD(, trie_desc)  unused_trie; /* abuses hash link entry */
190         struct sockaddr_in      dst;
191         struct sockaddr_in      mask;
192         struct heap_entry       heap[33];
193         uint32_t                prefixes;
194         uint32_t                updates_low;
195         uint32_t                updates_high;
196         uint32_t                unused_chunks_size;
197         uint32_t                xtbl_size;
198         uint32_t                all_trie_cnt;
199         uint32_t                unused_trie_cnt;
200         uint32_t                trie_rebuilt_prefixes;
201         uint32_t                heap_index;
202         uint32_t                d_bits;
203         uint32_t                rtbl_size;
204         uint32_t                rtbl_top;
205         uint32_t                rtbl_work_frags;
206         uint32_t                work_chunk;
207 };
208
209 /* Main lookup structure container */
210
211 struct dxr {
212         /* Lookup tables */
213         void                    *d;
214         void                    *x;
215         void                    *r;
216         struct nhop_object      **nh_tbl;
217
218         /* Glue to external state */
219         struct dxr_aux          *aux;
220         struct fib_data         *fd;
221         struct epoch_context    epoch_ctx;
222         uint32_t                fibnum;
223         uint16_t                d_shift;
224         uint16_t                x_shift;
225         uint32_t                x_mask;
226 };
227
228 static MALLOC_DEFINE(M_DXRLPM, "dxr", "DXR LPM");
229 static MALLOC_DEFINE(M_DXRAUX, "dxr aux", "DXR auxiliary");
230
231 uma_zone_t chunk_zone;
232 uma_zone_t trie_zone;
233
234 VNET_DEFINE_STATIC(int, frag_limit) = 100;
235 #define V_frag_limit    VNET(frag_limit)
236
237 /* Binary search for a matching address range */
238 #define DXR_LOOKUP_STAGE                                        \
239         if (masked_dst < range[middle].start) {                 \
240                 upperbound = middle;                            \
241                 middle = (middle + lowerbound) / 2;             \
242         } else if (masked_dst < range[middle + 1].start)        \
243                 return (range[middle].nexthop);                 \
244         else {                                                  \
245                 lowerbound = middle + 1;                        \
246                 middle = (upperbound + middle + 1) / 2;         \
247         }                                                       \
248         if (upperbound == lowerbound)                           \
249                 return (range[lowerbound].nexthop);
250
251 static int
252 range_lookup(struct range_entry_long *rt, struct direct_entry de, uint32_t dst)
253 {
254         uint32_t base;
255         uint32_t upperbound;
256         uint32_t middle;
257         uint32_t lowerbound;
258         uint32_t masked_dst;
259
260         base = de.base;
261         lowerbound = 0;
262         masked_dst = dst & DXR_RANGE_MASK;
263
264 #if DXR_TRIE_BITS < 24
265         if (__predict_true(IS_SHORT_FORMAT(de.fragments))) {
266                 upperbound = de.fragments & FRAGS_MASK_SHORT;
267                 struct range_entry_short *range =
268                     (struct range_entry_short *) &rt[base];
269
270                 masked_dst >>= 8;
271                 middle = upperbound;
272                 upperbound = upperbound * 2 + 1;
273
274                 for (;;) {
275                         DXR_LOOKUP_STAGE
276                         DXR_LOOKUP_STAGE
277                 }
278         }
279 #endif
280
281         upperbound = de.fragments;
282         middle = upperbound / 2;
283         struct range_entry_long *range = &rt[base];
284         if (__predict_false(IS_XL_FORMAT(de.fragments))) {
285                 upperbound = *((uint32_t *) range);
286                 range++;
287                 middle = upperbound / 2;
288         }
289
290         for (;;) {
291                 DXR_LOOKUP_STAGE
292                 DXR_LOOKUP_STAGE
293         }
294 }
295
296 #define DXR_LOOKUP_DEFINE(D)                                            \
297         static int inline                                               \
298         dxr_lookup_##D(struct dxr *dxr, uint32_t dst)                   \
299         {                                                               \
300                 struct direct_entry de;                                 \
301                 uint16_t *dt = dxr->d;                                  \
302                 struct direct_entry *xt = dxr->x;                       \
303                                                                         \
304                 de = xt[(dt[dst >> (32 - (D))] << (DXR_TRIE_BITS - (D))) \
305                     + ((dst >> DXR_RANGE_SHIFT) &                       \
306                     (0xffffffffU >> (32 - DXR_TRIE_BITS + (D))))];      \
307                 if (__predict_true(de.fragments == FRAGS_MARK_HIT))     \
308                         return (de.base);                               \
309                 return (range_lookup(dxr->r, de, dst));                 \
310         }                                                               \
311                                                                         \
312         static struct nhop_object *                                     \
313         dxr_fib_lookup_##D(void *algo_data,                             \
314             const struct flm_lookup_key key, uint32_t scopeid __unused) \
315         {                                                               \
316                 struct dxr *dxr = algo_data;                            \
317                                                                         \
318                 return (dxr->nh_tbl[dxr_lookup_##D(dxr,                 \
319                     ntohl(key.addr4.s_addr))]);                         \
320         }
321
322 #ifdef DXR2
323 #if DXR_TRIE_BITS > 16
324 DXR_LOOKUP_DEFINE(16)
325 #endif
326 DXR_LOOKUP_DEFINE(15)
327 DXR_LOOKUP_DEFINE(14)
328 DXR_LOOKUP_DEFINE(13)
329 DXR_LOOKUP_DEFINE(12)
330 DXR_LOOKUP_DEFINE(11)
331 DXR_LOOKUP_DEFINE(10)
332 DXR_LOOKUP_DEFINE(9)
333 #endif /* DXR2 */
334
335 static int inline
336 dxr_lookup(struct dxr *dxr, uint32_t dst)
337 {
338         struct direct_entry de;
339 #ifdef DXR2
340         uint16_t *dt = dxr->d;
341         struct direct_entry *xt = dxr->x;
342
343         de = xt[(dt[dst >> dxr->d_shift] << dxr->x_shift) +
344             ((dst >> DXR_RANGE_SHIFT) & dxr->x_mask)];
345 #else /* !DXR2 */
346         struct direct_entry *dt = dxr->d;
347
348         de = dt[dst >> DXR_RANGE_SHIFT];
349 #endif /* !DXR2 */
350         if (__predict_true(de.fragments == FRAGS_MARK_HIT))
351                 return (de.base);
352         return (range_lookup(dxr->r, de, dst));
353 }
354
355 static void
356 initheap(struct dxr_aux *da, uint32_t dst_u32, uint32_t chunk)
357 {
358         struct heap_entry *fhp = &da->heap[0];
359         struct rtentry *rt;
360         struct route_nhop_data rnd;
361  
362         da->heap_index = 0;
363         da->dst.sin_addr.s_addr = htonl(dst_u32);
364         rt = fib4_lookup_rt(da->fibnum, da->dst.sin_addr, 0, NHR_UNLOCKED,
365             &rnd);
366         if (rt != NULL) {
367                 struct in_addr addr;
368                 uint32_t scopeid;
369
370                 rt_get_inet_prefix_plen(rt, &addr, &fhp->preflen, &scopeid);
371                 fhp->start = ntohl(addr.s_addr);
372                 fhp->end = fhp->start;
373                 if (fhp->preflen < 32)
374                         fhp->end |= (0xffffffffU >> fhp->preflen);
375                 fhp->nexthop = fib_get_nhop_idx(da->fd, rnd.rnd_nhop);
376         } else {
377                 fhp->preflen = fhp->nexthop = fhp->start = 0;
378                 fhp->end = 0xffffffffU;
379         }
380 }
381
382 static uint32_t
383 chunk_size(struct dxr_aux *da, struct direct_entry *fdesc)
384 {
385
386         if (IS_SHORT_FORMAT(fdesc->fragments))
387                 return ((fdesc->fragments & FRAGS_MASK_SHORT) + 1);
388         else if (IS_XL_FORMAT(fdesc->fragments))
389                 return (da->range_tbl[fdesc->base].fragments + 2);
390         else /* if (IS_LONG_FORMAT(fdesc->fragments)) */
391                 return (fdesc->fragments + 1);
392 }
393
394 static uint32_t
395 chunk_hash(struct dxr_aux *da, struct direct_entry *fdesc)
396 {
397         uint32_t size = chunk_size(da, fdesc);
398         uint32_t *p = (uint32_t *) &da->range_tbl[fdesc->base];
399         uint32_t *l = (uint32_t *) &da->range_tbl[fdesc->base + size];
400         uint32_t hash = fdesc->fragments;
401
402         for (; p < l; p++)
403                 hash = (hash << 7) + (hash >> 13) + *p;
404
405         return (hash + (hash >> 16));
406 }
407
408 static int
409 chunk_ref(struct dxr_aux *da, uint32_t chunk)
410 {
411         struct direct_entry *fdesc = &da->direct_tbl[chunk];
412         struct chunk_desc *cdp, *empty_cdp;
413         uint32_t base = fdesc->base;
414         uint32_t size = chunk_size(da, fdesc);
415         uint32_t hash = chunk_hash(da, fdesc);
416         int i;
417
418         /* Find an existing descriptor */
419         LIST_FOREACH(cdp, &da->chunk_hashtbl[hash & CHUNK_HASH_MASK],
420             cd_hash_le) {
421                 if (cdp->cd_hash != hash || cdp->cd_cur_size != size ||
422                     memcmp(&da->range_tbl[base], &da->range_tbl[cdp->cd_base],
423                     sizeof(struct range_entry_long) * size))
424                         continue;
425                 da->rtbl_top = fdesc->base;
426                 fdesc->base = cdp->cd_base;
427                 cdp->cd_refcnt++;
428                 return (0);
429         }
430
431         /* No matching chunks found. Find an empty one to recycle. */
432         for (cdp = NULL, i = size; cdp == NULL && i < UNUSED_BUCKETS; i++)
433                 cdp = LIST_FIRST(&da->unused_chunks[i]);
434
435         if (cdp == NULL)
436                 LIST_FOREACH(empty_cdp, &da->unused_chunks[0], cd_hash_le)
437                         if (empty_cdp->cd_max_size >= size && (cdp == NULL ||
438                             empty_cdp->cd_max_size < cdp->cd_max_size)) {
439                                 cdp = empty_cdp;
440                                 if (empty_cdp->cd_max_size == size)
441                                         break;
442                         }
443
444         if (cdp != NULL) {
445                 /* Copy from heap into the recycled chunk */
446                 bcopy(&da->range_tbl[fdesc->base], &da->range_tbl[cdp->cd_base],
447                     size * sizeof(struct range_entry_long));
448                 fdesc->base = cdp->cd_base;
449                 da->rtbl_top -= size;
450                 da->unused_chunks_size -= cdp->cd_max_size;
451                 if (cdp->cd_max_size > size) {
452                         /* Split the range in two, need a new descriptor */
453                         empty_cdp = uma_zalloc(chunk_zone, M_NOWAIT);
454                         if (empty_cdp == NULL)
455                                 return (1);
456                         LIST_INSERT_BEFORE(cdp, empty_cdp, cd_all_le);
457                         empty_cdp->cd_base = cdp->cd_base + size;
458                         empty_cdp->cd_cur_size = 0;
459                         empty_cdp->cd_max_size = cdp->cd_max_size - size;
460
461                         i = empty_cdp->cd_max_size;
462                         if (i >= UNUSED_BUCKETS)
463                                 i = 0;
464                         LIST_INSERT_HEAD(&da->unused_chunks[i], empty_cdp,
465                             cd_hash_le);
466
467                         da->unused_chunks_size += empty_cdp->cd_max_size;
468                         cdp->cd_max_size = size;
469                 }
470                 LIST_REMOVE(cdp, cd_hash_le);
471         } else {
472                 /* Alloc a new descriptor at the top of the heap*/
473                 cdp = uma_zalloc(chunk_zone, M_NOWAIT);
474                 if (cdp == NULL)
475                         return (1);
476                 cdp->cd_max_size = size;
477                 cdp->cd_base = fdesc->base;
478                 LIST_INSERT_HEAD(&da->all_chunks, cdp, cd_all_le);
479                 KASSERT(cdp->cd_base + cdp->cd_max_size == da->rtbl_top,
480                     ("dxr: %s %d", __FUNCTION__, __LINE__));
481         }
482
483         cdp->cd_hash = hash;
484         cdp->cd_refcnt = 1;
485         cdp->cd_cur_size = size;
486         LIST_INSERT_HEAD(&da->chunk_hashtbl[hash & CHUNK_HASH_MASK], cdp,
487             cd_hash_le);
488         if (da->rtbl_top >= da->rtbl_size) {
489                 if (da->rtbl_top >= BASE_MAX) {
490                         FIB_PRINTF(LOG_ERR, da->fd,
491                             "structural limit exceeded at %d "
492                             "range table elements", da->rtbl_top);
493                         return (1);
494                 }
495                 da->rtbl_size += RTBL_SIZE_INCR;
496                 i = (BASE_MAX - da->rtbl_top) * LOG_DEBUG / BASE_MAX;
497                 FIB_PRINTF(i, da->fd, "range table at %d%% structural limit",
498                     da->rtbl_top * 100 / BASE_MAX);
499                 da->range_tbl = realloc(da->range_tbl,
500                     sizeof(*da->range_tbl) * da->rtbl_size + FRAGS_PREF_SHORT,
501                     M_DXRAUX, M_NOWAIT);
502                 if (da->range_tbl == NULL)
503                         return (1);
504         }
505
506         return (0);
507 }
508
509 static void
510 chunk_unref(struct dxr_aux *da, uint32_t chunk)
511 {
512         struct direct_entry *fdesc = &da->direct_tbl[chunk];
513         struct chunk_desc *cdp, *cdp2;
514         uint32_t base = fdesc->base;
515         uint32_t size = chunk_size(da, fdesc);
516         uint32_t hash = chunk_hash(da, fdesc);
517         int i;
518
519         /* Find the corresponding descriptor */
520         LIST_FOREACH(cdp, &da->chunk_hashtbl[hash & CHUNK_HASH_MASK],
521             cd_hash_le)
522                 if (cdp->cd_hash == hash && cdp->cd_cur_size == size &&
523                     memcmp(&da->range_tbl[base], &da->range_tbl[cdp->cd_base],
524                     sizeof(struct range_entry_long) * size) == 0)
525                         break;
526
527         KASSERT(cdp != NULL, ("dxr: dangling chunk"));
528         if (--cdp->cd_refcnt > 0)
529                 return;
530
531         LIST_REMOVE(cdp, cd_hash_le);
532         da->unused_chunks_size += cdp->cd_max_size;
533         cdp->cd_cur_size = 0;
534
535         /* Attempt to merge with the preceding chunk, if empty */
536         cdp2 = LIST_NEXT(cdp, cd_all_le);
537         if (cdp2 != NULL && cdp2->cd_cur_size == 0) {
538                 KASSERT(cdp2->cd_base + cdp2->cd_max_size == cdp->cd_base,
539                     ("dxr: %s %d", __FUNCTION__, __LINE__));
540                 LIST_REMOVE(cdp, cd_all_le);
541                 LIST_REMOVE(cdp2, cd_hash_le);
542                 cdp2->cd_max_size += cdp->cd_max_size;
543                 uma_zfree(chunk_zone, cdp);
544                 cdp = cdp2;
545         }
546
547         /* Attempt to merge with the subsequent chunk, if empty */
548         cdp2 = LIST_PREV(cdp, &da->all_chunks, chunk_desc, cd_all_le);
549         if (cdp2 != NULL && cdp2->cd_cur_size == 0) {
550                 KASSERT(cdp->cd_base + cdp->cd_max_size == cdp2->cd_base,
551                     ("dxr: %s %d", __FUNCTION__, __LINE__));
552                 LIST_REMOVE(cdp, cd_all_le);
553                 LIST_REMOVE(cdp2, cd_hash_le);
554                 cdp2->cd_max_size += cdp->cd_max_size;
555                 cdp2->cd_base = cdp->cd_base;
556                 uma_zfree(chunk_zone, cdp);
557                 cdp = cdp2;
558         }
559
560         if (cdp->cd_base + cdp->cd_max_size == da->rtbl_top) {
561                 /* Free the chunk on the top of the range heap, trim the heap */
562                 KASSERT(cdp == LIST_FIRST(&da->all_chunks),
563                     ("dxr: %s %d", __FUNCTION__, __LINE__));
564                 da->rtbl_top -= cdp->cd_max_size;
565                 da->unused_chunks_size -= cdp->cd_max_size;
566                 LIST_REMOVE(cdp, cd_all_le);
567                 uma_zfree(chunk_zone, cdp);
568                 return;
569         }
570
571         i = cdp->cd_max_size;
572         if (i >= UNUSED_BUCKETS)
573                 i = 0;
574         LIST_INSERT_HEAD(&da->unused_chunks[i], cdp, cd_hash_le);
575 }
576
577 #ifdef DXR2
578 static uint32_t
579 trie_hash(struct dxr_aux *da, uint32_t dxr_x, uint32_t index)
580 {
581         uint32_t i, *val;
582         uint32_t hash = 0;
583
584         for (i = 0; i < (1 << dxr_x); i++) {
585                 hash = (hash << 3) ^ (hash >> 3);
586                 val = (uint32_t *)
587                     (void *) &da->direct_tbl[(index << dxr_x) + i];
588                 hash += (*val << 5);
589                 hash += (*val >> 5);
590         }
591
592         return (hash + (hash >> 16));
593 }
594
595 static int
596 trie_ref(struct dxr_aux *da, uint32_t index)
597 {
598         struct trie_desc *tp;
599         uint32_t dxr_d = da->d_bits;
600         uint32_t dxr_x = DXR_TRIE_BITS - dxr_d;
601         uint32_t hash = trie_hash(da, dxr_x, index);
602
603         /* Find an existing descriptor */
604         LIST_FOREACH(tp, &da->trie_hashtbl[hash & TRIE_HASH_MASK], td_hash_le)
605                 if (tp->td_hash == hash &&
606                     memcmp(&da->direct_tbl[index << dxr_x],
607                     &da->x_tbl[tp->td_index << dxr_x],
608                     sizeof(*da->x_tbl) << dxr_x) == 0) {
609                         tp->td_refcnt++;
610                         da->trietbl[index] = tp;
611                         return(tp->td_index);
612                 }
613
614         tp = LIST_FIRST(&da->unused_trie);
615         if (tp != NULL) {
616                 LIST_REMOVE(tp, td_hash_le);
617                 da->unused_trie_cnt--;
618         } else {
619                 tp = uma_zalloc(trie_zone, M_NOWAIT);
620                 if (tp == NULL)
621                         return (-1);
622                 LIST_INSERT_HEAD(&da->all_trie, tp, td_all_le);
623                 tp->td_index = da->all_trie_cnt++;
624         }
625
626         tp->td_hash = hash;
627         tp->td_refcnt = 1;
628         LIST_INSERT_HEAD(&da->trie_hashtbl[hash & TRIE_HASH_MASK], tp,
629            td_hash_le);
630         memcpy(&da->x_tbl[tp->td_index << dxr_x],
631             &da->direct_tbl[index << dxr_x], sizeof(*da->x_tbl) << dxr_x);
632         da->trietbl[index] = tp;
633         if (da->all_trie_cnt >= da->xtbl_size >> dxr_x) {
634                 da->xtbl_size += XTBL_SIZE_INCR;
635                 da->x_tbl = realloc(da->x_tbl,
636                     sizeof(*da->x_tbl) * da->xtbl_size, M_DXRAUX, M_NOWAIT);
637                 if (da->x_tbl == NULL)
638                         return (-1);
639         }
640         return(tp->td_index);
641 }
642
643 static void
644 trie_unref(struct dxr_aux *da, uint32_t index)
645 {
646         struct trie_desc *tp = da->trietbl[index];
647
648         if (tp == NULL)
649                 return;
650         da->trietbl[index] = NULL;
651         if (--tp->td_refcnt > 0)
652                 return;
653
654         LIST_REMOVE(tp, td_hash_le);
655         da->unused_trie_cnt++;
656         if (tp->td_index != da->all_trie_cnt - 1) {
657                 LIST_INSERT_HEAD(&da->unused_trie, tp, td_hash_le);
658                 return;
659         }
660
661         do {
662                 da->all_trie_cnt--;
663                 da->unused_trie_cnt--;
664                 LIST_REMOVE(tp, td_all_le);
665                 uma_zfree(trie_zone, tp);
666                 LIST_FOREACH(tp, &da->unused_trie, td_hash_le)
667                         if (tp->td_index == da->all_trie_cnt - 1) {
668                                 LIST_REMOVE(tp, td_hash_le);
669                                 break;
670                         }
671         } while (tp != NULL);
672 }
673 #endif
674
675 static void
676 heap_inject(struct dxr_aux *da, uint32_t start, uint32_t end, uint32_t preflen,
677     uint32_t nh)
678 {
679         struct heap_entry *fhp;
680         int i;
681
682         for (i = da->heap_index; i >= 0; i--) {
683                 if (preflen > da->heap[i].preflen)
684                         break;
685                 else if (preflen < da->heap[i].preflen)
686                         da->heap[i + 1] = da->heap[i];
687                 else
688                         return;
689         }
690
691         fhp = &da->heap[i + 1];
692         fhp->preflen = preflen;
693         fhp->start = start;
694         fhp->end = end;
695         fhp->nexthop = nh;
696         da->heap_index++;
697 }
698
699 static int
700 dxr_walk(struct rtentry *rt, void *arg)
701 {
702         struct dxr_aux *da = arg;
703         uint32_t chunk = da->work_chunk;
704         uint32_t first = chunk << DXR_RANGE_SHIFT;
705         uint32_t last = first | DXR_RANGE_MASK;
706         struct range_entry_long *fp =
707             &da->range_tbl[da->rtbl_top + da->rtbl_work_frags].re;
708         struct heap_entry *fhp = &da->heap[da->heap_index];
709         uint32_t preflen, nh, start, end, scopeid;
710         struct in_addr addr;
711
712         rt_get_inet_prefix_plen(rt, &addr, &preflen, &scopeid);
713         start = ntohl(addr.s_addr);
714         if (start > last)
715                 return (-1);    /* Beyond chunk boundaries, we are done */
716         if (start < first)
717                 return (0);     /* Skip this route */
718
719         end = start;
720         if (preflen < 32)
721                 end |= (0xffffffffU >> preflen);
722         nh = fib_get_nhop_idx(da->fd, rt_get_raw_nhop(rt));
723
724         if (start == fhp->start)
725                 heap_inject(da, start, end, preflen, nh);
726         else {
727                 /* start > fhp->start */
728                 while (start > fhp->end) {
729                         uint32_t oend = fhp->end;
730
731                         if (da->heap_index > 0) {
732                                 fhp--;
733                                 da->heap_index--;
734                         } else
735                                 initheap(da, fhp->end + 1, chunk);
736                         if (fhp->end > oend && fhp->nexthop != fp->nexthop) {
737                                 fp++;
738                                 da->rtbl_work_frags++;
739                                 fp->start = (oend + 1) & DXR_RANGE_MASK;
740                                 fp->nexthop = fhp->nexthop;
741                         }
742                 }
743                 if (start > ((chunk << DXR_RANGE_SHIFT) | fp->start) &&
744                     nh != fp->nexthop) {
745                         fp++;
746                         da->rtbl_work_frags++;
747                         fp->start = start & DXR_RANGE_MASK;
748                 } else if (da->rtbl_work_frags) {
749                         if ((--fp)->nexthop == nh)
750                                 da->rtbl_work_frags--;
751                         else
752                                 fp++;
753                 }
754                 fp->nexthop = nh;
755                 heap_inject(da, start, end, preflen, nh);
756         }
757
758         return (0);
759 }
760
761 static int
762 update_chunk(struct dxr_aux *da, uint32_t chunk)
763 {
764         struct range_entry_long *fp;
765 #if DXR_TRIE_BITS < 24
766         struct range_entry_short *fps;
767         uint32_t start, nh, i;
768 #endif
769         struct heap_entry *fhp;
770         uint32_t first = chunk << DXR_RANGE_SHIFT;
771         uint32_t last = first | DXR_RANGE_MASK;
772
773         if (da->direct_tbl[chunk].fragments != FRAGS_MARK_HIT)
774                 chunk_unref(da, chunk);
775
776         initheap(da, first, chunk);
777
778         fp = &da->range_tbl[da->rtbl_top].re;
779         da->rtbl_work_frags = 0;
780         fp->start = first & DXR_RANGE_MASK;
781         fp->nexthop = da->heap[0].nexthop;
782
783         da->dst.sin_addr.s_addr = htonl(first);
784         da->mask.sin_addr.s_addr = htonl(~DXR_RANGE_MASK);
785
786         da->work_chunk = chunk;
787         rib_walk_from(da->fibnum, AF_INET, RIB_FLAG_LOCKED,
788             (struct sockaddr *) &da->dst, (struct sockaddr *) &da->mask,
789             dxr_walk, da);
790
791         /* Flush any remaining objects on the heap */
792         fp = &da->range_tbl[da->rtbl_top + da->rtbl_work_frags].re;
793         fhp = &da->heap[da->heap_index];
794         while (fhp->preflen > DXR_TRIE_BITS) {
795                 uint32_t oend = fhp->end;
796
797                 if (da->heap_index > 0) {
798                         fhp--;
799                         da->heap_index--;
800                 } else
801                         initheap(da, fhp->end + 1, chunk);
802                 if (fhp->end > oend && fhp->nexthop != fp->nexthop) {
803                         /* Have we crossed the upper chunk boundary? */
804                         if (oend >= last)
805                                 break;
806                         fp++;
807                         da->rtbl_work_frags++;
808                         fp->start = (oend + 1) & DXR_RANGE_MASK;
809                         fp->nexthop = fhp->nexthop;
810                 }
811         }
812
813         /* Direct hit if the chunk contains only a single fragment */
814         if (da->rtbl_work_frags == 0) {
815                 da->direct_tbl[chunk].base = fp->nexthop;
816                 da->direct_tbl[chunk].fragments = FRAGS_MARK_HIT;
817                 return (0);
818         }
819
820         da->direct_tbl[chunk].base = da->rtbl_top;
821         da->direct_tbl[chunk].fragments = da->rtbl_work_frags;
822
823 #if DXR_TRIE_BITS < 24
824         /* Check whether the chunk can be more compactly encoded */
825         fp = &da->range_tbl[da->rtbl_top].re;
826         for (i = 0; i <= da->rtbl_work_frags; i++, fp++)
827                 if ((fp->start & 0xff) != 0 || fp->nexthop > RE_SHORT_MAX_NH)
828                         break;
829         if (i == da->rtbl_work_frags + 1) {
830                 fp = &da->range_tbl[da->rtbl_top].re;
831                 fps = (void *) fp;
832                 for (i = 0; i <= da->rtbl_work_frags; i++, fp++, fps++) {
833                         start = fp->start;
834                         nh = fp->nexthop;
835                         fps->start = start >> 8;
836                         fps->nexthop = nh;
837                 }
838                 fps->start = start >> 8;
839                 fps->nexthop = nh;
840                 da->rtbl_work_frags >>= 1;
841                 da->direct_tbl[chunk].fragments =
842                     da->rtbl_work_frags | FRAGS_PREF_SHORT;
843         } else
844 #endif
845         if (da->rtbl_work_frags >= FRAGS_MARK_HIT) {
846                 da->direct_tbl[chunk].fragments = FRAGS_MARK_XL;
847                 memmove(&da->range_tbl[da->rtbl_top + 1],
848                    &da->range_tbl[da->rtbl_top],
849                    (da->rtbl_work_frags + 1) * sizeof(*da->range_tbl));
850                 da->range_tbl[da->rtbl_top].fragments = da->rtbl_work_frags;
851                 da->rtbl_work_frags++;
852         }
853         da->rtbl_top += (da->rtbl_work_frags + 1);
854         return (chunk_ref(da, chunk));
855 }
856
857 static void
858 dxr_build(struct dxr *dxr)
859 {
860         struct dxr_aux *da = dxr->aux;
861         struct chunk_desc *cdp;
862         struct rib_rtable_info rinfo;
863         struct timeval t0, t1, t2, t3;
864         uint32_t r_size, dxr_tot_size;
865         uint32_t i, m, range_rebuild = 0;
866         uint32_t range_frag;
867 #ifdef DXR2
868         struct trie_desc *tp;
869         uint32_t d_tbl_size, dxr_x, d_size, x_size;
870         uint32_t ti, trie_rebuild = 0, prev_size = 0;
871         uint32_t trie_frag;
872 #endif
873
874         KASSERT(dxr->d == NULL, ("dxr: d not free"));
875
876         if (da == NULL) {
877                 da = malloc(sizeof(*dxr->aux), M_DXRAUX, M_NOWAIT);
878                 if (da == NULL)
879                         return;
880                 dxr->aux = da;
881                 da->fibnum = dxr->fibnum;
882                 da->refcnt = 1;
883                 LIST_INIT(&da->all_chunks);
884                 LIST_INIT(&da->all_trie);
885                 da->rtbl_size = RTBL_SIZE_INCR;
886                 da->range_tbl = NULL;
887                 da->xtbl_size = XTBL_SIZE_INCR;
888                 da->x_tbl = NULL;
889                 bzero(&da->dst, sizeof(da->dst));
890                 bzero(&da->mask, sizeof(da->mask));
891                 da->dst.sin_len = sizeof(da->dst);
892                 da->mask.sin_len = sizeof(da->mask);
893                 da->dst.sin_family = AF_INET;
894                 da->mask.sin_family = AF_INET;
895         }
896         if (da->range_tbl == NULL) {
897                 da->range_tbl = malloc(sizeof(*da->range_tbl) * da->rtbl_size
898                     + FRAGS_PREF_SHORT, M_DXRAUX, M_NOWAIT);
899                 if (da->range_tbl == NULL)
900                         return;
901                 range_rebuild = 1;
902         }
903 #ifdef DXR2
904         if (da->x_tbl == NULL) {
905                 da->x_tbl = malloc(sizeof(*da->x_tbl) * da->xtbl_size,
906                     M_DXRAUX, M_NOWAIT);
907                 if (da->x_tbl == NULL)
908                         return;
909                 trie_rebuild = 1;
910         }
911 #endif
912         da->fd = dxr->fd;
913
914         microuptime(&t0);
915
916         dxr->nh_tbl = fib_get_nhop_array(da->fd);
917         fib_get_rtable_info(fib_get_rh(da->fd), &rinfo);
918
919         if (da->updates_low > da->updates_high)
920                 range_rebuild = 1;
921
922 range_build:
923         if (range_rebuild) {
924                 /* Bulk cleanup */
925                 bzero(da->chunk_hashtbl, sizeof(da->chunk_hashtbl));
926                 while ((cdp = LIST_FIRST(&da->all_chunks)) != NULL) {
927                         LIST_REMOVE(cdp, cd_all_le);
928                         uma_zfree(chunk_zone, cdp);
929                 }
930                 for (i = 0; i < UNUSED_BUCKETS; i++)
931                         LIST_INIT(&da->unused_chunks[i]);
932                 da->unused_chunks_size = 0;
933                 da->rtbl_top = 0;
934                 da->updates_low = 0;
935                 da->updates_high = DIRECT_TBL_SIZE - 1;
936                 memset(da->updates_mask, 0xff, sizeof(da->updates_mask));
937                 for (i = 0; i < DIRECT_TBL_SIZE; i++) {
938                         da->direct_tbl[i].fragments = FRAGS_MARK_HIT;
939                         da->direct_tbl[i].base = 0;
940                 }
941         }
942         da->prefixes = rinfo.num_prefixes;
943
944         /* DXR: construct direct & range table */
945         for (i = da->updates_low; i <= da->updates_high; i++) {
946                 m = da->updates_mask[i >> 5] >> (i & 0x1f);
947                 if (m == 0)
948                         i |= 0x1f;
949                 else if (m & 1 && update_chunk(da, i) != 0)
950                         return;
951         }
952
953         range_frag = 0;
954         if (da->rtbl_top)
955                 range_frag = da->unused_chunks_size * 10000ULL / da->rtbl_top;
956         if (range_frag > V_frag_limit) {
957                 range_rebuild = 1;
958                 goto range_build;
959         }
960
961         r_size = sizeof(*da->range_tbl) * da->rtbl_top;
962         microuptime(&t1);
963
964 #ifdef DXR2
965         if (range_rebuild ||
966             abs(fls(da->prefixes) - fls(da->trie_rebuilt_prefixes)) > 1)
967                 trie_rebuild = 1;
968
969 trie_build:
970         if (trie_rebuild) {
971                 da->trie_rebuilt_prefixes = da->prefixes;
972                 da->d_bits = DXR_D;
973                 da->updates_low = 0;
974                 da->updates_high = DIRECT_TBL_SIZE - 1;
975                 if (!range_rebuild)
976                         memset(da->updates_mask, 0xff,
977                             sizeof(da->updates_mask));
978         }
979
980 dxr2_try_squeeze:
981         if (trie_rebuild) {
982                 /* Bulk cleanup */
983                 bzero(da->trietbl, sizeof(da->trietbl));
984                 bzero(da->trie_hashtbl, sizeof(da->trie_hashtbl));
985                 while ((tp = LIST_FIRST(&da->all_trie)) != NULL) {
986                         LIST_REMOVE(tp, td_all_le);
987                         uma_zfree(trie_zone, tp);
988                 }
989                 LIST_INIT(&da->unused_trie);
990                 da->all_trie_cnt = da->unused_trie_cnt = 0;
991         }
992
993         /* Populate d_tbl, x_tbl */
994         dxr_x = DXR_TRIE_BITS - da->d_bits;
995         d_tbl_size = (1 << da->d_bits);
996
997         for (i = da->updates_low >> dxr_x; i <= da->updates_high >> dxr_x;
998             i++) {
999                 if (!trie_rebuild) {
1000                         m = 0;
1001                         for (int j = 0; j < (1 << dxr_x); j += 32)
1002                                 m |= da->updates_mask[((i << dxr_x) + j) >> 5];
1003                         if (m == 0)
1004                                 continue;
1005                         trie_unref(da, i);
1006                 }
1007                 ti = trie_ref(da, i);
1008                 if (ti < 0)
1009                         return;
1010                 da->d_tbl[i] = ti;
1011         }
1012
1013         trie_frag = 0;
1014         if (da->all_trie_cnt)
1015                 trie_frag = da->unused_trie_cnt * 10000ULL / da->all_trie_cnt;
1016         if (trie_frag > V_frag_limit) {
1017                 trie_rebuild = 1;
1018                 goto trie_build;
1019         }
1020
1021         d_size = sizeof(*da->d_tbl) * d_tbl_size;
1022         x_size = sizeof(*da->x_tbl) * DIRECT_TBL_SIZE / d_tbl_size
1023             * da->all_trie_cnt;
1024         dxr_tot_size = d_size + x_size + r_size;
1025
1026         if (trie_rebuild == 1) {
1027                 /* Try to find a more compact D/X split */
1028                 if (prev_size == 0 || dxr_tot_size <= prev_size)
1029                         da->d_bits--;
1030                 else {
1031                         da->d_bits++;
1032                         trie_rebuild = 2;
1033                 }
1034                 prev_size = dxr_tot_size;
1035                 goto dxr2_try_squeeze;
1036         }
1037         microuptime(&t2);
1038 #else /* !DXR2 */
1039         dxr_tot_size = sizeof(da->direct_tbl) + r_size;
1040         t2 = t1;
1041 #endif
1042
1043         dxr->d = malloc(dxr_tot_size, M_DXRLPM, M_NOWAIT);
1044         if (dxr->d == NULL)
1045                 return;
1046 #ifdef DXR2
1047         memcpy(dxr->d, da->d_tbl, d_size);
1048         dxr->x = ((char *) dxr->d) + d_size;
1049         memcpy(dxr->x, da->x_tbl, x_size);
1050         dxr->r = ((char *) dxr->x) + x_size;
1051         dxr->d_shift = 32 - da->d_bits;
1052         dxr->x_shift = dxr_x;
1053         dxr->x_mask = 0xffffffffU >> (32 - dxr_x);
1054 #else /* !DXR2 */
1055         memcpy(dxr->d, da->direct_tbl, sizeof(da->direct_tbl));
1056         dxr->r = ((char *) dxr->d) + sizeof(da->direct_tbl);
1057 #endif
1058         memcpy(dxr->r, da->range_tbl, r_size);
1059
1060         if (da->updates_low <= da->updates_high)
1061                 bzero(&da->updates_mask[da->updates_low / 32],
1062                     (da->updates_high - da->updates_low) / 8 + 1);
1063         da->updates_low = DIRECT_TBL_SIZE - 1;
1064         da->updates_high = 0;
1065         microuptime(&t3);
1066
1067 #ifdef DXR2
1068         FIB_PRINTF(LOG_INFO, da->fd, "D%dX%dR, %d prefixes, %d nhops (max)",
1069             da->d_bits, dxr_x, rinfo.num_prefixes, rinfo.num_nhops);
1070 #else
1071         FIB_PRINTF(LOG_INFO, da->fd, "D%dR, %d prefixes, %d nhops (max)",
1072             DXR_D, rinfo.num_prefixes, rinfo.num_nhops);
1073 #endif
1074         i = dxr_tot_size * 100;
1075         if (rinfo.num_prefixes)
1076                 i /= rinfo.num_prefixes;
1077         FIB_PRINTF(LOG_INFO, da->fd, "%d.%02d KBytes, %d.%02d Bytes/prefix",
1078             dxr_tot_size / 1024, dxr_tot_size * 100 / 1024 % 100,
1079             i / 100, i % 100);
1080 #ifdef DXR2
1081         FIB_PRINTF(LOG_INFO, da->fd,
1082             "%d.%02d%% trie, %d.%02d%% range fragmentation",
1083             trie_frag / 100, trie_frag % 100,
1084             range_frag / 100, range_frag % 100);
1085 #else
1086         FIB_PRINTF(LOG_INFO, da->fd, "%d.%01d%% range fragmentation",
1087             range_frag / 100, range_frag % 100);
1088 #endif
1089         i = (t1.tv_sec - t0.tv_sec) * 1000000 + t1.tv_usec - t0.tv_usec;
1090         FIB_PRINTF(LOG_INFO, da->fd, "range table %s in %u.%03u ms",
1091             range_rebuild ? "rebuilt" : "updated", i / 1000, i % 1000);
1092 #ifdef DXR2
1093         i = (t2.tv_sec - t1.tv_sec) * 1000000 + t2.tv_usec - t1.tv_usec;
1094         FIB_PRINTF(LOG_INFO, da->fd, "trie %s in %u.%03u ms",
1095             trie_rebuild ? "rebuilt" : "updated", i / 1000, i % 1000);
1096 #endif
1097         i = (t3.tv_sec - t2.tv_sec) * 1000000 + t3.tv_usec - t2.tv_usec;
1098         FIB_PRINTF(LOG_INFO, da->fd, "snapshot forked in %u.%03u ms",
1099             i / 1000, i % 1000);
1100 }
1101
1102 /*
1103  * Glue functions for attaching to FreeBSD 13 fib_algo infrastructure.
1104  */
1105
1106 static struct nhop_object *
1107 dxr_fib_lookup(void *algo_data, const struct flm_lookup_key key,
1108     uint32_t scopeid)
1109 {
1110         struct dxr *dxr = algo_data;
1111
1112         return (dxr->nh_tbl[dxr_lookup(dxr, ntohl(key.addr4.s_addr))]);
1113 }
1114
1115 static enum flm_op_result
1116 dxr_init(uint32_t fibnum, struct fib_data *fd, void *old_data, void **data)
1117 {
1118         struct dxr *old_dxr = old_data;
1119         struct dxr_aux *da = NULL;
1120         struct dxr *dxr;
1121
1122         dxr = malloc(sizeof(*dxr), M_DXRAUX, M_NOWAIT);
1123         if (dxr == NULL)
1124                 return (FLM_REBUILD);
1125
1126         /* Check whether we may reuse the old auxiliary structures */
1127         if (old_dxr != NULL && old_dxr->aux != NULL) {
1128                 da = old_dxr->aux;
1129                 atomic_add_int(&da->refcnt, 1);
1130         }
1131
1132         dxr->aux = da;
1133         dxr->d = NULL;
1134         dxr->fd = fd;
1135         dxr->fibnum = fibnum;
1136         *data = dxr;
1137
1138         return (FLM_SUCCESS);
1139 }
1140
1141 static void
1142 dxr_destroy(void *data)
1143 {
1144         struct dxr *dxr = data;
1145         struct dxr_aux *da;
1146         struct chunk_desc *cdp;
1147         struct trie_desc *tp;
1148
1149         if (dxr->d != NULL)
1150                 free(dxr->d, M_DXRLPM);
1151
1152         da = dxr->aux;
1153         free(dxr, M_DXRAUX);
1154
1155         if (da == NULL || atomic_fetchadd_int(&da->refcnt, -1) > 1)
1156                 return;
1157
1158         /* Release auxiliary structures */
1159         while ((cdp = LIST_FIRST(&da->all_chunks)) != NULL) {
1160                 LIST_REMOVE(cdp, cd_all_le);
1161                 uma_zfree(chunk_zone, cdp);
1162         }
1163         while ((tp = LIST_FIRST(&da->all_trie)) != NULL) {
1164                 LIST_REMOVE(tp, td_all_le);
1165                 uma_zfree(trie_zone, tp);
1166         }
1167         free(da->range_tbl, M_DXRAUX);
1168         free(da->x_tbl, M_DXRAUX);
1169         free(da, M_DXRAUX);
1170 }
1171
1172 static void 
1173 epoch_dxr_destroy(epoch_context_t ctx)
1174 {
1175         struct dxr *dxr = __containerof(ctx, struct dxr, epoch_ctx);
1176
1177         dxr_destroy(dxr);
1178 }
1179
1180 static void *
1181 choose_lookup_fn(struct dxr_aux *da)
1182 {
1183
1184 #ifdef DXR2
1185         switch (da->d_bits) {
1186 #if DXR_TRIE_BITS > 16
1187         case 16:
1188                 return (dxr_fib_lookup_16);
1189 #endif
1190         case 15:
1191                 return (dxr_fib_lookup_15);
1192         case 14:
1193                 return (dxr_fib_lookup_14);
1194         case 13:
1195                 return (dxr_fib_lookup_13);
1196         case 12:
1197                 return (dxr_fib_lookup_12);
1198         case 11:
1199                 return (dxr_fib_lookup_11);
1200         case 10:
1201                 return (dxr_fib_lookup_10);
1202         case 9:
1203                 return (dxr_fib_lookup_9);
1204         }
1205 #endif /* DXR2 */
1206         return (dxr_fib_lookup);
1207 }
1208
1209 static enum flm_op_result
1210 dxr_dump_end(void *data, struct fib_dp *dp)
1211 {
1212         struct dxr *dxr = data;
1213         struct dxr_aux *da;
1214
1215         dxr_build(dxr);
1216
1217         da = dxr->aux;
1218         if (da == NULL)
1219                 return (FLM_REBUILD);
1220
1221         /* Structural limit exceeded, hard error */
1222         if (da->rtbl_top >= BASE_MAX)
1223                 return (FLM_ERROR);
1224
1225         /* A malloc(,, M_NOWAIT) failed somewhere, retry later */
1226         if (dxr->d == NULL)
1227                 return (FLM_REBUILD);
1228
1229         dp->f = choose_lookup_fn(da);
1230         dp->arg = dxr;
1231
1232         return (FLM_SUCCESS);
1233 }
1234
1235 static enum flm_op_result
1236 dxr_dump_rib_item(struct rtentry *rt, void *data)
1237 {
1238         
1239         return (FLM_SUCCESS);
1240 }
1241
1242 static enum flm_op_result
1243 dxr_change_rib_item(struct rib_head *rnh, struct rib_cmd_info *rc,
1244     void *data)
1245 {
1246
1247         return (FLM_BATCH);
1248 }
1249
1250 static enum flm_op_result
1251 dxr_change_rib_batch(struct rib_head *rnh, struct fib_change_queue *q,
1252     void *data)
1253 {
1254         struct dxr *dxr = data;
1255         struct dxr *new_dxr;
1256         struct dxr_aux *da;
1257         struct fib_dp new_dp;
1258         enum flm_op_result res;
1259         uint32_t ip, plen, hmask, start, end, i, ui;
1260 #ifdef INVARIANTS
1261         struct rib_rtable_info rinfo;
1262         int update_delta = 0;
1263 #endif
1264
1265         KASSERT(data != NULL, ("%s: NULL data", __FUNCTION__));
1266         KASSERT(q != NULL, ("%s: NULL q", __FUNCTION__));
1267         KASSERT(q->count < q->size, ("%s: q->count %d q->size %d",
1268             __FUNCTION__, q->count, q->size));
1269
1270         da = dxr->aux;
1271         KASSERT(da != NULL, ("%s: NULL dxr->aux", __FUNCTION__));
1272
1273         FIB_PRINTF(LOG_INFO, da->fd, "processing %d update(s)", q->count);
1274         for (ui = 0; ui < q->count; ui++) {
1275 #ifdef INVARIANTS
1276                 if (q->entries[ui].nh_new != NULL)
1277                         update_delta++;
1278                 if (q->entries[ui].nh_old != NULL)
1279                         update_delta--;
1280 #endif
1281                 plen = q->entries[ui].plen;
1282                 ip = ntohl(q->entries[ui].addr4.s_addr);
1283                 if (plen < 32)
1284                         hmask = 0xffffffffU >> plen;
1285                 else
1286                         hmask = 0;
1287                 start = (ip & ~hmask) >> DXR_RANGE_SHIFT;
1288                 end = (ip | hmask) >> DXR_RANGE_SHIFT;
1289
1290                 if ((start & 0x1f) == 0 && (end & 0x1f) == 0x1f)
1291                         for (i = start >> 5; i <= end >> 5; i++)
1292                                 da->updates_mask[i] = 0xffffffffU;
1293                 else
1294                         for (i = start; i <= end; i++)
1295                                 da->updates_mask[i >> 5] |= (1 << (i & 0x1f));
1296                 if (start < da->updates_low)
1297                         da->updates_low = start;
1298                 if (end > da->updates_high)
1299                         da->updates_high = end;
1300         }
1301
1302 #ifdef INVARIANTS
1303         fib_get_rtable_info(fib_get_rh(da->fd), &rinfo);
1304         KASSERT(da->prefixes + update_delta == rinfo.num_prefixes,
1305             ("%s: update count mismatch", __FUNCTION__));
1306 #endif
1307
1308         res = dxr_init(0, dxr->fd, data, (void **) &new_dxr);
1309         if (res != FLM_SUCCESS)
1310                 return (res);
1311
1312         dxr_build(new_dxr);
1313
1314         /* Structural limit exceeded, hard error */
1315         if (da->rtbl_top >= BASE_MAX) {
1316                 dxr_destroy(new_dxr);
1317                 return (FLM_ERROR);
1318         }
1319
1320         /* A malloc(,, M_NOWAIT) failed somewhere, retry later */
1321         if (new_dxr->d == NULL) {
1322                 dxr_destroy(new_dxr);
1323                 return (FLM_REBUILD);
1324         }
1325
1326         new_dp.f = choose_lookup_fn(da);
1327         new_dp.arg = new_dxr;
1328         if (fib_set_datapath_ptr(dxr->fd, &new_dp)) {
1329                 fib_set_algo_ptr(dxr->fd, new_dxr);
1330                 fib_epoch_call(epoch_dxr_destroy, &dxr->epoch_ctx);
1331                 return (FLM_SUCCESS);
1332         }
1333
1334         dxr_destroy(new_dxr);
1335         return (FLM_REBUILD);
1336 }
1337
1338 static uint8_t
1339 dxr_get_pref(const struct rib_rtable_info *rinfo)
1340 {
1341
1342         /* Below bsearch4 up to 10 prefixes. Always supersedes dpdk_lpm4. */
1343         return (251);
1344 }
1345
1346 SYSCTL_DECL(_net_route_algo);
1347 SYSCTL_NODE(_net_route_algo, OID_AUTO, dxr, CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
1348     "DXR tunables");
1349
1350 static int
1351 sysctl_dxr_frag_limit(SYSCTL_HANDLER_ARGS)
1352 {
1353         char buf[8];
1354         int error, new, i;
1355
1356         snprintf(buf, sizeof(buf), "%d.%02d%%", V_frag_limit / 100,
1357             V_frag_limit % 100);
1358         error = sysctl_handle_string(oidp, buf, sizeof(buf), req);
1359         if (error != 0 || req->newptr == NULL)
1360                 return (error);
1361         if (!isdigit(*buf) && *buf != '.')
1362                 return (EINVAL);
1363         for (i = 0, new = 0; isdigit(buf[i]) && i < sizeof(buf); i++)
1364                 new = new * 10 + buf[i] - '0';
1365         new *= 100;
1366         if (buf[i++] == '.') {
1367                 if (!isdigit(buf[i]))
1368                         return (EINVAL);
1369                 new += (buf[i++] - '0') * 10;
1370                 if (isdigit(buf[i]))
1371                         new += buf[i++] - '0';
1372         }
1373         if (new > 1000)
1374                 return (EINVAL);
1375         V_frag_limit = new;
1376         snprintf(buf, sizeof(buf), "%d.%02d%%", V_frag_limit / 100,
1377             V_frag_limit % 100);
1378         return (0);
1379 }
1380
1381 SYSCTL_PROC(_net_route_algo_dxr, OID_AUTO, frag_limit,
1382     CTLTYPE_STRING | CTLFLAG_RW | CTLFLAG_VNET,
1383     0, 0, sysctl_dxr_frag_limit, "A",
1384     "Fragmentation threshold to full rebuild");
1385
1386 static struct fib_lookup_module fib_dxr_mod = {
1387         .flm_name = "dxr",
1388         .flm_family = AF_INET,
1389         .flm_init_cb = dxr_init,
1390         .flm_destroy_cb = dxr_destroy,
1391         .flm_dump_rib_item_cb = dxr_dump_rib_item,
1392         .flm_dump_end_cb = dxr_dump_end,
1393         .flm_change_rib_item_cb = dxr_change_rib_item,
1394         .flm_change_rib_items_cb = dxr_change_rib_batch,
1395         .flm_get_pref = dxr_get_pref,
1396 };
1397
1398 static int
1399 dxr_modevent(module_t mod, int type, void *unused)
1400 {
1401         int error;
1402
1403         switch (type) {
1404         case MOD_LOAD:
1405                 chunk_zone = uma_zcreate("dxr chunk", sizeof(struct chunk_desc),
1406                     NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0);
1407                 trie_zone = uma_zcreate("dxr trie", sizeof(struct trie_desc),
1408                     NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0);
1409                 fib_module_register(&fib_dxr_mod);
1410                 return(0);
1411         case MOD_UNLOAD:
1412                 error = fib_module_unregister(&fib_dxr_mod);
1413                 if (error)
1414                         return (error);
1415                 uma_zdestroy(chunk_zone);
1416                 uma_zdestroy(trie_zone);
1417                 return(0);
1418         default:
1419                 return(EOPNOTSUPP);
1420         }
1421 }
1422
1423 static moduledata_t dxr_mod = {"fib_dxr", dxr_modevent, 0};
1424
1425 DECLARE_MODULE(fib_dxr, dxr_mod, SI_SUB_PSEUDO, SI_ORDER_ANY);
1426 MODULE_VERSION(fib_dxr, 1);