2 * Copyright (c) 2014 Gleb Smirnoff <glebius@FreeBSD.org>
3 * Copyright (c) 2008-2010, BitGravity Inc.
6 * Redistribution and use in source and binary forms, with or without
7 * modification, are permitted provided that the following conditions are met:
9 * 1. Redistributions of source code must retain the above copyright notice,
10 * this list of conditions and the following disclaimer.
12 * 2. Neither the name of the BitGravity Corporation nor the names of its
13 * contributors may be used to endorse or promote products derived from
14 * this software without specific prior written permission.
16 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
17 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
20 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
21 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
22 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
23 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
24 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
25 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
26 * POSSIBILITY OF SUCH DAMAGE.
29 #include "opt_route.h"
30 #include "opt_mpath.h"
33 #include "opt_inet6.h"
35 #include <sys/cdefs.h>
36 __FBSDID("$FreeBSD$");
38 #include <sys/param.h>
39 #include <sys/types.h>
40 #include <sys/bitstring.h>
41 #include <sys/condvar.h>
42 #include <sys/callout.h>
44 #include <sys/kernel.h>
45 #include <sys/kthread.h>
46 #include <sys/limits.h>
47 #include <sys/malloc.h>
51 #include <sys/queue.h>
53 #include <sys/sched.h>
55 #include <sys/socket.h>
56 #include <sys/syslog.h>
57 #include <sys/sysctl.h>
61 #include <net/if_llatbl.h>
62 #include <net/if_var.h>
63 #include <net/route.h>
64 #include <net/flowtable.h>
67 #include <netinet/in.h>
68 #include <netinet/in_systm.h>
69 #include <netinet/in_var.h>
70 #include <netinet/if_ether.h>
71 #include <netinet/ip.h>
73 #include <netinet/ip6.h>
75 #ifdef FLOWTABLE_HASH_ALL
76 #include <netinet/tcp.h>
77 #include <netinet/udp.h>
78 #include <netinet/sctp.h>
83 #ifdef FLOWTABLE_HASH_ALL
84 #define KEY_PORTS (sizeof(uint16_t) * 2)
92 #define KEY_ADDR_LEN sizeof(struct in6_addr)
94 #define KEY_ADDR_LEN sizeof(struct in_addr)
97 #define KEYLEN ((KEY_ADDR_LEN * KEY_ADDRS + KEY_PORTS) / sizeof(uint32_t))
100 uint32_t f_hash; /* hash flowing forward */
101 uint32_t f_key[KEYLEN]; /* address(es and ports) */
102 uint32_t f_uptime; /* uptime at last access */
103 uint16_t f_fibnum; /* fib index */
104 #ifdef FLOWTABLE_HASH_ALL
105 uint8_t f_proto; /* protocol */
106 uint8_t f_flags; /* stale? */
109 SLIST_ENTRY(flentry) f_next; /* pointer to collision entry */
110 struct rtentry *f_rt; /* rtentry for flow */
111 struct llentry *f_lle; /* llentry for flow */
115 SLIST_HEAD(flist, flentry);
116 /* Make sure we can use pcpu_zone_ptr for struct flist. */
117 CTASSERT(sizeof(struct flist) == sizeof(void *));
120 counter_u64_t *ft_stat;
123 * ft_table is a malloc(9)ed array of pointers. Pointers point to
124 * memory from UMA_ZONE_PCPU zone.
125 * ft_masks is per-cpu pointer itself. Each instance points
126 * to a malloc(9)ed bitset, that is private to corresponding CPU.
128 struct flist **ft_table;
130 bitstr_t *ft_tmpmask;
133 #define FLOWSTAT_ADD(ft, name, v) \
134 counter_u64_add((ft)->ft_stat[offsetof(struct flowtable_stat, name) / sizeof(uint64_t)], (v))
135 #define FLOWSTAT_INC(ft, name) FLOWSTAT_ADD(ft, name, 1)
137 static struct proc *flowcleanerproc;
138 static uint32_t flow_hashjitter;
140 static struct cv flowclean_f_cv;
141 static struct cv flowclean_c_cv;
142 static struct mtx flowclean_lock;
143 static uint32_t flowclean_cycles;
147 * - add sysctls to resize && flush flow tables
148 * - Add per flowtable sysctls for statistics and configuring timeouts
149 * - add saturation counter to rtentry to support per-packet load-balancing
150 * add flag to indicate round-robin flow, add list lookup from head
152 * - add sysctl / device node / syscall to support exporting and importing
153 * of flows with flag to indicate that a flow was imported so should
154 * not be considered for auto-cleaning
155 * - support explicit connection state (currently only ad-hoc for DSR)
156 * - idetach() cleanup for options VIMAGE builds.
159 static VNET_DEFINE(struct flowtable, ip4_ft);
160 #define V_ip4_ft VNET(ip4_ft)
163 static VNET_DEFINE(struct flowtable, ip6_ft);
164 #define V_ip6_ft VNET(ip6_ft)
167 static uma_zone_t flow_zone;
169 static VNET_DEFINE(int, flowtable_enable) = 1;
170 #define V_flowtable_enable VNET(flowtable_enable)
172 static SYSCTL_NODE(_net, OID_AUTO, flowtable, CTLFLAG_RD, NULL,
174 SYSCTL_VNET_INT(_net_flowtable, OID_AUTO, enable, CTLFLAG_RW,
175 &VNET_NAME(flowtable_enable), 0, "enable flowtable caching.");
176 SYSCTL_UMA_MAX(_net_flowtable, OID_AUTO, maxflows, CTLFLAG_RW,
177 &flow_zone, "Maximum number of flows allowed");
179 static MALLOC_DEFINE(M_FTABLE, "flowtable", "flowtable hashes and bitstrings");
181 static struct flentry *
182 flowtable_lookup_common(struct flowtable *, uint32_t *, int, uint32_t);
185 static struct flentry *
186 flowtable_lookup_ipv4(struct mbuf *m, struct route *ro)
189 struct sockaddr_in *sin;
192 #ifdef FLOWTABLE_HASH_ALL
195 uint16_t sport, dport;
199 ip = mtod(m, struct ip *);
201 if (ip->ip_src.s_addr == ip->ip_dst.s_addr ||
202 (ntohl(ip->ip_dst.s_addr) >> IN_CLASSA_NSHIFT) == IN_LOOPBACKNET ||
203 (ntohl(ip->ip_src.s_addr) >> IN_CLASSA_NSHIFT) == IN_LOOPBACKNET)
206 fibnum = M_GETFIB(m);
208 #ifdef FLOWTABLE_HASH_ALL
209 iphlen = ip->ip_hl << 2;
216 th = (struct tcphdr *)((char *)ip + iphlen);
217 sport = th->th_sport;
218 dport = th->th_dport;
219 if (th->th_flags & (TH_RST|TH_FIN))
220 fibnum |= (FL_STALE << 24);
226 uh = (struct udphdr *)((char *)ip + iphlen);
227 sport = uh->uh_sport;
228 dport = uh->uh_dport;
234 sh = (struct sctphdr *)((char *)ip + iphlen);
235 sport = sh->src_port;
236 dport = sh->dest_port;
237 /* XXXGL: handle stale? */
245 key[0] = ip->ip_dst.s_addr;
246 key[1] = ip->ip_src.s_addr;
247 key[2] = (dport << 16) | sport;
248 fibnum |= proto << 16;
250 fle = flowtable_lookup_common(&V_ip4_ft, key, 3 * sizeof(uint32_t),
253 #else /* !FLOWTABLE_HASH_ALL */
255 fle = flowtable_lookup_common(&V_ip4_ft, (uint32_t *)&ip->ip_dst,
256 sizeof(struct in_addr), fibnum);
258 #endif /* FLOWTABLE_HASH_ALL */
263 sin = (struct sockaddr_in *)&ro->ro_dst;
264 sin->sin_family = AF_INET;
265 sin->sin_len = sizeof(*sin);
266 sin->sin_addr = ip->ip_dst;
274 * PULLUP_TO(len, p, T) makes sure that len + sizeof(T) is contiguous,
275 * then it sets p to point at the offset "len" in the mbuf. WARNING: the
276 * pointer might become stale after other pullups (but we never use it
279 #define PULLUP_TO(_len, p, T) \
281 int x = (_len) + sizeof(T); \
282 if ((m)->m_len < x) \
284 p = (mtod(m, char *) + (_len)); \
287 #define TCP(p) ((struct tcphdr *)(p))
288 #define SCTP(p) ((struct sctphdr *)(p))
289 #define UDP(p) ((struct udphdr *)(p))
291 static struct flentry *
292 flowtable_lookup_ipv6(struct mbuf *m, struct route *ro)
295 struct sockaddr_in6 *sin6;
298 #ifdef FLOWTABLE_HASH_ALL
302 uint16_t sport, dport;
309 ip6 = mtod(m, struct ip6_hdr *);
310 if (in6_localaddr(&ip6->ip6_dst))
313 fibnum = M_GETFIB(m);
315 #ifdef FLOWTABLE_HASH_ALL
316 hlen = sizeof(struct ip6_hdr);
317 proto = ip6->ip6_nxt;
318 offset = sport = dport = 0;
320 while (ulp == NULL) {
323 case IPPROTO_OSPFIGP:
331 PULLUP_TO(hlen, ulp, struct tcphdr);
332 dport = TCP(ulp)->th_dport;
333 sport = TCP(ulp)->th_sport;
334 if (TCP(ulp)->th_flags & (TH_RST|TH_FIN))
335 fibnum |= (FL_STALE << 24);
338 PULLUP_TO(hlen, ulp, struct sctphdr);
339 dport = SCTP(ulp)->src_port;
340 sport = SCTP(ulp)->dest_port;
341 /* XXXGL: handle stale? */
344 PULLUP_TO(hlen, ulp, struct udphdr);
345 dport = UDP(ulp)->uh_dport;
346 sport = UDP(ulp)->uh_sport;
348 case IPPROTO_HOPOPTS: /* RFC 2460 */
349 PULLUP_TO(hlen, ulp, struct ip6_hbh);
350 hlen += (((struct ip6_hbh *)ulp)->ip6h_len + 1) << 3;
351 proto = ((struct ip6_hbh *)ulp)->ip6h_nxt;
354 case IPPROTO_ROUTING: /* RFC 2460 */
355 PULLUP_TO(hlen, ulp, struct ip6_rthdr);
356 hlen += (((struct ip6_rthdr *)ulp)->ip6r_len + 1) << 3;
357 proto = ((struct ip6_rthdr *)ulp)->ip6r_nxt;
360 case IPPROTO_FRAGMENT: /* RFC 2460 */
361 PULLUP_TO(hlen, ulp, struct ip6_frag);
362 hlen += sizeof (struct ip6_frag);
363 proto = ((struct ip6_frag *)ulp)->ip6f_nxt;
364 offset = ((struct ip6_frag *)ulp)->ip6f_offlg &
368 case IPPROTO_DSTOPTS: /* RFC 2460 */
369 PULLUP_TO(hlen, ulp, struct ip6_hbh);
370 hlen += (((struct ip6_hbh *)ulp)->ip6h_len + 1) << 3;
371 proto = ((struct ip6_hbh *)ulp)->ip6h_nxt;
374 case IPPROTO_AH: /* RFC 2402 */
375 PULLUP_TO(hlen, ulp, struct ip6_ext);
376 hlen += (((struct ip6_ext *)ulp)->ip6e_len + 2) << 2;
377 proto = ((struct ip6_ext *)ulp)->ip6e_nxt;
381 PULLUP_TO(hlen, ulp, struct ip6_ext);
386 bcopy(&ip6->ip6_dst, &key[0], sizeof(struct in6_addr));
387 bcopy(&ip6->ip6_src, &key[4], sizeof(struct in6_addr));
388 key[8] = (dport << 16) | sport;
389 fibnum |= proto << 16;
391 fle = flowtable_lookup_common(&V_ip6_ft, key, 9 * sizeof(uint32_t),
393 #else /* !FLOWTABLE_HASH_ALL */
394 bcopy(&ip6->ip6_dst, &key[0], sizeof(struct in6_addr));
395 fle = flowtable_lookup_common(&V_ip6_ft, key, sizeof(struct in6_addr),
397 #endif /* FLOWTABLE_HASH_ALL */
402 sin6 = (struct sockaddr_in6 *)&ro->ro_dst;
403 sin6->sin6_family = AF_INET6;
404 sin6->sin6_len = sizeof(*sin6);
405 bcopy(&ip6->ip6_dst, &sin6->sin6_addr, sizeof(struct in6_addr));
412 flowtable_mask(struct flowtable *ft)
416 * flowtable_free_stale() calls w/o critical section, but
417 * with sched_bind(). Since pointer is stable throughout
418 * ft lifetime, it is safe, otherwise...
420 * CRITICAL_ASSERT(curthread);
423 return (*(bitstr_t **)zpcpu_get(ft->ft_masks));
426 static struct flist *
427 flowtable_list(struct flowtable *ft, uint32_t hash)
430 CRITICAL_ASSERT(curthread);
431 return (zpcpu_get(ft->ft_table[hash % ft->ft_size]));
435 flow_stale(struct flowtable *ft, struct flentry *fle, int maxidle)
438 if (((fle->f_rt->rt_flags & RTF_HOST) &&
439 ((fle->f_rt->rt_flags & (RTF_UP)) != (RTF_UP))) ||
440 (fle->f_rt->rt_ifp == NULL) ||
441 !RT_LINK_IS_UP(fle->f_rt->rt_ifp) ||
442 (fle->f_lle->la_flags & LLE_VALID) == 0)
445 if (time_uptime - fle->f_uptime > maxidle)
448 #ifdef FLOWTABLE_HASH_ALL
449 if (fle->f_flags & FL_STALE)
461 count = uma_zone_get_cur(flow_zone);
462 max = uma_zone_get_max(flow_zone);
464 return (count > (max - (max >> 3)));
468 flow_matches(struct flentry *fle, uint32_t *key, int keylen, uint32_t fibnum)
470 #ifdef FLOWTABLE_HASH_ALL
473 proto = (fibnum >> 16) & 0xff;
477 CRITICAL_ASSERT(curthread);
479 /* Microoptimization for IPv4: don't use bcmp(). */
480 if (((keylen == sizeof(uint32_t) && (fle->f_key[0] != key[0])) ||
481 (bcmp(fle->f_key, key, keylen) == 0)) &&
482 fibnum == fle->f_fibnum &&
483 #ifdef FLOWTABLE_HASH_ALL
484 proto == fle->f_proto &&
486 (fle->f_rt->rt_flags & RTF_UP) &&
487 fle->f_rt->rt_ifp != NULL &&
488 (fle->f_lle->la_flags & LLE_VALID))
494 static struct flentry *
495 flowtable_insert(struct flowtable *ft, uint32_t hash, uint32_t *key,
496 int keylen, uint32_t fibnum0)
499 struct route_in6 sro6;
504 struct route *ro = NULL;
506 struct lltable *lt = NULL;
508 struct sockaddr_storage *l3addr;
511 struct flentry *fle, *iter;
513 uint16_t fibnum = fibnum0;
514 #ifdef FLOWTABLE_HASH_ALL
517 proto = (fibnum0 >> 16) & 0xff;
518 fibnum = fibnum0 & 0xffff;
522 * This bit of code ends up locking the
523 * same route 3 times (just like ip_output + ether_output)
525 * - in rt_check when called by arpresolve
526 * - dropping the refcount for the rtentry
528 * This could be consolidated to one if we wrote a variant
529 * of arpresolve with an rt_check variant that expected to
530 * receive the route locked
533 if (ft == &V_ip4_ft) {
534 struct sockaddr_in *sin;
537 bzero(&sro.ro_dst, sizeof(sro.ro_dst));
539 sin = (struct sockaddr_in *)&sro.ro_dst;
540 sin->sin_family = AF_INET;
541 sin->sin_len = sizeof(*sin);
542 sin->sin_addr.s_addr = key[0];
546 if (ft == &V_ip6_ft) {
547 struct sockaddr_in6 *sin6;
549 ro = (struct route *)&sro6;
552 bzero(sin6, sizeof(*sin6));
553 sin6->sin6_family = AF_INET6;
554 sin6->sin6_len = sizeof(*sin6);
555 bcopy(key, &sin6->sin6_addr, sizeof(struct in6_addr));
561 rtalloc_mpath_fib(ro, hash, fibnum);
563 rtalloc_ign_fib(ro, 0, fibnum);
565 if (ro->ro_rt == NULL)
571 if (ifp->if_flags & (IFF_POINTOPOINT | IFF_LOOPBACK)) {
585 if (rt->rt_flags & RTF_GATEWAY)
586 l3addr = (struct sockaddr_storage *)rt->rt_gateway;
588 l3addr = (struct sockaddr_storage *)&ro->ro_dst;
589 lle = llentry_alloc(ifp, lt, l3addr);
596 /* Don't insert the entry if the ARP hasn't yet finished resolving. */
597 if ((lle->la_flags & LLE_VALID) == 0) {
600 FLOWSTAT_INC(ft, ft_fail_lle_invalid);
604 fle = uma_zalloc(flow_zone, M_NOWAIT | M_ZERO);
612 bcopy(key, &fle->f_key, keylen);
615 fle->f_fibnum = fibnum;
616 fle->f_uptime = time_uptime;
617 #ifdef FLOWTABLE_HASH_ALL
618 fle->f_proto = proto;
619 fle->f_flags = fibnum0 >> 24;
623 mask = flowtable_mask(ft);
624 flist = flowtable_list(ft, hash);
626 if (SLIST_EMPTY(flist)) {
627 bit_set(mask, (hash % ft->ft_size));
628 SLIST_INSERT_HEAD(flist, fle, f_next);
633 * find end of list and make sure that we were not
634 * preempted by another thread handling this flow
636 SLIST_FOREACH(iter, flist, f_next) {
637 KASSERT(iter->f_hash % ft->ft_size == hash % ft->ft_size,
638 ("%s: wrong hash", __func__));
639 if (flow_matches(iter, key, keylen, fibnum)) {
641 * We probably migrated to an other CPU after
642 * lookup in flowtable_lookup_common() failed.
643 * It appeared that this CPU already has flow
646 iter->f_uptime = time_uptime;
647 #ifdef FLOWTABLE_HASH_ALL
648 iter->f_flags |= fibnum >> 24;
651 FLOWSTAT_INC(ft, ft_collisions);
652 uma_zfree(flow_zone, fle);
657 SLIST_INSERT_HEAD(flist, fle, f_next);
660 FLOWSTAT_INC(ft, ft_inserts);
666 flowtable_lookup(sa_family_t sa, struct mbuf *m, struct route *ro)
670 if (V_flowtable_enable == 0)
676 fle = flowtable_lookup_ipv4(m, ro);
681 fle = flowtable_lookup_ipv6(m, ro);
685 panic("%s: sa %d", __func__, sa);
689 return (EHOSTUNREACH);
691 if (M_HASHTYPE_GET(m) == M_HASHTYPE_NONE) {
692 M_HASHTYPE_SET(m, M_HASHTYPE_OPAQUE);
693 m->m_pkthdr.flowid = fle->f_hash;
696 ro->ro_rt = fle->f_rt;
697 ro->ro_lle = fle->f_lle;
698 ro->ro_flags |= RT_NORTREF;
703 static struct flentry *
704 flowtable_lookup_common(struct flowtable *ft, uint32_t *key, int keylen,
711 FLOWSTAT_INC(ft, ft_lookups);
713 hash = jenkins_hash32(key, keylen / sizeof(uint32_t), flow_hashjitter);
716 flist = flowtable_list(ft, hash);
717 SLIST_FOREACH(fle, flist, f_next) {
718 KASSERT(fle->f_hash % ft->ft_size == hash % ft->ft_size,
719 ("%s: wrong hash", __func__));
720 if (flow_matches(fle, key, keylen, fibnum)) {
721 fle->f_uptime = time_uptime;
722 #ifdef FLOWTABLE_HASH_ALL
723 fle->f_flags |= fibnum >> 24;
726 FLOWSTAT_INC(ft, ft_hits);
732 FLOWSTAT_INC(ft, ft_misses);
734 return (flowtable_insert(ft, hash, key, keylen, fibnum));
738 * used by the bit_alloc macro
740 #define calloc(count, size) malloc((count)*(size), M_FTABLE, M_WAITOK | M_ZERO)
742 flowtable_alloc(struct flowtable *ft)
745 ft->ft_table = malloc(ft->ft_size * sizeof(struct flist),
747 for (int i = 0; i < ft->ft_size; i++)
748 ft->ft_table[i] = uma_zalloc(pcpu_zone_ptr, M_WAITOK | M_ZERO);
750 ft->ft_masks = uma_zalloc(pcpu_zone_ptr, M_WAITOK);
751 for (int i = 0; i < mp_ncpus; i++) {
754 b = zpcpu_get_cpu(ft->ft_masks, i);
755 *b = bit_alloc(ft->ft_size);
757 ft->ft_tmpmask = bit_alloc(ft->ft_size);
762 flowtable_free_stale(struct flowtable *ft, struct rtentry *rt, int maxidle)
764 struct flist *flist, freelist;
765 struct flentry *fle, *fle1, *fleprev;
766 bitstr_t *mask, *tmpmask;
769 SLIST_INIT(&freelist);
770 mask = flowtable_mask(ft);
771 tmpmask = ft->ft_tmpmask;
772 tmpsize = ft->ft_size;
773 memcpy(tmpmask, mask, ft->ft_size/8);
775 fleprev = NULL; /* pacify gcc */
777 * XXX Note to self, bit_ffs operates at the byte level
778 * and thus adds gratuitous overhead
780 bit_ffs(tmpmask, ft->ft_size, &curbit);
781 while (curbit != -1) {
782 if (curbit >= ft->ft_size || curbit < -1) {
784 "warning: bad curbit value %d \n",
789 FLOWSTAT_INC(ft, ft_free_checks);
792 flist = flowtable_list(ft, curbit);
794 if (SLIST_EMPTY(flist) && curbit > 0) {
796 "warning bit=%d set, but no fle found\n",
800 SLIST_FOREACH_SAFE(fle, flist, f_next, fle1) {
801 if (rt != NULL && fle->f_rt != rt) {
805 if (!flow_stale(ft, fle, maxidle)) {
810 if (fle == SLIST_FIRST(flist))
811 SLIST_REMOVE_HEAD(flist, f_next);
813 SLIST_REMOVE_AFTER(fleprev, f_next);
814 SLIST_INSERT_HEAD(&freelist, fle, f_next);
816 if (SLIST_EMPTY(flist))
817 bit_clear(mask, curbit);
820 bit_clear(tmpmask, curbit);
821 tmpmask += (curbit / 8);
822 tmpsize -= (curbit / 8) * 8;
823 bit_ffs(tmpmask, tmpsize, &curbit);
826 SLIST_FOREACH_SAFE(fle, &freelist, f_next, fle1) {
827 FLOWSTAT_INC(ft, ft_frees);
828 if (fle->f_rt != NULL)
830 if (fle->f_lle != NULL)
831 LLE_FREE(fle->f_lle);
832 uma_zfree(flow_zone, fle);
837 flowtable_clean_vnet(struct flowtable *ft, struct rtentry *rt, int maxidle)
842 if (smp_started == 1) {
843 thread_lock(curthread);
844 sched_bind(curthread, i);
845 thread_unlock(curthread);
848 flowtable_free_stale(ft, rt, maxidle);
850 if (smp_started == 1) {
851 thread_lock(curthread);
852 sched_unbind(curthread);
853 thread_unlock(curthread);
859 flowtable_route_flush(sa_family_t sa, struct rtentry *rt)
861 struct flowtable *ft;
875 panic("%s: sa %d", __func__, sa);
878 flowtable_clean_vnet(ft, rt, 0);
882 flowtable_cleaner(void)
884 VNET_ITERATOR_DECL(vnet_iter);
888 log(LOG_INFO, "flowtable cleaner started\n");
891 uint32_t flowclean_freq, maxidle;
894 * The maximum idle time, as well as frequency are arbitrary.
902 VNET_FOREACH(vnet_iter) {
903 CURVNET_SET(vnet_iter);
905 flowtable_clean_vnet(&V_ip4_ft, NULL, maxidle);
908 flowtable_clean_vnet(&V_ip6_ft, NULL, maxidle);
915 flowclean_freq = 4*hz;
917 flowclean_freq = 20*hz;
918 mtx_lock(&flowclean_lock);
920 sched_prio(td, PPAUSE);
923 cv_broadcast(&flowclean_f_cv);
924 cv_timedwait(&flowclean_c_cv, &flowclean_lock, flowclean_freq);
925 mtx_unlock(&flowclean_lock);
930 flowtable_flush(void *unused __unused)
934 mtx_lock(&flowclean_lock);
935 start = flowclean_cycles;
936 while (start == flowclean_cycles) {
937 cv_broadcast(&flowclean_c_cv);
938 cv_wait(&flowclean_f_cv, &flowclean_lock);
940 mtx_unlock(&flowclean_lock);
943 static struct kproc_desc flow_kp = {
948 SYSINIT(flowcleaner, SI_SUB_KTHREAD_IDLE, SI_ORDER_ANY, kproc_start, &flow_kp);
951 flowtable_get_size(char *name)
955 if (TUNABLE_INT_FETCH(name, &size)) {
958 if (!powerof2(size)) {
959 printf("%s must be power of 2\n", name);
964 * round up to the next power of 2
966 size = 1 << fls((1024 + maxusers * 64) - 1);
973 flowtable_init(const void *unused __unused)
976 flow_hashjitter = arc4random();
978 flow_zone = uma_zcreate("flows", sizeof(struct flentry),
979 NULL, NULL, NULL, NULL, (64-1), UMA_ZONE_MAXBUCKET);
980 uma_zone_set_max(flow_zone, 1024 + maxusers * 64 * mp_ncpus);
982 cv_init(&flowclean_c_cv, "c_flowcleanwait");
983 cv_init(&flowclean_f_cv, "f_flowcleanwait");
984 mtx_init(&flowclean_lock, "flowclean lock", NULL, MTX_DEF);
985 EVENTHANDLER_REGISTER(ifnet_departure_event, flowtable_flush, NULL,
986 EVENTHANDLER_PRI_ANY);
988 SYSINIT(flowtable_init, SI_SUB_PROTO_BEGIN, SI_ORDER_FIRST,
989 flowtable_init, NULL);
992 static SYSCTL_NODE(_net_flowtable, OID_AUTO, ip4, CTLFLAG_RD, NULL,
993 "Flowtable for IPv4");
995 static VNET_PCPUSTAT_DEFINE(struct flowtable_stat, ip4_ftstat);
996 VNET_PCPUSTAT_SYSINIT(ip4_ftstat);
997 VNET_PCPUSTAT_SYSUNINIT(ip4_ftstat);
998 SYSCTL_VNET_PCPUSTAT(_net_flowtable_ip4, OID_AUTO, stat, struct flowtable_stat,
999 ip4_ftstat, "Flowtable statistics for IPv4 "
1000 "(struct flowtable_stat, net/flowtable.h)");
1003 flowtable_init_vnet_v4(const void *unused __unused)
1006 V_ip4_ft.ft_size = flowtable_get_size("net.flowtable.ip4.size");
1007 V_ip4_ft.ft_stat = VNET(ip4_ftstat);
1008 flowtable_alloc(&V_ip4_ft);
1010 VNET_SYSINIT(ft_vnet_v4, SI_SUB_PROTO_IFATTACHDOMAIN, SI_ORDER_ANY,
1011 flowtable_init_vnet_v4, NULL);
1015 static SYSCTL_NODE(_net_flowtable, OID_AUTO, ip6, CTLFLAG_RD, NULL,
1016 "Flowtable for IPv6");
1018 static VNET_PCPUSTAT_DEFINE(struct flowtable_stat, ip6_ftstat);
1019 VNET_PCPUSTAT_SYSINIT(ip6_ftstat);
1020 VNET_PCPUSTAT_SYSUNINIT(ip6_ftstat);
1021 SYSCTL_VNET_PCPUSTAT(_net_flowtable_ip6, OID_AUTO, stat, struct flowtable_stat,
1022 ip6_ftstat, "Flowtable statistics for IPv6 "
1023 "(struct flowtable_stat, net/flowtable.h)");
1026 flowtable_init_vnet_v6(const void *unused __unused)
1029 V_ip6_ft.ft_size = flowtable_get_size("net.flowtable.ip6.size");
1030 V_ip6_ft.ft_stat = VNET(ip6_ftstat);
1031 flowtable_alloc(&V_ip6_ft);
1033 VNET_SYSINIT(flowtable_init_vnet_v6, SI_SUB_PROTO_IFATTACHDOMAIN, SI_ORDER_ANY,
1034 flowtable_init_vnet_v6, NULL);
1039 flowtable_mask_pcpu(struct flowtable *ft, int cpuid)
1042 return (zpcpu_get_cpu(*ft->ft_masks, cpuid));
1045 static struct flist *
1046 flowtable_list_pcpu(struct flowtable *ft, uint32_t hash, int cpuid)
1049 return (zpcpu_get_cpu(&ft->ft_table[hash % ft->ft_size], cpuid));
1053 flow_show(struct flowtable *ft, struct flentry *fle)
1056 int rt_valid, ifp_valid;
1057 volatile struct rtentry *rt;
1058 struct ifnet *ifp = NULL;
1059 uint32_t *hashkey = fle->f_key;
1061 idle_time = (int)(time_uptime - fle->f_uptime);
1063 rt_valid = rt != NULL;
1066 ifp_valid = ifp != NULL;
1069 if (ft == &V_ip4_ft) {
1070 char daddr[4*sizeof "123"];
1071 #ifdef FLOWTABLE_HASH_ALL
1072 char saddr[4*sizeof "123"];
1073 uint16_t sport, dport;
1076 inet_ntoa_r(*(struct in_addr *) &hashkey[0], daddr);
1077 #ifdef FLOWTABLE_HASH_ALL
1078 inet_ntoa_r(*(struct in_addr *) &hashkey[1], saddr);
1079 dport = ntohs((uint16_t)(hashkey[2] >> 16));
1080 sport = ntohs((uint16_t)(hashkey[2] & 0xffff));
1081 db_printf("%s:%d->%s:%d", saddr, sport, daddr, dport);
1083 db_printf("%s ", daddr);
1088 if (ft == &V_ip6_ft) {
1089 #ifdef FLOWTABLE_HASH_ALL
1090 db_printf("\n\tkey=%08x:%08x:%08x%08x:%08x:%08x%08x:%08x:%08x",
1091 hashkey[0], hashkey[1], hashkey[2],
1092 hashkey[3], hashkey[4], hashkey[5],
1093 hashkey[6], hashkey[7], hashkey[8]);
1095 db_printf("\n\tkey=%08x:%08x:%08x ",
1096 hashkey[0], hashkey[1], hashkey[2]);
1101 db_printf("hash=%08x idle_time=%03d"
1102 "\n\tfibnum=%02d rt=%p",
1103 fle->f_hash, idle_time, fle->f_fibnum, fle->f_rt);
1105 #ifdef FLOWTABLE_HASH_ALL
1106 if (fle->f_flags & FL_STALE)
1107 db_printf(" FL_STALE ");
1110 if (rt->rt_flags & RTF_UP)
1111 db_printf(" RTF_UP ");
1114 if (ifp->if_flags & IFF_LOOPBACK)
1115 db_printf(" IFF_LOOPBACK ");
1116 if (ifp->if_flags & IFF_UP)
1117 db_printf(" IFF_UP ");
1118 if (ifp->if_flags & IFF_POINTOPOINT)
1119 db_printf(" IFF_POINTOPOINT ");
1125 flowtable_show(struct flowtable *ft, int cpuid)
1128 bitstr_t *mask, *tmpmask;
1131 db_printf("cpu: %d\n", cpuid);
1132 mask = flowtable_mask_pcpu(ft, cpuid);
1133 tmpmask = ft->ft_tmpmask;
1134 memcpy(tmpmask, mask, ft->ft_size/8);
1136 * XXX Note to self, bit_ffs operates at the byte level
1137 * and thus adds gratuitous overhead
1139 bit_ffs(tmpmask, ft->ft_size, &curbit);
1140 while (curbit != -1) {
1141 struct flist *flist;
1142 struct flentry *fle;
1144 if (curbit >= ft->ft_size || curbit < -1) {
1145 db_printf("warning: bad curbit value %d \n",
1150 flist = flowtable_list_pcpu(ft, curbit, cpuid);
1152 SLIST_FOREACH(fle, flist, f_next)
1154 bit_clear(tmpmask, curbit);
1155 bit_ffs(tmpmask, ft->ft_size, &curbit);
1160 flowtable_show_vnet(struct flowtable *ft)
1166 flowtable_show(ft, i);
1169 DB_SHOW_COMMAND(flowtables, db_show_flowtables)
1171 VNET_ITERATOR_DECL(vnet_iter);
1173 VNET_FOREACH(vnet_iter) {
1174 CURVNET_SET(vnet_iter);
1176 db_printf("vnet %p\n", vnet_iter);
1180 flowtable_show_vnet(&V_ip4_ft);
1184 flowtable_show_vnet(&V_ip6_ft);