1 /**************************************************************************
3 Copyright (c) 2008-2010, BitGravity Inc.
6 Redistribution and use in source and binary forms, with or without
7 modification, are permitted provided that the following conditions are met:
9 1. Redistributions of source code must retain the above copyright notice,
10 this list of conditions and the following disclaimer.
12 2. Neither the name of the BitGravity Corporation nor the names of its
13 contributors may be used to endorse or promote products derived from
14 this software without specific prior written permission.
16 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
17 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
20 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
21 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
22 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
23 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
24 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
25 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
26 POSSIBILITY OF SUCH DAMAGE.
28 ***************************************************************************/
30 #include "opt_route.h"
31 #include "opt_mpath.h"
34 #include "opt_inet6.h"
36 #include <sys/cdefs.h>
37 __FBSDID("$FreeBSD$");
39 #include <sys/param.h>
40 #include <sys/types.h>
41 #include <sys/bitstring.h>
42 #include <sys/condvar.h>
43 #include <sys/callout.h>
44 #include <sys/kernel.h>
45 #include <sys/kthread.h>
46 #include <sys/limits.h>
47 #include <sys/malloc.h>
51 #include <sys/sched.h>
53 #include <sys/socket.h>
54 #include <sys/syslog.h>
55 #include <sys/sysctl.h>
58 #include <net/if_llatbl.h>
59 #include <net/if_var.h>
60 #include <net/route.h>
61 #include <net/flowtable.h>
64 #include <netinet/in.h>
65 #include <netinet/in_systm.h>
66 #include <netinet/in_var.h>
67 #include <netinet/if_ether.h>
68 #include <netinet/ip.h>
70 #include <netinet/ip6.h>
72 #include <netinet/tcp.h>
73 #include <netinet/udp.h>
74 #include <netinet/sctp.h>
76 #include <libkern/jenkins.h>
80 uint16_t ip_sport; /* source port */
81 uint16_t ip_dport; /* destination port */
82 in_addr_t ip_saddr; /* source address */
83 in_addr_t ip_daddr; /* destination address */
87 struct ipv4_tuple ipf_ipt;
92 uint16_t ip_sport; /* source port */
93 uint16_t ip_dport; /* destination port */
94 struct in6_addr ip_saddr; /* source address */
95 struct in6_addr ip_daddr; /* destination address */
99 struct ipv6_tuple ipf_ipt;
104 volatile uint32_t f_fhash; /* hash flowing forward */
105 uint16_t f_flags; /* flow flags */
107 uint8_t f_proto; /* protocol */
108 uint32_t f_fibnum; /* fib index */
109 uint32_t f_uptime; /* uptime at last access */
110 struct flentry *f_next; /* pointer to collision entry */
111 volatile struct rtentry *f_rt; /* rtentry for flow */
112 volatile struct llentry *f_lle; /* llentry for flow */
116 struct flentry fl_entry;
117 union ipv4_flow fl_flow;
121 struct flentry fl_entry;
122 union ipv6_flow fl_flow;
125 #define fl_fhash fl_entry.fl_fhash
126 #define fl_flags fl_entry.fl_flags
127 #define fl_proto fl_entry.fl_proto
128 #define fl_uptime fl_entry.fl_uptime
129 #define fl_rt fl_entry.fl_rt
130 #define fl_lle fl_entry.fl_lle
132 #define SECS_PER_HOUR 3600
133 #define SECS_PER_DAY (24*SECS_PER_HOUR)
137 #define FIN_WAIT_IDLE 600
138 #define TCP_IDLE SECS_PER_DAY
141 typedef void fl_lock_t(struct flowtable *, uint32_t);
142 typedef void fl_rtalloc_t(struct route *, uint32_t, u_int);
145 struct flentry **global;
146 struct flentry **pcpu[MAXCPU];
149 struct flowtable_stats {
150 uint64_t ft_collisions;
151 uint64_t ft_allocated;
153 uint64_t ft_max_depth;
154 uint64_t ft_free_checks;
158 } __aligned(CACHE_LINE_SIZE);
161 struct flowtable_stats ft_stats[MAXCPU];
167 fl_lock_t *ft_unlock;
168 fl_rtalloc_t *ft_rtalloc;
170 * XXX need to pad out
172 struct mtx *ft_locks;
173 union flentryp ft_table;
174 bitstr_t *ft_masks[MAXCPU];
175 bitstr_t *ft_tmpmask;
176 struct flowtable *ft_next;
178 uint32_t ft_count __aligned(CACHE_LINE_SIZE);
179 uint32_t ft_udp_idle __aligned(CACHE_LINE_SIZE);
180 uint32_t ft_fin_wait_idle;
181 uint32_t ft_syn_idle;
182 uint32_t ft_tcp_idle;
184 } __aligned(CACHE_LINE_SIZE);
186 static struct proc *flowcleanerproc;
187 static VNET_DEFINE(struct flowtable *, flow_list_head);
188 static VNET_DEFINE(uint32_t, flow_hashjitter);
189 static VNET_DEFINE(uma_zone_t, flow_ipv4_zone);
190 static VNET_DEFINE(uma_zone_t, flow_ipv6_zone);
192 #define V_flow_list_head VNET(flow_list_head)
193 #define V_flow_hashjitter VNET(flow_hashjitter)
194 #define V_flow_ipv4_zone VNET(flow_ipv4_zone)
195 #define V_flow_ipv6_zone VNET(flow_ipv6_zone)
198 static struct cv flowclean_f_cv;
199 static struct cv flowclean_c_cv;
200 static struct mtx flowclean_lock;
201 static uint32_t flowclean_cycles;
202 static uint32_t flowclean_freq;
204 #ifdef FLOWTABLE_DEBUG
205 #define FLDPRINTF(ft, flags, fmt, ...) \
207 if ((ft)->ft_flags & (flags)) \
208 printf((fmt), __VA_ARGS__); \
212 #define FLDPRINTF(ft, flags, fmt, ...)
219 * - Make flowtable stats per-cpu, aggregated at sysctl call time,
220 * to avoid extra cache evictions caused by incrementing a shared
222 * - add sysctls to resize && flush flow tables
223 * - Add per flowtable sysctls for statistics and configuring timeouts
224 * - add saturation counter to rtentry to support per-packet load-balancing
225 * add flag to indicate round-robin flow, add list lookup from head
227 * - add sysctl / device node / syscall to support exporting and importing
228 * of flows with flag to indicate that a flow was imported so should
229 * not be considered for auto-cleaning
230 * - support explicit connection state (currently only ad-hoc for DSR)
231 * - idetach() cleanup for options VIMAGE builds.
233 VNET_DEFINE(int, flowtable_enable) = 1;
234 static VNET_DEFINE(int, flowtable_debug);
235 static VNET_DEFINE(int, flowtable_syn_expire) = SYN_IDLE;
236 static VNET_DEFINE(int, flowtable_udp_expire) = UDP_IDLE;
237 static VNET_DEFINE(int, flowtable_fin_wait_expire) = FIN_WAIT_IDLE;
238 static VNET_DEFINE(int, flowtable_tcp_expire) = TCP_IDLE;
239 static VNET_DEFINE(int, flowtable_nmbflows);
240 static VNET_DEFINE(int, flowtable_ready) = 0;
242 #define V_flowtable_enable VNET(flowtable_enable)
243 #define V_flowtable_debug VNET(flowtable_debug)
244 #define V_flowtable_syn_expire VNET(flowtable_syn_expire)
245 #define V_flowtable_udp_expire VNET(flowtable_udp_expire)
246 #define V_flowtable_fin_wait_expire VNET(flowtable_fin_wait_expire)
247 #define V_flowtable_tcp_expire VNET(flowtable_tcp_expire)
248 #define V_flowtable_nmbflows VNET(flowtable_nmbflows)
249 #define V_flowtable_ready VNET(flowtable_ready)
251 SYSCTL_NODE(_net_inet, OID_AUTO, flowtable, CTLFLAG_RD, NULL, "flowtable");
252 SYSCTL_VNET_INT(_net_inet_flowtable, OID_AUTO, debug, CTLFLAG_RW,
253 &VNET_NAME(flowtable_debug), 0, "print debug info.");
254 SYSCTL_VNET_INT(_net_inet_flowtable, OID_AUTO, enable, CTLFLAG_RW,
255 &VNET_NAME(flowtable_enable), 0, "enable flowtable caching.");
258 * XXX This does not end up updating timeouts at runtime
259 * and only reflects the value for the last table added :-/
261 SYSCTL_VNET_INT(_net_inet_flowtable, OID_AUTO, syn_expire, CTLFLAG_RW,
262 &VNET_NAME(flowtable_syn_expire), 0,
263 "seconds after which to remove syn allocated flow.");
264 SYSCTL_VNET_INT(_net_inet_flowtable, OID_AUTO, udp_expire, CTLFLAG_RW,
265 &VNET_NAME(flowtable_udp_expire), 0,
266 "seconds after which to remove flow allocated to UDP.");
267 SYSCTL_VNET_INT(_net_inet_flowtable, OID_AUTO, fin_wait_expire, CTLFLAG_RW,
268 &VNET_NAME(flowtable_fin_wait_expire), 0,
269 "seconds after which to remove a flow in FIN_WAIT.");
270 SYSCTL_VNET_INT(_net_inet_flowtable, OID_AUTO, tcp_expire, CTLFLAG_RW,
271 &VNET_NAME(flowtable_tcp_expire), 0,
272 "seconds after which to remove flow allocated to a TCP connection.");
276 * Maximum number of flows that can be allocated of a given type.
278 * The table is allocated at boot time (for the pure caching case
279 * there is no reason why this could not be changed at runtime)
280 * and thus (currently) needs to be set with a tunable.
283 sysctl_nmbflows(SYSCTL_HANDLER_ARGS)
285 int error, newnmbflows;
287 newnmbflows = V_flowtable_nmbflows;
288 error = sysctl_handle_int(oidp, &newnmbflows, 0, req);
289 if (error == 0 && req->newptr) {
290 if (newnmbflows > V_flowtable_nmbflows) {
291 V_flowtable_nmbflows = newnmbflows;
292 uma_zone_set_max(V_flow_ipv4_zone,
293 V_flowtable_nmbflows);
294 uma_zone_set_max(V_flow_ipv6_zone,
295 V_flowtable_nmbflows);
301 SYSCTL_VNET_PROC(_net_inet_flowtable, OID_AUTO, nmbflows,
302 CTLTYPE_INT|CTLFLAG_RW, 0, 0, sysctl_nmbflows, "IU",
303 "Maximum number of flows allowed");
307 #define FS_PRINT(sb, field) sbuf_printf((sb), "\t%s: %jd\n", #field, fs->ft_##field)
310 fs_print(struct sbuf *sb, struct flowtable_stats *fs)
313 FS_PRINT(sb, collisions);
314 FS_PRINT(sb, allocated);
315 FS_PRINT(sb, misses);
316 FS_PRINT(sb, max_depth);
317 FS_PRINT(sb, free_checks);
320 FS_PRINT(sb, lookups);
324 flowtable_show_stats(struct sbuf *sb, struct flowtable *ft)
327 struct flowtable_stats fs, *pfs;
329 if (ft->ft_flags & FL_PCPU) {
330 bzero(&fs, sizeof(fs));
333 pfs->ft_collisions += ft->ft_stats[i].ft_collisions;
334 pfs->ft_allocated += ft->ft_stats[i].ft_allocated;
335 pfs->ft_misses += ft->ft_stats[i].ft_misses;
336 pfs->ft_free_checks += ft->ft_stats[i].ft_free_checks;
337 pfs->ft_frees += ft->ft_stats[i].ft_frees;
338 pfs->ft_hits += ft->ft_stats[i].ft_hits;
339 pfs->ft_lookups += ft->ft_stats[i].ft_lookups;
340 if (ft->ft_stats[i].ft_max_depth > pfs->ft_max_depth)
341 pfs->ft_max_depth = ft->ft_stats[i].ft_max_depth;
344 pfs = &ft->ft_stats[0];
350 sysctl_flowtable_stats(SYSCTL_HANDLER_ARGS)
352 struct flowtable *ft;
356 sb = sbuf_new(NULL, NULL, 64*1024, SBUF_FIXEDLEN);
358 ft = V_flow_list_head;
360 sbuf_printf(sb, "\ntable name: %s\n", ft->ft_name);
361 flowtable_show_stats(sb, ft);
365 error = SYSCTL_OUT(req, sbuf_data(sb), sbuf_len(sb) + 1);
370 SYSCTL_VNET_PROC(_net_inet_flowtable, OID_AUTO, stats, CTLTYPE_STRING|CTLFLAG_RD,
371 NULL, 0, sysctl_flowtable_stats, "A", "flowtable statistics");
376 rtalloc_ign_wrapper(struct route *ro, uint32_t hash, u_int fibnum)
379 rtalloc_ign_fib(ro, 0, fibnum);
384 flowtable_global_lock(struct flowtable *table, uint32_t hash)
386 int lock_index = (hash)&(table->ft_lock_count - 1);
388 mtx_lock(&table->ft_locks[lock_index]);
392 flowtable_global_unlock(struct flowtable *table, uint32_t hash)
394 int lock_index = (hash)&(table->ft_lock_count - 1);
396 mtx_unlock(&table->ft_locks[lock_index]);
400 flowtable_pcpu_lock(struct flowtable *table, uint32_t hash)
407 flowtable_pcpu_unlock(struct flowtable *table, uint32_t hash)
413 #define FL_ENTRY_INDEX(table, hash)((hash) % (table)->ft_size)
414 #define FL_ENTRY(table, hash) *flowtable_entry((table), (hash))
415 #define FL_ENTRY_LOCK(table, hash) (table)->ft_lock((table), (hash))
416 #define FL_ENTRY_UNLOCK(table, hash) (table)->ft_unlock((table), (hash))
418 #define FL_STALE (1<<8)
419 #define FL_OVERWRITE (1<<10)
422 flow_invalidate(struct flentry *fle)
425 fle->f_flags |= FL_STALE;
429 proto_to_flags(uint8_t proto)
452 flags_to_proto(int flags)
454 int proto, protoflags;
456 protoflags = flags & (FL_TCP|FL_SCTP|FL_UDP);
457 switch (protoflags) {
462 proto = IPPROTO_SCTP;
475 #ifdef FLOWTABLE_DEBUG
477 ipv4_flow_print_tuple(int flags, int proto, struct sockaddr_in *ssin,
478 struct sockaddr_in *dsin)
480 char saddr[4*sizeof "123"], daddr[4*sizeof "123"];
482 if (flags & FL_HASH_ALL) {
483 inet_ntoa_r(ssin->sin_addr, saddr);
484 inet_ntoa_r(dsin->sin_addr, daddr);
485 printf("proto=%d %s:%d->%s:%d\n",
486 proto, saddr, ntohs(ssin->sin_port), daddr,
487 ntohs(dsin->sin_port));
489 inet_ntoa_r(*(struct in_addr *) &dsin->sin_addr, daddr);
490 printf("proto=%d %s\n", proto, daddr);
497 ipv4_mbuf_demarshal(struct flowtable *ft, struct mbuf *m,
498 struct sockaddr_in *ssin, struct sockaddr_in *dsin, uint16_t *flags)
506 uint16_t sport, dport;
508 proto = sport = dport = 0;
509 ip = mtod(m, struct ip *);
510 dsin->sin_family = AF_INET;
511 dsin->sin_len = sizeof(*dsin);
512 dsin->sin_addr = ip->ip_dst;
513 ssin->sin_family = AF_INET;
514 ssin->sin_len = sizeof(*ssin);
515 ssin->sin_addr = ip->ip_src;
518 if ((*flags & FL_HASH_ALL) == 0) {
519 FLDPRINTF(ft, FL_DEBUG_ALL, "skip port check flags=0x%x ",
524 iphlen = ip->ip_hl << 2; /* XXX options? */
528 th = (struct tcphdr *)((caddr_t)ip + iphlen);
529 sport = th->th_sport;
530 dport = th->th_dport;
531 if ((*flags & FL_HASH_ALL) &&
532 (th->th_flags & (TH_RST|TH_FIN)))
536 uh = (struct udphdr *)((caddr_t)ip + iphlen);
537 sport = uh->uh_sport;
538 dport = uh->uh_dport;
541 sh = (struct sctphdr *)((caddr_t)ip + iphlen);
542 sport = sh->src_port;
543 dport = sh->dest_port;
546 FLDPRINTF(ft, FL_DEBUG_ALL, "proto=0x%x not supported\n", proto);
548 /* no port - hence not a protocol we care about */
554 *flags |= proto_to_flags(proto);
555 ssin->sin_port = sport;
556 dsin->sin_port = dport;
561 ipv4_flow_lookup_hash_internal(
562 struct sockaddr_in *ssin, struct sockaddr_in *dsin,
563 uint32_t *key, uint16_t flags)
565 uint16_t sport, dport;
569 if ((V_flowtable_enable == 0) || (V_flowtable_ready == 0))
571 proto = flags_to_proto(flags);
572 sport = dport = key[2] = key[1] = key[0] = 0;
573 if ((ssin != NULL) && (flags & FL_HASH_ALL)) {
574 key[1] = ssin->sin_addr.s_addr;
575 sport = ssin->sin_port;
578 key[2] = dsin->sin_addr.s_addr;
579 dport = dsin->sin_port;
581 if (flags & FL_HASH_ALL) {
582 ((uint16_t *)key)[0] = sport;
583 ((uint16_t *)key)[1] = dport;
585 offset = V_flow_hashjitter + proto;
587 return (jenkins_hashword(key, 3, offset));
590 static struct flentry *
591 flowtable_lookup_mbuf4(struct flowtable *ft, struct mbuf *m)
593 struct sockaddr_storage ssa, dsa;
595 struct sockaddr_in *dsin, *ssin;
597 dsin = (struct sockaddr_in *)&dsa;
598 ssin = (struct sockaddr_in *)&ssa;
599 bzero(dsin, sizeof(*dsin));
600 bzero(ssin, sizeof(*ssin));
601 flags = ft->ft_flags;
602 if (ipv4_mbuf_demarshal(ft, m, ssin, dsin, &flags) != 0)
605 return (flowtable_lookup(ft, &ssa, &dsa, M_GETFIB(m), flags));
609 flow_to_route(struct flentry *fle, struct route *ro)
611 uint32_t *hashkey = NULL;
612 struct sockaddr_in *sin;
614 sin = (struct sockaddr_in *)&ro->ro_dst;
615 sin->sin_family = AF_INET;
616 sin->sin_len = sizeof(*sin);
617 hashkey = ((struct flentry_v4 *)fle)->fl_flow.ipf_key;
618 sin->sin_addr.s_addr = hashkey[2];
619 ro->ro_rt = __DEVOLATILE(struct rtentry *, fle->f_rt);
620 ro->ro_lle = __DEVOLATILE(struct llentry *, fle->f_lle);
621 ro->ro_flags |= RT_NORTREF;
627 * PULLUP_TO(len, p, T) makes sure that len + sizeof(T) is contiguous,
628 * then it sets p to point at the offset "len" in the mbuf. WARNING: the
629 * pointer might become stale after other pullups (but we never use it
632 #define PULLUP_TO(_len, p, T) \
634 int x = (_len) + sizeof(T); \
635 if ((m)->m_len < x) { \
636 goto receive_failed; \
638 p = (mtod(m, char *) + (_len)); \
641 #define TCP(p) ((struct tcphdr *)(p))
642 #define SCTP(p) ((struct sctphdr *)(p))
643 #define UDP(p) ((struct udphdr *)(p))
646 ipv6_mbuf_demarshal(struct flowtable *ft, struct mbuf *m,
647 struct sockaddr_in6 *ssin6, struct sockaddr_in6 *dsin6, uint16_t *flags)
652 uint16_t src_port, dst_port;
656 offset = hlen = src_port = dst_port = 0;
658 ip6 = mtod(m, struct ip6_hdr *);
659 hlen = sizeof(struct ip6_hdr);
660 proto = ip6->ip6_nxt;
662 if ((*flags & FL_HASH_ALL) == 0)
665 while (ulp == NULL) {
668 case IPPROTO_OSPFIGP:
676 PULLUP_TO(hlen, ulp, struct tcphdr);
677 dst_port = TCP(ulp)->th_dport;
678 src_port = TCP(ulp)->th_sport;
679 if ((*flags & FL_HASH_ALL) &&
680 (TCP(ulp)->th_flags & (TH_RST|TH_FIN)))
684 PULLUP_TO(hlen, ulp, struct sctphdr);
685 src_port = SCTP(ulp)->src_port;
686 dst_port = SCTP(ulp)->dest_port;
689 PULLUP_TO(hlen, ulp, struct udphdr);
690 dst_port = UDP(ulp)->uh_dport;
691 src_port = UDP(ulp)->uh_sport;
693 case IPPROTO_HOPOPTS: /* RFC 2460 */
694 PULLUP_TO(hlen, ulp, struct ip6_hbh);
695 hlen += (((struct ip6_hbh *)ulp)->ip6h_len + 1) << 3;
696 proto = ((struct ip6_hbh *)ulp)->ip6h_nxt;
699 case IPPROTO_ROUTING: /* RFC 2460 */
700 PULLUP_TO(hlen, ulp, struct ip6_rthdr);
701 hlen += (((struct ip6_rthdr *)ulp)->ip6r_len + 1) << 3;
702 proto = ((struct ip6_rthdr *)ulp)->ip6r_nxt;
705 case IPPROTO_FRAGMENT: /* RFC 2460 */
706 PULLUP_TO(hlen, ulp, struct ip6_frag);
707 hlen += sizeof (struct ip6_frag);
708 proto = ((struct ip6_frag *)ulp)->ip6f_nxt;
709 offset = ((struct ip6_frag *)ulp)->ip6f_offlg &
713 case IPPROTO_DSTOPTS: /* RFC 2460 */
714 PULLUP_TO(hlen, ulp, struct ip6_hbh);
715 hlen += (((struct ip6_hbh *)ulp)->ip6h_len + 1) << 3;
716 proto = ((struct ip6_hbh *)ulp)->ip6h_nxt;
719 case IPPROTO_AH: /* RFC 2402 */
720 PULLUP_TO(hlen, ulp, struct ip6_ext);
721 hlen += (((struct ip6_ext *)ulp)->ip6e_len + 2) << 2;
722 proto = ((struct ip6_ext *)ulp)->ip6e_nxt;
726 PULLUP_TO(hlen, ulp, struct ip6_ext);
737 dsin6->sin6_family = AF_INET6;
738 dsin6->sin6_len = sizeof(*dsin6);
739 dsin6->sin6_port = dst_port;
740 memcpy(&dsin6->sin6_addr, &ip6->ip6_dst, sizeof(struct in6_addr));
742 ssin6->sin6_family = AF_INET6;
743 ssin6->sin6_len = sizeof(*ssin6);
744 ssin6->sin6_port = src_port;
745 memcpy(&ssin6->sin6_addr, &ip6->ip6_src, sizeof(struct in6_addr));
746 *flags |= proto_to_flags(proto);
751 #define zero_key(key) \
765 ipv6_flow_lookup_hash_internal(
766 struct sockaddr_in6 *ssin6, struct sockaddr_in6 *dsin6,
767 uint32_t *key, uint16_t flags)
769 uint16_t sport, dport;
773 if ((V_flowtable_enable == 0) || (V_flowtable_ready == 0))
776 proto = flags_to_proto(flags);
780 memcpy(&key[1], &dsin6->sin6_addr, sizeof(struct in6_addr));
781 dport = dsin6->sin6_port;
783 if ((ssin6 != NULL) && (flags & FL_HASH_ALL)) {
784 memcpy(&key[5], &ssin6->sin6_addr, sizeof(struct in6_addr));
785 sport = ssin6->sin6_port;
787 if (flags & FL_HASH_ALL) {
788 ((uint16_t *)key)[0] = sport;
789 ((uint16_t *)key)[1] = dport;
791 offset = V_flow_hashjitter + proto;
793 return (jenkins_hashword(key, 9, offset));
796 static struct flentry *
797 flowtable_lookup_mbuf6(struct flowtable *ft, struct mbuf *m)
799 struct sockaddr_storage ssa, dsa;
800 struct sockaddr_in6 *dsin6, *ssin6;
803 dsin6 = (struct sockaddr_in6 *)&dsa;
804 ssin6 = (struct sockaddr_in6 *)&ssa;
805 bzero(dsin6, sizeof(*dsin6));
806 bzero(ssin6, sizeof(*ssin6));
807 flags = ft->ft_flags;
809 if (ipv6_mbuf_demarshal(ft, m, ssin6, dsin6, &flags) != 0)
812 return (flowtable_lookup(ft, &ssa, &dsa, M_GETFIB(m), flags));
816 flow_to_route_in6(struct flentry *fle, struct route_in6 *ro)
818 uint32_t *hashkey = NULL;
819 struct sockaddr_in6 *sin6;
821 sin6 = (struct sockaddr_in6 *)&ro->ro_dst;
823 sin6->sin6_family = AF_INET6;
824 sin6->sin6_len = sizeof(*sin6);
825 hashkey = ((struct flentry_v6 *)fle)->fl_flow.ipf_key;
826 memcpy(&sin6->sin6_addr, &hashkey[5], sizeof (struct in6_addr));
827 ro->ro_rt = __DEVOLATILE(struct rtentry *, fle->f_rt);
828 ro->ro_lle = __DEVOLATILE(struct llentry *, fle->f_lle);
829 ro->ro_flags |= RT_NORTREF;
834 flowtable_mask(struct flowtable *ft)
838 if (ft->ft_flags & FL_PCPU)
839 mask = ft->ft_masks[curcpu];
841 mask = ft->ft_masks[0];
846 static struct flentry **
847 flowtable_entry(struct flowtable *ft, uint32_t hash)
849 struct flentry **fle;
850 int index = (hash % ft->ft_size);
852 if (ft->ft_flags & FL_PCPU) {
853 KASSERT(&ft->ft_table.pcpu[curcpu][0] != NULL, ("pcpu not set"));
854 fle = &ft->ft_table.pcpu[curcpu][index];
856 KASSERT(&ft->ft_table.global[0] != NULL, ("global not set"));
857 fle = &ft->ft_table.global[index];
864 flow_stale(struct flowtable *ft, struct flentry *fle)
868 if ((fle->f_fhash == 0)
869 || ((fle->f_rt->rt_flags & RTF_HOST) &&
870 ((fle->f_rt->rt_flags & (RTF_UP))
872 || (fle->f_rt->rt_ifp == NULL)
873 || !RT_LINK_IS_UP(fle->f_rt->rt_ifp))
876 idle_time = time_uptime - fle->f_uptime;
878 if ((fle->f_flags & FL_STALE) ||
879 ((fle->f_flags & (TH_SYN|TH_ACK|TH_FIN)) == 0
880 && (idle_time > ft->ft_udp_idle)) ||
881 ((fle->f_flags & TH_FIN)
882 && (idle_time > ft->ft_fin_wait_idle)) ||
883 ((fle->f_flags & (TH_SYN|TH_ACK)) == TH_SYN
884 && (idle_time > ft->ft_syn_idle)) ||
885 ((fle->f_flags & (TH_SYN|TH_ACK)) == (TH_SYN|TH_ACK)
886 && (idle_time > ft->ft_tcp_idle)) ||
887 ((fle->f_rt->rt_flags & RTF_UP) == 0 ||
888 (fle->f_rt->rt_ifp == NULL)))
895 flowtable_set_hashkey(struct flentry *fle, uint32_t *key)
900 if (fle->f_flags & FL_IPV6) {
902 hashkey = ((struct flentry_v4 *)fle)->fl_flow.ipf_key;
905 hashkey = ((struct flentry_v6 *)fle)->fl_flow.ipf_key;
908 for (i = 0; i < nwords; i++)
912 static struct flentry *
913 flow_alloc(struct flowtable *ft)
915 struct flentry *newfle;
919 zone = (ft->ft_flags & FL_IPV6) ? V_flow_ipv6_zone : V_flow_ipv4_zone;
921 newfle = uma_zalloc(zone, M_NOWAIT | M_ZERO);
923 atomic_add_int(&ft->ft_count, 1);
928 flow_free(struct flentry *fle, struct flowtable *ft)
932 zone = (ft->ft_flags & FL_IPV6) ? V_flow_ipv6_zone : V_flow_ipv4_zone;
933 atomic_add_int(&ft->ft_count, -1);
934 uma_zfree(zone, fle);
938 flow_full(struct flowtable *ft)
944 count = ft->ft_count;
946 if (full && (count < (V_flowtable_nmbflows - (V_flowtable_nmbflows >> 3))))
948 else if (!full && (count > (V_flowtable_nmbflows - (V_flowtable_nmbflows >> 5))))
951 if (full && !ft->ft_full) {
952 flowclean_freq = 4*hz;
953 if ((ft->ft_flags & FL_HASH_ALL) == 0)
954 ft->ft_udp_idle = ft->ft_fin_wait_idle =
955 ft->ft_syn_idle = ft->ft_tcp_idle = 5;
956 cv_broadcast(&flowclean_c_cv);
957 } else if (!full && ft->ft_full) {
958 flowclean_freq = 20*hz;
959 if ((ft->ft_flags & FL_HASH_ALL) == 0)
960 ft->ft_udp_idle = ft->ft_fin_wait_idle =
961 ft->ft_syn_idle = ft->ft_tcp_idle = 30;
964 return (ft->ft_full);
968 flowtable_insert(struct flowtable *ft, uint32_t hash, uint32_t *key,
969 uint32_t fibnum, struct route *ro, uint16_t flags)
971 struct flentry *fle, *fletail, *newfle, **flep;
972 struct flowtable_stats *fs = &ft->ft_stats[curcpu];
977 newfle = flow_alloc(ft);
981 newfle->f_flags |= (flags & FL_IPV6);
982 proto = flags_to_proto(flags);
984 FL_ENTRY_LOCK(ft, hash);
985 mask = flowtable_mask(ft);
986 flep = flowtable_entry(ft, hash);
987 fletail = fle = *flep;
990 bit_set(mask, FL_ENTRY_INDEX(ft, hash));
991 *flep = fle = newfle;
998 * find end of list and make sure that we were not
999 * preempted by another thread handling this flow
1001 while (fle != NULL) {
1002 if (fle->f_fhash == hash && !flow_stale(ft, fle)) {
1004 * there was either a hash collision
1005 * or we lost a race to insert
1007 FL_ENTRY_UNLOCK(ft, hash);
1008 flow_free(newfle, ft);
1010 if (flags & FL_OVERWRITE)
1015 * re-visit this double condition XXX
1017 if (fletail->f_next != NULL)
1018 fletail = fle->f_next;
1024 if (depth > fs->ft_max_depth)
1025 fs->ft_max_depth = depth;
1026 fletail->f_next = newfle;
1029 flowtable_set_hashkey(fle, key);
1031 fle->f_proto = proto;
1032 fle->f_rt = ro->ro_rt;
1033 fle->f_lle = ro->ro_lle;
1034 fle->f_fhash = hash;
1035 fle->f_fibnum = fibnum;
1036 fle->f_uptime = time_uptime;
1037 FL_ENTRY_UNLOCK(ft, hash);
1042 kern_flowtable_insert(struct flowtable *ft,
1043 struct sockaddr_storage *ssa, struct sockaddr_storage *dsa,
1044 struct route *ro, uint32_t fibnum, int flags)
1046 uint32_t key[9], hash;
1048 flags = (ft->ft_flags | flags | FL_OVERWRITE);
1052 if (ssa->ss_family == AF_INET)
1053 hash = ipv4_flow_lookup_hash_internal((struct sockaddr_in *)ssa,
1054 (struct sockaddr_in *)dsa, key, flags);
1057 if (ssa->ss_family == AF_INET6)
1058 hash = ipv6_flow_lookup_hash_internal((struct sockaddr_in6 *)ssa,
1059 (struct sockaddr_in6 *)dsa, key, flags);
1061 if (ro->ro_rt == NULL || ro->ro_lle == NULL)
1064 FLDPRINTF(ft, FL_DEBUG,
1065 "kern_flowtable_insert: key=%x:%x:%x hash=%x fibnum=%d flags=%x\n",
1066 key[0], key[1], key[2], hash, fibnum, flags);
1067 return (flowtable_insert(ft, hash, key, fibnum, ro, flags));
1071 flowtable_key_equal(struct flentry *fle, uint32_t *key)
1076 if (fle->f_flags & FL_IPV6) {
1078 hashkey = ((struct flentry_v4 *)fle)->fl_flow.ipf_key;
1081 hashkey = ((struct flentry_v6 *)fle)->fl_flow.ipf_key;
1084 for (i = 0; i < nwords; i++)
1085 if (hashkey[i] != key[i])
1092 flowtable_lookup_mbuf(struct flowtable *ft, struct mbuf *m, int af)
1094 struct flentry *fle = NULL;
1098 fle = flowtable_lookup_mbuf4(ft, m);
1102 fle = flowtable_lookup_mbuf6(ft, m);
1104 if (fle != NULL && m != NULL && (m->m_flags & M_FLOWID) == 0) {
1105 m->m_flags |= M_FLOWID;
1106 m->m_pkthdr.flowid = fle->f_fhash;
1112 flowtable_lookup(struct flowtable *ft, struct sockaddr_storage *ssa,
1113 struct sockaddr_storage *dsa, uint32_t fibnum, int flags)
1115 uint32_t key[9], hash;
1116 struct flentry *fle;
1117 struct flowtable_stats *fs = &ft->ft_stats[curcpu];
1121 struct llentry *lle;
1122 struct route sro, *ro;
1123 struct route_in6 sro6;
1125 sro.ro_rt = sro6.ro_rt = NULL;
1126 sro.ro_lle = sro6.ro_lle = NULL;
1129 flags |= ft->ft_flags;
1130 proto = flags_to_proto(flags);
1132 if (ssa->ss_family == AF_INET) {
1133 struct sockaddr_in *ssin, *dsin;
1136 memcpy(&ro->ro_dst, dsa, sizeof(struct sockaddr_in));
1138 * The harvested source and destination addresses
1139 * may contain port information if the packet is
1140 * from a transport protocol (e.g. TCP/UDP). The
1141 * port field must be cleared before performing
1144 ((struct sockaddr_in *)&ro->ro_dst)->sin_port = 0;
1145 dsin = (struct sockaddr_in *)dsa;
1146 ssin = (struct sockaddr_in *)ssa;
1147 if ((dsin->sin_addr.s_addr == ssin->sin_addr.s_addr) ||
1148 (ntohl(dsin->sin_addr.s_addr) >> IN_CLASSA_NSHIFT) == IN_LOOPBACKNET ||
1149 (ntohl(ssin->sin_addr.s_addr) >> IN_CLASSA_NSHIFT) == IN_LOOPBACKNET)
1152 hash = ipv4_flow_lookup_hash_internal(ssin, dsin, key, flags);
1156 if (ssa->ss_family == AF_INET6) {
1157 struct sockaddr_in6 *ssin6, *dsin6;
1159 ro = (struct route *)&sro6;
1160 memcpy(&sro6.ro_dst, dsa,
1161 sizeof(struct sockaddr_in6));
1162 ((struct sockaddr_in6 *)&ro->ro_dst)->sin6_port = 0;
1163 dsin6 = (struct sockaddr_in6 *)dsa;
1164 ssin6 = (struct sockaddr_in6 *)ssa;
1167 hash = ipv6_flow_lookup_hash_internal(ssin6, dsin6, key, flags);
1171 * Ports are zero and this isn't a transmit cache
1172 * - thus not a protocol for which we need to keep
1174 * FL_HASH_ALL => key[0] != 0 for TCP || UDP || SCTP
1176 if (hash == 0 || (key[0] == 0 && (ft->ft_flags & FL_HASH_ALL)))
1180 FL_ENTRY_LOCK(ft, hash);
1181 if ((fle = FL_ENTRY(ft, hash)) == NULL) {
1182 FL_ENTRY_UNLOCK(ft, hash);
1186 rt = __DEVOLATILE(struct rtentry *, fle->f_rt);
1187 lle = __DEVOLATILE(struct llentry *, fle->f_lle);
1190 && fle->f_fhash == hash
1191 && flowtable_key_equal(fle, key)
1192 && (proto == fle->f_proto)
1193 && (fibnum == fle->f_fibnum)
1194 && (rt->rt_flags & RTF_UP)
1195 && (rt->rt_ifp != NULL)
1196 && (lle->la_flags & LLE_VALID)) {
1198 fle->f_uptime = time_uptime;
1199 fle->f_flags |= flags;
1200 FL_ENTRY_UNLOCK(ft, hash);
1202 } else if (fle->f_next != NULL) {
1206 FL_ENTRY_UNLOCK(ft, hash);
1208 if (flags & FL_NOAUTO || flow_full(ft))
1213 * This bit of code ends up locking the
1214 * same route 3 times (just like ip_output + ether_output)
1216 * - in rt_check when called by arpresolve
1217 * - dropping the refcount for the rtentry
1219 * This could be consolidated to one if we wrote a variant
1220 * of arpresolve with an rt_check variant that expected to
1221 * receive the route locked
1225 if ((ro->ro_dst.sa_family != AF_INET) &&
1226 (ro->ro_dst.sa_family != AF_INET6))
1227 panic("sa_family == %d\n", ro->ro_dst.sa_family);
1230 ft->ft_rtalloc(ro, hash, fibnum);
1231 if (ro->ro_rt == NULL)
1232 error = ENETUNREACH;
1234 struct llentry *lle = NULL;
1235 struct sockaddr_storage *l3addr;
1236 struct rtentry *rt = ro->ro_rt;
1237 struct ifnet *ifp = rt->rt_ifp;
1239 if (ifp->if_flags & (IFF_POINTOPOINT | IFF_LOOPBACK)) {
1245 if (ssa->ss_family == AF_INET6) {
1246 struct sockaddr_in6 *dsin6;
1248 dsin6 = (struct sockaddr_in6 *)dsa;
1249 if (in6_localaddr(&dsin6->sin6_addr)) {
1255 if (rt->rt_flags & RTF_GATEWAY)
1256 l3addr = (struct sockaddr_storage *)rt->rt_gateway;
1259 l3addr = (struct sockaddr_storage *)&ro->ro_dst;
1260 lle = llentry_alloc(ifp, LLTABLE6(ifp), l3addr);
1264 if (ssa->ss_family == AF_INET) {
1265 if (rt->rt_flags & RTF_GATEWAY)
1266 l3addr = (struct sockaddr_storage *)rt->rt_gateway;
1268 l3addr = (struct sockaddr_storage *)&ro->ro_dst;
1269 lle = llentry_alloc(ifp, LLTABLE(ifp), l3addr);
1280 error = flowtable_insert(ft, hash, key, fibnum, ro, flags);
1290 return ((error) ? NULL : fle);
1294 * used by the bit_alloc macro
1296 #define calloc(count, size) malloc((count)*(size), M_DEVBUF, M_WAITOK|M_ZERO)
1299 flowtable_alloc(char *name, int nentry, int flags)
1301 struct flowtable *ft, *fttail;
1304 if (V_flow_hashjitter == 0)
1305 V_flow_hashjitter = arc4random();
1307 KASSERT(nentry > 0, ("nentry must be > 0, is %d\n", nentry));
1309 ft = malloc(sizeof(struct flowtable),
1310 M_RTABLE, M_WAITOK | M_ZERO);
1313 ft->ft_flags = flags;
1314 ft->ft_size = nentry;
1316 ft->ft_rtalloc = rtalloc_mpath_fib;
1318 ft->ft_rtalloc = rtalloc_ign_wrapper;
1320 if (flags & FL_PCPU) {
1321 ft->ft_lock = flowtable_pcpu_lock;
1322 ft->ft_unlock = flowtable_pcpu_unlock;
1324 for (i = 0; i <= mp_maxid; i++) {
1325 ft->ft_table.pcpu[i] =
1326 malloc(nentry*sizeof(struct flentry *),
1327 M_RTABLE, M_WAITOK | M_ZERO);
1328 ft->ft_masks[i] = bit_alloc(nentry);
1331 ft->ft_lock_count = 2*(powerof2(mp_maxid + 1) ? (mp_maxid + 1):
1332 (fls(mp_maxid + 1) << 1));
1334 ft->ft_lock = flowtable_global_lock;
1335 ft->ft_unlock = flowtable_global_unlock;
1336 ft->ft_table.global =
1337 malloc(nentry*sizeof(struct flentry *),
1338 M_RTABLE, M_WAITOK | M_ZERO);
1339 ft->ft_locks = malloc(ft->ft_lock_count*sizeof(struct mtx),
1340 M_RTABLE, M_WAITOK | M_ZERO);
1341 for (i = 0; i < ft->ft_lock_count; i++)
1342 mtx_init(&ft->ft_locks[i], "flow", NULL, MTX_DEF|MTX_DUPOK);
1344 ft->ft_masks[0] = bit_alloc(nentry);
1346 ft->ft_tmpmask = bit_alloc(nentry);
1349 * In the local transmit case the table truly is
1350 * just a cache - so everything is eligible for
1351 * replacement after 5s of non-use
1353 if (flags & FL_HASH_ALL) {
1354 ft->ft_udp_idle = V_flowtable_udp_expire;
1355 ft->ft_syn_idle = V_flowtable_syn_expire;
1356 ft->ft_fin_wait_idle = V_flowtable_fin_wait_expire;
1357 ft->ft_tcp_idle = V_flowtable_fin_wait_expire;
1359 ft->ft_udp_idle = ft->ft_fin_wait_idle =
1360 ft->ft_syn_idle = ft->ft_tcp_idle = 30;
1365 * hook in to the cleaner list
1367 if (V_flow_list_head == NULL)
1368 V_flow_list_head = ft;
1370 fttail = V_flow_list_head;
1371 while (fttail->ft_next != NULL)
1372 fttail = fttail->ft_next;
1373 fttail->ft_next = ft;
1380 * The rest of the code is devoted to garbage collection of expired entries.
1381 * It is a new additon made necessary by the switch to dynamically allocating
1386 fle_free(struct flentry *fle, struct flowtable *ft)
1389 struct llentry *lle;
1391 rt = __DEVOLATILE(struct rtentry *, fle->f_rt);
1392 lle = __DEVOLATILE(struct llentry *, fle->f_lle);
1401 flowtable_free_stale(struct flowtable *ft, struct rtentry *rt)
1403 int curbit = 0, count;
1404 struct flentry *fle, **flehead, *fleprev;
1405 struct flentry *flefreehead, *flefreetail, *fletmp;
1406 bitstr_t *mask, *tmpmask;
1407 struct flowtable_stats *fs = &ft->ft_stats[curcpu];
1409 flefreehead = flefreetail = NULL;
1410 mask = flowtable_mask(ft);
1411 tmpmask = ft->ft_tmpmask;
1412 memcpy(tmpmask, mask, ft->ft_size/8);
1414 * XXX Note to self, bit_ffs operates at the byte level
1415 * and thus adds gratuitous overhead
1417 bit_ffs(tmpmask, ft->ft_size, &curbit);
1418 while (curbit != -1) {
1419 if (curbit >= ft->ft_size || curbit < -1) {
1421 "warning: bad curbit value %d \n",
1426 FL_ENTRY_LOCK(ft, curbit);
1427 flehead = flowtable_entry(ft, curbit);
1428 fle = fleprev = *flehead;
1430 fs->ft_free_checks++;
1432 if (fle == NULL && curbit > 0) {
1434 "warning bit=%d set, but no fle found\n",
1438 while (fle != NULL) {
1440 if (__DEVOLATILE(struct rtentry *, fle->f_rt) != rt) {
1445 } else if (!flow_stale(ft, fle)) {
1451 * delete head of the list
1453 if (fleprev == *flehead) {
1455 if (fle == fleprev) {
1456 fleprev = *flehead = fle->f_next;
1458 fleprev = *flehead = fle;
1462 * don't advance fleprev
1465 fleprev->f_next = fle->f_next;
1466 fle = fleprev->f_next;
1469 if (flefreehead == NULL)
1470 flefreehead = flefreetail = fletmp;
1472 flefreetail->f_next = fletmp;
1473 flefreetail = fletmp;
1475 fletmp->f_next = NULL;
1477 if (*flehead == NULL)
1478 bit_clear(mask, curbit);
1479 FL_ENTRY_UNLOCK(ft, curbit);
1480 bit_clear(tmpmask, curbit);
1481 bit_ffs(tmpmask, ft->ft_size, &curbit);
1484 while ((fle = flefreehead) != NULL) {
1485 flefreehead = fle->f_next;
1490 if (V_flowtable_debug && count)
1491 log(LOG_DEBUG, "freed %d flow entries\n", count);
1495 flowtable_route_flush(struct flowtable *ft, struct rtentry *rt)
1499 if (ft->ft_flags & FL_PCPU) {
1501 if (smp_started == 1) {
1502 thread_lock(curthread);
1503 sched_bind(curthread, i);
1504 thread_unlock(curthread);
1507 flowtable_free_stale(ft, rt);
1509 if (smp_started == 1) {
1510 thread_lock(curthread);
1511 sched_unbind(curthread);
1512 thread_unlock(curthread);
1516 flowtable_free_stale(ft, rt);
1521 flowtable_clean_vnet(void)
1523 struct flowtable *ft;
1526 ft = V_flow_list_head;
1527 while (ft != NULL) {
1528 if (ft->ft_flags & FL_PCPU) {
1530 if (smp_started == 1) {
1531 thread_lock(curthread);
1532 sched_bind(curthread, i);
1533 thread_unlock(curthread);
1536 flowtable_free_stale(ft, NULL);
1538 if (smp_started == 1) {
1539 thread_lock(curthread);
1540 sched_unbind(curthread);
1541 thread_unlock(curthread);
1545 flowtable_free_stale(ft, NULL);
1552 flowtable_cleaner(void)
1554 VNET_ITERATOR_DECL(vnet_iter);
1558 log(LOG_INFO, "flowtable cleaner started\n");
1562 VNET_FOREACH(vnet_iter) {
1563 CURVNET_SET(vnet_iter);
1564 flowtable_clean_vnet();
1567 VNET_LIST_RUNLOCK();
1570 * The 10 second interval between cleaning checks
1573 mtx_lock(&flowclean_lock);
1575 sched_prio(td, PPAUSE);
1578 cv_broadcast(&flowclean_f_cv);
1579 cv_timedwait(&flowclean_c_cv, &flowclean_lock, flowclean_freq);
1580 mtx_unlock(&flowclean_lock);
1585 flowtable_flush(void *unused __unused)
1589 mtx_lock(&flowclean_lock);
1590 start = flowclean_cycles;
1591 while (start == flowclean_cycles) {
1592 cv_broadcast(&flowclean_c_cv);
1593 cv_wait(&flowclean_f_cv, &flowclean_lock);
1595 mtx_unlock(&flowclean_lock);
1598 static struct kproc_desc flow_kp = {
1603 SYSINIT(flowcleaner, SI_SUB_KTHREAD_IDLE, SI_ORDER_ANY, kproc_start, &flow_kp);
1606 flowtable_init_vnet(const void *unused __unused)
1609 V_flowtable_nmbflows = 1024 + maxusers * 64 * mp_ncpus;
1610 V_flow_ipv4_zone = uma_zcreate("ip4flow", sizeof(struct flentry_v4),
1611 NULL, NULL, NULL, NULL, 64, UMA_ZONE_MAXBUCKET);
1612 V_flow_ipv6_zone = uma_zcreate("ip6flow", sizeof(struct flentry_v6),
1613 NULL, NULL, NULL, NULL, 64, UMA_ZONE_MAXBUCKET);
1614 uma_zone_set_max(V_flow_ipv4_zone, V_flowtable_nmbflows);
1615 uma_zone_set_max(V_flow_ipv6_zone, V_flowtable_nmbflows);
1616 V_flowtable_ready = 1;
1618 VNET_SYSINIT(flowtable_init_vnet, SI_SUB_SMP, SI_ORDER_ANY,
1619 flowtable_init_vnet, NULL);
1622 flowtable_init(const void *unused __unused)
1625 cv_init(&flowclean_c_cv, "c_flowcleanwait");
1626 cv_init(&flowclean_f_cv, "f_flowcleanwait");
1627 mtx_init(&flowclean_lock, "flowclean lock", NULL, MTX_DEF);
1628 EVENTHANDLER_REGISTER(ifnet_departure_event, flowtable_flush, NULL,
1629 EVENTHANDLER_PRI_ANY);
1630 flowclean_freq = 20*hz;
1632 SYSINIT(flowtable_init, SI_SUB_KTHREAD_INIT, SI_ORDER_FIRST,
1633 flowtable_init, NULL);
1638 flowtable_uninit(const void *unused __unused)
1641 V_flowtable_ready = 0;
1642 uma_zdestroy(V_flow_ipv4_zone);
1643 uma_zdestroy(V_flow_ipv6_zone);
1646 VNET_SYSUNINIT(flowtable_uninit, SI_SUB_KTHREAD_INIT, SI_ORDER_ANY,
1647 flowtable_uninit, NULL);
1652 flowtable_get_hashkey(struct flentry *fle)
1656 if (fle->f_flags & FL_IPV6)
1657 hashkey = ((struct flentry_v4 *)fle)->fl_flow.ipf_key;
1659 hashkey = ((struct flentry_v6 *)fle)->fl_flow.ipf_key;
1665 flowtable_mask_pcpu(struct flowtable *ft, int cpuid)
1669 if (ft->ft_flags & FL_PCPU)
1670 mask = ft->ft_masks[cpuid];
1672 mask = ft->ft_masks[0];
1677 static struct flentry **
1678 flowtable_entry_pcpu(struct flowtable *ft, uint32_t hash, int cpuid)
1680 struct flentry **fle;
1681 int index = (hash % ft->ft_size);
1683 if (ft->ft_flags & FL_PCPU) {
1684 fle = &ft->ft_table.pcpu[cpuid][index];
1686 fle = &ft->ft_table.global[index];
1693 flow_show(struct flowtable *ft, struct flentry *fle)
1696 int rt_valid, ifp_valid;
1697 uint16_t sport, dport;
1699 char saddr[4*sizeof "123"], daddr[4*sizeof "123"];
1700 volatile struct rtentry *rt;
1701 struct ifnet *ifp = NULL;
1703 idle_time = (int)(time_uptime - fle->f_uptime);
1705 rt_valid = rt != NULL;
1708 ifp_valid = ifp != NULL;
1709 hashkey = flowtable_get_hashkey(fle);
1710 if (fle->f_flags & FL_IPV6)
1713 inet_ntoa_r(*(struct in_addr *) &hashkey[2], daddr);
1714 if (ft->ft_flags & FL_HASH_ALL) {
1715 inet_ntoa_r(*(struct in_addr *) &hashkey[1], saddr);
1716 sport = ntohs(((uint16_t *)hashkey)[0]);
1717 dport = ntohs(((uint16_t *)hashkey)[1]);
1718 db_printf("%s:%d->%s:%d",
1719 saddr, sport, daddr,
1722 db_printf("%s ", daddr);
1725 if (fle->f_flags & FL_STALE)
1726 db_printf(" FL_STALE ");
1727 if (fle->f_flags & FL_TCP)
1728 db_printf(" FL_TCP ");
1729 if (fle->f_flags & FL_UDP)
1730 db_printf(" FL_UDP ");
1732 if (rt->rt_flags & RTF_UP)
1733 db_printf(" RTF_UP ");
1736 if (ifp->if_flags & IFF_LOOPBACK)
1737 db_printf(" IFF_LOOPBACK ");
1738 if (ifp->if_flags & IFF_UP)
1739 db_printf(" IFF_UP ");
1740 if (ifp->if_flags & IFF_POINTOPOINT)
1741 db_printf(" IFF_POINTOPOINT ");
1743 if (fle->f_flags & FL_IPV6)
1744 db_printf("\n\tkey=%08x:%08x:%08x%08x:%08x:%08x%08x:%08x:%08x",
1745 hashkey[0], hashkey[1], hashkey[2],
1746 hashkey[3], hashkey[4], hashkey[5],
1747 hashkey[6], hashkey[7], hashkey[8]);
1749 db_printf("\n\tkey=%08x:%08x:%08x ",
1750 hashkey[0], hashkey[1], hashkey[2]);
1751 db_printf("hash=%08x idle_time=%03d"
1752 "\n\tfibnum=%02d rt=%p",
1753 fle->f_fhash, idle_time, fle->f_fibnum, fle->f_rt);
1758 flowtable_show(struct flowtable *ft, int cpuid)
1761 struct flentry *fle, **flehead;
1762 bitstr_t *mask, *tmpmask;
1765 db_printf("cpu: %d\n", cpuid);
1766 mask = flowtable_mask_pcpu(ft, cpuid);
1767 tmpmask = ft->ft_tmpmask;
1768 memcpy(tmpmask, mask, ft->ft_size/8);
1770 * XXX Note to self, bit_ffs operates at the byte level
1771 * and thus adds gratuitous overhead
1773 bit_ffs(tmpmask, ft->ft_size, &curbit);
1774 while (curbit != -1) {
1775 if (curbit >= ft->ft_size || curbit < -1) {
1776 db_printf("warning: bad curbit value %d \n",
1781 flehead = flowtable_entry_pcpu(ft, curbit, cpuid);
1784 while (fle != NULL) {
1789 bit_clear(tmpmask, curbit);
1790 bit_ffs(tmpmask, ft->ft_size, &curbit);
1795 flowtable_show_vnet(void)
1797 struct flowtable *ft;
1800 ft = V_flow_list_head;
1801 while (ft != NULL) {
1802 printf("name: %s\n", ft->ft_name);
1803 if (ft->ft_flags & FL_PCPU) {
1805 flowtable_show(ft, i);
1808 flowtable_show(ft, -1);
1814 DB_SHOW_COMMAND(flowtables, db_show_flowtables)
1816 VNET_ITERATOR_DECL(vnet_iter);
1818 VNET_FOREACH(vnet_iter) {
1819 CURVNET_SET(vnet_iter);
1821 db_printf("vnet %p\n", vnet_iter);
1823 flowtable_show_vnet();