1 /**************************************************************************
3 Copyright (c) 2008-2010, BitGravity Inc.
6 Redistribution and use in source and binary forms, with or without
7 modification, are permitted provided that the following conditions are met:
9 1. Redistributions of source code must retain the above copyright notice,
10 this list of conditions and the following disclaimer.
12 2. Neither the name of the BitGravity Corporation nor the names of its
13 contributors may be used to endorse or promote products derived from
14 this software without specific prior written permission.
16 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
17 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
20 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
21 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
22 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
23 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
24 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
25 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
26 POSSIBILITY OF SUCH DAMAGE.
28 ***************************************************************************/
30 #include "opt_route.h"
31 #include "opt_mpath.h"
34 #include "opt_inet6.h"
36 #include <sys/cdefs.h>
37 __FBSDID("$FreeBSD$");
39 #include <sys/param.h>
40 #include <sys/types.h>
41 #include <sys/bitstring.h>
42 #include <sys/condvar.h>
43 #include <sys/callout.h>
44 #include <sys/kernel.h>
45 #include <sys/kthread.h>
46 #include <sys/limits.h>
47 #include <sys/malloc.h>
51 #include <sys/sched.h>
53 #include <sys/socket.h>
54 #include <sys/syslog.h>
55 #include <sys/sysctl.h>
58 #include <net/if_llatbl.h>
59 #include <net/if_var.h>
60 #include <net/route.h>
61 #include <net/flowtable.h>
64 #include <netinet/in.h>
65 #include <netinet/in_systm.h>
66 #include <netinet/in_var.h>
67 #include <netinet/if_ether.h>
68 #include <netinet/ip.h>
70 #include <netinet/ip6.h>
72 #include <netinet/tcp.h>
73 #include <netinet/udp.h>
74 #include <netinet/sctp.h>
76 #include <libkern/jenkins.h>
80 uint16_t ip_sport; /* source port */
81 uint16_t ip_dport; /* destination port */
82 in_addr_t ip_saddr; /* source address */
83 in_addr_t ip_daddr; /* destination address */
87 struct ipv4_tuple ipf_ipt;
92 uint16_t ip_sport; /* source port */
93 uint16_t ip_dport; /* destination port */
94 struct in6_addr ip_saddr; /* source address */
95 struct in6_addr ip_daddr; /* destination address */
99 struct ipv6_tuple ipf_ipt;
104 volatile uint32_t f_fhash; /* hash flowing forward */
105 uint16_t f_flags; /* flow flags */
107 uint8_t f_proto; /* protocol */
108 uint32_t f_fibnum; /* fib index */
109 uint32_t f_uptime; /* uptime at last access */
110 struct flentry *f_next; /* pointer to collision entry */
111 volatile struct rtentry *f_rt; /* rtentry for flow */
112 volatile struct llentry *f_lle; /* llentry for flow */
116 struct flentry fl_entry;
117 union ipv4_flow fl_flow;
121 struct flentry fl_entry;
122 union ipv6_flow fl_flow;
125 #define fl_fhash fl_entry.fl_fhash
126 #define fl_flags fl_entry.fl_flags
127 #define fl_proto fl_entry.fl_proto
128 #define fl_uptime fl_entry.fl_uptime
129 #define fl_rt fl_entry.fl_rt
130 #define fl_lle fl_entry.fl_lle
132 #define SECS_PER_HOUR 3600
133 #define SECS_PER_DAY (24*SECS_PER_HOUR)
137 #define FIN_WAIT_IDLE 600
138 #define TCP_IDLE SECS_PER_DAY
141 typedef void fl_lock_t(struct flowtable *, uint32_t);
142 typedef void fl_rtalloc_t(struct route *, uint32_t, u_int);
145 struct flentry **global;
146 struct flentry **pcpu[MAXCPU];
149 struct flowtable_stats {
150 uint64_t ft_collisions;
151 uint64_t ft_allocated;
153 uint64_t ft_max_depth;
154 uint64_t ft_free_checks;
158 } __aligned(CACHE_LINE_SIZE);
161 struct flowtable_stats ft_stats[MAXCPU];
167 fl_lock_t *ft_unlock;
168 fl_rtalloc_t *ft_rtalloc;
170 * XXX need to pad out
172 struct mtx *ft_locks;
173 union flentryp ft_table;
174 bitstr_t *ft_masks[MAXCPU];
175 bitstr_t *ft_tmpmask;
176 struct flowtable *ft_next;
178 uint32_t ft_count __aligned(CACHE_LINE_SIZE);
179 uint32_t ft_udp_idle __aligned(CACHE_LINE_SIZE);
180 uint32_t ft_fin_wait_idle;
181 uint32_t ft_syn_idle;
182 uint32_t ft_tcp_idle;
184 } __aligned(CACHE_LINE_SIZE);
186 static struct proc *flowcleanerproc;
187 static VNET_DEFINE(struct flowtable *, flow_list_head);
188 static VNET_DEFINE(uint32_t, flow_hashjitter);
189 static VNET_DEFINE(uma_zone_t, flow_ipv4_zone);
190 static VNET_DEFINE(uma_zone_t, flow_ipv6_zone);
192 #define V_flow_list_head VNET(flow_list_head)
193 #define V_flow_hashjitter VNET(flow_hashjitter)
194 #define V_flow_ipv4_zone VNET(flow_ipv4_zone)
195 #define V_flow_ipv6_zone VNET(flow_ipv6_zone)
198 static struct cv flowclean_f_cv;
199 static struct cv flowclean_c_cv;
200 static struct mtx flowclean_lock;
201 static uint32_t flowclean_cycles;
202 static uint32_t flowclean_freq;
204 #ifdef FLOWTABLE_DEBUG
205 #define FLDPRINTF(ft, flags, fmt, ...) \
207 if ((ft)->ft_flags & (flags)) \
208 printf((fmt), __VA_ARGS__); \
212 #define FLDPRINTF(ft, flags, fmt, ...)
219 * - Make flowtable stats per-cpu, aggregated at sysctl call time,
220 * to avoid extra cache evictions caused by incrementing a shared
222 * - add sysctls to resize && flush flow tables
223 * - Add per flowtable sysctls for statistics and configuring timeouts
224 * - add saturation counter to rtentry to support per-packet load-balancing
225 * add flag to indicate round-robin flow, add list lookup from head
227 * - add sysctl / device node / syscall to support exporting and importing
228 * of flows with flag to indicate that a flow was imported so should
229 * not be considered for auto-cleaning
230 * - support explicit connection state (currently only ad-hoc for DSR)
231 * - idetach() cleanup for options VIMAGE builds.
233 VNET_DEFINE(int, flowtable_enable) = 1;
234 static VNET_DEFINE(int, flowtable_debug);
235 static VNET_DEFINE(int, flowtable_syn_expire) = SYN_IDLE;
236 static VNET_DEFINE(int, flowtable_udp_expire) = UDP_IDLE;
237 static VNET_DEFINE(int, flowtable_fin_wait_expire) = FIN_WAIT_IDLE;
238 static VNET_DEFINE(int, flowtable_tcp_expire) = TCP_IDLE;
239 static VNET_DEFINE(int, flowtable_nmbflows);
240 static VNET_DEFINE(int, flowtable_ready) = 0;
242 #define V_flowtable_enable VNET(flowtable_enable)
243 #define V_flowtable_debug VNET(flowtable_debug)
244 #define V_flowtable_syn_expire VNET(flowtable_syn_expire)
245 #define V_flowtable_udp_expire VNET(flowtable_udp_expire)
246 #define V_flowtable_fin_wait_expire VNET(flowtable_fin_wait_expire)
247 #define V_flowtable_tcp_expire VNET(flowtable_tcp_expire)
248 #define V_flowtable_nmbflows VNET(flowtable_nmbflows)
249 #define V_flowtable_ready VNET(flowtable_ready)
251 SYSCTL_NODE(_net_inet, OID_AUTO, flowtable, CTLFLAG_RD, NULL, "flowtable");
252 SYSCTL_VNET_INT(_net_inet_flowtable, OID_AUTO, debug, CTLFLAG_RW,
253 &VNET_NAME(flowtable_debug), 0, "print debug info.");
254 SYSCTL_VNET_INT(_net_inet_flowtable, OID_AUTO, enable, CTLFLAG_RW,
255 &VNET_NAME(flowtable_enable), 0, "enable flowtable caching.");
258 * XXX This does not end up updating timeouts at runtime
259 * and only reflects the value for the last table added :-/
261 SYSCTL_VNET_INT(_net_inet_flowtable, OID_AUTO, syn_expire, CTLFLAG_RW,
262 &VNET_NAME(flowtable_syn_expire), 0,
263 "seconds after which to remove syn allocated flow.");
264 SYSCTL_VNET_INT(_net_inet_flowtable, OID_AUTO, udp_expire, CTLFLAG_RW,
265 &VNET_NAME(flowtable_udp_expire), 0,
266 "seconds after which to remove flow allocated to UDP.");
267 SYSCTL_VNET_INT(_net_inet_flowtable, OID_AUTO, fin_wait_expire, CTLFLAG_RW,
268 &VNET_NAME(flowtable_fin_wait_expire), 0,
269 "seconds after which to remove a flow in FIN_WAIT.");
270 SYSCTL_VNET_INT(_net_inet_flowtable, OID_AUTO, tcp_expire, CTLFLAG_RW,
271 &VNET_NAME(flowtable_tcp_expire), 0,
272 "seconds after which to remove flow allocated to a TCP connection.");
276 * Maximum number of flows that can be allocated of a given type.
278 * The table is allocated at boot time (for the pure caching case
279 * there is no reason why this could not be changed at runtime)
280 * and thus (currently) needs to be set with a tunable.
283 sysctl_nmbflows(SYSCTL_HANDLER_ARGS)
285 int error, newnmbflows;
287 newnmbflows = V_flowtable_nmbflows;
288 error = sysctl_handle_int(oidp, &newnmbflows, 0, req);
289 if (error == 0 && req->newptr) {
290 if (newnmbflows > V_flowtable_nmbflows) {
291 V_flowtable_nmbflows = newnmbflows;
292 uma_zone_set_max(V_flow_ipv4_zone,
293 V_flowtable_nmbflows);
294 uma_zone_set_max(V_flow_ipv6_zone,
295 V_flowtable_nmbflows);
301 SYSCTL_VNET_PROC(_net_inet_flowtable, OID_AUTO, nmbflows,
302 CTLTYPE_INT|CTLFLAG_RW, 0, 0, sysctl_nmbflows, "IU",
303 "Maximum number of flows allowed");
307 #define FS_PRINT(sb, field) sbuf_printf((sb), "\t%s: %jd\n", #field, fs->ft_##field)
310 fs_print(struct sbuf *sb, struct flowtable_stats *fs)
313 FS_PRINT(sb, collisions);
314 FS_PRINT(sb, allocated);
315 FS_PRINT(sb, misses);
316 FS_PRINT(sb, max_depth);
317 FS_PRINT(sb, free_checks);
320 FS_PRINT(sb, lookups);
324 flowtable_show_stats(struct sbuf *sb, struct flowtable *ft)
327 struct flowtable_stats fs, *pfs;
329 if (ft->ft_flags & FL_PCPU) {
330 bzero(&fs, sizeof(fs));
333 pfs->ft_collisions += ft->ft_stats[i].ft_collisions;
334 pfs->ft_allocated += ft->ft_stats[i].ft_allocated;
335 pfs->ft_misses += ft->ft_stats[i].ft_misses;
336 pfs->ft_free_checks += ft->ft_stats[i].ft_free_checks;
337 pfs->ft_frees += ft->ft_stats[i].ft_frees;
338 pfs->ft_hits += ft->ft_stats[i].ft_hits;
339 pfs->ft_lookups += ft->ft_stats[i].ft_lookups;
340 if (ft->ft_stats[i].ft_max_depth > pfs->ft_max_depth)
341 pfs->ft_max_depth = ft->ft_stats[i].ft_max_depth;
344 pfs = &ft->ft_stats[0];
350 sysctl_flowtable_stats(SYSCTL_HANDLER_ARGS)
352 struct flowtable *ft;
356 sb = sbuf_new(NULL, NULL, 64*1024, SBUF_FIXEDLEN);
358 ft = V_flow_list_head;
360 sbuf_printf(sb, "\ntable name: %s\n", ft->ft_name);
361 flowtable_show_stats(sb, ft);
365 error = SYSCTL_OUT(req, sbuf_data(sb), sbuf_len(sb) + 1);
370 SYSCTL_VNET_PROC(_net_inet_flowtable, OID_AUTO, stats, CTLTYPE_STRING|CTLFLAG_RD,
371 NULL, 0, sysctl_flowtable_stats, "A", "flowtable statistics");
376 in_rtalloc_ign_wrapper(struct route *ro, uint32_t hash, u_int fibnum)
379 rtalloc_ign_fib(ro, 0, fibnum);
384 flowtable_global_lock(struct flowtable *table, uint32_t hash)
386 int lock_index = (hash)&(table->ft_lock_count - 1);
388 mtx_lock(&table->ft_locks[lock_index]);
392 flowtable_global_unlock(struct flowtable *table, uint32_t hash)
394 int lock_index = (hash)&(table->ft_lock_count - 1);
396 mtx_unlock(&table->ft_locks[lock_index]);
400 flowtable_pcpu_lock(struct flowtable *table, uint32_t hash)
407 flowtable_pcpu_unlock(struct flowtable *table, uint32_t hash)
413 #define FL_ENTRY_INDEX(table, hash)((hash) % (table)->ft_size)
414 #define FL_ENTRY(table, hash) *flowtable_entry((table), (hash))
415 #define FL_ENTRY_LOCK(table, hash) (table)->ft_lock((table), (hash))
416 #define FL_ENTRY_UNLOCK(table, hash) (table)->ft_unlock((table), (hash))
418 #define FL_STALE (1<<8)
419 #define FL_OVERWRITE (1<<10)
422 flow_invalidate(struct flentry *fle)
425 fle->f_flags |= FL_STALE;
429 proto_to_flags(uint8_t proto)
452 flags_to_proto(int flags)
454 int proto, protoflags;
456 protoflags = flags & (FL_TCP|FL_SCTP|FL_UDP);
457 switch (protoflags) {
462 proto = IPPROTO_SCTP;
475 #ifdef FLOWTABLE_DEBUG
477 ipv4_flow_print_tuple(int flags, int proto, struct sockaddr_in *ssin,
478 struct sockaddr_in *dsin)
480 char saddr[4*sizeof "123"], daddr[4*sizeof "123"];
482 if (flags & FL_HASH_ALL) {
483 inet_ntoa_r(ssin->sin_addr, saddr);
484 inet_ntoa_r(dsin->sin_addr, daddr);
485 printf("proto=%d %s:%d->%s:%d\n",
486 proto, saddr, ntohs(ssin->sin_port), daddr,
487 ntohs(dsin->sin_port));
489 inet_ntoa_r(*(struct in_addr *) &dsin->sin_addr, daddr);
490 printf("proto=%d %s\n", proto, daddr);
497 ipv4_mbuf_demarshal(struct flowtable *ft, struct mbuf *m,
498 struct sockaddr_in *ssin, struct sockaddr_in *dsin, uint16_t *flags)
506 uint16_t sport, dport;
508 proto = sport = dport = 0;
509 ip = mtod(m, struct ip *);
510 dsin->sin_family = AF_INET;
511 dsin->sin_len = sizeof(*dsin);
512 dsin->sin_addr = ip->ip_dst;
513 ssin->sin_family = AF_INET;
514 ssin->sin_len = sizeof(*ssin);
515 ssin->sin_addr = ip->ip_src;
518 if ((*flags & FL_HASH_ALL) == 0) {
519 FLDPRINTF(ft, FL_DEBUG_ALL, "skip port check flags=0x%x ",
524 iphlen = ip->ip_hl << 2; /* XXX options? */
528 th = (struct tcphdr *)((caddr_t)ip + iphlen);
529 sport = th->th_sport;
530 dport = th->th_dport;
531 if ((*flags & FL_HASH_ALL) &&
532 (th->th_flags & (TH_RST|TH_FIN)))
536 uh = (struct udphdr *)((caddr_t)ip + iphlen);
537 sport = uh->uh_sport;
538 dport = uh->uh_dport;
541 sh = (struct sctphdr *)((caddr_t)ip + iphlen);
542 sport = sh->src_port;
543 dport = sh->dest_port;
546 FLDPRINTF(ft, FL_DEBUG_ALL, "proto=0x%x not supported\n", proto);
548 /* no port - hence not a protocol we care about */
554 *flags |= proto_to_flags(proto);
555 ssin->sin_port = sport;
556 dsin->sin_port = dport;
561 ipv4_flow_lookup_hash_internal(
562 struct sockaddr_in *ssin, struct sockaddr_in *dsin,
563 uint32_t *key, uint16_t flags)
565 uint16_t sport, dport;
569 if ((V_flowtable_enable == 0) || (V_flowtable_ready == 0))
571 proto = flags_to_proto(flags);
572 sport = dport = key[2] = key[1] = key[0] = 0;
573 if ((ssin != NULL) && (flags & FL_HASH_ALL)) {
574 key[1] = ssin->sin_addr.s_addr;
575 sport = ssin->sin_port;
578 key[2] = dsin->sin_addr.s_addr;
579 dport = dsin->sin_port;
581 if (flags & FL_HASH_ALL) {
582 ((uint16_t *)key)[0] = sport;
583 ((uint16_t *)key)[1] = dport;
585 offset = V_flow_hashjitter + proto;
587 return (jenkins_hashword(key, 3, offset));
590 static struct flentry *
591 flowtable_lookup_mbuf4(struct flowtable *ft, struct mbuf *m)
593 struct sockaddr_storage ssa, dsa;
595 struct sockaddr_in *dsin, *ssin;
597 dsin = (struct sockaddr_in *)&dsa;
598 ssin = (struct sockaddr_in *)&ssa;
599 bzero(dsin, sizeof(*dsin));
600 bzero(ssin, sizeof(*ssin));
601 flags = ft->ft_flags;
602 if (ipv4_mbuf_demarshal(ft, m, ssin, dsin, &flags) != 0)
605 return (flowtable_lookup(ft, &ssa, &dsa, M_GETFIB(m), flags));
609 flow_to_route(struct flentry *fle, struct route *ro)
611 uint32_t *hashkey = NULL;
612 struct sockaddr_in *sin;
614 sin = (struct sockaddr_in *)&ro->ro_dst;
615 sin->sin_family = AF_INET;
616 sin->sin_len = sizeof(*sin);
617 hashkey = ((struct flentry_v4 *)fle)->fl_flow.ipf_key;
618 sin->sin_addr.s_addr = hashkey[2];
619 ro->ro_rt = __DEVOLATILE(struct rtentry *, fle->f_rt);
620 ro->ro_lle = __DEVOLATILE(struct llentry *, fle->f_lle);
626 * PULLUP_TO(len, p, T) makes sure that len + sizeof(T) is contiguous,
627 * then it sets p to point at the offset "len" in the mbuf. WARNING: the
628 * pointer might become stale after other pullups (but we never use it
631 #define PULLUP_TO(_len, p, T) \
633 int x = (_len) + sizeof(T); \
634 if ((m)->m_len < x) { \
635 goto receive_failed; \
637 p = (mtod(m, char *) + (_len)); \
640 #define TCP(p) ((struct tcphdr *)(p))
641 #define SCTP(p) ((struct sctphdr *)(p))
642 #define UDP(p) ((struct udphdr *)(p))
645 ipv6_mbuf_demarshal(struct flowtable *ft, struct mbuf *m,
646 struct sockaddr_in6 *ssin6, struct sockaddr_in6 *dsin6, uint16_t *flags)
651 uint16_t src_port, dst_port;
655 offset = hlen = src_port = dst_port = 0;
657 ip6 = mtod(m, struct ip6_hdr *);
658 hlen = sizeof(struct ip6_hdr);
659 proto = ip6->ip6_nxt;
661 if ((*flags & FL_HASH_ALL) == 0)
664 while (ulp == NULL) {
667 case IPPROTO_OSPFIGP:
675 PULLUP_TO(hlen, ulp, struct tcphdr);
676 dst_port = TCP(ulp)->th_dport;
677 src_port = TCP(ulp)->th_sport;
678 if ((*flags & FL_HASH_ALL) &&
679 (TCP(ulp)->th_flags & (TH_RST|TH_FIN)))
683 PULLUP_TO(hlen, ulp, struct sctphdr);
684 src_port = SCTP(ulp)->src_port;
685 dst_port = SCTP(ulp)->dest_port;
688 PULLUP_TO(hlen, ulp, struct udphdr);
689 dst_port = UDP(ulp)->uh_dport;
690 src_port = UDP(ulp)->uh_sport;
692 case IPPROTO_HOPOPTS: /* RFC 2460 */
693 PULLUP_TO(hlen, ulp, struct ip6_hbh);
694 hlen += (((struct ip6_hbh *)ulp)->ip6h_len + 1) << 3;
695 proto = ((struct ip6_hbh *)ulp)->ip6h_nxt;
698 case IPPROTO_ROUTING: /* RFC 2460 */
699 PULLUP_TO(hlen, ulp, struct ip6_rthdr);
700 hlen += (((struct ip6_rthdr *)ulp)->ip6r_len + 1) << 3;
701 proto = ((struct ip6_rthdr *)ulp)->ip6r_nxt;
704 case IPPROTO_FRAGMENT: /* RFC 2460 */
705 PULLUP_TO(hlen, ulp, struct ip6_frag);
706 hlen += sizeof (struct ip6_frag);
707 proto = ((struct ip6_frag *)ulp)->ip6f_nxt;
708 offset = ((struct ip6_frag *)ulp)->ip6f_offlg &
712 case IPPROTO_DSTOPTS: /* RFC 2460 */
713 PULLUP_TO(hlen, ulp, struct ip6_hbh);
714 hlen += (((struct ip6_hbh *)ulp)->ip6h_len + 1) << 3;
715 proto = ((struct ip6_hbh *)ulp)->ip6h_nxt;
718 case IPPROTO_AH: /* RFC 2402 */
719 PULLUP_TO(hlen, ulp, struct ip6_ext);
720 hlen += (((struct ip6_ext *)ulp)->ip6e_len + 2) << 2;
721 proto = ((struct ip6_ext *)ulp)->ip6e_nxt;
725 PULLUP_TO(hlen, ulp, struct ip6_ext);
736 dsin6->sin6_family = AF_INET6;
737 dsin6->sin6_len = sizeof(*dsin6);
738 dsin6->sin6_port = dst_port;
739 memcpy(&dsin6->sin6_addr, &ip6->ip6_dst, sizeof(struct in6_addr));
741 ssin6->sin6_family = AF_INET6;
742 ssin6->sin6_len = sizeof(*ssin6);
743 ssin6->sin6_port = src_port;
744 memcpy(&ssin6->sin6_addr, &ip6->ip6_src, sizeof(struct in6_addr));
745 *flags |= proto_to_flags(proto);
750 #define zero_key(key) \
764 ipv6_flow_lookup_hash_internal(
765 struct sockaddr_in6 *ssin6, struct sockaddr_in6 *dsin6,
766 uint32_t *key, uint16_t flags)
768 uint16_t sport, dport;
772 if ((V_flowtable_enable == 0) || (V_flowtable_ready == 0))
775 proto = flags_to_proto(flags);
779 memcpy(&key[1], &dsin6->sin6_addr, sizeof(struct in6_addr));
780 dport = dsin6->sin6_port;
782 if ((ssin6 != NULL) && (flags & FL_HASH_ALL)) {
783 memcpy(&key[5], &ssin6->sin6_addr, sizeof(struct in6_addr));
784 sport = ssin6->sin6_port;
786 if (flags & FL_HASH_ALL) {
787 ((uint16_t *)key)[0] = sport;
788 ((uint16_t *)key)[1] = dport;
790 offset = V_flow_hashjitter + proto;
792 return (jenkins_hashword(key, 9, offset));
795 static struct flentry *
796 flowtable_lookup_mbuf6(struct flowtable *ft, struct mbuf *m)
798 struct sockaddr_storage ssa, dsa;
799 struct sockaddr_in6 *dsin6, *ssin6;
802 dsin6 = (struct sockaddr_in6 *)&dsa;
803 ssin6 = (struct sockaddr_in6 *)&ssa;
804 bzero(dsin6, sizeof(*dsin6));
805 bzero(ssin6, sizeof(*ssin6));
806 flags = ft->ft_flags;
808 if (ipv6_mbuf_demarshal(ft, m, ssin6, dsin6, &flags) != 0)
811 return (flowtable_lookup(ft, &ssa, &dsa, M_GETFIB(m), flags));
815 flow_to_route_in6(struct flentry *fle, struct route_in6 *ro)
817 uint32_t *hashkey = NULL;
818 struct sockaddr_in6 *sin6;
820 sin6 = (struct sockaddr_in6 *)&ro->ro_dst;
822 sin6->sin6_family = AF_INET6;
823 sin6->sin6_len = sizeof(*sin6);
824 hashkey = ((struct flentry_v6 *)fle)->fl_flow.ipf_key;
825 memcpy(&sin6->sin6_addr, &hashkey[5], sizeof (struct in6_addr));
826 ro->ro_rt = __DEVOLATILE(struct rtentry *, fle->f_rt);
827 ro->ro_lle = __DEVOLATILE(struct llentry *, fle->f_lle);
833 flowtable_mask(struct flowtable *ft)
837 if (ft->ft_flags & FL_PCPU)
838 mask = ft->ft_masks[curcpu];
840 mask = ft->ft_masks[0];
845 static struct flentry **
846 flowtable_entry(struct flowtable *ft, uint32_t hash)
848 struct flentry **fle;
849 int index = (hash % ft->ft_size);
851 if (ft->ft_flags & FL_PCPU) {
852 KASSERT(&ft->ft_table.pcpu[curcpu][0] != NULL, ("pcpu not set"));
853 fle = &ft->ft_table.pcpu[curcpu][index];
855 KASSERT(&ft->ft_table.global[0] != NULL, ("global not set"));
856 fle = &ft->ft_table.global[index];
863 flow_stale(struct flowtable *ft, struct flentry *fle)
867 if ((fle->f_fhash == 0)
868 || ((fle->f_rt->rt_flags & RTF_HOST) &&
869 ((fle->f_rt->rt_flags & (RTF_UP))
871 || (fle->f_rt->rt_ifp == NULL)
872 || !RT_LINK_IS_UP(fle->f_rt->rt_ifp))
875 idle_time = time_uptime - fle->f_uptime;
877 if ((fle->f_flags & FL_STALE) ||
878 ((fle->f_flags & (TH_SYN|TH_ACK|TH_FIN)) == 0
879 && (idle_time > ft->ft_udp_idle)) ||
880 ((fle->f_flags & TH_FIN)
881 && (idle_time > ft->ft_fin_wait_idle)) ||
882 ((fle->f_flags & (TH_SYN|TH_ACK)) == TH_SYN
883 && (idle_time > ft->ft_syn_idle)) ||
884 ((fle->f_flags & (TH_SYN|TH_ACK)) == (TH_SYN|TH_ACK)
885 && (idle_time > ft->ft_tcp_idle)) ||
886 ((fle->f_rt->rt_flags & RTF_UP) == 0 ||
887 (fle->f_rt->rt_ifp == NULL)))
894 flowtable_set_hashkey(struct flentry *fle, uint32_t *key)
899 if (fle->f_flags & FL_IPV6) {
901 hashkey = ((struct flentry_v4 *)fle)->fl_flow.ipf_key;
904 hashkey = ((struct flentry_v6 *)fle)->fl_flow.ipf_key;
907 for (i = 0; i < nwords; i++)
911 static struct flentry *
912 flow_alloc(struct flowtable *ft)
914 struct flentry *newfle;
918 zone = (ft->ft_flags & FL_IPV6) ? V_flow_ipv6_zone : V_flow_ipv4_zone;
920 newfle = uma_zalloc(zone, M_NOWAIT | M_ZERO);
922 atomic_add_int(&ft->ft_count, 1);
927 flow_free(struct flentry *fle, struct flowtable *ft)
931 zone = (ft->ft_flags & FL_IPV6) ? V_flow_ipv6_zone : V_flow_ipv4_zone;
932 atomic_add_int(&ft->ft_count, -1);
933 uma_zfree(zone, fle);
937 flow_full(struct flowtable *ft)
943 count = ft->ft_count;
945 if (full && (count < (V_flowtable_nmbflows - (V_flowtable_nmbflows >> 3))))
947 else if (!full && (count > (V_flowtable_nmbflows - (V_flowtable_nmbflows >> 5))))
950 if (full && !ft->ft_full) {
951 flowclean_freq = 4*hz;
952 if ((ft->ft_flags & FL_HASH_ALL) == 0)
953 ft->ft_udp_idle = ft->ft_fin_wait_idle =
954 ft->ft_syn_idle = ft->ft_tcp_idle = 5;
955 cv_broadcast(&flowclean_c_cv);
956 } else if (!full && ft->ft_full) {
957 flowclean_freq = 20*hz;
958 if ((ft->ft_flags & FL_HASH_ALL) == 0)
959 ft->ft_udp_idle = ft->ft_fin_wait_idle =
960 ft->ft_syn_idle = ft->ft_tcp_idle = 30;
963 return (ft->ft_full);
967 flowtable_insert(struct flowtable *ft, uint32_t hash, uint32_t *key,
968 uint32_t fibnum, struct route *ro, uint16_t flags)
970 struct flentry *fle, *fletail, *newfle, **flep;
971 struct flowtable_stats *fs = &ft->ft_stats[curcpu];
976 newfle = flow_alloc(ft);
980 newfle->f_flags |= (flags & FL_IPV6);
981 proto = flags_to_proto(flags);
983 FL_ENTRY_LOCK(ft, hash);
984 mask = flowtable_mask(ft);
985 flep = flowtable_entry(ft, hash);
986 fletail = fle = *flep;
989 bit_set(mask, FL_ENTRY_INDEX(ft, hash));
990 *flep = fle = newfle;
997 * find end of list and make sure that we were not
998 * preempted by another thread handling this flow
1000 while (fle != NULL) {
1001 if (fle->f_fhash == hash && !flow_stale(ft, fle)) {
1003 * there was either a hash collision
1004 * or we lost a race to insert
1006 FL_ENTRY_UNLOCK(ft, hash);
1007 flow_free(newfle, ft);
1009 if (flags & FL_OVERWRITE)
1014 * re-visit this double condition XXX
1016 if (fletail->f_next != NULL)
1017 fletail = fle->f_next;
1023 if (depth > fs->ft_max_depth)
1024 fs->ft_max_depth = depth;
1025 fletail->f_next = newfle;
1028 flowtable_set_hashkey(fle, key);
1030 fle->f_proto = proto;
1031 fle->f_rt = ro->ro_rt;
1032 fle->f_lle = ro->ro_lle;
1033 fle->f_fhash = hash;
1034 fle->f_fibnum = fibnum;
1035 fle->f_uptime = time_uptime;
1036 FL_ENTRY_UNLOCK(ft, hash);
1041 kern_flowtable_insert(struct flowtable *ft,
1042 struct sockaddr_storage *ssa, struct sockaddr_storage *dsa,
1043 struct route *ro, uint32_t fibnum, int flags)
1045 uint32_t key[9], hash;
1047 flags = (ft->ft_flags | flags | FL_OVERWRITE);
1051 if (ssa->ss_family == AF_INET)
1052 hash = ipv4_flow_lookup_hash_internal((struct sockaddr_in *)ssa,
1053 (struct sockaddr_in *)dsa, key, flags);
1056 if (ssa->ss_family == AF_INET6)
1057 hash = ipv6_flow_lookup_hash_internal((struct sockaddr_in6 *)ssa,
1058 (struct sockaddr_in6 *)dsa, key, flags);
1060 if (ro->ro_rt == NULL || ro->ro_lle == NULL)
1063 FLDPRINTF(ft, FL_DEBUG,
1064 "kern_flowtable_insert: key=%x:%x:%x hash=%x fibnum=%d flags=%x\n",
1065 key[0], key[1], key[2], hash, fibnum, flags);
1066 return (flowtable_insert(ft, hash, key, fibnum, ro, flags));
1070 flowtable_key_equal(struct flentry *fle, uint32_t *key)
1075 if (fle->f_flags & FL_IPV6) {
1077 hashkey = ((struct flentry_v4 *)fle)->fl_flow.ipf_key;
1080 hashkey = ((struct flentry_v6 *)fle)->fl_flow.ipf_key;
1083 for (i = 0; i < nwords; i++)
1084 if (hashkey[i] != key[i])
1091 flowtable_lookup_mbuf(struct flowtable *ft, struct mbuf *m, int af)
1093 struct flentry *fle = NULL;
1097 fle = flowtable_lookup_mbuf4(ft, m);
1101 fle = flowtable_lookup_mbuf6(ft, m);
1103 if (fle != NULL && m != NULL && (m->m_flags & M_FLOWID) == 0) {
1104 m->m_flags |= M_FLOWID;
1105 m->m_pkthdr.flowid = fle->f_fhash;
1111 flowtable_lookup(struct flowtable *ft, struct sockaddr_storage *ssa,
1112 struct sockaddr_storage *dsa, uint32_t fibnum, int flags)
1114 uint32_t key[9], hash;
1115 struct flentry *fle;
1116 struct flowtable_stats *fs = &ft->ft_stats[curcpu];
1120 struct llentry *lle;
1121 struct route sro, *ro;
1122 struct route_in6 sro6;
1124 sro.ro_rt = sro6.ro_rt = NULL;
1125 sro.ro_lle = sro6.ro_lle = NULL;
1128 flags |= ft->ft_flags;
1129 proto = flags_to_proto(flags);
1131 if (ssa->ss_family == AF_INET) {
1132 struct sockaddr_in *ssin, *dsin;
1135 memcpy(&ro->ro_dst, dsa, sizeof(struct sockaddr_in));
1137 * The harvested source and destination addresses
1138 * may contain port information if the packet is
1139 * from a transport protocol (e.g. TCP/UDP). The
1140 * port field must be cleared before performing
1143 ((struct sockaddr_in *)&ro->ro_dst)->sin_port = 0;
1144 dsin = (struct sockaddr_in *)dsa;
1145 ssin = (struct sockaddr_in *)ssa;
1146 if ((dsin->sin_addr.s_addr == ssin->sin_addr.s_addr) ||
1147 (ntohl(dsin->sin_addr.s_addr) >> IN_CLASSA_NSHIFT) == IN_LOOPBACKNET ||
1148 (ntohl(ssin->sin_addr.s_addr) >> IN_CLASSA_NSHIFT) == IN_LOOPBACKNET)
1151 hash = ipv4_flow_lookup_hash_internal(ssin, dsin, key, flags);
1155 if (ssa->ss_family == AF_INET6) {
1156 struct sockaddr_in6 *ssin6, *dsin6;
1158 ro = (struct route *)&sro6;
1159 memcpy(&sro6.ro_dst, dsa,
1160 sizeof(struct sockaddr_in6));
1161 ((struct sockaddr_in6 *)&ro->ro_dst)->sin6_port = 0;
1162 dsin6 = (struct sockaddr_in6 *)dsa;
1163 ssin6 = (struct sockaddr_in6 *)ssa;
1166 hash = ipv6_flow_lookup_hash_internal(ssin6, dsin6, key, flags);
1170 * Ports are zero and this isn't a transmit cache
1171 * - thus not a protocol for which we need to keep
1173 * FL_HASH_ALL => key[0] != 0 for TCP || UDP || SCTP
1175 if (hash == 0 || (key[0] == 0 && (ft->ft_flags & FL_HASH_ALL)))
1179 FL_ENTRY_LOCK(ft, hash);
1180 if ((fle = FL_ENTRY(ft, hash)) == NULL) {
1181 FL_ENTRY_UNLOCK(ft, hash);
1185 rt = __DEVOLATILE(struct rtentry *, fle->f_rt);
1186 lle = __DEVOLATILE(struct llentry *, fle->f_lle);
1188 && fle->f_fhash == hash
1189 && flowtable_key_equal(fle, key)
1190 && (proto == fle->f_proto)
1191 && (fibnum == fle->f_fibnum)
1192 && (rt->rt_flags & RTF_UP)
1193 && (rt->rt_ifp != NULL)) {
1195 fle->f_uptime = time_uptime;
1196 fle->f_flags |= flags;
1197 FL_ENTRY_UNLOCK(ft, hash);
1199 } else if (fle->f_next != NULL) {
1203 FL_ENTRY_UNLOCK(ft, hash);
1205 if (flags & FL_NOAUTO || flow_full(ft))
1210 * This bit of code ends up locking the
1211 * same route 3 times (just like ip_output + ether_output)
1213 * - in rt_check when called by arpresolve
1214 * - dropping the refcount for the rtentry
1216 * This could be consolidated to one if we wrote a variant
1217 * of arpresolve with an rt_check variant that expected to
1218 * receive the route locked
1222 if ((ro->ro_dst.sa_family != AF_INET) &&
1223 (ro->ro_dst.sa_family != AF_INET6))
1224 panic("sa_family == %d\n", ro->ro_dst.sa_family);
1227 ft->ft_rtalloc(ro, hash, fibnum);
1228 if (ro->ro_rt == NULL)
1229 error = ENETUNREACH;
1231 struct llentry *lle = NULL;
1232 struct sockaddr_storage *l3addr;
1233 struct rtentry *rt = ro->ro_rt;
1234 struct ifnet *ifp = rt->rt_ifp;
1236 if (ifp->if_flags & (IFF_POINTOPOINT | IFF_LOOPBACK)) {
1242 if (ssa->ss_family == AF_INET6) {
1243 struct sockaddr_in6 *dsin6;
1245 dsin6 = (struct sockaddr_in6 *)dsa;
1246 if (in6_localaddr(&dsin6->sin6_addr)) {
1252 if (rt->rt_flags & RTF_GATEWAY)
1253 l3addr = (struct sockaddr_storage *)rt->rt_gateway;
1256 l3addr = (struct sockaddr_storage *)&ro->ro_dst;
1257 llentry_update(&lle, LLTABLE6(ifp), l3addr, ifp);
1261 if (ssa->ss_family == AF_INET) {
1262 if (rt->rt_flags & RTF_GATEWAY)
1263 l3addr = (struct sockaddr_storage *)rt->rt_gateway;
1265 l3addr = (struct sockaddr_storage *)&ro->ro_dst;
1266 llentry_update(&lle, LLTABLE(ifp), l3addr, ifp);
1277 error = flowtable_insert(ft, hash, key, fibnum, ro, flags);
1287 return ((error) ? NULL : fle);
1291 * used by the bit_alloc macro
1293 #define calloc(count, size) malloc((count)*(size), M_DEVBUF, M_WAITOK|M_ZERO)
1296 flowtable_alloc(char *name, int nentry, int flags)
1298 struct flowtable *ft, *fttail;
1301 if (V_flow_hashjitter == 0)
1302 V_flow_hashjitter = arc4random();
1304 KASSERT(nentry > 0, ("nentry must be > 0, is %d\n", nentry));
1306 ft = malloc(sizeof(struct flowtable),
1307 M_RTABLE, M_WAITOK | M_ZERO);
1310 ft->ft_flags = flags;
1311 ft->ft_size = nentry;
1313 ft->ft_rtalloc = rtalloc_mpath_fib;
1315 ft->ft_rtalloc = in_rtalloc_ign_wrapper;
1317 if (flags & FL_PCPU) {
1318 ft->ft_lock = flowtable_pcpu_lock;
1319 ft->ft_unlock = flowtable_pcpu_unlock;
1321 for (i = 0; i <= mp_maxid; i++) {
1322 ft->ft_table.pcpu[i] =
1323 malloc(nentry*sizeof(struct flentry *),
1324 M_RTABLE, M_WAITOK | M_ZERO);
1325 ft->ft_masks[i] = bit_alloc(nentry);
1328 ft->ft_lock_count = 2*(powerof2(mp_maxid + 1) ? (mp_maxid + 1):
1329 (fls(mp_maxid + 1) << 1));
1331 ft->ft_lock = flowtable_global_lock;
1332 ft->ft_unlock = flowtable_global_unlock;
1333 ft->ft_table.global =
1334 malloc(nentry*sizeof(struct flentry *),
1335 M_RTABLE, M_WAITOK | M_ZERO);
1336 ft->ft_locks = malloc(ft->ft_lock_count*sizeof(struct mtx),
1337 M_RTABLE, M_WAITOK | M_ZERO);
1338 for (i = 0; i < ft->ft_lock_count; i++)
1339 mtx_init(&ft->ft_locks[i], "flow", NULL, MTX_DEF|MTX_DUPOK);
1341 ft->ft_masks[0] = bit_alloc(nentry);
1343 ft->ft_tmpmask = bit_alloc(nentry);
1346 * In the local transmit case the table truly is
1347 * just a cache - so everything is eligible for
1348 * replacement after 5s of non-use
1350 if (flags & FL_HASH_ALL) {
1351 ft->ft_udp_idle = V_flowtable_udp_expire;
1352 ft->ft_syn_idle = V_flowtable_syn_expire;
1353 ft->ft_fin_wait_idle = V_flowtable_fin_wait_expire;
1354 ft->ft_tcp_idle = V_flowtable_fin_wait_expire;
1356 ft->ft_udp_idle = ft->ft_fin_wait_idle =
1357 ft->ft_syn_idle = ft->ft_tcp_idle = 30;
1362 * hook in to the cleaner list
1364 if (V_flow_list_head == NULL)
1365 V_flow_list_head = ft;
1367 fttail = V_flow_list_head;
1368 while (fttail->ft_next != NULL)
1369 fttail = fttail->ft_next;
1370 fttail->ft_next = ft;
1377 * The rest of the code is devoted to garbage collection of expired entries.
1378 * It is a new additon made necessary by the switch to dynamically allocating
1383 fle_free(struct flentry *fle, struct flowtable *ft)
1386 struct llentry *lle;
1388 rt = __DEVOLATILE(struct rtentry *, fle->f_rt);
1389 lle = __DEVOLATILE(struct llentry *, fle->f_lle);
1398 flowtable_free_stale(struct flowtable *ft, struct rtentry *rt)
1400 int curbit = 0, count;
1401 struct flentry *fle, **flehead, *fleprev;
1402 struct flentry *flefreehead, *flefreetail, *fletmp;
1403 bitstr_t *mask, *tmpmask;
1404 struct flowtable_stats *fs = &ft->ft_stats[curcpu];
1406 flefreehead = flefreetail = NULL;
1407 mask = flowtable_mask(ft);
1408 tmpmask = ft->ft_tmpmask;
1409 memcpy(tmpmask, mask, ft->ft_size/8);
1411 * XXX Note to self, bit_ffs operates at the byte level
1412 * and thus adds gratuitous overhead
1414 bit_ffs(tmpmask, ft->ft_size, &curbit);
1415 while (curbit != -1) {
1416 if (curbit >= ft->ft_size || curbit < -1) {
1418 "warning: bad curbit value %d \n",
1423 FL_ENTRY_LOCK(ft, curbit);
1424 flehead = flowtable_entry(ft, curbit);
1425 fle = fleprev = *flehead;
1427 fs->ft_free_checks++;
1429 if (fle == NULL && curbit > 0) {
1431 "warning bit=%d set, but no fle found\n",
1435 while (fle != NULL) {
1437 if (__DEVOLATILE(struct rtentry *, fle->f_rt) != rt) {
1442 } else if (!flow_stale(ft, fle)) {
1448 * delete head of the list
1450 if (fleprev == *flehead) {
1452 if (fle == fleprev) {
1453 fleprev = *flehead = fle->f_next;
1455 fleprev = *flehead = fle;
1459 * don't advance fleprev
1462 fleprev->f_next = fle->f_next;
1463 fle = fleprev->f_next;
1466 if (flefreehead == NULL)
1467 flefreehead = flefreetail = fletmp;
1469 flefreetail->f_next = fletmp;
1470 flefreetail = fletmp;
1472 fletmp->f_next = NULL;
1474 if (*flehead == NULL)
1475 bit_clear(mask, curbit);
1476 FL_ENTRY_UNLOCK(ft, curbit);
1477 bit_clear(tmpmask, curbit);
1478 bit_ffs(tmpmask, ft->ft_size, &curbit);
1481 while ((fle = flefreehead) != NULL) {
1482 flefreehead = fle->f_next;
1487 if (V_flowtable_debug && count)
1488 log(LOG_DEBUG, "freed %d flow entries\n", count);
1492 flowtable_route_flush(struct flowtable *ft, struct rtentry *rt)
1496 if (ft->ft_flags & FL_PCPU) {
1498 if (smp_started == 1) {
1499 thread_lock(curthread);
1500 sched_bind(curthread, i);
1501 thread_unlock(curthread);
1504 flowtable_free_stale(ft, rt);
1506 if (smp_started == 1) {
1507 thread_lock(curthread);
1508 sched_unbind(curthread);
1509 thread_unlock(curthread);
1513 flowtable_free_stale(ft, rt);
1518 flowtable_clean_vnet(void)
1520 struct flowtable *ft;
1523 ft = V_flow_list_head;
1524 while (ft != NULL) {
1525 if (ft->ft_flags & FL_PCPU) {
1527 if (smp_started == 1) {
1528 thread_lock(curthread);
1529 sched_bind(curthread, i);
1530 thread_unlock(curthread);
1533 flowtable_free_stale(ft, NULL);
1535 if (smp_started == 1) {
1536 thread_lock(curthread);
1537 sched_unbind(curthread);
1538 thread_unlock(curthread);
1542 flowtable_free_stale(ft, NULL);
1549 flowtable_cleaner(void)
1551 VNET_ITERATOR_DECL(vnet_iter);
1555 log(LOG_INFO, "flowtable cleaner started\n");
1559 VNET_FOREACH(vnet_iter) {
1560 CURVNET_SET(vnet_iter);
1561 flowtable_clean_vnet();
1564 VNET_LIST_RUNLOCK();
1567 * The 10 second interval between cleaning checks
1570 mtx_lock(&flowclean_lock);
1572 sched_prio(td, PPAUSE);
1575 cv_broadcast(&flowclean_f_cv);
1576 cv_timedwait(&flowclean_c_cv, &flowclean_lock, flowclean_freq);
1577 mtx_unlock(&flowclean_lock);
1582 flowtable_flush(void *unused __unused)
1586 mtx_lock(&flowclean_lock);
1587 start = flowclean_cycles;
1588 while (start == flowclean_cycles) {
1589 cv_broadcast(&flowclean_c_cv);
1590 cv_wait(&flowclean_f_cv, &flowclean_lock);
1592 mtx_unlock(&flowclean_lock);
1595 static struct kproc_desc flow_kp = {
1600 SYSINIT(flowcleaner, SI_SUB_KTHREAD_IDLE, SI_ORDER_ANY, kproc_start, &flow_kp);
1603 flowtable_init_vnet(const void *unused __unused)
1606 V_flowtable_nmbflows = 1024 + maxusers * 64 * mp_ncpus;
1607 V_flow_ipv4_zone = uma_zcreate("ip4flow", sizeof(struct flentry_v4),
1608 NULL, NULL, NULL, NULL, 64, UMA_ZONE_MAXBUCKET);
1609 V_flow_ipv6_zone = uma_zcreate("ip6flow", sizeof(struct flentry_v6),
1610 NULL, NULL, NULL, NULL, 64, UMA_ZONE_MAXBUCKET);
1611 uma_zone_set_max(V_flow_ipv4_zone, V_flowtable_nmbflows);
1612 uma_zone_set_max(V_flow_ipv6_zone, V_flowtable_nmbflows);
1613 V_flowtable_ready = 1;
1615 VNET_SYSINIT(flowtable_init_vnet, SI_SUB_SMP, SI_ORDER_ANY,
1616 flowtable_init_vnet, NULL);
1619 flowtable_init(const void *unused __unused)
1622 cv_init(&flowclean_c_cv, "c_flowcleanwait");
1623 cv_init(&flowclean_f_cv, "f_flowcleanwait");
1624 mtx_init(&flowclean_lock, "flowclean lock", NULL, MTX_DEF);
1625 EVENTHANDLER_REGISTER(ifnet_departure_event, flowtable_flush, NULL,
1626 EVENTHANDLER_PRI_ANY);
1627 flowclean_freq = 20*hz;
1629 SYSINIT(flowtable_init, SI_SUB_KTHREAD_INIT, SI_ORDER_FIRST,
1630 flowtable_init, NULL);
1635 flowtable_uninit(const void *unused __unused)
1638 V_flowtable_ready = 0;
1639 uma_zdestroy(V_flow_ipv4_zone);
1640 uma_zdestroy(V_flow_ipv6_zone);
1643 VNET_SYSUNINIT(flowtable_uninit, SI_SUB_KTHREAD_INIT, SI_ORDER_ANY,
1644 flowtable_uninit, NULL);
1649 flowtable_get_hashkey(struct flentry *fle)
1653 if (fle->f_flags & FL_IPV6)
1654 hashkey = ((struct flentry_v4 *)fle)->fl_flow.ipf_key;
1656 hashkey = ((struct flentry_v6 *)fle)->fl_flow.ipf_key;
1662 flowtable_mask_pcpu(struct flowtable *ft, int cpuid)
1666 if (ft->ft_flags & FL_PCPU)
1667 mask = ft->ft_masks[cpuid];
1669 mask = ft->ft_masks[0];
1674 static struct flentry **
1675 flowtable_entry_pcpu(struct flowtable *ft, uint32_t hash, int cpuid)
1677 struct flentry **fle;
1678 int index = (hash % ft->ft_size);
1680 if (ft->ft_flags & FL_PCPU) {
1681 fle = &ft->ft_table.pcpu[cpuid][index];
1683 fle = &ft->ft_table.global[index];
1690 flow_show(struct flowtable *ft, struct flentry *fle)
1693 int rt_valid, ifp_valid;
1694 uint16_t sport, dport;
1696 char saddr[4*sizeof "123"], daddr[4*sizeof "123"];
1697 volatile struct rtentry *rt;
1698 struct ifnet *ifp = NULL;
1700 idle_time = (int)(time_uptime - fle->f_uptime);
1702 rt_valid = rt != NULL;
1705 ifp_valid = ifp != NULL;
1706 hashkey = flowtable_get_hashkey(fle);
1707 if (fle->f_flags & FL_IPV6)
1710 inet_ntoa_r(*(struct in_addr *) &hashkey[2], daddr);
1711 if (ft->ft_flags & FL_HASH_ALL) {
1712 inet_ntoa_r(*(struct in_addr *) &hashkey[1], saddr);
1713 sport = ntohs(((uint16_t *)hashkey)[0]);
1714 dport = ntohs(((uint16_t *)hashkey)[1]);
1715 db_printf("%s:%d->%s:%d",
1716 saddr, sport, daddr,
1719 db_printf("%s ", daddr);
1722 if (fle->f_flags & FL_STALE)
1723 db_printf(" FL_STALE ");
1724 if (fle->f_flags & FL_TCP)
1725 db_printf(" FL_TCP ");
1726 if (fle->f_flags & FL_UDP)
1727 db_printf(" FL_UDP ");
1729 if (rt->rt_flags & RTF_UP)
1730 db_printf(" RTF_UP ");
1733 if (ifp->if_flags & IFF_LOOPBACK)
1734 db_printf(" IFF_LOOPBACK ");
1735 if (ifp->if_flags & IFF_UP)
1736 db_printf(" IFF_UP ");
1737 if (ifp->if_flags & IFF_POINTOPOINT)
1738 db_printf(" IFF_POINTOPOINT ");
1740 if (fle->f_flags & FL_IPV6)
1741 db_printf("\n\tkey=%08x:%08x:%08x%08x:%08x:%08x%08x:%08x:%08x",
1742 hashkey[0], hashkey[1], hashkey[2],
1743 hashkey[3], hashkey[4], hashkey[5],
1744 hashkey[6], hashkey[7], hashkey[8]);
1746 db_printf("\n\tkey=%08x:%08x:%08x ",
1747 hashkey[0], hashkey[1], hashkey[2]);
1748 db_printf("hash=%08x idle_time=%03d"
1749 "\n\tfibnum=%02d rt=%p",
1750 fle->f_fhash, idle_time, fle->f_fibnum, fle->f_rt);
1755 flowtable_show(struct flowtable *ft, int cpuid)
1758 struct flentry *fle, **flehead;
1759 bitstr_t *mask, *tmpmask;
1762 db_printf("cpu: %d\n", cpuid);
1763 mask = flowtable_mask_pcpu(ft, cpuid);
1764 tmpmask = ft->ft_tmpmask;
1765 memcpy(tmpmask, mask, ft->ft_size/8);
1767 * XXX Note to self, bit_ffs operates at the byte level
1768 * and thus adds gratuitous overhead
1770 bit_ffs(tmpmask, ft->ft_size, &curbit);
1771 while (curbit != -1) {
1772 if (curbit >= ft->ft_size || curbit < -1) {
1773 db_printf("warning: bad curbit value %d \n",
1778 flehead = flowtable_entry_pcpu(ft, curbit, cpuid);
1781 while (fle != NULL) {
1786 bit_clear(tmpmask, curbit);
1787 bit_ffs(tmpmask, ft->ft_size, &curbit);
1792 flowtable_show_vnet(void)
1794 struct flowtable *ft;
1797 ft = V_flow_list_head;
1798 while (ft != NULL) {
1799 printf("name: %s\n", ft->ft_name);
1800 if (ft->ft_flags & FL_PCPU) {
1802 flowtable_show(ft, i);
1805 flowtable_show(ft, -1);
1811 DB_SHOW_COMMAND(flowtables, db_show_flowtables)
1813 VNET_ITERATOR_DECL(vnet_iter);
1815 VNET_FOREACH(vnet_iter) {
1816 CURVNET_SET(vnet_iter);
1818 db_printf("vnet %p\n", vnet_iter);
1820 flowtable_show_vnet();