1 /**************************************************************************
3 Copyright (c) 2008-2010, BitGravity Inc.
6 Redistribution and use in source and binary forms, with or without
7 modification, are permitted provided that the following conditions are met:
9 1. Redistributions of source code must retain the above copyright notice,
10 this list of conditions and the following disclaimer.
12 2. Neither the name of the BitGravity Corporation nor the names of its
13 contributors may be used to endorse or promote products derived from
14 this software without specific prior written permission.
16 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
17 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
20 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
21 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
22 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
23 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
24 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
25 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
26 POSSIBILITY OF SUCH DAMAGE.
28 ***************************************************************************/
30 #include "opt_route.h"
31 #include "opt_mpath.h"
34 #include "opt_inet6.h"
36 #include <sys/cdefs.h>
37 __FBSDID("$FreeBSD$");
39 #include <sys/param.h>
40 #include <sys/types.h>
41 #include <sys/bitstring.h>
42 #include <sys/condvar.h>
43 #include <sys/callout.h>
44 #include <sys/kernel.h>
45 #include <sys/kthread.h>
46 #include <sys/limits.h>
47 #include <sys/malloc.h>
51 #include <sys/sched.h>
53 #include <sys/socket.h>
54 #include <sys/syslog.h>
55 #include <sys/sysctl.h>
58 #include <net/if_llatbl.h>
59 #include <net/if_var.h>
60 #include <net/route.h>
61 #include <net/flowtable.h>
64 #include <netinet/in.h>
65 #include <netinet/in_systm.h>
66 #include <netinet/in_var.h>
67 #include <netinet/if_ether.h>
68 #include <netinet/ip.h>
70 #include <netinet/ip6.h>
72 #include <netinet/tcp.h>
73 #include <netinet/udp.h>
74 #include <netinet/sctp.h>
76 #include <libkern/jenkins.h>
80 uint16_t ip_sport; /* source port */
81 uint16_t ip_dport; /* destination port */
82 in_addr_t ip_saddr; /* source address */
83 in_addr_t ip_daddr; /* destination address */
87 struct ipv4_tuple ipf_ipt;
92 uint16_t ip_sport; /* source port */
93 uint16_t ip_dport; /* destination port */
94 struct in6_addr ip_saddr; /* source address */
95 struct in6_addr ip_daddr; /* destination address */
99 struct ipv6_tuple ipf_ipt;
104 volatile uint32_t f_fhash; /* hash flowing forward */
105 uint16_t f_flags; /* flow flags */
107 uint8_t f_proto; /* protocol */
108 uint32_t f_fibnum; /* fib index */
109 uint32_t f_uptime; /* uptime at last access */
110 struct flentry *f_next; /* pointer to collision entry */
111 volatile struct rtentry *f_rt; /* rtentry for flow */
112 volatile struct llentry *f_lle; /* llentry for flow */
116 struct flentry fl_entry;
117 union ipv4_flow fl_flow;
121 struct flentry fl_entry;
122 union ipv6_flow fl_flow;
125 #define fl_fhash fl_entry.fl_fhash
126 #define fl_flags fl_entry.fl_flags
127 #define fl_proto fl_entry.fl_proto
128 #define fl_uptime fl_entry.fl_uptime
129 #define fl_rt fl_entry.fl_rt
130 #define fl_lle fl_entry.fl_lle
132 #define SECS_PER_HOUR 3600
133 #define SECS_PER_DAY (24*SECS_PER_HOUR)
137 #define FIN_WAIT_IDLE 600
138 #define TCP_IDLE SECS_PER_DAY
141 typedef void fl_lock_t(struct flowtable *, uint32_t);
142 typedef void fl_rtalloc_t(struct route *, uint32_t, u_int);
145 struct flentry **global;
146 struct flentry **pcpu[MAXCPU];
149 struct flowtable_stats {
150 uint64_t ft_collisions;
151 uint64_t ft_allocated;
153 uint64_t ft_max_depth;
154 uint64_t ft_free_checks;
158 } __aligned(CACHE_LINE_SIZE);
161 struct flowtable_stats ft_stats[MAXCPU];
167 fl_lock_t *ft_unlock;
168 fl_rtalloc_t *ft_rtalloc;
170 * XXX need to pad out
172 struct mtx *ft_locks;
173 union flentryp ft_table;
174 bitstr_t *ft_masks[MAXCPU];
175 bitstr_t *ft_tmpmask;
176 struct flowtable *ft_next;
178 uint32_t ft_count __aligned(CACHE_LINE_SIZE);
179 uint32_t ft_udp_idle __aligned(CACHE_LINE_SIZE);
180 uint32_t ft_fin_wait_idle;
181 uint32_t ft_syn_idle;
182 uint32_t ft_tcp_idle;
184 } __aligned(CACHE_LINE_SIZE);
186 static struct proc *flowcleanerproc;
187 static VNET_DEFINE(struct flowtable *, flow_list_head);
188 static VNET_DEFINE(uint32_t, flow_hashjitter);
189 static VNET_DEFINE(uma_zone_t, flow_ipv4_zone);
190 static VNET_DEFINE(uma_zone_t, flow_ipv6_zone);
192 #define V_flow_list_head VNET(flow_list_head)
193 #define V_flow_hashjitter VNET(flow_hashjitter)
194 #define V_flow_ipv4_zone VNET(flow_ipv4_zone)
195 #define V_flow_ipv6_zone VNET(flow_ipv6_zone)
198 static struct cv flowclean_f_cv;
199 static struct cv flowclean_c_cv;
200 static struct mtx flowclean_lock;
201 static uint32_t flowclean_cycles;
202 static uint32_t flowclean_freq;
204 #ifdef FLOWTABLE_DEBUG
205 #define FLDPRINTF(ft, flags, fmt, ...) \
207 if ((ft)->ft_flags & (flags)) \
208 printf((fmt), __VA_ARGS__); \
212 #define FLDPRINTF(ft, flags, fmt, ...)
219 * - Make flowtable stats per-cpu, aggregated at sysctl call time,
220 * to avoid extra cache evictions caused by incrementing a shared
222 * - add sysctls to resize && flush flow tables
223 * - Add per flowtable sysctls for statistics and configuring timeouts
224 * - add saturation counter to rtentry to support per-packet load-balancing
225 * add flag to indicate round-robin flow, add list lookup from head
227 * - add sysctl / device node / syscall to support exporting and importing
228 * of flows with flag to indicate that a flow was imported so should
229 * not be considered for auto-cleaning
230 * - support explicit connection state (currently only ad-hoc for DSR)
231 * - idetach() cleanup for options VIMAGE builds.
233 VNET_DEFINE(int, flowtable_enable) = 1;
234 static VNET_DEFINE(int, flowtable_debug);
235 static VNET_DEFINE(int, flowtable_syn_expire) = SYN_IDLE;
236 static VNET_DEFINE(int, flowtable_udp_expire) = UDP_IDLE;
237 static VNET_DEFINE(int, flowtable_fin_wait_expire) = FIN_WAIT_IDLE;
238 static VNET_DEFINE(int, flowtable_tcp_expire) = TCP_IDLE;
239 static VNET_DEFINE(int, flowtable_nmbflows);
240 static VNET_DEFINE(int, flowtable_ready) = 0;
242 #define V_flowtable_enable VNET(flowtable_enable)
243 #define V_flowtable_debug VNET(flowtable_debug)
244 #define V_flowtable_syn_expire VNET(flowtable_syn_expire)
245 #define V_flowtable_udp_expire VNET(flowtable_udp_expire)
246 #define V_flowtable_fin_wait_expire VNET(flowtable_fin_wait_expire)
247 #define V_flowtable_tcp_expire VNET(flowtable_tcp_expire)
248 #define V_flowtable_nmbflows VNET(flowtable_nmbflows)
249 #define V_flowtable_ready VNET(flowtable_ready)
251 SYSCTL_NODE(_net_inet, OID_AUTO, flowtable, CTLFLAG_RD, NULL, "flowtable");
252 SYSCTL_VNET_INT(_net_inet_flowtable, OID_AUTO, debug, CTLFLAG_RW,
253 &VNET_NAME(flowtable_debug), 0, "print debug info.");
254 SYSCTL_VNET_INT(_net_inet_flowtable, OID_AUTO, enable, CTLFLAG_RW,
255 &VNET_NAME(flowtable_enable), 0, "enable flowtable caching.");
258 * XXX This does not end up updating timeouts at runtime
259 * and only reflects the value for the last table added :-/
261 SYSCTL_VNET_INT(_net_inet_flowtable, OID_AUTO, syn_expire, CTLFLAG_RW,
262 &VNET_NAME(flowtable_syn_expire), 0,
263 "seconds after which to remove syn allocated flow.");
264 SYSCTL_VNET_INT(_net_inet_flowtable, OID_AUTO, udp_expire, CTLFLAG_RW,
265 &VNET_NAME(flowtable_udp_expire), 0,
266 "seconds after which to remove flow allocated to UDP.");
267 SYSCTL_VNET_INT(_net_inet_flowtable, OID_AUTO, fin_wait_expire, CTLFLAG_RW,
268 &VNET_NAME(flowtable_fin_wait_expire), 0,
269 "seconds after which to remove a flow in FIN_WAIT.");
270 SYSCTL_VNET_INT(_net_inet_flowtable, OID_AUTO, tcp_expire, CTLFLAG_RW,
271 &VNET_NAME(flowtable_tcp_expire), 0,
272 "seconds after which to remove flow allocated to a TCP connection.");
276 * Maximum number of flows that can be allocated of a given type.
278 * The table is allocated at boot time (for the pure caching case
279 * there is no reason why this could not be changed at runtime)
280 * and thus (currently) needs to be set with a tunable.
283 sysctl_nmbflows(SYSCTL_HANDLER_ARGS)
285 int error, newnmbflows;
287 newnmbflows = V_flowtable_nmbflows;
288 error = sysctl_handle_int(oidp, &newnmbflows, 0, req);
289 if (error == 0 && req->newptr) {
290 if (newnmbflows > V_flowtable_nmbflows) {
291 V_flowtable_nmbflows = newnmbflows;
292 uma_zone_set_max(V_flow_ipv4_zone,
293 V_flowtable_nmbflows);
294 uma_zone_set_max(V_flow_ipv6_zone,
295 V_flowtable_nmbflows);
301 SYSCTL_VNET_PROC(_net_inet_flowtable, OID_AUTO, nmbflows,
302 CTLTYPE_INT|CTLFLAG_RW, 0, 0, sysctl_nmbflows, "IU",
303 "Maximum number of flows allowed");
307 #define FS_PRINT(sb, field) sbuf_printf((sb), "\t%s: %jd\n", #field, fs->ft_##field)
310 fs_print(struct sbuf *sb, struct flowtable_stats *fs)
313 FS_PRINT(sb, collisions);
314 FS_PRINT(sb, allocated);
315 FS_PRINT(sb, misses);
316 FS_PRINT(sb, max_depth);
317 FS_PRINT(sb, free_checks);
320 FS_PRINT(sb, lookups);
324 flowtable_show_stats(struct sbuf *sb, struct flowtable *ft)
327 struct flowtable_stats fs, *pfs;
329 if (ft->ft_flags & FL_PCPU) {
330 bzero(&fs, sizeof(fs));
333 pfs->ft_collisions += ft->ft_stats[i].ft_collisions;
334 pfs->ft_allocated += ft->ft_stats[i].ft_allocated;
335 pfs->ft_misses += ft->ft_stats[i].ft_misses;
336 pfs->ft_free_checks += ft->ft_stats[i].ft_free_checks;
337 pfs->ft_frees += ft->ft_stats[i].ft_frees;
338 pfs->ft_hits += ft->ft_stats[i].ft_hits;
339 pfs->ft_lookups += ft->ft_stats[i].ft_lookups;
340 if (ft->ft_stats[i].ft_max_depth > pfs->ft_max_depth)
341 pfs->ft_max_depth = ft->ft_stats[i].ft_max_depth;
344 pfs = &ft->ft_stats[0];
350 sysctl_flowtable_stats(SYSCTL_HANDLER_ARGS)
352 struct flowtable *ft;
356 sb = sbuf_new(NULL, NULL, 64*1024, SBUF_FIXEDLEN);
358 ft = V_flow_list_head;
360 sbuf_printf(sb, "\ntable name: %s\n", ft->ft_name);
361 flowtable_show_stats(sb, ft);
365 error = SYSCTL_OUT(req, sbuf_data(sb), sbuf_len(sb) + 1);
370 SYSCTL_VNET_PROC(_net_inet_flowtable, OID_AUTO, stats, CTLTYPE_STRING|CTLFLAG_RD,
371 NULL, 0, sysctl_flowtable_stats, "A", "flowtable statistics");
376 rtalloc_ign_wrapper(struct route *ro, uint32_t hash, u_int fibnum)
379 rtalloc_ign_fib(ro, 0, fibnum);
384 flowtable_global_lock(struct flowtable *table, uint32_t hash)
386 int lock_index = (hash)&(table->ft_lock_count - 1);
388 mtx_lock(&table->ft_locks[lock_index]);
392 flowtable_global_unlock(struct flowtable *table, uint32_t hash)
394 int lock_index = (hash)&(table->ft_lock_count - 1);
396 mtx_unlock(&table->ft_locks[lock_index]);
400 flowtable_pcpu_lock(struct flowtable *table, uint32_t hash)
407 flowtable_pcpu_unlock(struct flowtable *table, uint32_t hash)
413 #define FL_ENTRY_INDEX(table, hash)((hash) % (table)->ft_size)
414 #define FL_ENTRY(table, hash) *flowtable_entry((table), (hash))
415 #define FL_ENTRY_LOCK(table, hash) (table)->ft_lock((table), (hash))
416 #define FL_ENTRY_UNLOCK(table, hash) (table)->ft_unlock((table), (hash))
418 #define FL_STALE (1<<8)
419 #define FL_OVERWRITE (1<<10)
422 flow_invalidate(struct flentry *fle)
425 fle->f_flags |= FL_STALE;
429 proto_to_flags(uint8_t proto)
452 flags_to_proto(int flags)
454 int proto, protoflags;
456 protoflags = flags & (FL_TCP|FL_SCTP|FL_UDP);
457 switch (protoflags) {
462 proto = IPPROTO_SCTP;
475 #ifdef FLOWTABLE_DEBUG
477 ipv4_flow_print_tuple(int flags, int proto, struct sockaddr_in *ssin,
478 struct sockaddr_in *dsin)
480 char saddr[4*sizeof "123"], daddr[4*sizeof "123"];
482 if (flags & FL_HASH_ALL) {
483 inet_ntoa_r(ssin->sin_addr, saddr);
484 inet_ntoa_r(dsin->sin_addr, daddr);
485 printf("proto=%d %s:%d->%s:%d\n",
486 proto, saddr, ntohs(ssin->sin_port), daddr,
487 ntohs(dsin->sin_port));
489 inet_ntoa_r(*(struct in_addr *) &dsin->sin_addr, daddr);
490 printf("proto=%d %s\n", proto, daddr);
497 ipv4_mbuf_demarshal(struct flowtable *ft, struct mbuf *m,
498 struct sockaddr_in *ssin, struct sockaddr_in *dsin, uint16_t *flags)
506 uint16_t sport, dport;
508 proto = sport = dport = 0;
509 ip = mtod(m, struct ip *);
510 dsin->sin_family = AF_INET;
511 dsin->sin_len = sizeof(*dsin);
512 dsin->sin_addr = ip->ip_dst;
513 ssin->sin_family = AF_INET;
514 ssin->sin_len = sizeof(*ssin);
515 ssin->sin_addr = ip->ip_src;
518 if ((*flags & FL_HASH_ALL) == 0) {
519 FLDPRINTF(ft, FL_DEBUG_ALL, "skip port check flags=0x%x ",
524 iphlen = ip->ip_hl << 2; /* XXX options? */
528 th = (struct tcphdr *)((caddr_t)ip + iphlen);
529 sport = th->th_sport;
530 dport = th->th_dport;
531 if ((*flags & FL_HASH_ALL) &&
532 (th->th_flags & (TH_RST|TH_FIN)))
536 uh = (struct udphdr *)((caddr_t)ip + iphlen);
537 sport = uh->uh_sport;
538 dport = uh->uh_dport;
541 sh = (struct sctphdr *)((caddr_t)ip + iphlen);
542 sport = sh->src_port;
543 dport = sh->dest_port;
546 FLDPRINTF(ft, FL_DEBUG_ALL, "proto=0x%x not supported\n", proto);
548 /* no port - hence not a protocol we care about */
554 *flags |= proto_to_flags(proto);
555 ssin->sin_port = sport;
556 dsin->sin_port = dport;
561 ipv4_flow_lookup_hash_internal(
562 struct sockaddr_in *ssin, struct sockaddr_in *dsin,
563 uint32_t *key, uint16_t flags)
565 uint16_t sport, dport;
569 if ((V_flowtable_enable == 0) || (V_flowtable_ready == 0))
571 proto = flags_to_proto(flags);
572 sport = dport = key[2] = key[1] = key[0] = 0;
573 if ((ssin != NULL) && (flags & FL_HASH_ALL)) {
574 key[1] = ssin->sin_addr.s_addr;
575 sport = ssin->sin_port;
578 key[2] = dsin->sin_addr.s_addr;
579 dport = dsin->sin_port;
581 if (flags & FL_HASH_ALL) {
582 ((uint16_t *)key)[0] = sport;
583 ((uint16_t *)key)[1] = dport;
585 offset = V_flow_hashjitter + proto;
587 return (jenkins_hashword(key, 3, offset));
590 static struct flentry *
591 flowtable_lookup_mbuf4(struct flowtable *ft, struct mbuf *m)
593 struct sockaddr_storage ssa, dsa;
595 struct sockaddr_in *dsin, *ssin;
597 dsin = (struct sockaddr_in *)&dsa;
598 ssin = (struct sockaddr_in *)&ssa;
599 bzero(dsin, sizeof(*dsin));
600 bzero(ssin, sizeof(*ssin));
601 flags = ft->ft_flags;
602 if (ipv4_mbuf_demarshal(ft, m, ssin, dsin, &flags) != 0)
605 return (flowtable_lookup(ft, &ssa, &dsa, M_GETFIB(m), flags));
609 flow_to_route(struct flentry *fle, struct route *ro)
611 uint32_t *hashkey = NULL;
612 struct sockaddr_in *sin;
614 sin = (struct sockaddr_in *)&ro->ro_dst;
615 sin->sin_family = AF_INET;
616 sin->sin_len = sizeof(*sin);
617 hashkey = ((struct flentry_v4 *)fle)->fl_flow.ipf_key;
618 sin->sin_addr.s_addr = hashkey[2];
619 ro->ro_rt = __DEVOLATILE(struct rtentry *, fle->f_rt);
620 ro->ro_lle = __DEVOLATILE(struct llentry *, fle->f_lle);
626 * PULLUP_TO(len, p, T) makes sure that len + sizeof(T) is contiguous,
627 * then it sets p to point at the offset "len" in the mbuf. WARNING: the
628 * pointer might become stale after other pullups (but we never use it
631 #define PULLUP_TO(_len, p, T) \
633 int x = (_len) + sizeof(T); \
634 if ((m)->m_len < x) { \
635 goto receive_failed; \
637 p = (mtod(m, char *) + (_len)); \
640 #define TCP(p) ((struct tcphdr *)(p))
641 #define SCTP(p) ((struct sctphdr *)(p))
642 #define UDP(p) ((struct udphdr *)(p))
645 ipv6_mbuf_demarshal(struct flowtable *ft, struct mbuf *m,
646 struct sockaddr_in6 *ssin6, struct sockaddr_in6 *dsin6, uint16_t *flags)
651 uint16_t src_port, dst_port;
655 offset = hlen = src_port = dst_port = 0;
657 ip6 = mtod(m, struct ip6_hdr *);
658 hlen = sizeof(struct ip6_hdr);
659 proto = ip6->ip6_nxt;
661 if ((*flags & FL_HASH_ALL) == 0)
664 while (ulp == NULL) {
667 case IPPROTO_OSPFIGP:
675 PULLUP_TO(hlen, ulp, struct tcphdr);
676 dst_port = TCP(ulp)->th_dport;
677 src_port = TCP(ulp)->th_sport;
678 if ((*flags & FL_HASH_ALL) &&
679 (TCP(ulp)->th_flags & (TH_RST|TH_FIN)))
683 PULLUP_TO(hlen, ulp, struct sctphdr);
684 src_port = SCTP(ulp)->src_port;
685 dst_port = SCTP(ulp)->dest_port;
688 PULLUP_TO(hlen, ulp, struct udphdr);
689 dst_port = UDP(ulp)->uh_dport;
690 src_port = UDP(ulp)->uh_sport;
692 case IPPROTO_HOPOPTS: /* RFC 2460 */
693 PULLUP_TO(hlen, ulp, struct ip6_hbh);
694 hlen += (((struct ip6_hbh *)ulp)->ip6h_len + 1) << 3;
695 proto = ((struct ip6_hbh *)ulp)->ip6h_nxt;
698 case IPPROTO_ROUTING: /* RFC 2460 */
699 PULLUP_TO(hlen, ulp, struct ip6_rthdr);
700 hlen += (((struct ip6_rthdr *)ulp)->ip6r_len + 1) << 3;
701 proto = ((struct ip6_rthdr *)ulp)->ip6r_nxt;
704 case IPPROTO_FRAGMENT: /* RFC 2460 */
705 PULLUP_TO(hlen, ulp, struct ip6_frag);
706 hlen += sizeof (struct ip6_frag);
707 proto = ((struct ip6_frag *)ulp)->ip6f_nxt;
708 offset = ((struct ip6_frag *)ulp)->ip6f_offlg &
712 case IPPROTO_DSTOPTS: /* RFC 2460 */
713 PULLUP_TO(hlen, ulp, struct ip6_hbh);
714 hlen += (((struct ip6_hbh *)ulp)->ip6h_len + 1) << 3;
715 proto = ((struct ip6_hbh *)ulp)->ip6h_nxt;
718 case IPPROTO_AH: /* RFC 2402 */
719 PULLUP_TO(hlen, ulp, struct ip6_ext);
720 hlen += (((struct ip6_ext *)ulp)->ip6e_len + 2) << 2;
721 proto = ((struct ip6_ext *)ulp)->ip6e_nxt;
725 PULLUP_TO(hlen, ulp, struct ip6_ext);
736 dsin6->sin6_family = AF_INET6;
737 dsin6->sin6_len = sizeof(*dsin6);
738 dsin6->sin6_port = dst_port;
739 memcpy(&dsin6->sin6_addr, &ip6->ip6_dst, sizeof(struct in6_addr));
741 ssin6->sin6_family = AF_INET6;
742 ssin6->sin6_len = sizeof(*ssin6);
743 ssin6->sin6_port = src_port;
744 memcpy(&ssin6->sin6_addr, &ip6->ip6_src, sizeof(struct in6_addr));
745 *flags |= proto_to_flags(proto);
750 #define zero_key(key) \
764 ipv6_flow_lookup_hash_internal(
765 struct sockaddr_in6 *ssin6, struct sockaddr_in6 *dsin6,
766 uint32_t *key, uint16_t flags)
768 uint16_t sport, dport;
772 if ((V_flowtable_enable == 0) || (V_flowtable_ready == 0))
775 proto = flags_to_proto(flags);
779 memcpy(&key[1], &dsin6->sin6_addr, sizeof(struct in6_addr));
780 dport = dsin6->sin6_port;
782 if ((ssin6 != NULL) && (flags & FL_HASH_ALL)) {
783 memcpy(&key[5], &ssin6->sin6_addr, sizeof(struct in6_addr));
784 sport = ssin6->sin6_port;
786 if (flags & FL_HASH_ALL) {
787 ((uint16_t *)key)[0] = sport;
788 ((uint16_t *)key)[1] = dport;
790 offset = V_flow_hashjitter + proto;
792 return (jenkins_hashword(key, 9, offset));
795 static struct flentry *
796 flowtable_lookup_mbuf6(struct flowtable *ft, struct mbuf *m)
798 struct sockaddr_storage ssa, dsa;
799 struct sockaddr_in6 *dsin6, *ssin6;
802 dsin6 = (struct sockaddr_in6 *)&dsa;
803 ssin6 = (struct sockaddr_in6 *)&ssa;
804 bzero(dsin6, sizeof(*dsin6));
805 bzero(ssin6, sizeof(*ssin6));
806 flags = ft->ft_flags;
808 if (ipv6_mbuf_demarshal(ft, m, ssin6, dsin6, &flags) != 0)
811 return (flowtable_lookup(ft, &ssa, &dsa, M_GETFIB(m), flags));
815 flow_to_route_in6(struct flentry *fle, struct route_in6 *ro)
817 uint32_t *hashkey = NULL;
818 struct sockaddr_in6 *sin6;
820 sin6 = (struct sockaddr_in6 *)&ro->ro_dst;
822 sin6->sin6_family = AF_INET6;
823 sin6->sin6_len = sizeof(*sin6);
824 hashkey = ((struct flentry_v6 *)fle)->fl_flow.ipf_key;
825 memcpy(&sin6->sin6_addr, &hashkey[5], sizeof (struct in6_addr));
826 ro->ro_rt = __DEVOLATILE(struct rtentry *, fle->f_rt);
827 ro->ro_lle = __DEVOLATILE(struct llentry *, fle->f_lle);
833 flowtable_mask(struct flowtable *ft)
837 if (ft->ft_flags & FL_PCPU)
838 mask = ft->ft_masks[curcpu];
840 mask = ft->ft_masks[0];
845 static struct flentry **
846 flowtable_entry(struct flowtable *ft, uint32_t hash)
848 struct flentry **fle;
849 int index = (hash % ft->ft_size);
851 if (ft->ft_flags & FL_PCPU) {
852 KASSERT(&ft->ft_table.pcpu[curcpu][0] != NULL, ("pcpu not set"));
853 fle = &ft->ft_table.pcpu[curcpu][index];
855 KASSERT(&ft->ft_table.global[0] != NULL, ("global not set"));
856 fle = &ft->ft_table.global[index];
863 flow_stale(struct flowtable *ft, struct flentry *fle)
867 if ((fle->f_fhash == 0)
868 || ((fle->f_rt->rt_flags & RTF_HOST) &&
869 ((fle->f_rt->rt_flags & (RTF_UP))
871 || (fle->f_rt->rt_ifp == NULL)
872 || !RT_LINK_IS_UP(fle->f_rt->rt_ifp))
875 idle_time = time_uptime - fle->f_uptime;
877 if ((fle->f_flags & FL_STALE) ||
878 ((fle->f_flags & (TH_SYN|TH_ACK|TH_FIN)) == 0
879 && (idle_time > ft->ft_udp_idle)) ||
880 ((fle->f_flags & TH_FIN)
881 && (idle_time > ft->ft_fin_wait_idle)) ||
882 ((fle->f_flags & (TH_SYN|TH_ACK)) == TH_SYN
883 && (idle_time > ft->ft_syn_idle)) ||
884 ((fle->f_flags & (TH_SYN|TH_ACK)) == (TH_SYN|TH_ACK)
885 && (idle_time > ft->ft_tcp_idle)) ||
886 ((fle->f_rt->rt_flags & RTF_UP) == 0 ||
887 (fle->f_rt->rt_ifp == NULL)))
894 flowtable_set_hashkey(struct flentry *fle, uint32_t *key)
899 if (fle->f_flags & FL_IPV6) {
901 hashkey = ((struct flentry_v4 *)fle)->fl_flow.ipf_key;
904 hashkey = ((struct flentry_v6 *)fle)->fl_flow.ipf_key;
907 for (i = 0; i < nwords; i++)
911 static struct flentry *
912 flow_alloc(struct flowtable *ft)
914 struct flentry *newfle;
918 zone = (ft->ft_flags & FL_IPV6) ? V_flow_ipv6_zone : V_flow_ipv4_zone;
920 newfle = uma_zalloc(zone, M_NOWAIT | M_ZERO);
922 atomic_add_int(&ft->ft_count, 1);
927 flow_free(struct flentry *fle, struct flowtable *ft)
931 zone = (ft->ft_flags & FL_IPV6) ? V_flow_ipv6_zone : V_flow_ipv4_zone;
932 atomic_add_int(&ft->ft_count, -1);
933 uma_zfree(zone, fle);
937 flow_full(struct flowtable *ft)
943 count = ft->ft_count;
945 if (full && (count < (V_flowtable_nmbflows - (V_flowtable_nmbflows >> 3))))
947 else if (!full && (count > (V_flowtable_nmbflows - (V_flowtable_nmbflows >> 5))))
950 if (full && !ft->ft_full) {
951 flowclean_freq = 4*hz;
952 if ((ft->ft_flags & FL_HASH_ALL) == 0)
953 ft->ft_udp_idle = ft->ft_fin_wait_idle =
954 ft->ft_syn_idle = ft->ft_tcp_idle = 5;
955 cv_broadcast(&flowclean_c_cv);
956 } else if (!full && ft->ft_full) {
957 flowclean_freq = 20*hz;
958 if ((ft->ft_flags & FL_HASH_ALL) == 0)
959 ft->ft_udp_idle = ft->ft_fin_wait_idle =
960 ft->ft_syn_idle = ft->ft_tcp_idle = 30;
963 return (ft->ft_full);
967 flowtable_insert(struct flowtable *ft, uint32_t hash, uint32_t *key,
968 uint32_t fibnum, struct route *ro, uint16_t flags)
970 struct flentry *fle, *fletail, *newfle, **flep;
971 struct flowtable_stats *fs = &ft->ft_stats[curcpu];
976 newfle = flow_alloc(ft);
980 newfle->f_flags |= (flags & FL_IPV6);
981 proto = flags_to_proto(flags);
983 FL_ENTRY_LOCK(ft, hash);
984 mask = flowtable_mask(ft);
985 flep = flowtable_entry(ft, hash);
986 fletail = fle = *flep;
989 bit_set(mask, FL_ENTRY_INDEX(ft, hash));
990 *flep = fle = newfle;
997 * find end of list and make sure that we were not
998 * preempted by another thread handling this flow
1000 while (fle != NULL) {
1001 if (fle->f_fhash == hash && !flow_stale(ft, fle)) {
1003 * there was either a hash collision
1004 * or we lost a race to insert
1006 FL_ENTRY_UNLOCK(ft, hash);
1007 flow_free(newfle, ft);
1009 if (flags & FL_OVERWRITE)
1014 * re-visit this double condition XXX
1016 if (fletail->f_next != NULL)
1017 fletail = fle->f_next;
1023 if (depth > fs->ft_max_depth)
1024 fs->ft_max_depth = depth;
1025 fletail->f_next = newfle;
1028 flowtable_set_hashkey(fle, key);
1030 fle->f_proto = proto;
1031 fle->f_rt = ro->ro_rt;
1032 fle->f_lle = ro->ro_lle;
1033 fle->f_fhash = hash;
1034 fle->f_fibnum = fibnum;
1035 fle->f_uptime = time_uptime;
1036 FL_ENTRY_UNLOCK(ft, hash);
1041 kern_flowtable_insert(struct flowtable *ft,
1042 struct sockaddr_storage *ssa, struct sockaddr_storage *dsa,
1043 struct route *ro, uint32_t fibnum, int flags)
1045 uint32_t key[9], hash;
1047 flags = (ft->ft_flags | flags | FL_OVERWRITE);
1051 if (ssa->ss_family == AF_INET)
1052 hash = ipv4_flow_lookup_hash_internal((struct sockaddr_in *)ssa,
1053 (struct sockaddr_in *)dsa, key, flags);
1056 if (ssa->ss_family == AF_INET6)
1057 hash = ipv6_flow_lookup_hash_internal((struct sockaddr_in6 *)ssa,
1058 (struct sockaddr_in6 *)dsa, key, flags);
1060 if (ro->ro_rt == NULL || ro->ro_lle == NULL)
1063 FLDPRINTF(ft, FL_DEBUG,
1064 "kern_flowtable_insert: key=%x:%x:%x hash=%x fibnum=%d flags=%x\n",
1065 key[0], key[1], key[2], hash, fibnum, flags);
1066 return (flowtable_insert(ft, hash, key, fibnum, ro, flags));
1070 flowtable_key_equal(struct flentry *fle, uint32_t *key)
1075 if (fle->f_flags & FL_IPV6) {
1077 hashkey = ((struct flentry_v4 *)fle)->fl_flow.ipf_key;
1080 hashkey = ((struct flentry_v6 *)fle)->fl_flow.ipf_key;
1083 for (i = 0; i < nwords; i++)
1084 if (hashkey[i] != key[i])
1091 flowtable_lookup_mbuf(struct flowtable *ft, struct mbuf *m, int af)
1093 struct flentry *fle = NULL;
1097 fle = flowtable_lookup_mbuf4(ft, m);
1101 fle = flowtable_lookup_mbuf6(ft, m);
1103 if (fle != NULL && m != NULL && (m->m_flags & M_FLOWID) == 0) {
1104 m->m_flags |= M_FLOWID;
1105 m->m_pkthdr.flowid = fle->f_fhash;
1111 flowtable_lookup(struct flowtable *ft, struct sockaddr_storage *ssa,
1112 struct sockaddr_storage *dsa, uint32_t fibnum, int flags)
1114 uint32_t key[9], hash;
1115 struct flentry *fle;
1116 struct flowtable_stats *fs = &ft->ft_stats[curcpu];
1120 struct llentry *lle;
1121 struct route sro, *ro;
1122 struct route_in6 sro6;
1124 sro.ro_rt = sro6.ro_rt = NULL;
1125 sro.ro_lle = sro6.ro_lle = NULL;
1128 flags |= ft->ft_flags;
1129 proto = flags_to_proto(flags);
1131 if (ssa->ss_family == AF_INET) {
1132 struct sockaddr_in *ssin, *dsin;
1135 memcpy(&ro->ro_dst, dsa, sizeof(struct sockaddr_in));
1137 * The harvested source and destination addresses
1138 * may contain port information if the packet is
1139 * from a transport protocol (e.g. TCP/UDP). The
1140 * port field must be cleared before performing
1143 ((struct sockaddr_in *)&ro->ro_dst)->sin_port = 0;
1144 dsin = (struct sockaddr_in *)dsa;
1145 ssin = (struct sockaddr_in *)ssa;
1146 if ((dsin->sin_addr.s_addr == ssin->sin_addr.s_addr) ||
1147 (ntohl(dsin->sin_addr.s_addr) >> IN_CLASSA_NSHIFT) == IN_LOOPBACKNET ||
1148 (ntohl(ssin->sin_addr.s_addr) >> IN_CLASSA_NSHIFT) == IN_LOOPBACKNET)
1151 hash = ipv4_flow_lookup_hash_internal(ssin, dsin, key, flags);
1155 if (ssa->ss_family == AF_INET6) {
1156 struct sockaddr_in6 *ssin6, *dsin6;
1158 ro = (struct route *)&sro6;
1159 memcpy(&sro6.ro_dst, dsa,
1160 sizeof(struct sockaddr_in6));
1161 ((struct sockaddr_in6 *)&ro->ro_dst)->sin6_port = 0;
1162 dsin6 = (struct sockaddr_in6 *)dsa;
1163 ssin6 = (struct sockaddr_in6 *)ssa;
1166 hash = ipv6_flow_lookup_hash_internal(ssin6, dsin6, key, flags);
1170 * Ports are zero and this isn't a transmit cache
1171 * - thus not a protocol for which we need to keep
1173 * FL_HASH_ALL => key[0] != 0 for TCP || UDP || SCTP
1175 if (hash == 0 || (key[0] == 0 && (ft->ft_flags & FL_HASH_ALL)))
1179 FL_ENTRY_LOCK(ft, hash);
1180 if ((fle = FL_ENTRY(ft, hash)) == NULL) {
1181 FL_ENTRY_UNLOCK(ft, hash);
1185 rt = __DEVOLATILE(struct rtentry *, fle->f_rt);
1186 lle = __DEVOLATILE(struct llentry *, fle->f_lle);
1189 && fle->f_fhash == hash
1190 && flowtable_key_equal(fle, key)
1191 && (proto == fle->f_proto)
1192 && (fibnum == fle->f_fibnum)
1193 && (rt->rt_flags & RTF_UP)
1194 && (rt->rt_ifp != NULL)
1195 && (lle->la_flags & LLE_VALID)) {
1197 fle->f_uptime = time_uptime;
1198 fle->f_flags |= flags;
1199 FL_ENTRY_UNLOCK(ft, hash);
1201 } else if (fle->f_next != NULL) {
1205 FL_ENTRY_UNLOCK(ft, hash);
1207 if (flags & FL_NOAUTO || flow_full(ft))
1212 * This bit of code ends up locking the
1213 * same route 3 times (just like ip_output + ether_output)
1215 * - in rt_check when called by arpresolve
1216 * - dropping the refcount for the rtentry
1218 * This could be consolidated to one if we wrote a variant
1219 * of arpresolve with an rt_check variant that expected to
1220 * receive the route locked
1224 if ((ro->ro_dst.sa_family != AF_INET) &&
1225 (ro->ro_dst.sa_family != AF_INET6))
1226 panic("sa_family == %d\n", ro->ro_dst.sa_family);
1229 ft->ft_rtalloc(ro, hash, fibnum);
1230 if (ro->ro_rt == NULL)
1231 error = ENETUNREACH;
1233 struct llentry *lle = NULL;
1234 struct sockaddr_storage *l3addr;
1235 struct rtentry *rt = ro->ro_rt;
1236 struct ifnet *ifp = rt->rt_ifp;
1238 if (ifp->if_flags & (IFF_POINTOPOINT | IFF_LOOPBACK)) {
1244 if (ssa->ss_family == AF_INET6) {
1245 struct sockaddr_in6 *dsin6;
1247 dsin6 = (struct sockaddr_in6 *)dsa;
1248 if (in6_localaddr(&dsin6->sin6_addr)) {
1254 if (rt->rt_flags & RTF_GATEWAY)
1255 l3addr = (struct sockaddr_storage *)rt->rt_gateway;
1258 l3addr = (struct sockaddr_storage *)&ro->ro_dst;
1259 llentry_update(&lle, LLTABLE6(ifp), l3addr, ifp);
1263 if (ssa->ss_family == AF_INET) {
1264 if (rt->rt_flags & RTF_GATEWAY)
1265 l3addr = (struct sockaddr_storage *)rt->rt_gateway;
1267 l3addr = (struct sockaddr_storage *)&ro->ro_dst;
1268 llentry_update(&lle, LLTABLE(ifp), l3addr, ifp);
1279 error = flowtable_insert(ft, hash, key, fibnum, ro, flags);
1289 return ((error) ? NULL : fle);
1293 * used by the bit_alloc macro
1295 #define calloc(count, size) malloc((count)*(size), M_DEVBUF, M_WAITOK|M_ZERO)
1298 flowtable_alloc(char *name, int nentry, int flags)
1300 struct flowtable *ft, *fttail;
1303 if (V_flow_hashjitter == 0)
1304 V_flow_hashjitter = arc4random();
1306 KASSERT(nentry > 0, ("nentry must be > 0, is %d\n", nentry));
1308 ft = malloc(sizeof(struct flowtable),
1309 M_RTABLE, M_WAITOK | M_ZERO);
1312 ft->ft_flags = flags;
1313 ft->ft_size = nentry;
1315 ft->ft_rtalloc = rtalloc_mpath_fib;
1317 ft->ft_rtalloc = rtalloc_ign_wrapper;
1319 if (flags & FL_PCPU) {
1320 ft->ft_lock = flowtable_pcpu_lock;
1321 ft->ft_unlock = flowtable_pcpu_unlock;
1323 for (i = 0; i <= mp_maxid; i++) {
1324 ft->ft_table.pcpu[i] =
1325 malloc(nentry*sizeof(struct flentry *),
1326 M_RTABLE, M_WAITOK | M_ZERO);
1327 ft->ft_masks[i] = bit_alloc(nentry);
1330 ft->ft_lock_count = 2*(powerof2(mp_maxid + 1) ? (mp_maxid + 1):
1331 (fls(mp_maxid + 1) << 1));
1333 ft->ft_lock = flowtable_global_lock;
1334 ft->ft_unlock = flowtable_global_unlock;
1335 ft->ft_table.global =
1336 malloc(nentry*sizeof(struct flentry *),
1337 M_RTABLE, M_WAITOK | M_ZERO);
1338 ft->ft_locks = malloc(ft->ft_lock_count*sizeof(struct mtx),
1339 M_RTABLE, M_WAITOK | M_ZERO);
1340 for (i = 0; i < ft->ft_lock_count; i++)
1341 mtx_init(&ft->ft_locks[i], "flow", NULL, MTX_DEF|MTX_DUPOK);
1343 ft->ft_masks[0] = bit_alloc(nentry);
1345 ft->ft_tmpmask = bit_alloc(nentry);
1348 * In the local transmit case the table truly is
1349 * just a cache - so everything is eligible for
1350 * replacement after 5s of non-use
1352 if (flags & FL_HASH_ALL) {
1353 ft->ft_udp_idle = V_flowtable_udp_expire;
1354 ft->ft_syn_idle = V_flowtable_syn_expire;
1355 ft->ft_fin_wait_idle = V_flowtable_fin_wait_expire;
1356 ft->ft_tcp_idle = V_flowtable_fin_wait_expire;
1358 ft->ft_udp_idle = ft->ft_fin_wait_idle =
1359 ft->ft_syn_idle = ft->ft_tcp_idle = 30;
1364 * hook in to the cleaner list
1366 if (V_flow_list_head == NULL)
1367 V_flow_list_head = ft;
1369 fttail = V_flow_list_head;
1370 while (fttail->ft_next != NULL)
1371 fttail = fttail->ft_next;
1372 fttail->ft_next = ft;
1379 * The rest of the code is devoted to garbage collection of expired entries.
1380 * It is a new additon made necessary by the switch to dynamically allocating
1385 fle_free(struct flentry *fle, struct flowtable *ft)
1388 struct llentry *lle;
1390 rt = __DEVOLATILE(struct rtentry *, fle->f_rt);
1391 lle = __DEVOLATILE(struct llentry *, fle->f_lle);
1400 flowtable_free_stale(struct flowtable *ft, struct rtentry *rt)
1402 int curbit = 0, count;
1403 struct flentry *fle, **flehead, *fleprev;
1404 struct flentry *flefreehead, *flefreetail, *fletmp;
1405 bitstr_t *mask, *tmpmask;
1406 struct flowtable_stats *fs = &ft->ft_stats[curcpu];
1408 flefreehead = flefreetail = NULL;
1409 mask = flowtable_mask(ft);
1410 tmpmask = ft->ft_tmpmask;
1411 memcpy(tmpmask, mask, ft->ft_size/8);
1413 * XXX Note to self, bit_ffs operates at the byte level
1414 * and thus adds gratuitous overhead
1416 bit_ffs(tmpmask, ft->ft_size, &curbit);
1417 while (curbit != -1) {
1418 if (curbit >= ft->ft_size || curbit < -1) {
1420 "warning: bad curbit value %d \n",
1425 FL_ENTRY_LOCK(ft, curbit);
1426 flehead = flowtable_entry(ft, curbit);
1427 fle = fleprev = *flehead;
1429 fs->ft_free_checks++;
1431 if (fle == NULL && curbit > 0) {
1433 "warning bit=%d set, but no fle found\n",
1437 while (fle != NULL) {
1439 if (__DEVOLATILE(struct rtentry *, fle->f_rt) != rt) {
1444 } else if (!flow_stale(ft, fle)) {
1450 * delete head of the list
1452 if (fleprev == *flehead) {
1454 if (fle == fleprev) {
1455 fleprev = *flehead = fle->f_next;
1457 fleprev = *flehead = fle;
1461 * don't advance fleprev
1464 fleprev->f_next = fle->f_next;
1465 fle = fleprev->f_next;
1468 if (flefreehead == NULL)
1469 flefreehead = flefreetail = fletmp;
1471 flefreetail->f_next = fletmp;
1472 flefreetail = fletmp;
1474 fletmp->f_next = NULL;
1476 if (*flehead == NULL)
1477 bit_clear(mask, curbit);
1478 FL_ENTRY_UNLOCK(ft, curbit);
1479 bit_clear(tmpmask, curbit);
1480 bit_ffs(tmpmask, ft->ft_size, &curbit);
1483 while ((fle = flefreehead) != NULL) {
1484 flefreehead = fle->f_next;
1489 if (V_flowtable_debug && count)
1490 log(LOG_DEBUG, "freed %d flow entries\n", count);
1494 flowtable_route_flush(struct flowtable *ft, struct rtentry *rt)
1498 if (ft->ft_flags & FL_PCPU) {
1500 if (smp_started == 1) {
1501 thread_lock(curthread);
1502 sched_bind(curthread, i);
1503 thread_unlock(curthread);
1506 flowtable_free_stale(ft, rt);
1508 if (smp_started == 1) {
1509 thread_lock(curthread);
1510 sched_unbind(curthread);
1511 thread_unlock(curthread);
1515 flowtable_free_stale(ft, rt);
1520 flowtable_clean_vnet(void)
1522 struct flowtable *ft;
1525 ft = V_flow_list_head;
1526 while (ft != NULL) {
1527 if (ft->ft_flags & FL_PCPU) {
1529 if (smp_started == 1) {
1530 thread_lock(curthread);
1531 sched_bind(curthread, i);
1532 thread_unlock(curthread);
1535 flowtable_free_stale(ft, NULL);
1537 if (smp_started == 1) {
1538 thread_lock(curthread);
1539 sched_unbind(curthread);
1540 thread_unlock(curthread);
1544 flowtable_free_stale(ft, NULL);
1551 flowtable_cleaner(void)
1553 VNET_ITERATOR_DECL(vnet_iter);
1557 log(LOG_INFO, "flowtable cleaner started\n");
1561 VNET_FOREACH(vnet_iter) {
1562 CURVNET_SET(vnet_iter);
1563 flowtable_clean_vnet();
1566 VNET_LIST_RUNLOCK();
1569 * The 10 second interval between cleaning checks
1572 mtx_lock(&flowclean_lock);
1574 sched_prio(td, PPAUSE);
1577 cv_broadcast(&flowclean_f_cv);
1578 cv_timedwait(&flowclean_c_cv, &flowclean_lock, flowclean_freq);
1579 mtx_unlock(&flowclean_lock);
1584 flowtable_flush(void *unused __unused)
1588 mtx_lock(&flowclean_lock);
1589 start = flowclean_cycles;
1590 while (start == flowclean_cycles) {
1591 cv_broadcast(&flowclean_c_cv);
1592 cv_wait(&flowclean_f_cv, &flowclean_lock);
1594 mtx_unlock(&flowclean_lock);
1597 static struct kproc_desc flow_kp = {
1602 SYSINIT(flowcleaner, SI_SUB_KTHREAD_IDLE, SI_ORDER_ANY, kproc_start, &flow_kp);
1605 flowtable_init_vnet(const void *unused __unused)
1608 V_flowtable_nmbflows = 1024 + maxusers * 64 * mp_ncpus;
1609 V_flow_ipv4_zone = uma_zcreate("ip4flow", sizeof(struct flentry_v4),
1610 NULL, NULL, NULL, NULL, 64, UMA_ZONE_MAXBUCKET);
1611 V_flow_ipv6_zone = uma_zcreate("ip6flow", sizeof(struct flentry_v6),
1612 NULL, NULL, NULL, NULL, 64, UMA_ZONE_MAXBUCKET);
1613 uma_zone_set_max(V_flow_ipv4_zone, V_flowtable_nmbflows);
1614 uma_zone_set_max(V_flow_ipv6_zone, V_flowtable_nmbflows);
1615 V_flowtable_ready = 1;
1617 VNET_SYSINIT(flowtable_init_vnet, SI_SUB_SMP, SI_ORDER_ANY,
1618 flowtable_init_vnet, NULL);
1621 flowtable_init(const void *unused __unused)
1624 cv_init(&flowclean_c_cv, "c_flowcleanwait");
1625 cv_init(&flowclean_f_cv, "f_flowcleanwait");
1626 mtx_init(&flowclean_lock, "flowclean lock", NULL, MTX_DEF);
1627 EVENTHANDLER_REGISTER(ifnet_departure_event, flowtable_flush, NULL,
1628 EVENTHANDLER_PRI_ANY);
1629 flowclean_freq = 20*hz;
1631 SYSINIT(flowtable_init, SI_SUB_KTHREAD_INIT, SI_ORDER_FIRST,
1632 flowtable_init, NULL);
1637 flowtable_uninit(const void *unused __unused)
1640 V_flowtable_ready = 0;
1641 uma_zdestroy(V_flow_ipv4_zone);
1642 uma_zdestroy(V_flow_ipv6_zone);
1645 VNET_SYSUNINIT(flowtable_uninit, SI_SUB_KTHREAD_INIT, SI_ORDER_ANY,
1646 flowtable_uninit, NULL);
1651 flowtable_get_hashkey(struct flentry *fle)
1655 if (fle->f_flags & FL_IPV6)
1656 hashkey = ((struct flentry_v4 *)fle)->fl_flow.ipf_key;
1658 hashkey = ((struct flentry_v6 *)fle)->fl_flow.ipf_key;
1664 flowtable_mask_pcpu(struct flowtable *ft, int cpuid)
1668 if (ft->ft_flags & FL_PCPU)
1669 mask = ft->ft_masks[cpuid];
1671 mask = ft->ft_masks[0];
1676 static struct flentry **
1677 flowtable_entry_pcpu(struct flowtable *ft, uint32_t hash, int cpuid)
1679 struct flentry **fle;
1680 int index = (hash % ft->ft_size);
1682 if (ft->ft_flags & FL_PCPU) {
1683 fle = &ft->ft_table.pcpu[cpuid][index];
1685 fle = &ft->ft_table.global[index];
1692 flow_show(struct flowtable *ft, struct flentry *fle)
1695 int rt_valid, ifp_valid;
1696 uint16_t sport, dport;
1698 char saddr[4*sizeof "123"], daddr[4*sizeof "123"];
1699 volatile struct rtentry *rt;
1700 struct ifnet *ifp = NULL;
1702 idle_time = (int)(time_uptime - fle->f_uptime);
1704 rt_valid = rt != NULL;
1707 ifp_valid = ifp != NULL;
1708 hashkey = flowtable_get_hashkey(fle);
1709 if (fle->f_flags & FL_IPV6)
1712 inet_ntoa_r(*(struct in_addr *) &hashkey[2], daddr);
1713 if (ft->ft_flags & FL_HASH_ALL) {
1714 inet_ntoa_r(*(struct in_addr *) &hashkey[1], saddr);
1715 sport = ntohs(((uint16_t *)hashkey)[0]);
1716 dport = ntohs(((uint16_t *)hashkey)[1]);
1717 db_printf("%s:%d->%s:%d",
1718 saddr, sport, daddr,
1721 db_printf("%s ", daddr);
1724 if (fle->f_flags & FL_STALE)
1725 db_printf(" FL_STALE ");
1726 if (fle->f_flags & FL_TCP)
1727 db_printf(" FL_TCP ");
1728 if (fle->f_flags & FL_UDP)
1729 db_printf(" FL_UDP ");
1731 if (rt->rt_flags & RTF_UP)
1732 db_printf(" RTF_UP ");
1735 if (ifp->if_flags & IFF_LOOPBACK)
1736 db_printf(" IFF_LOOPBACK ");
1737 if (ifp->if_flags & IFF_UP)
1738 db_printf(" IFF_UP ");
1739 if (ifp->if_flags & IFF_POINTOPOINT)
1740 db_printf(" IFF_POINTOPOINT ");
1742 if (fle->f_flags & FL_IPV6)
1743 db_printf("\n\tkey=%08x:%08x:%08x%08x:%08x:%08x%08x:%08x:%08x",
1744 hashkey[0], hashkey[1], hashkey[2],
1745 hashkey[3], hashkey[4], hashkey[5],
1746 hashkey[6], hashkey[7], hashkey[8]);
1748 db_printf("\n\tkey=%08x:%08x:%08x ",
1749 hashkey[0], hashkey[1], hashkey[2]);
1750 db_printf("hash=%08x idle_time=%03d"
1751 "\n\tfibnum=%02d rt=%p",
1752 fle->f_fhash, idle_time, fle->f_fibnum, fle->f_rt);
1757 flowtable_show(struct flowtable *ft, int cpuid)
1760 struct flentry *fle, **flehead;
1761 bitstr_t *mask, *tmpmask;
1764 db_printf("cpu: %d\n", cpuid);
1765 mask = flowtable_mask_pcpu(ft, cpuid);
1766 tmpmask = ft->ft_tmpmask;
1767 memcpy(tmpmask, mask, ft->ft_size/8);
1769 * XXX Note to self, bit_ffs operates at the byte level
1770 * and thus adds gratuitous overhead
1772 bit_ffs(tmpmask, ft->ft_size, &curbit);
1773 while (curbit != -1) {
1774 if (curbit >= ft->ft_size || curbit < -1) {
1775 db_printf("warning: bad curbit value %d \n",
1780 flehead = flowtable_entry_pcpu(ft, curbit, cpuid);
1783 while (fle != NULL) {
1788 bit_clear(tmpmask, curbit);
1789 bit_ffs(tmpmask, ft->ft_size, &curbit);
1794 flowtable_show_vnet(void)
1796 struct flowtable *ft;
1799 ft = V_flow_list_head;
1800 while (ft != NULL) {
1801 printf("name: %s\n", ft->ft_name);
1802 if (ft->ft_flags & FL_PCPU) {
1804 flowtable_show(ft, i);
1807 flowtable_show(ft, -1);
1813 DB_SHOW_COMMAND(flowtables, db_show_flowtables)
1815 VNET_ITERATOR_DECL(vnet_iter);
1817 VNET_FOREACH(vnet_iter) {
1818 CURVNET_SET(vnet_iter);
1820 db_printf("vnet %p\n", vnet_iter);
1822 flowtable_show_vnet();