1 /**************************************************************************
3 Copyright (c) 2008-2010, BitGravity Inc.
6 Redistribution and use in source and binary forms, with or without
7 modification, are permitted provided that the following conditions are met:
9 1. Redistributions of source code must retain the above copyright notice,
10 this list of conditions and the following disclaimer.
12 2. Neither the name of the BitGravity Corporation nor the names of its
13 contributors may be used to endorse or promote products derived from
14 this software without specific prior written permission.
16 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
17 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
20 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
21 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
22 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
23 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
24 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
25 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
26 POSSIBILITY OF SUCH DAMAGE.
28 ***************************************************************************/
30 #include "opt_route.h"
31 #include "opt_mpath.h"
34 #include "opt_inet6.h"
36 #include <sys/cdefs.h>
37 __FBSDID("$FreeBSD$");
39 #include <sys/param.h>
40 #include <sys/types.h>
41 #include <sys/bitstring.h>
42 #include <sys/condvar.h>
43 #include <sys/callout.h>
44 #include <sys/kernel.h>
45 #include <sys/kthread.h>
46 #include <sys/limits.h>
47 #include <sys/malloc.h>
51 #include <sys/sched.h>
53 #include <sys/socket.h>
54 #include <sys/syslog.h>
55 #include <sys/sysctl.h>
58 #include <net/if_llatbl.h>
59 #include <net/if_var.h>
60 #include <net/route.h>
61 #include <net/flowtable.h>
64 #include <netinet/in.h>
65 #include <netinet/in_systm.h>
66 #include <netinet/in_var.h>
67 #include <netinet/if_ether.h>
68 #include <netinet/ip.h>
70 #include <netinet/ip6.h>
72 #include <netinet/tcp.h>
73 #include <netinet/udp.h>
74 #include <netinet/sctp.h>
76 #include <libkern/jenkins.h>
80 uint16_t ip_sport; /* source port */
81 uint16_t ip_dport; /* destination port */
82 in_addr_t ip_saddr; /* source address */
83 in_addr_t ip_daddr; /* destination address */
87 struct ipv4_tuple ipf_ipt;
92 uint16_t ip_sport; /* source port */
93 uint16_t ip_dport; /* destination port */
94 struct in6_addr ip_saddr; /* source address */
95 struct in6_addr ip_daddr; /* destination address */
99 struct ipv6_tuple ipf_ipt;
104 volatile uint32_t f_fhash; /* hash flowing forward */
105 uint16_t f_flags; /* flow flags */
107 uint8_t f_proto; /* protocol */
108 uint32_t f_fibnum; /* fib index */
109 uint32_t f_uptime; /* uptime at last access */
110 struct flentry *f_next; /* pointer to collision entry */
111 volatile struct rtentry *f_rt; /* rtentry for flow */
112 volatile struct llentry *f_lle; /* llentry for flow */
116 struct flentry fl_entry;
117 union ipv4_flow fl_flow;
121 struct flentry fl_entry;
122 union ipv6_flow fl_flow;
125 #define fl_fhash fl_entry.fl_fhash
126 #define fl_flags fl_entry.fl_flags
127 #define fl_proto fl_entry.fl_proto
128 #define fl_uptime fl_entry.fl_uptime
129 #define fl_rt fl_entry.fl_rt
130 #define fl_lle fl_entry.fl_lle
132 #define SECS_PER_HOUR 3600
133 #define SECS_PER_DAY (24*SECS_PER_HOUR)
137 #define FIN_WAIT_IDLE 600
138 #define TCP_IDLE SECS_PER_DAY
141 typedef void fl_lock_t(struct flowtable *, uint32_t);
142 typedef void fl_rtalloc_t(struct route *, uint32_t, u_int);
145 struct flentry **global;
146 struct flentry **pcpu[MAXCPU];
149 struct flowtable_stats {
150 uint64_t ft_collisions;
151 uint64_t ft_allocated;
153 uint64_t ft_max_depth;
154 uint64_t ft_free_checks;
158 } __aligned(CACHE_LINE_SIZE);
161 struct flowtable_stats ft_stats[MAXCPU];
167 fl_lock_t *ft_unlock;
168 fl_rtalloc_t *ft_rtalloc;
170 * XXX need to pad out
172 struct mtx *ft_locks;
173 union flentryp ft_table;
174 bitstr_t *ft_masks[MAXCPU];
175 bitstr_t *ft_tmpmask;
176 struct flowtable *ft_next;
178 uint32_t ft_count __aligned(CACHE_LINE_SIZE);
179 uint32_t ft_udp_idle __aligned(CACHE_LINE_SIZE);
180 uint32_t ft_fin_wait_idle;
181 uint32_t ft_syn_idle;
182 uint32_t ft_tcp_idle;
184 } __aligned(CACHE_LINE_SIZE);
186 static struct proc *flowcleanerproc;
187 static VNET_DEFINE(struct flowtable *, flow_list_head);
188 static VNET_DEFINE(uint32_t, flow_hashjitter);
189 static VNET_DEFINE(uma_zone_t, flow_ipv4_zone);
190 static VNET_DEFINE(uma_zone_t, flow_ipv6_zone);
192 #define V_flow_list_head VNET(flow_list_head)
193 #define V_flow_hashjitter VNET(flow_hashjitter)
194 #define V_flow_ipv4_zone VNET(flow_ipv4_zone)
195 #define V_flow_ipv6_zone VNET(flow_ipv6_zone)
198 static struct cv flowclean_f_cv;
199 static struct cv flowclean_c_cv;
200 static struct mtx flowclean_lock;
201 static uint32_t flowclean_cycles;
202 static uint32_t flowclean_freq;
204 #ifdef FLOWTABLE_DEBUG
205 #define FLDPRINTF(ft, flags, fmt, ...) \
207 if ((ft)->ft_flags & (flags)) \
208 printf((fmt), __VA_ARGS__); \
212 #define FLDPRINTF(ft, flags, fmt, ...)
219 * - Make flowtable stats per-cpu, aggregated at sysctl call time,
220 * to avoid extra cache evictions caused by incrementing a shared
222 * - add sysctls to resize && flush flow tables
223 * - Add per flowtable sysctls for statistics and configuring timeouts
224 * - add saturation counter to rtentry to support per-packet load-balancing
225 * add flag to indicate round-robin flow, add list lookup from head
227 * - add sysctl / device node / syscall to support exporting and importing
228 * of flows with flag to indicate that a flow was imported so should
229 * not be considered for auto-cleaning
230 * - support explicit connection state (currently only ad-hoc for DSR)
231 * - idetach() cleanup for options VIMAGE builds.
233 VNET_DEFINE(int, flowtable_enable) = 1;
234 static VNET_DEFINE(int, flowtable_debug);
235 static VNET_DEFINE(int, flowtable_syn_expire) = SYN_IDLE;
236 static VNET_DEFINE(int, flowtable_udp_expire) = UDP_IDLE;
237 static VNET_DEFINE(int, flowtable_fin_wait_expire) = FIN_WAIT_IDLE;
238 static VNET_DEFINE(int, flowtable_tcp_expire) = TCP_IDLE;
239 static VNET_DEFINE(int, flowtable_nmbflows);
240 static VNET_DEFINE(int, flowtable_ready) = 0;
242 #define V_flowtable_enable VNET(flowtable_enable)
243 #define V_flowtable_debug VNET(flowtable_debug)
244 #define V_flowtable_syn_expire VNET(flowtable_syn_expire)
245 #define V_flowtable_udp_expire VNET(flowtable_udp_expire)
246 #define V_flowtable_fin_wait_expire VNET(flowtable_fin_wait_expire)
247 #define V_flowtable_tcp_expire VNET(flowtable_tcp_expire)
248 #define V_flowtable_nmbflows VNET(flowtable_nmbflows)
249 #define V_flowtable_ready VNET(flowtable_ready)
251 SYSCTL_NODE(_net_inet, OID_AUTO, flowtable, CTLFLAG_RD, NULL, "flowtable");
252 SYSCTL_VNET_INT(_net_inet_flowtable, OID_AUTO, debug, CTLFLAG_RW,
253 &VNET_NAME(flowtable_debug), 0, "print debug info.");
254 SYSCTL_VNET_INT(_net_inet_flowtable, OID_AUTO, enable, CTLFLAG_RW,
255 &VNET_NAME(flowtable_enable), 0, "enable flowtable caching.");
258 * XXX This does not end up updating timeouts at runtime
259 * and only reflects the value for the last table added :-/
261 SYSCTL_VNET_INT(_net_inet_flowtable, OID_AUTO, syn_expire, CTLFLAG_RW,
262 &VNET_NAME(flowtable_syn_expire), 0,
263 "seconds after which to remove syn allocated flow.");
264 SYSCTL_VNET_INT(_net_inet_flowtable, OID_AUTO, udp_expire, CTLFLAG_RW,
265 &VNET_NAME(flowtable_udp_expire), 0,
266 "seconds after which to remove flow allocated to UDP.");
267 SYSCTL_VNET_INT(_net_inet_flowtable, OID_AUTO, fin_wait_expire, CTLFLAG_RW,
268 &VNET_NAME(flowtable_fin_wait_expire), 0,
269 "seconds after which to remove a flow in FIN_WAIT.");
270 SYSCTL_VNET_INT(_net_inet_flowtable, OID_AUTO, tcp_expire, CTLFLAG_RW,
271 &VNET_NAME(flowtable_tcp_expire), 0,
272 "seconds after which to remove flow allocated to a TCP connection.");
276 * Maximum number of flows that can be allocated of a given type.
278 * The table is allocated at boot time (for the pure caching case
279 * there is no reason why this could not be changed at runtime)
280 * and thus (currently) needs to be set with a tunable.
283 sysctl_nmbflows(SYSCTL_HANDLER_ARGS)
285 int error, newnmbflows;
287 newnmbflows = V_flowtable_nmbflows;
288 error = sysctl_handle_int(oidp, &newnmbflows, 0, req);
289 if (error == 0 && req->newptr) {
290 if (newnmbflows > V_flowtable_nmbflows) {
291 V_flowtable_nmbflows = newnmbflows;
292 uma_zone_set_max(V_flow_ipv4_zone,
293 V_flowtable_nmbflows);
294 uma_zone_set_max(V_flow_ipv6_zone,
295 V_flowtable_nmbflows);
301 SYSCTL_VNET_PROC(_net_inet_flowtable, OID_AUTO, nmbflows,
302 CTLTYPE_INT|CTLFLAG_RW, 0, 0, sysctl_nmbflows, "IU",
303 "Maximum number of flows allowed");
307 #define FS_PRINT(sb, field) sbuf_printf((sb), "\t%s: %jd\n", #field, fs->ft_##field)
310 fs_print(struct sbuf *sb, struct flowtable_stats *fs)
313 FS_PRINT(sb, collisions);
314 FS_PRINT(sb, allocated);
315 FS_PRINT(sb, misses);
316 FS_PRINT(sb, max_depth);
317 FS_PRINT(sb, free_checks);
320 FS_PRINT(sb, lookups);
324 flowtable_show_stats(struct sbuf *sb, struct flowtable *ft)
327 struct flowtable_stats fs, *pfs;
329 if (ft->ft_flags & FL_PCPU) {
330 bzero(&fs, sizeof(fs));
332 for (i = 0; i <= mp_maxid; i++) {
335 pfs->ft_collisions += ft->ft_stats[i].ft_collisions;
336 pfs->ft_allocated += ft->ft_stats[i].ft_allocated;
337 pfs->ft_misses += ft->ft_stats[i].ft_misses;
338 pfs->ft_free_checks += ft->ft_stats[i].ft_free_checks;
339 pfs->ft_frees += ft->ft_stats[i].ft_frees;
340 pfs->ft_hits += ft->ft_stats[i].ft_hits;
341 pfs->ft_lookups += ft->ft_stats[i].ft_lookups;
342 if (ft->ft_stats[i].ft_max_depth > pfs->ft_max_depth)
343 pfs->ft_max_depth = ft->ft_stats[i].ft_max_depth;
346 pfs = &ft->ft_stats[0];
352 sysctl_flowtable_stats(SYSCTL_HANDLER_ARGS)
354 struct flowtable *ft;
358 sb = sbuf_new(NULL, NULL, 64*1024, SBUF_FIXEDLEN);
360 ft = V_flow_list_head;
362 sbuf_printf(sb, "\ntable name: %s\n", ft->ft_name);
363 flowtable_show_stats(sb, ft);
367 error = SYSCTL_OUT(req, sbuf_data(sb), sbuf_len(sb) + 1);
372 SYSCTL_VNET_PROC(_net_inet_flowtable, OID_AUTO, stats, CTLTYPE_STRING|CTLFLAG_RD,
373 NULL, 0, sysctl_flowtable_stats, "A", "flowtable statistics");
378 in_rtalloc_ign_wrapper(struct route *ro, uint32_t hash, u_int fibnum)
381 rtalloc_ign_fib(ro, 0, fibnum);
386 flowtable_global_lock(struct flowtable *table, uint32_t hash)
388 int lock_index = (hash)&(table->ft_lock_count - 1);
390 mtx_lock(&table->ft_locks[lock_index]);
394 flowtable_global_unlock(struct flowtable *table, uint32_t hash)
396 int lock_index = (hash)&(table->ft_lock_count - 1);
398 mtx_unlock(&table->ft_locks[lock_index]);
402 flowtable_pcpu_lock(struct flowtable *table, uint32_t hash)
409 flowtable_pcpu_unlock(struct flowtable *table, uint32_t hash)
415 #define FL_ENTRY_INDEX(table, hash)((hash) % (table)->ft_size)
416 #define FL_ENTRY(table, hash) *flowtable_entry((table), (hash))
417 #define FL_ENTRY_LOCK(table, hash) (table)->ft_lock((table), (hash))
418 #define FL_ENTRY_UNLOCK(table, hash) (table)->ft_unlock((table), (hash))
420 #define FL_STALE (1<<8)
421 #define FL_IPV6 (1<<9)
422 #define FL_OVERWRITE (1<<10)
425 flow_invalidate(struct flentry *fle)
428 fle->f_flags |= FL_STALE;
432 proto_to_flags(uint8_t proto)
455 flags_to_proto(int flags)
457 int proto, protoflags;
459 protoflags = flags & (FL_TCP|FL_SCTP|FL_UDP);
460 switch (protoflags) {
465 proto = IPPROTO_SCTP;
478 #ifdef FLOWTABLE_DEBUG
480 ipv4_flow_print_tuple(int flags, int proto, struct sockaddr_in *ssin,
481 struct sockaddr_in *dsin)
483 char saddr[4*sizeof "123"], daddr[4*sizeof "123"];
485 if (flags & FL_HASH_ALL) {
486 inet_ntoa_r(ssin->sin_addr, saddr);
487 inet_ntoa_r(dsin->sin_addr, daddr);
488 printf("proto=%d %s:%d->%s:%d\n",
489 proto, saddr, ntohs(ssin->sin_port), daddr,
490 ntohs(dsin->sin_port));
492 inet_ntoa_r(*(struct in_addr *) &dsin->sin_addr, daddr);
493 printf("proto=%d %s\n", proto, daddr);
500 ipv4_mbuf_demarshal(struct flowtable *ft, struct mbuf *m,
501 struct sockaddr_in *ssin, struct sockaddr_in *dsin, uint16_t *flags)
509 uint16_t sport, dport;
511 proto = sport = dport = 0;
512 ip = mtod(m, struct ip *);
513 dsin->sin_family = AF_INET;
514 dsin->sin_len = sizeof(*dsin);
515 dsin->sin_addr = ip->ip_dst;
516 ssin->sin_family = AF_INET;
517 ssin->sin_len = sizeof(*ssin);
518 ssin->sin_addr = ip->ip_src;
521 if ((*flags & FL_HASH_ALL) == 0) {
522 FLDPRINTF(ft, FL_DEBUG_ALL, "skip port check flags=0x%x ",
527 iphlen = ip->ip_hl << 2; /* XXX options? */
531 th = (struct tcphdr *)((caddr_t)ip + iphlen);
532 sport = th->th_sport;
533 dport = th->th_dport;
534 if ((*flags & FL_HASH_ALL) &&
535 (th->th_flags & (TH_RST|TH_FIN)))
539 uh = (struct udphdr *)((caddr_t)ip + iphlen);
540 sport = uh->uh_sport;
541 dport = uh->uh_dport;
544 sh = (struct sctphdr *)((caddr_t)ip + iphlen);
545 sport = sh->src_port;
546 dport = sh->dest_port;
549 FLDPRINTF(ft, FL_DEBUG_ALL, "proto=0x%x not supported\n", proto);
551 /* no port - hence not a protocol we care about */
557 *flags |= proto_to_flags(proto);
558 ssin->sin_port = sport;
559 dsin->sin_port = dport;
564 ipv4_flow_lookup_hash_internal(
565 struct sockaddr_in *ssin, struct sockaddr_in *dsin,
566 uint32_t *key, uint16_t flags)
568 uint16_t sport, dport;
572 if ((V_flowtable_enable == 0) || (V_flowtable_ready == 0))
574 proto = flags_to_proto(flags);
575 sport = dport = key[2] = key[1] = key[0] = 0;
576 if ((ssin != NULL) && (flags & FL_HASH_ALL)) {
577 key[1] = ssin->sin_addr.s_addr;
578 sport = ssin->sin_port;
581 key[2] = dsin->sin_addr.s_addr;
582 dport = dsin->sin_port;
584 if (flags & FL_HASH_ALL) {
585 ((uint16_t *)key)[0] = sport;
586 ((uint16_t *)key)[1] = dport;
588 offset = V_flow_hashjitter + proto;
590 return (jenkins_hashword(key, 3, offset));
593 static struct flentry *
594 flowtable_lookup_mbuf4(struct flowtable *ft, struct mbuf *m)
596 struct sockaddr_storage ssa, dsa;
598 struct sockaddr_in *dsin, *ssin;
600 dsin = (struct sockaddr_in *)&dsa;
601 ssin = (struct sockaddr_in *)&ssa;
602 bzero(dsin, sizeof(*dsin));
603 bzero(ssin, sizeof(*ssin));
604 flags = ft->ft_flags;
605 if (ipv4_mbuf_demarshal(ft, m, ssin, dsin, &flags) != 0)
608 return (flowtable_lookup(ft, &ssa, &dsa, M_GETFIB(m), flags));
612 flow_to_route(struct flentry *fle, struct route *ro)
614 uint32_t *hashkey = NULL;
615 struct sockaddr_in *sin;
617 sin = (struct sockaddr_in *)&ro->ro_dst;
618 sin->sin_family = AF_INET;
619 sin->sin_len = sizeof(*sin);
620 hashkey = ((struct flentry_v4 *)fle)->fl_flow.ipf_key;
621 sin->sin_addr.s_addr = hashkey[2];
622 ro->ro_rt = __DEVOLATILE(struct rtentry *, fle->f_rt);
623 ro->ro_lle = __DEVOLATILE(struct llentry *, fle->f_lle);
629 * PULLUP_TO(len, p, T) makes sure that len + sizeof(T) is contiguous,
630 * then it sets p to point at the offset "len" in the mbuf. WARNING: the
631 * pointer might become stale after other pullups (but we never use it
634 #define PULLUP_TO(_len, p, T) \
636 int x = (_len) + sizeof(T); \
637 if ((m)->m_len < x) { \
638 goto receive_failed; \
640 p = (mtod(m, char *) + (_len)); \
643 #define TCP(p) ((struct tcphdr *)(p))
644 #define SCTP(p) ((struct sctphdr *)(p))
645 #define UDP(p) ((struct udphdr *)(p))
648 ipv6_mbuf_demarshal(struct flowtable *ft, struct mbuf *m,
649 struct sockaddr_in6 *ssin6, struct sockaddr_in6 *dsin6, uint16_t *flags)
654 uint16_t src_port, dst_port;
658 offset = hlen = src_port = dst_port = 0;
660 ip6 = mtod(m, struct ip6_hdr *);
661 hlen = sizeof(struct ip6_hdr);
662 proto = ip6->ip6_nxt;
664 if ((*flags & FL_HASH_ALL) == 0)
667 while (ulp == NULL) {
670 case IPPROTO_OSPFIGP:
678 PULLUP_TO(hlen, ulp, struct tcphdr);
679 dst_port = TCP(ulp)->th_dport;
680 src_port = TCP(ulp)->th_sport;
681 if ((*flags & FL_HASH_ALL) &&
682 (TCP(ulp)->th_flags & (TH_RST|TH_FIN)))
686 PULLUP_TO(hlen, ulp, struct sctphdr);
687 src_port = SCTP(ulp)->src_port;
688 dst_port = SCTP(ulp)->dest_port;
691 PULLUP_TO(hlen, ulp, struct udphdr);
692 dst_port = UDP(ulp)->uh_dport;
693 src_port = UDP(ulp)->uh_sport;
695 case IPPROTO_HOPOPTS: /* RFC 2460 */
696 PULLUP_TO(hlen, ulp, struct ip6_hbh);
697 hlen += (((struct ip6_hbh *)ulp)->ip6h_len + 1) << 3;
698 proto = ((struct ip6_hbh *)ulp)->ip6h_nxt;
701 case IPPROTO_ROUTING: /* RFC 2460 */
702 PULLUP_TO(hlen, ulp, struct ip6_rthdr);
703 hlen += (((struct ip6_rthdr *)ulp)->ip6r_len + 1) << 3;
704 proto = ((struct ip6_rthdr *)ulp)->ip6r_nxt;
707 case IPPROTO_FRAGMENT: /* RFC 2460 */
708 PULLUP_TO(hlen, ulp, struct ip6_frag);
709 hlen += sizeof (struct ip6_frag);
710 proto = ((struct ip6_frag *)ulp)->ip6f_nxt;
711 offset = ((struct ip6_frag *)ulp)->ip6f_offlg &
715 case IPPROTO_DSTOPTS: /* RFC 2460 */
716 PULLUP_TO(hlen, ulp, struct ip6_hbh);
717 hlen += (((struct ip6_hbh *)ulp)->ip6h_len + 1) << 3;
718 proto = ((struct ip6_hbh *)ulp)->ip6h_nxt;
721 case IPPROTO_AH: /* RFC 2402 */
722 PULLUP_TO(hlen, ulp, struct ip6_ext);
723 hlen += (((struct ip6_ext *)ulp)->ip6e_len + 2) << 2;
724 proto = ((struct ip6_ext *)ulp)->ip6e_nxt;
728 PULLUP_TO(hlen, ulp, struct ip6_ext);
739 dsin6->sin6_family = AF_INET6;
740 dsin6->sin6_len = sizeof(*dsin6);
741 dsin6->sin6_port = dst_port;
742 memcpy(&dsin6->sin6_addr, &ip6->ip6_dst, sizeof(struct in6_addr));
744 ssin6->sin6_family = AF_INET6;
745 ssin6->sin6_len = sizeof(*ssin6);
746 ssin6->sin6_port = src_port;
747 memcpy(&ssin6->sin6_addr, &ip6->ip6_src, sizeof(struct in6_addr));
748 *flags |= proto_to_flags(proto);
753 #define zero_key(key) \
767 ipv6_flow_lookup_hash_internal(
768 struct sockaddr_in6 *ssin6, struct sockaddr_in6 *dsin6,
769 uint32_t *key, uint16_t flags)
771 uint16_t sport, dport;
775 if ((V_flowtable_enable == 0) || (V_flowtable_ready == 0))
778 proto = flags_to_proto(flags);
782 memcpy(&key[1], &dsin6->sin6_addr, sizeof(struct in6_addr));
783 dport = dsin6->sin6_port;
785 if ((ssin6 != NULL) && (flags & FL_HASH_ALL)) {
786 memcpy(&key[5], &ssin6->sin6_addr, sizeof(struct in6_addr));
787 sport = ssin6->sin6_port;
789 if (flags & FL_HASH_ALL) {
790 ((uint16_t *)key)[0] = sport;
791 ((uint16_t *)key)[1] = dport;
793 offset = V_flow_hashjitter + proto;
795 return (jenkins_hashword(key, 9, offset));
798 static struct flentry *
799 flowtable_lookup_mbuf6(struct flowtable *ft, struct mbuf *m)
801 struct sockaddr_storage ssa, dsa;
802 struct sockaddr_in6 *dsin6, *ssin6;
805 dsin6 = (struct sockaddr_in6 *)&dsa;
806 ssin6 = (struct sockaddr_in6 *)&ssa;
807 bzero(dsin6, sizeof(*dsin6));
808 bzero(ssin6, sizeof(*ssin6));
809 flags = ft->ft_flags;
811 if (ipv6_mbuf_demarshal(ft, m, ssin6, dsin6, &flags) != 0)
814 return (flowtable_lookup(ft, &ssa, &dsa, M_GETFIB(m), flags));
818 flow_to_route_in6(struct flentry *fle, struct route_in6 *ro)
820 uint32_t *hashkey = NULL;
821 struct sockaddr_in6 *sin6;
823 sin6 = (struct sockaddr_in6 *)&ro->ro_dst;
825 sin6->sin6_family = AF_INET6;
826 sin6->sin6_len = sizeof(*sin6);
827 hashkey = ((struct flentry_v6 *)fle)->fl_flow.ipf_key;
828 memcpy(&sin6->sin6_addr, &hashkey[5], sizeof (struct in6_addr));
829 ro->ro_rt = __DEVOLATILE(struct rtentry *, fle->f_rt);
830 ro->ro_lle = __DEVOLATILE(struct llentry *, fle->f_lle);
836 flowtable_mask(struct flowtable *ft)
840 if (ft->ft_flags & FL_PCPU)
841 mask = ft->ft_masks[curcpu];
843 mask = ft->ft_masks[0];
848 static struct flentry **
849 flowtable_entry(struct flowtable *ft, uint32_t hash)
851 struct flentry **fle;
852 int index = (hash % ft->ft_size);
854 if (ft->ft_flags & FL_PCPU) {
855 KASSERT(&ft->ft_table.pcpu[curcpu][0] != NULL, ("pcpu not set"));
856 fle = &ft->ft_table.pcpu[curcpu][index];
858 KASSERT(&ft->ft_table.global[0] != NULL, ("global not set"));
859 fle = &ft->ft_table.global[index];
866 flow_stale(struct flowtable *ft, struct flentry *fle)
870 if ((fle->f_fhash == 0)
871 || ((fle->f_rt->rt_flags & RTF_HOST) &&
872 ((fle->f_rt->rt_flags & (RTF_UP))
874 || (fle->f_rt->rt_ifp == NULL)
875 || !RT_LINK_IS_UP(fle->f_rt->rt_ifp))
878 idle_time = time_uptime - fle->f_uptime;
880 if ((fle->f_flags & FL_STALE) ||
881 ((fle->f_flags & (TH_SYN|TH_ACK|TH_FIN)) == 0
882 && (idle_time > ft->ft_udp_idle)) ||
883 ((fle->f_flags & TH_FIN)
884 && (idle_time > ft->ft_fin_wait_idle)) ||
885 ((fle->f_flags & (TH_SYN|TH_ACK)) == TH_SYN
886 && (idle_time > ft->ft_syn_idle)) ||
887 ((fle->f_flags & (TH_SYN|TH_ACK)) == (TH_SYN|TH_ACK)
888 && (idle_time > ft->ft_tcp_idle)) ||
889 ((fle->f_rt->rt_flags & RTF_UP) == 0 ||
890 (fle->f_rt->rt_ifp == NULL)))
897 flowtable_set_hashkey(struct flentry *fle, uint32_t *key)
902 if (fle->f_flags & FL_IPV6) {
904 hashkey = ((struct flentry_v4 *)fle)->fl_flow.ipf_key;
907 hashkey = ((struct flentry_v6 *)fle)->fl_flow.ipf_key;
910 for (i = 0; i < nwords; i++)
914 static struct flentry *
915 flow_alloc(struct flowtable *ft)
917 struct flentry *newfle;
921 zone = (ft->ft_flags & FL_IPV6) ? V_flow_ipv6_zone : V_flow_ipv4_zone;
923 newfle = uma_zalloc(zone, M_NOWAIT | M_ZERO);
925 atomic_add_int(&ft->ft_count, 1);
930 flow_free(struct flentry *fle, struct flowtable *ft)
934 zone = (ft->ft_flags & FL_IPV6) ? V_flow_ipv6_zone : V_flow_ipv4_zone;
935 atomic_add_int(&ft->ft_count, -1);
936 uma_zfree(zone, fle);
940 flow_full(struct flowtable *ft)
946 count = ft->ft_count;
948 if (full && (count < (V_flowtable_nmbflows - (V_flowtable_nmbflows >> 3))))
950 else if (!full && (count > (V_flowtable_nmbflows - (V_flowtable_nmbflows >> 5))))
953 if (full && !ft->ft_full) {
954 flowclean_freq = 4*hz;
955 if ((ft->ft_flags & FL_HASH_ALL) == 0)
956 ft->ft_udp_idle = ft->ft_fin_wait_idle =
957 ft->ft_syn_idle = ft->ft_tcp_idle = 5;
958 cv_broadcast(&flowclean_c_cv);
959 } else if (!full && ft->ft_full) {
960 flowclean_freq = 20*hz;
961 if ((ft->ft_flags & FL_HASH_ALL) == 0)
962 ft->ft_udp_idle = ft->ft_fin_wait_idle =
963 ft->ft_syn_idle = ft->ft_tcp_idle = 30;
966 return (ft->ft_full);
970 flowtable_insert(struct flowtable *ft, uint32_t hash, uint32_t *key,
971 uint32_t fibnum, struct route *ro, uint16_t flags)
973 struct flentry *fle, *fletail, *newfle, **flep;
974 struct flowtable_stats *fs = &ft->ft_stats[curcpu];
979 newfle = flow_alloc(ft);
983 newfle->f_flags |= (flags & FL_IPV6);
984 proto = flags_to_proto(flags);
986 FL_ENTRY_LOCK(ft, hash);
987 mask = flowtable_mask(ft);
988 flep = flowtable_entry(ft, hash);
989 fletail = fle = *flep;
992 bit_set(mask, FL_ENTRY_INDEX(ft, hash));
993 *flep = fle = newfle;
1000 * find end of list and make sure that we were not
1001 * preempted by another thread handling this flow
1003 while (fle != NULL) {
1004 if (fle->f_fhash == hash && !flow_stale(ft, fle)) {
1006 * there was either a hash collision
1007 * or we lost a race to insert
1009 FL_ENTRY_UNLOCK(ft, hash);
1010 flow_free(newfle, ft);
1012 if (flags & FL_OVERWRITE)
1017 * re-visit this double condition XXX
1019 if (fletail->f_next != NULL)
1020 fletail = fle->f_next;
1026 if (depth > fs->ft_max_depth)
1027 fs->ft_max_depth = depth;
1028 fletail->f_next = newfle;
1031 flowtable_set_hashkey(fle, key);
1033 fle->f_proto = proto;
1034 fle->f_rt = ro->ro_rt;
1035 fle->f_lle = ro->ro_lle;
1036 fle->f_fhash = hash;
1037 fle->f_fibnum = fibnum;
1038 fle->f_uptime = time_uptime;
1039 FL_ENTRY_UNLOCK(ft, hash);
1044 kern_flowtable_insert(struct flowtable *ft,
1045 struct sockaddr_storage *ssa, struct sockaddr_storage *dsa,
1046 struct route *ro, uint32_t fibnum, int flags)
1048 uint32_t key[9], hash;
1050 flags = (ft->ft_flags | flags | FL_OVERWRITE);
1054 if (ssa->ss_family == AF_INET)
1055 hash = ipv4_flow_lookup_hash_internal((struct sockaddr_in *)ssa,
1056 (struct sockaddr_in *)dsa, key, flags);
1059 if (ssa->ss_family == AF_INET6)
1060 hash = ipv6_flow_lookup_hash_internal((struct sockaddr_in6 *)ssa,
1061 (struct sockaddr_in6 *)dsa, key, flags);
1063 if (ro->ro_rt == NULL || ro->ro_lle == NULL)
1066 FLDPRINTF(ft, FL_DEBUG,
1067 "kern_flowtable_insert: key=%x:%x:%x hash=%x fibnum=%d flags=%x\n",
1068 key[0], key[1], key[2], hash, fibnum, flags);
1069 return (flowtable_insert(ft, hash, key, fibnum, ro, flags));
1073 flowtable_key_equal(struct flentry *fle, uint32_t *key)
1078 if (fle->f_flags & FL_IPV6) {
1080 hashkey = ((struct flentry_v4 *)fle)->fl_flow.ipf_key;
1083 hashkey = ((struct flentry_v6 *)fle)->fl_flow.ipf_key;
1086 for (i = 0; i < nwords; i++)
1087 if (hashkey[i] != key[i])
1094 flowtable_lookup_mbuf(struct flowtable *ft, struct mbuf *m, int af)
1096 struct flentry *fle = NULL;
1100 fle = flowtable_lookup_mbuf4(ft, m);
1104 fle = flowtable_lookup_mbuf6(ft, m);
1106 if (fle != NULL && m != NULL && (m->m_flags & M_FLOWID) == 0) {
1107 m->m_flags |= M_FLOWID;
1108 m->m_pkthdr.flowid = fle->f_fhash;
1114 flowtable_lookup(struct flowtable *ft, struct sockaddr_storage *ssa,
1115 struct sockaddr_storage *dsa, uint32_t fibnum, int flags)
1117 uint32_t key[9], hash;
1118 struct flentry *fle;
1119 struct flowtable_stats *fs = &ft->ft_stats[curcpu];
1123 struct llentry *lle;
1124 struct route sro, *ro;
1125 struct route_in6 sro6;
1127 sro.ro_rt = sro6.ro_rt = NULL;
1128 sro.ro_lle = sro6.ro_lle = NULL;
1131 flags |= ft->ft_flags;
1132 proto = flags_to_proto(flags);
1134 if (ssa->ss_family == AF_INET) {
1135 struct sockaddr_in *ssin, *dsin;
1138 memcpy(&ro->ro_dst, dsa, sizeof(struct sockaddr_in));
1140 * The harvested source and destination addresses
1141 * may contain port information if the packet is
1142 * from a transport protocol (e.g. TCP/UDP). The
1143 * port field must be cleared before performing
1146 ((struct sockaddr_in *)&ro->ro_dst)->sin_port = 0;
1147 dsin = (struct sockaddr_in *)dsa;
1148 ssin = (struct sockaddr_in *)ssa;
1149 if ((dsin->sin_addr.s_addr == ssin->sin_addr.s_addr) ||
1150 (ntohl(dsin->sin_addr.s_addr) >> IN_CLASSA_NSHIFT) == IN_LOOPBACKNET ||
1151 (ntohl(ssin->sin_addr.s_addr) >> IN_CLASSA_NSHIFT) == IN_LOOPBACKNET)
1154 hash = ipv4_flow_lookup_hash_internal(ssin, dsin, key, flags);
1158 if (ssa->ss_family == AF_INET6) {
1159 struct sockaddr_in6 *ssin6, *dsin6;
1161 ro = (struct route *)&sro6;
1162 memcpy(&sro6.ro_dst, dsa,
1163 sizeof(struct sockaddr_in6));
1164 ((struct sockaddr_in6 *)&ro->ro_dst)->sin6_port = 0;
1165 dsin6 = (struct sockaddr_in6 *)dsa;
1166 ssin6 = (struct sockaddr_in6 *)ssa;
1169 hash = ipv6_flow_lookup_hash_internal(ssin6, dsin6, key, flags);
1173 * Ports are zero and this isn't a transmit cache
1174 * - thus not a protocol for which we need to keep
1176 * FL_HASH_ALL => key[0] != 0 for TCP || UDP || SCTP
1178 if (hash == 0 || (key[0] == 0 && (ft->ft_flags & FL_HASH_ALL)))
1182 FL_ENTRY_LOCK(ft, hash);
1183 if ((fle = FL_ENTRY(ft, hash)) == NULL) {
1184 FL_ENTRY_UNLOCK(ft, hash);
1188 rt = __DEVOLATILE(struct rtentry *, fle->f_rt);
1189 lle = __DEVOLATILE(struct llentry *, fle->f_lle);
1191 && fle->f_fhash == hash
1192 && flowtable_key_equal(fle, key)
1193 && (proto == fle->f_proto)
1194 && (fibnum == fle->f_fibnum)
1195 && (rt->rt_flags & RTF_UP)
1196 && (rt->rt_ifp != NULL)) {
1198 fle->f_uptime = time_uptime;
1199 fle->f_flags |= flags;
1200 FL_ENTRY_UNLOCK(ft, hash);
1202 } else if (fle->f_next != NULL) {
1206 FL_ENTRY_UNLOCK(ft, hash);
1208 if (flags & FL_NOAUTO || flow_full(ft))
1213 * This bit of code ends up locking the
1214 * same route 3 times (just like ip_output + ether_output)
1216 * - in rt_check when called by arpresolve
1217 * - dropping the refcount for the rtentry
1219 * This could be consolidated to one if we wrote a variant
1220 * of arpresolve with an rt_check variant that expected to
1221 * receive the route locked
1225 if ((ro->ro_dst.sa_family != AF_INET) &&
1226 (ro->ro_dst.sa_family != AF_INET6))
1227 panic("sa_family == %d\n", ro->ro_dst.sa_family);
1230 ft->ft_rtalloc(ro, hash, fibnum);
1231 if (ro->ro_rt == NULL)
1232 error = ENETUNREACH;
1234 struct llentry *lle = NULL;
1235 struct sockaddr_storage *l3addr;
1236 struct rtentry *rt = ro->ro_rt;
1237 struct ifnet *ifp = rt->rt_ifp;
1239 if (ifp->if_flags & (IFF_POINTOPOINT | IFF_LOOPBACK)) {
1245 if (ssa->ss_family == AF_INET6) {
1246 struct sockaddr_in6 *dsin6;
1248 dsin6 = (struct sockaddr_in6 *)dsa;
1249 if (in6_localaddr(&dsin6->sin6_addr)) {
1255 if (rt->rt_flags & RTF_GATEWAY)
1256 l3addr = (struct sockaddr_storage *)rt->rt_gateway;
1259 l3addr = (struct sockaddr_storage *)&ro->ro_dst;
1260 llentry_update(&lle, LLTABLE6(ifp), l3addr, ifp);
1264 if (ssa->ss_family == AF_INET) {
1265 if (rt->rt_flags & RTF_GATEWAY)
1266 l3addr = (struct sockaddr_storage *)rt->rt_gateway;
1268 l3addr = (struct sockaddr_storage *)&ro->ro_dst;
1269 llentry_update(&lle, LLTABLE(ifp), l3addr, ifp);
1280 error = flowtable_insert(ft, hash, key, fibnum, ro, flags);
1290 return ((error) ? NULL : fle);
1294 * used by the bit_alloc macro
1296 #define calloc(count, size) malloc((count)*(size), M_DEVBUF, M_WAITOK|M_ZERO)
1299 flowtable_alloc(char *name, int nentry, int flags)
1301 struct flowtable *ft, *fttail;
1304 if (V_flow_hashjitter == 0)
1305 V_flow_hashjitter = arc4random();
1307 KASSERT(nentry > 0, ("nentry must be > 0, is %d\n", nentry));
1309 ft = malloc(sizeof(struct flowtable),
1310 M_RTABLE, M_WAITOK | M_ZERO);
1313 ft->ft_flags = flags;
1314 ft->ft_size = nentry;
1316 ft->ft_rtalloc = rtalloc_mpath_fib;
1318 ft->ft_rtalloc = in_rtalloc_ign_wrapper;
1320 if (flags & FL_PCPU) {
1321 ft->ft_lock = flowtable_pcpu_lock;
1322 ft->ft_unlock = flowtable_pcpu_unlock;
1324 for (i = 0; i <= mp_maxid; i++) {
1325 ft->ft_table.pcpu[i] =
1326 malloc(nentry*sizeof(struct flentry *),
1327 M_RTABLE, M_WAITOK | M_ZERO);
1328 ft->ft_masks[i] = bit_alloc(nentry);
1331 ft->ft_lock_count = 2*(powerof2(mp_maxid + 1) ? (mp_maxid + 1):
1332 (fls(mp_maxid + 1) << 1));
1334 ft->ft_lock = flowtable_global_lock;
1335 ft->ft_unlock = flowtable_global_unlock;
1336 ft->ft_table.global =
1337 malloc(nentry*sizeof(struct flentry *),
1338 M_RTABLE, M_WAITOK | M_ZERO);
1339 ft->ft_locks = malloc(ft->ft_lock_count*sizeof(struct mtx),
1340 M_RTABLE, M_WAITOK | M_ZERO);
1341 for (i = 0; i < ft->ft_lock_count; i++)
1342 mtx_init(&ft->ft_locks[i], "flow", NULL, MTX_DEF|MTX_DUPOK);
1344 ft->ft_masks[0] = bit_alloc(nentry);
1346 ft->ft_tmpmask = bit_alloc(nentry);
1349 * In the local transmit case the table truly is
1350 * just a cache - so everything is eligible for
1351 * replacement after 5s of non-use
1353 if (flags & FL_HASH_ALL) {
1354 ft->ft_udp_idle = V_flowtable_udp_expire;
1355 ft->ft_syn_idle = V_flowtable_syn_expire;
1356 ft->ft_fin_wait_idle = V_flowtable_fin_wait_expire;
1357 ft->ft_tcp_idle = V_flowtable_fin_wait_expire;
1359 ft->ft_udp_idle = ft->ft_fin_wait_idle =
1360 ft->ft_syn_idle = ft->ft_tcp_idle = 30;
1365 * hook in to the cleaner list
1367 if (V_flow_list_head == NULL)
1368 V_flow_list_head = ft;
1370 fttail = V_flow_list_head;
1371 while (fttail->ft_next != NULL)
1372 fttail = fttail->ft_next;
1373 fttail->ft_next = ft;
1380 * The rest of the code is devoted to garbage collection of expired entries.
1381 * It is a new additon made necessary by the switch to dynamically allocating
1386 fle_free(struct flentry *fle, struct flowtable *ft)
1389 struct llentry *lle;
1391 rt = __DEVOLATILE(struct rtentry *, fle->f_rt);
1392 lle = __DEVOLATILE(struct llentry *, fle->f_lle);
1399 flowtable_free_stale(struct flowtable *ft, struct rtentry *rt)
1401 int curbit = 0, count;
1402 struct flentry *fle, **flehead, *fleprev;
1403 struct flentry *flefreehead, *flefreetail, *fletmp;
1404 bitstr_t *mask, *tmpmask;
1405 struct flowtable_stats *fs = &ft->ft_stats[curcpu];
1407 flefreehead = flefreetail = NULL;
1408 mask = flowtable_mask(ft);
1409 tmpmask = ft->ft_tmpmask;
1410 memcpy(tmpmask, mask, ft->ft_size/8);
1412 * XXX Note to self, bit_ffs operates at the byte level
1413 * and thus adds gratuitous overhead
1415 bit_ffs(tmpmask, ft->ft_size, &curbit);
1416 while (curbit != -1) {
1417 if (curbit >= ft->ft_size || curbit < -1) {
1419 "warning: bad curbit value %d \n",
1424 FL_ENTRY_LOCK(ft, curbit);
1425 flehead = flowtable_entry(ft, curbit);
1426 fle = fleprev = *flehead;
1428 fs->ft_free_checks++;
1430 if (fle == NULL && curbit > 0) {
1432 "warning bit=%d set, but no fle found\n",
1436 while (fle != NULL) {
1438 if (__DEVOLATILE(struct rtentry *, fle->f_rt) != rt) {
1443 } else if (!flow_stale(ft, fle)) {
1449 * delete head of the list
1451 if (fleprev == *flehead) {
1453 if (fle == fleprev) {
1454 fleprev = *flehead = fle->f_next;
1456 fleprev = *flehead = fle;
1460 * don't advance fleprev
1463 fleprev->f_next = fle->f_next;
1464 fle = fleprev->f_next;
1467 if (flefreehead == NULL)
1468 flefreehead = flefreetail = fletmp;
1470 flefreetail->f_next = fletmp;
1471 flefreetail = fletmp;
1473 fletmp->f_next = NULL;
1475 if (*flehead == NULL)
1476 bit_clear(mask, curbit);
1477 FL_ENTRY_UNLOCK(ft, curbit);
1478 bit_clear(tmpmask, curbit);
1479 bit_ffs(tmpmask, ft->ft_size, &curbit);
1482 while ((fle = flefreehead) != NULL) {
1483 flefreehead = fle->f_next;
1488 if (V_flowtable_debug && count)
1489 log(LOG_DEBUG, "freed %d flow entries\n", count);
1493 flowtable_route_flush(struct flowtable *ft, struct rtentry *rt)
1497 if (ft->ft_flags & FL_PCPU) {
1498 for (i = 0; i <= mp_maxid; i++) {
1502 if (smp_started == 1) {
1503 thread_lock(curthread);
1504 sched_bind(curthread, i);
1505 thread_unlock(curthread);
1508 flowtable_free_stale(ft, rt);
1510 if (smp_started == 1) {
1511 thread_lock(curthread);
1512 sched_unbind(curthread);
1513 thread_unlock(curthread);
1517 flowtable_free_stale(ft, rt);
1522 flowtable_clean_vnet(void)
1524 struct flowtable *ft;
1527 ft = V_flow_list_head;
1528 while (ft != NULL) {
1529 if (ft->ft_flags & FL_PCPU) {
1530 for (i = 0; i <= mp_maxid; i++) {
1534 if (smp_started == 1) {
1535 thread_lock(curthread);
1536 sched_bind(curthread, i);
1537 thread_unlock(curthread);
1540 flowtable_free_stale(ft, NULL);
1542 if (smp_started == 1) {
1543 thread_lock(curthread);
1544 sched_unbind(curthread);
1545 thread_unlock(curthread);
1549 flowtable_free_stale(ft, NULL);
1556 flowtable_cleaner(void)
1558 VNET_ITERATOR_DECL(vnet_iter);
1561 log(LOG_INFO, "flowtable cleaner started\n");
1564 VNET_FOREACH(vnet_iter) {
1565 CURVNET_SET(vnet_iter);
1566 flowtable_clean_vnet();
1569 VNET_LIST_RUNLOCK();
1572 * The 10 second interval between cleaning checks
1575 mtx_lock(&flowclean_lock);
1577 cv_broadcast(&flowclean_f_cv);
1578 cv_timedwait(&flowclean_c_cv, &flowclean_lock, flowclean_freq);
1579 mtx_unlock(&flowclean_lock);
1584 flowtable_flush(void *unused __unused)
1588 mtx_lock(&flowclean_lock);
1589 start = flowclean_cycles;
1590 while (start == flowclean_cycles) {
1591 cv_broadcast(&flowclean_c_cv);
1592 cv_wait(&flowclean_f_cv, &flowclean_lock);
1594 mtx_unlock(&flowclean_lock);
1597 static struct kproc_desc flow_kp = {
1602 SYSINIT(flowcleaner, SI_SUB_KTHREAD_IDLE, SI_ORDER_ANY, kproc_start, &flow_kp);
1605 flowtable_init_vnet(const void *unused __unused)
1608 V_flowtable_nmbflows = 1024 + maxusers * 64 * mp_ncpus;
1609 V_flow_ipv4_zone = uma_zcreate("ip4flow", sizeof(struct flentry_v4),
1610 NULL, NULL, NULL, NULL, 64, UMA_ZONE_MAXBUCKET);
1611 V_flow_ipv6_zone = uma_zcreate("ip6flow", sizeof(struct flentry_v6),
1612 NULL, NULL, NULL, NULL, 64, UMA_ZONE_MAXBUCKET);
1613 uma_zone_set_max(V_flow_ipv4_zone, V_flowtable_nmbflows);
1614 uma_zone_set_max(V_flow_ipv6_zone, V_flowtable_nmbflows);
1615 V_flowtable_ready = 1;
1617 VNET_SYSINIT(flowtable_init_vnet, SI_SUB_SMP, SI_ORDER_ANY,
1618 flowtable_init_vnet, NULL);
1621 flowtable_init(const void *unused __unused)
1624 cv_init(&flowclean_c_cv, "c_flowcleanwait");
1625 cv_init(&flowclean_f_cv, "f_flowcleanwait");
1626 mtx_init(&flowclean_lock, "flowclean lock", NULL, MTX_DEF);
1627 EVENTHANDLER_REGISTER(ifnet_departure_event, flowtable_flush, NULL,
1628 EVENTHANDLER_PRI_ANY);
1629 flowclean_freq = 20*hz;
1631 SYSINIT(flowtable_init, SI_SUB_KTHREAD_INIT, SI_ORDER_FIRST,
1632 flowtable_init, NULL);
1637 flowtable_uninit(const void *unused __unused)
1640 V_flowtable_ready = 0;
1641 uma_zdestroy(V_flow_ipv4_zone);
1642 uma_zdestroy(V_flow_ipv6_zone);
1645 VNET_SYSUNINIT(flowtable_uninit, SI_SUB_KTHREAD_INIT, SI_ORDER_ANY,
1646 flowtable_uninit, NULL);
1651 flowtable_get_hashkey(struct flentry *fle)
1655 if (fle->f_flags & FL_IPV6)
1656 hashkey = ((struct flentry_v4 *)fle)->fl_flow.ipf_key;
1658 hashkey = ((struct flentry_v6 *)fle)->fl_flow.ipf_key;
1664 flowtable_mask_pcpu(struct flowtable *ft, int cpuid)
1668 if (ft->ft_flags & FL_PCPU)
1669 mask = ft->ft_masks[cpuid];
1671 mask = ft->ft_masks[0];
1676 static struct flentry **
1677 flowtable_entry_pcpu(struct flowtable *ft, uint32_t hash, int cpuid)
1679 struct flentry **fle;
1680 int index = (hash % ft->ft_size);
1682 if (ft->ft_flags & FL_PCPU) {
1683 fle = &ft->ft_table.pcpu[cpuid][index];
1685 fle = &ft->ft_table.global[index];
1692 flow_show(struct flowtable *ft, struct flentry *fle)
1695 int rt_valid, ifp_valid;
1696 uint16_t sport, dport;
1698 char saddr[4*sizeof "123"], daddr[4*sizeof "123"];
1699 volatile struct rtentry *rt;
1700 struct ifnet *ifp = NULL;
1702 idle_time = (int)(time_uptime - fle->f_uptime);
1704 rt_valid = rt != NULL;
1707 ifp_valid = ifp != NULL;
1708 hashkey = flowtable_get_hashkey(fle);
1709 if (fle->f_flags & FL_IPV6)
1712 inet_ntoa_r(*(struct in_addr *) &hashkey[2], daddr);
1713 if (ft->ft_flags & FL_HASH_ALL) {
1714 inet_ntoa_r(*(struct in_addr *) &hashkey[1], saddr);
1715 sport = ntohs(((uint16_t *)hashkey)[0]);
1716 dport = ntohs(((uint16_t *)hashkey)[1]);
1717 db_printf("%s:%d->%s:%d",
1718 saddr, sport, daddr,
1721 db_printf("%s ", daddr);
1724 if (fle->f_flags & FL_STALE)
1725 db_printf(" FL_STALE ");
1726 if (fle->f_flags & FL_TCP)
1727 db_printf(" FL_TCP ");
1728 if (fle->f_flags & FL_UDP)
1729 db_printf(" FL_UDP ");
1731 if (rt->rt_flags & RTF_UP)
1732 db_printf(" RTF_UP ");
1735 if (ifp->if_flags & IFF_LOOPBACK)
1736 db_printf(" IFF_LOOPBACK ");
1737 if (ifp->if_flags & IFF_UP)
1738 db_printf(" IFF_UP ");
1739 if (ifp->if_flags & IFF_POINTOPOINT)
1740 db_printf(" IFF_POINTOPOINT ");
1742 if (fle->f_flags & FL_IPV6)
1743 db_printf("\n\tkey=%08x:%08x:%08x%08x:%08x:%08x%08x:%08x:%08x",
1744 hashkey[0], hashkey[1], hashkey[2],
1745 hashkey[3], hashkey[4], hashkey[5],
1746 hashkey[6], hashkey[7], hashkey[8]);
1748 db_printf("\n\tkey=%08x:%08x:%08x ",
1749 hashkey[0], hashkey[1], hashkey[2]);
1750 db_printf("hash=%08x idle_time=%03d"
1751 "\n\tfibnum=%02d rt=%p",
1752 fle->f_fhash, idle_time, fle->f_fibnum, fle->f_rt);
1757 flowtable_show(struct flowtable *ft, int cpuid)
1760 struct flentry *fle, **flehead;
1761 bitstr_t *mask, *tmpmask;
1764 db_printf("cpu: %d\n", cpuid);
1765 mask = flowtable_mask_pcpu(ft, cpuid);
1766 tmpmask = ft->ft_tmpmask;
1767 memcpy(tmpmask, mask, ft->ft_size/8);
1769 * XXX Note to self, bit_ffs operates at the byte level
1770 * and thus adds gratuitous overhead
1772 bit_ffs(tmpmask, ft->ft_size, &curbit);
1773 while (curbit != -1) {
1774 if (curbit >= ft->ft_size || curbit < -1) {
1775 db_printf("warning: bad curbit value %d \n",
1780 flehead = flowtable_entry_pcpu(ft, curbit, cpuid);
1783 while (fle != NULL) {
1788 bit_clear(tmpmask, curbit);
1789 bit_ffs(tmpmask, ft->ft_size, &curbit);
1794 flowtable_show_vnet(void)
1796 struct flowtable *ft;
1799 ft = V_flow_list_head;
1800 while (ft != NULL) {
1801 printf("name: %s\n", ft->ft_name);
1802 if (ft->ft_flags & FL_PCPU) {
1803 for (i = 0; i <= mp_maxid; i++) {
1806 flowtable_show(ft, i);
1809 flowtable_show(ft, -1);
1815 DB_SHOW_COMMAND(flowtables, db_show_flowtables)
1817 VNET_ITERATOR_DECL(vnet_iter);
1819 VNET_FOREACH(vnet_iter) {
1820 CURVNET_SET(vnet_iter);
1822 db_printf("vnet %p\n", vnet_iter);
1824 flowtable_show_vnet();