1 /**************************************************************************
3 Copyright (c) 2008-2009, BitGravity Inc.
6 Redistribution and use in source and binary forms, with or without
7 modification, are permitted provided that the following conditions are met:
9 1. Redistributions of source code must retain the above copyright notice,
10 this list of conditions and the following disclaimer.
12 2. Neither the name of the BitGravity Corporation nor the names of its
13 contributors may be used to endorse or promote products derived from
14 this software without specific prior written permission.
16 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
17 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
20 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
21 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
22 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
23 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
24 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
25 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
26 POSSIBILITY OF SUCH DAMAGE.
28 ***************************************************************************/
30 #include "opt_route.h"
31 #include "opt_mpath.h"
34 #include <sys/cdefs.h>
35 __FBSDID("$FreeBSD$");
37 #include <sys/param.h>
38 #include <sys/types.h>
39 #include <sys/bitstring.h>
40 #include <sys/condvar.h>
41 #include <sys/callout.h>
42 #include <sys/kernel.h>
43 #include <sys/kthread.h>
44 #include <sys/limits.h>
45 #include <sys/malloc.h>
48 #include <sys/sched.h>
50 #include <sys/socket.h>
51 #include <sys/syslog.h>
52 #include <sys/sysctl.h>
55 #include <net/if_llatbl.h>
56 #include <net/if_var.h>
57 #include <net/route.h>
58 #include <net/flowtable.h>
61 #include <netinet/in.h>
62 #include <netinet/in_systm.h>
63 #include <netinet/in_var.h>
64 #include <netinet/if_ether.h>
65 #include <netinet/ip.h>
66 #include <netinet/tcp.h>
67 #include <netinet/udp.h>
68 #include <netinet/sctp.h>
70 #include <libkern/jenkins.h>
74 uint16_t ip_sport; /* source port */
75 uint16_t ip_dport; /* destination port */
76 in_addr_t ip_saddr; /* source address */
77 in_addr_t ip_daddr; /* destination address */
81 struct ipv4_tuple ipf_ipt;
86 uint16_t ip_sport; /* source port */
87 uint16_t ip_dport; /* destination port */
88 struct in6_addr ip_saddr; /* source address */
89 struct in6_addr ip_daddr; /* destination address */
93 struct ipv6_tuple ipf_ipt;
98 volatile uint32_t f_fhash; /* hash flowing forward */
99 uint16_t f_flags; /* flow flags */
101 uint8_t f_proto; /* protocol */
102 uint32_t f_fibnum; /* fib index */
103 uint32_t f_uptime; /* uptime at last access */
104 struct flentry *f_next; /* pointer to collision entry */
105 volatile struct rtentry *f_rt; /* rtentry for flow */
106 volatile struct llentry *f_lle; /* llentry for flow */
110 struct flentry fl_entry;
111 union ipv4_flow fl_flow;
115 struct flentry fl_entry;
116 union ipv6_flow fl_flow;
119 #define fl_fhash fl_entry.fl_fhash
120 #define fl_flags fl_entry.fl_flags
121 #define fl_proto fl_entry.fl_proto
122 #define fl_uptime fl_entry.fl_uptime
123 #define fl_rt fl_entry.fl_rt
124 #define fl_lle fl_entry.fl_lle
126 #define SECS_PER_HOUR 3600
127 #define SECS_PER_DAY (24*SECS_PER_HOUR)
131 #define FIN_WAIT_IDLE 600
132 #define TCP_IDLE SECS_PER_DAY
135 typedef void fl_lock_t(struct flowtable *, uint32_t);
136 typedef void fl_rtalloc_t(struct route *, uint32_t, u_int);
139 struct flentry **global;
140 struct flentry **pcpu[MAXCPU];
147 uint32_t ft_collisions;
148 uint32_t ft_allocated;
152 uint32_t ft_udp_idle;
153 uint32_t ft_fin_wait_idle;
154 uint32_t ft_syn_idle;
155 uint32_t ft_tcp_idle;
158 fl_lock_t *ft_unlock;
159 fl_rtalloc_t *ft_rtalloc;
160 struct mtx *ft_locks;
163 union flentryp ft_table;
164 bitstr_t *ft_masks[MAXCPU];
165 bitstr_t *ft_tmpmask;
166 struct flowtable *ft_next;
169 static struct proc *flowcleanerproc;
170 static VNET_DEFINE(struct flowtable *, flow_list_head);
171 static VNET_DEFINE(uint32_t, flow_hashjitter);
172 static VNET_DEFINE(uma_zone_t, flow_ipv4_zone);
173 static VNET_DEFINE(uma_zone_t, flow_ipv6_zone);
175 #define V_flow_list_head VNET(flow_list_head)
176 #define V_flow_hashjitter VNET(flow_hashjitter)
177 #define V_flow_ipv4_zone VNET(flow_ipv4_zone)
178 #define V_flow_ipv6_zone VNET(flow_ipv6_zone)
180 static struct cv flowclean_cv;
181 static struct mtx flowclean_lock;
182 static uint32_t flowclean_cycles;
186 * - Make flowtable stats per-cpu, aggregated at sysctl call time,
187 * to avoid extra cache evictions caused by incrementing a shared
189 * - add IPv6 support to flow lookup
190 * - add sysctls to resize && flush flow tables
191 * - Add per flowtable sysctls for statistics and configuring timeouts
192 * - add saturation counter to rtentry to support per-packet load-balancing
193 * add flag to indicate round-robin flow, add list lookup from head
195 * - add sysctl / device node / syscall to support exporting and importing
196 * of flows with flag to indicate that a flow was imported so should
197 * not be considered for auto-cleaning
198 * - support explicit connection state (currently only ad-hoc for DSR)
199 * - idetach() cleanup for options VIMAGE builds.
201 VNET_DEFINE(int, flowtable_enable) = 1;
202 static VNET_DEFINE(int, flowtable_hits);
203 static VNET_DEFINE(int, flowtable_lookups);
204 static VNET_DEFINE(int, flowtable_misses);
205 static VNET_DEFINE(int, flowtable_frees);
206 static VNET_DEFINE(int, flowtable_free_checks);
207 static VNET_DEFINE(int, flowtable_max_depth);
208 static VNET_DEFINE(int, flowtable_collisions);
209 static VNET_DEFINE(int, flowtable_syn_expire) = SYN_IDLE;
210 static VNET_DEFINE(int, flowtable_udp_expire) = UDP_IDLE;
211 static VNET_DEFINE(int, flowtable_fin_wait_expire) = FIN_WAIT_IDLE;
212 static VNET_DEFINE(int, flowtable_tcp_expire) = TCP_IDLE;
213 static VNET_DEFINE(int, flowtable_nmbflows) = 4096;
214 static VNET_DEFINE(int, flowtable_ready) = 0;
216 #define V_flowtable_enable VNET(flowtable_enable)
217 #define V_flowtable_hits VNET(flowtable_hits)
218 #define V_flowtable_lookups VNET(flowtable_lookups)
219 #define V_flowtable_misses VNET(flowtable_misses)
220 #define V_flowtable_frees VNET(flowtable_frees)
221 #define V_flowtable_free_checks VNET(flowtable_free_checks)
222 #define V_flowtable_max_depth VNET(flowtable_max_depth)
223 #define V_flowtable_collisions VNET(flowtable_collisions)
224 #define V_flowtable_syn_expire VNET(flowtable_syn_expire)
225 #define V_flowtable_udp_expire VNET(flowtable_udp_expire)
226 #define V_flowtable_fin_wait_expire VNET(flowtable_fin_wait_expire)
227 #define V_flowtable_tcp_expire VNET(flowtable_tcp_expire)
228 #define V_flowtable_nmbflows VNET(flowtable_nmbflows)
229 #define V_flowtable_ready VNET(flowtable_ready)
231 SYSCTL_NODE(_net_inet, OID_AUTO, flowtable, CTLFLAG_RD, NULL, "flowtable");
232 SYSCTL_VNET_INT(_net_inet_flowtable, OID_AUTO, enable, CTLFLAG_RW,
233 &VNET_NAME(flowtable_enable), 0, "enable flowtable caching.");
234 SYSCTL_VNET_INT(_net_inet_flowtable, OID_AUTO, hits, CTLFLAG_RD,
235 &VNET_NAME(flowtable_hits), 0, "# flowtable hits.");
236 SYSCTL_VNET_INT(_net_inet_flowtable, OID_AUTO, lookups, CTLFLAG_RD,
237 &VNET_NAME(flowtable_lookups), 0, "# flowtable lookups.");
238 SYSCTL_VNET_INT(_net_inet_flowtable, OID_AUTO, misses, CTLFLAG_RD,
239 &VNET_NAME(flowtable_misses), 0, "#flowtable misses.");
240 SYSCTL_VNET_INT(_net_inet_flowtable, OID_AUTO, frees, CTLFLAG_RD,
241 &VNET_NAME(flowtable_frees), 0, "#flows freed.");
242 SYSCTL_VNET_INT(_net_inet_flowtable, OID_AUTO, free_checks, CTLFLAG_RD,
243 &VNET_NAME(flowtable_free_checks), 0, "#flows free checks.");
244 SYSCTL_VNET_INT(_net_inet_flowtable, OID_AUTO, max_depth, CTLFLAG_RD,
245 &VNET_NAME(flowtable_max_depth), 0, "max collision list length.");
246 SYSCTL_VNET_INT(_net_inet_flowtable, OID_AUTO, collisions, CTLFLAG_RD,
247 &VNET_NAME(flowtable_collisions), 0, "#flowtable collisions.");
250 * XXX This does not end up updating timeouts at runtime
251 * and only reflects the value for the last table added :-/
253 SYSCTL_VNET_INT(_net_inet_flowtable, OID_AUTO, syn_expire, CTLFLAG_RW,
254 &VNET_NAME(flowtable_syn_expire), 0,
255 "seconds after which to remove syn allocated flow.");
256 SYSCTL_VNET_INT(_net_inet_flowtable, OID_AUTO, udp_expire, CTLFLAG_RW,
257 &VNET_NAME(flowtable_udp_expire), 0,
258 "seconds after which to remove flow allocated to UDP.");
259 SYSCTL_VNET_INT(_net_inet_flowtable, OID_AUTO, fin_wait_expire, CTLFLAG_RW,
260 &VNET_NAME(flowtable_fin_wait_expire), 0,
261 "seconds after which to remove a flow in FIN_WAIT.");
262 SYSCTL_VNET_INT(_net_inet_flowtable, OID_AUTO, tcp_expire, CTLFLAG_RW,
263 &VNET_NAME(flowtable_tcp_expire), 0,
264 "seconds after which to remove flow allocated to a TCP connection.");
268 * Maximum number of flows that can be allocated of a given type.
270 * The table is allocated at boot time (for the pure caching case
271 * there is no reason why this could not be changed at runtime)
272 * and thus (currently) needs to be set with a tunable.
275 sysctl_nmbflows(SYSCTL_HANDLER_ARGS)
277 int error, newnmbflows;
279 newnmbflows = V_flowtable_nmbflows;
280 error = sysctl_handle_int(oidp, &newnmbflows, 0, req);
281 if (error == 0 && req->newptr) {
282 if (newnmbflows > V_flowtable_nmbflows) {
283 V_flowtable_nmbflows = newnmbflows;
284 uma_zone_set_max(V_flow_ipv4_zone,
285 V_flowtable_nmbflows);
286 uma_zone_set_max(V_flow_ipv6_zone,
287 V_flowtable_nmbflows);
293 SYSCTL_VNET_PROC(_net_inet_flowtable, OID_AUTO, nmbflows,
294 CTLTYPE_INT|CTLFLAG_RW, 0, 0, sysctl_nmbflows, "IU",
295 "Maximum number of flows allowed");
299 in_rtalloc_ign_wrapper(struct route *ro, uint32_t hash, u_int fibnum)
302 rtalloc_ign_fib(ro, 0, fibnum);
307 flowtable_global_lock(struct flowtable *table, uint32_t hash)
309 int lock_index = (hash)&(table->ft_lock_count - 1);
311 mtx_lock(&table->ft_locks[lock_index]);
315 flowtable_global_unlock(struct flowtable *table, uint32_t hash)
317 int lock_index = (hash)&(table->ft_lock_count - 1);
319 mtx_unlock(&table->ft_locks[lock_index]);
323 flowtable_pcpu_lock(struct flowtable *table, uint32_t hash)
330 flowtable_pcpu_unlock(struct flowtable *table, uint32_t hash)
336 #define FL_ENTRY_INDEX(table, hash)((hash) % (table)->ft_size)
337 #define FL_ENTRY(table, hash) *flowtable_entry((table), (hash))
338 #define FL_ENTRY_LOCK(table, hash) (table)->ft_lock((table), (hash))
339 #define FL_ENTRY_UNLOCK(table, hash) (table)->ft_unlock((table), (hash))
341 #define FL_STALE (1<<8)
342 #define FL_IPV6 (1<<9)
345 ipv4_flow_lookup_hash_internal(struct mbuf *m, struct route *ro,
346 uint32_t *key, uint16_t *flags, uint8_t *protop)
348 uint16_t sport = 0, dport = 0;
349 struct ip *ip = NULL;
353 struct sockaddr_in *sin;
358 if ((V_flowtable_enable == 0) || (V_flowtable_ready == 0))
362 sin = (struct sockaddr_in *)&ro->ro_dst;
364 ip = mtod(m, struct ip *);
365 sin->sin_family = AF_INET;
366 sin->sin_len = sizeof(*sin);
367 sin->sin_addr = ip->ip_dst;
369 *flags &= ~FL_HASH_PORTS;
371 key[2] = sin->sin_addr.s_addr;
373 if ((*flags & FL_HASH_PORTS) == 0)
377 iphlen = ip->ip_hl << 2; /* XXX options? */
378 key[1] = ip->ip_src.s_addr;
382 th = (struct tcphdr *)((caddr_t)ip + iphlen);
383 sport = ntohs(th->th_sport);
384 dport = ntohs(th->th_dport);
385 *flags |= th->th_flags;
390 uh = (struct udphdr *)((caddr_t)ip + iphlen);
391 sport = uh->uh_sport;
392 dport = uh->uh_dport;
395 sh = (struct sctphdr *)((caddr_t)ip + iphlen);
396 sport = sh->src_port;
397 dport = sh->dest_port;
400 if (*flags & FL_HASH_PORTS)
402 /* no port - hence not a protocol we care about */
409 * If this is a transmit route cache then
410 * hash all flows to a given destination to
413 if ((*flags & FL_HASH_PORTS) == 0)
414 proto = sport = dport = 0;
416 ((uint16_t *)key)[0] = sport;
417 ((uint16_t *)key)[1] = dport;
420 hash = jenkins_hashword(key, 3, V_flow_hashjitter + proto);
421 if (m != NULL && (m->m_flags & M_FLOWID) == 0) {
422 m->m_flags |= M_FLOWID;
423 m->m_pkthdr.flowid = hash;
433 flowtable_mask(struct flowtable *ft)
437 if (ft->ft_flags & FL_PCPU)
438 mask = ft->ft_masks[curcpu];
440 mask = ft->ft_masks[0];
445 static struct flentry **
446 flowtable_entry(struct flowtable *ft, uint32_t hash)
448 struct flentry **fle;
449 int index = (hash % ft->ft_size);
451 if (ft->ft_flags & FL_PCPU) {
452 KASSERT(&ft->ft_table.pcpu[curcpu][0] != NULL, ("pcpu not set"));
453 fle = &ft->ft_table.pcpu[curcpu][index];
455 KASSERT(&ft->ft_table.global[0] != NULL, ("global not set"));
456 fle = &ft->ft_table.global[index];
463 flow_stale(struct flowtable *ft, struct flentry *fle)
467 if ((fle->f_fhash == 0)
468 || ((fle->f_rt->rt_flags & RTF_HOST) &&
469 ((fle->f_rt->rt_flags & (RTF_UP))
471 || (fle->f_rt->rt_ifp == NULL))
474 idle_time = time_uptime - fle->f_uptime;
476 if ((fle->f_flags & FL_STALE) ||
477 ((fle->f_flags & (TH_SYN|TH_ACK|TH_FIN)) == 0
478 && (idle_time > ft->ft_udp_idle)) ||
479 ((fle->f_flags & TH_FIN)
480 && (idle_time > ft->ft_fin_wait_idle)) ||
481 ((fle->f_flags & (TH_SYN|TH_ACK)) == TH_SYN
482 && (idle_time > ft->ft_syn_idle)) ||
483 ((fle->f_flags & (TH_SYN|TH_ACK)) == (TH_SYN|TH_ACK)
484 && (idle_time > ft->ft_tcp_idle)) ||
485 ((fle->f_rt->rt_flags & RTF_UP) == 0 ||
486 (fle->f_rt->rt_ifp == NULL)))
493 flowtable_set_hashkey(struct flentry *fle, uint32_t *key)
498 if (fle->f_flags & FL_IPV6) {
500 hashkey = ((struct flentry_v4 *)fle)->fl_flow.ipf_key;
503 hashkey = ((struct flentry_v6 *)fle)->fl_flow.ipf_key;
506 for (i = 0; i < nwords; i++)
511 flowtable_insert(struct flowtable *ft, uint32_t hash, uint32_t *key,
512 uint8_t proto, uint32_t fibnum, struct route *ro, uint16_t flags)
514 struct flentry *fle, *fletail, *newfle, **flep;
519 flezone = (flags & FL_IPV6) ? V_flow_ipv6_zone : V_flow_ipv4_zone;
520 newfle = uma_zalloc(flezone, M_NOWAIT | M_ZERO);
524 newfle->f_flags |= (flags & FL_IPV6);
526 FL_ENTRY_LOCK(ft, hash);
527 mask = flowtable_mask(ft);
528 flep = flowtable_entry(ft, hash);
529 fletail = fle = *flep;
532 bit_set(mask, FL_ENTRY_INDEX(ft, hash));
533 *flep = fle = newfle;
538 V_flowtable_collisions++;
540 * find end of list and make sure that we were not
541 * preempted by another thread handling this flow
543 while (fle != NULL) {
544 if (fle->f_fhash == hash && !flow_stale(ft, fle)) {
546 * there was either a hash collision
547 * or we lost a race to insert
549 FL_ENTRY_UNLOCK(ft, hash);
550 uma_zfree((newfle->f_flags & FL_IPV6) ?
551 V_flow_ipv6_zone : V_flow_ipv4_zone, newfle);
555 * re-visit this double condition XXX
557 if (fletail->f_next != NULL)
558 fletail = fle->f_next;
564 if (depth > V_flowtable_max_depth)
565 V_flowtable_max_depth = depth;
566 fletail->f_next = newfle;
569 flowtable_set_hashkey(fle, key);
571 fle->f_proto = proto;
572 fle->f_rt = ro->ro_rt;
573 fle->f_lle = ro->ro_lle;
575 fle->f_fibnum = fibnum;
576 fle->f_uptime = time_uptime;
577 FL_ENTRY_UNLOCK(ft, hash);
582 flowtable_key_equal(struct flentry *fle, uint32_t *key)
587 if (fle->f_flags & FL_IPV6) {
589 hashkey = ((struct flentry_v4 *)fle)->fl_flow.ipf_key;
592 hashkey = ((struct flentry_v6 *)fle)->fl_flow.ipf_key;
595 for (i = 0; i < nwords; i++)
596 if (hashkey[i] != key[i])
603 flowtable_lookup(struct flowtable *ft, struct mbuf *m, struct route *ro, uint32_t fibnum)
605 uint32_t key[9], hash;
613 flags = ft->ft_flags;
618 * The internal hash lookup is the only IPv4 specific bit
621 * XXX BZ: to add IPv6 support just add a check for the
622 * address type in m and ro and an equivalent ipv6 lookup
623 * function - the rest of the code should automatically
624 * handle an ipv6 flow (note that m can be NULL in which
625 * case ro will be set)
627 hash = ipv4_flow_lookup_hash_internal(m, ro, key,
631 * Ports are zero and this isn't a transmit cache
632 * - thus not a protocol for which we need to keep
634 * FL_HASH_PORTS => key[0] != 0 for TCP || UDP || SCTP
636 if (hash == 0 || (key[0] == 0 && (ft->ft_flags & FL_HASH_PORTS)))
639 V_flowtable_lookups++;
640 FL_ENTRY_LOCK(ft, hash);
641 if ((fle = FL_ENTRY(ft, hash)) == NULL) {
642 FL_ENTRY_UNLOCK(ft, hash);
646 rt = __DEVOLATILE(struct rtentry *, fle->f_rt);
647 lle = __DEVOLATILE(struct llentry *, fle->f_lle);
649 && fle->f_fhash == hash
650 && flowtable_key_equal(fle, key)
651 && (proto == fle->f_proto)
652 && (fibnum == fle->f_fibnum)
653 && (rt->rt_flags & RTF_UP)
654 && (rt->rt_ifp != NULL)) {
656 fle->f_uptime = time_uptime;
657 fle->f_flags |= flags;
660 FL_ENTRY_UNLOCK(ft, hash);
662 } else if (fle->f_next != NULL) {
666 FL_ENTRY_UNLOCK(ft, hash);
669 V_flowtable_misses++;
671 * This bit of code ends up locking the
672 * same route 3 times (just like ip_output + ether_output)
674 * - in rt_check when called by arpresolve
675 * - dropping the refcount for the rtentry
677 * This could be consolidated to one if we wrote a variant
678 * of arpresolve with an rt_check variant that expected to
679 * receive the route locked
682 ft->ft_rtalloc(ro, hash, fibnum);
683 if (ro->ro_rt == NULL)
686 struct llentry *lle = NULL;
687 struct sockaddr *l3addr;
688 struct rtentry *rt = ro->ro_rt;
689 struct ifnet *ifp = rt->rt_ifp;
691 if (rt->rt_flags & RTF_GATEWAY)
692 l3addr = rt->rt_gateway;
694 l3addr = &ro->ro_dst;
695 llentry_update(&lle, LLTABLE(ifp), l3addr, ifp);
703 error = flowtable_insert(ft, hash, key, proto, fibnum,
718 * used by the bit_alloc macro
720 #define calloc(count, size) malloc((count)*(size), M_DEVBUF, M_WAITOK|M_ZERO)
723 flowtable_alloc(int nentry, int flags)
725 struct flowtable *ft, *fttail;
728 if (V_flow_hashjitter == 0)
729 V_flow_hashjitter = arc4random();
731 KASSERT(nentry > 0, ("nentry must be > 0, is %d\n", nentry));
733 ft = malloc(sizeof(struct flowtable),
734 M_RTABLE, M_WAITOK | M_ZERO);
736 ft->ft_flags = flags;
737 ft->ft_size = nentry;
739 ft->ft_rtalloc = rtalloc_mpath_fib;
741 ft->ft_rtalloc = in_rtalloc_ign_wrapper;
743 if (flags & FL_PCPU) {
744 ft->ft_lock = flowtable_pcpu_lock;
745 ft->ft_unlock = flowtable_pcpu_unlock;
747 for (i = 0; i <= mp_maxid; i++) {
748 ft->ft_table.pcpu[i] =
749 malloc(nentry*sizeof(struct flentry *),
750 M_RTABLE, M_WAITOK | M_ZERO);
751 ft->ft_masks[i] = bit_alloc(nentry);
754 ft->ft_lock_count = 2*(powerof2(mp_maxid + 1) ? (mp_maxid + 1):
755 (fls(mp_maxid + 1) << 1));
757 ft->ft_lock = flowtable_global_lock;
758 ft->ft_unlock = flowtable_global_unlock;
759 ft->ft_table.global =
760 malloc(nentry*sizeof(struct flentry *),
761 M_RTABLE, M_WAITOK | M_ZERO);
762 ft->ft_locks = malloc(ft->ft_lock_count*sizeof(struct mtx),
763 M_RTABLE, M_WAITOK | M_ZERO);
764 for (i = 0; i < ft->ft_lock_count; i++)
765 mtx_init(&ft->ft_locks[i], "flow", NULL, MTX_DEF|MTX_DUPOK);
767 ft->ft_masks[0] = bit_alloc(nentry);
769 ft->ft_tmpmask = bit_alloc(nentry);
772 * In the local transmit case the table truly is
773 * just a cache - so everything is eligible for
774 * replacement after 5s of non-use
776 if (flags & FL_HASH_PORTS) {
777 ft->ft_udp_idle = V_flowtable_udp_expire;
778 ft->ft_syn_idle = V_flowtable_syn_expire;
779 ft->ft_fin_wait_idle = V_flowtable_fin_wait_expire;
780 ft->ft_tcp_idle = V_flowtable_fin_wait_expire;
782 ft->ft_udp_idle = ft->ft_fin_wait_idle =
783 ft->ft_syn_idle = ft->ft_tcp_idle = 30;
788 * hook in to the cleaner list
790 if (V_flow_list_head == NULL)
791 V_flow_list_head = ft;
793 fttail = V_flow_list_head;
794 while (fttail->ft_next != NULL)
795 fttail = fttail->ft_next;
796 fttail->ft_next = ft;
803 * The rest of the code is devoted to garbage collection of expired entries.
804 * It is a new additon made necessary by the switch to dynamically allocating
809 fle_free(struct flentry *fle)
814 rt = __DEVOLATILE(struct rtentry *, fle->f_rt);
815 lle = __DEVOLATILE(struct llentry *, fle->f_lle);
818 uma_zfree((fle->f_flags & FL_IPV6) ?
819 V_flow_ipv6_zone : V_flow_ipv4_zone, fle);
823 flowtable_free_stale(struct flowtable *ft)
825 int curbit = 0, count;
826 struct flentry *fle, **flehead, *fleprev;
827 struct flentry *flefreehead, *flefreetail, *fletmp;
828 bitstr_t *mask, *tmpmask;
830 flefreehead = flefreetail = NULL;
831 mask = flowtable_mask(ft);
832 tmpmask = ft->ft_tmpmask;
833 memcpy(tmpmask, mask, ft->ft_size/8);
835 * XXX Note to self, bit_ffs operates at the byte level
836 * and thus adds gratuitous overhead
838 bit_ffs(tmpmask, ft->ft_size, &curbit);
839 while (curbit != -1) {
840 if (curbit >= ft->ft_size || curbit < -1) {
842 "warning: bad curbit value %d \n",
847 FL_ENTRY_LOCK(ft, curbit);
848 flehead = flowtable_entry(ft, curbit);
849 fle = fleprev = *flehead;
851 V_flowtable_free_checks++;
853 if (fle == NULL && curbit > 0) {
855 "warning bit=%d set, but no fle found\n",
859 while (fle != NULL) {
860 if (!flow_stale(ft, fle)) {
866 * delete head of the list
868 if (fleprev == *flehead) {
870 if (fle == fleprev) {
871 fleprev = *flehead = fle->f_next;
873 fleprev = *flehead = fle;
877 * don't advance fleprev
880 fleprev->f_next = fle->f_next;
881 fle = fleprev->f_next;
884 if (flefreehead == NULL)
885 flefreehead = flefreetail = fletmp;
887 flefreetail->f_next = fletmp;
888 flefreetail = fletmp;
890 fletmp->f_next = NULL;
892 if (*flehead == NULL)
893 bit_clear(mask, curbit);
894 FL_ENTRY_UNLOCK(ft, curbit);
895 bit_clear(tmpmask, curbit);
896 bit_ffs(tmpmask, ft->ft_size, &curbit);
899 while ((fle = flefreehead) != NULL) {
900 flefreehead = fle->f_next;
905 if (bootverbose && count)
906 log(LOG_DEBUG, "freed %d flow entries\n", count);
910 flowtable_clean_vnet(void)
912 struct flowtable *ft;
915 ft = V_flow_list_head;
917 if (ft->ft_flags & FL_PCPU) {
918 for (i = 0; i <= mp_maxid; i++) {
922 thread_lock(curthread);
923 sched_bind(curthread, i);
924 thread_unlock(curthread);
926 flowtable_free_stale(ft);
928 thread_lock(curthread);
929 sched_unbind(curthread);
930 thread_unlock(curthread);
933 flowtable_free_stale(ft);
940 flowtable_cleaner(void)
942 VNET_ITERATOR_DECL(vnet_iter);
945 log(LOG_INFO, "flowtable cleaner started\n");
948 VNET_FOREACH(vnet_iter) {
949 CURVNET_SET(vnet_iter);
950 flowtable_clean_vnet();
957 * The 20 second interval between cleaning checks
960 mtx_lock(&flowclean_lock);
961 cv_broadcast(&flowclean_cv);
962 cv_timedwait(&flowclean_cv, &flowclean_lock, 10*hz);
963 mtx_unlock(&flowclean_lock);
968 flowtable_flush(void *unused __unused)
972 mtx_lock(&flowclean_lock);
973 start = flowclean_cycles;
974 while (start == flowclean_cycles) {
975 cv_broadcast(&flowclean_cv);
976 cv_wait(&flowclean_cv, &flowclean_lock);
978 mtx_unlock(&flowclean_lock);
981 static struct kproc_desc flow_kp = {
986 SYSINIT(flowcleaner, SI_SUB_KTHREAD_IDLE, SI_ORDER_ANY, kproc_start, &flow_kp);
989 flowtable_init_vnet(const void *unused __unused)
992 V_flow_ipv4_zone = uma_zcreate("ip4flow", sizeof(struct flentry_v4),
993 NULL, NULL, NULL, NULL, 64, UMA_ZONE_MAXBUCKET);
994 V_flow_ipv6_zone = uma_zcreate("ip6flow", sizeof(struct flentry_v6),
995 NULL, NULL, NULL, NULL, 64, UMA_ZONE_MAXBUCKET);
996 uma_zone_set_max(V_flow_ipv4_zone, V_flowtable_nmbflows);
997 uma_zone_set_max(V_flow_ipv6_zone, V_flowtable_nmbflows);
999 VNET_SYSINIT(flowtable_init_vnet, SI_SUB_KTHREAD_INIT, SI_ORDER_MIDDLE,
1000 flowtable_init_vnet, NULL);
1003 flowtable_init(const void *unused __unused)
1006 cv_init(&flowclean_cv, "flowcleanwait");
1007 mtx_init(&flowclean_lock, "flowclean lock", NULL, MTX_DEF);
1008 EVENTHANDLER_REGISTER(ifnet_departure_event, flowtable_flush, NULL,
1009 EVENTHANDLER_PRI_ANY);
1010 V_flowtable_ready = 1;
1012 SYSINIT(flowtable_init, SI_SUB_KTHREAD_INIT, SI_ORDER_ANY,
1013 flowtable_init, NULL);
1018 flowtable_uninit(const void *unused __unused)
1021 uma_zdestroy(V_flow_ipv4_zone);
1022 uma_zdestroy(V_flow_ipv6_zone);
1025 VNET_SYSUNINIT(flowtable_uninit, SI_SUB_KTHREAD_INIT, SI_ORDER_ANY,
1026 flowtable_uninit, NULL);
1031 flowtable_mask_pcpu(struct flowtable *ft, int cpuid)
1035 if (ft->ft_flags & FL_PCPU)
1036 mask = ft->ft_masks[cpuid];
1038 mask = ft->ft_masks[0];
1043 static struct flentry **
1044 flowtable_entry_pcpu(struct flowtable *ft, uint32_t hash, int cpuid)
1046 struct flentry **fle;
1047 int index = (hash % ft->ft_size);
1049 if (ft->ft_flags & FL_PCPU) {
1050 fle = &ft->ft_table.pcpu[cpuid][index];
1052 fle = &ft->ft_table.global[index];
1059 flow_show(struct flowtable *ft, struct flentry *fle)
1064 idle_time = (int)(time_uptime - fle->f_uptime);
1065 rt_valid = fle->f_rt != NULL;
1066 db_printf("hash=0x%08x idle_time=%03d rt=%p ifp=%p",
1067 fle->f_fhash, idle_time,
1068 fle->f_rt, rt_valid ? fle->f_rt->rt_ifp : NULL);
1069 if (rt_valid && (fle->f_rt->rt_flags & RTF_UP))
1070 db_printf(" RTF_UP ");
1071 if (fle->f_flags & FL_STALE)
1072 db_printf(" FL_STALE ");
1077 flowtable_show(struct flowtable *ft, int cpuid)
1080 struct flentry *fle, **flehead;
1081 bitstr_t *mask, *tmpmask;
1083 db_printf("cpu: %d\n", cpuid);
1084 mask = flowtable_mask_pcpu(ft, cpuid);
1085 tmpmask = ft->ft_tmpmask;
1086 memcpy(tmpmask, mask, ft->ft_size/8);
1088 * XXX Note to self, bit_ffs operates at the byte level
1089 * and thus adds gratuitous overhead
1091 bit_ffs(tmpmask, ft->ft_size, &curbit);
1092 while (curbit != -1) {
1093 if (curbit >= ft->ft_size || curbit < -1) {
1094 db_printf("warning: bad curbit value %d \n",
1099 flehead = flowtable_entry_pcpu(ft, curbit, cpuid);
1102 while (fle != NULL) {
1107 bit_clear(tmpmask, curbit);
1108 bit_ffs(tmpmask, ft->ft_size, &curbit);
1113 flowtable_show_vnet(void)
1115 struct flowtable *ft;
1118 ft = V_flow_list_head;
1119 while (ft != NULL) {
1120 if (ft->ft_flags & FL_PCPU) {
1121 for (i = 0; i <= mp_maxid; i++) {
1124 flowtable_show(ft, i);
1127 flowtable_show(ft, 0);
1133 DB_SHOW_COMMAND(flowtables, db_show_flowtables)
1135 VNET_ITERATOR_DECL(vnet_iter);
1137 VNET_FOREACH(vnet_iter) {
1138 CURVNET_SET(vnet_iter);
1139 flowtable_show_vnet();