1 /**************************************************************************
3 Copyright (c) 2008-2009, BitGravity Inc.
6 Redistribution and use in source and binary forms, with or without
7 modification, are permitted provided that the following conditions are met:
9 1. Redistributions of source code must retain the above copyright notice,
10 this list of conditions and the following disclaimer.
12 2. Neither the name of the BitGravity Corporation nor the names of its
13 contributors may be used to endorse or promote products derived from
14 this software without specific prior written permission.
16 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
17 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
20 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
21 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
22 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
23 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
24 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
25 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
26 POSSIBILITY OF SUCH DAMAGE.
28 ***************************************************************************/
30 #include "opt_route.h"
31 #include "opt_mpath.h"
33 #include <sys/cdefs.h>
34 __FBSDID("$FreeBSD$");
36 #include <sys/param.h>
37 #include <sys/types.h>
38 #include <sys/bitstring.h>
39 #include <sys/callout.h>
40 #include <sys/kernel.h>
41 #include <sys/kthread.h>
42 #include <sys/limits.h>
43 #include <sys/malloc.h>
46 #include <sys/sched.h>
48 #include <sys/socket.h>
49 #include <sys/syslog.h>
50 #include <sys/sysctl.h>
53 #include <net/if_llatbl.h>
54 #include <net/if_var.h>
55 #include <net/route.h>
56 #include <net/flowtable.h>
59 #include <netinet/in.h>
60 #include <netinet/in_systm.h>
61 #include <netinet/in_var.h>
62 #include <netinet/if_ether.h>
63 #include <netinet/ip.h>
64 #include <netinet/tcp.h>
65 #include <netinet/udp.h>
66 #include <netinet/sctp.h>
68 #include <libkern/jenkins.h>
71 uint16_t ip_sport; /* source port */
72 uint16_t ip_dport; /* destination port */
73 in_addr_t ip_saddr; /* source address */
74 in_addr_t ip_daddr; /* destination address */
78 struct ipv4_tuple ipf_ipt;
83 uint16_t ip_sport; /* source port */
84 uint16_t ip_dport; /* destination port */
85 struct in6_addr ip_saddr; /* source address */
86 struct in6_addr ip_daddr; /* destination address */
90 struct ipv6_tuple ipf_ipt;
95 volatile uint32_t f_fhash; /* hash flowing forward */
96 uint16_t f_flags; /* flow flags */
97 uint8_t f_pad; /* alignment */
98 uint8_t f_proto; /* protocol */
99 uint32_t f_uptime; /* uptime at last access */
100 struct flentry *f_next; /* pointer to collision entry */
101 volatile struct rtentry *f_rt; /* rtentry for flow */
102 volatile struct llentry *f_lle; /* llentry for flow */
106 struct flentry fl_entry;
107 union ipv4_flow fl_flow;
111 struct flentry fl_entry;
112 union ipv6_flow fl_flow;
115 #define fl_fhash fl_entry.fl_fhash
116 #define fl_flags fl_entry.fl_flags
117 #define fl_proto fl_entry.fl_proto
118 #define fl_uptime fl_entry.fl_uptime
119 #define fl_rt fl_entry.fl_rt
120 #define fl_lle fl_entry.fl_lle
122 #define SECS_PER_HOUR 3600
123 #define SECS_PER_DAY (24*SECS_PER_HOUR)
127 #define FIN_WAIT_IDLE 600
128 #define TCP_IDLE SECS_PER_DAY
131 typedef void fl_lock_t(struct flowtable *, uint32_t);
132 typedef void fl_rtalloc_t(struct route *, uint32_t, u_int);
135 struct flentry **global;
136 struct flentry **pcpu[MAXCPU];
143 uint32_t ft_collisions;
144 uint32_t ft_allocated;
148 uint32_t ft_udp_idle;
149 uint32_t ft_fin_wait_idle;
150 uint32_t ft_syn_idle;
151 uint32_t ft_tcp_idle;
154 fl_lock_t *ft_unlock;
155 fl_rtalloc_t *ft_rtalloc;
156 struct mtx *ft_locks;
159 union flentryp ft_table;
160 bitstr_t *ft_masks[MAXCPU];
161 bitstr_t *ft_tmpmask;
162 struct flowtable *ft_next;
165 static struct proc *flowcleanerproc;
166 static VNET_DEFINE(struct flowtable *, flow_list_head);
167 static VNET_DEFINE(uint32_t, flow_hashjitter);
168 static VNET_DEFINE(uma_zone_t, flow_ipv4_zone);
169 static VNET_DEFINE(uma_zone_t, flow_ipv6_zone);
171 #define V_flow_list_head VNET(flow_list_head)
172 #define V_flow_hashjitter VNET(flow_hashjitter)
173 #define V_flow_ipv4_zone VNET(flow_ipv4_zone)
174 #define V_flow_ipv6_zone VNET(flow_ipv6_zone)
178 * - Make flowtable stats per-cpu, aggregated at sysctl call time,
179 * to avoid extra cache evictions caused by incrementing a shared
181 * - add IPv6 support to flow lookup
182 * - add sysctls to resize && flush flow tables
183 * - Add per flowtable sysctls for statistics and configuring timeouts
184 * - add saturation counter to rtentry to support per-packet load-balancing
185 * add flag to indicate round-robin flow, add list lookup from head
187 * - add sysctl / device node / syscall to support exporting and importing
188 * of flows with flag to indicate that a flow was imported so should
189 * not be considered for auto-cleaning
190 * - support explicit connection state (currently only ad-hoc for DSR)
191 * - idetach() cleanup for options VIMAGE builds.
193 VNET_DEFINE(int, flowtable_enable) = 1;
194 static VNET_DEFINE(int, flowtable_hits);
195 static VNET_DEFINE(int, flowtable_lookups);
196 static VNET_DEFINE(int, flowtable_misses);
197 static VNET_DEFINE(int, flowtable_frees);
198 static VNET_DEFINE(int, flowtable_free_checks);
199 static VNET_DEFINE(int, flowtable_max_depth);
200 static VNET_DEFINE(int, flowtable_collisions);
201 static VNET_DEFINE(int, flowtable_syn_expire) = SYN_IDLE;
202 static VNET_DEFINE(int, flowtable_udp_expire) = UDP_IDLE;
203 static VNET_DEFINE(int, flowtable_fin_wait_expire) = FIN_WAIT_IDLE;
204 static VNET_DEFINE(int, flowtable_tcp_expire) = TCP_IDLE;
205 static VNET_DEFINE(int, flowtable_nmbflows) = 4096;
207 #define V_flowtable_enable VNET(flowtable_enable)
208 #define V_flowtable_hits VNET(flowtable_hits)
209 #define V_flowtable_lookups VNET(flowtable_lookups)
210 #define V_flowtable_misses VNET(flowtable_misses)
211 #define V_flowtable_frees VNET(flowtable_frees)
212 #define V_flowtable_free_checks VNET(flowtable_free_checks)
213 #define V_flowtable_max_depth VNET(flowtable_max_depth)
214 #define V_flowtable_collisions VNET(flowtable_collisions)
215 #define V_flowtable_syn_expire VNET(flowtable_syn_expire)
216 #define V_flowtable_udp_expire VNET(flowtable_udp_expire)
217 #define V_flowtable_fin_wait_expire VNET(flowtable_fin_wait_expire)
218 #define V_flowtable_tcp_expire VNET(flowtable_tcp_expire)
219 #define V_flowtable_nmbflows VNET(flowtable_nmbflows)
221 SYSCTL_NODE(_net_inet, OID_AUTO, flowtable, CTLFLAG_RD, NULL, "flowtable");
222 SYSCTL_VNET_INT(_net_inet_flowtable, OID_AUTO, enable, CTLFLAG_RW,
223 &VNET_NAME(flowtable_enable), 0, "enable flowtable caching.");
224 SYSCTL_VNET_INT(_net_inet_flowtable, OID_AUTO, hits, CTLFLAG_RD,
225 &VNET_NAME(flowtable_hits), 0, "# flowtable hits.");
226 SYSCTL_VNET_INT(_net_inet_flowtable, OID_AUTO, lookups, CTLFLAG_RD,
227 &VNET_NAME(flowtable_lookups), 0, "# flowtable lookups.");
228 SYSCTL_VNET_INT(_net_inet_flowtable, OID_AUTO, misses, CTLFLAG_RD,
229 &VNET_NAME(flowtable_misses), 0, "#flowtable misses.");
230 SYSCTL_VNET_INT(_net_inet_flowtable, OID_AUTO, frees, CTLFLAG_RD,
231 &VNET_NAME(flowtable_frees), 0, "#flows freed.");
232 SYSCTL_VNET_INT(_net_inet_flowtable, OID_AUTO, free_checks, CTLFLAG_RD,
233 &VNET_NAME(flowtable_free_checks), 0, "#flows free checks.");
234 SYSCTL_VNET_INT(_net_inet_flowtable, OID_AUTO, max_depth, CTLFLAG_RD,
235 &VNET_NAME(flowtable_max_depth), 0, "max collision list length.");
236 SYSCTL_VNET_INT(_net_inet_flowtable, OID_AUTO, collisions, CTLFLAG_RD,
237 &VNET_NAME(flowtable_collisions), 0, "#flowtable collisions.");
240 * XXX This does not end up updating timeouts at runtime
241 * and only reflects the value for the last table added :-/
243 SYSCTL_VNET_INT(_net_inet_flowtable, OID_AUTO, syn_expire, CTLFLAG_RW,
244 &VNET_NAME(flowtable_syn_expire), 0,
245 "seconds after which to remove syn allocated flow.");
246 SYSCTL_VNET_INT(_net_inet_flowtable, OID_AUTO, udp_expire, CTLFLAG_RW,
247 &VNET_NAME(flowtable_udp_expire), 0,
248 "seconds after which to remove flow allocated to UDP.");
249 SYSCTL_VNET_INT(_net_inet_flowtable, OID_AUTO, fin_wait_expire, CTLFLAG_RW,
250 &VNET_NAME(flowtable_fin_wait_expire), 0,
251 "seconds after which to remove a flow in FIN_WAIT.");
252 SYSCTL_VNET_INT(_net_inet_flowtable, OID_AUTO, tcp_expire, CTLFLAG_RW,
253 &VNET_NAME(flowtable_tcp_expire), 0,
254 "seconds after which to remove flow allocated to a TCP connection.");
258 * Maximum number of flows that can be allocated of a given type.
260 * The table is allocated at boot time (for the pure caching case
261 * there is no reason why this could not be changed at runtime)
262 * and thus (currently) needs to be set with a tunable.
265 sysctl_nmbflows(SYSCTL_HANDLER_ARGS)
267 int error, newnmbflows;
269 newnmbflows = V_flowtable_nmbflows;
270 error = sysctl_handle_int(oidp, &newnmbflows, 0, req);
271 if (error == 0 && req->newptr) {
272 if (newnmbflows > V_flowtable_nmbflows) {
273 V_flowtable_nmbflows = newnmbflows;
274 uma_zone_set_max(V_flow_ipv4_zone,
275 V_flowtable_nmbflows);
276 uma_zone_set_max(V_flow_ipv6_zone,
277 V_flowtable_nmbflows);
283 SYSCTL_VNET_PROC(_net_inet_flowtable, OID_AUTO, nmbflows,
284 CTLTYPE_INT|CTLFLAG_RW, 0, 0, sysctl_nmbflows, "IU",
285 "Maximum number of flows allowed");
289 in_rtalloc_ign_wrapper(struct route *ro, uint32_t hash, u_int fib)
292 rtalloc_ign_fib(ro, 0, fib);
297 flowtable_global_lock(struct flowtable *table, uint32_t hash)
299 int lock_index = (hash)&(table->ft_lock_count - 1);
301 mtx_lock(&table->ft_locks[lock_index]);
305 flowtable_global_unlock(struct flowtable *table, uint32_t hash)
307 int lock_index = (hash)&(table->ft_lock_count - 1);
309 mtx_unlock(&table->ft_locks[lock_index]);
313 flowtable_pcpu_lock(struct flowtable *table, uint32_t hash)
320 flowtable_pcpu_unlock(struct flowtable *table, uint32_t hash)
326 #define FL_ENTRY_INDEX(table, hash)((hash) % (table)->ft_size)
327 #define FL_ENTRY(table, hash) *flowtable_entry((table), (hash))
328 #define FL_ENTRY_LOCK(table, hash) (table)->ft_lock((table), (hash))
329 #define FL_ENTRY_UNLOCK(table, hash) (table)->ft_unlock((table), (hash))
331 #define FL_STALE (1<<8)
332 #define FL_IPV6 (1<<9)
335 ipv4_flow_lookup_hash_internal(struct mbuf *m, struct route *ro,
336 uint32_t *key, uint16_t *flags, uint8_t *protop)
338 uint16_t sport = 0, dport = 0;
339 struct ip *ip = NULL;
343 struct sockaddr_in *sin;
348 if (V_flowtable_enable == 0)
352 sin = (struct sockaddr_in *)&ro->ro_dst;
354 ip = mtod(m, struct ip *);
355 sin->sin_family = AF_INET;
356 sin->sin_len = sizeof(*sin);
357 sin->sin_addr = ip->ip_dst;
359 *flags &= ~FL_HASH_PORTS;
361 key[2] = sin->sin_addr.s_addr;
363 if ((*flags & FL_HASH_PORTS) == 0)
367 iphlen = ip->ip_hl << 2; /* XXX options? */
368 key[1] = ip->ip_src.s_addr;
372 th = (struct tcphdr *)((caddr_t)ip + iphlen);
373 sport = ntohs(th->th_sport);
374 dport = ntohs(th->th_dport);
375 *flags |= th->th_flags;
380 uh = (struct udphdr *)((caddr_t)ip + iphlen);
381 sport = uh->uh_sport;
382 dport = uh->uh_dport;
385 sh = (struct sctphdr *)((caddr_t)ip + iphlen);
386 sport = sh->src_port;
387 dport = sh->dest_port;
390 if (*flags & FL_HASH_PORTS)
392 /* no port - hence not a protocol we care about */
399 * If this is a transmit route cache then
400 * hash all flows to a given destination to
403 if ((*flags & FL_HASH_PORTS) == 0)
404 proto = sport = dport = 0;
406 ((uint16_t *)key)[0] = sport;
407 ((uint16_t *)key)[1] = dport;
410 hash = jenkins_hashword(key, 3, V_flow_hashjitter + proto);
411 if (m != NULL && (m->m_flags & M_FLOWID) == 0) {
412 m->m_flags |= M_FLOWID;
413 m->m_pkthdr.flowid = hash;
423 flowtable_mask(struct flowtable *ft)
427 if (ft->ft_flags & FL_PCPU)
428 mask = ft->ft_masks[curcpu];
430 mask = ft->ft_masks[0];
435 static struct flentry **
436 flowtable_entry(struct flowtable *ft, uint32_t hash)
438 struct flentry **fle;
439 int index = (hash % ft->ft_size);
441 if (ft->ft_flags & FL_PCPU) {
442 KASSERT(&ft->ft_table.pcpu[curcpu][0] != NULL, ("pcpu not set"));
443 fle = &ft->ft_table.pcpu[curcpu][index];
445 KASSERT(&ft->ft_table.global[0] != NULL, ("global not set"));
446 fle = &ft->ft_table.global[index];
453 flow_stale(struct flowtable *ft, struct flentry *fle)
457 if ((fle->f_fhash == 0)
458 || ((fle->f_rt->rt_flags & RTF_HOST) &&
459 ((fle->f_rt->rt_flags & (RTF_UP))
461 || (fle->f_rt->rt_ifp == NULL))
464 idle_time = time_uptime - fle->f_uptime;
466 if ((fle->f_flags & FL_STALE) ||
467 ((fle->f_flags & (TH_SYN|TH_ACK|TH_FIN)) == 0
468 && (idle_time > ft->ft_udp_idle)) ||
469 ((fle->f_flags & TH_FIN)
470 && (idle_time > ft->ft_fin_wait_idle)) ||
471 ((fle->f_flags & (TH_SYN|TH_ACK)) == TH_SYN
472 && (idle_time > ft->ft_syn_idle)) ||
473 ((fle->f_flags & (TH_SYN|TH_ACK)) == (TH_SYN|TH_ACK)
474 && (idle_time > ft->ft_tcp_idle)) ||
475 ((fle->f_rt->rt_flags & RTF_UP) == 0 ||
476 (fle->f_rt->rt_ifp == NULL)))
483 flowtable_set_hashkey(struct flentry *fle, uint32_t *key)
488 if (fle->f_flags & FL_IPV6) {
490 hashkey = ((struct flentry_v4 *)fle)->fl_flow.ipf_key;
493 hashkey = ((struct flentry_v6 *)fle)->fl_flow.ipf_key;
496 for (i = 0; i < nwords; i++)
501 flowtable_insert(struct flowtable *ft, uint32_t hash, uint32_t *key,
502 uint8_t proto, struct route *ro, uint16_t flags)
504 struct flentry *fle, *fletail, *newfle, **flep;
509 flezone = (flags & FL_IPV6) ? V_flow_ipv6_zone : V_flow_ipv4_zone;
510 newfle = uma_zalloc(flezone, M_NOWAIT | M_ZERO);
514 newfle->f_flags |= (flags & FL_IPV6);
516 FL_ENTRY_LOCK(ft, hash);
517 mask = flowtable_mask(ft);
518 flep = flowtable_entry(ft, hash);
519 fletail = fle = *flep;
522 bit_set(mask, FL_ENTRY_INDEX(ft, hash));
523 *flep = fle = newfle;
528 V_flowtable_collisions++;
530 * find end of list and make sure that we were not
531 * preempted by another thread handling this flow
533 while (fle != NULL) {
534 if (fle->f_fhash == hash && !flow_stale(ft, fle)) {
536 * there was either a hash collision
537 * or we lost a race to insert
539 FL_ENTRY_UNLOCK(ft, hash);
540 uma_zfree((newfle->f_flags & FL_IPV6) ?
541 V_flow_ipv6_zone : V_flow_ipv4_zone, newfle);
545 * re-visit this double condition XXX
547 if (fletail->f_next != NULL)
548 fletail = fle->f_next;
554 if (depth > V_flowtable_max_depth)
555 V_flowtable_max_depth = depth;
556 fletail->f_next = newfle;
559 flowtable_set_hashkey(fle, key);
561 fle->f_proto = proto;
562 fle->f_rt = ro->ro_rt;
563 fle->f_lle = ro->ro_lle;
565 fle->f_uptime = time_uptime;
566 FL_ENTRY_UNLOCK(ft, hash);
571 flowtable_key_equal(struct flentry *fle, uint32_t *key)
576 if (fle->f_flags & FL_IPV6) {
578 hashkey = ((struct flentry_v4 *)fle)->fl_flow.ipf_key;
581 hashkey = ((struct flentry_v6 *)fle)->fl_flow.ipf_key;
584 for (i = 0; i < nwords; i++)
585 if (hashkey[i] != key[i])
592 flowtable_lookup(struct flowtable *ft, struct mbuf *m, struct route *ro)
594 uint32_t key[9], hash;
598 int error = 0, fib = 0;
602 flags = ft->ft_flags;
607 * The internal hash lookup is the only IPv4 specific bit
610 * XXX BZ: to add IPv6 support just add a check for the
611 * address type in m and ro and an equivalent ipv6 lookup
612 * function - the rest of the code should automatically
613 * handle an ipv6 flow (note that m can be NULL in which
614 * case ro will be set)
616 hash = ipv4_flow_lookup_hash_internal(m, ro, key,
620 * Ports are zero and this isn't a transmit cache
621 * - thus not a protocol for which we need to keep
623 * FL_HASH_PORTS => key[0] != 0 for TCP || UDP || SCTP
625 if (hash == 0 || (key[0] == 0 && (ft->ft_flags & FL_HASH_PORTS)))
628 V_flowtable_lookups++;
629 FL_ENTRY_LOCK(ft, hash);
630 if ((fle = FL_ENTRY(ft, hash)) == NULL) {
631 FL_ENTRY_UNLOCK(ft, hash);
635 rt = __DEVOLATILE(struct rtentry *, fle->f_rt);
636 lle = __DEVOLATILE(struct llentry *, fle->f_lle);
638 && fle->f_fhash == hash
639 && flowtable_key_equal(fle, key)
640 && (proto == fle->f_proto)
641 && (rt->rt_flags & RTF_UP)
642 && (rt->rt_ifp != NULL)) {
644 fle->f_uptime = time_uptime;
645 fle->f_flags |= flags;
648 FL_ENTRY_UNLOCK(ft, hash);
650 } else if (fle->f_next != NULL) {
654 FL_ENTRY_UNLOCK(ft, hash);
657 V_flowtable_misses++;
659 * This bit of code ends up locking the
660 * same route 3 times (just like ip_output + ether_output)
662 * - in rt_check when called by arpresolve
663 * - dropping the refcount for the rtentry
665 * This could be consolidated to one if we wrote a variant
666 * of arpresolve with an rt_check variant that expected to
667 * receive the route locked
672 ft->ft_rtalloc(ro, hash, fib);
673 if (ro->ro_rt == NULL)
676 struct llentry *lle = NULL;
677 struct sockaddr *l3addr;
678 struct rtentry *rt = ro->ro_rt;
679 struct ifnet *ifp = rt->rt_ifp;
681 if (rt->rt_flags & RTF_GATEWAY)
682 l3addr = rt->rt_gateway;
684 l3addr = &ro->ro_dst;
685 llentry_update(&lle, LLTABLE(ifp), l3addr, ifp);
693 error = flowtable_insert(ft, hash, key, proto,
708 * used by the bit_alloc macro
710 #define calloc(count, size) malloc((count)*(size), M_DEVBUF, M_WAITOK|M_ZERO)
713 flowtable_alloc(int nentry, int flags)
715 struct flowtable *ft, *fttail;
718 if (V_flow_hashjitter == 0)
719 V_flow_hashjitter = arc4random();
721 KASSERT(nentry > 0, ("nentry must be > 0, is %d\n", nentry));
723 ft = malloc(sizeof(struct flowtable),
724 M_RTABLE, M_WAITOK | M_ZERO);
726 ft->ft_flags = flags;
727 ft->ft_size = nentry;
729 ft->ft_rtalloc = rtalloc_mpath_fib;
731 ft->ft_rtalloc = in_rtalloc_ign_wrapper;
733 if (flags & FL_PCPU) {
734 ft->ft_lock = flowtable_pcpu_lock;
735 ft->ft_unlock = flowtable_pcpu_unlock;
737 for (i = 0; i <= mp_maxid; i++) {
738 ft->ft_table.pcpu[i] =
739 malloc(nentry*sizeof(struct flentry *),
740 M_RTABLE, M_WAITOK | M_ZERO);
741 ft->ft_masks[i] = bit_alloc(nentry);
744 ft->ft_lock_count = 2*(powerof2(mp_maxid + 1) ? (mp_maxid + 1):
745 (fls(mp_maxid + 1) << 1));
747 ft->ft_lock = flowtable_global_lock;
748 ft->ft_unlock = flowtable_global_unlock;
749 ft->ft_table.global =
750 malloc(nentry*sizeof(struct flentry *),
751 M_RTABLE, M_WAITOK | M_ZERO);
752 ft->ft_locks = malloc(ft->ft_lock_count*sizeof(struct mtx),
753 M_RTABLE, M_WAITOK | M_ZERO);
754 for (i = 0; i < ft->ft_lock_count; i++)
755 mtx_init(&ft->ft_locks[i], "flow", NULL, MTX_DEF|MTX_DUPOK);
757 ft->ft_masks[0] = bit_alloc(nentry);
759 ft->ft_tmpmask = bit_alloc(nentry);
762 * In the local transmit case the table truly is
763 * just a cache - so everything is eligible for
764 * replacement after 5s of non-use
766 if (flags & FL_HASH_PORTS) {
767 ft->ft_udp_idle = V_flowtable_udp_expire;
768 ft->ft_syn_idle = V_flowtable_syn_expire;
769 ft->ft_fin_wait_idle = V_flowtable_fin_wait_expire;
770 ft->ft_tcp_idle = V_flowtable_fin_wait_expire;
772 ft->ft_udp_idle = ft->ft_fin_wait_idle =
773 ft->ft_syn_idle = ft->ft_tcp_idle = 30;
778 * hook in to the cleaner list
780 if (V_flow_list_head == NULL)
781 V_flow_list_head = ft;
783 fttail = V_flow_list_head;
784 while (fttail->ft_next != NULL)
785 fttail = fttail->ft_next;
786 fttail->ft_next = ft;
793 flowtable_init(const void *unused __unused)
796 V_flow_ipv4_zone = uma_zcreate("ip4flow", sizeof(struct flentry_v4),
797 NULL, NULL, NULL, NULL, 64, UMA_ZONE_MAXBUCKET);
798 V_flow_ipv6_zone = uma_zcreate("ip6flow", sizeof(struct flentry_v6),
799 NULL, NULL, NULL, NULL, 64, UMA_ZONE_MAXBUCKET);
800 uma_zone_set_max(V_flow_ipv4_zone, V_flowtable_nmbflows);
801 uma_zone_set_max(V_flow_ipv6_zone, V_flowtable_nmbflows);
804 VNET_SYSINIT(flowtable_init, SI_SUB_KTHREAD_INIT, SI_ORDER_ANY,
805 flowtable_init, NULL);
809 flowtable_uninit(const void *unused __unused)
812 uma_zdestroy(V_flow_ipv4_zone);
813 uma_zdestroy(V_flow_ipv6_zone);
816 VNET_SYSUNINIT(flowtable_uninit, SI_SUB_KTHREAD_INIT, SI_ORDER_ANY,
817 flowtable_uninit, NULL);
821 * The rest of the code is devoted to garbage collection of expired entries.
822 * It is a new additon made necessary by the switch to dynamically allocating
827 fle_free(struct flentry *fle)
832 rt = __DEVOLATILE(struct rtentry *, fle->f_rt);
833 lle = __DEVOLATILE(struct llentry *, fle->f_lle);
836 uma_zfree((fle->f_flags & FL_IPV6) ?
837 V_flow_ipv6_zone : V_flow_ipv4_zone, fle);
841 flowtable_free_stale(struct flowtable *ft)
843 int curbit = 0, count;
844 struct flentry *fle, **flehead, *fleprev;
845 struct flentry *flefreehead, *flefreetail, *fletmp;
846 bitstr_t *mask, *tmpmask;
848 flefreehead = flefreetail = NULL;
849 mask = flowtable_mask(ft);
850 tmpmask = ft->ft_tmpmask;
851 memcpy(tmpmask, mask, ft->ft_size/8);
853 * XXX Note to self, bit_ffs operates at the byte level
854 * and thus adds gratuitous overhead
856 bit_ffs(tmpmask, ft->ft_size, &curbit);
857 while (curbit != -1) {
858 if (curbit >= ft->ft_size || curbit < -1) {
860 "warning: bad curbit value %d \n",
865 FL_ENTRY_LOCK(ft, curbit);
866 flehead = flowtable_entry(ft, curbit);
867 fle = fleprev = *flehead;
869 V_flowtable_free_checks++;
871 if (fle == NULL && curbit > 0) {
873 "warning bit=%d set, but no fle found\n",
877 while (fle != NULL) {
878 if (!flow_stale(ft, fle)) {
884 * delete head of the list
886 if (fleprev == *flehead) {
888 if (fle == fleprev) {
889 fleprev = *flehead = fle->f_next;
891 fleprev = *flehead = fle;
895 * don't advance fleprev
898 fleprev->f_next = fle->f_next;
899 fle = fleprev->f_next;
902 if (flefreehead == NULL)
903 flefreehead = flefreetail = fletmp;
905 flefreetail->f_next = fletmp;
906 flefreetail = fletmp;
908 fletmp->f_next = NULL;
910 if (*flehead == NULL)
911 bit_clear(mask, curbit);
912 FL_ENTRY_UNLOCK(ft, curbit);
913 bit_clear(tmpmask, curbit);
914 bit_ffs(tmpmask, ft->ft_size, &curbit);
917 while ((fle = flefreehead) != NULL) {
918 flefreehead = fle->f_next;
923 if (bootverbose && count)
924 log(LOG_DEBUG, "freed %d flow entries\n", count);
928 flowtable_clean_vnet(void)
930 struct flowtable *ft;
933 ft = V_flow_list_head;
935 if (ft->ft_flags & FL_PCPU) {
936 for (i = 0; i <= mp_maxid; i++) {
940 thread_lock(curthread);
941 sched_bind(curthread, i);
942 thread_unlock(curthread);
944 flowtable_free_stale(ft);
946 thread_lock(curthread);
947 sched_unbind(curthread);
948 thread_unlock(curthread);
951 flowtable_free_stale(ft);
958 flowtable_cleaner(void)
960 VNET_ITERATOR_DECL(vnet_iter);
963 log(LOG_INFO, "flowtable cleaner started\n");
966 VNET_FOREACH(vnet_iter) {
967 CURVNET_SET(vnet_iter);
968 flowtable_clean_vnet();
974 * The 20 second interval between cleaning checks
977 pause("flowcleanwait", 20*hz);
981 static struct kproc_desc flow_kp = {
986 SYSINIT(flowcleaner, SI_SUB_KTHREAD_IDLE, SI_ORDER_ANY, kproc_start, &flow_kp);