1 /**************************************************************************
3 Copyright (c) 2008-2009, BitGravity Inc.
6 Redistribution and use in source and binary forms, with or without
7 modification, are permitted provided that the following conditions are met:
9 1. Redistributions of source code must retain the above copyright notice,
10 this list of conditions and the following disclaimer.
12 2. Neither the name of the BitGravity Corporation nor the names of its
13 contributors may be used to endorse or promote products derived from
14 this software without specific prior written permission.
16 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
17 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
20 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
21 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
22 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
23 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
24 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
25 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
26 POSSIBILITY OF SUCH DAMAGE.
28 ***************************************************************************/
30 #include "opt_route.h"
31 #include "opt_mpath.h"
34 #include <sys/cdefs.h>
35 __FBSDID("$FreeBSD$");
37 #include <sys/param.h>
38 #include <sys/types.h>
39 #include <sys/bitstring.h>
40 #include <sys/condvar.h>
41 #include <sys/callout.h>
42 #include <sys/kernel.h>
43 #include <sys/kthread.h>
44 #include <sys/limits.h>
45 #include <sys/malloc.h>
48 #include <sys/sched.h>
50 #include <sys/socket.h>
51 #include <sys/syslog.h>
52 #include <sys/sysctl.h>
55 #include <net/if_llatbl.h>
56 #include <net/if_var.h>
57 #include <net/route.h>
58 #include <net/flowtable.h>
61 #include <netinet/in.h>
62 #include <netinet/in_systm.h>
63 #include <netinet/in_var.h>
64 #include <netinet/if_ether.h>
65 #include <netinet/ip.h>
66 #include <netinet/tcp.h>
67 #include <netinet/udp.h>
68 #include <netinet/sctp.h>
70 #include <libkern/jenkins.h>
74 uint16_t ip_sport; /* source port */
75 uint16_t ip_dport; /* destination port */
76 in_addr_t ip_saddr; /* source address */
77 in_addr_t ip_daddr; /* destination address */
81 struct ipv4_tuple ipf_ipt;
86 uint16_t ip_sport; /* source port */
87 uint16_t ip_dport; /* destination port */
88 struct in6_addr ip_saddr; /* source address */
89 struct in6_addr ip_daddr; /* destination address */
93 struct ipv6_tuple ipf_ipt;
98 volatile uint32_t f_fhash; /* hash flowing forward */
99 uint16_t f_flags; /* flow flags */
101 uint8_t f_proto; /* protocol */
102 uint32_t f_fibnum; /* fib index */
103 uint32_t f_uptime; /* uptime at last access */
104 struct flentry *f_next; /* pointer to collision entry */
105 volatile struct rtentry *f_rt; /* rtentry for flow */
106 volatile struct llentry *f_lle; /* llentry for flow */
110 struct flentry fl_entry;
111 union ipv4_flow fl_flow;
115 struct flentry fl_entry;
116 union ipv6_flow fl_flow;
119 #define fl_fhash fl_entry.fl_fhash
120 #define fl_flags fl_entry.fl_flags
121 #define fl_proto fl_entry.fl_proto
122 #define fl_uptime fl_entry.fl_uptime
123 #define fl_rt fl_entry.fl_rt
124 #define fl_lle fl_entry.fl_lle
126 #define SECS_PER_HOUR 3600
127 #define SECS_PER_DAY (24*SECS_PER_HOUR)
131 #define FIN_WAIT_IDLE 600
132 #define TCP_IDLE SECS_PER_DAY
135 typedef void fl_lock_t(struct flowtable *, uint32_t);
136 typedef void fl_rtalloc_t(struct route *, uint32_t, u_int);
139 struct flentry **global;
140 struct flentry **pcpu[MAXCPU];
147 uint32_t ft_collisions;
148 uint32_t ft_allocated;
152 uint32_t ft_udp_idle;
153 uint32_t ft_fin_wait_idle;
154 uint32_t ft_syn_idle;
155 uint32_t ft_tcp_idle;
158 fl_lock_t *ft_unlock;
159 fl_rtalloc_t *ft_rtalloc;
160 struct mtx *ft_locks;
163 union flentryp ft_table;
164 bitstr_t *ft_masks[MAXCPU];
165 bitstr_t *ft_tmpmask;
166 struct flowtable *ft_next;
169 static struct proc *flowcleanerproc;
170 static VNET_DEFINE(struct flowtable *, flow_list_head);
171 static VNET_DEFINE(uint32_t, flow_hashjitter);
172 static VNET_DEFINE(uma_zone_t, flow_ipv4_zone);
173 static VNET_DEFINE(uma_zone_t, flow_ipv6_zone);
175 #define V_flow_list_head VNET(flow_list_head)
176 #define V_flow_hashjitter VNET(flow_hashjitter)
177 #define V_flow_ipv4_zone VNET(flow_ipv4_zone)
178 #define V_flow_ipv6_zone VNET(flow_ipv6_zone)
180 static struct cv flowclean_cv;
181 static struct mtx flowclean_lock;
182 static uint32_t flowclean_cycles;
186 * - Make flowtable stats per-cpu, aggregated at sysctl call time,
187 * to avoid extra cache evictions caused by incrementing a shared
189 * - add IPv6 support to flow lookup
190 * - add sysctls to resize && flush flow tables
191 * - Add per flowtable sysctls for statistics and configuring timeouts
192 * - add saturation counter to rtentry to support per-packet load-balancing
193 * add flag to indicate round-robin flow, add list lookup from head
195 * - add sysctl / device node / syscall to support exporting and importing
196 * of flows with flag to indicate that a flow was imported so should
197 * not be considered for auto-cleaning
198 * - support explicit connection state (currently only ad-hoc for DSR)
199 * - idetach() cleanup for options VIMAGE builds.
201 VNET_DEFINE(int, flowtable_enable) = 1;
202 static VNET_DEFINE(int, flowtable_debug);
203 static VNET_DEFINE(int, flowtable_hits);
204 static VNET_DEFINE(int, flowtable_lookups);
205 static VNET_DEFINE(int, flowtable_misses);
206 static VNET_DEFINE(int, flowtable_frees);
207 static VNET_DEFINE(int, flowtable_free_checks);
208 static VNET_DEFINE(int, flowtable_max_depth);
209 static VNET_DEFINE(int, flowtable_collisions);
210 static VNET_DEFINE(int, flowtable_syn_expire) = SYN_IDLE;
211 static VNET_DEFINE(int, flowtable_udp_expire) = UDP_IDLE;
212 static VNET_DEFINE(int, flowtable_fin_wait_expire) = FIN_WAIT_IDLE;
213 static VNET_DEFINE(int, flowtable_tcp_expire) = TCP_IDLE;
214 static VNET_DEFINE(int, flowtable_nmbflows) = 4096;
215 static VNET_DEFINE(int, flowtable_ready) = 0;
217 #define V_flowtable_enable VNET(flowtable_enable)
218 #define V_flowtable_debug VNET(flowtable_debug)
219 #define V_flowtable_hits VNET(flowtable_hits)
220 #define V_flowtable_lookups VNET(flowtable_lookups)
221 #define V_flowtable_misses VNET(flowtable_misses)
222 #define V_flowtable_frees VNET(flowtable_frees)
223 #define V_flowtable_free_checks VNET(flowtable_free_checks)
224 #define V_flowtable_max_depth VNET(flowtable_max_depth)
225 #define V_flowtable_collisions VNET(flowtable_collisions)
226 #define V_flowtable_syn_expire VNET(flowtable_syn_expire)
227 #define V_flowtable_udp_expire VNET(flowtable_udp_expire)
228 #define V_flowtable_fin_wait_expire VNET(flowtable_fin_wait_expire)
229 #define V_flowtable_tcp_expire VNET(flowtable_tcp_expire)
230 #define V_flowtable_nmbflows VNET(flowtable_nmbflows)
231 #define V_flowtable_ready VNET(flowtable_ready)
233 SYSCTL_NODE(_net_inet, OID_AUTO, flowtable, CTLFLAG_RD, NULL, "flowtable");
234 SYSCTL_VNET_INT(_net_inet_flowtable, OID_AUTO, debug, CTLFLAG_RW,
235 &VNET_NAME(flowtable_debug), 0, "print debug info.");
236 SYSCTL_VNET_INT(_net_inet_flowtable, OID_AUTO, enable, CTLFLAG_RW,
237 &VNET_NAME(flowtable_enable), 0, "enable flowtable caching.");
238 SYSCTL_VNET_INT(_net_inet_flowtable, OID_AUTO, hits, CTLFLAG_RD,
239 &VNET_NAME(flowtable_hits), 0, "# flowtable hits.");
240 SYSCTL_VNET_INT(_net_inet_flowtable, OID_AUTO, lookups, CTLFLAG_RD,
241 &VNET_NAME(flowtable_lookups), 0, "# flowtable lookups.");
242 SYSCTL_VNET_INT(_net_inet_flowtable, OID_AUTO, misses, CTLFLAG_RD,
243 &VNET_NAME(flowtable_misses), 0, "#flowtable misses.");
244 SYSCTL_VNET_INT(_net_inet_flowtable, OID_AUTO, frees, CTLFLAG_RD,
245 &VNET_NAME(flowtable_frees), 0, "#flows freed.");
246 SYSCTL_VNET_INT(_net_inet_flowtable, OID_AUTO, free_checks, CTLFLAG_RD,
247 &VNET_NAME(flowtable_free_checks), 0, "#flows free checks.");
248 SYSCTL_VNET_INT(_net_inet_flowtable, OID_AUTO, max_depth, CTLFLAG_RD,
249 &VNET_NAME(flowtable_max_depth), 0, "max collision list length.");
250 SYSCTL_VNET_INT(_net_inet_flowtable, OID_AUTO, collisions, CTLFLAG_RD,
251 &VNET_NAME(flowtable_collisions), 0, "#flowtable collisions.");
254 * XXX This does not end up updating timeouts at runtime
255 * and only reflects the value for the last table added :-/
257 SYSCTL_VNET_INT(_net_inet_flowtable, OID_AUTO, syn_expire, CTLFLAG_RW,
258 &VNET_NAME(flowtable_syn_expire), 0,
259 "seconds after which to remove syn allocated flow.");
260 SYSCTL_VNET_INT(_net_inet_flowtable, OID_AUTO, udp_expire, CTLFLAG_RW,
261 &VNET_NAME(flowtable_udp_expire), 0,
262 "seconds after which to remove flow allocated to UDP.");
263 SYSCTL_VNET_INT(_net_inet_flowtable, OID_AUTO, fin_wait_expire, CTLFLAG_RW,
264 &VNET_NAME(flowtable_fin_wait_expire), 0,
265 "seconds after which to remove a flow in FIN_WAIT.");
266 SYSCTL_VNET_INT(_net_inet_flowtable, OID_AUTO, tcp_expire, CTLFLAG_RW,
267 &VNET_NAME(flowtable_tcp_expire), 0,
268 "seconds after which to remove flow allocated to a TCP connection.");
272 * Maximum number of flows that can be allocated of a given type.
274 * The table is allocated at boot time (for the pure caching case
275 * there is no reason why this could not be changed at runtime)
276 * and thus (currently) needs to be set with a tunable.
279 sysctl_nmbflows(SYSCTL_HANDLER_ARGS)
281 int error, newnmbflows;
283 newnmbflows = V_flowtable_nmbflows;
284 error = sysctl_handle_int(oidp, &newnmbflows, 0, req);
285 if (error == 0 && req->newptr) {
286 if (newnmbflows > V_flowtable_nmbflows) {
287 V_flowtable_nmbflows = newnmbflows;
288 uma_zone_set_max(V_flow_ipv4_zone,
289 V_flowtable_nmbflows);
290 uma_zone_set_max(V_flow_ipv6_zone,
291 V_flowtable_nmbflows);
297 SYSCTL_VNET_PROC(_net_inet_flowtable, OID_AUTO, nmbflows,
298 CTLTYPE_INT|CTLFLAG_RW, 0, 0, sysctl_nmbflows, "IU",
299 "Maximum number of flows allowed");
303 in_rtalloc_ign_wrapper(struct route *ro, uint32_t hash, u_int fibnum)
306 rtalloc_ign_fib(ro, 0, fibnum);
311 flowtable_global_lock(struct flowtable *table, uint32_t hash)
313 int lock_index = (hash)&(table->ft_lock_count - 1);
315 mtx_lock(&table->ft_locks[lock_index]);
319 flowtable_global_unlock(struct flowtable *table, uint32_t hash)
321 int lock_index = (hash)&(table->ft_lock_count - 1);
323 mtx_unlock(&table->ft_locks[lock_index]);
327 flowtable_pcpu_lock(struct flowtable *table, uint32_t hash)
334 flowtable_pcpu_unlock(struct flowtable *table, uint32_t hash)
340 #define FL_ENTRY_INDEX(table, hash)((hash) % (table)->ft_size)
341 #define FL_ENTRY(table, hash) *flowtable_entry((table), (hash))
342 #define FL_ENTRY_LOCK(table, hash) (table)->ft_lock((table), (hash))
343 #define FL_ENTRY_UNLOCK(table, hash) (table)->ft_unlock((table), (hash))
345 #define FL_STALE (1<<8)
346 #define FL_IPV6 (1<<9)
349 ipv4_flow_lookup_hash_internal(struct mbuf *m, struct route *ro,
350 uint32_t *key, uint16_t *flags, uint8_t *protop)
352 uint16_t sport = 0, dport = 0;
353 struct ip *ip = NULL;
357 struct sockaddr_in *sin;
362 if ((V_flowtable_enable == 0) || (V_flowtable_ready == 0))
366 sin = (struct sockaddr_in *)&ro->ro_dst;
368 ip = mtod(m, struct ip *);
369 sin->sin_family = AF_INET;
370 sin->sin_len = sizeof(*sin);
371 sin->sin_addr = ip->ip_dst;
373 *flags &= ~FL_HASH_PORTS;
375 key[2] = sin->sin_addr.s_addr;
377 if ((*flags & FL_HASH_PORTS) == 0)
381 iphlen = ip->ip_hl << 2; /* XXX options? */
382 key[1] = ip->ip_src.s_addr;
386 th = (struct tcphdr *)((caddr_t)ip + iphlen);
387 sport = ntohs(th->th_sport);
388 dport = ntohs(th->th_dport);
389 *flags |= th->th_flags;
394 uh = (struct udphdr *)((caddr_t)ip + iphlen);
395 sport = uh->uh_sport;
396 dport = uh->uh_dport;
399 sh = (struct sctphdr *)((caddr_t)ip + iphlen);
400 sport = sh->src_port;
401 dport = sh->dest_port;
404 if (*flags & FL_HASH_PORTS)
406 /* no port - hence not a protocol we care about */
413 * If this is a transmit route cache then
414 * hash all flows to a given destination to
417 if ((*flags & FL_HASH_PORTS) == 0)
418 proto = sport = dport = 0;
420 ((uint16_t *)key)[0] = sport;
421 ((uint16_t *)key)[1] = dport;
424 hash = jenkins_hashword(key, 3, V_flow_hashjitter + proto);
425 if (m != NULL && (m->m_flags & M_FLOWID) == 0) {
426 m->m_flags |= M_FLOWID;
427 m->m_pkthdr.flowid = hash;
437 flowtable_mask(struct flowtable *ft)
441 if (ft->ft_flags & FL_PCPU)
442 mask = ft->ft_masks[curcpu];
444 mask = ft->ft_masks[0];
449 static struct flentry **
450 flowtable_entry(struct flowtable *ft, uint32_t hash)
452 struct flentry **fle;
453 int index = (hash % ft->ft_size);
455 if (ft->ft_flags & FL_PCPU) {
456 KASSERT(&ft->ft_table.pcpu[curcpu][0] != NULL, ("pcpu not set"));
457 fle = &ft->ft_table.pcpu[curcpu][index];
459 KASSERT(&ft->ft_table.global[0] != NULL, ("global not set"));
460 fle = &ft->ft_table.global[index];
467 flow_stale(struct flowtable *ft, struct flentry *fle)
471 if ((fle->f_fhash == 0)
472 || ((fle->f_rt->rt_flags & RTF_HOST) &&
473 ((fle->f_rt->rt_flags & (RTF_UP))
475 || (fle->f_rt->rt_ifp == NULL))
478 idle_time = time_uptime - fle->f_uptime;
480 if ((fle->f_flags & FL_STALE) ||
481 ((fle->f_flags & (TH_SYN|TH_ACK|TH_FIN)) == 0
482 && (idle_time > ft->ft_udp_idle)) ||
483 ((fle->f_flags & TH_FIN)
484 && (idle_time > ft->ft_fin_wait_idle)) ||
485 ((fle->f_flags & (TH_SYN|TH_ACK)) == TH_SYN
486 && (idle_time > ft->ft_syn_idle)) ||
487 ((fle->f_flags & (TH_SYN|TH_ACK)) == (TH_SYN|TH_ACK)
488 && (idle_time > ft->ft_tcp_idle)) ||
489 ((fle->f_rt->rt_flags & RTF_UP) == 0 ||
490 (fle->f_rt->rt_ifp == NULL)))
497 flowtable_set_hashkey(struct flentry *fle, uint32_t *key)
502 if (fle->f_flags & FL_IPV6) {
504 hashkey = ((struct flentry_v4 *)fle)->fl_flow.ipf_key;
507 hashkey = ((struct flentry_v6 *)fle)->fl_flow.ipf_key;
510 for (i = 0; i < nwords; i++)
515 flowtable_insert(struct flowtable *ft, uint32_t hash, uint32_t *key,
516 uint8_t proto, uint32_t fibnum, struct route *ro, uint16_t flags)
518 struct flentry *fle, *fletail, *newfle, **flep;
523 flezone = (flags & FL_IPV6) ? V_flow_ipv6_zone : V_flow_ipv4_zone;
524 newfle = uma_zalloc(flezone, M_NOWAIT | M_ZERO);
528 newfle->f_flags |= (flags & FL_IPV6);
530 FL_ENTRY_LOCK(ft, hash);
531 mask = flowtable_mask(ft);
532 flep = flowtable_entry(ft, hash);
533 fletail = fle = *flep;
536 bit_set(mask, FL_ENTRY_INDEX(ft, hash));
537 *flep = fle = newfle;
542 V_flowtable_collisions++;
544 * find end of list and make sure that we were not
545 * preempted by another thread handling this flow
547 while (fle != NULL) {
548 if (fle->f_fhash == hash && !flow_stale(ft, fle)) {
550 * there was either a hash collision
551 * or we lost a race to insert
553 FL_ENTRY_UNLOCK(ft, hash);
554 uma_zfree((newfle->f_flags & FL_IPV6) ?
555 V_flow_ipv6_zone : V_flow_ipv4_zone, newfle);
559 * re-visit this double condition XXX
561 if (fletail->f_next != NULL)
562 fletail = fle->f_next;
568 if (depth > V_flowtable_max_depth)
569 V_flowtable_max_depth = depth;
570 fletail->f_next = newfle;
573 flowtable_set_hashkey(fle, key);
575 fle->f_proto = proto;
576 fle->f_rt = ro->ro_rt;
577 fle->f_lle = ro->ro_lle;
579 fle->f_fibnum = fibnum;
580 fle->f_uptime = time_uptime;
581 FL_ENTRY_UNLOCK(ft, hash);
586 flowtable_key_equal(struct flentry *fle, uint32_t *key)
591 if (fle->f_flags & FL_IPV6) {
593 hashkey = ((struct flentry_v4 *)fle)->fl_flow.ipf_key;
596 hashkey = ((struct flentry_v6 *)fle)->fl_flow.ipf_key;
599 for (i = 0; i < nwords; i++)
600 if (hashkey[i] != key[i])
607 flowtable_lookup(struct flowtable *ft, struct mbuf *m, struct route *ro, uint32_t fibnum)
609 uint32_t key[9], hash;
617 flags = ft->ft_flags;
622 * The internal hash lookup is the only IPv4 specific bit
625 * XXX BZ: to add IPv6 support just add a check for the
626 * address type in m and ro and an equivalent ipv6 lookup
627 * function - the rest of the code should automatically
628 * handle an ipv6 flow (note that m can be NULL in which
629 * case ro will be set)
631 hash = ipv4_flow_lookup_hash_internal(m, ro, key,
635 * Ports are zero and this isn't a transmit cache
636 * - thus not a protocol for which we need to keep
638 * FL_HASH_PORTS => key[0] != 0 for TCP || UDP || SCTP
640 if (hash == 0 || (key[0] == 0 && (ft->ft_flags & FL_HASH_PORTS)))
643 V_flowtable_lookups++;
644 FL_ENTRY_LOCK(ft, hash);
645 if ((fle = FL_ENTRY(ft, hash)) == NULL) {
646 FL_ENTRY_UNLOCK(ft, hash);
650 rt = __DEVOLATILE(struct rtentry *, fle->f_rt);
651 lle = __DEVOLATILE(struct llentry *, fle->f_lle);
653 && fle->f_fhash == hash
654 && flowtable_key_equal(fle, key)
655 && (proto == fle->f_proto)
656 && (fibnum == fle->f_fibnum)
657 && (rt->rt_flags & RTF_UP)
658 && (rt->rt_ifp != NULL)) {
660 fle->f_uptime = time_uptime;
661 fle->f_flags |= flags;
664 FL_ENTRY_UNLOCK(ft, hash);
666 } else if (fle->f_next != NULL) {
670 FL_ENTRY_UNLOCK(ft, hash);
673 V_flowtable_misses++;
675 * This bit of code ends up locking the
676 * same route 3 times (just like ip_output + ether_output)
678 * - in rt_check when called by arpresolve
679 * - dropping the refcount for the rtentry
681 * This could be consolidated to one if we wrote a variant
682 * of arpresolve with an rt_check variant that expected to
683 * receive the route locked
686 ft->ft_rtalloc(ro, hash, fibnum);
687 if (ro->ro_rt == NULL)
690 struct llentry *lle = NULL;
691 struct sockaddr *l3addr;
692 struct rtentry *rt = ro->ro_rt;
693 struct ifnet *ifp = rt->rt_ifp;
695 if (rt->rt_flags & RTF_GATEWAY)
696 l3addr = rt->rt_gateway;
698 l3addr = &ro->ro_dst;
699 llentry_update(&lle, LLTABLE(ifp), l3addr, ifp);
707 error = flowtable_insert(ft, hash, key, proto, fibnum,
722 * used by the bit_alloc macro
724 #define calloc(count, size) malloc((count)*(size), M_DEVBUF, M_WAITOK|M_ZERO)
727 flowtable_alloc(int nentry, int flags)
729 struct flowtable *ft, *fttail;
732 if (V_flow_hashjitter == 0)
733 V_flow_hashjitter = arc4random();
735 KASSERT(nentry > 0, ("nentry must be > 0, is %d\n", nentry));
737 ft = malloc(sizeof(struct flowtable),
738 M_RTABLE, M_WAITOK | M_ZERO);
740 ft->ft_flags = flags;
741 ft->ft_size = nentry;
743 ft->ft_rtalloc = rtalloc_mpath_fib;
745 ft->ft_rtalloc = in_rtalloc_ign_wrapper;
747 if (flags & FL_PCPU) {
748 ft->ft_lock = flowtable_pcpu_lock;
749 ft->ft_unlock = flowtable_pcpu_unlock;
751 for (i = 0; i <= mp_maxid; i++) {
752 ft->ft_table.pcpu[i] =
753 malloc(nentry*sizeof(struct flentry *),
754 M_RTABLE, M_WAITOK | M_ZERO);
755 ft->ft_masks[i] = bit_alloc(nentry);
758 ft->ft_lock_count = 2*(powerof2(mp_maxid + 1) ? (mp_maxid + 1):
759 (fls(mp_maxid + 1) << 1));
761 ft->ft_lock = flowtable_global_lock;
762 ft->ft_unlock = flowtable_global_unlock;
763 ft->ft_table.global =
764 malloc(nentry*sizeof(struct flentry *),
765 M_RTABLE, M_WAITOK | M_ZERO);
766 ft->ft_locks = malloc(ft->ft_lock_count*sizeof(struct mtx),
767 M_RTABLE, M_WAITOK | M_ZERO);
768 for (i = 0; i < ft->ft_lock_count; i++)
769 mtx_init(&ft->ft_locks[i], "flow", NULL, MTX_DEF|MTX_DUPOK);
771 ft->ft_masks[0] = bit_alloc(nentry);
773 ft->ft_tmpmask = bit_alloc(nentry);
776 * In the local transmit case the table truly is
777 * just a cache - so everything is eligible for
778 * replacement after 5s of non-use
780 if (flags & FL_HASH_PORTS) {
781 ft->ft_udp_idle = V_flowtable_udp_expire;
782 ft->ft_syn_idle = V_flowtable_syn_expire;
783 ft->ft_fin_wait_idle = V_flowtable_fin_wait_expire;
784 ft->ft_tcp_idle = V_flowtable_fin_wait_expire;
786 ft->ft_udp_idle = ft->ft_fin_wait_idle =
787 ft->ft_syn_idle = ft->ft_tcp_idle = 30;
792 * hook in to the cleaner list
794 if (V_flow_list_head == NULL)
795 V_flow_list_head = ft;
797 fttail = V_flow_list_head;
798 while (fttail->ft_next != NULL)
799 fttail = fttail->ft_next;
800 fttail->ft_next = ft;
807 * The rest of the code is devoted to garbage collection of expired entries.
808 * It is a new additon made necessary by the switch to dynamically allocating
813 fle_free(struct flentry *fle)
818 rt = __DEVOLATILE(struct rtentry *, fle->f_rt);
819 lle = __DEVOLATILE(struct llentry *, fle->f_lle);
822 uma_zfree((fle->f_flags & FL_IPV6) ?
823 V_flow_ipv6_zone : V_flow_ipv4_zone, fle);
827 flowtable_free_stale(struct flowtable *ft)
829 int curbit = 0, count;
830 struct flentry *fle, **flehead, *fleprev;
831 struct flentry *flefreehead, *flefreetail, *fletmp;
832 bitstr_t *mask, *tmpmask;
834 flefreehead = flefreetail = NULL;
835 mask = flowtable_mask(ft);
836 tmpmask = ft->ft_tmpmask;
837 memcpy(tmpmask, mask, ft->ft_size/8);
839 * XXX Note to self, bit_ffs operates at the byte level
840 * and thus adds gratuitous overhead
842 bit_ffs(tmpmask, ft->ft_size, &curbit);
843 while (curbit != -1) {
844 if (curbit >= ft->ft_size || curbit < -1) {
846 "warning: bad curbit value %d \n",
851 FL_ENTRY_LOCK(ft, curbit);
852 flehead = flowtable_entry(ft, curbit);
853 fle = fleprev = *flehead;
855 V_flowtable_free_checks++;
857 if (fle == NULL && curbit > 0) {
859 "warning bit=%d set, but no fle found\n",
863 while (fle != NULL) {
864 if (!flow_stale(ft, fle)) {
870 * delete head of the list
872 if (fleprev == *flehead) {
874 if (fle == fleprev) {
875 fleprev = *flehead = fle->f_next;
877 fleprev = *flehead = fle;
881 * don't advance fleprev
884 fleprev->f_next = fle->f_next;
885 fle = fleprev->f_next;
888 if (flefreehead == NULL)
889 flefreehead = flefreetail = fletmp;
891 flefreetail->f_next = fletmp;
892 flefreetail = fletmp;
894 fletmp->f_next = NULL;
896 if (*flehead == NULL)
897 bit_clear(mask, curbit);
898 FL_ENTRY_UNLOCK(ft, curbit);
899 bit_clear(tmpmask, curbit);
900 bit_ffs(tmpmask, ft->ft_size, &curbit);
903 while ((fle = flefreehead) != NULL) {
904 flefreehead = fle->f_next;
909 if (V_flowtable_debug && count)
910 log(LOG_DEBUG, "freed %d flow entries\n", count);
914 flowtable_clean_vnet(void)
916 struct flowtable *ft;
919 ft = V_flow_list_head;
921 if (ft->ft_flags & FL_PCPU) {
922 for (i = 0; i <= mp_maxid; i++) {
926 thread_lock(curthread);
927 sched_bind(curthread, i);
928 thread_unlock(curthread);
930 flowtable_free_stale(ft);
932 thread_lock(curthread);
933 sched_unbind(curthread);
934 thread_unlock(curthread);
937 flowtable_free_stale(ft);
944 flowtable_cleaner(void)
946 VNET_ITERATOR_DECL(vnet_iter);
949 log(LOG_INFO, "flowtable cleaner started\n");
952 VNET_FOREACH(vnet_iter) {
953 CURVNET_SET(vnet_iter);
954 flowtable_clean_vnet();
961 * The 10 second interval between cleaning checks
964 mtx_lock(&flowclean_lock);
965 cv_broadcast(&flowclean_cv);
966 cv_timedwait(&flowclean_cv, &flowclean_lock, 10*hz);
967 mtx_unlock(&flowclean_lock);
972 flowtable_flush(void *unused __unused)
976 mtx_lock(&flowclean_lock);
977 start = flowclean_cycles;
978 while (start == flowclean_cycles) {
979 cv_broadcast(&flowclean_cv);
980 cv_wait(&flowclean_cv, &flowclean_lock);
982 mtx_unlock(&flowclean_lock);
985 static struct kproc_desc flow_kp = {
990 SYSINIT(flowcleaner, SI_SUB_KTHREAD_IDLE, SI_ORDER_ANY, kproc_start, &flow_kp);
993 flowtable_init_vnet(const void *unused __unused)
996 V_flow_ipv4_zone = uma_zcreate("ip4flow", sizeof(struct flentry_v4),
997 NULL, NULL, NULL, NULL, 64, UMA_ZONE_MAXBUCKET);
998 V_flow_ipv6_zone = uma_zcreate("ip6flow", sizeof(struct flentry_v6),
999 NULL, NULL, NULL, NULL, 64, UMA_ZONE_MAXBUCKET);
1000 uma_zone_set_max(V_flow_ipv4_zone, V_flowtable_nmbflows);
1001 uma_zone_set_max(V_flow_ipv6_zone, V_flowtable_nmbflows);
1003 VNET_SYSINIT(flowtable_init_vnet, SI_SUB_KTHREAD_INIT, SI_ORDER_MIDDLE,
1004 flowtable_init_vnet, NULL);
1007 flowtable_init(const void *unused __unused)
1010 cv_init(&flowclean_cv, "flowcleanwait");
1011 mtx_init(&flowclean_lock, "flowclean lock", NULL, MTX_DEF);
1012 EVENTHANDLER_REGISTER(ifnet_departure_event, flowtable_flush, NULL,
1013 EVENTHANDLER_PRI_ANY);
1014 V_flowtable_ready = 1;
1016 SYSINIT(flowtable_init, SI_SUB_KTHREAD_INIT, SI_ORDER_ANY,
1017 flowtable_init, NULL);
1022 flowtable_uninit(const void *unused __unused)
1025 uma_zdestroy(V_flow_ipv4_zone);
1026 uma_zdestroy(V_flow_ipv6_zone);
1029 VNET_SYSUNINIT(flowtable_uninit, SI_SUB_KTHREAD_INIT, SI_ORDER_ANY,
1030 flowtable_uninit, NULL);
1035 flowtable_mask_pcpu(struct flowtable *ft, int cpuid)
1039 if (ft->ft_flags & FL_PCPU)
1040 mask = ft->ft_masks[cpuid];
1042 mask = ft->ft_masks[0];
1047 static struct flentry **
1048 flowtable_entry_pcpu(struct flowtable *ft, uint32_t hash, int cpuid)
1050 struct flentry **fle;
1051 int index = (hash % ft->ft_size);
1053 if (ft->ft_flags & FL_PCPU) {
1054 fle = &ft->ft_table.pcpu[cpuid][index];
1056 fle = &ft->ft_table.global[index];
1063 flow_show(struct flowtable *ft, struct flentry *fle)
1068 idle_time = (int)(time_uptime - fle->f_uptime);
1069 rt_valid = fle->f_rt != NULL;
1070 db_printf("hash=0x%08x idle_time=%03d rt=%p ifp=%p",
1071 fle->f_fhash, idle_time,
1072 fle->f_rt, rt_valid ? fle->f_rt->rt_ifp : NULL);
1073 if (rt_valid && (fle->f_rt->rt_flags & RTF_UP))
1074 db_printf(" RTF_UP ");
1075 if (fle->f_flags & FL_STALE)
1076 db_printf(" FL_STALE ");
1081 flowtable_show(struct flowtable *ft, int cpuid)
1084 struct flentry *fle, **flehead;
1085 bitstr_t *mask, *tmpmask;
1087 db_printf("cpu: %d\n", cpuid);
1088 mask = flowtable_mask_pcpu(ft, cpuid);
1089 tmpmask = ft->ft_tmpmask;
1090 memcpy(tmpmask, mask, ft->ft_size/8);
1092 * XXX Note to self, bit_ffs operates at the byte level
1093 * and thus adds gratuitous overhead
1095 bit_ffs(tmpmask, ft->ft_size, &curbit);
1096 while (curbit != -1) {
1097 if (curbit >= ft->ft_size || curbit < -1) {
1098 db_printf("warning: bad curbit value %d \n",
1103 flehead = flowtable_entry_pcpu(ft, curbit, cpuid);
1106 while (fle != NULL) {
1111 bit_clear(tmpmask, curbit);
1112 bit_ffs(tmpmask, ft->ft_size, &curbit);
1117 flowtable_show_vnet(void)
1119 struct flowtable *ft;
1122 ft = V_flow_list_head;
1123 while (ft != NULL) {
1124 if (ft->ft_flags & FL_PCPU) {
1125 for (i = 0; i <= mp_maxid; i++) {
1128 flowtable_show(ft, i);
1131 flowtable_show(ft, 0);
1137 DB_SHOW_COMMAND(flowtables, db_show_flowtables)
1139 VNET_ITERATOR_DECL(vnet_iter);
1141 VNET_FOREACH(vnet_iter) {
1142 CURVNET_SET(vnet_iter);
1143 flowtable_show_vnet();