1 /**************************************************************************
3 Copyright (c) 2008-2009, BitGravity Inc.
6 Redistribution and use in source and binary forms, with or without
7 modification, are permitted provided that the following conditions are met:
9 1. Redistributions of source code must retain the above copyright notice,
10 this list of conditions and the following disclaimer.
12 2. Neither the name of the BitGravity Corporation nor the names of its
13 contributors may be used to endorse or promote products derived from
14 this software without specific prior written permission.
16 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
17 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
20 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
21 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
22 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
23 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
24 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
25 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
26 POSSIBILITY OF SUCH DAMAGE.
28 ***************************************************************************/
30 #include "opt_route.h"
31 #include "opt_mpath.h"
34 #include <sys/cdefs.h>
35 __FBSDID("$FreeBSD$");
37 #include <sys/param.h>
38 #include <sys/types.h>
39 #include <sys/bitstring.h>
40 #include <sys/condvar.h>
41 #include <sys/callout.h>
42 #include <sys/kernel.h>
43 #include <sys/kthread.h>
44 #include <sys/limits.h>
45 #include <sys/malloc.h>
48 #include <sys/sched.h>
50 #include <sys/socket.h>
51 #include <sys/syslog.h>
52 #include <sys/sysctl.h>
55 #include <net/if_llatbl.h>
56 #include <net/if_var.h>
57 #include <net/route.h>
58 #include <net/flowtable.h>
61 #include <netinet/in.h>
62 #include <netinet/in_systm.h>
63 #include <netinet/in_var.h>
64 #include <netinet/if_ether.h>
65 #include <netinet/ip.h>
66 #include <netinet/tcp.h>
67 #include <netinet/udp.h>
68 #include <netinet/sctp.h>
70 #include <libkern/jenkins.h>
74 uint16_t ip_sport; /* source port */
75 uint16_t ip_dport; /* destination port */
76 in_addr_t ip_saddr; /* source address */
77 in_addr_t ip_daddr; /* destination address */
81 struct ipv4_tuple ipf_ipt;
86 uint16_t ip_sport; /* source port */
87 uint16_t ip_dport; /* destination port */
88 struct in6_addr ip_saddr; /* source address */
89 struct in6_addr ip_daddr; /* destination address */
93 struct ipv6_tuple ipf_ipt;
98 volatile uint32_t f_fhash; /* hash flowing forward */
99 uint16_t f_flags; /* flow flags */
101 uint8_t f_proto; /* protocol */
102 uint32_t f_fibnum; /* fib index */
103 uint32_t f_uptime; /* uptime at last access */
104 struct flentry *f_next; /* pointer to collision entry */
105 volatile struct rtentry *f_rt; /* rtentry for flow */
106 volatile struct llentry *f_lle; /* llentry for flow */
110 struct flentry fl_entry;
111 union ipv4_flow fl_flow;
115 struct flentry fl_entry;
116 union ipv6_flow fl_flow;
119 #define fl_fhash fl_entry.fl_fhash
120 #define fl_flags fl_entry.fl_flags
121 #define fl_proto fl_entry.fl_proto
122 #define fl_uptime fl_entry.fl_uptime
123 #define fl_rt fl_entry.fl_rt
124 #define fl_lle fl_entry.fl_lle
126 #define SECS_PER_HOUR 3600
127 #define SECS_PER_DAY (24*SECS_PER_HOUR)
131 #define FIN_WAIT_IDLE 600
132 #define TCP_IDLE SECS_PER_DAY
135 typedef void fl_lock_t(struct flowtable *, uint32_t);
136 typedef void fl_rtalloc_t(struct route *, uint32_t, u_int);
139 struct flentry **global;
140 struct flentry **pcpu[MAXCPU];
147 uint32_t ft_collisions;
148 uint32_t ft_allocated;
152 uint32_t ft_udp_idle;
153 uint32_t ft_fin_wait_idle;
154 uint32_t ft_syn_idle;
155 uint32_t ft_tcp_idle;
158 fl_lock_t *ft_unlock;
159 fl_rtalloc_t *ft_rtalloc;
160 struct mtx *ft_locks;
163 union flentryp ft_table;
164 bitstr_t *ft_masks[MAXCPU];
165 bitstr_t *ft_tmpmask;
166 struct flowtable *ft_next;
169 static struct proc *flowcleanerproc;
170 static VNET_DEFINE(struct flowtable *, flow_list_head);
171 static VNET_DEFINE(uint32_t, flow_hashjitter);
172 static VNET_DEFINE(uma_zone_t, flow_ipv4_zone);
173 static VNET_DEFINE(uma_zone_t, flow_ipv6_zone);
175 #define V_flow_list_head VNET(flow_list_head)
176 #define V_flow_hashjitter VNET(flow_hashjitter)
177 #define V_flow_ipv4_zone VNET(flow_ipv4_zone)
178 #define V_flow_ipv6_zone VNET(flow_ipv6_zone)
180 static struct cv flowclean_cv;
181 static struct mtx flowclean_lock;
182 static uint32_t flowclean_cycles;
186 * - Make flowtable stats per-cpu, aggregated at sysctl call time,
187 * to avoid extra cache evictions caused by incrementing a shared
189 * - add IPv6 support to flow lookup
190 * - add sysctls to resize && flush flow tables
191 * - Add per flowtable sysctls for statistics and configuring timeouts
192 * - add saturation counter to rtentry to support per-packet load-balancing
193 * add flag to indicate round-robin flow, add list lookup from head
195 * - add sysctl / device node / syscall to support exporting and importing
196 * of flows with flag to indicate that a flow was imported so should
197 * not be considered for auto-cleaning
198 * - support explicit connection state (currently only ad-hoc for DSR)
199 * - idetach() cleanup for options VIMAGE builds.
201 VNET_DEFINE(int, flowtable_enable) = 1;
202 static VNET_DEFINE(int, flowtable_debug);
203 static VNET_DEFINE(int, flowtable_hits);
204 static VNET_DEFINE(int, flowtable_lookups);
205 static VNET_DEFINE(int, flowtable_misses);
206 static VNET_DEFINE(int, flowtable_frees);
207 static VNET_DEFINE(int, flowtable_free_checks);
208 static VNET_DEFINE(int, flowtable_max_depth);
209 static VNET_DEFINE(int, flowtable_collisions);
210 static VNET_DEFINE(int, flowtable_syn_expire) = SYN_IDLE;
211 static VNET_DEFINE(int, flowtable_udp_expire) = UDP_IDLE;
212 static VNET_DEFINE(int, flowtable_fin_wait_expire) = FIN_WAIT_IDLE;
213 static VNET_DEFINE(int, flowtable_tcp_expire) = TCP_IDLE;
214 static VNET_DEFINE(int, flowtable_nmbflows) = 4096;
215 static VNET_DEFINE(int, flowtable_ready) = 0;
217 #define V_flowtable_enable VNET(flowtable_enable)
218 #define V_flowtable_debug VNET(flowtable_debug)
219 #define V_flowtable_hits VNET(flowtable_hits)
220 #define V_flowtable_lookups VNET(flowtable_lookups)
221 #define V_flowtable_misses VNET(flowtable_misses)
222 #define V_flowtable_frees VNET(flowtable_frees)
223 #define V_flowtable_free_checks VNET(flowtable_free_checks)
224 #define V_flowtable_max_depth VNET(flowtable_max_depth)
225 #define V_flowtable_collisions VNET(flowtable_collisions)
226 #define V_flowtable_syn_expire VNET(flowtable_syn_expire)
227 #define V_flowtable_udp_expire VNET(flowtable_udp_expire)
228 #define V_flowtable_fin_wait_expire VNET(flowtable_fin_wait_expire)
229 #define V_flowtable_tcp_expire VNET(flowtable_tcp_expire)
230 #define V_flowtable_nmbflows VNET(flowtable_nmbflows)
231 #define V_flowtable_ready VNET(flowtable_ready)
233 SYSCTL_NODE(_net_inet, OID_AUTO, flowtable, CTLFLAG_RD, NULL, "flowtable");
234 SYSCTL_VNET_INT(_net_inet_flowtable, OID_AUTO, debug, CTLFLAG_RW,
235 &VNET_NAME(flowtable_debug), 0, "print debug info.");
236 SYSCTL_VNET_INT(_net_inet_flowtable, OID_AUTO, enable, CTLFLAG_RW,
237 &VNET_NAME(flowtable_enable), 0, "enable flowtable caching.");
238 SYSCTL_VNET_INT(_net_inet_flowtable, OID_AUTO, hits, CTLFLAG_RD,
239 &VNET_NAME(flowtable_hits), 0, "# flowtable hits.");
240 SYSCTL_VNET_INT(_net_inet_flowtable, OID_AUTO, lookups, CTLFLAG_RD,
241 &VNET_NAME(flowtable_lookups), 0, "# flowtable lookups.");
242 SYSCTL_VNET_INT(_net_inet_flowtable, OID_AUTO, misses, CTLFLAG_RD,
243 &VNET_NAME(flowtable_misses), 0, "#flowtable misses.");
244 SYSCTL_VNET_INT(_net_inet_flowtable, OID_AUTO, frees, CTLFLAG_RD,
245 &VNET_NAME(flowtable_frees), 0, "#flows freed.");
246 SYSCTL_VNET_INT(_net_inet_flowtable, OID_AUTO, free_checks, CTLFLAG_RD,
247 &VNET_NAME(flowtable_free_checks), 0, "#flows free checks.");
248 SYSCTL_VNET_INT(_net_inet_flowtable, OID_AUTO, max_depth, CTLFLAG_RD,
249 &VNET_NAME(flowtable_max_depth), 0, "max collision list length.");
250 SYSCTL_VNET_INT(_net_inet_flowtable, OID_AUTO, collisions, CTLFLAG_RD,
251 &VNET_NAME(flowtable_collisions), 0, "#flowtable collisions.");
254 * XXX This does not end up updating timeouts at runtime
255 * and only reflects the value for the last table added :-/
257 SYSCTL_VNET_INT(_net_inet_flowtable, OID_AUTO, syn_expire, CTLFLAG_RW,
258 &VNET_NAME(flowtable_syn_expire), 0,
259 "seconds after which to remove syn allocated flow.");
260 SYSCTL_VNET_INT(_net_inet_flowtable, OID_AUTO, udp_expire, CTLFLAG_RW,
261 &VNET_NAME(flowtable_udp_expire), 0,
262 "seconds after which to remove flow allocated to UDP.");
263 SYSCTL_VNET_INT(_net_inet_flowtable, OID_AUTO, fin_wait_expire, CTLFLAG_RW,
264 &VNET_NAME(flowtable_fin_wait_expire), 0,
265 "seconds after which to remove a flow in FIN_WAIT.");
266 SYSCTL_VNET_INT(_net_inet_flowtable, OID_AUTO, tcp_expire, CTLFLAG_RW,
267 &VNET_NAME(flowtable_tcp_expire), 0,
268 "seconds after which to remove flow allocated to a TCP connection.");
272 * Maximum number of flows that can be allocated of a given type.
274 * The table is allocated at boot time (for the pure caching case
275 * there is no reason why this could not be changed at runtime)
276 * and thus (currently) needs to be set with a tunable.
279 sysctl_nmbflows(SYSCTL_HANDLER_ARGS)
281 int error, newnmbflows;
283 newnmbflows = V_flowtable_nmbflows;
284 error = sysctl_handle_int(oidp, &newnmbflows, 0, req);
285 if (error == 0 && req->newptr) {
286 if (newnmbflows > V_flowtable_nmbflows) {
287 V_flowtable_nmbflows = newnmbflows;
288 uma_zone_set_max(V_flow_ipv4_zone,
289 V_flowtable_nmbflows);
290 uma_zone_set_max(V_flow_ipv6_zone,
291 V_flowtable_nmbflows);
297 SYSCTL_VNET_PROC(_net_inet_flowtable, OID_AUTO, nmbflows,
298 CTLTYPE_INT|CTLFLAG_RW, 0, 0, sysctl_nmbflows, "IU",
299 "Maximum number of flows allowed");
303 in_rtalloc_ign_wrapper(struct route *ro, uint32_t hash, u_int fibnum)
306 rtalloc_ign_fib(ro, 0, fibnum);
311 flowtable_global_lock(struct flowtable *table, uint32_t hash)
313 int lock_index = (hash)&(table->ft_lock_count - 1);
315 mtx_lock(&table->ft_locks[lock_index]);
319 flowtable_global_unlock(struct flowtable *table, uint32_t hash)
321 int lock_index = (hash)&(table->ft_lock_count - 1);
323 mtx_unlock(&table->ft_locks[lock_index]);
327 flowtable_pcpu_lock(struct flowtable *table, uint32_t hash)
334 flowtable_pcpu_unlock(struct flowtable *table, uint32_t hash)
340 #define FL_ENTRY_INDEX(table, hash)((hash) % (table)->ft_size)
341 #define FL_ENTRY(table, hash) *flowtable_entry((table), (hash))
342 #define FL_ENTRY_LOCK(table, hash) (table)->ft_lock((table), (hash))
343 #define FL_ENTRY_UNLOCK(table, hash) (table)->ft_unlock((table), (hash))
345 #define FL_STALE (1<<8)
346 #define FL_IPV6 (1<<9)
349 ipv4_flow_lookup_hash_internal(struct mbuf *m, struct route *ro,
350 uint32_t *key, uint16_t *flags, uint8_t *protop)
352 uint16_t sport = 0, dport = 0;
353 struct ip *ip = NULL;
357 struct sockaddr_in *sin;
362 if ((V_flowtable_enable == 0) || (V_flowtable_ready == 0))
366 sin = (struct sockaddr_in *)&ro->ro_dst;
368 ip = mtod(m, struct ip *);
369 sin->sin_family = AF_INET;
370 sin->sin_len = sizeof(*sin);
371 sin->sin_addr = ip->ip_dst;
373 *flags &= ~FL_HASH_PORTS;
375 key[2] = sin->sin_addr.s_addr;
377 if ((*flags & FL_HASH_PORTS) == 0)
381 iphlen = ip->ip_hl << 2; /* XXX options? */
382 key[1] = ip->ip_src.s_addr;
386 th = (struct tcphdr *)((caddr_t)ip + iphlen);
387 sport = ntohs(th->th_sport);
388 dport = ntohs(th->th_dport);
389 *flags |= th->th_flags;
394 uh = (struct udphdr *)((caddr_t)ip + iphlen);
395 sport = uh->uh_sport;
396 dport = uh->uh_dport;
399 sh = (struct sctphdr *)((caddr_t)ip + iphlen);
400 sport = sh->src_port;
401 dport = sh->dest_port;
404 if (*flags & FL_HASH_PORTS)
406 /* no port - hence not a protocol we care about */
413 * If this is a transmit route cache then
414 * hash all flows to a given destination to
417 if ((*flags & FL_HASH_PORTS) == 0)
418 proto = sport = dport = 0;
420 ((uint16_t *)key)[0] = sport;
421 ((uint16_t *)key)[1] = dport;
424 hash = jenkins_hashword(key, 3, V_flow_hashjitter + proto);
425 if (m != NULL && (m->m_flags & M_FLOWID) == 0) {
426 m->m_flags |= M_FLOWID;
427 m->m_pkthdr.flowid = hash;
437 flowtable_mask(struct flowtable *ft)
441 if (ft->ft_flags & FL_PCPU)
442 mask = ft->ft_masks[curcpu];
444 mask = ft->ft_masks[0];
449 static struct flentry **
450 flowtable_entry(struct flowtable *ft, uint32_t hash)
452 struct flentry **fle;
453 int index = (hash % ft->ft_size);
455 if (ft->ft_flags & FL_PCPU) {
456 KASSERT(&ft->ft_table.pcpu[curcpu][0] != NULL, ("pcpu not set"));
457 fle = &ft->ft_table.pcpu[curcpu][index];
459 KASSERT(&ft->ft_table.global[0] != NULL, ("global not set"));
460 fle = &ft->ft_table.global[index];
467 flow_stale(struct flowtable *ft, struct flentry *fle)
471 if ((fle->f_fhash == 0)
472 || ((fle->f_rt->rt_flags & RTF_HOST) &&
473 ((fle->f_rt->rt_flags & (RTF_UP))
475 || (fle->f_rt->rt_ifp == NULL))
478 idle_time = time_uptime - fle->f_uptime;
480 if ((fle->f_flags & FL_STALE) ||
481 ((fle->f_flags & (TH_SYN|TH_ACK|TH_FIN)) == 0
482 && (idle_time > ft->ft_udp_idle)) ||
483 ((fle->f_flags & TH_FIN)
484 && (idle_time > ft->ft_fin_wait_idle)) ||
485 ((fle->f_flags & (TH_SYN|TH_ACK)) == TH_SYN
486 && (idle_time > ft->ft_syn_idle)) ||
487 ((fle->f_flags & (TH_SYN|TH_ACK)) == (TH_SYN|TH_ACK)
488 && (idle_time > ft->ft_tcp_idle)) ||
489 ((fle->f_rt->rt_flags & RTF_UP) == 0 ||
490 (fle->f_rt->rt_ifp == NULL)))
497 flowtable_set_hashkey(struct flentry *fle, uint32_t *key)
502 if (fle->f_flags & FL_IPV6) {
504 hashkey = ((struct flentry_v4 *)fle)->fl_flow.ipf_key;
507 hashkey = ((struct flentry_v6 *)fle)->fl_flow.ipf_key;
510 for (i = 0; i < nwords; i++)
515 flowtable_insert(struct flowtable *ft, uint32_t hash, uint32_t *key,
516 uint8_t proto, uint32_t fibnum, struct route *ro, uint16_t flags)
518 struct flentry *fle, *fletail, *newfle, **flep;
523 flezone = (flags & FL_IPV6) ? V_flow_ipv6_zone : V_flow_ipv4_zone;
524 newfle = uma_zalloc(flezone, M_NOWAIT | M_ZERO);
528 newfle->f_flags |= (flags & FL_IPV6);
530 FL_ENTRY_LOCK(ft, hash);
531 mask = flowtable_mask(ft);
532 flep = flowtable_entry(ft, hash);
533 fletail = fle = *flep;
536 bit_set(mask, FL_ENTRY_INDEX(ft, hash));
537 *flep = fle = newfle;
542 V_flowtable_collisions++;
544 * find end of list and make sure that we were not
545 * preempted by another thread handling this flow
547 while (fle != NULL) {
548 if (fle->f_fhash == hash && !flow_stale(ft, fle)) {
550 * there was either a hash collision
551 * or we lost a race to insert
553 FL_ENTRY_UNLOCK(ft, hash);
554 uma_zfree((newfle->f_flags & FL_IPV6) ?
555 V_flow_ipv6_zone : V_flow_ipv4_zone, newfle);
559 * re-visit this double condition XXX
561 if (fletail->f_next != NULL)
562 fletail = fle->f_next;
568 if (depth > V_flowtable_max_depth)
569 V_flowtable_max_depth = depth;
570 fletail->f_next = newfle;
573 flowtable_set_hashkey(fle, key);
575 fle->f_proto = proto;
576 fle->f_rt = ro->ro_rt;
577 fle->f_lle = ro->ro_lle;
579 fle->f_fibnum = fibnum;
580 fle->f_uptime = time_uptime;
581 FL_ENTRY_UNLOCK(ft, hash);
586 flowtable_key_equal(struct flentry *fle, uint32_t *key)
591 if (fle->f_flags & FL_IPV6) {
593 hashkey = ((struct flentry_v4 *)fle)->fl_flow.ipf_key;
596 hashkey = ((struct flentry_v6 *)fle)->fl_flow.ipf_key;
599 for (i = 0; i < nwords; i++)
600 if (hashkey[i] != key[i])
607 flowtable_lookup(struct flowtable *ft, struct mbuf *m, struct route *ro, uint32_t fibnum)
609 uint32_t key[9], hash;
617 flags = ft->ft_flags;
622 * The internal hash lookup is the only IPv4 specific bit
625 * XXX BZ: to add IPv6 support just add a check for the
626 * address type in m and ro and an equivalent ipv6 lookup
627 * function - the rest of the code should automatically
628 * handle an ipv6 flow (note that m can be NULL in which
629 * case ro will be set)
631 hash = ipv4_flow_lookup_hash_internal(m, ro, key,
635 * Ports are zero and this isn't a transmit cache
636 * - thus not a protocol for which we need to keep
638 * FL_HASH_PORTS => key[0] != 0 for TCP || UDP || SCTP
640 if (hash == 0 || (key[0] == 0 && (ft->ft_flags & FL_HASH_PORTS)))
643 V_flowtable_lookups++;
644 FL_ENTRY_LOCK(ft, hash);
645 if ((fle = FL_ENTRY(ft, hash)) == NULL) {
646 FL_ENTRY_UNLOCK(ft, hash);
650 rt = __DEVOLATILE(struct rtentry *, fle->f_rt);
651 lle = __DEVOLATILE(struct llentry *, fle->f_lle);
653 && fle->f_fhash == hash
654 && flowtable_key_equal(fle, key)
655 && (proto == fle->f_proto)
656 && (fibnum == fle->f_fibnum)
657 && (rt->rt_flags & RTF_UP)
658 && (rt->rt_ifp != NULL)) {
660 fle->f_uptime = time_uptime;
661 fle->f_flags |= flags;
664 FL_ENTRY_UNLOCK(ft, hash);
666 } else if (fle->f_next != NULL) {
670 FL_ENTRY_UNLOCK(ft, hash);
673 V_flowtable_misses++;
675 * This bit of code ends up locking the
676 * same route 3 times (just like ip_output + ether_output)
678 * - in rt_check when called by arpresolve
679 * - dropping the refcount for the rtentry
681 * This could be consolidated to one if we wrote a variant
682 * of arpresolve with an rt_check variant that expected to
683 * receive the route locked
686 ft->ft_rtalloc(ro, hash, fibnum);
687 if (ro->ro_rt == NULL)
690 struct llentry *lle = NULL;
691 struct sockaddr *l3addr;
692 struct rtentry *rt = ro->ro_rt;
693 struct ifnet *ifp = rt->rt_ifp;
695 if (ifp->if_flags & (IFF_POINTOPOINT | IFF_LOOPBACK)) {
701 if (rt->rt_flags & RTF_GATEWAY)
702 l3addr = rt->rt_gateway;
704 l3addr = &ro->ro_dst;
705 llentry_update(&lle, LLTABLE(ifp), l3addr, ifp);
713 error = flowtable_insert(ft, hash, key, proto, fibnum,
728 * used by the bit_alloc macro
730 #define calloc(count, size) malloc((count)*(size), M_DEVBUF, M_WAITOK|M_ZERO)
733 flowtable_alloc(int nentry, int flags)
735 struct flowtable *ft, *fttail;
738 if (V_flow_hashjitter == 0)
739 V_flow_hashjitter = arc4random();
741 KASSERT(nentry > 0, ("nentry must be > 0, is %d\n", nentry));
743 ft = malloc(sizeof(struct flowtable),
744 M_RTABLE, M_WAITOK | M_ZERO);
746 ft->ft_flags = flags;
747 ft->ft_size = nentry;
749 ft->ft_rtalloc = rtalloc_mpath_fib;
751 ft->ft_rtalloc = in_rtalloc_ign_wrapper;
753 if (flags & FL_PCPU) {
754 ft->ft_lock = flowtable_pcpu_lock;
755 ft->ft_unlock = flowtable_pcpu_unlock;
757 for (i = 0; i <= mp_maxid; i++) {
758 ft->ft_table.pcpu[i] =
759 malloc(nentry*sizeof(struct flentry *),
760 M_RTABLE, M_WAITOK | M_ZERO);
761 ft->ft_masks[i] = bit_alloc(nentry);
764 ft->ft_lock_count = 2*(powerof2(mp_maxid + 1) ? (mp_maxid + 1):
765 (fls(mp_maxid + 1) << 1));
767 ft->ft_lock = flowtable_global_lock;
768 ft->ft_unlock = flowtable_global_unlock;
769 ft->ft_table.global =
770 malloc(nentry*sizeof(struct flentry *),
771 M_RTABLE, M_WAITOK | M_ZERO);
772 ft->ft_locks = malloc(ft->ft_lock_count*sizeof(struct mtx),
773 M_RTABLE, M_WAITOK | M_ZERO);
774 for (i = 0; i < ft->ft_lock_count; i++)
775 mtx_init(&ft->ft_locks[i], "flow", NULL, MTX_DEF|MTX_DUPOK);
777 ft->ft_masks[0] = bit_alloc(nentry);
779 ft->ft_tmpmask = bit_alloc(nentry);
782 * In the local transmit case the table truly is
783 * just a cache - so everything is eligible for
784 * replacement after 5s of non-use
786 if (flags & FL_HASH_PORTS) {
787 ft->ft_udp_idle = V_flowtable_udp_expire;
788 ft->ft_syn_idle = V_flowtable_syn_expire;
789 ft->ft_fin_wait_idle = V_flowtable_fin_wait_expire;
790 ft->ft_tcp_idle = V_flowtable_fin_wait_expire;
792 ft->ft_udp_idle = ft->ft_fin_wait_idle =
793 ft->ft_syn_idle = ft->ft_tcp_idle = 30;
798 * hook in to the cleaner list
800 if (V_flow_list_head == NULL)
801 V_flow_list_head = ft;
803 fttail = V_flow_list_head;
804 while (fttail->ft_next != NULL)
805 fttail = fttail->ft_next;
806 fttail->ft_next = ft;
813 * The rest of the code is devoted to garbage collection of expired entries.
814 * It is a new additon made necessary by the switch to dynamically allocating
819 fle_free(struct flentry *fle)
824 rt = __DEVOLATILE(struct rtentry *, fle->f_rt);
825 lle = __DEVOLATILE(struct llentry *, fle->f_lle);
828 uma_zfree((fle->f_flags & FL_IPV6) ?
829 V_flow_ipv6_zone : V_flow_ipv4_zone, fle);
833 flowtable_free_stale(struct flowtable *ft, struct rtentry *rt)
835 int curbit = 0, count;
836 struct flentry *fle, **flehead, *fleprev;
837 struct flentry *flefreehead, *flefreetail, *fletmp;
838 bitstr_t *mask, *tmpmask;
840 flefreehead = flefreetail = NULL;
841 mask = flowtable_mask(ft);
842 tmpmask = ft->ft_tmpmask;
843 memcpy(tmpmask, mask, ft->ft_size/8);
845 * XXX Note to self, bit_ffs operates at the byte level
846 * and thus adds gratuitous overhead
848 bit_ffs(tmpmask, ft->ft_size, &curbit);
849 while (curbit != -1) {
850 if (curbit >= ft->ft_size || curbit < -1) {
852 "warning: bad curbit value %d \n",
857 FL_ENTRY_LOCK(ft, curbit);
858 flehead = flowtable_entry(ft, curbit);
859 fle = fleprev = *flehead;
861 V_flowtable_free_checks++;
863 if (fle == NULL && curbit > 0) {
865 "warning bit=%d set, but no fle found\n",
869 while (fle != NULL) {
871 if (__DEVOLATILE(struct rtentry *, fle->f_rt) != rt) {
876 } else if (!flow_stale(ft, fle)) {
882 * delete head of the list
884 if (fleprev == *flehead) {
886 if (fle == fleprev) {
887 fleprev = *flehead = fle->f_next;
889 fleprev = *flehead = fle;
893 * don't advance fleprev
896 fleprev->f_next = fle->f_next;
897 fle = fleprev->f_next;
900 if (flefreehead == NULL)
901 flefreehead = flefreetail = fletmp;
903 flefreetail->f_next = fletmp;
904 flefreetail = fletmp;
906 fletmp->f_next = NULL;
908 if (*flehead == NULL)
909 bit_clear(mask, curbit);
910 FL_ENTRY_UNLOCK(ft, curbit);
911 bit_clear(tmpmask, curbit);
912 bit_ffs(tmpmask, ft->ft_size, &curbit);
915 while ((fle = flefreehead) != NULL) {
916 flefreehead = fle->f_next;
921 if (V_flowtable_debug && count)
922 log(LOG_DEBUG, "freed %d flow entries\n", count);
926 flowtable_route_flush(struct flowtable *ft, struct rtentry *rt)
929 if (ft->ft_flags & FL_PCPU) {
930 for (i = 0; i <= mp_maxid; i++) {
934 if (smp_started == 1) {
935 thread_lock(curthread);
936 sched_bind(curthread, i);
937 thread_unlock(curthread);
940 flowtable_free_stale(ft, rt);
942 if (smp_started == 1) {
943 thread_lock(curthread);
944 sched_unbind(curthread);
945 thread_unlock(curthread);
949 flowtable_free_stale(ft, rt);
954 flowtable_clean_vnet(void)
956 struct flowtable *ft;
959 ft = V_flow_list_head;
961 if (ft->ft_flags & FL_PCPU) {
962 for (i = 0; i <= mp_maxid; i++) {
966 if (smp_started == 1) {
967 thread_lock(curthread);
968 sched_bind(curthread, i);
969 thread_unlock(curthread);
972 flowtable_free_stale(ft, NULL);
974 if (smp_started == 1) {
975 thread_lock(curthread);
976 sched_unbind(curthread);
977 thread_unlock(curthread);
981 flowtable_free_stale(ft, NULL);
988 flowtable_cleaner(void)
990 VNET_ITERATOR_DECL(vnet_iter);
993 log(LOG_INFO, "flowtable cleaner started\n");
996 VNET_FOREACH(vnet_iter) {
997 CURVNET_SET(vnet_iter);
998 flowtable_clean_vnet();
1001 VNET_LIST_RUNLOCK();
1005 * The 10 second interval between cleaning checks
1008 mtx_lock(&flowclean_lock);
1009 cv_broadcast(&flowclean_cv);
1010 cv_timedwait(&flowclean_cv, &flowclean_lock, 10*hz);
1011 mtx_unlock(&flowclean_lock);
1016 flowtable_flush(void *unused __unused)
1020 mtx_lock(&flowclean_lock);
1021 start = flowclean_cycles;
1022 while (start == flowclean_cycles) {
1023 cv_broadcast(&flowclean_cv);
1024 cv_wait(&flowclean_cv, &flowclean_lock);
1026 mtx_unlock(&flowclean_lock);
1029 static struct kproc_desc flow_kp = {
1034 SYSINIT(flowcleaner, SI_SUB_KTHREAD_IDLE, SI_ORDER_ANY, kproc_start, &flow_kp);
1037 flowtable_init_vnet(const void *unused __unused)
1040 V_flow_ipv4_zone = uma_zcreate("ip4flow", sizeof(struct flentry_v4),
1041 NULL, NULL, NULL, NULL, 64, UMA_ZONE_MAXBUCKET);
1042 V_flow_ipv6_zone = uma_zcreate("ip6flow", sizeof(struct flentry_v6),
1043 NULL, NULL, NULL, NULL, 64, UMA_ZONE_MAXBUCKET);
1044 uma_zone_set_max(V_flow_ipv4_zone, V_flowtable_nmbflows);
1045 uma_zone_set_max(V_flow_ipv6_zone, V_flowtable_nmbflows);
1046 V_flowtable_ready = 1;
1048 VNET_SYSINIT(flowtable_init_vnet, SI_SUB_KTHREAD_INIT, SI_ORDER_MIDDLE,
1049 flowtable_init_vnet, NULL);
1052 flowtable_init(const void *unused __unused)
1055 cv_init(&flowclean_cv, "flowcleanwait");
1056 mtx_init(&flowclean_lock, "flowclean lock", NULL, MTX_DEF);
1057 EVENTHANDLER_REGISTER(ifnet_departure_event, flowtable_flush, NULL,
1058 EVENTHANDLER_PRI_ANY);
1060 SYSINIT(flowtable_init, SI_SUB_KTHREAD_INIT, SI_ORDER_ANY,
1061 flowtable_init, NULL);
1066 flowtable_uninit(const void *unused __unused)
1069 V_flowtable_ready = 0;
1070 uma_zdestroy(V_flow_ipv4_zone);
1071 uma_zdestroy(V_flow_ipv6_zone);
1074 VNET_SYSUNINIT(flowtable_uninit, SI_SUB_KTHREAD_INIT, SI_ORDER_ANY,
1075 flowtable_uninit, NULL);
1080 flowtable_mask_pcpu(struct flowtable *ft, int cpuid)
1084 if (ft->ft_flags & FL_PCPU)
1085 mask = ft->ft_masks[cpuid];
1087 mask = ft->ft_masks[0];
1092 static struct flentry **
1093 flowtable_entry_pcpu(struct flowtable *ft, uint32_t hash, int cpuid)
1095 struct flentry **fle;
1096 int index = (hash % ft->ft_size);
1098 if (ft->ft_flags & FL_PCPU) {
1099 fle = &ft->ft_table.pcpu[cpuid][index];
1101 fle = &ft->ft_table.global[index];
1108 flow_show(struct flowtable *ft, struct flentry *fle)
1113 idle_time = (int)(time_uptime - fle->f_uptime);
1114 rt_valid = fle->f_rt != NULL;
1115 db_printf("hash=0x%08x idle_time=%03d rt=%p ifp=%p",
1116 fle->f_fhash, idle_time,
1117 fle->f_rt, rt_valid ? fle->f_rt->rt_ifp : NULL);
1118 if (rt_valid && (fle->f_rt->rt_flags & RTF_UP))
1119 db_printf(" RTF_UP ");
1120 if (fle->f_flags & FL_STALE)
1121 db_printf(" FL_STALE ");
1126 flowtable_show(struct flowtable *ft, int cpuid)
1129 struct flentry *fle, **flehead;
1130 bitstr_t *mask, *tmpmask;
1132 db_printf("cpu: %d\n", cpuid);
1133 mask = flowtable_mask_pcpu(ft, cpuid);
1134 tmpmask = ft->ft_tmpmask;
1135 memcpy(tmpmask, mask, ft->ft_size/8);
1137 * XXX Note to self, bit_ffs operates at the byte level
1138 * and thus adds gratuitous overhead
1140 bit_ffs(tmpmask, ft->ft_size, &curbit);
1141 while (curbit != -1) {
1142 if (curbit >= ft->ft_size || curbit < -1) {
1143 db_printf("warning: bad curbit value %d \n",
1148 flehead = flowtable_entry_pcpu(ft, curbit, cpuid);
1151 while (fle != NULL) {
1156 bit_clear(tmpmask, curbit);
1157 bit_ffs(tmpmask, ft->ft_size, &curbit);
1162 flowtable_show_vnet(void)
1164 struct flowtable *ft;
1167 ft = V_flow_list_head;
1168 while (ft != NULL) {
1169 if (ft->ft_flags & FL_PCPU) {
1170 for (i = 0; i <= mp_maxid; i++) {
1173 flowtable_show(ft, i);
1176 flowtable_show(ft, 0);
1182 DB_SHOW_COMMAND(flowtables, db_show_flowtables)
1184 VNET_ITERATOR_DECL(vnet_iter);
1186 VNET_FOREACH(vnet_iter) {
1187 CURVNET_SET(vnet_iter);
1188 flowtable_show_vnet();