]> CyberLeo.Net >> Repos - FreeBSD/stable/8.git/blob - sys/net/flowtable.c
This change fixes a comment and addresses a complaint by kib@ by
[FreeBSD/stable/8.git] / sys / net / flowtable.c
1 /**************************************************************************
2
3 Copyright (c) 2008-2009, BitGravity Inc.
4 All rights reserved.
5
6 Redistribution and use in source and binary forms, with or without
7 modification, are permitted provided that the following conditions are met:
8
9  1. Redistributions of source code must retain the above copyright notice,
10     this list of conditions and the following disclaimer.
11
12  2. Neither the name of the BitGravity Corporation nor the names of its
13     contributors may be used to endorse or promote products derived from
14     this software without specific prior written permission.
15
16 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
17 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
20 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
21 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
22 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
23 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
24 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
25 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
26 POSSIBILITY OF SUCH DAMAGE.
27
28 ***************************************************************************/
29
30 #include "opt_route.h"
31 #include "opt_mpath.h"
32 #include "opt_ddb.h"
33
34 #include <sys/cdefs.h>
35 __FBSDID("$FreeBSD$");
36
37 #include <sys/param.h>  
38 #include <sys/types.h>
39 #include <sys/bitstring.h>
40 #include <sys/condvar.h>
41 #include <sys/callout.h>
42 #include <sys/kernel.h>  
43 #include <sys/kthread.h>
44 #include <sys/limits.h>
45 #include <sys/malloc.h>
46 #include <sys/mbuf.h>
47 #include <sys/proc.h>
48 #include <sys/sched.h>
49 #include <sys/smp.h>
50 #include <sys/socket.h>
51 #include <sys/syslog.h>
52 #include <sys/sysctl.h>
53
54 #include <net/if.h>
55 #include <net/if_llatbl.h>
56 #include <net/if_var.h>
57 #include <net/route.h> 
58 #include <net/flowtable.h>
59 #include <net/vnet.h>
60
61 #include <netinet/in.h>
62 #include <netinet/in_systm.h>
63 #include <netinet/in_var.h>
64 #include <netinet/if_ether.h>
65 #include <netinet/ip.h>
66 #include <netinet/tcp.h>
67 #include <netinet/udp.h>
68 #include <netinet/sctp.h>
69
70 #include <libkern/jenkins.h>
71 #include <ddb/ddb.h>
72
73 struct ipv4_tuple {
74         uint16_t        ip_sport;       /* source port */
75         uint16_t        ip_dport;       /* destination port */
76         in_addr_t       ip_saddr;       /* source address */
77         in_addr_t       ip_daddr;       /* destination address */
78 };
79
80 union ipv4_flow {
81         struct ipv4_tuple ipf_ipt;
82         uint32_t        ipf_key[3];
83 };
84
85 struct ipv6_tuple {
86         uint16_t        ip_sport;       /* source port */
87         uint16_t        ip_dport;       /* destination port */
88         struct in6_addr ip_saddr;       /* source address */
89         struct in6_addr ip_daddr;       /* destination address */
90 };
91
92 union ipv6_flow {
93         struct ipv6_tuple ipf_ipt;
94         uint32_t        ipf_key[9];
95 };
96
97 struct flentry {
98         volatile uint32_t       f_fhash;        /* hash flowing forward */
99         uint16_t                f_flags;        /* flow flags */
100         uint8_t                 f_pad;          
101         uint8_t                 f_proto;        /* protocol */
102         uint32_t                f_fibnum;       /* fib index */
103         uint32_t                f_uptime;       /* uptime at last access */
104         struct flentry          *f_next;        /* pointer to collision entry */
105         volatile struct rtentry *f_rt;          /* rtentry for flow */
106         volatile struct llentry *f_lle;         /* llentry for flow */
107 };
108
109 struct flentry_v4 {
110         struct flentry  fl_entry;
111         union ipv4_flow fl_flow;
112 };
113
114 struct flentry_v6 {
115         struct flentry  fl_entry;
116         union ipv6_flow fl_flow;
117 };
118
119 #define fl_fhash        fl_entry.fl_fhash
120 #define fl_flags        fl_entry.fl_flags
121 #define fl_proto        fl_entry.fl_proto
122 #define fl_uptime       fl_entry.fl_uptime
123 #define fl_rt           fl_entry.fl_rt
124 #define fl_lle          fl_entry.fl_lle
125
126 #define SECS_PER_HOUR           3600
127 #define SECS_PER_DAY            (24*SECS_PER_HOUR)
128
129 #define SYN_IDLE                300
130 #define UDP_IDLE                300
131 #define FIN_WAIT_IDLE           600
132 #define TCP_IDLE                SECS_PER_DAY
133
134
135 typedef void fl_lock_t(struct flowtable *, uint32_t);
136 typedef void fl_rtalloc_t(struct route *, uint32_t, u_int);
137
138 union flentryp {
139         struct flentry          **global;
140         struct flentry          **pcpu[MAXCPU];
141 };
142
143 struct flowtable {
144         int             ft_size;
145         int             ft_lock_count;
146         uint32_t        ft_flags;
147         uint32_t        ft_collisions;
148         uint32_t        ft_allocated;
149         uint32_t        ft_misses;
150         uint64_t        ft_hits;
151
152         uint32_t        ft_udp_idle;
153         uint32_t        ft_fin_wait_idle;
154         uint32_t        ft_syn_idle;
155         uint32_t        ft_tcp_idle;
156
157         fl_lock_t       *ft_lock;
158         fl_lock_t       *ft_unlock;
159         fl_rtalloc_t    *ft_rtalloc;
160         struct mtx      *ft_locks;
161
162         
163         union flentryp  ft_table;
164         bitstr_t        *ft_masks[MAXCPU];
165         bitstr_t        *ft_tmpmask;
166         struct flowtable *ft_next;
167 };
168
169 static struct proc *flowcleanerproc;
170 static VNET_DEFINE(struct flowtable *, flow_list_head);
171 static VNET_DEFINE(uint32_t, flow_hashjitter);
172 static VNET_DEFINE(uma_zone_t, flow_ipv4_zone);
173 static VNET_DEFINE(uma_zone_t, flow_ipv6_zone);
174
175 #define V_flow_list_head        VNET(flow_list_head)
176 #define V_flow_hashjitter       VNET(flow_hashjitter)
177 #define V_flow_ipv4_zone        VNET(flow_ipv4_zone)
178 #define V_flow_ipv6_zone        VNET(flow_ipv6_zone)
179
180 static struct cv        flowclean_cv;
181 static struct mtx       flowclean_lock;
182 static uint32_t         flowclean_cycles;
183
184 /*
185  * TODO:
186  * - Make flowtable stats per-cpu, aggregated at sysctl call time,
187  *   to avoid extra cache evictions caused by incrementing a shared
188  *   counter
189  * - add IPv6 support to flow lookup
190  * - add sysctls to resize && flush flow tables 
191  * - Add per flowtable sysctls for statistics and configuring timeouts
192  * - add saturation counter to rtentry to support per-packet load-balancing
193  *   add flag to indicate round-robin flow, add list lookup from head
194      for flows
195  * - add sysctl / device node / syscall to support exporting and importing
196  *   of flows with flag to indicate that a flow was imported so should
197  *   not be considered for auto-cleaning
198  * - support explicit connection state (currently only ad-hoc for DSR)
199  * - idetach() cleanup for options VIMAGE builds.
200  */
201 VNET_DEFINE(int, flowtable_enable) = 1;
202 static VNET_DEFINE(int, flowtable_debug);
203 static VNET_DEFINE(int, flowtable_hits);
204 static VNET_DEFINE(int, flowtable_lookups);
205 static VNET_DEFINE(int, flowtable_misses);
206 static VNET_DEFINE(int, flowtable_frees);
207 static VNET_DEFINE(int, flowtable_free_checks);
208 static VNET_DEFINE(int, flowtable_max_depth);
209 static VNET_DEFINE(int, flowtable_collisions);
210 static VNET_DEFINE(int, flowtable_syn_expire) = SYN_IDLE;
211 static VNET_DEFINE(int, flowtable_udp_expire) = UDP_IDLE;
212 static VNET_DEFINE(int, flowtable_fin_wait_expire) = FIN_WAIT_IDLE;
213 static VNET_DEFINE(int, flowtable_tcp_expire) = TCP_IDLE;
214 static VNET_DEFINE(int, flowtable_nmbflows) = 4096;
215 static VNET_DEFINE(int, flowtable_ready) = 0;
216
217 #define V_flowtable_enable              VNET(flowtable_enable)
218 #define V_flowtable_debug               VNET(flowtable_debug)
219 #define V_flowtable_hits                VNET(flowtable_hits)
220 #define V_flowtable_lookups             VNET(flowtable_lookups)
221 #define V_flowtable_misses              VNET(flowtable_misses)
222 #define V_flowtable_frees               VNET(flowtable_frees)
223 #define V_flowtable_free_checks         VNET(flowtable_free_checks)
224 #define V_flowtable_max_depth           VNET(flowtable_max_depth)
225 #define V_flowtable_collisions          VNET(flowtable_collisions)
226 #define V_flowtable_syn_expire          VNET(flowtable_syn_expire)
227 #define V_flowtable_udp_expire          VNET(flowtable_udp_expire)
228 #define V_flowtable_fin_wait_expire     VNET(flowtable_fin_wait_expire)
229 #define V_flowtable_tcp_expire          VNET(flowtable_tcp_expire)
230 #define V_flowtable_nmbflows            VNET(flowtable_nmbflows)
231 #define V_flowtable_ready               VNET(flowtable_ready)
232
233 SYSCTL_NODE(_net_inet, OID_AUTO, flowtable, CTLFLAG_RD, NULL, "flowtable");
234 SYSCTL_VNET_INT(_net_inet_flowtable, OID_AUTO, debug, CTLFLAG_RW,
235     &VNET_NAME(flowtable_debug), 0, "print debug info.");
236 SYSCTL_VNET_INT(_net_inet_flowtable, OID_AUTO, enable, CTLFLAG_RW,
237     &VNET_NAME(flowtable_enable), 0, "enable flowtable caching.");
238 SYSCTL_VNET_INT(_net_inet_flowtable, OID_AUTO, hits, CTLFLAG_RD,
239     &VNET_NAME(flowtable_hits), 0, "# flowtable hits.");
240 SYSCTL_VNET_INT(_net_inet_flowtable, OID_AUTO, lookups, CTLFLAG_RD,
241     &VNET_NAME(flowtable_lookups), 0, "# flowtable lookups.");
242 SYSCTL_VNET_INT(_net_inet_flowtable, OID_AUTO, misses, CTLFLAG_RD,
243     &VNET_NAME(flowtable_misses), 0, "#flowtable misses.");
244 SYSCTL_VNET_INT(_net_inet_flowtable, OID_AUTO, frees, CTLFLAG_RD,
245     &VNET_NAME(flowtable_frees), 0, "#flows freed.");
246 SYSCTL_VNET_INT(_net_inet_flowtable, OID_AUTO, free_checks, CTLFLAG_RD,
247     &VNET_NAME(flowtable_free_checks), 0, "#flows free checks.");
248 SYSCTL_VNET_INT(_net_inet_flowtable, OID_AUTO, max_depth, CTLFLAG_RD,
249     &VNET_NAME(flowtable_max_depth), 0, "max collision list length.");
250 SYSCTL_VNET_INT(_net_inet_flowtable, OID_AUTO, collisions, CTLFLAG_RD,
251     &VNET_NAME(flowtable_collisions), 0, "#flowtable collisions.");
252
253 /*
254  * XXX This does not end up updating timeouts at runtime
255  * and only reflects the value for the last table added :-/
256  */
257 SYSCTL_VNET_INT(_net_inet_flowtable, OID_AUTO, syn_expire, CTLFLAG_RW,
258     &VNET_NAME(flowtable_syn_expire), 0,
259     "seconds after which to remove syn allocated flow.");
260 SYSCTL_VNET_INT(_net_inet_flowtable, OID_AUTO, udp_expire, CTLFLAG_RW,
261     &VNET_NAME(flowtable_udp_expire), 0,
262     "seconds after which to remove flow allocated to UDP.");
263 SYSCTL_VNET_INT(_net_inet_flowtable, OID_AUTO, fin_wait_expire, CTLFLAG_RW,
264     &VNET_NAME(flowtable_fin_wait_expire), 0,
265     "seconds after which to remove a flow in FIN_WAIT.");
266 SYSCTL_VNET_INT(_net_inet_flowtable, OID_AUTO, tcp_expire, CTLFLAG_RW,
267     &VNET_NAME(flowtable_tcp_expire), 0,
268     "seconds after which to remove flow allocated to a TCP connection.");
269
270
271 /*
272  * Maximum number of flows that can be allocated of a given type.
273  *
274  * The table is allocated at boot time (for the pure caching case
275  * there is no reason why this could not be changed at runtime)
276  * and thus (currently) needs to be set with a tunable.
277  */
278 static int
279 sysctl_nmbflows(SYSCTL_HANDLER_ARGS)
280 {
281         int error, newnmbflows;
282
283         newnmbflows = V_flowtable_nmbflows;
284         error = sysctl_handle_int(oidp, &newnmbflows, 0, req); 
285         if (error == 0 && req->newptr) {
286                 if (newnmbflows > V_flowtable_nmbflows) {
287                         V_flowtable_nmbflows = newnmbflows;
288                         uma_zone_set_max(V_flow_ipv4_zone,
289                             V_flowtable_nmbflows);
290                         uma_zone_set_max(V_flow_ipv6_zone,
291                             V_flowtable_nmbflows);
292                 } else
293                         error = EINVAL;
294         }
295         return (error);
296 }
297 SYSCTL_VNET_PROC(_net_inet_flowtable, OID_AUTO, nmbflows,
298     CTLTYPE_INT|CTLFLAG_RW, 0, 0, sysctl_nmbflows, "IU",
299     "Maximum number of flows allowed");
300
301 #ifndef RADIX_MPATH
302 static void
303 in_rtalloc_ign_wrapper(struct route *ro, uint32_t hash, u_int fibnum)
304 {
305
306         rtalloc_ign_fib(ro, 0, fibnum);
307 }
308 #endif
309
310 static void
311 flowtable_global_lock(struct flowtable *table, uint32_t hash)
312 {       
313         int lock_index = (hash)&(table->ft_lock_count - 1);
314
315         mtx_lock(&table->ft_locks[lock_index]);
316 }
317
318 static void
319 flowtable_global_unlock(struct flowtable *table, uint32_t hash)
320 {       
321         int lock_index = (hash)&(table->ft_lock_count - 1);
322
323         mtx_unlock(&table->ft_locks[lock_index]);
324 }
325
326 static void
327 flowtable_pcpu_lock(struct flowtable *table, uint32_t hash)
328 {
329
330         critical_enter();
331 }
332
333 static void
334 flowtable_pcpu_unlock(struct flowtable *table, uint32_t hash)
335 {
336
337         critical_exit();
338 }
339
340 #define FL_ENTRY_INDEX(table, hash)((hash) % (table)->ft_size)
341 #define FL_ENTRY(table, hash) *flowtable_entry((table), (hash))
342 #define FL_ENTRY_LOCK(table, hash)  (table)->ft_lock((table), (hash))
343 #define FL_ENTRY_UNLOCK(table, hash) (table)->ft_unlock((table), (hash))
344
345 #define FL_STALE (1<<8)
346 #define FL_IPV6  (1<<9)
347
348 static uint32_t
349 ipv4_flow_lookup_hash_internal(struct mbuf *m, struct route *ro,
350     uint32_t *key, uint16_t *flags, uint8_t *protop)
351 {
352         uint16_t sport = 0, dport = 0;
353         struct ip *ip = NULL;
354         uint8_t proto = 0;
355         int iphlen;
356         uint32_t hash;
357         struct sockaddr_in *sin;
358         struct tcphdr *th;
359         struct udphdr *uh;
360         struct sctphdr *sh;
361
362         if ((V_flowtable_enable == 0) || (V_flowtable_ready == 0))
363                 return (0);
364
365         key[1] = key[0] = 0;
366         sin = (struct sockaddr_in *)&ro->ro_dst;
367         if (m != NULL) {
368                 ip = mtod(m, struct ip *);
369                 sin->sin_family = AF_INET;
370                 sin->sin_len = sizeof(*sin);
371                 sin->sin_addr = ip->ip_dst;
372         } else
373                 *flags &= ~FL_HASH_PORTS;
374
375         key[2] = sin->sin_addr.s_addr;
376
377         if ((*flags & FL_HASH_PORTS) == 0)
378                 goto skipports;
379
380         proto = ip->ip_p;
381         iphlen = ip->ip_hl << 2; /* XXX options? */
382         key[1] = ip->ip_src.s_addr;
383         
384         switch (proto) {
385         case IPPROTO_TCP:
386                 th = (struct tcphdr *)((caddr_t)ip + iphlen);
387                 sport = ntohs(th->th_sport);
388                 dport = ntohs(th->th_dport);
389                 *flags |= th->th_flags;
390                 if (*flags & TH_RST)
391                         *flags |= FL_STALE;
392         break;
393         case IPPROTO_UDP:
394                 uh = (struct udphdr *)((caddr_t)ip + iphlen);
395                 sport = uh->uh_sport;
396                 dport = uh->uh_dport;
397         break;
398         case IPPROTO_SCTP:
399                 sh = (struct sctphdr *)((caddr_t)ip + iphlen);
400                 sport = sh->src_port;
401                 dport = sh->dest_port;
402         break;
403         default:
404                 if (*flags & FL_HASH_PORTS)
405                         goto noop;
406                 /* no port - hence not a protocol we care about */
407                 break;;
408         
409         }
410         *protop = proto;
411
412         /*
413          * If this is a transmit route cache then 
414          * hash all flows to a given destination to
415          * the same bucket
416          */
417         if ((*flags & FL_HASH_PORTS) == 0)
418                 proto = sport = dport = 0;
419
420         ((uint16_t *)key)[0] = sport;
421         ((uint16_t *)key)[1] = dport; 
422
423 skipports:
424         hash = jenkins_hashword(key, 3, V_flow_hashjitter + proto);
425         if (m != NULL && (m->m_flags & M_FLOWID) == 0) {
426                 m->m_flags |= M_FLOWID;
427                 m->m_pkthdr.flowid = hash;
428         }
429
430         return (hash);
431 noop:
432         *protop = proto;
433         return (0);
434 }
435
436 static bitstr_t *
437 flowtable_mask(struct flowtable *ft)
438 {
439         bitstr_t *mask;
440
441         if (ft->ft_flags & FL_PCPU)
442                 mask = ft->ft_masks[curcpu];
443         else
444                 mask = ft->ft_masks[0];
445
446         return (mask);
447 }
448
449 static struct flentry **
450 flowtable_entry(struct flowtable *ft, uint32_t hash)
451 {
452         struct flentry **fle;
453         int index = (hash % ft->ft_size);
454
455         if (ft->ft_flags & FL_PCPU) {
456                 KASSERT(&ft->ft_table.pcpu[curcpu][0] != NULL, ("pcpu not set"));
457                 fle = &ft->ft_table.pcpu[curcpu][index];
458         } else {
459                 KASSERT(&ft->ft_table.global[0] != NULL, ("global not set"));
460                 fle = &ft->ft_table.global[index];
461         }
462         
463         return (fle);
464 }
465
466 static int
467 flow_stale(struct flowtable *ft, struct flentry *fle)
468 {
469         time_t idle_time;
470
471         if ((fle->f_fhash == 0)
472             || ((fle->f_rt->rt_flags & RTF_HOST) &&
473                 ((fle->f_rt->rt_flags & (RTF_UP))
474                     != (RTF_UP)))
475             || (fle->f_rt->rt_ifp == NULL))
476                 return (1);
477
478         idle_time = time_uptime - fle->f_uptime;
479
480         if ((fle->f_flags & FL_STALE) ||
481             ((fle->f_flags & (TH_SYN|TH_ACK|TH_FIN)) == 0
482                 && (idle_time > ft->ft_udp_idle)) ||
483             ((fle->f_flags & TH_FIN)
484                 && (idle_time > ft->ft_fin_wait_idle)) ||
485             ((fle->f_flags & (TH_SYN|TH_ACK)) == TH_SYN
486                 && (idle_time > ft->ft_syn_idle)) ||
487             ((fle->f_flags & (TH_SYN|TH_ACK)) == (TH_SYN|TH_ACK)
488                 && (idle_time > ft->ft_tcp_idle)) ||
489             ((fle->f_rt->rt_flags & RTF_UP) == 0 || 
490                 (fle->f_rt->rt_ifp == NULL)))
491                 return (1);
492
493         return (0);
494 }
495
496 static void
497 flowtable_set_hashkey(struct flentry *fle, uint32_t *key)
498 {
499         uint32_t *hashkey;
500         int i, nwords;
501
502         if (fle->f_flags & FL_IPV6) {
503                 nwords = 9;
504                 hashkey = ((struct flentry_v4 *)fle)->fl_flow.ipf_key;
505         } else {
506                 nwords = 3;
507                 hashkey = ((struct flentry_v6 *)fle)->fl_flow.ipf_key;
508         }
509         
510         for (i = 0; i < nwords; i++) 
511                 hashkey[i] = key[i];
512 }
513
514 static int
515 flowtable_insert(struct flowtable *ft, uint32_t hash, uint32_t *key,
516     uint8_t proto, uint32_t fibnum, struct route *ro, uint16_t flags)
517 {
518         struct flentry *fle, *fletail, *newfle, **flep;
519         int depth;
520         uma_zone_t flezone;
521         bitstr_t *mask;
522
523         flezone = (flags & FL_IPV6) ? V_flow_ipv6_zone : V_flow_ipv4_zone;
524         newfle = uma_zalloc(flezone, M_NOWAIT | M_ZERO);
525         if (newfle == NULL)
526                 return (ENOMEM);
527
528         newfle->f_flags |= (flags & FL_IPV6);
529         
530         FL_ENTRY_LOCK(ft, hash);
531         mask = flowtable_mask(ft);
532         flep = flowtable_entry(ft, hash);
533         fletail = fle = *flep;
534
535         if (fle == NULL) {
536                 bit_set(mask, FL_ENTRY_INDEX(ft, hash));
537                 *flep = fle = newfle;
538                 goto skip;
539         } 
540         
541         depth = 0;
542         V_flowtable_collisions++;
543         /*
544          * find end of list and make sure that we were not
545          * preempted by another thread handling this flow
546          */
547         while (fle != NULL) {
548                 if (fle->f_fhash == hash && !flow_stale(ft, fle)) {
549                         /*
550                          * there was either a hash collision
551                          * or we lost a race to insert
552                          */
553                         FL_ENTRY_UNLOCK(ft, hash);
554                         uma_zfree((newfle->f_flags & FL_IPV6) ?
555                             V_flow_ipv6_zone : V_flow_ipv4_zone, newfle);
556                         return (EEXIST);
557                 }
558                 /*
559                  * re-visit this double condition XXX
560                  */
561                 if (fletail->f_next != NULL)
562                         fletail = fle->f_next;
563
564                 depth++;
565                 fle = fle->f_next;
566         } 
567
568         if (depth > V_flowtable_max_depth)
569                 V_flowtable_max_depth = depth;
570         fletail->f_next = newfle;
571         fle = newfle;
572 skip:
573         flowtable_set_hashkey(fle, key);
574
575         fle->f_proto = proto;
576         fle->f_rt = ro->ro_rt;
577         fle->f_lle = ro->ro_lle;
578         fle->f_fhash = hash;
579         fle->f_fibnum = fibnum;
580         fle->f_uptime = time_uptime;
581         FL_ENTRY_UNLOCK(ft, hash);
582         return (0);
583 }
584
585 static int
586 flowtable_key_equal(struct flentry *fle, uint32_t *key)
587 {
588         uint32_t *hashkey;
589         int i, nwords;
590
591         if (fle->f_flags & FL_IPV6) {
592                 nwords = 9;
593                 hashkey = ((struct flentry_v4 *)fle)->fl_flow.ipf_key;
594         } else {
595                 nwords = 3;
596                 hashkey = ((struct flentry_v6 *)fle)->fl_flow.ipf_key;
597         }
598         
599         for (i = 0; i < nwords; i++) 
600                 if (hashkey[i] != key[i])
601                         return (0);
602
603         return (1);
604 }
605
606 int
607 flowtable_lookup(struct flowtable *ft, struct mbuf *m, struct route *ro, uint32_t fibnum)
608 {
609         uint32_t key[9], hash;
610         struct flentry *fle;
611         uint16_t flags;
612         uint8_t proto = 0;
613         int error = 0;
614         struct rtentry *rt;
615         struct llentry *lle;
616
617         flags = ft->ft_flags;
618         ro->ro_rt = NULL;
619         ro->ro_lle = NULL;
620
621         /*
622          * The internal hash lookup is the only IPv4 specific bit
623          * remaining
624          *
625          * XXX BZ: to add IPv6 support just add a check for the
626          * address type in m and ro and an equivalent ipv6 lookup
627          * function - the rest of the code should automatically
628          * handle an ipv6 flow (note that m can be NULL in which
629          * case ro will be set)
630          */
631         hash = ipv4_flow_lookup_hash_internal(m, ro, key,
632             &flags, &proto);
633
634         /*
635          * Ports are zero and this isn't a transmit cache
636          * - thus not a protocol for which we need to keep 
637          * state
638          * FL_HASH_PORTS => key[0] != 0 for TCP || UDP || SCTP
639          */
640         if (hash == 0 || (key[0] == 0 && (ft->ft_flags & FL_HASH_PORTS)))
641                 return (ENOENT);
642
643         V_flowtable_lookups++;
644         FL_ENTRY_LOCK(ft, hash);
645         if ((fle = FL_ENTRY(ft, hash)) == NULL) {
646                 FL_ENTRY_UNLOCK(ft, hash);
647                 goto uncached;
648         }
649 keycheck:       
650         rt = __DEVOLATILE(struct rtentry *, fle->f_rt);
651         lle = __DEVOLATILE(struct llentry *, fle->f_lle);
652         if ((rt != NULL)
653             && fle->f_fhash == hash
654             && flowtable_key_equal(fle, key)
655             && (proto == fle->f_proto)
656             && (fibnum == fle->f_fibnum)
657             && (rt->rt_flags & RTF_UP)
658             && (rt->rt_ifp != NULL)) {
659                 V_flowtable_hits++;
660                 fle->f_uptime = time_uptime;
661                 fle->f_flags |= flags;
662                 ro->ro_rt = rt;
663                 ro->ro_lle = lle;
664                 FL_ENTRY_UNLOCK(ft, hash);
665                 return (0);
666         } else if (fle->f_next != NULL) {
667                 fle = fle->f_next;
668                 goto keycheck;
669         }
670         FL_ENTRY_UNLOCK(ft, hash);
671
672 uncached:
673         V_flowtable_misses++;
674         /*
675          * This bit of code ends up locking the
676          * same route 3 times (just like ip_output + ether_output)
677          * - at lookup
678          * - in rt_check when called by arpresolve
679          * - dropping the refcount for the rtentry
680          *
681          * This could be consolidated to one if we wrote a variant
682          * of arpresolve with an rt_check variant that expected to
683          * receive the route locked
684          */
685
686         ft->ft_rtalloc(ro, hash, fibnum);
687         if (ro->ro_rt == NULL) 
688                 error = ENETUNREACH;
689         else {
690                 struct llentry *lle = NULL;
691                 struct sockaddr *l3addr;
692                 struct rtentry *rt = ro->ro_rt;
693                 struct ifnet *ifp = rt->rt_ifp;
694
695                 if (rt->rt_flags & RTF_GATEWAY)
696                         l3addr = rt->rt_gateway;
697                 else
698                         l3addr = &ro->ro_dst;
699                 llentry_update(&lle, LLTABLE(ifp), l3addr, ifp);
700                 ro->ro_lle = lle;
701
702                 if (lle == NULL) {
703                         RTFREE(rt);
704                         ro->ro_rt = NULL;
705                         return (ENOENT);
706                 }
707                 error = flowtable_insert(ft, hash, key, proto, fibnum,
708                     ro, flags);
709                                 
710                 if (error) {
711                         RTFREE(rt);
712                         LLE_FREE(lle);
713                         ro->ro_rt = NULL;
714                         ro->ro_lle = NULL;
715                 }
716         } 
717
718         return (error);
719 }
720
721 /*
722  * used by the bit_alloc macro
723  */
724 #define calloc(count, size) malloc((count)*(size), M_DEVBUF, M_WAITOK|M_ZERO)
725         
726 struct flowtable *
727 flowtable_alloc(int nentry, int flags)
728 {
729         struct flowtable *ft, *fttail;
730         int i;
731
732         if (V_flow_hashjitter == 0)
733                 V_flow_hashjitter = arc4random();
734
735         KASSERT(nentry > 0, ("nentry must be > 0, is %d\n", nentry));
736
737         ft = malloc(sizeof(struct flowtable),
738             M_RTABLE, M_WAITOK | M_ZERO);
739         
740         ft->ft_flags = flags;
741         ft->ft_size = nentry;
742 #ifdef RADIX_MPATH
743         ft->ft_rtalloc = rtalloc_mpath_fib;
744 #else
745         ft->ft_rtalloc = in_rtalloc_ign_wrapper;
746 #endif
747         if (flags & FL_PCPU) {
748                 ft->ft_lock = flowtable_pcpu_lock;
749                 ft->ft_unlock = flowtable_pcpu_unlock;
750
751                 for (i = 0; i <= mp_maxid; i++) {
752                         ft->ft_table.pcpu[i] =
753                             malloc(nentry*sizeof(struct flentry *),
754                                 M_RTABLE, M_WAITOK | M_ZERO);
755                         ft->ft_masks[i] = bit_alloc(nentry);
756                 }
757         } else {
758                 ft->ft_lock_count = 2*(powerof2(mp_maxid + 1) ? (mp_maxid + 1):
759                     (fls(mp_maxid + 1) << 1));
760                 
761                 ft->ft_lock = flowtable_global_lock;
762                 ft->ft_unlock = flowtable_global_unlock;
763                 ft->ft_table.global =
764                             malloc(nentry*sizeof(struct flentry *),
765                                 M_RTABLE, M_WAITOK | M_ZERO);
766                 ft->ft_locks = malloc(ft->ft_lock_count*sizeof(struct mtx),
767                                 M_RTABLE, M_WAITOK | M_ZERO);
768                 for (i = 0; i < ft->ft_lock_count; i++)
769                         mtx_init(&ft->ft_locks[i], "flow", NULL, MTX_DEF|MTX_DUPOK);
770
771                 ft->ft_masks[0] = bit_alloc(nentry);
772         }
773         ft->ft_tmpmask = bit_alloc(nentry);
774
775         /*
776          * In the local transmit case the table truly is 
777          * just a cache - so everything is eligible for
778          * replacement after 5s of non-use
779          */
780         if (flags & FL_HASH_PORTS) {
781                 ft->ft_udp_idle = V_flowtable_udp_expire;
782                 ft->ft_syn_idle = V_flowtable_syn_expire;
783                 ft->ft_fin_wait_idle = V_flowtable_fin_wait_expire;
784                 ft->ft_tcp_idle = V_flowtable_fin_wait_expire;
785         } else {
786                 ft->ft_udp_idle = ft->ft_fin_wait_idle =
787                     ft->ft_syn_idle = ft->ft_tcp_idle = 30;
788                 
789         }
790
791         /*
792          * hook in to the cleaner list
793          */
794         if (V_flow_list_head == NULL)
795                 V_flow_list_head = ft;
796         else {
797                 fttail = V_flow_list_head;
798                 while (fttail->ft_next != NULL)
799                         fttail = fttail->ft_next;
800                 fttail->ft_next = ft;
801         }
802
803         return (ft);
804 }
805
806 /*
807  * The rest of the code is devoted to garbage collection of expired entries.
808  * It is a new additon made necessary by the switch to dynamically allocating
809  * flow tables.
810  * 
811  */
812 static void
813 fle_free(struct flentry *fle)
814 {
815         struct rtentry *rt;
816         struct llentry *lle;
817
818         rt = __DEVOLATILE(struct rtentry *, fle->f_rt);
819         lle = __DEVOLATILE(struct llentry *, fle->f_lle);
820         RTFREE(rt);
821         LLE_FREE(lle);
822         uma_zfree((fle->f_flags & FL_IPV6) ?
823             V_flow_ipv6_zone : V_flow_ipv4_zone, fle);
824 }
825
826 static void
827 flowtable_free_stale(struct flowtable *ft)
828 {
829         int curbit = 0, count;
830         struct flentry *fle,  **flehead, *fleprev;
831         struct flentry *flefreehead, *flefreetail, *fletmp;
832         bitstr_t *mask, *tmpmask;
833         
834         flefreehead = flefreetail = NULL;
835         mask = flowtable_mask(ft);
836         tmpmask = ft->ft_tmpmask;
837         memcpy(tmpmask, mask, ft->ft_size/8);
838         /*
839          * XXX Note to self, bit_ffs operates at the byte level
840          * and thus adds gratuitous overhead
841          */
842         bit_ffs(tmpmask, ft->ft_size, &curbit);
843         while (curbit != -1) {
844                 if (curbit >= ft->ft_size || curbit < -1) {
845                         log(LOG_ALERT,
846                             "warning: bad curbit value %d \n",
847                             curbit);
848                         break;
849                 }
850                 
851                 FL_ENTRY_LOCK(ft, curbit);
852                 flehead = flowtable_entry(ft, curbit);
853                 fle = fleprev = *flehead;
854
855                 V_flowtable_free_checks++;
856 #ifdef DIAGNOSTIC
857                 if (fle == NULL && curbit > 0) {
858                         log(LOG_ALERT,
859                             "warning bit=%d set, but no fle found\n",
860                             curbit);
861                 }
862 #endif          
863                 while (fle != NULL) {   
864                         if (!flow_stale(ft, fle)) {
865                                 fleprev = fle;
866                                 fle = fle->f_next;
867                                 continue;
868                         }
869                         /*
870                          * delete head of the list
871                          */
872                         if (fleprev == *flehead) {
873                                 fletmp = fleprev;
874                                 if (fle == fleprev) {
875                                         fleprev = *flehead = fle->f_next;
876                                 } else
877                                         fleprev = *flehead = fle;
878                                 fle = fle->f_next;
879                         } else {
880                                 /*
881                                  * don't advance fleprev
882                                  */
883                                 fletmp = fle;
884                                 fleprev->f_next = fle->f_next;
885                                 fle = fleprev->f_next;
886                         }
887                         
888                         if (flefreehead == NULL)
889                                 flefreehead = flefreetail = fletmp;
890                         else {
891                                 flefreetail->f_next = fletmp;
892                                 flefreetail = fletmp;
893                         }
894                         fletmp->f_next = NULL;
895                 }
896                 if (*flehead == NULL)
897                         bit_clear(mask, curbit);
898                 FL_ENTRY_UNLOCK(ft, curbit);
899                 bit_clear(tmpmask, curbit);
900                 bit_ffs(tmpmask, ft->ft_size, &curbit);
901         }
902         count = 0;
903         while ((fle = flefreehead) != NULL) {
904                 flefreehead = fle->f_next;
905                 count++;
906                 V_flowtable_frees++;
907                 fle_free(fle);
908         }
909         if (V_flowtable_debug && count)
910                 log(LOG_DEBUG, "freed %d flow entries\n", count);
911 }
912
913 static void
914 flowtable_clean_vnet(void)
915 {
916         struct flowtable *ft;
917         int i;
918
919         ft = V_flow_list_head;
920         while (ft != NULL) {
921                 if (ft->ft_flags & FL_PCPU) {
922                         for (i = 0; i <= mp_maxid; i++) {
923                                 if (CPU_ABSENT(i))
924                                         continue;
925
926                                 thread_lock(curthread);
927                                 sched_bind(curthread, i);
928                                 thread_unlock(curthread);
929
930                                 flowtable_free_stale(ft);
931
932                                 thread_lock(curthread);
933                                 sched_unbind(curthread);
934                                 thread_unlock(curthread);
935                         }
936                 } else {
937                         flowtable_free_stale(ft);
938                 }
939                 ft = ft->ft_next;
940         }
941 }
942
943 static void
944 flowtable_cleaner(void)
945 {
946         VNET_ITERATOR_DECL(vnet_iter);
947
948         if (bootverbose)
949                 log(LOG_INFO, "flowtable cleaner started\n");
950         while (1) {
951                 VNET_LIST_RLOCK();
952                 VNET_FOREACH(vnet_iter) {
953                         CURVNET_SET(vnet_iter);
954                         flowtable_clean_vnet();
955                         CURVNET_RESTORE();
956                 }
957                 VNET_LIST_RUNLOCK();
958
959                 flowclean_cycles++;
960                 /*
961                  * The 10 second interval between cleaning checks
962                  * is arbitrary
963                  */
964                 mtx_lock(&flowclean_lock);
965                 cv_broadcast(&flowclean_cv);
966                 cv_timedwait(&flowclean_cv, &flowclean_lock, 10*hz);
967                 mtx_unlock(&flowclean_lock);
968         }
969 }
970
971 static void
972 flowtable_flush(void *unused __unused)
973 {
974         uint64_t start;
975         
976         mtx_lock(&flowclean_lock);
977         start = flowclean_cycles;
978         while (start == flowclean_cycles) {
979                 cv_broadcast(&flowclean_cv);
980                 cv_wait(&flowclean_cv, &flowclean_lock);
981         }
982         mtx_unlock(&flowclean_lock);
983 }
984
985 static struct kproc_desc flow_kp = {
986         "flowcleaner",
987         flowtable_cleaner,
988         &flowcleanerproc
989 };
990 SYSINIT(flowcleaner, SI_SUB_KTHREAD_IDLE, SI_ORDER_ANY, kproc_start, &flow_kp);
991
992 static void
993 flowtable_init_vnet(const void *unused __unused)
994 {
995
996         V_flow_ipv4_zone = uma_zcreate("ip4flow", sizeof(struct flentry_v4),
997             NULL, NULL, NULL, NULL, 64, UMA_ZONE_MAXBUCKET);
998         V_flow_ipv6_zone = uma_zcreate("ip6flow", sizeof(struct flentry_v6),
999             NULL, NULL, NULL, NULL, 64, UMA_ZONE_MAXBUCKET);    
1000         uma_zone_set_max(V_flow_ipv4_zone, V_flowtable_nmbflows);
1001         uma_zone_set_max(V_flow_ipv6_zone, V_flowtable_nmbflows);
1002 }
1003 VNET_SYSINIT(flowtable_init_vnet, SI_SUB_KTHREAD_INIT, SI_ORDER_MIDDLE,
1004     flowtable_init_vnet, NULL);
1005
1006 static void
1007 flowtable_init(const void *unused __unused)
1008 {
1009
1010         cv_init(&flowclean_cv, "flowcleanwait");
1011         mtx_init(&flowclean_lock, "flowclean lock", NULL, MTX_DEF);
1012         EVENTHANDLER_REGISTER(ifnet_departure_event, flowtable_flush, NULL,
1013             EVENTHANDLER_PRI_ANY);
1014         V_flowtable_ready = 1;
1015 }
1016 SYSINIT(flowtable_init, SI_SUB_KTHREAD_INIT, SI_ORDER_ANY,
1017     flowtable_init, NULL);
1018
1019
1020 #ifdef VIMAGE
1021 static void
1022 flowtable_uninit(const void *unused __unused)
1023 {
1024
1025         uma_zdestroy(V_flow_ipv4_zone);
1026         uma_zdestroy(V_flow_ipv6_zone);
1027 }
1028
1029 VNET_SYSUNINIT(flowtable_uninit, SI_SUB_KTHREAD_INIT, SI_ORDER_ANY,
1030     flowtable_uninit, NULL);
1031 #endif
1032
1033 #ifdef DDB
1034 static bitstr_t *
1035 flowtable_mask_pcpu(struct flowtable *ft, int cpuid)
1036 {
1037         bitstr_t *mask;
1038
1039         if (ft->ft_flags & FL_PCPU)
1040                 mask = ft->ft_masks[cpuid];
1041         else
1042                 mask = ft->ft_masks[0];
1043
1044         return (mask);
1045 }
1046
1047 static struct flentry **
1048 flowtable_entry_pcpu(struct flowtable *ft, uint32_t hash, int cpuid)
1049 {
1050         struct flentry **fle;
1051         int index = (hash % ft->ft_size);
1052
1053         if (ft->ft_flags & FL_PCPU) {
1054                 fle = &ft->ft_table.pcpu[cpuid][index];
1055         } else {
1056                 fle = &ft->ft_table.global[index];
1057         }
1058         
1059         return (fle);
1060 }
1061
1062 static void
1063 flow_show(struct flowtable *ft, struct flentry *fle)
1064 {
1065         int idle_time;
1066         int rt_valid;
1067
1068         idle_time = (int)(time_uptime - fle->f_uptime);
1069         rt_valid = fle->f_rt != NULL;
1070         db_printf("hash=0x%08x idle_time=%03d rt=%p ifp=%p",
1071             fle->f_fhash, idle_time,
1072             fle->f_rt, rt_valid ? fle->f_rt->rt_ifp : NULL);
1073         if (rt_valid && (fle->f_rt->rt_flags & RTF_UP))
1074                 db_printf(" RTF_UP ");
1075         if (fle->f_flags & FL_STALE)
1076                 db_printf(" FL_STALE ");
1077         db_printf("\n");
1078 }
1079
1080 static void
1081 flowtable_show(struct flowtable *ft, int cpuid)
1082 {
1083         int curbit = 0;
1084         struct flentry *fle,  **flehead;
1085         bitstr_t *mask, *tmpmask;
1086
1087         db_printf("cpu: %d\n", cpuid);
1088         mask = flowtable_mask_pcpu(ft, cpuid);
1089         tmpmask = ft->ft_tmpmask;
1090         memcpy(tmpmask, mask, ft->ft_size/8);
1091         /*
1092          * XXX Note to self, bit_ffs operates at the byte level
1093          * and thus adds gratuitous overhead
1094          */
1095         bit_ffs(tmpmask, ft->ft_size, &curbit);
1096         while (curbit != -1) {
1097                 if (curbit >= ft->ft_size || curbit < -1) {
1098                         db_printf("warning: bad curbit value %d \n",
1099                             curbit);
1100                         break;
1101                 }
1102
1103                 flehead = flowtable_entry_pcpu(ft, curbit, cpuid);
1104                 fle = *flehead;
1105
1106                 while (fle != NULL) {   
1107                         flow_show(ft, fle);
1108                         fle = fle->f_next;
1109                         continue;
1110                 }
1111                 bit_clear(tmpmask, curbit);
1112                 bit_ffs(tmpmask, ft->ft_size, &curbit);
1113         }
1114 }
1115
1116 static void
1117 flowtable_show_vnet(void)
1118 {
1119         struct flowtable *ft;
1120         int i;
1121
1122         ft = V_flow_list_head;
1123         while (ft != NULL) {
1124                 if (ft->ft_flags & FL_PCPU) {
1125                         for (i = 0; i <= mp_maxid; i++) {
1126                                 if (CPU_ABSENT(i))
1127                                         continue;
1128                                 flowtable_show(ft, i);
1129                         }
1130                 } else {
1131                         flowtable_show(ft, 0);
1132                 }
1133                 ft = ft->ft_next;
1134         }
1135 }
1136
1137 DB_SHOW_COMMAND(flowtables, db_show_flowtables)
1138 {
1139         VNET_ITERATOR_DECL(vnet_iter);
1140
1141         VNET_FOREACH(vnet_iter) {
1142                 CURVNET_SET(vnet_iter);
1143                 flowtable_show_vnet();
1144                 CURVNET_RESTORE();
1145         }
1146 }
1147 #endif