]> CyberLeo.Net >> Repos - FreeBSD/releng/8.0.git/blob - sys/net/flowtable.c
Adjust to reflect 8.0-RELEASE.
[FreeBSD/releng/8.0.git] / sys / net / flowtable.c
1 /**************************************************************************
2
3 Copyright (c) 2008-2009, BitGravity Inc.
4 All rights reserved.
5
6 Redistribution and use in source and binary forms, with or without
7 modification, are permitted provided that the following conditions are met:
8
9  1. Redistributions of source code must retain the above copyright notice,
10     this list of conditions and the following disclaimer.
11
12  2. Neither the name of the BitGravity Corporation nor the names of its
13     contributors may be used to endorse or promote products derived from
14     this software without specific prior written permission.
15
16 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
17 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
20 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
21 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
22 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
23 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
24 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
25 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
26 POSSIBILITY OF SUCH DAMAGE.
27
28 ***************************************************************************/
29
30 #include "opt_route.h"
31 #include "opt_mpath.h"
32 #include "opt_ddb.h"
33
34 #include <sys/cdefs.h>
35 __FBSDID("$FreeBSD$");
36
37 #include <sys/param.h>  
38 #include <sys/types.h>
39 #include <sys/bitstring.h>
40 #include <sys/condvar.h>
41 #include <sys/callout.h>
42 #include <sys/kernel.h>  
43 #include <sys/kthread.h>
44 #include <sys/limits.h>
45 #include <sys/malloc.h>
46 #include <sys/mbuf.h>
47 #include <sys/proc.h>
48 #include <sys/sched.h>
49 #include <sys/smp.h>
50 #include <sys/socket.h>
51 #include <sys/syslog.h>
52 #include <sys/sysctl.h>
53
54 #include <net/if.h>
55 #include <net/if_llatbl.h>
56 #include <net/if_var.h>
57 #include <net/route.h> 
58 #include <net/flowtable.h>
59 #include <net/vnet.h>
60
61 #include <netinet/in.h>
62 #include <netinet/in_systm.h>
63 #include <netinet/in_var.h>
64 #include <netinet/if_ether.h>
65 #include <netinet/ip.h>
66 #include <netinet/tcp.h>
67 #include <netinet/udp.h>
68 #include <netinet/sctp.h>
69
70 #include <libkern/jenkins.h>
71 #include <ddb/ddb.h>
72
73 struct ipv4_tuple {
74         uint16_t        ip_sport;       /* source port */
75         uint16_t        ip_dport;       /* destination port */
76         in_addr_t       ip_saddr;       /* source address */
77         in_addr_t       ip_daddr;       /* destination address */
78 };
79
80 union ipv4_flow {
81         struct ipv4_tuple ipf_ipt;
82         uint32_t        ipf_key[3];
83 };
84
85 struct ipv6_tuple {
86         uint16_t        ip_sport;       /* source port */
87         uint16_t        ip_dport;       /* destination port */
88         struct in6_addr ip_saddr;       /* source address */
89         struct in6_addr ip_daddr;       /* destination address */
90 };
91
92 union ipv6_flow {
93         struct ipv6_tuple ipf_ipt;
94         uint32_t        ipf_key[9];
95 };
96
97 struct flentry {
98         volatile uint32_t       f_fhash;        /* hash flowing forward */
99         uint16_t                f_flags;        /* flow flags */
100         uint8_t                 f_pad;          
101         uint8_t                 f_proto;        /* protocol */
102         uint32_t                f_fibnum;       /* fib index */
103         uint32_t                f_uptime;       /* uptime at last access */
104         struct flentry          *f_next;        /* pointer to collision entry */
105         volatile struct rtentry *f_rt;          /* rtentry for flow */
106         volatile struct llentry *f_lle;         /* llentry for flow */
107 };
108
109 struct flentry_v4 {
110         struct flentry  fl_entry;
111         union ipv4_flow fl_flow;
112 };
113
114 struct flentry_v6 {
115         struct flentry  fl_entry;
116         union ipv6_flow fl_flow;
117 };
118
119 #define fl_fhash        fl_entry.fl_fhash
120 #define fl_flags        fl_entry.fl_flags
121 #define fl_proto        fl_entry.fl_proto
122 #define fl_uptime       fl_entry.fl_uptime
123 #define fl_rt           fl_entry.fl_rt
124 #define fl_lle          fl_entry.fl_lle
125
126 #define SECS_PER_HOUR           3600
127 #define SECS_PER_DAY            (24*SECS_PER_HOUR)
128
129 #define SYN_IDLE                300
130 #define UDP_IDLE                300
131 #define FIN_WAIT_IDLE           600
132 #define TCP_IDLE                SECS_PER_DAY
133
134
135 typedef void fl_lock_t(struct flowtable *, uint32_t);
136 typedef void fl_rtalloc_t(struct route *, uint32_t, u_int);
137
138 union flentryp {
139         struct flentry          **global;
140         struct flentry          **pcpu[MAXCPU];
141 };
142
143 struct flowtable {
144         int             ft_size;
145         int             ft_lock_count;
146         uint32_t        ft_flags;
147         uint32_t        ft_collisions;
148         uint32_t        ft_allocated;
149         uint32_t        ft_misses;
150         uint64_t        ft_hits;
151
152         uint32_t        ft_udp_idle;
153         uint32_t        ft_fin_wait_idle;
154         uint32_t        ft_syn_idle;
155         uint32_t        ft_tcp_idle;
156
157         fl_lock_t       *ft_lock;
158         fl_lock_t       *ft_unlock;
159         fl_rtalloc_t    *ft_rtalloc;
160         struct mtx      *ft_locks;
161
162         
163         union flentryp  ft_table;
164         bitstr_t        *ft_masks[MAXCPU];
165         bitstr_t        *ft_tmpmask;
166         struct flowtable *ft_next;
167 };
168
169 static struct proc *flowcleanerproc;
170 static VNET_DEFINE(struct flowtable *, flow_list_head);
171 static VNET_DEFINE(uint32_t, flow_hashjitter);
172 static VNET_DEFINE(uma_zone_t, flow_ipv4_zone);
173 static VNET_DEFINE(uma_zone_t, flow_ipv6_zone);
174
175 #define V_flow_list_head        VNET(flow_list_head)
176 #define V_flow_hashjitter       VNET(flow_hashjitter)
177 #define V_flow_ipv4_zone        VNET(flow_ipv4_zone)
178 #define V_flow_ipv6_zone        VNET(flow_ipv6_zone)
179
180 static struct cv        flowclean_cv;
181 static struct mtx       flowclean_lock;
182 static uint32_t         flowclean_cycles;
183
184 /*
185  * TODO:
186  * - Make flowtable stats per-cpu, aggregated at sysctl call time,
187  *   to avoid extra cache evictions caused by incrementing a shared
188  *   counter
189  * - add IPv6 support to flow lookup
190  * - add sysctls to resize && flush flow tables 
191  * - Add per flowtable sysctls for statistics and configuring timeouts
192  * - add saturation counter to rtentry to support per-packet load-balancing
193  *   add flag to indicate round-robin flow, add list lookup from head
194      for flows
195  * - add sysctl / device node / syscall to support exporting and importing
196  *   of flows with flag to indicate that a flow was imported so should
197  *   not be considered for auto-cleaning
198  * - support explicit connection state (currently only ad-hoc for DSR)
199  * - idetach() cleanup for options VIMAGE builds.
200  */
201 VNET_DEFINE(int, flowtable_enable) = 1;
202 static VNET_DEFINE(int, flowtable_debug);
203 static VNET_DEFINE(int, flowtable_hits);
204 static VNET_DEFINE(int, flowtable_lookups);
205 static VNET_DEFINE(int, flowtable_misses);
206 static VNET_DEFINE(int, flowtable_frees);
207 static VNET_DEFINE(int, flowtable_free_checks);
208 static VNET_DEFINE(int, flowtable_max_depth);
209 static VNET_DEFINE(int, flowtable_collisions);
210 static VNET_DEFINE(int, flowtable_syn_expire) = SYN_IDLE;
211 static VNET_DEFINE(int, flowtable_udp_expire) = UDP_IDLE;
212 static VNET_DEFINE(int, flowtable_fin_wait_expire) = FIN_WAIT_IDLE;
213 static VNET_DEFINE(int, flowtable_tcp_expire) = TCP_IDLE;
214 static VNET_DEFINE(int, flowtable_nmbflows) = 4096;
215 static VNET_DEFINE(int, flowtable_ready) = 0;
216
217 #define V_flowtable_enable              VNET(flowtable_enable)
218 #define V_flowtable_debug               VNET(flowtable_debug)
219 #define V_flowtable_hits                VNET(flowtable_hits)
220 #define V_flowtable_lookups             VNET(flowtable_lookups)
221 #define V_flowtable_misses              VNET(flowtable_misses)
222 #define V_flowtable_frees               VNET(flowtable_frees)
223 #define V_flowtable_free_checks         VNET(flowtable_free_checks)
224 #define V_flowtable_max_depth           VNET(flowtable_max_depth)
225 #define V_flowtable_collisions          VNET(flowtable_collisions)
226 #define V_flowtable_syn_expire          VNET(flowtable_syn_expire)
227 #define V_flowtable_udp_expire          VNET(flowtable_udp_expire)
228 #define V_flowtable_fin_wait_expire     VNET(flowtable_fin_wait_expire)
229 #define V_flowtable_tcp_expire          VNET(flowtable_tcp_expire)
230 #define V_flowtable_nmbflows            VNET(flowtable_nmbflows)
231 #define V_flowtable_ready               VNET(flowtable_ready)
232
233 SYSCTL_NODE(_net_inet, OID_AUTO, flowtable, CTLFLAG_RD, NULL, "flowtable");
234 SYSCTL_VNET_INT(_net_inet_flowtable, OID_AUTO, debug, CTLFLAG_RW,
235     &VNET_NAME(flowtable_debug), 0, "print debug info.");
236 SYSCTL_VNET_INT(_net_inet_flowtable, OID_AUTO, enable, CTLFLAG_RW,
237     &VNET_NAME(flowtable_enable), 0, "enable flowtable caching.");
238 SYSCTL_VNET_INT(_net_inet_flowtable, OID_AUTO, hits, CTLFLAG_RD,
239     &VNET_NAME(flowtable_hits), 0, "# flowtable hits.");
240 SYSCTL_VNET_INT(_net_inet_flowtable, OID_AUTO, lookups, CTLFLAG_RD,
241     &VNET_NAME(flowtable_lookups), 0, "# flowtable lookups.");
242 SYSCTL_VNET_INT(_net_inet_flowtable, OID_AUTO, misses, CTLFLAG_RD,
243     &VNET_NAME(flowtable_misses), 0, "#flowtable misses.");
244 SYSCTL_VNET_INT(_net_inet_flowtable, OID_AUTO, frees, CTLFLAG_RD,
245     &VNET_NAME(flowtable_frees), 0, "#flows freed.");
246 SYSCTL_VNET_INT(_net_inet_flowtable, OID_AUTO, free_checks, CTLFLAG_RD,
247     &VNET_NAME(flowtable_free_checks), 0, "#flows free checks.");
248 SYSCTL_VNET_INT(_net_inet_flowtable, OID_AUTO, max_depth, CTLFLAG_RD,
249     &VNET_NAME(flowtable_max_depth), 0, "max collision list length.");
250 SYSCTL_VNET_INT(_net_inet_flowtable, OID_AUTO, collisions, CTLFLAG_RD,
251     &VNET_NAME(flowtable_collisions), 0, "#flowtable collisions.");
252
253 /*
254  * XXX This does not end up updating timeouts at runtime
255  * and only reflects the value for the last table added :-/
256  */
257 SYSCTL_VNET_INT(_net_inet_flowtable, OID_AUTO, syn_expire, CTLFLAG_RW,
258     &VNET_NAME(flowtable_syn_expire), 0,
259     "seconds after which to remove syn allocated flow.");
260 SYSCTL_VNET_INT(_net_inet_flowtable, OID_AUTO, udp_expire, CTLFLAG_RW,
261     &VNET_NAME(flowtable_udp_expire), 0,
262     "seconds after which to remove flow allocated to UDP.");
263 SYSCTL_VNET_INT(_net_inet_flowtable, OID_AUTO, fin_wait_expire, CTLFLAG_RW,
264     &VNET_NAME(flowtable_fin_wait_expire), 0,
265     "seconds after which to remove a flow in FIN_WAIT.");
266 SYSCTL_VNET_INT(_net_inet_flowtable, OID_AUTO, tcp_expire, CTLFLAG_RW,
267     &VNET_NAME(flowtable_tcp_expire), 0,
268     "seconds after which to remove flow allocated to a TCP connection.");
269
270
271 /*
272  * Maximum number of flows that can be allocated of a given type.
273  *
274  * The table is allocated at boot time (for the pure caching case
275  * there is no reason why this could not be changed at runtime)
276  * and thus (currently) needs to be set with a tunable.
277  */
278 static int
279 sysctl_nmbflows(SYSCTL_HANDLER_ARGS)
280 {
281         int error, newnmbflows;
282
283         newnmbflows = V_flowtable_nmbflows;
284         error = sysctl_handle_int(oidp, &newnmbflows, 0, req); 
285         if (error == 0 && req->newptr) {
286                 if (newnmbflows > V_flowtable_nmbflows) {
287                         V_flowtable_nmbflows = newnmbflows;
288                         uma_zone_set_max(V_flow_ipv4_zone,
289                             V_flowtable_nmbflows);
290                         uma_zone_set_max(V_flow_ipv6_zone,
291                             V_flowtable_nmbflows);
292                 } else
293                         error = EINVAL;
294         }
295         return (error);
296 }
297 SYSCTL_VNET_PROC(_net_inet_flowtable, OID_AUTO, nmbflows,
298     CTLTYPE_INT|CTLFLAG_RW, 0, 0, sysctl_nmbflows, "IU",
299     "Maximum number of flows allowed");
300
301 #ifndef RADIX_MPATH
302 static void
303 in_rtalloc_ign_wrapper(struct route *ro, uint32_t hash, u_int fibnum)
304 {
305
306         rtalloc_ign_fib(ro, 0, fibnum);
307 }
308 #endif
309
310 static void
311 flowtable_global_lock(struct flowtable *table, uint32_t hash)
312 {       
313         int lock_index = (hash)&(table->ft_lock_count - 1);
314
315         mtx_lock(&table->ft_locks[lock_index]);
316 }
317
318 static void
319 flowtable_global_unlock(struct flowtable *table, uint32_t hash)
320 {       
321         int lock_index = (hash)&(table->ft_lock_count - 1);
322
323         mtx_unlock(&table->ft_locks[lock_index]);
324 }
325
326 static void
327 flowtable_pcpu_lock(struct flowtable *table, uint32_t hash)
328 {
329
330         critical_enter();
331 }
332
333 static void
334 flowtable_pcpu_unlock(struct flowtable *table, uint32_t hash)
335 {
336
337         critical_exit();
338 }
339
340 #define FL_ENTRY_INDEX(table, hash)((hash) % (table)->ft_size)
341 #define FL_ENTRY(table, hash) *flowtable_entry((table), (hash))
342 #define FL_ENTRY_LOCK(table, hash)  (table)->ft_lock((table), (hash))
343 #define FL_ENTRY_UNLOCK(table, hash) (table)->ft_unlock((table), (hash))
344
345 #define FL_STALE (1<<8)
346 #define FL_IPV6  (1<<9)
347
348 static uint32_t
349 ipv4_flow_lookup_hash_internal(struct mbuf *m, struct route *ro,
350     uint32_t *key, uint16_t *flags, uint8_t *protop)
351 {
352         uint16_t sport = 0, dport = 0;
353         struct ip *ip = NULL;
354         uint8_t proto = 0;
355         int iphlen;
356         uint32_t hash;
357         struct sockaddr_in *sin;
358         struct tcphdr *th;
359         struct udphdr *uh;
360         struct sctphdr *sh;
361
362         if ((V_flowtable_enable == 0) || (V_flowtable_ready == 0))
363                 return (0);
364
365         key[1] = key[0] = 0;
366         sin = (struct sockaddr_in *)&ro->ro_dst;
367         if (m != NULL) {
368                 ip = mtod(m, struct ip *);
369                 sin->sin_family = AF_INET;
370                 sin->sin_len = sizeof(*sin);
371                 sin->sin_addr = ip->ip_dst;
372         } else
373                 *flags &= ~FL_HASH_PORTS;
374
375         key[2] = sin->sin_addr.s_addr;
376
377         if ((*flags & FL_HASH_PORTS) == 0)
378                 goto skipports;
379
380         proto = ip->ip_p;
381         iphlen = ip->ip_hl << 2; /* XXX options? */
382         key[1] = ip->ip_src.s_addr;
383         
384         switch (proto) {
385         case IPPROTO_TCP:
386                 th = (struct tcphdr *)((caddr_t)ip + iphlen);
387                 sport = ntohs(th->th_sport);
388                 dport = ntohs(th->th_dport);
389                 *flags |= th->th_flags;
390                 if (*flags & TH_RST)
391                         *flags |= FL_STALE;
392         break;
393         case IPPROTO_UDP:
394                 uh = (struct udphdr *)((caddr_t)ip + iphlen);
395                 sport = uh->uh_sport;
396                 dport = uh->uh_dport;
397         break;
398         case IPPROTO_SCTP:
399                 sh = (struct sctphdr *)((caddr_t)ip + iphlen);
400                 sport = sh->src_port;
401                 dport = sh->dest_port;
402         break;
403         default:
404                 if (*flags & FL_HASH_PORTS)
405                         goto noop;
406                 /* no port - hence not a protocol we care about */
407                 break;;
408         
409         }
410         *protop = proto;
411
412         /*
413          * If this is a transmit route cache then 
414          * hash all flows to a given destination to
415          * the same bucket
416          */
417         if ((*flags & FL_HASH_PORTS) == 0)
418                 proto = sport = dport = 0;
419
420         ((uint16_t *)key)[0] = sport;
421         ((uint16_t *)key)[1] = dport; 
422
423 skipports:
424         hash = jenkins_hashword(key, 3, V_flow_hashjitter + proto);
425         if (m != NULL && (m->m_flags & M_FLOWID) == 0) {
426                 m->m_flags |= M_FLOWID;
427                 m->m_pkthdr.flowid = hash;
428         }
429
430         return (hash);
431 noop:
432         *protop = proto;
433         return (0);
434 }
435
436 static bitstr_t *
437 flowtable_mask(struct flowtable *ft)
438 {
439         bitstr_t *mask;
440
441         if (ft->ft_flags & FL_PCPU)
442                 mask = ft->ft_masks[curcpu];
443         else
444                 mask = ft->ft_masks[0];
445
446         return (mask);
447 }
448
449 static struct flentry **
450 flowtable_entry(struct flowtable *ft, uint32_t hash)
451 {
452         struct flentry **fle;
453         int index = (hash % ft->ft_size);
454
455         if (ft->ft_flags & FL_PCPU) {
456                 KASSERT(&ft->ft_table.pcpu[curcpu][0] != NULL, ("pcpu not set"));
457                 fle = &ft->ft_table.pcpu[curcpu][index];
458         } else {
459                 KASSERT(&ft->ft_table.global[0] != NULL, ("global not set"));
460                 fle = &ft->ft_table.global[index];
461         }
462         
463         return (fle);
464 }
465
466 static int
467 flow_stale(struct flowtable *ft, struct flentry *fle)
468 {
469         time_t idle_time;
470
471         if ((fle->f_fhash == 0)
472             || ((fle->f_rt->rt_flags & RTF_HOST) &&
473                 ((fle->f_rt->rt_flags & (RTF_UP))
474                     != (RTF_UP)))
475             || (fle->f_rt->rt_ifp == NULL))
476                 return (1);
477
478         idle_time = time_uptime - fle->f_uptime;
479
480         if ((fle->f_flags & FL_STALE) ||
481             ((fle->f_flags & (TH_SYN|TH_ACK|TH_FIN)) == 0
482                 && (idle_time > ft->ft_udp_idle)) ||
483             ((fle->f_flags & TH_FIN)
484                 && (idle_time > ft->ft_fin_wait_idle)) ||
485             ((fle->f_flags & (TH_SYN|TH_ACK)) == TH_SYN
486                 && (idle_time > ft->ft_syn_idle)) ||
487             ((fle->f_flags & (TH_SYN|TH_ACK)) == (TH_SYN|TH_ACK)
488                 && (idle_time > ft->ft_tcp_idle)) ||
489             ((fle->f_rt->rt_flags & RTF_UP) == 0 || 
490                 (fle->f_rt->rt_ifp == NULL)))
491                 return (1);
492
493         return (0);
494 }
495
496 static void
497 flowtable_set_hashkey(struct flentry *fle, uint32_t *key)
498 {
499         uint32_t *hashkey;
500         int i, nwords;
501
502         if (fle->f_flags & FL_IPV6) {
503                 nwords = 9;
504                 hashkey = ((struct flentry_v4 *)fle)->fl_flow.ipf_key;
505         } else {
506                 nwords = 3;
507                 hashkey = ((struct flentry_v6 *)fle)->fl_flow.ipf_key;
508         }
509         
510         for (i = 0; i < nwords; i++) 
511                 hashkey[i] = key[i];
512 }
513
514 static int
515 flowtable_insert(struct flowtable *ft, uint32_t hash, uint32_t *key,
516     uint8_t proto, uint32_t fibnum, struct route *ro, uint16_t flags)
517 {
518         struct flentry *fle, *fletail, *newfle, **flep;
519         int depth;
520         uma_zone_t flezone;
521         bitstr_t *mask;
522
523         flezone = (flags & FL_IPV6) ? V_flow_ipv6_zone : V_flow_ipv4_zone;
524         newfle = uma_zalloc(flezone, M_NOWAIT | M_ZERO);
525         if (newfle == NULL)
526                 return (ENOMEM);
527
528         newfle->f_flags |= (flags & FL_IPV6);
529         
530         FL_ENTRY_LOCK(ft, hash);
531         mask = flowtable_mask(ft);
532         flep = flowtable_entry(ft, hash);
533         fletail = fle = *flep;
534
535         if (fle == NULL) {
536                 bit_set(mask, FL_ENTRY_INDEX(ft, hash));
537                 *flep = fle = newfle;
538                 goto skip;
539         } 
540         
541         depth = 0;
542         V_flowtable_collisions++;
543         /*
544          * find end of list and make sure that we were not
545          * preempted by another thread handling this flow
546          */
547         while (fle != NULL) {
548                 if (fle->f_fhash == hash && !flow_stale(ft, fle)) {
549                         /*
550                          * there was either a hash collision
551                          * or we lost a race to insert
552                          */
553                         FL_ENTRY_UNLOCK(ft, hash);
554                         uma_zfree((newfle->f_flags & FL_IPV6) ?
555                             V_flow_ipv6_zone : V_flow_ipv4_zone, newfle);
556                         return (EEXIST);
557                 }
558                 /*
559                  * re-visit this double condition XXX
560                  */
561                 if (fletail->f_next != NULL)
562                         fletail = fle->f_next;
563
564                 depth++;
565                 fle = fle->f_next;
566         } 
567
568         if (depth > V_flowtable_max_depth)
569                 V_flowtable_max_depth = depth;
570         fletail->f_next = newfle;
571         fle = newfle;
572 skip:
573         flowtable_set_hashkey(fle, key);
574
575         fle->f_proto = proto;
576         fle->f_rt = ro->ro_rt;
577         fle->f_lle = ro->ro_lle;
578         fle->f_fhash = hash;
579         fle->f_fibnum = fibnum;
580         fle->f_uptime = time_uptime;
581         FL_ENTRY_UNLOCK(ft, hash);
582         return (0);
583 }
584
585 static int
586 flowtable_key_equal(struct flentry *fle, uint32_t *key)
587 {
588         uint32_t *hashkey;
589         int i, nwords;
590
591         if (fle->f_flags & FL_IPV6) {
592                 nwords = 9;
593                 hashkey = ((struct flentry_v4 *)fle)->fl_flow.ipf_key;
594         } else {
595                 nwords = 3;
596                 hashkey = ((struct flentry_v6 *)fle)->fl_flow.ipf_key;
597         }
598         
599         for (i = 0; i < nwords; i++) 
600                 if (hashkey[i] != key[i])
601                         return (0);
602
603         return (1);
604 }
605
606 int
607 flowtable_lookup(struct flowtable *ft, struct mbuf *m, struct route *ro, uint32_t fibnum)
608 {
609         uint32_t key[9], hash;
610         struct flentry *fle;
611         uint16_t flags;
612         uint8_t proto = 0;
613         int error = 0;
614         struct rtentry *rt;
615         struct llentry *lle;
616
617         flags = ft->ft_flags;
618         ro->ro_rt = NULL;
619         ro->ro_lle = NULL;
620
621         /*
622          * The internal hash lookup is the only IPv4 specific bit
623          * remaining
624          *
625          * XXX BZ: to add IPv6 support just add a check for the
626          * address type in m and ro and an equivalent ipv6 lookup
627          * function - the rest of the code should automatically
628          * handle an ipv6 flow (note that m can be NULL in which
629          * case ro will be set)
630          */
631         hash = ipv4_flow_lookup_hash_internal(m, ro, key,
632             &flags, &proto);
633
634         /*
635          * Ports are zero and this isn't a transmit cache
636          * - thus not a protocol for which we need to keep 
637          * state
638          * FL_HASH_PORTS => key[0] != 0 for TCP || UDP || SCTP
639          */
640         if (hash == 0 || (key[0] == 0 && (ft->ft_flags & FL_HASH_PORTS)))
641                 return (ENOENT);
642
643         V_flowtable_lookups++;
644         FL_ENTRY_LOCK(ft, hash);
645         if ((fle = FL_ENTRY(ft, hash)) == NULL) {
646                 FL_ENTRY_UNLOCK(ft, hash);
647                 goto uncached;
648         }
649 keycheck:       
650         rt = __DEVOLATILE(struct rtentry *, fle->f_rt);
651         lle = __DEVOLATILE(struct llentry *, fle->f_lle);
652         if ((rt != NULL)
653             && fle->f_fhash == hash
654             && flowtable_key_equal(fle, key)
655             && (proto == fle->f_proto)
656             && (fibnum == fle->f_fibnum)
657             && (rt->rt_flags & RTF_UP)
658             && (rt->rt_ifp != NULL)) {
659                 V_flowtable_hits++;
660                 fle->f_uptime = time_uptime;
661                 fle->f_flags |= flags;
662                 ro->ro_rt = rt;
663                 ro->ro_lle = lle;
664                 FL_ENTRY_UNLOCK(ft, hash);
665                 return (0);
666         } else if (fle->f_next != NULL) {
667                 fle = fle->f_next;
668                 goto keycheck;
669         }
670         FL_ENTRY_UNLOCK(ft, hash);
671
672 uncached:
673         V_flowtable_misses++;
674         /*
675          * This bit of code ends up locking the
676          * same route 3 times (just like ip_output + ether_output)
677          * - at lookup
678          * - in rt_check when called by arpresolve
679          * - dropping the refcount for the rtentry
680          *
681          * This could be consolidated to one if we wrote a variant
682          * of arpresolve with an rt_check variant that expected to
683          * receive the route locked
684          */
685
686         ft->ft_rtalloc(ro, hash, fibnum);
687         if (ro->ro_rt == NULL) 
688                 error = ENETUNREACH;
689         else {
690                 struct llentry *lle = NULL;
691                 struct sockaddr *l3addr;
692                 struct rtentry *rt = ro->ro_rt;
693                 struct ifnet *ifp = rt->rt_ifp;
694
695                 if (ifp->if_flags & (IFF_POINTOPOINT | IFF_LOOPBACK)) {
696                         RTFREE(rt);
697                         ro->ro_rt = NULL;
698                         return (ENOENT);
699                 }
700
701                 if (rt->rt_flags & RTF_GATEWAY)
702                         l3addr = rt->rt_gateway;
703                 else
704                         l3addr = &ro->ro_dst;
705                 llentry_update(&lle, LLTABLE(ifp), l3addr, ifp);
706                 ro->ro_lle = lle;
707
708                 if (lle == NULL) {
709                         RTFREE(rt);
710                         ro->ro_rt = NULL;
711                         return (ENOENT);
712                 }
713                 error = flowtable_insert(ft, hash, key, proto, fibnum,
714                     ro, flags);
715                                 
716                 if (error) {
717                         RTFREE(rt);
718                         LLE_FREE(lle);
719                         ro->ro_rt = NULL;
720                         ro->ro_lle = NULL;
721                 }
722         } 
723
724         return (error);
725 }
726
727 /*
728  * used by the bit_alloc macro
729  */
730 #define calloc(count, size) malloc((count)*(size), M_DEVBUF, M_WAITOK|M_ZERO)
731         
732 struct flowtable *
733 flowtable_alloc(int nentry, int flags)
734 {
735         struct flowtable *ft, *fttail;
736         int i;
737
738         if (V_flow_hashjitter == 0)
739                 V_flow_hashjitter = arc4random();
740
741         KASSERT(nentry > 0, ("nentry must be > 0, is %d\n", nentry));
742
743         ft = malloc(sizeof(struct flowtable),
744             M_RTABLE, M_WAITOK | M_ZERO);
745         
746         ft->ft_flags = flags;
747         ft->ft_size = nentry;
748 #ifdef RADIX_MPATH
749         ft->ft_rtalloc = rtalloc_mpath_fib;
750 #else
751         ft->ft_rtalloc = in_rtalloc_ign_wrapper;
752 #endif
753         if (flags & FL_PCPU) {
754                 ft->ft_lock = flowtable_pcpu_lock;
755                 ft->ft_unlock = flowtable_pcpu_unlock;
756
757                 for (i = 0; i <= mp_maxid; i++) {
758                         ft->ft_table.pcpu[i] =
759                             malloc(nentry*sizeof(struct flentry *),
760                                 M_RTABLE, M_WAITOK | M_ZERO);
761                         ft->ft_masks[i] = bit_alloc(nentry);
762                 }
763         } else {
764                 ft->ft_lock_count = 2*(powerof2(mp_maxid + 1) ? (mp_maxid + 1):
765                     (fls(mp_maxid + 1) << 1));
766                 
767                 ft->ft_lock = flowtable_global_lock;
768                 ft->ft_unlock = flowtable_global_unlock;
769                 ft->ft_table.global =
770                             malloc(nentry*sizeof(struct flentry *),
771                                 M_RTABLE, M_WAITOK | M_ZERO);
772                 ft->ft_locks = malloc(ft->ft_lock_count*sizeof(struct mtx),
773                                 M_RTABLE, M_WAITOK | M_ZERO);
774                 for (i = 0; i < ft->ft_lock_count; i++)
775                         mtx_init(&ft->ft_locks[i], "flow", NULL, MTX_DEF|MTX_DUPOK);
776
777                 ft->ft_masks[0] = bit_alloc(nentry);
778         }
779         ft->ft_tmpmask = bit_alloc(nentry);
780
781         /*
782          * In the local transmit case the table truly is 
783          * just a cache - so everything is eligible for
784          * replacement after 5s of non-use
785          */
786         if (flags & FL_HASH_PORTS) {
787                 ft->ft_udp_idle = V_flowtable_udp_expire;
788                 ft->ft_syn_idle = V_flowtable_syn_expire;
789                 ft->ft_fin_wait_idle = V_flowtable_fin_wait_expire;
790                 ft->ft_tcp_idle = V_flowtable_fin_wait_expire;
791         } else {
792                 ft->ft_udp_idle = ft->ft_fin_wait_idle =
793                     ft->ft_syn_idle = ft->ft_tcp_idle = 30;
794                 
795         }
796
797         /*
798          * hook in to the cleaner list
799          */
800         if (V_flow_list_head == NULL)
801                 V_flow_list_head = ft;
802         else {
803                 fttail = V_flow_list_head;
804                 while (fttail->ft_next != NULL)
805                         fttail = fttail->ft_next;
806                 fttail->ft_next = ft;
807         }
808
809         return (ft);
810 }
811
812 /*
813  * The rest of the code is devoted to garbage collection of expired entries.
814  * It is a new additon made necessary by the switch to dynamically allocating
815  * flow tables.
816  * 
817  */
818 static void
819 fle_free(struct flentry *fle)
820 {
821         struct rtentry *rt;
822         struct llentry *lle;
823
824         rt = __DEVOLATILE(struct rtentry *, fle->f_rt);
825         lle = __DEVOLATILE(struct llentry *, fle->f_lle);
826         RTFREE(rt);
827         LLE_FREE(lle);
828         uma_zfree((fle->f_flags & FL_IPV6) ?
829             V_flow_ipv6_zone : V_flow_ipv4_zone, fle);
830 }
831
832 static void
833 flowtable_free_stale(struct flowtable *ft, struct rtentry *rt)
834 {
835         int curbit = 0, count;
836         struct flentry *fle,  **flehead, *fleprev;
837         struct flentry *flefreehead, *flefreetail, *fletmp;
838         bitstr_t *mask, *tmpmask;
839         
840         flefreehead = flefreetail = NULL;
841         mask = flowtable_mask(ft);
842         tmpmask = ft->ft_tmpmask;
843         memcpy(tmpmask, mask, ft->ft_size/8);
844         /*
845          * XXX Note to self, bit_ffs operates at the byte level
846          * and thus adds gratuitous overhead
847          */
848         bit_ffs(tmpmask, ft->ft_size, &curbit);
849         while (curbit != -1) {
850                 if (curbit >= ft->ft_size || curbit < -1) {
851                         log(LOG_ALERT,
852                             "warning: bad curbit value %d \n",
853                             curbit);
854                         break;
855                 }
856                 
857                 FL_ENTRY_LOCK(ft, curbit);
858                 flehead = flowtable_entry(ft, curbit);
859                 fle = fleprev = *flehead;
860
861                 V_flowtable_free_checks++;
862 #ifdef DIAGNOSTIC
863                 if (fle == NULL && curbit > 0) {
864                         log(LOG_ALERT,
865                             "warning bit=%d set, but no fle found\n",
866                             curbit);
867                 }
868 #endif          
869                 while (fle != NULL) {
870                         if (rt != NULL) {
871                                 if (__DEVOLATILE(struct rtentry *, fle->f_rt) != rt) {
872                                         fleprev = fle;
873                                         fle = fle->f_next;
874                                         continue;
875                                 }
876                         } else if (!flow_stale(ft, fle)) {
877                                 fleprev = fle;
878                                 fle = fle->f_next;
879                                 continue;
880                         }
881                         /*
882                          * delete head of the list
883                          */
884                         if (fleprev == *flehead) {
885                                 fletmp = fleprev;
886                                 if (fle == fleprev) {
887                                         fleprev = *flehead = fle->f_next;
888                                 } else
889                                         fleprev = *flehead = fle;
890                                 fle = fle->f_next;
891                         } else {
892                                 /*
893                                  * don't advance fleprev
894                                  */
895                                 fletmp = fle;
896                                 fleprev->f_next = fle->f_next;
897                                 fle = fleprev->f_next;
898                         }
899                         
900                         if (flefreehead == NULL)
901                                 flefreehead = flefreetail = fletmp;
902                         else {
903                                 flefreetail->f_next = fletmp;
904                                 flefreetail = fletmp;
905                         }
906                         fletmp->f_next = NULL;
907                 }
908                 if (*flehead == NULL)
909                         bit_clear(mask, curbit);
910                 FL_ENTRY_UNLOCK(ft, curbit);
911                 bit_clear(tmpmask, curbit);
912                 bit_ffs(tmpmask, ft->ft_size, &curbit);
913         }
914         count = 0;
915         while ((fle = flefreehead) != NULL) {
916                 flefreehead = fle->f_next;
917                 count++;
918                 V_flowtable_frees++;
919                 fle_free(fle);
920         }
921         if (V_flowtable_debug && count)
922                 log(LOG_DEBUG, "freed %d flow entries\n", count);
923 }
924
925 void
926 flowtable_route_flush(struct flowtable *ft, struct rtentry *rt)
927 {
928         int i;
929         if (ft->ft_flags & FL_PCPU) {
930                 for (i = 0; i <= mp_maxid; i++) {
931                         if (CPU_ABSENT(i))
932                                 continue;
933                         
934                         if (smp_started == 1) {
935                                 thread_lock(curthread);
936                                 sched_bind(curthread, i);
937                                 thread_unlock(curthread);
938                         }
939
940                         flowtable_free_stale(ft, rt);
941
942                         if (smp_started == 1) {
943                                 thread_lock(curthread);
944                                 sched_unbind(curthread);
945                                 thread_unlock(curthread);
946                         }
947                 }
948         } else {
949                 flowtable_free_stale(ft, rt);
950         }
951 }
952
953 static void
954 flowtable_clean_vnet(void)
955 {
956         struct flowtable *ft;
957         int i;
958
959         ft = V_flow_list_head;
960         while (ft != NULL) {
961                 if (ft->ft_flags & FL_PCPU) {
962                         for (i = 0; i <= mp_maxid; i++) {
963                                 if (CPU_ABSENT(i))
964                                         continue;
965
966                                 if (smp_started == 1) {
967                                         thread_lock(curthread);
968                                         sched_bind(curthread, i);
969                                         thread_unlock(curthread);
970                                 }
971
972                                 flowtable_free_stale(ft, NULL);
973
974                                 if (smp_started == 1) {
975                                         thread_lock(curthread);
976                                         sched_unbind(curthread);
977                                         thread_unlock(curthread);
978                                 }
979                         }
980                 } else {
981                         flowtable_free_stale(ft, NULL);
982                 }
983                 ft = ft->ft_next;
984         }
985 }
986
987 static void
988 flowtable_cleaner(void)
989 {
990         VNET_ITERATOR_DECL(vnet_iter);
991
992         if (bootverbose)
993                 log(LOG_INFO, "flowtable cleaner started\n");
994         while (1) {
995                 VNET_LIST_RLOCK();
996                 VNET_FOREACH(vnet_iter) {
997                         CURVNET_SET(vnet_iter);
998                         flowtable_clean_vnet();
999                         CURVNET_RESTORE();
1000                 }
1001                 VNET_LIST_RUNLOCK();
1002
1003                 flowclean_cycles++;
1004                 /*
1005                  * The 10 second interval between cleaning checks
1006                  * is arbitrary
1007                  */
1008                 mtx_lock(&flowclean_lock);
1009                 cv_broadcast(&flowclean_cv);
1010                 cv_timedwait(&flowclean_cv, &flowclean_lock, 10*hz);
1011                 mtx_unlock(&flowclean_lock);
1012         }
1013 }
1014
1015 static void
1016 flowtable_flush(void *unused __unused)
1017 {
1018         uint64_t start;
1019         
1020         mtx_lock(&flowclean_lock);
1021         start = flowclean_cycles;
1022         while (start == flowclean_cycles) {
1023                 cv_broadcast(&flowclean_cv);
1024                 cv_wait(&flowclean_cv, &flowclean_lock);
1025         }
1026         mtx_unlock(&flowclean_lock);
1027 }
1028
1029 static struct kproc_desc flow_kp = {
1030         "flowcleaner",
1031         flowtable_cleaner,
1032         &flowcleanerproc
1033 };
1034 SYSINIT(flowcleaner, SI_SUB_KTHREAD_IDLE, SI_ORDER_ANY, kproc_start, &flow_kp);
1035
1036 static void
1037 flowtable_init_vnet(const void *unused __unused)
1038 {
1039
1040         V_flow_ipv4_zone = uma_zcreate("ip4flow", sizeof(struct flentry_v4),
1041             NULL, NULL, NULL, NULL, 64, UMA_ZONE_MAXBUCKET);
1042         V_flow_ipv6_zone = uma_zcreate("ip6flow", sizeof(struct flentry_v6),
1043             NULL, NULL, NULL, NULL, 64, UMA_ZONE_MAXBUCKET);    
1044         uma_zone_set_max(V_flow_ipv4_zone, V_flowtable_nmbflows);
1045         uma_zone_set_max(V_flow_ipv6_zone, V_flowtable_nmbflows);
1046         V_flowtable_ready = 1;
1047 }
1048 VNET_SYSINIT(flowtable_init_vnet, SI_SUB_KTHREAD_INIT, SI_ORDER_MIDDLE,
1049     flowtable_init_vnet, NULL);
1050
1051 static void
1052 flowtable_init(const void *unused __unused)
1053 {
1054
1055         cv_init(&flowclean_cv, "flowcleanwait");
1056         mtx_init(&flowclean_lock, "flowclean lock", NULL, MTX_DEF);
1057         EVENTHANDLER_REGISTER(ifnet_departure_event, flowtable_flush, NULL,
1058             EVENTHANDLER_PRI_ANY);
1059 }
1060 SYSINIT(flowtable_init, SI_SUB_KTHREAD_INIT, SI_ORDER_ANY,
1061     flowtable_init, NULL);
1062
1063
1064 #ifdef VIMAGE
1065 static void
1066 flowtable_uninit(const void *unused __unused)
1067 {
1068
1069         V_flowtable_ready = 0;
1070         uma_zdestroy(V_flow_ipv4_zone);
1071         uma_zdestroy(V_flow_ipv6_zone);
1072 }
1073
1074 VNET_SYSUNINIT(flowtable_uninit, SI_SUB_KTHREAD_INIT, SI_ORDER_ANY,
1075     flowtable_uninit, NULL);
1076 #endif
1077
1078 #ifdef DDB
1079 static bitstr_t *
1080 flowtable_mask_pcpu(struct flowtable *ft, int cpuid)
1081 {
1082         bitstr_t *mask;
1083
1084         if (ft->ft_flags & FL_PCPU)
1085                 mask = ft->ft_masks[cpuid];
1086         else
1087                 mask = ft->ft_masks[0];
1088
1089         return (mask);
1090 }
1091
1092 static struct flentry **
1093 flowtable_entry_pcpu(struct flowtable *ft, uint32_t hash, int cpuid)
1094 {
1095         struct flentry **fle;
1096         int index = (hash % ft->ft_size);
1097
1098         if (ft->ft_flags & FL_PCPU) {
1099                 fle = &ft->ft_table.pcpu[cpuid][index];
1100         } else {
1101                 fle = &ft->ft_table.global[index];
1102         }
1103         
1104         return (fle);
1105 }
1106
1107 static void
1108 flow_show(struct flowtable *ft, struct flentry *fle)
1109 {
1110         int idle_time;
1111         int rt_valid;
1112
1113         idle_time = (int)(time_uptime - fle->f_uptime);
1114         rt_valid = fle->f_rt != NULL;
1115         db_printf("hash=0x%08x idle_time=%03d rt=%p ifp=%p",
1116             fle->f_fhash, idle_time,
1117             fle->f_rt, rt_valid ? fle->f_rt->rt_ifp : NULL);
1118         if (rt_valid && (fle->f_rt->rt_flags & RTF_UP))
1119                 db_printf(" RTF_UP ");
1120         if (fle->f_flags & FL_STALE)
1121                 db_printf(" FL_STALE ");
1122         db_printf("\n");
1123 }
1124
1125 static void
1126 flowtable_show(struct flowtable *ft, int cpuid)
1127 {
1128         int curbit = 0;
1129         struct flentry *fle,  **flehead;
1130         bitstr_t *mask, *tmpmask;
1131
1132         db_printf("cpu: %d\n", cpuid);
1133         mask = flowtable_mask_pcpu(ft, cpuid);
1134         tmpmask = ft->ft_tmpmask;
1135         memcpy(tmpmask, mask, ft->ft_size/8);
1136         /*
1137          * XXX Note to self, bit_ffs operates at the byte level
1138          * and thus adds gratuitous overhead
1139          */
1140         bit_ffs(tmpmask, ft->ft_size, &curbit);
1141         while (curbit != -1) {
1142                 if (curbit >= ft->ft_size || curbit < -1) {
1143                         db_printf("warning: bad curbit value %d \n",
1144                             curbit);
1145                         break;
1146                 }
1147
1148                 flehead = flowtable_entry_pcpu(ft, curbit, cpuid);
1149                 fle = *flehead;
1150
1151                 while (fle != NULL) {   
1152                         flow_show(ft, fle);
1153                         fle = fle->f_next;
1154                         continue;
1155                 }
1156                 bit_clear(tmpmask, curbit);
1157                 bit_ffs(tmpmask, ft->ft_size, &curbit);
1158         }
1159 }
1160
1161 static void
1162 flowtable_show_vnet(void)
1163 {
1164         struct flowtable *ft;
1165         int i;
1166
1167         ft = V_flow_list_head;
1168         while (ft != NULL) {
1169                 if (ft->ft_flags & FL_PCPU) {
1170                         for (i = 0; i <= mp_maxid; i++) {
1171                                 if (CPU_ABSENT(i))
1172                                         continue;
1173                                 flowtable_show(ft, i);
1174                         }
1175                 } else {
1176                         flowtable_show(ft, 0);
1177                 }
1178                 ft = ft->ft_next;
1179         }
1180 }
1181
1182 DB_SHOW_COMMAND(flowtables, db_show_flowtables)
1183 {
1184         VNET_ITERATOR_DECL(vnet_iter);
1185
1186         VNET_FOREACH(vnet_iter) {
1187                 CURVNET_SET(vnet_iter);
1188                 flowtable_show_vnet();
1189                 CURVNET_RESTORE();
1190         }
1191 }
1192 #endif