]> CyberLeo.Net >> Repos - FreeBSD/stable/8.git/blob - sys/net/flowtable.c
Copy head to stable/8 as part of 8.0 Release cycle.
[FreeBSD/stable/8.git] / sys / net / flowtable.c
1 /**************************************************************************
2
3 Copyright (c) 2008-2009, BitGravity Inc.
4 All rights reserved.
5
6 Redistribution and use in source and binary forms, with or without
7 modification, are permitted provided that the following conditions are met:
8
9  1. Redistributions of source code must retain the above copyright notice,
10     this list of conditions and the following disclaimer.
11
12  2. Neither the name of the BitGravity Corporation nor the names of its
13     contributors may be used to endorse or promote products derived from
14     this software without specific prior written permission.
15
16 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
17 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
20 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
21 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
22 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
23 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
24 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
25 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
26 POSSIBILITY OF SUCH DAMAGE.
27
28 ***************************************************************************/
29
30 #include "opt_route.h"
31 #include "opt_mpath.h"
32
33 #include <sys/cdefs.h>
34 __FBSDID("$FreeBSD$");
35
36 #include <sys/param.h>  
37 #include <sys/types.h>
38 #include <sys/bitstring.h>
39 #include <sys/callout.h>
40 #include <sys/kernel.h>  
41 #include <sys/kthread.h>
42 #include <sys/limits.h>
43 #include <sys/malloc.h>
44 #include <sys/mbuf.h>
45 #include <sys/proc.h>
46 #include <sys/sched.h>
47 #include <sys/smp.h>
48 #include <sys/socket.h>
49 #include <sys/syslog.h>
50 #include <sys/sysctl.h>
51
52 #include <net/if.h>
53 #include <net/if_llatbl.h>
54 #include <net/if_var.h>
55 #include <net/route.h> 
56 #include <net/flowtable.h>
57 #include <net/vnet.h>
58
59 #include <netinet/in.h>
60 #include <netinet/in_systm.h>
61 #include <netinet/in_var.h>
62 #include <netinet/if_ether.h>
63 #include <netinet/ip.h>
64 #include <netinet/tcp.h>
65 #include <netinet/udp.h>
66 #include <netinet/sctp.h>
67
68 #include <libkern/jenkins.h>
69
70 struct ipv4_tuple {
71         uint16_t        ip_sport;       /* source port */
72         uint16_t        ip_dport;       /* destination port */
73         in_addr_t       ip_saddr;       /* source address */
74         in_addr_t       ip_daddr;       /* destination address */
75 };
76
77 union ipv4_flow {
78         struct ipv4_tuple ipf_ipt;
79         uint32_t        ipf_key[3];
80 };
81
82 struct ipv6_tuple {
83         uint16_t        ip_sport;       /* source port */
84         uint16_t        ip_dport;       /* destination port */
85         struct in6_addr ip_saddr;       /* source address */
86         struct in6_addr ip_daddr;       /* destination address */
87 };
88
89 union ipv6_flow {
90         struct ipv6_tuple ipf_ipt;
91         uint32_t        ipf_key[9];
92 };
93
94 struct flentry {
95         volatile uint32_t       f_fhash;        /* hash flowing forward */
96         uint16_t                f_flags;        /* flow flags */
97         uint8_t                 f_pad;          /* alignment */
98         uint8_t                 f_proto;        /* protocol */
99         uint32_t                f_uptime;       /* uptime at last access */
100         struct flentry          *f_next;        /* pointer to collision entry */
101         volatile struct rtentry *f_rt;          /* rtentry for flow */
102         volatile struct llentry *f_lle;         /* llentry for flow */
103 };
104
105 struct flentry_v4 {
106         struct flentry  fl_entry;
107         union ipv4_flow fl_flow;
108 };
109
110 struct flentry_v6 {
111         struct flentry  fl_entry;
112         union ipv6_flow fl_flow;
113 };
114
115 #define fl_fhash        fl_entry.fl_fhash
116 #define fl_flags        fl_entry.fl_flags
117 #define fl_proto        fl_entry.fl_proto
118 #define fl_uptime       fl_entry.fl_uptime
119 #define fl_rt           fl_entry.fl_rt
120 #define fl_lle          fl_entry.fl_lle
121
122 #define SECS_PER_HOUR           3600
123 #define SECS_PER_DAY            (24*SECS_PER_HOUR)
124
125 #define SYN_IDLE                300
126 #define UDP_IDLE                300
127 #define FIN_WAIT_IDLE           600
128 #define TCP_IDLE                SECS_PER_DAY
129
130
131 typedef void fl_lock_t(struct flowtable *, uint32_t);
132 typedef void fl_rtalloc_t(struct route *, uint32_t, u_int);
133
134 union flentryp {
135         struct flentry          **global;
136         struct flentry          **pcpu[MAXCPU];
137 };
138
139 struct flowtable {
140         int             ft_size;
141         int             ft_lock_count;
142         uint32_t        ft_flags;
143         uint32_t        ft_collisions;
144         uint32_t        ft_allocated;
145         uint32_t        ft_misses;
146         uint64_t        ft_hits;
147
148         uint32_t        ft_udp_idle;
149         uint32_t        ft_fin_wait_idle;
150         uint32_t        ft_syn_idle;
151         uint32_t        ft_tcp_idle;
152
153         fl_lock_t       *ft_lock;
154         fl_lock_t       *ft_unlock;
155         fl_rtalloc_t    *ft_rtalloc;
156         struct mtx      *ft_locks;
157
158         
159         union flentryp  ft_table;
160         bitstr_t        *ft_masks[MAXCPU];
161         bitstr_t        *ft_tmpmask;
162         struct flowtable *ft_next;
163 };
164
165 static struct proc *flowcleanerproc;
166 static VNET_DEFINE(struct flowtable *, flow_list_head);
167 static VNET_DEFINE(uint32_t, flow_hashjitter);
168 static VNET_DEFINE(uma_zone_t, flow_ipv4_zone);
169 static VNET_DEFINE(uma_zone_t, flow_ipv6_zone);
170
171 #define V_flow_list_head        VNET(flow_list_head)
172 #define V_flow_hashjitter       VNET(flow_hashjitter)
173 #define V_flow_ipv4_zone        VNET(flow_ipv4_zone)
174 #define V_flow_ipv6_zone        VNET(flow_ipv6_zone)
175
176 /*
177  * TODO:
178  * - Make flowtable stats per-cpu, aggregated at sysctl call time,
179  *   to avoid extra cache evictions caused by incrementing a shared
180  *   counter
181  * - add IPv6 support to flow lookup
182  * - add sysctls to resize && flush flow tables 
183  * - Add per flowtable sysctls for statistics and configuring timeouts
184  * - add saturation counter to rtentry to support per-packet load-balancing
185  *   add flag to indicate round-robin flow, add list lookup from head
186      for flows
187  * - add sysctl / device node / syscall to support exporting and importing
188  *   of flows with flag to indicate that a flow was imported so should
189  *   not be considered for auto-cleaning
190  * - support explicit connection state (currently only ad-hoc for DSR)
191  * - idetach() cleanup for options VIMAGE builds.
192  */
193 VNET_DEFINE(int, flowtable_enable) = 1;
194 static VNET_DEFINE(int, flowtable_hits);
195 static VNET_DEFINE(int, flowtable_lookups);
196 static VNET_DEFINE(int, flowtable_misses);
197 static VNET_DEFINE(int, flowtable_frees);
198 static VNET_DEFINE(int, flowtable_free_checks);
199 static VNET_DEFINE(int, flowtable_max_depth);
200 static VNET_DEFINE(int, flowtable_collisions);
201 static VNET_DEFINE(int, flowtable_syn_expire) = SYN_IDLE;
202 static VNET_DEFINE(int, flowtable_udp_expire) = UDP_IDLE;
203 static VNET_DEFINE(int, flowtable_fin_wait_expire) = FIN_WAIT_IDLE;
204 static VNET_DEFINE(int, flowtable_tcp_expire) = TCP_IDLE;
205 static VNET_DEFINE(int, flowtable_nmbflows) = 4096;
206
207 #define V_flowtable_enable              VNET(flowtable_enable)
208 #define V_flowtable_hits                VNET(flowtable_hits)
209 #define V_flowtable_lookups             VNET(flowtable_lookups)
210 #define V_flowtable_misses              VNET(flowtable_misses)
211 #define V_flowtable_frees               VNET(flowtable_frees)
212 #define V_flowtable_free_checks         VNET(flowtable_free_checks)
213 #define V_flowtable_max_depth           VNET(flowtable_max_depth)
214 #define V_flowtable_collisions          VNET(flowtable_collisions)
215 #define V_flowtable_syn_expire          VNET(flowtable_syn_expire)
216 #define V_flowtable_udp_expire          VNET(flowtable_udp_expire)
217 #define V_flowtable_fin_wait_expire     VNET(flowtable_fin_wait_expire)
218 #define V_flowtable_tcp_expire          VNET(flowtable_tcp_expire)
219 #define V_flowtable_nmbflows            VNET(flowtable_nmbflows)
220
221 SYSCTL_NODE(_net_inet, OID_AUTO, flowtable, CTLFLAG_RD, NULL, "flowtable");
222 SYSCTL_VNET_INT(_net_inet_flowtable, OID_AUTO, enable, CTLFLAG_RW,
223     &VNET_NAME(flowtable_enable), 0, "enable flowtable caching.");
224 SYSCTL_VNET_INT(_net_inet_flowtable, OID_AUTO, hits, CTLFLAG_RD,
225     &VNET_NAME(flowtable_hits), 0, "# flowtable hits.");
226 SYSCTL_VNET_INT(_net_inet_flowtable, OID_AUTO, lookups, CTLFLAG_RD,
227     &VNET_NAME(flowtable_lookups), 0, "# flowtable lookups.");
228 SYSCTL_VNET_INT(_net_inet_flowtable, OID_AUTO, misses, CTLFLAG_RD,
229     &VNET_NAME(flowtable_misses), 0, "#flowtable misses.");
230 SYSCTL_VNET_INT(_net_inet_flowtable, OID_AUTO, frees, CTLFLAG_RD,
231     &VNET_NAME(flowtable_frees), 0, "#flows freed.");
232 SYSCTL_VNET_INT(_net_inet_flowtable, OID_AUTO, free_checks, CTLFLAG_RD,
233     &VNET_NAME(flowtable_free_checks), 0, "#flows free checks.");
234 SYSCTL_VNET_INT(_net_inet_flowtable, OID_AUTO, max_depth, CTLFLAG_RD,
235     &VNET_NAME(flowtable_max_depth), 0, "max collision list length.");
236 SYSCTL_VNET_INT(_net_inet_flowtable, OID_AUTO, collisions, CTLFLAG_RD,
237     &VNET_NAME(flowtable_collisions), 0, "#flowtable collisions.");
238
239 /*
240  * XXX This does not end up updating timeouts at runtime
241  * and only reflects the value for the last table added :-/
242  */
243 SYSCTL_VNET_INT(_net_inet_flowtable, OID_AUTO, syn_expire, CTLFLAG_RW,
244     &VNET_NAME(flowtable_syn_expire), 0,
245     "seconds after which to remove syn allocated flow.");
246 SYSCTL_VNET_INT(_net_inet_flowtable, OID_AUTO, udp_expire, CTLFLAG_RW,
247     &VNET_NAME(flowtable_udp_expire), 0,
248     "seconds after which to remove flow allocated to UDP.");
249 SYSCTL_VNET_INT(_net_inet_flowtable, OID_AUTO, fin_wait_expire, CTLFLAG_RW,
250     &VNET_NAME(flowtable_fin_wait_expire), 0,
251     "seconds after which to remove a flow in FIN_WAIT.");
252 SYSCTL_VNET_INT(_net_inet_flowtable, OID_AUTO, tcp_expire, CTLFLAG_RW,
253     &VNET_NAME(flowtable_tcp_expire), 0,
254     "seconds after which to remove flow allocated to a TCP connection.");
255
256
257 /*
258  * Maximum number of flows that can be allocated of a given type.
259  *
260  * The table is allocated at boot time (for the pure caching case
261  * there is no reason why this could not be changed at runtime)
262  * and thus (currently) needs to be set with a tunable.
263  */
264 static int
265 sysctl_nmbflows(SYSCTL_HANDLER_ARGS)
266 {
267         int error, newnmbflows;
268
269         newnmbflows = V_flowtable_nmbflows;
270         error = sysctl_handle_int(oidp, &newnmbflows, 0, req); 
271         if (error == 0 && req->newptr) {
272                 if (newnmbflows > V_flowtable_nmbflows) {
273                         V_flowtable_nmbflows = newnmbflows;
274                         uma_zone_set_max(V_flow_ipv4_zone,
275                             V_flowtable_nmbflows);
276                         uma_zone_set_max(V_flow_ipv6_zone,
277                             V_flowtable_nmbflows);
278                 } else
279                         error = EINVAL;
280         }
281         return (error);
282 }
283 SYSCTL_VNET_PROC(_net_inet_flowtable, OID_AUTO, nmbflows,
284     CTLTYPE_INT|CTLFLAG_RW, 0, 0, sysctl_nmbflows, "IU",
285     "Maximum number of flows allowed");
286
287 #ifndef RADIX_MPATH
288 static void
289 in_rtalloc_ign_wrapper(struct route *ro, uint32_t hash, u_int fib)
290 {
291
292         rtalloc_ign_fib(ro, 0, fib);
293 }
294 #endif
295
296 static void
297 flowtable_global_lock(struct flowtable *table, uint32_t hash)
298 {       
299         int lock_index = (hash)&(table->ft_lock_count - 1);
300
301         mtx_lock(&table->ft_locks[lock_index]);
302 }
303
304 static void
305 flowtable_global_unlock(struct flowtable *table, uint32_t hash)
306 {       
307         int lock_index = (hash)&(table->ft_lock_count - 1);
308
309         mtx_unlock(&table->ft_locks[lock_index]);
310 }
311
312 static void
313 flowtable_pcpu_lock(struct flowtable *table, uint32_t hash)
314 {
315
316         critical_enter();
317 }
318
319 static void
320 flowtable_pcpu_unlock(struct flowtable *table, uint32_t hash)
321 {
322
323         critical_exit();
324 }
325
326 #define FL_ENTRY_INDEX(table, hash)((hash) % (table)->ft_size)
327 #define FL_ENTRY(table, hash) *flowtable_entry((table), (hash))
328 #define FL_ENTRY_LOCK(table, hash)  (table)->ft_lock((table), (hash))
329 #define FL_ENTRY_UNLOCK(table, hash) (table)->ft_unlock((table), (hash))
330
331 #define FL_STALE (1<<8)
332 #define FL_IPV6  (1<<9)
333
334 static uint32_t
335 ipv4_flow_lookup_hash_internal(struct mbuf *m, struct route *ro,
336     uint32_t *key, uint16_t *flags, uint8_t *protop)
337 {
338         uint16_t sport = 0, dport = 0;
339         struct ip *ip = NULL;
340         uint8_t proto = 0;
341         int iphlen;
342         uint32_t hash;
343         struct sockaddr_in *sin;
344         struct tcphdr *th;
345         struct udphdr *uh;
346         struct sctphdr *sh;
347
348         if (V_flowtable_enable == 0)
349                 return (0);
350
351         key[1] = key[0] = 0;
352         sin = (struct sockaddr_in *)&ro->ro_dst;
353         if (m != NULL) {
354                 ip = mtod(m, struct ip *);
355                 sin->sin_family = AF_INET;
356                 sin->sin_len = sizeof(*sin);
357                 sin->sin_addr = ip->ip_dst;
358         } else
359                 *flags &= ~FL_HASH_PORTS;
360
361         key[2] = sin->sin_addr.s_addr;
362
363         if ((*flags & FL_HASH_PORTS) == 0)
364                 goto skipports;
365
366         proto = ip->ip_p;
367         iphlen = ip->ip_hl << 2; /* XXX options? */
368         key[1] = ip->ip_src.s_addr;
369         
370         switch (proto) {
371         case IPPROTO_TCP:
372                 th = (struct tcphdr *)((caddr_t)ip + iphlen);
373                 sport = ntohs(th->th_sport);
374                 dport = ntohs(th->th_dport);
375                 *flags |= th->th_flags;
376                 if (*flags & TH_RST)
377                         *flags |= FL_STALE;
378         break;
379         case IPPROTO_UDP:
380                 uh = (struct udphdr *)((caddr_t)ip + iphlen);
381                 sport = uh->uh_sport;
382                 dport = uh->uh_dport;
383         break;
384         case IPPROTO_SCTP:
385                 sh = (struct sctphdr *)((caddr_t)ip + iphlen);
386                 sport = sh->src_port;
387                 dport = sh->dest_port;
388         break;
389         default:
390                 if (*flags & FL_HASH_PORTS)
391                         goto noop;
392                 /* no port - hence not a protocol we care about */
393                 break;;
394         
395         }
396         *protop = proto;
397
398         /*
399          * If this is a transmit route cache then 
400          * hash all flows to a given destination to
401          * the same bucket
402          */
403         if ((*flags & FL_HASH_PORTS) == 0)
404                 proto = sport = dport = 0;
405
406         ((uint16_t *)key)[0] = sport;
407         ((uint16_t *)key)[1] = dport; 
408
409 skipports:
410         hash = jenkins_hashword(key, 3, V_flow_hashjitter + proto);
411         if (m != NULL && (m->m_flags & M_FLOWID) == 0) {
412                 m->m_flags |= M_FLOWID;
413                 m->m_pkthdr.flowid = hash;
414         }
415
416         return (hash);
417 noop:
418         *protop = proto;
419         return (0);
420 }
421
422 static bitstr_t *
423 flowtable_mask(struct flowtable *ft)
424 {
425         bitstr_t *mask;
426         
427         if (ft->ft_flags & FL_PCPU)
428                 mask = ft->ft_masks[curcpu];
429         else
430                 mask = ft->ft_masks[0];
431
432         return (mask);
433 }
434
435 static struct flentry **
436 flowtable_entry(struct flowtable *ft, uint32_t hash)
437 {
438         struct flentry **fle;
439         int index = (hash % ft->ft_size);
440
441         if (ft->ft_flags & FL_PCPU) {
442                 KASSERT(&ft->ft_table.pcpu[curcpu][0] != NULL, ("pcpu not set"));
443                 fle = &ft->ft_table.pcpu[curcpu][index];
444         } else {
445                 KASSERT(&ft->ft_table.global[0] != NULL, ("global not set"));
446                 fle = &ft->ft_table.global[index];
447         }
448         
449         return (fle);
450 }
451
452 static int
453 flow_stale(struct flowtable *ft, struct flentry *fle)
454 {
455         time_t idle_time;
456
457         if ((fle->f_fhash == 0)
458             || ((fle->f_rt->rt_flags & RTF_HOST) &&
459                 ((fle->f_rt->rt_flags & (RTF_UP))
460                     != (RTF_UP)))
461             || (fle->f_rt->rt_ifp == NULL))
462                 return (1);
463
464         idle_time = time_uptime - fle->f_uptime;
465
466         if ((fle->f_flags & FL_STALE) ||
467             ((fle->f_flags & (TH_SYN|TH_ACK|TH_FIN)) == 0
468                 && (idle_time > ft->ft_udp_idle)) ||
469             ((fle->f_flags & TH_FIN)
470                 && (idle_time > ft->ft_fin_wait_idle)) ||
471             ((fle->f_flags & (TH_SYN|TH_ACK)) == TH_SYN
472                 && (idle_time > ft->ft_syn_idle)) ||
473             ((fle->f_flags & (TH_SYN|TH_ACK)) == (TH_SYN|TH_ACK)
474                 && (idle_time > ft->ft_tcp_idle)) ||
475             ((fle->f_rt->rt_flags & RTF_UP) == 0 || 
476                 (fle->f_rt->rt_ifp == NULL)))
477                 return (1);
478
479         return (0);
480 }
481
482 static void
483 flowtable_set_hashkey(struct flentry *fle, uint32_t *key)
484 {
485         uint32_t *hashkey;
486         int i, nwords;
487
488         if (fle->f_flags & FL_IPV6) {
489                 nwords = 9;
490                 hashkey = ((struct flentry_v4 *)fle)->fl_flow.ipf_key;
491         } else {
492                 nwords = 3;
493                 hashkey = ((struct flentry_v6 *)fle)->fl_flow.ipf_key;
494         }
495         
496         for (i = 0; i < nwords; i++) 
497                 hashkey[i] = key[i];
498 }
499
500 static int
501 flowtable_insert(struct flowtable *ft, uint32_t hash, uint32_t *key,
502     uint8_t proto, struct route *ro, uint16_t flags)
503 {
504         struct flentry *fle, *fletail, *newfle, **flep;
505         int depth;
506         uma_zone_t flezone;
507         bitstr_t *mask;
508
509         flezone = (flags & FL_IPV6) ? V_flow_ipv6_zone : V_flow_ipv4_zone;
510         newfle = uma_zalloc(flezone, M_NOWAIT | M_ZERO);
511         if (newfle == NULL)
512                 return (ENOMEM);
513
514         newfle->f_flags |= (flags & FL_IPV6);
515         
516         FL_ENTRY_LOCK(ft, hash);
517         mask = flowtable_mask(ft);
518         flep = flowtable_entry(ft, hash);
519         fletail = fle = *flep;
520
521         if (fle == NULL) {
522                 bit_set(mask, FL_ENTRY_INDEX(ft, hash));
523                 *flep = fle = newfle;
524                 goto skip;
525         } 
526         
527         depth = 0;
528         V_flowtable_collisions++;
529         /*
530          * find end of list and make sure that we were not
531          * preempted by another thread handling this flow
532          */
533         while (fle != NULL) {
534                 if (fle->f_fhash == hash && !flow_stale(ft, fle)) {
535                         /*
536                          * there was either a hash collision
537                          * or we lost a race to insert
538                          */
539                         FL_ENTRY_UNLOCK(ft, hash);
540                         uma_zfree((newfle->f_flags & FL_IPV6) ?
541                             V_flow_ipv6_zone : V_flow_ipv4_zone, newfle);
542                         return (EEXIST);
543                 }
544                 /*
545                  * re-visit this double condition XXX
546                  */
547                 if (fletail->f_next != NULL)
548                         fletail = fle->f_next;
549
550                 depth++;
551                 fle = fle->f_next;
552         } 
553
554         if (depth > V_flowtable_max_depth)
555                 V_flowtable_max_depth = depth;
556         fletail->f_next = newfle;
557         fle = newfle;
558 skip:
559         flowtable_set_hashkey(fle, key);
560
561         fle->f_proto = proto;
562         fle->f_rt = ro->ro_rt;
563         fle->f_lle = ro->ro_lle;
564         fle->f_fhash = hash;
565         fle->f_uptime = time_uptime;
566         FL_ENTRY_UNLOCK(ft, hash);
567         return (0);
568 }
569
570 static int
571 flowtable_key_equal(struct flentry *fle, uint32_t *key)
572 {
573         uint32_t *hashkey;
574         int i, nwords;
575
576         if (fle->f_flags & FL_IPV6) {
577                 nwords = 9;
578                 hashkey = ((struct flentry_v4 *)fle)->fl_flow.ipf_key;
579         } else {
580                 nwords = 3;
581                 hashkey = ((struct flentry_v6 *)fle)->fl_flow.ipf_key;
582         }
583         
584         for (i = 0; i < nwords; i++) 
585                 if (hashkey[i] != key[i])
586                         return (0);
587
588         return (1);
589 }
590
591 int
592 flowtable_lookup(struct flowtable *ft, struct mbuf *m, struct route *ro)
593 {
594         uint32_t key[9], hash;
595         struct flentry *fle;
596         uint16_t flags;
597         uint8_t proto = 0;
598         int error = 0, fib = 0;
599         struct rtentry *rt;
600         struct llentry *lle;
601
602         flags = ft->ft_flags;
603         ro->ro_rt = NULL;
604         ro->ro_lle = NULL;
605
606         /*
607          * The internal hash lookup is the only IPv4 specific bit
608          * remaining
609          *
610          * XXX BZ: to add IPv6 support just add a check for the
611          * address type in m and ro and an equivalent ipv6 lookup
612          * function - the rest of the code should automatically
613          * handle an ipv6 flow (note that m can be NULL in which
614          * case ro will be set)
615          */
616         hash = ipv4_flow_lookup_hash_internal(m, ro, key,
617             &flags, &proto);
618
619         /*
620          * Ports are zero and this isn't a transmit cache
621          * - thus not a protocol for which we need to keep 
622          * state
623          * FL_HASH_PORTS => key[0] != 0 for TCP || UDP || SCTP
624          */
625         if (hash == 0 || (key[0] == 0 && (ft->ft_flags & FL_HASH_PORTS)))
626                 return (ENOENT);
627
628         V_flowtable_lookups++;
629         FL_ENTRY_LOCK(ft, hash);
630         if ((fle = FL_ENTRY(ft, hash)) == NULL) {
631                 FL_ENTRY_UNLOCK(ft, hash);
632                 goto uncached;
633         }
634 keycheck:       
635         rt = __DEVOLATILE(struct rtentry *, fle->f_rt);
636         lle = __DEVOLATILE(struct llentry *, fle->f_lle);
637         if ((rt != NULL)
638             && fle->f_fhash == hash
639             && flowtable_key_equal(fle, key)
640             && (proto == fle->f_proto)
641             && (rt->rt_flags & RTF_UP)
642             && (rt->rt_ifp != NULL)) {
643                 V_flowtable_hits++;
644                 fle->f_uptime = time_uptime;
645                 fle->f_flags |= flags;
646                 ro->ro_rt = rt;
647                 ro->ro_lle = lle;
648                 FL_ENTRY_UNLOCK(ft, hash);
649                 return (0);
650         } else if (fle->f_next != NULL) {
651                 fle = fle->f_next;
652                 goto keycheck;
653         }
654         FL_ENTRY_UNLOCK(ft, hash);
655
656 uncached:
657         V_flowtable_misses++;
658         /*
659          * This bit of code ends up locking the
660          * same route 3 times (just like ip_output + ether_output)
661          * - at lookup
662          * - in rt_check when called by arpresolve
663          * - dropping the refcount for the rtentry
664          *
665          * This could be consolidated to one if we wrote a variant
666          * of arpresolve with an rt_check variant that expected to
667          * receive the route locked
668          */
669         if (m != NULL)
670                 fib = M_GETFIB(m);
671
672         ft->ft_rtalloc(ro, hash, fib);
673         if (ro->ro_rt == NULL) 
674                 error = ENETUNREACH;
675         else {
676                 struct llentry *lle = NULL;
677                 struct sockaddr *l3addr;
678                 struct rtentry *rt = ro->ro_rt;
679                 struct ifnet *ifp = rt->rt_ifp;
680
681                 if (rt->rt_flags & RTF_GATEWAY)
682                         l3addr = rt->rt_gateway;
683                 else
684                         l3addr = &ro->ro_dst;
685                 llentry_update(&lle, LLTABLE(ifp), l3addr, ifp);
686                 ro->ro_lle = lle;
687
688                 if (lle == NULL) {
689                         RTFREE(rt);
690                         ro->ro_rt = NULL;
691                         return (ENOENT);
692                 }
693                 error = flowtable_insert(ft, hash, key, proto,
694                     ro, flags);
695                                 
696                 if (error) {
697                         RTFREE(rt);
698                         LLE_FREE(lle);
699                         ro->ro_rt = NULL;
700                         ro->ro_lle = NULL;
701                 }
702         } 
703
704         return (error);
705 }
706
707 /*
708  * used by the bit_alloc macro
709  */
710 #define calloc(count, size) malloc((count)*(size), M_DEVBUF, M_WAITOK|M_ZERO)
711         
712 struct flowtable *
713 flowtable_alloc(int nentry, int flags)
714 {
715         struct flowtable *ft, *fttail;
716         int i;
717
718         if (V_flow_hashjitter == 0)
719                 V_flow_hashjitter = arc4random();
720
721         KASSERT(nentry > 0, ("nentry must be > 0, is %d\n", nentry));
722
723         ft = malloc(sizeof(struct flowtable),
724             M_RTABLE, M_WAITOK | M_ZERO);
725         
726         ft->ft_flags = flags;
727         ft->ft_size = nentry;
728 #ifdef RADIX_MPATH
729         ft->ft_rtalloc = rtalloc_mpath_fib;
730 #else
731         ft->ft_rtalloc = in_rtalloc_ign_wrapper;
732 #endif
733         if (flags & FL_PCPU) {
734                 ft->ft_lock = flowtable_pcpu_lock;
735                 ft->ft_unlock = flowtable_pcpu_unlock;
736
737                 for (i = 0; i <= mp_maxid; i++) {
738                         ft->ft_table.pcpu[i] =
739                             malloc(nentry*sizeof(struct flentry *),
740                                 M_RTABLE, M_WAITOK | M_ZERO);
741                         ft->ft_masks[i] = bit_alloc(nentry);
742                 }
743         } else {
744                 ft->ft_lock_count = 2*(powerof2(mp_maxid + 1) ? (mp_maxid + 1):
745                     (fls(mp_maxid + 1) << 1));
746                 
747                 ft->ft_lock = flowtable_global_lock;
748                 ft->ft_unlock = flowtable_global_unlock;
749                 ft->ft_table.global =
750                             malloc(nentry*sizeof(struct flentry *),
751                                 M_RTABLE, M_WAITOK | M_ZERO);
752                 ft->ft_locks = malloc(ft->ft_lock_count*sizeof(struct mtx),
753                                 M_RTABLE, M_WAITOK | M_ZERO);
754                 for (i = 0; i < ft->ft_lock_count; i++)
755                         mtx_init(&ft->ft_locks[i], "flow", NULL, MTX_DEF|MTX_DUPOK);
756
757                 ft->ft_masks[0] = bit_alloc(nentry);
758         }
759         ft->ft_tmpmask = bit_alloc(nentry);
760
761         /*
762          * In the local transmit case the table truly is 
763          * just a cache - so everything is eligible for
764          * replacement after 5s of non-use
765          */
766         if (flags & FL_HASH_PORTS) {
767                 ft->ft_udp_idle = V_flowtable_udp_expire;
768                 ft->ft_syn_idle = V_flowtable_syn_expire;
769                 ft->ft_fin_wait_idle = V_flowtable_fin_wait_expire;
770                 ft->ft_tcp_idle = V_flowtable_fin_wait_expire;
771         } else {
772                 ft->ft_udp_idle = ft->ft_fin_wait_idle =
773                     ft->ft_syn_idle = ft->ft_tcp_idle = 30;
774                 
775         }
776
777         /*
778          * hook in to the cleaner list
779          */
780         if (V_flow_list_head == NULL)
781                 V_flow_list_head = ft;
782         else {
783                 fttail = V_flow_list_head;
784                 while (fttail->ft_next != NULL)
785                         fttail = fttail->ft_next;
786                 fttail->ft_next = ft;
787         }
788
789         return (ft);
790 }
791
792 static void
793 flowtable_init(const void *unused __unused)
794 {
795
796         V_flow_ipv4_zone = uma_zcreate("ip4flow", sizeof(struct flentry_v4),
797             NULL, NULL, NULL, NULL, 64, UMA_ZONE_MAXBUCKET);
798         V_flow_ipv6_zone = uma_zcreate("ip6flow", sizeof(struct flentry_v6),
799             NULL, NULL, NULL, NULL, 64, UMA_ZONE_MAXBUCKET);    
800         uma_zone_set_max(V_flow_ipv4_zone, V_flowtable_nmbflows);
801         uma_zone_set_max(V_flow_ipv6_zone, V_flowtable_nmbflows);
802 }
803
804 VNET_SYSINIT(flowtable_init, SI_SUB_KTHREAD_INIT, SI_ORDER_ANY,
805     flowtable_init, NULL);
806
807 #ifdef VIMAGE
808 static void
809 flowtable_uninit(const void *unused __unused)
810 {
811
812         uma_zdestroy(V_flow_ipv4_zone);
813         uma_zdestroy(V_flow_ipv6_zone);
814 }
815
816 VNET_SYSUNINIT(flowtable_uninit, SI_SUB_KTHREAD_INIT, SI_ORDER_ANY,
817     flowtable_uninit, NULL);
818 #endif
819
820 /*
821  * The rest of the code is devoted to garbage collection of expired entries.
822  * It is a new additon made necessary by the switch to dynamically allocating
823  * flow tables.
824  * 
825  */
826 static void
827 fle_free(struct flentry *fle)
828 {
829         struct rtentry *rt;
830         struct llentry *lle;
831
832         rt = __DEVOLATILE(struct rtentry *, fle->f_rt);
833         lle = __DEVOLATILE(struct llentry *, fle->f_lle);
834         RTFREE(rt);
835         LLE_FREE(lle);
836         uma_zfree((fle->f_flags & FL_IPV6) ?
837             V_flow_ipv6_zone : V_flow_ipv4_zone, fle);
838 }
839
840 static void
841 flowtable_free_stale(struct flowtable *ft)
842 {
843         int curbit = 0, count;
844         struct flentry *fle,  **flehead, *fleprev;
845         struct flentry *flefreehead, *flefreetail, *fletmp;
846         bitstr_t *mask, *tmpmask;
847         
848         flefreehead = flefreetail = NULL;
849         mask = flowtable_mask(ft);
850         tmpmask = ft->ft_tmpmask;
851         memcpy(tmpmask, mask, ft->ft_size/8);
852         /*
853          * XXX Note to self, bit_ffs operates at the byte level
854          * and thus adds gratuitous overhead
855          */
856         bit_ffs(tmpmask, ft->ft_size, &curbit);
857         while (curbit != -1) {
858                 if (curbit >= ft->ft_size || curbit < -1) {
859                         log(LOG_ALERT,
860                             "warning: bad curbit value %d \n",
861                             curbit);
862                         break;
863                 }
864                 
865                 FL_ENTRY_LOCK(ft, curbit);
866                 flehead = flowtable_entry(ft, curbit);
867                 fle = fleprev = *flehead;
868
869                 V_flowtable_free_checks++;
870 #ifdef DIAGNOSTIC
871                 if (fle == NULL && curbit > 0) {
872                         log(LOG_ALERT,
873                             "warning bit=%d set, but no fle found\n",
874                             curbit);
875                 }
876 #endif          
877                 while (fle != NULL) {   
878                         if (!flow_stale(ft, fle)) {
879                                 fleprev = fle;
880                                 fle = fle->f_next;
881                                 continue;
882                         }
883                         /*
884                          * delete head of the list
885                          */
886                         if (fleprev == *flehead) {
887                                 fletmp = fleprev;
888                                 if (fle == fleprev) {
889                                         fleprev = *flehead = fle->f_next;
890                                 } else
891                                         fleprev = *flehead = fle;
892                                 fle = fle->f_next;
893                         } else {
894                                 /*
895                                  * don't advance fleprev
896                                  */
897                                 fletmp = fle;
898                                 fleprev->f_next = fle->f_next;
899                                 fle = fleprev->f_next;
900                         }
901                         
902                         if (flefreehead == NULL)
903                                 flefreehead = flefreetail = fletmp;
904                         else {
905                                 flefreetail->f_next = fletmp;
906                                 flefreetail = fletmp;
907                         }
908                         fletmp->f_next = NULL;
909                 }
910                 if (*flehead == NULL)
911                         bit_clear(mask, curbit);
912                 FL_ENTRY_UNLOCK(ft, curbit);
913                 bit_clear(tmpmask, curbit);
914                 bit_ffs(tmpmask, ft->ft_size, &curbit);
915         }
916         count = 0;
917         while ((fle = flefreehead) != NULL) {
918                 flefreehead = fle->f_next;
919                 count++;
920                 V_flowtable_frees++;
921                 fle_free(fle);
922         }
923         if (bootverbose && count)
924                 log(LOG_DEBUG, "freed %d flow entries\n", count);
925 }
926
927 static void
928 flowtable_clean_vnet(void)
929 {
930         struct flowtable *ft;
931         int i;
932
933         ft = V_flow_list_head;
934         while (ft != NULL) {
935                 if (ft->ft_flags & FL_PCPU) {
936                         for (i = 0; i <= mp_maxid; i++) {
937                                 if (CPU_ABSENT(i))
938                                         continue;
939
940                                 thread_lock(curthread);
941                                 sched_bind(curthread, i);
942                                 thread_unlock(curthread);
943
944                                 flowtable_free_stale(ft);
945
946                                 thread_lock(curthread);
947                                 sched_unbind(curthread);
948                                 thread_unlock(curthread);
949                         }
950                 } else {
951                         flowtable_free_stale(ft);
952                 }
953                 ft = ft->ft_next;
954         }
955 }
956
957 static void
958 flowtable_cleaner(void)
959 {
960         VNET_ITERATOR_DECL(vnet_iter);
961
962         if (bootverbose)
963                 log(LOG_INFO, "flowtable cleaner started\n");
964         while (1) {
965                 VNET_LIST_RLOCK();
966                 VNET_FOREACH(vnet_iter) {
967                         CURVNET_SET(vnet_iter);
968                         flowtable_clean_vnet();
969                         CURVNET_RESTORE();
970                 }
971                 VNET_LIST_RUNLOCK();
972
973                 /*
974                  * The 20 second interval between cleaning checks
975                  * is arbitrary
976                  */
977                 pause("flowcleanwait", 20*hz);
978         }
979 }
980
981 static struct kproc_desc flow_kp = {
982         "flowcleaner",
983         flowtable_cleaner,
984         &flowcleanerproc
985 };
986 SYSINIT(flowcleaner, SI_SUB_KTHREAD_IDLE, SI_ORDER_ANY, kproc_start, &flow_kp);
987