]> CyberLeo.Net >> Repos - FreeBSD/stable/8.git/blob - sys/net/flowtable.c
MFC 205066, 205069, 205093, 205097, 205488:
[FreeBSD/stable/8.git] / sys / net / flowtable.c
1 /**************************************************************************
2
3 Copyright (c) 2008-2010, BitGravity Inc.
4 All rights reserved.
5
6 Redistribution and use in source and binary forms, with or without
7 modification, are permitted provided that the following conditions are met:
8
9  1. Redistributions of source code must retain the above copyright notice,
10     this list of conditions and the following disclaimer.
11
12  2. Neither the name of the BitGravity Corporation nor the names of its
13     contributors may be used to endorse or promote products derived from
14     this software without specific prior written permission.
15
16 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
17 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
20 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
21 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
22 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
23 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
24 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
25 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
26 POSSIBILITY OF SUCH DAMAGE.
27
28 ***************************************************************************/
29
30 #include "opt_route.h"
31 #include "opt_mpath.h"
32 #include "opt_ddb.h"
33 #include "opt_inet.h"
34 #include "opt_inet6.h"
35
36 #include <sys/cdefs.h>
37 __FBSDID("$FreeBSD$");
38
39 #include <sys/param.h>  
40 #include <sys/types.h>
41 #include <sys/bitstring.h>
42 #include <sys/condvar.h>
43 #include <sys/callout.h>
44 #include <sys/kernel.h>  
45 #include <sys/kthread.h>
46 #include <sys/limits.h>
47 #include <sys/malloc.h>
48 #include <sys/mbuf.h>
49 #include <sys/proc.h>
50 #include <sys/sbuf.h>
51 #include <sys/sched.h>
52 #include <sys/smp.h>
53 #include <sys/socket.h>
54 #include <sys/syslog.h>
55 #include <sys/sysctl.h>
56
57 #include <net/if.h>
58 #include <net/if_llatbl.h>
59 #include <net/if_var.h>
60 #include <net/route.h> 
61 #include <net/flowtable.h>
62 #include <net/vnet.h>
63
64 #include <netinet/in.h>
65 #include <netinet/in_systm.h>
66 #include <netinet/in_var.h>
67 #include <netinet/if_ether.h>
68 #include <netinet/ip.h>
69 #ifdef INET6
70 #include <netinet/ip6.h>
71 #endif
72 #include <netinet/tcp.h>
73 #include <netinet/udp.h>
74 #include <netinet/sctp.h>
75
76 #include <libkern/jenkins.h>
77 #include <ddb/ddb.h>
78
79 struct ipv4_tuple {
80         uint16_t        ip_sport;       /* source port */
81         uint16_t        ip_dport;       /* destination port */
82         in_addr_t       ip_saddr;       /* source address */
83         in_addr_t       ip_daddr;       /* destination address */
84 };
85
86 union ipv4_flow {
87         struct ipv4_tuple ipf_ipt;
88         uint32_t        ipf_key[3];
89 };
90
91 struct ipv6_tuple {
92         uint16_t        ip_sport;       /* source port */
93         uint16_t        ip_dport;       /* destination port */
94         struct in6_addr ip_saddr;       /* source address */
95         struct in6_addr ip_daddr;       /* destination address */
96 };
97
98 union ipv6_flow {
99         struct ipv6_tuple ipf_ipt;
100         uint32_t        ipf_key[9];
101 };
102
103 struct flentry {
104         volatile uint32_t       f_fhash;        /* hash flowing forward */
105         uint16_t                f_flags;        /* flow flags */
106         uint8_t                 f_pad;          
107         uint8_t                 f_proto;        /* protocol */
108         uint32_t                f_fibnum;       /* fib index */
109         uint32_t                f_uptime;       /* uptime at last access */
110         struct flentry          *f_next;        /* pointer to collision entry */
111         volatile struct rtentry *f_rt;          /* rtentry for flow */
112         volatile struct llentry *f_lle;         /* llentry for flow */
113 };
114
115 struct flentry_v4 {
116         struct flentry  fl_entry;
117         union ipv4_flow fl_flow;
118 };
119
120 struct flentry_v6 {
121         struct flentry  fl_entry;
122         union ipv6_flow fl_flow;
123 };
124
125 #define fl_fhash        fl_entry.fl_fhash
126 #define fl_flags        fl_entry.fl_flags
127 #define fl_proto        fl_entry.fl_proto
128 #define fl_uptime       fl_entry.fl_uptime
129 #define fl_rt           fl_entry.fl_rt
130 #define fl_lle          fl_entry.fl_lle
131
132 #define SECS_PER_HOUR           3600
133 #define SECS_PER_DAY            (24*SECS_PER_HOUR)
134
135 #define SYN_IDLE                300
136 #define UDP_IDLE                300
137 #define FIN_WAIT_IDLE           600
138 #define TCP_IDLE                SECS_PER_DAY
139
140
141 typedef void fl_lock_t(struct flowtable *, uint32_t);
142 typedef void fl_rtalloc_t(struct route *, uint32_t, u_int);
143
144 union flentryp {
145         struct flentry          **global;
146         struct flentry          **pcpu[MAXCPU];
147 };
148
149 struct flowtable_stats {
150         uint64_t        ft_collisions;
151         uint64_t        ft_allocated;
152         uint64_t        ft_misses;
153         uint64_t        ft_max_depth;
154         uint64_t        ft_free_checks;
155         uint64_t        ft_frees;
156         uint64_t        ft_hits;
157         uint64_t        ft_lookups;
158 } __aligned(CACHE_LINE_SIZE);
159
160 struct flowtable {
161         struct  flowtable_stats ft_stats[MAXCPU];
162         int             ft_size;
163         int             ft_lock_count;
164         uint32_t        ft_flags;
165         char            *ft_name;
166         fl_lock_t       *ft_lock;
167         fl_lock_t       *ft_unlock;
168         fl_rtalloc_t    *ft_rtalloc;
169         /*
170          * XXX need to pad out 
171          */ 
172         struct mtx      *ft_locks;
173         union flentryp  ft_table;
174         bitstr_t        *ft_masks[MAXCPU];
175         bitstr_t        *ft_tmpmask;
176         struct flowtable *ft_next;
177
178         uint32_t        ft_count __aligned(CACHE_LINE_SIZE);
179         uint32_t        ft_udp_idle __aligned(CACHE_LINE_SIZE);
180         uint32_t        ft_fin_wait_idle;
181         uint32_t        ft_syn_idle;
182         uint32_t        ft_tcp_idle;
183         boolean_t       ft_full;
184 } __aligned(CACHE_LINE_SIZE);
185
186 static struct proc *flowcleanerproc;
187 static VNET_DEFINE(struct flowtable *, flow_list_head);
188 static VNET_DEFINE(uint32_t, flow_hashjitter);
189 static VNET_DEFINE(uma_zone_t, flow_ipv4_zone);
190 static VNET_DEFINE(uma_zone_t, flow_ipv6_zone);
191
192 #define V_flow_list_head        VNET(flow_list_head)
193 #define V_flow_hashjitter       VNET(flow_hashjitter)
194 #define V_flow_ipv4_zone        VNET(flow_ipv4_zone)
195 #define V_flow_ipv6_zone        VNET(flow_ipv6_zone)
196
197
198 static struct cv        flowclean_cv;
199 static struct mtx       flowclean_lock;
200 static uint32_t         flowclean_cycles;
201 static uint32_t         flowclean_freq;
202
203 #ifdef FLOWTABLE_DEBUG
204 #define FLDPRINTF(ft, flags, fmt, ...)          \
205 do {                                            \
206         if ((ft)->ft_flags & (flags))           \
207                 printf((fmt), __VA_ARGS__);     \
208 } while (0);                                    \
209
210 #else
211 #define FLDPRINTF(ft, flags, fmt, ...)
212
213 #endif
214
215
216 /*
217  * TODO:
218  * - Make flowtable stats per-cpu, aggregated at sysctl call time,
219  *   to avoid extra cache evictions caused by incrementing a shared
220  *   counter
221  * - add sysctls to resize && flush flow tables 
222  * - Add per flowtable sysctls for statistics and configuring timeouts
223  * - add saturation counter to rtentry to support per-packet load-balancing
224  *   add flag to indicate round-robin flow, add list lookup from head
225      for flows
226  * - add sysctl / device node / syscall to support exporting and importing
227  *   of flows with flag to indicate that a flow was imported so should
228  *   not be considered for auto-cleaning
229  * - support explicit connection state (currently only ad-hoc for DSR)
230  * - idetach() cleanup for options VIMAGE builds.
231  */
232 VNET_DEFINE(int, flowtable_enable) = 1;
233 static VNET_DEFINE(int, flowtable_debug);
234 static VNET_DEFINE(int, flowtable_syn_expire) = SYN_IDLE;
235 static VNET_DEFINE(int, flowtable_udp_expire) = UDP_IDLE;
236 static VNET_DEFINE(int, flowtable_fin_wait_expire) = FIN_WAIT_IDLE;
237 static VNET_DEFINE(int, flowtable_tcp_expire) = TCP_IDLE;
238 static VNET_DEFINE(int, flowtable_nmbflows);
239 static VNET_DEFINE(int, flowtable_ready) = 0;
240
241 #define V_flowtable_enable              VNET(flowtable_enable)
242 #define V_flowtable_debug               VNET(flowtable_debug)
243 #define V_flowtable_syn_expire          VNET(flowtable_syn_expire)
244 #define V_flowtable_udp_expire          VNET(flowtable_udp_expire)
245 #define V_flowtable_fin_wait_expire     VNET(flowtable_fin_wait_expire)
246 #define V_flowtable_tcp_expire          VNET(flowtable_tcp_expire)
247 #define V_flowtable_nmbflows            VNET(flowtable_nmbflows)
248 #define V_flowtable_ready               VNET(flowtable_ready)
249
250 SYSCTL_NODE(_net_inet, OID_AUTO, flowtable, CTLFLAG_RD, NULL, "flowtable");
251 SYSCTL_VNET_INT(_net_inet_flowtable, OID_AUTO, debug, CTLFLAG_RW,
252     &VNET_NAME(flowtable_debug), 0, "print debug info.");
253 SYSCTL_VNET_INT(_net_inet_flowtable, OID_AUTO, enable, CTLFLAG_RW,
254     &VNET_NAME(flowtable_enable), 0, "enable flowtable caching.");
255
256 /*
257  * XXX This does not end up updating timeouts at runtime
258  * and only reflects the value for the last table added :-/
259  */
260 SYSCTL_VNET_INT(_net_inet_flowtable, OID_AUTO, syn_expire, CTLFLAG_RW,
261     &VNET_NAME(flowtable_syn_expire), 0,
262     "seconds after which to remove syn allocated flow.");
263 SYSCTL_VNET_INT(_net_inet_flowtable, OID_AUTO, udp_expire, CTLFLAG_RW,
264     &VNET_NAME(flowtable_udp_expire), 0,
265     "seconds after which to remove flow allocated to UDP.");
266 SYSCTL_VNET_INT(_net_inet_flowtable, OID_AUTO, fin_wait_expire, CTLFLAG_RW,
267     &VNET_NAME(flowtable_fin_wait_expire), 0,
268     "seconds after which to remove a flow in FIN_WAIT.");
269 SYSCTL_VNET_INT(_net_inet_flowtable, OID_AUTO, tcp_expire, CTLFLAG_RW,
270     &VNET_NAME(flowtable_tcp_expire), 0,
271     "seconds after which to remove flow allocated to a TCP connection.");
272
273
274 /*
275  * Maximum number of flows that can be allocated of a given type.
276  *
277  * The table is allocated at boot time (for the pure caching case
278  * there is no reason why this could not be changed at runtime)
279  * and thus (currently) needs to be set with a tunable.
280  */
281 static int
282 sysctl_nmbflows(SYSCTL_HANDLER_ARGS)
283 {
284         int error, newnmbflows;
285
286         newnmbflows = V_flowtable_nmbflows;
287         error = sysctl_handle_int(oidp, &newnmbflows, 0, req); 
288         if (error == 0 && req->newptr) {
289                 if (newnmbflows > V_flowtable_nmbflows) {
290                         V_flowtable_nmbflows = newnmbflows;
291                         uma_zone_set_max(V_flow_ipv4_zone,
292                             V_flowtable_nmbflows);
293                         uma_zone_set_max(V_flow_ipv6_zone,
294                             V_flowtable_nmbflows);
295                 } else
296                         error = EINVAL;
297         }
298         return (error);
299 }
300 SYSCTL_VNET_PROC(_net_inet_flowtable, OID_AUTO, nmbflows,
301     CTLTYPE_INT|CTLFLAG_RW, 0, 0, sysctl_nmbflows, "IU",
302     "Maximum number of flows allowed");
303
304
305
306 #define FS_PRINT(sb, field)     sbuf_printf((sb), "\t%s: %jd\n", #field, fs->ft_##field)
307
308 static void
309 fs_print(struct sbuf *sb, struct flowtable_stats *fs)
310 {
311
312         FS_PRINT(sb, collisions);
313         FS_PRINT(sb, allocated);
314         FS_PRINT(sb, misses);
315         FS_PRINT(sb, max_depth);
316         FS_PRINT(sb, free_checks);
317         FS_PRINT(sb, frees);
318         FS_PRINT(sb, hits);
319         FS_PRINT(sb, lookups);
320 }
321
322 static void
323 flowtable_show_stats(struct sbuf *sb, struct flowtable *ft)
324 {
325         int i;
326         struct flowtable_stats fs, *pfs;
327
328         if (ft->ft_flags & FL_PCPU) {
329                 bzero(&fs, sizeof(fs));
330                 pfs = &fs;
331                 for (i = 0; i <= mp_maxid; i++) {
332                         if (CPU_ABSENT(i))
333                                 continue;
334                         pfs->ft_collisions  += ft->ft_stats[i].ft_collisions;
335                         pfs->ft_allocated   += ft->ft_stats[i].ft_allocated;
336                         pfs->ft_misses      += ft->ft_stats[i].ft_misses;
337                         pfs->ft_free_checks += ft->ft_stats[i].ft_free_checks;
338                         pfs->ft_frees       += ft->ft_stats[i].ft_frees;
339                         pfs->ft_hits        += ft->ft_stats[i].ft_hits;
340                         pfs->ft_lookups     += ft->ft_stats[i].ft_lookups;
341                         if (ft->ft_stats[i].ft_max_depth > pfs->ft_max_depth)
342                                 pfs->ft_max_depth = ft->ft_stats[i].ft_max_depth;
343                 }
344         } else {
345                 pfs = &ft->ft_stats[0];
346         }
347         fs_print(sb, pfs);
348 }
349
350 static int
351 sysctl_flowtable_stats(SYSCTL_HANDLER_ARGS)
352 {
353         struct flowtable *ft;
354         struct sbuf *sb;
355         int error;
356
357         sb = sbuf_new(NULL, NULL, 64*1024, SBUF_FIXEDLEN);
358
359         ft = V_flow_list_head;
360         while (ft != NULL) {
361                 sbuf_printf(sb, "\ntable name: %s\n", ft->ft_name);
362                 flowtable_show_stats(sb, ft);
363                 ft = ft->ft_next;
364         }
365         sbuf_finish(sb);
366         error = SYSCTL_OUT(req, sbuf_data(sb), sbuf_len(sb) + 1);
367         sbuf_delete(sb);
368
369         return (error);
370 }
371 SYSCTL_VNET_PROC(_net_inet_flowtable, OID_AUTO, stats, CTLTYPE_STRING|CTLFLAG_RD,
372     NULL, 0, sysctl_flowtable_stats, "A", "flowtable statistics");
373
374
375 #ifndef RADIX_MPATH
376 static void
377 in_rtalloc_ign_wrapper(struct route *ro, uint32_t hash, u_int fibnum)
378 {
379
380         rtalloc_ign_fib(ro, 0, fibnum);
381 }
382 #endif
383
384 static void
385 flowtable_global_lock(struct flowtable *table, uint32_t hash)
386 {       
387         int lock_index = (hash)&(table->ft_lock_count - 1);
388
389         mtx_lock(&table->ft_locks[lock_index]);
390 }
391
392 static void
393 flowtable_global_unlock(struct flowtable *table, uint32_t hash)
394 {       
395         int lock_index = (hash)&(table->ft_lock_count - 1);
396
397         mtx_unlock(&table->ft_locks[lock_index]);
398 }
399
400 static void
401 flowtable_pcpu_lock(struct flowtable *table, uint32_t hash)
402 {
403
404         critical_enter();
405 }
406
407 static void
408 flowtable_pcpu_unlock(struct flowtable *table, uint32_t hash)
409 {
410
411         critical_exit();
412 }
413
414 #define FL_ENTRY_INDEX(table, hash)((hash) % (table)->ft_size)
415 #define FL_ENTRY(table, hash) *flowtable_entry((table), (hash))
416 #define FL_ENTRY_LOCK(table, hash)  (table)->ft_lock((table), (hash))
417 #define FL_ENTRY_UNLOCK(table, hash) (table)->ft_unlock((table), (hash))
418
419 #define FL_STALE        (1<<8)
420 #define FL_IPV6         (1<<9)
421 #define FL_OVERWRITE    (1<<10)
422
423 void
424 flow_invalidate(struct flentry *fle)
425 {
426
427         fle->f_flags |= FL_STALE;
428 }
429
430 static __inline int
431 proto_to_flags(uint8_t proto)
432 {
433         int flag;
434
435         switch (proto) {
436         case IPPROTO_TCP:
437                 flag = FL_TCP;
438                 break;
439         case IPPROTO_SCTP:
440                 flag = FL_SCTP;
441                 break;          
442         case IPPROTO_UDP:
443                 flag = FL_UDP;
444                 break;
445         default:
446                 flag = 0;
447                 break;
448         }
449
450         return (flag);
451 }
452
453 static __inline int
454 flags_to_proto(int flags)
455 {
456         int proto, protoflags;
457
458         protoflags = flags & (FL_TCP|FL_SCTP|FL_UDP);
459         switch (protoflags) {
460         case FL_TCP:
461                 proto = IPPROTO_TCP;
462                 break;
463         case FL_SCTP:
464                 proto = IPPROTO_SCTP;
465                 break;
466         case FL_UDP:
467                 proto = IPPROTO_UDP;
468                 break;
469         default:
470                 proto = 0;
471                 break;
472         }
473         return (proto);
474 }
475
476 #ifdef INET
477 #ifdef FLOWTABLE_DEBUG
478 static void
479 ipv4_flow_print_tuple(int flags, int proto, struct sockaddr_in *ssin,
480     struct sockaddr_in *dsin)
481 {
482         char saddr[4*sizeof "123"], daddr[4*sizeof "123"];
483
484         if (flags & FL_HASH_ALL) {
485                 inet_ntoa_r(ssin->sin_addr, saddr);
486                 inet_ntoa_r(dsin->sin_addr, daddr);
487                 printf("proto=%d %s:%d->%s:%d\n",
488                     proto, saddr, ntohs(ssin->sin_port), daddr,
489                     ntohs(dsin->sin_port));
490         } else {
491                 inet_ntoa_r(*(struct in_addr *) &dsin->sin_addr, daddr);
492                 printf("proto=%d %s\n", proto, daddr);
493         }
494
495 }
496 #endif
497
498 static int
499 ipv4_mbuf_demarshal(struct flowtable *ft, struct mbuf *m,
500     struct sockaddr_in *ssin, struct sockaddr_in *dsin, uint16_t *flags)
501 {
502         struct ip *ip;
503         uint8_t proto;
504         int iphlen;
505         struct tcphdr *th;
506         struct udphdr *uh;
507         struct sctphdr *sh;
508         uint16_t sport, dport;
509
510         proto = sport = dport = 0;
511         ip = mtod(m, struct ip *);
512         dsin->sin_family = AF_INET;
513         dsin->sin_len = sizeof(*dsin);
514         dsin->sin_addr = ip->ip_dst;
515         ssin->sin_family = AF_INET;
516         ssin->sin_len = sizeof(*ssin);
517         ssin->sin_addr = ip->ip_src;    
518
519         proto = ip->ip_p;
520         if ((*flags & FL_HASH_ALL) == 0) {
521                 FLDPRINTF(ft, FL_DEBUG_ALL, "skip port check flags=0x%x ",
522                     *flags);
523                 goto skipports;
524         }
525
526         iphlen = ip->ip_hl << 2; /* XXX options? */
527
528         switch (proto) {
529         case IPPROTO_TCP:
530                 th = (struct tcphdr *)((caddr_t)ip + iphlen);
531                 sport = th->th_sport;
532                 dport = th->th_dport;
533                 if ((*flags & FL_HASH_ALL) &&
534                     (th->th_flags & (TH_RST|TH_FIN)))
535                         *flags |= FL_STALE;
536         break;
537         case IPPROTO_UDP:
538                 uh = (struct udphdr *)((caddr_t)ip + iphlen);
539                 sport = uh->uh_sport;
540                 dport = uh->uh_dport;
541         break;
542         case IPPROTO_SCTP:
543                 sh = (struct sctphdr *)((caddr_t)ip + iphlen);
544                 sport = sh->src_port;
545                 dport = sh->dest_port;
546         break;
547         default:
548                 FLDPRINTF(ft, FL_DEBUG_ALL, "proto=0x%x not supported\n", proto);
549                 return (ENOTSUP);
550                 /* no port - hence not a protocol we care about */
551                 break;
552         
553         }
554
555 skipports:
556         *flags |= proto_to_flags(proto);
557         ssin->sin_port = sport;
558         dsin->sin_port = dport;
559         return (0);
560 }
561
562 static uint32_t
563 ipv4_flow_lookup_hash_internal(
564         struct sockaddr_in *ssin, struct sockaddr_in *dsin, 
565             uint32_t *key, uint16_t flags)
566 {
567         uint16_t sport, dport;
568         uint8_t proto;
569         int offset = 0;
570
571         if ((V_flowtable_enable == 0) || (V_flowtable_ready == 0))
572                 return (0);
573         proto = flags_to_proto(flags);
574         sport = dport = key[2] = key[1] = key[0] = 0;
575         if ((ssin != NULL) && (flags & FL_HASH_ALL)) {
576                 key[1] = ssin->sin_addr.s_addr;
577                 sport = ssin->sin_port;
578         }
579         if (dsin != NULL) {
580                 key[2] = dsin->sin_addr.s_addr;
581                 dport = dsin->sin_port;
582         }
583         if (flags & FL_HASH_ALL) {
584                 ((uint16_t *)key)[0] = sport;
585                 ((uint16_t *)key)[1] = dport; 
586         } else
587                 offset = V_flow_hashjitter + proto;
588
589         return (jenkins_hashword(key, 3, offset));
590 }
591
592 static struct flentry *
593 flowtable_lookup_mbuf4(struct flowtable *ft, struct mbuf *m)
594 {
595         struct sockaddr_storage ssa, dsa;
596         uint16_t flags;
597         struct sockaddr_in *dsin, *ssin;
598
599         dsin = (struct sockaddr_in *)&dsa;
600         ssin = (struct sockaddr_in *)&ssa;
601         flags = ft->ft_flags;
602         if (ipv4_mbuf_demarshal(ft, m, ssin, dsin, &flags) != 0)
603                 return (NULL);
604
605         return (flowtable_lookup(ft, &ssa, &dsa, M_GETFIB(m), flags));
606 }
607
608 void
609 flow_to_route(struct flentry *fle, struct route *ro)
610 {
611         uint32_t *hashkey = NULL;
612         struct sockaddr_in *sin;
613
614         sin = (struct sockaddr_in *)&ro->ro_dst;
615         sin->sin_family = AF_INET;
616         sin->sin_len = sizeof(*sin);
617         hashkey = ((struct flentry_v4 *)fle)->fl_flow.ipf_key;
618         sin->sin_addr.s_addr = hashkey[2];
619         ro->ro_rt = __DEVOLATILE(struct rtentry *, fle->f_rt);
620         ro->ro_lle = __DEVOLATILE(struct llentry *, fle->f_lle);
621 }
622 #endif /* INET */
623
624 #ifdef INET6
625 /*
626  * PULLUP_TO(len, p, T) makes sure that len + sizeof(T) is contiguous,
627  * then it sets p to point at the offset "len" in the mbuf. WARNING: the
628  * pointer might become stale after other pullups (but we never use it
629  * this way).
630  */
631 #define PULLUP_TO(_len, p, T)                                           \
632 do {                                                                    \
633         int x = (_len) + sizeof(T);                                     \
634         if ((m)->m_len < x) {                                           \
635                 goto receive_failed;                                    \
636         }                                                               \
637         p = (mtod(m, char *) + (_len));                                 \
638 } while (0)
639
640 #define TCP(p)          ((struct tcphdr *)(p))
641 #define SCTP(p)         ((struct sctphdr *)(p))
642 #define UDP(p)          ((struct udphdr *)(p))
643
644 static int
645 ipv6_mbuf_demarshal(struct flowtable *ft, struct mbuf *m,
646     struct sockaddr_in6 *ssin6, struct sockaddr_in6 *dsin6, uint16_t *flags)
647 {
648         struct ip6_hdr *ip6;
649         uint8_t proto;
650         int hlen;
651         uint16_t src_port, dst_port;
652         u_short offset;
653         void *ulp;
654
655         offset = hlen = src_port = dst_port = 0;
656         ulp = NULL;
657         ip6 = mtod(m, struct ip6_hdr *);
658         hlen = sizeof(struct ip6_hdr);
659         proto = ip6->ip6_nxt;
660
661         if ((*flags & FL_HASH_ALL) == 0)
662                 goto skipports;
663
664         while (ulp == NULL) {
665                 switch (proto) {
666                 case IPPROTO_ICMPV6:
667                 case IPPROTO_OSPFIGP:
668                 case IPPROTO_PIM:
669                 case IPPROTO_CARP:
670                 case IPPROTO_ESP:
671                 case IPPROTO_NONE:
672                         ulp = ip6;
673                         break;
674                 case IPPROTO_TCP:
675                         PULLUP_TO(hlen, ulp, struct tcphdr);
676                         dst_port = TCP(ulp)->th_dport;
677                         src_port = TCP(ulp)->th_sport;
678                         if ((*flags & FL_HASH_ALL) &&
679                             (TCP(ulp)->th_flags & (TH_RST|TH_FIN)))
680                                 *flags |= FL_STALE;
681                         break;
682                 case IPPROTO_SCTP:
683                         PULLUP_TO(hlen, ulp, struct sctphdr);
684                         src_port = SCTP(ulp)->src_port;
685                         dst_port = SCTP(ulp)->dest_port;
686                         break;
687                 case IPPROTO_UDP:
688                         PULLUP_TO(hlen, ulp, struct udphdr);
689                         dst_port = UDP(ulp)->uh_dport;
690                         src_port = UDP(ulp)->uh_sport;
691                         break;
692                 case IPPROTO_HOPOPTS:   /* RFC 2460 */
693                         PULLUP_TO(hlen, ulp, struct ip6_hbh);
694                         hlen += (((struct ip6_hbh *)ulp)->ip6h_len + 1) << 3;
695                         proto = ((struct ip6_hbh *)ulp)->ip6h_nxt;
696                         ulp = NULL;
697                         break;
698                 case IPPROTO_ROUTING:   /* RFC 2460 */
699                         PULLUP_TO(hlen, ulp, struct ip6_rthdr); 
700                         hlen += (((struct ip6_rthdr *)ulp)->ip6r_len + 1) << 3;
701                         proto = ((struct ip6_rthdr *)ulp)->ip6r_nxt;
702                         ulp = NULL;
703                         break;
704                 case IPPROTO_FRAGMENT:  /* RFC 2460 */
705                         PULLUP_TO(hlen, ulp, struct ip6_frag);
706                         hlen += sizeof (struct ip6_frag);
707                         proto = ((struct ip6_frag *)ulp)->ip6f_nxt;
708                         offset = ((struct ip6_frag *)ulp)->ip6f_offlg &
709                             IP6F_OFF_MASK;
710                         ulp = NULL;
711                         break;
712                 case IPPROTO_DSTOPTS:   /* RFC 2460 */
713                         PULLUP_TO(hlen, ulp, struct ip6_hbh);
714                         hlen += (((struct ip6_hbh *)ulp)->ip6h_len + 1) << 3;
715                         proto = ((struct ip6_hbh *)ulp)->ip6h_nxt;
716                         ulp = NULL;
717                         break;
718                 case IPPROTO_AH:        /* RFC 2402 */
719                         PULLUP_TO(hlen, ulp, struct ip6_ext);
720                         hlen += (((struct ip6_ext *)ulp)->ip6e_len + 2) << 2;
721                         proto = ((struct ip6_ext *)ulp)->ip6e_nxt;
722                         ulp = NULL;
723                         break;
724                 default:
725                         PULLUP_TO(hlen, ulp, struct ip6_ext);
726                         break;
727                 }
728         }
729
730         if (src_port == 0) {
731         receive_failed:
732                 return (ENOTSUP);
733         }
734
735 skipports:
736         dsin6->sin6_family = AF_INET6;
737         dsin6->sin6_len = sizeof(*dsin6);
738         dsin6->sin6_port = dst_port;
739         memcpy(&dsin6->sin6_addr, &ip6->ip6_dst, sizeof(struct in6_addr));
740
741         ssin6->sin6_family = AF_INET6;
742         ssin6->sin6_len = sizeof(*ssin6);
743         ssin6->sin6_port = src_port;
744         memcpy(&ssin6->sin6_addr, &ip6->ip6_src, sizeof(struct in6_addr));
745         *flags |= proto_to_flags(proto);
746
747         return (0);
748 }
749
750 #define zero_key(key)           \
751 do {                            \
752         key[0] = 0;             \
753         key[1] = 0;             \
754         key[2] = 0;             \
755         key[3] = 0;             \
756         key[4] = 0;             \
757         key[5] = 0;             \
758         key[6] = 0;             \
759         key[7] = 0;             \
760         key[8] = 0;             \
761 } while (0)
762         
763 static uint32_t
764 ipv6_flow_lookup_hash_internal(
765         struct sockaddr_in6 *ssin6, struct sockaddr_in6 *dsin6, 
766             uint32_t *key, uint16_t flags)
767 {
768         uint16_t sport, dport;
769         uint8_t proto;
770         int offset = 0;
771
772         if ((V_flowtable_enable == 0) || (V_flowtable_ready == 0))
773                 return (0);
774
775         proto = flags_to_proto(flags);
776         zero_key(key);
777         sport = dport = 0;
778         if (dsin6 != NULL) {
779                 memcpy(&key[1], &dsin6->sin6_addr, sizeof(struct in6_addr));
780                 dport = dsin6->sin6_port;
781         }
782         if ((ssin6 != NULL) && (flags & FL_HASH_ALL)) {
783                 memcpy(&key[5], &ssin6->sin6_addr, sizeof(struct in6_addr));
784                 sport = ssin6->sin6_port;
785         }
786         if (flags & FL_HASH_ALL) {
787                 ((uint16_t *)key)[0] = sport;
788                 ((uint16_t *)key)[1] = dport; 
789         } else
790                 offset = V_flow_hashjitter + proto;
791
792         return (jenkins_hashword(key, 9, offset));
793 }
794
795 static struct flentry *
796 flowtable_lookup_mbuf6(struct flowtable *ft, struct mbuf *m)
797 {
798         struct sockaddr_storage ssa, dsa;
799         struct sockaddr_in6 *dsin6, *ssin6;     
800         uint16_t flags;
801
802         dsin6 = (struct sockaddr_in6 *)&dsa;
803         ssin6 = (struct sockaddr_in6 *)&ssa;
804         flags = ft->ft_flags;
805         
806         if (ipv6_mbuf_demarshal(ft, m, ssin6, dsin6, &flags) != 0)
807                 return (NULL);
808
809         return (flowtable_lookup(ft, &ssa, &dsa, M_GETFIB(m), flags));
810 }
811
812 void
813 flow_to_route_in6(struct flentry *fle, struct route_in6 *ro)
814 {
815         uint32_t *hashkey = NULL;
816         struct sockaddr_in6 *sin6;
817
818         sin6 = (struct sockaddr_in6 *)&ro->ro_dst;
819
820         sin6->sin6_family = AF_INET6;
821         sin6->sin6_len = sizeof(*sin6);
822         hashkey = ((struct flentry_v6 *)fle)->fl_flow.ipf_key;
823         memcpy(&sin6->sin6_addr, &hashkey[5], sizeof (struct in6_addr));
824         ro->ro_rt = __DEVOLATILE(struct rtentry *, fle->f_rt);
825         ro->ro_lle = __DEVOLATILE(struct llentry *, fle->f_lle);
826
827 }
828 #endif /* INET6 */
829
830 static bitstr_t *
831 flowtable_mask(struct flowtable *ft)
832 {
833         bitstr_t *mask;
834
835         if (ft->ft_flags & FL_PCPU)
836                 mask = ft->ft_masks[curcpu];
837         else
838                 mask = ft->ft_masks[0];
839
840         return (mask);
841 }
842
843 static struct flentry **
844 flowtable_entry(struct flowtable *ft, uint32_t hash)
845 {
846         struct flentry **fle;
847         int index = (hash % ft->ft_size);
848
849         if (ft->ft_flags & FL_PCPU) {
850                 KASSERT(&ft->ft_table.pcpu[curcpu][0] != NULL, ("pcpu not set"));
851                 fle = &ft->ft_table.pcpu[curcpu][index];
852         } else {
853                 KASSERT(&ft->ft_table.global[0] != NULL, ("global not set"));
854                 fle = &ft->ft_table.global[index];
855         }
856         
857         return (fle);
858 }
859
860 static int
861 flow_stale(struct flowtable *ft, struct flentry *fle)
862 {
863         time_t idle_time;
864
865         if ((fle->f_fhash == 0)
866             || ((fle->f_rt->rt_flags & RTF_HOST) &&
867                 ((fle->f_rt->rt_flags & (RTF_UP))
868                     != (RTF_UP)))
869             || (fle->f_rt->rt_ifp == NULL))
870                 return (1);
871
872         idle_time = time_uptime - fle->f_uptime;
873
874         if ((fle->f_flags & FL_STALE) ||
875             ((fle->f_flags & (TH_SYN|TH_ACK|TH_FIN)) == 0
876                 && (idle_time > ft->ft_udp_idle)) ||
877             ((fle->f_flags & TH_FIN)
878                 && (idle_time > ft->ft_fin_wait_idle)) ||
879             ((fle->f_flags & (TH_SYN|TH_ACK)) == TH_SYN
880                 && (idle_time > ft->ft_syn_idle)) ||
881             ((fle->f_flags & (TH_SYN|TH_ACK)) == (TH_SYN|TH_ACK)
882                 && (idle_time > ft->ft_tcp_idle)) ||
883             ((fle->f_rt->rt_flags & RTF_UP) == 0 || 
884                 (fle->f_rt->rt_ifp == NULL)))
885                 return (1);
886
887         return (0);
888 }
889
890 static void
891 flowtable_set_hashkey(struct flentry *fle, uint32_t *key)
892 {
893         uint32_t *hashkey;
894         int i, nwords;
895
896         if (fle->f_flags & FL_IPV6) {
897                 nwords = 9;
898                 hashkey = ((struct flentry_v4 *)fle)->fl_flow.ipf_key;
899         } else {
900                 nwords = 3;
901                 hashkey = ((struct flentry_v6 *)fle)->fl_flow.ipf_key;
902         }
903         
904         for (i = 0; i < nwords; i++) 
905                 hashkey[i] = key[i];
906 }
907
908 static struct flentry *
909 flow_alloc(struct flowtable *ft)
910 {
911         struct flentry *newfle;
912         uma_zone_t zone;
913
914         newfle = NULL;
915         zone = (ft->ft_flags & FL_IPV6) ? V_flow_ipv6_zone : V_flow_ipv4_zone;
916
917         newfle = uma_zalloc(zone, M_NOWAIT | M_ZERO);
918         if (newfle != NULL)
919                 atomic_add_int(&ft->ft_count, 1);
920         return (newfle);
921 }
922
923 static void
924 flow_free(struct flentry *fle, struct flowtable *ft)
925 {
926         uma_zone_t zone;
927
928         zone = (ft->ft_flags & FL_IPV6) ? V_flow_ipv6_zone : V_flow_ipv4_zone;
929         atomic_add_int(&ft->ft_count, -1);
930         uma_zfree(zone, fle);
931 }
932
933 static int
934 flow_full(struct flowtable *ft)
935 {
936         boolean_t full;
937         uint32_t count;
938         
939         full = ft->ft_full;
940         count = ft->ft_count;
941
942         if (full && (count < (V_flowtable_nmbflows - (V_flowtable_nmbflows >> 3))))
943                 ft->ft_full = FALSE;
944         else if (!full && (count > (V_flowtable_nmbflows - (V_flowtable_nmbflows >> 5))))
945                 ft->ft_full = TRUE;
946         
947         if (full && !ft->ft_full) {
948                 flowclean_freq = 4*hz;
949                 if ((ft->ft_flags & FL_HASH_ALL) == 0)
950                         ft->ft_udp_idle = ft->ft_fin_wait_idle =
951                             ft->ft_syn_idle = ft->ft_tcp_idle = 5;
952                 cv_broadcast(&flowclean_cv);
953         } else if (!full && ft->ft_full) {
954                 flowclean_freq = 20*hz;
955                 if ((ft->ft_flags & FL_HASH_ALL) == 0)
956                         ft->ft_udp_idle = ft->ft_fin_wait_idle =
957                             ft->ft_syn_idle = ft->ft_tcp_idle = 30;
958         }
959
960         return (ft->ft_full);
961 }
962
963 static int
964 flowtable_insert(struct flowtable *ft, uint32_t hash, uint32_t *key,
965     uint32_t fibnum, struct route *ro, uint16_t flags)
966 {
967         struct flentry *fle, *fletail, *newfle, **flep;
968         struct flowtable_stats *fs = &ft->ft_stats[curcpu];
969         int depth;
970         bitstr_t *mask;
971         uint8_t proto;
972
973         newfle = flow_alloc(ft);
974         if (newfle == NULL)
975                 return (ENOMEM);
976
977         newfle->f_flags |= (flags & FL_IPV6);
978         proto = flags_to_proto(flags);
979
980         FL_ENTRY_LOCK(ft, hash);
981         mask = flowtable_mask(ft);
982         flep = flowtable_entry(ft, hash);
983         fletail = fle = *flep;
984
985         if (fle == NULL) {
986                 bit_set(mask, FL_ENTRY_INDEX(ft, hash));
987                 *flep = fle = newfle;
988                 goto skip;
989         } 
990         
991         depth = 0;
992         fs->ft_collisions++;
993         /*
994          * find end of list and make sure that we were not
995          * preempted by another thread handling this flow
996          */
997         while (fle != NULL) {
998                 if (fle->f_fhash == hash && !flow_stale(ft, fle)) {
999                         /*
1000                          * there was either a hash collision
1001                          * or we lost a race to insert
1002                          */
1003                         FL_ENTRY_UNLOCK(ft, hash);
1004                         flow_free(newfle, ft);
1005                         
1006                         if (flags & FL_OVERWRITE) 
1007                                 goto skip;
1008                         return (EEXIST);
1009                 }
1010                 /*
1011                  * re-visit this double condition XXX
1012                  */
1013                 if (fletail->f_next != NULL)
1014                         fletail = fle->f_next;
1015
1016                 depth++;
1017                 fle = fle->f_next;
1018         } 
1019
1020         if (depth > fs->ft_max_depth)
1021                 fs->ft_max_depth = depth;
1022         fletail->f_next = newfle;
1023         fle = newfle;
1024 skip:
1025         flowtable_set_hashkey(fle, key);
1026
1027         fle->f_proto = proto;
1028         fle->f_rt = ro->ro_rt;
1029         fle->f_lle = ro->ro_lle;
1030         fle->f_fhash = hash;
1031         fle->f_fibnum = fibnum;
1032         fle->f_uptime = time_uptime;
1033         FL_ENTRY_UNLOCK(ft, hash);
1034         return (0);
1035 }
1036
1037 int
1038 kern_flowtable_insert(struct flowtable *ft,
1039     struct sockaddr_storage *ssa, struct sockaddr_storage *dsa,
1040     struct route *ro, uint32_t fibnum, int flags)
1041 {
1042         uint32_t key[9], hash;
1043
1044         flags = (ft->ft_flags | flags | FL_OVERWRITE);
1045         hash = 0;
1046
1047 #ifdef INET
1048         if (ssa->ss_family == AF_INET) 
1049                 hash = ipv4_flow_lookup_hash_internal((struct sockaddr_in *)ssa,
1050                     (struct sockaddr_in *)dsa, key, flags);
1051 #endif
1052 #ifdef INET6
1053         if (ssa->ss_family == AF_INET6) 
1054                 hash = ipv6_flow_lookup_hash_internal((struct sockaddr_in6 *)ssa,
1055                     (struct sockaddr_in6 *)dsa, key, flags);
1056 #endif  
1057         if (ro->ro_rt == NULL || ro->ro_lle == NULL)
1058                 return (EINVAL);
1059
1060         FLDPRINTF(ft, FL_DEBUG,
1061             "kern_flowtable_insert: key=%x:%x:%x hash=%x fibnum=%d flags=%x\n",
1062             key[0], key[1], key[2], hash, fibnum, flags);
1063         return (flowtable_insert(ft, hash, key, fibnum, ro, flags));
1064 }
1065
1066 static int
1067 flowtable_key_equal(struct flentry *fle, uint32_t *key)
1068 {
1069         uint32_t *hashkey;
1070         int i, nwords;
1071
1072         if (fle->f_flags & FL_IPV6) {
1073                 nwords = 9;
1074                 hashkey = ((struct flentry_v4 *)fle)->fl_flow.ipf_key;
1075         } else {
1076                 nwords = 3;
1077                 hashkey = ((struct flentry_v6 *)fle)->fl_flow.ipf_key;
1078         }
1079
1080         for (i = 0; i < nwords; i++) 
1081                 if (hashkey[i] != key[i])
1082                         return (0);
1083
1084         return (1);
1085 }
1086
1087 struct flentry *
1088 flowtable_lookup_mbuf(struct flowtable *ft, struct mbuf *m, int af)
1089 {
1090         struct flentry *fle = NULL;
1091
1092 #ifdef INET
1093         if (af == AF_INET)
1094                 fle = flowtable_lookup_mbuf4(ft, m);
1095 #endif
1096 #ifdef INET6
1097         if (af == AF_INET6)
1098                 fle = flowtable_lookup_mbuf6(ft, m);
1099 #endif  
1100         if (fle != NULL && m != NULL && (m->m_flags & M_FLOWID) == 0) {
1101                 m->m_flags |= M_FLOWID;
1102                 m->m_pkthdr.flowid = fle->f_fhash;
1103         }
1104         return (fle);
1105 }
1106         
1107 struct flentry *
1108 flowtable_lookup(struct flowtable *ft, struct sockaddr_storage *ssa,
1109     struct sockaddr_storage *dsa, uint32_t fibnum, int flags)
1110 {
1111         uint32_t key[9], hash;
1112         struct flentry *fle;
1113         struct flowtable_stats *fs = &ft->ft_stats[curcpu];
1114         uint8_t proto = 0;
1115         int error = 0;
1116         struct rtentry *rt;
1117         struct llentry *lle;
1118         struct route sro, *ro;
1119         struct route_in6 sro6;
1120
1121         sro.ro_rt = sro6.ro_rt = NULL;
1122         sro.ro_lle = sro6.ro_lle = NULL;
1123         ro = NULL;
1124         hash = 0;
1125         flags |= ft->ft_flags;
1126         proto = flags_to_proto(flags);
1127 #ifdef INET
1128         if (ssa->ss_family == AF_INET) {
1129                 struct sockaddr_in *ssin, *dsin;
1130
1131                 ro = &sro;
1132                 memcpy(&ro->ro_dst, dsa, sizeof(struct sockaddr_in));
1133                 dsin = (struct sockaddr_in *)dsa;
1134                 ssin = (struct sockaddr_in *)ssa;
1135                 if ((dsin->sin_addr.s_addr == ssin->sin_addr.s_addr) ||
1136                     (ntohl(dsin->sin_addr.s_addr) >> IN_CLASSA_NSHIFT) == IN_LOOPBACKNET ||
1137                     (ntohl(ssin->sin_addr.s_addr) >> IN_CLASSA_NSHIFT) == IN_LOOPBACKNET)
1138                         return (NULL);
1139
1140                 hash = ipv4_flow_lookup_hash_internal(ssin, dsin, key, flags);
1141         }
1142 #endif
1143 #ifdef INET6
1144         if (ssa->ss_family == AF_INET6) {
1145                 struct sockaddr_in6 *ssin6, *dsin6;
1146
1147                 ro = (struct route *)&sro6;
1148                 memcpy(&sro6.ro_dst, dsa,
1149                     sizeof(struct sockaddr_in6));
1150                 dsin6 = (struct sockaddr_in6 *)dsa;
1151                 ssin6 = (struct sockaddr_in6 *)ssa;
1152
1153                 flags |= FL_IPV6;
1154                 hash = ipv6_flow_lookup_hash_internal(ssin6, dsin6, key, flags);
1155         }
1156 #endif
1157         /*
1158          * Ports are zero and this isn't a transmit cache
1159          * - thus not a protocol for which we need to keep 
1160          * state
1161          * FL_HASH_ALL => key[0] != 0 for TCP || UDP || SCTP
1162          */
1163         if (hash == 0 || (key[0] == 0 && (ft->ft_flags & FL_HASH_ALL)))
1164                 return (NULL);
1165
1166         fs->ft_lookups++;
1167         FL_ENTRY_LOCK(ft, hash);
1168         if ((fle = FL_ENTRY(ft, hash)) == NULL) {
1169                 FL_ENTRY_UNLOCK(ft, hash);
1170                 goto uncached;
1171         }
1172 keycheck:       
1173         rt = __DEVOLATILE(struct rtentry *, fle->f_rt);
1174         lle = __DEVOLATILE(struct llentry *, fle->f_lle);
1175         if ((rt != NULL)
1176             && fle->f_fhash == hash
1177             && flowtable_key_equal(fle, key)
1178             && (proto == fle->f_proto)
1179             && (fibnum == fle->f_fibnum)
1180             && (rt->rt_flags & RTF_UP)
1181             && (rt->rt_ifp != NULL)) {
1182                 fs->ft_hits++;
1183                 fle->f_uptime = time_uptime;
1184                 fle->f_flags |= flags;
1185                 FL_ENTRY_UNLOCK(ft, hash);
1186                 return (fle);
1187         } else if (fle->f_next != NULL) {
1188                 fle = fle->f_next;
1189                 goto keycheck;
1190         }
1191         FL_ENTRY_UNLOCK(ft, hash);
1192 uncached:
1193         if (flags & FL_NOAUTO || flow_full(ft))
1194                 return (NULL);
1195
1196         fs->ft_misses++;
1197         /*
1198          * This bit of code ends up locking the
1199          * same route 3 times (just like ip_output + ether_output)
1200          * - at lookup
1201          * - in rt_check when called by arpresolve
1202          * - dropping the refcount for the rtentry
1203          *
1204          * This could be consolidated to one if we wrote a variant
1205          * of arpresolve with an rt_check variant that expected to
1206          * receive the route locked
1207          */
1208
1209 #ifdef INVARIANTS
1210         if ((ro->ro_dst.sa_family != AF_INET) &&
1211             (ro->ro_dst.sa_family != AF_INET6))
1212                 panic("sa_family == %d\n", ro->ro_dst.sa_family);
1213 #endif
1214
1215         ft->ft_rtalloc(ro, hash, fibnum);
1216         if (ro->ro_rt == NULL) 
1217                 error = ENETUNREACH;
1218         else {
1219                 struct llentry *lle = NULL;
1220                 struct sockaddr_storage *l3addr;
1221                 struct rtentry *rt = ro->ro_rt;
1222                 struct ifnet *ifp = rt->rt_ifp;
1223
1224                 if (ifp->if_flags & (IFF_POINTOPOINT | IFF_LOOPBACK)) {
1225                         RTFREE(rt);
1226                         ro->ro_rt = NULL;
1227                         return (NULL);
1228                 }
1229 #ifdef INET6
1230                 if (ssa->ss_family == AF_INET6) {
1231                         struct sockaddr_in6 *dsin6;
1232
1233                         dsin6 = (struct sockaddr_in6 *)dsa;                     
1234                         if (in6_localaddr(&dsin6->sin6_addr)) {
1235                                 RTFREE(rt);
1236                                 ro->ro_rt = NULL;
1237                                 return (NULL);                          
1238                         }
1239
1240                         if (rt->rt_flags & RTF_GATEWAY)
1241                                 l3addr = (struct sockaddr_storage *)rt->rt_gateway;
1242                         
1243                         else
1244                                 l3addr = (struct sockaddr_storage *)&ro->ro_dst;
1245                         llentry_update(&lle, LLTABLE6(ifp), l3addr, ifp);
1246                 }
1247 #endif  
1248 #ifdef INET
1249                 if (ssa->ss_family == AF_INET) {
1250                         if (rt->rt_flags & RTF_GATEWAY)
1251                                 l3addr = (struct sockaddr_storage *)rt->rt_gateway;
1252                         else
1253                                 l3addr = (struct sockaddr_storage *)&ro->ro_dst;
1254                         llentry_update(&lle, LLTABLE(ifp), l3addr, ifp);        
1255                 }
1256                         
1257 #endif
1258                 ro->ro_lle = lle;
1259
1260                 if (lle == NULL) {
1261                         RTFREE(rt);
1262                         ro->ro_rt = NULL;
1263                         return (NULL);
1264                 }
1265                 error = flowtable_insert(ft, hash, key, fibnum, ro, flags);
1266
1267                 if (error) {
1268                         RTFREE(rt);
1269                         LLE_FREE(lle);
1270                         ro->ro_rt = NULL;
1271                         ro->ro_lle = NULL;
1272                 }
1273         } 
1274
1275         return ((error) ? NULL : fle);
1276 }
1277
1278 /*
1279  * used by the bit_alloc macro
1280  */
1281 #define calloc(count, size) malloc((count)*(size), M_DEVBUF, M_WAITOK|M_ZERO)
1282         
1283 struct flowtable *
1284 flowtable_alloc(char *name, int nentry, int flags)
1285 {
1286         struct flowtable *ft, *fttail;
1287         int i;
1288
1289         if (V_flow_hashjitter == 0)
1290                 V_flow_hashjitter = arc4random();
1291
1292         KASSERT(nentry > 0, ("nentry must be > 0, is %d\n", nentry));
1293
1294         ft = malloc(sizeof(struct flowtable),
1295             M_RTABLE, M_WAITOK | M_ZERO);
1296
1297         ft->ft_name = name;
1298         ft->ft_flags = flags;
1299         ft->ft_size = nentry;
1300 #ifdef RADIX_MPATH
1301         ft->ft_rtalloc = rtalloc_mpath_fib;
1302 #else
1303         ft->ft_rtalloc = in_rtalloc_ign_wrapper;
1304 #endif
1305         if (flags & FL_PCPU) {
1306                 ft->ft_lock = flowtable_pcpu_lock;
1307                 ft->ft_unlock = flowtable_pcpu_unlock;
1308
1309                 for (i = 0; i <= mp_maxid; i++) {
1310                         ft->ft_table.pcpu[i] =
1311                             malloc(nentry*sizeof(struct flentry *),
1312                                 M_RTABLE, M_WAITOK | M_ZERO);
1313                         ft->ft_masks[i] = bit_alloc(nentry);
1314                 }
1315         } else {
1316                 ft->ft_lock_count = 2*(powerof2(mp_maxid + 1) ? (mp_maxid + 1):
1317                     (fls(mp_maxid + 1) << 1));
1318                 
1319                 ft->ft_lock = flowtable_global_lock;
1320                 ft->ft_unlock = flowtable_global_unlock;
1321                 ft->ft_table.global =
1322                             malloc(nentry*sizeof(struct flentry *),
1323                                 M_RTABLE, M_WAITOK | M_ZERO);
1324                 ft->ft_locks = malloc(ft->ft_lock_count*sizeof(struct mtx),
1325                                 M_RTABLE, M_WAITOK | M_ZERO);
1326                 for (i = 0; i < ft->ft_lock_count; i++)
1327                         mtx_init(&ft->ft_locks[i], "flow", NULL, MTX_DEF|MTX_DUPOK);
1328
1329                 ft->ft_masks[0] = bit_alloc(nentry);
1330         }
1331         ft->ft_tmpmask = bit_alloc(nentry);
1332
1333         /*
1334          * In the local transmit case the table truly is 
1335          * just a cache - so everything is eligible for
1336          * replacement after 5s of non-use
1337          */
1338         if (flags & FL_HASH_ALL) {
1339                 ft->ft_udp_idle = V_flowtable_udp_expire;
1340                 ft->ft_syn_idle = V_flowtable_syn_expire;
1341                 ft->ft_fin_wait_idle = V_flowtable_fin_wait_expire;
1342                 ft->ft_tcp_idle = V_flowtable_fin_wait_expire;
1343         } else {
1344                 ft->ft_udp_idle = ft->ft_fin_wait_idle =
1345                     ft->ft_syn_idle = ft->ft_tcp_idle = 30;
1346                 
1347         }
1348
1349         /*
1350          * hook in to the cleaner list
1351          */
1352         if (V_flow_list_head == NULL)
1353                 V_flow_list_head = ft;
1354         else {
1355                 fttail = V_flow_list_head;
1356                 while (fttail->ft_next != NULL)
1357                         fttail = fttail->ft_next;
1358                 fttail->ft_next = ft;
1359         }
1360
1361         return (ft);
1362 }
1363
1364 /*
1365  * The rest of the code is devoted to garbage collection of expired entries.
1366  * It is a new additon made necessary by the switch to dynamically allocating
1367  * flow tables.
1368  * 
1369  */
1370 static void
1371 fle_free(struct flentry *fle, struct flowtable *ft)
1372 {
1373         struct rtentry *rt;
1374         struct llentry *lle;
1375
1376         rt = __DEVOLATILE(struct rtentry *, fle->f_rt);
1377         lle = __DEVOLATILE(struct llentry *, fle->f_lle);
1378         RTFREE(rt);
1379         LLE_FREE(lle);
1380         flow_free(fle, ft);
1381 }
1382
1383 static void
1384 flowtable_free_stale(struct flowtable *ft, struct rtentry *rt)
1385 {
1386         int curbit = 0, count;
1387         struct flentry *fle,  **flehead, *fleprev;
1388         struct flentry *flefreehead, *flefreetail, *fletmp;
1389         bitstr_t *mask, *tmpmask;
1390         struct flowtable_stats *fs = &ft->ft_stats[curcpu];
1391
1392         flefreehead = flefreetail = NULL;
1393         mask = flowtable_mask(ft);
1394         tmpmask = ft->ft_tmpmask;
1395         memcpy(tmpmask, mask, ft->ft_size/8);
1396         /*
1397          * XXX Note to self, bit_ffs operates at the byte level
1398          * and thus adds gratuitous overhead
1399          */
1400         bit_ffs(tmpmask, ft->ft_size, &curbit);
1401         while (curbit != -1) {
1402                 if (curbit >= ft->ft_size || curbit < -1) {
1403                         log(LOG_ALERT,
1404                             "warning: bad curbit value %d \n",
1405                             curbit);
1406                         break;
1407                 }
1408
1409                 FL_ENTRY_LOCK(ft, curbit);
1410                 flehead = flowtable_entry(ft, curbit);
1411                 fle = fleprev = *flehead;
1412
1413                 fs->ft_free_checks++;
1414 #ifdef DIAGNOSTIC
1415                 if (fle == NULL && curbit > 0) {
1416                         log(LOG_ALERT,
1417                             "warning bit=%d set, but no fle found\n",
1418                             curbit);
1419                 }
1420 #endif          
1421                 while (fle != NULL) {
1422                         if (rt != NULL) {
1423                                 if (__DEVOLATILE(struct rtentry *, fle->f_rt) != rt) {
1424                                         fleprev = fle;
1425                                         fle = fle->f_next;
1426                                         continue;
1427                                 }
1428                         } else if (!flow_stale(ft, fle)) {
1429                                 fleprev = fle;
1430                                 fle = fle->f_next;
1431                                 continue;
1432                         }
1433                         /*
1434                          * delete head of the list
1435                          */
1436                         if (fleprev == *flehead) {
1437                                 fletmp = fleprev;
1438                                 if (fle == fleprev) {
1439                                         fleprev = *flehead = fle->f_next;
1440                                 } else
1441                                         fleprev = *flehead = fle;
1442                                 fle = fle->f_next;
1443                         } else {
1444                                 /*
1445                                  * don't advance fleprev
1446                                  */
1447                                 fletmp = fle;
1448                                 fleprev->f_next = fle->f_next;
1449                                 fle = fleprev->f_next;
1450                         }
1451
1452                         if (flefreehead == NULL)
1453                                 flefreehead = flefreetail = fletmp;
1454                         else {
1455                                 flefreetail->f_next = fletmp;
1456                                 flefreetail = fletmp;
1457                         }
1458                         fletmp->f_next = NULL;
1459                 }
1460                 if (*flehead == NULL)
1461                         bit_clear(mask, curbit);
1462                 FL_ENTRY_UNLOCK(ft, curbit);
1463                 bit_clear(tmpmask, curbit);
1464                 bit_ffs(tmpmask, ft->ft_size, &curbit);
1465         }
1466         count = 0;
1467         while ((fle = flefreehead) != NULL) {
1468                 flefreehead = fle->f_next;
1469                 count++;
1470                 fs->ft_frees++;
1471                 fle_free(fle, ft);
1472         }
1473         if (V_flowtable_debug && count)
1474                 log(LOG_DEBUG, "freed %d flow entries\n", count);
1475 }
1476
1477 void
1478 flowtable_route_flush(struct flowtable *ft, struct rtentry *rt)
1479 {
1480         int i;
1481
1482         if (ft->ft_flags & FL_PCPU) {
1483                 for (i = 0; i <= mp_maxid; i++) {
1484                         if (CPU_ABSENT(i))
1485                                 continue;
1486                         
1487                         if (smp_started == 1) {
1488                                 thread_lock(curthread);
1489                                 sched_bind(curthread, i);
1490                                 thread_unlock(curthread);
1491                         }
1492
1493                         flowtable_free_stale(ft, rt);
1494
1495                         if (smp_started == 1) {
1496                                 thread_lock(curthread);
1497                                 sched_unbind(curthread);
1498                                 thread_unlock(curthread);
1499                         }
1500                 }
1501         } else {
1502                 flowtable_free_stale(ft, rt);
1503         }
1504 }
1505
1506 static void
1507 flowtable_clean_vnet(void)
1508 {
1509         struct flowtable *ft;
1510         int i;
1511
1512         ft = V_flow_list_head;
1513         while (ft != NULL) {
1514                 if (ft->ft_flags & FL_PCPU) {
1515                         for (i = 0; i <= mp_maxid; i++) {
1516                                 if (CPU_ABSENT(i))
1517                                         continue;
1518
1519                                 if (smp_started == 1) {
1520                                         thread_lock(curthread);
1521                                         sched_bind(curthread, i);
1522                                         thread_unlock(curthread);
1523                                 }
1524
1525                                 flowtable_free_stale(ft, NULL);
1526
1527                                 if (smp_started == 1) {
1528                                         thread_lock(curthread);
1529                                         sched_unbind(curthread);
1530                                         thread_unlock(curthread);
1531                                 }
1532                         }
1533                 } else {
1534                         flowtable_free_stale(ft, NULL);
1535                 }
1536                 ft = ft->ft_next;
1537         }
1538 }
1539
1540 static void
1541 flowtable_cleaner(void)
1542 {
1543         VNET_ITERATOR_DECL(vnet_iter);
1544
1545         if (bootverbose)
1546                 log(LOG_INFO, "flowtable cleaner started\n");
1547         while (1) {
1548                 VNET_LIST_RLOCK();
1549                 VNET_FOREACH(vnet_iter) {
1550                         CURVNET_SET(vnet_iter);
1551                         flowtable_clean_vnet();
1552                         CURVNET_RESTORE();
1553                 }
1554                 VNET_LIST_RUNLOCK();
1555
1556                 flowclean_cycles++;
1557                 /*
1558                  * The 10 second interval between cleaning checks
1559                  * is arbitrary
1560                  */
1561                 mtx_lock(&flowclean_lock);
1562                 cv_broadcast(&flowclean_cv);
1563                 cv_timedwait(&flowclean_cv, &flowclean_lock, flowclean_freq);
1564                 mtx_unlock(&flowclean_lock);
1565         }
1566 }
1567
1568 static void
1569 flowtable_flush(void *unused __unused)
1570 {
1571         uint64_t start;
1572
1573         mtx_lock(&flowclean_lock);
1574         start = flowclean_cycles;
1575         while (start == flowclean_cycles) {
1576                 cv_broadcast(&flowclean_cv);
1577                 cv_wait(&flowclean_cv, &flowclean_lock);
1578         }
1579         mtx_unlock(&flowclean_lock);
1580 }
1581
1582 static struct kproc_desc flow_kp = {
1583         "flowcleaner",
1584         flowtable_cleaner,
1585         &flowcleanerproc
1586 };
1587 SYSINIT(flowcleaner, SI_SUB_KTHREAD_IDLE, SI_ORDER_ANY, kproc_start, &flow_kp);
1588
1589 static void
1590 flowtable_init_vnet(const void *unused __unused)
1591 {
1592
1593         V_flowtable_nmbflows = 1024 + maxusers * 64 * mp_ncpus;
1594         V_flow_ipv4_zone = uma_zcreate("ip4flow", sizeof(struct flentry_v4),
1595             NULL, NULL, NULL, NULL, 64, UMA_ZONE_MAXBUCKET);
1596         V_flow_ipv6_zone = uma_zcreate("ip6flow", sizeof(struct flentry_v6),
1597             NULL, NULL, NULL, NULL, 64, UMA_ZONE_MAXBUCKET);    
1598         uma_zone_set_max(V_flow_ipv4_zone, V_flowtable_nmbflows);
1599         uma_zone_set_max(V_flow_ipv6_zone, V_flowtable_nmbflows);
1600         V_flowtable_ready = 1;
1601 }
1602 VNET_SYSINIT(flowtable_init_vnet, SI_SUB_SMP, SI_ORDER_ANY,
1603     flowtable_init_vnet, NULL);
1604
1605 static void
1606 flowtable_init(const void *unused __unused)
1607 {
1608
1609         cv_init(&flowclean_cv, "flowcleanwait");
1610         mtx_init(&flowclean_lock, "flowclean lock", NULL, MTX_DEF);
1611         EVENTHANDLER_REGISTER(ifnet_departure_event, flowtable_flush, NULL,
1612             EVENTHANDLER_PRI_ANY);
1613         flowclean_freq = 20*hz;
1614 }
1615 SYSINIT(flowtable_init, SI_SUB_SMP, SI_ORDER_MIDDLE,
1616     flowtable_init, NULL);
1617
1618
1619 #ifdef VIMAGE
1620 static void
1621 flowtable_uninit(const void *unused __unused)
1622 {
1623
1624         V_flowtable_ready = 0;
1625         uma_zdestroy(V_flow_ipv4_zone);
1626         uma_zdestroy(V_flow_ipv6_zone);
1627 }
1628
1629 VNET_SYSUNINIT(flowtable_uninit, SI_SUB_KTHREAD_INIT, SI_ORDER_ANY,
1630     flowtable_uninit, NULL);
1631 #endif
1632
1633 #ifdef DDB
1634 static uint32_t *
1635 flowtable_get_hashkey(struct flentry *fle)
1636 {
1637         uint32_t *hashkey;
1638
1639         if (fle->f_flags & FL_IPV6)
1640                 hashkey = ((struct flentry_v4 *)fle)->fl_flow.ipf_key;
1641         else
1642                 hashkey = ((struct flentry_v6 *)fle)->fl_flow.ipf_key;
1643
1644         return (hashkey);
1645 }
1646
1647 static bitstr_t *
1648 flowtable_mask_pcpu(struct flowtable *ft, int cpuid)
1649 {
1650         bitstr_t *mask;
1651
1652         if (ft->ft_flags & FL_PCPU)
1653                 mask = ft->ft_masks[cpuid];
1654         else
1655                 mask = ft->ft_masks[0];
1656
1657         return (mask);
1658 }
1659
1660 static struct flentry **
1661 flowtable_entry_pcpu(struct flowtable *ft, uint32_t hash, int cpuid)
1662 {
1663         struct flentry **fle;
1664         int index = (hash % ft->ft_size);
1665
1666         if (ft->ft_flags & FL_PCPU) {
1667                 fle = &ft->ft_table.pcpu[cpuid][index];
1668         } else {
1669                 fle = &ft->ft_table.global[index];
1670         }
1671         
1672         return (fle);
1673 }
1674
1675 static void
1676 flow_show(struct flowtable *ft, struct flentry *fle)
1677 {
1678         int idle_time;
1679         int rt_valid, ifp_valid;
1680         uint16_t sport, dport;
1681         uint32_t *hashkey;
1682         char saddr[4*sizeof "123"], daddr[4*sizeof "123"];
1683         volatile struct rtentry *rt;
1684         struct ifnet *ifp = NULL;
1685
1686         idle_time = (int)(time_uptime - fle->f_uptime);
1687         rt = fle->f_rt;
1688         rt_valid = rt != NULL;
1689         if (rt_valid) 
1690                 ifp = rt->rt_ifp;
1691         ifp_valid = ifp != NULL;
1692         hashkey = flowtable_get_hashkey(fle);
1693         if (fle->f_flags & FL_IPV6)
1694                 goto skipaddr;
1695
1696         inet_ntoa_r(*(struct in_addr *) &hashkey[2], daddr);
1697         if (ft->ft_flags & FL_HASH_ALL) {
1698                 inet_ntoa_r(*(struct in_addr *) &hashkey[1], saddr);            
1699                 sport = ntohs(((uint16_t *)hashkey)[0]);
1700                 dport = ntohs(((uint16_t *)hashkey)[1]);
1701                 db_printf("%s:%d->%s:%d",
1702                     saddr, sport, daddr,
1703                     dport);
1704         } else 
1705                 db_printf("%s ", daddr);
1706     
1707 skipaddr:
1708         if (fle->f_flags & FL_STALE)
1709                 db_printf(" FL_STALE ");
1710         if (fle->f_flags & FL_TCP)
1711                 db_printf(" FL_TCP ");
1712         if (fle->f_flags & FL_UDP)
1713                 db_printf(" FL_UDP ");
1714         if (rt_valid) {
1715                 if (rt->rt_flags & RTF_UP)
1716                         db_printf(" RTF_UP ");
1717         }
1718         if (ifp_valid) {
1719                 if (ifp->if_flags & IFF_LOOPBACK)
1720                         db_printf(" IFF_LOOPBACK ");
1721                 if (ifp->if_flags & IFF_UP)
1722                         db_printf(" IFF_UP ");          
1723                 if (ifp->if_flags & IFF_POINTOPOINT)
1724                         db_printf(" IFF_POINTOPOINT ");         
1725         }
1726         if (fle->f_flags & FL_IPV6)
1727                 db_printf("\n\tkey=%08x:%08x:%08x%08x:%08x:%08x%08x:%08x:%08x",
1728                     hashkey[0], hashkey[1], hashkey[2],
1729                     hashkey[3], hashkey[4], hashkey[5],
1730                     hashkey[6], hashkey[7], hashkey[8]);
1731         else
1732                 db_printf("\n\tkey=%08x:%08x:%08x ",
1733                     hashkey[0], hashkey[1], hashkey[2]);
1734         db_printf("hash=%08x idle_time=%03d"
1735             "\n\tfibnum=%02d rt=%p",
1736             fle->f_fhash, idle_time, fle->f_fibnum, fle->f_rt);
1737         db_printf("\n");
1738 }
1739
1740 static void
1741 flowtable_show(struct flowtable *ft, int cpuid)
1742 {
1743         int curbit = 0;
1744         struct flentry *fle,  **flehead;
1745         bitstr_t *mask, *tmpmask;
1746
1747         if (cpuid != -1)
1748                 db_printf("cpu: %d\n", cpuid);
1749         mask = flowtable_mask_pcpu(ft, cpuid);
1750         tmpmask = ft->ft_tmpmask;
1751         memcpy(tmpmask, mask, ft->ft_size/8);
1752         /*
1753          * XXX Note to self, bit_ffs operates at the byte level
1754          * and thus adds gratuitous overhead
1755          */
1756         bit_ffs(tmpmask, ft->ft_size, &curbit);
1757         while (curbit != -1) {
1758                 if (curbit >= ft->ft_size || curbit < -1) {
1759                         db_printf("warning: bad curbit value %d \n",
1760                             curbit);
1761                         break;
1762                 }
1763
1764                 flehead = flowtable_entry_pcpu(ft, curbit, cpuid);
1765                 fle = *flehead;
1766
1767                 while (fle != NULL) {   
1768                         flow_show(ft, fle);
1769                         fle = fle->f_next;
1770                         continue;
1771                 }
1772                 bit_clear(tmpmask, curbit);
1773                 bit_ffs(tmpmask, ft->ft_size, &curbit);
1774         }
1775 }
1776
1777 static void
1778 flowtable_show_vnet(void)
1779 {
1780         struct flowtable *ft;
1781         int i;
1782
1783         ft = V_flow_list_head;
1784         while (ft != NULL) {
1785                 printf("name: %s\n", ft->ft_name);
1786                 if (ft->ft_flags & FL_PCPU) {
1787                         for (i = 0; i <= mp_maxid; i++) {
1788                                 if (CPU_ABSENT(i))
1789                                         continue;
1790                                 flowtable_show(ft, i);
1791                         }
1792                 } else {
1793                         flowtable_show(ft, -1);
1794                 }
1795                 ft = ft->ft_next;
1796         }
1797 }
1798
1799 DB_SHOW_COMMAND(flowtables, db_show_flowtables)
1800 {
1801         VNET_ITERATOR_DECL(vnet_iter);
1802
1803         VNET_FOREACH(vnet_iter) {
1804                 CURVNET_SET(vnet_iter);
1805                 flowtable_show_vnet();
1806                 CURVNET_RESTORE();
1807         }
1808 }
1809 #endif