]> CyberLeo.Net >> Repos - FreeBSD/stable/8.git/blob - sys/net/flowtable.c
MFC 205077
[FreeBSD/stable/8.git] / sys / net / flowtable.c
1 /**************************************************************************
2
3 Copyright (c) 2008-2010, BitGravity Inc.
4 All rights reserved.
5
6 Redistribution and use in source and binary forms, with or without
7 modification, are permitted provided that the following conditions are met:
8
9  1. Redistributions of source code must retain the above copyright notice,
10     this list of conditions and the following disclaimer.
11
12  2. Neither the name of the BitGravity Corporation nor the names of its
13     contributors may be used to endorse or promote products derived from
14     this software without specific prior written permission.
15
16 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
17 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
20 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
21 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
22 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
23 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
24 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
25 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
26 POSSIBILITY OF SUCH DAMAGE.
27
28 ***************************************************************************/
29
30 #include "opt_route.h"
31 #include "opt_mpath.h"
32 #include "opt_ddb.h"
33 #include "opt_inet.h"
34 #include "opt_inet6.h"
35
36 #include <sys/cdefs.h>
37 __FBSDID("$FreeBSD$");
38
39 #include <sys/param.h>  
40 #include <sys/types.h>
41 #include <sys/bitstring.h>
42 #include <sys/condvar.h>
43 #include <sys/callout.h>
44 #include <sys/kernel.h>  
45 #include <sys/kthread.h>
46 #include <sys/limits.h>
47 #include <sys/malloc.h>
48 #include <sys/mbuf.h>
49 #include <sys/proc.h>
50 #include <sys/sbuf.h>
51 #include <sys/sched.h>
52 #include <sys/smp.h>
53 #include <sys/socket.h>
54 #include <sys/syslog.h>
55 #include <sys/sysctl.h>
56
57 #include <net/if.h>
58 #include <net/if_llatbl.h>
59 #include <net/if_var.h>
60 #include <net/route.h> 
61 #include <net/flowtable.h>
62 #include <net/vnet.h>
63
64 #include <netinet/in.h>
65 #include <netinet/in_systm.h>
66 #include <netinet/in_var.h>
67 #include <netinet/if_ether.h>
68 #include <netinet/ip.h>
69 #ifdef INET6
70 #include <netinet/ip6.h>
71 #endif
72 #include <netinet/tcp.h>
73 #include <netinet/udp.h>
74 #include <netinet/sctp.h>
75
76 #include <libkern/jenkins.h>
77 #include <ddb/ddb.h>
78
79 struct ipv4_tuple {
80         uint16_t        ip_sport;       /* source port */
81         uint16_t        ip_dport;       /* destination port */
82         in_addr_t       ip_saddr;       /* source address */
83         in_addr_t       ip_daddr;       /* destination address */
84 };
85
86 union ipv4_flow {
87         struct ipv4_tuple ipf_ipt;
88         uint32_t        ipf_key[3];
89 };
90
91 struct ipv6_tuple {
92         uint16_t        ip_sport;       /* source port */
93         uint16_t        ip_dport;       /* destination port */
94         struct in6_addr ip_saddr;       /* source address */
95         struct in6_addr ip_daddr;       /* destination address */
96 };
97
98 union ipv6_flow {
99         struct ipv6_tuple ipf_ipt;
100         uint32_t        ipf_key[9];
101 };
102
103 struct flentry {
104         volatile uint32_t       f_fhash;        /* hash flowing forward */
105         uint16_t                f_flags;        /* flow flags */
106         uint8_t                 f_pad;          
107         uint8_t                 f_proto;        /* protocol */
108         uint32_t                f_fibnum;       /* fib index */
109         uint32_t                f_uptime;       /* uptime at last access */
110         struct flentry          *f_next;        /* pointer to collision entry */
111         volatile struct rtentry *f_rt;          /* rtentry for flow */
112         volatile struct llentry *f_lle;         /* llentry for flow */
113 };
114
115 struct flentry_v4 {
116         struct flentry  fl_entry;
117         union ipv4_flow fl_flow;
118 };
119
120 struct flentry_v6 {
121         struct flentry  fl_entry;
122         union ipv6_flow fl_flow;
123 };
124
125 #define fl_fhash        fl_entry.fl_fhash
126 #define fl_flags        fl_entry.fl_flags
127 #define fl_proto        fl_entry.fl_proto
128 #define fl_uptime       fl_entry.fl_uptime
129 #define fl_rt           fl_entry.fl_rt
130 #define fl_lle          fl_entry.fl_lle
131
132 #define SECS_PER_HOUR           3600
133 #define SECS_PER_DAY            (24*SECS_PER_HOUR)
134
135 #define SYN_IDLE                300
136 #define UDP_IDLE                300
137 #define FIN_WAIT_IDLE           600
138 #define TCP_IDLE                SECS_PER_DAY
139
140
141 typedef void fl_lock_t(struct flowtable *, uint32_t);
142 typedef void fl_rtalloc_t(struct route *, uint32_t, u_int);
143
144 union flentryp {
145         struct flentry          **global;
146         struct flentry          **pcpu[MAXCPU];
147 };
148
149 struct flowtable_stats {
150         uint64_t        ft_collisions;
151         uint64_t        ft_allocated;
152         uint64_t        ft_misses;
153         uint64_t        ft_max_depth;
154         uint64_t        ft_free_checks;
155         uint64_t        ft_frees;
156         uint64_t        ft_hits;
157         uint64_t        ft_lookups;
158 } __aligned(CACHE_LINE_SIZE);
159
160 struct flowtable {
161         struct  flowtable_stats ft_stats[MAXCPU];
162         int             ft_size;
163         int             ft_lock_count;
164         uint32_t        ft_flags;
165         char            *ft_name;
166         fl_lock_t       *ft_lock;
167         fl_lock_t       *ft_unlock;
168         fl_rtalloc_t    *ft_rtalloc;
169         /*
170          * XXX need to pad out 
171          */ 
172         struct mtx      *ft_locks;
173         union flentryp  ft_table;
174         bitstr_t        *ft_masks[MAXCPU];
175         bitstr_t        *ft_tmpmask;
176         struct flowtable *ft_next;
177
178         uint32_t        ft_count __aligned(CACHE_LINE_SIZE);
179         uint32_t        ft_udp_idle __aligned(CACHE_LINE_SIZE);
180         uint32_t        ft_fin_wait_idle;
181         uint32_t        ft_syn_idle;
182         uint32_t        ft_tcp_idle;
183         boolean_t       ft_full;
184 } __aligned(CACHE_LINE_SIZE);
185
186 static struct proc *flowcleanerproc;
187 static VNET_DEFINE(struct flowtable *, flow_list_head);
188 static VNET_DEFINE(uint32_t, flow_hashjitter);
189 static VNET_DEFINE(uma_zone_t, flow_ipv4_zone);
190 static VNET_DEFINE(uma_zone_t, flow_ipv6_zone);
191
192 #define V_flow_list_head        VNET(flow_list_head)
193 #define V_flow_hashjitter       VNET(flow_hashjitter)
194 #define V_flow_ipv4_zone        VNET(flow_ipv4_zone)
195 #define V_flow_ipv6_zone        VNET(flow_ipv6_zone)
196
197
198 static struct cv        flowclean_cv;
199 static struct mtx       flowclean_lock;
200 static uint32_t         flowclean_cycles;
201 static uint32_t         flowclean_freq;
202
203 #ifdef FLOWTABLE_DEBUG
204 #define FLDPRINTF(ft, flags, fmt, ...)          \
205 do {                                            \
206         if ((ft)->ft_flags & (flags))           \
207                 printf((fmt), __VA_ARGS__);     \
208 } while (0);                                    \
209
210 #else
211 #define FLDPRINTF(ft, flags, fmt, ...)
212
213 #endif
214
215
216 /*
217  * TODO:
218  * - Make flowtable stats per-cpu, aggregated at sysctl call time,
219  *   to avoid extra cache evictions caused by incrementing a shared
220  *   counter
221  * - add sysctls to resize && flush flow tables 
222  * - Add per flowtable sysctls for statistics and configuring timeouts
223  * - add saturation counter to rtentry to support per-packet load-balancing
224  *   add flag to indicate round-robin flow, add list lookup from head
225      for flows
226  * - add sysctl / device node / syscall to support exporting and importing
227  *   of flows with flag to indicate that a flow was imported so should
228  *   not be considered for auto-cleaning
229  * - support explicit connection state (currently only ad-hoc for DSR)
230  * - idetach() cleanup for options VIMAGE builds.
231  */
232 VNET_DEFINE(int, flowtable_enable) = 1;
233 static VNET_DEFINE(int, flowtable_debug);
234 static VNET_DEFINE(int, flowtable_syn_expire) = SYN_IDLE;
235 static VNET_DEFINE(int, flowtable_udp_expire) = UDP_IDLE;
236 static VNET_DEFINE(int, flowtable_fin_wait_expire) = FIN_WAIT_IDLE;
237 static VNET_DEFINE(int, flowtable_tcp_expire) = TCP_IDLE;
238 static VNET_DEFINE(int, flowtable_nmbflows);
239 static VNET_DEFINE(int, flowtable_ready) = 0;
240
241 #define V_flowtable_enable              VNET(flowtable_enable)
242 #define V_flowtable_debug               VNET(flowtable_debug)
243 #define V_flowtable_syn_expire          VNET(flowtable_syn_expire)
244 #define V_flowtable_udp_expire          VNET(flowtable_udp_expire)
245 #define V_flowtable_fin_wait_expire     VNET(flowtable_fin_wait_expire)
246 #define V_flowtable_tcp_expire          VNET(flowtable_tcp_expire)
247 #define V_flowtable_nmbflows            VNET(flowtable_nmbflows)
248 #define V_flowtable_ready               VNET(flowtable_ready)
249
250 SYSCTL_NODE(_net_inet, OID_AUTO, flowtable, CTLFLAG_RD, NULL, "flowtable");
251 SYSCTL_VNET_INT(_net_inet_flowtable, OID_AUTO, debug, CTLFLAG_RW,
252     &VNET_NAME(flowtable_debug), 0, "print debug info.");
253 SYSCTL_VNET_INT(_net_inet_flowtable, OID_AUTO, enable, CTLFLAG_RW,
254     &VNET_NAME(flowtable_enable), 0, "enable flowtable caching.");
255
256 /*
257  * XXX This does not end up updating timeouts at runtime
258  * and only reflects the value for the last table added :-/
259  */
260 SYSCTL_VNET_INT(_net_inet_flowtable, OID_AUTO, syn_expire, CTLFLAG_RW,
261     &VNET_NAME(flowtable_syn_expire), 0,
262     "seconds after which to remove syn allocated flow.");
263 SYSCTL_VNET_INT(_net_inet_flowtable, OID_AUTO, udp_expire, CTLFLAG_RW,
264     &VNET_NAME(flowtable_udp_expire), 0,
265     "seconds after which to remove flow allocated to UDP.");
266 SYSCTL_VNET_INT(_net_inet_flowtable, OID_AUTO, fin_wait_expire, CTLFLAG_RW,
267     &VNET_NAME(flowtable_fin_wait_expire), 0,
268     "seconds after which to remove a flow in FIN_WAIT.");
269 SYSCTL_VNET_INT(_net_inet_flowtable, OID_AUTO, tcp_expire, CTLFLAG_RW,
270     &VNET_NAME(flowtable_tcp_expire), 0,
271     "seconds after which to remove flow allocated to a TCP connection.");
272
273
274 /*
275  * Maximum number of flows that can be allocated of a given type.
276  *
277  * The table is allocated at boot time (for the pure caching case
278  * there is no reason why this could not be changed at runtime)
279  * and thus (currently) needs to be set with a tunable.
280  */
281 static int
282 sysctl_nmbflows(SYSCTL_HANDLER_ARGS)
283 {
284         int error, newnmbflows;
285
286         newnmbflows = V_flowtable_nmbflows;
287         error = sysctl_handle_int(oidp, &newnmbflows, 0, req); 
288         if (error == 0 && req->newptr) {
289                 if (newnmbflows > V_flowtable_nmbflows) {
290                         V_flowtable_nmbflows = newnmbflows;
291                         uma_zone_set_max(V_flow_ipv4_zone,
292                             V_flowtable_nmbflows);
293                         uma_zone_set_max(V_flow_ipv6_zone,
294                             V_flowtable_nmbflows);
295                 } else
296                         error = EINVAL;
297         }
298         return (error);
299 }
300 SYSCTL_VNET_PROC(_net_inet_flowtable, OID_AUTO, nmbflows,
301     CTLTYPE_INT|CTLFLAG_RW, 0, 0, sysctl_nmbflows, "IU",
302     "Maximum number of flows allowed");
303
304
305
306 #define FS_PRINT(sb, field)     sbuf_printf((sb), "\t%s: %jd\n", #field, fs->ft_##field)
307
308 static void
309 fs_print(struct sbuf *sb, struct flowtable_stats *fs)
310 {
311
312         FS_PRINT(sb, collisions);
313         FS_PRINT(sb, allocated);
314         FS_PRINT(sb, misses);
315         FS_PRINT(sb, max_depth);
316         FS_PRINT(sb, free_checks);
317         FS_PRINT(sb, frees);
318         FS_PRINT(sb, hits);
319         FS_PRINT(sb, lookups);
320 }
321
322 static void
323 flowtable_show_stats(struct sbuf *sb, struct flowtable *ft)
324 {
325         int i;
326         struct flowtable_stats fs, *pfs;
327
328         if (ft->ft_flags & FL_PCPU) {
329                 bzero(&fs, sizeof(fs));
330                 pfs = &fs;
331                 for (i = 0; i <= mp_maxid; i++) {
332                         if (CPU_ABSENT(i))
333                                 continue;
334                         pfs->ft_collisions  += ft->ft_stats[i].ft_collisions;
335                         pfs->ft_allocated   += ft->ft_stats[i].ft_allocated;
336                         pfs->ft_misses      += ft->ft_stats[i].ft_misses;
337                         pfs->ft_free_checks += ft->ft_stats[i].ft_free_checks;
338                         pfs->ft_frees       += ft->ft_stats[i].ft_frees;
339                         pfs->ft_hits        += ft->ft_stats[i].ft_hits;
340                         pfs->ft_lookups     += ft->ft_stats[i].ft_lookups;
341                         if (ft->ft_stats[i].ft_max_depth > pfs->ft_max_depth)
342                                 pfs->ft_max_depth = ft->ft_stats[i].ft_max_depth;
343                 }
344         } else {
345                 pfs = &ft->ft_stats[0];
346         }
347         fs_print(sb, pfs);
348 }
349
350 static int
351 sysctl_flowtable_stats(SYSCTL_HANDLER_ARGS)
352 {
353         struct flowtable *ft;
354         struct sbuf *sb;
355         int error;
356
357         sb = sbuf_new(NULL, NULL, 64*1024, SBUF_FIXEDLEN);
358
359         ft = V_flow_list_head;
360         while (ft != NULL) {
361                 sbuf_printf(sb, "\ntable name: %s\n", ft->ft_name);
362                 flowtable_show_stats(sb, ft);
363                 ft = ft->ft_next;
364         }
365         sbuf_finish(sb);
366         error = SYSCTL_OUT(req, sbuf_data(sb), sbuf_len(sb) + 1);
367         sbuf_delete(sb);
368
369         return (error);
370 }
371 SYSCTL_VNET_PROC(_net_inet_flowtable, OID_AUTO, stats, CTLTYPE_STRING|CTLFLAG_RD,
372     NULL, 0, sysctl_flowtable_stats, "A", "flowtable statistics");
373
374
375 #ifndef RADIX_MPATH
376 static void
377 in_rtalloc_ign_wrapper(struct route *ro, uint32_t hash, u_int fibnum)
378 {
379
380         rtalloc_ign_fib(ro, 0, fibnum);
381 }
382 #endif
383
384 static void
385 flowtable_global_lock(struct flowtable *table, uint32_t hash)
386 {       
387         int lock_index = (hash)&(table->ft_lock_count - 1);
388
389         mtx_lock(&table->ft_locks[lock_index]);
390 }
391
392 static void
393 flowtable_global_unlock(struct flowtable *table, uint32_t hash)
394 {       
395         int lock_index = (hash)&(table->ft_lock_count - 1);
396
397         mtx_unlock(&table->ft_locks[lock_index]);
398 }
399
400 static void
401 flowtable_pcpu_lock(struct flowtable *table, uint32_t hash)
402 {
403
404         critical_enter();
405 }
406
407 static void
408 flowtable_pcpu_unlock(struct flowtable *table, uint32_t hash)
409 {
410
411         critical_exit();
412 }
413
414 #define FL_ENTRY_INDEX(table, hash)((hash) % (table)->ft_size)
415 #define FL_ENTRY(table, hash) *flowtable_entry((table), (hash))
416 #define FL_ENTRY_LOCK(table, hash)  (table)->ft_lock((table), (hash))
417 #define FL_ENTRY_UNLOCK(table, hash) (table)->ft_unlock((table), (hash))
418
419 #define FL_STALE        (1<<8)
420 #define FL_IPV6         (1<<9)
421 #define FL_OVERWRITE    (1<<10)
422
423 void
424 flow_invalidate(struct flentry *fle)
425 {
426
427         fle->f_flags |= FL_STALE;
428 }
429
430 static __inline int
431 proto_to_flags(uint8_t proto)
432 {
433         int flag;
434
435         switch (proto) {
436         case IPPROTO_TCP:
437                 flag = FL_TCP;
438                 break;
439         case IPPROTO_SCTP:
440                 flag = FL_SCTP;
441                 break;          
442         case IPPROTO_UDP:
443                 flag = FL_UDP;
444                 break;
445         default:
446                 flag = 0;
447                 break;
448         }
449
450         return (flag);
451 }
452
453 static __inline int
454 flags_to_proto(int flags)
455 {
456         int proto, protoflags;
457
458         protoflags = flags & (FL_TCP|FL_SCTP|FL_UDP);
459         switch (protoflags) {
460         case FL_TCP:
461                 proto = IPPROTO_TCP;
462                 break;
463         case FL_SCTP:
464                 proto = IPPROTO_SCTP;
465                 break;
466         case FL_UDP:
467                 proto = IPPROTO_UDP;
468                 break;
469         default:
470                 proto = 0;
471                 break;
472         }
473         return (proto);
474 }
475
476 #ifdef INET
477 #ifdef FLOWTABLE_DEBUG
478 static void
479 ipv4_flow_print_tuple(int flags, int proto, struct sockaddr_in *ssin,
480     struct sockaddr_in *dsin)
481 {
482         char saddr[4*sizeof "123"], daddr[4*sizeof "123"];
483
484         if (flags & FL_HASH_ALL) {
485                 inet_ntoa_r(ssin->sin_addr, saddr);
486                 inet_ntoa_r(dsin->sin_addr, daddr);
487                 printf("proto=%d %s:%d->%s:%d\n",
488                     proto, saddr, ntohs(ssin->sin_port), daddr,
489                     ntohs(dsin->sin_port));
490         } else {
491                 inet_ntoa_r(*(struct in_addr *) &dsin->sin_addr, daddr);
492                 printf("proto=%d %s\n", proto, daddr);
493         }
494
495 }
496 #endif
497
498 static int
499 ipv4_mbuf_demarshal(struct flowtable *ft, struct mbuf *m,
500     struct sockaddr_in *ssin, struct sockaddr_in *dsin, uint16_t *flags)
501 {
502         struct ip *ip;
503         uint8_t proto;
504         int iphlen;
505         struct tcphdr *th;
506         struct udphdr *uh;
507         struct sctphdr *sh;
508         uint16_t sport, dport;
509
510         proto = sport = dport = 0;
511         ip = mtod(m, struct ip *);
512         dsin->sin_family = AF_INET;
513         dsin->sin_len = sizeof(*dsin);
514         dsin->sin_addr = ip->ip_dst;
515         ssin->sin_family = AF_INET;
516         ssin->sin_len = sizeof(*ssin);
517         ssin->sin_addr = ip->ip_src;    
518
519         proto = ip->ip_p;
520         if ((*flags & FL_HASH_ALL) == 0) {
521                 FLDPRINTF(ft, FL_DEBUG_ALL, "skip port check flags=0x%x ",
522                     *flags);
523                 goto skipports;
524         }
525
526         iphlen = ip->ip_hl << 2; /* XXX options? */
527
528         switch (proto) {
529         case IPPROTO_TCP:
530                 th = (struct tcphdr *)((caddr_t)ip + iphlen);
531                 sport = th->th_sport;
532                 dport = th->th_dport;
533                 if ((*flags & FL_HASH_ALL) &&
534                     (th->th_flags & (TH_RST|TH_FIN)))
535                         *flags |= FL_STALE;
536         break;
537         case IPPROTO_UDP:
538                 uh = (struct udphdr *)((caddr_t)ip + iphlen);
539                 sport = uh->uh_sport;
540                 dport = uh->uh_dport;
541         break;
542         case IPPROTO_SCTP:
543                 sh = (struct sctphdr *)((caddr_t)ip + iphlen);
544                 sport = sh->src_port;
545                 dport = sh->dest_port;
546         break;
547         default:
548                 FLDPRINTF(ft, FL_DEBUG_ALL, "proto=0x%x not supported\n", proto);
549                 return (ENOTSUP);
550                 /* no port - hence not a protocol we care about */
551                 break;
552         
553         }
554
555 skipports:
556         *flags |= proto_to_flags(proto);
557         ssin->sin_port = sport;
558         dsin->sin_port = dport;
559         return (0);
560 }
561
562 static uint32_t
563 ipv4_flow_lookup_hash_internal(
564         struct sockaddr_in *ssin, struct sockaddr_in *dsin, 
565             uint32_t *key, uint16_t flags)
566 {
567         uint16_t sport, dport;
568         uint8_t proto;
569         int offset = 0;
570
571         if ((V_flowtable_enable == 0) || (V_flowtable_ready == 0))
572                 return (0);
573         proto = flags_to_proto(flags);
574         sport = dport = key[2] = key[1] = key[0] = 0;
575         if ((ssin != NULL) && (flags & FL_HASH_ALL)) {
576                 key[1] = ssin->sin_addr.s_addr;
577                 sport = ssin->sin_port;
578         }
579         if (dsin != NULL) {
580                 key[2] = dsin->sin_addr.s_addr;
581                 dport = dsin->sin_port;
582         }
583         if (flags & FL_HASH_ALL) {
584                 ((uint16_t *)key)[0] = sport;
585                 ((uint16_t *)key)[1] = dport; 
586         } else
587                 offset = V_flow_hashjitter + proto;
588
589         return (jenkins_hashword(key, 3, offset));
590 }
591
592 static struct flentry *
593 flowtable_lookup_mbuf4(struct flowtable *ft, struct mbuf *m)
594 {
595         struct sockaddr_storage ssa, dsa;
596         uint16_t flags;
597         struct sockaddr_in *dsin, *ssin;
598
599         dsin = (struct sockaddr_in *)&dsa;
600         ssin = (struct sockaddr_in *)&ssa;
601         bzero(dsin, sizeof(*dsin));
602         bzero(ssin, sizeof(*ssin));
603         flags = ft->ft_flags;
604         if (ipv4_mbuf_demarshal(ft, m, ssin, dsin, &flags) != 0)
605                 return (NULL);
606
607         return (flowtable_lookup(ft, &ssa, &dsa, M_GETFIB(m), flags));
608 }
609
610 void
611 flow_to_route(struct flentry *fle, struct route *ro)
612 {
613         uint32_t *hashkey = NULL;
614         struct sockaddr_in *sin;
615
616         sin = (struct sockaddr_in *)&ro->ro_dst;
617         sin->sin_family = AF_INET;
618         sin->sin_len = sizeof(*sin);
619         hashkey = ((struct flentry_v4 *)fle)->fl_flow.ipf_key;
620         sin->sin_addr.s_addr = hashkey[2];
621         ro->ro_rt = __DEVOLATILE(struct rtentry *, fle->f_rt);
622         ro->ro_lle = __DEVOLATILE(struct llentry *, fle->f_lle);
623 }
624 #endif /* INET */
625
626 #ifdef INET6
627 /*
628  * PULLUP_TO(len, p, T) makes sure that len + sizeof(T) is contiguous,
629  * then it sets p to point at the offset "len" in the mbuf. WARNING: the
630  * pointer might become stale after other pullups (but we never use it
631  * this way).
632  */
633 #define PULLUP_TO(_len, p, T)                                           \
634 do {                                                                    \
635         int x = (_len) + sizeof(T);                                     \
636         if ((m)->m_len < x) {                                           \
637                 goto receive_failed;                                    \
638         }                                                               \
639         p = (mtod(m, char *) + (_len));                                 \
640 } while (0)
641
642 #define TCP(p)          ((struct tcphdr *)(p))
643 #define SCTP(p)         ((struct sctphdr *)(p))
644 #define UDP(p)          ((struct udphdr *)(p))
645
646 static int
647 ipv6_mbuf_demarshal(struct flowtable *ft, struct mbuf *m,
648     struct sockaddr_in6 *ssin6, struct sockaddr_in6 *dsin6, uint16_t *flags)
649 {
650         struct ip6_hdr *ip6;
651         uint8_t proto;
652         int hlen;
653         uint16_t src_port, dst_port;
654         u_short offset;
655         void *ulp;
656
657         offset = hlen = src_port = dst_port = 0;
658         ulp = NULL;
659         ip6 = mtod(m, struct ip6_hdr *);
660         hlen = sizeof(struct ip6_hdr);
661         proto = ip6->ip6_nxt;
662
663         if ((*flags & FL_HASH_ALL) == 0)
664                 goto skipports;
665
666         while (ulp == NULL) {
667                 switch (proto) {
668                 case IPPROTO_ICMPV6:
669                 case IPPROTO_OSPFIGP:
670                 case IPPROTO_PIM:
671                 case IPPROTO_CARP:
672                 case IPPROTO_ESP:
673                 case IPPROTO_NONE:
674                         ulp = ip6;
675                         break;
676                 case IPPROTO_TCP:
677                         PULLUP_TO(hlen, ulp, struct tcphdr);
678                         dst_port = TCP(ulp)->th_dport;
679                         src_port = TCP(ulp)->th_sport;
680                         if ((*flags & FL_HASH_ALL) &&
681                             (TCP(ulp)->th_flags & (TH_RST|TH_FIN)))
682                                 *flags |= FL_STALE;
683                         break;
684                 case IPPROTO_SCTP:
685                         PULLUP_TO(hlen, ulp, struct sctphdr);
686                         src_port = SCTP(ulp)->src_port;
687                         dst_port = SCTP(ulp)->dest_port;
688                         break;
689                 case IPPROTO_UDP:
690                         PULLUP_TO(hlen, ulp, struct udphdr);
691                         dst_port = UDP(ulp)->uh_dport;
692                         src_port = UDP(ulp)->uh_sport;
693                         break;
694                 case IPPROTO_HOPOPTS:   /* RFC 2460 */
695                         PULLUP_TO(hlen, ulp, struct ip6_hbh);
696                         hlen += (((struct ip6_hbh *)ulp)->ip6h_len + 1) << 3;
697                         proto = ((struct ip6_hbh *)ulp)->ip6h_nxt;
698                         ulp = NULL;
699                         break;
700                 case IPPROTO_ROUTING:   /* RFC 2460 */
701                         PULLUP_TO(hlen, ulp, struct ip6_rthdr); 
702                         hlen += (((struct ip6_rthdr *)ulp)->ip6r_len + 1) << 3;
703                         proto = ((struct ip6_rthdr *)ulp)->ip6r_nxt;
704                         ulp = NULL;
705                         break;
706                 case IPPROTO_FRAGMENT:  /* RFC 2460 */
707                         PULLUP_TO(hlen, ulp, struct ip6_frag);
708                         hlen += sizeof (struct ip6_frag);
709                         proto = ((struct ip6_frag *)ulp)->ip6f_nxt;
710                         offset = ((struct ip6_frag *)ulp)->ip6f_offlg &
711                             IP6F_OFF_MASK;
712                         ulp = NULL;
713                         break;
714                 case IPPROTO_DSTOPTS:   /* RFC 2460 */
715                         PULLUP_TO(hlen, ulp, struct ip6_hbh);
716                         hlen += (((struct ip6_hbh *)ulp)->ip6h_len + 1) << 3;
717                         proto = ((struct ip6_hbh *)ulp)->ip6h_nxt;
718                         ulp = NULL;
719                         break;
720                 case IPPROTO_AH:        /* RFC 2402 */
721                         PULLUP_TO(hlen, ulp, struct ip6_ext);
722                         hlen += (((struct ip6_ext *)ulp)->ip6e_len + 2) << 2;
723                         proto = ((struct ip6_ext *)ulp)->ip6e_nxt;
724                         ulp = NULL;
725                         break;
726                 default:
727                         PULLUP_TO(hlen, ulp, struct ip6_ext);
728                         break;
729                 }
730         }
731
732         if (src_port == 0) {
733         receive_failed:
734                 return (ENOTSUP);
735         }
736
737 skipports:
738         dsin6->sin6_family = AF_INET6;
739         dsin6->sin6_len = sizeof(*dsin6);
740         dsin6->sin6_port = dst_port;
741         memcpy(&dsin6->sin6_addr, &ip6->ip6_dst, sizeof(struct in6_addr));
742
743         ssin6->sin6_family = AF_INET6;
744         ssin6->sin6_len = sizeof(*ssin6);
745         ssin6->sin6_port = src_port;
746         memcpy(&ssin6->sin6_addr, &ip6->ip6_src, sizeof(struct in6_addr));
747         *flags |= proto_to_flags(proto);
748
749         return (0);
750 }
751
752 #define zero_key(key)           \
753 do {                            \
754         key[0] = 0;             \
755         key[1] = 0;             \
756         key[2] = 0;             \
757         key[3] = 0;             \
758         key[4] = 0;             \
759         key[5] = 0;             \
760         key[6] = 0;             \
761         key[7] = 0;             \
762         key[8] = 0;             \
763 } while (0)
764         
765 static uint32_t
766 ipv6_flow_lookup_hash_internal(
767         struct sockaddr_in6 *ssin6, struct sockaddr_in6 *dsin6, 
768             uint32_t *key, uint16_t flags)
769 {
770         uint16_t sport, dport;
771         uint8_t proto;
772         int offset = 0;
773
774         if ((V_flowtable_enable == 0) || (V_flowtable_ready == 0))
775                 return (0);
776
777         proto = flags_to_proto(flags);
778         zero_key(key);
779         sport = dport = 0;
780         if (dsin6 != NULL) {
781                 memcpy(&key[1], &dsin6->sin6_addr, sizeof(struct in6_addr));
782                 dport = dsin6->sin6_port;
783         }
784         if ((ssin6 != NULL) && (flags & FL_HASH_ALL)) {
785                 memcpy(&key[5], &ssin6->sin6_addr, sizeof(struct in6_addr));
786                 sport = ssin6->sin6_port;
787         }
788         if (flags & FL_HASH_ALL) {
789                 ((uint16_t *)key)[0] = sport;
790                 ((uint16_t *)key)[1] = dport; 
791         } else
792                 offset = V_flow_hashjitter + proto;
793
794         return (jenkins_hashword(key, 9, offset));
795 }
796
797 static struct flentry *
798 flowtable_lookup_mbuf6(struct flowtable *ft, struct mbuf *m)
799 {
800         struct sockaddr_storage ssa, dsa;
801         struct sockaddr_in6 *dsin6, *ssin6;     
802         uint16_t flags;
803
804         dsin6 = (struct sockaddr_in6 *)&dsa;
805         ssin6 = (struct sockaddr_in6 *)&ssa;
806         bzero(dsin6, sizeof(*dsin6));
807         bzero(ssin6, sizeof(*ssin6));
808         flags = ft->ft_flags;
809         
810         if (ipv6_mbuf_demarshal(ft, m, ssin6, dsin6, &flags) != 0)
811                 return (NULL);
812
813         return (flowtable_lookup(ft, &ssa, &dsa, M_GETFIB(m), flags));
814 }
815
816 void
817 flow_to_route_in6(struct flentry *fle, struct route_in6 *ro)
818 {
819         uint32_t *hashkey = NULL;
820         struct sockaddr_in6 *sin6;
821
822         sin6 = (struct sockaddr_in6 *)&ro->ro_dst;
823
824         sin6->sin6_family = AF_INET6;
825         sin6->sin6_len = sizeof(*sin6);
826         hashkey = ((struct flentry_v6 *)fle)->fl_flow.ipf_key;
827         memcpy(&sin6->sin6_addr, &hashkey[5], sizeof (struct in6_addr));
828         ro->ro_rt = __DEVOLATILE(struct rtentry *, fle->f_rt);
829         ro->ro_lle = __DEVOLATILE(struct llentry *, fle->f_lle);
830
831 }
832 #endif /* INET6 */
833
834 static bitstr_t *
835 flowtable_mask(struct flowtable *ft)
836 {
837         bitstr_t *mask;
838
839         if (ft->ft_flags & FL_PCPU)
840                 mask = ft->ft_masks[curcpu];
841         else
842                 mask = ft->ft_masks[0];
843
844         return (mask);
845 }
846
847 static struct flentry **
848 flowtable_entry(struct flowtable *ft, uint32_t hash)
849 {
850         struct flentry **fle;
851         int index = (hash % ft->ft_size);
852
853         if (ft->ft_flags & FL_PCPU) {
854                 KASSERT(&ft->ft_table.pcpu[curcpu][0] != NULL, ("pcpu not set"));
855                 fle = &ft->ft_table.pcpu[curcpu][index];
856         } else {
857                 KASSERT(&ft->ft_table.global[0] != NULL, ("global not set"));
858                 fle = &ft->ft_table.global[index];
859         }
860         
861         return (fle);
862 }
863
864 static int
865 flow_stale(struct flowtable *ft, struct flentry *fle)
866 {
867         time_t idle_time;
868
869         if ((fle->f_fhash == 0)
870             || ((fle->f_rt->rt_flags & RTF_HOST) &&
871                 ((fle->f_rt->rt_flags & (RTF_UP))
872                     != (RTF_UP)))
873             || (fle->f_rt->rt_ifp == NULL))
874                 return (1);
875
876         idle_time = time_uptime - fle->f_uptime;
877
878         if ((fle->f_flags & FL_STALE) ||
879             ((fle->f_flags & (TH_SYN|TH_ACK|TH_FIN)) == 0
880                 && (idle_time > ft->ft_udp_idle)) ||
881             ((fle->f_flags & TH_FIN)
882                 && (idle_time > ft->ft_fin_wait_idle)) ||
883             ((fle->f_flags & (TH_SYN|TH_ACK)) == TH_SYN
884                 && (idle_time > ft->ft_syn_idle)) ||
885             ((fle->f_flags & (TH_SYN|TH_ACK)) == (TH_SYN|TH_ACK)
886                 && (idle_time > ft->ft_tcp_idle)) ||
887             ((fle->f_rt->rt_flags & RTF_UP) == 0 || 
888                 (fle->f_rt->rt_ifp == NULL)))
889                 return (1);
890
891         return (0);
892 }
893
894 static void
895 flowtable_set_hashkey(struct flentry *fle, uint32_t *key)
896 {
897         uint32_t *hashkey;
898         int i, nwords;
899
900         if (fle->f_flags & FL_IPV6) {
901                 nwords = 9;
902                 hashkey = ((struct flentry_v4 *)fle)->fl_flow.ipf_key;
903         } else {
904                 nwords = 3;
905                 hashkey = ((struct flentry_v6 *)fle)->fl_flow.ipf_key;
906         }
907         
908         for (i = 0; i < nwords; i++) 
909                 hashkey[i] = key[i];
910 }
911
912 static struct flentry *
913 flow_alloc(struct flowtable *ft)
914 {
915         struct flentry *newfle;
916         uma_zone_t zone;
917
918         newfle = NULL;
919         zone = (ft->ft_flags & FL_IPV6) ? V_flow_ipv6_zone : V_flow_ipv4_zone;
920
921         newfle = uma_zalloc(zone, M_NOWAIT | M_ZERO);
922         if (newfle != NULL)
923                 atomic_add_int(&ft->ft_count, 1);
924         return (newfle);
925 }
926
927 static void
928 flow_free(struct flentry *fle, struct flowtable *ft)
929 {
930         uma_zone_t zone;
931
932         zone = (ft->ft_flags & FL_IPV6) ? V_flow_ipv6_zone : V_flow_ipv4_zone;
933         atomic_add_int(&ft->ft_count, -1);
934         uma_zfree(zone, fle);
935 }
936
937 static int
938 flow_full(struct flowtable *ft)
939 {
940         boolean_t full;
941         uint32_t count;
942         
943         full = ft->ft_full;
944         count = ft->ft_count;
945
946         if (full && (count < (V_flowtable_nmbflows - (V_flowtable_nmbflows >> 3))))
947                 ft->ft_full = FALSE;
948         else if (!full && (count > (V_flowtable_nmbflows - (V_flowtable_nmbflows >> 5))))
949                 ft->ft_full = TRUE;
950         
951         if (full && !ft->ft_full) {
952                 flowclean_freq = 4*hz;
953                 if ((ft->ft_flags & FL_HASH_ALL) == 0)
954                         ft->ft_udp_idle = ft->ft_fin_wait_idle =
955                             ft->ft_syn_idle = ft->ft_tcp_idle = 5;
956                 cv_broadcast(&flowclean_cv);
957         } else if (!full && ft->ft_full) {
958                 flowclean_freq = 20*hz;
959                 if ((ft->ft_flags & FL_HASH_ALL) == 0)
960                         ft->ft_udp_idle = ft->ft_fin_wait_idle =
961                             ft->ft_syn_idle = ft->ft_tcp_idle = 30;
962         }
963
964         return (ft->ft_full);
965 }
966
967 static int
968 flowtable_insert(struct flowtable *ft, uint32_t hash, uint32_t *key,
969     uint32_t fibnum, struct route *ro, uint16_t flags)
970 {
971         struct flentry *fle, *fletail, *newfle, **flep;
972         struct flowtable_stats *fs = &ft->ft_stats[curcpu];
973         int depth;
974         bitstr_t *mask;
975         uint8_t proto;
976
977         newfle = flow_alloc(ft);
978         if (newfle == NULL)
979                 return (ENOMEM);
980
981         newfle->f_flags |= (flags & FL_IPV6);
982         proto = flags_to_proto(flags);
983
984         FL_ENTRY_LOCK(ft, hash);
985         mask = flowtable_mask(ft);
986         flep = flowtable_entry(ft, hash);
987         fletail = fle = *flep;
988
989         if (fle == NULL) {
990                 bit_set(mask, FL_ENTRY_INDEX(ft, hash));
991                 *flep = fle = newfle;
992                 goto skip;
993         } 
994         
995         depth = 0;
996         fs->ft_collisions++;
997         /*
998          * find end of list and make sure that we were not
999          * preempted by another thread handling this flow
1000          */
1001         while (fle != NULL) {
1002                 if (fle->f_fhash == hash && !flow_stale(ft, fle)) {
1003                         /*
1004                          * there was either a hash collision
1005                          * or we lost a race to insert
1006                          */
1007                         FL_ENTRY_UNLOCK(ft, hash);
1008                         flow_free(newfle, ft);
1009                         
1010                         if (flags & FL_OVERWRITE) 
1011                                 goto skip;
1012                         return (EEXIST);
1013                 }
1014                 /*
1015                  * re-visit this double condition XXX
1016                  */
1017                 if (fletail->f_next != NULL)
1018                         fletail = fle->f_next;
1019
1020                 depth++;
1021                 fle = fle->f_next;
1022         } 
1023
1024         if (depth > fs->ft_max_depth)
1025                 fs->ft_max_depth = depth;
1026         fletail->f_next = newfle;
1027         fle = newfle;
1028 skip:
1029         flowtable_set_hashkey(fle, key);
1030
1031         fle->f_proto = proto;
1032         fle->f_rt = ro->ro_rt;
1033         fle->f_lle = ro->ro_lle;
1034         fle->f_fhash = hash;
1035         fle->f_fibnum = fibnum;
1036         fle->f_uptime = time_uptime;
1037         FL_ENTRY_UNLOCK(ft, hash);
1038         return (0);
1039 }
1040
1041 int
1042 kern_flowtable_insert(struct flowtable *ft,
1043     struct sockaddr_storage *ssa, struct sockaddr_storage *dsa,
1044     struct route *ro, uint32_t fibnum, int flags)
1045 {
1046         uint32_t key[9], hash;
1047
1048         flags = (ft->ft_flags | flags | FL_OVERWRITE);
1049         hash = 0;
1050
1051 #ifdef INET
1052         if (ssa->ss_family == AF_INET) 
1053                 hash = ipv4_flow_lookup_hash_internal((struct sockaddr_in *)ssa,
1054                     (struct sockaddr_in *)dsa, key, flags);
1055 #endif
1056 #ifdef INET6
1057         if (ssa->ss_family == AF_INET6) 
1058                 hash = ipv6_flow_lookup_hash_internal((struct sockaddr_in6 *)ssa,
1059                     (struct sockaddr_in6 *)dsa, key, flags);
1060 #endif  
1061         if (ro->ro_rt == NULL || ro->ro_lle == NULL)
1062                 return (EINVAL);
1063
1064         FLDPRINTF(ft, FL_DEBUG,
1065             "kern_flowtable_insert: key=%x:%x:%x hash=%x fibnum=%d flags=%x\n",
1066             key[0], key[1], key[2], hash, fibnum, flags);
1067         return (flowtable_insert(ft, hash, key, fibnum, ro, flags));
1068 }
1069
1070 static int
1071 flowtable_key_equal(struct flentry *fle, uint32_t *key)
1072 {
1073         uint32_t *hashkey;
1074         int i, nwords;
1075
1076         if (fle->f_flags & FL_IPV6) {
1077                 nwords = 9;
1078                 hashkey = ((struct flentry_v4 *)fle)->fl_flow.ipf_key;
1079         } else {
1080                 nwords = 3;
1081                 hashkey = ((struct flentry_v6 *)fle)->fl_flow.ipf_key;
1082         }
1083
1084         for (i = 0; i < nwords; i++) 
1085                 if (hashkey[i] != key[i])
1086                         return (0);
1087
1088         return (1);
1089 }
1090
1091 struct flentry *
1092 flowtable_lookup_mbuf(struct flowtable *ft, struct mbuf *m, int af)
1093 {
1094         struct flentry *fle = NULL;
1095
1096 #ifdef INET
1097         if (af == AF_INET)
1098                 fle = flowtable_lookup_mbuf4(ft, m);
1099 #endif
1100 #ifdef INET6
1101         if (af == AF_INET6)
1102                 fle = flowtable_lookup_mbuf6(ft, m);
1103 #endif  
1104         if (fle != NULL && m != NULL && (m->m_flags & M_FLOWID) == 0) {
1105                 m->m_flags |= M_FLOWID;
1106                 m->m_pkthdr.flowid = fle->f_fhash;
1107         }
1108         return (fle);
1109 }
1110         
1111 struct flentry *
1112 flowtable_lookup(struct flowtable *ft, struct sockaddr_storage *ssa,
1113     struct sockaddr_storage *dsa, uint32_t fibnum, int flags)
1114 {
1115         uint32_t key[9], hash;
1116         struct flentry *fle;
1117         struct flowtable_stats *fs = &ft->ft_stats[curcpu];
1118         uint8_t proto = 0;
1119         int error = 0;
1120         struct rtentry *rt;
1121         struct llentry *lle;
1122         struct route sro, *ro;
1123         struct route_in6 sro6;
1124
1125         sro.ro_rt = sro6.ro_rt = NULL;
1126         sro.ro_lle = sro6.ro_lle = NULL;
1127         ro = NULL;
1128         hash = 0;
1129         flags |= ft->ft_flags;
1130         proto = flags_to_proto(flags);
1131 #ifdef INET
1132         if (ssa->ss_family == AF_INET) {
1133                 struct sockaddr_in *ssin, *dsin;
1134
1135                 ro = &sro;
1136                 memcpy(&ro->ro_dst, dsa, sizeof(struct sockaddr_in));
1137                 /*
1138                  * The harvested source and destination addresses
1139                  * may contain port information if the packet is 
1140                  * from a transport protocol (e.g. TCP/UDP). The 
1141                  * port field must be cleared before performing 
1142                  * a route lookup.
1143                  */
1144                 ((struct sockaddr_in *)&ro->ro_dst)->sin_port = 0;
1145                 dsin = (struct sockaddr_in *)dsa;
1146                 ssin = (struct sockaddr_in *)ssa;
1147                 if ((dsin->sin_addr.s_addr == ssin->sin_addr.s_addr) ||
1148                     (ntohl(dsin->sin_addr.s_addr) >> IN_CLASSA_NSHIFT) == IN_LOOPBACKNET ||
1149                     (ntohl(ssin->sin_addr.s_addr) >> IN_CLASSA_NSHIFT) == IN_LOOPBACKNET)
1150                         return (NULL);
1151
1152                 hash = ipv4_flow_lookup_hash_internal(ssin, dsin, key, flags);
1153         }
1154 #endif
1155 #ifdef INET6
1156         if (ssa->ss_family == AF_INET6) {
1157                 struct sockaddr_in6 *ssin6, *dsin6;
1158
1159                 ro = (struct route *)&sro6;
1160                 memcpy(&sro6.ro_dst, dsa,
1161                     sizeof(struct sockaddr_in6));
1162                 ((struct sockaddr_in6 *)&ro->ro_dst)->sin6_port = 0;
1163                 dsin6 = (struct sockaddr_in6 *)dsa;
1164                 ssin6 = (struct sockaddr_in6 *)ssa;
1165
1166                 flags |= FL_IPV6;
1167                 hash = ipv6_flow_lookup_hash_internal(ssin6, dsin6, key, flags);
1168         }
1169 #endif
1170         /*
1171          * Ports are zero and this isn't a transmit cache
1172          * - thus not a protocol for which we need to keep 
1173          * state
1174          * FL_HASH_ALL => key[0] != 0 for TCP || UDP || SCTP
1175          */
1176         if (hash == 0 || (key[0] == 0 && (ft->ft_flags & FL_HASH_ALL)))
1177                 return (NULL);
1178
1179         fs->ft_lookups++;
1180         FL_ENTRY_LOCK(ft, hash);
1181         if ((fle = FL_ENTRY(ft, hash)) == NULL) {
1182                 FL_ENTRY_UNLOCK(ft, hash);
1183                 goto uncached;
1184         }
1185 keycheck:       
1186         rt = __DEVOLATILE(struct rtentry *, fle->f_rt);
1187         lle = __DEVOLATILE(struct llentry *, fle->f_lle);
1188         if ((rt != NULL)
1189             && fle->f_fhash == hash
1190             && flowtable_key_equal(fle, key)
1191             && (proto == fle->f_proto)
1192             && (fibnum == fle->f_fibnum)
1193             && (rt->rt_flags & RTF_UP)
1194             && (rt->rt_ifp != NULL)) {
1195                 fs->ft_hits++;
1196                 fle->f_uptime = time_uptime;
1197                 fle->f_flags |= flags;
1198                 FL_ENTRY_UNLOCK(ft, hash);
1199                 return (fle);
1200         } else if (fle->f_next != NULL) {
1201                 fle = fle->f_next;
1202                 goto keycheck;
1203         }
1204         FL_ENTRY_UNLOCK(ft, hash);
1205 uncached:
1206         if (flags & FL_NOAUTO || flow_full(ft))
1207                 return (NULL);
1208
1209         fs->ft_misses++;
1210         /*
1211          * This bit of code ends up locking the
1212          * same route 3 times (just like ip_output + ether_output)
1213          * - at lookup
1214          * - in rt_check when called by arpresolve
1215          * - dropping the refcount for the rtentry
1216          *
1217          * This could be consolidated to one if we wrote a variant
1218          * of arpresolve with an rt_check variant that expected to
1219          * receive the route locked
1220          */
1221
1222 #ifdef INVARIANTS
1223         if ((ro->ro_dst.sa_family != AF_INET) &&
1224             (ro->ro_dst.sa_family != AF_INET6))
1225                 panic("sa_family == %d\n", ro->ro_dst.sa_family);
1226 #endif
1227
1228         ft->ft_rtalloc(ro, hash, fibnum);
1229         if (ro->ro_rt == NULL) 
1230                 error = ENETUNREACH;
1231         else {
1232                 struct llentry *lle = NULL;
1233                 struct sockaddr_storage *l3addr;
1234                 struct rtentry *rt = ro->ro_rt;
1235                 struct ifnet *ifp = rt->rt_ifp;
1236
1237                 if (ifp->if_flags & (IFF_POINTOPOINT | IFF_LOOPBACK)) {
1238                         RTFREE(rt);
1239                         ro->ro_rt = NULL;
1240                         return (NULL);
1241                 }
1242 #ifdef INET6
1243                 if (ssa->ss_family == AF_INET6) {
1244                         struct sockaddr_in6 *dsin6;
1245
1246                         dsin6 = (struct sockaddr_in6 *)dsa;                     
1247                         if (in6_localaddr(&dsin6->sin6_addr)) {
1248                                 RTFREE(rt);
1249                                 ro->ro_rt = NULL;
1250                                 return (NULL);                          
1251                         }
1252
1253                         if (rt->rt_flags & RTF_GATEWAY)
1254                                 l3addr = (struct sockaddr_storage *)rt->rt_gateway;
1255                         
1256                         else
1257                                 l3addr = (struct sockaddr_storage *)&ro->ro_dst;
1258                         llentry_update(&lle, LLTABLE6(ifp), l3addr, ifp);
1259                 }
1260 #endif  
1261 #ifdef INET
1262                 if (ssa->ss_family == AF_INET) {
1263                         if (rt->rt_flags & RTF_GATEWAY)
1264                                 l3addr = (struct sockaddr_storage *)rt->rt_gateway;
1265                         else
1266                                 l3addr = (struct sockaddr_storage *)&ro->ro_dst;
1267                         llentry_update(&lle, LLTABLE(ifp), l3addr, ifp);        
1268                 }
1269                         
1270 #endif
1271                 ro->ro_lle = lle;
1272
1273                 if (lle == NULL) {
1274                         RTFREE(rt);
1275                         ro->ro_rt = NULL;
1276                         return (NULL);
1277                 }
1278                 error = flowtable_insert(ft, hash, key, fibnum, ro, flags);
1279
1280                 if (error) {
1281                         RTFREE(rt);
1282                         LLE_FREE(lle);
1283                         ro->ro_rt = NULL;
1284                         ro->ro_lle = NULL;
1285                 }
1286         } 
1287
1288         return ((error) ? NULL : fle);
1289 }
1290
1291 /*
1292  * used by the bit_alloc macro
1293  */
1294 #define calloc(count, size) malloc((count)*(size), M_DEVBUF, M_WAITOK|M_ZERO)
1295         
1296 struct flowtable *
1297 flowtable_alloc(char *name, int nentry, int flags)
1298 {
1299         struct flowtable *ft, *fttail;
1300         int i;
1301
1302         if (V_flow_hashjitter == 0)
1303                 V_flow_hashjitter = arc4random();
1304
1305         KASSERT(nentry > 0, ("nentry must be > 0, is %d\n", nentry));
1306
1307         ft = malloc(sizeof(struct flowtable),
1308             M_RTABLE, M_WAITOK | M_ZERO);
1309
1310         ft->ft_name = name;
1311         ft->ft_flags = flags;
1312         ft->ft_size = nentry;
1313 #ifdef RADIX_MPATH
1314         ft->ft_rtalloc = rtalloc_mpath_fib;
1315 #else
1316         ft->ft_rtalloc = in_rtalloc_ign_wrapper;
1317 #endif
1318         if (flags & FL_PCPU) {
1319                 ft->ft_lock = flowtable_pcpu_lock;
1320                 ft->ft_unlock = flowtable_pcpu_unlock;
1321
1322                 for (i = 0; i <= mp_maxid; i++) {
1323                         ft->ft_table.pcpu[i] =
1324                             malloc(nentry*sizeof(struct flentry *),
1325                                 M_RTABLE, M_WAITOK | M_ZERO);
1326                         ft->ft_masks[i] = bit_alloc(nentry);
1327                 }
1328         } else {
1329                 ft->ft_lock_count = 2*(powerof2(mp_maxid + 1) ? (mp_maxid + 1):
1330                     (fls(mp_maxid + 1) << 1));
1331                 
1332                 ft->ft_lock = flowtable_global_lock;
1333                 ft->ft_unlock = flowtable_global_unlock;
1334                 ft->ft_table.global =
1335                             malloc(nentry*sizeof(struct flentry *),
1336                                 M_RTABLE, M_WAITOK | M_ZERO);
1337                 ft->ft_locks = malloc(ft->ft_lock_count*sizeof(struct mtx),
1338                                 M_RTABLE, M_WAITOK | M_ZERO);
1339                 for (i = 0; i < ft->ft_lock_count; i++)
1340                         mtx_init(&ft->ft_locks[i], "flow", NULL, MTX_DEF|MTX_DUPOK);
1341
1342                 ft->ft_masks[0] = bit_alloc(nentry);
1343         }
1344         ft->ft_tmpmask = bit_alloc(nentry);
1345
1346         /*
1347          * In the local transmit case the table truly is 
1348          * just a cache - so everything is eligible for
1349          * replacement after 5s of non-use
1350          */
1351         if (flags & FL_HASH_ALL) {
1352                 ft->ft_udp_idle = V_flowtable_udp_expire;
1353                 ft->ft_syn_idle = V_flowtable_syn_expire;
1354                 ft->ft_fin_wait_idle = V_flowtable_fin_wait_expire;
1355                 ft->ft_tcp_idle = V_flowtable_fin_wait_expire;
1356         } else {
1357                 ft->ft_udp_idle = ft->ft_fin_wait_idle =
1358                     ft->ft_syn_idle = ft->ft_tcp_idle = 30;
1359                 
1360         }
1361
1362         /*
1363          * hook in to the cleaner list
1364          */
1365         if (V_flow_list_head == NULL)
1366                 V_flow_list_head = ft;
1367         else {
1368                 fttail = V_flow_list_head;
1369                 while (fttail->ft_next != NULL)
1370                         fttail = fttail->ft_next;
1371                 fttail->ft_next = ft;
1372         }
1373
1374         return (ft);
1375 }
1376
1377 /*
1378  * The rest of the code is devoted to garbage collection of expired entries.
1379  * It is a new additon made necessary by the switch to dynamically allocating
1380  * flow tables.
1381  * 
1382  */
1383 static void
1384 fle_free(struct flentry *fle, struct flowtable *ft)
1385 {
1386         struct rtentry *rt;
1387         struct llentry *lle;
1388
1389         rt = __DEVOLATILE(struct rtentry *, fle->f_rt);
1390         lle = __DEVOLATILE(struct llentry *, fle->f_lle);
1391         RTFREE(rt);
1392         LLE_FREE(lle);
1393         flow_free(fle, ft);
1394 }
1395
1396 static void
1397 flowtable_free_stale(struct flowtable *ft, struct rtentry *rt)
1398 {
1399         int curbit = 0, count;
1400         struct flentry *fle,  **flehead, *fleprev;
1401         struct flentry *flefreehead, *flefreetail, *fletmp;
1402         bitstr_t *mask, *tmpmask;
1403         struct flowtable_stats *fs = &ft->ft_stats[curcpu];
1404
1405         flefreehead = flefreetail = NULL;
1406         mask = flowtable_mask(ft);
1407         tmpmask = ft->ft_tmpmask;
1408         memcpy(tmpmask, mask, ft->ft_size/8);
1409         /*
1410          * XXX Note to self, bit_ffs operates at the byte level
1411          * and thus adds gratuitous overhead
1412          */
1413         bit_ffs(tmpmask, ft->ft_size, &curbit);
1414         while (curbit != -1) {
1415                 if (curbit >= ft->ft_size || curbit < -1) {
1416                         log(LOG_ALERT,
1417                             "warning: bad curbit value %d \n",
1418                             curbit);
1419                         break;
1420                 }
1421
1422                 FL_ENTRY_LOCK(ft, curbit);
1423                 flehead = flowtable_entry(ft, curbit);
1424                 fle = fleprev = *flehead;
1425
1426                 fs->ft_free_checks++;
1427 #ifdef DIAGNOSTIC
1428                 if (fle == NULL && curbit > 0) {
1429                         log(LOG_ALERT,
1430                             "warning bit=%d set, but no fle found\n",
1431                             curbit);
1432                 }
1433 #endif          
1434                 while (fle != NULL) {
1435                         if (rt != NULL) {
1436                                 if (__DEVOLATILE(struct rtentry *, fle->f_rt) != rt) {
1437                                         fleprev = fle;
1438                                         fle = fle->f_next;
1439                                         continue;
1440                                 }
1441                         } else if (!flow_stale(ft, fle)) {
1442                                 fleprev = fle;
1443                                 fle = fle->f_next;
1444                                 continue;
1445                         }
1446                         /*
1447                          * delete head of the list
1448                          */
1449                         if (fleprev == *flehead) {
1450                                 fletmp = fleprev;
1451                                 if (fle == fleprev) {
1452                                         fleprev = *flehead = fle->f_next;
1453                                 } else
1454                                         fleprev = *flehead = fle;
1455                                 fle = fle->f_next;
1456                         } else {
1457                                 /*
1458                                  * don't advance fleprev
1459                                  */
1460                                 fletmp = fle;
1461                                 fleprev->f_next = fle->f_next;
1462                                 fle = fleprev->f_next;
1463                         }
1464
1465                         if (flefreehead == NULL)
1466                                 flefreehead = flefreetail = fletmp;
1467                         else {
1468                                 flefreetail->f_next = fletmp;
1469                                 flefreetail = fletmp;
1470                         }
1471                         fletmp->f_next = NULL;
1472                 }
1473                 if (*flehead == NULL)
1474                         bit_clear(mask, curbit);
1475                 FL_ENTRY_UNLOCK(ft, curbit);
1476                 bit_clear(tmpmask, curbit);
1477                 bit_ffs(tmpmask, ft->ft_size, &curbit);
1478         }
1479         count = 0;
1480         while ((fle = flefreehead) != NULL) {
1481                 flefreehead = fle->f_next;
1482                 count++;
1483                 fs->ft_frees++;
1484                 fle_free(fle, ft);
1485         }
1486         if (V_flowtable_debug && count)
1487                 log(LOG_DEBUG, "freed %d flow entries\n", count);
1488 }
1489
1490 void
1491 flowtable_route_flush(struct flowtable *ft, struct rtentry *rt)
1492 {
1493         int i;
1494
1495         if (ft->ft_flags & FL_PCPU) {
1496                 for (i = 0; i <= mp_maxid; i++) {
1497                         if (CPU_ABSENT(i))
1498                                 continue;
1499                         
1500                         if (smp_started == 1) {
1501                                 thread_lock(curthread);
1502                                 sched_bind(curthread, i);
1503                                 thread_unlock(curthread);
1504                         }
1505
1506                         flowtable_free_stale(ft, rt);
1507
1508                         if (smp_started == 1) {
1509                                 thread_lock(curthread);
1510                                 sched_unbind(curthread);
1511                                 thread_unlock(curthread);
1512                         }
1513                 }
1514         } else {
1515                 flowtable_free_stale(ft, rt);
1516         }
1517 }
1518
1519 static void
1520 flowtable_clean_vnet(void)
1521 {
1522         struct flowtable *ft;
1523         int i;
1524
1525         ft = V_flow_list_head;
1526         while (ft != NULL) {
1527                 if (ft->ft_flags & FL_PCPU) {
1528                         for (i = 0; i <= mp_maxid; i++) {
1529                                 if (CPU_ABSENT(i))
1530                                         continue;
1531
1532                                 if (smp_started == 1) {
1533                                         thread_lock(curthread);
1534                                         sched_bind(curthread, i);
1535                                         thread_unlock(curthread);
1536                                 }
1537
1538                                 flowtable_free_stale(ft, NULL);
1539
1540                                 if (smp_started == 1) {
1541                                         thread_lock(curthread);
1542                                         sched_unbind(curthread);
1543                                         thread_unlock(curthread);
1544                                 }
1545                         }
1546                 } else {
1547                         flowtable_free_stale(ft, NULL);
1548                 }
1549                 ft = ft->ft_next;
1550         }
1551 }
1552
1553 static void
1554 flowtable_cleaner(void)
1555 {
1556         VNET_ITERATOR_DECL(vnet_iter);
1557
1558         if (bootverbose)
1559                 log(LOG_INFO, "flowtable cleaner started\n");
1560         while (1) {
1561                 VNET_LIST_RLOCK();
1562                 VNET_FOREACH(vnet_iter) {
1563                         CURVNET_SET(vnet_iter);
1564                         flowtable_clean_vnet();
1565                         CURVNET_RESTORE();
1566                 }
1567                 VNET_LIST_RUNLOCK();
1568
1569                 flowclean_cycles++;
1570                 /*
1571                  * The 10 second interval between cleaning checks
1572                  * is arbitrary
1573                  */
1574                 mtx_lock(&flowclean_lock);
1575                 cv_broadcast(&flowclean_cv);
1576                 cv_timedwait(&flowclean_cv, &flowclean_lock, flowclean_freq);
1577                 mtx_unlock(&flowclean_lock);
1578         }
1579 }
1580
1581 static void
1582 flowtable_flush(void *unused __unused)
1583 {
1584         uint64_t start;
1585
1586         mtx_lock(&flowclean_lock);
1587         start = flowclean_cycles;
1588         while (start == flowclean_cycles) {
1589                 cv_broadcast(&flowclean_cv);
1590                 cv_wait(&flowclean_cv, &flowclean_lock);
1591         }
1592         mtx_unlock(&flowclean_lock);
1593 }
1594
1595 static struct kproc_desc flow_kp = {
1596         "flowcleaner",
1597         flowtable_cleaner,
1598         &flowcleanerproc
1599 };
1600 SYSINIT(flowcleaner, SI_SUB_KTHREAD_IDLE, SI_ORDER_ANY, kproc_start, &flow_kp);
1601
1602 static void
1603 flowtable_init_vnet(const void *unused __unused)
1604 {
1605
1606         V_flowtable_nmbflows = 1024 + maxusers * 64 * mp_ncpus;
1607         V_flow_ipv4_zone = uma_zcreate("ip4flow", sizeof(struct flentry_v4),
1608             NULL, NULL, NULL, NULL, 64, UMA_ZONE_MAXBUCKET);
1609         V_flow_ipv6_zone = uma_zcreate("ip6flow", sizeof(struct flentry_v6),
1610             NULL, NULL, NULL, NULL, 64, UMA_ZONE_MAXBUCKET);    
1611         uma_zone_set_max(V_flow_ipv4_zone, V_flowtable_nmbflows);
1612         uma_zone_set_max(V_flow_ipv6_zone, V_flowtable_nmbflows);
1613         V_flowtable_ready = 1;
1614 }
1615 VNET_SYSINIT(flowtable_init_vnet, SI_SUB_SMP, SI_ORDER_ANY,
1616     flowtable_init_vnet, NULL);
1617
1618 static void
1619 flowtable_init(const void *unused __unused)
1620 {
1621
1622         cv_init(&flowclean_cv, "flowcleanwait");
1623         mtx_init(&flowclean_lock, "flowclean lock", NULL, MTX_DEF);
1624         EVENTHANDLER_REGISTER(ifnet_departure_event, flowtable_flush, NULL,
1625             EVENTHANDLER_PRI_ANY);
1626         flowclean_freq = 20*hz;
1627 }
1628 SYSINIT(flowtable_init, SI_SUB_SMP, SI_ORDER_MIDDLE,
1629     flowtable_init, NULL);
1630
1631
1632 #ifdef VIMAGE
1633 static void
1634 flowtable_uninit(const void *unused __unused)
1635 {
1636
1637         V_flowtable_ready = 0;
1638         uma_zdestroy(V_flow_ipv4_zone);
1639         uma_zdestroy(V_flow_ipv6_zone);
1640 }
1641
1642 VNET_SYSUNINIT(flowtable_uninit, SI_SUB_KTHREAD_INIT, SI_ORDER_ANY,
1643     flowtable_uninit, NULL);
1644 #endif
1645
1646 #ifdef DDB
1647 static uint32_t *
1648 flowtable_get_hashkey(struct flentry *fle)
1649 {
1650         uint32_t *hashkey;
1651
1652         if (fle->f_flags & FL_IPV6)
1653                 hashkey = ((struct flentry_v4 *)fle)->fl_flow.ipf_key;
1654         else
1655                 hashkey = ((struct flentry_v6 *)fle)->fl_flow.ipf_key;
1656
1657         return (hashkey);
1658 }
1659
1660 static bitstr_t *
1661 flowtable_mask_pcpu(struct flowtable *ft, int cpuid)
1662 {
1663         bitstr_t *mask;
1664
1665         if (ft->ft_flags & FL_PCPU)
1666                 mask = ft->ft_masks[cpuid];
1667         else
1668                 mask = ft->ft_masks[0];
1669
1670         return (mask);
1671 }
1672
1673 static struct flentry **
1674 flowtable_entry_pcpu(struct flowtable *ft, uint32_t hash, int cpuid)
1675 {
1676         struct flentry **fle;
1677         int index = (hash % ft->ft_size);
1678
1679         if (ft->ft_flags & FL_PCPU) {
1680                 fle = &ft->ft_table.pcpu[cpuid][index];
1681         } else {
1682                 fle = &ft->ft_table.global[index];
1683         }
1684         
1685         return (fle);
1686 }
1687
1688 static void
1689 flow_show(struct flowtable *ft, struct flentry *fle)
1690 {
1691         int idle_time;
1692         int rt_valid, ifp_valid;
1693         uint16_t sport, dport;
1694         uint32_t *hashkey;
1695         char saddr[4*sizeof "123"], daddr[4*sizeof "123"];
1696         volatile struct rtentry *rt;
1697         struct ifnet *ifp = NULL;
1698
1699         idle_time = (int)(time_uptime - fle->f_uptime);
1700         rt = fle->f_rt;
1701         rt_valid = rt != NULL;
1702         if (rt_valid) 
1703                 ifp = rt->rt_ifp;
1704         ifp_valid = ifp != NULL;
1705         hashkey = flowtable_get_hashkey(fle);
1706         if (fle->f_flags & FL_IPV6)
1707                 goto skipaddr;
1708
1709         inet_ntoa_r(*(struct in_addr *) &hashkey[2], daddr);
1710         if (ft->ft_flags & FL_HASH_ALL) {
1711                 inet_ntoa_r(*(struct in_addr *) &hashkey[1], saddr);            
1712                 sport = ntohs(((uint16_t *)hashkey)[0]);
1713                 dport = ntohs(((uint16_t *)hashkey)[1]);
1714                 db_printf("%s:%d->%s:%d",
1715                     saddr, sport, daddr,
1716                     dport);
1717         } else 
1718                 db_printf("%s ", daddr);
1719     
1720 skipaddr:
1721         if (fle->f_flags & FL_STALE)
1722                 db_printf(" FL_STALE ");
1723         if (fle->f_flags & FL_TCP)
1724                 db_printf(" FL_TCP ");
1725         if (fle->f_flags & FL_UDP)
1726                 db_printf(" FL_UDP ");
1727         if (rt_valid) {
1728                 if (rt->rt_flags & RTF_UP)
1729                         db_printf(" RTF_UP ");
1730         }
1731         if (ifp_valid) {
1732                 if (ifp->if_flags & IFF_LOOPBACK)
1733                         db_printf(" IFF_LOOPBACK ");
1734                 if (ifp->if_flags & IFF_UP)
1735                         db_printf(" IFF_UP ");          
1736                 if (ifp->if_flags & IFF_POINTOPOINT)
1737                         db_printf(" IFF_POINTOPOINT ");         
1738         }
1739         if (fle->f_flags & FL_IPV6)
1740                 db_printf("\n\tkey=%08x:%08x:%08x%08x:%08x:%08x%08x:%08x:%08x",
1741                     hashkey[0], hashkey[1], hashkey[2],
1742                     hashkey[3], hashkey[4], hashkey[5],
1743                     hashkey[6], hashkey[7], hashkey[8]);
1744         else
1745                 db_printf("\n\tkey=%08x:%08x:%08x ",
1746                     hashkey[0], hashkey[1], hashkey[2]);
1747         db_printf("hash=%08x idle_time=%03d"
1748             "\n\tfibnum=%02d rt=%p",
1749             fle->f_fhash, idle_time, fle->f_fibnum, fle->f_rt);
1750         db_printf("\n");
1751 }
1752
1753 static void
1754 flowtable_show(struct flowtable *ft, int cpuid)
1755 {
1756         int curbit = 0;
1757         struct flentry *fle,  **flehead;
1758         bitstr_t *mask, *tmpmask;
1759
1760         if (cpuid != -1)
1761                 db_printf("cpu: %d\n", cpuid);
1762         mask = flowtable_mask_pcpu(ft, cpuid);
1763         tmpmask = ft->ft_tmpmask;
1764         memcpy(tmpmask, mask, ft->ft_size/8);
1765         /*
1766          * XXX Note to self, bit_ffs operates at the byte level
1767          * and thus adds gratuitous overhead
1768          */
1769         bit_ffs(tmpmask, ft->ft_size, &curbit);
1770         while (curbit != -1) {
1771                 if (curbit >= ft->ft_size || curbit < -1) {
1772                         db_printf("warning: bad curbit value %d \n",
1773                             curbit);
1774                         break;
1775                 }
1776
1777                 flehead = flowtable_entry_pcpu(ft, curbit, cpuid);
1778                 fle = *flehead;
1779
1780                 while (fle != NULL) {   
1781                         flow_show(ft, fle);
1782                         fle = fle->f_next;
1783                         continue;
1784                 }
1785                 bit_clear(tmpmask, curbit);
1786                 bit_ffs(tmpmask, ft->ft_size, &curbit);
1787         }
1788 }
1789
1790 static void
1791 flowtable_show_vnet(void)
1792 {
1793         struct flowtable *ft;
1794         int i;
1795
1796         ft = V_flow_list_head;
1797         while (ft != NULL) {
1798                 printf("name: %s\n", ft->ft_name);
1799                 if (ft->ft_flags & FL_PCPU) {
1800                         for (i = 0; i <= mp_maxid; i++) {
1801                                 if (CPU_ABSENT(i))
1802                                         continue;
1803                                 flowtable_show(ft, i);
1804                         }
1805                 } else {
1806                         flowtable_show(ft, -1);
1807                 }
1808                 ft = ft->ft_next;
1809         }
1810 }
1811
1812 DB_SHOW_COMMAND(flowtables, db_show_flowtables)
1813 {
1814         VNET_ITERATOR_DECL(vnet_iter);
1815
1816         VNET_FOREACH(vnet_iter) {
1817                 CURVNET_SET(vnet_iter);
1818                 flowtable_show_vnet();
1819                 CURVNET_RESTORE();
1820         }
1821 }
1822 #endif