]> CyberLeo.Net >> Repos - FreeBSD/stable/9.git/blob - sys/net/flowtable.c
Merge r238990 (manually resolving absence of r237263):
[FreeBSD/stable/9.git] / sys / net / flowtable.c
1 /**************************************************************************
2
3 Copyright (c) 2008-2010, BitGravity Inc.
4 All rights reserved.
5
6 Redistribution and use in source and binary forms, with or without
7 modification, are permitted provided that the following conditions are met:
8
9  1. Redistributions of source code must retain the above copyright notice,
10     this list of conditions and the following disclaimer.
11
12  2. Neither the name of the BitGravity Corporation nor the names of its
13     contributors may be used to endorse or promote products derived from
14     this software without specific prior written permission.
15
16 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
17 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
20 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
21 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
22 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
23 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
24 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
25 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
26 POSSIBILITY OF SUCH DAMAGE.
27
28 ***************************************************************************/
29
30 #include "opt_route.h"
31 #include "opt_mpath.h"
32 #include "opt_ddb.h"
33 #include "opt_inet.h"
34 #include "opt_inet6.h"
35
36 #include <sys/cdefs.h>
37 __FBSDID("$FreeBSD$");
38
39 #include <sys/param.h>  
40 #include <sys/types.h>
41 #include <sys/bitstring.h>
42 #include <sys/condvar.h>
43 #include <sys/callout.h>
44 #include <sys/kernel.h>  
45 #include <sys/kthread.h>
46 #include <sys/limits.h>
47 #include <sys/malloc.h>
48 #include <sys/mbuf.h>
49 #include <sys/proc.h>
50 #include <sys/sbuf.h>
51 #include <sys/sched.h>
52 #include <sys/smp.h>
53 #include <sys/socket.h>
54 #include <sys/syslog.h>
55 #include <sys/sysctl.h>
56
57 #include <net/if.h>
58 #include <net/if_llatbl.h>
59 #include <net/if_var.h>
60 #include <net/route.h> 
61 #include <net/flowtable.h>
62 #include <net/vnet.h>
63
64 #include <netinet/in.h>
65 #include <netinet/in_systm.h>
66 #include <netinet/in_var.h>
67 #include <netinet/if_ether.h>
68 #include <netinet/ip.h>
69 #ifdef INET6
70 #include <netinet/ip6.h>
71 #endif
72 #include <netinet/tcp.h>
73 #include <netinet/udp.h>
74 #include <netinet/sctp.h>
75
76 #include <libkern/jenkins.h>
77 #include <ddb/ddb.h>
78
79 struct ipv4_tuple {
80         uint16_t        ip_sport;       /* source port */
81         uint16_t        ip_dport;       /* destination port */
82         in_addr_t       ip_saddr;       /* source address */
83         in_addr_t       ip_daddr;       /* destination address */
84 };
85
86 union ipv4_flow {
87         struct ipv4_tuple ipf_ipt;
88         uint32_t        ipf_key[3];
89 };
90
91 struct ipv6_tuple {
92         uint16_t        ip_sport;       /* source port */
93         uint16_t        ip_dport;       /* destination port */
94         struct in6_addr ip_saddr;       /* source address */
95         struct in6_addr ip_daddr;       /* destination address */
96 };
97
98 union ipv6_flow {
99         struct ipv6_tuple ipf_ipt;
100         uint32_t        ipf_key[9];
101 };
102
103 struct flentry {
104         volatile uint32_t       f_fhash;        /* hash flowing forward */
105         uint16_t                f_flags;        /* flow flags */
106         uint8_t                 f_pad;          
107         uint8_t                 f_proto;        /* protocol */
108         uint32_t                f_fibnum;       /* fib index */
109         uint32_t                f_uptime;       /* uptime at last access */
110         struct flentry          *f_next;        /* pointer to collision entry */
111         volatile struct rtentry *f_rt;          /* rtentry for flow */
112         volatile struct llentry *f_lle;         /* llentry for flow */
113 };
114
115 struct flentry_v4 {
116         struct flentry  fl_entry;
117         union ipv4_flow fl_flow;
118 };
119
120 struct flentry_v6 {
121         struct flentry  fl_entry;
122         union ipv6_flow fl_flow;
123 };
124
125 #define fl_fhash        fl_entry.fl_fhash
126 #define fl_flags        fl_entry.fl_flags
127 #define fl_proto        fl_entry.fl_proto
128 #define fl_uptime       fl_entry.fl_uptime
129 #define fl_rt           fl_entry.fl_rt
130 #define fl_lle          fl_entry.fl_lle
131
132 #define SECS_PER_HOUR           3600
133 #define SECS_PER_DAY            (24*SECS_PER_HOUR)
134
135 #define SYN_IDLE                300
136 #define UDP_IDLE                300
137 #define FIN_WAIT_IDLE           600
138 #define TCP_IDLE                SECS_PER_DAY
139
140
141 typedef void fl_lock_t(struct flowtable *, uint32_t);
142 typedef void fl_rtalloc_t(struct route *, uint32_t, u_int);
143
144 union flentryp {
145         struct flentry          **global;
146         struct flentry          **pcpu[MAXCPU];
147 };
148
149 struct flowtable_stats {
150         uint64_t        ft_collisions;
151         uint64_t        ft_allocated;
152         uint64_t        ft_misses;
153         uint64_t        ft_max_depth;
154         uint64_t        ft_free_checks;
155         uint64_t        ft_frees;
156         uint64_t        ft_hits;
157         uint64_t        ft_lookups;
158 } __aligned(CACHE_LINE_SIZE);
159
160 struct flowtable {
161         struct  flowtable_stats ft_stats[MAXCPU];
162         int             ft_size;
163         int             ft_lock_count;
164         uint32_t        ft_flags;
165         char            *ft_name;
166         fl_lock_t       *ft_lock;
167         fl_lock_t       *ft_unlock;
168         fl_rtalloc_t    *ft_rtalloc;
169         /*
170          * XXX need to pad out 
171          */ 
172         struct mtx      *ft_locks;
173         union flentryp  ft_table;
174         bitstr_t        *ft_masks[MAXCPU];
175         bitstr_t        *ft_tmpmask;
176         struct flowtable *ft_next;
177
178         uint32_t        ft_count __aligned(CACHE_LINE_SIZE);
179         uint32_t        ft_udp_idle __aligned(CACHE_LINE_SIZE);
180         uint32_t        ft_fin_wait_idle;
181         uint32_t        ft_syn_idle;
182         uint32_t        ft_tcp_idle;
183         boolean_t       ft_full;
184 } __aligned(CACHE_LINE_SIZE);
185
186 static struct proc *flowcleanerproc;
187 static VNET_DEFINE(struct flowtable *, flow_list_head);
188 static VNET_DEFINE(uint32_t, flow_hashjitter);
189 static VNET_DEFINE(uma_zone_t, flow_ipv4_zone);
190 static VNET_DEFINE(uma_zone_t, flow_ipv6_zone);
191
192 #define V_flow_list_head        VNET(flow_list_head)
193 #define V_flow_hashjitter       VNET(flow_hashjitter)
194 #define V_flow_ipv4_zone        VNET(flow_ipv4_zone)
195 #define V_flow_ipv6_zone        VNET(flow_ipv6_zone)
196
197
198 static struct cv        flowclean_f_cv;
199 static struct cv        flowclean_c_cv;
200 static struct mtx       flowclean_lock;
201 static uint32_t         flowclean_cycles;
202 static uint32_t         flowclean_freq;
203
204 #ifdef FLOWTABLE_DEBUG
205 #define FLDPRINTF(ft, flags, fmt, ...)          \
206 do {                                            \
207         if ((ft)->ft_flags & (flags))           \
208                 printf((fmt), __VA_ARGS__);     \
209 } while (0);                                    \
210
211 #else
212 #define FLDPRINTF(ft, flags, fmt, ...)
213
214 #endif
215
216
217 /*
218  * TODO:
219  * - Make flowtable stats per-cpu, aggregated at sysctl call time,
220  *   to avoid extra cache evictions caused by incrementing a shared
221  *   counter
222  * - add sysctls to resize && flush flow tables 
223  * - Add per flowtable sysctls for statistics and configuring timeouts
224  * - add saturation counter to rtentry to support per-packet load-balancing
225  *   add flag to indicate round-robin flow, add list lookup from head
226      for flows
227  * - add sysctl / device node / syscall to support exporting and importing
228  *   of flows with flag to indicate that a flow was imported so should
229  *   not be considered for auto-cleaning
230  * - support explicit connection state (currently only ad-hoc for DSR)
231  * - idetach() cleanup for options VIMAGE builds.
232  */
233 VNET_DEFINE(int, flowtable_enable) = 1;
234 static VNET_DEFINE(int, flowtable_debug);
235 static VNET_DEFINE(int, flowtable_syn_expire) = SYN_IDLE;
236 static VNET_DEFINE(int, flowtable_udp_expire) = UDP_IDLE;
237 static VNET_DEFINE(int, flowtable_fin_wait_expire) = FIN_WAIT_IDLE;
238 static VNET_DEFINE(int, flowtable_tcp_expire) = TCP_IDLE;
239 static VNET_DEFINE(int, flowtable_nmbflows);
240 static VNET_DEFINE(int, flowtable_ready) = 0;
241
242 #define V_flowtable_enable              VNET(flowtable_enable)
243 #define V_flowtable_debug               VNET(flowtable_debug)
244 #define V_flowtable_syn_expire          VNET(flowtable_syn_expire)
245 #define V_flowtable_udp_expire          VNET(flowtable_udp_expire)
246 #define V_flowtable_fin_wait_expire     VNET(flowtable_fin_wait_expire)
247 #define V_flowtable_tcp_expire          VNET(flowtable_tcp_expire)
248 #define V_flowtable_nmbflows            VNET(flowtable_nmbflows)
249 #define V_flowtable_ready               VNET(flowtable_ready)
250
251 SYSCTL_NODE(_net_inet, OID_AUTO, flowtable, CTLFLAG_RD, NULL, "flowtable");
252 SYSCTL_VNET_INT(_net_inet_flowtable, OID_AUTO, debug, CTLFLAG_RW,
253     &VNET_NAME(flowtable_debug), 0, "print debug info.");
254 SYSCTL_VNET_INT(_net_inet_flowtable, OID_AUTO, enable, CTLFLAG_RW,
255     &VNET_NAME(flowtable_enable), 0, "enable flowtable caching.");
256
257 /*
258  * XXX This does not end up updating timeouts at runtime
259  * and only reflects the value for the last table added :-/
260  */
261 SYSCTL_VNET_INT(_net_inet_flowtable, OID_AUTO, syn_expire, CTLFLAG_RW,
262     &VNET_NAME(flowtable_syn_expire), 0,
263     "seconds after which to remove syn allocated flow.");
264 SYSCTL_VNET_INT(_net_inet_flowtable, OID_AUTO, udp_expire, CTLFLAG_RW,
265     &VNET_NAME(flowtable_udp_expire), 0,
266     "seconds after which to remove flow allocated to UDP.");
267 SYSCTL_VNET_INT(_net_inet_flowtable, OID_AUTO, fin_wait_expire, CTLFLAG_RW,
268     &VNET_NAME(flowtable_fin_wait_expire), 0,
269     "seconds after which to remove a flow in FIN_WAIT.");
270 SYSCTL_VNET_INT(_net_inet_flowtable, OID_AUTO, tcp_expire, CTLFLAG_RW,
271     &VNET_NAME(flowtable_tcp_expire), 0,
272     "seconds after which to remove flow allocated to a TCP connection.");
273
274
275 /*
276  * Maximum number of flows that can be allocated of a given type.
277  *
278  * The table is allocated at boot time (for the pure caching case
279  * there is no reason why this could not be changed at runtime)
280  * and thus (currently) needs to be set with a tunable.
281  */
282 static int
283 sysctl_nmbflows(SYSCTL_HANDLER_ARGS)
284 {
285         int error, newnmbflows;
286
287         newnmbflows = V_flowtable_nmbflows;
288         error = sysctl_handle_int(oidp, &newnmbflows, 0, req); 
289         if (error == 0 && req->newptr) {
290                 if (newnmbflows > V_flowtable_nmbflows) {
291                         V_flowtable_nmbflows = newnmbflows;
292                         uma_zone_set_max(V_flow_ipv4_zone,
293                             V_flowtable_nmbflows);
294                         uma_zone_set_max(V_flow_ipv6_zone,
295                             V_flowtable_nmbflows);
296                 } else
297                         error = EINVAL;
298         }
299         return (error);
300 }
301 SYSCTL_VNET_PROC(_net_inet_flowtable, OID_AUTO, nmbflows,
302     CTLTYPE_INT|CTLFLAG_RW, 0, 0, sysctl_nmbflows, "IU",
303     "Maximum number of flows allowed");
304
305
306
307 #define FS_PRINT(sb, field)     sbuf_printf((sb), "\t%s: %jd\n", #field, fs->ft_##field)
308
309 static void
310 fs_print(struct sbuf *sb, struct flowtable_stats *fs)
311 {
312
313         FS_PRINT(sb, collisions);
314         FS_PRINT(sb, allocated);
315         FS_PRINT(sb, misses);
316         FS_PRINT(sb, max_depth);
317         FS_PRINT(sb, free_checks);
318         FS_PRINT(sb, frees);
319         FS_PRINT(sb, hits);
320         FS_PRINT(sb, lookups);
321 }
322
323 static void
324 flowtable_show_stats(struct sbuf *sb, struct flowtable *ft)
325 {
326         int i;
327         struct flowtable_stats fs, *pfs;
328
329         if (ft->ft_flags & FL_PCPU) {
330                 bzero(&fs, sizeof(fs));
331                 pfs = &fs;
332                 CPU_FOREACH(i) {
333                         pfs->ft_collisions  += ft->ft_stats[i].ft_collisions;
334                         pfs->ft_allocated   += ft->ft_stats[i].ft_allocated;
335                         pfs->ft_misses      += ft->ft_stats[i].ft_misses;
336                         pfs->ft_free_checks += ft->ft_stats[i].ft_free_checks;
337                         pfs->ft_frees       += ft->ft_stats[i].ft_frees;
338                         pfs->ft_hits        += ft->ft_stats[i].ft_hits;
339                         pfs->ft_lookups     += ft->ft_stats[i].ft_lookups;
340                         if (ft->ft_stats[i].ft_max_depth > pfs->ft_max_depth)
341                                 pfs->ft_max_depth = ft->ft_stats[i].ft_max_depth;
342                 }
343         } else {
344                 pfs = &ft->ft_stats[0];
345         }
346         fs_print(sb, pfs);
347 }
348
349 static int
350 sysctl_flowtable_stats(SYSCTL_HANDLER_ARGS)
351 {
352         struct flowtable *ft;
353         struct sbuf *sb;
354         int error;
355
356         sb = sbuf_new(NULL, NULL, 64*1024, SBUF_FIXEDLEN);
357
358         ft = V_flow_list_head;
359         while (ft != NULL) {
360                 sbuf_printf(sb, "\ntable name: %s\n", ft->ft_name);
361                 flowtable_show_stats(sb, ft);
362                 ft = ft->ft_next;
363         }
364         sbuf_finish(sb);
365         error = SYSCTL_OUT(req, sbuf_data(sb), sbuf_len(sb) + 1);
366         sbuf_delete(sb);
367
368         return (error);
369 }
370 SYSCTL_VNET_PROC(_net_inet_flowtable, OID_AUTO, stats, CTLTYPE_STRING|CTLFLAG_RD,
371     NULL, 0, sysctl_flowtable_stats, "A", "flowtable statistics");
372
373
374 #ifndef RADIX_MPATH
375 static void
376 rtalloc_ign_wrapper(struct route *ro, uint32_t hash, u_int fibnum)
377 {
378
379         rtalloc_ign_fib(ro, 0, fibnum);
380 }
381 #endif
382
383 static void
384 flowtable_global_lock(struct flowtable *table, uint32_t hash)
385 {       
386         int lock_index = (hash)&(table->ft_lock_count - 1);
387
388         mtx_lock(&table->ft_locks[lock_index]);
389 }
390
391 static void
392 flowtable_global_unlock(struct flowtable *table, uint32_t hash)
393 {       
394         int lock_index = (hash)&(table->ft_lock_count - 1);
395
396         mtx_unlock(&table->ft_locks[lock_index]);
397 }
398
399 static void
400 flowtable_pcpu_lock(struct flowtable *table, uint32_t hash)
401 {
402
403         critical_enter();
404 }
405
406 static void
407 flowtable_pcpu_unlock(struct flowtable *table, uint32_t hash)
408 {
409
410         critical_exit();
411 }
412
413 #define FL_ENTRY_INDEX(table, hash)((hash) % (table)->ft_size)
414 #define FL_ENTRY(table, hash) *flowtable_entry((table), (hash))
415 #define FL_ENTRY_LOCK(table, hash)  (table)->ft_lock((table), (hash))
416 #define FL_ENTRY_UNLOCK(table, hash) (table)->ft_unlock((table), (hash))
417
418 #define FL_STALE        (1<<8)
419 #define FL_OVERWRITE    (1<<10)
420
421 void
422 flow_invalidate(struct flentry *fle)
423 {
424
425         fle->f_flags |= FL_STALE;
426 }
427
428 static __inline int
429 proto_to_flags(uint8_t proto)
430 {
431         int flag;
432
433         switch (proto) {
434         case IPPROTO_TCP:
435                 flag = FL_TCP;
436                 break;
437         case IPPROTO_SCTP:
438                 flag = FL_SCTP;
439                 break;          
440         case IPPROTO_UDP:
441                 flag = FL_UDP;
442                 break;
443         default:
444                 flag = 0;
445                 break;
446         }
447
448         return (flag);
449 }
450
451 static __inline int
452 flags_to_proto(int flags)
453 {
454         int proto, protoflags;
455
456         protoflags = flags & (FL_TCP|FL_SCTP|FL_UDP);
457         switch (protoflags) {
458         case FL_TCP:
459                 proto = IPPROTO_TCP;
460                 break;
461         case FL_SCTP:
462                 proto = IPPROTO_SCTP;
463                 break;
464         case FL_UDP:
465                 proto = IPPROTO_UDP;
466                 break;
467         default:
468                 proto = 0;
469                 break;
470         }
471         return (proto);
472 }
473
474 #ifdef INET
475 #ifdef FLOWTABLE_DEBUG
476 static void
477 ipv4_flow_print_tuple(int flags, int proto, struct sockaddr_in *ssin,
478     struct sockaddr_in *dsin)
479 {
480         char saddr[4*sizeof "123"], daddr[4*sizeof "123"];
481
482         if (flags & FL_HASH_ALL) {
483                 inet_ntoa_r(ssin->sin_addr, saddr);
484                 inet_ntoa_r(dsin->sin_addr, daddr);
485                 printf("proto=%d %s:%d->%s:%d\n",
486                     proto, saddr, ntohs(ssin->sin_port), daddr,
487                     ntohs(dsin->sin_port));
488         } else {
489                 inet_ntoa_r(*(struct in_addr *) &dsin->sin_addr, daddr);
490                 printf("proto=%d %s\n", proto, daddr);
491         }
492
493 }
494 #endif
495
496 static int
497 ipv4_mbuf_demarshal(struct flowtable *ft, struct mbuf *m,
498     struct sockaddr_in *ssin, struct sockaddr_in *dsin, uint16_t *flags)
499 {
500         struct ip *ip;
501         uint8_t proto;
502         int iphlen;
503         struct tcphdr *th;
504         struct udphdr *uh;
505         struct sctphdr *sh;
506         uint16_t sport, dport;
507
508         proto = sport = dport = 0;
509         ip = mtod(m, struct ip *);
510         dsin->sin_family = AF_INET;
511         dsin->sin_len = sizeof(*dsin);
512         dsin->sin_addr = ip->ip_dst;
513         ssin->sin_family = AF_INET;
514         ssin->sin_len = sizeof(*ssin);
515         ssin->sin_addr = ip->ip_src;    
516
517         proto = ip->ip_p;
518         if ((*flags & FL_HASH_ALL) == 0) {
519                 FLDPRINTF(ft, FL_DEBUG_ALL, "skip port check flags=0x%x ",
520                     *flags);
521                 goto skipports;
522         }
523
524         iphlen = ip->ip_hl << 2; /* XXX options? */
525
526         switch (proto) {
527         case IPPROTO_TCP:
528                 th = (struct tcphdr *)((caddr_t)ip + iphlen);
529                 sport = th->th_sport;
530                 dport = th->th_dport;
531                 if ((*flags & FL_HASH_ALL) &&
532                     (th->th_flags & (TH_RST|TH_FIN)))
533                         *flags |= FL_STALE;
534         break;
535         case IPPROTO_UDP:
536                 uh = (struct udphdr *)((caddr_t)ip + iphlen);
537                 sport = uh->uh_sport;
538                 dport = uh->uh_dport;
539         break;
540         case IPPROTO_SCTP:
541                 sh = (struct sctphdr *)((caddr_t)ip + iphlen);
542                 sport = sh->src_port;
543                 dport = sh->dest_port;
544         break;
545         default:
546                 FLDPRINTF(ft, FL_DEBUG_ALL, "proto=0x%x not supported\n", proto);
547                 return (ENOTSUP);
548                 /* no port - hence not a protocol we care about */
549                 break;
550         
551         }
552
553 skipports:
554         *flags |= proto_to_flags(proto);
555         ssin->sin_port = sport;
556         dsin->sin_port = dport;
557         return (0);
558 }
559
560 static uint32_t
561 ipv4_flow_lookup_hash_internal(
562         struct sockaddr_in *ssin, struct sockaddr_in *dsin, 
563             uint32_t *key, uint16_t flags)
564 {
565         uint16_t sport, dport;
566         uint8_t proto;
567         int offset = 0;
568
569         if ((V_flowtable_enable == 0) || (V_flowtable_ready == 0))
570                 return (0);
571         proto = flags_to_proto(flags);
572         sport = dport = key[2] = key[1] = key[0] = 0;
573         if ((ssin != NULL) && (flags & FL_HASH_ALL)) {
574                 key[1] = ssin->sin_addr.s_addr;
575                 sport = ssin->sin_port;
576         }
577         if (dsin != NULL) {
578                 key[2] = dsin->sin_addr.s_addr;
579                 dport = dsin->sin_port;
580         }
581         if (flags & FL_HASH_ALL) {
582                 ((uint16_t *)key)[0] = sport;
583                 ((uint16_t *)key)[1] = dport; 
584         } else
585                 offset = V_flow_hashjitter + proto;
586
587         return (jenkins_hashword(key, 3, offset));
588 }
589
590 static struct flentry *
591 flowtable_lookup_mbuf4(struct flowtable *ft, struct mbuf *m)
592 {
593         struct sockaddr_storage ssa, dsa;
594         uint16_t flags;
595         struct sockaddr_in *dsin, *ssin;
596
597         dsin = (struct sockaddr_in *)&dsa;
598         ssin = (struct sockaddr_in *)&ssa;
599         bzero(dsin, sizeof(*dsin));
600         bzero(ssin, sizeof(*ssin));
601         flags = ft->ft_flags;
602         if (ipv4_mbuf_demarshal(ft, m, ssin, dsin, &flags) != 0)
603                 return (NULL);
604
605         return (flowtable_lookup(ft, &ssa, &dsa, M_GETFIB(m), flags));
606 }
607
608 void
609 flow_to_route(struct flentry *fle, struct route *ro)
610 {
611         uint32_t *hashkey = NULL;
612         struct sockaddr_in *sin;
613
614         sin = (struct sockaddr_in *)&ro->ro_dst;
615         sin->sin_family = AF_INET;
616         sin->sin_len = sizeof(*sin);
617         hashkey = ((struct flentry_v4 *)fle)->fl_flow.ipf_key;
618         sin->sin_addr.s_addr = hashkey[2];
619         ro->ro_rt = __DEVOLATILE(struct rtentry *, fle->f_rt);
620         ro->ro_lle = __DEVOLATILE(struct llentry *, fle->f_lle);
621         ro->ro_flags |= RT_NORTREF;
622 }
623 #endif /* INET */
624
625 #ifdef INET6
626 /*
627  * PULLUP_TO(len, p, T) makes sure that len + sizeof(T) is contiguous,
628  * then it sets p to point at the offset "len" in the mbuf. WARNING: the
629  * pointer might become stale after other pullups (but we never use it
630  * this way).
631  */
632 #define PULLUP_TO(_len, p, T)                                           \
633 do {                                                                    \
634         int x = (_len) + sizeof(T);                                     \
635         if ((m)->m_len < x) {                                           \
636                 goto receive_failed;                                    \
637         }                                                               \
638         p = (mtod(m, char *) + (_len));                                 \
639 } while (0)
640
641 #define TCP(p)          ((struct tcphdr *)(p))
642 #define SCTP(p)         ((struct sctphdr *)(p))
643 #define UDP(p)          ((struct udphdr *)(p))
644
645 static int
646 ipv6_mbuf_demarshal(struct flowtable *ft, struct mbuf *m,
647     struct sockaddr_in6 *ssin6, struct sockaddr_in6 *dsin6, uint16_t *flags)
648 {
649         struct ip6_hdr *ip6;
650         uint8_t proto;
651         int hlen;
652         uint16_t src_port, dst_port;
653         u_short offset;
654         void *ulp;
655
656         offset = hlen = src_port = dst_port = 0;
657         ulp = NULL;
658         ip6 = mtod(m, struct ip6_hdr *);
659         hlen = sizeof(struct ip6_hdr);
660         proto = ip6->ip6_nxt;
661
662         if ((*flags & FL_HASH_ALL) == 0)
663                 goto skipports;
664
665         while (ulp == NULL) {
666                 switch (proto) {
667                 case IPPROTO_ICMPV6:
668                 case IPPROTO_OSPFIGP:
669                 case IPPROTO_PIM:
670                 case IPPROTO_CARP:
671                 case IPPROTO_ESP:
672                 case IPPROTO_NONE:
673                         ulp = ip6;
674                         break;
675                 case IPPROTO_TCP:
676                         PULLUP_TO(hlen, ulp, struct tcphdr);
677                         dst_port = TCP(ulp)->th_dport;
678                         src_port = TCP(ulp)->th_sport;
679                         if ((*flags & FL_HASH_ALL) &&
680                             (TCP(ulp)->th_flags & (TH_RST|TH_FIN)))
681                                 *flags |= FL_STALE;
682                         break;
683                 case IPPROTO_SCTP:
684                         PULLUP_TO(hlen, ulp, struct sctphdr);
685                         src_port = SCTP(ulp)->src_port;
686                         dst_port = SCTP(ulp)->dest_port;
687                         break;
688                 case IPPROTO_UDP:
689                         PULLUP_TO(hlen, ulp, struct udphdr);
690                         dst_port = UDP(ulp)->uh_dport;
691                         src_port = UDP(ulp)->uh_sport;
692                         break;
693                 case IPPROTO_HOPOPTS:   /* RFC 2460 */
694                         PULLUP_TO(hlen, ulp, struct ip6_hbh);
695                         hlen += (((struct ip6_hbh *)ulp)->ip6h_len + 1) << 3;
696                         proto = ((struct ip6_hbh *)ulp)->ip6h_nxt;
697                         ulp = NULL;
698                         break;
699                 case IPPROTO_ROUTING:   /* RFC 2460 */
700                         PULLUP_TO(hlen, ulp, struct ip6_rthdr); 
701                         hlen += (((struct ip6_rthdr *)ulp)->ip6r_len + 1) << 3;
702                         proto = ((struct ip6_rthdr *)ulp)->ip6r_nxt;
703                         ulp = NULL;
704                         break;
705                 case IPPROTO_FRAGMENT:  /* RFC 2460 */
706                         PULLUP_TO(hlen, ulp, struct ip6_frag);
707                         hlen += sizeof (struct ip6_frag);
708                         proto = ((struct ip6_frag *)ulp)->ip6f_nxt;
709                         offset = ((struct ip6_frag *)ulp)->ip6f_offlg &
710                             IP6F_OFF_MASK;
711                         ulp = NULL;
712                         break;
713                 case IPPROTO_DSTOPTS:   /* RFC 2460 */
714                         PULLUP_TO(hlen, ulp, struct ip6_hbh);
715                         hlen += (((struct ip6_hbh *)ulp)->ip6h_len + 1) << 3;
716                         proto = ((struct ip6_hbh *)ulp)->ip6h_nxt;
717                         ulp = NULL;
718                         break;
719                 case IPPROTO_AH:        /* RFC 2402 */
720                         PULLUP_TO(hlen, ulp, struct ip6_ext);
721                         hlen += (((struct ip6_ext *)ulp)->ip6e_len + 2) << 2;
722                         proto = ((struct ip6_ext *)ulp)->ip6e_nxt;
723                         ulp = NULL;
724                         break;
725                 default:
726                         PULLUP_TO(hlen, ulp, struct ip6_ext);
727                         break;
728                 }
729         }
730
731         if (src_port == 0) {
732         receive_failed:
733                 return (ENOTSUP);
734         }
735
736 skipports:
737         dsin6->sin6_family = AF_INET6;
738         dsin6->sin6_len = sizeof(*dsin6);
739         dsin6->sin6_port = dst_port;
740         memcpy(&dsin6->sin6_addr, &ip6->ip6_dst, sizeof(struct in6_addr));
741
742         ssin6->sin6_family = AF_INET6;
743         ssin6->sin6_len = sizeof(*ssin6);
744         ssin6->sin6_port = src_port;
745         memcpy(&ssin6->sin6_addr, &ip6->ip6_src, sizeof(struct in6_addr));
746         *flags |= proto_to_flags(proto);
747
748         return (0);
749 }
750
751 #define zero_key(key)           \
752 do {                            \
753         key[0] = 0;             \
754         key[1] = 0;             \
755         key[2] = 0;             \
756         key[3] = 0;             \
757         key[4] = 0;             \
758         key[5] = 0;             \
759         key[6] = 0;             \
760         key[7] = 0;             \
761         key[8] = 0;             \
762 } while (0)
763         
764 static uint32_t
765 ipv6_flow_lookup_hash_internal(
766         struct sockaddr_in6 *ssin6, struct sockaddr_in6 *dsin6, 
767             uint32_t *key, uint16_t flags)
768 {
769         uint16_t sport, dport;
770         uint8_t proto;
771         int offset = 0;
772
773         if ((V_flowtable_enable == 0) || (V_flowtable_ready == 0))
774                 return (0);
775
776         proto = flags_to_proto(flags);
777         zero_key(key);
778         sport = dport = 0;
779         if (dsin6 != NULL) {
780                 memcpy(&key[1], &dsin6->sin6_addr, sizeof(struct in6_addr));
781                 dport = dsin6->sin6_port;
782         }
783         if ((ssin6 != NULL) && (flags & FL_HASH_ALL)) {
784                 memcpy(&key[5], &ssin6->sin6_addr, sizeof(struct in6_addr));
785                 sport = ssin6->sin6_port;
786         }
787         if (flags & FL_HASH_ALL) {
788                 ((uint16_t *)key)[0] = sport;
789                 ((uint16_t *)key)[1] = dport; 
790         } else
791                 offset = V_flow_hashjitter + proto;
792
793         return (jenkins_hashword(key, 9, offset));
794 }
795
796 static struct flentry *
797 flowtable_lookup_mbuf6(struct flowtable *ft, struct mbuf *m)
798 {
799         struct sockaddr_storage ssa, dsa;
800         struct sockaddr_in6 *dsin6, *ssin6;     
801         uint16_t flags;
802
803         dsin6 = (struct sockaddr_in6 *)&dsa;
804         ssin6 = (struct sockaddr_in6 *)&ssa;
805         bzero(dsin6, sizeof(*dsin6));
806         bzero(ssin6, sizeof(*ssin6));
807         flags = ft->ft_flags;
808         
809         if (ipv6_mbuf_demarshal(ft, m, ssin6, dsin6, &flags) != 0)
810                 return (NULL);
811
812         return (flowtable_lookup(ft, &ssa, &dsa, M_GETFIB(m), flags));
813 }
814
815 void
816 flow_to_route_in6(struct flentry *fle, struct route_in6 *ro)
817 {
818         uint32_t *hashkey = NULL;
819         struct sockaddr_in6 *sin6;
820
821         sin6 = (struct sockaddr_in6 *)&ro->ro_dst;
822
823         sin6->sin6_family = AF_INET6;
824         sin6->sin6_len = sizeof(*sin6);
825         hashkey = ((struct flentry_v6 *)fle)->fl_flow.ipf_key;
826         memcpy(&sin6->sin6_addr, &hashkey[5], sizeof (struct in6_addr));
827         ro->ro_rt = __DEVOLATILE(struct rtentry *, fle->f_rt);
828         ro->ro_lle = __DEVOLATILE(struct llentry *, fle->f_lle);
829         ro->ro_flags |= RT_NORTREF;
830 }
831 #endif /* INET6 */
832
833 static bitstr_t *
834 flowtable_mask(struct flowtable *ft)
835 {
836         bitstr_t *mask;
837
838         if (ft->ft_flags & FL_PCPU)
839                 mask = ft->ft_masks[curcpu];
840         else
841                 mask = ft->ft_masks[0];
842
843         return (mask);
844 }
845
846 static struct flentry **
847 flowtable_entry(struct flowtable *ft, uint32_t hash)
848 {
849         struct flentry **fle;
850         int index = (hash % ft->ft_size);
851
852         if (ft->ft_flags & FL_PCPU) {
853                 KASSERT(&ft->ft_table.pcpu[curcpu][0] != NULL, ("pcpu not set"));
854                 fle = &ft->ft_table.pcpu[curcpu][index];
855         } else {
856                 KASSERT(&ft->ft_table.global[0] != NULL, ("global not set"));
857                 fle = &ft->ft_table.global[index];
858         }
859         
860         return (fle);
861 }
862
863 static int
864 flow_stale(struct flowtable *ft, struct flentry *fle)
865 {
866         time_t idle_time;
867
868         if ((fle->f_fhash == 0)
869             || ((fle->f_rt->rt_flags & RTF_HOST) &&
870                 ((fle->f_rt->rt_flags & (RTF_UP))
871                     != (RTF_UP)))
872             || (fle->f_rt->rt_ifp == NULL)
873             || !RT_LINK_IS_UP(fle->f_rt->rt_ifp))
874                 return (1);
875
876         idle_time = time_uptime - fle->f_uptime;
877
878         if ((fle->f_flags & FL_STALE) ||
879             ((fle->f_flags & (TH_SYN|TH_ACK|TH_FIN)) == 0
880                 && (idle_time > ft->ft_udp_idle)) ||
881             ((fle->f_flags & TH_FIN)
882                 && (idle_time > ft->ft_fin_wait_idle)) ||
883             ((fle->f_flags & (TH_SYN|TH_ACK)) == TH_SYN
884                 && (idle_time > ft->ft_syn_idle)) ||
885             ((fle->f_flags & (TH_SYN|TH_ACK)) == (TH_SYN|TH_ACK)
886                 && (idle_time > ft->ft_tcp_idle)) ||
887             ((fle->f_rt->rt_flags & RTF_UP) == 0 || 
888                 (fle->f_rt->rt_ifp == NULL)))
889                 return (1);
890
891         return (0);
892 }
893
894 static void
895 flowtable_set_hashkey(struct flentry *fle, uint32_t *key)
896 {
897         uint32_t *hashkey;
898         int i, nwords;
899
900         if (fle->f_flags & FL_IPV6) {
901                 nwords = 9;
902                 hashkey = ((struct flentry_v4 *)fle)->fl_flow.ipf_key;
903         } else {
904                 nwords = 3;
905                 hashkey = ((struct flentry_v6 *)fle)->fl_flow.ipf_key;
906         }
907         
908         for (i = 0; i < nwords; i++) 
909                 hashkey[i] = key[i];
910 }
911
912 static struct flentry *
913 flow_alloc(struct flowtable *ft)
914 {
915         struct flentry *newfle;
916         uma_zone_t zone;
917
918         newfle = NULL;
919         zone = (ft->ft_flags & FL_IPV6) ? V_flow_ipv6_zone : V_flow_ipv4_zone;
920
921         newfle = uma_zalloc(zone, M_NOWAIT | M_ZERO);
922         if (newfle != NULL)
923                 atomic_add_int(&ft->ft_count, 1);
924         return (newfle);
925 }
926
927 static void
928 flow_free(struct flentry *fle, struct flowtable *ft)
929 {
930         uma_zone_t zone;
931
932         zone = (ft->ft_flags & FL_IPV6) ? V_flow_ipv6_zone : V_flow_ipv4_zone;
933         atomic_add_int(&ft->ft_count, -1);
934         uma_zfree(zone, fle);
935 }
936
937 static int
938 flow_full(struct flowtable *ft)
939 {
940         boolean_t full;
941         uint32_t count;
942         
943         full = ft->ft_full;
944         count = ft->ft_count;
945
946         if (full && (count < (V_flowtable_nmbflows - (V_flowtable_nmbflows >> 3))))
947                 ft->ft_full = FALSE;
948         else if (!full && (count > (V_flowtable_nmbflows - (V_flowtable_nmbflows >> 5))))
949                 ft->ft_full = TRUE;
950         
951         if (full && !ft->ft_full) {
952                 flowclean_freq = 4*hz;
953                 if ((ft->ft_flags & FL_HASH_ALL) == 0)
954                         ft->ft_udp_idle = ft->ft_fin_wait_idle =
955                             ft->ft_syn_idle = ft->ft_tcp_idle = 5;
956                 cv_broadcast(&flowclean_c_cv);
957         } else if (!full && ft->ft_full) {
958                 flowclean_freq = 20*hz;
959                 if ((ft->ft_flags & FL_HASH_ALL) == 0)
960                         ft->ft_udp_idle = ft->ft_fin_wait_idle =
961                             ft->ft_syn_idle = ft->ft_tcp_idle = 30;
962         }
963
964         return (ft->ft_full);
965 }
966
967 static int
968 flowtable_insert(struct flowtable *ft, uint32_t hash, uint32_t *key,
969     uint32_t fibnum, struct route *ro, uint16_t flags)
970 {
971         struct flentry *fle, *fletail, *newfle, **flep;
972         struct flowtable_stats *fs = &ft->ft_stats[curcpu];
973         int depth;
974         bitstr_t *mask;
975         uint8_t proto;
976
977         newfle = flow_alloc(ft);
978         if (newfle == NULL)
979                 return (ENOMEM);
980
981         newfle->f_flags |= (flags & FL_IPV6);
982         proto = flags_to_proto(flags);
983
984         FL_ENTRY_LOCK(ft, hash);
985         mask = flowtable_mask(ft);
986         flep = flowtable_entry(ft, hash);
987         fletail = fle = *flep;
988
989         if (fle == NULL) {
990                 bit_set(mask, FL_ENTRY_INDEX(ft, hash));
991                 *flep = fle = newfle;
992                 goto skip;
993         } 
994         
995         depth = 0;
996         fs->ft_collisions++;
997         /*
998          * find end of list and make sure that we were not
999          * preempted by another thread handling this flow
1000          */
1001         while (fle != NULL) {
1002                 if (fle->f_fhash == hash && !flow_stale(ft, fle)) {
1003                         /*
1004                          * there was either a hash collision
1005                          * or we lost a race to insert
1006                          */
1007                         FL_ENTRY_UNLOCK(ft, hash);
1008                         flow_free(newfle, ft);
1009                         
1010                         if (flags & FL_OVERWRITE) 
1011                                 goto skip;
1012                         return (EEXIST);
1013                 }
1014                 /*
1015                  * re-visit this double condition XXX
1016                  */
1017                 if (fletail->f_next != NULL)
1018                         fletail = fle->f_next;
1019
1020                 depth++;
1021                 fle = fle->f_next;
1022         } 
1023
1024         if (depth > fs->ft_max_depth)
1025                 fs->ft_max_depth = depth;
1026         fletail->f_next = newfle;
1027         fle = newfle;
1028 skip:
1029         flowtable_set_hashkey(fle, key);
1030
1031         fle->f_proto = proto;
1032         fle->f_rt = ro->ro_rt;
1033         fle->f_lle = ro->ro_lle;
1034         fle->f_fhash = hash;
1035         fle->f_fibnum = fibnum;
1036         fle->f_uptime = time_uptime;
1037         FL_ENTRY_UNLOCK(ft, hash);
1038         return (0);
1039 }
1040
1041 int
1042 kern_flowtable_insert(struct flowtable *ft,
1043     struct sockaddr_storage *ssa, struct sockaddr_storage *dsa,
1044     struct route *ro, uint32_t fibnum, int flags)
1045 {
1046         uint32_t key[9], hash;
1047
1048         flags = (ft->ft_flags | flags | FL_OVERWRITE);
1049         hash = 0;
1050
1051 #ifdef INET
1052         if (ssa->ss_family == AF_INET) 
1053                 hash = ipv4_flow_lookup_hash_internal((struct sockaddr_in *)ssa,
1054                     (struct sockaddr_in *)dsa, key, flags);
1055 #endif
1056 #ifdef INET6
1057         if (ssa->ss_family == AF_INET6) 
1058                 hash = ipv6_flow_lookup_hash_internal((struct sockaddr_in6 *)ssa,
1059                     (struct sockaddr_in6 *)dsa, key, flags);
1060 #endif  
1061         if (ro->ro_rt == NULL || ro->ro_lle == NULL)
1062                 return (EINVAL);
1063
1064         FLDPRINTF(ft, FL_DEBUG,
1065             "kern_flowtable_insert: key=%x:%x:%x hash=%x fibnum=%d flags=%x\n",
1066             key[0], key[1], key[2], hash, fibnum, flags);
1067         return (flowtable_insert(ft, hash, key, fibnum, ro, flags));
1068 }
1069
1070 static int
1071 flowtable_key_equal(struct flentry *fle, uint32_t *key)
1072 {
1073         uint32_t *hashkey;
1074         int i, nwords;
1075
1076         if (fle->f_flags & FL_IPV6) {
1077                 nwords = 9;
1078                 hashkey = ((struct flentry_v4 *)fle)->fl_flow.ipf_key;
1079         } else {
1080                 nwords = 3;
1081                 hashkey = ((struct flentry_v6 *)fle)->fl_flow.ipf_key;
1082         }
1083
1084         for (i = 0; i < nwords; i++) 
1085                 if (hashkey[i] != key[i])
1086                         return (0);
1087
1088         return (1);
1089 }
1090
1091 struct flentry *
1092 flowtable_lookup_mbuf(struct flowtable *ft, struct mbuf *m, int af)
1093 {
1094         struct flentry *fle = NULL;
1095
1096 #ifdef INET
1097         if (af == AF_INET)
1098                 fle = flowtable_lookup_mbuf4(ft, m);
1099 #endif
1100 #ifdef INET6
1101         if (af == AF_INET6)
1102                 fle = flowtable_lookup_mbuf6(ft, m);
1103 #endif  
1104         if (fle != NULL && m != NULL && (m->m_flags & M_FLOWID) == 0) {
1105                 m->m_flags |= M_FLOWID;
1106                 m->m_pkthdr.flowid = fle->f_fhash;
1107         }
1108         return (fle);
1109 }
1110         
1111 struct flentry *
1112 flowtable_lookup(struct flowtable *ft, struct sockaddr_storage *ssa,
1113     struct sockaddr_storage *dsa, uint32_t fibnum, int flags)
1114 {
1115         uint32_t key[9], hash;
1116         struct flentry *fle;
1117         struct flowtable_stats *fs = &ft->ft_stats[curcpu];
1118         uint8_t proto = 0;
1119         int error = 0;
1120         struct rtentry *rt;
1121         struct llentry *lle;
1122         struct route sro, *ro;
1123         struct route_in6 sro6;
1124
1125         sro.ro_rt = sro6.ro_rt = NULL;
1126         sro.ro_lle = sro6.ro_lle = NULL;
1127         ro = NULL;
1128         hash = 0;
1129         flags |= ft->ft_flags;
1130         proto = flags_to_proto(flags);
1131 #ifdef INET
1132         if (ssa->ss_family == AF_INET) {
1133                 struct sockaddr_in *ssin, *dsin;
1134
1135                 ro = &sro;
1136                 memcpy(&ro->ro_dst, dsa, sizeof(struct sockaddr_in));
1137                 /*
1138                  * The harvested source and destination addresses
1139                  * may contain port information if the packet is 
1140                  * from a transport protocol (e.g. TCP/UDP). The 
1141                  * port field must be cleared before performing 
1142                  * a route lookup.
1143                  */
1144                 ((struct sockaddr_in *)&ro->ro_dst)->sin_port = 0;
1145                 dsin = (struct sockaddr_in *)dsa;
1146                 ssin = (struct sockaddr_in *)ssa;
1147                 if ((dsin->sin_addr.s_addr == ssin->sin_addr.s_addr) ||
1148                     (ntohl(dsin->sin_addr.s_addr) >> IN_CLASSA_NSHIFT) == IN_LOOPBACKNET ||
1149                     (ntohl(ssin->sin_addr.s_addr) >> IN_CLASSA_NSHIFT) == IN_LOOPBACKNET)
1150                         return (NULL);
1151
1152                 hash = ipv4_flow_lookup_hash_internal(ssin, dsin, key, flags);
1153         }
1154 #endif
1155 #ifdef INET6
1156         if (ssa->ss_family == AF_INET6) {
1157                 struct sockaddr_in6 *ssin6, *dsin6;
1158
1159                 ro = (struct route *)&sro6;
1160                 memcpy(&sro6.ro_dst, dsa,
1161                     sizeof(struct sockaddr_in6));
1162                 ((struct sockaddr_in6 *)&ro->ro_dst)->sin6_port = 0;
1163                 dsin6 = (struct sockaddr_in6 *)dsa;
1164                 ssin6 = (struct sockaddr_in6 *)ssa;
1165
1166                 flags |= FL_IPV6;
1167                 hash = ipv6_flow_lookup_hash_internal(ssin6, dsin6, key, flags);
1168         }
1169 #endif
1170         /*
1171          * Ports are zero and this isn't a transmit cache
1172          * - thus not a protocol for which we need to keep 
1173          * state
1174          * FL_HASH_ALL => key[0] != 0 for TCP || UDP || SCTP
1175          */
1176         if (hash == 0 || (key[0] == 0 && (ft->ft_flags & FL_HASH_ALL)))
1177                 return (NULL);
1178
1179         fs->ft_lookups++;
1180         FL_ENTRY_LOCK(ft, hash);
1181         if ((fle = FL_ENTRY(ft, hash)) == NULL) {
1182                 FL_ENTRY_UNLOCK(ft, hash);
1183                 goto uncached;
1184         }
1185 keycheck:       
1186         rt = __DEVOLATILE(struct rtentry *, fle->f_rt);
1187         lle = __DEVOLATILE(struct llentry *, fle->f_lle);
1188         if ((rt != NULL)
1189             && lle != NULL
1190             && fle->f_fhash == hash
1191             && flowtable_key_equal(fle, key)
1192             && (proto == fle->f_proto)
1193             && (fibnum == fle->f_fibnum)
1194             && (rt->rt_flags & RTF_UP)
1195             && (rt->rt_ifp != NULL)
1196             && (lle->la_flags & LLE_VALID)) {
1197                 fs->ft_hits++;
1198                 fle->f_uptime = time_uptime;
1199                 fle->f_flags |= flags;
1200                 FL_ENTRY_UNLOCK(ft, hash);
1201                 return (fle);
1202         } else if (fle->f_next != NULL) {
1203                 fle = fle->f_next;
1204                 goto keycheck;
1205         }
1206         FL_ENTRY_UNLOCK(ft, hash);
1207 uncached:
1208         if (flags & FL_NOAUTO || flow_full(ft))
1209                 return (NULL);
1210
1211         fs->ft_misses++;
1212         /*
1213          * This bit of code ends up locking the
1214          * same route 3 times (just like ip_output + ether_output)
1215          * - at lookup
1216          * - in rt_check when called by arpresolve
1217          * - dropping the refcount for the rtentry
1218          *
1219          * This could be consolidated to one if we wrote a variant
1220          * of arpresolve with an rt_check variant that expected to
1221          * receive the route locked
1222          */
1223
1224 #ifdef INVARIANTS
1225         if ((ro->ro_dst.sa_family != AF_INET) &&
1226             (ro->ro_dst.sa_family != AF_INET6))
1227                 panic("sa_family == %d\n", ro->ro_dst.sa_family);
1228 #endif
1229
1230         ft->ft_rtalloc(ro, hash, fibnum);
1231         if (ro->ro_rt == NULL) 
1232                 error = ENETUNREACH;
1233         else {
1234                 struct llentry *lle = NULL;
1235                 struct sockaddr_storage *l3addr;
1236                 struct rtentry *rt = ro->ro_rt;
1237                 struct ifnet *ifp = rt->rt_ifp;
1238
1239                 if (ifp->if_flags & (IFF_POINTOPOINT | IFF_LOOPBACK)) {
1240                         RTFREE(rt);
1241                         ro->ro_rt = NULL;
1242                         return (NULL);
1243                 }
1244 #ifdef INET6
1245                 if (ssa->ss_family == AF_INET6) {
1246                         struct sockaddr_in6 *dsin6;
1247
1248                         dsin6 = (struct sockaddr_in6 *)dsa;                     
1249                         if (in6_localaddr(&dsin6->sin6_addr)) {
1250                                 RTFREE(rt);
1251                                 ro->ro_rt = NULL;
1252                                 return (NULL);                          
1253                         }
1254
1255                         if (rt->rt_flags & RTF_GATEWAY)
1256                                 l3addr = (struct sockaddr_storage *)rt->rt_gateway;
1257                         
1258                         else
1259                                 l3addr = (struct sockaddr_storage *)&ro->ro_dst;
1260                         lle = llentry_alloc(ifp, LLTABLE6(ifp), l3addr);
1261                 }
1262 #endif  
1263 #ifdef INET
1264                 if (ssa->ss_family == AF_INET) {
1265                         if (rt->rt_flags & RTF_GATEWAY)
1266                                 l3addr = (struct sockaddr_storage *)rt->rt_gateway;
1267                         else
1268                                 l3addr = (struct sockaddr_storage *)&ro->ro_dst;
1269                         lle = llentry_alloc(ifp, LLTABLE(ifp), l3addr); 
1270                 }
1271                         
1272 #endif
1273                 ro->ro_lle = lle;
1274
1275                 if (lle == NULL) {
1276                         RTFREE(rt);
1277                         ro->ro_rt = NULL;
1278                         return (NULL);
1279                 }
1280                 error = flowtable_insert(ft, hash, key, fibnum, ro, flags);
1281
1282                 if (error) {
1283                         RTFREE(rt);
1284                         LLE_FREE(lle);
1285                         ro->ro_rt = NULL;
1286                         ro->ro_lle = NULL;
1287                 }
1288         } 
1289
1290         return ((error) ? NULL : fle);
1291 }
1292
1293 /*
1294  * used by the bit_alloc macro
1295  */
1296 #define calloc(count, size) malloc((count)*(size), M_DEVBUF, M_WAITOK|M_ZERO)
1297         
1298 struct flowtable *
1299 flowtable_alloc(char *name, int nentry, int flags)
1300 {
1301         struct flowtable *ft, *fttail;
1302         int i;
1303
1304         if (V_flow_hashjitter == 0)
1305                 V_flow_hashjitter = arc4random();
1306
1307         KASSERT(nentry > 0, ("nentry must be > 0, is %d\n", nentry));
1308
1309         ft = malloc(sizeof(struct flowtable),
1310             M_RTABLE, M_WAITOK | M_ZERO);
1311
1312         ft->ft_name = name;
1313         ft->ft_flags = flags;
1314         ft->ft_size = nentry;
1315 #ifdef RADIX_MPATH
1316         ft->ft_rtalloc = rtalloc_mpath_fib;
1317 #else
1318         ft->ft_rtalloc = rtalloc_ign_wrapper;
1319 #endif
1320         if (flags & FL_PCPU) {
1321                 ft->ft_lock = flowtable_pcpu_lock;
1322                 ft->ft_unlock = flowtable_pcpu_unlock;
1323
1324                 for (i = 0; i <= mp_maxid; i++) {
1325                         ft->ft_table.pcpu[i] =
1326                             malloc(nentry*sizeof(struct flentry *),
1327                                 M_RTABLE, M_WAITOK | M_ZERO);
1328                         ft->ft_masks[i] = bit_alloc(nentry);
1329                 }
1330         } else {
1331                 ft->ft_lock_count = 2*(powerof2(mp_maxid + 1) ? (mp_maxid + 1):
1332                     (fls(mp_maxid + 1) << 1));
1333                 
1334                 ft->ft_lock = flowtable_global_lock;
1335                 ft->ft_unlock = flowtable_global_unlock;
1336                 ft->ft_table.global =
1337                             malloc(nentry*sizeof(struct flentry *),
1338                                 M_RTABLE, M_WAITOK | M_ZERO);
1339                 ft->ft_locks = malloc(ft->ft_lock_count*sizeof(struct mtx),
1340                                 M_RTABLE, M_WAITOK | M_ZERO);
1341                 for (i = 0; i < ft->ft_lock_count; i++)
1342                         mtx_init(&ft->ft_locks[i], "flow", NULL, MTX_DEF|MTX_DUPOK);
1343
1344                 ft->ft_masks[0] = bit_alloc(nentry);
1345         }
1346         ft->ft_tmpmask = bit_alloc(nentry);
1347
1348         /*
1349          * In the local transmit case the table truly is 
1350          * just a cache - so everything is eligible for
1351          * replacement after 5s of non-use
1352          */
1353         if (flags & FL_HASH_ALL) {
1354                 ft->ft_udp_idle = V_flowtable_udp_expire;
1355                 ft->ft_syn_idle = V_flowtable_syn_expire;
1356                 ft->ft_fin_wait_idle = V_flowtable_fin_wait_expire;
1357                 ft->ft_tcp_idle = V_flowtable_fin_wait_expire;
1358         } else {
1359                 ft->ft_udp_idle = ft->ft_fin_wait_idle =
1360                     ft->ft_syn_idle = ft->ft_tcp_idle = 30;
1361                 
1362         }
1363
1364         /*
1365          * hook in to the cleaner list
1366          */
1367         if (V_flow_list_head == NULL)
1368                 V_flow_list_head = ft;
1369         else {
1370                 fttail = V_flow_list_head;
1371                 while (fttail->ft_next != NULL)
1372                         fttail = fttail->ft_next;
1373                 fttail->ft_next = ft;
1374         }
1375
1376         return (ft);
1377 }
1378
1379 /*
1380  * The rest of the code is devoted to garbage collection of expired entries.
1381  * It is a new additon made necessary by the switch to dynamically allocating
1382  * flow tables.
1383  * 
1384  */
1385 static void
1386 fle_free(struct flentry *fle, struct flowtable *ft)
1387 {
1388         struct rtentry *rt;
1389         struct llentry *lle;
1390
1391         rt = __DEVOLATILE(struct rtentry *, fle->f_rt);
1392         lle = __DEVOLATILE(struct llentry *, fle->f_lle);
1393         if (rt != NULL)
1394                 RTFREE(rt);
1395         if (lle != NULL)
1396                 LLE_FREE(lle);
1397         flow_free(fle, ft);
1398 }
1399
1400 static void
1401 flowtable_free_stale(struct flowtable *ft, struct rtentry *rt)
1402 {
1403         int curbit = 0, count;
1404         struct flentry *fle,  **flehead, *fleprev;
1405         struct flentry *flefreehead, *flefreetail, *fletmp;
1406         bitstr_t *mask, *tmpmask;
1407         struct flowtable_stats *fs = &ft->ft_stats[curcpu];
1408
1409         flefreehead = flefreetail = NULL;
1410         mask = flowtable_mask(ft);
1411         tmpmask = ft->ft_tmpmask;
1412         memcpy(tmpmask, mask, ft->ft_size/8);
1413         /*
1414          * XXX Note to self, bit_ffs operates at the byte level
1415          * and thus adds gratuitous overhead
1416          */
1417         bit_ffs(tmpmask, ft->ft_size, &curbit);
1418         while (curbit != -1) {
1419                 if (curbit >= ft->ft_size || curbit < -1) {
1420                         log(LOG_ALERT,
1421                             "warning: bad curbit value %d \n",
1422                             curbit);
1423                         break;
1424                 }
1425
1426                 FL_ENTRY_LOCK(ft, curbit);
1427                 flehead = flowtable_entry(ft, curbit);
1428                 fle = fleprev = *flehead;
1429
1430                 fs->ft_free_checks++;
1431 #ifdef DIAGNOSTIC
1432                 if (fle == NULL && curbit > 0) {
1433                         log(LOG_ALERT,
1434                             "warning bit=%d set, but no fle found\n",
1435                             curbit);
1436                 }
1437 #endif          
1438                 while (fle != NULL) {
1439                         if (rt != NULL) {
1440                                 if (__DEVOLATILE(struct rtentry *, fle->f_rt) != rt) {
1441                                         fleprev = fle;
1442                                         fle = fle->f_next;
1443                                         continue;
1444                                 }
1445                         } else if (!flow_stale(ft, fle)) {
1446                                 fleprev = fle;
1447                                 fle = fle->f_next;
1448                                 continue;
1449                         }
1450                         /*
1451                          * delete head of the list
1452                          */
1453                         if (fleprev == *flehead) {
1454                                 fletmp = fleprev;
1455                                 if (fle == fleprev) {
1456                                         fleprev = *flehead = fle->f_next;
1457                                 } else
1458                                         fleprev = *flehead = fle;
1459                                 fle = fle->f_next;
1460                         } else {
1461                                 /*
1462                                  * don't advance fleprev
1463                                  */
1464                                 fletmp = fle;
1465                                 fleprev->f_next = fle->f_next;
1466                                 fle = fleprev->f_next;
1467                         }
1468
1469                         if (flefreehead == NULL)
1470                                 flefreehead = flefreetail = fletmp;
1471                         else {
1472                                 flefreetail->f_next = fletmp;
1473                                 flefreetail = fletmp;
1474                         }
1475                         fletmp->f_next = NULL;
1476                 }
1477                 if (*flehead == NULL)
1478                         bit_clear(mask, curbit);
1479                 FL_ENTRY_UNLOCK(ft, curbit);
1480                 bit_clear(tmpmask, curbit);
1481                 bit_ffs(tmpmask, ft->ft_size, &curbit);
1482         }
1483         count = 0;
1484         while ((fle = flefreehead) != NULL) {
1485                 flefreehead = fle->f_next;
1486                 count++;
1487                 fs->ft_frees++;
1488                 fle_free(fle, ft);
1489         }
1490         if (V_flowtable_debug && count)
1491                 log(LOG_DEBUG, "freed %d flow entries\n", count);
1492 }
1493
1494 void
1495 flowtable_route_flush(struct flowtable *ft, struct rtentry *rt)
1496 {
1497         int i;
1498
1499         if (ft->ft_flags & FL_PCPU) {
1500                 CPU_FOREACH(i) {
1501                         if (smp_started == 1) {
1502                                 thread_lock(curthread);
1503                                 sched_bind(curthread, i);
1504                                 thread_unlock(curthread);
1505                         }
1506
1507                         flowtable_free_stale(ft, rt);
1508
1509                         if (smp_started == 1) {
1510                                 thread_lock(curthread);
1511                                 sched_unbind(curthread);
1512                                 thread_unlock(curthread);
1513                         }
1514                 }
1515         } else {
1516                 flowtable_free_stale(ft, rt);
1517         }
1518 }
1519
1520 static void
1521 flowtable_clean_vnet(void)
1522 {
1523         struct flowtable *ft;
1524         int i;
1525
1526         ft = V_flow_list_head;
1527         while (ft != NULL) {
1528                 if (ft->ft_flags & FL_PCPU) {
1529                         CPU_FOREACH(i) {
1530                                 if (smp_started == 1) {
1531                                         thread_lock(curthread);
1532                                         sched_bind(curthread, i);
1533                                         thread_unlock(curthread);
1534                                 }
1535
1536                                 flowtable_free_stale(ft, NULL);
1537
1538                                 if (smp_started == 1) {
1539                                         thread_lock(curthread);
1540                                         sched_unbind(curthread);
1541                                         thread_unlock(curthread);
1542                                 }
1543                         }
1544                 } else {
1545                         flowtable_free_stale(ft, NULL);
1546                 }
1547                 ft = ft->ft_next;
1548         }
1549 }
1550
1551 static void
1552 flowtable_cleaner(void)
1553 {
1554         VNET_ITERATOR_DECL(vnet_iter);
1555         struct thread *td;
1556
1557         if (bootverbose)
1558                 log(LOG_INFO, "flowtable cleaner started\n");
1559         td = curthread;
1560         while (1) {
1561                 VNET_LIST_RLOCK();
1562                 VNET_FOREACH(vnet_iter) {
1563                         CURVNET_SET(vnet_iter);
1564                         flowtable_clean_vnet();
1565                         CURVNET_RESTORE();
1566                 }
1567                 VNET_LIST_RUNLOCK();
1568
1569                 /*
1570                  * The 10 second interval between cleaning checks
1571                  * is arbitrary
1572                  */
1573                 mtx_lock(&flowclean_lock);
1574                 thread_lock(td);
1575                 sched_prio(td, PPAUSE);
1576                 thread_unlock(td);
1577                 flowclean_cycles++;
1578                 cv_broadcast(&flowclean_f_cv);
1579                 cv_timedwait(&flowclean_c_cv, &flowclean_lock, flowclean_freq);
1580                 mtx_unlock(&flowclean_lock);
1581         }
1582 }
1583
1584 static void
1585 flowtable_flush(void *unused __unused)
1586 {
1587         uint64_t start;
1588
1589         mtx_lock(&flowclean_lock);
1590         start = flowclean_cycles;
1591         while (start == flowclean_cycles) {
1592                 cv_broadcast(&flowclean_c_cv);
1593                 cv_wait(&flowclean_f_cv, &flowclean_lock);
1594         }
1595         mtx_unlock(&flowclean_lock);
1596 }
1597
1598 static struct kproc_desc flow_kp = {
1599         "flowcleaner",
1600         flowtable_cleaner,
1601         &flowcleanerproc
1602 };
1603 SYSINIT(flowcleaner, SI_SUB_KTHREAD_IDLE, SI_ORDER_ANY, kproc_start, &flow_kp);
1604
1605 static void
1606 flowtable_init_vnet(const void *unused __unused)
1607 {
1608
1609         V_flowtable_nmbflows = 1024 + maxusers * 64 * mp_ncpus;
1610         V_flow_ipv4_zone = uma_zcreate("ip4flow", sizeof(struct flentry_v4),
1611             NULL, NULL, NULL, NULL, 64, UMA_ZONE_MAXBUCKET);
1612         V_flow_ipv6_zone = uma_zcreate("ip6flow", sizeof(struct flentry_v6),
1613             NULL, NULL, NULL, NULL, 64, UMA_ZONE_MAXBUCKET);    
1614         uma_zone_set_max(V_flow_ipv4_zone, V_flowtable_nmbflows);
1615         uma_zone_set_max(V_flow_ipv6_zone, V_flowtable_nmbflows);
1616         V_flowtable_ready = 1;
1617 }
1618 VNET_SYSINIT(flowtable_init_vnet, SI_SUB_SMP, SI_ORDER_ANY,
1619     flowtable_init_vnet, NULL);
1620
1621 static void
1622 flowtable_init(const void *unused __unused)
1623 {
1624
1625         cv_init(&flowclean_c_cv, "c_flowcleanwait");
1626         cv_init(&flowclean_f_cv, "f_flowcleanwait");
1627         mtx_init(&flowclean_lock, "flowclean lock", NULL, MTX_DEF);
1628         EVENTHANDLER_REGISTER(ifnet_departure_event, flowtable_flush, NULL,
1629             EVENTHANDLER_PRI_ANY);
1630         flowclean_freq = 20*hz;
1631 }
1632 SYSINIT(flowtable_init, SI_SUB_KTHREAD_INIT, SI_ORDER_FIRST,
1633     flowtable_init, NULL);
1634
1635
1636 #ifdef VIMAGE
1637 static void
1638 flowtable_uninit(const void *unused __unused)
1639 {
1640
1641         V_flowtable_ready = 0;
1642         uma_zdestroy(V_flow_ipv4_zone);
1643         uma_zdestroy(V_flow_ipv6_zone);
1644 }
1645
1646 VNET_SYSUNINIT(flowtable_uninit, SI_SUB_KTHREAD_INIT, SI_ORDER_ANY,
1647     flowtable_uninit, NULL);
1648 #endif
1649
1650 #ifdef DDB
1651 static uint32_t *
1652 flowtable_get_hashkey(struct flentry *fle)
1653 {
1654         uint32_t *hashkey;
1655
1656         if (fle->f_flags & FL_IPV6)
1657                 hashkey = ((struct flentry_v4 *)fle)->fl_flow.ipf_key;
1658         else
1659                 hashkey = ((struct flentry_v6 *)fle)->fl_flow.ipf_key;
1660
1661         return (hashkey);
1662 }
1663
1664 static bitstr_t *
1665 flowtable_mask_pcpu(struct flowtable *ft, int cpuid)
1666 {
1667         bitstr_t *mask;
1668
1669         if (ft->ft_flags & FL_PCPU)
1670                 mask = ft->ft_masks[cpuid];
1671         else
1672                 mask = ft->ft_masks[0];
1673
1674         return (mask);
1675 }
1676
1677 static struct flentry **
1678 flowtable_entry_pcpu(struct flowtable *ft, uint32_t hash, int cpuid)
1679 {
1680         struct flentry **fle;
1681         int index = (hash % ft->ft_size);
1682
1683         if (ft->ft_flags & FL_PCPU) {
1684                 fle = &ft->ft_table.pcpu[cpuid][index];
1685         } else {
1686                 fle = &ft->ft_table.global[index];
1687         }
1688         
1689         return (fle);
1690 }
1691
1692 static void
1693 flow_show(struct flowtable *ft, struct flentry *fle)
1694 {
1695         int idle_time;
1696         int rt_valid, ifp_valid;
1697         uint16_t sport, dport;
1698         uint32_t *hashkey;
1699         char saddr[4*sizeof "123"], daddr[4*sizeof "123"];
1700         volatile struct rtentry *rt;
1701         struct ifnet *ifp = NULL;
1702
1703         idle_time = (int)(time_uptime - fle->f_uptime);
1704         rt = fle->f_rt;
1705         rt_valid = rt != NULL;
1706         if (rt_valid) 
1707                 ifp = rt->rt_ifp;
1708         ifp_valid = ifp != NULL;
1709         hashkey = flowtable_get_hashkey(fle);
1710         if (fle->f_flags & FL_IPV6)
1711                 goto skipaddr;
1712
1713         inet_ntoa_r(*(struct in_addr *) &hashkey[2], daddr);
1714         if (ft->ft_flags & FL_HASH_ALL) {
1715                 inet_ntoa_r(*(struct in_addr *) &hashkey[1], saddr);            
1716                 sport = ntohs(((uint16_t *)hashkey)[0]);
1717                 dport = ntohs(((uint16_t *)hashkey)[1]);
1718                 db_printf("%s:%d->%s:%d",
1719                     saddr, sport, daddr,
1720                     dport);
1721         } else 
1722                 db_printf("%s ", daddr);
1723     
1724 skipaddr:
1725         if (fle->f_flags & FL_STALE)
1726                 db_printf(" FL_STALE ");
1727         if (fle->f_flags & FL_TCP)
1728                 db_printf(" FL_TCP ");
1729         if (fle->f_flags & FL_UDP)
1730                 db_printf(" FL_UDP ");
1731         if (rt_valid) {
1732                 if (rt->rt_flags & RTF_UP)
1733                         db_printf(" RTF_UP ");
1734         }
1735         if (ifp_valid) {
1736                 if (ifp->if_flags & IFF_LOOPBACK)
1737                         db_printf(" IFF_LOOPBACK ");
1738                 if (ifp->if_flags & IFF_UP)
1739                         db_printf(" IFF_UP ");          
1740                 if (ifp->if_flags & IFF_POINTOPOINT)
1741                         db_printf(" IFF_POINTOPOINT ");         
1742         }
1743         if (fle->f_flags & FL_IPV6)
1744                 db_printf("\n\tkey=%08x:%08x:%08x%08x:%08x:%08x%08x:%08x:%08x",
1745                     hashkey[0], hashkey[1], hashkey[2],
1746                     hashkey[3], hashkey[4], hashkey[5],
1747                     hashkey[6], hashkey[7], hashkey[8]);
1748         else
1749                 db_printf("\n\tkey=%08x:%08x:%08x ",
1750                     hashkey[0], hashkey[1], hashkey[2]);
1751         db_printf("hash=%08x idle_time=%03d"
1752             "\n\tfibnum=%02d rt=%p",
1753             fle->f_fhash, idle_time, fle->f_fibnum, fle->f_rt);
1754         db_printf("\n");
1755 }
1756
1757 static void
1758 flowtable_show(struct flowtable *ft, int cpuid)
1759 {
1760         int curbit = 0;
1761         struct flentry *fle,  **flehead;
1762         bitstr_t *mask, *tmpmask;
1763
1764         if (cpuid != -1)
1765                 db_printf("cpu: %d\n", cpuid);
1766         mask = flowtable_mask_pcpu(ft, cpuid);
1767         tmpmask = ft->ft_tmpmask;
1768         memcpy(tmpmask, mask, ft->ft_size/8);
1769         /*
1770          * XXX Note to self, bit_ffs operates at the byte level
1771          * and thus adds gratuitous overhead
1772          */
1773         bit_ffs(tmpmask, ft->ft_size, &curbit);
1774         while (curbit != -1) {
1775                 if (curbit >= ft->ft_size || curbit < -1) {
1776                         db_printf("warning: bad curbit value %d \n",
1777                             curbit);
1778                         break;
1779                 }
1780
1781                 flehead = flowtable_entry_pcpu(ft, curbit, cpuid);
1782                 fle = *flehead;
1783
1784                 while (fle != NULL) {   
1785                         flow_show(ft, fle);
1786                         fle = fle->f_next;
1787                         continue;
1788                 }
1789                 bit_clear(tmpmask, curbit);
1790                 bit_ffs(tmpmask, ft->ft_size, &curbit);
1791         }
1792 }
1793
1794 static void
1795 flowtable_show_vnet(void)
1796 {
1797         struct flowtable *ft;
1798         int i;
1799
1800         ft = V_flow_list_head;
1801         while (ft != NULL) {
1802                 printf("name: %s\n", ft->ft_name);
1803                 if (ft->ft_flags & FL_PCPU) {
1804                         CPU_FOREACH(i) {
1805                                 flowtable_show(ft, i);
1806                         }
1807                 } else {
1808                         flowtable_show(ft, -1);
1809                 }
1810                 ft = ft->ft_next;
1811         }
1812 }
1813
1814 DB_SHOW_COMMAND(flowtables, db_show_flowtables)
1815 {
1816         VNET_ITERATOR_DECL(vnet_iter);
1817
1818         VNET_FOREACH(vnet_iter) {
1819                 CURVNET_SET(vnet_iter);
1820 #ifdef VIMAGE
1821                 db_printf("vnet %p\n", vnet_iter);
1822 #endif
1823                 flowtable_show_vnet();
1824                 CURVNET_RESTORE();
1825         }
1826 }
1827 #endif