]> CyberLeo.Net >> Repos - FreeBSD/stable/8.git/blob - sys/net/flowtable.c
MFC r215238 (originally by kib):
[FreeBSD/stable/8.git] / sys / net / flowtable.c
1 /**************************************************************************
2
3 Copyright (c) 2008-2010, BitGravity Inc.
4 All rights reserved.
5
6 Redistribution and use in source and binary forms, with or without
7 modification, are permitted provided that the following conditions are met:
8
9  1. Redistributions of source code must retain the above copyright notice,
10     this list of conditions and the following disclaimer.
11
12  2. Neither the name of the BitGravity Corporation nor the names of its
13     contributors may be used to endorse or promote products derived from
14     this software without specific prior written permission.
15
16 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
17 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
20 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
21 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
22 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
23 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
24 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
25 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
26 POSSIBILITY OF SUCH DAMAGE.
27
28 ***************************************************************************/
29
30 #include "opt_route.h"
31 #include "opt_mpath.h"
32 #include "opt_ddb.h"
33 #include "opt_inet.h"
34 #include "opt_inet6.h"
35
36 #include <sys/cdefs.h>
37 __FBSDID("$FreeBSD$");
38
39 #include <sys/param.h>  
40 #include <sys/types.h>
41 #include <sys/bitstring.h>
42 #include <sys/condvar.h>
43 #include <sys/callout.h>
44 #include <sys/kernel.h>  
45 #include <sys/kthread.h>
46 #include <sys/limits.h>
47 #include <sys/malloc.h>
48 #include <sys/mbuf.h>
49 #include <sys/proc.h>
50 #include <sys/sbuf.h>
51 #include <sys/sched.h>
52 #include <sys/smp.h>
53 #include <sys/socket.h>
54 #include <sys/syslog.h>
55 #include <sys/sysctl.h>
56
57 #include <net/if.h>
58 #include <net/if_llatbl.h>
59 #include <net/if_var.h>
60 #include <net/route.h> 
61 #include <net/flowtable.h>
62 #include <net/vnet.h>
63
64 #include <netinet/in.h>
65 #include <netinet/in_systm.h>
66 #include <netinet/in_var.h>
67 #include <netinet/if_ether.h>
68 #include <netinet/ip.h>
69 #ifdef INET6
70 #include <netinet/ip6.h>
71 #endif
72 #include <netinet/tcp.h>
73 #include <netinet/udp.h>
74 #include <netinet/sctp.h>
75
76 #include <libkern/jenkins.h>
77 #include <ddb/ddb.h>
78
79 struct ipv4_tuple {
80         uint16_t        ip_sport;       /* source port */
81         uint16_t        ip_dport;       /* destination port */
82         in_addr_t       ip_saddr;       /* source address */
83         in_addr_t       ip_daddr;       /* destination address */
84 };
85
86 union ipv4_flow {
87         struct ipv4_tuple ipf_ipt;
88         uint32_t        ipf_key[3];
89 };
90
91 struct ipv6_tuple {
92         uint16_t        ip_sport;       /* source port */
93         uint16_t        ip_dport;       /* destination port */
94         struct in6_addr ip_saddr;       /* source address */
95         struct in6_addr ip_daddr;       /* destination address */
96 };
97
98 union ipv6_flow {
99         struct ipv6_tuple ipf_ipt;
100         uint32_t        ipf_key[9];
101 };
102
103 struct flentry {
104         volatile uint32_t       f_fhash;        /* hash flowing forward */
105         uint16_t                f_flags;        /* flow flags */
106         uint8_t                 f_pad;          
107         uint8_t                 f_proto;        /* protocol */
108         uint32_t                f_fibnum;       /* fib index */
109         uint32_t                f_uptime;       /* uptime at last access */
110         struct flentry          *f_next;        /* pointer to collision entry */
111         volatile struct rtentry *f_rt;          /* rtentry for flow */
112         volatile struct llentry *f_lle;         /* llentry for flow */
113 };
114
115 struct flentry_v4 {
116         struct flentry  fl_entry;
117         union ipv4_flow fl_flow;
118 };
119
120 struct flentry_v6 {
121         struct flentry  fl_entry;
122         union ipv6_flow fl_flow;
123 };
124
125 #define fl_fhash        fl_entry.fl_fhash
126 #define fl_flags        fl_entry.fl_flags
127 #define fl_proto        fl_entry.fl_proto
128 #define fl_uptime       fl_entry.fl_uptime
129 #define fl_rt           fl_entry.fl_rt
130 #define fl_lle          fl_entry.fl_lle
131
132 #define SECS_PER_HOUR           3600
133 #define SECS_PER_DAY            (24*SECS_PER_HOUR)
134
135 #define SYN_IDLE                300
136 #define UDP_IDLE                300
137 #define FIN_WAIT_IDLE           600
138 #define TCP_IDLE                SECS_PER_DAY
139
140
141 typedef void fl_lock_t(struct flowtable *, uint32_t);
142 typedef void fl_rtalloc_t(struct route *, uint32_t, u_int);
143
144 union flentryp {
145         struct flentry          **global;
146         struct flentry          **pcpu[MAXCPU];
147 };
148
149 struct flowtable_stats {
150         uint64_t        ft_collisions;
151         uint64_t        ft_allocated;
152         uint64_t        ft_misses;
153         uint64_t        ft_max_depth;
154         uint64_t        ft_free_checks;
155         uint64_t        ft_frees;
156         uint64_t        ft_hits;
157         uint64_t        ft_lookups;
158 } __aligned(CACHE_LINE_SIZE);
159
160 struct flowtable {
161         struct  flowtable_stats ft_stats[MAXCPU];
162         int             ft_size;
163         int             ft_lock_count;
164         uint32_t        ft_flags;
165         char            *ft_name;
166         fl_lock_t       *ft_lock;
167         fl_lock_t       *ft_unlock;
168         fl_rtalloc_t    *ft_rtalloc;
169         /*
170          * XXX need to pad out 
171          */ 
172         struct mtx      *ft_locks;
173         union flentryp  ft_table;
174         bitstr_t        *ft_masks[MAXCPU];
175         bitstr_t        *ft_tmpmask;
176         struct flowtable *ft_next;
177
178         uint32_t        ft_count __aligned(CACHE_LINE_SIZE);
179         uint32_t        ft_udp_idle __aligned(CACHE_LINE_SIZE);
180         uint32_t        ft_fin_wait_idle;
181         uint32_t        ft_syn_idle;
182         uint32_t        ft_tcp_idle;
183         boolean_t       ft_full;
184 } __aligned(CACHE_LINE_SIZE);
185
186 static struct proc *flowcleanerproc;
187 static VNET_DEFINE(struct flowtable *, flow_list_head);
188 static VNET_DEFINE(uint32_t, flow_hashjitter);
189 static VNET_DEFINE(uma_zone_t, flow_ipv4_zone);
190 static VNET_DEFINE(uma_zone_t, flow_ipv6_zone);
191
192 #define V_flow_list_head        VNET(flow_list_head)
193 #define V_flow_hashjitter       VNET(flow_hashjitter)
194 #define V_flow_ipv4_zone        VNET(flow_ipv4_zone)
195 #define V_flow_ipv6_zone        VNET(flow_ipv6_zone)
196
197
198 static struct cv        flowclean_f_cv;
199 static struct cv        flowclean_c_cv;
200 static struct mtx       flowclean_lock;
201 static uint32_t         flowclean_cycles;
202 static uint32_t         flowclean_freq;
203
204 #ifdef FLOWTABLE_DEBUG
205 #define FLDPRINTF(ft, flags, fmt, ...)          \
206 do {                                            \
207         if ((ft)->ft_flags & (flags))           \
208                 printf((fmt), __VA_ARGS__);     \
209 } while (0);                                    \
210
211 #else
212 #define FLDPRINTF(ft, flags, fmt, ...)
213
214 #endif
215
216
217 /*
218  * TODO:
219  * - Make flowtable stats per-cpu, aggregated at sysctl call time,
220  *   to avoid extra cache evictions caused by incrementing a shared
221  *   counter
222  * - add sysctls to resize && flush flow tables 
223  * - Add per flowtable sysctls for statistics and configuring timeouts
224  * - add saturation counter to rtentry to support per-packet load-balancing
225  *   add flag to indicate round-robin flow, add list lookup from head
226      for flows
227  * - add sysctl / device node / syscall to support exporting and importing
228  *   of flows with flag to indicate that a flow was imported so should
229  *   not be considered for auto-cleaning
230  * - support explicit connection state (currently only ad-hoc for DSR)
231  * - idetach() cleanup for options VIMAGE builds.
232  */
233 VNET_DEFINE(int, flowtable_enable) = 1;
234 static VNET_DEFINE(int, flowtable_debug);
235 static VNET_DEFINE(int, flowtable_syn_expire) = SYN_IDLE;
236 static VNET_DEFINE(int, flowtable_udp_expire) = UDP_IDLE;
237 static VNET_DEFINE(int, flowtable_fin_wait_expire) = FIN_WAIT_IDLE;
238 static VNET_DEFINE(int, flowtable_tcp_expire) = TCP_IDLE;
239 static VNET_DEFINE(int, flowtable_nmbflows);
240 static VNET_DEFINE(int, flowtable_ready) = 0;
241
242 #define V_flowtable_enable              VNET(flowtable_enable)
243 #define V_flowtable_debug               VNET(flowtable_debug)
244 #define V_flowtable_syn_expire          VNET(flowtable_syn_expire)
245 #define V_flowtable_udp_expire          VNET(flowtable_udp_expire)
246 #define V_flowtable_fin_wait_expire     VNET(flowtable_fin_wait_expire)
247 #define V_flowtable_tcp_expire          VNET(flowtable_tcp_expire)
248 #define V_flowtable_nmbflows            VNET(flowtable_nmbflows)
249 #define V_flowtable_ready               VNET(flowtable_ready)
250
251 SYSCTL_NODE(_net_inet, OID_AUTO, flowtable, CTLFLAG_RD, NULL, "flowtable");
252 SYSCTL_VNET_INT(_net_inet_flowtable, OID_AUTO, debug, CTLFLAG_RW,
253     &VNET_NAME(flowtable_debug), 0, "print debug info.");
254 SYSCTL_VNET_INT(_net_inet_flowtable, OID_AUTO, enable, CTLFLAG_RW,
255     &VNET_NAME(flowtable_enable), 0, "enable flowtable caching.");
256
257 /*
258  * XXX This does not end up updating timeouts at runtime
259  * and only reflects the value for the last table added :-/
260  */
261 SYSCTL_VNET_INT(_net_inet_flowtable, OID_AUTO, syn_expire, CTLFLAG_RW,
262     &VNET_NAME(flowtable_syn_expire), 0,
263     "seconds after which to remove syn allocated flow.");
264 SYSCTL_VNET_INT(_net_inet_flowtable, OID_AUTO, udp_expire, CTLFLAG_RW,
265     &VNET_NAME(flowtable_udp_expire), 0,
266     "seconds after which to remove flow allocated to UDP.");
267 SYSCTL_VNET_INT(_net_inet_flowtable, OID_AUTO, fin_wait_expire, CTLFLAG_RW,
268     &VNET_NAME(flowtable_fin_wait_expire), 0,
269     "seconds after which to remove a flow in FIN_WAIT.");
270 SYSCTL_VNET_INT(_net_inet_flowtable, OID_AUTO, tcp_expire, CTLFLAG_RW,
271     &VNET_NAME(flowtable_tcp_expire), 0,
272     "seconds after which to remove flow allocated to a TCP connection.");
273
274
275 /*
276  * Maximum number of flows that can be allocated of a given type.
277  *
278  * The table is allocated at boot time (for the pure caching case
279  * there is no reason why this could not be changed at runtime)
280  * and thus (currently) needs to be set with a tunable.
281  */
282 static int
283 sysctl_nmbflows(SYSCTL_HANDLER_ARGS)
284 {
285         int error, newnmbflows;
286
287         newnmbflows = V_flowtable_nmbflows;
288         error = sysctl_handle_int(oidp, &newnmbflows, 0, req); 
289         if (error == 0 && req->newptr) {
290                 if (newnmbflows > V_flowtable_nmbflows) {
291                         V_flowtable_nmbflows = newnmbflows;
292                         uma_zone_set_max(V_flow_ipv4_zone,
293                             V_flowtable_nmbflows);
294                         uma_zone_set_max(V_flow_ipv6_zone,
295                             V_flowtable_nmbflows);
296                 } else
297                         error = EINVAL;
298         }
299         return (error);
300 }
301 SYSCTL_VNET_PROC(_net_inet_flowtable, OID_AUTO, nmbflows,
302     CTLTYPE_INT|CTLFLAG_RW, 0, 0, sysctl_nmbflows, "IU",
303     "Maximum number of flows allowed");
304
305
306
307 #define FS_PRINT(sb, field)     sbuf_printf((sb), "\t%s: %jd\n", #field, fs->ft_##field)
308
309 static void
310 fs_print(struct sbuf *sb, struct flowtable_stats *fs)
311 {
312
313         FS_PRINT(sb, collisions);
314         FS_PRINT(sb, allocated);
315         FS_PRINT(sb, misses);
316         FS_PRINT(sb, max_depth);
317         FS_PRINT(sb, free_checks);
318         FS_PRINT(sb, frees);
319         FS_PRINT(sb, hits);
320         FS_PRINT(sb, lookups);
321 }
322
323 static void
324 flowtable_show_stats(struct sbuf *sb, struct flowtable *ft)
325 {
326         int i;
327         struct flowtable_stats fs, *pfs;
328
329         if (ft->ft_flags & FL_PCPU) {
330                 bzero(&fs, sizeof(fs));
331                 pfs = &fs;
332                 for (i = 0; i <= mp_maxid; i++) {
333                         if (CPU_ABSENT(i))
334                                 continue;
335                         pfs->ft_collisions  += ft->ft_stats[i].ft_collisions;
336                         pfs->ft_allocated   += ft->ft_stats[i].ft_allocated;
337                         pfs->ft_misses      += ft->ft_stats[i].ft_misses;
338                         pfs->ft_free_checks += ft->ft_stats[i].ft_free_checks;
339                         pfs->ft_frees       += ft->ft_stats[i].ft_frees;
340                         pfs->ft_hits        += ft->ft_stats[i].ft_hits;
341                         pfs->ft_lookups     += ft->ft_stats[i].ft_lookups;
342                         if (ft->ft_stats[i].ft_max_depth > pfs->ft_max_depth)
343                                 pfs->ft_max_depth = ft->ft_stats[i].ft_max_depth;
344                 }
345         } else {
346                 pfs = &ft->ft_stats[0];
347         }
348         fs_print(sb, pfs);
349 }
350
351 static int
352 sysctl_flowtable_stats(SYSCTL_HANDLER_ARGS)
353 {
354         struct flowtable *ft;
355         struct sbuf *sb;
356         int error;
357
358         sb = sbuf_new(NULL, NULL, 64*1024, SBUF_FIXEDLEN);
359
360         ft = V_flow_list_head;
361         while (ft != NULL) {
362                 sbuf_printf(sb, "\ntable name: %s\n", ft->ft_name);
363                 flowtable_show_stats(sb, ft);
364                 ft = ft->ft_next;
365         }
366         sbuf_finish(sb);
367         error = SYSCTL_OUT(req, sbuf_data(sb), sbuf_len(sb) + 1);
368         sbuf_delete(sb);
369
370         return (error);
371 }
372 SYSCTL_VNET_PROC(_net_inet_flowtable, OID_AUTO, stats, CTLTYPE_STRING|CTLFLAG_RD,
373     NULL, 0, sysctl_flowtable_stats, "A", "flowtable statistics");
374
375
376 #ifndef RADIX_MPATH
377 static void
378 in_rtalloc_ign_wrapper(struct route *ro, uint32_t hash, u_int fibnum)
379 {
380
381         rtalloc_ign_fib(ro, 0, fibnum);
382 }
383 #endif
384
385 static void
386 flowtable_global_lock(struct flowtable *table, uint32_t hash)
387 {       
388         int lock_index = (hash)&(table->ft_lock_count - 1);
389
390         mtx_lock(&table->ft_locks[lock_index]);
391 }
392
393 static void
394 flowtable_global_unlock(struct flowtable *table, uint32_t hash)
395 {       
396         int lock_index = (hash)&(table->ft_lock_count - 1);
397
398         mtx_unlock(&table->ft_locks[lock_index]);
399 }
400
401 static void
402 flowtable_pcpu_lock(struct flowtable *table, uint32_t hash)
403 {
404
405         critical_enter();
406 }
407
408 static void
409 flowtable_pcpu_unlock(struct flowtable *table, uint32_t hash)
410 {
411
412         critical_exit();
413 }
414
415 #define FL_ENTRY_INDEX(table, hash)((hash) % (table)->ft_size)
416 #define FL_ENTRY(table, hash) *flowtable_entry((table), (hash))
417 #define FL_ENTRY_LOCK(table, hash)  (table)->ft_lock((table), (hash))
418 #define FL_ENTRY_UNLOCK(table, hash) (table)->ft_unlock((table), (hash))
419
420 #define FL_STALE        (1<<8)
421 #define FL_IPV6         (1<<9)
422 #define FL_OVERWRITE    (1<<10)
423
424 void
425 flow_invalidate(struct flentry *fle)
426 {
427
428         fle->f_flags |= FL_STALE;
429 }
430
431 static __inline int
432 proto_to_flags(uint8_t proto)
433 {
434         int flag;
435
436         switch (proto) {
437         case IPPROTO_TCP:
438                 flag = FL_TCP;
439                 break;
440         case IPPROTO_SCTP:
441                 flag = FL_SCTP;
442                 break;          
443         case IPPROTO_UDP:
444                 flag = FL_UDP;
445                 break;
446         default:
447                 flag = 0;
448                 break;
449         }
450
451         return (flag);
452 }
453
454 static __inline int
455 flags_to_proto(int flags)
456 {
457         int proto, protoflags;
458
459         protoflags = flags & (FL_TCP|FL_SCTP|FL_UDP);
460         switch (protoflags) {
461         case FL_TCP:
462                 proto = IPPROTO_TCP;
463                 break;
464         case FL_SCTP:
465                 proto = IPPROTO_SCTP;
466                 break;
467         case FL_UDP:
468                 proto = IPPROTO_UDP;
469                 break;
470         default:
471                 proto = 0;
472                 break;
473         }
474         return (proto);
475 }
476
477 #ifdef INET
478 #ifdef FLOWTABLE_DEBUG
479 static void
480 ipv4_flow_print_tuple(int flags, int proto, struct sockaddr_in *ssin,
481     struct sockaddr_in *dsin)
482 {
483         char saddr[4*sizeof "123"], daddr[4*sizeof "123"];
484
485         if (flags & FL_HASH_ALL) {
486                 inet_ntoa_r(ssin->sin_addr, saddr);
487                 inet_ntoa_r(dsin->sin_addr, daddr);
488                 printf("proto=%d %s:%d->%s:%d\n",
489                     proto, saddr, ntohs(ssin->sin_port), daddr,
490                     ntohs(dsin->sin_port));
491         } else {
492                 inet_ntoa_r(*(struct in_addr *) &dsin->sin_addr, daddr);
493                 printf("proto=%d %s\n", proto, daddr);
494         }
495
496 }
497 #endif
498
499 static int
500 ipv4_mbuf_demarshal(struct flowtable *ft, struct mbuf *m,
501     struct sockaddr_in *ssin, struct sockaddr_in *dsin, uint16_t *flags)
502 {
503         struct ip *ip;
504         uint8_t proto;
505         int iphlen;
506         struct tcphdr *th;
507         struct udphdr *uh;
508         struct sctphdr *sh;
509         uint16_t sport, dport;
510
511         proto = sport = dport = 0;
512         ip = mtod(m, struct ip *);
513         dsin->sin_family = AF_INET;
514         dsin->sin_len = sizeof(*dsin);
515         dsin->sin_addr = ip->ip_dst;
516         ssin->sin_family = AF_INET;
517         ssin->sin_len = sizeof(*ssin);
518         ssin->sin_addr = ip->ip_src;    
519
520         proto = ip->ip_p;
521         if ((*flags & FL_HASH_ALL) == 0) {
522                 FLDPRINTF(ft, FL_DEBUG_ALL, "skip port check flags=0x%x ",
523                     *flags);
524                 goto skipports;
525         }
526
527         iphlen = ip->ip_hl << 2; /* XXX options? */
528
529         switch (proto) {
530         case IPPROTO_TCP:
531                 th = (struct tcphdr *)((caddr_t)ip + iphlen);
532                 sport = th->th_sport;
533                 dport = th->th_dport;
534                 if ((*flags & FL_HASH_ALL) &&
535                     (th->th_flags & (TH_RST|TH_FIN)))
536                         *flags |= FL_STALE;
537         break;
538         case IPPROTO_UDP:
539                 uh = (struct udphdr *)((caddr_t)ip + iphlen);
540                 sport = uh->uh_sport;
541                 dport = uh->uh_dport;
542         break;
543         case IPPROTO_SCTP:
544                 sh = (struct sctphdr *)((caddr_t)ip + iphlen);
545                 sport = sh->src_port;
546                 dport = sh->dest_port;
547         break;
548         default:
549                 FLDPRINTF(ft, FL_DEBUG_ALL, "proto=0x%x not supported\n", proto);
550                 return (ENOTSUP);
551                 /* no port - hence not a protocol we care about */
552                 break;
553         
554         }
555
556 skipports:
557         *flags |= proto_to_flags(proto);
558         ssin->sin_port = sport;
559         dsin->sin_port = dport;
560         return (0);
561 }
562
563 static uint32_t
564 ipv4_flow_lookup_hash_internal(
565         struct sockaddr_in *ssin, struct sockaddr_in *dsin, 
566             uint32_t *key, uint16_t flags)
567 {
568         uint16_t sport, dport;
569         uint8_t proto;
570         int offset = 0;
571
572         if ((V_flowtable_enable == 0) || (V_flowtable_ready == 0))
573                 return (0);
574         proto = flags_to_proto(flags);
575         sport = dport = key[2] = key[1] = key[0] = 0;
576         if ((ssin != NULL) && (flags & FL_HASH_ALL)) {
577                 key[1] = ssin->sin_addr.s_addr;
578                 sport = ssin->sin_port;
579         }
580         if (dsin != NULL) {
581                 key[2] = dsin->sin_addr.s_addr;
582                 dport = dsin->sin_port;
583         }
584         if (flags & FL_HASH_ALL) {
585                 ((uint16_t *)key)[0] = sport;
586                 ((uint16_t *)key)[1] = dport; 
587         } else
588                 offset = V_flow_hashjitter + proto;
589
590         return (jenkins_hashword(key, 3, offset));
591 }
592
593 static struct flentry *
594 flowtable_lookup_mbuf4(struct flowtable *ft, struct mbuf *m)
595 {
596         struct sockaddr_storage ssa, dsa;
597         uint16_t flags;
598         struct sockaddr_in *dsin, *ssin;
599
600         dsin = (struct sockaddr_in *)&dsa;
601         ssin = (struct sockaddr_in *)&ssa;
602         bzero(dsin, sizeof(*dsin));
603         bzero(ssin, sizeof(*ssin));
604         flags = ft->ft_flags;
605         if (ipv4_mbuf_demarshal(ft, m, ssin, dsin, &flags) != 0)
606                 return (NULL);
607
608         return (flowtable_lookup(ft, &ssa, &dsa, M_GETFIB(m), flags));
609 }
610
611 void
612 flow_to_route(struct flentry *fle, struct route *ro)
613 {
614         uint32_t *hashkey = NULL;
615         struct sockaddr_in *sin;
616
617         sin = (struct sockaddr_in *)&ro->ro_dst;
618         sin->sin_family = AF_INET;
619         sin->sin_len = sizeof(*sin);
620         hashkey = ((struct flentry_v4 *)fle)->fl_flow.ipf_key;
621         sin->sin_addr.s_addr = hashkey[2];
622         ro->ro_rt = __DEVOLATILE(struct rtentry *, fle->f_rt);
623         ro->ro_lle = __DEVOLATILE(struct llentry *, fle->f_lle);
624 }
625 #endif /* INET */
626
627 #ifdef INET6
628 /*
629  * PULLUP_TO(len, p, T) makes sure that len + sizeof(T) is contiguous,
630  * then it sets p to point at the offset "len" in the mbuf. WARNING: the
631  * pointer might become stale after other pullups (but we never use it
632  * this way).
633  */
634 #define PULLUP_TO(_len, p, T)                                           \
635 do {                                                                    \
636         int x = (_len) + sizeof(T);                                     \
637         if ((m)->m_len < x) {                                           \
638                 goto receive_failed;                                    \
639         }                                                               \
640         p = (mtod(m, char *) + (_len));                                 \
641 } while (0)
642
643 #define TCP(p)          ((struct tcphdr *)(p))
644 #define SCTP(p)         ((struct sctphdr *)(p))
645 #define UDP(p)          ((struct udphdr *)(p))
646
647 static int
648 ipv6_mbuf_demarshal(struct flowtable *ft, struct mbuf *m,
649     struct sockaddr_in6 *ssin6, struct sockaddr_in6 *dsin6, uint16_t *flags)
650 {
651         struct ip6_hdr *ip6;
652         uint8_t proto;
653         int hlen;
654         uint16_t src_port, dst_port;
655         u_short offset;
656         void *ulp;
657
658         offset = hlen = src_port = dst_port = 0;
659         ulp = NULL;
660         ip6 = mtod(m, struct ip6_hdr *);
661         hlen = sizeof(struct ip6_hdr);
662         proto = ip6->ip6_nxt;
663
664         if ((*flags & FL_HASH_ALL) == 0)
665                 goto skipports;
666
667         while (ulp == NULL) {
668                 switch (proto) {
669                 case IPPROTO_ICMPV6:
670                 case IPPROTO_OSPFIGP:
671                 case IPPROTO_PIM:
672                 case IPPROTO_CARP:
673                 case IPPROTO_ESP:
674                 case IPPROTO_NONE:
675                         ulp = ip6;
676                         break;
677                 case IPPROTO_TCP:
678                         PULLUP_TO(hlen, ulp, struct tcphdr);
679                         dst_port = TCP(ulp)->th_dport;
680                         src_port = TCP(ulp)->th_sport;
681                         if ((*flags & FL_HASH_ALL) &&
682                             (TCP(ulp)->th_flags & (TH_RST|TH_FIN)))
683                                 *flags |= FL_STALE;
684                         break;
685                 case IPPROTO_SCTP:
686                         PULLUP_TO(hlen, ulp, struct sctphdr);
687                         src_port = SCTP(ulp)->src_port;
688                         dst_port = SCTP(ulp)->dest_port;
689                         break;
690                 case IPPROTO_UDP:
691                         PULLUP_TO(hlen, ulp, struct udphdr);
692                         dst_port = UDP(ulp)->uh_dport;
693                         src_port = UDP(ulp)->uh_sport;
694                         break;
695                 case IPPROTO_HOPOPTS:   /* RFC 2460 */
696                         PULLUP_TO(hlen, ulp, struct ip6_hbh);
697                         hlen += (((struct ip6_hbh *)ulp)->ip6h_len + 1) << 3;
698                         proto = ((struct ip6_hbh *)ulp)->ip6h_nxt;
699                         ulp = NULL;
700                         break;
701                 case IPPROTO_ROUTING:   /* RFC 2460 */
702                         PULLUP_TO(hlen, ulp, struct ip6_rthdr); 
703                         hlen += (((struct ip6_rthdr *)ulp)->ip6r_len + 1) << 3;
704                         proto = ((struct ip6_rthdr *)ulp)->ip6r_nxt;
705                         ulp = NULL;
706                         break;
707                 case IPPROTO_FRAGMENT:  /* RFC 2460 */
708                         PULLUP_TO(hlen, ulp, struct ip6_frag);
709                         hlen += sizeof (struct ip6_frag);
710                         proto = ((struct ip6_frag *)ulp)->ip6f_nxt;
711                         offset = ((struct ip6_frag *)ulp)->ip6f_offlg &
712                             IP6F_OFF_MASK;
713                         ulp = NULL;
714                         break;
715                 case IPPROTO_DSTOPTS:   /* RFC 2460 */
716                         PULLUP_TO(hlen, ulp, struct ip6_hbh);
717                         hlen += (((struct ip6_hbh *)ulp)->ip6h_len + 1) << 3;
718                         proto = ((struct ip6_hbh *)ulp)->ip6h_nxt;
719                         ulp = NULL;
720                         break;
721                 case IPPROTO_AH:        /* RFC 2402 */
722                         PULLUP_TO(hlen, ulp, struct ip6_ext);
723                         hlen += (((struct ip6_ext *)ulp)->ip6e_len + 2) << 2;
724                         proto = ((struct ip6_ext *)ulp)->ip6e_nxt;
725                         ulp = NULL;
726                         break;
727                 default:
728                         PULLUP_TO(hlen, ulp, struct ip6_ext);
729                         break;
730                 }
731         }
732
733         if (src_port == 0) {
734         receive_failed:
735                 return (ENOTSUP);
736         }
737
738 skipports:
739         dsin6->sin6_family = AF_INET6;
740         dsin6->sin6_len = sizeof(*dsin6);
741         dsin6->sin6_port = dst_port;
742         memcpy(&dsin6->sin6_addr, &ip6->ip6_dst, sizeof(struct in6_addr));
743
744         ssin6->sin6_family = AF_INET6;
745         ssin6->sin6_len = sizeof(*ssin6);
746         ssin6->sin6_port = src_port;
747         memcpy(&ssin6->sin6_addr, &ip6->ip6_src, sizeof(struct in6_addr));
748         *flags |= proto_to_flags(proto);
749
750         return (0);
751 }
752
753 #define zero_key(key)           \
754 do {                            \
755         key[0] = 0;             \
756         key[1] = 0;             \
757         key[2] = 0;             \
758         key[3] = 0;             \
759         key[4] = 0;             \
760         key[5] = 0;             \
761         key[6] = 0;             \
762         key[7] = 0;             \
763         key[8] = 0;             \
764 } while (0)
765         
766 static uint32_t
767 ipv6_flow_lookup_hash_internal(
768         struct sockaddr_in6 *ssin6, struct sockaddr_in6 *dsin6, 
769             uint32_t *key, uint16_t flags)
770 {
771         uint16_t sport, dport;
772         uint8_t proto;
773         int offset = 0;
774
775         if ((V_flowtable_enable == 0) || (V_flowtable_ready == 0))
776                 return (0);
777
778         proto = flags_to_proto(flags);
779         zero_key(key);
780         sport = dport = 0;
781         if (dsin6 != NULL) {
782                 memcpy(&key[1], &dsin6->sin6_addr, sizeof(struct in6_addr));
783                 dport = dsin6->sin6_port;
784         }
785         if ((ssin6 != NULL) && (flags & FL_HASH_ALL)) {
786                 memcpy(&key[5], &ssin6->sin6_addr, sizeof(struct in6_addr));
787                 sport = ssin6->sin6_port;
788         }
789         if (flags & FL_HASH_ALL) {
790                 ((uint16_t *)key)[0] = sport;
791                 ((uint16_t *)key)[1] = dport; 
792         } else
793                 offset = V_flow_hashjitter + proto;
794
795         return (jenkins_hashword(key, 9, offset));
796 }
797
798 static struct flentry *
799 flowtable_lookup_mbuf6(struct flowtable *ft, struct mbuf *m)
800 {
801         struct sockaddr_storage ssa, dsa;
802         struct sockaddr_in6 *dsin6, *ssin6;     
803         uint16_t flags;
804
805         dsin6 = (struct sockaddr_in6 *)&dsa;
806         ssin6 = (struct sockaddr_in6 *)&ssa;
807         bzero(dsin6, sizeof(*dsin6));
808         bzero(ssin6, sizeof(*ssin6));
809         flags = ft->ft_flags;
810         
811         if (ipv6_mbuf_demarshal(ft, m, ssin6, dsin6, &flags) != 0)
812                 return (NULL);
813
814         return (flowtable_lookup(ft, &ssa, &dsa, M_GETFIB(m), flags));
815 }
816
817 void
818 flow_to_route_in6(struct flentry *fle, struct route_in6 *ro)
819 {
820         uint32_t *hashkey = NULL;
821         struct sockaddr_in6 *sin6;
822
823         sin6 = (struct sockaddr_in6 *)&ro->ro_dst;
824
825         sin6->sin6_family = AF_INET6;
826         sin6->sin6_len = sizeof(*sin6);
827         hashkey = ((struct flentry_v6 *)fle)->fl_flow.ipf_key;
828         memcpy(&sin6->sin6_addr, &hashkey[5], sizeof (struct in6_addr));
829         ro->ro_rt = __DEVOLATILE(struct rtentry *, fle->f_rt);
830         ro->ro_lle = __DEVOLATILE(struct llentry *, fle->f_lle);
831
832 }
833 #endif /* INET6 */
834
835 static bitstr_t *
836 flowtable_mask(struct flowtable *ft)
837 {
838         bitstr_t *mask;
839
840         if (ft->ft_flags & FL_PCPU)
841                 mask = ft->ft_masks[curcpu];
842         else
843                 mask = ft->ft_masks[0];
844
845         return (mask);
846 }
847
848 static struct flentry **
849 flowtable_entry(struct flowtable *ft, uint32_t hash)
850 {
851         struct flentry **fle;
852         int index = (hash % ft->ft_size);
853
854         if (ft->ft_flags & FL_PCPU) {
855                 KASSERT(&ft->ft_table.pcpu[curcpu][0] != NULL, ("pcpu not set"));
856                 fle = &ft->ft_table.pcpu[curcpu][index];
857         } else {
858                 KASSERT(&ft->ft_table.global[0] != NULL, ("global not set"));
859                 fle = &ft->ft_table.global[index];
860         }
861         
862         return (fle);
863 }
864
865 static int
866 flow_stale(struct flowtable *ft, struct flentry *fle)
867 {
868         time_t idle_time;
869
870         if ((fle->f_fhash == 0)
871             || ((fle->f_rt->rt_flags & RTF_HOST) &&
872                 ((fle->f_rt->rt_flags & (RTF_UP))
873                     != (RTF_UP)))
874             || (fle->f_rt->rt_ifp == NULL)
875             || !RT_LINK_IS_UP(fle->f_rt->rt_ifp))
876                 return (1);
877
878         idle_time = time_uptime - fle->f_uptime;
879
880         if ((fle->f_flags & FL_STALE) ||
881             ((fle->f_flags & (TH_SYN|TH_ACK|TH_FIN)) == 0
882                 && (idle_time > ft->ft_udp_idle)) ||
883             ((fle->f_flags & TH_FIN)
884                 && (idle_time > ft->ft_fin_wait_idle)) ||
885             ((fle->f_flags & (TH_SYN|TH_ACK)) == TH_SYN
886                 && (idle_time > ft->ft_syn_idle)) ||
887             ((fle->f_flags & (TH_SYN|TH_ACK)) == (TH_SYN|TH_ACK)
888                 && (idle_time > ft->ft_tcp_idle)) ||
889             ((fle->f_rt->rt_flags & RTF_UP) == 0 || 
890                 (fle->f_rt->rt_ifp == NULL)))
891                 return (1);
892
893         return (0);
894 }
895
896 static void
897 flowtable_set_hashkey(struct flentry *fle, uint32_t *key)
898 {
899         uint32_t *hashkey;
900         int i, nwords;
901
902         if (fle->f_flags & FL_IPV6) {
903                 nwords = 9;
904                 hashkey = ((struct flentry_v4 *)fle)->fl_flow.ipf_key;
905         } else {
906                 nwords = 3;
907                 hashkey = ((struct flentry_v6 *)fle)->fl_flow.ipf_key;
908         }
909         
910         for (i = 0; i < nwords; i++) 
911                 hashkey[i] = key[i];
912 }
913
914 static struct flentry *
915 flow_alloc(struct flowtable *ft)
916 {
917         struct flentry *newfle;
918         uma_zone_t zone;
919
920         newfle = NULL;
921         zone = (ft->ft_flags & FL_IPV6) ? V_flow_ipv6_zone : V_flow_ipv4_zone;
922
923         newfle = uma_zalloc(zone, M_NOWAIT | M_ZERO);
924         if (newfle != NULL)
925                 atomic_add_int(&ft->ft_count, 1);
926         return (newfle);
927 }
928
929 static void
930 flow_free(struct flentry *fle, struct flowtable *ft)
931 {
932         uma_zone_t zone;
933
934         zone = (ft->ft_flags & FL_IPV6) ? V_flow_ipv6_zone : V_flow_ipv4_zone;
935         atomic_add_int(&ft->ft_count, -1);
936         uma_zfree(zone, fle);
937 }
938
939 static int
940 flow_full(struct flowtable *ft)
941 {
942         boolean_t full;
943         uint32_t count;
944         
945         full = ft->ft_full;
946         count = ft->ft_count;
947
948         if (full && (count < (V_flowtable_nmbflows - (V_flowtable_nmbflows >> 3))))
949                 ft->ft_full = FALSE;
950         else if (!full && (count > (V_flowtable_nmbflows - (V_flowtable_nmbflows >> 5))))
951                 ft->ft_full = TRUE;
952         
953         if (full && !ft->ft_full) {
954                 flowclean_freq = 4*hz;
955                 if ((ft->ft_flags & FL_HASH_ALL) == 0)
956                         ft->ft_udp_idle = ft->ft_fin_wait_idle =
957                             ft->ft_syn_idle = ft->ft_tcp_idle = 5;
958                 cv_broadcast(&flowclean_c_cv);
959         } else if (!full && ft->ft_full) {
960                 flowclean_freq = 20*hz;
961                 if ((ft->ft_flags & FL_HASH_ALL) == 0)
962                         ft->ft_udp_idle = ft->ft_fin_wait_idle =
963                             ft->ft_syn_idle = ft->ft_tcp_idle = 30;
964         }
965
966         return (ft->ft_full);
967 }
968
969 static int
970 flowtable_insert(struct flowtable *ft, uint32_t hash, uint32_t *key,
971     uint32_t fibnum, struct route *ro, uint16_t flags)
972 {
973         struct flentry *fle, *fletail, *newfle, **flep;
974         struct flowtable_stats *fs = &ft->ft_stats[curcpu];
975         int depth;
976         bitstr_t *mask;
977         uint8_t proto;
978
979         newfle = flow_alloc(ft);
980         if (newfle == NULL)
981                 return (ENOMEM);
982
983         newfle->f_flags |= (flags & FL_IPV6);
984         proto = flags_to_proto(flags);
985
986         FL_ENTRY_LOCK(ft, hash);
987         mask = flowtable_mask(ft);
988         flep = flowtable_entry(ft, hash);
989         fletail = fle = *flep;
990
991         if (fle == NULL) {
992                 bit_set(mask, FL_ENTRY_INDEX(ft, hash));
993                 *flep = fle = newfle;
994                 goto skip;
995         } 
996         
997         depth = 0;
998         fs->ft_collisions++;
999         /*
1000          * find end of list and make sure that we were not
1001          * preempted by another thread handling this flow
1002          */
1003         while (fle != NULL) {
1004                 if (fle->f_fhash == hash && !flow_stale(ft, fle)) {
1005                         /*
1006                          * there was either a hash collision
1007                          * or we lost a race to insert
1008                          */
1009                         FL_ENTRY_UNLOCK(ft, hash);
1010                         flow_free(newfle, ft);
1011                         
1012                         if (flags & FL_OVERWRITE) 
1013                                 goto skip;
1014                         return (EEXIST);
1015                 }
1016                 /*
1017                  * re-visit this double condition XXX
1018                  */
1019                 if (fletail->f_next != NULL)
1020                         fletail = fle->f_next;
1021
1022                 depth++;
1023                 fle = fle->f_next;
1024         } 
1025
1026         if (depth > fs->ft_max_depth)
1027                 fs->ft_max_depth = depth;
1028         fletail->f_next = newfle;
1029         fle = newfle;
1030 skip:
1031         flowtable_set_hashkey(fle, key);
1032
1033         fle->f_proto = proto;
1034         fle->f_rt = ro->ro_rt;
1035         fle->f_lle = ro->ro_lle;
1036         fle->f_fhash = hash;
1037         fle->f_fibnum = fibnum;
1038         fle->f_uptime = time_uptime;
1039         FL_ENTRY_UNLOCK(ft, hash);
1040         return (0);
1041 }
1042
1043 int
1044 kern_flowtable_insert(struct flowtable *ft,
1045     struct sockaddr_storage *ssa, struct sockaddr_storage *dsa,
1046     struct route *ro, uint32_t fibnum, int flags)
1047 {
1048         uint32_t key[9], hash;
1049
1050         flags = (ft->ft_flags | flags | FL_OVERWRITE);
1051         hash = 0;
1052
1053 #ifdef INET
1054         if (ssa->ss_family == AF_INET) 
1055                 hash = ipv4_flow_lookup_hash_internal((struct sockaddr_in *)ssa,
1056                     (struct sockaddr_in *)dsa, key, flags);
1057 #endif
1058 #ifdef INET6
1059         if (ssa->ss_family == AF_INET6) 
1060                 hash = ipv6_flow_lookup_hash_internal((struct sockaddr_in6 *)ssa,
1061                     (struct sockaddr_in6 *)dsa, key, flags);
1062 #endif  
1063         if (ro->ro_rt == NULL || ro->ro_lle == NULL)
1064                 return (EINVAL);
1065
1066         FLDPRINTF(ft, FL_DEBUG,
1067             "kern_flowtable_insert: key=%x:%x:%x hash=%x fibnum=%d flags=%x\n",
1068             key[0], key[1], key[2], hash, fibnum, flags);
1069         return (flowtable_insert(ft, hash, key, fibnum, ro, flags));
1070 }
1071
1072 static int
1073 flowtable_key_equal(struct flentry *fle, uint32_t *key)
1074 {
1075         uint32_t *hashkey;
1076         int i, nwords;
1077
1078         if (fle->f_flags & FL_IPV6) {
1079                 nwords = 9;
1080                 hashkey = ((struct flentry_v4 *)fle)->fl_flow.ipf_key;
1081         } else {
1082                 nwords = 3;
1083                 hashkey = ((struct flentry_v6 *)fle)->fl_flow.ipf_key;
1084         }
1085
1086         for (i = 0; i < nwords; i++) 
1087                 if (hashkey[i] != key[i])
1088                         return (0);
1089
1090         return (1);
1091 }
1092
1093 struct flentry *
1094 flowtable_lookup_mbuf(struct flowtable *ft, struct mbuf *m, int af)
1095 {
1096         struct flentry *fle = NULL;
1097
1098 #ifdef INET
1099         if (af == AF_INET)
1100                 fle = flowtable_lookup_mbuf4(ft, m);
1101 #endif
1102 #ifdef INET6
1103         if (af == AF_INET6)
1104                 fle = flowtable_lookup_mbuf6(ft, m);
1105 #endif  
1106         if (fle != NULL && m != NULL && (m->m_flags & M_FLOWID) == 0) {
1107                 m->m_flags |= M_FLOWID;
1108                 m->m_pkthdr.flowid = fle->f_fhash;
1109         }
1110         return (fle);
1111 }
1112         
1113 struct flentry *
1114 flowtable_lookup(struct flowtable *ft, struct sockaddr_storage *ssa,
1115     struct sockaddr_storage *dsa, uint32_t fibnum, int flags)
1116 {
1117         uint32_t key[9], hash;
1118         struct flentry *fle;
1119         struct flowtable_stats *fs = &ft->ft_stats[curcpu];
1120         uint8_t proto = 0;
1121         int error = 0;
1122         struct rtentry *rt;
1123         struct llentry *lle;
1124         struct route sro, *ro;
1125         struct route_in6 sro6;
1126
1127         sro.ro_rt = sro6.ro_rt = NULL;
1128         sro.ro_lle = sro6.ro_lle = NULL;
1129         ro = NULL;
1130         hash = 0;
1131         flags |= ft->ft_flags;
1132         proto = flags_to_proto(flags);
1133 #ifdef INET
1134         if (ssa->ss_family == AF_INET) {
1135                 struct sockaddr_in *ssin, *dsin;
1136
1137                 ro = &sro;
1138                 memcpy(&ro->ro_dst, dsa, sizeof(struct sockaddr_in));
1139                 /*
1140                  * The harvested source and destination addresses
1141                  * may contain port information if the packet is 
1142                  * from a transport protocol (e.g. TCP/UDP). The 
1143                  * port field must be cleared before performing 
1144                  * a route lookup.
1145                  */
1146                 ((struct sockaddr_in *)&ro->ro_dst)->sin_port = 0;
1147                 dsin = (struct sockaddr_in *)dsa;
1148                 ssin = (struct sockaddr_in *)ssa;
1149                 if ((dsin->sin_addr.s_addr == ssin->sin_addr.s_addr) ||
1150                     (ntohl(dsin->sin_addr.s_addr) >> IN_CLASSA_NSHIFT) == IN_LOOPBACKNET ||
1151                     (ntohl(ssin->sin_addr.s_addr) >> IN_CLASSA_NSHIFT) == IN_LOOPBACKNET)
1152                         return (NULL);
1153
1154                 hash = ipv4_flow_lookup_hash_internal(ssin, dsin, key, flags);
1155         }
1156 #endif
1157 #ifdef INET6
1158         if (ssa->ss_family == AF_INET6) {
1159                 struct sockaddr_in6 *ssin6, *dsin6;
1160
1161                 ro = (struct route *)&sro6;
1162                 memcpy(&sro6.ro_dst, dsa,
1163                     sizeof(struct sockaddr_in6));
1164                 ((struct sockaddr_in6 *)&ro->ro_dst)->sin6_port = 0;
1165                 dsin6 = (struct sockaddr_in6 *)dsa;
1166                 ssin6 = (struct sockaddr_in6 *)ssa;
1167
1168                 flags |= FL_IPV6;
1169                 hash = ipv6_flow_lookup_hash_internal(ssin6, dsin6, key, flags);
1170         }
1171 #endif
1172         /*
1173          * Ports are zero and this isn't a transmit cache
1174          * - thus not a protocol for which we need to keep 
1175          * state
1176          * FL_HASH_ALL => key[0] != 0 for TCP || UDP || SCTP
1177          */
1178         if (hash == 0 || (key[0] == 0 && (ft->ft_flags & FL_HASH_ALL)))
1179                 return (NULL);
1180
1181         fs->ft_lookups++;
1182         FL_ENTRY_LOCK(ft, hash);
1183         if ((fle = FL_ENTRY(ft, hash)) == NULL) {
1184                 FL_ENTRY_UNLOCK(ft, hash);
1185                 goto uncached;
1186         }
1187 keycheck:       
1188         rt = __DEVOLATILE(struct rtentry *, fle->f_rt);
1189         lle = __DEVOLATILE(struct llentry *, fle->f_lle);
1190         if ((rt != NULL)
1191             && fle->f_fhash == hash
1192             && flowtable_key_equal(fle, key)
1193             && (proto == fle->f_proto)
1194             && (fibnum == fle->f_fibnum)
1195             && (rt->rt_flags & RTF_UP)
1196             && (rt->rt_ifp != NULL)) {
1197                 fs->ft_hits++;
1198                 fle->f_uptime = time_uptime;
1199                 fle->f_flags |= flags;
1200                 FL_ENTRY_UNLOCK(ft, hash);
1201                 return (fle);
1202         } else if (fle->f_next != NULL) {
1203                 fle = fle->f_next;
1204                 goto keycheck;
1205         }
1206         FL_ENTRY_UNLOCK(ft, hash);
1207 uncached:
1208         if (flags & FL_NOAUTO || flow_full(ft))
1209                 return (NULL);
1210
1211         fs->ft_misses++;
1212         /*
1213          * This bit of code ends up locking the
1214          * same route 3 times (just like ip_output + ether_output)
1215          * - at lookup
1216          * - in rt_check when called by arpresolve
1217          * - dropping the refcount for the rtentry
1218          *
1219          * This could be consolidated to one if we wrote a variant
1220          * of arpresolve with an rt_check variant that expected to
1221          * receive the route locked
1222          */
1223
1224 #ifdef INVARIANTS
1225         if ((ro->ro_dst.sa_family != AF_INET) &&
1226             (ro->ro_dst.sa_family != AF_INET6))
1227                 panic("sa_family == %d\n", ro->ro_dst.sa_family);
1228 #endif
1229
1230         ft->ft_rtalloc(ro, hash, fibnum);
1231         if (ro->ro_rt == NULL) 
1232                 error = ENETUNREACH;
1233         else {
1234                 struct llentry *lle = NULL;
1235                 struct sockaddr_storage *l3addr;
1236                 struct rtentry *rt = ro->ro_rt;
1237                 struct ifnet *ifp = rt->rt_ifp;
1238
1239                 if (ifp->if_flags & (IFF_POINTOPOINT | IFF_LOOPBACK)) {
1240                         RTFREE(rt);
1241                         ro->ro_rt = NULL;
1242                         return (NULL);
1243                 }
1244 #ifdef INET6
1245                 if (ssa->ss_family == AF_INET6) {
1246                         struct sockaddr_in6 *dsin6;
1247
1248                         dsin6 = (struct sockaddr_in6 *)dsa;                     
1249                         if (in6_localaddr(&dsin6->sin6_addr)) {
1250                                 RTFREE(rt);
1251                                 ro->ro_rt = NULL;
1252                                 return (NULL);                          
1253                         }
1254
1255                         if (rt->rt_flags & RTF_GATEWAY)
1256                                 l3addr = (struct sockaddr_storage *)rt->rt_gateway;
1257                         
1258                         else
1259                                 l3addr = (struct sockaddr_storage *)&ro->ro_dst;
1260                         llentry_update(&lle, LLTABLE6(ifp), l3addr, ifp);
1261                 }
1262 #endif  
1263 #ifdef INET
1264                 if (ssa->ss_family == AF_INET) {
1265                         if (rt->rt_flags & RTF_GATEWAY)
1266                                 l3addr = (struct sockaddr_storage *)rt->rt_gateway;
1267                         else
1268                                 l3addr = (struct sockaddr_storage *)&ro->ro_dst;
1269                         llentry_update(&lle, LLTABLE(ifp), l3addr, ifp);        
1270                 }
1271                         
1272 #endif
1273                 ro->ro_lle = lle;
1274
1275                 if (lle == NULL) {
1276                         RTFREE(rt);
1277                         ro->ro_rt = NULL;
1278                         return (NULL);
1279                 }
1280                 error = flowtable_insert(ft, hash, key, fibnum, ro, flags);
1281
1282                 if (error) {
1283                         RTFREE(rt);
1284                         LLE_FREE(lle);
1285                         ro->ro_rt = NULL;
1286                         ro->ro_lle = NULL;
1287                 }
1288         } 
1289
1290         return ((error) ? NULL : fle);
1291 }
1292
1293 /*
1294  * used by the bit_alloc macro
1295  */
1296 #define calloc(count, size) malloc((count)*(size), M_DEVBUF, M_WAITOK|M_ZERO)
1297         
1298 struct flowtable *
1299 flowtable_alloc(char *name, int nentry, int flags)
1300 {
1301         struct flowtable *ft, *fttail;
1302         int i;
1303
1304         if (V_flow_hashjitter == 0)
1305                 V_flow_hashjitter = arc4random();
1306
1307         KASSERT(nentry > 0, ("nentry must be > 0, is %d\n", nentry));
1308
1309         ft = malloc(sizeof(struct flowtable),
1310             M_RTABLE, M_WAITOK | M_ZERO);
1311
1312         ft->ft_name = name;
1313         ft->ft_flags = flags;
1314         ft->ft_size = nentry;
1315 #ifdef RADIX_MPATH
1316         ft->ft_rtalloc = rtalloc_mpath_fib;
1317 #else
1318         ft->ft_rtalloc = in_rtalloc_ign_wrapper;
1319 #endif
1320         if (flags & FL_PCPU) {
1321                 ft->ft_lock = flowtable_pcpu_lock;
1322                 ft->ft_unlock = flowtable_pcpu_unlock;
1323
1324                 for (i = 0; i <= mp_maxid; i++) {
1325                         ft->ft_table.pcpu[i] =
1326                             malloc(nentry*sizeof(struct flentry *),
1327                                 M_RTABLE, M_WAITOK | M_ZERO);
1328                         ft->ft_masks[i] = bit_alloc(nentry);
1329                 }
1330         } else {
1331                 ft->ft_lock_count = 2*(powerof2(mp_maxid + 1) ? (mp_maxid + 1):
1332                     (fls(mp_maxid + 1) << 1));
1333                 
1334                 ft->ft_lock = flowtable_global_lock;
1335                 ft->ft_unlock = flowtable_global_unlock;
1336                 ft->ft_table.global =
1337                             malloc(nentry*sizeof(struct flentry *),
1338                                 M_RTABLE, M_WAITOK | M_ZERO);
1339                 ft->ft_locks = malloc(ft->ft_lock_count*sizeof(struct mtx),
1340                                 M_RTABLE, M_WAITOK | M_ZERO);
1341                 for (i = 0; i < ft->ft_lock_count; i++)
1342                         mtx_init(&ft->ft_locks[i], "flow", NULL, MTX_DEF|MTX_DUPOK);
1343
1344                 ft->ft_masks[0] = bit_alloc(nentry);
1345         }
1346         ft->ft_tmpmask = bit_alloc(nentry);
1347
1348         /*
1349          * In the local transmit case the table truly is 
1350          * just a cache - so everything is eligible for
1351          * replacement after 5s of non-use
1352          */
1353         if (flags & FL_HASH_ALL) {
1354                 ft->ft_udp_idle = V_flowtable_udp_expire;
1355                 ft->ft_syn_idle = V_flowtable_syn_expire;
1356                 ft->ft_fin_wait_idle = V_flowtable_fin_wait_expire;
1357                 ft->ft_tcp_idle = V_flowtable_fin_wait_expire;
1358         } else {
1359                 ft->ft_udp_idle = ft->ft_fin_wait_idle =
1360                     ft->ft_syn_idle = ft->ft_tcp_idle = 30;
1361                 
1362         }
1363
1364         /*
1365          * hook in to the cleaner list
1366          */
1367         if (V_flow_list_head == NULL)
1368                 V_flow_list_head = ft;
1369         else {
1370                 fttail = V_flow_list_head;
1371                 while (fttail->ft_next != NULL)
1372                         fttail = fttail->ft_next;
1373                 fttail->ft_next = ft;
1374         }
1375
1376         return (ft);
1377 }
1378
1379 /*
1380  * The rest of the code is devoted to garbage collection of expired entries.
1381  * It is a new additon made necessary by the switch to dynamically allocating
1382  * flow tables.
1383  * 
1384  */
1385 static void
1386 fle_free(struct flentry *fle, struct flowtable *ft)
1387 {
1388         struct rtentry *rt;
1389         struct llentry *lle;
1390
1391         rt = __DEVOLATILE(struct rtentry *, fle->f_rt);
1392         lle = __DEVOLATILE(struct llentry *, fle->f_lle);
1393         RTFREE(rt);
1394         LLE_FREE(lle);
1395         flow_free(fle, ft);
1396 }
1397
1398 static void
1399 flowtable_free_stale(struct flowtable *ft, struct rtentry *rt)
1400 {
1401         int curbit = 0, count;
1402         struct flentry *fle,  **flehead, *fleprev;
1403         struct flentry *flefreehead, *flefreetail, *fletmp;
1404         bitstr_t *mask, *tmpmask;
1405         struct flowtable_stats *fs = &ft->ft_stats[curcpu];
1406
1407         flefreehead = flefreetail = NULL;
1408         mask = flowtable_mask(ft);
1409         tmpmask = ft->ft_tmpmask;
1410         memcpy(tmpmask, mask, ft->ft_size/8);
1411         /*
1412          * XXX Note to self, bit_ffs operates at the byte level
1413          * and thus adds gratuitous overhead
1414          */
1415         bit_ffs(tmpmask, ft->ft_size, &curbit);
1416         while (curbit != -1) {
1417                 if (curbit >= ft->ft_size || curbit < -1) {
1418                         log(LOG_ALERT,
1419                             "warning: bad curbit value %d \n",
1420                             curbit);
1421                         break;
1422                 }
1423
1424                 FL_ENTRY_LOCK(ft, curbit);
1425                 flehead = flowtable_entry(ft, curbit);
1426                 fle = fleprev = *flehead;
1427
1428                 fs->ft_free_checks++;
1429 #ifdef DIAGNOSTIC
1430                 if (fle == NULL && curbit > 0) {
1431                         log(LOG_ALERT,
1432                             "warning bit=%d set, but no fle found\n",
1433                             curbit);
1434                 }
1435 #endif          
1436                 while (fle != NULL) {
1437                         if (rt != NULL) {
1438                                 if (__DEVOLATILE(struct rtentry *, fle->f_rt) != rt) {
1439                                         fleprev = fle;
1440                                         fle = fle->f_next;
1441                                         continue;
1442                                 }
1443                         } else if (!flow_stale(ft, fle)) {
1444                                 fleprev = fle;
1445                                 fle = fle->f_next;
1446                                 continue;
1447                         }
1448                         /*
1449                          * delete head of the list
1450                          */
1451                         if (fleprev == *flehead) {
1452                                 fletmp = fleprev;
1453                                 if (fle == fleprev) {
1454                                         fleprev = *flehead = fle->f_next;
1455                                 } else
1456                                         fleprev = *flehead = fle;
1457                                 fle = fle->f_next;
1458                         } else {
1459                                 /*
1460                                  * don't advance fleprev
1461                                  */
1462                                 fletmp = fle;
1463                                 fleprev->f_next = fle->f_next;
1464                                 fle = fleprev->f_next;
1465                         }
1466
1467                         if (flefreehead == NULL)
1468                                 flefreehead = flefreetail = fletmp;
1469                         else {
1470                                 flefreetail->f_next = fletmp;
1471                                 flefreetail = fletmp;
1472                         }
1473                         fletmp->f_next = NULL;
1474                 }
1475                 if (*flehead == NULL)
1476                         bit_clear(mask, curbit);
1477                 FL_ENTRY_UNLOCK(ft, curbit);
1478                 bit_clear(tmpmask, curbit);
1479                 bit_ffs(tmpmask, ft->ft_size, &curbit);
1480         }
1481         count = 0;
1482         while ((fle = flefreehead) != NULL) {
1483                 flefreehead = fle->f_next;
1484                 count++;
1485                 fs->ft_frees++;
1486                 fle_free(fle, ft);
1487         }
1488         if (V_flowtable_debug && count)
1489                 log(LOG_DEBUG, "freed %d flow entries\n", count);
1490 }
1491
1492 void
1493 flowtable_route_flush(struct flowtable *ft, struct rtentry *rt)
1494 {
1495         int i;
1496
1497         if (ft->ft_flags & FL_PCPU) {
1498                 for (i = 0; i <= mp_maxid; i++) {
1499                         if (CPU_ABSENT(i))
1500                                 continue;
1501                         
1502                         if (smp_started == 1) {
1503                                 thread_lock(curthread);
1504                                 sched_bind(curthread, i);
1505                                 thread_unlock(curthread);
1506                         }
1507
1508                         flowtable_free_stale(ft, rt);
1509
1510                         if (smp_started == 1) {
1511                                 thread_lock(curthread);
1512                                 sched_unbind(curthread);
1513                                 thread_unlock(curthread);
1514                         }
1515                 }
1516         } else {
1517                 flowtable_free_stale(ft, rt);
1518         }
1519 }
1520
1521 static void
1522 flowtable_clean_vnet(void)
1523 {
1524         struct flowtable *ft;
1525         int i;
1526
1527         ft = V_flow_list_head;
1528         while (ft != NULL) {
1529                 if (ft->ft_flags & FL_PCPU) {
1530                         for (i = 0; i <= mp_maxid; i++) {
1531                                 if (CPU_ABSENT(i))
1532                                         continue;
1533
1534                                 if (smp_started == 1) {
1535                                         thread_lock(curthread);
1536                                         sched_bind(curthread, i);
1537                                         thread_unlock(curthread);
1538                                 }
1539
1540                                 flowtable_free_stale(ft, NULL);
1541
1542                                 if (smp_started == 1) {
1543                                         thread_lock(curthread);
1544                                         sched_unbind(curthread);
1545                                         thread_unlock(curthread);
1546                                 }
1547                         }
1548                 } else {
1549                         flowtable_free_stale(ft, NULL);
1550                 }
1551                 ft = ft->ft_next;
1552         }
1553 }
1554
1555 static void
1556 flowtable_cleaner(void)
1557 {
1558         VNET_ITERATOR_DECL(vnet_iter);
1559
1560         if (bootverbose)
1561                 log(LOG_INFO, "flowtable cleaner started\n");
1562         while (1) {
1563                 VNET_LIST_RLOCK();
1564                 VNET_FOREACH(vnet_iter) {
1565                         CURVNET_SET(vnet_iter);
1566                         flowtable_clean_vnet();
1567                         CURVNET_RESTORE();
1568                 }
1569                 VNET_LIST_RUNLOCK();
1570
1571                 /*
1572                  * The 10 second interval between cleaning checks
1573                  * is arbitrary
1574                  */
1575                 mtx_lock(&flowclean_lock);
1576                 flowclean_cycles++;
1577                 cv_broadcast(&flowclean_f_cv);
1578                 cv_timedwait(&flowclean_c_cv, &flowclean_lock, flowclean_freq);
1579                 mtx_unlock(&flowclean_lock);
1580         }
1581 }
1582
1583 static void
1584 flowtable_flush(void *unused __unused)
1585 {
1586         uint64_t start;
1587
1588         mtx_lock(&flowclean_lock);
1589         start = flowclean_cycles;
1590         while (start == flowclean_cycles) {
1591                 cv_broadcast(&flowclean_c_cv);
1592                 cv_wait(&flowclean_f_cv, &flowclean_lock);
1593         }
1594         mtx_unlock(&flowclean_lock);
1595 }
1596
1597 static struct kproc_desc flow_kp = {
1598         "flowcleaner",
1599         flowtable_cleaner,
1600         &flowcleanerproc
1601 };
1602 SYSINIT(flowcleaner, SI_SUB_KTHREAD_IDLE, SI_ORDER_ANY, kproc_start, &flow_kp);
1603
1604 static void
1605 flowtable_init_vnet(const void *unused __unused)
1606 {
1607
1608         V_flowtable_nmbflows = 1024 + maxusers * 64 * mp_ncpus;
1609         V_flow_ipv4_zone = uma_zcreate("ip4flow", sizeof(struct flentry_v4),
1610             NULL, NULL, NULL, NULL, 64, UMA_ZONE_MAXBUCKET);
1611         V_flow_ipv6_zone = uma_zcreate("ip6flow", sizeof(struct flentry_v6),
1612             NULL, NULL, NULL, NULL, 64, UMA_ZONE_MAXBUCKET);    
1613         uma_zone_set_max(V_flow_ipv4_zone, V_flowtable_nmbflows);
1614         uma_zone_set_max(V_flow_ipv6_zone, V_flowtable_nmbflows);
1615         V_flowtable_ready = 1;
1616 }
1617 VNET_SYSINIT(flowtable_init_vnet, SI_SUB_SMP, SI_ORDER_ANY,
1618     flowtable_init_vnet, NULL);
1619
1620 static void
1621 flowtable_init(const void *unused __unused)
1622 {
1623
1624         cv_init(&flowclean_c_cv, "c_flowcleanwait");
1625         cv_init(&flowclean_f_cv, "f_flowcleanwait");
1626         mtx_init(&flowclean_lock, "flowclean lock", NULL, MTX_DEF);
1627         EVENTHANDLER_REGISTER(ifnet_departure_event, flowtable_flush, NULL,
1628             EVENTHANDLER_PRI_ANY);
1629         flowclean_freq = 20*hz;
1630 }
1631 SYSINIT(flowtable_init, SI_SUB_KTHREAD_INIT, SI_ORDER_FIRST,
1632     flowtable_init, NULL);
1633
1634
1635 #ifdef VIMAGE
1636 static void
1637 flowtable_uninit(const void *unused __unused)
1638 {
1639
1640         V_flowtable_ready = 0;
1641         uma_zdestroy(V_flow_ipv4_zone);
1642         uma_zdestroy(V_flow_ipv6_zone);
1643 }
1644
1645 VNET_SYSUNINIT(flowtable_uninit, SI_SUB_KTHREAD_INIT, SI_ORDER_ANY,
1646     flowtable_uninit, NULL);
1647 #endif
1648
1649 #ifdef DDB
1650 static uint32_t *
1651 flowtable_get_hashkey(struct flentry *fle)
1652 {
1653         uint32_t *hashkey;
1654
1655         if (fle->f_flags & FL_IPV6)
1656                 hashkey = ((struct flentry_v4 *)fle)->fl_flow.ipf_key;
1657         else
1658                 hashkey = ((struct flentry_v6 *)fle)->fl_flow.ipf_key;
1659
1660         return (hashkey);
1661 }
1662
1663 static bitstr_t *
1664 flowtable_mask_pcpu(struct flowtable *ft, int cpuid)
1665 {
1666         bitstr_t *mask;
1667
1668         if (ft->ft_flags & FL_PCPU)
1669                 mask = ft->ft_masks[cpuid];
1670         else
1671                 mask = ft->ft_masks[0];
1672
1673         return (mask);
1674 }
1675
1676 static struct flentry **
1677 flowtable_entry_pcpu(struct flowtable *ft, uint32_t hash, int cpuid)
1678 {
1679         struct flentry **fle;
1680         int index = (hash % ft->ft_size);
1681
1682         if (ft->ft_flags & FL_PCPU) {
1683                 fle = &ft->ft_table.pcpu[cpuid][index];
1684         } else {
1685                 fle = &ft->ft_table.global[index];
1686         }
1687         
1688         return (fle);
1689 }
1690
1691 static void
1692 flow_show(struct flowtable *ft, struct flentry *fle)
1693 {
1694         int idle_time;
1695         int rt_valid, ifp_valid;
1696         uint16_t sport, dport;
1697         uint32_t *hashkey;
1698         char saddr[4*sizeof "123"], daddr[4*sizeof "123"];
1699         volatile struct rtentry *rt;
1700         struct ifnet *ifp = NULL;
1701
1702         idle_time = (int)(time_uptime - fle->f_uptime);
1703         rt = fle->f_rt;
1704         rt_valid = rt != NULL;
1705         if (rt_valid) 
1706                 ifp = rt->rt_ifp;
1707         ifp_valid = ifp != NULL;
1708         hashkey = flowtable_get_hashkey(fle);
1709         if (fle->f_flags & FL_IPV6)
1710                 goto skipaddr;
1711
1712         inet_ntoa_r(*(struct in_addr *) &hashkey[2], daddr);
1713         if (ft->ft_flags & FL_HASH_ALL) {
1714                 inet_ntoa_r(*(struct in_addr *) &hashkey[1], saddr);            
1715                 sport = ntohs(((uint16_t *)hashkey)[0]);
1716                 dport = ntohs(((uint16_t *)hashkey)[1]);
1717                 db_printf("%s:%d->%s:%d",
1718                     saddr, sport, daddr,
1719                     dport);
1720         } else 
1721                 db_printf("%s ", daddr);
1722     
1723 skipaddr:
1724         if (fle->f_flags & FL_STALE)
1725                 db_printf(" FL_STALE ");
1726         if (fle->f_flags & FL_TCP)
1727                 db_printf(" FL_TCP ");
1728         if (fle->f_flags & FL_UDP)
1729                 db_printf(" FL_UDP ");
1730         if (rt_valid) {
1731                 if (rt->rt_flags & RTF_UP)
1732                         db_printf(" RTF_UP ");
1733         }
1734         if (ifp_valid) {
1735                 if (ifp->if_flags & IFF_LOOPBACK)
1736                         db_printf(" IFF_LOOPBACK ");
1737                 if (ifp->if_flags & IFF_UP)
1738                         db_printf(" IFF_UP ");          
1739                 if (ifp->if_flags & IFF_POINTOPOINT)
1740                         db_printf(" IFF_POINTOPOINT ");         
1741         }
1742         if (fle->f_flags & FL_IPV6)
1743                 db_printf("\n\tkey=%08x:%08x:%08x%08x:%08x:%08x%08x:%08x:%08x",
1744                     hashkey[0], hashkey[1], hashkey[2],
1745                     hashkey[3], hashkey[4], hashkey[5],
1746                     hashkey[6], hashkey[7], hashkey[8]);
1747         else
1748                 db_printf("\n\tkey=%08x:%08x:%08x ",
1749                     hashkey[0], hashkey[1], hashkey[2]);
1750         db_printf("hash=%08x idle_time=%03d"
1751             "\n\tfibnum=%02d rt=%p",
1752             fle->f_fhash, idle_time, fle->f_fibnum, fle->f_rt);
1753         db_printf("\n");
1754 }
1755
1756 static void
1757 flowtable_show(struct flowtable *ft, int cpuid)
1758 {
1759         int curbit = 0;
1760         struct flentry *fle,  **flehead;
1761         bitstr_t *mask, *tmpmask;
1762
1763         if (cpuid != -1)
1764                 db_printf("cpu: %d\n", cpuid);
1765         mask = flowtable_mask_pcpu(ft, cpuid);
1766         tmpmask = ft->ft_tmpmask;
1767         memcpy(tmpmask, mask, ft->ft_size/8);
1768         /*
1769          * XXX Note to self, bit_ffs operates at the byte level
1770          * and thus adds gratuitous overhead
1771          */
1772         bit_ffs(tmpmask, ft->ft_size, &curbit);
1773         while (curbit != -1) {
1774                 if (curbit >= ft->ft_size || curbit < -1) {
1775                         db_printf("warning: bad curbit value %d \n",
1776                             curbit);
1777                         break;
1778                 }
1779
1780                 flehead = flowtable_entry_pcpu(ft, curbit, cpuid);
1781                 fle = *flehead;
1782
1783                 while (fle != NULL) {   
1784                         flow_show(ft, fle);
1785                         fle = fle->f_next;
1786                         continue;
1787                 }
1788                 bit_clear(tmpmask, curbit);
1789                 bit_ffs(tmpmask, ft->ft_size, &curbit);
1790         }
1791 }
1792
1793 static void
1794 flowtable_show_vnet(void)
1795 {
1796         struct flowtable *ft;
1797         int i;
1798
1799         ft = V_flow_list_head;
1800         while (ft != NULL) {
1801                 printf("name: %s\n", ft->ft_name);
1802                 if (ft->ft_flags & FL_PCPU) {
1803                         for (i = 0; i <= mp_maxid; i++) {
1804                                 if (CPU_ABSENT(i))
1805                                         continue;
1806                                 flowtable_show(ft, i);
1807                         }
1808                 } else {
1809                         flowtable_show(ft, -1);
1810                 }
1811                 ft = ft->ft_next;
1812         }
1813 }
1814
1815 DB_SHOW_COMMAND(flowtables, db_show_flowtables)
1816 {
1817         VNET_ITERATOR_DECL(vnet_iter);
1818
1819         VNET_FOREACH(vnet_iter) {
1820                 CURVNET_SET(vnet_iter);
1821 #ifdef VIMAGE
1822                 db_printf("vnet %p\n", vnet_iter);
1823 #endif
1824                 flowtable_show_vnet();
1825                 CURVNET_RESTORE();
1826         }
1827 }
1828 #endif