]> CyberLeo.Net >> Repos - FreeBSD/FreeBSD.git/blob - sys/net/flowtable.c
o Revamp API between flowtable and netinet, netinet6.
[FreeBSD/FreeBSD.git] / sys / net / flowtable.c
1 /**************************************************************************
2
3 Copyright (c) 2008-2010, BitGravity Inc.
4 All rights reserved.
5
6 Redistribution and use in source and binary forms, with or without
7 modification, are permitted provided that the following conditions are met:
8
9  1. Redistributions of source code must retain the above copyright notice,
10     this list of conditions and the following disclaimer.
11
12  2. Neither the name of the BitGravity Corporation nor the names of its
13     contributors may be used to endorse or promote products derived from
14     this software without specific prior written permission.
15
16 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
17 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
20 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
21 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
22 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
23 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
24 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
25 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
26 POSSIBILITY OF SUCH DAMAGE.
27
28 ***************************************************************************/
29
30 #include "opt_route.h"
31 #include "opt_mpath.h"
32 #include "opt_ddb.h"
33 #include "opt_inet.h"
34 #include "opt_inet6.h"
35
36 #include <sys/cdefs.h>
37 __FBSDID("$FreeBSD$");
38
39 #include <sys/param.h>
40 #include <sys/types.h>
41 #include <sys/bitstring.h>
42 #include <sys/condvar.h>
43 #include <sys/callout.h>
44 #include <sys/hash.h>
45 #include <sys/kernel.h>
46 #include <sys/kthread.h>
47 #include <sys/limits.h>
48 #include <sys/malloc.h>
49 #include <sys/mbuf.h>
50 #include <sys/proc.h>
51 #include <sys/sbuf.h>
52 #include <sys/sched.h>
53 #include <sys/smp.h>
54 #include <sys/socket.h>
55 #include <sys/syslog.h>
56 #include <sys/sysctl.h>
57
58 #include <net/if.h>
59 #include <net/if_llatbl.h>
60 #include <net/if_var.h>
61 #include <net/route.h>
62 #include <net/flowtable.h>
63 #include <net/vnet.h>
64
65 #include <netinet/in.h>
66 #include <netinet/in_systm.h>
67 #include <netinet/in_var.h>
68 #include <netinet/if_ether.h>
69 #include <netinet/ip.h>
70 #ifdef INET6
71 #include <netinet/ip6.h>
72 #endif
73 #include <netinet/tcp.h>
74 #include <netinet/udp.h>
75 #include <netinet/sctp.h>
76
77 #include <ddb/ddb.h>
78
79 struct ipv4_tuple {
80         uint16_t        ip_sport;       /* source port */
81         uint16_t        ip_dport;       /* destination port */
82         in_addr_t       ip_saddr;       /* source address */
83         in_addr_t       ip_daddr;       /* destination address */
84 };
85
86 union ipv4_flow {
87         struct ipv4_tuple ipf_ipt;
88         uint32_t        ipf_key[3];
89 };
90
91 struct ipv6_tuple {
92         uint16_t        ip_sport;       /* source port */
93         uint16_t        ip_dport;       /* destination port */
94         struct in6_addr ip_saddr;       /* source address */
95         struct in6_addr ip_daddr;       /* destination address */
96 };
97
98 union ipv6_flow {
99         struct ipv6_tuple ipf_ipt;
100         uint32_t        ipf_key[9];
101 };
102
103 struct flentry {
104         volatile uint32_t       f_fhash;        /* hash flowing forward */
105         uint16_t                f_flags;        /* flow flags */
106         uint8_t                 f_pad;          
107         uint8_t                 f_proto;        /* protocol */
108         uint32_t                f_fibnum;       /* fib index */
109         uint32_t                f_uptime;       /* uptime at last access */
110         struct flentry          *f_next;        /* pointer to collision entry */
111         volatile struct rtentry *f_rt;          /* rtentry for flow */
112         volatile struct llentry *f_lle;         /* llentry for flow */
113 };
114
115 struct flentry_v4 {
116         struct flentry  fl_entry;
117         union ipv4_flow fl_flow;
118 };
119
120 struct flentry_v6 {
121         struct flentry  fl_entry;
122         union ipv6_flow fl_flow;
123 };
124
125 #define fl_fhash        fl_entry.fl_fhash
126 #define fl_flags        fl_entry.fl_flags
127 #define fl_proto        fl_entry.fl_proto
128 #define fl_uptime       fl_entry.fl_uptime
129 #define fl_rt           fl_entry.fl_rt
130 #define fl_lle          fl_entry.fl_lle
131
132 #define SECS_PER_HOUR           3600
133 #define SECS_PER_DAY            (24*SECS_PER_HOUR)
134
135 #define SYN_IDLE                300
136 #define UDP_IDLE                300
137 #define FIN_WAIT_IDLE           600
138 #define TCP_IDLE                SECS_PER_DAY
139
140
141 typedef void fl_lock_t(struct flowtable *, uint32_t);
142 typedef void fl_rtalloc_t(struct route *, uint32_t, u_int);
143
144 union flentryp {
145         struct flentry          **global;
146         struct flentry          **pcpu[MAXCPU];
147 };
148
149 struct flowtable {
150         counter_u64_t   *ft_stat;
151         uma_zone_t      ft_zone;
152         int             ft_size;
153         int             ft_lock_count;
154         uint32_t        ft_flags;
155         uint32_t        ft_max_depth;
156         fl_lock_t       *ft_lock;
157         fl_lock_t       *ft_unlock;
158         fl_rtalloc_t    *ft_rtalloc;
159         /*
160          * XXX need to pad out
161          */
162         struct mtx      *ft_locks;
163         union flentryp  ft_table;
164         bitstr_t        *ft_masks[MAXCPU];
165         bitstr_t        *ft_tmpmask;
166
167         uint32_t        ft_udp_idle __aligned(CACHE_LINE_SIZE);
168         uint32_t        ft_fin_wait_idle;
169         uint32_t        ft_syn_idle;
170         uint32_t        ft_tcp_idle;
171         boolean_t       ft_full;
172 } __aligned(CACHE_LINE_SIZE);
173
174 #define FLOWSTAT_ADD(ft, name, v)       \
175         counter_u64_add((ft)->ft_stat[offsetof(struct flowtable_stat, name) / sizeof(uint64_t)], (v))
176 #define FLOWSTAT_INC(ft, name)  FLOWSTAT_ADD(ft, name, 1)
177
178 static struct proc *flowcleanerproc;
179 static uint32_t flow_hashjitter;
180
181 static struct cv        flowclean_f_cv;
182 static struct cv        flowclean_c_cv;
183 static struct mtx       flowclean_lock;
184 static uint32_t         flowclean_cycles;
185 static uint32_t         flowclean_freq;
186
187 /*
188  * TODO:
189  * - add sysctls to resize && flush flow tables
190  * - Add per flowtable sysctls for statistics and configuring timeouts
191  * - add saturation counter to rtentry to support per-packet load-balancing
192  *   add flag to indicate round-robin flow, add list lookup from head
193      for flows
194  * - add sysctl / device node / syscall to support exporting and importing
195  *   of flows with flag to indicate that a flow was imported so should
196  *   not be considered for auto-cleaning
197  * - support explicit connection state (currently only ad-hoc for DSR)
198  * - idetach() cleanup for options VIMAGE builds.
199  */
200 #ifdef INET
201 static VNET_DEFINE(struct flowtable, ip4_ft);
202 #define V_ip4_ft        VNET(ip4_ft)
203 static uma_zone_t       flow_ipv4_zone;
204 #endif
205 #ifdef INET6
206 static VNET_DEFINE(struct flowtable, ip6_ft);
207 #define V_ip6_ft        VNET(ip6_ft)
208 static uma_zone_t       flow_ipv6_zone;
209 #endif
210
211 static VNET_DEFINE(int, flowtable_enable) = 1;
212 static VNET_DEFINE(int, flowtable_syn_expire) = SYN_IDLE;
213 static VNET_DEFINE(int, flowtable_udp_expire) = UDP_IDLE;
214 static VNET_DEFINE(int, flowtable_fin_wait_expire) = FIN_WAIT_IDLE;
215 static VNET_DEFINE(int, flowtable_tcp_expire) = TCP_IDLE;
216
217 #define V_flowtable_enable              VNET(flowtable_enable)
218 #define V_flowtable_syn_expire          VNET(flowtable_syn_expire)
219 #define V_flowtable_udp_expire          VNET(flowtable_udp_expire)
220 #define V_flowtable_fin_wait_expire     VNET(flowtable_fin_wait_expire)
221 #define V_flowtable_tcp_expire          VNET(flowtable_tcp_expire)
222
223 static SYSCTL_NODE(_net, OID_AUTO, flowtable, CTLFLAG_RD, NULL,
224     "flowtable");
225 SYSCTL_VNET_INT(_net_flowtable, OID_AUTO, enable, CTLFLAG_RW,
226     &VNET_NAME(flowtable_enable), 0, "enable flowtable caching.");
227
228 /*
229  * XXX This does not end up updating timeouts at runtime
230  * and only reflects the value for the last table added :-/
231  */
232 SYSCTL_VNET_INT(_net_flowtable, OID_AUTO, syn_expire, CTLFLAG_RW,
233     &VNET_NAME(flowtable_syn_expire), 0,
234     "seconds after which to remove syn allocated flow.");
235 SYSCTL_VNET_INT(_net_flowtable, OID_AUTO, udp_expire, CTLFLAG_RW,
236     &VNET_NAME(flowtable_udp_expire), 0,
237     "seconds after which to remove flow allocated to UDP.");
238 SYSCTL_VNET_INT(_net_flowtable, OID_AUTO, fin_wait_expire, CTLFLAG_RW,
239     &VNET_NAME(flowtable_fin_wait_expire), 0,
240     "seconds after which to remove a flow in FIN_WAIT.");
241 SYSCTL_VNET_INT(_net_flowtable, OID_AUTO, tcp_expire, CTLFLAG_RW,
242     &VNET_NAME(flowtable_tcp_expire), 0,
243     "seconds after which to remove flow allocated to a TCP connection.");
244
245 #ifndef RADIX_MPATH
246 static void
247 rtalloc_ign_wrapper(struct route *ro, uint32_t hash, u_int fibnum)
248 {
249
250         rtalloc_ign_fib(ro, 0, fibnum);
251 }
252 #endif
253
254 static void
255 flowtable_global_lock(struct flowtable *table, uint32_t hash)
256 {       
257         int lock_index = (hash)&(table->ft_lock_count - 1);
258
259         mtx_lock(&table->ft_locks[lock_index]);
260 }
261
262 static void
263 flowtable_global_unlock(struct flowtable *table, uint32_t hash)
264 {       
265         int lock_index = (hash)&(table->ft_lock_count - 1);
266
267         mtx_unlock(&table->ft_locks[lock_index]);
268 }
269
270 static void
271 flowtable_pcpu_lock(struct flowtable *table, uint32_t hash)
272 {
273
274         critical_enter();
275 }
276
277 static void
278 flowtable_pcpu_unlock(struct flowtable *table, uint32_t hash)
279 {
280
281         critical_exit();
282 }
283
284 #define FL_ENTRY_INDEX(table, hash)((hash) % (table)->ft_size)
285 #define FL_ENTRY(table, hash) *flowtable_entry((table), (hash))
286 #define FL_ENTRY_LOCK(table, hash)  (table)->ft_lock((table), (hash))
287 #define FL_ENTRY_UNLOCK(table, hash) (table)->ft_unlock((table), (hash))
288
289 #define FL_STALE        (1<<8)
290 #define FL_OVERWRITE    (1<<10)
291
292 static struct flentry *flowtable_lookup_common(struct flowtable *,
293     struct sockaddr_storage *, struct sockaddr_storage *, struct mbuf *, int);
294
295 static __inline int
296 proto_to_flags(uint8_t proto)
297 {
298         int flag;
299
300         switch (proto) {
301         case IPPROTO_TCP:
302                 flag = FL_TCP;
303                 break;
304         case IPPROTO_SCTP:
305                 flag = FL_SCTP;
306                 break;          
307         case IPPROTO_UDP:
308                 flag = FL_UDP;
309                 break;
310         default:
311                 flag = 0;
312                 break;
313         }
314
315         return (flag);
316 }
317
318 static __inline int
319 flags_to_proto(int flags)
320 {
321         int proto, protoflags;
322
323         protoflags = flags & (FL_TCP|FL_SCTP|FL_UDP);
324         switch (protoflags) {
325         case FL_TCP:
326                 proto = IPPROTO_TCP;
327                 break;
328         case FL_SCTP:
329                 proto = IPPROTO_SCTP;
330                 break;
331         case FL_UDP:
332                 proto = IPPROTO_UDP;
333                 break;
334         default:
335                 proto = 0;
336                 break;
337         }
338         return (proto);
339 }
340
341 #ifdef INET
342 #ifdef FLOWTABLE_DEBUG
343 static void
344 ipv4_flow_print_tuple(int flags, int proto, struct sockaddr_in *ssin,
345     struct sockaddr_in *dsin)
346 {
347         char saddr[4*sizeof "123"], daddr[4*sizeof "123"];
348
349         if (flags & FL_HASH_ALL) {
350                 inet_ntoa_r(ssin->sin_addr, saddr);
351                 inet_ntoa_r(dsin->sin_addr, daddr);
352                 printf("proto=%d %s:%d->%s:%d\n",
353                     proto, saddr, ntohs(ssin->sin_port), daddr,
354                     ntohs(dsin->sin_port));
355         } else {
356                 inet_ntoa_r(*(struct in_addr *) &dsin->sin_addr, daddr);
357                 printf("proto=%d %s\n", proto, daddr);
358         }
359
360 }
361 #endif
362
363 static int
364 ipv4_mbuf_demarshal(struct mbuf *m, struct sockaddr_in *ssin,
365     struct sockaddr_in *dsin, uint16_t *flags)
366 {
367         struct ip *ip;
368         uint8_t proto;
369         int iphlen;
370         struct tcphdr *th;
371         struct udphdr *uh;
372         struct sctphdr *sh;
373         uint16_t sport, dport;
374
375         proto = sport = dport = 0;
376         ip = mtod(m, struct ip *);
377         dsin->sin_family = AF_INET;
378         dsin->sin_len = sizeof(*dsin);
379         dsin->sin_addr = ip->ip_dst;
380         ssin->sin_family = AF_INET;
381         ssin->sin_len = sizeof(*ssin);
382         ssin->sin_addr = ip->ip_src;    
383
384         proto = ip->ip_p;
385         if ((*flags & FL_HASH_ALL) == 0)
386                 goto skipports;
387
388         iphlen = ip->ip_hl << 2; /* XXX options? */
389
390         switch (proto) {
391         case IPPROTO_TCP:
392                 th = (struct tcphdr *)((caddr_t)ip + iphlen);
393                 sport = th->th_sport;
394                 dport = th->th_dport;
395                 if ((*flags & FL_HASH_ALL) &&
396                     (th->th_flags & (TH_RST|TH_FIN)))
397                         *flags |= FL_STALE;
398                 break;
399         case IPPROTO_UDP:
400                 uh = (struct udphdr *)((caddr_t)ip + iphlen);
401                 sport = uh->uh_sport;
402                 dport = uh->uh_dport;
403                 break;
404         case IPPROTO_SCTP:
405                 sh = (struct sctphdr *)((caddr_t)ip + iphlen);
406                 sport = sh->src_port;
407                 dport = sh->dest_port;
408                 break;
409         default:
410                 return (ENOTSUP);
411                 /* no port - hence not a protocol we care about */
412                 break;
413         
414         }
415
416 skipports:
417         *flags |= proto_to_flags(proto);
418         ssin->sin_port = sport;
419         dsin->sin_port = dport;
420         return (0);
421 }
422
423 static uint32_t
424 ipv4_flow_lookup_hash(
425         struct sockaddr_in *ssin, struct sockaddr_in *dsin,
426             uint32_t *key, uint16_t flags)
427 {
428         uint16_t sport, dport;
429         uint8_t proto;
430         int offset = 0;
431
432         proto = flags_to_proto(flags);
433         sport = dport = key[2] = key[1] = key[0] = 0;
434         if ((ssin != NULL) && (flags & FL_HASH_ALL)) {
435                 key[1] = ssin->sin_addr.s_addr;
436                 sport = ssin->sin_port;
437         }
438         if (dsin != NULL) {
439                 key[2] = dsin->sin_addr.s_addr;
440                 dport = dsin->sin_port;
441         }
442         if (flags & FL_HASH_ALL) {
443                 ((uint16_t *)key)[0] = sport;
444                 ((uint16_t *)key)[1] = dport;
445         } else
446                 offset = flow_hashjitter + proto;
447
448         return (jenkins_hash32(key, 3, offset));
449 }
450
451 static struct flentry *
452 flowtable_lookup_ipv4(struct mbuf *m)
453 {
454         struct sockaddr_storage ssa, dsa;
455         uint16_t flags;
456         struct sockaddr_in *dsin, *ssin;
457
458         dsin = (struct sockaddr_in *)&dsa;
459         ssin = (struct sockaddr_in *)&ssa;
460         bzero(dsin, sizeof(*dsin));
461         bzero(ssin, sizeof(*ssin));
462         flags = V_ip4_ft.ft_flags;
463         if (ipv4_mbuf_demarshal(m, ssin, dsin, &flags) != 0)
464                 return (NULL);
465
466         return (flowtable_lookup_common(&V_ip4_ft, &ssa, &dsa, m, flags));
467 }
468
469 void
470 flow_to_route(struct flentry *fle, struct route *ro)
471 {
472         uint32_t *hashkey = NULL;
473         struct sockaddr_in *sin;
474
475         sin = (struct sockaddr_in *)&ro->ro_dst;
476         sin->sin_family = AF_INET;
477         sin->sin_len = sizeof(*sin);
478         hashkey = ((struct flentry_v4 *)fle)->fl_flow.ipf_key;
479         sin->sin_addr.s_addr = hashkey[2];
480         ro->ro_rt = __DEVOLATILE(struct rtentry *, fle->f_rt);
481         ro->ro_lle = __DEVOLATILE(struct llentry *, fle->f_lle);
482         ro->ro_flags |= RT_NORTREF;
483 }
484 #endif /* INET */
485
486 #ifdef INET6
487 /*
488  * PULLUP_TO(len, p, T) makes sure that len + sizeof(T) is contiguous,
489  * then it sets p to point at the offset "len" in the mbuf. WARNING: the
490  * pointer might become stale after other pullups (but we never use it
491  * this way).
492  */
493 #define PULLUP_TO(_len, p, T)                                           \
494 do {                                                                    \
495         int x = (_len) + sizeof(T);                                     \
496         if ((m)->m_len < x) {                                           \
497                 goto receive_failed;                                    \
498         }                                                               \
499         p = (mtod(m, char *) + (_len));                                 \
500 } while (0)
501
502 #define TCP(p)          ((struct tcphdr *)(p))
503 #define SCTP(p)         ((struct sctphdr *)(p))
504 #define UDP(p)          ((struct udphdr *)(p))
505
506 static int
507 ipv6_mbuf_demarshal(struct mbuf *m, struct sockaddr_in6 *ssin6,
508     struct sockaddr_in6 *dsin6, uint16_t *flags)
509 {
510         struct ip6_hdr *ip6;
511         uint8_t proto;
512         int hlen;
513         uint16_t src_port, dst_port;
514         u_short offset;
515         void *ulp;
516
517         offset = hlen = src_port = dst_port = 0;
518         ulp = NULL;
519         ip6 = mtod(m, struct ip6_hdr *);
520         hlen = sizeof(struct ip6_hdr);
521         proto = ip6->ip6_nxt;
522
523         if ((*flags & FL_HASH_ALL) == 0)
524                 goto skipports;
525
526         while (ulp == NULL) {
527                 switch (proto) {
528                 case IPPROTO_ICMPV6:
529                 case IPPROTO_OSPFIGP:
530                 case IPPROTO_PIM:
531                 case IPPROTO_CARP:
532                 case IPPROTO_ESP:
533                 case IPPROTO_NONE:
534                         ulp = ip6;
535                         break;
536                 case IPPROTO_TCP:
537                         PULLUP_TO(hlen, ulp, struct tcphdr);
538                         dst_port = TCP(ulp)->th_dport;
539                         src_port = TCP(ulp)->th_sport;
540                         if ((*flags & FL_HASH_ALL) &&
541                             (TCP(ulp)->th_flags & (TH_RST|TH_FIN)))
542                                 *flags |= FL_STALE;
543                         break;
544                 case IPPROTO_SCTP:
545                         PULLUP_TO(hlen, ulp, struct sctphdr);
546                         src_port = SCTP(ulp)->src_port;
547                         dst_port = SCTP(ulp)->dest_port;
548                         break;
549                 case IPPROTO_UDP:
550                         PULLUP_TO(hlen, ulp, struct udphdr);
551                         dst_port = UDP(ulp)->uh_dport;
552                         src_port = UDP(ulp)->uh_sport;
553                         break;
554                 case IPPROTO_HOPOPTS:   /* RFC 2460 */
555                         PULLUP_TO(hlen, ulp, struct ip6_hbh);
556                         hlen += (((struct ip6_hbh *)ulp)->ip6h_len + 1) << 3;
557                         proto = ((struct ip6_hbh *)ulp)->ip6h_nxt;
558                         ulp = NULL;
559                         break;
560                 case IPPROTO_ROUTING:   /* RFC 2460 */
561                         PULLUP_TO(hlen, ulp, struct ip6_rthdr); 
562                         hlen += (((struct ip6_rthdr *)ulp)->ip6r_len + 1) << 3;
563                         proto = ((struct ip6_rthdr *)ulp)->ip6r_nxt;
564                         ulp = NULL;
565                         break;
566                 case IPPROTO_FRAGMENT:  /* RFC 2460 */
567                         PULLUP_TO(hlen, ulp, struct ip6_frag);
568                         hlen += sizeof (struct ip6_frag);
569                         proto = ((struct ip6_frag *)ulp)->ip6f_nxt;
570                         offset = ((struct ip6_frag *)ulp)->ip6f_offlg &
571                             IP6F_OFF_MASK;
572                         ulp = NULL;
573                         break;
574                 case IPPROTO_DSTOPTS:   /* RFC 2460 */
575                         PULLUP_TO(hlen, ulp, struct ip6_hbh);
576                         hlen += (((struct ip6_hbh *)ulp)->ip6h_len + 1) << 3;
577                         proto = ((struct ip6_hbh *)ulp)->ip6h_nxt;
578                         ulp = NULL;
579                         break;
580                 case IPPROTO_AH:        /* RFC 2402 */
581                         PULLUP_TO(hlen, ulp, struct ip6_ext);
582                         hlen += (((struct ip6_ext *)ulp)->ip6e_len + 2) << 2;
583                         proto = ((struct ip6_ext *)ulp)->ip6e_nxt;
584                         ulp = NULL;
585                         break;
586                 default:
587                         PULLUP_TO(hlen, ulp, struct ip6_ext);
588                         break;
589                 }
590         }
591
592         if (src_port == 0) {
593         receive_failed:
594                 return (ENOTSUP);
595         }
596
597 skipports:
598         dsin6->sin6_family = AF_INET6;
599         dsin6->sin6_len = sizeof(*dsin6);
600         dsin6->sin6_port = dst_port;
601         memcpy(&dsin6->sin6_addr, &ip6->ip6_dst, sizeof(struct in6_addr));
602
603         ssin6->sin6_family = AF_INET6;
604         ssin6->sin6_len = sizeof(*ssin6);
605         ssin6->sin6_port = src_port;
606         memcpy(&ssin6->sin6_addr, &ip6->ip6_src, sizeof(struct in6_addr));
607         *flags |= proto_to_flags(proto);
608
609         return (0);
610 }
611
612 #define zero_key(key)           \
613 do {                            \
614         key[0] = 0;             \
615         key[1] = 0;             \
616         key[2] = 0;             \
617         key[3] = 0;             \
618         key[4] = 0;             \
619         key[5] = 0;             \
620         key[6] = 0;             \
621         key[7] = 0;             \
622         key[8] = 0;             \
623 } while (0)
624         
625 static uint32_t
626 ipv6_flow_lookup_hash(
627         struct sockaddr_in6 *ssin6, struct sockaddr_in6 *dsin6,
628             uint32_t *key, uint16_t flags)
629 {
630         uint16_t sport, dport;
631         uint8_t proto;
632         int offset = 0;
633
634         proto = flags_to_proto(flags);
635         zero_key(key);
636         sport = dport = 0;
637         if (dsin6 != NULL) {
638                 memcpy(&key[1], &dsin6->sin6_addr, sizeof(struct in6_addr));
639                 dport = dsin6->sin6_port;
640         }
641         if ((ssin6 != NULL) && (flags & FL_HASH_ALL)) {
642                 memcpy(&key[5], &ssin6->sin6_addr, sizeof(struct in6_addr));
643                 sport = ssin6->sin6_port;
644         }
645         if (flags & FL_HASH_ALL) {
646                 ((uint16_t *)key)[0] = sport;
647                 ((uint16_t *)key)[1] = dport;
648         } else
649                 offset = flow_hashjitter + proto;
650
651         return (jenkins_hash32(key, 9, offset));
652 }
653
654 static struct flentry *
655 flowtable_lookup_ipv6(struct mbuf *m)
656 {
657         struct sockaddr_storage ssa, dsa;
658         struct sockaddr_in6 *dsin6, *ssin6;     
659         uint16_t flags;
660
661         dsin6 = (struct sockaddr_in6 *)&dsa;
662         ssin6 = (struct sockaddr_in6 *)&ssa;
663         bzero(dsin6, sizeof(*dsin6));
664         bzero(ssin6, sizeof(*ssin6));
665         flags = V_ip6_ft.ft_flags;
666         
667         if (ipv6_mbuf_demarshal(m, ssin6, dsin6, &flags) != 0)
668                 return (NULL);
669
670         return (flowtable_lookup_common(&V_ip6_ft, &ssa, &dsa, m, flags));
671 }
672
673 void
674 flow_to_route_in6(struct flentry *fle, struct route_in6 *ro)
675 {
676         uint32_t *hashkey = NULL;
677         struct sockaddr_in6 *sin6;
678
679         sin6 = (struct sockaddr_in6 *)&ro->ro_dst;
680
681         sin6->sin6_family = AF_INET6;
682         sin6->sin6_len = sizeof(*sin6);
683         hashkey = ((struct flentry_v6 *)fle)->fl_flow.ipf_key;
684         memcpy(&sin6->sin6_addr, &hashkey[5], sizeof (struct in6_addr));
685         ro->ro_rt = __DEVOLATILE(struct rtentry *, fle->f_rt);
686         ro->ro_lle = __DEVOLATILE(struct llentry *, fle->f_lle);
687         ro->ro_flags |= RT_NORTREF;
688 }
689 #endif /* INET6 */
690
691 static bitstr_t *
692 flowtable_mask(struct flowtable *ft)
693 {
694         bitstr_t *mask;
695
696         if (ft->ft_flags & FL_PCPU)
697                 mask = ft->ft_masks[curcpu];
698         else
699                 mask = ft->ft_masks[0];
700
701         return (mask);
702 }
703
704 static struct flentry **
705 flowtable_entry(struct flowtable *ft, uint32_t hash)
706 {
707         struct flentry **fle;
708         int index = (hash % ft->ft_size);
709
710         if (ft->ft_flags & FL_PCPU) {
711                 KASSERT(&ft->ft_table.pcpu[curcpu][0] != NULL, ("pcpu not set"));
712                 fle = &ft->ft_table.pcpu[curcpu][index];
713         } else {
714                 KASSERT(&ft->ft_table.global[0] != NULL, ("global not set"));
715                 fle = &ft->ft_table.global[index];
716         }
717         
718         return (fle);
719 }
720
721 static int
722 flow_stale(struct flowtable *ft, struct flentry *fle)
723 {
724         time_t idle_time;
725
726         if ((fle->f_fhash == 0)
727             || ((fle->f_rt->rt_flags & RTF_HOST) &&
728                 ((fle->f_rt->rt_flags & (RTF_UP))
729                     != (RTF_UP)))
730             || (fle->f_rt->rt_ifp == NULL)
731             || !RT_LINK_IS_UP(fle->f_rt->rt_ifp))
732                 return (1);
733
734         idle_time = time_uptime - fle->f_uptime;
735
736         if ((fle->f_flags & FL_STALE) ||
737             ((fle->f_flags & (TH_SYN|TH_ACK|TH_FIN)) == 0
738                 && (idle_time > ft->ft_udp_idle)) ||
739             ((fle->f_flags & TH_FIN)
740                 && (idle_time > ft->ft_fin_wait_idle)) ||
741             ((fle->f_flags & (TH_SYN|TH_ACK)) == TH_SYN
742                 && (idle_time > ft->ft_syn_idle)) ||
743             ((fle->f_flags & (TH_SYN|TH_ACK)) == (TH_SYN|TH_ACK)
744                 && (idle_time > ft->ft_tcp_idle)) ||
745             ((fle->f_rt->rt_flags & RTF_UP) == 0 ||
746                 (fle->f_rt->rt_ifp == NULL)))
747                 return (1);
748
749         return (0);
750 }
751
752 static void
753 flowtable_set_hashkey(struct flentry *fle, uint32_t *key)
754 {
755         uint32_t *hashkey;
756         int i, nwords;
757
758         if (fle->f_flags & FL_IPV6) {
759                 nwords = 9;
760                 hashkey = ((struct flentry_v4 *)fle)->fl_flow.ipf_key;
761         } else {
762                 nwords = 3;
763                 hashkey = ((struct flentry_v6 *)fle)->fl_flow.ipf_key;
764         }
765         
766         for (i = 0; i < nwords; i++)
767                 hashkey[i] = key[i];
768 }
769
770 static int
771 flow_full(struct flowtable *ft)
772 {
773         boolean_t full;
774         int count, max;
775         
776         full = ft->ft_full;
777         count = uma_zone_get_cur(ft->ft_zone);
778         max = uma_zone_get_max(ft->ft_zone);
779
780         if (full && (count < (max - (max >> 3))))
781                 ft->ft_full = FALSE;
782         else if (!full && (count > (max - (max >> 5))))
783                 ft->ft_full = TRUE;
784         
785         if (full && !ft->ft_full) {
786                 flowclean_freq = 4*hz;
787                 if ((ft->ft_flags & FL_HASH_ALL) == 0)
788                         ft->ft_udp_idle = ft->ft_fin_wait_idle =
789                             ft->ft_syn_idle = ft->ft_tcp_idle = 5;
790                 cv_broadcast(&flowclean_c_cv);
791         } else if (!full && ft->ft_full) {
792                 flowclean_freq = 20*hz;
793                 if ((ft->ft_flags & FL_HASH_ALL) == 0)
794                         ft->ft_udp_idle = ft->ft_fin_wait_idle =
795                             ft->ft_syn_idle = ft->ft_tcp_idle = 30;
796         }
797
798         return (ft->ft_full);
799 }
800
801 static int
802 flowtable_insert(struct flowtable *ft, uint32_t hash, uint32_t *key,
803     uint32_t fibnum, struct route *ro, uint16_t flags)
804 {
805         struct flentry *fle, *fletail, *newfle, **flep;
806         int depth;
807         bitstr_t *mask;
808         uint8_t proto;
809
810         newfle = uma_zalloc(ft->ft_zone, M_NOWAIT | M_ZERO);
811         if (newfle == NULL)
812                 return (ENOMEM);
813
814         newfle->f_flags |= (flags & FL_IPV6);
815         proto = flags_to_proto(flags);
816
817         FL_ENTRY_LOCK(ft, hash);
818         mask = flowtable_mask(ft);
819         flep = flowtable_entry(ft, hash);
820         fletail = fle = *flep;
821
822         if (fle == NULL) {
823                 bit_set(mask, FL_ENTRY_INDEX(ft, hash));
824                 *flep = fle = newfle;
825                 goto skip;
826         }
827         
828         depth = 0;
829         FLOWSTAT_INC(ft, ft_collisions);
830         /*
831          * find end of list and make sure that we were not
832          * preempted by another thread handling this flow
833          */
834         while (fle != NULL) {
835                 if (fle->f_fhash == hash && !flow_stale(ft, fle)) {
836                         /*
837                          * there was either a hash collision
838                          * or we lost a race to insert
839                          */
840                         FL_ENTRY_UNLOCK(ft, hash);
841                         uma_zfree(ft->ft_zone, newfle);
842                         
843                         if (flags & FL_OVERWRITE)
844                                 goto skip;
845                         return (EEXIST);
846                 }
847                 /*
848                  * re-visit this double condition XXX
849                  */
850                 if (fletail->f_next != NULL)
851                         fletail = fle->f_next;
852
853                 depth++;
854                 fle = fle->f_next;
855         }
856
857         if (depth > ft->ft_max_depth)
858                 ft->ft_max_depth = depth;
859         fletail->f_next = newfle;
860         fle = newfle;
861 skip:
862         flowtable_set_hashkey(fle, key);
863
864         fle->f_proto = proto;
865         fle->f_rt = ro->ro_rt;
866         fle->f_lle = ro->ro_lle;
867         fle->f_fhash = hash;
868         fle->f_fibnum = fibnum;
869         fle->f_uptime = time_uptime;
870         FL_ENTRY_UNLOCK(ft, hash);
871         return (0);
872 }
873
874 static int
875 flowtable_key_equal(struct flentry *fle, uint32_t *key)
876 {
877         uint32_t *hashkey;
878         int i, nwords;
879
880         if (fle->f_flags & FL_IPV6) {
881                 nwords = 9;
882                 hashkey = ((struct flentry_v4 *)fle)->fl_flow.ipf_key;
883         } else {
884                 nwords = 3;
885                 hashkey = ((struct flentry_v6 *)fle)->fl_flow.ipf_key;
886         }
887
888         for (i = 0; i < nwords; i++)
889                 if (hashkey[i] != key[i])
890                         return (0);
891
892         return (1);
893 }
894
895 struct flentry *
896 flowtable_lookup(sa_family_t sa, struct mbuf *m)
897 {
898
899         switch (sa) {
900 #ifdef INET
901         case AF_INET:
902                 return (flowtable_lookup_ipv4(m));
903 #endif
904 #ifdef INET6
905         case AF_INET6:
906                 return (flowtable_lookup_ipv6(m));
907 #endif
908         default:
909                 panic("%s: sa %d", __func__, sa);
910         }
911 }
912
913 static struct flentry *
914 flowtable_lookup_common(struct flowtable *ft, struct sockaddr_storage *ssa,
915     struct sockaddr_storage *dsa, struct mbuf *m, int flags)
916 {
917         struct route_in6 sro6;
918         struct route sro, *ro;
919         struct flentry *fle;
920         struct rtentry *rt;
921         struct llentry *lle;
922         struct sockaddr_storage *l3addr;
923         struct ifnet *ifp;
924         uint32_t key[9], hash, fibnum;
925         uint8_t proto;
926
927         if (V_flowtable_enable == 0)
928                 return (NULL);
929
930         sro.ro_rt = sro6.ro_rt = NULL;
931         sro.ro_lle = sro6.ro_lle = NULL;
932         flags |= ft->ft_flags;
933         proto = flags_to_proto(flags);
934         fibnum = M_GETFIB(m);
935
936         switch (ssa->ss_family) {
937 #ifdef INET
938         case AF_INET: {
939                 struct sockaddr_in *ssin, *dsin;
940
941                 KASSERT(dsa->ss_family == AF_INET,
942                     ("%s: dsa family %d\n", __func__, dsa->ss_family));
943
944                 ro = &sro;
945                 memcpy(&ro->ro_dst, dsa, sizeof(struct sockaddr_in));
946                 /*
947                  * The harvested source and destination addresses
948                  * may contain port information if the packet is
949                  * from a transport protocol (e.g. TCP/UDP). The
950                  * port field must be cleared before performing
951                  * a route lookup.
952                  */
953                 ((struct sockaddr_in *)&ro->ro_dst)->sin_port = 0;
954                 dsin = (struct sockaddr_in *)dsa;
955                 ssin = (struct sockaddr_in *)ssa;
956                 if ((dsin->sin_addr.s_addr == ssin->sin_addr.s_addr) ||
957                     (ntohl(dsin->sin_addr.s_addr) >> IN_CLASSA_NSHIFT) == IN_LOOPBACKNET ||
958                     (ntohl(ssin->sin_addr.s_addr) >> IN_CLASSA_NSHIFT) == IN_LOOPBACKNET)
959                         return (NULL);
960
961                 hash = ipv4_flow_lookup_hash(ssin, dsin, key, flags);
962                 break;
963         }
964 #endif
965 #ifdef INET6
966         case AF_INET6: {
967                 struct sockaddr_in6 *ssin6, *dsin6;
968
969                 KASSERT(dsa->ss_family == AF_INET6,
970                     ("%s: dsa family %d\n", __func__, dsa->ss_family));
971
972                 ro = (struct route *)&sro6;
973                 memcpy(&sro6.ro_dst, dsa,
974                     sizeof(struct sockaddr_in6));
975                 ((struct sockaddr_in6 *)&ro->ro_dst)->sin6_port = 0;
976                 dsin6 = (struct sockaddr_in6 *)dsa;
977                 ssin6 = (struct sockaddr_in6 *)ssa;
978
979                 flags |= FL_IPV6;
980                 hash = ipv6_flow_lookup_hash(ssin6, dsin6, key, flags);
981                 break;
982         }
983 #endif
984         default:
985                 panic("%s: ssa family %d", __func__, ssa->ss_family);
986         }
987
988         /*
989          * Ports are zero and this isn't a transmit cache
990          * - thus not a protocol for which we need to keep
991          * state
992          * FL_HASH_ALL => key[0] != 0 for TCP || UDP || SCTP
993          */
994         if (key[0] == 0 && (ft->ft_flags & FL_HASH_ALL))
995                 return (NULL);
996
997         FLOWSTAT_INC(ft, ft_lookups);
998         FL_ENTRY_LOCK(ft, hash);
999         if ((fle = FL_ENTRY(ft, hash)) == NULL) {
1000                 FL_ENTRY_UNLOCK(ft, hash);
1001                 goto uncached;
1002         }
1003 keycheck:       
1004         rt = __DEVOLATILE(struct rtentry *, fle->f_rt);
1005         lle = __DEVOLATILE(struct llentry *, fle->f_lle);
1006         if ((rt != NULL)
1007             && lle != NULL
1008             && fle->f_fhash == hash
1009             && flowtable_key_equal(fle, key)
1010             && (proto == fle->f_proto)
1011             && (fibnum == fle->f_fibnum)
1012             && (rt->rt_flags & RTF_UP)
1013             && (rt->rt_ifp != NULL)
1014             && (lle->la_flags & LLE_VALID)) {
1015                 FLOWSTAT_INC(ft, ft_hits);
1016                 fle->f_uptime = time_uptime;
1017                 fle->f_flags |= flags;
1018                 FL_ENTRY_UNLOCK(ft, hash);
1019                 goto success;
1020         } else if (fle->f_next != NULL) {
1021                 fle = fle->f_next;
1022                 goto keycheck;
1023         }
1024         FL_ENTRY_UNLOCK(ft, hash);
1025 uncached:
1026         if (flags & FL_NOAUTO || flow_full(ft))
1027                 return (NULL);
1028
1029         FLOWSTAT_INC(ft, ft_misses);
1030         /*
1031          * This bit of code ends up locking the
1032          * same route 3 times (just like ip_output + ether_output)
1033          * - at lookup
1034          * - in rt_check when called by arpresolve
1035          * - dropping the refcount for the rtentry
1036          *
1037          * This could be consolidated to one if we wrote a variant
1038          * of arpresolve with an rt_check variant that expected to
1039          * receive the route locked
1040          */
1041
1042         ft->ft_rtalloc(ro, hash, fibnum);
1043         if (ro->ro_rt == NULL)
1044                 return (NULL);
1045
1046         rt = ro->ro_rt;
1047         ifp = rt->rt_ifp;
1048
1049         if (ifp->if_flags & (IFF_POINTOPOINT | IFF_LOOPBACK)) {
1050                 RTFREE(rt);
1051                 return (NULL);
1052         }
1053
1054         switch (ssa->ss_family) {
1055 #ifdef INET
1056         case AF_INET:
1057                 if (rt->rt_flags & RTF_GATEWAY)
1058                         l3addr = (struct sockaddr_storage *)rt->rt_gateway;
1059                 else
1060                         l3addr = (struct sockaddr_storage *)&ro->ro_dst;
1061                 lle = llentry_alloc(ifp, LLTABLE(ifp), l3addr); 
1062                 break;
1063 #endif
1064 #ifdef INET6
1065         case AF_INET6: {
1066                 struct sockaddr_in6 *dsin6;
1067
1068                 dsin6 = (struct sockaddr_in6 *)dsa;                     
1069                 if (in6_localaddr(&dsin6->sin6_addr)) {
1070                         RTFREE(rt);
1071                         return (NULL);                          
1072                 }
1073
1074                 if (rt->rt_flags & RTF_GATEWAY)
1075                         l3addr = (struct sockaddr_storage *)rt->rt_gateway;
1076                 else
1077                         l3addr = (struct sockaddr_storage *)&ro->ro_dst;
1078                 lle = llentry_alloc(ifp, LLTABLE6(ifp), l3addr);
1079                 break;
1080         }
1081 #endif  
1082         }
1083
1084         if (lle == NULL) {
1085                 RTFREE(rt);
1086                 return (NULL);
1087         }
1088         ro->ro_lle = lle;
1089
1090         if (flowtable_insert(ft, hash, key, fibnum, ro, flags) != 0) {
1091                 RTFREE(rt);
1092                 LLE_FREE(lle);
1093                 return (NULL);
1094         }
1095
1096 success:
1097         if (fle != NULL && (m->m_flags & M_FLOWID) == 0) {
1098                 m->m_flags |= M_FLOWID;
1099                 m->m_pkthdr.flowid = fle->f_fhash;
1100         }
1101         return (fle);
1102 }
1103
1104 /*
1105  * used by the bit_alloc macro
1106  */
1107 #define calloc(count, size) malloc((count)*(size), M_DEVBUF, M_WAITOK|M_ZERO)
1108         
1109 static void
1110 flowtable_alloc(struct flowtable *ft)
1111 {
1112
1113 #ifdef RADIX_MPATH
1114         ft->ft_rtalloc = rtalloc_mpath_fib;
1115 #else
1116         ft->ft_rtalloc = rtalloc_ign_wrapper;
1117 #endif
1118         if (ft->ft_flags & FL_PCPU) {
1119                 ft->ft_lock = flowtable_pcpu_lock;
1120                 ft->ft_unlock = flowtable_pcpu_unlock;
1121
1122                 for (int i = 0; i <= mp_maxid; i++) {
1123                         ft->ft_table.pcpu[i] =
1124                             malloc(ft->ft_size * sizeof(struct flentry *),
1125                                 M_RTABLE, M_WAITOK | M_ZERO);
1126                         ft->ft_masks[i] = bit_alloc(ft->ft_size);
1127                 }
1128         } else {
1129                 ft->ft_lock_count = 2*(powerof2(mp_maxid + 1) ? (mp_maxid + 1):
1130                     (fls(mp_maxid + 1) << 1));
1131                 
1132                 ft->ft_lock = flowtable_global_lock;
1133                 ft->ft_unlock = flowtable_global_unlock;
1134                 ft->ft_table.global =
1135                             malloc(ft->ft_size * sizeof(struct flentry *),
1136                                 M_RTABLE, M_WAITOK | M_ZERO);
1137                 ft->ft_locks = malloc(ft->ft_lock_count*sizeof(struct mtx),
1138                                 M_RTABLE, M_WAITOK | M_ZERO);
1139                 for (int i = 0; i < ft->ft_lock_count; i++)
1140                         mtx_init(&ft->ft_locks[i], "flow", NULL,
1141                             MTX_DEF | MTX_DUPOK);
1142
1143                 ft->ft_masks[0] = bit_alloc(ft->ft_size);
1144         }
1145         ft->ft_tmpmask = bit_alloc(ft->ft_size);
1146
1147         /*
1148          * In the local transmit case the table truly is
1149          * just a cache - so everything is eligible for
1150          * replacement after 5s of non-use
1151          */
1152         if (ft->ft_flags & FL_HASH_ALL) {
1153                 ft->ft_udp_idle = V_flowtable_udp_expire;
1154                 ft->ft_syn_idle = V_flowtable_syn_expire;
1155                 ft->ft_fin_wait_idle = V_flowtable_fin_wait_expire;
1156                 ft->ft_tcp_idle = V_flowtable_fin_wait_expire;
1157         } else {
1158                 ft->ft_udp_idle = ft->ft_fin_wait_idle =
1159                     ft->ft_syn_idle = ft->ft_tcp_idle = 30;
1160                 
1161         }
1162 }
1163
1164 /*
1165  * The rest of the code is devoted to garbage collection of expired entries.
1166  * It is a new additon made necessary by the switch to dynamically allocating
1167  * flow tables.
1168  *
1169  */
1170 static void
1171 fle_free(struct flentry *fle, struct flowtable *ft)
1172 {
1173         struct rtentry *rt;
1174         struct llentry *lle;
1175
1176         rt = __DEVOLATILE(struct rtentry *, fle->f_rt);
1177         lle = __DEVOLATILE(struct llentry *, fle->f_lle);
1178         if (rt != NULL)
1179                 RTFREE(rt);
1180         if (lle != NULL)
1181                 LLE_FREE(lle);
1182         uma_zfree(ft->ft_zone, fle);
1183 }
1184
1185 static void
1186 flowtable_free_stale(struct flowtable *ft, struct rtentry *rt)
1187 {
1188         int curbit = 0, tmpsize;
1189         struct flentry *fle,  **flehead, *fleprev;
1190         struct flentry *flefreehead, *flefreetail, *fletmp;
1191         bitstr_t *mask, *tmpmask;
1192
1193         flefreehead = flefreetail = NULL;
1194         mask = flowtable_mask(ft);
1195         tmpmask = ft->ft_tmpmask;
1196         tmpsize = ft->ft_size;
1197         memcpy(tmpmask, mask, ft->ft_size/8);
1198         /*
1199          * XXX Note to self, bit_ffs operates at the byte level
1200          * and thus adds gratuitous overhead
1201          */
1202         bit_ffs(tmpmask, ft->ft_size, &curbit);
1203         while (curbit != -1) {
1204                 if (curbit >= ft->ft_size || curbit < -1) {
1205                         log(LOG_ALERT,
1206                             "warning: bad curbit value %d \n",
1207                             curbit);
1208                         break;
1209                 }
1210
1211                 FL_ENTRY_LOCK(ft, curbit);
1212                 flehead = flowtable_entry(ft, curbit);
1213                 fle = fleprev = *flehead;
1214
1215                 FLOWSTAT_INC(ft, ft_free_checks);
1216 #ifdef DIAGNOSTIC
1217                 if (fle == NULL && curbit > 0) {
1218                         log(LOG_ALERT,
1219                             "warning bit=%d set, but no fle found\n",
1220                             curbit);
1221                 }
1222 #endif          
1223                 while (fle != NULL) {
1224                         if (rt != NULL) {
1225                                 if (__DEVOLATILE(struct rtentry *, fle->f_rt) != rt) {
1226                                         fleprev = fle;
1227                                         fle = fle->f_next;
1228                                         continue;
1229                                 }
1230                         } else if (!flow_stale(ft, fle)) {
1231                                 fleprev = fle;
1232                                 fle = fle->f_next;
1233                                 continue;
1234                         }
1235                         /*
1236                          * delete head of the list
1237                          */
1238                         if (fleprev == *flehead) {
1239                                 fletmp = fleprev;
1240                                 if (fle == fleprev) {
1241                                         fleprev = *flehead = fle->f_next;
1242                                 } else
1243                                         fleprev = *flehead = fle;
1244                                 fle = fle->f_next;
1245                         } else {
1246                                 /*
1247                                  * don't advance fleprev
1248                                  */
1249                                 fletmp = fle;
1250                                 fleprev->f_next = fle->f_next;
1251                                 fle = fleprev->f_next;
1252                         }
1253
1254                         if (flefreehead == NULL)
1255                                 flefreehead = flefreetail = fletmp;
1256                         else {
1257                                 flefreetail->f_next = fletmp;
1258                                 flefreetail = fletmp;
1259                         }
1260                         fletmp->f_next = NULL;
1261                 }
1262                 if (*flehead == NULL)
1263                         bit_clear(mask, curbit);
1264                 FL_ENTRY_UNLOCK(ft, curbit);
1265                 bit_clear(tmpmask, curbit);
1266                 tmpmask += (curbit / 8);
1267                 tmpsize -= (curbit / 8) * 8;
1268                 bit_ffs(tmpmask, tmpsize, &curbit);
1269         }
1270         while ((fle = flefreehead) != NULL) {
1271                 flefreehead = fle->f_next;
1272                 FLOWSTAT_INC(ft, ft_frees);
1273                 fle_free(fle, ft);
1274         }
1275 }
1276
1277 void
1278 flowtable_route_flush(sa_family_t sa, struct rtentry *rt)
1279 {
1280         struct flowtable *ft;
1281         int i;
1282
1283         switch (sa) {
1284 #ifdef INET
1285         case AF_INET:
1286                 ft = &V_ip4_ft;
1287                 break;
1288 #endif
1289 #ifdef INET6
1290         case AF_INET6:
1291                 ft = &V_ip6_ft;
1292                 break;
1293 #endif
1294         default:
1295                 panic("%s: sa %d", __func__, sa);
1296         }
1297
1298         if (ft->ft_flags & FL_PCPU) {
1299                 CPU_FOREACH(i) {
1300                         if (smp_started == 1) {
1301                                 thread_lock(curthread);
1302                                 sched_bind(curthread, i);
1303                                 thread_unlock(curthread);
1304                         }
1305
1306                         flowtable_free_stale(ft, rt);
1307
1308                         if (smp_started == 1) {
1309                                 thread_lock(curthread);
1310                                 sched_unbind(curthread);
1311                                 thread_unlock(curthread);
1312                         }
1313                 }
1314         } else {
1315                 flowtable_free_stale(ft, rt);
1316         }
1317 }
1318
1319 static void
1320 flowtable_clean_vnet(struct flowtable *ft)
1321 {
1322
1323         if (ft->ft_flags & FL_PCPU) {
1324                 int i;
1325
1326                 CPU_FOREACH(i) {
1327                         if (smp_started == 1) {
1328                                 thread_lock(curthread);
1329                                 sched_bind(curthread, i);
1330                                 thread_unlock(curthread);
1331                         }
1332
1333                         flowtable_free_stale(ft, NULL);
1334
1335                         if (smp_started == 1) {
1336                                 thread_lock(curthread);
1337                                 sched_unbind(curthread);
1338                                 thread_unlock(curthread);
1339                         }
1340                 }
1341         } else
1342                 flowtable_free_stale(ft, NULL);
1343 }
1344
1345 static void
1346 flowtable_cleaner(void)
1347 {
1348         VNET_ITERATOR_DECL(vnet_iter);
1349         struct thread *td;
1350
1351         if (bootverbose)
1352                 log(LOG_INFO, "flowtable cleaner started\n");
1353         td = curthread;
1354         while (1) {
1355                 VNET_LIST_RLOCK();
1356                 VNET_FOREACH(vnet_iter) {
1357                         CURVNET_SET(vnet_iter);
1358 #ifdef INET
1359                         flowtable_clean_vnet(&V_ip4_ft);
1360 #endif
1361 #ifdef INET6
1362                         flowtable_clean_vnet(&V_ip6_ft);
1363 #endif
1364                         CURVNET_RESTORE();
1365                 }
1366                 VNET_LIST_RUNLOCK();
1367
1368                 /*
1369                  * The 10 second interval between cleaning checks
1370                  * is arbitrary
1371                  */
1372                 mtx_lock(&flowclean_lock);
1373                 thread_lock(td);
1374                 sched_prio(td, PPAUSE);
1375                 thread_unlock(td);
1376                 flowclean_cycles++;
1377                 cv_broadcast(&flowclean_f_cv);
1378                 cv_timedwait(&flowclean_c_cv, &flowclean_lock, flowclean_freq);
1379                 mtx_unlock(&flowclean_lock);
1380         }
1381 }
1382
1383 static void
1384 flowtable_flush(void *unused __unused)
1385 {
1386         uint64_t start;
1387
1388         mtx_lock(&flowclean_lock);
1389         start = flowclean_cycles;
1390         while (start == flowclean_cycles) {
1391                 cv_broadcast(&flowclean_c_cv);
1392                 cv_wait(&flowclean_f_cv, &flowclean_lock);
1393         }
1394         mtx_unlock(&flowclean_lock);
1395 }
1396
1397 static struct kproc_desc flow_kp = {
1398         "flowcleaner",
1399         flowtable_cleaner,
1400         &flowcleanerproc
1401 };
1402 SYSINIT(flowcleaner, SI_SUB_KTHREAD_IDLE, SI_ORDER_ANY, kproc_start, &flow_kp);
1403
1404 static int
1405 flowtable_get_size(char *name)
1406 {
1407         int size;
1408
1409         if (TUNABLE_INT_FETCH(name, &size)) {
1410                 if (size < 256)
1411                         size = 256;
1412                 if (!powerof2(size)) {
1413                         printf("%s must be power of 2\n", name);
1414                         size = 2048;
1415                 }
1416         } else {
1417                 /*
1418                  * round up to the next power of 2
1419                  */
1420                 size = 1 << fls((1024 + maxusers * 64) - 1);
1421         }
1422
1423         return (size);
1424 }
1425
1426 static void
1427 flowtable_init(const void *unused __unused)
1428 {
1429
1430         flow_hashjitter = arc4random();
1431
1432 #ifdef INET
1433         flow_ipv4_zone = uma_zcreate("ip4flow", sizeof(struct flentry_v4),
1434             NULL, NULL, NULL, NULL, UMA_ALIGN_CACHE, UMA_ZONE_MAXBUCKET);
1435         uma_zone_set_max(flow_ipv4_zone, 1024 + maxusers * 64 * mp_ncpus);
1436 #endif
1437 #ifdef INET6
1438         flow_ipv6_zone = uma_zcreate("ip6flow", sizeof(struct flentry_v6),
1439             NULL, NULL, NULL, NULL, UMA_ALIGN_CACHE, UMA_ZONE_MAXBUCKET);
1440         uma_zone_set_max(flow_ipv6_zone, 1024 + maxusers * 64 * mp_ncpus);
1441 #endif
1442
1443         cv_init(&flowclean_c_cv, "c_flowcleanwait");
1444         cv_init(&flowclean_f_cv, "f_flowcleanwait");
1445         mtx_init(&flowclean_lock, "flowclean lock", NULL, MTX_DEF);
1446         EVENTHANDLER_REGISTER(ifnet_departure_event, flowtable_flush, NULL,
1447             EVENTHANDLER_PRI_ANY);
1448         flowclean_freq = 20*hz;
1449 }
1450 SYSINIT(flowtable_init, SI_SUB_PROTO_BEGIN, SI_ORDER_FIRST,
1451     flowtable_init, NULL);
1452
1453 #ifdef INET
1454 static SYSCTL_NODE(_net_flowtable, OID_AUTO, ip4, CTLFLAG_RD, NULL,
1455     "Flowtable for IPv4");
1456 SYSCTL_UMA_MAX(_net_flowtable_ip4, OID_AUTO, maxflows, CTLFLAG_RW,
1457     &flow_ipv4_zone, "Maximum number of IPv4 flows allowed");
1458
1459 static VNET_PCPUSTAT_DEFINE(struct flowtable_stat, ip4_ftstat);
1460 VNET_PCPUSTAT_SYSINIT(ip4_ftstat);
1461 VNET_PCPUSTAT_SYSUNINIT(ip4_ftstat);
1462 SYSCTL_VNET_PCPUSTAT(_net_flowtable_ip4, OID_AUTO, stat, struct flowtable_stat,
1463     ip4_ftstat, "Flowtable statistics for IPv4 "
1464     "(struct flowtable_stat, net/flowtable.h)");
1465
1466 static void
1467 flowtable_init_vnet_v4(const void *unused __unused)
1468 {
1469
1470         V_ip4_ft.ft_zone = flow_ipv4_zone;
1471         V_ip4_ft.ft_size = flowtable_get_size("net.flowtable.ip4.size");
1472         V_ip4_ft.ft_flags = FL_PCPU;
1473         V_ip4_ft.ft_stat = VNET(ip4_ftstat);
1474         flowtable_alloc(&V_ip4_ft);
1475 }
1476 VNET_SYSINIT(ft_vnet_v4, SI_SUB_PROTO_IFATTACHDOMAIN, SI_ORDER_ANY,
1477     flowtable_init_vnet_v4, NULL);
1478 #endif /* INET */
1479
1480 #ifdef INET6
1481 static SYSCTL_NODE(_net_flowtable, OID_AUTO, ip6, CTLFLAG_RD, NULL,
1482     "Flowtable for IPv6");
1483 SYSCTL_UMA_MAX(_net_flowtable_ip6, OID_AUTO, maxflows, CTLFLAG_RW,
1484     &flow_ipv6_zone, "Maximum number of IPv6 flows allowed");
1485
1486 static VNET_PCPUSTAT_DEFINE(struct flowtable_stat, ip6_ftstat);
1487 VNET_PCPUSTAT_SYSINIT(ip6_ftstat);
1488 VNET_PCPUSTAT_SYSUNINIT(ip6_ftstat);
1489 SYSCTL_VNET_PCPUSTAT(_net_flowtable_ip6, OID_AUTO, stat, struct flowtable_stat,
1490     ip6_ftstat, "Flowtable statistics for IPv6 "
1491     "(struct flowtable_stat, net/flowtable.h)");
1492
1493 static void
1494 flowtable_init_vnet_v6(const void *unused __unused)
1495 {
1496
1497         V_ip6_ft.ft_zone = flow_ipv6_zone;
1498         V_ip6_ft.ft_size = flowtable_get_size("net.flowtable.ip6.size");
1499         V_ip6_ft.ft_flags = FL_PCPU;
1500         V_ip6_ft.ft_stat = VNET(ip6_ftstat);
1501         flowtable_alloc(&V_ip6_ft);
1502 }
1503 VNET_SYSINIT(flowtable_init_vnet_v6, SI_SUB_PROTO_IFATTACHDOMAIN, SI_ORDER_ANY,
1504     flowtable_init_vnet_v6, NULL);
1505 #endif /* INET6 */
1506
1507 #ifdef DDB
1508 static uint32_t *
1509 flowtable_get_hashkey(struct flentry *fle)
1510 {
1511         uint32_t *hashkey;
1512
1513         if (fle->f_flags & FL_IPV6)
1514                 hashkey = ((struct flentry_v4 *)fle)->fl_flow.ipf_key;
1515         else
1516                 hashkey = ((struct flentry_v6 *)fle)->fl_flow.ipf_key;
1517
1518         return (hashkey);
1519 }
1520
1521 static bitstr_t *
1522 flowtable_mask_pcpu(struct flowtable *ft, int cpuid)
1523 {
1524         bitstr_t *mask;
1525
1526         if (ft->ft_flags & FL_PCPU)
1527                 mask = ft->ft_masks[cpuid];
1528         else
1529                 mask = ft->ft_masks[0];
1530
1531         return (mask);
1532 }
1533
1534 static struct flentry **
1535 flowtable_entry_pcpu(struct flowtable *ft, uint32_t hash, int cpuid)
1536 {
1537         struct flentry **fle;
1538         int index = (hash % ft->ft_size);
1539
1540         if (ft->ft_flags & FL_PCPU) {
1541                 fle = &ft->ft_table.pcpu[cpuid][index];
1542         } else {
1543                 fle = &ft->ft_table.global[index];
1544         }
1545         
1546         return (fle);
1547 }
1548
1549 static void
1550 flow_show(struct flowtable *ft, struct flentry *fle)
1551 {
1552         int idle_time;
1553         int rt_valid, ifp_valid;
1554         uint16_t sport, dport;
1555         uint32_t *hashkey;
1556         char saddr[4*sizeof "123"], daddr[4*sizeof "123"];
1557         volatile struct rtentry *rt;
1558         struct ifnet *ifp = NULL;
1559
1560         idle_time = (int)(time_uptime - fle->f_uptime);
1561         rt = fle->f_rt;
1562         rt_valid = rt != NULL;
1563         if (rt_valid)
1564                 ifp = rt->rt_ifp;
1565         ifp_valid = ifp != NULL;
1566         hashkey = flowtable_get_hashkey(fle);
1567         if (fle->f_flags & FL_IPV6)
1568                 goto skipaddr;
1569
1570         inet_ntoa_r(*(struct in_addr *) &hashkey[2], daddr);
1571         if (ft->ft_flags & FL_HASH_ALL) {
1572                 inet_ntoa_r(*(struct in_addr *) &hashkey[1], saddr);            
1573                 sport = ntohs(((uint16_t *)hashkey)[0]);
1574                 dport = ntohs(((uint16_t *)hashkey)[1]);
1575                 db_printf("%s:%d->%s:%d",
1576                     saddr, sport, daddr,
1577                     dport);
1578         } else
1579                 db_printf("%s ", daddr);
1580
1581 skipaddr:
1582         if (fle->f_flags & FL_STALE)
1583                 db_printf(" FL_STALE ");
1584         if (fle->f_flags & FL_TCP)
1585                 db_printf(" FL_TCP ");
1586         if (fle->f_flags & FL_UDP)
1587                 db_printf(" FL_UDP ");
1588         if (rt_valid) {
1589                 if (rt->rt_flags & RTF_UP)
1590                         db_printf(" RTF_UP ");
1591         }
1592         if (ifp_valid) {
1593                 if (ifp->if_flags & IFF_LOOPBACK)
1594                         db_printf(" IFF_LOOPBACK ");
1595                 if (ifp->if_flags & IFF_UP)
1596                         db_printf(" IFF_UP ");          
1597                 if (ifp->if_flags & IFF_POINTOPOINT)
1598                         db_printf(" IFF_POINTOPOINT ");         
1599         }
1600         if (fle->f_flags & FL_IPV6)
1601                 db_printf("\n\tkey=%08x:%08x:%08x%08x:%08x:%08x%08x:%08x:%08x",
1602                     hashkey[0], hashkey[1], hashkey[2],
1603                     hashkey[3], hashkey[4], hashkey[5],
1604                     hashkey[6], hashkey[7], hashkey[8]);
1605         else
1606                 db_printf("\n\tkey=%08x:%08x:%08x ",
1607                     hashkey[0], hashkey[1], hashkey[2]);
1608         db_printf("hash=%08x idle_time=%03d"
1609             "\n\tfibnum=%02d rt=%p",
1610             fle->f_fhash, idle_time, fle->f_fibnum, fle->f_rt);
1611         db_printf("\n");
1612 }
1613
1614 static void
1615 flowtable_show(struct flowtable *ft, int cpuid)
1616 {
1617         int curbit = 0;
1618         struct flentry *fle,  **flehead;
1619         bitstr_t *mask, *tmpmask;
1620
1621         if (cpuid != -1)
1622                 db_printf("cpu: %d\n", cpuid);
1623         mask = flowtable_mask_pcpu(ft, cpuid);
1624         tmpmask = ft->ft_tmpmask;
1625         memcpy(tmpmask, mask, ft->ft_size/8);
1626         /*
1627          * XXX Note to self, bit_ffs operates at the byte level
1628          * and thus adds gratuitous overhead
1629          */
1630         bit_ffs(tmpmask, ft->ft_size, &curbit);
1631         while (curbit != -1) {
1632                 if (curbit >= ft->ft_size || curbit < -1) {
1633                         db_printf("warning: bad curbit value %d \n",
1634                             curbit);
1635                         break;
1636                 }
1637
1638                 flehead = flowtable_entry_pcpu(ft, curbit, cpuid);
1639                 fle = *flehead;
1640
1641                 while (fle != NULL) {   
1642                         flow_show(ft, fle);
1643                         fle = fle->f_next;
1644                         continue;
1645                 }
1646                 bit_clear(tmpmask, curbit);
1647                 bit_ffs(tmpmask, ft->ft_size, &curbit);
1648         }
1649 }
1650
1651 static void
1652 flowtable_show_vnet(struct flowtable *ft)
1653 {
1654
1655         if (ft->ft_flags & FL_PCPU) {
1656                 int i;
1657
1658                 CPU_FOREACH(i) {
1659                         flowtable_show(ft, i);
1660                 }
1661         } else
1662                 flowtable_show(ft, -1);
1663 }
1664
1665 DB_SHOW_COMMAND(flowtables, db_show_flowtables)
1666 {
1667         VNET_ITERATOR_DECL(vnet_iter);
1668
1669         VNET_FOREACH(vnet_iter) {
1670                 CURVNET_SET(vnet_iter);
1671 #ifdef VIMAGE
1672                 db_printf("vnet %p\n", vnet_iter);
1673 #endif
1674 #ifdef INET
1675                 printf("IPv4:\n");
1676                 flowtable_show_vnet(&V_ip4_ft);
1677 #endif
1678 #ifdef INET6
1679                 printf("IPv6:\n");
1680                 flowtable_show_vnet(&V_ip6_ft);
1681 #endif
1682                 CURVNET_RESTORE();
1683         }
1684 }
1685 #endif