]> CyberLeo.Net >> Repos - FreeBSD/FreeBSD.git/blob - sys/netinet/ip_state.c
This commit was generated by cvs2svn to compensate for changes in r53809,
[FreeBSD/FreeBSD.git] / sys / netinet / ip_state.c
1 /*
2  * Copyright (C) 1995-1998 by Darren Reed.
3  *
4  * Redistribution and use in source and binary forms are permitted
5  * provided that this notice is preserved and due credit is given
6  * to the original author and the contributors.
7  */
8 #if !defined(lint)
9 static const char sccsid[] = "@(#)ip_state.c    1.8 6/5/96 (C) 1993-1995 Darren Reed";
10 /*static const char rcsid[] = "@(#)$Id: ip_state.c,v 2.3.2.9 1999/10/21 14:31:09 darrenr Exp $";*/
11 static const char rcsid[] = "@(#)$FreeBSD$";
12 #endif
13
14 #include <sys/errno.h>
15 #include <sys/types.h>
16 #include <sys/param.h>
17 #include <sys/file.h>
18 #if defined(__NetBSD__) && (NetBSD >= 199905) && !defined(IPFILTER_LKM) && \
19     defined(_KERNEL)
20 # include "opt_ipfilter_log.h"
21 #endif
22 #if !defined(_KERNEL) && !defined(KERNEL) && !defined(__KERNEL__)
23 # include <stdio.h>
24 # include <stdlib.h>
25 # include <string.h>
26 #else
27 # ifdef linux
28 #  include <linux/kernel.h>
29 #  include <linux/module.h>
30 # endif
31 #endif
32 #if defined(KERNEL) && (__FreeBSD_version >= 220000)
33 # include <sys/filio.h>
34 # include <sys/fcntl.h>
35 # if (__FreeBSD_version >= 300000) && !defined(IPFILTER_LKM)
36 #  include "opt_ipfilter.h"
37 # endif
38 #else
39 # include <sys/ioctl.h>
40 #endif
41 #include <sys/time.h>
42 #include <sys/uio.h>
43 #ifndef linux
44 # include <sys/protosw.h>
45 #endif
46 #include <sys/socket.h>
47 #if defined(_KERNEL) && !defined(linux)
48 # include <sys/systm.h>
49 #endif
50 #if !defined(__SVR4) && !defined(__svr4__)
51 # ifndef linux
52 #  include <sys/mbuf.h>
53 # endif
54 #else
55 # include <sys/filio.h>
56 # include <sys/byteorder.h>
57 # ifdef _KERNEL
58 #  include <sys/dditypes.h>
59 # endif
60 # include <sys/stream.h>
61 # include <sys/kmem.h>
62 #endif
63
64 #include <net/if.h>
65 #ifdef sun
66 # include <net/af.h>
67 #endif
68 #include <net/route.h>
69 #include <netinet/in.h>
70 #include <netinet/in_systm.h>
71 #include <netinet/ip.h>
72 #include <netinet/tcp.h>
73 #ifndef linux
74 # include <netinet/ip_var.h>
75 # include <netinet/tcp_fsm.h>
76 #endif
77 #include <netinet/udp.h>
78 #include <netinet/ip_icmp.h>
79 #include "netinet/ip_compat.h"
80 #include <netinet/tcpip.h>
81 #include "netinet/ip_fil.h"
82 #include "netinet/ip_nat.h"
83 #include "netinet/ip_frag.h"
84 #include "netinet/ip_proxy.h"
85 #include "netinet/ip_state.h"
86 #if (__FreeBSD_version >= 300000)
87 # include <sys/malloc.h>
88 # if (defined(_KERNEL) || defined(KERNEL)) && !defined(IPFILTER_LKM)
89 #  include <sys/libkern.h>
90 #  include <sys/systm.h>
91 # endif
92 #endif
93
94 #ifndef MIN
95 # define        MIN(a,b)        (((a)<(b))?(a):(b))
96 #endif
97
98 #define TCP_CLOSE       (TH_FIN|TH_RST)
99
100 static ipstate_t **ips_table = NULL;
101 static int      ips_num = 0;
102 static ips_stat_t ips_stats;
103 #if     (SOLARIS || defined(__sgi)) && defined(_KERNEL)
104 extern  KRWLOCK_T       ipf_state, ipf_mutex;
105 extern  kmutex_t        ipf_rw;
106 #endif
107
108 static int fr_matchsrcdst __P((ipstate_t *, struct in_addr, struct in_addr,
109                                fr_info_t *, tcphdr_t *));
110 static frentry_t *fr_checkicmpmatchingstate __P((ip_t *, fr_info_t *));
111 static int fr_state_flush __P((int));
112 static ips_stat_t *fr_statetstats __P((void));
113 static void fr_delstate __P((ipstate_t *));
114
115
116 #define FIVE_DAYS       (2 * 5 * 86400) /* 5 days: half closed session */
117
118 #define TCP_MSL 240                     /* 2 minutes */
119 u_long  fr_tcpidletimeout = FIVE_DAYS,
120         fr_tcpclosewait = 2 * TCP_MSL,
121         fr_tcplastack = 2 * TCP_MSL,
122         fr_tcptimeout = 2 * TCP_MSL,
123         fr_tcpclosed = 1,
124         fr_udptimeout = 240,
125         fr_icmptimeout = 120;
126 int     fr_statemax = IPSTATE_MAX,
127         fr_statesize = IPSTATE_SIZE;
128 int     fr_state_doflush = 0;
129
130
131 int fr_stateinit()
132 {
133         KMALLOCS(ips_table, ipstate_t **, fr_statesize * sizeof(ipstate_t *));
134         if (ips_table != NULL)
135                 bzero((char *)ips_table, fr_statesize * sizeof(ipstate_t *));
136         else
137                 return -1;
138         return 0;
139 }
140
141
142 static ips_stat_t *fr_statetstats()
143 {
144         ips_stats.iss_active = ips_num;
145         ips_stats.iss_table = ips_table;
146         return &ips_stats;
147 }
148
149
150 /*
151  * flush state tables.  two actions currently defined:
152  * which == 0 : flush all state table entries
153  * which == 1 : flush TCP connections which have started to close but are
154  *              stuck for some reason.
155  */
156 static int fr_state_flush(which)
157 int which;
158 {
159         register int i;
160         register ipstate_t *is, **isp;
161 #if defined(_KERNEL) && !SOLARIS
162         int s;
163 #endif
164         int delete, removed = 0;
165
166         SPL_NET(s);
167         WRITE_ENTER(&ipf_state);
168         for (i = fr_statesize - 1; i >= 0; i--)
169                 for (isp = &ips_table[i]; (is = *isp); ) {
170                         delete = 0;
171
172                         switch (which)
173                         {
174                         case 0 :
175                                 delete = 1;
176                                 break;
177                         case 1 :
178                                 if ((is->is_p == IPPROTO_TCP) &&
179                                     (((is->is_state[0] <= TCPS_ESTABLISHED) &&
180                                       (is->is_state[1] > TCPS_ESTABLISHED)) ||
181                                      ((is->is_state[1] <= TCPS_ESTABLISHED) &&
182                                       (is->is_state[0] > TCPS_ESTABLISHED))))
183                                         delete = 1;
184                                 break;
185                         }
186
187                         if (delete) {
188                                 *isp = is->is_next;
189                                 if (is->is_p == IPPROTO_TCP)
190                                         ips_stats.iss_fin++;
191                                 else
192                                         ips_stats.iss_expire++;
193                                 if (ips_table[i] == NULL)
194                                         ips_stats.iss_inuse--;
195 #ifdef  IPFILTER_LOG
196                                 ipstate_log(is, ISL_FLUSH);
197 #endif
198                                 fr_delstate(is);
199                                 ips_num--;
200                                 removed++;
201                         } else
202                                 isp = &is->is_next;
203                 }
204         if (fr_state_doflush) {
205                 (void) fr_state_flush(1);
206                 fr_state_doflush = 0;
207         }
208         RWLOCK_EXIT(&ipf_state);
209         SPL_X(s);
210         return removed;
211 }
212
213
214 int fr_state_ioctl(data, cmd, mode)
215 caddr_t data;
216 #if defined(__NetBSD__) || defined(__OpenBSD__)
217 u_long cmd;
218 #else
219 int cmd;
220 #endif
221 int mode;
222 {
223         int     arg, ret, error = 0;
224
225         switch (cmd)
226         {
227         case SIOCIPFFL :
228                 IRCOPY(data, (caddr_t)&arg, sizeof(arg));
229                 if (arg == 0 || arg == 1) {
230                         ret = fr_state_flush(arg);
231                         IWCOPY((caddr_t)&ret, data, sizeof(ret));
232                 } else
233                         error = EINVAL;
234                 break;
235         case SIOCGIPST :
236                 IWCOPY((caddr_t)fr_statetstats(), data, sizeof(ips_stat_t));
237                 break;
238         case FIONREAD :
239 #ifdef  IPFILTER_LOG
240                 IWCOPY((caddr_t)&iplused[IPL_LOGSTATE], (caddr_t)data,
241                        sizeof(iplused[IPL_LOGSTATE]));
242 #endif
243                 break;
244         default :
245                 error = EINVAL;
246                 break;
247         }
248         return error;
249 }
250
251
252 /*
253  * Create a new ipstate structure and hang it off the hash table.
254  */
255 ipstate_t *fr_addstate(ip, fin, flags)
256 ip_t *ip;
257 fr_info_t *fin;
258 u_int flags;
259 {
260         register ipstate_t *is;
261         register u_int hv;
262         ipstate_t ips;
263         u_int pass;
264
265         if ((ip->ip_off & IP_OFFMASK) || (fin->fin_fi.fi_fl & FI_SHORT))
266                 return NULL;
267         if (ips_num == fr_statemax) {
268                 ips_stats.iss_max++;
269                 fr_state_doflush = 1;
270                 return NULL;
271         }
272         is = &ips;
273         bzero((char *)is, sizeof(*is));
274         ips.is_age = 1;
275         ips.is_state[0] = 0;
276         ips.is_state[1] = 0;
277         /*
278          * Copy and calculate...
279          */
280         hv = (is->is_p = ip->ip_p);
281         hv += (is->is_src.s_addr = ip->ip_src.s_addr);
282         hv += (is->is_dst.s_addr = ip->ip_dst.s_addr);
283
284         switch (ip->ip_p)
285         {
286         case IPPROTO_ICMP :
287             {
288                 struct icmp *ic = (struct icmp *)fin->fin_dp;
289
290                 switch (ic->icmp_type)
291                 {
292                 case ICMP_ECHO :
293                         is->is_icmp.ics_type = ICMP_ECHOREPLY;  /* XXX */
294                         hv += (is->is_icmp.ics_id = ic->icmp_id);
295                         hv += (is->is_icmp.ics_seq = ic->icmp_seq);
296                         break;
297                 case ICMP_TSTAMP :
298                 case ICMP_IREQ :
299                 case ICMP_MASKREQ :
300                         is->is_icmp.ics_type = ic->icmp_type + 1;
301                         break;
302                 default :
303                         return NULL;
304                 }
305                 ATOMIC_INC(ips_stats.iss_icmp);
306                 is->is_age = fr_icmptimeout;
307                 break;
308             }
309         case IPPROTO_TCP :
310             {
311                 register tcphdr_t *tcp = (tcphdr_t *)fin->fin_dp;
312
313                 /*
314                  * The endian of the ports doesn't matter, but the ack and
315                  * sequence numbers do as we do mathematics on them later.
316                  */
317                 is->is_dport = tcp->th_dport;
318                 is->is_sport = tcp->th_sport;
319                 if ((flags & (FI_W_DPORT|FI_W_SPORT)) == 0) {
320                         hv += tcp->th_dport;
321                         hv += tcp->th_sport;
322                 }
323                 if (tcp->th_seq != 0) {
324                         is->is_send = ntohl(tcp->th_seq) + ip->ip_len -
325                                       fin->fin_hlen - (tcp->th_off << 2) +
326                                       ((tcp->th_flags & TH_SYN) ? 1 : 0) +
327                                       ((tcp->th_flags & TH_FIN) ? 1 : 0);
328                         is->is_maxsend = is->is_send + 1;
329                 }
330                 is->is_dend = 0;
331                 is->is_maxswin = ntohs(tcp->th_win);
332                 if (is->is_maxswin == 0)
333                         is->is_maxswin = 1;
334                 /*
335                  * If we're creating state for a starting connection, start the
336                  * timer on it as we'll never see an error if it fails to
337                  * connect.
338                  */
339                 MUTEX_ENTER(&ipf_rw);
340                 ips_stats.iss_tcp++;
341                 fr_tcp_age(&is->is_age, is->is_state, ip, fin,
342                            tcp->th_sport == is->is_sport);
343                 MUTEX_EXIT(&ipf_rw);
344                 break;
345             }
346         case IPPROTO_UDP :
347             {
348                 register tcphdr_t *tcp = (tcphdr_t *)fin->fin_dp;
349
350                 if ((flags & (FI_W_DPORT|FI_W_SPORT)) == 0) {
351                         hv += (is->is_dport = tcp->th_dport);
352                         hv += (is->is_sport = tcp->th_sport);
353                 }
354                 ATOMIC_INC(ips_stats.iss_udp);
355                 is->is_age = fr_udptimeout;
356                 break;
357             }
358         default :
359                 return NULL;
360         }
361
362         KMALLOC(is, ipstate_t *);
363         if (is == NULL) {
364                 ATOMIC_INC(ips_stats.iss_nomem);
365                 return NULL;
366         }
367         bcopy((char *)&ips, (char *)is, sizeof(*is));
368         hv %= fr_statesize;
369         RW_UPGRADE(&ipf_mutex);
370         is->is_rule = fin->fin_fr;
371         if (is->is_rule != NULL) {
372                 is->is_rule->fr_ref++;
373                 pass = is->is_rule->fr_flags;
374         } else
375                 pass = fr_flags;
376         MUTEX_DOWNGRADE(&ipf_mutex);
377         WRITE_ENTER(&ipf_state);
378
379         is->is_rout = pass & FR_OUTQUE ? 1 : 0;
380         is->is_pass = pass;
381         is->is_pkts = 1;
382         is->is_bytes = ip->ip_len;
383         /*
384          * We want to check everything that is a property of this packet,
385          * but we don't (automatically) care about it's fragment status as
386          * this may change.
387          */
388         is->is_opt = fin->fin_fi.fi_optmsk;
389         is->is_optmsk = 0xffffffff;
390         is->is_sec = fin->fin_fi.fi_secmsk;
391         is->is_secmsk = 0xffff;
392         is->is_auth = fin->fin_fi.fi_auth;
393         is->is_authmsk = 0xffff;
394         is->is_flags = fin->fin_fi.fi_fl & FI_CMP;
395         is->is_flags |= FI_CMP << 4;
396         is->is_flags |= flags & (FI_W_DPORT|FI_W_SPORT);
397         /*
398          * add into table.
399          */
400         is->is_next = ips_table[hv];
401         ips_table[hv] = is;
402         if (is->is_next == NULL)
403                 ips_stats.iss_inuse++;
404         if (fin->fin_out) {
405                 is->is_ifpin = NULL;
406                 is->is_ifpout = fin->fin_ifp;
407         } else {
408                 is->is_ifpin = fin->fin_ifp;
409                 is->is_ifpout = NULL;
410         }
411         if (pass & FR_LOGFIRST)
412                 is->is_pass &= ~(FR_LOGFIRST|FR_LOG);
413         ATOMIC_INC(ips_num);
414 #ifdef  IPFILTER_LOG
415         ipstate_log(is, ISL_NEW);
416 #endif
417         RWLOCK_EXIT(&ipf_state);
418         fin->fin_rev = (is->is_dst.s_addr != ip->ip_dst.s_addr);
419         if (fin->fin_fi.fi_fl & FI_FRAG)
420                 ipfr_newfrag(ip, fin, pass ^ FR_KEEPSTATE);
421         return is;
422 }
423
424
425
426 /*
427  * check to see if a packet with TCP headers fits within the TCP window.
428  * change timeout depending on whether new packet is a SYN-ACK returning for a
429  * SYN or a RST or FIN which indicate time to close up shop.
430  */
431 int fr_tcpstate(is, fin, ip, tcp)
432 register ipstate_t *is;
433 fr_info_t *fin;
434 ip_t *ip;
435 tcphdr_t *tcp;
436 {
437         register tcp_seq seq, ack, end;
438         register int ackskew;
439         tcpdata_t  *fdata, *tdata;
440         u_short win, maxwin;
441         int ret = 0;
442         int source;
443
444         /*
445          * Find difference between last checked packet and this packet.
446          */
447         source = (ip->ip_src.s_addr == is->is_src.s_addr);
448         fdata = &is->is_tcp.ts_data[!source];
449         tdata = &is->is_tcp.ts_data[source];
450         seq = ntohl(tcp->th_seq);
451         ack = ntohl(tcp->th_ack);
452         win = ntohs(tcp->th_win);
453         end = seq + ip->ip_len - fin->fin_hlen - (tcp->th_off << 2) +
454                ((tcp->th_flags & TH_SYN) ? 1 : 0) +
455                ((tcp->th_flags & TH_FIN) ? 1 : 0);      
456
457         if (fdata->td_end == 0) {
458                 /*
459                  * Must be a (outgoing) SYN-ACK in reply to a SYN.
460                  */
461                 fdata->td_end = end;
462                 fdata->td_maxwin = 1;
463                 fdata->td_maxend = end + 1;
464         }
465
466         if (!(tcp->th_flags & TH_ACK)) {  /* Pretend an ack was sent */
467                 ack = tdata->td_end;
468                 win = 1;
469         } else if (((tcp->th_flags & (TH_ACK|TH_RST)) == (TH_ACK|TH_RST)) &&
470                    (ack == 0)) {
471                 /* gross hack to get around certain broken tcp stacks */
472                 ack = tdata->td_end;
473         }
474
475         if (seq == end)
476                 seq = end = fdata->td_end;
477
478         maxwin = tdata->td_maxwin;
479         ackskew = tdata->td_end - ack;
480
481 #define SEQ_GE(a,b)     ((int)((a) - (b)) >= 0)
482 #define SEQ_GT(a,b)     ((int)((a) - (b)) > 0)
483         if ((SEQ_GE(fdata->td_maxend, end)) &&
484             (SEQ_GE(seq + maxwin, fdata->td_end - maxwin)) && 
485 /* XXX what about big packets */
486 #define MAXACKWINDOW 66000
487             (ackskew >= -MAXACKWINDOW) &&
488             (ackskew <= MAXACKWINDOW)) {
489                 /* if ackskew < 0 then this should be due to fragented
490                  * packets. There is no way to know the length of the
491                  * total packet in advance.
492                  * We do know the total length from the fragment cache though.
493                  * Note however that there might be more sessions with
494                  * exactly the same source and destination paramters in the
495                  * state cache (and source and destination is the only stuff
496                  * that is saved in the fragment cache). Note further that
497                  * some TCP connections in the state cache are hashed with
498                  * sport and dport as well which makes it not worthwhile to
499                  * look for them.
500                  * Thus, when ackskew is negative but still seems to belong
501                  * to this session, we bump up the destinations end value.
502                  */
503                 if (ackskew < 0)
504                         tdata->td_end = ack;
505
506                 /* update max window seen */
507                 if (fdata->td_maxwin < win)
508                         fdata->td_maxwin = win;
509                 if (SEQ_GT(end, fdata->td_end))
510                         fdata->td_end = end;
511                 if (SEQ_GE(ack + win, tdata->td_maxend)) {
512                         tdata->td_maxend = ack + win;
513                         if (win == 0)
514                                 tdata->td_maxend++;
515                 }
516
517                 ATOMIC_INC(ips_stats.iss_hits);
518                 is->is_pkts++;
519                 is->is_bytes += ip->ip_len;
520                 /*
521                  * Nearing end of connection, start timeout.
522                  */
523                 MUTEX_ENTER(&ipf_rw);
524                 fr_tcp_age(&is->is_age, is->is_state, ip, fin, source);
525                 MUTEX_EXIT(&ipf_rw);
526                 ret = 1;
527         }
528         return ret;
529 }
530
531
532 static int fr_matchsrcdst(is, src, dst, fin, tcp)
533 ipstate_t *is;
534 struct in_addr src, dst;
535 fr_info_t *fin;
536 tcphdr_t *tcp;
537 {
538         int ret = 0, rev, out, flags;
539         u_short sp, dp;
540         void *ifp;
541
542         rev = fin->fin_rev = (is->is_dst.s_addr != dst.s_addr);
543         ifp = fin->fin_ifp;
544         out = fin->fin_out;
545
546         if (tcp != NULL) {
547                 flags = is->is_flags;
548                 sp = tcp->th_sport;
549                 dp = tcp->th_dport;
550         } else {
551                 flags = 0;
552                 sp = 0;
553                 dp = 0;
554         }
555
556         if (rev == 0) {
557                 if (!out) {
558                         if (is->is_ifpin == ifp)
559                                 ret = 1;
560                 } else {
561                         if (is->is_ifpout == NULL || is->is_ifpout == ifp)
562                                 ret = 1;
563                 }
564         } else {
565                 if (out) {
566                         if (is->is_ifpin == ifp)
567                                 ret = 1;
568                 } else {
569                         if (is->is_ifpout == NULL || is->is_ifpout == ifp)
570                                 ret = 1;
571                 }
572         }
573         if (ret == 0)
574                 return 0;
575         ret = 0;
576
577         if (rev == 0) {
578                 if ((is->is_dst.s_addr == dst.s_addr) &&
579                     (is->is_src.s_addr == src.s_addr) &&
580                     (!tcp || ((sp == is->is_sport || flags & FI_W_SPORT) &&
581                      (dp == is->is_dport || flags & FI_W_DPORT)))) {
582                         ret = 1;
583                 }
584         } else {
585                 if ((is->is_dst.s_addr == src.s_addr) &&
586                     (is->is_src.s_addr == dst.s_addr) &&
587                     (!tcp || ((sp == is->is_dport || flags & FI_W_DPORT) &&
588                      (dp == is->is_sport || flags & FI_W_SPORT)))) {
589                         ret = 1;
590                 }
591         }
592         if (ret == 0)
593                 return 0;
594
595         /*
596          * Whether or not this should be here, is questionable, but the aim
597          * is to get this out of the main line.
598          */
599         if (tcp == NULL)
600                 flags = is->is_flags & (FI_CMP|(FI_CMP<<4));
601
602         if (((fin->fin_fi.fi_fl & (flags >> 4)) != (flags & FI_CMP)) ||
603             ((fin->fin_fi.fi_optmsk & is->is_optmsk) != is->is_opt) ||
604             ((fin->fin_fi.fi_secmsk & is->is_secmsk) != is->is_sec) ||
605             ((fin->fin_fi.fi_auth & is->is_authmsk) != is->is_auth))
606                 return 0;
607
608         if ((flags & (FI_W_SPORT|FI_W_DPORT))) {
609                 if ((flags & FI_W_SPORT) != 0) {
610                         if (rev == 0) {
611                                 is->is_sport = sp;
612                                 is->is_send = htonl(tcp->th_seq);
613                         } else {
614                                 is->is_sport = dp;
615                                 is->is_send = htonl(tcp->th_ack);
616                         }
617                         is->is_maxsend = is->is_send + 1;
618                 } else if ((flags & FI_W_DPORT) != 0) {
619                         if (rev == 0) {
620                                 is->is_dport = dp;
621                                 is->is_dend = htonl(tcp->th_ack);
622                         } else {
623                                 is->is_dport = sp;
624                                 is->is_dend = htonl(tcp->th_seq);
625                         }
626                         is->is_maxdend = is->is_dend + 1;
627                 }
628                 is->is_flags &= ~(FI_W_SPORT|FI_W_DPORT);
629         }
630
631         if (!rev) {
632                 if (out && (out == is->is_rout)) {
633                         if (!is->is_ifpout)
634                                 is->is_ifpout = ifp;
635                 } else {
636                         if (!is->is_ifpin)
637                                 is->is_ifpin = ifp;
638                 }
639         } else {
640                 if (!out && (out != is->is_rout)) {
641                         if (!is->is_ifpin)
642                                 is->is_ifpin = ifp;
643                 } else {
644                         if (!is->is_ifpout)
645                                 is->is_ifpout = ifp;
646                 }
647         }
648         return 1;
649 }
650
651 frentry_t *fr_checkicmpmatchingstate(ip, fin)
652 ip_t *ip;
653 fr_info_t *fin;
654 {
655         register struct in_addr dst, src;
656         register ipstate_t *is, **isp;
657         register u_short sport, dport;
658         register u_char pr;
659         struct icmp *ic;
660         fr_info_t ofin;
661         u_int hv, dest;
662         tcphdr_t *tcp;
663         frentry_t *fr;
664         ip_t *oip;
665         int type;
666
667         /* 
668          * Does it at least have the return (basic) IP header ? 
669          * Only a basic IP header (no options) should be with
670          * an ICMP error header.
671          */
672         if ((ip->ip_hl != 5) || (ip->ip_len < ICMPERR_MINPKTLEN))
673                 return NULL;
674         ic = (struct icmp *)((char *)ip + fin->fin_hlen);
675         type = ic->icmp_type;
676         /*
677          * If it's not an error type, then return
678          */
679         if ((type != ICMP_UNREACH) && (type != ICMP_SOURCEQUENCH) &&
680             (type != ICMP_REDIRECT) && (type != ICMP_TIMXCEED) &&
681             (type != ICMP_PARAMPROB))
682                 return NULL;
683
684         oip = (ip_t *)((char *)fin->fin_dp + ICMPERR_ICMPHLEN);
685         if (ip->ip_len < ICMPERR_MAXPKTLEN + ((oip->ip_hl - 5) << 2))
686                 return NULL;
687         if ((oip->ip_p != IPPROTO_TCP) && (oip->ip_p != IPPROTO_UDP))
688                 return NULL;
689
690         tcp = (tcphdr_t *)((char *)oip + (oip->ip_hl << 2));
691         dport = tcp->th_dport;
692         sport = tcp->th_sport;
693
694         hv = (pr = oip->ip_p);
695         hv += (src.s_addr = oip->ip_src.s_addr);
696         hv += (dst.s_addr = oip->ip_dst.s_addr);
697         hv += dport;
698         hv += sport;
699         hv %= fr_statesize;
700         /*
701          * we make an fin entry to be able to feed it to
702          * matchsrcdst note that not all fields are encessary
703          * but this is the cleanest way. Note further we fill
704          * in fin_mp such that if someone uses it we'll get
705          * a kernel panic. fr_matchsrcdst does not use this.
706          *
707          * watch out here, as ip is in host order and oip in network
708          * order. Any change we make must be undone afterwards.
709          */
710         oip->ip_len = ntohs(oip->ip_len);
711         fr_makefrip(oip->ip_hl << 2, oip, &ofin);
712         oip->ip_len = htons(oip->ip_len);
713         ofin.fin_ifp = fin->fin_ifp;
714         ofin.fin_out = !fin->fin_out;
715         ofin.fin_mp = NULL; /* if dereferenced, panic XXX */
716         READ_ENTER(&ipf_state);
717         for (isp = &ips_table[hv]; (is = *isp); isp = &is->is_next) {
718                 /*
719                  * Only allow this icmp though if the
720                  * encapsulated packet was allowed through the
721                  * other way around. Note that the minimal amount
722                  * of info present does not allow for checking against
723                  * tcp internals such as seq and ack numbers.
724                  */
725                 if ((is->is_p == pr) &&
726                     fr_matchsrcdst(is, src, dst, &ofin, tcp)) {
727                         fr = is->is_rule;
728                         ips_stats.iss_hits++;
729                         /*
730                          * we must swap src and dst here because the icmp
731                          * comes the other way around
732                          */
733                         dest = (is->is_dst.s_addr != src.s_addr);
734                         is->is_pkts++;
735                         is->is_bytes += ip->ip_len;     
736                         /*
737                          * we deliberately do not touch the timeouts
738                          * for the accompanying state table entry.
739                          * It remains to be seen if that is correct. XXX
740                          */
741                         RWLOCK_EXIT(&ipf_state);
742                         return fr;
743                 }
744         }
745         RWLOCK_EXIT(&ipf_state);
746         return NULL;
747 }
748
749 /*
750  * Check if a packet has a registered state.
751  */
752 frentry_t *fr_checkstate(ip, fin)
753 ip_t *ip;
754 fr_info_t *fin;
755 {
756         register struct in_addr dst, src;
757         register ipstate_t *is, **isp;
758         register u_char pr;
759         u_int hv, hvm, hlen, tryagain, pass;
760         struct icmp *ic;
761         frentry_t *fr;
762         tcphdr_t *tcp;
763
764         if ((ip->ip_off & IP_OFFMASK) || (fin->fin_fi.fi_fl & FI_SHORT))
765                 return NULL;
766
767         is = NULL;
768         hlen = fin->fin_hlen;
769         tcp = (tcphdr_t *)((char *)ip + hlen);
770         ic = (struct icmp *)tcp;
771         hv = (pr = ip->ip_p);
772         hv += (src.s_addr = ip->ip_src.s_addr);
773         hv += (dst.s_addr = ip->ip_dst.s_addr);
774
775         /*
776          * Search the hash table for matching packet header info.
777          */
778         switch (ip->ip_p)
779         {
780         case IPPROTO_ICMP :
781                 hv += ic->icmp_id;
782                 hv += ic->icmp_seq;
783                 hv %= fr_statesize;
784                 READ_ENTER(&ipf_state);
785                 for (isp = &ips_table[hv]; (is = *isp); isp = &is->is_next)
786                         if ((is->is_p == pr) &&
787                             (ic->icmp_id == is->is_icmp.ics_id) &&
788                             (ic->icmp_seq == is->is_icmp.ics_seq) &&
789                             fr_matchsrcdst(is, src, dst, fin, NULL)) {
790                                 if ((is->is_type == ICMP_ECHOREPLY) &&
791                                     (ic->icmp_type == ICMP_ECHO))
792                                         ;
793                                 else if (is->is_type != ic->icmp_type)
794                                         continue;
795                                 is->is_age = fr_icmptimeout;
796                                 break;
797                         }
798                 if (is != NULL)
799                         break;
800                 RWLOCK_EXIT(&ipf_state);
801                 /*
802                  * No matching icmp state entry. Perhaps this is a
803                  * response to another state entry.
804                  */
805                 fr = fr_checkicmpmatchingstate(ip, fin);
806                 if (fr)
807                         return fr;
808                 break;
809         case IPPROTO_TCP :
810             {
811                 register u_short dport = tcp->th_dport, sport = tcp->th_sport;
812
813                 tryagain = 0;
814 retry_tcp:
815                 hvm = hv % fr_statesize;
816                 WRITE_ENTER(&ipf_state);
817                 for (isp = &ips_table[hvm]; (is = *isp);
818                      isp = &is->is_next)
819                         if ((is->is_p == pr) &&
820                             fr_matchsrcdst(is, src, dst, fin, tcp)) {
821                                 if (fr_tcpstate(is, fin, ip, tcp)) {
822                                         break;
823 #ifndef _KERNEL
824                                         if (tcp->th_flags & TCP_CLOSE) {
825                                                 *isp = is->is_next;
826                                                 isp = &ips_table[hvm];
827                                                 if (ips_table[hvm] == NULL)
828                                                         ips_stats.iss_inuse--;
829                                                 fr_delstate(is);
830                                                 ips_num--;
831                                         }
832 #endif
833                                         break;
834                                 }
835                                 is = NULL;
836                                 break;
837                         }
838                 if (is != NULL)
839                         break;
840                 RWLOCK_EXIT(&ipf_state);
841                 hv += dport;
842                 hv += sport;
843                 if (tryagain == 0) {
844                         tryagain = 1;
845                         goto retry_tcp;
846                 }
847                 break;
848             }
849         case IPPROTO_UDP :
850             {
851                 register u_short dport = tcp->th_dport, sport = tcp->th_sport;
852
853                 tryagain = 0;
854 retry_udp:
855                 hvm = hv % fr_statesize;
856                 /*
857                  * Nothing else to match on but ports. and IP#'s
858                  */
859                 READ_ENTER(&ipf_state);
860                 for (is = ips_table[hvm]; is; is = is->is_next)
861                         if ((is->is_p == pr) &&
862                             fr_matchsrcdst(is, src, dst, fin, tcp)) {
863                                 is->is_age = fr_udptimeout;
864                                 break;
865                         }
866                 if (is != NULL)
867                         break;
868                 RWLOCK_EXIT(&ipf_state);
869                 hv += dport;
870                 hv += sport;
871                 if (tryagain == 0) {
872                         tryagain = 1;
873                         goto retry_udp;
874                 }
875                 break;
876             }
877         default :
878                 break;
879         }
880         if (is == NULL) {
881                 ATOMIC_INC(ips_stats.iss_miss);
882                 return NULL;
883         }
884         MUTEX_ENTER(&ipf_rw);
885         is->is_bytes += ip->ip_len;
886         ips_stats.iss_hits++;
887         is->is_pkts++;
888         MUTEX_EXIT(&ipf_rw);
889         fr = is->is_rule;
890         fin->fin_fr = fr;
891         pass = is->is_pass;
892         RWLOCK_EXIT(&ipf_state);
893         if (fin->fin_fi.fi_fl & FI_FRAG)
894                 ipfr_newfrag(ip, fin, pass ^ FR_KEEPSTATE);
895         return fr;
896 }
897
898
899 static void fr_delstate(is)
900 ipstate_t *is;
901 {
902         frentry_t *fr;
903
904         fr = is->is_rule;
905         if (fr != NULL) {
906                 ATOMIC_DEC(fr->fr_ref);
907                 if (fr->fr_ref == 0)
908                         KFREE(fr);
909         }
910         KFREE(is);
911 }
912
913
914 /*
915  * Free memory in use by all state info. kept.
916  */
917 void fr_stateunload()
918 {
919         register int i;
920         register ipstate_t *is, **isp;
921
922         WRITE_ENTER(&ipf_state);
923         for (i = fr_statesize - 1; i >= 0; i--)
924                 for (isp = &ips_table[i]; (is = *isp); ) {
925                         *isp = is->is_next;
926                         fr_delstate(is);
927                         ips_num--;
928                 }
929         ips_stats.iss_inuse = 0;
930         ips_num = 0;
931         RWLOCK_EXIT(&ipf_state);
932         KFREES(ips_table, fr_statesize * sizeof(ipstate_t *));
933         ips_table = NULL;
934 }
935
936
937 /*
938  * Slowly expire held state for thingslike UDP and ICMP.  Timeouts are set
939  * in expectation of this being called twice per second.
940  */
941 void fr_timeoutstate()
942 {
943         register int i;
944         register ipstate_t *is, **isp;
945 #if defined(_KERNEL) && !SOLARIS
946         int s;
947 #endif
948
949         SPL_NET(s);
950         WRITE_ENTER(&ipf_state);
951         for (i = fr_statesize - 1; i >= 0; i--)
952                 for (isp = &ips_table[i]; (is = *isp); )
953                         if (is->is_age && !--is->is_age) {
954                                 *isp = is->is_next;
955                                 if (is->is_p == IPPROTO_TCP)
956                                         ips_stats.iss_fin++;
957                                 else
958                                         ips_stats.iss_expire++;
959                                 if (ips_table[i] == NULL)
960                                         ips_stats.iss_inuse--;
961 #ifdef  IPFILTER_LOG
962                                 ipstate_log(is, ISL_EXPIRE);
963 #endif
964                                 fr_delstate(is);
965                                 ips_num--;
966                         } else
967                                 isp = &is->is_next;
968         RWLOCK_EXIT(&ipf_state);
969         SPL_X(s);
970 }
971
972
973 /*
974  * Original idea freom Pradeep Krishnan for use primarily with NAT code.
975  * (pkrishna@netcom.com)
976  */
977 void fr_tcp_age(age, state, ip, fin, dir)
978 u_long *age;
979 u_char *state;
980 ip_t *ip;
981 fr_info_t *fin;
982 int dir;
983 {
984         tcphdr_t *tcp = (tcphdr_t *)fin->fin_dp;
985         u_char flags = tcp->th_flags;
986         int dlen, ostate;
987
988         ostate = state[1 - dir];
989
990         dlen = ip->ip_len - fin->fin_hlen - (tcp->th_off << 2);
991
992         if (flags & TH_RST) {
993                 if (!(tcp->th_flags & TH_PUSH) && !dlen) {
994                         *age = fr_tcpclosed;
995                         state[dir] = TCPS_CLOSED;
996                 } else {
997                         *age = fr_tcpclosewait;
998                         state[dir] = TCPS_CLOSE_WAIT;
999                 }
1000                 return;
1001         }
1002
1003         *age = fr_tcptimeout; /* 1 min */
1004
1005         switch(state[dir])
1006         {
1007         case TCPS_CLOSED:
1008                 if ((flags & (TH_FIN|TH_SYN|TH_RST|TH_ACK)) == TH_ACK) {
1009                         state[dir] = TCPS_ESTABLISHED;
1010                         *age = fr_tcpidletimeout;
1011                 }
1012         case TCPS_FIN_WAIT_2:
1013                 if ((flags & TH_OPENING) == TH_OPENING)
1014                         state[dir] = TCPS_SYN_RECEIVED;
1015                 else if (flags & TH_SYN)
1016                         state[dir] = TCPS_SYN_SENT;
1017                 break;
1018         case TCPS_SYN_RECEIVED:
1019         case TCPS_SYN_SENT:
1020                 if ((flags & (TH_FIN|TH_ACK)) == TH_ACK) {
1021                         state[dir] = TCPS_ESTABLISHED;
1022                         *age = fr_tcpidletimeout;
1023                 } else if ((flags & (TH_FIN|TH_ACK)) == (TH_FIN|TH_ACK)) {
1024                         state[dir] = TCPS_CLOSE_WAIT;
1025                         if (!(flags & TH_PUSH) && !dlen &&
1026                             ostate > TCPS_ESTABLISHED)
1027                                 *age  = fr_tcplastack;
1028                         else
1029                                 *age  = fr_tcpclosewait;
1030                 }
1031                 break;
1032         case TCPS_ESTABLISHED:
1033                 if (flags & TH_FIN) {
1034                         state[dir] = TCPS_CLOSE_WAIT;
1035                         if (!(flags & TH_PUSH) && !dlen &&
1036                             ostate > TCPS_ESTABLISHED)
1037                                 *age  = fr_tcplastack;
1038                         else
1039                                 *age  = fr_tcpclosewait;
1040                 } else {
1041                         if (ostate < TCPS_CLOSE_WAIT)
1042                                 *age = fr_tcpidletimeout;
1043                 }
1044                 break;
1045         case TCPS_CLOSE_WAIT:
1046                 if ((flags & TH_FIN) && !(flags & TH_PUSH) && !dlen &&
1047                     ostate > TCPS_ESTABLISHED) {
1048                         *age  = fr_tcplastack;
1049                         state[dir] = TCPS_LAST_ACK;
1050                 } else
1051                         *age  = fr_tcpclosewait;
1052                 break;
1053         case TCPS_LAST_ACK:
1054                 if (flags & TH_ACK) {
1055                         state[dir] = TCPS_FIN_WAIT_2;
1056                         if (!(flags & TH_PUSH) && !dlen &&
1057                             ostate > TCPS_ESTABLISHED)
1058                                 *age  = fr_tcplastack;
1059                         else {
1060                                 *age  = fr_tcpclosewait;
1061                                 state[dir] = TCPS_CLOSE_WAIT;
1062                         }
1063                 }
1064                 break;
1065         }
1066 }
1067
1068
1069 #ifdef  IPFILTER_LOG
1070 void ipstate_log(is, type)
1071 struct ipstate *is;
1072 u_int type;
1073 {
1074         struct  ipslog  ipsl;
1075         void *items[1];
1076         size_t sizes[1];
1077         int types[1];
1078
1079         ipsl.isl_type = type;
1080         ipsl.isl_pkts = is->is_pkts;
1081         ipsl.isl_bytes = is->is_bytes;
1082         ipsl.isl_src = is->is_src;
1083         ipsl.isl_dst = is->is_dst;
1084         ipsl.isl_p = is->is_p;
1085         ipsl.isl_flags = is->is_flags;
1086         if (ipsl.isl_p == IPPROTO_TCP || ipsl.isl_p == IPPROTO_UDP) {
1087                 ipsl.isl_sport = is->is_sport;
1088                 ipsl.isl_dport = is->is_dport;
1089                 if (ipsl.isl_p == IPPROTO_TCP) {
1090                         ipsl.isl_state[0] = is->is_state[0];
1091                         ipsl.isl_state[1] = is->is_state[1];
1092                 }
1093         } else if (ipsl.isl_p == IPPROTO_ICMP)
1094                 ipsl.isl_itype = is->is_icmp.ics_type;
1095         else {
1096                 ipsl.isl_ps.isl_filler[0] = 0;
1097                 ipsl.isl_ps.isl_filler[1] = 0;
1098         }
1099         items[0] = &ipsl;
1100         sizes[0] = sizeof(ipsl);
1101         types[0] = 0;
1102
1103         (void) ipllog(IPL_LOGSTATE, NULL, items, sizes, types, 1);
1104 }
1105 #endif