4 * Copyright (C) 1995-2003 by Darren Reed.
6 * See the IPFILTER.LICENCE file for details on licencing.
8 #if defined(KERNEL) || defined(_KERNEL)
14 #include <sys/errno.h>
15 #include <sys/types.h>
16 #include <sys/param.h>
18 #if defined(__NetBSD__) && (NetBSD >= 199905) && !defined(IPFILTER_LKM) && \
20 # if (__NetBSD_Version__ < 399001400)
21 # include "opt_ipfilter_log.h"
23 # include "opt_ipfilter.h"
26 #if defined(_KERNEL) && defined(__FreeBSD_version) && \
27 (__FreeBSD_version >= 400000) && !defined(KLD_MODULE)
28 #include "opt_inet6.h"
30 #if !defined(_KERNEL) && !defined(__KERNEL__)
41 #if defined(_KERNEL) && (__FreeBSD_version >= 220000)
42 # include <sys/filio.h>
43 # include <sys/fcntl.h>
44 # if (__FreeBSD_version >= 300000) && !defined(IPFILTER_LKM)
45 # include "opt_ipfilter.h"
48 # include <sys/ioctl.h>
52 # include <sys/protosw.h>
54 #include <sys/socket.h>
56 # include <sys/systm.h>
57 # if !defined(__SVR4) && !defined(__svr4__)
58 # include <sys/mbuf.h>
61 #if defined(__SVR4) || defined(__svr4__)
62 # include <sys/filio.h>
63 # include <sys/byteorder.h>
65 # include <sys/dditypes.h>
67 # include <sys/stream.h>
68 # include <sys/kmem.h>
75 #include <net/route.h>
76 #include <netinet/in.h>
77 #include <netinet/in_systm.h>
78 #include <netinet/ip.h>
79 #include <netinet/tcp.h>
81 # include <netinet/ip_var.h>
83 #if !defined(__hpux) && !defined(linux)
84 # include <netinet/tcp_fsm.h>
86 #include <netinet/udp.h>
87 #include <netinet/ip_icmp.h>
88 #include "netinet/ip_compat.h"
89 #include <netinet/tcpip.h>
90 #include "netinet/ip_fil.h"
91 #include "netinet/ip_nat.h"
92 #include "netinet/ip_frag.h"
93 #include "netinet/ip_state.h"
94 #include "netinet/ip_proxy.h"
96 #include "netinet/ip_sync.h"
99 #include "netinet/ip_scan.h"
102 #include <netinet/icmp6.h>
104 #if (__FreeBSD_version >= 300000)
105 # include <sys/malloc.h>
106 # if defined(_KERNEL) && !defined(IPFILTER_LKM)
107 # include <sys/libkern.h>
108 # include <sys/systm.h>
111 /* END OF INCLUDES */
115 static const char sccsid[] = "@(#)ip_state.c 1.8 6/5/96 (C) 1993-2000 Darren Reed";
116 static const char rcsid[] = "@(#)$Id: ip_state.c,v 2.186.2.80 2007/10/16 09:33:23 darrenr Exp $";
119 static ipstate_t **ips_table = NULL;
120 static u_long *ips_seed = NULL;
121 static int ips_num = 0;
122 static u_long ips_last_force_flush = 0;
123 ips_stat_t ips_stats;
126 static ipstate_t *fr_checkicmp6matchingstate __P((fr_info_t *));
128 static ipstate_t *fr_matchsrcdst __P((fr_info_t *, ipstate_t *, i6addr_t *,
129 i6addr_t *, tcphdr_t *, u_32_t));
130 static ipstate_t *fr_checkicmpmatchingstate __P((fr_info_t *));
131 static int fr_state_flush __P((int, int));
132 static int fr_state_flush_entry __P((void *));
133 static ips_stat_t *fr_statetstats __P((void));
134 static int fr_delstate __P((ipstate_t *, int));
135 static int fr_state_remove __P((caddr_t));
136 static void fr_ipsmove __P((ipstate_t *, u_int));
137 static int fr_tcpstate __P((fr_info_t *, tcphdr_t *, ipstate_t *));
138 static int fr_tcpoptions __P((fr_info_t *, tcphdr_t *, tcpdata_t *));
139 static ipstate_t *fr_stclone __P((fr_info_t *, tcphdr_t *, ipstate_t *));
140 static void fr_fixinisn __P((fr_info_t *, ipstate_t *));
141 static void fr_fixoutisn __P((fr_info_t *, ipstate_t *));
142 static void fr_checknewisn __P((fr_info_t *, ipstate_t *));
143 static int fr_stateiter __P((ipftoken_t *, ipfgeniter_t *));
144 static int fr_stgettable __P((char *));
146 int fr_stputent __P((caddr_t));
147 int fr_stgetent __P((caddr_t));
149 #define ONE_DAY IPF_TTLVAL(1 * 86400) /* 1 day */
150 #define FIVE_DAYS (5 * ONE_DAY)
151 #define DOUBLE_HASH(x) (((x) + ips_seed[(x) % fr_statesize]) % fr_statesize)
153 u_long fr_tcpidletimeout = FIVE_DAYS,
154 fr_tcpclosewait = IPF_TTLVAL(2 * TCP_MSL),
155 fr_tcplastack = IPF_TTLVAL(30),
156 fr_tcptimeout = IPF_TTLVAL(2 * TCP_MSL),
157 fr_tcptimewait = IPF_TTLVAL(2 * TCP_MSL),
158 fr_tcpclosed = IPF_TTLVAL(30),
159 fr_tcphalfclosed = IPF_TTLVAL(2 * 3600), /* 2 hours */
160 fr_udptimeout = IPF_TTLVAL(120),
161 fr_udpacktimeout = IPF_TTLVAL(12),
162 fr_icmptimeout = IPF_TTLVAL(60),
163 fr_icmpacktimeout = IPF_TTLVAL(6),
164 fr_iptimeout = IPF_TTLVAL(60);
165 int fr_statemax = IPSTATE_MAX,
166 fr_statesize = IPSTATE_SIZE;
167 int fr_state_doflush = 0,
169 fr_state_maxbucket = 0,
170 fr_state_maxbucket_reset = 1,
172 ipftq_t ips_tqtqb[IPF_TCP_NSTATES],
181 int ipstate_logging = 1;
183 int ipstate_logging = 0;
185 ipstate_t *ips_list = NULL;
188 /* ------------------------------------------------------------------------ */
189 /* Function: fr_stateinit */
190 /* Returns: int - 0 == success, -1 == failure */
191 /* Parameters: Nil */
193 /* Initialise all the global variables used within the state code. */
194 /* This action also includes initiailising locks. */
195 /* ------------------------------------------------------------------------ */
200 KMALLOCS(ips_table, ipstate_t **, fr_statesize * sizeof(ipstate_t *));
201 if (ips_table == NULL)
203 bzero((char *)ips_table, fr_statesize * sizeof(ipstate_t *));
205 KMALLOCS(ips_seed, u_long *, fr_statesize * sizeof(*ips_seed));
206 if (ips_seed == NULL)
208 for (i = 0; i < fr_statesize; i++) {
210 * XXX - ips_seed[X] should be a random number of sorts.
212 #if (__FreeBSD_version >= 400000)
213 ips_seed[i] = arc4random();
215 ips_seed[i] = ((u_long)ips_seed + i) * fr_statesize;
216 ips_seed[i] ^= 0xa5a55a5a;
217 ips_seed[i] *= (u_long)ips_seed;
218 ips_seed[i] ^= 0x5a5aa5a5;
219 ips_seed[i] *= fr_statemax;
223 /* fill icmp reply type table */
224 for (i = 0; i <= ICMP_MAXTYPE; i++)
225 icmpreplytype4[i] = -1;
226 icmpreplytype4[ICMP_ECHO] = ICMP_ECHOREPLY;
227 icmpreplytype4[ICMP_TSTAMP] = ICMP_TSTAMPREPLY;
228 icmpreplytype4[ICMP_IREQ] = ICMP_IREQREPLY;
229 icmpreplytype4[ICMP_MASKREQ] = ICMP_MASKREPLY;
231 /* fill icmp reply type table */
232 for (i = 0; i <= ICMP6_MAXTYPE; i++)
233 icmpreplytype6[i] = -1;
234 icmpreplytype6[ICMP6_ECHO_REQUEST] = ICMP6_ECHO_REPLY;
235 icmpreplytype6[ICMP6_MEMBERSHIP_QUERY] = ICMP6_MEMBERSHIP_REPORT;
236 icmpreplytype6[ICMP6_NI_QUERY] = ICMP6_NI_REPLY;
237 icmpreplytype6[ND_ROUTER_SOLICIT] = ND_ROUTER_ADVERT;
238 icmpreplytype6[ND_NEIGHBOR_SOLICIT] = ND_NEIGHBOR_ADVERT;
241 KMALLOCS(ips_stats.iss_bucketlen, u_long *,
242 fr_statesize * sizeof(u_long));
243 if (ips_stats.iss_bucketlen == NULL)
245 bzero((char *)ips_stats.iss_bucketlen, fr_statesize * sizeof(u_long));
247 if (fr_state_maxbucket == 0) {
248 for (i = fr_statesize; i > 0; i >>= 1)
249 fr_state_maxbucket++;
250 fr_state_maxbucket *= 2;
253 ips_stats.iss_tcptab = ips_tqtqb;
254 fr_sttab_init(ips_tqtqb);
255 ips_tqtqb[IPF_TCP_NSTATES - 1].ifq_next = &ips_udptq;
256 ips_udptq.ifq_ttl = (u_long)fr_udptimeout;
257 ips_udptq.ifq_ref = 1;
258 ips_udptq.ifq_head = NULL;
259 ips_udptq.ifq_tail = &ips_udptq.ifq_head;
260 MUTEX_INIT(&ips_udptq.ifq_lock, "ipftq udp tab");
261 ips_udptq.ifq_next = &ips_udpacktq;
262 ips_udpacktq.ifq_ttl = (u_long)fr_udpacktimeout;
263 ips_udpacktq.ifq_ref = 1;
264 ips_udpacktq.ifq_head = NULL;
265 ips_udpacktq.ifq_tail = &ips_udpacktq.ifq_head;
266 MUTEX_INIT(&ips_udpacktq.ifq_lock, "ipftq udpack tab");
267 ips_udpacktq.ifq_next = &ips_icmptq;
268 ips_icmptq.ifq_ttl = (u_long)fr_icmptimeout;
269 ips_icmptq.ifq_ref = 1;
270 ips_icmptq.ifq_head = NULL;
271 ips_icmptq.ifq_tail = &ips_icmptq.ifq_head;
272 MUTEX_INIT(&ips_icmptq.ifq_lock, "ipftq icmp tab");
273 ips_icmptq.ifq_next = &ips_icmpacktq;
274 ips_icmpacktq.ifq_ttl = (u_long)fr_icmpacktimeout;
275 ips_icmpacktq.ifq_ref = 1;
276 ips_icmpacktq.ifq_head = NULL;
277 ips_icmpacktq.ifq_tail = &ips_icmpacktq.ifq_head;
278 MUTEX_INIT(&ips_icmpacktq.ifq_lock, "ipftq icmpack tab");
279 ips_icmpacktq.ifq_next = &ips_iptq;
280 ips_iptq.ifq_ttl = (u_long)fr_iptimeout;
281 ips_iptq.ifq_ref = 1;
282 ips_iptq.ifq_head = NULL;
283 ips_iptq.ifq_tail = &ips_iptq.ifq_head;
284 MUTEX_INIT(&ips_iptq.ifq_lock, "ipftq ip tab");
285 ips_iptq.ifq_next = &ips_deletetq;
286 ips_deletetq.ifq_ttl = (u_long)1;
287 ips_deletetq.ifq_ref = 1;
288 ips_deletetq.ifq_head = NULL;
289 ips_deletetq.ifq_tail = &ips_deletetq.ifq_head;
290 MUTEX_INIT(&ips_deletetq.ifq_lock, "state delete queue");
291 ips_deletetq.ifq_next = NULL;
293 RWLOCK_INIT(&ipf_state, "ipf IP state rwlock");
294 MUTEX_INIT(&ipf_stinsert, "ipf state insert mutex");
297 ips_last_force_flush = fr_ticks;
302 /* ------------------------------------------------------------------------ */
303 /* Function: fr_stateunload */
305 /* Parameters: Nil */
307 /* Release and destroy any resources acquired or initialised so that */
308 /* IPFilter can be unloaded or re-initialised. */
309 /* ------------------------------------------------------------------------ */
310 void fr_stateunload()
312 ipftq_t *ifq, *ifqnext;
315 while ((is = ips_list) != NULL)
316 fr_delstate(is, ISL_UNLOAD);
319 * Proxy timeout queues are not cleaned here because although they
320 * exist on the state list, appr_unload is called after fr_stateunload
321 * and the proxies actually are responsible for them being created.
322 * Should the proxy timeouts have their own list? There's no real
323 * justification as this is the only complicationA
325 for (ifq = ips_utqe; ifq != NULL; ifq = ifqnext) {
326 ifqnext = ifq->ifq_next;
327 if (((ifq->ifq_flags & IFQF_PROXY) == 0) &&
328 (fr_deletetimeoutqueue(ifq) == 0))
329 fr_freetimeoutqueue(ifq);
332 ips_stats.iss_inuse = 0;
335 if (fr_state_init == 1) {
336 fr_sttab_destroy(ips_tqtqb);
337 MUTEX_DESTROY(&ips_udptq.ifq_lock);
338 MUTEX_DESTROY(&ips_icmptq.ifq_lock);
339 MUTEX_DESTROY(&ips_udpacktq.ifq_lock);
340 MUTEX_DESTROY(&ips_icmpacktq.ifq_lock);
341 MUTEX_DESTROY(&ips_iptq.ifq_lock);
342 MUTEX_DESTROY(&ips_deletetq.ifq_lock);
345 if (ips_table != NULL) {
346 KFREES(ips_table, fr_statesize * sizeof(*ips_table));
350 if (ips_seed != NULL) {
351 KFREES(ips_seed, fr_statesize * sizeof(*ips_seed));
355 if (ips_stats.iss_bucketlen != NULL) {
356 KFREES(ips_stats.iss_bucketlen, fr_statesize * sizeof(u_long));
357 ips_stats.iss_bucketlen = NULL;
360 if (fr_state_maxbucket_reset == 1)
361 fr_state_maxbucket = 0;
363 if (fr_state_init == 1) {
365 RW_DESTROY(&ipf_state);
366 MUTEX_DESTROY(&ipf_stinsert);
371 /* ------------------------------------------------------------------------ */
372 /* Function: fr_statetstats */
373 /* Returns: ips_state_t* - pointer to state stats structure */
374 /* Parameters: Nil */
376 /* Put all the current numbers and pointers into a single struct and return */
377 /* a pointer to it. */
378 /* ------------------------------------------------------------------------ */
379 static ips_stat_t *fr_statetstats()
381 ips_stats.iss_active = ips_num;
382 ips_stats.iss_statesize = fr_statesize;
383 ips_stats.iss_statemax = fr_statemax;
384 ips_stats.iss_table = ips_table;
385 ips_stats.iss_list = ips_list;
386 ips_stats.iss_ticks = fr_ticks;
390 /* ------------------------------------------------------------------------ */
391 /* Function: fr_state_remove */
392 /* Returns: int - 0 == success, != 0 == failure */
393 /* Parameters: data(I) - pointer to state structure to delete from table */
395 /* Search for a state structure that matches the one passed, according to */
396 /* the IP addresses and other protocol specific information. */
397 /* ------------------------------------------------------------------------ */
398 static int fr_state_remove(data)
405 error = fr_inobj(data, &st, IPFOBJ_IPSTATE);
409 WRITE_ENTER(&ipf_state);
410 for (sp = ips_list; sp; sp = sp->is_next)
411 if ((sp->is_p == st.is_p) && (sp->is_v == st.is_v) &&
412 !bcmp((caddr_t)&sp->is_src, (caddr_t)&st.is_src,
413 sizeof(st.is_src)) &&
414 !bcmp((caddr_t)&sp->is_dst, (caddr_t)&st.is_src,
415 sizeof(st.is_dst)) &&
416 !bcmp((caddr_t)&sp->is_ps, (caddr_t)&st.is_ps,
418 fr_delstate(sp, ISL_REMOVE);
419 RWLOCK_EXIT(&ipf_state);
422 RWLOCK_EXIT(&ipf_state);
427 /* ------------------------------------------------------------------------ */
428 /* Function: fr_state_ioctl */
429 /* Returns: int - 0 == success, != 0 == failure */
430 /* Parameters: data(I) - pointer to ioctl data */
431 /* cmd(I) - ioctl command integer */
432 /* mode(I) - file mode bits used with open */
434 /* Processes an ioctl call made to operate on the IP Filter state device. */
435 /* ------------------------------------------------------------------------ */
436 int fr_state_ioctl(data, cmd, mode, uid, ctx)
442 int arg, ret, error = 0;
448 * Delete an entry from the state table.
451 error = fr_state_remove(data);
455 * Flush the state table
458 error = BCOPYIN(data, (char *)&arg, sizeof(arg));
462 WRITE_ENTER(&ipf_state);
463 ret = fr_state_flush(arg, 4);
464 RWLOCK_EXIT(&ipf_state);
465 error = BCOPYOUT((char *)&ret, data, sizeof(ret));
473 error = BCOPYIN(data, (char *)&arg, sizeof(arg));
477 WRITE_ENTER(&ipf_state);
478 ret = fr_state_flush(arg, 6);
479 RWLOCK_EXIT(&ipf_state);
480 error = BCOPYOUT((char *)&ret, data, sizeof(ret));
488 * Flush the state log.
491 if (!(mode & FWRITE))
496 tmp = ipflog_clear(IPL_LOGSTATE);
497 error = BCOPYOUT((char *)&tmp, data, sizeof(tmp));
504 * Turn logging of state information on/off.
507 if (!(mode & FWRITE))
510 error = BCOPYIN((char *)data, (char *)&ipstate_logging,
511 sizeof(ipstate_logging));
518 * Return the current state of logging.
521 error = BCOPYOUT((char *)&ipstate_logging, (char *)data,
522 sizeof(ipstate_logging));
528 * Return the number of bytes currently waiting to be read.
531 arg = iplused[IPL_LOGSTATE]; /* returned in an int */
532 error = BCOPYOUT((char *)&arg, data, sizeof(arg));
539 * Get the current state statistics.
542 error = fr_outobj(data, fr_statetstats(), IPFOBJ_STATESTAT);
546 * Lock/Unlock the state table. (Locking prevents any changes, which
547 * means no packets match).
550 if (!(mode & FWRITE)) {
553 error = fr_lock(data, &fr_state_lock);
558 * Add an entry to the current state table.
561 if (!fr_state_lock || !(mode &FWRITE)) {
565 error = fr_stputent(data);
569 * Get a state table entry.
572 if (!fr_state_lock) {
576 error = fr_stgetent(data);
580 * Return a copy of the hash table bucket lengths
583 error = BCOPYOUT(ips_stats.iss_bucketlen, data,
584 fr_statesize * sizeof(u_long));
594 error = fr_inobj(data, &iter, IPFOBJ_GENITER);
599 token = ipf_findtoken(IPFGENITER_STATE, uid, ctx);
601 error = fr_stateiter(token, &iter);
604 RWLOCK_EXIT(&ipf_tokens);
610 error = fr_stgettable(data);
614 error = BCOPYIN(data, (char *)&arg, sizeof(arg));
619 error = ipf_deltoken(arg, uid, ctx);
625 error = fr_outobj(data, ips_tqtqb, IPFOBJ_STATETQTAB);
636 /* ------------------------------------------------------------------------ */
637 /* Function: fr_stgetent */
638 /* Returns: int - 0 == success, != 0 == failure */
639 /* Parameters: data(I) - pointer to state structure to retrieve from table */
641 /* Copy out state information from the kernel to a user space process. If */
642 /* there is a filter rule associated with the state entry, copy that out */
643 /* as well. The entry to copy out is taken from the value of "ips_next" in */
644 /* the struct passed in and if not null and not found in the list of current*/
645 /* state entries, the retrieval fails. */
646 /* ------------------------------------------------------------------------ */
647 int fr_stgetent(data)
654 error = fr_inobj(data, &ips, IPFOBJ_STATESAVE);
662 if (ips.ips_next == NULL)
668 * Make sure the pointer we're copying from exists in the
669 * current list of entries. Security precaution to prevent
670 * copying of random kernel data.
672 for (is = ips_list; is; is = is->is_next)
678 ips.ips_next = isn->is_next;
679 bcopy((char *)isn, (char *)&ips.ips_is, sizeof(ips.ips_is));
680 ips.ips_rule = isn->is_rule;
681 if (isn->is_rule != NULL)
682 bcopy((char *)isn->is_rule, (char *)&ips.ips_fr,
684 error = fr_outobj(data, &ips, IPFOBJ_STATESAVE);
689 /* ------------------------------------------------------------------------ */
690 /* Function: fr_stputent */
691 /* Returns: int - 0 == success, != 0 == failure */
692 /* Parameters: data(I) - pointer to state information struct */
694 /* This function implements the SIOCSTPUT ioctl: insert a state entry into */
695 /* the state table. If the state info. includes a pointer to a filter rule */
696 /* then also add in an orphaned rule (will not show up in any "ipfstat -io" */
698 /* ------------------------------------------------------------------------ */
699 int fr_stputent(data)
708 error = fr_inobj(data, &ips, IPFOBJ_STATESAVE);
712 KMALLOC(isn, ipstate_t *);
716 bcopy((char *)&ips.ips_is, (char *)isn, sizeof(*isn));
717 bzero((char *)isn, offsetof(struct ipstate, is_pkts));
718 isn->is_sti.tqe_pnext = NULL;
719 isn->is_sti.tqe_next = NULL;
720 isn->is_sti.tqe_ifq = NULL;
721 isn->is_sti.tqe_parent = isn;
722 isn->is_ifp[0] = NULL;
723 isn->is_ifp[1] = NULL;
724 isn->is_ifp[2] = NULL;
725 isn->is_ifp[3] = NULL;
730 READ_ENTER(&ipf_state);
732 MUTEX_EXIT(&isn->is_lock);
733 RWLOCK_EXIT(&ipf_state);
737 if (isn->is_flags & SI_NEWFR) {
738 KMALLOC(fr, frentry_t *);
743 bcopy((char *)&ips.ips_fr, (char *)fr, sizeof(*fr));
744 out = fr->fr_flags & FR_OUTQUE ? 1 : 0;
746 ips.ips_is.is_rule = fr;
747 MUTEX_NUKE(&fr->fr_lock);
748 MUTEX_INIT(&fr->fr_lock, "state filter rule lock");
751 * Look up all the interface names in the rule.
753 for (i = 0; i < 4; i++) {
754 name = fr->fr_ifnames[i];
755 fr->fr_ifas[i] = fr_resolvenic(name, fr->fr_v);
756 name = isn->is_ifname[i];
757 isn->is_ifp[i] = fr_resolvenic(name, isn->is_v);
763 fr->fr_type = FR_T_NONE;
765 fr_resolvedest(&fr->fr_tifs[0], fr->fr_v);
766 fr_resolvedest(&fr->fr_tifs[1], fr->fr_v);
767 fr_resolvedest(&fr->fr_dif, fr->fr_v);
770 * send a copy back to userland of what we ended up
771 * to allow for verification.
773 error = fr_outobj(data, &ips, IPFOBJ_STATESAVE);
776 MUTEX_DESTROY(&fr->fr_lock);
780 READ_ENTER(&ipf_state);
782 MUTEX_EXIT(&isn->is_lock);
783 RWLOCK_EXIT(&ipf_state);
786 READ_ENTER(&ipf_state);
787 for (is = ips_list; is; is = is->is_next)
788 if (is->is_rule == fr) {
790 MUTEX_EXIT(&isn->is_lock);
798 RWLOCK_EXIT(&ipf_state);
800 return (isn == NULL) ? ESRCH : 0;
807 /* ------------------------------------------------------------------------ */
808 /* Function: fr_stinsert */
810 /* Parameters: is(I) - pointer to state structure */
811 /* rev(I) - flag indicating forward/reverse direction of packet */
813 /* Inserts a state structure into the hash table (for lookups) and the list */
814 /* of state entries (for enumeration). Resolves all of the interface names */
815 /* to pointers and adjusts running stats for the hash table as appropriate. */
817 /* Locking: it is assumed that some kind of lock on ipf_state is held. */
818 /* Exits with is_lock initialised and held. */
819 /* ------------------------------------------------------------------------ */
820 void fr_stinsert(is, rev)
828 MUTEX_INIT(&is->is_lock, "ipf state entry");
832 MUTEX_ENTER(&fr->fr_lock);
835 MUTEX_EXIT(&fr->fr_lock);
839 * Look up all the interface names in the state entry.
841 for (i = 0; i < 4; i++) {
842 if (is->is_ifp[i] != NULL)
844 is->is_ifp[i] = fr_resolvenic(is->is_ifname[i], is->is_v);
848 * If we could trust is_hv, then the modulous would not be needed, but
849 * when running with IPFILTER_SYNC, this stops bad values.
851 hv = is->is_hv % fr_statesize;
855 * We need to get both of these locks...the first because it is
856 * possible that once the insert is complete another packet might
857 * come along, match the entry and want to update it.
859 MUTEX_ENTER(&is->is_lock);
860 MUTEX_ENTER(&ipf_stinsert);
863 * add into list table.
865 if (ips_list != NULL)
866 ips_list->is_pnext = &is->is_next;
867 is->is_pnext = &ips_list;
868 is->is_next = ips_list;
871 if (ips_table[hv] != NULL)
872 ips_table[hv]->is_phnext = &is->is_hnext;
874 ips_stats.iss_inuse++;
875 is->is_phnext = ips_table + hv;
876 is->is_hnext = ips_table[hv];
878 ips_stats.iss_bucketlen[hv]++;
880 MUTEX_EXIT(&ipf_stinsert);
882 fr_setstatequeue(is, rev);
886 /* ------------------------------------------------------------------------ */
887 /* Function: fr_addstate */
888 /* Returns: ipstate_t* - NULL == failure, else pointer to new state */
889 /* Parameters: fin(I) - pointer to packet information */
890 /* stsave(O) - pointer to place to save pointer to created */
891 /* state structure. */
892 /* flags(I) - flags to use when creating the structure */
894 /* Creates a new IP state structure from the packet information collected. */
895 /* Inserts it into the state table and appends to the bottom of the active */
896 /* list. If the capacity of the table has reached the maximum allowed then */
897 /* the call will fail and a flush is scheduled for the next timeout call. */
899 /* NOTE: The use of stsave to point to nat_state will result in memory */
900 /* corruption. It should only be used to point to objects that will */
901 /* either outlive this (not expired) or will deref the ip_state_t */
902 /* when they are deleted. */
903 /* ------------------------------------------------------------------------ */
904 ipstate_t *fr_addstate(fin, stsave, flags)
918 (fin->fin_flx & (FI_SHORT|FI_STATE|FI_FRAGBODY|FI_BAD)))
921 if ((fin->fin_flx & FI_OOW) && !(fin->fin_tcpf & TH_SYN))
925 * If a "keep state" rule has reached the maximum number of references
926 * to it, then schedule an automatic flush in case we can clear out
927 * some "dead old wood". Note that because the lock isn't held on
928 * fr it is possible that we could overflow. The cost of overflowing
929 * is being ignored here as the number by which it can overflow is
930 * a product of the number of simultaneous threads that could be
931 * executing in here, so a limit of 100 won't result in 200, but could
932 * result in 101 or 102.
936 if ((ips_num >= fr_statemax) && (fr->fr_statemax == 0)) {
937 ATOMIC_INCL(ips_stats.iss_max);
938 fr_state_doflush = 1;
941 if ((fr->fr_statemax != 0) &&
942 (fr->fr_statecnt >= fr->fr_statemax)) {
943 ATOMIC_INCL(ips_stats.iss_maxref);
948 pass = (fr == NULL) ? 0 : fr->fr_flags;
954 bzero((char *)is, sizeof(*is));
955 is->is_die = 1 + fr_ticks;
958 * Copy and calculate...
960 hv = (is->is_p = fin->fin_fi.fi_p);
961 is->is_src = fin->fin_fi.fi_src;
963 is->is_dst = fin->fin_fi.fi_dst;
966 if (fin->fin_v == 6) {
968 * For ICMPv6, we check to see if the destination address is
969 * a multicast address. If it is, do not include it in the
970 * calculation of the hash because the correct reply will come
971 * back from a real address, not a multicast address.
973 if ((is->is_p == IPPROTO_ICMPV6) &&
974 IN6_IS_ADDR_MULTICAST(&is->is_dst.in6)) {
976 * So you can do keep state with neighbour discovery.
978 * Here we could use the address from the neighbour
979 * solicit message to put in the state structure and
980 * we could use that without a wildcard flag too...
985 hv += is->is_dst.i6[1];
986 hv += is->is_dst.i6[2];
987 hv += is->is_dst.i6[3];
989 hv += is->is_src.i6[1];
990 hv += is->is_src.i6[2];
991 hv += is->is_src.i6[3];
994 if ((fin->fin_v == 4) &&
995 (fin->fin_flx & (FI_MULTICAST|FI_BROADCAST|FI_MBCAST))) {
996 if (fin->fin_out == 0) {
997 flags |= SI_W_DADDR|SI_CLONE;
1000 flags |= SI_W_SADDR|SI_CLONE;
1008 case IPPROTO_ICMPV6 :
1011 switch (ic->icmp_type)
1013 case ICMP6_ECHO_REQUEST :
1014 is->is_icmp.ici_type = ic->icmp_type;
1015 hv += (is->is_icmp.ici_id = ic->icmp_id);
1017 case ICMP6_MEMBERSHIP_QUERY :
1018 case ND_ROUTER_SOLICIT :
1019 case ND_NEIGHBOR_SOLICIT :
1020 case ICMP6_NI_QUERY :
1021 is->is_icmp.ici_type = ic->icmp_type;
1026 ATOMIC_INCL(ips_stats.iss_icmp);
1032 switch (ic->icmp_type)
1038 is->is_icmp.ici_type = ic->icmp_type;
1039 hv += (is->is_icmp.ici_id = ic->icmp_id);
1044 ATOMIC_INCL(ips_stats.iss_icmp);
1050 is->is_gre.gs_flags = gre->gr_flags;
1051 is->is_gre.gs_ptype = gre->gr_ptype;
1052 if (GRE_REV(is->is_gre.gs_flags) == 1) {
1053 is->is_call[0] = fin->fin_data[0];
1054 is->is_call[1] = fin->fin_data[1];
1061 if (tcp->th_flags & TH_RST)
1064 * The endian of the ports doesn't matter, but the ack and
1065 * sequence numbers do as we do mathematics on them later.
1067 is->is_sport = htons(fin->fin_data[0]);
1068 is->is_dport = htons(fin->fin_data[1]);
1069 if ((flags & (SI_W_DPORT|SI_W_SPORT)) == 0) {
1075 * If this is a real packet then initialise fields in the
1076 * state information structure from the TCP header information.
1080 is->is_maxswin = ntohs(tcp->th_win);
1081 if (is->is_maxswin == 0)
1084 if ((fin->fin_flx & FI_IGNORE) == 0) {
1085 is->is_send = ntohl(tcp->th_seq) + fin->fin_dlen -
1086 (TCP_OFF(tcp) << 2) +
1087 ((tcp->th_flags & TH_SYN) ? 1 : 0) +
1088 ((tcp->th_flags & TH_FIN) ? 1 : 0);
1089 is->is_maxsend = is->is_send;
1092 * Window scale option is only present in
1093 * SYN/SYN-ACK packet.
1095 if ((tcp->th_flags & ~(TH_FIN|TH_ACK|TH_ECNALL)) ==
1097 (TCP_OFF(tcp) > (sizeof(tcphdr_t) >> 2))) {
1098 if (fr_tcpoptions(fin, tcp,
1099 &is->is_tcp.ts_data[0]) == -1) {
1100 fin->fin_flx |= FI_BAD;
1104 if ((fin->fin_out != 0) && (pass & FR_NEWISN) != 0) {
1105 fr_checknewisn(fin, is);
1106 fr_fixoutisn(fin, is);
1109 if ((tcp->th_flags & TH_OPENING) == TH_SYN)
1112 is->is_maxdwin = is->is_maxswin * 2;
1113 is->is_dend = ntohl(tcp->th_ack);
1114 is->is_maxdend = ntohl(tcp->th_ack);
1115 is->is_maxdwin *= 2;
1120 * If we're creating state for a starting connection, start the
1121 * timer on it as we'll never see an error if it fails to
1124 ATOMIC_INCL(ips_stats.iss_tcp);
1130 is->is_sport = htons(fin->fin_data[0]);
1131 is->is_dport = htons(fin->fin_data[1]);
1132 if ((flags & (SI_W_DPORT|SI_W_SPORT)) == 0) {
1133 hv += tcp->th_dport;
1134 hv += tcp->th_sport;
1136 ATOMIC_INCL(ips_stats.iss_udp);
1142 hv = DOUBLE_HASH(hv);
1145 is->is_flags = flags & IS_INHERITED;
1148 * Look for identical state.
1150 for (is = ips_table[is->is_hv % fr_statesize]; is != NULL;
1151 is = is->is_hnext) {
1152 if (bcmp(&ips.is_src, &is->is_src,
1153 offsetof(struct ipstate, is_ps) -
1154 offsetof(struct ipstate, is_src)) == 0)
1160 if (ips_stats.iss_bucketlen[hv] >= fr_state_maxbucket) {
1161 ATOMIC_INCL(ips_stats.iss_bucketfull);
1164 KMALLOC(is, ipstate_t *);
1166 ATOMIC_INCL(ips_stats.iss_nomem);
1169 bcopy((char *)&ips, (char *)is, sizeof(*is));
1171 * Do not do the modulous here, it is done in fr_stinsert().
1174 (void) strncpy(is->is_group, fr->fr_group, FR_GROUPLEN);
1175 if (fr->fr_age[0] != 0) {
1176 is->is_tqehead[0] = fr_addtimeoutqueue(&ips_utqe,
1178 is->is_sti.tqe_flags |= TQE_RULEBASED;
1180 if (fr->fr_age[1] != 0) {
1181 is->is_tqehead[1] = fr_addtimeoutqueue(&ips_utqe,
1183 is->is_sti.tqe_flags |= TQE_RULEBASED;
1186 is->is_tag = fr->fr_logtag;
1189 * The name '-' is special for network interfaces and causes
1190 * a NULL name to be present, always, allowing packets to
1191 * match it, regardless of their interface.
1193 if ((fin->fin_ifp == NULL) ||
1194 (fr->fr_ifnames[out << 1][0] == '-' &&
1195 fr->fr_ifnames[out << 1][1] == '\0')) {
1196 is->is_ifp[out << 1] = fr->fr_ifas[0];
1197 strncpy(is->is_ifname[out << 1], fr->fr_ifnames[0],
1198 sizeof(fr->fr_ifnames[0]));
1200 is->is_ifp[out << 1] = fin->fin_ifp;
1201 COPYIFNAME(is->is_v, fin->fin_ifp,
1202 is->is_ifname[out << 1]);
1205 is->is_ifp[(out << 1) + 1] = fr->fr_ifas[1];
1206 strncpy(is->is_ifname[(out << 1) + 1], fr->fr_ifnames[1],
1207 sizeof(fr->fr_ifnames[1]));
1209 is->is_ifp[(1 - out) << 1] = fr->fr_ifas[2];
1210 strncpy(is->is_ifname[((1 - out) << 1)], fr->fr_ifnames[2],
1211 sizeof(fr->fr_ifnames[2]));
1213 is->is_ifp[((1 - out) << 1) + 1] = fr->fr_ifas[3];
1214 strncpy(is->is_ifname[((1 - out) << 1) + 1], fr->fr_ifnames[3],
1215 sizeof(fr->fr_ifnames[3]));
1218 is->is_tag = FR_NOLOGTAG;
1220 if (fin->fin_ifp != NULL) {
1221 is->is_ifp[out << 1] = fin->fin_ifp;
1222 COPYIFNAME(is->is_v, fin->fin_ifp,
1223 is->is_ifname[out << 1]);
1228 * It may seem strange to set is_ref to 2, but fr_check() will call
1229 * fr_statederef() after calling fr_addstate() and the idea is to
1230 * have it exist at the end of fr_check() with is_ref == 1.
1234 is->is_pkts[0] = 0, is->is_bytes[0] = 0;
1235 is->is_pkts[1] = 0, is->is_bytes[1] = 0;
1236 is->is_pkts[2] = 0, is->is_bytes[2] = 0;
1237 is->is_pkts[3] = 0, is->is_bytes[3] = 0;
1238 if ((fin->fin_flx & FI_IGNORE) == 0) {
1239 is->is_pkts[out] = 1;
1240 is->is_bytes[out] = fin->fin_plen;
1241 is->is_flx[out][0] = fin->fin_flx & FI_CMP;
1242 is->is_flx[out][0] &= ~FI_OOW;
1245 if (pass & FR_STSTRICT)
1246 is->is_flags |= IS_STRICT;
1248 if (pass & FR_STATESYNC)
1249 is->is_flags |= IS_STATESYNC;
1252 * We want to check everything that is a property of this packet,
1253 * but we don't (automatically) care about it's fragment status as
1256 is->is_v = fin->fin_v;
1257 is->is_opt[0] = fin->fin_optmsk;
1258 is->is_optmsk[0] = 0xffffffff;
1259 is->is_optmsk[1] = 0xffffffff;
1260 if (is->is_v == 6) {
1261 is->is_opt[0] &= ~0x8;
1262 is->is_optmsk[0] &= ~0x8;
1263 is->is_optmsk[1] &= ~0x8;
1266 is->is_sec = fin->fin_secmsk;
1267 is->is_secmsk = 0xffff;
1268 is->is_auth = fin->fin_auth;
1269 is->is_authmsk = 0xffff;
1270 if (flags & (SI_WILDP|SI_WILDA)) {
1271 ATOMIC_INCL(ips_stats.iss_wild);
1273 is->is_rulen = fin->fin_rule;
1276 if (pass & FR_LOGFIRST)
1277 is->is_pass &= ~(FR_LOGFIRST|FR_LOG);
1279 READ_ENTER(&ipf_state);
1281 fr_stinsert(is, fin->fin_rev);
1283 if (fin->fin_p == IPPROTO_TCP) {
1285 * If we're creating state for a starting connection, start the
1286 * timer on it as we'll never see an error if it fails to
1289 (void) fr_tcp_age(&is->is_sti, fin, ips_tqtqb, is->is_flags);
1290 MUTEX_EXIT(&is->is_lock);
1291 #ifdef IPFILTER_SCAN
1292 if ((is->is_flags & SI_CLONE) == 0)
1293 (void) ipsc_attachis(is);
1296 MUTEX_EXIT(&is->is_lock);
1298 #ifdef IPFILTER_SYNC
1299 if ((is->is_flags & IS_STATESYNC) && ((is->is_flags & SI_CLONE) == 0))
1300 is->is_sync = ipfsync_new(SMC_STATE, fin, is);
1302 if (ipstate_logging)
1303 ipstate_log(is, ISL_NEW);
1305 RWLOCK_EXIT(&ipf_state);
1306 fin->fin_state = is;
1307 fin->fin_rev = IP6_NEQ(&is->is_dst, &fin->fin_daddr);
1308 fin->fin_flx |= FI_STATE;
1309 if (fin->fin_flx & FI_FRAG)
1310 (void) fr_newfrag(fin, pass ^ FR_KEEPSTATE);
1316 /* ------------------------------------------------------------------------ */
1317 /* Function: fr_tcpoptions */
1318 /* Returns: int - 1 == packet matches state entry, 0 == it does not, */
1319 /* -1 == packet has bad TCP options data */
1320 /* Parameters: fin(I) - pointer to packet information */
1321 /* tcp(I) - pointer to TCP packet header */
1322 /* td(I) - pointer to TCP data held as part of the state */
1324 /* Look after the TCP header for any options and deal with those that are */
1325 /* present. Record details about those that we recogise. */
1326 /* ------------------------------------------------------------------------ */
1327 static int fr_tcpoptions(fin, tcp, td)
1332 int off, mlen, ol, i, len, retval;
1333 char buf[64], *s, opt;
1336 len = (TCP_OFF(tcp) << 2);
1337 if (fin->fin_dlen < len)
1339 len -= sizeof(*tcp);
1341 off = fin->fin_plen - fin->fin_dlen + sizeof(*tcp) + fin->fin_ipoff;
1344 mlen = MSGDSIZE(m) - off;
1352 COPYDATA(m, off, len, buf);
1354 for (s = buf; len > 0; ) {
1356 if (opt == TCPOPT_EOL)
1358 else if (opt == TCPOPT_NOP)
1364 if (ol < 2 || ol > len)
1368 * Extract the TCP options we are interested in out of
1369 * the header and store them in the the tcpdata struct.
1373 case TCPOPT_WINDOW :
1374 if (ol == TCPOLEN_WINDOW) {
1376 if (i > TCP_WSCALE_MAX)
1380 td->td_winscale = i;
1381 td->td_winflags |= TCP_WSCALE_SEEN|
1386 case TCPOPT_MAXSEG :
1388 * So, if we wanted to set the TCP MAXSEG,
1389 * it should be done here...
1391 if (ol == TCPOLEN_MAXSEG) {
1399 case TCPOPT_SACK_PERMITTED :
1400 if (ol == TCPOLEN_SACK_PERMITTED)
1401 td->td_winflags |= TCP_SACK_PERMIT;
1414 /* ------------------------------------------------------------------------ */
1415 /* Function: fr_tcpstate */
1416 /* Returns: int - 1 == packet matches state entry, 0 == it does not */
1417 /* Parameters: fin(I) - pointer to packet information */
1418 /* tcp(I) - pointer to TCP packet header */
1419 /* is(I) - pointer to master state structure */
1421 /* Check to see if a packet with TCP headers fits within the TCP window. */
1422 /* Change timeout depending on whether new packet is a SYN-ACK returning */
1423 /* for a SYN or a RST or FIN which indicate time to close up shop. */
1424 /* ------------------------------------------------------------------------ */
1425 static int fr_tcpstate(fin, tcp, is)
1430 int source, ret = 0, flags;
1431 tcpdata_t *fdata, *tdata;
1433 source = !fin->fin_rev;
1434 if (((is->is_flags & IS_TCPFSM) != 0) && (source == 1) &&
1435 (ntohs(is->is_sport) != fin->fin_data[0]))
1437 fdata = &is->is_tcp.ts_data[!source];
1438 tdata = &is->is_tcp.ts_data[source];
1440 MUTEX_ENTER(&is->is_lock);
1443 * If a SYN packet is received for a connection that is on the way out
1444 * but hasn't yet departed then advance this session along the way.
1446 if ((tcp->th_flags & TH_OPENING) == TH_SYN) {
1447 if ((is->is_state[0] > IPF_TCPS_ESTABLISHED) &&
1448 (is->is_state[1] > IPF_TCPS_ESTABLISHED)) {
1449 is->is_state[!source] = IPF_TCPS_CLOSED;
1450 fr_movequeue(&is->is_sti, is->is_sti.tqe_ifq,
1452 MUTEX_EXIT(&is->is_lock);
1457 ret = fr_tcpinwindow(fin, fdata, tdata, tcp, is->is_flags);
1459 #ifdef IPFILTER_SCAN
1460 if (is->is_flags & (IS_SC_CLIENT|IS_SC_SERVER)) {
1461 ipsc_packet(fin, is);
1462 if (FR_ISBLOCK(is->is_pass)) {
1463 MUTEX_EXIT(&is->is_lock);
1470 * Nearing end of connection, start timeout.
1472 ret = fr_tcp_age(&is->is_sti, fin, ips_tqtqb, is->is_flags);
1474 MUTEX_EXIT(&is->is_lock);
1479 * set s0's as appropriate. Use syn-ack packet as it
1480 * contains both pieces of required information.
1483 * Window scale option is only present in SYN/SYN-ACK packet.
1484 * Compare with ~TH_FIN to mask out T/TCP setups.
1486 flags = tcp->th_flags & ~(TH_FIN|TH_ECNALL);
1487 if (flags == (TH_SYN|TH_ACK)) {
1488 is->is_s0[source] = ntohl(tcp->th_ack);
1489 is->is_s0[!source] = ntohl(tcp->th_seq) + 1;
1490 if ((TCP_OFF(tcp) > (sizeof(tcphdr_t) >> 2))) {
1491 if (fr_tcpoptions(fin, tcp, fdata) == -1)
1492 fin->fin_flx |= FI_BAD;
1494 if ((fin->fin_out != 0) && (is->is_pass & FR_NEWISN))
1495 fr_checknewisn(fin, is);
1496 } else if (flags == TH_SYN) {
1497 is->is_s0[source] = ntohl(tcp->th_seq) + 1;
1498 if ((TCP_OFF(tcp) > (sizeof(tcphdr_t) >> 2))) {
1499 if (fr_tcpoptions(fin, tcp, fdata) == -1)
1500 fin->fin_flx |= FI_BAD;
1503 if ((fin->fin_out != 0) && (is->is_pass & FR_NEWISN))
1504 fr_checknewisn(fin, is);
1509 fin->fin_flx |= FI_OOW;
1511 MUTEX_EXIT(&is->is_lock);
1516 /* ------------------------------------------------------------------------ */
1517 /* Function: fr_checknewisn */
1519 /* Parameters: fin(I) - pointer to packet information */
1520 /* is(I) - pointer to master state structure */
1522 /* Check to see if this TCP connection is expecting and needs a new */
1523 /* sequence number for a particular direction of the connection. */
1525 /* NOTE: This does not actually change the sequence numbers, only gets new */
1527 /* ------------------------------------------------------------------------ */
1528 static void fr_checknewisn(fin, is)
1532 u_32_t sumd, old, new;
1539 if (((i == 0) && !(is->is_flags & IS_ISNSYN)) ||
1540 ((i == 1) && !(is->is_flags & IS_ISNACK))) {
1541 old = ntohl(tcp->th_seq);
1542 new = fr_newisn(fin);
1543 is->is_isninc[i] = new - old;
1544 CALC_SUMD(old, new, sumd);
1545 is->is_sumd[i] = (sumd & 0xffff) + (sumd >> 16);
1547 is->is_flags |= ((i == 0) ? IS_ISNSYN : IS_ISNACK);
1552 /* ------------------------------------------------------------------------ */
1553 /* Function: fr_tcpinwindow */
1554 /* Returns: int - 1 == packet inside TCP "window", 0 == not inside, */
1555 /* 2 == packet seq number matches next expected */
1556 /* Parameters: fin(I) - pointer to packet information */
1557 /* fdata(I) - pointer to tcp state informatio (forward) */
1558 /* tdata(I) - pointer to tcp state informatio (reverse) */
1559 /* tcp(I) - pointer to TCP packet header */
1561 /* Given a packet has matched addresses and ports, check to see if it is */
1562 /* within the TCP data window. In a show of generosity, allow packets that */
1563 /* are within the window space behind the current sequence # as well. */
1564 /* ------------------------------------------------------------------------ */
1565 int fr_tcpinwindow(fin, fdata, tdata, tcp, flags)
1567 tcpdata_t *fdata, *tdata;
1571 tcp_seq seq, ack, end;
1572 int ackskew, tcpflags;
1577 * Find difference between last checked packet and this packet.
1579 tcpflags = tcp->th_flags;
1580 seq = ntohl(tcp->th_seq);
1581 ack = ntohl(tcp->th_ack);
1582 if (tcpflags & TH_SYN)
1583 win = ntohs(tcp->th_win);
1585 win = ntohs(tcp->th_win) << fdata->td_winscale;
1588 * A window of 0 produces undesirable behaviour from this function.
1593 dsize = fin->fin_dlen - (TCP_OFF(tcp) << 2) +
1594 ((tcpflags & TH_SYN) ? 1 : 0) + ((tcpflags & TH_FIN) ? 1 : 0);
1597 * if window scaling is present, the scaling is only allowed
1598 * for windows not in the first SYN packet. In that packet the
1599 * window is 65535 to specify the largest window possible
1600 * for receivers not implementing the window scale option.
1601 * Currently, we do not assume TTCP here. That means that
1602 * if we see a second packet from a host (after the initial
1603 * SYN), we can assume that the receiver of the SYN did
1604 * already send back the SYN/ACK (and thus that we know if
1605 * the receiver also does window scaling)
1607 if (!(tcpflags & TH_SYN) && (fdata->td_winflags & TCP_WSCALE_FIRST)) {
1608 fdata->td_winflags &= ~TCP_WSCALE_FIRST;
1609 fdata->td_maxwin = win;
1614 if ((fdata->td_end == 0) &&
1615 (!(flags & IS_TCPFSM) ||
1616 ((tcpflags & TH_OPENING) == TH_OPENING))) {
1618 * Must be a (outgoing) SYN-ACK in reply to a SYN.
1620 fdata->td_end = end - 1;
1621 fdata->td_maxwin = 1;
1622 fdata->td_maxend = end + win;
1625 if (!(tcpflags & TH_ACK)) { /* Pretend an ack was sent */
1626 ack = tdata->td_end;
1627 } else if (((tcpflags & (TH_ACK|TH_RST)) == (TH_ACK|TH_RST)) &&
1629 /* gross hack to get around certain broken tcp stacks */
1630 ack = tdata->td_end;
1633 maxwin = tdata->td_maxwin;
1634 ackskew = tdata->td_end - ack;
1637 * Strict sequencing only allows in-order delivery.
1639 if (seq != fdata->td_end) {
1640 if ((flags & IS_STRICT) != 0) {
1646 if ((SEQ_GE(fdata->td_maxend, end)) &&
1647 (SEQ_GE(seq, fdata->td_end - maxwin)) &&
1648 /* XXX what about big packets */
1649 #define MAXACKWINDOW 66000
1650 (-ackskew <= (MAXACKWINDOW)) &&
1651 ( ackskew <= (MAXACKWINDOW << fdata->td_winscale))) {
1654 * Microsoft Windows will send the next packet to the right of the
1655 * window if SACK is in use.
1657 } else if ((seq == fdata->td_maxend) && (ackskew == 0) &&
1658 (fdata->td_winflags & TCP_SACK_PERMIT) &&
1659 (tdata->td_winflags & TCP_SACK_PERMIT)) {
1662 * Sometimes a TCP RST will be generated with only the ACK field
1665 } else if ((seq == 0) && (tcpflags == (TH_RST|TH_ACK)) &&
1666 (ackskew >= -1) && (ackskew <= 1)) {
1668 } else if (!(flags & IS_TCPFSM)) {
1671 i = (fin->fin_rev << 1) + fin->fin_out;
1674 if (is_pkts[i]0 == 0) {
1676 * Picking up a connection in the middle, the "next"
1677 * packet seen from a direction that is new should be
1678 * accepted, even if it appears out of sequence.
1683 if (!(fdata->td_winflags &
1684 (TCP_WSCALE_SEEN|TCP_WSCALE_FIRST))) {
1686 * No TCPFSM and no window scaling, so make some
1689 if ((seq == fdata->td_maxend) && (ackskew == 0))
1691 else if (SEQ_GE(seq + maxwin, fdata->td_end - maxwin))
1696 /* TRACE(inseq, fdata, tdata, seq, end, ack, ackskew, win, maxwin) */
1699 /* if ackskew < 0 then this should be due to fragmented
1700 * packets. There is no way to know the length of the
1701 * total packet in advance.
1702 * We do know the total length from the fragment cache though.
1703 * Note however that there might be more sessions with
1704 * exactly the same source and destination parameters in the
1705 * state cache (and source and destination is the only stuff
1706 * that is saved in the fragment cache). Note further that
1707 * some TCP connections in the state cache are hashed with
1708 * sport and dport as well which makes it not worthwhile to
1710 * Thus, when ackskew is negative but still seems to belong
1711 * to this session, we bump up the destinations end value.
1714 tdata->td_end = ack;
1716 /* update max window seen */
1717 if (fdata->td_maxwin < win)
1718 fdata->td_maxwin = win;
1719 if (SEQ_GT(end, fdata->td_end))
1720 fdata->td_end = end;
1721 if (SEQ_GE(ack + win, tdata->td_maxend))
1722 tdata->td_maxend = ack + win;
1729 /* ------------------------------------------------------------------------ */
1730 /* Function: fr_stclone */
1731 /* Returns: ipstate_t* - NULL == cloning failed, */
1732 /* else pointer to new state structure */
1733 /* Parameters: fin(I) - pointer to packet information */
1734 /* tcp(I) - pointer to TCP/UDP header */
1735 /* is(I) - pointer to master state structure */
1737 /* Create a "duplcate" state table entry from the master. */
1738 /* ------------------------------------------------------------------------ */
1739 static ipstate_t *fr_stclone(fin, tcp, is)
1747 if (ips_num == fr_statemax) {
1748 ATOMIC_INCL(ips_stats.iss_max);
1749 fr_state_doflush = 1;
1752 KMALLOC(clone, ipstate_t *);
1755 bcopy((char *)is, (char *)clone, sizeof(*clone));
1757 MUTEX_NUKE(&clone->is_lock);
1759 clone->is_die = ONE_DAY + fr_ticks;
1760 clone->is_state[0] = 0;
1761 clone->is_state[1] = 0;
1762 send = ntohl(tcp->th_seq) + fin->fin_dlen - (TCP_OFF(tcp) << 2) +
1763 ((tcp->th_flags & TH_SYN) ? 1 : 0) +
1764 ((tcp->th_flags & TH_FIN) ? 1 : 0);
1766 if (fin->fin_rev == 1) {
1767 clone->is_dend = send;
1768 clone->is_maxdend = send;
1770 clone->is_maxswin = 1;
1771 clone->is_maxdwin = ntohs(tcp->th_win);
1772 if (clone->is_maxdwin == 0)
1773 clone->is_maxdwin = 1;
1775 clone->is_send = send;
1776 clone->is_maxsend = send;
1778 clone->is_maxdwin = 1;
1779 clone->is_maxswin = ntohs(tcp->th_win);
1780 if (clone->is_maxswin == 0)
1781 clone->is_maxswin = 1;
1784 clone->is_flags &= ~SI_CLONE;
1785 clone->is_flags |= SI_CLONED;
1786 fr_stinsert(clone, fin->fin_rev);
1788 if (clone->is_p == IPPROTO_TCP) {
1789 (void) fr_tcp_age(&clone->is_sti, fin, ips_tqtqb,
1792 MUTEX_EXIT(&clone->is_lock);
1793 #ifdef IPFILTER_SCAN
1794 (void) ipsc_attachis(is);
1796 #ifdef IPFILTER_SYNC
1797 if (is->is_flags & IS_STATESYNC)
1798 clone->is_sync = ipfsync_new(SMC_STATE, fin, clone);
1804 /* ------------------------------------------------------------------------ */
1805 /* Function: fr_matchsrcdst */
1807 /* Parameters: fin(I) - pointer to packet information */
1808 /* is(I) - pointer to state structure */
1809 /* src(I) - pointer to source address */
1810 /* dst(I) - pointer to destination address */
1811 /* tcp(I) - pointer to TCP/UDP header */
1813 /* Match a state table entry against an IP packet. The logic below is that */
1814 /* ret gets set to one if the match succeeds, else remains 0. If it is */
1815 /* still 0 after the test. no match. */
1816 /* ------------------------------------------------------------------------ */
1817 static ipstate_t *fr_matchsrcdst(fin, is, src, dst, tcp, cmask)
1820 i6addr_t *src, *dst;
1824 int ret = 0, rev, out, flags, flx = 0, idx;
1829 rev = IP6_NEQ(&is->is_dst, dst);
1832 flags = is->is_flags;
1837 sp = htons(fin->fin_sport);
1838 dp = ntohs(fin->fin_dport);
1842 if (!(flags & SI_W_SPORT) && (sp != is->is_sport))
1844 else if (!(flags & SI_W_DPORT) && (dp != is->is_dport))
1849 idx = (out << 1) + rev;
1852 * If the interface for this 'direction' is set, make sure it matches.
1853 * An interface name that is not set matches any, as does a name of *.
1855 if ((is->is_ifp[idx] == ifp) || (is->is_ifp[idx] == NULL &&
1856 (*is->is_ifname[idx] == '\0' || *is->is_ifname[idx] == '-' ||
1857 *is->is_ifname[idx] == '*')))
1865 * Match addresses and ports.
1868 if ((IP6_EQ(&is->is_dst, dst) || (flags & SI_W_DADDR)) &&
1869 (IP6_EQ(&is->is_src, src) || (flags & SI_W_SADDR))) {
1871 if ((sp == is->is_sport || flags & SI_W_SPORT)&&
1872 (dp == is->is_dport || flags & SI_W_DPORT))
1879 if ((IP6_EQ(&is->is_dst, src) || (flags & SI_W_DADDR)) &&
1880 (IP6_EQ(&is->is_src, dst) || (flags & SI_W_SADDR))) {
1882 if ((dp == is->is_sport || flags & SI_W_SPORT)&&
1883 (sp == is->is_dport || flags & SI_W_DPORT))
1895 * Whether or not this should be here, is questionable, but the aim
1896 * is to get this out of the main line.
1899 flags = is->is_flags & ~(SI_WILDP|SI_NEWFR|SI_CLONE|SI_CLONED);
1902 * Only one of the source or destination address can be flaged as a
1903 * wildcard. Fill in the missing address, if set.
1904 * For IPv6, if the address being copied in is multicast, then
1905 * don't reset the wild flag - multicast causes it to be set in the
1908 if ((flags & (SI_W_SADDR|SI_W_DADDR))) {
1909 fr_ip_t *fi = &fin->fin_fi;
1911 if ((flags & SI_W_SADDR) != 0) {
1914 if (is->is_v == 6 &&
1915 IN6_IS_ADDR_MULTICAST(&fi->fi_src.in6))
1920 is->is_src = fi->fi_src;
1921 is->is_flags &= ~SI_W_SADDR;
1925 if (is->is_v == 6 &&
1926 IN6_IS_ADDR_MULTICAST(&fi->fi_dst.in6))
1931 is->is_src = fi->fi_dst;
1932 is->is_flags &= ~SI_W_SADDR;
1935 } else if ((flags & SI_W_DADDR) != 0) {
1938 if (is->is_v == 6 &&
1939 IN6_IS_ADDR_MULTICAST(&fi->fi_dst.in6))
1944 is->is_dst = fi->fi_dst;
1945 is->is_flags &= ~SI_W_DADDR;
1949 if (is->is_v == 6 &&
1950 IN6_IS_ADDR_MULTICAST(&fi->fi_src.in6))
1955 is->is_dst = fi->fi_src;
1956 is->is_flags &= ~SI_W_DADDR;
1960 if ((is->is_flags & (SI_WILDA|SI_WILDP)) == 0) {
1961 ATOMIC_DECL(ips_stats.iss_wild);
1965 flx = fin->fin_flx & cmask;
1966 cflx = is->is_flx[out][rev];
1969 * Match up any flags set from IP options.
1971 if ((cflx && (flx != (cflx & cmask))) ||
1972 ((fin->fin_optmsk & is->is_optmsk[rev]) != is->is_opt[rev]) ||
1973 ((fin->fin_secmsk & is->is_secmsk) != is->is_sec) ||
1974 ((fin->fin_auth & is->is_authmsk) != is->is_auth))
1978 * Only one of the source or destination port can be flagged as a
1979 * wildcard. When filling it in, fill in a copy of the matched entry
1980 * if it has the cloning flag set.
1982 if ((fin->fin_flx & FI_IGNORE) != 0) {
1987 if ((flags & (SI_W_SPORT|SI_W_DPORT))) {
1988 if ((flags & SI_CLONE) != 0) {
1991 clone = fr_stclone(fin, tcp, is);
1996 ATOMIC_DECL(ips_stats.iss_wild);
1999 if ((flags & SI_W_SPORT) != 0) {
2002 is->is_send = ntohl(tcp->th_seq);
2005 is->is_send = ntohl(tcp->th_ack);
2007 is->is_maxsend = is->is_send + 1;
2008 } else if ((flags & SI_W_DPORT) != 0) {
2011 is->is_dend = ntohl(tcp->th_ack);
2014 is->is_dend = ntohl(tcp->th_seq);
2016 is->is_maxdend = is->is_dend + 1;
2018 is->is_flags &= ~(SI_W_SPORT|SI_W_DPORT);
2019 if ((flags & SI_CLONED) && ipstate_logging)
2020 ipstate_log(is, ISL_CLONE);
2025 if (is->is_flx[out][rev] == 0) {
2026 is->is_flx[out][rev] = flx;
2027 is->is_opt[rev] = fin->fin_optmsk;
2028 if (is->is_v == 6) {
2029 is->is_opt[rev] &= ~0x8;
2030 is->is_optmsk[rev] &= ~0x8;
2035 * Check if the interface name for this "direction" is set and if not,
2038 if (is->is_ifp[idx] == NULL &&
2039 (*is->is_ifname[idx] == '\0' || *is->is_ifname[idx] == '*')) {
2040 is->is_ifp[idx] = ifp;
2041 COPYIFNAME(is->is_v, ifp, is->is_ifname[idx]);
2048 /* ------------------------------------------------------------------------ */
2049 /* Function: fr_checkicmpmatchingstate */
2051 /* Parameters: fin(I) - pointer to packet information */
2053 /* If we've got an ICMP error message, using the information stored in the */
2054 /* ICMP packet, look for a matching state table entry. */
2056 /* If we return NULL then no lock on ipf_state is held. */
2057 /* If we return non-null then a read-lock on ipf_state is held. */
2058 /* ------------------------------------------------------------------------ */
2059 static ipstate_t *fr_checkicmpmatchingstate(fin)
2062 ipstate_t *is, **isp;
2063 u_short sport, dport;
2065 int backward, i, oi;
2077 * Does it at least have the return (basic) IP header ?
2078 * Is it an actual recognised ICMP error type?
2079 * Only a basic IP header (no options) should be with
2080 * an ICMP error header.
2082 if ((fin->fin_v != 4) || (fin->fin_hlen != sizeof(ip_t)) ||
2083 (fin->fin_plen < ICMPERR_MINPKTLEN) ||
2084 !(fin->fin_flx & FI_ICMPERR))
2087 type = ic->icmp_type;
2089 oip = (ip_t *)((char *)ic + ICMPERR_ICMPHLEN);
2091 * Check if the at least the old IP header (with options) and
2092 * 8 bytes of payload is present.
2094 if (fin->fin_plen < ICMPERR_MAXPKTLEN + ((IP_HL(oip) - 5) << 2))
2100 len = fin->fin_dlen - ICMPERR_ICMPHLEN;
2101 if ((len <= 0) || ((IP_HL(oip) << 2) > len))
2105 * Is the buffer big enough for all of it ? It's the size of the IP
2106 * header claimed in the encapsulated part which is of concern. It
2107 * may be too big to be in this buffer but not so big that it's
2108 * outside the ICMP packet, leading to TCP deref's causing problems.
2109 * This is possible because we don't know how big oip_hl is when we
2110 * do the pullup early in fr_check() and thus can't guarantee it is
2118 # if defined(MENTAT)
2119 if ((char *)oip + len > (char *)m->b_wptr)
2122 if ((char *)oip + len > (char *)fin->fin_ip + m->m_len)
2127 bcopy((char *)fin, (char *)&ofin, sizeof(*fin));
2130 * in the IPv4 case we must zero the i6addr union otherwise
2131 * the IP6_EQ and IP6_NEQ macros produce the wrong results because
2132 * of the 'junk' in the unused part of the union
2134 bzero((char *)&src, sizeof(src));
2135 bzero((char *)&dst, sizeof(dst));
2138 * we make an fin entry to be able to feed it to
2139 * matchsrcdst note that not all fields are encessary
2140 * but this is the cleanest way. Note further we fill
2141 * in fin_mp such that if someone uses it we'll get
2142 * a kernel panic. fr_matchsrcdst does not use this.
2144 * watch out here, as ip is in host order and oip in network
2145 * order. Any change we make must be undone afterwards, like
2146 * oip->ip_off - it is still in network byte order so fix it.
2148 savelen = oip->ip_len;
2150 oip->ip_off = ntohs(oip->ip_off);
2152 ofin.fin_flx = FI_NOCKSUM;
2155 ofin.fin_m = NULL; /* if dereferenced, panic XXX */
2156 ofin.fin_mp = NULL; /* if dereferenced, panic XXX */
2157 (void) fr_makefrip(IP_HL(oip) << 2, oip, &ofin);
2158 ofin.fin_ifp = fin->fin_ifp;
2159 ofin.fin_out = !fin->fin_out;
2161 * Reset the short and bad flag here because in fr_matchsrcdst()
2162 * the flags for the current packet (fin_flx) are compared against
2163 * those for the existing session.
2165 ofin.fin_flx &= ~(FI_BAD|FI_SHORT);
2168 * Put old values of ip_len and ip_off back as we don't know
2169 * if we have to forward the packet (or process it again.
2171 oip->ip_len = savelen;
2172 oip->ip_off = htons(oip->ip_off);
2178 * an ICMP error can only be generated as a result of an
2179 * ICMP query, not as the response on an ICMP error
2181 * XXX theoretically ICMP_ECHOREP and the other reply's are
2182 * ICMP query's as well, but adding them here seems strange XXX
2184 if ((ofin.fin_flx & FI_ICMPERR) != 0)
2188 * perform a lookup of the ICMP packet in the state table
2190 icmp = (icmphdr_t *)((char *)oip + (IP_HL(oip) << 2));
2191 hv = (pr = oip->ip_p);
2192 src.in4 = oip->ip_src;
2193 hv += src.in4.s_addr;
2194 dst.in4 = oip->ip_dst;
2195 hv += dst.in4.s_addr;
2196 hv += icmp->icmp_id;
2197 hv = DOUBLE_HASH(hv);
2199 READ_ENTER(&ipf_state);
2200 for (isp = &ips_table[hv]; ((is = *isp) != NULL); ) {
2201 isp = &is->is_hnext;
2202 if ((is->is_p != pr) || (is->is_v != 4))
2204 if (is->is_pass & FR_NOICMPERR)
2206 is = fr_matchsrcdst(&ofin, is, &src, &dst,
2210 * i : the index of this packet (the icmp
2212 * oi : the index of the original packet found
2213 * in the icmp header (i.e. the packet
2214 * causing this icmp)
2215 * backward : original packet was backward
2216 * compared to the state
2218 backward = IP6_NEQ(&is->is_src, &src);
2219 fin->fin_rev = !backward;
2220 i = (!backward << 1) + fin->fin_out;
2221 oi = (backward << 1) + ofin.fin_out;
2222 if (is->is_icmppkts[i] > is->is_pkts[oi])
2224 ips_stats.iss_hits++;
2225 is->is_icmppkts[i]++;
2229 RWLOCK_EXIT(&ipf_state);
2238 tcp = (tcphdr_t *)((char *)oip + (IP_HL(oip) << 2));
2239 dport = tcp->th_dport;
2240 sport = tcp->th_sport;
2242 hv = (pr = oip->ip_p);
2243 src.in4 = oip->ip_src;
2244 hv += src.in4.s_addr;
2245 dst.in4 = oip->ip_dst;
2246 hv += dst.in4.s_addr;
2249 hv = DOUBLE_HASH(hv);
2251 READ_ENTER(&ipf_state);
2252 for (isp = &ips_table[hv]; ((is = *isp) != NULL); ) {
2253 isp = &is->is_hnext;
2255 * Only allow this icmp though if the
2256 * encapsulated packet was allowed through the
2257 * other way around. Note that the minimal amount
2258 * of info present does not allow for checking against
2259 * tcp internals such as seq and ack numbers. Only the
2260 * ports are known to be present and can be even if the
2261 * short flag is set.
2263 if ((is->is_p == pr) && (is->is_v == 4) &&
2264 (is = fr_matchsrcdst(&ofin, is, &src, &dst,
2265 tcp, FI_ICMPCMP))) {
2267 * i : the index of this packet (the icmp unreachable)
2268 * oi : the index of the original packet found in the
2269 * icmp header (i.e. the packet causing this icmp)
2270 * backward : original packet was backward compared to
2273 backward = IP6_NEQ(&is->is_src, &src);
2274 fin->fin_rev = !backward;
2275 i = (!backward << 1) + fin->fin_out;
2276 oi = (backward << 1) + ofin.fin_out;
2278 if (((is->is_pass & FR_NOICMPERR) != 0) ||
2279 (is->is_icmppkts[i] > is->is_pkts[oi]))
2281 ips_stats.iss_hits++;
2282 is->is_icmppkts[i]++;
2284 * we deliberately do not touch the timeouts
2285 * for the accompanying state table entry.
2286 * It remains to be seen if that is correct. XXX
2291 RWLOCK_EXIT(&ipf_state);
2296 /* ------------------------------------------------------------------------ */
2297 /* Function: fr_ipsmove */
2299 /* Parameters: is(I) - pointer to state table entry */
2300 /* hv(I) - new hash value for state table entry */
2301 /* Write Locks: ipf_state */
2303 /* Move a state entry from one position in the hash table to another. */
2304 /* ------------------------------------------------------------------------ */
2305 static void fr_ipsmove(is, hv)
2314 * Remove the hash from the old location...
2316 isp = is->is_phnext;
2318 is->is_hnext->is_phnext = isp;
2319 *isp = is->is_hnext;
2320 if (ips_table[hvm] == NULL)
2321 ips_stats.iss_inuse--;
2322 ips_stats.iss_bucketlen[hvm]--;
2325 * ...and put the hash in the new one.
2327 hvm = DOUBLE_HASH(hv);
2329 isp = &ips_table[hvm];
2331 (*isp)->is_phnext = &is->is_hnext;
2333 ips_stats.iss_inuse++;
2334 ips_stats.iss_bucketlen[hvm]++;
2335 is->is_phnext = isp;
2336 is->is_hnext = *isp;
2341 /* ------------------------------------------------------------------------ */
2342 /* Function: fr_stlookup */
2343 /* Returns: ipstate_t* - NULL == no matching state found, */
2344 /* else pointer to state information is returned */
2345 /* Parameters: fin(I) - pointer to packet information */
2346 /* tcp(I) - pointer to TCP/UDP header. */
2348 /* Search the state table for a matching entry to the packet described by */
2349 /* the contents of *fin. */
2351 /* If we return NULL then no lock on ipf_state is held. */
2352 /* If we return non-null then a read-lock on ipf_state is held. */
2353 /* ------------------------------------------------------------------------ */
2354 ipstate_t *fr_stlookup(fin, tcp, ifqp)
2359 u_int hv, hvm, pr, v, tryagain;
2360 ipstate_t *is, **isp;
2361 u_short dport, sport;
2370 ic = (struct icmp *)tcp;
2371 hv = (pr = fin->fin_fi.fi_p);
2372 src = fin->fin_fi.fi_src;
2373 dst = fin->fin_fi.fi_dst;
2374 hv += src.in4.s_addr;
2375 hv += dst.in4.s_addr;
2377 v = fin->fin_fi.fi_v;
2380 hv += fin->fin_fi.fi_src.i6[1];
2381 hv += fin->fin_fi.fi_src.i6[2];
2382 hv += fin->fin_fi.fi_src.i6[3];
2384 if ((fin->fin_p == IPPROTO_ICMPV6) &&
2385 IN6_IS_ADDR_MULTICAST(&fin->fin_fi.fi_dst.in6)) {
2386 hv -= dst.in4.s_addr;
2388 hv += fin->fin_fi.fi_dst.i6[1];
2389 hv += fin->fin_fi.fi_dst.i6[2];
2390 hv += fin->fin_fi.fi_dst.i6[3];
2395 (fin->fin_flx & (FI_MULTICAST|FI_BROADCAST|FI_MBCAST))) {
2396 if (fin->fin_out == 0) {
2397 hv -= src.in4.s_addr;
2399 hv -= dst.in4.s_addr;
2404 * Search the hash table for matching packet header info.
2409 case IPPROTO_ICMPV6 :
2412 if ((ic->icmp_type == ICMP6_ECHO_REQUEST) ||
2413 (ic->icmp_type == ICMP6_ECHO_REPLY)) {
2417 READ_ENTER(&ipf_state);
2419 hvm = DOUBLE_HASH(hv);
2420 for (isp = &ips_table[hvm]; ((is = *isp) != NULL); ) {
2421 isp = &is->is_hnext;
2423 * If a connection is about to be deleted, no packets
2424 * are allowed to match it.
2426 if (is->is_sti.tqe_ifq == &ips_deletetq)
2429 if ((is->is_p != pr) || (is->is_v != v))
2431 is = fr_matchsrcdst(fin, is, &src, &dst, NULL, FI_CMP);
2433 fr_matchicmpqueryreply(v, &is->is_icmp,
2434 ic, fin->fin_rev)) {
2436 ifq = &ips_icmpacktq;
2444 if ((tryagain != 0) && !(is->is_flags & SI_W_DADDR)) {
2445 hv += fin->fin_fi.fi_src.i6[0];
2446 hv += fin->fin_fi.fi_src.i6[1];
2447 hv += fin->fin_fi.fi_src.i6[2];
2448 hv += fin->fin_fi.fi_src.i6[3];
2450 MUTEX_DOWNGRADE(&ipf_state);
2454 RWLOCK_EXIT(&ipf_state);
2457 * No matching icmp state entry. Perhaps this is a
2458 * response to another state entry.
2460 * XXX With some ICMP6 packets, the "other" address is already
2461 * in the packet, after the ICMP6 header, and this could be
2462 * used in place of the multicast address. However, taking
2463 * advantage of this requires some significant code changes
2464 * to handle the specific types where that is the case.
2466 if ((ips_stats.iss_wild != 0) && (v == 6) && (tryagain == 0) &&
2467 !IN6_IS_ADDR_MULTICAST(&fin->fin_fi.fi_src.in6)) {
2468 hv -= fin->fin_fi.fi_src.i6[0];
2469 hv -= fin->fin_fi.fi_src.i6[1];
2470 hv -= fin->fin_fi.fi_src.i6[2];
2471 hv -= fin->fin_fi.fi_src.i6[3];
2473 WRITE_ENTER(&ipf_state);
2477 is = fr_checkicmp6matchingstate(fin);
2487 hv = DOUBLE_HASH(hv);
2488 READ_ENTER(&ipf_state);
2489 for (isp = &ips_table[hv]; ((is = *isp) != NULL); ) {
2490 isp = &is->is_hnext;
2491 if ((is->is_p != pr) || (is->is_v != v))
2493 is = fr_matchsrcdst(fin, is, &src, &dst, NULL, FI_CMP);
2495 (ic->icmp_id == is->is_icmp.ici_id) &&
2496 fr_matchicmpqueryreply(v, &is->is_icmp,
2497 ic, fin->fin_rev)) {
2499 ifq = &ips_icmpacktq;
2506 RWLOCK_EXIT(&ipf_state);
2513 sport = htons(fin->fin_data[0]);
2515 dport = htons(fin->fin_data[1]);
2519 READ_ENTER(&ipf_state);
2521 hvm = DOUBLE_HASH(hv);
2522 for (isp = &ips_table[hvm]; ((is = *isp) != NULL); ) {
2523 isp = &is->is_hnext;
2524 if ((is->is_p != pr) || (is->is_v != v))
2526 fin->fin_flx &= ~FI_OOW;
2527 is = fr_matchsrcdst(fin, is, &src, &dst, tcp, FI_CMP);
2529 if (pr == IPPROTO_TCP) {
2530 if (!fr_tcpstate(fin, tcp, is)) {
2531 oow |= fin->fin_flx & FI_OOW;
2540 !(is->is_flags & (SI_CLONE|SI_WILDP|SI_WILDA))) {
2544 MUTEX_DOWNGRADE(&ipf_state);
2548 RWLOCK_EXIT(&ipf_state);
2550 if (ips_stats.iss_wild) {
2551 if (tryagain == 0) {
2554 } else if (tryagain == 1) {
2555 hv = fin->fin_fi.fi_p;
2557 * If we try to pretend this is a reply to a
2558 * multicast/broadcast packet then we need to
2559 * exclude part of the address from the hash
2562 if (fin->fin_out == 0) {
2563 hv += src.in4.s_addr;
2565 hv += dst.in4.s_addr;
2571 if (tryagain <= 2) {
2572 WRITE_ENTER(&ipf_state);
2576 fin->fin_flx |= oow;
2582 if (GRE_REV(gre->gr_flags) == 1) {
2589 hvm = DOUBLE_HASH(hv);
2590 READ_ENTER(&ipf_state);
2591 for (isp = &ips_table[hvm]; ((is = *isp) != NULL); ) {
2592 isp = &is->is_hnext;
2593 if ((is->is_p != pr) || (is->is_v != v))
2595 is = fr_matchsrcdst(fin, is, &src, &dst, NULL, FI_CMP);
2602 RWLOCK_EXIT(&ipf_state);
2608 if (((is->is_sti.tqe_flags & TQE_RULEBASED) != 0) &&
2609 (is->is_tqehead[fin->fin_rev] != NULL))
2610 ifq = is->is_tqehead[fin->fin_rev];
2611 if (ifq != NULL && ifqp != NULL)
2618 /* ------------------------------------------------------------------------ */
2619 /* Function: fr_updatestate */
2621 /* Parameters: fin(I) - pointer to packet information */
2622 /* is(I) - pointer to state table entry */
2623 /* Read Locks: ipf_state */
2625 /* Updates packet and byte counters for a newly received packet. Seeds the */
2626 /* fragment cache with a new entry as required. */
2627 /* ------------------------------------------------------------------------ */
2628 void fr_updatestate(fin, is, ifq)
2636 i = (fin->fin_rev << 1) + fin->fin_out;
2639 * For TCP packets, ifq == NULL. For all others, check if this new
2640 * queue is different to the last one it was on and move it if so.
2643 MUTEX_ENTER(&is->is_lock);
2644 if ((tqe->tqe_flags & TQE_RULEBASED) != 0)
2645 ifq = is->is_tqehead[fin->fin_rev];
2648 fr_movequeue(tqe, tqe->tqe_ifq, ifq);
2651 is->is_bytes[i] += fin->fin_plen;
2652 MUTEX_EXIT(&is->is_lock);
2654 #ifdef IPFILTER_SYNC
2655 if (is->is_flags & IS_STATESYNC)
2656 ipfsync_update(SMC_STATE, fin, is->is_sync);
2659 ATOMIC_INCL(ips_stats.iss_hits);
2661 fin->fin_fr = is->is_rule;
2664 * If this packet is a fragment and the rule says to track fragments,
2665 * then create a new fragment cache entry.
2668 if ((fin->fin_flx & FI_FRAG) && FR_ISPASS(pass))
2669 (void) fr_newfrag(fin, pass ^ FR_KEEPSTATE);
2673 /* ------------------------------------------------------------------------ */
2674 /* Function: fr_checkstate */
2675 /* Returns: frentry_t* - NULL == search failed, */
2676 /* else pointer to rule for matching state */
2677 /* Parameters: ifp(I) - pointer to interface */
2678 /* passp(I) - pointer to filtering result flags */
2680 /* Check if a packet is associated with an entry in the state table. */
2681 /* ------------------------------------------------------------------------ */
2682 frentry_t *fr_checkstate(fin, passp)
2692 if (fr_state_lock || (ips_list == NULL) ||
2693 (fin->fin_flx & (FI_SHORT|FI_STATE|FI_FRAGBODY|FI_BAD)))
2697 if ((fin->fin_flx & FI_TCPUDP) ||
2698 (fin->fin_fi.fi_p == IPPROTO_ICMP)
2700 || (fin->fin_fi.fi_p == IPPROTO_ICMPV6)
2708 * Search the hash table for matching packet header info.
2711 is = fin->fin_state;
2713 is = fr_stlookup(fin, tcp, &ifq);
2717 case IPPROTO_ICMPV6 :
2720 if (fin->fin_v == 6) {
2721 is = fr_checkicmp6matchingstate(fin);
2731 * No matching icmp state entry. Perhaps this is a
2732 * response to another state entry.
2734 is = fr_checkicmpmatchingstate(fin);
2742 if (is->is_pass & FR_NEWISN) {
2743 if (fin->fin_out == 0)
2744 fr_fixinisn(fin, is);
2745 else if (fin->fin_out == 1)
2746 fr_fixoutisn(fin, is);
2751 ifq = &ips_udpacktq;
2757 ATOMIC_INCL(ips_stats.iss_miss);
2764 if ((fin->fin_out == 0) && (fr->fr_nattag.ipt_num[0] != 0)) {
2765 if (fin->fin_nattag == NULL)
2767 if (fr_matchtag(&fr->fr_nattag, fin->fin_nattag) != 0)
2770 (void) strncpy(fin->fin_group, fr->fr_group, FR_GROUPLEN);
2771 fin->fin_icode = fr->fr_icode;
2774 fin->fin_rule = is->is_rulen;
2776 fr_updatestate(fin, is, ifq);
2778 fin->fin_state = is;
2779 is->is_touched = fr_ticks;
2780 MUTEX_ENTER(&is->is_lock);
2782 MUTEX_EXIT(&is->is_lock);
2783 RWLOCK_EXIT(&ipf_state);
2784 fin->fin_flx |= FI_STATE;
2785 if ((pass & FR_LOGFIRST) != 0)
2786 pass &= ~(FR_LOGFIRST|FR_LOG);
2792 /* ------------------------------------------------------------------------ */
2793 /* Function: fr_fixoutisn */
2795 /* Parameters: fin(I) - pointer to packet information */
2796 /* is(I) - pointer to master state structure */
2798 /* Called only for outbound packets, adjusts the sequence number and the */
2799 /* TCP checksum to match that change. */
2800 /* ------------------------------------------------------------------------ */
2801 static void fr_fixoutisn(fin, is)
2811 if ((is->is_flags & IS_ISNSYN) != 0) {
2813 seq = ntohl(tcp->th_seq);
2814 seq += is->is_isninc[0];
2815 tcp->th_seq = htonl(seq);
2816 fix_outcksum(fin, &tcp->th_sum, is->is_sumd[0]);
2819 if ((is->is_flags & IS_ISNACK) != 0) {
2821 seq = ntohl(tcp->th_seq);
2822 seq += is->is_isninc[1];
2823 tcp->th_seq = htonl(seq);
2824 fix_outcksum(fin, &tcp->th_sum, is->is_sumd[1]);
2830 /* ------------------------------------------------------------------------ */
2831 /* Function: fr_fixinisn */
2833 /* Parameters: fin(I) - pointer to packet information */
2834 /* is(I) - pointer to master state structure */
2836 /* Called only for inbound packets, adjusts the acknowledge number and the */
2837 /* TCP checksum to match that change. */
2838 /* ------------------------------------------------------------------------ */
2839 static void fr_fixinisn(fin, is)
2849 if ((is->is_flags & IS_ISNSYN) != 0) {
2851 ack = ntohl(tcp->th_ack);
2852 ack -= is->is_isninc[0];
2853 tcp->th_ack = htonl(ack);
2854 fix_incksum(fin, &tcp->th_sum, is->is_sumd[0]);
2857 if ((is->is_flags & IS_ISNACK) != 0) {
2859 ack = ntohl(tcp->th_ack);
2860 ack -= is->is_isninc[1];
2861 tcp->th_ack = htonl(ack);
2862 fix_incksum(fin, &tcp->th_sum, is->is_sumd[1]);
2868 /* ------------------------------------------------------------------------ */
2869 /* Function: fr_statesync */
2871 /* Parameters: ifp(I) - pointer to interface */
2873 /* Walk through all state entries and if an interface pointer match is */
2874 /* found then look it up again, based on its name in case the pointer has */
2875 /* changed since last time. */
2877 /* If ifp is passed in as being non-null then we are only doing updates for */
2878 /* existing, matching, uses of it. */
2879 /* ------------------------------------------------------------------------ */
2880 void fr_statesync(ifp)
2886 if (fr_running <= 0)
2889 WRITE_ENTER(&ipf_state);
2891 if (fr_running <= 0) {
2892 RWLOCK_EXIT(&ipf_state);
2896 for (is = ips_list; is; is = is->is_next) {
2898 * Look up all the interface names in the state entry.
2900 for (i = 0; i < 4; i++) {
2901 if (ifp == NULL || ifp == is->is_ifp[i])
2902 is->is_ifp[i] = fr_resolvenic(is->is_ifname[i],
2906 RWLOCK_EXIT(&ipf_state);
2910 /* ------------------------------------------------------------------------ */
2911 /* Function: fr_delstate */
2912 /* Returns: int - 0 = entry deleted, else reference count on struct */
2913 /* Parameters: is(I) - pointer to state structure to delete */
2914 /* why(I) - if not 0, log reason why it was deleted */
2915 /* Write Locks: ipf_state */
2917 /* Deletes a state entry from the enumerated list as well as the hash table */
2918 /* and timeout queue lists. Make adjustments to hash table statistics and */
2919 /* global counters as required. */
2920 /* ------------------------------------------------------------------------ */
2921 static int fr_delstate(is, why)
2927 * Since we want to delete this, remove it from the state table,
2928 * where it can be found & used, first.
2930 if (is->is_phnext != NULL) {
2931 *is->is_phnext = is->is_hnext;
2932 if (is->is_hnext != NULL)
2933 is->is_hnext->is_phnext = is->is_phnext;
2934 if (ips_table[is->is_hv] == NULL)
2935 ips_stats.iss_inuse--;
2936 ips_stats.iss_bucketlen[is->is_hv]--;
2938 is->is_phnext = NULL;
2939 is->is_hnext = NULL;
2943 * Because ips_stats.iss_wild is a count of entries in the state
2944 * table that have wildcard flags set, only decerement it once
2947 if (is->is_flags & (SI_WILDP|SI_WILDA)) {
2948 if (!(is->is_flags & SI_CLONED)) {
2949 ATOMIC_DECL(ips_stats.iss_wild);
2951 is->is_flags &= ~(SI_WILDP|SI_WILDA);
2955 * Next, remove it from the timeout queue it is in.
2957 if (is->is_sti.tqe_ifq != NULL)
2958 fr_deletequeueentry(&is->is_sti);
2960 if (is->is_me != NULL) {
2966 * If it is still in use by something else, do not go any further,
2967 * but note that at this point it is now an orphan. How can this
2968 * be? fr_state_flush() calls fr_delete() directly because it wants
2969 * to empty the table out and if something has a hold on a state
2970 * entry (such as ipfstat), it'll do the deref path that'll bring
2971 * us back here to do the real delete & free.
2973 MUTEX_ENTER(&is->is_lock);
2974 if (is->is_ref > 1) {
2976 MUTEX_EXIT(&is->is_lock);
2979 MUTEX_EXIT(&is->is_lock);
2983 if (is->is_tqehead[0] != NULL) {
2984 if (fr_deletetimeoutqueue(is->is_tqehead[0]) == 0)
2985 fr_freetimeoutqueue(is->is_tqehead[0]);
2987 if (is->is_tqehead[1] != NULL) {
2988 if (fr_deletetimeoutqueue(is->is_tqehead[1]) == 0)
2989 fr_freetimeoutqueue(is->is_tqehead[1]);
2992 #ifdef IPFILTER_SYNC
2994 ipfsync_del(is->is_sync);
2996 #ifdef IPFILTER_SCAN
2997 (void) ipsc_detachis(is);
3001 * Now remove it from the linked list of known states
3003 if (is->is_pnext != NULL) {
3004 *is->is_pnext = is->is_next;
3006 if (is->is_next != NULL)
3007 is->is_next->is_pnext = is->is_pnext;
3009 is->is_pnext = NULL;
3013 if (ipstate_logging != 0 && why != 0)
3014 ipstate_log(is, why);
3016 if (is->is_p == IPPROTO_TCP)
3017 ips_stats.iss_fin++;
3019 ips_stats.iss_expire++;
3021 if (is->is_rule != NULL) {
3022 is->is_rule->fr_statecnt--;
3023 (void) fr_derefrule(&is->is_rule);
3026 MUTEX_DESTROY(&is->is_lock);
3034 /* ------------------------------------------------------------------------ */
3035 /* Function: fr_timeoutstate */
3037 /* Parameters: Nil */
3039 /* Slowly expire held state for thingslike UDP and ICMP. The algorithm */
3040 /* used here is to keep the queue sorted with the oldest things at the top */
3041 /* and the youngest at the bottom. So if the top one doesn't need to be */
3042 /* expired then neither will any under it. */
3043 /* ------------------------------------------------------------------------ */
3044 void fr_timeoutstate()
3046 ipftq_t *ifq, *ifqnext;
3047 ipftqent_t *tqe, *tqn;
3052 WRITE_ENTER(&ipf_state);
3053 for (ifq = ips_tqtqb; ifq != NULL; ifq = ifq->ifq_next)
3054 for (tqn = ifq->ifq_head; ((tqe = tqn) != NULL); ) {
3055 if (tqe->tqe_die > fr_ticks)
3057 tqn = tqe->tqe_next;
3058 is = tqe->tqe_parent;
3059 fr_delstate(is, ISL_EXPIRE);
3062 for (ifq = ips_utqe; ifq != NULL; ifq = ifqnext) {
3063 ifqnext = ifq->ifq_next;
3065 for (tqn = ifq->ifq_head; ((tqe = tqn) != NULL); ) {
3066 if (tqe->tqe_die > fr_ticks)
3068 tqn = tqe->tqe_next;
3069 is = tqe->tqe_parent;
3070 fr_delstate(is, ISL_EXPIRE);
3074 for (ifq = ips_utqe; ifq != NULL; ifq = ifqnext) {
3075 ifqnext = ifq->ifq_next;
3077 if (((ifq->ifq_flags & IFQF_DELETE) != 0) &&
3078 (ifq->ifq_ref == 0)) {
3079 fr_freetimeoutqueue(ifq);
3083 if (fr_state_doflush) {
3084 (void) fr_state_flush(2, 0);
3085 fr_state_doflush = 0;
3088 RWLOCK_EXIT(&ipf_state);
3093 /* ------------------------------------------------------------------------ */
3094 /* Function: fr_state_flush */
3095 /* Returns: int - 0 == success, -1 == failure */
3096 /* Parameters: Nil */
3097 /* Write Locks: ipf_state */
3099 /* Flush state tables. Three actions currently defined: */
3100 /* which == 0 : flush all state table entries */
3101 /* which == 1 : flush TCP connections which have started to close but are */
3102 /* stuck for some reason. */
3103 /* which == 2 : flush TCP connections which have been idle for a long time, */
3104 /* starting at > 4 days idle and working back in successive half-*/
3105 /* days to at most 12 hours old. If this fails to free enough */
3106 /* slots then work backwards in half hour slots to 30 minutes. */
3107 /* If that too fails, then work backwards in 30 second intervals */
3108 /* for the last 30 minutes to at worst 30 seconds idle. */
3109 /* ------------------------------------------------------------------------ */
3110 static int fr_state_flush(which, proto)
3113 ipftq_t *ifq, *ifqnext;
3114 ipftqent_t *tqe, *tqn;
3115 ipstate_t *is, **isp;
3127 * Style 0 flush removes everything...
3129 for (isp = &ips_list; ((is = *isp) != NULL); ) {
3130 if ((proto != 0) && (is->is_v != proto)) {
3134 if (fr_delstate(is, ISL_FLUSH) == 0)
3143 * Since we're only interested in things that are closing,
3144 * we can start with the appropriate timeout queue.
3146 for (ifq = ips_tqtqb + IPF_TCPS_CLOSE_WAIT; ifq != NULL;
3147 ifq = ifq->ifq_next) {
3149 for (tqn = ifq->ifq_head; ((tqe = tqn) != NULL); ) {
3150 tqn = tqe->tqe_next;
3151 is = tqe->tqe_parent;
3152 if (is->is_p != IPPROTO_TCP)
3154 if (fr_delstate(is, ISL_EXPIRE) == 0)
3160 * Also need to look through the user defined queues.
3162 for (ifq = ips_utqe; ifq != NULL; ifq = ifqnext) {
3163 ifqnext = ifq->ifq_next;
3164 for (tqn = ifq->ifq_head; ((tqe = tqn) != NULL); ) {
3165 tqn = tqe->tqe_next;
3166 is = tqe->tqe_parent;
3167 if (is->is_p != IPPROTO_TCP)
3170 if ((is->is_state[0] > IPF_TCPS_ESTABLISHED) &&
3171 (is->is_state[1] > IPF_TCPS_ESTABLISHED)) {
3172 if (fr_delstate(is, ISL_EXPIRE) == 0)
3183 * Args 5-11 correspond to flushing those particular states
3184 * for TCP connections.
3186 case IPF_TCPS_CLOSE_WAIT :
3187 case IPF_TCPS_FIN_WAIT_1 :
3188 case IPF_TCPS_CLOSING :
3189 case IPF_TCPS_LAST_ACK :
3190 case IPF_TCPS_FIN_WAIT_2 :
3191 case IPF_TCPS_TIME_WAIT :
3192 case IPF_TCPS_CLOSED :
3193 tqn = ips_tqtqb[which].ifq_head;
3194 while (tqn != NULL) {
3196 tqn = tqe->tqe_next;
3197 is = tqe->tqe_parent;
3198 if (fr_delstate(is, ISL_FLUSH) == 0)
3208 * Take a large arbitrary number to mean the number of seconds
3209 * for which which consider to be the maximum value we'll allow
3210 * the expiration to be.
3212 which = IPF_TTLVAL(which);
3213 for (isp = &ips_list; ((is = *isp) != NULL); ) {
3214 if ((proto == 0) || (is->is_v == proto)) {
3215 if (fr_ticks - is->is_touched > which) {
3216 if (fr_delstate(is, ISL_FLUSH) == 0) {
3233 * Asked to remove inactive entries because the table is full.
3235 if (fr_ticks - ips_last_force_flush > IPF_TTLVAL(5)) {
3236 ips_last_force_flush = fr_ticks;
3237 removed = ipf_queueflush(fr_state_flush_entry, ips_tqtqb,
3246 /* ------------------------------------------------------------------------ */
3247 /* Function: fr_state_flush_entry */
3248 /* Returns: int - 0 = entry deleted, else not deleted */
3249 /* Parameters: entry(I) - pointer to state structure to delete */
3250 /* Write Locks: ipf_state */
3252 /* This function is a stepping stone between ipf_queueflush() and */
3253 /* fr_delstate(). It is used so we can provide a uniform interface via the */
3254 /* ipf_queueflush() function. */
3255 /* ------------------------------------------------------------------------ */
3256 static int fr_state_flush_entry(entry)
3259 return fr_delstate(entry, ISL_FLUSH);
3263 /* ------------------------------------------------------------------------ */
3264 /* Function: fr_tcp_age */
3265 /* Returns: int - 1 == state transition made, 0 == no change (rejected) */
3266 /* Parameters: tq(I) - pointer to timeout queue information */
3267 /* fin(I) - pointer to packet information */
3268 /* tqtab(I) - TCP timeout queue table this is in */
3269 /* flags(I) - flags from state/NAT entry */
3271 /* Rewritten by Arjan de Vet <Arjan.deVet@adv.iae.nl>, 2000-07-29: */
3273 /* - (try to) base state transitions on real evidence only, */
3274 /* i.e. packets that are sent and have been received by ipfilter; */
3275 /* diagram 18.12 of TCP/IP volume 1 by W. Richard Stevens was used. */
3277 /* - deal with half-closed connections correctly; */
3279 /* - store the state of the source in state[0] such that ipfstat */
3280 /* displays the state as source/dest instead of dest/source; the calls */
3281 /* to fr_tcp_age have been changed accordingly. */
3283 /* Internal Parameters: */
3285 /* state[0] = state of source (host that initiated connection) */
3286 /* state[1] = state of dest (host that accepted the connection) */
3288 /* dir == 0 : a packet from source to dest */
3289 /* dir == 1 : a packet from dest to source */
3291 /* A typical procession for a connection is as follows: */
3293 /* +--------------+-------------------+ */
3294 /* | Side '0' | Side '1' | */
3295 /* +--------------+-------------------+ */
3296 /* | 0 -> 1 (SYN) | | */
3297 /* | | 0 -> 2 (SYN-ACK) | */
3298 /* | 1 -> 3 (ACK) | | */
3299 /* | | 2 -> 4 (ACK-PUSH) | */
3300 /* | 3 -> 4 (ACK) | | */
3302 /* | | 4 -> 6 (FIN-ACK) | */
3303 /* | 4 -> 5 (ACK) | | */
3304 /* | | 6 -> 6 (ACK-PUSH) | */
3305 /* | 5 -> 5 (ACK) | | */
3306 /* | 5 -> 8 (FIN) | | */
3307 /* | | 6 -> 10 (ACK) | */
3308 /* +--------------+-------------------+ */
3310 /* Locking: it is assumed that the parent of the tqe structure is locked. */
3311 /* ------------------------------------------------------------------------ */
3312 int fr_tcp_age(tqe, fin, tqtab, flags)
3318 int dlen, ostate, nstate, rval, dir;
3326 tcpflags = tcp->th_flags;
3327 dlen = fin->fin_dlen - (TCP_OFF(tcp) << 2);
3329 if (tcpflags & TH_RST) {
3330 if (!(tcpflags & TH_PUSH) && !dlen)
3331 nstate = IPF_TCPS_CLOSED;
3333 nstate = IPF_TCPS_CLOSE_WAIT;
3336 ostate = tqe->tqe_state[1 - dir];
3337 nstate = tqe->tqe_state[dir];
3341 case IPF_TCPS_LISTEN: /* 0 */
3342 if ((tcpflags & TH_OPENING) == TH_OPENING) {
3344 * 'dir' received an S and sends SA in
3345 * response, LISTEN -> SYN_RECEIVED
3347 nstate = IPF_TCPS_SYN_RECEIVED;
3349 } else if ((tcpflags & TH_OPENING) == TH_SYN) {
3350 /* 'dir' sent S, LISTEN -> SYN_SENT */
3351 nstate = IPF_TCPS_SYN_SENT;
3355 * the next piece of code makes it possible to get
3356 * already established connections into the state table
3357 * after a restart or reload of the filter rules; this
3358 * does not work when a strict 'flags S keep state' is
3359 * used for tcp connections of course
3361 if (((flags & IS_TCPFSM) == 0) &&
3362 ((tcpflags & TH_ACKMASK) == TH_ACK)) {
3364 * we saw an A, guess 'dir' is in ESTABLISHED
3369 case IPF_TCPS_LISTEN :
3370 case IPF_TCPS_SYN_RECEIVED :
3371 nstate = IPF_TCPS_HALF_ESTAB;
3374 case IPF_TCPS_HALF_ESTAB :
3375 case IPF_TCPS_ESTABLISHED :
3376 nstate = IPF_TCPS_ESTABLISHED;
3384 * TODO: besides regular ACK packets we can have other
3385 * packets as well; it is yet to be determined how we
3386 * should initialize the states in those cases
3390 case IPF_TCPS_SYN_SENT: /* 1 */
3391 if ((tcpflags & ~(TH_ECN|TH_CWR)) == TH_SYN) {
3393 * A retransmitted SYN packet. We do not reset
3394 * the timeout here to fr_tcptimeout because a
3395 * connection connect timeout does not renew
3396 * after every packet that is sent. We need to
3397 * set rval so as to indicate the packet has
3398 * passed the check for its flags being valid
3399 * in the TCP FSM. Setting rval to 2 has the
3400 * result of not resetting the timeout.
3403 } else if ((tcpflags & (TH_SYN|TH_FIN|TH_ACK)) ==
3406 * we see an A from 'dir' which is in SYN_SENT
3407 * state: 'dir' sent an A in response to an SA
3408 * which it received, SYN_SENT -> ESTABLISHED
3410 nstate = IPF_TCPS_ESTABLISHED;
3412 } else if (tcpflags & TH_FIN) {
3414 * we see an F from 'dir' which is in SYN_SENT
3415 * state and wants to close its side of the
3416 * connection; SYN_SENT -> FIN_WAIT_1
3418 nstate = IPF_TCPS_FIN_WAIT_1;
3420 } else if ((tcpflags & TH_OPENING) == TH_OPENING) {
3422 * we see an SA from 'dir' which is already in
3423 * SYN_SENT state, this means we have a
3424 * simultaneous open; SYN_SENT -> SYN_RECEIVED
3426 nstate = IPF_TCPS_SYN_RECEIVED;
3431 case IPF_TCPS_SYN_RECEIVED: /* 2 */
3432 if ((tcpflags & (TH_SYN|TH_FIN|TH_ACK)) == TH_ACK) {
3434 * we see an A from 'dir' which was in
3435 * SYN_RECEIVED state so it must now be in
3436 * established state, SYN_RECEIVED ->
3439 nstate = IPF_TCPS_ESTABLISHED;
3441 } else if ((tcpflags & ~(TH_ECN|TH_CWR)) ==
3444 * We see an SA from 'dir' which is already in
3445 * SYN_RECEIVED state.
3448 } else if (tcpflags & TH_FIN) {
3450 * we see an F from 'dir' which is in
3451 * SYN_RECEIVED state and wants to close its
3452 * side of the connection; SYN_RECEIVED ->
3455 nstate = IPF_TCPS_FIN_WAIT_1;
3460 case IPF_TCPS_HALF_ESTAB: /* 3 */
3461 if (tcpflags & TH_FIN) {
3462 nstate = IPF_TCPS_FIN_WAIT_1;
3464 } else if ((tcpflags & TH_ACKMASK) == TH_ACK) {
3466 * If we've picked up a connection in mid
3467 * flight, we could be looking at a follow on
3468 * packet from the same direction as the one
3469 * that created this state. Recognise it but
3470 * do not advance the entire connection's
3475 case IPF_TCPS_LISTEN :
3476 case IPF_TCPS_SYN_SENT :
3477 case IPF_TCPS_SYN_RECEIVED :
3480 case IPF_TCPS_HALF_ESTAB :
3481 case IPF_TCPS_ESTABLISHED :
3482 nstate = IPF_TCPS_ESTABLISHED;
3491 case IPF_TCPS_ESTABLISHED: /* 4 */
3493 if (tcpflags & TH_FIN) {
3495 * 'dir' closed its side of the connection;
3496 * this gives us a half-closed connection;
3497 * ESTABLISHED -> FIN_WAIT_1
3499 if (ostate == IPF_TCPS_FIN_WAIT_1) {
3500 nstate = IPF_TCPS_CLOSING;
3502 nstate = IPF_TCPS_FIN_WAIT_1;
3504 } else if (tcpflags & TH_ACK) {
3506 * an ACK, should we exclude other flags here?
3508 if (ostate == IPF_TCPS_FIN_WAIT_1) {
3510 * We know the other side did an active
3511 * close, so we are ACKing the recvd
3512 * FIN packet (does the window matching
3513 * code guarantee this?) and go into
3514 * CLOSE_WAIT state; this gives us a
3515 * half-closed connection
3517 nstate = IPF_TCPS_CLOSE_WAIT;
3518 } else if (ostate < IPF_TCPS_CLOSE_WAIT) {
3520 * still a fully established
3521 * connection reset timeout
3523 nstate = IPF_TCPS_ESTABLISHED;
3528 case IPF_TCPS_CLOSE_WAIT: /* 5 */
3530 if (tcpflags & TH_FIN) {
3532 * application closed and 'dir' sent a FIN,
3533 * we're now going into LAST_ACK state
3535 nstate = IPF_TCPS_LAST_ACK;
3538 * we remain in CLOSE_WAIT because the other
3539 * side has closed already and we did not
3540 * close our side yet; reset timeout
3542 nstate = IPF_TCPS_CLOSE_WAIT;
3546 case IPF_TCPS_FIN_WAIT_1: /* 6 */
3548 if ((tcpflags & TH_ACK) &&
3549 ostate > IPF_TCPS_CLOSE_WAIT) {
3551 * if the other side is not active anymore
3552 * it has sent us a FIN packet that we are
3553 * ack'ing now with an ACK; this means both
3554 * sides have now closed the connection and
3555 * we go into TIME_WAIT
3558 * XXX: how do we know we really are ACKing
3559 * the FIN packet here? does the window code
3562 nstate = IPF_TCPS_TIME_WAIT;
3565 * we closed our side of the connection
3566 * already but the other side is still active
3567 * (ESTABLISHED/CLOSE_WAIT); continue with
3568 * this half-closed connection
3570 nstate = IPF_TCPS_FIN_WAIT_1;
3574 case IPF_TCPS_CLOSING: /* 7 */
3575 if ((tcpflags & (TH_FIN|TH_ACK)) == TH_ACK) {
3576 nstate = IPF_TCPS_TIME_WAIT;
3581 case IPF_TCPS_LAST_ACK: /* 8 */
3582 if (tcpflags & TH_ACK) {
3583 if ((tcpflags & TH_PUSH) || dlen)
3585 * there is still data to be delivered,
3593 * we cannot detect when we go out of LAST_ACK state to
3594 * CLOSED because that is based on the reception of ACK
3595 * packets; ipfilter can only detect that a packet
3596 * has been sent by a host
3600 case IPF_TCPS_FIN_WAIT_2: /* 9 */
3604 case IPF_TCPS_TIME_WAIT: /* 10 */
3605 /* we're in 2MSL timeout now */
3606 if (ostate == IPF_TCPS_LAST_ACK) {
3607 nstate = IPF_TCPS_CLOSED;
3612 case IPF_TCPS_CLOSED: /* 11 */
3617 #if defined(_KERNEL)
3620 "tcp %lx flags %x si %lx nstate %d ostate %d\n",
3621 (u_long)tcp, tcpflags, (u_long)tqe,
3624 printf("tcp %lx flags %x si %lx nstate %d ostate %d\n",
3625 (u_long)tcp, tcpflags, (u_long)tqe,
3636 * If rval == 2 then do not update the queue position, but treat the
3637 * packet as being ok.
3641 else if (rval == 1) {
3642 tqe->tqe_state[dir] = nstate;
3643 if ((tqe->tqe_flags & TQE_RULEBASED) == 0)
3644 fr_movequeue(tqe, tqe->tqe_ifq, tqtab + nstate);
3651 /* ------------------------------------------------------------------------ */
3652 /* Function: ipstate_log */
3654 /* Parameters: is(I) - pointer to state structure */
3655 /* type(I) - type of log entry to create */
3657 /* Creates a state table log entry using the state structure and type info. */
3658 /* passed in. Log packet/byte counts, source/destination address and other */
3659 /* protocol specific information. */
3660 /* ------------------------------------------------------------------------ */
3661 void ipstate_log(is, type)
3672 * Copy information out of the ipstate_t structure and into the
3673 * structure used for logging.
3675 ipsl.isl_type = type;
3676 ipsl.isl_pkts[0] = is->is_pkts[0] + is->is_icmppkts[0];
3677 ipsl.isl_bytes[0] = is->is_bytes[0];
3678 ipsl.isl_pkts[1] = is->is_pkts[1] + is->is_icmppkts[1];
3679 ipsl.isl_bytes[1] = is->is_bytes[1];
3680 ipsl.isl_pkts[2] = is->is_pkts[2] + is->is_icmppkts[2];
3681 ipsl.isl_bytes[2] = is->is_bytes[2];
3682 ipsl.isl_pkts[3] = is->is_pkts[3] + is->is_icmppkts[3];
3683 ipsl.isl_bytes[3] = is->is_bytes[3];
3684 ipsl.isl_src = is->is_src;
3685 ipsl.isl_dst = is->is_dst;
3686 ipsl.isl_p = is->is_p;
3687 ipsl.isl_v = is->is_v;
3688 ipsl.isl_flags = is->is_flags;
3689 ipsl.isl_tag = is->is_tag;
3690 ipsl.isl_rulen = is->is_rulen;
3691 (void) strncpy(ipsl.isl_group, is->is_group, FR_GROUPLEN);
3693 if (ipsl.isl_p == IPPROTO_TCP || ipsl.isl_p == IPPROTO_UDP) {
3694 ipsl.isl_sport = is->is_sport;
3695 ipsl.isl_dport = is->is_dport;
3696 if (ipsl.isl_p == IPPROTO_TCP) {
3697 ipsl.isl_state[0] = is->is_state[0];
3698 ipsl.isl_state[1] = is->is_state[1];
3700 } else if (ipsl.isl_p == IPPROTO_ICMP) {
3701 ipsl.isl_itype = is->is_icmp.ici_type;
3702 } else if (ipsl.isl_p == IPPROTO_ICMPV6) {
3703 ipsl.isl_itype = is->is_icmp.ici_type;
3705 ipsl.isl_ps.isl_filler[0] = 0;
3706 ipsl.isl_ps.isl_filler[1] = 0;
3710 sizes[0] = sizeof(ipsl);
3713 if (ipllog(IPL_LOGSTATE, NULL, items, sizes, types, 1)) {
3714 ATOMIC_INCL(ips_stats.iss_logged);
3716 ATOMIC_INCL(ips_stats.iss_logfail);
3723 /* ------------------------------------------------------------------------ */
3724 /* Function: fr_checkicmp6matchingstate */
3725 /* Returns: ipstate_t* - NULL == no match found, */
3726 /* else pointer to matching state entry */
3727 /* Parameters: fin(I) - pointer to packet information */
3728 /* Locks: NULL == no locks, else Read Lock on ipf_state */
3730 /* If we've got an ICMPv6 error message, using the information stored in */
3731 /* the ICMPv6 packet, look for a matching state table entry. */
3732 /* ------------------------------------------------------------------------ */
3733 static ipstate_t *fr_checkicmp6matchingstate(fin)
3736 struct icmp6_hdr *ic6, *oic;
3737 int type, backward, i;
3738 ipstate_t *is, **isp;
3739 u_short sport, dport;
3750 * Does it at least have the return (basic) IP header ?
3751 * Is it an actual recognised ICMP error type?
3752 * Only a basic IP header (no options) should be with
3753 * an ICMP error header.
3755 if ((fin->fin_v != 6) || (fin->fin_plen < ICMP6ERR_MINPKTLEN) ||
3756 !(fin->fin_flx & FI_ICMPERR))
3760 type = ic6->icmp6_type;
3762 oip6 = (ip6_t *)((char *)ic6 + ICMPERR_ICMPHLEN);
3763 if (fin->fin_plen < sizeof(*oip6))
3766 bcopy((char *)fin, (char *)&ofin, sizeof(*fin));
3768 ofin.fin_ifp = fin->fin_ifp;
3769 ofin.fin_out = !fin->fin_out;
3770 ofin.fin_m = NULL; /* if dereferenced, panic XXX */
3771 ofin.fin_mp = NULL; /* if dereferenced, panic XXX */
3774 * We make a fin entry to be able to feed it to
3775 * matchsrcdst. Note that not all fields are necessary
3776 * but this is the cleanest way. Note further we fill
3777 * in fin_mp such that if someone uses it we'll get
3778 * a kernel panic. fr_matchsrcdst does not use this.
3780 * watch out here, as ip is in host order and oip6 in network
3781 * order. Any change we make must be undone afterwards.
3783 savelen = oip6->ip6_plen;
3784 oip6->ip6_plen = fin->fin_dlen - ICMPERR_ICMPHLEN;
3785 ofin.fin_flx = FI_NOCKSUM;
3786 ofin.fin_ip = (ip_t *)oip6;
3787 (void) fr_makefrip(sizeof(*oip6), (ip_t *)oip6, &ofin);
3788 ofin.fin_flx &= ~(FI_BAD|FI_SHORT);
3789 oip6->ip6_plen = savelen;
3791 if (oip6->ip6_nxt == IPPROTO_ICMPV6) {
3792 oic = (struct icmp6_hdr *)(oip6 + 1);
3794 * an ICMP error can only be generated as a result of an
3795 * ICMP query, not as the response on an ICMP error
3797 * XXX theoretically ICMP_ECHOREP and the other reply's are
3798 * ICMP query's as well, but adding them here seems strange XXX
3800 if (!(oic->icmp6_type & ICMP6_INFOMSG_MASK))
3804 * perform a lookup of the ICMP packet in the state table
3806 hv = (pr = oip6->ip6_nxt);
3807 src.in6 = oip6->ip6_src;
3808 hv += src.in4.s_addr;
3809 dst.in6 = oip6->ip6_dst;
3810 hv += dst.in4.s_addr;
3811 hv += oic->icmp6_id;
3812 hv += oic->icmp6_seq;
3813 hv = DOUBLE_HASH(hv);
3815 READ_ENTER(&ipf_state);
3816 for (isp = &ips_table[hv]; ((is = *isp) != NULL); ) {
3818 isp = &is->is_hnext;
3819 if ((is->is_p == pr) &&
3820 !(is->is_pass & FR_NOICMPERR) &&
3821 (oic->icmp6_id == ic->ici_id) &&
3822 (oic->icmp6_seq == ic->ici_seq) &&
3823 (is = fr_matchsrcdst(&ofin, is, &src,
3824 &dst, NULL, FI_ICMPCMP))) {
3826 * in the state table ICMP query's are stored
3827 * with the type of the corresponding ICMP
3828 * response. Correct here
3830 if (((ic->ici_type == ICMP6_ECHO_REPLY) &&
3831 (oic->icmp6_type == ICMP6_ECHO_REQUEST)) ||
3832 (ic->ici_type - 1 == oic->icmp6_type )) {
3833 ips_stats.iss_hits++;
3834 backward = IP6_NEQ(&is->is_dst, &src);
3835 fin->fin_rev = !backward;
3836 i = (backward << 1) + fin->fin_out;
3837 is->is_icmppkts[i]++;
3842 RWLOCK_EXIT(&ipf_state);
3846 hv = (pr = oip6->ip6_nxt);
3847 src.in6 = oip6->ip6_src;
3852 dst.in6 = oip6->ip6_dst;
3858 if ((oip6->ip6_nxt == IPPROTO_TCP) || (oip6->ip6_nxt == IPPROTO_UDP)) {
3859 tcp = (tcphdr_t *)(oip6 + 1);
3860 dport = tcp->th_dport;
3861 sport = tcp->th_sport;
3866 hv = DOUBLE_HASH(hv);
3868 READ_ENTER(&ipf_state);
3869 for (isp = &ips_table[hv]; ((is = *isp) != NULL); ) {
3870 isp = &is->is_hnext;
3872 * Only allow this icmp though if the
3873 * encapsulated packet was allowed through the
3874 * other way around. Note that the minimal amount
3875 * of info present does not allow for checking against
3876 * tcp internals such as seq and ack numbers.
3878 if ((is->is_p != pr) || (is->is_v != 6) ||
3879 (is->is_pass & FR_NOICMPERR))
3881 is = fr_matchsrcdst(&ofin, is, &src, &dst, tcp, FI_ICMPCMP);
3883 ips_stats.iss_hits++;
3884 backward = IP6_NEQ(&is->is_dst, &src);
3885 fin->fin_rev = !backward;
3886 i = (backward << 1) + fin->fin_out;
3887 is->is_icmppkts[i]++;
3889 * we deliberately do not touch the timeouts
3890 * for the accompanying state table entry.
3891 * It remains to be seen if that is correct. XXX
3896 RWLOCK_EXIT(&ipf_state);
3902 /* ------------------------------------------------------------------------ */
3903 /* Function: fr_sttab_init */
3905 /* Parameters: tqp(I) - pointer to an array of timeout queues for TCP */
3907 /* Initialise the array of timeout queues for TCP. */
3908 /* ------------------------------------------------------------------------ */
3909 void fr_sttab_init(tqp)
3914 for (i = IPF_TCP_NSTATES - 1; i >= 0; i--) {
3917 tqp[i].ifq_head = NULL;
3918 tqp[i].ifq_tail = &tqp[i].ifq_head;
3919 tqp[i].ifq_next = tqp + i + 1;
3920 MUTEX_INIT(&tqp[i].ifq_lock, "ipftq tcp tab");
3922 tqp[IPF_TCP_NSTATES - 1].ifq_next = NULL;
3923 tqp[IPF_TCPS_CLOSED].ifq_ttl = fr_tcpclosed;
3924 tqp[IPF_TCPS_LISTEN].ifq_ttl = fr_tcptimeout;
3925 tqp[IPF_TCPS_SYN_SENT].ifq_ttl = fr_tcptimeout;
3926 tqp[IPF_TCPS_SYN_RECEIVED].ifq_ttl = fr_tcptimeout;
3927 tqp[IPF_TCPS_ESTABLISHED].ifq_ttl = fr_tcpidletimeout;
3928 tqp[IPF_TCPS_CLOSE_WAIT].ifq_ttl = fr_tcphalfclosed;
3929 tqp[IPF_TCPS_FIN_WAIT_1].ifq_ttl = fr_tcphalfclosed;
3930 tqp[IPF_TCPS_CLOSING].ifq_ttl = fr_tcptimeout;
3931 tqp[IPF_TCPS_LAST_ACK].ifq_ttl = fr_tcplastack;
3932 tqp[IPF_TCPS_FIN_WAIT_2].ifq_ttl = fr_tcpclosewait;
3933 tqp[IPF_TCPS_TIME_WAIT].ifq_ttl = fr_tcptimewait;
3934 tqp[IPF_TCPS_HALF_ESTAB].ifq_ttl = fr_tcptimeout;
3938 /* ------------------------------------------------------------------------ */
3939 /* Function: fr_sttab_destroy */
3941 /* Parameters: tqp(I) - pointer to an array of timeout queues for TCP */
3943 /* Do whatever is necessary to "destroy" each of the entries in the array */
3944 /* of timeout queues for TCP. */
3945 /* ------------------------------------------------------------------------ */
3946 void fr_sttab_destroy(tqp)
3951 for (i = IPF_TCP_NSTATES - 1; i >= 0; i--)
3952 MUTEX_DESTROY(&tqp[i].ifq_lock);
3956 /* ------------------------------------------------------------------------ */
3957 /* Function: fr_statederef */
3959 /* Parameters: isp(I) - pointer to pointer to state table entry */
3961 /* Decrement the reference counter for this state table entry and free it */
3962 /* if there are no more things using it. */
3964 /* This function is only called when cleaning up after increasing is_ref by */
3965 /* one earlier in the 'code path' so if is_ref is 1 when entering, we do */
3966 /* have an orphan, otherwise not. However there is a possible race between */
3967 /* the entry being deleted via flushing with an ioctl call (that calls the */
3968 /* delete function directly) and the tail end of packet processing so we */
3969 /* need to grab is_lock before doing the check to synchronise the two code */
3972 /* When operating in userland (ipftest), we have no timers to clear a state */
3973 /* entry. Therefore, we make a few simple tests before deleting an entry */
3974 /* outright. We compare states on each side looking for a combination of */
3975 /* TIME_WAIT (should really be FIN_WAIT_2?) and LAST_ACK. Then we factor */
3976 /* in packet direction with the interface list to make sure we don't */
3977 /* prematurely delete an entry on a final inbound packet that's we're also */
3978 /* supposed to route elsewhere. */
3980 /* Internal parameters: */
3981 /* state[0] = state of source (host that initiated connection) */
3982 /* state[1] = state of dest (host that accepted the connection) */
3984 /* dir == 0 : a packet from source to dest */
3985 /* dir == 1 : a packet from dest to source */
3986 /* ------------------------------------------------------------------------ */
3987 void fr_statederef(isp)
3995 MUTEX_ENTER(&is->is_lock);
3996 if (is->is_ref > 1) {
3998 MUTEX_EXIT(&is->is_lock);
4000 if ((is->is_sti.tqe_state[0] > IPF_TCPS_ESTABLISHED) ||
4001 (is->is_sti.tqe_state[1] > IPF_TCPS_ESTABLISHED)) {
4002 fr_delstate(is, ISL_ORPHAN);
4007 MUTEX_EXIT(&is->is_lock);
4009 WRITE_ENTER(&ipf_state);
4010 fr_delstate(is, ISL_EXPIRE);
4011 RWLOCK_EXIT(&ipf_state);
4015 /* ------------------------------------------------------------------------ */
4016 /* Function: fr_setstatequeue */
4018 /* Parameters: is(I) - pointer to state structure */
4019 /* rev(I) - forward(0) or reverse(1) direction */
4020 /* Locks: ipf_state (read or write) */
4022 /* Put the state entry on its default queue entry, using rev as a helped in */
4023 /* determining which queue it should be placed on. */
4024 /* ------------------------------------------------------------------------ */
4025 void fr_setstatequeue(is, rev)
4029 ipftq_t *oifq, *nifq;
4032 if ((is->is_sti.tqe_flags & TQE_RULEBASED) != 0)
4033 nifq = is->is_tqehead[rev];
4041 case IPPROTO_ICMPV6 :
4043 nifq = &ips_icmpacktq;
4050 nifq = &ips_icmpacktq;
4055 nifq = ips_tqtqb + is->is_state[rev];
4060 nifq = &ips_udpacktq;
4071 oifq = is->is_sti.tqe_ifq;
4073 * If it's currently on a timeout queue, move it from one queue to
4074 * another, else put it on the end of the newly determined queue.
4077 fr_movequeue(&is->is_sti, oifq, nifq);
4079 fr_queueappend(&is->is_sti, nifq, is);
4084 /* ------------------------------------------------------------------------ */
4085 /* Function: fr_stateiter */
4086 /* Returns: int - 0 == success, else error */
4087 /* Parameters: token(I) - pointer to ipftoken structure */
4088 /* itp(I) - pointer to ipfgeniter structure */
4090 /* This function handles the SIOCGENITER ioctl for the state tables and */
4091 /* walks through the list of entries in the state table list (ips_list.) */
4092 /* ------------------------------------------------------------------------ */
4093 static int fr_stateiter(token, itp)
4097 ipstate_t *is, *next, zero;
4101 if (itp->igi_data == NULL)
4104 if (itp->igi_nitems < 1)
4107 if (itp->igi_type != IPFGENITER_STATE)
4110 is = token->ipt_data;
4111 if (is == (void *)-1) {
4112 ipf_freetoken(token);
4117 dst = itp->igi_data;
4119 READ_ENTER(&ipf_state);
4126 count = itp->igi_nitems;
4130 * If we find a state entry to use, bump its
4131 * reference count so that it can be used for
4132 * is_next when we come back.
4135 MUTEX_ENTER(&next->is_lock);
4137 MUTEX_EXIT(&next->is_lock);
4138 token->ipt_data = next;
4141 bzero(&zero, sizeof(zero));
4144 token->ipt_data = NULL;
4146 RWLOCK_EXIT(&ipf_state);
4149 * This should arguably be via fr_outobj() so that the state
4150 * structure can (if required) be massaged going out.
4152 error = COPYOUT(next, dst, sizeof(*next));
4155 if ((count == 1) || (error != 0))
4158 dst += sizeof(*next);
4161 READ_ENTER(&ipf_state);
4162 next = next->is_next;
4173 /* ------------------------------------------------------------------------ */
4174 /* Function: fr_stgettable */
4175 /* Returns: int - 0 = success, else error */
4176 /* Parameters: data(I) - pointer to ioctl data */
4178 /* This function handles ioctl requests for tables of state information. */
4179 /* At present the only table it deals with is the hash bucket statistics. */
4180 /* ------------------------------------------------------------------------ */
4181 static int fr_stgettable(data)
4187 error = fr_inobj(data, &table, IPFOBJ_GTABLE);
4191 if (table.ita_type != IPFTABLE_BUCKETS)
4194 error = COPYOUT(ips_stats.iss_bucketlen, table.ita_table,
4195 fr_statesize * sizeof(u_long));