1 /* $OpenBSD: pf_lb.c,v 1.2 2009/02/12 02:13:15 sthen Exp $ */
4 * Copyright (c) 2001 Daniel Hartmeier
5 * Copyright (c) 2002 - 2008 Henning Brauer
8 * Redistribution and use in source and binary forms, with or without
9 * modification, are permitted provided that the following conditions
12 * - Redistributions of source code must retain the above copyright
13 * notice, this list of conditions and the following disclaimer.
14 * - Redistributions in binary form must reproduce the above
15 * copyright notice, this list of conditions and the following
16 * disclaimer in the documentation and/or other materials provided
17 * with the distribution.
19 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
20 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
21 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
22 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
23 * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
24 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
25 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
26 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
27 * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
28 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
29 * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
30 * POSSIBILITY OF SUCH DAMAGE.
32 * Effort sponsored in part by the Defense Advanced Research Projects
33 * Agency (DARPA) and Air Force Research Laboratory, Air Force
34 * Materiel Command, USAF, under agreement number F30602-01-2-0537.
40 #include "opt_inet6.h"
42 #include <sys/cdefs.h>
43 __FBSDID("$FreeBSD$");
51 #define NBPFILTER DEV_BPF
57 #define NPFLOG DEV_PFLOG
63 #define NPFSYNC DEV_PFSYNC
69 #define NPFLOW DEV_PFLOW
81 #include <sys/param.h>
82 #include <sys/systm.h>
84 #include <sys/filio.h>
85 #include <sys/socket.h>
86 #include <sys/socketvar.h>
87 #include <sys/kernel.h>
90 #include <sys/sysctl.h>
97 #include <sys/kthread.h>
101 #include <sys/rwlock.h>
107 #include <crypto/md5.h>
111 #include <net/if_types.h>
113 #include <net/route.h>
114 #include <net/radix_mpath.h>
116 #include <netinet/in.h>
117 #include <netinet/in_var.h>
118 #include <netinet/in_systm.h>
119 #include <netinet/ip.h>
120 #include <netinet/ip_var.h>
121 #include <netinet/tcp.h>
122 #include <netinet/tcp_seq.h>
123 #include <netinet/udp.h>
124 #include <netinet/ip_icmp.h>
125 #include <netinet/in_pcb.h>
126 #include <netinet/tcp_timer.h>
127 #include <netinet/tcp_var.h>
128 #include <netinet/udp_var.h>
129 #include <netinet/icmp_var.h>
130 #include <netinet/if_ether.h>
133 #include <dev/rndvar.h>
135 #include <net/pfvar.h>
136 #include <net/if_pflog.h>
137 #include <net/if_pflow.h>
140 #include <net/if_pfsync.h>
141 #endif /* NPFSYNC > 0 */
144 #include <netinet/ip6.h>
145 #include <netinet/in_pcb.h>
146 #include <netinet/icmp6.h>
147 #include <netinet6/nd6.h>
152 #define DPFPRINTF(n, x) if (V_pf_status.debug >= (n)) printf x
154 #define DPFPRINTF(n, x) if (pf_status.debug >= (n)) printf x
161 void pf_hash(struct pf_addr *, struct pf_addr *,
162 struct pf_poolhashkey *, sa_family_t);
163 struct pf_rule *pf_match_translation(struct pf_pdesc *, struct mbuf *,
164 int, int, struct pfi_kif *,
165 struct pf_addr *, u_int16_t, struct pf_addr *,
167 int pf_get_sport(sa_family_t, u_int8_t, struct pf_rule *,
168 struct pf_addr *, struct pf_addr *, u_int16_t,
169 struct pf_addr *, u_int16_t*, u_int16_t, u_int16_t,
170 struct pf_src_node **);
174 a -= b; a -= c; a ^= (c >> 13); \
175 b -= c; b -= a; b ^= (a << 8); \
176 c -= a; c -= b; c ^= (b >> 13); \
177 a -= b; a -= c; a ^= (c >> 12); \
178 b -= c; b -= a; b ^= (a << 16); \
179 c -= a; c -= b; c ^= (b >> 5); \
180 a -= b; a -= c; a ^= (c >> 3); \
181 b -= c; b -= a; b ^= (a << 10); \
182 c -= a; c -= b; c ^= (b >> 15); \
186 * hash function based on bridge_hash in if_bridge.c
189 pf_hash(struct pf_addr *inaddr, struct pf_addr *hash,
190 struct pf_poolhashkey *key, sa_family_t af)
192 u_int32_t a = 0x9e3779b9, b = 0x9e3779b9, c = key->key32[0];
197 a += inaddr->addr32[0];
200 hash->addr32[0] = c + key->key32[2];
205 a += inaddr->addr32[0];
206 b += inaddr->addr32[2];
209 a += inaddr->addr32[1];
210 b += inaddr->addr32[3];
214 a += inaddr->addr32[2];
215 b += inaddr->addr32[1];
219 a += inaddr->addr32[3];
220 b += inaddr->addr32[0];
230 pf_match_translation(struct pf_pdesc *pd, struct mbuf *m, int off,
231 int direction, struct pfi_kif *kif, struct pf_addr *saddr, u_int16_t sport,
232 struct pf_addr *daddr, u_int16_t dport, int rs_num)
234 struct pf_rule *r, *rm = NULL;
235 struct pf_ruleset *ruleset = NULL;
240 r = TAILQ_FIRST(pf_main_ruleset.rules[rs_num].active.ptr);
241 while (r && rm == NULL) {
242 struct pf_rule_addr *src = NULL, *dst = NULL;
243 struct pf_addr_wrap *xdst = NULL;
245 if (r->action == PF_BINAT && direction == PF_IN) {
247 if (r->rpool.cur != NULL)
248 xdst = &r->rpool.cur->addr;
255 if (pfi_kif_match(r->kif, kif) == r->ifnot)
256 r = r->skip[PF_SKIP_IFP].ptr;
257 else if (r->direction && r->direction != direction)
258 r = r->skip[PF_SKIP_DIR].ptr;
259 else if (r->af && r->af != pd->af)
260 r = r->skip[PF_SKIP_AF].ptr;
261 else if (r->proto && r->proto != pd->proto)
262 r = r->skip[PF_SKIP_PROTO].ptr;
263 else if (PF_MISMATCHAW(&src->addr, saddr, pd->af,
264 src->neg, kif, M_GETFIB(m)))
265 r = r->skip[src == &r->src ? PF_SKIP_SRC_ADDR :
266 PF_SKIP_DST_ADDR].ptr;
267 else if (src->port_op && !pf_match_port(src->port_op,
268 src->port[0], src->port[1], sport))
269 r = r->skip[src == &r->src ? PF_SKIP_SRC_PORT :
270 PF_SKIP_DST_PORT].ptr;
271 else if (dst != NULL &&
272 PF_MISMATCHAW(&dst->addr, daddr, pd->af, dst->neg, NULL,
274 r = r->skip[PF_SKIP_DST_ADDR].ptr;
275 else if (xdst != NULL && PF_MISMATCHAW(xdst, daddr, pd->af,
276 0, NULL, M_GETFIB(m)))
277 r = TAILQ_NEXT(r, entries);
278 else if (dst != NULL && dst->port_op &&
279 !pf_match_port(dst->port_op, dst->port[0],
280 dst->port[1], dport))
281 r = r->skip[PF_SKIP_DST_PORT].ptr;
283 else if (r->match_tag && !pf_match_tag(m, r, &tag, pd->pf_mtag))
285 else if (r->match_tag && !pf_match_tag(m, r, &tag))
287 r = TAILQ_NEXT(r, entries);
288 else if (r->os_fingerprint != PF_OSFP_ANY && (pd->proto !=
289 IPPROTO_TCP || !pf_osfp_match(pf_osfp_fingerprint(pd, m,
290 off, pd->hdr.tcp), r->os_fingerprint)))
291 r = TAILQ_NEXT(r, entries);
295 if (r->rtableid >= 0)
296 rtableid = r->rtableid;
297 if (r->anchor == NULL) {
300 pf_step_into_anchor(&asd, &ruleset, rs_num,
304 pf_step_out_of_anchor(&asd, &ruleset, rs_num, &r,
308 if (pf_tag_packet(m, tag, rtableid, pd->pf_mtag))
310 if (pf_tag_packet(m, tag, rtableid))
313 if (rm != NULL && (rm->action == PF_NONAT ||
314 rm->action == PF_NORDR || rm->action == PF_NOBINAT))
320 pf_get_sport(sa_family_t af, u_int8_t proto, struct pf_rule *r,
321 struct pf_addr *saddr, struct pf_addr *daddr, u_int16_t dport,
322 struct pf_addr *naddr, u_int16_t *nport, u_int16_t low, u_int16_t high,
323 struct pf_src_node **sn)
325 struct pf_state_key_cmp key;
326 struct pf_addr init_addr;
329 bzero(&init_addr, sizeof(init_addr));
330 if (pf_map_addr(af, r, saddr, naddr, &init_addr, sn))
333 if (proto == IPPROTO_ICMP) {
341 PF_ACPY(&key.addr[1], daddr, key.af);
342 PF_ACPY(&key.addr[0], naddr, key.af);
346 * port search; start random, step;
347 * similar 2 portloop in in_pcbbind
349 if (!(proto == IPPROTO_TCP || proto == IPPROTO_UDP ||
350 proto == IPPROTO_ICMP)) {
352 if (pf_find_state_all(&key, PF_IN, NULL) == NULL)
354 } else if (low == 0 && high == 0) {
355 key.port[0] = *nport;
356 if (pf_find_state_all(&key, PF_IN, NULL) == NULL)
358 } else if (low == high) {
359 key.port[0] = htons(low);
360 if (pf_find_state_all(&key, PF_IN, NULL) == NULL) {
374 cut = htonl(arc4random()) % (1 + high - low) + low;
376 cut = arc4random_uniform(1 + high - low) + low;
378 /* low <= cut <= high */
379 for (tmp = cut; tmp <= high; ++(tmp)) {
380 key.port[0] = htons(tmp);
381 if (pf_find_state_all(&key, PF_IN, NULL) ==
385 NULL && !in_baddynamic(tmp, proto)) {
391 for (tmp = cut - 1; tmp >= low; --(tmp)) {
392 key.port[0] = htons(tmp);
393 if (pf_find_state_all(&key, PF_IN, NULL) ==
397 NULL && !in_baddynamic(tmp, proto)) {
405 switch (r->rpool.opts & PF_POOL_TYPEMASK) {
407 case PF_POOL_ROUNDROBIN:
408 if (pf_map_addr(af, r, saddr, naddr, &init_addr, sn))
412 case PF_POOL_SRCHASH:
413 case PF_POOL_BITMASK:
417 } while (! PF_AEQ(&init_addr, naddr, af) );
418 return (1); /* none available */
422 pf_map_addr(sa_family_t af, struct pf_rule *r, struct pf_addr *saddr,
423 struct pf_addr *naddr, struct pf_addr *init_addr, struct pf_src_node **sn)
425 unsigned char hash[16];
426 struct pf_pool *rpool = &r->rpool;
427 struct pf_addr *raddr = &rpool->cur->addr.v.a.addr;
428 struct pf_addr *rmask = &rpool->cur->addr.v.a.mask;
429 struct pf_pooladdr *acur = rpool->cur;
430 struct pf_src_node k;
432 if (*sn == NULL && r->rpool.opts & PF_POOL_STICKYADDR &&
433 (r->rpool.opts & PF_POOL_TYPEMASK) != PF_POOL_NONE) {
435 PF_ACPY(&k.addr, saddr, af);
436 if (r->rule_flag & PFRULE_RULESRCTRACK ||
437 r->rpool.opts & PF_POOL_STICKYADDR)
442 V_pf_status.scounters[SCNT_SRC_NODE_SEARCH]++;
443 *sn = RB_FIND(pf_src_tree, &V_tree_src_tracking, &k);
445 pf_status.scounters[SCNT_SRC_NODE_SEARCH]++;
446 *sn = RB_FIND(pf_src_tree, &tree_src_tracking, &k);
448 if (*sn != NULL && !PF_AZERO(&(*sn)->raddr, af)) {
449 PF_ACPY(naddr, &(*sn)->raddr, af);
451 if (V_pf_status.debug >= PF_DEBUG_MISC) {
453 if (pf_status.debug >= PF_DEBUG_MISC) {
455 printf("pf_map_addr: src tracking maps ");
456 pf_print_host(&k.addr, 0, af);
458 pf_print_host(naddr, 0, af);
465 if (rpool->cur->addr.type == PF_ADDR_NOROUTE)
467 if (rpool->cur->addr.type == PF_ADDR_DYNIFTL) {
471 if (rpool->cur->addr.p.dyn->pfid_acnt4 < 1 &&
472 (rpool->opts & PF_POOL_TYPEMASK) !=
475 raddr = &rpool->cur->addr.p.dyn->pfid_addr4;
476 rmask = &rpool->cur->addr.p.dyn->pfid_mask4;
481 if (rpool->cur->addr.p.dyn->pfid_acnt6 < 1 &&
482 (rpool->opts & PF_POOL_TYPEMASK) !=
485 raddr = &rpool->cur->addr.p.dyn->pfid_addr6;
486 rmask = &rpool->cur->addr.p.dyn->pfid_mask6;
490 } else if (rpool->cur->addr.type == PF_ADDR_TABLE) {
491 if ((rpool->opts & PF_POOL_TYPEMASK) != PF_POOL_ROUNDROBIN)
492 return (1); /* unsupported */
494 raddr = &rpool->cur->addr.v.a.addr;
495 rmask = &rpool->cur->addr.v.a.mask;
498 switch (rpool->opts & PF_POOL_TYPEMASK) {
500 PF_ACPY(naddr, raddr, af);
502 case PF_POOL_BITMASK:
503 PF_POOLMASK(naddr, raddr, rmask, saddr, af);
506 if (init_addr != NULL && PF_AZERO(init_addr, af)) {
510 rpool->counter.addr32[0] = htonl(arc4random());
515 if (rmask->addr32[3] != 0xffffffff)
516 rpool->counter.addr32[3] =
520 if (rmask->addr32[2] != 0xffffffff)
521 rpool->counter.addr32[2] =
525 if (rmask->addr32[1] != 0xffffffff)
526 rpool->counter.addr32[1] =
530 if (rmask->addr32[0] != 0xffffffff)
531 rpool->counter.addr32[0] =
536 PF_POOLMASK(naddr, raddr, rmask, &rpool->counter, af);
537 PF_ACPY(init_addr, naddr, af);
540 PF_AINC(&rpool->counter, af);
541 PF_POOLMASK(naddr, raddr, rmask, &rpool->counter, af);
544 case PF_POOL_SRCHASH:
545 pf_hash(saddr, (struct pf_addr *)&hash, &rpool->key, af);
546 PF_POOLMASK(naddr, raddr, rmask, (struct pf_addr *)&hash, af);
548 case PF_POOL_ROUNDROBIN:
549 if (rpool->cur->addr.type == PF_ADDR_TABLE) {
550 if (!pfr_pool_get(rpool->cur->addr.p.tbl,
551 &rpool->tblidx, &rpool->counter,
554 } else if (rpool->cur->addr.type == PF_ADDR_DYNIFTL) {
555 if (!pfr_pool_get(rpool->cur->addr.p.dyn->pfid_kt,
556 &rpool->tblidx, &rpool->counter,
559 } else if (pf_match_addr(0, raddr, rmask, &rpool->counter, af))
563 if ((rpool->cur = TAILQ_NEXT(rpool->cur, entries)) == NULL)
564 rpool->cur = TAILQ_FIRST(&rpool->list);
565 if (rpool->cur->addr.type == PF_ADDR_TABLE) {
567 if (pfr_pool_get(rpool->cur->addr.p.tbl,
568 &rpool->tblidx, &rpool->counter,
569 &raddr, &rmask, af)) {
570 /* table contains no address of type 'af' */
571 if (rpool->cur != acur)
575 } else if (rpool->cur->addr.type == PF_ADDR_DYNIFTL) {
577 if (pfr_pool_get(rpool->cur->addr.p.dyn->pfid_kt,
578 &rpool->tblidx, &rpool->counter,
579 &raddr, &rmask, af)) {
580 /* table contains no address of type 'af' */
581 if (rpool->cur != acur)
586 raddr = &rpool->cur->addr.v.a.addr;
587 rmask = &rpool->cur->addr.v.a.mask;
588 PF_ACPY(&rpool->counter, raddr, af);
592 PF_ACPY(naddr, &rpool->counter, af);
593 if (init_addr != NULL && PF_AZERO(init_addr, af))
594 PF_ACPY(init_addr, naddr, af);
595 PF_AINC(&rpool->counter, af);
599 PF_ACPY(&(*sn)->raddr, naddr, af);
602 if (V_pf_status.debug >= PF_DEBUG_MISC &&
604 if (pf_status.debug >= PF_DEBUG_MISC &&
606 (rpool->opts & PF_POOL_TYPEMASK) != PF_POOL_NONE) {
607 printf("pf_map_addr: selected address ");
608 pf_print_host(naddr, 0, af);
616 pf_get_translation(struct pf_pdesc *pd, struct mbuf *m, int off, int direction,
617 struct pfi_kif *kif, struct pf_src_node **sn,
618 struct pf_state_key **skw, struct pf_state_key **sks,
619 struct pf_state_key **skp, struct pf_state_key **nkp,
620 struct pf_addr *saddr, struct pf_addr *daddr,
621 u_int16_t sport, u_int16_t dport)
623 struct pf_rule *r = NULL;
626 if (direction == PF_OUT) {
627 r = pf_match_translation(pd, m, off, direction, kif, saddr,
628 sport, daddr, dport, PF_RULESET_BINAT);
630 r = pf_match_translation(pd, m, off, direction, kif,
631 saddr, sport, daddr, dport, PF_RULESET_NAT);
633 r = pf_match_translation(pd, m, off, direction, kif, saddr,
634 sport, daddr, dport, PF_RULESET_RDR);
636 r = pf_match_translation(pd, m, off, direction, kif,
637 saddr, sport, daddr, dport, PF_RULESET_BINAT);
641 struct pf_addr *naddr;
644 if (pf_state_key_setup(pd, r, skw, sks, skp, nkp,
645 saddr, daddr, sport, dport))
648 /* XXX We only modify one side for now. */
649 naddr = &(*nkp)->addr[1];
650 nport = &(*nkp)->port[1];
658 if (pf_get_sport(pd->af, pd->proto, r, saddr,
659 daddr, dport, naddr, nport, r->rpool.proxy_port[0],
660 r->rpool.proxy_port[1], sn)) {
661 DPFPRINTF(PF_DEBUG_MISC,
662 ("pf: NAT proxy port allocation "
664 r->rpool.proxy_port[0],
665 r->rpool.proxy_port[1]));
672 if (r->rpool.cur->addr.type == PF_ADDR_DYNIFTL){
676 if (r->rpool.cur->addr.p.dyn->
680 &r->rpool.cur->addr.p.dyn->
682 &r->rpool.cur->addr.p.dyn->
689 if (r->rpool.cur->addr.p.dyn->
693 &r->rpool.cur->addr.p.dyn->
695 &r->rpool.cur->addr.p.dyn->
703 &r->rpool.cur->addr.v.a.addr,
704 &r->rpool.cur->addr.v.a.mask,
708 if (r->src.addr.type == PF_ADDR_DYNIFTL) {
712 if (r->src.addr.p.dyn->
725 if (r->src.addr.p.dyn->
739 &r->src.addr.v.a.addr,
740 &r->src.addr.v.a.mask, daddr,
746 if (pf_map_addr(pd->af, r, saddr, naddr, NULL, sn))
748 if ((r->rpool.opts & PF_POOL_TYPEMASK) ==
750 PF_POOLMASK(naddr, naddr,
751 &r->rpool.cur->addr.v.a.mask, daddr,
754 if (r->rpool.proxy_port[1]) {
757 tmp_nport = ((ntohs(dport) -
758 ntohs(r->dst.port[0])) %
759 (r->rpool.proxy_port[1] -
760 r->rpool.proxy_port[0] + 1)) +
761 r->rpool.proxy_port[0];
763 /* wrap around if necessary */
764 if (tmp_nport > 65535)
766 *nport = htons((u_int16_t)tmp_nport);
767 } else if (r->rpool.proxy_port[0])
768 *nport = htons(r->rpool.proxy_port[0]);
775 * Translation was a NOP.
776 * Pretend there was no match.
778 if (!bcmp(*skp, *nkp, sizeof(struct pf_state_key_cmp))) {
780 pool_put(&V_pf_state_key_pl, *nkp);
781 pool_put(&V_pf_state_key_pl, *skp);
783 pool_put(&pf_state_key_pl, *nkp);
784 pool_put(&pf_state_key_pl, *skp);
786 *skw = *sks = *nkp = *skp = NULL;