2 * Copyright (c) 2001 Daniel Hartmeier
3 * Copyright (c) 2002 - 2008 Henning Brauer
6 * Redistribution and use in source and binary forms, with or without
7 * modification, are permitted provided that the following conditions
10 * - Redistributions of source code must retain the above copyright
11 * notice, this list of conditions and the following disclaimer.
12 * - Redistributions in binary form must reproduce the above
13 * copyright notice, this list of conditions and the following
14 * disclaimer in the documentation and/or other materials provided
15 * with the distribution.
17 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
18 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
19 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
20 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
21 * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
22 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
23 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
24 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
25 * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
26 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
27 * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
28 * POSSIBILITY OF SUCH DAMAGE.
30 * Effort sponsored in part by the Defense Advanced Research Projects
31 * Agency (DARPA) and Air Force Research Laboratory, Air Force
32 * Materiel Command, USAF, under agreement number F30602-01-2-0537.
34 * $OpenBSD: pf_lb.c,v 1.2 2009/02/12 02:13:15 sthen Exp $
37 #include <sys/cdefs.h>
38 __FBSDID("$FreeBSD$");
42 #include "opt_inet6.h"
44 #include <sys/param.h>
45 #include <sys/socket.h>
46 #include <sys/sysctl.h>
49 #include <net/pfvar.h>
50 #include <net/if_pflog.h>
52 #define DPFPRINTF(n, x) if (V_pf_status.debug >= (n)) printf x
54 static void pf_hash(struct pf_addr *, struct pf_addr *,
55 struct pf_poolhashkey *, sa_family_t);
56 static struct pf_rule *pf_match_translation(struct pf_pdesc *, struct mbuf *,
57 int, int, struct pfi_kif *,
58 struct pf_addr *, u_int16_t, struct pf_addr *,
59 uint16_t, int, struct pf_anchor_stackframe *);
60 static int pf_get_sport(sa_family_t, uint8_t, struct pf_rule *,
61 struct pf_addr *, uint16_t, struct pf_addr *, uint16_t, struct pf_addr *,
62 uint16_t *, uint16_t, uint16_t, struct pf_src_node **);
66 a -= b; a -= c; a ^= (c >> 13); \
67 b -= c; b -= a; b ^= (a << 8); \
68 c -= a; c -= b; c ^= (b >> 13); \
69 a -= b; a -= c; a ^= (c >> 12); \
70 b -= c; b -= a; b ^= (a << 16); \
71 c -= a; c -= b; c ^= (b >> 5); \
72 a -= b; a -= c; a ^= (c >> 3); \
73 b -= c; b -= a; b ^= (a << 10); \
74 c -= a; c -= b; c ^= (b >> 15); \
78 * hash function based on bridge_hash in if_bridge.c
81 pf_hash(struct pf_addr *inaddr, struct pf_addr *hash,
82 struct pf_poolhashkey *key, sa_family_t af)
84 u_int32_t a = 0x9e3779b9, b = 0x9e3779b9, c = key->key32[0];
89 a += inaddr->addr32[0];
92 hash->addr32[0] = c + key->key32[2];
97 a += inaddr->addr32[0];
98 b += inaddr->addr32[2];
101 a += inaddr->addr32[1];
102 b += inaddr->addr32[3];
106 a += inaddr->addr32[2];
107 b += inaddr->addr32[1];
111 a += inaddr->addr32[3];
112 b += inaddr->addr32[0];
121 static struct pf_rule *
122 pf_match_translation(struct pf_pdesc *pd, struct mbuf *m, int off,
123 int direction, struct pfi_kif *kif, struct pf_addr *saddr, u_int16_t sport,
124 struct pf_addr *daddr, uint16_t dport, int rs_num,
125 struct pf_anchor_stackframe *anchor_stack)
127 struct pf_rule *r, *rm = NULL;
128 struct pf_ruleset *ruleset = NULL;
133 r = TAILQ_FIRST(pf_main_ruleset.rules[rs_num].active.ptr);
134 while (r && rm == NULL) {
135 struct pf_rule_addr *src = NULL, *dst = NULL;
136 struct pf_addr_wrap *xdst = NULL;
138 if (r->action == PF_BINAT && direction == PF_IN) {
140 if (r->rpool.cur != NULL)
141 xdst = &r->rpool.cur->addr;
148 if (pfi_kif_match(r->kif, kif) == r->ifnot)
149 r = r->skip[PF_SKIP_IFP].ptr;
150 else if (r->direction && r->direction != direction)
151 r = r->skip[PF_SKIP_DIR].ptr;
152 else if (r->af && r->af != pd->af)
153 r = r->skip[PF_SKIP_AF].ptr;
154 else if (r->proto && r->proto != pd->proto)
155 r = r->skip[PF_SKIP_PROTO].ptr;
156 else if (PF_MISMATCHAW(&src->addr, saddr, pd->af,
157 src->neg, kif, M_GETFIB(m)))
158 r = r->skip[src == &r->src ? PF_SKIP_SRC_ADDR :
159 PF_SKIP_DST_ADDR].ptr;
160 else if (src->port_op && !pf_match_port(src->port_op,
161 src->port[0], src->port[1], sport))
162 r = r->skip[src == &r->src ? PF_SKIP_SRC_PORT :
163 PF_SKIP_DST_PORT].ptr;
164 else if (dst != NULL &&
165 PF_MISMATCHAW(&dst->addr, daddr, pd->af, dst->neg, NULL,
167 r = r->skip[PF_SKIP_DST_ADDR].ptr;
168 else if (xdst != NULL && PF_MISMATCHAW(xdst, daddr, pd->af,
169 0, NULL, M_GETFIB(m)))
170 r = TAILQ_NEXT(r, entries);
171 else if (dst != NULL && dst->port_op &&
172 !pf_match_port(dst->port_op, dst->port[0],
173 dst->port[1], dport))
174 r = r->skip[PF_SKIP_DST_PORT].ptr;
175 else if (r->match_tag && !pf_match_tag(m, r, &tag,
176 pd->pf_mtag ? pd->pf_mtag->tag : 0))
177 r = TAILQ_NEXT(r, entries);
178 else if (r->os_fingerprint != PF_OSFP_ANY && (pd->proto !=
179 IPPROTO_TCP || !pf_osfp_match(pf_osfp_fingerprint(pd, m,
180 off, pd->hdr.tcp), r->os_fingerprint)))
181 r = TAILQ_NEXT(r, entries);
185 if (r->rtableid >= 0)
186 rtableid = r->rtableid;
187 if (r->anchor == NULL) {
190 pf_step_into_anchor(anchor_stack, &asd,
191 &ruleset, rs_num, &r, NULL, NULL);
194 pf_step_out_of_anchor(anchor_stack, &asd, &ruleset,
195 rs_num, &r, NULL, NULL);
198 if (tag > 0 && pf_tag_packet(m, pd, tag))
201 M_SETFIB(m, rtableid);
203 if (rm != NULL && (rm->action == PF_NONAT ||
204 rm->action == PF_NORDR || rm->action == PF_NOBINAT))
210 pf_get_sport(sa_family_t af, u_int8_t proto, struct pf_rule *r,
211 struct pf_addr *saddr, uint16_t sport, struct pf_addr *daddr,
212 uint16_t dport, struct pf_addr *naddr, uint16_t *nport, uint16_t low,
213 uint16_t high, struct pf_src_node **sn)
215 struct pf_state_key_cmp key;
216 struct pf_addr init_addr;
219 bzero(&init_addr, sizeof(init_addr));
220 if (pf_map_addr(af, r, saddr, naddr, &init_addr, sn))
223 if (proto == IPPROTO_ICMP) {
228 bzero(&key, sizeof(key));
232 PF_ACPY(&key.addr[0], daddr, key.af);
235 PF_ACPY(&key.addr[1], naddr, key.af);
238 * port search; start random, step;
239 * similar 2 portloop in in_pcbbind
241 if (!(proto == IPPROTO_TCP || proto == IPPROTO_UDP ||
242 proto == IPPROTO_ICMP) || (low == 0 && high == 0)) {
244 * XXX bug: icmp states don't use the id on both sides.
245 * (traceroute -I through nat)
248 if (pf_find_state_all(&key, PF_IN, NULL) == NULL) {
252 } else if (low == high) {
253 key.port[1] = htons(low);
254 if (pf_find_state_all(&key, PF_IN, NULL) == NULL) {
267 cut = htonl(arc4random()) % (1 + high - low) + low;
268 /* low <= cut <= high */
269 for (tmp = cut; tmp <= high; ++(tmp)) {
270 key.port[1] = htons(tmp);
271 if (pf_find_state_all(&key, PF_IN, NULL) ==
277 for (tmp = cut - 1; tmp >= low; --(tmp)) {
278 key.port[1] = htons(tmp);
279 if (pf_find_state_all(&key, PF_IN, NULL) ==
287 switch (r->rpool.opts & PF_POOL_TYPEMASK) {
289 case PF_POOL_ROUNDROBIN:
290 if (pf_map_addr(af, r, saddr, naddr, &init_addr, sn))
294 case PF_POOL_SRCHASH:
295 case PF_POOL_BITMASK:
299 } while (! PF_AEQ(&init_addr, naddr, af) );
300 return (1); /* none available */
304 pf_map_addr(sa_family_t af, struct pf_rule *r, struct pf_addr *saddr,
305 struct pf_addr *naddr, struct pf_addr *init_addr, struct pf_src_node **sn)
307 struct pf_pool *rpool = &r->rpool;
308 struct pf_addr *raddr = NULL, *rmask = NULL;
310 if (*sn == NULL && r->rpool.opts & PF_POOL_STICKYADDR &&
311 (r->rpool.opts & PF_POOL_TYPEMASK) != PF_POOL_NONE) {
312 *sn = pf_find_src_node(saddr, r, af, 0);
313 if (*sn != NULL && !PF_AZERO(&(*sn)->raddr, af)) {
314 PF_ACPY(naddr, &(*sn)->raddr, af);
315 if (V_pf_status.debug >= PF_DEBUG_MISC) {
316 printf("pf_map_addr: src tracking maps ");
317 pf_print_host(saddr, 0, af);
319 pf_print_host(naddr, 0, af);
326 if (rpool->cur->addr.type == PF_ADDR_NOROUTE)
328 if (rpool->cur->addr.type == PF_ADDR_DYNIFTL) {
332 if (rpool->cur->addr.p.dyn->pfid_acnt4 < 1 &&
333 (rpool->opts & PF_POOL_TYPEMASK) !=
336 raddr = &rpool->cur->addr.p.dyn->pfid_addr4;
337 rmask = &rpool->cur->addr.p.dyn->pfid_mask4;
342 if (rpool->cur->addr.p.dyn->pfid_acnt6 < 1 &&
343 (rpool->opts & PF_POOL_TYPEMASK) !=
346 raddr = &rpool->cur->addr.p.dyn->pfid_addr6;
347 rmask = &rpool->cur->addr.p.dyn->pfid_mask6;
351 } else if (rpool->cur->addr.type == PF_ADDR_TABLE) {
352 if ((rpool->opts & PF_POOL_TYPEMASK) != PF_POOL_ROUNDROBIN)
353 return (1); /* unsupported */
355 raddr = &rpool->cur->addr.v.a.addr;
356 rmask = &rpool->cur->addr.v.a.mask;
359 switch (rpool->opts & PF_POOL_TYPEMASK) {
361 PF_ACPY(naddr, raddr, af);
363 case PF_POOL_BITMASK:
364 PF_POOLMASK(naddr, raddr, rmask, saddr, af);
367 if (init_addr != NULL && PF_AZERO(init_addr, af)) {
371 rpool->counter.addr32[0] = htonl(arc4random());
376 if (rmask->addr32[3] != 0xffffffff)
377 rpool->counter.addr32[3] =
381 if (rmask->addr32[2] != 0xffffffff)
382 rpool->counter.addr32[2] =
386 if (rmask->addr32[1] != 0xffffffff)
387 rpool->counter.addr32[1] =
391 if (rmask->addr32[0] != 0xffffffff)
392 rpool->counter.addr32[0] =
397 PF_POOLMASK(naddr, raddr, rmask, &rpool->counter, af);
398 PF_ACPY(init_addr, naddr, af);
401 PF_AINC(&rpool->counter, af);
402 PF_POOLMASK(naddr, raddr, rmask, &rpool->counter, af);
405 case PF_POOL_SRCHASH:
407 unsigned char hash[16];
409 pf_hash(saddr, (struct pf_addr *)&hash, &rpool->key, af);
410 PF_POOLMASK(naddr, raddr, rmask, (struct pf_addr *)&hash, af);
413 case PF_POOL_ROUNDROBIN:
415 struct pf_pooladdr *acur = rpool->cur;
418 * XXXGL: in the round-robin case we need to store
419 * the round-robin machine state in the rule, thus
420 * forwarding thread needs to modify rule.
422 * This is done w/o locking, because performance is assumed
423 * more important than round-robin precision.
425 * In the simpliest case we just update the "rpool->cur"
426 * pointer. However, if pool contains tables or dynamic
427 * addresses, then "tblidx" is also used to store machine
428 * state. Since "tblidx" is int, concurrent access to it can't
429 * lead to inconsistence, only to lost of precision.
431 * Things get worse, if table contains not hosts, but
432 * prefixes. In this case counter also stores machine state,
433 * and for IPv6 address, counter can't be updated atomically.
434 * Probably, using round-robin on a table containing IPv6
435 * prefixes (or even IPv4) would cause a panic.
438 if (rpool->cur->addr.type == PF_ADDR_TABLE) {
439 if (!pfr_pool_get(rpool->cur->addr.p.tbl,
440 &rpool->tblidx, &rpool->counter, af))
442 } else if (rpool->cur->addr.type == PF_ADDR_DYNIFTL) {
443 if (!pfr_pool_get(rpool->cur->addr.p.dyn->pfid_kt,
444 &rpool->tblidx, &rpool->counter, af))
446 } else if (pf_match_addr(0, raddr, rmask, &rpool->counter, af))
450 if (TAILQ_NEXT(rpool->cur, entries) == NULL)
451 rpool->cur = TAILQ_FIRST(&rpool->list);
453 rpool->cur = TAILQ_NEXT(rpool->cur, entries);
454 if (rpool->cur->addr.type == PF_ADDR_TABLE) {
456 if (pfr_pool_get(rpool->cur->addr.p.tbl,
457 &rpool->tblidx, &rpool->counter, af)) {
458 /* table contains no address of type 'af' */
459 if (rpool->cur != acur)
463 } else if (rpool->cur->addr.type == PF_ADDR_DYNIFTL) {
465 if (pfr_pool_get(rpool->cur->addr.p.dyn->pfid_kt,
466 &rpool->tblidx, &rpool->counter, af)) {
467 /* table contains no address of type 'af' */
468 if (rpool->cur != acur)
473 raddr = &rpool->cur->addr.v.a.addr;
474 rmask = &rpool->cur->addr.v.a.mask;
475 PF_ACPY(&rpool->counter, raddr, af);
479 PF_ACPY(naddr, &rpool->counter, af);
480 if (init_addr != NULL && PF_AZERO(init_addr, af))
481 PF_ACPY(init_addr, naddr, af);
482 PF_AINC(&rpool->counter, af);
487 PF_ACPY(&(*sn)->raddr, naddr, af);
489 if (V_pf_status.debug >= PF_DEBUG_MISC &&
490 (rpool->opts & PF_POOL_TYPEMASK) != PF_POOL_NONE) {
491 printf("pf_map_addr: selected address ");
492 pf_print_host(naddr, 0, af);
500 pf_get_translation(struct pf_pdesc *pd, struct mbuf *m, int off, int direction,
501 struct pfi_kif *kif, struct pf_src_node **sn,
502 struct pf_state_key **skp, struct pf_state_key **nkp,
503 struct pf_addr *saddr, struct pf_addr *daddr,
504 uint16_t sport, uint16_t dport, struct pf_anchor_stackframe *anchor_stack)
506 struct pf_rule *r = NULL;
507 struct pf_addr *naddr;
511 KASSERT(*skp == NULL, ("*skp not NULL"));
512 KASSERT(*nkp == NULL, ("*nkp not NULL"));
514 if (direction == PF_OUT) {
515 r = pf_match_translation(pd, m, off, direction, kif, saddr,
516 sport, daddr, dport, PF_RULESET_BINAT, anchor_stack);
518 r = pf_match_translation(pd, m, off, direction, kif,
519 saddr, sport, daddr, dport, PF_RULESET_NAT,
522 r = pf_match_translation(pd, m, off, direction, kif, saddr,
523 sport, daddr, dport, PF_RULESET_RDR, anchor_stack);
525 r = pf_match_translation(pd, m, off, direction, kif,
526 saddr, sport, daddr, dport, PF_RULESET_BINAT,
540 *skp = pf_state_key_setup(pd, saddr, daddr, sport, dport);
543 *nkp = pf_state_key_clone(*skp);
545 uma_zfree(V_pf_state_key_z, skp);
550 /* XXX We only modify one side for now. */
551 naddr = &(*nkp)->addr[1];
552 nport = &(*nkp)->port[1];
556 if (pf_get_sport(pd->af, pd->proto, r, saddr, sport, daddr,
557 dport, naddr, nport, r->rpool.proxy_port[0],
558 r->rpool.proxy_port[1], sn)) {
559 DPFPRINTF(PF_DEBUG_MISC,
560 ("pf: NAT proxy port allocation (%u-%u) failed\n",
561 r->rpool.proxy_port[0], r->rpool.proxy_port[1]));
568 if (r->rpool.cur->addr.type == PF_ADDR_DYNIFTL){
572 if (r->rpool.cur->addr.p.dyn->
576 &r->rpool.cur->addr.p.dyn->
578 &r->rpool.cur->addr.p.dyn->
579 pfid_mask4, saddr, AF_INET);
584 if (r->rpool.cur->addr.p.dyn->
588 &r->rpool.cur->addr.p.dyn->
590 &r->rpool.cur->addr.p.dyn->
591 pfid_mask6, saddr, AF_INET6);
597 &r->rpool.cur->addr.v.a.addr,
598 &r->rpool.cur->addr.v.a.mask, saddr,
602 if (r->src.addr.type == PF_ADDR_DYNIFTL) {
606 if (r->src.addr.p.dyn-> pfid_acnt4 < 1)
609 &r->src.addr.p.dyn->pfid_addr4,
610 &r->src.addr.p.dyn->pfid_mask4,
616 if (r->src.addr.p.dyn->pfid_acnt6 < 1)
619 &r->src.addr.p.dyn->pfid_addr6,
620 &r->src.addr.p.dyn->pfid_mask6,
626 PF_POOLMASK(naddr, &r->src.addr.v.a.addr,
627 &r->src.addr.v.a.mask, daddr, pd->af);
632 if (pf_map_addr(pd->af, r, saddr, naddr, NULL, sn))
634 if ((r->rpool.opts & PF_POOL_TYPEMASK) == PF_POOL_BITMASK)
635 PF_POOLMASK(naddr, naddr, &r->rpool.cur->addr.v.a.mask,
638 if (r->rpool.proxy_port[1]) {
641 tmp_nport = ((ntohs(dport) - ntohs(r->dst.port[0])) %
642 (r->rpool.proxy_port[1] - r->rpool.proxy_port[0] +
643 1)) + r->rpool.proxy_port[0];
645 /* Wrap around if necessary. */
646 if (tmp_nport > 65535)
648 *nport = htons((uint16_t)tmp_nport);
649 } else if (r->rpool.proxy_port[0])
650 *nport = htons(r->rpool.proxy_port[0]);
654 panic("%s: unknown action %u", __func__, r->action);
657 /* Return success only if translation really happened. */
658 if (bcmp(*skp, *nkp, sizeof(struct pf_state_key_cmp)))
662 uma_zfree(V_pf_state_key_z, *nkp);
663 uma_zfree(V_pf_state_key_z, *skp);