2 * SPDX-License-Identifier: BSD-2-Clause
4 * Copyright (c) 2001 Daniel Hartmeier
5 * Copyright (c) 2002 - 2008 Henning Brauer
8 * Redistribution and use in source and binary forms, with or without
9 * modification, are permitted provided that the following conditions
12 * - Redistributions of source code must retain the above copyright
13 * notice, this list of conditions and the following disclaimer.
14 * - Redistributions in binary form must reproduce the above
15 * copyright notice, this list of conditions and the following
16 * disclaimer in the documentation and/or other materials provided
17 * with the distribution.
19 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
20 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
21 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
22 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
23 * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
24 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
25 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
26 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
27 * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
28 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
29 * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
30 * POSSIBILITY OF SUCH DAMAGE.
32 * Effort sponsored in part by the Defense Advanced Research Projects
33 * Agency (DARPA) and Air Force Research Laboratory, Air Force
34 * Materiel Command, USAF, under agreement number F30602-01-2-0537.
36 * $OpenBSD: pf_lb.c,v 1.2 2009/02/12 02:13:15 sthen Exp $
39 #include <sys/cdefs.h>
40 __FBSDID("$FreeBSD$");
44 #include "opt_inet6.h"
46 #include <sys/param.h>
49 #include <sys/socket.h>
50 #include <sys/sysctl.h>
54 #include <net/pfvar.h>
55 #include <net/if_pflog.h>
57 #define DPFPRINTF(n, x) if (V_pf_status.debug >= (n)) printf x
59 static void pf_hash(struct pf_addr *, struct pf_addr *,
60 struct pf_poolhashkey *, sa_family_t);
61 static struct pf_krule *pf_match_translation(struct pf_pdesc *, struct mbuf *,
62 int, int, struct pfi_kkif *,
63 struct pf_addr *, u_int16_t, struct pf_addr *,
64 uint16_t, int, struct pf_kanchor_stackframe *);
65 static int pf_get_sport(sa_family_t, uint8_t, struct pf_krule *,
66 struct pf_addr *, uint16_t, struct pf_addr *, uint16_t, struct pf_addr *,
67 uint16_t *, uint16_t, uint16_t, struct pf_ksrc_node **);
71 a -= b; a -= c; a ^= (c >> 13); \
72 b -= c; b -= a; b ^= (a << 8); \
73 c -= a; c -= b; c ^= (b >> 13); \
74 a -= b; a -= c; a ^= (c >> 12); \
75 b -= c; b -= a; b ^= (a << 16); \
76 c -= a; c -= b; c ^= (b >> 5); \
77 a -= b; a -= c; a ^= (c >> 3); \
78 b -= c; b -= a; b ^= (a << 10); \
79 c -= a; c -= b; c ^= (b >> 15); \
83 * hash function based on bridge_hash in if_bridge.c
86 pf_hash(struct pf_addr *inaddr, struct pf_addr *hash,
87 struct pf_poolhashkey *key, sa_family_t af)
89 u_int32_t a = 0x9e3779b9, b = 0x9e3779b9, c = key->key32[0];
94 a += inaddr->addr32[0];
97 hash->addr32[0] = c + key->key32[2];
102 a += inaddr->addr32[0];
103 b += inaddr->addr32[2];
106 a += inaddr->addr32[1];
107 b += inaddr->addr32[3];
111 a += inaddr->addr32[2];
112 b += inaddr->addr32[1];
116 a += inaddr->addr32[3];
117 b += inaddr->addr32[0];
126 static struct pf_krule *
127 pf_match_translation(struct pf_pdesc *pd, struct mbuf *m, int off,
128 int direction, struct pfi_kkif *kif, struct pf_addr *saddr, u_int16_t sport,
129 struct pf_addr *daddr, uint16_t dport, int rs_num,
130 struct pf_kanchor_stackframe *anchor_stack)
132 struct pf_krule *r, *rm = NULL;
133 struct pf_kruleset *ruleset = NULL;
138 r = TAILQ_FIRST(pf_main_ruleset.rules[rs_num].active.ptr);
139 while (r && rm == NULL) {
140 struct pf_rule_addr *src = NULL, *dst = NULL;
141 struct pf_addr_wrap *xdst = NULL;
143 if (r->action == PF_BINAT && direction == PF_IN) {
145 if (r->rpool.cur != NULL)
146 xdst = &r->rpool.cur->addr;
152 counter_u64_add(r->evaluations, 1);
153 if (pfi_kkif_match(r->kif, kif) == r->ifnot)
154 r = r->skip[PF_SKIP_IFP].ptr;
155 else if (r->direction && r->direction != direction)
156 r = r->skip[PF_SKIP_DIR].ptr;
157 else if (r->af && r->af != pd->af)
158 r = r->skip[PF_SKIP_AF].ptr;
159 else if (r->proto && r->proto != pd->proto)
160 r = r->skip[PF_SKIP_PROTO].ptr;
161 else if (PF_MISMATCHAW(&src->addr, saddr, pd->af,
162 src->neg, kif, M_GETFIB(m)))
163 r = r->skip[src == &r->src ? PF_SKIP_SRC_ADDR :
164 PF_SKIP_DST_ADDR].ptr;
165 else if (src->port_op && !pf_match_port(src->port_op,
166 src->port[0], src->port[1], sport))
167 r = r->skip[src == &r->src ? PF_SKIP_SRC_PORT :
168 PF_SKIP_DST_PORT].ptr;
169 else if (dst != NULL &&
170 PF_MISMATCHAW(&dst->addr, daddr, pd->af, dst->neg, NULL,
172 r = r->skip[PF_SKIP_DST_ADDR].ptr;
173 else if (xdst != NULL && PF_MISMATCHAW(xdst, daddr, pd->af,
174 0, NULL, M_GETFIB(m)))
175 r = TAILQ_NEXT(r, entries);
176 else if (dst != NULL && dst->port_op &&
177 !pf_match_port(dst->port_op, dst->port[0],
178 dst->port[1], dport))
179 r = r->skip[PF_SKIP_DST_PORT].ptr;
180 else if (r->match_tag && !pf_match_tag(m, r, &tag,
181 pd->pf_mtag ? pd->pf_mtag->tag : 0))
182 r = TAILQ_NEXT(r, entries);
183 else if (r->os_fingerprint != PF_OSFP_ANY && (pd->proto !=
184 IPPROTO_TCP || !pf_osfp_match(pf_osfp_fingerprint(pd, m,
185 off, &pd->hdr.tcp), r->os_fingerprint)))
186 r = TAILQ_NEXT(r, entries);
190 if (r->rtableid >= 0)
191 rtableid = r->rtableid;
192 if (r->anchor == NULL) {
195 pf_step_into_anchor(anchor_stack, &asd,
196 &ruleset, rs_num, &r, NULL, NULL);
199 pf_step_out_of_anchor(anchor_stack, &asd, &ruleset,
200 rs_num, &r, NULL, NULL);
203 if (tag > 0 && pf_tag_packet(m, pd, tag))
206 M_SETFIB(m, rtableid);
208 if (rm != NULL && (rm->action == PF_NONAT ||
209 rm->action == PF_NORDR || rm->action == PF_NOBINAT))
215 pf_get_sport(sa_family_t af, u_int8_t proto, struct pf_krule *r,
216 struct pf_addr *saddr, uint16_t sport, struct pf_addr *daddr,
217 uint16_t dport, struct pf_addr *naddr, uint16_t *nport, uint16_t low,
218 uint16_t high, struct pf_ksrc_node **sn)
220 struct pf_state_key_cmp key;
221 struct pf_addr init_addr;
223 bzero(&init_addr, sizeof(init_addr));
224 if (pf_map_addr(af, r, saddr, naddr, &init_addr, sn))
227 bzero(&key, sizeof(key));
231 PF_ACPY(&key.addr[0], daddr, key.af);
234 PF_ACPY(&key.addr[1], naddr, key.af);
237 * port search; start random, step;
238 * similar 2 portloop in in_pcbbind
240 if (!(proto == IPPROTO_TCP || proto == IPPROTO_UDP ||
241 proto == IPPROTO_ICMP) || (low == 0 && high == 0)) {
243 * XXX bug: icmp states don't use the id on both sides.
244 * (traceroute -I through nat)
247 if (!pf_find_state_all_exists(&key, PF_IN)) {
251 } else if (low == high) {
252 key.port[1] = htons(low);
253 if (!pf_find_state_all_exists(&key, PF_IN)) {
267 cut = arc4random() % (1 + high - low) + low;
268 /* low <= cut <= high */
269 for (tmp = cut; tmp <= high && tmp <= 0xffff; ++tmp) {
270 key.port[1] = htons(tmp);
271 if (!pf_find_state_all_exists(&key, PF_IN)) {
277 for (tmp -= 1; tmp >= low && tmp <= 0xffff; --tmp) {
278 key.port[1] = htons(tmp);
279 if (!pf_find_state_all_exists(&key, PF_IN)) {
286 switch (r->rpool.opts & PF_POOL_TYPEMASK) {
288 case PF_POOL_ROUNDROBIN:
290 * pick a different source address since we're out
291 * of free port choices for the current one.
293 if (pf_map_addr(af, r, saddr, naddr, &init_addr, sn))
297 case PF_POOL_SRCHASH:
298 case PF_POOL_BITMASK:
302 } while (! PF_AEQ(&init_addr, naddr, af) );
303 return (1); /* none available */
307 pf_get_mape_sport(sa_family_t af, u_int8_t proto, struct pf_krule *r,
308 struct pf_addr *saddr, uint16_t sport, struct pf_addr *daddr,
309 uint16_t dport, struct pf_addr *naddr, uint16_t *nport,
310 struct pf_ksrc_node **sn)
312 uint16_t psmask, low, highmask;
313 uint16_t i, ahigh, cut;
314 int ashift, psidshift;
316 ashift = 16 - r->rpool.mape.offset;
317 psidshift = ashift - r->rpool.mape.psidlen;
318 psmask = r->rpool.mape.psid & ((1U << r->rpool.mape.psidlen) - 1);
319 psmask = psmask << psidshift;
320 highmask = (1U << psidshift) - 1;
322 ahigh = (1U << r->rpool.mape.offset) - 1;
323 cut = arc4random() & ahigh;
327 for (i = cut; i <= ahigh; i++) {
328 low = (i << ashift) | psmask;
329 if (!pf_get_sport(af, proto, r, saddr, sport, daddr, dport,
330 naddr, nport, low, low | highmask, sn))
333 for (i = cut - 1; i > 0; i--) {
334 low = (i << ashift) | psmask;
335 if (!pf_get_sport(af, proto, r, saddr, sport, daddr, dport,
336 naddr, nport, low, low | highmask, sn))
343 pf_map_addr(sa_family_t af, struct pf_krule *r, struct pf_addr *saddr,
344 struct pf_addr *naddr, struct pf_addr *init_addr, struct pf_ksrc_node **sn)
346 struct pf_kpool *rpool = &r->rpool;
347 struct pf_addr *raddr = NULL, *rmask = NULL;
349 /* Try to find a src_node if none was given and this
350 is a sticky-address rule. */
351 if (*sn == NULL && r->rpool.opts & PF_POOL_STICKYADDR &&
352 (r->rpool.opts & PF_POOL_TYPEMASK) != PF_POOL_NONE)
353 *sn = pf_find_src_node(saddr, r, af, 0);
355 /* If a src_node was found or explicitly given and it has a non-zero
356 route address, use this address. A zeroed address is found if the
357 src node was created just a moment ago in pf_create_state and it
358 needs to be filled in with routing decision calculated here. */
359 if (*sn != NULL && !PF_AZERO(&(*sn)->raddr, af)) {
360 /* If the supplied address is the same as the current one we've
361 * been asked before, so tell the caller that there's no other
362 * address to be had. */
363 if (PF_AEQ(naddr, &(*sn)->raddr, af))
366 PF_ACPY(naddr, &(*sn)->raddr, af);
367 if (V_pf_status.debug >= PF_DEBUG_MISC) {
368 printf("pf_map_addr: src tracking maps ");
369 pf_print_host(saddr, 0, af);
371 pf_print_host(naddr, 0, af);
377 /* Find the route using chosen algorithm. Store the found route
378 in src_node if it was given or found. */
379 if (rpool->cur->addr.type == PF_ADDR_NOROUTE)
381 if (rpool->cur->addr.type == PF_ADDR_DYNIFTL) {
385 if (rpool->cur->addr.p.dyn->pfid_acnt4 < 1 &&
386 (rpool->opts & PF_POOL_TYPEMASK) !=
389 raddr = &rpool->cur->addr.p.dyn->pfid_addr4;
390 rmask = &rpool->cur->addr.p.dyn->pfid_mask4;
395 if (rpool->cur->addr.p.dyn->pfid_acnt6 < 1 &&
396 (rpool->opts & PF_POOL_TYPEMASK) !=
399 raddr = &rpool->cur->addr.p.dyn->pfid_addr6;
400 rmask = &rpool->cur->addr.p.dyn->pfid_mask6;
404 } else if (rpool->cur->addr.type == PF_ADDR_TABLE) {
405 if ((rpool->opts & PF_POOL_TYPEMASK) != PF_POOL_ROUNDROBIN)
406 return (1); /* unsupported */
408 raddr = &rpool->cur->addr.v.a.addr;
409 rmask = &rpool->cur->addr.v.a.mask;
412 switch (rpool->opts & PF_POOL_TYPEMASK) {
414 PF_ACPY(naddr, raddr, af);
416 case PF_POOL_BITMASK:
417 PF_POOLMASK(naddr, raddr, rmask, saddr, af);
420 if (init_addr != NULL && PF_AZERO(init_addr, af)) {
424 rpool->counter.addr32[0] = htonl(arc4random());
429 if (rmask->addr32[3] != 0xffffffff)
430 rpool->counter.addr32[3] =
434 if (rmask->addr32[2] != 0xffffffff)
435 rpool->counter.addr32[2] =
439 if (rmask->addr32[1] != 0xffffffff)
440 rpool->counter.addr32[1] =
444 if (rmask->addr32[0] != 0xffffffff)
445 rpool->counter.addr32[0] =
450 PF_POOLMASK(naddr, raddr, rmask, &rpool->counter, af);
451 PF_ACPY(init_addr, naddr, af);
454 PF_AINC(&rpool->counter, af);
455 PF_POOLMASK(naddr, raddr, rmask, &rpool->counter, af);
458 case PF_POOL_SRCHASH:
460 unsigned char hash[16];
462 pf_hash(saddr, (struct pf_addr *)&hash, &rpool->key, af);
463 PF_POOLMASK(naddr, raddr, rmask, (struct pf_addr *)&hash, af);
466 case PF_POOL_ROUNDROBIN:
468 struct pf_kpooladdr *acur = rpool->cur;
471 * XXXGL: in the round-robin case we need to store
472 * the round-robin machine state in the rule, thus
473 * forwarding thread needs to modify rule.
475 * This is done w/o locking, because performance is assumed
476 * more important than round-robin precision.
478 * In the simpliest case we just update the "rpool->cur"
479 * pointer. However, if pool contains tables or dynamic
480 * addresses, then "tblidx" is also used to store machine
481 * state. Since "tblidx" is int, concurrent access to it can't
482 * lead to inconsistence, only to lost of precision.
484 * Things get worse, if table contains not hosts, but
485 * prefixes. In this case counter also stores machine state,
486 * and for IPv6 address, counter can't be updated atomically.
487 * Probably, using round-robin on a table containing IPv6
488 * prefixes (or even IPv4) would cause a panic.
491 if (rpool->cur->addr.type == PF_ADDR_TABLE) {
492 if (!pfr_pool_get(rpool->cur->addr.p.tbl,
493 &rpool->tblidx, &rpool->counter, af))
495 } else if (rpool->cur->addr.type == PF_ADDR_DYNIFTL) {
496 if (!pfr_pool_get(rpool->cur->addr.p.dyn->pfid_kt,
497 &rpool->tblidx, &rpool->counter, af))
499 } else if (pf_match_addr(0, raddr, rmask, &rpool->counter, af))
503 if (TAILQ_NEXT(rpool->cur, entries) == NULL)
504 rpool->cur = TAILQ_FIRST(&rpool->list);
506 rpool->cur = TAILQ_NEXT(rpool->cur, entries);
507 if (rpool->cur->addr.type == PF_ADDR_TABLE) {
509 if (pfr_pool_get(rpool->cur->addr.p.tbl,
510 &rpool->tblidx, &rpool->counter, af)) {
511 /* table contains no address of type 'af' */
512 if (rpool->cur != acur)
516 } else if (rpool->cur->addr.type == PF_ADDR_DYNIFTL) {
518 if (pfr_pool_get(rpool->cur->addr.p.dyn->pfid_kt,
519 &rpool->tblidx, &rpool->counter, af)) {
520 /* table contains no address of type 'af' */
521 if (rpool->cur != acur)
526 raddr = &rpool->cur->addr.v.a.addr;
527 rmask = &rpool->cur->addr.v.a.mask;
528 PF_ACPY(&rpool->counter, raddr, af);
532 PF_ACPY(naddr, &rpool->counter, af);
533 if (init_addr != NULL && PF_AZERO(init_addr, af))
534 PF_ACPY(init_addr, naddr, af);
535 PF_AINC(&rpool->counter, af);
540 PF_ACPY(&(*sn)->raddr, naddr, af);
542 if (V_pf_status.debug >= PF_DEBUG_MISC &&
543 (rpool->opts & PF_POOL_TYPEMASK) != PF_POOL_NONE) {
544 printf("pf_map_addr: selected address ");
545 pf_print_host(naddr, 0, af);
553 pf_get_translation(struct pf_pdesc *pd, struct mbuf *m, int off, int direction,
554 struct pfi_kkif *kif, struct pf_ksrc_node **sn,
555 struct pf_state_key **skp, struct pf_state_key **nkp,
556 struct pf_addr *saddr, struct pf_addr *daddr,
557 uint16_t sport, uint16_t dport, struct pf_kanchor_stackframe *anchor_stack)
559 struct pf_krule *r = NULL;
560 struct pf_addr *naddr;
565 KASSERT(*skp == NULL, ("*skp not NULL"));
566 KASSERT(*nkp == NULL, ("*nkp not NULL"));
568 if (direction == PF_OUT) {
569 r = pf_match_translation(pd, m, off, direction, kif, saddr,
570 sport, daddr, dport, PF_RULESET_BINAT, anchor_stack);
572 r = pf_match_translation(pd, m, off, direction, kif,
573 saddr, sport, daddr, dport, PF_RULESET_NAT,
576 r = pf_match_translation(pd, m, off, direction, kif, saddr,
577 sport, daddr, dport, PF_RULESET_RDR, anchor_stack);
579 r = pf_match_translation(pd, m, off, direction, kif,
580 saddr, sport, daddr, dport, PF_RULESET_BINAT,
594 *skp = pf_state_key_setup(pd, saddr, daddr, sport, dport);
597 *nkp = pf_state_key_clone(*skp);
599 uma_zfree(V_pf_state_key_z, *skp);
604 /* XXX We only modify one side for now. */
605 naddr = &(*nkp)->addr[1];
606 nport = &(*nkp)->port[1];
610 if (pd->proto == IPPROTO_ICMP) {
614 low = r->rpool.proxy_port[0];
615 high = r->rpool.proxy_port[1];
617 if (r->rpool.mape.offset > 0) {
618 if (pf_get_mape_sport(pd->af, pd->proto, r, saddr,
619 sport, daddr, dport, naddr, nport, sn)) {
620 DPFPRINTF(PF_DEBUG_MISC,
621 ("pf: MAP-E port allocation (%u/%u/%u)"
623 r->rpool.mape.offset,
624 r->rpool.mape.psidlen,
625 r->rpool.mape.psid));
628 } else if (pf_get_sport(pd->af, pd->proto, r, saddr, sport,
629 daddr, dport, naddr, nport, low, high, sn)) {
630 DPFPRINTF(PF_DEBUG_MISC,
631 ("pf: NAT proxy port allocation (%u-%u) failed\n",
632 r->rpool.proxy_port[0], r->rpool.proxy_port[1]));
639 if (r->rpool.cur->addr.type == PF_ADDR_DYNIFTL){
643 if (r->rpool.cur->addr.p.dyn->
647 &r->rpool.cur->addr.p.dyn->
649 &r->rpool.cur->addr.p.dyn->
650 pfid_mask4, saddr, AF_INET);
655 if (r->rpool.cur->addr.p.dyn->
659 &r->rpool.cur->addr.p.dyn->
661 &r->rpool.cur->addr.p.dyn->
662 pfid_mask6, saddr, AF_INET6);
668 &r->rpool.cur->addr.v.a.addr,
669 &r->rpool.cur->addr.v.a.mask, saddr,
673 if (r->src.addr.type == PF_ADDR_DYNIFTL) {
677 if (r->src.addr.p.dyn-> pfid_acnt4 < 1)
680 &r->src.addr.p.dyn->pfid_addr4,
681 &r->src.addr.p.dyn->pfid_mask4,
687 if (r->src.addr.p.dyn->pfid_acnt6 < 1)
690 &r->src.addr.p.dyn->pfid_addr6,
691 &r->src.addr.p.dyn->pfid_mask6,
697 PF_POOLMASK(naddr, &r->src.addr.v.a.addr,
698 &r->src.addr.v.a.mask, daddr, pd->af);
703 if (pf_map_addr(pd->af, r, saddr, naddr, NULL, sn))
705 if ((r->rpool.opts & PF_POOL_TYPEMASK) == PF_POOL_BITMASK)
706 PF_POOLMASK(naddr, naddr, &r->rpool.cur->addr.v.a.mask,
709 if (r->rpool.proxy_port[1]) {
712 tmp_nport = ((ntohs(dport) - ntohs(r->dst.port[0])) %
713 (r->rpool.proxy_port[1] - r->rpool.proxy_port[0] +
714 1)) + r->rpool.proxy_port[0];
716 /* Wrap around if necessary. */
717 if (tmp_nport > 65535)
719 *nport = htons((uint16_t)tmp_nport);
720 } else if (r->rpool.proxy_port[0])
721 *nport = htons(r->rpool.proxy_port[0]);
725 panic("%s: unknown action %u", __func__, r->action);
728 /* Return success only if translation really happened. */
729 if (bcmp(*skp, *nkp, sizeof(struct pf_state_key_cmp)))
733 uma_zfree(V_pf_state_key_z, *nkp);
734 uma_zfree(V_pf_state_key_z, *skp);