2 * Copyright (c) 2014 Yandex LLC
3 * Copyright (c) 2014 Alexander V. Chernikov
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
8 * 1. Redistributions of source code must retain the above copyright
9 * notice, this list of conditions and the following disclaimer.
10 * 2. Redistributions in binary form must reproduce the above copyright
11 * notice, this list of conditions and the following disclaimer in the
12 * documentation and/or other materials provided with the distribution.
14 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
15 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
16 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
17 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
18 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
19 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
20 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
21 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
22 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
23 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
27 #include <sys/cdefs.h>
28 __FBSDID("$FreeBSD$");
31 * Multi-field value support for ipfw tables.
33 * This file contains necessary functions to convert
34 * large multi-field values into u32 indices suitable to be fed
35 * to various table algorithms. Other machinery like proper refcounting,
36 * internal structures resizing are also kept here.
41 #include <sys/param.h>
42 #include <sys/systm.h>
43 #include <sys/malloc.h>
44 #include <sys/kernel.h>
47 #include <sys/rwlock.h>
48 #include <sys/rmlock.h>
49 #include <sys/socket.h>
50 #include <sys/socketvar.h>
51 #include <sys/queue.h>
52 #include <net/if.h> /* ip_fw.h requires IFNAMSIZ */
54 #include <netinet/in.h>
55 #include <netinet/ip_var.h> /* struct ipfw_rule_ref */
56 #include <netinet/ip_fw.h>
58 #include <netpfil/ipfw/ip_fw_private.h>
59 #include <netpfil/ipfw/ip_fw_table.h>
61 static uint32_t hash_table_value(struct namedobj_instance *ni, const void *key,
63 static int cmp_table_value(struct named_object *no, const void *key,
66 static int list_table_values(struct ip_fw_chain *ch, ip_fw3_opheader *op3,
67 struct sockopt_data *sd);
69 static struct ipfw_sopt_handler scodes[] = {
70 { IP_FW_TABLE_VLIST, 0, HDIR_GET, list_table_values },
73 #define CHAIN_TO_VI(chain) (CHAIN_TO_TCFG(chain)->valhash)
77 struct named_object no;
78 struct table_value *pval; /* Pointer to real table value */
80 #define VALDATA_START_SIZE 64 /* Allocate 64-items array by default */
83 struct ip_fw_chain *ch;
84 struct sockopt_data *sd;
85 struct table_value *pval;
90 hash_table_value(struct namedobj_instance *ni, const void *key, uint32_t kopt)
93 return (hash32_buf(key, 56, 0));
97 cmp_table_value(struct named_object *no, const void *key, uint32_t kopt)
100 return (memcmp(((struct table_val_link *)no)->pval, key, 56));
104 mask_table_value(struct table_value *src, struct table_value *dst,
107 #define _MCPY(f, b) if ((mask & (b)) != 0) { dst->f = src->f; }
109 memset(dst, 0, sizeof(*dst));
110 _MCPY(tag, IPFW_VTYPE_TAG);
111 _MCPY(pipe, IPFW_VTYPE_PIPE);
112 _MCPY(divert, IPFW_VTYPE_DIVERT);
113 _MCPY(skipto, IPFW_VTYPE_SKIPTO);
114 _MCPY(netgraph, IPFW_VTYPE_NETGRAPH);
115 _MCPY(fib, IPFW_VTYPE_FIB);
116 _MCPY(nat, IPFW_VTYPE_NAT);
117 _MCPY(dscp, IPFW_VTYPE_DSCP);
118 _MCPY(nh4, IPFW_VTYPE_NH4);
119 _MCPY(nh6, IPFW_VTYPE_NH6);
120 _MCPY(zoneid, IPFW_VTYPE_NH6);
125 get_value_ptrs(struct ip_fw_chain *ch, struct table_config *tc, int vshared,
126 struct table_value **ptv, struct namedobj_instance **pvi)
128 struct table_value *pval;
129 struct namedobj_instance *vi;
132 pval = (struct table_value *)ch->valuestate;
133 vi = CHAIN_TO_VI(ch);
137 //pval = (struct table_value *)&tc->ti.data;
147 * Update pointers to real vaues after @pval change.
150 update_tvalue(struct namedobj_instance *ni, struct named_object *no, void *arg)
152 struct vdump_args *da;
153 struct table_val_link *ptv;
154 struct table_value *pval;
156 da = (struct vdump_args *)arg;
157 ptv = (struct table_val_link *)no;
160 ptv->pval = &pval[ptv->no.kidx];
161 ptv->no.name = (char *)&pval[ptv->no.kidx];
166 * Grows value storage shared among all tables.
167 * Drops/reacquires UH locks.
168 * Notifies other running adds on @ch shared storage resize.
169 * Note function does not guarantee that free space
170 * will be available after invocation, so one caller needs
171 * to roll cycle himself.
173 * Returns 0 if case of no errors.
176 resize_shared_value_storage(struct ip_fw_chain *ch)
178 struct tables_config *tcfg;
179 struct namedobj_instance *vi;
180 struct table_value *pval, *valuestate, *old_valuestate;
182 struct vdump_args da;
184 int val_size, val_size_old;
186 IPFW_UH_WLOCK_ASSERT(ch);
191 pval = (struct table_value *)ch->valuestate;
192 vi = CHAIN_TO_VI(ch);
193 tcfg = CHAIN_TO_TCFG(ch);
195 val_size = tcfg->val_size * 2;
197 if (val_size == (1 << 30))
202 valuestate = malloc(sizeof(struct table_value) * val_size, M_IPFW,
204 ipfw_objhash_bitmap_alloc(val_size, (void *)&new_idx,
210 * Check if we still need to resize
212 if (tcfg->val_size >= val_size)
215 /* Update pointers and notify everyone we're changing @ch */
216 pval = (struct table_value *)ch->valuestate;
217 rollback_toperation_state(ch, ch);
219 /* Good. Let's merge */
220 memcpy(valuestate, pval, sizeof(struct table_value) * tcfg->val_size);
221 ipfw_objhash_bitmap_merge(CHAIN_TO_VI(ch), &new_idx, &new_blocks);
224 /* Change pointers */
225 old_valuestate = ch->valuestate;
226 ch->valuestate = valuestate;
227 valuestate = old_valuestate;
228 ipfw_objhash_bitmap_swap(CHAIN_TO_VI(ch), &new_idx, &new_blocks);
230 val_size_old = tcfg->val_size;
231 tcfg->val_size = val_size;
232 val_size = val_size_old;
234 /* Update pointers to reflect resize */
235 memset(&da, 0, sizeof(da));
236 da.pval = (struct table_value *)ch->valuestate;
237 ipfw_objhash_foreach(vi, update_tvalue, &da);
240 free(valuestate, M_IPFW);
241 ipfw_objhash_bitmap_free(new_idx, new_blocks);
247 * Drops reference for table value with index @kidx, stored in @pval and
248 * @vi. Frees value if it has no references.
251 unref_table_value(struct namedobj_instance *vi, struct table_value *pval,
254 struct table_val_link *ptvl;
256 KASSERT(pval[kidx].refcnt > 0, ("Refcount is 0 on kidx %d", kidx));
257 if (--pval[kidx].refcnt > 0)
260 /* Last reference, delete item */
261 ptvl = (struct table_val_link *)ipfw_objhash_lookup_kidx(vi, kidx);
262 KASSERT(ptvl != NULL, ("lookup on value kidx %d failed", kidx));
263 ipfw_objhash_del(vi, &ptvl->no);
264 ipfw_objhash_free_idx(vi, kidx);
269 struct ip_fw_chain *ch;
270 struct table_algo *ta;
271 struct table_info *ti;
273 ipfw_obj_tentry tent;
277 unref_table_value_cb(void *e, void *arg)
279 struct flush_args *fa;
280 struct ip_fw_chain *ch;
281 struct table_algo *ta;
282 ipfw_obj_tentry *tent;
285 fa = (struct flush_args *)arg;
288 memset(&fa->tent, 0, sizeof(fa->tent));
290 error = ta->dump_tentry(fa->astate, fa->ti, e, tent);
296 unref_table_value(CHAIN_TO_VI(ch),
297 (struct table_value *)ch->valuestate, tent->v.kidx);
303 * Drop references for each value used in @tc.
306 ipfw_unref_table_values(struct ip_fw_chain *ch, struct table_config *tc,
307 struct table_algo *ta, void *astate, struct table_info *ti)
309 struct flush_args fa;
311 IPFW_UH_WLOCK_ASSERT(ch);
313 memset(&fa, 0, sizeof(fa));
319 ta->foreach(astate, ti, unref_table_value_cb, &fa);
323 * Table operation state handler.
324 * Called when we are going to change something in @tc which
325 * may lead to inconsistencies in on-going table data addition.
327 * Here we rollback all already committed state (table values, currently)
328 * and set "modified" field to non-zero value to indicate
329 * that we need to restart original operation.
332 rollback_table_values(struct tableop_state *ts)
334 struct ip_fw_chain *ch;
335 struct table_value *pval;
336 struct tentry_info *ptei;
337 struct namedobj_instance *vi;
342 IPFW_UH_WLOCK_ASSERT(ch);
344 /* Get current table value pointer */
345 get_value_ptrs(ch, ts->tc, ts->vshared, &pval, &vi);
347 for (i = 0; i < ts->count; i++) {
350 if (ptei->value == 0)
353 unref_table_value(vi, pval, ptei->value);
358 * Allocate new value index in either shared or per-table array.
359 * Function may drop/reacquire UH lock.
361 * Returns 0 on success.
364 alloc_table_vidx(struct ip_fw_chain *ch, struct tableop_state *ts,
365 struct namedobj_instance *vi, uint16_t *pvidx, uint8_t flags)
370 IPFW_UH_WLOCK_ASSERT(ch);
372 error = ipfw_objhash_alloc_idx(vi, &vidx);
375 * We need to resize array. This involves
376 * lock/unlock, so we need to check "modified"
379 ts->opstate.func(ts->tc, &ts->opstate);
380 error = resize_shared_value_storage(ch);
381 return (error); /* ts->modified should be set, we will restart */
384 vlimit = ts->ta->vlimit;
385 if (vlimit != 0 && vidx >= vlimit && !(flags & IPFW_CTF_ATOMIC)) {
387 * Algorithm is not able to store given index.
388 * We have to rollback state, start using
389 * per-table value array or return error
390 * if we're already using it.
392 if (ts->vshared != 0) {
393 /* shared -> per-table */
394 return (ENOSPC); /* TODO: proper error */
397 /* per-table. Fail for now. */
398 return (ENOSPC); /* TODO: proper error */
406 * Drops value reference for unused values (updates, deletes, partially
407 * successful adds or rollbacks).
410 ipfw_garbage_table_values(struct ip_fw_chain *ch, struct table_config *tc,
411 struct tentry_info *tei, uint32_t count, int rollback)
414 struct tentry_info *ptei;
415 struct table_value *pval;
416 struct namedobj_instance *vi;
419 * We have two slightly different ADD cases here:
420 * either (1) we are successful / partially successful,
421 * in that case we need
422 * * to ignore ADDED entries values
423 * * rollback every other values if atomicity is not
424 * * required (either UPDATED since old value has been
425 * stored there, or some failure like EXISTS or LIMIT
426 * or simply "ignored" case.
428 * (2): atomic rollback of partially successful operation
429 * in that case we simply need to unref all entries.
431 * DELETE case is simpler: no atomic support there, so
432 * we simply unref all non-zero values.
436 * Get current table value pointers.
437 * XXX: Properly read vshared
439 get_value_ptrs(ch, tc, 1, &pval, &vi);
441 for (i = 0; i < count; i++) {
444 if (ptei->value == 0) {
446 * We may be deleting non-existing record.
452 if ((ptei->flags & TEI_FLAGS_ADDED) != 0 && rollback == 0) {
457 unref_table_value(vi, pval, ptei->value);
463 * Main function used to link values of entries going to be added,
464 * to the index. Since we may perform many UH locks drops/acquires,
465 * handle changes by checking tablestate "modified" field.
470 ipfw_link_table_values(struct ip_fw_chain *ch, struct tableop_state *ts,
474 struct namedobj_instance *vi;
475 struct table_config *tc;
476 struct tentry_info *tei, *ptei;
477 uint32_t count, vlimit;
479 struct table_val_link *ptv;
480 struct table_value tval, *pval;
483 * Stage 1: reference all existing values and
484 * save their indices.
486 IPFW_UH_WLOCK_ASSERT(ch);
487 get_value_ptrs(ch, ts->tc, ts->vshared, &pval, &vi);
491 vlimit = ts->ta->vlimit;
496 for (i = 0; i < count; i++) {
498 ptei->value = 0; /* Ensure value is always 0 in the beginning */
499 mask_table_value(ptei->pvalue, &tval, ts->vmask);
500 ptv = (struct table_val_link *)ipfw_objhash_lookup_name(vi, 0,
504 /* Deal with vlimit later */
505 if (vlimit > 0 && vlimit <= ptv->no.kidx)
508 /* Value found. Bump refcount */
510 ptei->value = ptv->no.kidx;
514 if (ts->count == found) {
515 /* We've found all values , no need ts create new ones */
520 * we have added some state here, let's attach operation
521 * state ts the list ts be able ts rollback if necessary.
523 add_toperation_state(ch, ts);
524 /* Ensure table won't disappear */
529 * Stage 2: allocate objects for non-existing values.
531 for (i = 0; i < count; i++) {
533 if (ptei->value != 0)
535 if (ptei->ptv != NULL)
537 ptei->ptv = malloc(sizeof(struct table_val_link), M_IPFW,
542 * Stage 3: allocate index numbers for new values
543 * and link them to index.
547 del_toperation_state(ch, ts);
548 if (ts->modified != 0) {
550 * In general, we should free all state/indexes here
551 * and return. However, we keep allocated state instead
552 * to ensure we achieve some progress on each restart.
557 KASSERT(pval == ch->valuestate, ("resize_storage() notify failure"));
559 /* Let's try to link values */
560 for (i = 0; i < count; i++) {
563 /* Check if record has appeared */
564 mask_table_value(ptei->pvalue, &tval, ts->vmask);
565 ptv = (struct table_val_link *)ipfw_objhash_lookup_name(vi, 0,
569 ptei->value = ptv->no.kidx;
573 /* May perform UH unlock/lock */
574 error = alloc_table_vidx(ch, ts, vi, &vidx, flags);
576 ts->opstate.func(ts->tc, &ts->opstate);
579 /* value storage resize has happened, return */
580 if (ts->modified != 0)
583 /* Finally, we have allocated valid index, let's add entry */
585 ptv = (struct table_val_link *)ptei->ptv;
589 ptv->no.name = (char *)&pval[vidx];
590 ptv->pval = &pval[vidx];
591 memcpy(ptv->pval, &tval, sizeof(struct table_value));
592 pval[vidx].refcnt = 1;
593 ipfw_objhash_add(vi, &ptv->no);
600 * Compatibility function used to import data from old
601 * IP_FW_TABLE_ADD / IP_FW_TABLE_XADD opcodes.
604 ipfw_import_table_value_legacy(uint32_t value, struct table_value *v)
607 memset(v, 0, sizeof(*v));
615 v->nh4 = value; /* host format */
621 * Export data to legacy table dumps opcodes.
624 ipfw_export_table_value_legacy(struct table_value *v)
628 * TODO: provide more compatibility depending on
635 * Imports table value from current userland format.
636 * Saves value in kernel format to the same place.
639 ipfw_import_table_value_v1(ipfw_table_value *iv)
641 struct table_value v;
643 memset(&v, 0, sizeof(v));
646 v.divert = iv->divert;
647 v.skipto = iv->skipto;
648 v.netgraph = iv->netgraph;
655 v.zoneid = iv->zoneid;
657 memcpy(iv, &v, sizeof(ipfw_table_value));
661 * Export real table value @v to current userland format.
662 * Note that @v and @piv may point to the same memory.
665 ipfw_export_table_value_v1(struct table_value *v, ipfw_table_value *piv)
669 memset(&iv, 0, sizeof(iv));
672 iv.divert = v->divert;
673 iv.skipto = v->skipto;
674 iv.netgraph = v->netgraph;
681 iv.zoneid = v->zoneid;
683 memcpy(piv, &iv, sizeof(iv));
687 * Exports real value data into ipfw_table_value structure.
688 * Utilizes "spare1" field to store kernel index.
691 dump_tvalue(struct namedobj_instance *ni, struct named_object *no, void *arg)
693 struct vdump_args *da;
694 struct table_val_link *ptv;
695 struct table_value *v;
697 da = (struct vdump_args *)arg;
698 ptv = (struct table_val_link *)no;
700 v = (struct table_value *)ipfw_get_sopt_space(da->sd, sizeof(*v));
701 /* Out of memory, returning */
707 memcpy(v, ptv->pval, sizeof(*v));
708 v->spare1 = ptv->no.kidx;
713 * Dumps all shared/table value data
714 * Data layout (v1)(current):
715 * Request: [ ipfw_obj_lheader ], size = ipfw_obj_lheader.size
716 * Reply: [ ipfw_obj_lheader ipfw_table_value x N ]
718 * Returns 0 on success
721 list_table_values(struct ip_fw_chain *ch, ip_fw3_opheader *op3,
722 struct sockopt_data *sd)
724 struct _ipfw_obj_lheader *olh;
725 struct namedobj_instance *vi;
726 struct vdump_args da;
727 uint32_t count, size;
729 olh = (struct _ipfw_obj_lheader *)ipfw_get_sopt_header(sd,sizeof(*olh));
732 if (sd->valsize < olh->size)
736 vi = CHAIN_TO_VI(ch);
738 count = ipfw_objhash_count(vi);
739 size = count * sizeof(ipfw_table_value) + sizeof(ipfw_obj_lheader);
741 /* Fill in header regadless of buffer size */
743 olh->objsize = sizeof(ipfw_table_value);
745 if (size > olh->size) {
753 * Do the actual value dump
755 memset(&da, 0, sizeof(da));
758 ipfw_objhash_foreach(vi, dump_tvalue, &da);
766 ipfw_table_value_init(struct ip_fw_chain *ch, int first)
768 struct tables_config *tcfg;
770 ch->valuestate = malloc(VALDATA_START_SIZE * sizeof(struct table_value),
771 M_IPFW, M_WAITOK | M_ZERO);
775 tcfg->val_size = VALDATA_START_SIZE;
776 tcfg->valhash = ipfw_objhash_create(tcfg->val_size);
777 ipfw_objhash_set_funcs(tcfg->valhash, hash_table_value,
780 IPFW_ADD_SOPT_HANDLER(first, scodes);
784 destroy_value(struct namedobj_instance *ni, struct named_object *no,
793 ipfw_table_value_destroy(struct ip_fw_chain *ch, int last)
796 IPFW_DEL_SOPT_HANDLER(last, scodes);
798 free(ch->valuestate, M_IPFW);
799 ipfw_objhash_foreach(CHAIN_TO_VI(ch), destroy_value, ch);
800 ipfw_objhash_destroy(CHAIN_TO_VI(ch));