2 * Copyright (c) 2014 Yandex LLC
3 * Copyright (c) 2014 Alexander V. Chernikov
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
8 * 1. Redistributions of source code must retain the above copyright
9 * notice, this list of conditions and the following disclaimer.
10 * 2. Redistributions in binary form must reproduce the above copyright
11 * notice, this list of conditions and the following disclaimer in the
12 * documentation and/or other materials provided with the distribution.
14 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
15 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
16 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
17 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
18 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
19 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
20 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
21 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
22 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
23 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
27 #include <sys/cdefs.h>
28 __FBSDID("$FreeBSD$");
31 * Multi-field value support for ipfw tables.
33 * This file contains necessary functions to convert
34 * large multi-field values into u32 indices suitable to be fed
35 * to various table algorithms. Other machinery like proper refcounting,
36 * internal structures resizing are also kept here.
41 #include <sys/param.h>
42 #include <sys/systm.h>
43 #include <sys/malloc.h>
44 #include <sys/kernel.h>
47 #include <sys/rwlock.h>
48 #include <sys/rmlock.h>
49 #include <sys/socket.h>
50 #include <sys/socketvar.h>
51 #include <sys/queue.h>
52 #include <net/if.h> /* ip_fw.h requires IFNAMSIZ */
54 #include <netinet/in.h>
55 #include <netinet/ip_var.h> /* struct ipfw_rule_ref */
56 #include <netinet/ip_fw.h>
58 #include <netpfil/ipfw/ip_fw_private.h>
59 #include <netpfil/ipfw/ip_fw_table.h>
61 static uint32_t hash_table_value(struct namedobj_instance *ni, void *key,
63 static int cmp_table_value(struct named_object *no, void *key, uint32_t kopt);
65 static int list_table_values(struct ip_fw_chain *ch, ip_fw3_opheader *op3,
66 struct sockopt_data *sd);
68 static struct ipfw_sopt_handler scodes[] = {
69 { IP_FW_TABLE_VLIST, 0, HDIR_GET, list_table_values },
72 #define CHAIN_TO_VI(chain) (CHAIN_TO_TCFG(chain)->valhash)
76 struct named_object no;
77 struct table_value *pval; /* Pointer to real table value */
79 #define VALDATA_START_SIZE 64 /* Allocate 64-items array by default */
82 struct ip_fw_chain *ch;
83 struct sockopt_data *sd;
84 struct table_value *pval;
90 hash_table_value(struct namedobj_instance *ni, void *key, uint32_t kopt)
93 return (hash32_buf(key, 56, 0));
97 cmp_table_value(struct named_object *no, void *key, uint32_t kopt)
100 return (memcmp(((struct table_val_link *)no)->pval, key, 56));
104 mask_table_value(struct table_value *src, struct table_value *dst,
107 #define _MCPY(f, b) if ((mask & (b)) != 0) { dst->f = src->f; }
109 memset(dst, 0, sizeof(*dst));
110 _MCPY(tag, IPFW_VTYPE_TAG);
111 _MCPY(pipe, IPFW_VTYPE_PIPE);
112 _MCPY(divert, IPFW_VTYPE_DIVERT);
113 _MCPY(skipto, IPFW_VTYPE_SKIPTO);
114 _MCPY(netgraph, IPFW_VTYPE_NETGRAPH);
115 _MCPY(fib, IPFW_VTYPE_FIB);
116 _MCPY(nat, IPFW_VTYPE_NAT);
117 _MCPY(dscp, IPFW_VTYPE_DSCP);
118 _MCPY(nh4, IPFW_VTYPE_NH4);
119 _MCPY(nh6, IPFW_VTYPE_NH6);
120 _MCPY(zoneid, IPFW_VTYPE_NH6);
125 get_value_ptrs(struct ip_fw_chain *ch, struct table_config *tc, int vshared,
126 struct table_value **ptv, struct namedobj_instance **pvi)
128 struct table_value *pval;
129 struct namedobj_instance *vi;
132 pval = (struct table_value *)ch->valuestate;
133 vi = CHAIN_TO_VI(ch);
137 //pval = (struct table_value *)&tc->ti.data;
147 * Update pointers to real vaues after @pval change.
150 update_tvalue(struct namedobj_instance *ni, struct named_object *no, void *arg)
152 struct vdump_args *da;
153 struct table_val_link *ptv;
154 struct table_value *pval;
156 da = (struct vdump_args *)arg;
157 ptv = (struct table_val_link *)no;
160 ptv->pval = &pval[ptv->no.kidx];
165 * Grows value storage shared among all tables.
166 * Drops/reacquires UH locks.
167 * Notifies other running adds on @ch shared storage resize.
168 * Note function does not guarantee that free space
169 * will be available after invocation, so one caller needs
170 * to roll cycle himself.
172 * Returns 0 if case of no errors.
175 resize_shared_value_storage(struct ip_fw_chain *ch)
177 struct tables_config *tcfg;
178 struct namedobj_instance *vi;
179 struct table_value *pval, *valuestate, *old_valuestate;
181 struct vdump_args da;
183 int val_size, val_size_old;
185 IPFW_UH_WLOCK_ASSERT(ch);
190 pval = (struct table_value *)ch->valuestate;
191 vi = CHAIN_TO_VI(ch);
192 tcfg = CHAIN_TO_TCFG(ch);
194 val_size = tcfg->val_size * 2;
196 if (val_size == (1 << 30))
201 valuestate = malloc(sizeof(struct table_value) * val_size, M_IPFW,
203 ipfw_objhash_bitmap_alloc(val_size, (void *)&new_idx,
209 * Check if we still need to resize
211 if (tcfg->val_size >= val_size)
214 /* Update pointers and notify everyone we're changing @ch */
215 pval = (struct table_value *)ch->valuestate;
216 rollback_toperation_state(ch, ch);
218 /* Good. Let's merge */
219 memcpy(valuestate, pval, sizeof(struct table_value) * tcfg->val_size);
220 ipfw_objhash_bitmap_merge(CHAIN_TO_VI(ch), &new_idx, &new_blocks);
223 /* Change pointers */
224 old_valuestate = ch->valuestate;
225 ch->valuestate = valuestate;
226 valuestate = old_valuestate;
227 ipfw_objhash_bitmap_swap(CHAIN_TO_VI(ch), &new_idx, &new_blocks);
229 val_size_old = tcfg->val_size;
230 tcfg->val_size = val_size;
231 val_size = val_size_old;
233 /* Update pointers to reflect resize */
234 memset(&da, 0, sizeof(da));
235 da.pval = (struct table_value *)ch->valuestate;
236 ipfw_objhash_foreach(vi, update_tvalue, &da);
239 free(valuestate, M_IPFW);
240 ipfw_objhash_bitmap_free(new_idx, new_blocks);
246 * Drops reference for table value with index @kidx, stored in @pval and
247 * @vi. Frees value if it has no references.
250 unref_table_value(struct namedobj_instance *vi, struct table_value *pval,
253 struct table_val_link *ptvl;
255 KASSERT(pval[kidx].refcnt > 0, ("Refcount is 0 on kidx %d", kidx));
256 if (--pval[kidx].refcnt > 0)
259 /* Last reference, delete item */
260 ptvl = (struct table_val_link *)ipfw_objhash_lookup_kidx(vi, kidx);
261 KASSERT(ptvl != NULL, ("lookup on value kidx %d failed", kidx));
262 ipfw_objhash_del(vi, &ptvl->no);
263 ipfw_objhash_free_idx(vi, kidx);
268 struct ip_fw_chain *ch;
269 struct table_algo *ta;
270 struct table_info *ti;
272 ipfw_obj_tentry tent;
276 unref_table_value_cb(void *e, void *arg)
278 struct flush_args *fa;
279 struct ip_fw_chain *ch;
280 struct table_algo *ta;
281 ipfw_obj_tentry *tent;
284 fa = (struct flush_args *)arg;
287 memset(&fa->tent, 0, sizeof(fa->tent));
289 error = ta->dump_tentry(fa->astate, fa->ti, e, tent);
295 unref_table_value(CHAIN_TO_VI(ch),
296 (struct table_value *)ch->valuestate, tent->v.kidx);
302 * Drop references for each value used in @tc.
305 ipfw_unref_table_values(struct ip_fw_chain *ch, struct table_config *tc,
306 struct table_algo *ta, void *astate, struct table_info *ti)
308 struct flush_args fa;
310 IPFW_UH_WLOCK_ASSERT(ch);
312 memset(&fa, 0, sizeof(fa));
318 ta->foreach(astate, ti, unref_table_value_cb, &fa);
322 * Table operation state handler.
323 * Called when we are going to change something in @tc which
324 * may lead to inconsistencies in on-going table data addition.
326 * Here we rollback all already committed state (table values, currently)
327 * and set "modified" field to non-zero value to indicate
328 * that we need to restart original operation.
331 rollback_table_values(struct tableop_state *ts)
333 struct ip_fw_chain *ch;
334 struct table_value *pval;
335 struct tentry_info *ptei;
336 struct namedobj_instance *vi;
341 IPFW_UH_WLOCK_ASSERT(ch);
343 /* Get current table value pointer */
344 get_value_ptrs(ch, ts->tc, ts->vshared, &pval, &vi);
346 for (i = 0; i < ts->count; i++) {
349 if (ptei->value == 0)
352 unref_table_value(vi, pval, ptei->value);
357 * Allocate new value index in either shared or per-table array.
358 * Function may drop/reacquire UH lock.
360 * Returns 0 on success.
363 alloc_table_vidx(struct ip_fw_chain *ch, struct tableop_state *ts,
364 struct namedobj_instance *vi, uint16_t *pvidx)
369 IPFW_UH_WLOCK_ASSERT(ch);
371 error = ipfw_objhash_alloc_idx(vi, &vidx);
375 * We need to resize array. This involves
376 * lock/unlock, so we need to check "modified"
379 ts->opstate.func(ts->tc, &ts->opstate);
380 error = resize_shared_value_storage(ch);
381 return (error); /* ts->modified should be set, we will restart */
384 vlimit = ts->ta->vlimit;
385 if (vlimit != 0 && vidx >= vlimit) {
388 * Algorithm is not able to store given index.
389 * We have to rollback state, start using
390 * per-table value array or return error
391 * if we're already using it.
393 * TODO: do not rollback state if
394 * atomicity is not required.
396 if (ts->vshared != 0) {
397 /* shared -> per-table */
398 return (ENOSPC); /* TODO: proper error */
401 /* per-table. Fail for now. */
402 return (ENOSPC); /* TODO: proper error */
410 * Drops value reference for unused values (updates, deletes, partially
411 * successful adds or rollbacks).
414 ipfw_garbage_table_values(struct ip_fw_chain *ch, struct table_config *tc,
415 struct tentry_info *tei, uint32_t count, int rollback)
418 struct tentry_info *ptei;
419 struct table_value *pval;
420 struct namedobj_instance *vi;
423 * We have two slightly different ADD cases here:
424 * either (1) we are successful / partially successful,
425 * in that case we need
426 * * to ignore ADDED entries values
427 * * rollback every other values (either UPDATED since
428 * old value has been stored there, or some failure like
429 * EXISTS or LIMIT or simply "ignored" case.
431 * (2): atomic rollback of partially successful operation
432 * in that case we simply need to unref all entries.
434 * DELETE case is simpler: no atomic support there, so
435 * we simply unref all non-zero values.
439 * Get current table value pointers.
440 * XXX: Properly read vshared
442 get_value_ptrs(ch, tc, 1, &pval, &vi);
444 for (i = 0; i < count; i++) {
447 if (ptei->value == 0) {
450 * We may be deleting non-existing record.
456 if ((ptei->flags & TEI_FLAGS_ADDED) != 0 && rollback == 0) {
461 unref_table_value(vi, pval, ptei->value);
467 * Main function used to link values of entries going to be added,
468 * to the index. Since we may perform many UH locks drops/acquires,
469 * handle changes by checking tablestate "modified" field.
474 ipfw_link_table_values(struct ip_fw_chain *ch, struct tableop_state *ts)
477 struct namedobj_instance *vi;
478 struct table_config *tc;
479 struct tentry_info *tei, *ptei;
480 uint32_t count, vlimit;
482 struct table_val_link *ptv;
483 struct table_value tval, *pval;
486 * Stage 1: reference all existing values and
487 * save their indices.
489 IPFW_UH_WLOCK_ASSERT(ch);
490 get_value_ptrs(ch, ts->tc, ts->vshared, &pval, &vi);
494 vlimit = ts->ta->vlimit;
499 for (i = 0; i < count; i++) {
501 ptei->value = 0; /* Ensure value is always 0 in the beginnig */
502 mask_table_value(ptei->pvalue, &tval, ts->vmask);
503 ptv = (struct table_val_link *)ipfw_objhash_lookup_name(vi, 0,
507 /* Deal with vlimit later */
508 if (vlimit > 0 && vlimit <= ptv->no.kidx)
511 /* Value found. Bump refcount */
513 ptei->value = ptv->no.kidx;
517 if (ts->count == found) {
518 /* We've found all values , no need ts create new ones */
523 * we have added some state here, let's attach operation
524 * state ts the list ts be able ts rollback if necessary.
526 add_toperation_state(ch, ts);
527 /* Ensure table won't disappear */
532 * Stage 2: allocate objects for non-existing values.
534 for (i = 0; i < count; i++) {
536 if (ptei->value != 0)
538 if (ptei->ptv != NULL)
540 ptei->ptv = malloc(sizeof(struct table_val_link), M_IPFW,
545 * Stage 3: allocate index numbers for new values
546 * and link them to index.
550 del_toperation_state(ch, ts);
551 if (ts->modified != 0) {
554 * In general, we should free all state/indexes here
555 * and return. However, we keep allocated state instead
556 * to ensure we achieve some progress on each restart.
561 KASSERT(pval == ch->valuestate, ("resize_storage() notify failure"));
563 /* Let's try to link values */
564 for (i = 0; i < count; i++) {
567 /* Check if record has appeared */
568 mask_table_value(ptei->pvalue, &tval, ts->vmask);
569 ptv = (struct table_val_link *)ipfw_objhash_lookup_name(vi, 0,
573 ptei->value = ptv->no.kidx;
577 /* May perform UH unlock/lock */
578 error = alloc_table_vidx(ch, ts, vi, &vidx);
580 ts->opstate.func(ts->tc, &ts->opstate);
583 /* value storage resize has happened, return */
584 if (ts->modified != 0)
587 /* Finally, we have allocated valid index, let's add entry */
589 ptv = (struct table_val_link *)ptei->ptv;
593 ptv->no.name = (char *)&pval[vidx];
594 ptv->pval = &pval[vidx];
595 memcpy(ptv->pval, &tval, sizeof(struct table_value));
596 pval[vidx].refcnt = 1;
597 ipfw_objhash_add(vi, &ptv->no);
604 * Compability function used to import data from old
605 * IP_FW_TABLE_ADD / IP_FW_TABLE_XADD opcodes.
608 ipfw_import_table_value_legacy(uint32_t value, struct table_value *v)
611 memset(v, 0, sizeof(*v));
619 v->nh4 = value; /* host format */
625 * Export data to legacy table dumps opcodes.
628 ipfw_export_table_value_legacy(struct table_value *v)
632 * TODO: provide more compatibility depending on
639 * Imports table value from current userland format.
640 * Saves value in kernel format to the same place.
643 ipfw_import_table_value_v1(ipfw_table_value *iv)
645 struct table_value v;
647 memset(&v, 0, sizeof(v));
650 v.divert = iv->divert;
651 v.skipto = iv->skipto;
652 v.netgraph = iv->netgraph;
659 v.zoneid = iv->zoneid;
661 memcpy(iv, &v, sizeof(ipfw_table_value));
665 * Export real table value @v to current userland format.
666 * Note that @v and @piv may point to the same memory.
669 ipfw_export_table_value_v1(struct table_value *v, ipfw_table_value *piv)
673 memset(&iv, 0, sizeof(iv));
676 iv.divert = v->divert;
677 iv.skipto = v->skipto;
678 iv.netgraph = v->netgraph;
685 iv.zoneid = v->zoneid;
687 memcpy(piv, &iv, sizeof(iv));
691 * Exports real value data into ipfw_table_value structure.
692 * Utilizes "spare1" field to store kernel index.
695 dump_tvalue(struct namedobj_instance *ni, struct named_object *no, void *arg)
697 struct vdump_args *da;
698 struct table_val_link *ptv;
699 struct table_value *v;
701 da = (struct vdump_args *)arg;
702 ptv = (struct table_val_link *)no;
704 v = (struct table_value *)ipfw_get_sopt_space(da->sd, sizeof(*v));
705 /* Out of memory, returning */
711 memcpy(v, ptv->pval, sizeof(*v));
712 v->spare1 = ptv->no.kidx;
716 * Dumps all shared/table value data
717 * Data layout (v1)(current):
718 * Request: [ ipfw_obj_lheader ], size = ipfw_obj_lheader.size
719 * Reply: [ ipfw_obj_lheader ipfw_table_value x N ]
721 * Returns 0 on success
724 list_table_values(struct ip_fw_chain *ch, ip_fw3_opheader *op3,
725 struct sockopt_data *sd)
727 struct _ipfw_obj_lheader *olh;
728 struct namedobj_instance *vi;
729 struct vdump_args da;
730 uint32_t count, size;
732 olh = (struct _ipfw_obj_lheader *)ipfw_get_sopt_header(sd,sizeof(*olh));
735 if (sd->valsize < olh->size)
739 vi = CHAIN_TO_VI(ch);
741 count = ipfw_objhash_count(vi);
742 size = count * sizeof(ipfw_table_value) + sizeof(ipfw_obj_lheader);
744 /* Fill in header regadless of buffer size */
746 olh->objsize = sizeof(ipfw_table_value);
748 if (size > olh->size) {
756 * Do the actual value dump
758 memset(&da, 0, sizeof(da));
761 ipfw_objhash_foreach(vi, dump_tvalue, &da);
769 ipfw_table_value_init(struct ip_fw_chain *ch, int first)
771 struct tables_config *tcfg;
773 ch->valuestate = malloc(VALDATA_START_SIZE * sizeof(struct table_value),
774 M_IPFW, M_WAITOK | M_ZERO);
778 tcfg->val_size = VALDATA_START_SIZE;
779 tcfg->valhash = ipfw_objhash_create(tcfg->val_size);
780 ipfw_objhash_set_funcs(tcfg->valhash, hash_table_value,
783 IPFW_ADD_SOPT_HANDLER(first, scodes);
787 destroy_value(struct namedobj_instance *ni, struct named_object *no,
795 ipfw_table_value_destroy(struct ip_fw_chain *ch, int last)
798 IPFW_DEL_SOPT_HANDLER(last, scodes);
800 free(ch->valuestate, M_IPFW);
801 ipfw_objhash_foreach(CHAIN_TO_VI(ch), destroy_value, ch);
802 ipfw_objhash_destroy(CHAIN_TO_VI(ch));