2 * Copyright (c) 2014 Yandex LLC
3 * Copyright (c) 2014 Alexander V. Chernikov
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
8 * 1. Redistributions of source code must retain the above copyright
9 * notice, this list of conditions and the following disclaimer.
10 * 2. Redistributions in binary form must reproduce the above copyright
11 * notice, this list of conditions and the following disclaimer in the
12 * documentation and/or other materials provided with the distribution.
14 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
15 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
16 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
17 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
18 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
19 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
20 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
21 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
22 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
23 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
27 #include <sys/cdefs.h>
28 __FBSDID("$FreeBSD: projects/ipfw/sys/netpfil/ipfw/ip_fw_table.c 270407 2014-08-23 12:41:39Z melifaro $");
31 * Multi-field value support for ipfw tables.
33 * This file contains necessary functions to convert
34 * large multi-field values into u32 indices suitable to be fed
35 * to various table algorithms. Other machinery like proper refcounting,
36 * internal structures resizing are also kept here.
41 #include <sys/param.h>
42 #include <sys/systm.h>
43 #include <sys/malloc.h>
44 #include <sys/kernel.h>
47 #include <sys/rwlock.h>
48 #include <sys/rmlock.h>
49 #include <sys/socket.h>
50 #include <sys/socketvar.h>
51 #include <sys/queue.h>
52 #include <net/if.h> /* ip_fw.h requires IFNAMSIZ */
54 #include <netinet/in.h>
55 #include <netinet/ip_var.h> /* struct ipfw_rule_ref */
56 #include <netinet/ip_fw.h>
58 #include <netpfil/ipfw/ip_fw_private.h>
59 #include <netpfil/ipfw/ip_fw_table.h>
61 static uint32_t hash_table_value(struct namedobj_instance *ni, void *key,
63 static int cmp_table_value(struct named_object *no, void *key, uint32_t kopt);
65 static int list_table_values(struct ip_fw_chain *ch, ip_fw3_opheader *op3,
66 struct sockopt_data *sd);
68 static struct ipfw_sopt_handler scodes[] = {
69 { IP_FW_TABLE_VLIST, 0, HDIR_GET, list_table_values },
72 #define CHAIN_TO_VI(chain) (CHAIN_TO_TCFG(chain)->valhash)
76 struct named_object no;
77 struct table_value *pval; /* Pointer to real table value */
79 #define VALDATA_START_SIZE 64 /* Allocate 64-items array by default */
82 struct ip_fw_chain *ch;
83 struct sockopt_data *sd;
84 struct table_value *pval;
90 hash_table_value(struct namedobj_instance *ni, void *key, uint32_t kopt)
93 return (hash32_buf(key, 56, 0));
97 cmp_table_value(struct named_object *no, void *key, uint32_t kopt)
100 return (memcmp(((struct table_val_link *)no)->pval, key, 56));
104 mask_table_value(struct table_value *src, struct table_value *dst,
107 #define _MCPY(f, b) if ((mask & (b)) != 0) { dst->f = src->f; }
109 memset(dst, 0, sizeof(*dst));
110 _MCPY(tag, IPFW_VTYPE_TAG);
111 _MCPY(pipe, IPFW_VTYPE_PIPE);
112 _MCPY(divert, IPFW_VTYPE_DIVERT);
113 _MCPY(skipto, IPFW_VTYPE_SKIPTO);
114 _MCPY(netgraph, IPFW_VTYPE_NETGRAPH);
115 _MCPY(fib, IPFW_VTYPE_FIB);
116 _MCPY(nat, IPFW_VTYPE_NAT);
117 _MCPY(dscp, IPFW_VTYPE_DSCP);
118 _MCPY(nh4, IPFW_VTYPE_NH4);
119 _MCPY(nh6, IPFW_VTYPE_NH6);
124 get_value_ptrs(struct ip_fw_chain *ch, struct table_config *tc, int vshared,
125 struct table_value **ptv, struct namedobj_instance **pvi)
127 struct table_value *pval;
128 struct namedobj_instance *vi;
131 pval = (struct table_value *)ch->valuestate;
132 vi = CHAIN_TO_VI(ch);
136 //pval = (struct table_value *)&tc->ti.data;
146 * Update pointers to real vaues after @pval change.
149 update_tvalue(struct namedobj_instance *ni, struct named_object *no, void *arg)
151 struct vdump_args *da;
152 struct table_val_link *ptv;
153 struct table_value *pval;
155 da = (struct vdump_args *)arg;
156 ptv = (struct table_val_link *)no;
159 ptv->pval = &pval[ptv->no.kidx];
164 * Grows value storage shared among all tables.
165 * Drops/reacquires UH locks.
166 * Notifies other running adds on @ch shared storage resize.
167 * Note function does not guarantee that free space
168 * will be available after invocation, so one caller needs
169 * to roll cycle himself.
171 * Returns 0 if case of no errors.
174 resize_shared_value_storage(struct ip_fw_chain *ch)
176 struct tables_config *tcfg;
177 struct namedobj_instance *vi;
178 struct table_value *pval, *valuestate, *old_valuestate;
180 struct vdump_args da;
182 int val_size, val_size_old;
184 IPFW_UH_WLOCK_ASSERT(ch);
189 pval = (struct table_value *)ch->valuestate;
190 vi = CHAIN_TO_VI(ch);
191 tcfg = CHAIN_TO_TCFG(ch);
193 val_size = tcfg->val_size * 2;
195 if (val_size == (1 << 30))
200 valuestate = malloc(sizeof(struct table_value) * val_size, M_IPFW,
202 ipfw_objhash_bitmap_alloc(val_size, (void *)&new_idx,
208 * Check if we still need to resize
210 if (tcfg->val_size >= val_size)
213 /* Update pointers and notify everyone we're changing @ch */
214 pval = (struct table_value *)ch->valuestate;
215 rollback_toperation_state(ch, ch);
217 /* Good. Let's merge */
218 memcpy(valuestate, pval, sizeof(struct table_value) * tcfg->val_size);
219 ipfw_objhash_bitmap_merge(CHAIN_TO_VI(ch), &new_idx, &new_blocks);
222 /* Change pointers */
223 old_valuestate = ch->valuestate;
224 ch->valuestate = valuestate;
225 valuestate = old_valuestate;
226 ipfw_objhash_bitmap_swap(CHAIN_TO_VI(ch), &new_idx, &new_blocks);
228 val_size_old = tcfg->val_size;
229 tcfg->val_size = val_size;
230 val_size = val_size_old;
232 /* Update pointers to reflect resize */
233 memset(&da, 0, sizeof(da));
234 da.pval = (struct table_value *)ch->valuestate;
235 ipfw_objhash_foreach(vi, update_tvalue, &da);
238 free(valuestate, M_IPFW);
239 ipfw_objhash_bitmap_free(new_idx, new_blocks);
245 * Drops reference for table value with index @kidx, stored in @pval and
246 * @vi. Frees value if it has no references.
249 unref_table_value(struct namedobj_instance *vi, struct table_value *pval,
252 struct table_val_link *ptvl;
254 KASSERT(pval[kidx].refcnt > 0, ("Refcount is 0 on kidx %d", kidx));
255 if (--pval[kidx].refcnt > 0)
258 /* Last reference, delete item */
259 ptvl = (struct table_val_link *)ipfw_objhash_lookup_kidx(vi, kidx);
260 KASSERT(ptvl != NULL, ("lookup on value kidx %d failed", kidx));
261 ipfw_objhash_del(vi, &ptvl->no);
262 ipfw_objhash_free_idx(vi, kidx);
267 struct ip_fw_chain *ch;
268 struct table_algo *ta;
269 struct table_info *ti;
271 ipfw_obj_tentry tent;
275 unref_table_value_cb(void *e, void *arg)
277 struct flush_args *fa;
278 struct ip_fw_chain *ch;
279 struct table_algo *ta;
280 ipfw_obj_tentry *tent;
283 fa = (struct flush_args *)arg;
286 memset(&fa->tent, 0, sizeof(fa->tent));
288 error = ta->dump_tentry(fa->astate, fa->ti, e, tent);
294 unref_table_value(CHAIN_TO_VI(ch),
295 (struct table_value *)ch->valuestate, tent->v.kidx);
301 * Drop references for each value used in @tc.
304 ipfw_unref_table_values(struct ip_fw_chain *ch, struct table_config *tc,
305 struct table_algo *ta, void *astate, struct table_info *ti)
307 struct flush_args fa;
309 IPFW_UH_WLOCK_ASSERT(ch);
311 memset(&fa, 0, sizeof(fa));
317 ta->foreach(astate, ti, unref_table_value_cb, &fa);
321 * Table operation state handler.
322 * Called when we are going to change something in @tc which
323 * may lead to inconsistencies in on-going table data addition.
325 * Here we rollback all already committed state (table values, currently)
326 * and set "modified" field to non-zero value to indicate
327 * that we need to restart original operation.
330 rollback_table_values(struct tableop_state *ts)
332 struct ip_fw_chain *ch;
333 struct table_value *pval;
334 struct tentry_info *ptei;
335 struct namedobj_instance *vi;
340 IPFW_UH_WLOCK_ASSERT(ch);
342 /* Get current table value pointer */
343 get_value_ptrs(ch, ts->tc, ts->vshared, &pval, &vi);
345 for (i = 0; i < ts->count; i++) {
348 if (ptei->value == 0)
351 unref_table_value(vi, pval, ptei->value);
356 * Allocate new value index in either shared or per-table array.
357 * Function may drop/reacquire UH lock.
359 * Returns 0 on success.
362 alloc_table_vidx(struct ip_fw_chain *ch, struct tableop_state *ts,
363 struct namedobj_instance *vi, uint16_t *pvidx)
368 IPFW_UH_WLOCK_ASSERT(ch);
370 error = ipfw_objhash_alloc_idx(vi, &vidx);
374 * We need to resize array. This involves
375 * lock/unlock, so we need to check "modified"
378 ts->opstate.func(ts->tc, &ts->opstate);
379 error = resize_shared_value_storage(ch);
380 return (error); /* ts->modified should be set, we will restart */
383 vlimit = ts->ta->vlimit;
384 if (vlimit != 0 && vidx >= vlimit) {
387 * Algorithm is not able to store given index.
388 * We have to rollback state, start using
389 * per-table value array or return error
390 * if we're already using it.
392 * TODO: do not rollback state if
393 * atomicity is not required.
395 if (ts->vshared != 0) {
396 /* shared -> per-table */
397 return (ENOSPC); /* TODO: proper error */
400 /* per-table. Fail for now. */
401 return (ENOSPC); /* TODO: proper error */
409 * Drops value reference for unused values (updates, deletes, partially
410 * successful adds or rollbacks).
413 ipfw_garbage_table_values(struct ip_fw_chain *ch, struct table_config *tc,
414 struct tentry_info *tei, uint32_t count, int rollback)
417 struct tentry_info *ptei;
418 struct table_value *pval;
419 struct namedobj_instance *vi;
422 * We have two slightly different ADD cases here:
423 * either (1) we are successful / partially successful,
424 * in that case we need
425 * * to ignore ADDED entries values
426 * * rollback every other values (either UPDATED since
427 * old value has been stored there, or some failure like
428 * EXISTS or LIMIT or simply "ignored" case.
430 * (2): atomic rollback of partially successful operation
431 * in that case we simply need to unref all entries.
433 * DELETE case is simpler: no atomic support there, so
434 * we simply unref all non-zero values.
438 * Get current table value pointers.
439 * XXX: Properly read vshared
441 get_value_ptrs(ch, tc, 1, &pval, &vi);
443 for (i = 0; i < count; i++) {
446 if (ptei->value == 0) {
449 * We may be deleting non-existing record.
455 if ((ptei->flags & TEI_FLAGS_ADDED) != 0 && rollback == 0) {
460 unref_table_value(vi, pval, ptei->value);
466 * Main function used to link values of entries going to be added,
467 * to the index. Since we may perform many UH locks drops/acquires,
468 * handle changes by checking tablestate "modified" field.
473 ipfw_link_table_values(struct ip_fw_chain *ch, struct tableop_state *ts)
476 struct namedobj_instance *vi;
477 struct table_config *tc;
478 struct tentry_info *tei, *ptei;
479 uint32_t count, vlimit;
481 struct table_val_link *ptv;
482 struct table_value tval, *pval;
485 * Stage 1: reference all existing values and
486 * save their indices.
488 IPFW_UH_WLOCK_ASSERT(ch);
489 get_value_ptrs(ch, ts->tc, ts->vshared, &pval, &vi);
493 vlimit = ts->ta->vlimit;
498 for (i = 0; i < count; i++) {
500 ptei->value = 0; /* Ensure value is always 0 in the beginnig */
501 mask_table_value(ptei->pvalue, &tval, ts->vmask);
502 ptv = (struct table_val_link *)ipfw_objhash_lookup_name(vi, 0,
506 /* Deal with vlimit later */
507 if (vlimit > 0 && vlimit <= ptv->no.kidx)
510 /* Value found. Bump refcount */
512 ptei->value = ptv->no.kidx;
516 if (ts->count == found) {
517 /* We've found all values , no need ts create new ones */
522 * we have added some state here, let's attach operation
523 * state ts the list ts be able ts rollback if necessary.
525 add_toperation_state(ch, ts);
526 /* Ensure table won't disappear */
531 * Stage 2: allocate objects for non-existing values.
533 for (i = 0; i < count; i++) {
535 if (ptei->value != 0)
537 if (ptei->ptv != NULL)
539 ptei->ptv = malloc(sizeof(struct table_val_link), M_IPFW,
544 * Stage 3: allocate index numbers for new values
545 * and link them to index.
549 del_toperation_state(ch, ts);
550 if (ts->modified != 0) {
553 * In general, we should free all state/indexes here
554 * and return. However, we keep allocated state instead
555 * to ensure we achieve some progress on each restart.
560 KASSERT(pval == ch->tablestate, ("resize_storage() notify failure"));
562 /* Let's try to link values */
563 for (i = 0; i < count; i++) {
565 if (ptei->value != 0) {
568 * We may be here after several process restarts,
569 * so we need to update all fields that might
572 ptv = (struct table_val_link *)ptei->ptv;
573 ptv->pval = &pval[i];
577 /* Check if record has appeared */
578 mask_table_value(ptei->pvalue, &tval, ts->vmask);
579 ptv = (struct table_val_link *)ipfw_objhash_lookup_name(vi, 0,
583 ptei->value = ptv->no.kidx;
587 /* May perform UH unlock/lock */
588 error = alloc_table_vidx(ch, ts, vi, &vidx);
590 ts->opstate.func(ts->tc, &ts->opstate);
593 /* value storage resize has happened, return */
594 if (ts->modified != 0)
597 /* Finally, we have allocated valid index, let's add entry */
599 ptv = (struct table_val_link *)ptei->ptv;
603 ptv->no.name = (char *)&pval[vidx];
604 ptv->pval = &pval[vidx];
605 memcpy(ptv->pval, &tval, sizeof(struct table_value));
606 pval[vidx].refcnt = 1;
607 ipfw_objhash_add(vi, &ptv->no);
614 * Compability function used to import data from old
615 * IP_FW_TABLE_ADD / IP_FW_TABLE_XADD opcodes.
618 ipfw_import_table_value_legacy(uint32_t value, struct table_value *v)
621 memset(v, 0, sizeof(*v));
629 v->nh4 = value; /* host format */
635 * Export data to legacy table dumps opcodes.
638 ipfw_export_table_value_legacy(struct table_value *v)
642 * TODO: provide more compatibility depending on
649 * Imports table value from current userland format.
650 * Saves value in kernel format to the same place.
653 ipfw_import_table_value_v1(ipfw_table_value *iv)
655 struct table_value v;
657 memset(&v, 0, sizeof(v));
660 v.divert = iv->divert;
661 v.skipto = iv->skipto;
662 v.netgraph = iv->netgraph;
670 memcpy(iv, &v, sizeof(ipfw_table_value));
674 * Export real table value @v to current userland format.
675 * Note that @v and @piv may point to the same memory.
678 ipfw_export_table_value_v1(struct table_value *v, ipfw_table_value *piv)
682 memset(&iv, 0, sizeof(iv));
685 iv.divert = v->divert;
686 iv.skipto = v->skipto;
687 iv.netgraph = v->netgraph;
695 memcpy(piv, &iv, sizeof(iv));
699 * Exports real value data into ipfw_table_value structure.
700 * Utilizes "spare1" field to store kernel index.
703 dump_tvalue(struct namedobj_instance *ni, struct named_object *no, void *arg)
705 struct vdump_args *da;
706 struct table_val_link *ptv;
707 struct table_value *v;
709 da = (struct vdump_args *)arg;
710 ptv = (struct table_val_link *)no;
712 v = (struct table_value *)ipfw_get_sopt_space(da->sd, sizeof(*v));
713 /* Out of memory, returning */
719 memcpy(v, ptv->pval, sizeof(*v));
720 v->spare1 = ptv->no.kidx;
724 * Dumps all shared/table value data
725 * Data layout (v1)(current):
726 * Request: [ ipfw_obj_lheader ], size = ipfw_obj_lheader.size
727 * Reply: [ ipfw_obj_lheader ipfw_table_value x N ]
729 * Returns 0 on success
732 list_table_values(struct ip_fw_chain *ch, ip_fw3_opheader *op3,
733 struct sockopt_data *sd)
735 struct _ipfw_obj_lheader *olh;
736 struct namedobj_instance *vi;
737 struct vdump_args da;
738 uint32_t count, size;
740 olh = (struct _ipfw_obj_lheader *)ipfw_get_sopt_header(sd,sizeof(*olh));
743 if (sd->valsize < olh->size)
747 vi = CHAIN_TO_VI(ch);
749 count = ipfw_objhash_count(vi);
750 size = count * sizeof(ipfw_table_value) + sizeof(ipfw_obj_lheader);
752 /* Fill in header regadless of buffer size */
754 olh->objsize = sizeof(ipfw_table_value);
756 if (size > olh->size) {
764 * Do the actual value dump
766 memset(&da, 0, sizeof(da));
769 ipfw_objhash_foreach(vi, dump_tvalue, &da);
777 ipfw_table_value_init(struct ip_fw_chain *ch, int first)
779 struct tables_config *tcfg;
781 ch->valuestate = malloc(VALDATA_START_SIZE * sizeof(struct table_value),
782 M_IPFW, M_WAITOK | M_ZERO);
786 tcfg->val_size = VALDATA_START_SIZE;
787 tcfg->valhash = ipfw_objhash_create(tcfg->val_size);
788 ipfw_objhash_set_funcs(tcfg->valhash, hash_table_value,
791 IPFW_ADD_SOPT_HANDLER(first, scodes);
795 destroy_value(struct namedobj_instance *ni, struct named_object *no,
803 ipfw_table_value_destroy(struct ip_fw_chain *ch, int last)
806 IPFW_DEL_SOPT_HANDLER(last, scodes);
808 free(ch->valuestate, M_IPFW);
809 ipfw_objhash_foreach(CHAIN_TO_VI(ch), destroy_value, ch);
810 ipfw_objhash_destroy(CHAIN_TO_VI(ch));