2 * Copyright (c) 2014 Yandex LLC
3 * Copyright (c) 2014 Alexander V. Chernikov
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
8 * 1. Redistributions of source code must retain the above copyright
9 * notice, this list of conditions and the following disclaimer.
10 * 2. Redistributions in binary form must reproduce the above copyright
11 * notice, this list of conditions and the following disclaimer in the
12 * documentation and/or other materials provided with the distribution.
14 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
15 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
16 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
17 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
18 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
19 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
20 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
21 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
22 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
23 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
27 #include <sys/cdefs.h>
28 __FBSDID("$FreeBSD$");
31 * Multi-field value support for ipfw tables.
33 * This file contains necessary functions to convert
34 * large multi-field values into u32 indices suitable to be fed
35 * to various table algorithms. Other machinery like proper refcounting,
36 * internal structures resizing are also kept here.
41 #include <sys/param.h>
42 #include <sys/systm.h>
43 #include <sys/malloc.h>
44 #include <sys/kernel.h>
47 #include <sys/rwlock.h>
48 #include <sys/rmlock.h>
49 #include <sys/socket.h>
50 #include <sys/socketvar.h>
51 #include <sys/queue.h>
52 #include <net/if.h> /* ip_fw.h requires IFNAMSIZ */
54 #include <netinet/in.h>
55 #include <netinet/ip_var.h> /* struct ipfw_rule_ref */
56 #include <netinet/ip_fw.h>
58 #include <netpfil/ipfw/ip_fw_private.h>
59 #include <netpfil/ipfw/ip_fw_table.h>
61 static uint32_t hash_table_value(struct namedobj_instance *ni, const void *key,
63 static int cmp_table_value(struct named_object *no, const void *key,
66 static int list_table_values(struct ip_fw_chain *ch, ip_fw3_opheader *op3,
67 struct sockopt_data *sd);
69 static struct ipfw_sopt_handler scodes[] = {
70 { IP_FW_TABLE_VLIST, 0, HDIR_GET, list_table_values },
73 #define CHAIN_TO_VI(chain) (CHAIN_TO_TCFG(chain)->valhash)
77 struct named_object no;
78 struct table_value *pval; /* Pointer to real table value */
80 #define VALDATA_START_SIZE 64 /* Allocate 64-items array by default */
83 struct ip_fw_chain *ch;
84 struct sockopt_data *sd;
85 struct table_value *pval;
91 hash_table_value(struct namedobj_instance *ni, const void *key, uint32_t kopt)
94 return (hash32_buf(key, 56, 0));
98 cmp_table_value(struct named_object *no, const void *key, uint32_t kopt)
101 return (memcmp(((struct table_val_link *)no)->pval, key, 56));
105 mask_table_value(struct table_value *src, struct table_value *dst,
108 #define _MCPY(f, b) if ((mask & (b)) != 0) { dst->f = src->f; }
110 memset(dst, 0, sizeof(*dst));
111 _MCPY(tag, IPFW_VTYPE_TAG);
112 _MCPY(pipe, IPFW_VTYPE_PIPE);
113 _MCPY(divert, IPFW_VTYPE_DIVERT);
114 _MCPY(skipto, IPFW_VTYPE_SKIPTO);
115 _MCPY(netgraph, IPFW_VTYPE_NETGRAPH);
116 _MCPY(fib, IPFW_VTYPE_FIB);
117 _MCPY(nat, IPFW_VTYPE_NAT);
118 _MCPY(dscp, IPFW_VTYPE_DSCP);
119 _MCPY(nh4, IPFW_VTYPE_NH4);
120 _MCPY(nh6, IPFW_VTYPE_NH6);
121 _MCPY(zoneid, IPFW_VTYPE_NH6);
126 get_value_ptrs(struct ip_fw_chain *ch, struct table_config *tc, int vshared,
127 struct table_value **ptv, struct namedobj_instance **pvi)
129 struct table_value *pval;
130 struct namedobj_instance *vi;
133 pval = (struct table_value *)ch->valuestate;
134 vi = CHAIN_TO_VI(ch);
138 //pval = (struct table_value *)&tc->ti.data;
148 * Update pointers to real vaues after @pval change.
151 update_tvalue(struct namedobj_instance *ni, struct named_object *no, void *arg)
153 struct vdump_args *da;
154 struct table_val_link *ptv;
155 struct table_value *pval;
157 da = (struct vdump_args *)arg;
158 ptv = (struct table_val_link *)no;
161 ptv->pval = &pval[ptv->no.kidx];
162 ptv->no.name = (char *)&pval[ptv->no.kidx];
167 * Grows value storage shared among all tables.
168 * Drops/reacquires UH locks.
169 * Notifies other running adds on @ch shared storage resize.
170 * Note function does not guarantee that free space
171 * will be available after invocation, so one caller needs
172 * to roll cycle himself.
174 * Returns 0 if case of no errors.
177 resize_shared_value_storage(struct ip_fw_chain *ch)
179 struct tables_config *tcfg;
180 struct namedobj_instance *vi;
181 struct table_value *pval, *valuestate, *old_valuestate;
183 struct vdump_args da;
185 int val_size, val_size_old;
187 IPFW_UH_WLOCK_ASSERT(ch);
192 pval = (struct table_value *)ch->valuestate;
193 vi = CHAIN_TO_VI(ch);
194 tcfg = CHAIN_TO_TCFG(ch);
196 val_size = tcfg->val_size * 2;
198 if (val_size == (1 << 30))
203 valuestate = malloc(sizeof(struct table_value) * val_size, M_IPFW,
205 ipfw_objhash_bitmap_alloc(val_size, (void *)&new_idx,
211 * Check if we still need to resize
213 if (tcfg->val_size >= val_size)
216 /* Update pointers and notify everyone we're changing @ch */
217 pval = (struct table_value *)ch->valuestate;
218 rollback_toperation_state(ch, ch);
220 /* Good. Let's merge */
221 memcpy(valuestate, pval, sizeof(struct table_value) * tcfg->val_size);
222 ipfw_objhash_bitmap_merge(CHAIN_TO_VI(ch), &new_idx, &new_blocks);
225 /* Change pointers */
226 old_valuestate = ch->valuestate;
227 ch->valuestate = valuestate;
228 valuestate = old_valuestate;
229 ipfw_objhash_bitmap_swap(CHAIN_TO_VI(ch), &new_idx, &new_blocks);
231 val_size_old = tcfg->val_size;
232 tcfg->val_size = val_size;
233 val_size = val_size_old;
235 /* Update pointers to reflect resize */
236 memset(&da, 0, sizeof(da));
237 da.pval = (struct table_value *)ch->valuestate;
238 ipfw_objhash_foreach(vi, update_tvalue, &da);
241 free(valuestate, M_IPFW);
242 ipfw_objhash_bitmap_free(new_idx, new_blocks);
248 * Drops reference for table value with index @kidx, stored in @pval and
249 * @vi. Frees value if it has no references.
252 unref_table_value(struct namedobj_instance *vi, struct table_value *pval,
255 struct table_val_link *ptvl;
257 KASSERT(pval[kidx].refcnt > 0, ("Refcount is 0 on kidx %d", kidx));
258 if (--pval[kidx].refcnt > 0)
261 /* Last reference, delete item */
262 ptvl = (struct table_val_link *)ipfw_objhash_lookup_kidx(vi, kidx);
263 KASSERT(ptvl != NULL, ("lookup on value kidx %d failed", kidx));
264 ipfw_objhash_del(vi, &ptvl->no);
265 ipfw_objhash_free_idx(vi, kidx);
270 struct ip_fw_chain *ch;
271 struct table_algo *ta;
272 struct table_info *ti;
274 ipfw_obj_tentry tent;
278 unref_table_value_cb(void *e, void *arg)
280 struct flush_args *fa;
281 struct ip_fw_chain *ch;
282 struct table_algo *ta;
283 ipfw_obj_tentry *tent;
286 fa = (struct flush_args *)arg;
289 memset(&fa->tent, 0, sizeof(fa->tent));
291 error = ta->dump_tentry(fa->astate, fa->ti, e, tent);
297 unref_table_value(CHAIN_TO_VI(ch),
298 (struct table_value *)ch->valuestate, tent->v.kidx);
304 * Drop references for each value used in @tc.
307 ipfw_unref_table_values(struct ip_fw_chain *ch, struct table_config *tc,
308 struct table_algo *ta, void *astate, struct table_info *ti)
310 struct flush_args fa;
312 IPFW_UH_WLOCK_ASSERT(ch);
314 memset(&fa, 0, sizeof(fa));
320 ta->foreach(astate, ti, unref_table_value_cb, &fa);
324 * Table operation state handler.
325 * Called when we are going to change something in @tc which
326 * may lead to inconsistencies in on-going table data addition.
328 * Here we rollback all already committed state (table values, currently)
329 * and set "modified" field to non-zero value to indicate
330 * that we need to restart original operation.
333 rollback_table_values(struct tableop_state *ts)
335 struct ip_fw_chain *ch;
336 struct table_value *pval;
337 struct tentry_info *ptei;
338 struct namedobj_instance *vi;
343 IPFW_UH_WLOCK_ASSERT(ch);
345 /* Get current table value pointer */
346 get_value_ptrs(ch, ts->tc, ts->vshared, &pval, &vi);
348 for (i = 0; i < ts->count; i++) {
351 if (ptei->value == 0)
354 unref_table_value(vi, pval, ptei->value);
359 * Allocate new value index in either shared or per-table array.
360 * Function may drop/reacquire UH lock.
362 * Returns 0 on success.
365 alloc_table_vidx(struct ip_fw_chain *ch, struct tableop_state *ts,
366 struct namedobj_instance *vi, uint16_t *pvidx)
371 IPFW_UH_WLOCK_ASSERT(ch);
373 error = ipfw_objhash_alloc_idx(vi, &vidx);
377 * We need to resize array. This involves
378 * lock/unlock, so we need to check "modified"
381 ts->opstate.func(ts->tc, &ts->opstate);
382 error = resize_shared_value_storage(ch);
383 return (error); /* ts->modified should be set, we will restart */
386 vlimit = ts->ta->vlimit;
387 if (vlimit != 0 && vidx >= vlimit) {
390 * Algorithm is not able to store given index.
391 * We have to rollback state, start using
392 * per-table value array or return error
393 * if we're already using it.
395 * TODO: do not rollback state if
396 * atomicity is not required.
398 if (ts->vshared != 0) {
399 /* shared -> per-table */
400 return (ENOSPC); /* TODO: proper error */
403 /* per-table. Fail for now. */
404 return (ENOSPC); /* TODO: proper error */
412 * Drops value reference for unused values (updates, deletes, partially
413 * successful adds or rollbacks).
416 ipfw_garbage_table_values(struct ip_fw_chain *ch, struct table_config *tc,
417 struct tentry_info *tei, uint32_t count, int rollback)
420 struct tentry_info *ptei;
421 struct table_value *pval;
422 struct namedobj_instance *vi;
425 * We have two slightly different ADD cases here:
426 * either (1) we are successful / partially successful,
427 * in that case we need
428 * * to ignore ADDED entries values
429 * * rollback every other values (either UPDATED since
430 * old value has been stored there, or some failure like
431 * EXISTS or LIMIT or simply "ignored" case.
433 * (2): atomic rollback of partially successful operation
434 * in that case we simply need to unref all entries.
436 * DELETE case is simpler: no atomic support there, so
437 * we simply unref all non-zero values.
441 * Get current table value pointers.
442 * XXX: Properly read vshared
444 get_value_ptrs(ch, tc, 1, &pval, &vi);
446 for (i = 0; i < count; i++) {
449 if (ptei->value == 0) {
452 * We may be deleting non-existing record.
458 if ((ptei->flags & TEI_FLAGS_ADDED) != 0 && rollback == 0) {
463 unref_table_value(vi, pval, ptei->value);
469 * Main function used to link values of entries going to be added,
470 * to the index. Since we may perform many UH locks drops/acquires,
471 * handle changes by checking tablestate "modified" field.
476 ipfw_link_table_values(struct ip_fw_chain *ch, struct tableop_state *ts)
479 struct namedobj_instance *vi;
480 struct table_config *tc;
481 struct tentry_info *tei, *ptei;
482 uint32_t count, vlimit;
484 struct table_val_link *ptv;
485 struct table_value tval, *pval;
488 * Stage 1: reference all existing values and
489 * save their indices.
491 IPFW_UH_WLOCK_ASSERT(ch);
492 get_value_ptrs(ch, ts->tc, ts->vshared, &pval, &vi);
496 vlimit = ts->ta->vlimit;
501 for (i = 0; i < count; i++) {
503 ptei->value = 0; /* Ensure value is always 0 in the beginning */
504 mask_table_value(ptei->pvalue, &tval, ts->vmask);
505 ptv = (struct table_val_link *)ipfw_objhash_lookup_name(vi, 0,
509 /* Deal with vlimit later */
510 if (vlimit > 0 && vlimit <= ptv->no.kidx)
513 /* Value found. Bump refcount */
515 ptei->value = ptv->no.kidx;
519 if (ts->count == found) {
520 /* We've found all values , no need ts create new ones */
525 * we have added some state here, let's attach operation
526 * state ts the list ts be able ts rollback if necessary.
528 add_toperation_state(ch, ts);
529 /* Ensure table won't disappear */
534 * Stage 2: allocate objects for non-existing values.
536 for (i = 0; i < count; i++) {
538 if (ptei->value != 0)
540 if (ptei->ptv != NULL)
542 ptei->ptv = malloc(sizeof(struct table_val_link), M_IPFW,
547 * Stage 3: allocate index numbers for new values
548 * and link them to index.
552 del_toperation_state(ch, ts);
553 if (ts->modified != 0) {
556 * In general, we should free all state/indexes here
557 * and return. However, we keep allocated state instead
558 * to ensure we achieve some progress on each restart.
563 KASSERT(pval == ch->valuestate, ("resize_storage() notify failure"));
565 /* Let's try to link values */
566 for (i = 0; i < count; i++) {
569 /* Check if record has appeared */
570 mask_table_value(ptei->pvalue, &tval, ts->vmask);
571 ptv = (struct table_val_link *)ipfw_objhash_lookup_name(vi, 0,
575 ptei->value = ptv->no.kidx;
579 /* May perform UH unlock/lock */
580 error = alloc_table_vidx(ch, ts, vi, &vidx);
582 ts->opstate.func(ts->tc, &ts->opstate);
585 /* value storage resize has happened, return */
586 if (ts->modified != 0)
589 /* Finally, we have allocated valid index, let's add entry */
591 ptv = (struct table_val_link *)ptei->ptv;
595 ptv->no.name = (char *)&pval[vidx];
596 ptv->pval = &pval[vidx];
597 memcpy(ptv->pval, &tval, sizeof(struct table_value));
598 pval[vidx].refcnt = 1;
599 ipfw_objhash_add(vi, &ptv->no);
606 * Compatibility function used to import data from old
607 * IP_FW_TABLE_ADD / IP_FW_TABLE_XADD opcodes.
610 ipfw_import_table_value_legacy(uint32_t value, struct table_value *v)
613 memset(v, 0, sizeof(*v));
621 v->nh4 = value; /* host format */
627 * Export data to legacy table dumps opcodes.
630 ipfw_export_table_value_legacy(struct table_value *v)
634 * TODO: provide more compatibility depending on
641 * Imports table value from current userland format.
642 * Saves value in kernel format to the same place.
645 ipfw_import_table_value_v1(ipfw_table_value *iv)
647 struct table_value v;
649 memset(&v, 0, sizeof(v));
652 v.divert = iv->divert;
653 v.skipto = iv->skipto;
654 v.netgraph = iv->netgraph;
661 v.zoneid = iv->zoneid;
663 memcpy(iv, &v, sizeof(ipfw_table_value));
667 * Export real table value @v to current userland format.
668 * Note that @v and @piv may point to the same memory.
671 ipfw_export_table_value_v1(struct table_value *v, ipfw_table_value *piv)
675 memset(&iv, 0, sizeof(iv));
678 iv.divert = v->divert;
679 iv.skipto = v->skipto;
680 iv.netgraph = v->netgraph;
687 iv.zoneid = v->zoneid;
689 memcpy(piv, &iv, sizeof(iv));
693 * Exports real value data into ipfw_table_value structure.
694 * Utilizes "spare1" field to store kernel index.
697 dump_tvalue(struct namedobj_instance *ni, struct named_object *no, void *arg)
699 struct vdump_args *da;
700 struct table_val_link *ptv;
701 struct table_value *v;
703 da = (struct vdump_args *)arg;
704 ptv = (struct table_val_link *)no;
706 v = (struct table_value *)ipfw_get_sopt_space(da->sd, sizeof(*v));
707 /* Out of memory, returning */
713 memcpy(v, ptv->pval, sizeof(*v));
714 v->spare1 = ptv->no.kidx;
719 * Dumps all shared/table value data
720 * Data layout (v1)(current):
721 * Request: [ ipfw_obj_lheader ], size = ipfw_obj_lheader.size
722 * Reply: [ ipfw_obj_lheader ipfw_table_value x N ]
724 * Returns 0 on success
727 list_table_values(struct ip_fw_chain *ch, ip_fw3_opheader *op3,
728 struct sockopt_data *sd)
730 struct _ipfw_obj_lheader *olh;
731 struct namedobj_instance *vi;
732 struct vdump_args da;
733 uint32_t count, size;
735 olh = (struct _ipfw_obj_lheader *)ipfw_get_sopt_header(sd,sizeof(*olh));
738 if (sd->valsize < olh->size)
742 vi = CHAIN_TO_VI(ch);
744 count = ipfw_objhash_count(vi);
745 size = count * sizeof(ipfw_table_value) + sizeof(ipfw_obj_lheader);
747 /* Fill in header regadless of buffer size */
749 olh->objsize = sizeof(ipfw_table_value);
751 if (size > olh->size) {
759 * Do the actual value dump
761 memset(&da, 0, sizeof(da));
764 ipfw_objhash_foreach(vi, dump_tvalue, &da);
772 ipfw_table_value_init(struct ip_fw_chain *ch, int first)
774 struct tables_config *tcfg;
776 ch->valuestate = malloc(VALDATA_START_SIZE * sizeof(struct table_value),
777 M_IPFW, M_WAITOK | M_ZERO);
781 tcfg->val_size = VALDATA_START_SIZE;
782 tcfg->valhash = ipfw_objhash_create(tcfg->val_size);
783 ipfw_objhash_set_funcs(tcfg->valhash, hash_table_value,
786 IPFW_ADD_SOPT_HANDLER(first, scodes);
790 destroy_value(struct namedobj_instance *ni, struct named_object *no,
799 ipfw_table_value_destroy(struct ip_fw_chain *ch, int last)
802 IPFW_DEL_SOPT_HANDLER(last, scodes);
804 free(ch->valuestate, M_IPFW);
805 ipfw_objhash_foreach(CHAIN_TO_VI(ch), destroy_value, ch);
806 ipfw_objhash_destroy(CHAIN_TO_VI(ch));