2 * Copyright (c) 2014 Yandex LLC
3 * Copyright (c) 2014 Alexander V. Chernikov
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
8 * 1. Redistributions of source code must retain the above copyright
9 * notice, this list of conditions and the following disclaimer.
10 * 2. Redistributions in binary form must reproduce the above copyright
11 * notice, this list of conditions and the following disclaimer in the
12 * documentation and/or other materials provided with the distribution.
14 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
15 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
16 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
17 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
18 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
19 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
20 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
21 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
22 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
23 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
27 #include <sys/cdefs.h>
28 __FBSDID("$FreeBSD$");
31 * Multi-field value support for ipfw tables.
33 * This file contains necessary functions to convert
34 * large multi-field values into u32 indices suitable to be fed
35 * to various table algorithms. Other machinery like proper refcounting,
36 * internal structures resizing are also kept here.
41 #include <sys/param.h>
42 #include <sys/systm.h>
43 #include <sys/malloc.h>
44 #include <sys/kernel.h>
47 #include <sys/rwlock.h>
48 #include <sys/rmlock.h>
49 #include <sys/socket.h>
50 #include <sys/socketvar.h>
51 #include <sys/queue.h>
52 #include <net/if.h> /* ip_fw.h requires IFNAMSIZ */
55 #include <netinet/in.h>
56 #include <netinet/ip_var.h> /* struct ipfw_rule_ref */
57 #include <netinet/ip_fw.h>
59 #include <netpfil/ipfw/ip_fw_private.h>
60 #include <netpfil/ipfw/ip_fw_table.h>
62 static uint32_t hash_table_value(struct namedobj_instance *ni, const void *key,
64 static int cmp_table_value(struct named_object *no, const void *key,
67 static int list_table_values(struct ip_fw_chain *ch, ip_fw3_opheader *op3,
68 struct sockopt_data *sd);
70 static struct ipfw_sopt_handler scodes[] = {
71 { IP_FW_TABLE_VLIST, 0, HDIR_GET, list_table_values },
74 #define CHAIN_TO_VI(chain) (CHAIN_TO_TCFG(chain)->valhash)
78 struct named_object no;
79 struct table_value *pval; /* Pointer to real table value */
81 #define VALDATA_START_SIZE 64 /* Allocate 64-items array by default */
84 struct ip_fw_chain *ch;
85 struct sockopt_data *sd;
86 struct table_value *pval;
92 hash_table_value(struct namedobj_instance *ni, const void *key, uint32_t kopt)
95 return (hash32_buf(key, 56, 0));
99 cmp_table_value(struct named_object *no, const void *key, uint32_t kopt)
102 return (memcmp(((struct table_val_link *)no)->pval, key, 56));
106 mask_table_value(struct table_value *src, struct table_value *dst,
109 #define _MCPY(f, b) if ((mask & (b)) != 0) { dst->f = src->f; }
111 memset(dst, 0, sizeof(*dst));
112 _MCPY(tag, IPFW_VTYPE_TAG);
113 _MCPY(pipe, IPFW_VTYPE_PIPE);
114 _MCPY(divert, IPFW_VTYPE_DIVERT);
115 _MCPY(skipto, IPFW_VTYPE_SKIPTO);
116 _MCPY(netgraph, IPFW_VTYPE_NETGRAPH);
117 _MCPY(fib, IPFW_VTYPE_FIB);
118 _MCPY(nat, IPFW_VTYPE_NAT);
119 _MCPY(dscp, IPFW_VTYPE_DSCP);
120 _MCPY(nh4, IPFW_VTYPE_NH4);
121 _MCPY(nh6, IPFW_VTYPE_NH6);
122 _MCPY(zoneid, IPFW_VTYPE_NH6);
127 get_value_ptrs(struct ip_fw_chain *ch, struct table_config *tc, int vshared,
128 struct table_value **ptv, struct namedobj_instance **pvi)
130 struct table_value *pval;
131 struct namedobj_instance *vi;
134 pval = (struct table_value *)ch->valuestate;
135 vi = CHAIN_TO_VI(ch);
139 //pval = (struct table_value *)&tc->ti.data;
149 * Update pointers to real vaues after @pval change.
152 update_tvalue(struct namedobj_instance *ni, struct named_object *no, void *arg)
154 struct vdump_args *da;
155 struct table_val_link *ptv;
156 struct table_value *pval;
158 da = (struct vdump_args *)arg;
159 ptv = (struct table_val_link *)no;
162 ptv->pval = &pval[ptv->no.kidx];
163 ptv->no.name = (char *)&pval[ptv->no.kidx];
168 * Grows value storage shared among all tables.
169 * Drops/reacquires UH locks.
170 * Notifies other running adds on @ch shared storage resize.
171 * Note function does not guarantee that free space
172 * will be available after invocation, so one caller needs
173 * to roll cycle himself.
175 * Returns 0 if case of no errors.
178 resize_shared_value_storage(struct ip_fw_chain *ch)
180 struct tables_config *tcfg;
181 struct namedobj_instance *vi;
182 struct table_value *pval, *valuestate, *old_valuestate;
184 struct vdump_args da;
186 int val_size, val_size_old;
188 IPFW_UH_WLOCK_ASSERT(ch);
193 pval = (struct table_value *)ch->valuestate;
194 vi = CHAIN_TO_VI(ch);
195 tcfg = CHAIN_TO_TCFG(ch);
197 val_size = tcfg->val_size * 2;
199 if (val_size == (1 << 30))
204 valuestate = malloc(sizeof(struct table_value) * val_size, M_IPFW,
206 ipfw_objhash_bitmap_alloc(val_size, (void *)&new_idx,
212 * Check if we still need to resize
214 if (tcfg->val_size >= val_size)
217 /* Update pointers and notify everyone we're changing @ch */
218 pval = (struct table_value *)ch->valuestate;
219 rollback_toperation_state(ch, ch);
221 /* Good. Let's merge */
222 memcpy(valuestate, pval, sizeof(struct table_value) * tcfg->val_size);
223 ipfw_objhash_bitmap_merge(CHAIN_TO_VI(ch), &new_idx, &new_blocks);
226 /* Change pointers */
227 old_valuestate = ch->valuestate;
228 ch->valuestate = valuestate;
229 valuestate = old_valuestate;
230 ipfw_objhash_bitmap_swap(CHAIN_TO_VI(ch), &new_idx, &new_blocks);
232 val_size_old = tcfg->val_size;
233 tcfg->val_size = val_size;
234 val_size = val_size_old;
236 /* Update pointers to reflect resize */
237 memset(&da, 0, sizeof(da));
238 da.pval = (struct table_value *)ch->valuestate;
239 ipfw_objhash_foreach(vi, update_tvalue, &da);
242 free(valuestate, M_IPFW);
243 ipfw_objhash_bitmap_free(new_idx, new_blocks);
249 * Drops reference for table value with index @kidx, stored in @pval and
250 * @vi. Frees value if it has no references.
253 unref_table_value(struct namedobj_instance *vi, struct table_value *pval,
256 struct table_val_link *ptvl;
258 KASSERT(pval[kidx].refcnt > 0, ("Refcount is 0 on kidx %d", kidx));
259 if (--pval[kidx].refcnt > 0)
262 /* Last reference, delete item */
263 ptvl = (struct table_val_link *)ipfw_objhash_lookup_kidx(vi, kidx);
264 KASSERT(ptvl != NULL, ("lookup on value kidx %d failed", kidx));
265 ipfw_objhash_del(vi, &ptvl->no);
266 ipfw_objhash_free_idx(vi, kidx);
271 struct ip_fw_chain *ch;
272 struct table_algo *ta;
273 struct table_info *ti;
275 ipfw_obj_tentry tent;
279 unref_table_value_cb(void *e, void *arg)
281 struct flush_args *fa;
282 struct ip_fw_chain *ch;
283 struct table_algo *ta;
284 ipfw_obj_tentry *tent;
287 fa = (struct flush_args *)arg;
290 memset(&fa->tent, 0, sizeof(fa->tent));
292 error = ta->dump_tentry(fa->astate, fa->ti, e, tent);
298 unref_table_value(CHAIN_TO_VI(ch),
299 (struct table_value *)ch->valuestate, tent->v.kidx);
305 * Drop references for each value used in @tc.
308 ipfw_unref_table_values(struct ip_fw_chain *ch, struct table_config *tc,
309 struct table_algo *ta, void *astate, struct table_info *ti)
311 struct flush_args fa;
313 IPFW_UH_WLOCK_ASSERT(ch);
315 memset(&fa, 0, sizeof(fa));
321 ta->foreach(astate, ti, unref_table_value_cb, &fa);
325 * Table operation state handler.
326 * Called when we are going to change something in @tc which
327 * may lead to inconsistencies in on-going table data addition.
329 * Here we rollback all already committed state (table values, currently)
330 * and set "modified" field to non-zero value to indicate
331 * that we need to restart original operation.
334 rollback_table_values(struct tableop_state *ts)
336 struct ip_fw_chain *ch;
337 struct table_value *pval;
338 struct tentry_info *ptei;
339 struct namedobj_instance *vi;
344 IPFW_UH_WLOCK_ASSERT(ch);
346 /* Get current table value pointer */
347 get_value_ptrs(ch, ts->tc, ts->vshared, &pval, &vi);
349 for (i = 0; i < ts->count; i++) {
352 if (ptei->value == 0)
355 unref_table_value(vi, pval, ptei->value);
360 * Allocate new value index in either shared or per-table array.
361 * Function may drop/reacquire UH lock.
363 * Returns 0 on success.
366 alloc_table_vidx(struct ip_fw_chain *ch, struct tableop_state *ts,
367 struct namedobj_instance *vi, uint16_t *pvidx)
372 IPFW_UH_WLOCK_ASSERT(ch);
374 error = ipfw_objhash_alloc_idx(vi, &vidx);
378 * We need to resize array. This involves
379 * lock/unlock, so we need to check "modified"
382 ts->opstate.func(ts->tc, &ts->opstate);
383 error = resize_shared_value_storage(ch);
384 return (error); /* ts->modified should be set, we will restart */
387 vlimit = ts->ta->vlimit;
388 if (vlimit != 0 && vidx >= vlimit) {
391 * Algorithm is not able to store given index.
392 * We have to rollback state, start using
393 * per-table value array or return error
394 * if we're already using it.
396 * TODO: do not rollback state if
397 * atomicity is not required.
399 if (ts->vshared != 0) {
400 /* shared -> per-table */
401 return (ENOSPC); /* TODO: proper error */
404 /* per-table. Fail for now. */
405 return (ENOSPC); /* TODO: proper error */
413 * Drops value reference for unused values (updates, deletes, partially
414 * successful adds or rollbacks).
417 ipfw_garbage_table_values(struct ip_fw_chain *ch, struct table_config *tc,
418 struct tentry_info *tei, uint32_t count, int rollback)
421 struct tentry_info *ptei;
422 struct table_value *pval;
423 struct namedobj_instance *vi;
426 * We have two slightly different ADD cases here:
427 * either (1) we are successful / partially successful,
428 * in that case we need
429 * * to ignore ADDED entries values
430 * * rollback every other values (either UPDATED since
431 * old value has been stored there, or some failure like
432 * EXISTS or LIMIT or simply "ignored" case.
434 * (2): atomic rollback of partially successful operation
435 * in that case we simply need to unref all entries.
437 * DELETE case is simpler: no atomic support there, so
438 * we simply unref all non-zero values.
442 * Get current table value pointers.
443 * XXX: Properly read vshared
445 get_value_ptrs(ch, tc, 1, &pval, &vi);
447 for (i = 0; i < count; i++) {
450 if (ptei->value == 0) {
453 * We may be deleting non-existing record.
459 if ((ptei->flags & TEI_FLAGS_ADDED) != 0 && rollback == 0) {
464 unref_table_value(vi, pval, ptei->value);
470 * Main function used to link values of entries going to be added,
471 * to the index. Since we may perform many UH locks drops/acquires,
472 * handle changes by checking tablestate "modified" field.
477 ipfw_link_table_values(struct ip_fw_chain *ch, struct tableop_state *ts)
480 struct namedobj_instance *vi;
481 struct table_config *tc;
482 struct tentry_info *tei, *ptei;
483 uint32_t count, vlimit;
485 struct table_val_link *ptv;
486 struct table_value tval, *pval;
489 * Stage 1: reference all existing values and
490 * save their indices.
492 IPFW_UH_WLOCK_ASSERT(ch);
493 get_value_ptrs(ch, ts->tc, ts->vshared, &pval, &vi);
497 vlimit = ts->ta->vlimit;
502 for (i = 0; i < count; i++) {
504 ptei->value = 0; /* Ensure value is always 0 in the beginning */
505 mask_table_value(ptei->pvalue, &tval, ts->vmask);
506 ptv = (struct table_val_link *)ipfw_objhash_lookup_name(vi, 0,
510 /* Deal with vlimit later */
511 if (vlimit > 0 && vlimit <= ptv->no.kidx)
514 /* Value found. Bump refcount */
516 ptei->value = ptv->no.kidx;
520 if (ts->count == found) {
521 /* We've found all values , no need ts create new ones */
526 * we have added some state here, let's attach operation
527 * state ts the list ts be able ts rollback if necessary.
529 add_toperation_state(ch, ts);
530 /* Ensure table won't disappear */
535 * Stage 2: allocate objects for non-existing values.
537 for (i = 0; i < count; i++) {
539 if (ptei->value != 0)
541 if (ptei->ptv != NULL)
543 ptei->ptv = malloc(sizeof(struct table_val_link), M_IPFW,
548 * Stage 3: allocate index numbers for new values
549 * and link them to index.
553 del_toperation_state(ch, ts);
554 if (ts->modified != 0) {
557 * In general, we should free all state/indexes here
558 * and return. However, we keep allocated state instead
559 * to ensure we achieve some progress on each restart.
564 KASSERT(pval == ch->valuestate, ("resize_storage() notify failure"));
566 /* Let's try to link values */
567 for (i = 0; i < count; i++) {
570 /* Check if record has appeared */
571 mask_table_value(ptei->pvalue, &tval, ts->vmask);
572 ptv = (struct table_val_link *)ipfw_objhash_lookup_name(vi, 0,
576 ptei->value = ptv->no.kidx;
580 /* May perform UH unlock/lock */
581 error = alloc_table_vidx(ch, ts, vi, &vidx);
583 ts->opstate.func(ts->tc, &ts->opstate);
586 /* value storage resize has happened, return */
587 if (ts->modified != 0)
590 /* Finally, we have allocated valid index, let's add entry */
592 ptv = (struct table_val_link *)ptei->ptv;
596 ptv->no.name = (char *)&pval[vidx];
597 ptv->pval = &pval[vidx];
598 memcpy(ptv->pval, &tval, sizeof(struct table_value));
599 pval[vidx].refcnt = 1;
600 ipfw_objhash_add(vi, &ptv->no);
607 * Compatibility function used to import data from old
608 * IP_FW_TABLE_ADD / IP_FW_TABLE_XADD opcodes.
611 ipfw_import_table_value_legacy(uint32_t value, struct table_value *v)
614 memset(v, 0, sizeof(*v));
622 v->nh4 = value; /* host format */
628 * Export data to legacy table dumps opcodes.
631 ipfw_export_table_value_legacy(struct table_value *v)
635 * TODO: provide more compatibility depending on
642 * Imports table value from current userland format.
643 * Saves value in kernel format to the same place.
646 ipfw_import_table_value_v1(ipfw_table_value *iv)
648 struct table_value v;
650 memset(&v, 0, sizeof(v));
653 v.divert = iv->divert;
654 v.skipto = iv->skipto;
655 v.netgraph = iv->netgraph;
662 v.zoneid = iv->zoneid;
664 memcpy(iv, &v, sizeof(ipfw_table_value));
668 * Export real table value @v to current userland format.
669 * Note that @v and @piv may point to the same memory.
672 ipfw_export_table_value_v1(struct table_value *v, ipfw_table_value *piv)
676 memset(&iv, 0, sizeof(iv));
679 iv.divert = v->divert;
680 iv.skipto = v->skipto;
681 iv.netgraph = v->netgraph;
688 iv.zoneid = v->zoneid;
690 memcpy(piv, &iv, sizeof(iv));
694 * Exports real value data into ipfw_table_value structure.
695 * Utilizes "spare1" field to store kernel index.
698 dump_tvalue(struct namedobj_instance *ni, struct named_object *no, void *arg)
700 struct vdump_args *da;
701 struct table_val_link *ptv;
702 struct table_value *v;
704 da = (struct vdump_args *)arg;
705 ptv = (struct table_val_link *)no;
707 v = (struct table_value *)ipfw_get_sopt_space(da->sd, sizeof(*v));
708 /* Out of memory, returning */
714 memcpy(v, ptv->pval, sizeof(*v));
715 v->spare1 = ptv->no.kidx;
720 * Dumps all shared/table value data
721 * Data layout (v1)(current):
722 * Request: [ ipfw_obj_lheader ], size = ipfw_obj_lheader.size
723 * Reply: [ ipfw_obj_lheader ipfw_table_value x N ]
725 * Returns 0 on success
728 list_table_values(struct ip_fw_chain *ch, ip_fw3_opheader *op3,
729 struct sockopt_data *sd)
731 struct _ipfw_obj_lheader *olh;
732 struct namedobj_instance *vi;
733 struct vdump_args da;
734 uint32_t count, size;
736 olh = (struct _ipfw_obj_lheader *)ipfw_get_sopt_header(sd,sizeof(*olh));
739 if (sd->valsize < olh->size)
743 vi = CHAIN_TO_VI(ch);
745 count = ipfw_objhash_count(vi);
746 size = count * sizeof(ipfw_table_value) + sizeof(ipfw_obj_lheader);
748 /* Fill in header regadless of buffer size */
750 olh->objsize = sizeof(ipfw_table_value);
752 if (size > olh->size) {
760 * Do the actual value dump
762 memset(&da, 0, sizeof(da));
765 ipfw_objhash_foreach(vi, dump_tvalue, &da);
773 ipfw_table_value_init(struct ip_fw_chain *ch, int first)
775 struct tables_config *tcfg;
777 ch->valuestate = malloc(VALDATA_START_SIZE * sizeof(struct table_value),
778 M_IPFW, M_WAITOK | M_ZERO);
782 tcfg->val_size = VALDATA_START_SIZE;
783 tcfg->valhash = ipfw_objhash_create(tcfg->val_size);
784 ipfw_objhash_set_funcs(tcfg->valhash, hash_table_value,
787 IPFW_ADD_SOPT_HANDLER(first, scodes);
791 destroy_value(struct namedobj_instance *ni, struct named_object *no,
800 ipfw_table_value_destroy(struct ip_fw_chain *ch, int last)
803 IPFW_DEL_SOPT_HANDLER(last, scodes);
805 free(ch->valuestate, M_IPFW);
806 ipfw_objhash_foreach(CHAIN_TO_VI(ch), destroy_value, ch);
807 ipfw_objhash_destroy(CHAIN_TO_VI(ch));