2 * SPDX-License-Identifier: BSD-2-Clause
4 * Copyright (c) 2022 Alexander V. Chernikov <melifaro@FreeBSD.org>
6 * Redistribution and use in source and binary forms, with or without
7 * modification, are permitted provided that the following conditions
9 * 1. Redistributions of source code must retain the above copyright
10 * notice, this list of conditions and the following disclaimer.
11 * 2. Redistributions in binary form must reproduce the above copyright
12 * notice, this list of conditions and the following disclaimer in the
13 * documentation and/or other materials provided with the distribution.
15 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
16 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
17 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
18 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
19 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
20 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
21 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
22 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
23 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
24 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
28 #include "opt_netlink.h"
30 #include <sys/cdefs.h>
31 __FBSDID("$FreeBSD$");
32 #include <sys/param.h>
33 #include <sys/malloc.h>
35 #include <sys/rmlock.h>
38 #include <sys/socket.h>
39 #include <sys/socketvar.h>
40 #include <sys/syslog.h>
42 #include <netlink/netlink.h>
43 #include <netlink/netlink_ctl.h>
44 #include <netlink/netlink_linux.h>
45 #include <netlink/netlink_var.h>
47 #define DEBUG_MOD_NAME nl_writer
48 #define DEBUG_MAX_LEVEL LOG_DEBUG3
49 #include <netlink/netlink_debug.h>
50 _DECLARE_DEBUG(LOG_INFO);
53 * The goal of this file is to provide convenient message writing KPI on top of
54 * different storage methods (mbufs, uio, temporary memory chunks).
56 * The main KPI guarantee is the the (last) message always resides in the contiguous
57 * memory buffer, so one is able to update the header after writing the entire message.
59 * This guarantee comes with a side effect of potentially reallocating underlying
60 * buffer, so one needs to update the desired pointers after something is added
63 * Messaging layer contains hooks performing transparent Linux translation for the messages.
65 * There are 3 types of supported targets:
66 * * socket (adds mbufs to the socket buffer, used for message replies)
67 * * group (sends mbuf/chain to the specified groups, used for the notifications)
68 * * chain (returns mbuf chain, used in Linux message translation code)
70 * There are 3 types of storage:
71 * * NS_WRITER_TYPE_MBUF (mbuf-based, most efficient, used when a single message
73 * * NS_WRITER_TYPE_BUF (fallback, malloc-based, used when a single message needs
74 * to be larger than one supported by NS_WRITER_TYPE_MBUF)
75 * * NS_WRITER_TYPE_LBUF (malloc-based, similar to NS_WRITER_TYPE_BUF, used for
76 * Linux sockets, calls translation hook prior to sending messages to the socket).
78 * Internally, KPI switches between different types of storage when memory requirements
79 * change. It happens transparently to the caller.
83 typedef bool nlwriter_op_init(struct nl_writer *nw, int size, bool waitok);
84 typedef bool nlwriter_op_write(struct nl_writer *nw, void *buf, int buflen, int cnt);
87 nlwriter_op_init *init;
88 nlwriter_op_write *write_socket;
89 nlwriter_op_write *write_group;
90 nlwriter_op_write *write_chain;
95 * Writes message to a temporary memory buffer,
96 * flushing to the socket/group when buffer size limit is reached
99 nlmsg_get_ns_buf(struct nl_writer *nw, int size, bool waitok)
101 int mflag = waitok ? M_WAITOK : M_NOWAIT;
102 nw->_storage = malloc(size, M_NETLINK, mflag | M_ZERO);
103 if (__predict_false(nw->_storage == NULL))
105 nw->alloc_len = size;
108 nw->data = nw->_storage;
109 nw->writer_type = NS_WRITER_TYPE_BUF;
110 nw->malloc_flag = mflag;
111 nw->num_messages = 0;
117 nlmsg_write_socket_buf(struct nl_writer *nw, void *buf, int datalen, int cnt)
119 NL_LOG(LOG_DEBUG2, "IN: ptr: %p len: %d arg: %p", buf, datalen, nw->arg.ptr);
120 if (__predict_false(datalen == 0)) {
121 free(buf, M_NETLINK);
125 struct mbuf *m = m_getm2(NULL, datalen, nw->malloc_flag, MT_DATA, M_PKTHDR);
126 if (__predict_false(m == NULL)) {
127 /* XXX: should we set sorcverr? */
128 free(buf, M_NETLINK);
131 m_append(m, datalen, buf);
132 free(buf, M_NETLINK);
134 int io_flags = (nw->ignore_limit) ? NL_IOF_IGNORE_LIMIT : 0;
135 return (nl_send_one(m, (struct nlpcb *)(nw->arg.ptr), cnt, io_flags));
139 nlmsg_write_group_buf(struct nl_writer *nw, void *buf, int datalen, int cnt)
141 NL_LOG(LOG_DEBUG2, "IN: ptr: %p len: %d proto: %d id: %d", buf, datalen,
142 nw->arg.group.proto, nw->arg.group.id);
143 if (__predict_false(datalen == 0)) {
144 free(buf, M_NETLINK);
148 struct mbuf *m = m_getm2(NULL, datalen, nw->malloc_flag, MT_DATA, M_PKTHDR);
149 if (__predict_false(m == NULL)) {
150 free(buf, M_NETLINK);
153 bool success = m_append(m, datalen, buf) != 0;
154 free(buf, M_NETLINK);
159 nl_send_group(m, cnt, nw->arg.group.proto, nw->arg.group.id);
164 nlmsg_write_chain_buf(struct nl_writer *nw, void *buf, int datalen, int cnt)
166 struct mbuf **m0 = (struct mbuf **)(nw->arg.ptr);
167 NL_LOG(LOG_DEBUG2, "IN: ptr: %p len: %d arg: %p", buf, datalen, nw->arg.ptr);
169 if (__predict_false(datalen == 0)) {
170 free(buf, M_NETLINK);
177 m = m_getm2(NULL, datalen, nw->malloc_flag, MT_DATA, M_PKTHDR);
178 if (__predict_false(m == NULL)) {
179 free(buf, M_NETLINK);
184 if (__predict_false(m_append(*m0, datalen, buf) == 0)) {
185 free(buf, M_NETLINK);
193 * NS_WRITER_TYPE_MBUF
194 * Writes message to the allocated mbuf,
195 * flushing to socket/group when mbuf size limit is reached.
196 * This is the most efficient mechanism as it avoids double-copying.
198 * Allocates a single mbuf suitable to store up to @size bytes of data.
199 * If size < MHLEN (around 160 bytes), allocates mbuf with pkghdr
200 * If size <= MCLBYTES (2k), allocate a single mbuf cluster
201 * Otherwise, return NULL.
204 nlmsg_get_ns_mbuf(struct nl_writer *nw, int size, bool waitok)
208 int mflag = waitok ? M_WAITOK : M_NOWAIT;
209 m = m_get2(size, mflag, MT_DATA, M_PKTHDR);
210 if (__predict_false(m == NULL))
212 nw->alloc_len = M_TRAILINGSPACE(m);
215 nw->_storage = (void *)m;
216 nw->data = mtod(m, void *);
217 nw->writer_type = NS_WRITER_TYPE_MBUF;
218 nw->malloc_flag = mflag;
219 nw->num_messages = 0;
221 memset(nw->data, 0, size);
222 NL_LOG(LOG_DEBUG2, "alloc mbuf %p req_len %d alloc_len %d data_ptr %p",
223 m, size, nw->alloc_len, nw->data);
228 nlmsg_write_socket_mbuf(struct nl_writer *nw, void *buf, int datalen, int cnt)
230 struct mbuf *m = (struct mbuf *)buf;
231 NL_LOG(LOG_DEBUG2, "IN: ptr: %p len: %d arg: %p", buf, datalen, nw->arg.ptr);
233 if (__predict_false(datalen == 0)) {
238 m->m_pkthdr.len = datalen;
240 int io_flags = (nw->ignore_limit) ? NL_IOF_IGNORE_LIMIT : 0;
241 return (nl_send_one(m, (struct nlpcb *)(nw->arg.ptr), cnt, io_flags));
245 nlmsg_write_group_mbuf(struct nl_writer *nw, void *buf, int datalen, int cnt)
247 struct mbuf *m = (struct mbuf *)buf;
248 NL_LOG(LOG_DEBUG2, "IN: ptr: %p len: %d proto: %d id: %d", buf, datalen,
249 nw->arg.group.proto, nw->arg.group.id);
251 if (__predict_false(datalen == 0)) {
256 m->m_pkthdr.len = datalen;
258 nl_send_group(m, cnt, nw->arg.group.proto, nw->arg.group.id);
263 nlmsg_write_chain_mbuf(struct nl_writer *nw, void *buf, int datalen, int cnt)
265 struct mbuf *m_new = (struct mbuf *)buf;
266 struct mbuf **m0 = (struct mbuf **)(nw->arg.ptr);
268 NL_LOG(LOG_DEBUG2, "IN: ptr: %p len: %d arg: %p", buf, datalen, nw->arg.ptr);
270 if (__predict_false(datalen == 0)) {
275 m_new->m_pkthdr.len = datalen;
276 m_new->m_len = datalen;
282 for (m_last = *m0; m_last->m_next != NULL; m_last = m_last->m_next)
284 m_last->m_next = m_new;
285 (*m0)->m_pkthdr.len += datalen;
292 * NS_WRITER_TYPE_LBUF
293 * Writes message to the allocated memory buffer,
294 * flushing to socket/group when mbuf size limit is reached.
295 * Calls linux handler to rewrite messages before sending to the socket.
298 nlmsg_get_ns_lbuf(struct nl_writer *nw, int size, bool waitok)
300 int mflag = waitok ? M_WAITOK : M_NOWAIT;
301 size = roundup2(size, sizeof(void *));
302 int add_size = sizeof(struct linear_buffer) + SCRATCH_BUFFER_SIZE;
303 char *buf = malloc(add_size + size * 2, M_NETLINK, mflag | M_ZERO);
304 if (__predict_false(buf == NULL))
307 /* Fill buffer header first */
308 struct linear_buffer *lb = (struct linear_buffer *)buf;
309 lb->base = &buf[sizeof(struct linear_buffer) + size];
310 lb->size = size + SCRATCH_BUFFER_SIZE;
312 nw->alloc_len = size;
316 nw->data = (char *)(lb + 1);
317 nw->malloc_flag = mflag;
318 nw->writer_type = NS_WRITER_TYPE_LBUF;
319 nw->num_messages = 0;
325 nlmsg_write_socket_lbuf(struct nl_writer *nw, void *buf, int datalen, int cnt)
327 struct linear_buffer *lb = (struct linear_buffer *)buf;
328 char *data = (char *)(lb + 1);
329 struct nlpcb *nlp = (struct nlpcb *)(nw->arg.ptr);
331 if (__predict_false(datalen == 0)) {
332 free(buf, M_NETLINK);
336 struct mbuf *m = NULL;
337 if (linux_netlink_p != NULL)
338 m = linux_netlink_p->msgs_to_linux(nlp->nl_proto, data, datalen, nlp);
339 free(buf, M_NETLINK);
341 if (__predict_false(m == NULL)) {
342 /* XXX: should we set sorcverr? */
346 int io_flags = (nw->ignore_limit) ? NL_IOF_IGNORE_LIMIT : 0;
347 return (nl_send_one(m, nlp, cnt, io_flags));
350 /* Shouldn't be called (maybe except Linux code originating message) */
352 nlmsg_write_group_lbuf(struct nl_writer *nw, void *buf, int datalen, int cnt)
354 struct linear_buffer *lb = (struct linear_buffer *)buf;
355 char *data = (char *)(lb + 1);
357 if (__predict_false(datalen == 0)) {
358 free(buf, M_NETLINK);
362 struct mbuf *m = m_getm2(NULL, datalen, nw->malloc_flag, MT_DATA, M_PKTHDR);
363 if (__predict_false(m == NULL)) {
364 free(buf, M_NETLINK);
367 m_append(m, datalen, data);
368 free(buf, M_NETLINK);
370 nl_send_group(m, cnt, nw->arg.group.proto, nw->arg.group.id);
374 static const struct nlwriter_ops nlmsg_writers[] = {
375 /* NS_WRITER_TYPE_MBUF */
377 .init = nlmsg_get_ns_mbuf,
378 .write_socket = nlmsg_write_socket_mbuf,
379 .write_group = nlmsg_write_group_mbuf,
380 .write_chain = nlmsg_write_chain_mbuf,
382 /* NS_WRITER_TYPE_BUF */
384 .init = nlmsg_get_ns_buf,
385 .write_socket = nlmsg_write_socket_buf,
386 .write_group = nlmsg_write_group_buf,
387 .write_chain = nlmsg_write_chain_buf,
389 /* NS_WRITER_TYPE_LBUF */
391 .init = nlmsg_get_ns_lbuf,
392 .write_socket = nlmsg_write_socket_lbuf,
393 .write_group = nlmsg_write_group_lbuf,
398 nlmsg_set_callback(struct nl_writer *nw)
400 const struct nlwriter_ops *pops = &nlmsg_writers[nw->writer_type];
402 switch (nw->writer_target) {
403 case NS_WRITER_TARGET_SOCKET:
404 nw->cb = pops->write_socket;
406 case NS_WRITER_TARGET_GROUP:
407 nw->cb = pops->write_group;
409 case NS_WRITER_TARGET_CHAIN:
410 nw->cb = pops->write_chain;
413 panic("not implemented");
418 nlmsg_get_buf_type(struct nl_writer *nw, int size, int type, bool waitok)
420 MPASS(type + 1 <= sizeof(nlmsg_writers) / sizeof(nlmsg_writers[0]));
421 NL_LOG(LOG_DEBUG3, "Setting up nw %p size %d type %d", nw, size, type);
422 return (nlmsg_writers[type].init(nw, size, waitok));
426 nlmsg_get_buf(struct nl_writer *nw, int size, bool waitok, bool is_linux)
431 if (__predict_true(size <= MCLBYTES))
432 type = NS_WRITER_TYPE_MBUF;
434 type = NS_WRITER_TYPE_BUF;
436 type = NS_WRITER_TYPE_LBUF;
437 return (nlmsg_get_buf_type(nw, size, type, waitok));
441 _nlmsg_get_unicast_writer(struct nl_writer *nw, int size, struct nlpcb *nlp)
443 if (!nlmsg_get_buf(nw, size, false, nlp->nl_linux))
445 nw->arg.ptr = (void *)nlp;
446 nw->writer_target = NS_WRITER_TARGET_SOCKET;
447 nlmsg_set_callback(nw);
452 _nlmsg_get_group_writer(struct nl_writer *nw, int size, int protocol, int group_id)
454 if (!nlmsg_get_buf(nw, size, false, false))
456 nw->arg.group.proto = protocol;
457 nw->arg.group.id = group_id;
458 nw->writer_target = NS_WRITER_TARGET_GROUP;
459 nlmsg_set_callback(nw);
464 _nlmsg_get_chain_writer(struct nl_writer *nw, int size, struct mbuf **pm)
466 if (!nlmsg_get_buf(nw, size, false, false))
469 nw->arg.ptr = (void *)pm;
470 nw->writer_target = NS_WRITER_TARGET_CHAIN;
471 nlmsg_set_callback(nw);
472 NL_LOG(LOG_DEBUG3, "setup cb %p (need %p)", nw->cb, &nlmsg_write_chain_mbuf);
477 _nlmsg_ignore_limit(struct nl_writer *nw)
479 nw->ignore_limit = true;
483 _nlmsg_flush(struct nl_writer *nw)
486 if (__predict_false(nw->hdr != NULL)) {
487 /* Last message has not been completed, skip it. */
488 int completed_len = (char *)nw->hdr - nw->data;
489 /* Send completed messages */
490 nw->offset -= nw->offset - completed_len;
494 NL_LOG(LOG_DEBUG2, "OUT");
495 bool result = nw->cb(nw, nw->_storage, nw->offset, nw->num_messages);
499 NL_LOG(LOG_DEBUG, "nw %p offset %d: flush with %p() failed", nw, nw->offset, nw->cb);
506 * Flushes previous data and allocates new underlying storage
507 * sufficient for holding at least @required_len bytes.
508 * Return true on success.
511 _nlmsg_refill_buffer(struct nl_writer *nw, int required_len)
513 struct nl_writer ns_new = {};
514 int completed_len, new_len;
519 NL_LOG(LOG_DEBUG3, "no space at offset %d/%d (want %d), trying to reclaim",
520 nw->offset, nw->alloc_len, required_len);
522 /* Calculated new buffer size and allocate it s*/
523 completed_len = (nw->hdr != NULL) ? (char *)nw->hdr - nw->data : nw->offset;
524 if (completed_len > 0 && required_len < MCLBYTES) {
525 /* We already ran out of space, use the largest effective size */
526 new_len = max(nw->alloc_len, MCLBYTES);
528 if (nw->alloc_len < MCLBYTES)
531 new_len = nw->alloc_len * 2;
532 while (new_len < required_len)
535 bool waitok = (nw->malloc_flag == M_WAITOK);
536 bool is_linux = (nw->writer_type == NS_WRITER_TYPE_LBUF);
537 if (!nlmsg_get_buf(&ns_new, new_len, waitok, is_linux)) {
539 NL_LOG(LOG_DEBUG, "getting new buf failed, setting ENOMEM");
542 if (nw->ignore_limit)
543 nlmsg_ignore_limit(&ns_new);
545 /* Update callback data */
546 ns_new.writer_target = nw->writer_target;
547 nlmsg_set_callback(&ns_new);
548 ns_new.arg = nw->arg;
550 /* Copy last (unfinished) header to the new storage */
551 int last_len = nw->offset - completed_len;
553 memcpy(ns_new.data, nw->hdr, last_len);
554 ns_new.hdr = (struct nlmsghdr *)ns_new.data;
555 ns_new.offset = last_len;
558 NL_LOG(LOG_DEBUG2, "completed: %d bytes, copied: %d bytes", completed_len, last_len);
560 /* Flush completed headers & switch to the new nw */
562 memcpy(nw, &ns_new, sizeof(struct nl_writer));
563 NL_LOG(LOG_DEBUG2, "switched buffer: used %d/%d bytes", nw->offset, nw->alloc_len);
569 _nlmsg_add(struct nl_writer *nw, uint32_t portid, uint32_t seq, uint16_t type,
570 uint16_t flags, uint32_t len)
572 struct nlmsghdr *hdr;
574 MPASS(nw->hdr == NULL);
576 int required_len = NETLINK_ALIGN(len + sizeof(struct nlmsghdr));
577 if (__predict_false(nw->offset + required_len > nw->alloc_len)) {
578 if (!nlmsg_refill_buffer(nw, required_len))
582 hdr = (struct nlmsghdr *)(&nw->data[nw->offset]);
584 hdr->nlmsg_len = len;
585 hdr->nlmsg_type = type;
586 hdr->nlmsg_flags = flags;
587 hdr->nlmsg_seq = seq;
588 hdr->nlmsg_pid = portid;
591 nw->offset += sizeof(struct nlmsghdr);
597 _nlmsg_end(struct nl_writer *nw)
599 MPASS(nw->hdr != NULL);
602 NL_LOG(LOG_DEBUG, "ENOMEM when dumping message");
607 nw->hdr->nlmsg_len = (uint32_t)(nw->data + nw->offset - (char *)nw->hdr);
608 NL_LOG(LOG_DEBUG2, "wrote msg len: %u type: %d: flags: 0x%X seq: %u pid: %u",
609 nw->hdr->nlmsg_len, nw->hdr->nlmsg_type, nw->hdr->nlmsg_flags,
610 nw->hdr->nlmsg_seq, nw->hdr->nlmsg_pid);
617 _nlmsg_abort(struct nl_writer *nw)
619 if (nw->hdr != NULL) {
620 nw->offset = (uint32_t)((char *)nw->hdr - nw->data);
626 nlmsg_ack(struct nlpcb *nlp, int error, struct nlmsghdr *hdr,
627 struct nl_pstate *npt)
629 struct nlmsgerr *errmsg;
631 uint32_t flags = nlp->nl_flags;
632 struct nl_writer *nw = npt->nw;
635 payload_len = sizeof(struct nlmsgerr);
638 * The only case when we send the full message in the
639 * reply is when there is an error and NETLINK_CAP_ACK
642 cap_ack = (error == 0) || (flags & NLF_CAP_ACK);
644 payload_len += hdr->nlmsg_len - sizeof(struct nlmsghdr);
645 payload_len = NETLINK_ALIGN(payload_len);
647 uint16_t nl_flags = cap_ack ? NLM_F_CAPPED : 0;
648 if ((npt->err_msg || npt->err_off) && nlp->nl_flags & NLF_EXT_ACK)
649 nl_flags |= NLM_F_ACK_TLVS;
651 NL_LOG(LOG_DEBUG3, "acknowledging message type %d seq %d",
652 hdr->nlmsg_type, hdr->nlmsg_seq);
654 if (!nlmsg_add(nw, nlp->nl_port, hdr->nlmsg_seq, NLMSG_ERROR, nl_flags, payload_len))
657 errmsg = nlmsg_reserve_data(nw, payload_len, struct nlmsgerr);
658 errmsg->error = error;
659 /* In case of error copy the whole message, else just the header */
660 memcpy(&errmsg->msg, hdr, cap_ack ? sizeof(*hdr) : hdr->nlmsg_len);
662 if (npt->err_msg != NULL && nlp->nl_flags & NLF_EXT_ACK)
663 nlattr_add_string(nw, NLMSGERR_ATTR_MSG, npt->err_msg);
664 if (npt->err_off != 0 && nlp->nl_flags & NLF_EXT_ACK)
665 nlattr_add_u32(nw, NLMSGERR_ATTR_OFFS, npt->err_off);
666 if (npt->cookie != NULL)
667 nlattr_add_raw(nw, npt->cookie);
672 NLP_LOG(LOG_DEBUG, nlp, "error allocating ack data for message %d seq %u",
673 hdr->nlmsg_type, hdr->nlmsg_seq);
678 _nlmsg_end_dump(struct nl_writer *nw, int error, struct nlmsghdr *hdr)
680 if (!nlmsg_add(nw, hdr->nlmsg_pid, hdr->nlmsg_seq, NLMSG_DONE, 0, sizeof(int))) {
681 NL_LOG(LOG_DEBUG, "Error finalizing table dump");
684 /* Save operation result */
685 int *perror = nlmsg_reserve_object(nw, int);
686 NL_LOG(LOG_DEBUG2, "record error=%d at off %d (%p)", error,
690 nw->suppress_ack = true;