2 * SPDX-License-Identifier: BSD-2-Clause
4 * Copyright (c) 2021 Ng Peng Nam Sean
5 * Copyright (c) 2022 Alexander V. Chernikov <melifaro@FreeBSD.org>
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
10 * 1. Redistributions of source code must retain the above copyright
11 * notice, this list of conditions and the following disclaimer.
12 * 2. Redistributions in binary form must reproduce the above copyright
13 * notice, this list of conditions and the following disclaimer in the
14 * documentation and/or other materials provided with the distribution.
16 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
17 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
20 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
22 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
23 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
24 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
25 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
29 #include <sys/cdefs.h>
30 #include <sys/param.h>
33 #include <sys/malloc.h>
35 #include <sys/mutex.h>
36 #include <sys/socket.h>
37 #include <sys/socketvar.h>
38 #include <sys/syslog.h>
40 #include <netlink/netlink.h>
41 #include <netlink/netlink_ctl.h>
42 #include <netlink/netlink_linux.h>
43 #include <netlink/netlink_var.h>
45 #define DEBUG_MOD_NAME nl_io
46 #define DEBUG_MAX_LEVEL LOG_DEBUG3
47 #include <netlink/netlink_debug.h>
48 _DECLARE_DEBUG(LOG_DEBUG);
51 * The logic below provide a p2p interface for receiving and
52 * sending netlink data between the kernel and userland.
55 static const struct sockaddr_nl _nl_empty_src = {
56 .nl_len = sizeof(struct sockaddr_nl),
57 .nl_family = PF_NETLINK,
58 .nl_pid = 0 /* comes from the kernel */
60 static const struct sockaddr *nl_empty_src = (const struct sockaddr *)&_nl_empty_src;
62 static struct mbuf *nl_process_mbuf(struct mbuf *m, struct nlpcb *nlp);
66 queue_push(struct nl_io_queue *q, struct mbuf *mq)
73 q->length += m_length(m, NULL);
74 STAILQ_INSERT_TAIL(&q->head, m, m_stailqpkt);
79 queue_push_head(struct nl_io_queue *q, struct mbuf *m)
81 MPASS(m->m_nextpkt == NULL);
83 q->length += m_length(m, NULL);
84 STAILQ_INSERT_HEAD(&q->head, m, m_stailqpkt);
88 queue_pop(struct nl_io_queue *q)
90 if (!STAILQ_EMPTY(&q->head)) {
91 struct mbuf *m = STAILQ_FIRST(&q->head);
92 STAILQ_REMOVE_HEAD(&q->head, m_stailqpkt);
94 q->length -= m_length(m, NULL);
102 queue_head(const struct nl_io_queue *q)
104 return (STAILQ_FIRST(&q->head));
108 queue_empty(const struct nl_io_queue *q)
110 return (q->length == 0);
114 queue_free(struct nl_io_queue *q)
116 while (!STAILQ_EMPTY(&q->head)) {
117 struct mbuf *m = STAILQ_FIRST(&q->head);
118 STAILQ_REMOVE_HEAD(&q->head, m_stailqpkt);
127 nl_schedule_taskqueue(struct nlpcb *nlp)
129 if (!nlp->nl_task_pending) {
130 nlp->nl_task_pending = true;
131 taskqueue_enqueue(nlp->nl_taskqueue, &nlp->nl_task);
132 NL_LOG(LOG_DEBUG3, "taskqueue scheduled");
134 NL_LOG(LOG_DEBUG3, "taskqueue schedule skipped");
139 nl_receive_async(struct mbuf *m, struct socket *so)
141 struct nlpcb *nlp = sotonlpcb(so);
148 if ((__predict_true(nlp->nl_active))) {
149 sbappend(&so->so_snd, m, 0);
150 NL_LOG(LOG_DEBUG3, "enqueue %u bytes", m_length(m, NULL));
151 nl_schedule_taskqueue(nlp);
153 NL_LOG(LOG_DEBUG, "ignoring %u bytes on non-active socket",
165 tx_check_locked(struct nlpcb *nlp)
167 if (queue_empty(&nlp->tx_queue))
171 * Check if something can be moved from the internal TX queue
172 * to the socket queue.
175 bool appended = false;
176 struct sockbuf *sb = &nlp->nl_socket->so_rcv;
180 struct mbuf *m = queue_head(&nlp->tx_queue);
181 if (m && sbappendaddr_locked(sb, nl_empty_src, m, NULL) != 0) {
182 /* appended successfully */
183 queue_pop(&nlp->tx_queue);
192 sorwakeup(nlp->nl_socket);
194 return (queue_empty(&nlp->tx_queue));
198 nl_process_received_one(struct nlpcb *nlp)
200 bool reschedule = false;
203 nlp->nl_task_pending = false;
205 if (!tx_check_locked(nlp)) {
206 /* TX overflow queue still not empty, ignore RX */
211 if (queue_empty(&nlp->rx_queue)) {
213 * Grab all data we have from the socket TX queue
214 * and store it the internal queue, so it can be worked on
215 * w/o holding socket lock.
217 struct sockbuf *sb = &nlp->nl_socket->so_snd;
220 unsigned int avail = sbavail(sb);
222 NL_LOG(LOG_DEBUG3, "grabbed %u bytes", avail);
223 queue_push(&nlp->rx_queue, sbcut_locked(sb, avail));
227 /* Schedule another pass to read from the socket queue */
231 int prev_hiwat = nlp->tx_queue.hiwat;
234 while (!queue_empty(&nlp->rx_queue)) {
235 struct mbuf *m = queue_pop(&nlp->rx_queue);
237 m = nl_process_mbuf(m, nlp);
239 queue_push_head(&nlp->rx_queue, m);
244 if (nlp->tx_queue.hiwat > prev_hiwat) {
245 NLP_LOG(LOG_DEBUG, nlp, "TX override peaked to %d", nlp->tx_queue.hiwat);
253 nl_process_received(struct nlpcb *nlp)
255 NL_LOG(LOG_DEBUG3, "taskqueue called");
257 while (nl_process_received_one(nlp))
262 nl_init_io(struct nlpcb *nlp)
264 STAILQ_INIT(&nlp->rx_queue.head);
265 STAILQ_INIT(&nlp->tx_queue.head);
269 nl_free_io(struct nlpcb *nlp)
271 queue_free(&nlp->rx_queue);
272 queue_free(&nlp->tx_queue);
276 * Called after some data have been read from the socket.
279 nl_on_transmit(struct nlpcb *nlp)
283 struct socket *so = nlp->nl_socket;
284 if (__predict_false(nlp->nl_dropped_bytes > 0 && so != NULL)) {
285 unsigned long dropped_bytes = nlp->nl_dropped_bytes;
286 unsigned long dropped_messages = nlp->nl_dropped_messages;
287 nlp->nl_dropped_bytes = 0;
288 nlp->nl_dropped_messages = 0;
290 struct sockbuf *sb = &so->so_rcv;
291 NLP_LOG(LOG_DEBUG, nlp,
292 "socket RX overflowed, %lu messages (%lu bytes) dropped. "
293 "bytes: [%u/%u] mbufs: [%u/%u]", dropped_messages, dropped_bytes,
294 sb->sb_ccc, sb->sb_hiwat, sb->sb_mbcnt, sb->sb_mbmax);
295 /* TODO: send netlink message */
298 nl_schedule_taskqueue(nlp);
303 nl_taskqueue_handler(void *_arg, int pending)
305 struct nlpcb *nlp = (struct nlpcb *)_arg;
307 CURVNET_SET(nlp->nl_socket->so_vnet);
308 nl_process_received(nlp);
312 static __noinline void
313 queue_push_tx(struct nlpcb *nlp, struct mbuf *m)
315 queue_push(&nlp->tx_queue, m);
316 nlp->nl_tx_blocked = true;
318 if (nlp->tx_queue.length > nlp->tx_queue.hiwat)
319 nlp->tx_queue.hiwat = nlp->tx_queue.length;
323 * Tries to send @m to the socket @nlp.
325 * @m: mbuf(s) to send to. Consumed in any case.
326 * @nlp: socket to send to
327 * @cnt: number of messages in @m
328 * @io_flags: combination of NL_IOF_* flags
330 * Returns true on success.
331 * If no queue overrunes happened, wakes up socket owner.
334 nl_send_one(struct mbuf *m, struct nlpcb *nlp, int num_messages, int io_flags)
336 bool untranslated = io_flags & NL_IOF_UNTRANSLATED;
337 bool ignore_limits = io_flags & NL_IOF_IGNORE_LIMIT;
340 IF_DEBUG_LEVEL(LOG_DEBUG2) {
341 struct nlmsghdr *hdr = mtod(m, struct nlmsghdr *);
342 NLP_LOG(LOG_DEBUG2, nlp,
343 "TX mbuf len %u msgs %u msg type %d first hdrlen %u io_flags %X",
344 m_length(m, NULL), num_messages, hdr->nlmsg_type, hdr->nlmsg_len,
348 if (__predict_false(nlp->nl_linux && linux_netlink_p != NULL && untranslated)) {
349 m = linux_netlink_p->mbufs_to_linux(nlp->nl_proto, m, nlp);
356 if (__predict_false(nlp->nl_socket == NULL)) {
362 if (!queue_empty(&nlp->tx_queue)) {
364 queue_push_tx(nlp, m);
373 struct socket *so = nlp->nl_socket;
374 if (sbappendaddr(&so->so_rcv, nl_empty_src, m, NULL) != 0) {
376 NLP_LOG(LOG_DEBUG3, nlp, "appended data & woken up");
379 queue_push_tx(nlp, m);
382 * Store dropped data so it can be reported
385 nlp->nl_dropped_bytes += m_length(m, NULL);
386 nlp->nl_dropped_messages += num_messages;
387 NLP_LOG(LOG_DEBUG2, nlp, "RX oveflow: %lu m (+%d), %lu b (+%d)",
388 (unsigned long)nlp->nl_dropped_messages, num_messages,
389 (unsigned long)nlp->nl_dropped_bytes, m_length(m, NULL));
401 nl_receive_message(struct nlmsghdr *hdr, int remaining_length,
402 struct nlpcb *nlp, struct nl_pstate *npt)
404 nl_handler_f handler = nl_handlers[nlp->nl_proto].cb;
407 NLP_LOG(LOG_DEBUG2, nlp, "msg len: %u type: %d: flags: 0x%X seq: %u pid: %u",
408 hdr->nlmsg_len, hdr->nlmsg_type, hdr->nlmsg_flags, hdr->nlmsg_seq,
411 if (__predict_false(hdr->nlmsg_len > remaining_length)) {
412 NLP_LOG(LOG_DEBUG, nlp, "message is not entirely present: want %d got %d",
413 hdr->nlmsg_len, remaining_length);
415 } else if (__predict_false(hdr->nlmsg_len < sizeof(*hdr))) {
416 NL_LOG(LOG_DEBUG, "message too short: %d", hdr->nlmsg_len);
419 /* Stamp each message with sender pid */
420 hdr->nlmsg_pid = nlp->nl_port;
424 if (hdr->nlmsg_flags & NLM_F_REQUEST && hdr->nlmsg_type >= NLMSG_MIN_TYPE) {
425 NL_LOG(LOG_DEBUG2, "handling message with msg type: %d",
428 if (nlp->nl_linux && linux_netlink_p != NULL) {
429 struct nlmsghdr *hdr_orig = hdr;
430 hdr = linux_netlink_p->msg_from_linux(nlp->nl_proto, hdr, npt);
432 /* Failed to translate to kernel format. Report an error back */
435 if (hdr->nlmsg_flags & NLM_F_ACK)
436 nlmsg_ack(nlp, EOPNOTSUPP, hdr, npt);
440 error = handler(hdr, npt);
441 NL_LOG(LOG_DEBUG2, "retcode: %d", error);
443 if ((hdr->nlmsg_flags & NLM_F_ACK) || (error != 0 && error != EINTR)) {
444 if (!npt->nw->suppress_ack) {
445 NL_LOG(LOG_DEBUG3, "ack");
446 nlmsg_ack(nlp, error, hdr, npt);
454 npt_clear(struct nl_pstate *npt)
461 npt->nw->suppress_ack = false;
465 * Processes an incoming packet, which can contain multiple netlink messages
468 nl_process_mbuf(struct mbuf *m, struct nlpcb *nlp)
470 int offset, buffer_length;
471 struct nlmsghdr *hdr;
475 NL_LOG(LOG_DEBUG3, "RX netlink mbuf %p on %p", m, nlp->nl_socket);
477 struct nl_writer nw = {};
478 if (!nlmsg_get_unicast_writer(&nw, NLMSG_SMALL, nlp)) {
480 NL_LOG(LOG_DEBUG, "error allocating socket writer");
484 nlmsg_ignore_limit(&nw);
485 /* TODO: alloc this buf once for nlp */
486 int data_length = m_length(m, NULL);
487 buffer_length = roundup2(data_length, 8) + SCRATCH_BUFFER_SIZE;
489 buffer_length += roundup2(data_length, 8);
490 buffer = malloc(buffer_length, M_NETLINK, M_NOWAIT | M_ZERO);
491 if (buffer == NULL) {
494 NL_LOG(LOG_DEBUG, "Unable to allocate %d bytes of memory",
498 m_copydata(m, 0, data_length, buffer);
500 struct nl_pstate npt = {
502 .lb.base = &buffer[roundup2(data_length, 8)],
503 .lb.size = buffer_length - roundup2(data_length, 8),
505 .strict = nlp->nl_flags & NLF_STRICT,
508 for (offset = 0; offset + sizeof(struct nlmsghdr) <= data_length;) {
509 hdr = (struct nlmsghdr *)&buffer[offset];
510 /* Save length prior to calling handler */
511 int msglen = NLMSG_ALIGN(hdr->nlmsg_len);
512 NL_LOG(LOG_DEBUG3, "parsing offset %d/%d", offset, data_length);
514 error = nl_receive_message(hdr, data_length - offset, nlp, &npt);
516 if (__predict_false(error != 0 || nlp->nl_tx_blocked))
519 NL_LOG(LOG_DEBUG3, "packet parsing done");
520 free(buffer, M_NETLINK);
523 if (nlp->nl_tx_blocked) {
525 nlp->nl_tx_blocked = false;