2 * Copyright (c) 2016-2018 Netflix, Inc.
3 * Copyright (c) 2016-2021 Mellanox Technologies.
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
8 * 1. Redistributions of source code must retain the above copyright
9 * notice, this list of conditions and the following disclaimer.
10 * 2. Redistributions in binary form must reproduce the above copyright
11 * notice, this list of conditions and the following disclaimer in the
12 * documentation and/or other materials provided with the distribution.
14 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
15 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
16 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
17 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
18 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
19 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
20 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
21 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
22 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
23 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
27 #include <sys/cdefs.h>
29 #include "opt_inet6.h"
31 #include <sys/param.h>
32 #include <sys/systm.h>
33 #include <sys/kernel.h>
34 #include <sys/malloc.h>
36 #include <sys/socket.h>
37 #include <sys/socketvar.h>
38 #include <sys/sysctl.h>
41 #include <net/if_var.h>
42 #include <net/ethernet.h>
45 #include <net/if_dl.h>
46 #include <net/if_media.h>
47 #include <net/if_types.h>
48 #include <net/infiniband.h>
49 #include <net/if_lagg.h>
51 #include <netinet/in.h>
52 #include <netinet/ip6.h>
53 #include <netinet/ip.h>
54 #include <netinet/ip_var.h>
55 #include <netinet/in_pcb.h>
56 #include <netinet6/in6_pcb.h>
57 #include <netinet/tcp.h>
58 #include <netinet/tcp_lro.h>
59 #include <netinet/tcp_var.h>
60 #include <netinet/tcp_hpts.h>
61 #include <netinet/tcp_log_buf.h>
64 build_ack_entry(struct tcp_ackent *ae, struct tcphdr *th, struct mbuf *m,
65 uint32_t *ts_ptr, uint16_t iptos)
68 * Given a TCP ACK, summarize it down into the small TCP ACK
71 ae->timestamp = m->m_pkthdr.rcv_tstmp;
73 if (m->m_flags & M_TSTMP_LRO)
74 ae->flags |= TSTMP_LRO;
75 else if (m->m_flags & M_TSTMP)
76 ae->flags |= TSTMP_HDWR;
77 ae->seq = ntohl(th->th_seq);
78 ae->ack = ntohl(th->th_ack);
79 ae->flags |= tcp_get_flags(th);
81 ae->ts_value = ntohl(ts_ptr[1]);
82 ae->ts_echo = ntohl(ts_ptr[2]);
83 ae->flags |= HAS_TSTMP;
85 ae->win = ntohs(th->th_win);
86 ae->codepoint = iptos;
90 tcp_lro_ack_valid(struct mbuf *m, struct tcphdr *th, uint32_t **ppts, bool *other_opts)
93 * This function returns two bits of valuable information.
94 * a) Is what is present capable of being ack-compressed,
95 * we can ack-compress if there is no options or just
96 * a timestamp option, and of course the th_flags must
98 * b) Our other options present such as SACK. This is
99 * used to determine if we want to wakeup or not.
103 switch (th->th_off << 2) {
104 case (sizeof(*th) + TCPOLEN_TSTAMP_APPA):
105 *ppts = (uint32_t *)(th + 1);
106 /* Check if we have only one timestamp option. */
107 if (**ppts == TCP_LRO_TS_OPTION)
125 /* For ACKCMP we only accept ACK, PUSH, ECE and CWR. */
126 if ((tcp_get_flags(th) & ~(TH_ACK | TH_PUSH | TH_ECE | TH_CWR)) != 0)
128 /* If it has data on it we cannot compress it */
129 if (m->m_pkthdr.lro_tcp_d_len)
132 /* ACK flag must be set. */
133 if (!(tcp_get_flags(th) & TH_ACK))
139 tcp_lro_check_wake_status(struct tcpcb *tp)
142 if (tp->t_fb->tfb_early_wake_check != NULL)
143 return ((tp->t_fb->tfb_early_wake_check)(tp));
148 tcp_lro_log(struct tcpcb *tp, const struct lro_ctrl *lc,
149 const struct lro_entry *le, const struct mbuf *m,
150 int frm, int32_t tcp_data_len, uint32_t th_seq,
151 uint32_t th_ack, uint16_t th_win)
153 if (tcp_bblogging_on(tp)) {
154 union tcp_log_stackspecific log;
155 struct timeval tv, btv;
158 cts = tcp_get_usecs(&tv);
159 memset(&log, 0, sizeof(union tcp_log_stackspecific));
160 log.u_bbr.flex8 = frm;
161 log.u_bbr.flex1 = tcp_data_len;
163 log.u_bbr.flex2 = m->m_pkthdr.len;
167 log.u_bbr.flex3 = le->m_head->m_pkthdr.lro_nsegs;
168 log.u_bbr.flex4 = le->m_head->m_pkthdr.lro_tcp_d_len;
169 log.u_bbr.flex5 = le->m_head->m_pkthdr.len;
170 log.u_bbr.delRate = le->m_head->m_flags;
171 log.u_bbr.rttProp = le->m_head->m_pkthdr.rcv_tstmp;
173 log.u_bbr.inflight = th_seq;
174 log.u_bbr.delivered = th_ack;
175 log.u_bbr.timeStamp = cts;
176 log.u_bbr.epoch = le->next_seq;
177 log.u_bbr.lt_epoch = le->ack_seq;
178 log.u_bbr.pacing_gain = th_win;
179 log.u_bbr.cwnd_gain = le->window;
180 log.u_bbr.lost = curcpu;
181 log.u_bbr.cur_del_rate = (uintptr_t)m;
182 log.u_bbr.bw_inuse = (uintptr_t)le->m_head;
183 bintime2timeval(&lc->lro_last_queue_time, &btv);
184 log.u_bbr.flex6 = tcp_tv_to_usectick(&btv);
185 log.u_bbr.flex7 = le->compressed;
186 log.u_bbr.pacing_gain = le->uncompressed;
187 if (in_epoch(net_epoch_preempt))
188 log.u_bbr.inhpts = 1;
190 log.u_bbr.inhpts = 0;
191 TCP_LOG_EVENTP(tp, NULL, &tptosocket(tp)->so_rcv,
192 &tptosocket(tp)->so_snd,
193 TCP_LOG_LRO, 0, 0, &log, false, &tv);
198 tcp_lro_get_last_if_ackcmp(struct lro_ctrl *lc, struct lro_entry *le,
199 struct tcpcb *tp, int32_t *new_m, bool can_append_old_cmp)
203 /* Look at the last mbuf if any in queue */
204 if (can_append_old_cmp) {
205 m = STAILQ_LAST(&tp->t_inqueue, mbuf, m_stailqpkt);
206 if (m != NULL && (m->m_flags & M_ACKCMP) != 0) {
207 if (M_TRAILINGSPACE(m) >= sizeof(struct tcp_ackent)) {
208 tcp_lro_log(tp, lc, le, NULL, 23, 0, 0, 0, 0);
210 counter_u64_add(tcp_extra_mbuf, 1);
213 /* Mark we ran out of space */
214 tp->t_flags2 |= TF2_MBUF_L_ACKS;
218 /* Decide mbuf size. */
219 tcp_lro_log(tp, lc, le, NULL, 21, 0, 0, 0, 0);
220 if (tp->t_flags2 & TF2_MBUF_L_ACKS)
221 m = m_getcl(M_NOWAIT, MT_DATA, M_ACKCMP | M_PKTHDR);
223 m = m_gethdr(M_NOWAIT, MT_DATA);
225 if (__predict_false(m == NULL)) {
226 counter_u64_add(tcp_would_have_but, 1);
229 counter_u64_add(tcp_comp_total, 1);
230 m->m_pkthdr.rcvif = lc->ifp;
231 m->m_flags |= M_ACKCMP;
237 * Do BPF tap for either ACK_CMP packets or MBUF QUEUE type packets
238 * and strip all, but the IPv4/IPv6 header.
241 do_bpf_strip_and_compress(struct tcpcb *tp, struct lro_ctrl *lc,
242 struct lro_entry *le, struct mbuf **pp, struct mbuf **cmp,
243 struct mbuf **mv_to, bool *should_wake, bool bpf_req, bool lagg_bpf_req,
244 struct ifnet *lagg_ifp, bool can_append_old_cmp)
254 struct tcp_ackent *ack_ent;
257 bool other_opts, can_compress;
263 /* Get current mbuf. */
266 /* Let the BPF see the packet */
267 if (__predict_false(bpf_req))
268 ETHER_BPF_MTAP(lc->ifp, m);
270 if (__predict_false(lagg_bpf_req))
271 ETHER_BPF_MTAP(lagg_ifp, m);
273 tcp_hdr_offset = m->m_pkthdr.lro_tcp_h_off;
274 lro_type = le->inner.data.lro_type;
277 lro_type = le->outer.data.lro_type;
279 case LRO_TYPE_IPV4_TCP:
280 tcp_hdr_offset -= sizeof(*le->outer.ip4);
281 m->m_pkthdr.lro_etype = ETHERTYPE_IP;
283 case LRO_TYPE_IPV6_TCP:
284 tcp_hdr_offset -= sizeof(*le->outer.ip6);
285 m->m_pkthdr.lro_etype = ETHERTYPE_IPV6;
291 case LRO_TYPE_IPV4_TCP:
292 tcp_hdr_offset -= sizeof(*le->outer.ip4);
293 m->m_pkthdr.lro_etype = ETHERTYPE_IP;
295 case LRO_TYPE_IPV6_TCP:
296 tcp_hdr_offset -= sizeof(*le->outer.ip6);
297 m->m_pkthdr.lro_etype = ETHERTYPE_IPV6;
303 MPASS(tcp_hdr_offset >= 0);
305 m_adj(m, tcp_hdr_offset);
306 m->m_flags |= M_LRO_EHDRSTRP;
307 m->m_flags &= ~M_ACKCMP;
308 m->m_pkthdr.lro_tcp_h_off -= tcp_hdr_offset;
310 th = tcp_lro_get_th(m);
312 th->th_sum = 0; /* TCP checksum is valid. */
314 /* Check if ACK can be compressed */
315 can_compress = tcp_lro_ack_valid(m, th, &ts_ptr, &other_opts);
317 /* Now lets look at the should wake states */
318 if ((other_opts == true) &&
319 ((tp->t_flags2 & TF2_DONT_SACK_QUEUE) == 0)) {
321 * If there are other options (SACK?) and the
322 * tcp endpoint has not expressly told us it does
323 * not care about SACKS, then we should wake up.
326 } else if (*should_wake == false) {
327 /* Wakeup override check if we are false here */
328 *should_wake = tcp_lro_check_wake_status(tp);
330 /* Is the ack compressable? */
331 if (can_compress == false)
333 /* Does the TCP endpoint support ACK compression? */
334 if ((tp->t_flags2 & TF2_MBUF_ACKCMP) == 0)
337 /* Lets get the TOS/traffic class field */
338 l3.ptr = mtod(m, void *);
340 case LRO_TYPE_IPV4_TCP:
341 iptos = l3.ip4->ip_tos;
343 case LRO_TYPE_IPV6_TCP:
344 iptos = IPV6_TRAFFIC_CLASS(l3.ip6);
347 iptos = 0; /* Keep compiler happy. */
350 /* Now lets get space if we don't have some already */
353 nm = tcp_lro_get_last_if_ackcmp(lc, le, tp, &n_mbuf,
355 if (__predict_false(nm == NULL))
360 * Link in the new cmp ack to our in-order place,
361 * first set our cmp ack's next to where we are.
366 * Set it up so mv_to is advanced to our
367 * compressed ack. This way the caller can
368 * advance pp to the right place.
372 * Advance it here locally as well.
377 /* We have one already we are working on */
379 if (M_TRAILINGSPACE(nm) < sizeof(struct tcp_ackent)) {
380 /* We ran out of space */
381 tp->t_flags2 |= TF2_MBUF_L_ACKS;
385 MPASS(M_TRAILINGSPACE(nm) >= sizeof(struct tcp_ackent));
386 counter_u64_add(tcp_inp_lro_compressed, 1);
388 /* We can add in to the one on the tail */
389 ack_ent = mtod(nm, struct tcp_ackent *);
390 idx = (nm->m_len / sizeof(struct tcp_ackent));
391 build_ack_entry(&ack_ent[idx], th, m, ts_ptr, iptos);
393 /* Bump the size of both pkt-hdr and len */
394 nm->m_len += sizeof(struct tcp_ackent);
395 nm->m_pkthdr.len += sizeof(struct tcp_ackent);
397 /* Advance to next mbuf before freeing. */
403 counter_u64_add(tcp_uncomp_total, 1);
409 tcp_queue_pkts(struct tcpcb *tp, struct lro_entry *le)
412 INP_WLOCK_ASSERT(tptoinpcb(tp));
414 STAILQ_HEAD(, mbuf) q = { le->m_head,
415 &STAILQ_NEXT(le->m_last_mbuf, m_stailqpkt) };
416 STAILQ_CONCAT(&tp->t_inqueue, &q);
418 le->m_last_mbuf = NULL;
421 static struct tcpcb *
422 tcp_lro_lookup(struct ifnet *ifp, struct lro_parser *pa)
426 CURVNET_SET(ifp->if_vnet);
427 switch (pa->data.lro_type) {
429 case LRO_TYPE_IPV6_TCP:
430 inp = in6_pcblookup(&V_tcbinfo,
440 case LRO_TYPE_IPV4_TCP:
441 inp = in_pcblookup(&V_tcbinfo,
456 return (intotcpcb(inp));
460 _tcp_lro_flush_tcphpts(struct lro_ctrl *lc, struct lro_entry *le)
463 struct mbuf **pp, *cmp, *mv_to;
464 struct ifnet *lagg_ifp;
465 bool bpf_req, lagg_bpf_req, should_wake, can_append_old_cmp;
467 /* Check if packet doesn't belongs to our network interface. */
468 if ((tcplro_stacks_wanting_mbufq == 0) ||
469 (le->outer.data.vlan_id != 0) ||
470 (le->inner.data.lro_type != LRO_TYPE_NONE))
471 return (TCP_LRO_CANNOT);
475 * Be proactive about unspecified IPv6 address in source. As
476 * we use all-zero to indicate unbounded/unconnected pcb,
477 * unspecified IPv6 address can be used to confuse us.
479 * Note that packets with unspecified IPv6 destination is
480 * already dropped in ip6_input.
482 if (__predict_false(le->outer.data.lro_type == LRO_TYPE_IPV6_TCP &&
483 IN6_IS_ADDR_UNSPECIFIED(&le->outer.data.s_addr.v6)))
484 return (TCP_LRO_CANNOT);
486 if (__predict_false(le->inner.data.lro_type == LRO_TYPE_IPV6_TCP &&
487 IN6_IS_ADDR_UNSPECIFIED(&le->inner.data.s_addr.v6)))
488 return (TCP_LRO_CANNOT);
490 /* Lookup inp, if any. Returns locked TCP inpcb. */
491 tp = tcp_lro_lookup(lc->ifp,
492 (le->inner.data.lro_type == LRO_TYPE_NONE) ? &le->outer : &le->inner);
494 return (TCP_LRO_CANNOT);
496 counter_u64_add(tcp_inp_lro_locks_taken, 1);
498 /* Check if the inp is dead, Jim. */
499 if (tp->t_state == TCPS_TIME_WAIT) {
500 INP_WUNLOCK(tptoinpcb(tp));
501 return (TCP_LRO_CANNOT);
503 if (tp->t_lro_cpu == HPTS_CPU_NONE && lc->lro_cpu_is_set == 1)
504 tp->t_lro_cpu = lc->lro_last_cpu;
505 /* Check if the transport doesn't support the needed optimizations. */
506 if ((tp->t_flags2 & (TF2_SUPPORTS_MBUFQ | TF2_MBUF_ACKCMP)) == 0) {
507 INP_WUNLOCK(tptoinpcb(tp));
508 return (TCP_LRO_CANNOT);
511 if (tp->t_flags2 & TF2_MBUF_QUEUE_READY)
515 /* Check if packets should be tapped to BPF. */
516 bpf_req = bpf_peers_present(lc->ifp->if_bpf);
517 lagg_bpf_req = false;
519 if (lc->ifp->if_type == IFT_IEEE8023ADLAG ||
520 lc->ifp->if_type == IFT_INFINIBANDLAG) {
521 struct lagg_port *lp = lc->ifp->if_lagg;
522 struct lagg_softc *sc = lp->lp_softc;
524 lagg_ifp = sc->sc_ifp;
525 if (lagg_ifp != NULL)
526 lagg_bpf_req = bpf_peers_present(lagg_ifp->if_bpf);
529 /* Strip and compress all the incoming packets. */
530 can_append_old_cmp = true;
532 for (pp = &le->m_head; *pp != NULL; ) {
534 if (do_bpf_strip_and_compress(tp, lc, le, pp, &cmp, &mv_to,
535 &should_wake, bpf_req, lagg_bpf_req, lagg_ifp,
536 can_append_old_cmp) == false) {
537 /* Advance to next mbuf. */
538 pp = &(*pp)->m_nextpkt;
540 * Once we have appended we can't look in the pending
541 * inbound packets for a compressed ack to append to.
543 can_append_old_cmp = false;
545 * Once we append we also need to stop adding to any
546 * compressed ack we were remembering. A new cmp
547 * ack will be required.
550 tcp_lro_log(tp, lc, le, NULL, 25, 0, 0, 0, 0);
551 } else if (mv_to != NULL) {
552 /* We are asked to move pp up */
553 pp = &mv_to->m_nextpkt;
554 tcp_lro_log(tp, lc, le, NULL, 24, 0, 0, 0, 0);
556 tcp_lro_log(tp, lc, le, NULL, 26, 0, 0, 0, 0);
558 /* Update "m_last_mbuf", if any. */
559 if (pp == &le->m_head)
560 le->m_last_mbuf = *pp;
562 le->m_last_mbuf = __containerof(pp, struct mbuf, m_nextpkt);
564 /* Check if any data mbufs left. */
565 if (le->m_head != NULL) {
566 counter_u64_add(tcp_inp_lro_direct_queue, 1);
567 tcp_lro_log(tp, lc, le, NULL, 22, 1, tp->t_flags2, 0, 1);
568 tcp_queue_pkts(tp, le);
572 counter_u64_add(tcp_inp_lro_wokeup_queue, 1);
573 if ((*tp->t_fb->tfb_do_queued_segments)(tp, 0))
574 /* TCP cb gone and unlocked. */
577 INP_WUNLOCK(tptoinpcb(tp));
579 return (0); /* Success. */
583 tcp_lro_hpts_init(void)
585 tcp_lro_flush_tcphpts = _tcp_lro_flush_tcphpts;
589 tcp_lro_hpts_uninit(void)
591 atomic_store_ptr(&tcp_lro_flush_tcphpts, NULL);