2 * Copyright (c) 2007, Myricom Inc.
3 * Copyright (c) 2008, Intel Corporation.
4 * Copyright (c) 2012 The FreeBSD Foundation
5 * Copyright (c) 2016 Mellanox Technologies.
8 * Portions of this software were developed by Bjoern Zeeb
9 * under sponsorship from the FreeBSD Foundation.
11 * Redistribution and use in source and binary forms, with or without
12 * modification, are permitted provided that the following conditions
14 * 1. Redistributions of source code must retain the above copyright
15 * notice, this list of conditions and the following disclaimer.
16 * 2. Redistributions in binary form must reproduce the above copyright
17 * notice, this list of conditions and the following disclaimer in the
18 * documentation and/or other materials provided with the distribution.
20 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
21 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
22 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
23 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
24 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
25 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
26 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
27 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
28 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
29 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
33 #include <sys/cdefs.h>
34 __FBSDID("$FreeBSD$");
37 #include "opt_inet6.h"
39 #include <sys/param.h>
40 #include <sys/systm.h>
41 #include <sys/kernel.h>
42 #include <sys/malloc.h>
44 #include <sys/socket.h>
47 #include <net/if_var.h>
48 #include <net/ethernet.h>
51 #include <netinet/in_systm.h>
52 #include <netinet/in.h>
53 #include <netinet/ip6.h>
54 #include <netinet/ip.h>
55 #include <netinet/ip_var.h>
56 #include <netinet/tcp.h>
57 #include <netinet/tcp_lro.h>
59 #include <netinet6/ip6_var.h>
61 #include <machine/in_cksum.h>
63 static MALLOC_DEFINE(M_LRO, "LRO", "LRO control structures");
65 #define TCP_LRO_UPDATE_CSUM 1
66 #ifndef TCP_LRO_UPDATE_CSUM
67 #define TCP_LRO_INVALID_CSUM 0x0000
71 tcp_lro_init(struct lro_ctrl *lc)
73 return (tcp_lro_init_args(lc, NULL, TCP_LRO_ENTRIES, 0));
77 tcp_lro_init_args(struct lro_ctrl *lc, struct ifnet *ifp,
78 unsigned lro_entries, unsigned lro_mbufs)
88 lc->lro_mbuf_count = 0;
89 lc->lro_mbuf_max = lro_mbufs;
90 lc->lro_cnt = lro_entries;
91 lc->lro_ackcnt_lim = TCP_LRO_ACKCNT_MAX;
92 lc->lro_length_lim = TCP_LRO_LENGTH_MAX;
94 SLIST_INIT(&lc->lro_free);
95 SLIST_INIT(&lc->lro_active);
97 /* compute size to allocate */
98 size = (lro_mbufs * sizeof(struct mbuf *)) +
99 (lro_entries * sizeof(*le));
100 lc->lro_mbuf_data = (struct mbuf **)
101 malloc(size, M_LRO, M_NOWAIT | M_ZERO);
103 /* check for out of memory */
104 if (lc->lro_mbuf_data == NULL) {
105 memset(lc, 0, sizeof(*lc));
108 /* compute offset for LRO entries */
109 le = (struct lro_entry *)
110 (lc->lro_mbuf_data + lro_mbufs);
112 /* setup linked list */
113 for (i = 0; i != lro_entries; i++)
114 SLIST_INSERT_HEAD(&lc->lro_free, le + i, next);
120 tcp_lro_free(struct lro_ctrl *lc)
122 struct lro_entry *le;
125 /* reset LRO free list */
126 SLIST_INIT(&lc->lro_free);
128 /* free active mbufs, if any */
129 while ((le = SLIST_FIRST(&lc->lro_active)) != NULL) {
130 SLIST_REMOVE_HEAD(&lc->lro_active, next);
134 /* free mbuf array, if any */
135 for (x = 0; x != lc->lro_mbuf_count; x++)
136 m_freem(lc->lro_mbuf_data[x]);
137 lc->lro_mbuf_count = 0;
139 /* free allocated memory, if any */
140 free(lc->lro_mbuf_data, M_LRO);
141 lc->lro_mbuf_data = NULL;
144 #ifdef TCP_LRO_UPDATE_CSUM
146 tcp_lro_csum_th(struct tcphdr *th)
151 ch = th->th_sum = 0x0000;
162 ch = (ch >> 16) + (ch & 0xffff);
164 return (ch & 0xffff);
168 tcp_lro_rx_csum_fixup(struct lro_entry *le, void *l3hdr, struct tcphdr *th,
169 uint16_t tcp_data_len, uint16_t csum)
176 /* Remove length from checksum. */
177 switch (le->eh_type) {
183 ip6 = (struct ip6_hdr *)l3hdr;
184 if (le->append_cnt == 0)
189 cx = ntohs(ip6->ip6_plen);
190 cs = in6_cksum_pseudo(ip6, cx, ip6->ip6_nxt, 0);
200 ip4 = (struct ip *)l3hdr;
201 if (le->append_cnt == 0)
204 cs = in_addword(ntohs(ip4->ip_len) - sizeof(*ip4),
206 cs = in_pseudo(ip4->ip_src.s_addr, ip4->ip_dst.s_addr,
213 cs = 0; /* Keep compiler happy. */
219 /* Remove TCP header csum. */
220 cs = ~tcp_lro_csum_th(th);
223 c = (c >> 16) + (c & 0xffff);
230 tcp_lro_flush_inactive(struct lro_ctrl *lc, const struct timeval *timeout)
232 struct lro_entry *le, *le_tmp;
235 if (SLIST_EMPTY(&lc->lro_active))
239 timevalsub(&tv, timeout);
240 SLIST_FOREACH_SAFE(le, &lc->lro_active, next, le_tmp) {
241 if (timevalcmp(&tv, &le->mtime, >=)) {
242 SLIST_REMOVE(&lc->lro_active, le, lro_entry, next);
243 tcp_lro_flush(lc, le);
249 tcp_lro_flush(struct lro_ctrl *lc, struct lro_entry *le)
252 if (le->append_cnt > 0) {
256 p_len = htons(le->p_len);
257 switch (le->eh_type) {
264 ip6->ip6_plen = p_len;
265 th = (struct tcphdr *)(ip6 + 1);
266 le->m_head->m_pkthdr.csum_flags = CSUM_DATA_VALID |
268 le->p_len += ETHER_HDR_LEN + sizeof(*ip6);
276 #ifdef TCP_LRO_UPDATE_CSUM
282 #ifdef TCP_LRO_UPDATE_CSUM
283 /* Fix IP header checksum for new length. */
289 cl = (cl >> 16) + (cl & 0xffff);
293 ip4->ip_sum = TCP_LRO_INVALID_CSUM;
296 th = (struct tcphdr *)(ip4 + 1);
297 le->m_head->m_pkthdr.csum_flags = CSUM_DATA_VALID |
298 CSUM_PSEUDO_HDR | CSUM_IP_CHECKED | CSUM_IP_VALID;
299 le->p_len += ETHER_HDR_LEN;
304 th = NULL; /* Keep compiler happy. */
306 le->m_head->m_pkthdr.csum_data = 0xffff;
307 le->m_head->m_pkthdr.len = le->p_len;
309 /* Incorporate the latest ACK into the TCP header. */
310 th->th_ack = le->ack_seq;
311 th->th_win = le->window;
312 /* Incorporate latest timestamp into the TCP header. */
313 if (le->timestamp != 0) {
316 ts_ptr = (uint32_t *)(th + 1);
317 ts_ptr[1] = htonl(le->tsval);
318 ts_ptr[2] = le->tsecr;
320 #ifdef TCP_LRO_UPDATE_CSUM
321 /* Update the TCP header checksum. */
322 le->ulp_csum += p_len;
323 le->ulp_csum += tcp_lro_csum_th(th);
324 while (le->ulp_csum > 0xffff)
325 le->ulp_csum = (le->ulp_csum >> 16) +
326 (le->ulp_csum & 0xffff);
327 th->th_sum = (le->ulp_csum & 0xffff);
328 th->th_sum = ~th->th_sum;
330 th->th_sum = TCP_LRO_INVALID_CSUM;
334 (*lc->ifp->if_input)(lc->ifp, le->m_head);
335 lc->lro_queued += le->append_cnt + 1;
337 bzero(le, sizeof(*le));
338 SLIST_INSERT_HEAD(&lc->lro_free, le, next);
342 tcp_lro_mbuf_compare_header(const void *ppa, const void *ppb)
344 const struct mbuf *ma = *((const struct mbuf * const *)ppa);
345 const struct mbuf *mb = *((const struct mbuf * const *)ppb);
348 ret = M_HASHTYPE_GET(ma) - M_HASHTYPE_GET(mb);
352 if (ma->m_pkthdr.flowid > mb->m_pkthdr.flowid)
354 else if (ma->m_pkthdr.flowid < mb->m_pkthdr.flowid)
357 ret = TCP_LRO_SEQUENCE(ma) - TCP_LRO_SEQUENCE(mb);
363 tcp_lro_flush_all(struct lro_ctrl *lc)
365 struct lro_entry *le;
370 /* check if no mbufs to flush */
371 if (__predict_false(lc->lro_mbuf_count == 0))
374 /* sort all mbufs according to stream */
375 qsort(lc->lro_mbuf_data, lc->lro_mbuf_count, sizeof(struct mbuf *),
376 &tcp_lro_mbuf_compare_header);
378 /* input data into LRO engine, stream by stream */
380 hashtype = M_HASHTYPE_NONE;
381 for (x = 0; x != lc->lro_mbuf_count; x++) {
384 mb = lc->lro_mbuf_data[x];
386 /* check for new stream */
387 if (mb->m_pkthdr.flowid != flowid ||
388 M_HASHTYPE_GET(mb) != hashtype) {
389 flowid = mb->m_pkthdr.flowid;
390 hashtype = M_HASHTYPE_GET(mb);
392 /* flush active streams */
393 while ((le = SLIST_FIRST(&lc->lro_active)) != NULL) {
394 SLIST_REMOVE_HEAD(&lc->lro_active, next);
395 tcp_lro_flush(lc, le);
398 #ifdef TCP_LRO_RESET_SEQUENCE
399 /* reset sequence number */
400 TCP_LRO_SEQUENCE(mb) = 0;
402 /* add packet to LRO engine */
403 if (tcp_lro_rx(lc, mb, 0) != 0) {
404 /* input packet to network layer */
405 (*lc->ifp->if_input)(lc->ifp, mb);
411 /* flush active streams */
412 while ((le = SLIST_FIRST(&lc->lro_active)) != NULL) {
413 SLIST_REMOVE_HEAD(&lc->lro_active, next);
414 tcp_lro_flush(lc, le);
416 lc->lro_mbuf_count = 0;
421 tcp_lro_rx_ipv6(struct lro_ctrl *lc, struct mbuf *m, struct ip6_hdr *ip6,
425 /* XXX-BZ we should check the flow-label. */
427 /* XXX-BZ We do not yet support ext. hdrs. */
428 if (ip6->ip6_nxt != IPPROTO_TCP)
429 return (TCP_LRO_NOT_SUPPORTED);
431 /* Find the TCP header. */
432 *th = (struct tcphdr *)(ip6 + 1);
440 tcp_lro_rx_ipv4(struct lro_ctrl *lc, struct mbuf *m, struct ip *ip4,
446 if (ip4->ip_p != IPPROTO_TCP)
447 return (TCP_LRO_NOT_SUPPORTED);
449 /* Ensure there are no options. */
450 if ((ip4->ip_hl << 2) != sizeof (*ip4))
451 return (TCP_LRO_CANNOT);
453 /* .. and the packet is not fragmented. */
454 if (ip4->ip_off & htons(IP_MF|IP_OFFMASK))
455 return (TCP_LRO_CANNOT);
457 /* Legacy IP has a header checksum that needs to be correct. */
458 csum_flags = m->m_pkthdr.csum_flags;
459 if (csum_flags & CSUM_IP_CHECKED) {
460 if (__predict_false((csum_flags & CSUM_IP_VALID) == 0)) {
462 return (TCP_LRO_CANNOT);
465 csum = in_cksum_hdr(ip4);
466 if (__predict_false((csum) != 0)) {
468 return (TCP_LRO_CANNOT);
472 /* Find the TCP header (we assured there are no IP options). */
473 *th = (struct tcphdr *)(ip4 + 1);
480 tcp_lro_rx(struct lro_ctrl *lc, struct mbuf *m, uint32_t csum)
482 struct lro_entry *le;
483 struct ether_header *eh;
485 struct ip6_hdr *ip6 = NULL; /* Keep compiler happy. */
488 struct ip *ip4 = NULL; /* Keep compiler happy. */
491 void *l3hdr = NULL; /* Keep compiler happy. */
494 int error, ip_len, l;
495 uint16_t eh_type, tcp_data_len;
497 /* We expect a contiguous header [eh, ip, tcp]. */
499 eh = mtod(m, struct ether_header *);
500 eh_type = ntohs(eh->ether_type);
505 CURVNET_SET(lc->ifp->if_vnet);
506 if (V_ip6_forwarding != 0) {
507 /* XXX-BZ stats but changing lro_ctrl is a problem. */
509 return (TCP_LRO_CANNOT);
512 l3hdr = ip6 = (struct ip6_hdr *)(eh + 1);
513 error = tcp_lro_rx_ipv6(lc, m, ip6, &th);
516 tcp_data_len = ntohs(ip6->ip6_plen);
517 ip_len = sizeof(*ip6) + tcp_data_len;
524 CURVNET_SET(lc->ifp->if_vnet);
525 if (V_ipforwarding != 0) {
526 /* XXX-BZ stats but changing lro_ctrl is a problem. */
528 return (TCP_LRO_CANNOT);
531 l3hdr = ip4 = (struct ip *)(eh + 1);
532 error = tcp_lro_rx_ipv4(lc, m, ip4, &th);
535 ip_len = ntohs(ip4->ip_len);
536 tcp_data_len = ip_len - sizeof(*ip4);
540 /* XXX-BZ what happens in case of VLAN(s)? */
542 return (TCP_LRO_NOT_SUPPORTED);
546 * If the frame is padded beyond the end of the IP packet, then we must
547 * trim the extra bytes off.
549 l = m->m_pkthdr.len - (ETHER_HDR_LEN + ip_len);
552 /* Truncated packet. */
553 return (TCP_LRO_CANNOT);
559 * Check TCP header constraints.
561 /* Ensure no bits set besides ACK or PSH. */
562 if ((th->th_flags & ~(TH_ACK | TH_PUSH)) != 0)
563 return (TCP_LRO_CANNOT);
565 /* XXX-BZ We lose a AKC|PUSH flag concatinating multiple segments. */
566 /* XXX-BZ Ideally we'd flush on PUSH? */
569 * Check for timestamps.
570 * Since the only option we handle are timestamps, we only have to
571 * handle the simple case of aligned timestamps.
573 l = (th->th_off << 2);
576 ts_ptr = (uint32_t *)(th + 1);
577 if (l != 0 && (__predict_false(l != TCPOLEN_TSTAMP_APPA) ||
578 (*ts_ptr != ntohl(TCPOPT_NOP<<24|TCPOPT_NOP<<16|
579 TCPOPT_TIMESTAMP<<8|TCPOLEN_TIMESTAMP))))
580 return (TCP_LRO_CANNOT);
582 /* If the driver did not pass in the checksum, set it now. */
586 seq = ntohl(th->th_seq);
588 /* Try to find a matching previous segment. */
589 SLIST_FOREACH(le, &lc->lro_active, next) {
590 if (le->eh_type != eh_type)
592 if (le->source_port != th->th_sport ||
593 le->dest_port != th->th_dport)
598 if (bcmp(&le->source_ip6, &ip6->ip6_src,
599 sizeof(struct in6_addr)) != 0 ||
600 bcmp(&le->dest_ip6, &ip6->ip6_dst,
601 sizeof(struct in6_addr)) != 0)
607 if (le->source_ip4 != ip4->ip_src.s_addr ||
608 le->dest_ip4 != ip4->ip_dst.s_addr)
614 /* Flush now if appending will result in overflow. */
615 if (le->p_len > (lc->lro_length_lim - tcp_data_len)) {
616 SLIST_REMOVE(&lc->lro_active, le, lro_entry, next);
617 tcp_lro_flush(lc, le);
621 /* Try to append the new segment. */
622 if (__predict_false(seq != le->next_seq ||
623 (tcp_data_len == 0 && le->ack_seq == th->th_ack))) {
624 /* Out of order packet or duplicate ACK. */
625 SLIST_REMOVE(&lc->lro_active, le, lro_entry, next);
626 tcp_lro_flush(lc, le);
627 return (TCP_LRO_CANNOT);
631 uint32_t tsval = ntohl(*(ts_ptr + 1));
632 /* Make sure timestamp values are increasing. */
633 /* XXX-BZ flip and use TSTMP_GEQ macro for this? */
634 if (__predict_false(le->tsval > tsval ||
636 return (TCP_LRO_CANNOT);
638 le->tsecr = *(ts_ptr + 2);
641 le->next_seq += tcp_data_len;
642 le->ack_seq = th->th_ack;
643 le->window = th->th_win;
646 #ifdef TCP_LRO_UPDATE_CSUM
647 le->ulp_csum += tcp_lro_rx_csum_fixup(le, l3hdr, th,
648 tcp_data_len, ~csum);
651 if (tcp_data_len == 0) {
654 * Flush this LRO entry, if this ACK should not
655 * be further delayed.
657 if (le->append_cnt >= lc->lro_ackcnt_lim) {
658 SLIST_REMOVE(&lc->lro_active, le, lro_entry,
660 tcp_lro_flush(lc, le);
665 le->p_len += tcp_data_len;
668 * Adjust the mbuf so that m_data points to the first byte of
669 * the ULP payload. Adjust the mbuf to avoid complications and
670 * append new segment to existing mbuf chain.
672 m_adj(m, m->m_pkthdr.len - tcp_data_len);
675 le->m_tail->m_next = m;
676 le->m_tail = m_last(m);
679 * If a possible next full length packet would cause an
680 * overflow, pro-actively flush now.
682 if (le->p_len > (lc->lro_length_lim - lc->ifp->if_mtu)) {
683 SLIST_REMOVE(&lc->lro_active, le, lro_entry, next);
684 tcp_lro_flush(lc, le);
686 getmicrotime(&le->mtime);
691 /* Try to find an empty slot. */
692 if (SLIST_EMPTY(&lc->lro_free))
693 return (TCP_LRO_NO_ENTRIES);
695 /* Start a new segment chain. */
696 le = SLIST_FIRST(&lc->lro_free);
697 SLIST_REMOVE_HEAD(&lc->lro_free, next);
698 SLIST_INSERT_HEAD(&lc->lro_active, le, next);
699 getmicrotime(&le->mtime);
701 /* Start filling in details. */
706 le->source_ip6 = ip6->ip6_src;
707 le->dest_ip6 = ip6->ip6_dst;
708 le->eh_type = eh_type;
709 le->p_len = m->m_pkthdr.len - ETHER_HDR_LEN - sizeof(*ip6);
715 le->source_ip4 = ip4->ip_src.s_addr;
716 le->dest_ip4 = ip4->ip_dst.s_addr;
717 le->eh_type = eh_type;
718 le->p_len = m->m_pkthdr.len - ETHER_HDR_LEN;
722 le->source_port = th->th_sport;
723 le->dest_port = th->th_dport;
725 le->next_seq = seq + tcp_data_len;
726 le->ack_seq = th->th_ack;
727 le->window = th->th_win;
730 le->tsval = ntohl(*(ts_ptr + 1));
731 le->tsecr = *(ts_ptr + 2);
734 #ifdef TCP_LRO_UPDATE_CSUM
736 * Do not touch the csum of the first packet. However save the
737 * "adjusted" checksum of just the source and destination addresses,
738 * the next header and the TCP payload. The length and TCP header
739 * parts may change, so we remove those from the saved checksum and
740 * re-add with final values on tcp_lro_flush() if needed.
742 KASSERT(le->ulp_csum == 0, ("%s: le=%p le->ulp_csum=0x%04x\n",
743 __func__, le, le->ulp_csum));
745 le->ulp_csum = tcp_lro_rx_csum_fixup(le, l3hdr, th, tcp_data_len,
747 th->th_sum = csum; /* Restore checksum on first packet. */
751 le->m_tail = m_last(m);
757 tcp_lro_queue_mbuf(struct lro_ctrl *lc, struct mbuf *mb)
760 if (__predict_false(lc->ifp == NULL || lc->lro_mbuf_data == NULL ||
761 lc->lro_mbuf_max == 0)) {
767 /* check if packet is not LRO capable */
768 if (__predict_false(mb->m_pkthdr.csum_flags == 0 ||
769 (lc->ifp->if_capenable & IFCAP_LRO) == 0)) {
773 /* input packet to network layer */
774 (*lc->ifp->if_input) (lc->ifp, mb);
778 /* check if array is full */
779 if (__predict_false(lc->lro_mbuf_count == lc->lro_mbuf_max))
780 tcp_lro_flush_all(lc);
782 /* store sequence number */
783 TCP_LRO_SEQUENCE(mb) = lc->lro_mbuf_count;
786 lc->lro_mbuf_data[lc->lro_mbuf_count++] = mb;