2 * Copyright (c) 2015 Mellanox Technologies. All rights reserved.
4 * Redistribution and use in source and binary forms, with or without
5 * modification, are permitted provided that the following conditions
7 * 1. Redistributions of source code must retain the above copyright
8 * notice, this list of conditions and the following disclaimer.
9 * 2. Redistributions in binary form must reproduce the above copyright
10 * notice, this list of conditions and the following disclaimer in the
11 * documentation and/or other materials provided with the distribution.
13 * THIS SOFTWARE IS PROVIDED BY AUTHOR AND CONTRIBUTORS `AS IS' AND
14 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
15 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
16 * ARE DISCLAIMED. IN NO EVENT SHALL AUTHOR OR CONTRIBUTORS BE LIABLE
17 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
18 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
19 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
20 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
21 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
22 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
26 #include <sys/cdefs.h>
27 __FBSDID("$FreeBSD$");
30 #include "opt_inet6.h"
32 #include <sys/param.h>
33 #include <sys/libkern.h>
36 #include <sys/mutex.h>
37 #include <sys/sysctl.h>
38 #include <sys/malloc.h>
39 #include <sys/kernel.h>
40 #include <sys/endian.h>
41 #include <sys/socket.h>
42 #include <sys/sockopt.h>
46 #include <net/if_var.h>
47 #include <net/ethernet.h>
49 #if defined(INET) || defined(INET6)
50 #include <netinet/in.h>
54 #include <netinet/ip.h>
58 #include <netinet/ip6.h>
61 #include <netinet/tcp_var.h>
65 #ifndef M_HASHTYPE_LRO_TCP
67 #warning "M_HASHTYPE_LRO_TCP is not defined"
69 #define M_HASHTYPE_LRO_TCP 254
72 static SYSCTL_NODE(_net_inet_tcp, OID_AUTO, tlro,
73 CTLFLAG_RW, 0, "TCP turbo LRO parameters");
75 static MALLOC_DEFINE(M_TLRO, "TLRO", "Turbo LRO");
77 static int tlro_min_rate = 20; /* Hz */
79 SYSCTL_INT(_net_inet_tcp_tlro, OID_AUTO, min_rate, CTLFLAG_RWTUN,
80 &tlro_min_rate, 0, "Minimum serving rate in Hz");
82 static int tlro_max_packet = IP_MAXPACKET;
84 SYSCTL_INT(_net_inet_tcp_tlro, OID_AUTO, max_packet, CTLFLAG_RWTUN,
85 &tlro_max_packet, 0, "Maximum packet size in bytes");
89 } __packed uint32_p_t;
92 tcp_tlro_csum(const uint32_p_t *p, size_t l)
94 const uint32_p_t *pend = p + (l / 4);
97 for (cs = 0; p != pend; p++)
98 cs += le32toh(p->value);
100 cs = (cs >> 16) + (cs & 0xffff);
105 tcp_tlro_get_header(const struct mbuf *m, const u_int off,
108 if (m->m_len < (off + len))
110 return (mtod(m, char *) + off);
114 tcp_tlro_info_save_timestamp(struct tlro_mbuf_data *pinfo)
116 struct tcphdr *tcp = pinfo->tcp;
119 if (tcp->th_off < ((TCPOLEN_TSTAMP_APPA + sizeof(*tcp)) >> 2))
122 ts_ptr = (uint32_t *)(tcp + 1);
123 if (*ts_ptr != ntohl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) |
124 (TCPOPT_TIMESTAMP << 8) | TCPOLEN_TIMESTAMP))
127 /* Save timestamps */
128 pinfo->tcp_ts = ts_ptr[1];
129 pinfo->tcp_ts_reply = ts_ptr[2];
134 tcp_tlro_info_restore_timestamp(struct tlro_mbuf_data *pinfoa,
135 struct tlro_mbuf_data *pinfob)
137 struct tcphdr *tcp = pinfoa->tcp;
140 if (tcp->th_off < ((TCPOLEN_TSTAMP_APPA + sizeof(*tcp)) >> 2))
143 ts_ptr = (uint32_t *)(tcp + 1);
144 if (*ts_ptr != ntohl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) |
145 (TCPOPT_TIMESTAMP << 8) | TCPOLEN_TIMESTAMP))
148 /* Restore timestamps */
149 ts_ptr[1] = pinfob->tcp_ts;
150 ts_ptr[2] = pinfob->tcp_ts_reply;
154 tcp_tlro_extract_header(struct tlro_mbuf_data *pinfo, struct mbuf *m, int seq)
156 uint8_t *phdr = (uint8_t *)pinfo->buf;
157 struct ether_header *eh;
158 struct ether_vlan_header *vlan;
170 /* Fill in information */
172 pinfo->last_tick = ticks;
173 pinfo->sequence = seq;
174 pinfo->pprev = &m_last(m)->m_next;
179 eh = tcp_tlro_get_header(m, 0, sizeof(*eh));
182 memcpy(phdr, &eh->ether_dhost, ETHER_ADDR_LEN);
183 phdr += ETHER_ADDR_LEN;
184 memcpy(phdr, &eh->ether_type, sizeof(eh->ether_type));
185 phdr += sizeof(eh->ether_type);
186 etype = ntohs(eh->ether_type);
188 if (etype == ETHERTYPE_VLAN) {
189 vlan = tcp_tlro_get_header(m, off, sizeof(*vlan));
192 memcpy(phdr, &vlan->evl_tag, sizeof(vlan->evl_tag) +
193 sizeof(vlan->evl_proto));
194 phdr += sizeof(vlan->evl_tag) + sizeof(vlan->evl_proto);
195 etype = ntohs(vlan->evl_proto);
196 off += sizeof(*vlan) - sizeof(*eh);
204 * - Fragmented packets
205 * - Packets with IPv4 options
208 ip = tcp_tlro_get_header(m, off, sizeof(*ip));
210 (ip->ip_off & htons(IP_MF | IP_OFFMASK)) != 0 ||
211 (ip->ip_p != IPPROTO_TCP) ||
212 (ip->ip_hl << 2) != sizeof(*ip))
215 /* Legacy IP has a header checksum that needs to be correct */
216 if (!(m->m_pkthdr.csum_flags & CSUM_IP_CHECKED)) {
217 /* Verify IP header */
218 if (tcp_tlro_csum((uint32_p_t *)ip, sizeof(*ip)) != 0xFFFF)
219 m->m_pkthdr.csum_flags |= CSUM_IP_CHECKED;
221 m->m_pkthdr.csum_flags |= CSUM_IP_CHECKED |
224 /* Only accept valid checksums */
225 if (!(m->m_pkthdr.csum_flags & CSUM_IP_VALID) ||
226 !(m->m_pkthdr.csum_flags & CSUM_DATA_VALID))
228 memcpy(phdr, &ip->ip_src, sizeof(ip->ip_src) +
230 phdr += sizeof(ip->ip_src) + sizeof(ip->ip_dst);
231 if (M_HASHTYPE_GET(m) == M_HASHTYPE_LRO_TCP)
232 pinfo->ip_len = m->m_pkthdr.len - off;
234 pinfo->ip_len = ntohs(ip->ip_len);
235 pinfo->ip_hdrlen = sizeof(*ip);
237 pinfo->ip_version = 4;
246 * - Packets with IPv6 options
249 ip6 = tcp_tlro_get_header(m, off, sizeof(*ip6));
250 if (ip6 == NULL || ip6->ip6_nxt != IPPROTO_TCP)
252 if (!(m->m_pkthdr.csum_flags & CSUM_DATA_VALID))
254 memcpy(phdr, &ip6->ip6_src, sizeof(struct in6_addr) +
255 sizeof(struct in6_addr));
256 phdr += sizeof(struct in6_addr) + sizeof(struct in6_addr);
257 if (M_HASHTYPE_GET(m) == M_HASHTYPE_LRO_TCP)
258 pinfo->ip_len = m->m_pkthdr.len - off;
260 pinfo->ip_len = ntohs(ip6->ip6_plen) + sizeof(*ip6);
261 pinfo->ip_hdrlen = sizeof(*ip6);
263 pinfo->ip_version = 6;
270 tcp = tcp_tlro_get_header(m, off, sizeof(*tcp));
273 memcpy(phdr, &tcp->th_sport, sizeof(tcp->th_sport) +
274 sizeof(tcp->th_dport));
275 phdr += sizeof(tcp->th_sport) +
276 sizeof(tcp->th_dport);
277 /* Store TCP header length */
278 *phdr++ = tcp->th_off;
279 if (tcp->th_off < (sizeof(*tcp) >> 2))
282 /* Compute offset to data payload */
283 pinfo->tcp_len = (tcp->th_off << 2);
284 off += pinfo->tcp_len;
286 /* Store more info */
287 pinfo->data_off = off;
290 /* Try to save timestamp, if any */
291 *phdr++ = tcp_tlro_info_save_timestamp(pinfo);
293 /* Verify offset and IP/TCP length */
294 if (off > m->m_pkthdr.len ||
295 pinfo->ip_len < pinfo->tcp_len)
298 /* Compute data payload length */
299 pinfo->data_len = (pinfo->ip_len - pinfo->tcp_len - pinfo->ip_hdrlen);
301 /* Trim any padded data */
302 diff = (m->m_pkthdr.len - off) - pinfo->data_len;
309 /* Compute header length */
310 pinfo->buf_length = phdr - (uint8_t *)pinfo->buf;
311 /* Zero-pad rest of buffer */
312 memset(phdr, 0, TLRO_MAX_HEADER - pinfo->buf_length);
315 pinfo->buf_length = 0;
319 tcp_tlro_cmp64(const uint64_t *pa, const uint64_t *pb)
324 for (x = 0; x != TLRO_MAX_HEADER / 8; x++) {
326 * NOTE: Endianness does not matter in this
329 diff = pa[x] - pb[x];
342 tcp_tlro_compare_header(const void *_ppa, const void *_ppb)
344 const struct tlro_mbuf_ptr *ppa = _ppa;
345 const struct tlro_mbuf_ptr *ppb = _ppb;
346 struct tlro_mbuf_data *pinfoa = ppa->data;
347 struct tlro_mbuf_data *pinfob = ppb->data;
350 ret = (pinfoa->head == NULL) - (pinfob->head == NULL);
354 ret = pinfoa->buf_length - pinfob->buf_length;
357 if (pinfoa->buf_length != 0) {
358 ret = tcp_tlro_cmp64(pinfoa->buf, pinfob->buf);
361 ret = ntohl(pinfoa->tcp->th_seq) - ntohl(pinfob->tcp->th_seq);
364 ret = ntohl(pinfoa->tcp->th_ack) - ntohl(pinfob->tcp->th_ack);
367 ret = pinfoa->sequence - pinfob->sequence;
376 tcp_tlro_sort(struct tlro_ctrl *tlro)
381 qsort(tlro->mbuf, tlro->curr, sizeof(struct tlro_mbuf_ptr),
382 &tcp_tlro_compare_header);
386 tcp_tlro_get_ticks(void)
388 int to = tlro_min_rate;
399 tcp_tlro_combine(struct tlro_ctrl *tlro, int force)
401 struct tlro_mbuf_data *pinfoa;
402 struct tlro_mbuf_data *pinfob;
404 int curr_ticks = ticks;
405 int ticks_limit = tcp_tlro_get_ticks();
414 for (y = 0; y != tlro->curr;) {
417 pinfoa = tlro->mbuf[y].data;
418 for (x = y + 1; x != tlro->curr; x++) {
419 pinfob = tlro->mbuf[x].data;
420 if (pinfoa->buf_length != pinfob->buf_length ||
421 tcp_tlro_cmp64(pinfoa->buf, pinfob->buf) != 0)
424 if (pinfoa->buf_length == 0) {
425 /* Forward traffic which cannot be combined */
426 for (z = y; z != x; z++) {
427 /* Just forward packets */
428 pinfob = tlro->mbuf[z].data;
432 /* Reset info structure */
434 pinfob->buf_length = 0;
439 /* Input packet to network layer */
440 (*tlro->ifp->if_input) (tlro->ifp, m);
446 /* Compute current checksum subtracted some header parts */
447 temp = (pinfoa->ip_len - pinfoa->ip_hdrlen);
448 cs = ((temp & 0xFF) << 8) + ((temp & 0xFF00) >> 8) +
449 tcp_tlro_csum((uint32_p_t *)pinfoa->tcp, pinfoa->tcp_len);
451 /* Append all fragments into one block */
452 for (z = y + 1; z != x; z++) {
454 pinfob = tlro->mbuf[z].data;
456 /* Check for command packets */
457 if ((pinfoa->tcp->th_flags & ~(TH_ACK | TH_PUSH)) ||
458 (pinfob->tcp->th_flags & ~(TH_ACK | TH_PUSH)))
461 /* Check if there is enough space */
462 if ((pinfoa->ip_len + pinfob->data_len) > tlro_max_packet)
465 /* Try to append the new segment */
466 temp = ntohl(pinfoa->tcp->th_seq) + pinfoa->data_len;
467 if (temp != (int)ntohl(pinfob->tcp->th_seq))
470 temp = pinfob->ip_len - pinfob->ip_hdrlen;
471 cs += ((temp & 0xFF) << 8) + ((temp & 0xFF00) >> 8) +
472 tcp_tlro_csum((uint32_p_t *)pinfob->tcp, pinfob->tcp_len);
473 /* Remove fields which appear twice */
474 cs += (IPPROTO_TCP << 8);
475 if (pinfob->ip_version == 4) {
476 cs += tcp_tlro_csum((uint32_p_t *)&pinfob->ip.v4->ip_src, 4);
477 cs += tcp_tlro_csum((uint32_p_t *)&pinfob->ip.v4->ip_dst, 4);
479 cs += tcp_tlro_csum((uint32_p_t *)&pinfob->ip.v6->ip6_src, 16);
480 cs += tcp_tlro_csum((uint32_p_t *)&pinfob->ip.v6->ip6_dst, 16);
482 /* Remainder computation */
484 cs = (cs >> 16) + (cs & 0xffff);
486 /* Update window and ack sequence number */
487 pinfoa->tcp->th_ack = pinfob->tcp->th_ack;
488 pinfoa->tcp->th_win = pinfob->tcp->th_win;
490 /* Check if we should restore the timestamp */
491 tcp_tlro_info_restore_timestamp(pinfoa, pinfob);
493 /* Accumulate TCP flags */
494 pinfoa->tcp->th_flags |= pinfob->tcp->th_flags;
497 pinfoa->ip_len += pinfob->data_len;
498 pinfoa->data_len += pinfob->data_len;
500 /* Clear mbuf pointer - packet is accumulated */
503 /* Reset info structure */
505 pinfob->buf_length = 0;
507 /* Append data to mbuf [y] */
508 m_adj(m, pinfob->data_off);
509 /* Delete mbuf tags, if any */
510 m_tag_delete_chain(m, NULL);
511 /* Clear packet header flag */
512 m->m_flags &= ~M_PKTHDR;
514 /* Concat mbuf(s) to end of list */
515 pinfoa->pprev[0] = m;
517 pinfoa->pprev = &m->m_next;
518 pinfoa->head->m_pkthdr.len += pinfob->data_len;
520 /* Compute new TCP header checksum */
521 pinfoa->tcp->th_sum = 0;
523 temp = pinfoa->ip_len - pinfoa->ip_hdrlen;
525 tcp_tlro_csum((uint32_p_t *)pinfoa->tcp, pinfoa->tcp_len) +
526 ((temp & 0xFF) << 8) + ((temp & 0xFF00) >> 8);
528 /* Remainder computation */
530 cs = (cs >> 16) + (cs & 0xffff);
532 /* Update new checksum */
533 pinfoa->tcp->th_sum = ~htole16(cs);
535 /* Update IP length, if any */
536 if (pinfoa->ip_version == 4) {
537 if (pinfoa->ip_len > IP_MAXPACKET) {
538 M_HASHTYPE_SET(pinfoa->head, M_HASHTYPE_LRO_TCP);
539 pinfoa->ip.v4->ip_len = htons(IP_MAXPACKET);
541 pinfoa->ip.v4->ip_len = htons(pinfoa->ip_len);
544 if (pinfoa->ip_len > (IP_MAXPACKET + sizeof(*pinfoa->ip.v6))) {
545 M_HASHTYPE_SET(pinfoa->head, M_HASHTYPE_LRO_TCP);
546 pinfoa->ip.v6->ip6_plen = htons(IP_MAXPACKET);
548 temp = pinfoa->ip_len - sizeof(*pinfoa->ip.v6);
549 pinfoa->ip.v6->ip6_plen = htons(temp);
553 temp = curr_ticks - pinfoa->last_tick;
554 /* Check if packet should be forwarded */
555 if (force != 0 || z != x || temp >= ticks_limit ||
556 pinfoa->data_len == 0) {
558 /* Compute new IPv4 header checksum */
559 if (pinfoa->ip_version == 4) {
560 pinfoa->ip.v4->ip_sum = 0;
561 cs = tcp_tlro_csum((uint32_p_t *)pinfoa->ip.v4,
562 sizeof(*pinfoa->ip.v4));
563 pinfoa->ip.v4->ip_sum = ~htole16(cs);
568 /* Reset info structure */
570 pinfoa->buf_length = 0;
575 /* Input packet to network layer */
576 (*tlro->ifp->if_input) (tlro->ifp, m);
581 /* Cleanup all NULL heads */
582 for (y = 0; y != tlro->curr; y++) {
583 if (tlro->mbuf[y].data->head == NULL) {
584 for (z = y + 1; z != tlro->curr; z++) {
585 struct tlro_mbuf_ptr ptemp;
586 if (tlro->mbuf[z].data->head == NULL)
588 ptemp = tlro->mbuf[y];
589 tlro->mbuf[y] = tlro->mbuf[z];
590 tlro->mbuf[z] = ptemp;
600 tcp_tlro_cleanup(struct tlro_ctrl *tlro)
602 while (tlro->curr != 0 &&
603 tlro->mbuf[tlro->curr - 1].data->head == NULL)
608 tcp_tlro_flush(struct tlro_ctrl *tlro, int force)
614 tcp_tlro_cleanup(tlro);
615 tcp_tlro_combine(tlro, force);
619 tcp_tlro_init(struct tlro_ctrl *tlro, struct ifnet *ifp,
625 /* Set zero defaults */
626 memset(tlro, 0, sizeof(*tlro));
628 /* Compute size needed for data */
629 size = (sizeof(struct tlro_mbuf_ptr) * max_mbufs) +
630 (sizeof(struct tlro_mbuf_data) * max_mbufs);
633 if (max_mbufs <= 0 || size <= 0 || ifp == NULL)
636 /* Setup tlro control structure */
637 tlro->mbuf = malloc(size, M_TLRO, M_WAITOK | M_ZERO);
638 tlro->max = max_mbufs;
641 /* Setup pointer array */
642 for (x = 0; x != tlro->max; x++) {
643 tlro->mbuf[x].data = ((struct tlro_mbuf_data *)
644 &tlro->mbuf[max_mbufs]) + x;
650 tcp_tlro_free(struct tlro_ctrl *tlro)
652 struct tlro_mbuf_data *pinfo;
656 /* Check if not setup */
657 if (tlro->mbuf == NULL)
659 /* Free MBUF array and any leftover MBUFs */
660 for (y = 0; y != tlro->max; y++) {
662 pinfo = tlro->mbuf[y].data;
666 /* Reset info structure */
668 pinfo->buf_length = 0;
672 free(tlro->mbuf, M_TLRO);
674 memset(tlro, 0, sizeof(*tlro));
678 tcp_tlro_rx(struct tlro_ctrl *tlro, struct mbuf *m)
680 if (m->m_len > 0 && tlro->curr < tlro->max) {
685 tcp_tlro_extract_header(tlro->mbuf[tlro->curr++].data,
686 m, tlro->sequence++);
687 } else if (tlro->ifp != NULL) {
691 /* input packet to network layer */
692 (*tlro->ifp->if_input) (tlro->ifp, m);