2 * Copyright (c) 2007, Myricom Inc.
3 * Copyright (c) 2008, Intel Corporation.
4 * Copyright (c) 2012 The FreeBSD Foundation
7 * Portions of this software were developed by Bjoern Zeeb
8 * under sponsorship from the FreeBSD Foundation.
10 * Redistribution and use in source and binary forms, with or without
11 * modification, are permitted provided that the following conditions
13 * 1. Redistributions of source code must retain the above copyright
14 * notice, this list of conditions and the following disclaimer.
15 * 2. Redistributions in binary form must reproduce the above copyright
16 * notice, this list of conditions and the following disclaimer in the
17 * documentation and/or other materials provided with the distribution.
19 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
20 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
21 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
22 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
23 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
24 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
25 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
26 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
27 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
28 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32 #include <sys/cdefs.h>
33 __FBSDID("$FreeBSD$");
36 #include "opt_inet6.h"
38 #include <sys/param.h>
39 #include <sys/systm.h>
41 #include <sys/kernel.h>
42 #include <sys/socket.h>
45 #include <net/if_var.h>
46 #include <net/ethernet.h>
49 #include <netinet/in_systm.h>
50 #include <netinet/in.h>
51 #include <netinet/ip6.h>
52 #include <netinet/ip.h>
53 #include <netinet/ip_var.h>
54 #include <netinet/tcp.h>
55 #include <netinet/tcp_lro.h>
57 #include <netinet6/ip6_var.h>
59 #include <machine/in_cksum.h>
62 #define LRO_ENTRIES 8 /* # of LRO entries per RX queue. */
65 #define TCP_LRO_UPDATE_CSUM 1
66 #ifndef TCP_LRO_UPDATE_CSUM
67 #define TCP_LRO_INVALID_CSUM 0x0000
71 tcp_lro_init(struct lro_ctrl *lc)
80 SLIST_INIT(&lc->lro_free);
81 SLIST_INIT(&lc->lro_active);
84 for (i = 0; i < LRO_ENTRIES; i++) {
85 le = (struct lro_entry *)malloc(sizeof(*le), M_DEVBUF,
93 SLIST_INSERT_HEAD(&lc->lro_free, le, next);
100 tcp_lro_free(struct lro_ctrl *lc)
102 struct lro_entry *le;
104 while (!SLIST_EMPTY(&lc->lro_free)) {
105 le = SLIST_FIRST(&lc->lro_free);
106 SLIST_REMOVE_HEAD(&lc->lro_free, next);
111 #ifdef TCP_LRO_UPDATE_CSUM
113 tcp_lro_csum_th(struct tcphdr *th)
118 ch = th->th_sum = 0x0000;
129 ch = (ch >> 16) + (ch & 0xffff);
131 return (ch & 0xffff);
135 tcp_lro_rx_csum_fixup(struct lro_entry *le, void *l3hdr, struct tcphdr *th,
136 uint16_t tcp_data_len, uint16_t csum)
143 /* Remove length from checksum. */
144 switch (le->eh_type) {
150 ip6 = (struct ip6_hdr *)l3hdr;
151 if (le->append_cnt == 0)
156 cx = ntohs(ip6->ip6_plen);
157 cs = in6_cksum_pseudo(ip6, cx, ip6->ip6_nxt, 0);
167 ip4 = (struct ip *)l3hdr;
168 if (le->append_cnt == 0)
171 cs = in_addword(ntohs(ip4->ip_len) - sizeof(*ip4),
173 cs = in_pseudo(ip4->ip_src.s_addr, ip4->ip_dst.s_addr,
180 cs = 0; /* Keep compiler happy. */
186 /* Remove TCP header csum. */
187 cs = ~tcp_lro_csum_th(th);
190 c = (c >> 16) + (c & 0xffff);
197 tcp_lro_flush_inactive(struct lro_ctrl *lc, const struct timeval *timeout)
199 struct lro_entry *le, *le_tmp;
202 if (SLIST_EMPTY(&lc->lro_active))
206 timevalsub(&tv, timeout);
207 SLIST_FOREACH_SAFE(le, &lc->lro_active, next, le_tmp) {
208 if (timevalcmp(&tv, &le->mtime, >=)) {
209 SLIST_REMOVE(&lc->lro_active, le, lro_entry, next);
210 tcp_lro_flush(lc, le);
216 tcp_lro_flush(struct lro_ctrl *lc, struct lro_entry *le)
219 if (le->append_cnt > 0) {
223 p_len = htons(le->p_len);
224 switch (le->eh_type) {
231 ip6->ip6_plen = p_len;
232 th = (struct tcphdr *)(ip6 + 1);
233 le->m_head->m_pkthdr.csum_flags = CSUM_DATA_VALID |
235 le->p_len += ETHER_HDR_LEN + sizeof(*ip6);
243 #ifdef TCP_LRO_UPDATE_CSUM
249 #ifdef TCP_LRO_UPDATE_CSUM
250 /* Fix IP header checksum for new length. */
256 cl = (cl >> 16) + (cl & 0xffff);
260 ip4->ip_sum = TCP_LRO_INVALID_CSUM;
263 th = (struct tcphdr *)(ip4 + 1);
264 le->m_head->m_pkthdr.csum_flags = CSUM_DATA_VALID |
265 CSUM_PSEUDO_HDR | CSUM_IP_CHECKED | CSUM_IP_VALID;
266 le->p_len += ETHER_HDR_LEN;
271 th = NULL; /* Keep compiler happy. */
273 le->m_head->m_pkthdr.csum_data = 0xffff;
274 le->m_head->m_pkthdr.len = le->p_len;
276 /* Incorporate the latest ACK into the TCP header. */
277 th->th_ack = le->ack_seq;
278 th->th_win = le->window;
279 /* Incorporate latest timestamp into the TCP header. */
280 if (le->timestamp != 0) {
283 ts_ptr = (uint32_t *)(th + 1);
284 ts_ptr[1] = htonl(le->tsval);
285 ts_ptr[2] = le->tsecr;
287 #ifdef TCP_LRO_UPDATE_CSUM
288 /* Update the TCP header checksum. */
289 le->ulp_csum += p_len;
290 le->ulp_csum += tcp_lro_csum_th(th);
291 while (le->ulp_csum > 0xffff)
292 le->ulp_csum = (le->ulp_csum >> 16) +
293 (le->ulp_csum & 0xffff);
294 th->th_sum = (le->ulp_csum & 0xffff);
295 th->th_sum = ~th->th_sum;
297 th->th_sum = TCP_LRO_INVALID_CSUM;
301 (*lc->ifp->if_input)(lc->ifp, le->m_head);
302 lc->lro_queued += le->append_cnt + 1;
304 bzero(le, sizeof(*le));
305 SLIST_INSERT_HEAD(&lc->lro_free, le, next);
310 tcp_lro_rx_ipv6(struct lro_ctrl *lc, struct mbuf *m, struct ip6_hdr *ip6,
314 /* XXX-BZ we should check the flow-label. */
316 /* XXX-BZ We do not yet support ext. hdrs. */
317 if (ip6->ip6_nxt != IPPROTO_TCP)
318 return (TCP_LRO_NOT_SUPPORTED);
320 /* Find the TCP header. */
321 *th = (struct tcphdr *)(ip6 + 1);
329 tcp_lro_rx_ipv4(struct lro_ctrl *lc, struct mbuf *m, struct ip *ip4,
335 if (ip4->ip_p != IPPROTO_TCP)
336 return (TCP_LRO_NOT_SUPPORTED);
338 /* Ensure there are no options. */
339 if ((ip4->ip_hl << 2) != sizeof (*ip4))
340 return (TCP_LRO_CANNOT);
342 /* .. and the packet is not fragmented. */
343 if (ip4->ip_off & htons(IP_MF|IP_OFFMASK))
344 return (TCP_LRO_CANNOT);
346 /* Legacy IP has a header checksum that needs to be correct. */
347 csum_flags = m->m_pkthdr.csum_flags;
348 if (csum_flags & CSUM_IP_CHECKED) {
349 if (__predict_false((csum_flags & CSUM_IP_VALID) == 0)) {
351 return (TCP_LRO_CANNOT);
354 csum = in_cksum_hdr(ip4);
355 if (__predict_false((csum) != 0)) {
357 return (TCP_LRO_CANNOT);
361 /* Find the TCP header (we assured there are no IP options). */
362 *th = (struct tcphdr *)(ip4 + 1);
369 tcp_lro_rx(struct lro_ctrl *lc, struct mbuf *m, uint32_t csum)
371 struct lro_entry *le;
372 struct ether_header *eh;
374 struct ip6_hdr *ip6 = NULL; /* Keep compiler happy. */
377 struct ip *ip4 = NULL; /* Keep compiler happy. */
380 void *l3hdr = NULL; /* Keep compiler happy. */
383 int error, ip_len, l;
384 uint16_t eh_type, tcp_data_len;
386 /* We expect a contiguous header [eh, ip, tcp]. */
388 eh = mtod(m, struct ether_header *);
389 eh_type = ntohs(eh->ether_type);
394 CURVNET_SET(lc->ifp->if_vnet);
395 if (V_ip6_forwarding != 0) {
396 /* XXX-BZ stats but changing lro_ctrl is a problem. */
398 return (TCP_LRO_CANNOT);
401 l3hdr = ip6 = (struct ip6_hdr *)(eh + 1);
402 error = tcp_lro_rx_ipv6(lc, m, ip6, &th);
405 tcp_data_len = ntohs(ip6->ip6_plen);
406 ip_len = sizeof(*ip6) + tcp_data_len;
413 CURVNET_SET(lc->ifp->if_vnet);
414 if (V_ipforwarding != 0) {
415 /* XXX-BZ stats but changing lro_ctrl is a problem. */
417 return (TCP_LRO_CANNOT);
420 l3hdr = ip4 = (struct ip *)(eh + 1);
421 error = tcp_lro_rx_ipv4(lc, m, ip4, &th);
424 ip_len = ntohs(ip4->ip_len);
425 tcp_data_len = ip_len - sizeof(*ip4);
429 /* XXX-BZ what happens in case of VLAN(s)? */
431 return (TCP_LRO_NOT_SUPPORTED);
435 * If the frame is padded beyond the end of the IP packet, then we must
436 * trim the extra bytes off.
438 l = m->m_pkthdr.len - (ETHER_HDR_LEN + ip_len);
441 /* Truncated packet. */
442 return (TCP_LRO_CANNOT);
448 * Check TCP header constraints.
450 /* Ensure no bits set besides ACK or PSH. */
451 if ((th->th_flags & ~(TH_ACK | TH_PUSH)) != 0)
452 return (TCP_LRO_CANNOT);
454 /* XXX-BZ We lose a AKC|PUSH flag concatinating multiple segments. */
455 /* XXX-BZ Ideally we'd flush on PUSH? */
458 * Check for timestamps.
459 * Since the only option we handle are timestamps, we only have to
460 * handle the simple case of aligned timestamps.
462 l = (th->th_off << 2);
465 ts_ptr = (uint32_t *)(th + 1);
466 if (l != 0 && (__predict_false(l != TCPOLEN_TSTAMP_APPA) ||
467 (*ts_ptr != ntohl(TCPOPT_NOP<<24|TCPOPT_NOP<<16|
468 TCPOPT_TIMESTAMP<<8|TCPOLEN_TIMESTAMP))))
469 return (TCP_LRO_CANNOT);
471 /* If the driver did not pass in the checksum, set it now. */
475 seq = ntohl(th->th_seq);
477 /* Try to find a matching previous segment. */
478 SLIST_FOREACH(le, &lc->lro_active, next) {
479 if (le->eh_type != eh_type)
481 if (le->source_port != th->th_sport ||
482 le->dest_port != th->th_dport)
487 if (bcmp(&le->source_ip6, &ip6->ip6_src,
488 sizeof(struct in6_addr)) != 0 ||
489 bcmp(&le->dest_ip6, &ip6->ip6_dst,
490 sizeof(struct in6_addr)) != 0)
496 if (le->source_ip4 != ip4->ip_src.s_addr ||
497 le->dest_ip4 != ip4->ip_dst.s_addr)
503 /* Flush now if appending will result in overflow. */
504 if (le->p_len > (65535 - tcp_data_len)) {
505 SLIST_REMOVE(&lc->lro_active, le, lro_entry, next);
506 tcp_lro_flush(lc, le);
510 /* Try to append the new segment. */
511 if (__predict_false(seq != le->next_seq ||
512 (tcp_data_len == 0 && le->ack_seq == th->th_ack))) {
513 /* Out of order packet or duplicate ACK. */
514 SLIST_REMOVE(&lc->lro_active, le, lro_entry, next);
515 tcp_lro_flush(lc, le);
516 return (TCP_LRO_CANNOT);
520 uint32_t tsval = ntohl(*(ts_ptr + 1));
521 /* Make sure timestamp values are increasing. */
522 /* XXX-BZ flip and use TSTMP_GEQ macro for this? */
523 if (__predict_false(le->tsval > tsval ||
525 return (TCP_LRO_CANNOT);
527 le->tsecr = *(ts_ptr + 2);
530 le->next_seq += tcp_data_len;
531 le->ack_seq = th->th_ack;
532 le->window = th->th_win;
535 #ifdef TCP_LRO_UPDATE_CSUM
536 le->ulp_csum += tcp_lro_rx_csum_fixup(le, l3hdr, th,
537 tcp_data_len, ~csum);
540 if (tcp_data_len == 0) {
545 le->p_len += tcp_data_len;
548 * Adjust the mbuf so that m_data points to the first byte of
549 * the ULP payload. Adjust the mbuf to avoid complications and
550 * append new segment to existing mbuf chain.
552 m_adj(m, m->m_pkthdr.len - tcp_data_len);
553 m->m_flags &= ~M_PKTHDR;
555 le->m_tail->m_next = m;
556 le->m_tail = m_last(m);
559 * If a possible next full length packet would cause an
560 * overflow, pro-actively flush now.
562 if (le->p_len > (65535 - lc->ifp->if_mtu)) {
563 SLIST_REMOVE(&lc->lro_active, le, lro_entry, next);
564 tcp_lro_flush(lc, le);
566 getmicrotime(&le->mtime);
571 /* Try to find an empty slot. */
572 if (SLIST_EMPTY(&lc->lro_free))
573 return (TCP_LRO_CANNOT);
575 /* Start a new segment chain. */
576 le = SLIST_FIRST(&lc->lro_free);
577 SLIST_REMOVE_HEAD(&lc->lro_free, next);
578 SLIST_INSERT_HEAD(&lc->lro_active, le, next);
579 getmicrotime(&le->mtime);
581 /* Start filling in details. */
586 le->source_ip6 = ip6->ip6_src;
587 le->dest_ip6 = ip6->ip6_dst;
588 le->eh_type = eh_type;
589 le->p_len = m->m_pkthdr.len - ETHER_HDR_LEN - sizeof(*ip6);
595 le->source_ip4 = ip4->ip_src.s_addr;
596 le->dest_ip4 = ip4->ip_dst.s_addr;
597 le->eh_type = eh_type;
598 le->p_len = m->m_pkthdr.len - ETHER_HDR_LEN;
602 le->source_port = th->th_sport;
603 le->dest_port = th->th_dport;
605 le->next_seq = seq + tcp_data_len;
606 le->ack_seq = th->th_ack;
607 le->window = th->th_win;
610 le->tsval = ntohl(*(ts_ptr + 1));
611 le->tsecr = *(ts_ptr + 2);
614 #ifdef TCP_LRO_UPDATE_CSUM
616 * Do not touch the csum of the first packet. However save the
617 * "adjusted" checksum of just the source and destination addresses,
618 * the next header and the TCP payload. The length and TCP header
619 * parts may change, so we remove those from the saved checksum and
620 * re-add with final values on tcp_lro_flush() if needed.
622 KASSERT(le->ulp_csum == 0, ("%s: le=%p le->ulp_csum=0x%04x\n",
623 __func__, le, le->ulp_csum));
625 le->ulp_csum = tcp_lro_rx_csum_fixup(le, l3hdr, th, tcp_data_len,
627 th->th_sum = csum; /* Restore checksum on first packet. */
631 le->m_tail = m_last(m);