2 * Copyright (c) 2007, Myricom Inc.
3 * Copyright (c) 2008, Intel Corporation.
6 * Redistribution and use in source and binary forms, with or without
7 * modification, are permitted provided that the following conditions
9 * 1. Redistributions of source code must retain the above copyright
10 * notice, this list of conditions and the following disclaimer.
11 * 2. Redistributions in binary form must reproduce the above copyright
12 * notice, this list of conditions and the following disclaimer in the
13 * documentation and/or other materials provided with the distribution.
15 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
16 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
17 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
18 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
19 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
20 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
21 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
22 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
23 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
24 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
30 #include <sys/param.h>
31 #include <sys/systm.h>
32 #include <sys/endian.h>
34 #include <sys/kernel.h>
35 #include <sys/socket.h>
38 #include <net/ethernet.h>
39 #include <net/if_media.h>
41 #include <netinet/in_systm.h>
42 #include <netinet/in.h>
43 #include <netinet/ip.h>
44 #include <netinet/tcp.h>
45 #include <netinet/tcp_lro.h>
47 #include <machine/bus.h>
48 #include <machine/in_cksum.h>
51 static uint16_t do_csum_data(uint16_t *raw, int len)
62 csum = (csum >> 16) + (csum & 0xffff);
63 csum = (csum >> 16) + (csum & 0xffff);
64 return (uint16_t)csum;
68 * Allocate and init the LRO data structures
71 tcp_lro_init(struct lro_ctrl *cntl)
73 struct lro_entry *lro;
76 SLIST_INIT(&cntl->lro_free);
77 SLIST_INIT(&cntl->lro_active);
79 cntl->lro_bad_csum = 0;
81 cntl->lro_flushed = 0;
83 for (i = 0; i < LRO_ENTRIES; i++) {
84 lro = (struct lro_entry *) malloc(sizeof (struct lro_entry),
85 M_DEVBUF, M_NOWAIT | M_ZERO);
92 SLIST_INSERT_HEAD(&cntl->lro_free, lro, next);
99 tcp_lro_free(struct lro_ctrl *cntl)
101 struct lro_entry *entry;
103 while (!SLIST_EMPTY(&cntl->lro_free)) {
104 entry = SLIST_FIRST(&cntl->lro_free);
105 SLIST_REMOVE_HEAD(&cntl->lro_free, next);
106 free(entry, M_DEVBUF);
111 tcp_lro_flush(struct lro_ctrl *cntl, struct lro_entry *lro)
117 uint32_t tcplen, tcp_csum;
120 if (lro->append_cnt) {
121 /* incorporate the new len into the ip header and
122 * re-calculate the checksum */
124 ip->ip_len = htons(lro->len - ETHER_HDR_LEN);
126 ip->ip_sum = 0xffff ^
127 do_csum_data((uint16_t*)ip,
130 lro->m_head->m_pkthdr.csum_flags = CSUM_IP_CHECKED |
131 CSUM_IP_VALID | CSUM_DATA_VALID | CSUM_PSEUDO_HDR;
132 lro->m_head->m_pkthdr.csum_data = 0xffff;
133 lro->m_head->m_pkthdr.len = lro->len;
135 /* incorporate the latest ack into the tcp header */
136 tcp = (struct tcphdr *) (ip + 1);
137 tcp->th_ack = lro->ack_seq;
138 tcp->th_win = lro->window;
139 /* incorporate latest timestamp into the tcp header */
140 if (lro->timestamp) {
141 ts_ptr = (uint32_t *)(tcp + 1);
142 ts_ptr[1] = htonl(lro->tsval);
143 ts_ptr[2] = lro->tsecr;
146 * update checksum in tcp header by re-calculating the
147 * tcp pseudoheader checksum, and adding it to the checksum
148 * of the tcp payload data
151 tcplen = lro->len - sizeof(*ip) - ETHER_HDR_LEN;
152 tcp_csum = lro->data_csum;
153 tcp_csum += in_pseudo(ip->ip_src.s_addr, ip->ip_dst.s_addr,
154 htons(tcplen + IPPROTO_TCP));
155 tcp_csum += do_csum_data((uint16_t*)tcp,
157 tcp_csum = (tcp_csum & 0xffff) + (tcp_csum >> 16);
158 tcp_csum = (tcp_csum & 0xffff) + (tcp_csum >> 16);
159 tcp->th_sum = 0xffff ^ tcp_csum;
162 (*ifp->if_input)(cntl->ifp, lro->m_head);
163 cntl->lro_queued += lro->append_cnt + 1;
168 SLIST_INSERT_HEAD(&cntl->lro_free, lro, next);
172 tcp_lro_rx(struct lro_ctrl *cntl, struct mbuf *m_head, uint32_t csum)
174 struct ether_header *eh;
178 struct mbuf *m_nxt, *m_tail;
179 struct lro_entry *lro;
180 int hlen, ip_len, tcp_hdr_len, tcp_data_len, tot_len;
181 int opt_bytes, trim, csum_flags;
182 uint32_t seq, tmp_csum, device_mtu;
185 eh = mtod(m_head, struct ether_header *);
186 if (eh->ether_type != htons(ETHERTYPE_IP))
188 ip = (struct ip *) (eh + 1);
189 if (ip->ip_p != IPPROTO_TCP)
192 /* ensure there are no options */
193 if ((ip->ip_hl << 2) != sizeof (*ip))
196 /* .. and the packet is not fragmented */
197 if (ip->ip_off & htons(IP_MF|IP_OFFMASK))
200 /* verify that the IP header checksum is correct */
201 csum_flags = m_head->m_pkthdr.csum_flags;
202 if (csum_flags & CSUM_IP_CHECKED) {
203 if (__predict_false((csum_flags & CSUM_IP_VALID) == 0)) {
204 cntl->lro_bad_csum++;
208 tmp_csum = do_csum_data((uint16_t *)ip, sizeof (*ip));
209 if (__predict_false((tmp_csum ^ 0xffff) != 0)) {
210 cntl->lro_bad_csum++;
215 /* find the TCP header */
216 tcp = (struct tcphdr *) (ip + 1);
218 /* Get the TCP checksum if we dont have it */
222 /* ensure no bits set besides ack or psh */
223 if ((tcp->th_flags & ~(TH_ACK | TH_PUSH)) != 0)
226 /* check for timestamps. Since the only option we handle are
227 timestamps, we only have to handle the simple case of
228 aligned timestamps */
230 opt_bytes = (tcp->th_off << 2) - sizeof (*tcp);
231 tcp_hdr_len = sizeof (*tcp) + opt_bytes;
232 ts_ptr = (uint32_t *)(tcp + 1);
233 if (opt_bytes != 0) {
234 if (__predict_false(opt_bytes != TCPOLEN_TSTAMP_APPA) ||
235 (*ts_ptr != ntohl(TCPOPT_NOP<<24|TCPOPT_NOP<<16|
236 TCPOPT_TIMESTAMP<<8|TCPOLEN_TIMESTAMP)))
240 ip_len = ntohs(ip->ip_len);
241 tcp_data_len = ip_len - (tcp->th_off << 2) - sizeof (*ip);
245 * If frame is padded beyond the end of the IP packet,
246 * then we must trim the extra bytes off the end.
248 tot_len = m_head->m_pkthdr.len;
249 trim = tot_len - (ip_len + ETHER_HDR_LEN);
252 /* truncated packet */
255 m_adj(m_head, -trim);
256 tot_len = m_head->m_pkthdr.len;
260 m_tail = NULL; /* -Wuninitialized */
261 while (m_nxt != NULL) {
263 m_nxt = m_tail->m_next;
266 hlen = ip_len + ETHER_HDR_LEN - tcp_data_len;
267 seq = ntohl(tcp->th_seq);
269 SLIST_FOREACH(lro, &cntl->lro_active, next) {
270 if (lro->source_port == tcp->th_sport &&
271 lro->dest_port == tcp->th_dport &&
272 lro->source_ip == ip->ip_src.s_addr &&
273 lro->dest_ip == ip->ip_dst.s_addr) {
274 /* Try to append it */
276 if (__predict_false(seq != lro->next_seq)) {
277 /* out of order packet */
278 SLIST_REMOVE(&cntl->lro_active, lro,
280 tcp_lro_flush(cntl, lro);
285 uint32_t tsval = ntohl(*(ts_ptr + 1));
286 /* make sure timestamp values are increasing */
287 if (__predict_false(lro->tsval > tsval ||
288 *(ts_ptr + 2) == 0)) {
292 lro->tsecr = *(ts_ptr + 2);
295 lro->next_seq += tcp_data_len;
296 lro->ack_seq = tcp->th_ack;
297 lro->window = tcp->th_win;
299 if (tcp_data_len == 0) {
303 /* subtract off the checksum of the tcp header
304 * from the hardware checksum, and add it to the
305 * stored tcp data checksum. Byteswap the checksum
306 * if the total length so far is odd
308 tmp_csum = do_csum_data((uint16_t*)tcp,
310 csum = csum + (tmp_csum ^ 0xffff);
311 csum = (csum & 0xffff) + (csum >> 16);
312 csum = (csum & 0xffff) + (csum >> 16);
313 if (lro->len & 0x1) {
314 /* Odd number of bytes so far, flip bytes */
315 csum = ((csum << 8) | (csum >> 8)) & 0xffff;
317 csum = csum + lro->data_csum;
318 csum = (csum & 0xffff) + (csum >> 16);
319 csum = (csum & 0xffff) + (csum >> 16);
320 lro->data_csum = csum;
322 lro->len += tcp_data_len;
324 /* adjust mbuf so that m->m_data points to
325 the first byte of the payload */
327 /* append mbuf chain */
328 lro->m_tail->m_next = m_head;
329 /* advance the last pointer */
330 lro->m_tail = m_tail;
331 /* flush packet if required */
332 device_mtu = cntl->ifp->if_mtu;
333 if (lro->len > (65535 - device_mtu)) {
334 SLIST_REMOVE(&cntl->lro_active, lro,
336 tcp_lro_flush(cntl, lro);
342 if (SLIST_EMPTY(&cntl->lro_free))
345 /* start a new chain */
346 lro = SLIST_FIRST(&cntl->lro_free);
347 SLIST_REMOVE_HEAD(&cntl->lro_free, next);
348 SLIST_INSERT_HEAD(&cntl->lro_active, lro, next);
349 lro->source_port = tcp->th_sport;
350 lro->dest_port = tcp->th_dport;
351 lro->source_ip = ip->ip_src.s_addr;
352 lro->dest_ip = ip->ip_dst.s_addr;
353 lro->next_seq = seq + tcp_data_len;
354 lro->mss = tcp_data_len;
355 lro->ack_seq = tcp->th_ack;
356 lro->window = tcp->th_win;
358 /* save the checksum of just the TCP payload by
359 * subtracting off the checksum of the TCP header from
360 * the entire hardware checksum
361 * Since IP header checksum is correct, checksum over
362 * the IP header is -0. Substracting -0 is unnecessary.
364 tmp_csum = do_csum_data((uint16_t*)tcp, tcp_hdr_len);
365 csum = csum + (tmp_csum ^ 0xffff);
366 csum = (csum & 0xffff) + (csum >> 16);
367 csum = (csum & 0xffff) + (csum >> 16);
368 lro->data_csum = csum;
371 /* record timestamp if it is present */
374 lro->tsval = ntohl(*(ts_ptr + 1));
375 lro->tsecr = *(ts_ptr + 2);
378 lro->m_head = m_head;
379 lro->m_tail = m_tail;