]> CyberLeo.Net >> Repos - FreeBSD/releng/8.1.git/blob - sys/netinet/tcp_lro.c
Copy stable/8 to releng/8.1 in preparation for 8.1-RC1.
[FreeBSD/releng/8.1.git] / sys / netinet / tcp_lro.c
1 /******************************************************************************
2
3 Copyright (c) 2007, Myricom Inc.
4 Copyright (c) 2008, Intel Corporation.
5 All rights reserved.
6
7 Redistribution and use in source and binary forms, with or without
8 modification, are permitted provided that the following conditions are met:
9
10  1. Redistributions of source code must retain the above copyright notice,
11     this list of conditions and the following disclaimer.
12
13  2. Neither the name of the Myricom Inc, nor the names of its
14     contributors may be used to endorse or promote products derived from
15     this software without specific prior written permission.
16
17  3. Neither the name of the Intel Corporation, nor the names of its
18     contributors may be used to endorse or promote products derived from
19     this software without specific prior written permission.
20
21 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
22 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
23 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
24 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
25 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
26 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
27 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
28 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
29 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
30 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
31 POSSIBILITY OF SUCH DAMAGE.
32
33 $FreeBSD$ 
34 ***************************************************************************/
35
36 #include <sys/param.h>
37 #include <sys/systm.h>
38 #include <sys/endian.h>
39 #include <sys/mbuf.h>
40 #include <sys/kernel.h>
41 #include <sys/socket.h>
42
43 #include <net/if.h>
44 #include <net/ethernet.h>
45 #include <net/if_media.h>
46
47 #include <netinet/in_systm.h>
48 #include <netinet/in.h>
49 #include <netinet/ip.h>
50 #include <netinet/tcp.h>
51 #include <netinet/tcp_lro.h>
52
53 #include <machine/bus.h>
54 #include <machine/in_cksum.h>
55
56
57 static uint16_t do_csum_data(uint16_t *raw, int len)
58 {
59         uint32_t csum;
60         csum = 0;
61         while (len > 0) {
62                 csum += *raw;
63                 raw++;
64                 csum += *raw;
65                 raw++;
66                 len -= 4;
67         }
68         csum = (csum >> 16) + (csum & 0xffff);
69         csum = (csum >> 16) + (csum & 0xffff);
70         return (uint16_t)csum;
71 }
72
73 /*
74  * Allocate and init the LRO data structures
75  */
76 int
77 tcp_lro_init(struct lro_ctrl *cntl)
78 {
79         struct lro_entry *lro;
80         int i, error = 0;
81
82         SLIST_INIT(&cntl->lro_free);
83         SLIST_INIT(&cntl->lro_active);
84
85         cntl->lro_bad_csum = 0;
86         cntl->lro_queued = 0;
87         cntl->lro_flushed = 0;
88
89         for (i = 0; i < LRO_ENTRIES; i++) {
90                 lro = (struct lro_entry *) malloc(sizeof (struct lro_entry),
91                     M_DEVBUF, M_NOWAIT | M_ZERO);
92                 if (lro == NULL) {
93                         if (i == 0)
94                                 error = ENOMEM;
95                         break;
96                 }
97                 cntl->lro_cnt = i;
98                 SLIST_INSERT_HEAD(&cntl->lro_free, lro, next);
99         }
100
101         return (error);
102 }
103
104 void
105 tcp_lro_free(struct lro_ctrl *cntl)
106 {
107         struct lro_entry *entry;
108
109         while (!SLIST_EMPTY(&cntl->lro_free)) {
110                 entry = SLIST_FIRST(&cntl->lro_free);
111                 SLIST_REMOVE_HEAD(&cntl->lro_free, next);
112                 free(entry, M_DEVBUF);
113         }
114 }
115
116 void
117 tcp_lro_flush(struct lro_ctrl *cntl, struct lro_entry *lro)
118 {
119         struct ifnet *ifp;
120         struct ip *ip;
121         struct tcphdr *tcp;
122         uint32_t *ts_ptr;
123         uint32_t tcplen, tcp_csum;
124
125
126         if (lro->append_cnt) {
127                 /* incorporate the new len into the ip header and
128                  * re-calculate the checksum */
129                 ip = lro->ip;
130                 ip->ip_len = htons(lro->len - ETHER_HDR_LEN);
131                 ip->ip_sum = 0;
132                 ip->ip_sum = 0xffff ^ 
133                         do_csum_data((uint16_t*)ip,
134                                               sizeof (*ip));
135
136                 lro->m_head->m_pkthdr.csum_flags = CSUM_IP_CHECKED |
137                         CSUM_IP_VALID | CSUM_DATA_VALID | CSUM_PSEUDO_HDR;
138                 lro->m_head->m_pkthdr.csum_data = 0xffff;
139                 lro->m_head->m_pkthdr.len = lro->len;
140
141                 /* incorporate the latest ack into the tcp header */
142                 tcp = (struct tcphdr *) (ip + 1);
143                 tcp->th_ack = lro->ack_seq;
144                 tcp->th_win = lro->window;
145                 /* incorporate latest timestamp into the tcp header */
146                 if (lro->timestamp) {
147                         ts_ptr = (uint32_t *)(tcp + 1);
148                         ts_ptr[1] = htonl(lro->tsval);
149                         ts_ptr[2] = lro->tsecr;
150                 }
151                 /* 
152                  * update checksum in tcp header by re-calculating the
153                  * tcp pseudoheader checksum, and adding it to the checksum
154                  * of the tcp payload data 
155                  */
156                 tcp->th_sum = 0;
157                 tcplen = lro->len - sizeof(*ip) - ETHER_HDR_LEN;
158                 tcp_csum = lro->data_csum;
159                 tcp_csum += in_pseudo(ip->ip_src.s_addr, ip->ip_dst.s_addr,
160                                       htons(tcplen + IPPROTO_TCP));
161                 tcp_csum += do_csum_data((uint16_t*)tcp,
162                                                   tcp->th_off << 2);
163                 tcp_csum = (tcp_csum & 0xffff) + (tcp_csum >> 16);
164                 tcp_csum = (tcp_csum & 0xffff) + (tcp_csum >> 16);
165                 tcp->th_sum = 0xffff ^ tcp_csum;
166         }
167         ifp = cntl->ifp;
168         (*ifp->if_input)(cntl->ifp, lro->m_head);
169         cntl->lro_queued += lro->append_cnt + 1;
170         cntl->lro_flushed++;
171         lro->m_head = NULL;
172         lro->timestamp = 0;
173         lro->append_cnt = 0;
174         SLIST_INSERT_HEAD(&cntl->lro_free, lro, next);
175 }
176
177 int
178 tcp_lro_rx(struct lro_ctrl *cntl, struct mbuf *m_head, uint32_t csum)
179 {
180         struct ether_header *eh;
181         struct ip *ip;
182         struct tcphdr *tcp;
183         uint32_t *ts_ptr;
184         struct mbuf *m_nxt, *m_tail;
185         struct lro_entry *lro;
186         int hlen, ip_len, tcp_hdr_len, tcp_data_len, tot_len;
187         int opt_bytes, trim, csum_flags;
188         uint32_t seq, tmp_csum, device_mtu;
189
190
191         eh = mtod(m_head, struct ether_header *);
192         if (eh->ether_type != htons(ETHERTYPE_IP))
193                 return 1;
194         ip = (struct ip *) (eh + 1);
195         if (ip->ip_p != IPPROTO_TCP)
196                 return 1;
197         
198         /* ensure there are no options */
199         if ((ip->ip_hl << 2) != sizeof (*ip))
200                 return -1;
201
202         /* .. and the packet is not fragmented */
203         if (ip->ip_off & htons(IP_MF|IP_OFFMASK))
204                 return -1;
205
206         /* verify that the IP header checksum is correct */
207         csum_flags = m_head->m_pkthdr.csum_flags;
208         if (csum_flags & CSUM_IP_CHECKED) {
209                 if (__predict_false((csum_flags & CSUM_IP_VALID) == 0)) {
210                         cntl->lro_bad_csum++;
211                         return -1;
212                 }
213         } else {
214                 tmp_csum = do_csum_data((uint16_t *)ip, sizeof (*ip));
215                 if (__predict_false((tmp_csum ^ 0xffff) != 0)) {
216                         cntl->lro_bad_csum++;
217                         return -1;
218                 }
219         }
220         
221         /* find the TCP header */
222         tcp = (struct tcphdr *) (ip + 1);
223
224         /* Get the TCP checksum if we dont have it */
225         if (!csum)
226                 csum = tcp->th_sum;
227
228         /* ensure no bits set besides ack or psh */
229         if ((tcp->th_flags & ~(TH_ACK | TH_PUSH)) != 0)
230                 return -1;
231
232         /* check for timestamps. Since the only option we handle are
233            timestamps, we only have to handle the simple case of
234            aligned timestamps */
235
236         opt_bytes = (tcp->th_off << 2) - sizeof (*tcp);
237         tcp_hdr_len =  sizeof (*tcp) + opt_bytes;
238         ts_ptr = (uint32_t *)(tcp + 1);
239         if (opt_bytes != 0) {
240                 if (__predict_false(opt_bytes != TCPOLEN_TSTAMP_APPA) ||
241                     (*ts_ptr !=  ntohl(TCPOPT_NOP<<24|TCPOPT_NOP<<16|
242                     TCPOPT_TIMESTAMP<<8|TCPOLEN_TIMESTAMP)))
243                         return -1;
244         }
245
246         ip_len = ntohs(ip->ip_len);
247         tcp_data_len = ip_len - (tcp->th_off << 2) - sizeof (*ip);
248         
249
250         /* 
251          * If frame is padded beyond the end of the IP packet,
252          * then we must trim the extra bytes off the end.
253          */
254         tot_len = m_head->m_pkthdr.len;
255         trim = tot_len - (ip_len + ETHER_HDR_LEN);
256         if (trim != 0) {
257                 if (trim < 0) {
258                         /* truncated packet */
259                         return -1;
260                 }
261                 m_adj(m_head, -trim);
262                 tot_len = m_head->m_pkthdr.len;
263         }
264
265         m_nxt = m_head;
266         m_tail = NULL; /* -Wuninitialized */
267         while (m_nxt != NULL) {
268                 m_tail = m_nxt;
269                 m_nxt = m_tail->m_next;
270         }
271
272         hlen = ip_len + ETHER_HDR_LEN - tcp_data_len;
273         seq = ntohl(tcp->th_seq);
274
275         SLIST_FOREACH(lro, &cntl->lro_active, next) {
276                 if (lro->source_port == tcp->th_sport && 
277                     lro->dest_port == tcp->th_dport &&
278                     lro->source_ip == ip->ip_src.s_addr && 
279                     lro->dest_ip == ip->ip_dst.s_addr) {
280                         /* Try to append it */
281
282                         if (__predict_false(seq != lro->next_seq)) {
283                                 /* out of order packet */
284                                 SLIST_REMOVE(&cntl->lro_active, lro,
285                                              lro_entry, next);
286                                 tcp_lro_flush(cntl, lro);
287                                 return -1;
288                         }
289
290                         if (opt_bytes) {
291                                 uint32_t tsval = ntohl(*(ts_ptr + 1));
292                                 /* make sure timestamp values are increasing */
293                                 if (__predict_false(lro->tsval > tsval || 
294                                              *(ts_ptr + 2) == 0)) {
295                                         return -1;
296                                 }
297                                 lro->tsval = tsval;
298                                 lro->tsecr = *(ts_ptr + 2);
299                         }
300
301                         lro->next_seq += tcp_data_len;
302                         lro->ack_seq = tcp->th_ack;
303                         lro->window = tcp->th_win;
304                         lro->append_cnt++;
305                         if (tcp_data_len == 0) {
306                                 m_freem(m_head);
307                                 return 0;
308                         }
309                         /* subtract off the checksum of the tcp header
310                          * from the hardware checksum, and add it to the
311                          * stored tcp data checksum.  Byteswap the checksum
312                          * if the total length so far is odd 
313                          */
314                         tmp_csum = do_csum_data((uint16_t*)tcp,
315                                                          tcp_hdr_len);
316                         csum = csum + (tmp_csum ^ 0xffff);
317                         csum = (csum & 0xffff) + (csum >> 16);
318                         csum = (csum & 0xffff) + (csum >> 16);
319                         if (lro->len & 0x1) {
320                                 /* Odd number of bytes so far, flip bytes */
321                                 csum = ((csum << 8) | (csum >> 8)) & 0xffff;
322                         }
323                         csum = csum + lro->data_csum;
324                         csum = (csum & 0xffff) + (csum >> 16);
325                         csum = (csum & 0xffff) + (csum >> 16);
326                         lro->data_csum = csum;
327
328                         lro->len += tcp_data_len;
329
330                         /* adjust mbuf so that m->m_data points to
331                            the first byte of the payload */
332                         m_adj(m_head, hlen);
333                         /* append mbuf chain */
334                         lro->m_tail->m_next = m_head;
335                         /* advance the last pointer */
336                         lro->m_tail = m_tail;
337                         /* flush packet if required */
338                         device_mtu = cntl->ifp->if_mtu;
339                         if (lro->len > (65535 - device_mtu)) {
340                                 SLIST_REMOVE(&cntl->lro_active, lro,
341                                              lro_entry, next);
342                                 tcp_lro_flush(cntl, lro);
343                         }
344                         return 0;
345                 }
346         }
347
348         if (SLIST_EMPTY(&cntl->lro_free))
349             return -1;
350
351         /* start a new chain */
352         lro = SLIST_FIRST(&cntl->lro_free);
353         SLIST_REMOVE_HEAD(&cntl->lro_free, next);
354         SLIST_INSERT_HEAD(&cntl->lro_active, lro, next);
355         lro->source_port = tcp->th_sport;
356         lro->dest_port = tcp->th_dport;
357         lro->source_ip = ip->ip_src.s_addr;
358         lro->dest_ip = ip->ip_dst.s_addr;
359         lro->next_seq = seq + tcp_data_len;
360         lro->mss = tcp_data_len;
361         lro->ack_seq = tcp->th_ack;
362         lro->window = tcp->th_win;
363
364         /* save the checksum of just the TCP payload by
365          * subtracting off the checksum of the TCP header from
366          * the entire hardware checksum 
367          * Since IP header checksum is correct, checksum over
368          * the IP header is -0.  Substracting -0 is unnecessary.
369          */
370         tmp_csum = do_csum_data((uint16_t*)tcp, tcp_hdr_len);
371         csum = csum + (tmp_csum ^ 0xffff);
372         csum = (csum & 0xffff) + (csum >> 16);
373         csum = (csum & 0xffff) + (csum >> 16);
374         lro->data_csum = csum;
375         
376         lro->ip = ip;
377         /* record timestamp if it is present */
378         if (opt_bytes) {
379                 lro->timestamp = 1;
380                 lro->tsval = ntohl(*(ts_ptr + 1));
381                 lro->tsecr = *(ts_ptr + 2);
382         }
383         lro->len = tot_len;
384         lro->m_head = m_head;
385         lro->m_tail = m_tail;
386         return 0;
387 }