]> CyberLeo.Net >> Repos - FreeBSD/releng/9.0.git/blob - sys/netinet/tcp_lro.c
Copy stable/9 to releng/9.0 as part of the FreeBSD 9.0-RELEASE release
[FreeBSD/releng/9.0.git] / sys / netinet / tcp_lro.c
1 /******************************************************************************
2
3 Copyright (c) 2007, Myricom Inc.
4 Copyright (c) 2008, Intel Corporation.
5 All rights reserved.
6
7 Redistribution and use in source and binary forms, with or without
8 modification, are permitted provided that the following conditions are met:
9
10  1. Redistributions of source code must retain the above copyright notice,
11     this list of conditions and the following disclaimer.
12
13  2. Neither the name of the Myricom Inc, nor the names of its
14     contributors may be used to endorse or promote products derived from
15     this software without specific prior written permission.
16
17  3. Neither the name of the Intel Corporation, nor the names of its
18     contributors may be used to endorse or promote products derived from
19     this software without specific prior written permission.
20
21 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
22 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
23 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
24 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
25 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
26 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
27 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
28 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
29 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
30 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
31 POSSIBILITY OF SUCH DAMAGE.
32
33 $FreeBSD$ 
34 ***************************************************************************/
35
36 #include <sys/param.h>
37 #include <sys/systm.h>
38 #include <sys/endian.h>
39 #include <sys/mbuf.h>
40 #include <sys/kernel.h>
41 #include <sys/socket.h>
42
43 #include <net/if.h>
44 #include <net/ethernet.h>
45 #include <net/if_media.h>
46
47 #include <netinet/in_systm.h>
48 #include <netinet/in.h>
49 #include <netinet/ip.h>
50 #include <netinet/tcp.h>
51 #include <netinet/tcp_lro.h>
52
53 #include <machine/bus.h>
54 #include <machine/in_cksum.h>
55
56
57 static uint16_t do_csum_data(uint16_t *raw, int len)
58 {
59         uint32_t csum;
60         csum = 0;
61         while (len > 0) {
62                 csum += *raw;
63                 raw++;
64                 csum += *raw;
65                 raw++;
66                 len -= 4;
67         }
68         csum = (csum >> 16) + (csum & 0xffff);
69         csum = (csum >> 16) + (csum & 0xffff);
70         return (uint16_t)csum;
71 }
72
73 /*
74  * Allocate and init the LRO data structures
75  */
76 int
77 tcp_lro_init(struct lro_ctrl *cntl)
78 {
79         struct lro_entry *lro;
80         int i, error = 0;
81
82         SLIST_INIT(&cntl->lro_free);
83         SLIST_INIT(&cntl->lro_active);
84
85         cntl->lro_bad_csum = 0;
86         cntl->lro_queued = 0;
87         cntl->lro_flushed = 0;
88
89         for (i = 0; i < LRO_ENTRIES; i++) {
90                 lro = (struct lro_entry *) malloc(sizeof (struct lro_entry),
91                     M_DEVBUF, M_NOWAIT | M_ZERO);
92                 if (lro == NULL) {
93                         if (i == 0)
94                                 error = ENOMEM;
95                         break;
96                 }
97                 cntl->lro_cnt = i;
98                 SLIST_INSERT_HEAD(&cntl->lro_free, lro, next);
99         }
100
101         return (error);
102 }
103
104 void
105 tcp_lro_free(struct lro_ctrl *cntl)
106 {
107         struct lro_entry *entry;
108
109         while (!SLIST_EMPTY(&cntl->lro_free)) {
110                 entry = SLIST_FIRST(&cntl->lro_free);
111                 SLIST_REMOVE_HEAD(&cntl->lro_free, next);
112                 free(entry, M_DEVBUF);
113         }
114 }
115
116 void
117 tcp_lro_flush(struct lro_ctrl *cntl, struct lro_entry *lro)
118 {
119         struct ifnet *ifp;
120         struct ip *ip;
121         struct tcphdr *tcp;
122         uint32_t *ts_ptr;
123         uint32_t tcplen, tcp_csum;
124
125
126         if (lro->append_cnt) {
127                 /* incorporate the new len into the ip header and
128                  * re-calculate the checksum */
129                 ip = lro->ip;
130                 ip->ip_len = htons(lro->len - ETHER_HDR_LEN);
131                 ip->ip_sum = 0;
132                 ip->ip_sum = 0xffff ^ 
133                         do_csum_data((uint16_t*)ip,
134                                               sizeof (*ip));
135
136                 lro->m_head->m_pkthdr.csum_flags = CSUM_IP_CHECKED |
137                         CSUM_IP_VALID | CSUM_DATA_VALID | CSUM_PSEUDO_HDR;
138                 lro->m_head->m_pkthdr.csum_data = 0xffff;
139                 lro->m_head->m_pkthdr.len = lro->len;
140
141                 /* incorporate the latest ack into the tcp header */
142                 tcp = (struct tcphdr *) (ip + 1);
143                 tcp->th_ack = lro->ack_seq;
144                 tcp->th_win = lro->window;
145                 /* incorporate latest timestamp into the tcp header */
146                 if (lro->timestamp) {
147                         ts_ptr = (uint32_t *)(tcp + 1);
148                         ts_ptr[1] = htonl(lro->tsval);
149                         ts_ptr[2] = lro->tsecr;
150                 }
151                 /* 
152                  * update checksum in tcp header by re-calculating the
153                  * tcp pseudoheader checksum, and adding it to the checksum
154                  * of the tcp payload data 
155                  */
156                 tcp->th_sum = 0;
157                 tcplen = lro->len - sizeof(*ip) - ETHER_HDR_LEN;
158                 tcp_csum = lro->data_csum;
159                 tcp_csum += in_pseudo(ip->ip_src.s_addr, ip->ip_dst.s_addr,
160                                       htons(tcplen + IPPROTO_TCP));
161                 tcp_csum += do_csum_data((uint16_t*)tcp,
162                                                   tcp->th_off << 2);
163                 tcp_csum = (tcp_csum & 0xffff) + (tcp_csum >> 16);
164                 tcp_csum = (tcp_csum & 0xffff) + (tcp_csum >> 16);
165                 tcp->th_sum = 0xffff ^ tcp_csum;
166         }
167         ifp = cntl->ifp;
168         (*ifp->if_input)(cntl->ifp, lro->m_head);
169         cntl->lro_queued += lro->append_cnt + 1;
170         cntl->lro_flushed++;
171         lro->m_head = NULL;
172         lro->timestamp = 0;
173         lro->append_cnt = 0;
174         SLIST_INSERT_HEAD(&cntl->lro_free, lro, next);
175 }
176
177 int
178 tcp_lro_rx(struct lro_ctrl *cntl, struct mbuf *m_head, uint32_t csum)
179 {
180         struct ether_header *eh;
181         struct ip *ip;
182         struct tcphdr *tcp;
183         uint32_t *ts_ptr;
184         struct mbuf *m_nxt, *m_tail;
185         struct lro_entry *lro;
186         int hlen, ip_len, tcp_hdr_len, tcp_data_len, tot_len;
187         int opt_bytes, trim, csum_flags;
188         uint32_t seq, tmp_csum, device_mtu;
189
190
191         eh = mtod(m_head, struct ether_header *);
192         if (eh->ether_type != htons(ETHERTYPE_IP))
193                 return 1;
194         ip = (struct ip *) (eh + 1);
195         if (ip->ip_p != IPPROTO_TCP)
196                 return 1;
197         
198         /* ensure there are no options */
199         if ((ip->ip_hl << 2) != sizeof (*ip))
200                 return -1;
201
202         /* .. and the packet is not fragmented */
203         if (ip->ip_off & htons(IP_MF|IP_OFFMASK))
204                 return -1;
205
206         /* verify that the IP header checksum is correct */
207         csum_flags = m_head->m_pkthdr.csum_flags;
208         if (csum_flags & CSUM_IP_CHECKED) {
209                 if (__predict_false((csum_flags & CSUM_IP_VALID) == 0)) {
210                         cntl->lro_bad_csum++;
211                         return -1;
212                 }
213         } else {
214                 tmp_csum = do_csum_data((uint16_t *)ip, sizeof (*ip));
215                 if (__predict_false((tmp_csum ^ 0xffff) != 0)) {
216                         cntl->lro_bad_csum++;
217                         return -1;
218                 }
219         }
220         
221         /* find the TCP header */
222         tcp = (struct tcphdr *) (ip + 1);
223
224         /* Get the TCP checksum if we dont have it */
225         if (!csum)
226                 csum = tcp->th_sum;
227
228         /* ensure no bits set besides ack or psh */
229         if ((tcp->th_flags & ~(TH_ACK | TH_PUSH)) != 0)
230                 return -1;
231
232         /* check for timestamps. Since the only option we handle are
233            timestamps, we only have to handle the simple case of
234            aligned timestamps */
235
236         opt_bytes = (tcp->th_off << 2) - sizeof (*tcp);
237         tcp_hdr_len =  sizeof (*tcp) + opt_bytes;
238         ts_ptr = (uint32_t *)(tcp + 1);
239         if (opt_bytes != 0) {
240                 if (__predict_false(opt_bytes != TCPOLEN_TSTAMP_APPA) ||
241                     (*ts_ptr !=  ntohl(TCPOPT_NOP<<24|TCPOPT_NOP<<16|
242                     TCPOPT_TIMESTAMP<<8|TCPOLEN_TIMESTAMP)))
243                         return -1;
244         }
245
246         ip_len = ntohs(ip->ip_len);
247         tcp_data_len = ip_len - (tcp->th_off << 2) - sizeof (*ip);
248         
249
250         /* 
251          * If frame is padded beyond the end of the IP packet,
252          * then we must trim the extra bytes off the end.
253          */
254         tot_len = m_head->m_pkthdr.len;
255         trim = tot_len - (ip_len + ETHER_HDR_LEN);
256         if (trim != 0) {
257                 if (trim < 0) {
258                         /* truncated packet */
259                         return -1;
260                 }
261                 m_adj(m_head, -trim);
262                 tot_len = m_head->m_pkthdr.len;
263         }
264
265         m_nxt = m_head;
266         m_tail = NULL; /* -Wuninitialized */
267         while (m_nxt != NULL) {
268                 m_tail = m_nxt;
269                 m_nxt = m_tail->m_next;
270         }
271
272         hlen = ip_len + ETHER_HDR_LEN - tcp_data_len;
273         seq = ntohl(tcp->th_seq);
274
275         SLIST_FOREACH(lro, &cntl->lro_active, next) {
276                 if (lro->source_port == tcp->th_sport && 
277                     lro->dest_port == tcp->th_dport &&
278                     lro->source_ip == ip->ip_src.s_addr && 
279                     lro->dest_ip == ip->ip_dst.s_addr) {
280                         /* Flush now if appending will result in overflow. */
281                         if (lro->len > (65535 - tcp_data_len)) {
282                                 SLIST_REMOVE(&cntl->lro_active, lro,
283                                              lro_entry, next);
284                                 tcp_lro_flush(cntl, lro);
285                                 break;
286                         }
287
288                         /* Try to append it */
289
290                         if (__predict_false(seq != lro->next_seq ||
291                                     (tcp_data_len == 0 &&
292                                     lro->ack_seq == tcp->th_ack))) {
293                                 /* out of order packet or dup ack */
294                                 SLIST_REMOVE(&cntl->lro_active, lro,
295                                              lro_entry, next);
296                                 tcp_lro_flush(cntl, lro);
297                                 return -1;
298                         }
299
300                         if (opt_bytes) {
301                                 uint32_t tsval = ntohl(*(ts_ptr + 1));
302                                 /* make sure timestamp values are increasing */
303                                 if (__predict_false(lro->tsval > tsval || 
304                                              *(ts_ptr + 2) == 0)) {
305                                         return -1;
306                                 }
307                                 lro->tsval = tsval;
308                                 lro->tsecr = *(ts_ptr + 2);
309                         }
310
311                         lro->next_seq += tcp_data_len;
312                         lro->ack_seq = tcp->th_ack;
313                         lro->window = tcp->th_win;
314                         lro->append_cnt++;
315                         if (tcp_data_len == 0) {
316                                 m_freem(m_head);
317                                 return 0;
318                         }
319                         /* subtract off the checksum of the tcp header
320                          * from the hardware checksum, and add it to the
321                          * stored tcp data checksum.  Byteswap the checksum
322                          * if the total length so far is odd 
323                          */
324                         tmp_csum = do_csum_data((uint16_t*)tcp,
325                                                          tcp_hdr_len);
326                         csum = csum + (tmp_csum ^ 0xffff);
327                         csum = (csum & 0xffff) + (csum >> 16);
328                         csum = (csum & 0xffff) + (csum >> 16);
329                         if (lro->len & 0x1) {
330                                 /* Odd number of bytes so far, flip bytes */
331                                 csum = ((csum << 8) | (csum >> 8)) & 0xffff;
332                         }
333                         csum = csum + lro->data_csum;
334                         csum = (csum & 0xffff) + (csum >> 16);
335                         csum = (csum & 0xffff) + (csum >> 16);
336                         lro->data_csum = csum;
337
338                         lro->len += tcp_data_len;
339
340                         /* adjust mbuf so that m->m_data points to
341                            the first byte of the payload */
342                         m_adj(m_head, hlen);
343                         /* append mbuf chain */
344                         lro->m_tail->m_next = m_head;
345                         /* advance the last pointer */
346                         lro->m_tail = m_tail;
347                         /* flush packet if required */
348                         device_mtu = cntl->ifp->if_mtu;
349                         if (lro->len > (65535 - device_mtu)) {
350                                 SLIST_REMOVE(&cntl->lro_active, lro,
351                                              lro_entry, next);
352                                 tcp_lro_flush(cntl, lro);
353                         }
354                         return 0;
355                 }
356         }
357
358         if (SLIST_EMPTY(&cntl->lro_free))
359             return -1;
360
361         /* start a new chain */
362         lro = SLIST_FIRST(&cntl->lro_free);
363         SLIST_REMOVE_HEAD(&cntl->lro_free, next);
364         SLIST_INSERT_HEAD(&cntl->lro_active, lro, next);
365         lro->source_port = tcp->th_sport;
366         lro->dest_port = tcp->th_dport;
367         lro->source_ip = ip->ip_src.s_addr;
368         lro->dest_ip = ip->ip_dst.s_addr;
369         lro->next_seq = seq + tcp_data_len;
370         lro->mss = tcp_data_len;
371         lro->ack_seq = tcp->th_ack;
372         lro->window = tcp->th_win;
373
374         /* save the checksum of just the TCP payload by
375          * subtracting off the checksum of the TCP header from
376          * the entire hardware checksum 
377          * Since IP header checksum is correct, checksum over
378          * the IP header is -0.  Substracting -0 is unnecessary.
379          */
380         tmp_csum = do_csum_data((uint16_t*)tcp, tcp_hdr_len);
381         csum = csum + (tmp_csum ^ 0xffff);
382         csum = (csum & 0xffff) + (csum >> 16);
383         csum = (csum & 0xffff) + (csum >> 16);
384         lro->data_csum = csum;
385         
386         lro->ip = ip;
387         /* record timestamp if it is present */
388         if (opt_bytes) {
389                 lro->timestamp = 1;
390                 lro->tsval = ntohl(*(ts_ptr + 1));
391                 lro->tsecr = *(ts_ptr + 2);
392         }
393         lro->len = tot_len;
394         lro->m_head = m_head;
395         lro->m_tail = m_tail;
396         return 0;
397 }