]> CyberLeo.Net >> Repos - FreeBSD/FreeBSD.git/blob - sys/netinet/tcp_lro.c
ident(1): Normalizing date format
[FreeBSD/FreeBSD.git] / sys / netinet / tcp_lro.c
1 /*-
2  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
3  *
4  * Copyright (c) 2007, Myricom Inc.
5  * Copyright (c) 2008, Intel Corporation.
6  * Copyright (c) 2012 The FreeBSD Foundation
7  * Copyright (c) 2016 Mellanox Technologies.
8  * All rights reserved.
9  *
10  * Portions of this software were developed by Bjoern Zeeb
11  * under sponsorship from the FreeBSD Foundation.
12  *
13  * Redistribution and use in source and binary forms, with or without
14  * modification, are permitted provided that the following conditions
15  * are met:
16  * 1. Redistributions of source code must retain the above copyright
17  *    notice, this list of conditions and the following disclaimer.
18  * 2. Redistributions in binary form must reproduce the above copyright
19  *    notice, this list of conditions and the following disclaimer in the
20  *    documentation and/or other materials provided with the distribution.
21  *
22  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
23  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
24  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
25  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
26  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
27  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
28  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
29  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
30  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
31  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32  * SUCH DAMAGE.
33  */
34
35 #include <sys/cdefs.h>
36 __FBSDID("$FreeBSD$");
37
38 #include "opt_inet.h"
39 #include "opt_inet6.h"
40
41 #include <sys/param.h>
42 #include <sys/systm.h>
43 #include <sys/kernel.h>
44 #include <sys/malloc.h>
45 #include <sys/mbuf.h>
46 #include <sys/socket.h>
47 #include <sys/socketvar.h>
48 #include <sys/sockbuf.h>
49 #include <sys/sysctl.h>
50
51 #include <net/if.h>
52 #include <net/if_var.h>
53 #include <net/ethernet.h>
54 #include <net/vnet.h>
55
56 #include <netinet/in_systm.h>
57 #include <netinet/in.h>
58 #include <netinet/ip6.h>
59 #include <netinet/ip.h>
60 #include <netinet/ip_var.h>
61 #include <netinet/in_pcb.h>
62 #include <netinet6/in6_pcb.h>
63 #include <netinet/tcp.h>
64 #include <netinet/tcp_seq.h>
65 #include <netinet/tcp_lro.h>
66 #include <netinet/tcp_var.h>
67 #include <netinet/tcp_hpts.h>
68 #include <netinet/tcp_log_buf.h>
69 #include <netinet6/ip6_var.h>
70
71 #include <machine/in_cksum.h>
72
73 static MALLOC_DEFINE(M_LRO, "LRO", "LRO control structures");
74
75 #define TCP_LRO_UPDATE_CSUM     1
76 #ifndef TCP_LRO_UPDATE_CSUM
77 #define TCP_LRO_INVALID_CSUM    0x0000
78 #endif
79
80 static void     tcp_lro_rx_done(struct lro_ctrl *lc);
81 static int      tcp_lro_rx2(struct lro_ctrl *lc, struct mbuf *m,
82                     uint32_t csum, int use_hash);
83
84 SYSCTL_NODE(_net_inet_tcp, OID_AUTO, lro,  CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
85     "TCP LRO");
86
87 static long tcplro_stacks_wanting_mbufq = 0;
88 counter_u64_t tcp_inp_lro_direct_queue;
89 counter_u64_t tcp_inp_lro_wokeup_queue;
90 counter_u64_t tcp_inp_lro_compressed;
91 counter_u64_t tcp_inp_lro_single_push;
92 counter_u64_t tcp_inp_lro_locks_taken;
93 counter_u64_t tcp_inp_lro_sack_wake;
94
95 static unsigned tcp_lro_entries = TCP_LRO_ENTRIES;
96 static int32_t hold_lock_over_compress = 0;
97 SYSCTL_INT(_net_inet_tcp_lro, OID_AUTO, hold_lock, CTLFLAG_RW,
98     &hold_lock_over_compress, 0,
99     "Do we hold the lock over the compress of mbufs?");
100 SYSCTL_UINT(_net_inet_tcp_lro, OID_AUTO, entries,
101     CTLFLAG_RDTUN | CTLFLAG_MPSAFE, &tcp_lro_entries, 0,
102     "default number of LRO entries");
103 SYSCTL_COUNTER_U64(_net_inet_tcp_lro, OID_AUTO, fullqueue, CTLFLAG_RD,
104     &tcp_inp_lro_direct_queue, "Number of lro's fully queued to transport");
105 SYSCTL_COUNTER_U64(_net_inet_tcp_lro, OID_AUTO, wokeup, CTLFLAG_RD,
106     &tcp_inp_lro_wokeup_queue, "Number of lro's where we woke up transport via hpts");
107 SYSCTL_COUNTER_U64(_net_inet_tcp_lro, OID_AUTO, compressed, CTLFLAG_RD,
108     &tcp_inp_lro_compressed, "Number of lro's compressed and sent to transport");
109 SYSCTL_COUNTER_U64(_net_inet_tcp_lro, OID_AUTO, single, CTLFLAG_RD,
110     &tcp_inp_lro_single_push, "Number of lro's sent with single segment");
111 SYSCTL_COUNTER_U64(_net_inet_tcp_lro, OID_AUTO, lockcnt, CTLFLAG_RD,
112     &tcp_inp_lro_locks_taken, "Number of lro's inp_wlocks taken");
113 SYSCTL_COUNTER_U64(_net_inet_tcp_lro, OID_AUTO, sackwakeups, CTLFLAG_RD,
114     &tcp_inp_lro_sack_wake, "Number of wakeups caused by sack/fin");
115
116 void
117 tcp_lro_reg_mbufq(void)
118 {
119         atomic_fetchadd_long(&tcplro_stacks_wanting_mbufq, 1);
120 }
121
122 void
123 tcp_lro_dereg_mbufq(void)
124 {
125         atomic_fetchadd_long(&tcplro_stacks_wanting_mbufq, -1);
126 }
127
128 static __inline void
129 tcp_lro_active_insert(struct lro_ctrl *lc, struct lro_head *bucket,
130     struct lro_entry *le)
131 {
132
133         LIST_INSERT_HEAD(&lc->lro_active, le, next);
134         LIST_INSERT_HEAD(bucket, le, hash_next);
135 }
136
137 static __inline void
138 tcp_lro_active_remove(struct lro_entry *le)
139 {
140
141         LIST_REMOVE(le, next);          /* active list */
142         LIST_REMOVE(le, hash_next);     /* hash bucket */
143 }
144
145 int
146 tcp_lro_init(struct lro_ctrl *lc)
147 {
148         return (tcp_lro_init_args(lc, NULL, tcp_lro_entries, 0));
149 }
150
151 int
152 tcp_lro_init_args(struct lro_ctrl *lc, struct ifnet *ifp,
153     unsigned lro_entries, unsigned lro_mbufs)
154 {
155         struct lro_entry *le;
156         size_t size;
157         unsigned i, elements;
158
159         lc->lro_bad_csum = 0;
160         lc->lro_queued = 0;
161         lc->lro_flushed = 0;
162         lc->lro_mbuf_count = 0;
163         lc->lro_mbuf_max = lro_mbufs;
164         lc->lro_cnt = lro_entries;
165         lc->lro_ackcnt_lim = TCP_LRO_ACKCNT_MAX;
166         lc->lro_length_lim = TCP_LRO_LENGTH_MAX;
167         lc->ifp = ifp;
168         LIST_INIT(&lc->lro_free);
169         LIST_INIT(&lc->lro_active);
170
171         /* create hash table to accelerate entry lookup */
172         if (lro_entries > lro_mbufs)
173                 elements = lro_entries;
174         else
175                 elements = lro_mbufs;
176         lc->lro_hash = phashinit_flags(elements, M_LRO, &lc->lro_hashsz,
177             HASH_NOWAIT);
178         if (lc->lro_hash == NULL) {
179                 memset(lc, 0, sizeof(*lc));
180                 return (ENOMEM);
181         }
182
183         /* compute size to allocate */
184         size = (lro_mbufs * sizeof(struct lro_mbuf_sort)) +
185             (lro_entries * sizeof(*le));
186         lc->lro_mbuf_data = (struct lro_mbuf_sort *)
187             malloc(size, M_LRO, M_NOWAIT | M_ZERO);
188
189         /* check for out of memory */
190         if (lc->lro_mbuf_data == NULL) {
191                 free(lc->lro_hash, M_LRO);
192                 memset(lc, 0, sizeof(*lc));
193                 return (ENOMEM);
194         }
195         /* compute offset for LRO entries */
196         le = (struct lro_entry *)
197             (lc->lro_mbuf_data + lro_mbufs);
198
199         /* setup linked list */
200         for (i = 0; i != lro_entries; i++)
201                 LIST_INSERT_HEAD(&lc->lro_free, le + i, next);
202
203         return (0);
204 }
205
206 static struct tcphdr *
207 tcp_lro_get_th(struct lro_entry *le, struct mbuf *m)
208 {
209         struct ether_header *eh;
210         struct tcphdr *th = NULL;
211 #ifdef INET6
212         struct ip6_hdr *ip6 = NULL;     /* Keep compiler happy. */
213 #endif
214 #ifdef INET
215         struct ip *ip4 = NULL;          /* Keep compiler happy. */
216 #endif
217
218         eh = mtod(m, struct ether_header *);
219         switch (le->eh_type) {
220 #ifdef INET6
221         case ETHERTYPE_IPV6:
222                 ip6 = (struct ip6_hdr *)(eh + 1);
223                 th = (struct tcphdr *)(ip6 + 1);
224                 break;
225 #endif
226 #ifdef INET
227         case ETHERTYPE_IP:
228                 ip4 = (struct ip *)(eh + 1);
229                 th = (struct tcphdr *)(ip4 + 1);
230                 break;
231 #endif
232         }
233         return (th);
234 }
235
236 void
237 tcp_lro_free(struct lro_ctrl *lc)
238 {
239         struct lro_entry *le;
240         unsigned x;
241
242         /* reset LRO free list */
243         LIST_INIT(&lc->lro_free);
244
245         /* free active mbufs, if any */
246         while ((le = LIST_FIRST(&lc->lro_active)) != NULL) {
247                 tcp_lro_active_remove(le);
248                 m_freem(le->m_head);
249         }
250
251         /* free hash table */
252         free(lc->lro_hash, M_LRO);
253         lc->lro_hash = NULL;
254         lc->lro_hashsz = 0;
255
256         /* free mbuf array, if any */
257         for (x = 0; x != lc->lro_mbuf_count; x++)
258                 m_freem(lc->lro_mbuf_data[x].mb);
259         lc->lro_mbuf_count = 0;
260
261         /* free allocated memory, if any */
262         free(lc->lro_mbuf_data, M_LRO);
263         lc->lro_mbuf_data = NULL;
264 }
265
266 static uint16_t
267 tcp_lro_csum_th(struct tcphdr *th)
268 {
269         uint32_t ch;
270         uint16_t *p, l;
271
272         ch = th->th_sum = 0x0000;
273         l = th->th_off;
274         p = (uint16_t *)th;
275         while (l > 0) {
276                 ch += *p;
277                 p++;
278                 ch += *p;
279                 p++;
280                 l--;
281         }
282         while (ch > 0xffff)
283                 ch = (ch >> 16) + (ch & 0xffff);
284
285         return (ch & 0xffff);
286 }
287
288 static uint16_t
289 tcp_lro_rx_csum_fixup(struct lro_entry *le, void *l3hdr, struct tcphdr *th,
290     uint16_t tcp_data_len, uint16_t csum)
291 {
292         uint32_t c;
293         uint16_t cs;
294
295         c = csum;
296
297         /* Remove length from checksum. */
298         switch (le->eh_type) {
299 #ifdef INET6
300         case ETHERTYPE_IPV6:
301         {
302                 struct ip6_hdr *ip6;
303
304                 ip6 = (struct ip6_hdr *)l3hdr;
305                 if (le->append_cnt == 0)
306                         cs = ip6->ip6_plen;
307                 else {
308                         uint32_t cx;
309
310                         cx = ntohs(ip6->ip6_plen);
311                         cs = in6_cksum_pseudo(ip6, cx, ip6->ip6_nxt, 0);
312                 }
313                 break;
314         }
315 #endif
316 #ifdef INET
317         case ETHERTYPE_IP:
318         {
319                 struct ip *ip4;
320
321                 ip4 = (struct ip *)l3hdr;
322                 if (le->append_cnt == 0)
323                         cs = ip4->ip_len;
324                 else {
325                         cs = in_addword(ntohs(ip4->ip_len) - sizeof(*ip4),
326                             IPPROTO_TCP);
327                         cs = in_pseudo(ip4->ip_src.s_addr, ip4->ip_dst.s_addr,
328                             htons(cs));
329                 }
330                 break;
331         }
332 #endif
333         default:
334                 cs = 0;         /* Keep compiler happy. */
335         }
336
337         cs = ~cs;
338         c += cs;
339
340         /* Remove TCP header csum. */
341         cs = ~tcp_lro_csum_th(th);
342         c += cs;
343         while (c > 0xffff)
344                 c = (c >> 16) + (c & 0xffff);
345
346         return (c & 0xffff);
347 }
348
349 static void
350 tcp_lro_rx_done(struct lro_ctrl *lc)
351 {
352         struct lro_entry *le;
353
354         while ((le = LIST_FIRST(&lc->lro_active)) != NULL) {
355                 tcp_lro_active_remove(le);
356                 tcp_lro_flush(lc, le);
357         }
358 }
359
360 void
361 tcp_lro_flush_inactive(struct lro_ctrl *lc, const struct timeval *timeout)
362 {
363         struct lro_entry *le, *le_tmp;
364         struct timeval tv;
365
366         if (LIST_EMPTY(&lc->lro_active))
367                 return;
368
369         getmicrouptime(&tv);
370         timevalsub(&tv, timeout);
371         LIST_FOREACH_SAFE(le, &lc->lro_active, next, le_tmp) {
372                 if (timevalcmp(&tv, &le->mtime, >=)) {
373                         tcp_lro_active_remove(le);
374                         tcp_lro_flush(lc, le);
375                 }
376         }
377 }
378
379 #ifdef INET6
380 static int
381 tcp_lro_rx_ipv6(struct lro_ctrl *lc, struct mbuf *m, struct ip6_hdr *ip6,
382     struct tcphdr **th)
383 {
384
385         /* XXX-BZ we should check the flow-label. */
386
387         /* XXX-BZ We do not yet support ext. hdrs. */
388         if (ip6->ip6_nxt != IPPROTO_TCP)
389                 return (TCP_LRO_NOT_SUPPORTED);
390
391         /* Find the TCP header. */
392         *th = (struct tcphdr *)(ip6 + 1);
393
394         return (0);
395 }
396 #endif
397
398 #ifdef INET
399 static int
400 tcp_lro_rx_ipv4(struct lro_ctrl *lc, struct mbuf *m, struct ip *ip4,
401     struct tcphdr **th)
402 {
403         int csum_flags;
404         uint16_t csum;
405
406         if (ip4->ip_p != IPPROTO_TCP)
407                 return (TCP_LRO_NOT_SUPPORTED);
408
409         /* Ensure there are no options. */
410         if ((ip4->ip_hl << 2) != sizeof (*ip4))
411                 return (TCP_LRO_CANNOT);
412
413         /* .. and the packet is not fragmented. */
414         if (ip4->ip_off & htons(IP_MF|IP_OFFMASK))
415                 return (TCP_LRO_CANNOT);
416
417         /* Legacy IP has a header checksum that needs to be correct. */
418         csum_flags = m->m_pkthdr.csum_flags;
419         if (csum_flags & CSUM_IP_CHECKED) {
420                 if (__predict_false((csum_flags & CSUM_IP_VALID) == 0)) {
421                         lc->lro_bad_csum++;
422                         return (TCP_LRO_CANNOT);
423                 }
424         } else {
425                 csum = in_cksum_hdr(ip4);
426                 if (__predict_false((csum) != 0)) {
427                         lc->lro_bad_csum++;
428                         return (TCP_LRO_CANNOT);
429                 }
430         }
431         /* Find the TCP header (we assured there are no IP options). */
432         *th = (struct tcphdr *)(ip4 + 1);
433         return (0);
434 }
435 #endif
436
437 static void
438 tcp_lro_log(struct tcpcb *tp, struct lro_ctrl *lc,
439             struct lro_entry *le, struct mbuf *m, int frm, int32_t tcp_data_len,
440             uint32_t th_seq , uint32_t th_ack, uint16_t th_win)
441 {
442         if (tp->t_logstate != TCP_LOG_STATE_OFF) {
443                 union tcp_log_stackspecific log;
444                 struct timeval tv;
445                 uint32_t cts;
446
447                 cts = tcp_get_usecs(&tv);
448                 memset(&log, 0, sizeof(union tcp_log_stackspecific));
449                 log.u_bbr.flex8 = frm;
450                 log.u_bbr.flex1 = tcp_data_len;
451                 if (m)
452                         log.u_bbr.flex2 = m->m_pkthdr.len;
453                 else
454                         log.u_bbr.flex2 = 0;
455                 log.u_bbr.flex3 = le->append_cnt;
456                 log.u_bbr.flex4 = le->p_len;
457                 log.u_bbr.flex5 = le->m_head->m_pkthdr.len;
458                 log.u_bbr.delRate = le->m_head->m_flags;
459                 log.u_bbr.rttProp = le->m_head->m_pkthdr.rcv_tstmp;
460                 log.u_bbr.flex6 = lc->lro_length_lim;
461                 log.u_bbr.flex7 = lc->lro_ackcnt_lim;
462                 log.u_bbr.inflight = th_seq;
463                 log.u_bbr.timeStamp = cts;
464                 log.u_bbr.epoch = le->next_seq;
465                 log.u_bbr.delivered = th_ack;
466                 log.u_bbr.lt_epoch = le->ack_seq;
467                 log.u_bbr.pacing_gain = th_win;
468                 log.u_bbr.cwnd_gain = le->window;
469                 log.u_bbr.cur_del_rate = (uintptr_t)m;
470                 log.u_bbr.bw_inuse = (uintptr_t)le->m_head;
471                 log.u_bbr.pkts_out = le->mbuf_cnt;      /* Total mbufs added */
472                 log.u_bbr.applimited = le->ulp_csum;
473                 log.u_bbr.lost = le->mbuf_appended;
474                 TCP_LOG_EVENTP(tp, NULL,
475                                &tp->t_inpcb->inp_socket->so_rcv,
476                                &tp->t_inpcb->inp_socket->so_snd,
477                                TCP_LOG_LRO, 0,
478                                0, &log, false, &tv);
479         }
480 }
481
482 static void
483 tcp_flush_out_le(struct tcpcb *tp, struct lro_ctrl *lc, struct lro_entry *le, int locked)
484 {
485         if (le->append_cnt > 1) {
486                 struct tcphdr *th;
487                 uint16_t p_len;
488
489                 p_len = htons(le->p_len);
490                 switch (le->eh_type) {
491 #ifdef INET6
492                 case ETHERTYPE_IPV6:
493                 {
494                         struct ip6_hdr *ip6;
495
496                         ip6 = le->le_ip6;
497                         ip6->ip6_plen = p_len;
498                         th = (struct tcphdr *)(ip6 + 1);
499                         le->m_head->m_pkthdr.csum_flags = CSUM_DATA_VALID |
500                             CSUM_PSEUDO_HDR;
501                         le->p_len += ETHER_HDR_LEN + sizeof(*ip6);
502                         break;
503                 }
504 #endif
505 #ifdef INET
506                 case ETHERTYPE_IP:
507                 {
508                         struct ip *ip4;
509                         uint32_t cl;
510                         uint16_t c;
511
512                         ip4 = le->le_ip4;
513                         /* Fix IP header checksum for new length. */
514                         c = ~ip4->ip_sum;
515                         cl = c;
516                         c = ~ip4->ip_len;
517                         cl += c + p_len;
518                         while (cl > 0xffff)
519                                 cl = (cl >> 16) + (cl & 0xffff);
520                         c = cl;
521                         ip4->ip_sum = ~c;
522                         ip4->ip_len = p_len;
523                         th = (struct tcphdr *)(ip4 + 1);
524                         le->m_head->m_pkthdr.csum_flags = CSUM_DATA_VALID |
525                             CSUM_PSEUDO_HDR | CSUM_IP_CHECKED | CSUM_IP_VALID;
526                         le->p_len += ETHER_HDR_LEN;
527                         break;
528                 }
529 #endif
530                 default:
531                         th = NULL;      /* Keep compiler happy. */
532                 }
533                 le->m_head->m_pkthdr.csum_data = 0xffff;
534                 le->m_head->m_pkthdr.len = le->p_len;
535
536                 /* Incorporate the latest ACK into the TCP header. */
537                 th->th_ack = le->ack_seq;
538                 th->th_win = le->window;
539                 /* Incorporate latest timestamp into the TCP header. */
540                 if (le->timestamp != 0) {
541                         uint32_t *ts_ptr;
542
543                         ts_ptr = (uint32_t *)(th + 1);
544                         ts_ptr[1] = htonl(le->tsval);
545                         ts_ptr[2] = le->tsecr;
546                 }
547                 /* Update the TCP header checksum. */
548                 le->ulp_csum += p_len;
549                 le->ulp_csum += tcp_lro_csum_th(th);
550                 while (le->ulp_csum > 0xffff)
551                         le->ulp_csum = (le->ulp_csum >> 16) +
552                             (le->ulp_csum & 0xffff);
553                 th->th_sum = (le->ulp_csum & 0xffff);
554                 th->th_sum = ~th->th_sum;
555                 if (tp && locked) {
556                         tcp_lro_log(tp, lc, le, NULL, 7, 0, 0, 0, 0);
557                 }
558         }
559         /*
560          * Break any chain, this is not set to NULL on the singleton
561          * case m_nextpkt points to m_head. Other case set them
562          * m_nextpkt to NULL in push_and_replace.
563          */
564         le->m_head->m_nextpkt = NULL;
565         le->m_head->m_pkthdr.lro_nsegs = le->append_cnt;
566         if (tp && locked) {
567                 tcp_lro_log(tp, lc, le, le->m_head, 8, 0, 0, 0, 0);
568         }
569         (*lc->ifp->if_input)(lc->ifp, le->m_head);
570         lc->lro_queued += le->append_cnt;
571 }
572
573 static void
574 tcp_set_le_to_m(struct lro_ctrl *lc, struct lro_entry *le, struct mbuf *m)
575 {
576         struct ether_header *eh;
577         void *l3hdr = NULL;             /* Keep compiler happy. */
578         struct tcphdr *th;
579 #ifdef INET6
580         struct ip6_hdr *ip6 = NULL;     /* Keep compiler happy. */
581 #endif
582 #ifdef INET
583         struct ip *ip4 = NULL;          /* Keep compiler happy. */
584 #endif
585         uint32_t *ts_ptr;
586         int error, l, ts_failed = 0;
587         uint16_t tcp_data_len;
588         uint16_t csum;
589
590         error = -1;
591         eh = mtod(m, struct ether_header *);
592         /*
593          * We must reset the other pointers since the mbuf
594          * we were pointing too is about to go away.
595          */
596         switch (le->eh_type) {
597 #ifdef INET6
598         case ETHERTYPE_IPV6:
599                 l3hdr = ip6 = (struct ip6_hdr *)(eh + 1);
600                 error = tcp_lro_rx_ipv6(lc, m, ip6, &th);
601                 le->le_ip6 = ip6;
602                 le->source_ip6 = ip6->ip6_src;
603                 le->dest_ip6 = ip6->ip6_dst;
604                 le->p_len = m->m_pkthdr.len - ETHER_HDR_LEN - sizeof(*ip6);
605                 break;
606 #endif
607 #ifdef INET
608         case ETHERTYPE_IP:
609                 l3hdr = ip4 = (struct ip *)(eh + 1);
610                 error = tcp_lro_rx_ipv4(lc, m, ip4, &th);
611                 le->le_ip4 = ip4;
612                 le->source_ip4 = ip4->ip_src.s_addr;
613                 le->dest_ip4 = ip4->ip_dst.s_addr;
614                 le->p_len = m->m_pkthdr.len - ETHER_HDR_LEN;
615                 break;
616 #endif
617         }
618         KASSERT(error == 0, ("%s: le=%p tcp_lro_rx_xxx failed\n",
619                                     __func__, le));
620         ts_ptr = (uint32_t *)(th + 1);
621         l = (th->th_off << 2);
622         l -= sizeof(*th);
623         if (l != 0 &&
624             (__predict_false(l != TCPOLEN_TSTAMP_APPA) ||
625              (*ts_ptr != ntohl(TCPOPT_NOP<<24|TCPOPT_NOP<<16|
626                                TCPOPT_TIMESTAMP<<8|TCPOLEN_TIMESTAMP)))) {
627                 /* We have failed to find a timestamp some other option? */
628                 ts_failed = 1;
629         }
630         if ((l != 0) && (ts_failed == 0)) {
631                 le->timestamp = 1;
632                 le->tsval = ntohl(*(ts_ptr + 1));
633                 le->tsecr = *(ts_ptr + 2);
634         } else
635                 le->timestamp = 0;
636         le->source_port = th->th_sport;
637         le->dest_port = th->th_dport;
638         /* Pull out the csum */
639         tcp_data_len = m->m_pkthdr.lro_len;
640         le->next_seq = ntohl(th->th_seq) + tcp_data_len;
641         le->ack_seq = th->th_ack;
642         le->window = th->th_win;
643         csum = th->th_sum;
644         /* Setup the data pointers */
645         le->m_head = m;
646         le->m_tail = m_last(m);
647         le->append_cnt = 0;
648         le->ulp_csum = tcp_lro_rx_csum_fixup(le, l3hdr, th, tcp_data_len,
649                                              ~csum);
650         le->append_cnt++;
651         th->th_sum = csum;      /* Restore checksum on first packet. */
652 }
653
654 static void
655 tcp_push_and_replace(struct tcpcb *tp, struct lro_ctrl *lc, struct lro_entry *le, struct mbuf *m, int locked)
656 {
657         /*
658          * Push up the stack the current le and replace
659          * it with m.
660          */
661         struct mbuf *msave;
662
663         /* Grab off the next and save it */
664         msave = le->m_head->m_nextpkt;
665         le->m_head->m_nextpkt = NULL;
666         /* Now push out the old le entry */
667         tcp_flush_out_le(tp, lc, le, locked);
668         /*
669          * Now to replace the data properly in the le
670          * we have to reset the tcp header and
671          * other fields.
672          */
673         tcp_set_le_to_m(lc, le, m);
674         /* Restore the next list */
675         m->m_nextpkt = msave;
676 }
677
678 static void
679 tcp_lro_condense(struct tcpcb *tp, struct lro_ctrl *lc, struct lro_entry *le, int locked)
680 {
681         /*
682          * Walk through the mbuf chain we
683          * have on tap and compress/condense
684          * as required.
685          */
686         uint32_t *ts_ptr;
687         struct mbuf *m;
688         struct tcphdr *th;
689         uint16_t tcp_data_len, csum_upd;
690         int l;
691
692         /*
693          * First we must check the lead (m_head)
694          * we must make sure that it is *not*
695          * something that should be sent up
696          * right away (sack etc).
697          */
698 again:
699
700         m = le->m_head->m_nextpkt;
701         if (m == NULL) {
702                 /* Just the one left */
703                 return;
704         }
705         th = tcp_lro_get_th(le, le->m_head);
706         KASSERT(th != NULL,
707                 ("le:%p m:%p th comes back NULL?", le, le->m_head));
708         l = (th->th_off << 2);
709         l -= sizeof(*th);
710         ts_ptr = (uint32_t *)(th + 1);
711         if (l != 0 && (__predict_false(l != TCPOLEN_TSTAMP_APPA) ||
712                        (*ts_ptr != ntohl(TCPOPT_NOP<<24|TCPOPT_NOP<<16|
713                                          TCPOPT_TIMESTAMP<<8|TCPOLEN_TIMESTAMP)))) {
714                 /*
715                  * Its not the timestamp. We can't
716                  * use this guy as the head.
717                  */
718                 le->m_head->m_nextpkt = m->m_nextpkt;
719                 tcp_push_and_replace(tp, lc, le, m, locked);
720                 goto again;
721         }
722         if ((th->th_flags & ~(TH_ACK | TH_PUSH)) != 0) {
723                 /*
724                  * Make sure that previously seen segements/ACKs are delivered
725                  * before this segment, e.g. FIN.
726                  */
727                 le->m_head->m_nextpkt = m->m_nextpkt;
728                 tcp_push_and_replace(tp, lc, le, m, locked);
729                 goto again;
730         }
731         while((m = le->m_head->m_nextpkt) != NULL) {
732                 /*
733                  * condense m into le, first
734                  * pull m out of the list.
735                  */
736                 le->m_head->m_nextpkt = m->m_nextpkt;
737                 m->m_nextpkt = NULL;
738                 /* Setup my data */
739                 tcp_data_len = m->m_pkthdr.lro_len;
740                 th = tcp_lro_get_th(le, m);
741                 KASSERT(th != NULL,
742                         ("le:%p m:%p th comes back NULL?", le, m));
743                 ts_ptr = (uint32_t *)(th + 1);
744                 l = (th->th_off << 2);
745                 l -= sizeof(*th);
746                 if (tp && locked) {
747                         tcp_lro_log(tp, lc, le, m, 1, 0, 0, 0, 0);
748                 }
749                 if (le->append_cnt >= lc->lro_ackcnt_lim) {
750                         if (tp && locked) {
751                                 tcp_lro_log(tp, lc, le, m, 2, 0, 0, 0, 0);
752                         }
753                         tcp_push_and_replace(tp, lc, le, m, locked);
754                         goto again;
755                 }
756                 if (le->p_len > (lc->lro_length_lim - tcp_data_len)) {
757                         /* Flush now if appending will result in overflow. */
758                         if (tp && locked) {
759                                 tcp_lro_log(tp, lc, le, m, 3, tcp_data_len, 0, 0, 0);
760                         }
761                         tcp_push_and_replace(tp, lc, le, m, locked);
762                         goto again;
763                 }
764                 if (l != 0 && (__predict_false(l != TCPOLEN_TSTAMP_APPA) ||
765                                (*ts_ptr != ntohl(TCPOPT_NOP<<24|TCPOPT_NOP<<16|
766                                                  TCPOPT_TIMESTAMP<<8|TCPOLEN_TIMESTAMP)))) {
767                         /*
768                          * Maybe a sack in the new one? We need to
769                          * start all over after flushing the
770                          * current le. We will go up to the beginning
771                          * and flush it (calling the replace again possibly
772                          * or just returning).
773                          */
774                         tcp_push_and_replace(tp, lc, le, m, locked);
775                         goto again;
776                 }
777                 if ((th->th_flags & ~(TH_ACK | TH_PUSH)) != 0) {
778                         tcp_push_and_replace(tp, lc, le, m, locked);
779                         goto again;
780                 }
781                 if (l != 0) {
782                         uint32_t tsval = ntohl(*(ts_ptr + 1));
783                         /* Make sure timestamp values are increasing. */
784                         if (TSTMP_GT(le->tsval, tsval))  {
785                                 tcp_push_and_replace(tp, lc, le, m, locked);
786                                 goto again;
787                         }
788                         le->tsval = tsval;
789                         le->tsecr = *(ts_ptr + 2);
790                 }
791                 /* Try to append the new segment. */
792                 if (__predict_false(ntohl(th->th_seq) != le->next_seq ||
793                                     (tcp_data_len == 0 &&
794                                      le->ack_seq == th->th_ack &&
795                                      le->window == th->th_win))) {
796                         /* Out of order packet or duplicate ACK. */
797                         if (tp && locked) {
798                                 tcp_lro_log(tp, lc, le, m, 4, tcp_data_len,
799                                             ntohl(th->th_seq),
800                                             th->th_ack,
801                                             th->th_win);
802                         }
803                         tcp_push_and_replace(tp, lc, le, m, locked);
804                         goto again;
805                 }
806                 if (tcp_data_len || SEQ_GT(ntohl(th->th_ack), ntohl(le->ack_seq))) {
807                         le->next_seq += tcp_data_len;
808                         le->ack_seq = th->th_ack;
809                         le->window = th->th_win;
810                 } else if (th->th_ack == le->ack_seq) {
811                         le->window = WIN_MAX(le->window, th->th_win);
812                 }
813                 csum_upd = m->m_pkthdr.lro_csum;
814                 le->ulp_csum += csum_upd;
815                 if (tcp_data_len == 0) {
816                         le->append_cnt++;
817                         le->mbuf_cnt--;
818                         if (tp && locked) {
819                                 tcp_lro_log(tp, lc, le, m, 5, tcp_data_len,
820                                             ntohl(th->th_seq),
821                                             th->th_ack,
822                                             th->th_win);
823                         }
824                         m_freem(m);
825                         continue;
826                 }
827                 le->append_cnt++;
828                 le->mbuf_appended++;
829                 le->p_len += tcp_data_len;
830                 /*
831                  * Adjust the mbuf so that m_data points to the first byte of
832                  * the ULP payload.  Adjust the mbuf to avoid complications and
833                  * append new segment to existing mbuf chain.
834                  */
835                 m_adj(m, m->m_pkthdr.len - tcp_data_len);
836                 if (tp && locked) {
837                         tcp_lro_log(tp, lc, le, m, 6, tcp_data_len,
838                                             ntohl(th->th_seq),
839                                             th->th_ack,
840                                             th->th_win);
841                 }
842                 m_demote_pkthdr(m);
843                 le->m_tail->m_next = m;
844                 le->m_tail = m_last(m);
845         }
846 }
847
848 #ifdef TCPHPTS
849 static void
850 tcp_queue_pkts(struct tcpcb *tp, struct lro_entry *le)
851 {
852         if (tp->t_in_pkt == NULL) {
853                 /* Nothing yet there */
854                 tp->t_in_pkt = le->m_head;
855                 tp->t_tail_pkt = le->m_last_mbuf;
856         } else {
857                 /* Already some there */
858                 tp->t_tail_pkt->m_nextpkt = le->m_head;
859                 tp->t_tail_pkt = le->m_last_mbuf;
860         }
861         le->m_head = NULL;
862         le->m_last_mbuf = NULL;
863 }
864 #endif
865
866 void
867 tcp_lro_flush(struct lro_ctrl *lc, struct lro_entry *le)
868 {
869         struct tcpcb *tp = NULL;
870         int locked = 0;
871 #ifdef TCPHPTS
872         struct inpcb *inp = NULL;
873         int need_wakeup = 0, can_queue = 0;
874         struct epoch_tracker et;
875
876         /* Now lets lookup the inp first */
877         CURVNET_SET(lc->ifp->if_vnet);
878         /*
879          * XXXRRS Currently the common input handler for
880          * mbuf queuing cannot handle VLAN Tagged. This needs
881          * to be fixed and the or condition removed (i.e. the
882          * common code should do the right lookup for the vlan
883          * tag and anything else that the vlan_input() does).
884          */
885         if ((tcplro_stacks_wanting_mbufq == 0) || (le->m_head->m_flags & M_VLANTAG))
886                 goto skip_lookup;
887         NET_EPOCH_ENTER(et);
888         switch (le->eh_type) {
889 #ifdef INET6
890         case ETHERTYPE_IPV6:
891                 inp = in6_pcblookup(&V_tcbinfo, &le->source_ip6,
892                                     le->source_port, &le->dest_ip6,le->dest_port,
893                                     INPLOOKUP_WLOCKPCB,
894                                     lc->ifp);
895                 break;
896 #endif
897 #ifdef INET
898         case ETHERTYPE_IP:
899                 inp = in_pcblookup(&V_tcbinfo, le->le_ip4->ip_src,
900                                    le->source_port, le->le_ip4->ip_dst, le->dest_port,
901                                    INPLOOKUP_WLOCKPCB,
902                                    lc->ifp);
903                 break;
904 #endif
905         }
906         NET_EPOCH_EXIT(et);
907         if (inp && ((inp->inp_flags & (INP_DROPPED|INP_TIMEWAIT)) ||
908                     (inp->inp_flags2 & INP_FREED))) {
909                 /* We don't want this guy */
910                 INP_WUNLOCK(inp);
911                 inp = NULL;
912         }
913         if (inp && (inp->inp_flags2 & INP_SUPPORTS_MBUFQ)) {
914                 /* The transport supports mbuf queuing */
915                 can_queue = 1;
916                 if (le->need_wakeup ||
917                     ((inp->inp_in_input == 0) &&
918                      ((inp->inp_flags2 & INP_MBUF_QUEUE_READY) == 0))) {
919                         /*
920                          * Either the transport is off on a keep-alive
921                          * (it has the queue_ready flag clear and its
922                          *  not already been woken) or the entry has
923                          * some urgent thing (FIN or possibly SACK blocks).
924                          * This means we need to wake the transport up by
925                          * putting it on the input pacer.
926                          */
927                         need_wakeup = 1;
928                         if ((inp->inp_flags2 & INP_DONT_SACK_QUEUE) &&
929                             (le->need_wakeup != 1)) {
930                                 /*
931                                  * Prohibited from a sack wakeup.
932                                  */
933                                 need_wakeup = 0;
934                         }
935                 }
936                 /* Do we need to be awoken due to lots of data or acks? */
937                 if ((le->tcp_tot_p_len >= lc->lro_length_lim) ||
938                     (le->mbuf_cnt >= lc->lro_ackcnt_lim))
939                         need_wakeup = 1;
940         }
941         if (inp) {
942                 tp = intotcpcb(inp);
943                 locked = 1;
944         } else
945                 tp = NULL;
946         if (can_queue) {
947                 counter_u64_add(tcp_inp_lro_direct_queue, 1);
948                 tcp_lro_log(tp, lc, le, NULL, 22, need_wakeup,
949                             inp->inp_flags2, inp->inp_in_input, le->need_wakeup);
950                 tcp_queue_pkts(tp, le);
951                 if (need_wakeup) {
952                         /*
953                          * We must get the guy to wakeup via
954                          * hpts.
955                          */
956                         counter_u64_add(tcp_inp_lro_wokeup_queue, 1);
957                         if (le->need_wakeup)
958                                 counter_u64_add(tcp_inp_lro_sack_wake, 1);
959                         tcp_queue_to_input(inp);
960                 }
961         }
962         if (inp && (hold_lock_over_compress == 0)) {
963                 /* Unlock it */
964                 locked = 0;
965                 tp = NULL;
966                 counter_u64_add(tcp_inp_lro_locks_taken, 1);
967                 INP_WUNLOCK(inp);
968         }
969         if (can_queue == 0) {
970 skip_lookup:
971 #endif /* TCPHPTS */
972                 /* Old fashioned lro method */
973                 if (le->m_head != le->m_last_mbuf)  {
974                         counter_u64_add(tcp_inp_lro_compressed, 1);
975                         tcp_lro_condense(tp, lc, le, locked);
976                 } else
977                         counter_u64_add(tcp_inp_lro_single_push, 1);
978                 tcp_flush_out_le(tp, lc, le, locked);
979 #ifdef TCPHPTS
980         }
981         if (inp && locked) {
982                 counter_u64_add(tcp_inp_lro_locks_taken, 1);
983                 INP_WUNLOCK(inp);
984         }
985         CURVNET_RESTORE();
986 #endif
987         lc->lro_flushed++;
988         bzero(le, sizeof(*le));
989         LIST_INSERT_HEAD(&lc->lro_free, le, next);
990 }
991
992 #ifdef HAVE_INLINE_FLSLL
993 #define tcp_lro_msb_64(x) (1ULL << (flsll(x) - 1))
994 #else
995 static inline uint64_t
996 tcp_lro_msb_64(uint64_t x)
997 {
998         x |= (x >> 1);
999         x |= (x >> 2);
1000         x |= (x >> 4);
1001         x |= (x >> 8);
1002         x |= (x >> 16);
1003         x |= (x >> 32);
1004         return (x & ~(x >> 1));
1005 }
1006 #endif
1007
1008 /*
1009  * The tcp_lro_sort() routine is comparable to qsort(), except it has
1010  * a worst case complexity limit of O(MIN(N,64)*N), where N is the
1011  * number of elements to sort and 64 is the number of sequence bits
1012  * available. The algorithm is bit-slicing the 64-bit sequence number,
1013  * sorting one bit at a time from the most significant bit until the
1014  * least significant one, skipping the constant bits. This is
1015  * typically called a radix sort.
1016  */
1017 static void
1018 tcp_lro_sort(struct lro_mbuf_sort *parray, uint32_t size)
1019 {
1020         struct lro_mbuf_sort temp;
1021         uint64_t ones;
1022         uint64_t zeros;
1023         uint32_t x;
1024         uint32_t y;
1025
1026 repeat:
1027         /* for small arrays insertion sort is faster */
1028         if (size <= 12) {
1029                 for (x = 1; x < size; x++) {
1030                         temp = parray[x];
1031                         for (y = x; y > 0 && temp.seq < parray[y - 1].seq; y--)
1032                                 parray[y] = parray[y - 1];
1033                         parray[y] = temp;
1034                 }
1035                 return;
1036         }
1037
1038         /* compute sequence bits which are constant */
1039         ones = 0;
1040         zeros = 0;
1041         for (x = 0; x != size; x++) {
1042                 ones |= parray[x].seq;
1043                 zeros |= ~parray[x].seq;
1044         }
1045
1046         /* compute bits which are not constant into "ones" */
1047         ones &= zeros;
1048         if (ones == 0)
1049                 return;
1050
1051         /* pick the most significant bit which is not constant */
1052         ones = tcp_lro_msb_64(ones);
1053
1054         /*
1055          * Move entries having cleared sequence bits to the beginning
1056          * of the array:
1057          */
1058         for (x = y = 0; y != size; y++) {
1059                 /* skip set bits */
1060                 if (parray[y].seq & ones)
1061                         continue;
1062                 /* swap entries */
1063                 temp = parray[x];
1064                 parray[x] = parray[y];
1065                 parray[y] = temp;
1066                 x++;
1067         }
1068
1069         KASSERT(x != 0 && x != size, ("Memory is corrupted\n"));
1070
1071         /* sort zeros */
1072         tcp_lro_sort(parray, x);
1073
1074         /* sort ones */
1075         parray += x;
1076         size -= x;
1077         goto repeat;
1078 }
1079
1080 void
1081 tcp_lro_flush_all(struct lro_ctrl *lc)
1082 {
1083         uint64_t seq;
1084         uint64_t nseq;
1085         unsigned x;
1086
1087         /* check if no mbufs to flush */
1088         if (lc->lro_mbuf_count == 0)
1089                 goto done;
1090
1091         /* sort all mbufs according to stream */
1092         tcp_lro_sort(lc->lro_mbuf_data, lc->lro_mbuf_count);
1093
1094         /* input data into LRO engine, stream by stream */
1095         seq = 0;
1096         for (x = 0; x != lc->lro_mbuf_count; x++) {
1097                 struct mbuf *mb;
1098
1099                 /* get mbuf */
1100                 mb = lc->lro_mbuf_data[x].mb;
1101
1102                 /* get sequence number, masking away the packet index */
1103                 nseq = lc->lro_mbuf_data[x].seq & (-1ULL << 24);
1104
1105                 /* check for new stream */
1106                 if (seq != nseq) {
1107                         seq = nseq;
1108
1109                         /* flush active streams */
1110                         tcp_lro_rx_done(lc);
1111                 }
1112
1113                 /* add packet to LRO engine */
1114                 if (tcp_lro_rx2(lc, mb, 0, 0) != 0) {
1115                         /* input packet to network layer */
1116                         (*lc->ifp->if_input)(lc->ifp, mb);
1117                         lc->lro_queued++;
1118                         lc->lro_flushed++;
1119                 }
1120         }
1121 done:
1122         /* flush active streams */
1123         tcp_lro_rx_done(lc);
1124
1125         lc->lro_mbuf_count = 0;
1126 }
1127
1128 static void
1129 lro_set_mtime(struct timeval *tv, struct timespec *ts)
1130 {
1131         tv->tv_sec = ts->tv_sec;
1132         tv->tv_usec = ts->tv_nsec / 1000;
1133 }
1134
1135 static int
1136 tcp_lro_rx2(struct lro_ctrl *lc, struct mbuf *m, uint32_t csum, int use_hash)
1137 {
1138         struct lro_entry *le;
1139         struct ether_header *eh;
1140 #ifdef INET6
1141         struct ip6_hdr *ip6 = NULL;     /* Keep compiler happy. */
1142 #endif
1143 #ifdef INET
1144         struct ip *ip4 = NULL;          /* Keep compiler happy. */
1145 #endif
1146         struct tcphdr *th;
1147         void *l3hdr = NULL;             /* Keep compiler happy. */
1148         uint32_t *ts_ptr;
1149         tcp_seq seq;
1150         int error, ip_len, l;
1151         uint16_t eh_type, tcp_data_len, need_flush;
1152         struct lro_head *bucket;
1153         struct timespec arrv;
1154
1155         /* We expect a contiguous header [eh, ip, tcp]. */
1156         if ((m->m_flags & (M_TSTMP_LRO|M_TSTMP)) == 0) {
1157                 /* If no hardware or arrival stamp on the packet add arrival */
1158                 nanouptime(&arrv);
1159                 m->m_pkthdr.rcv_tstmp = (arrv.tv_sec * 1000000000) + arrv.tv_nsec;
1160                 m->m_flags |= M_TSTMP_LRO;
1161         }
1162         eh = mtod(m, struct ether_header *);
1163         eh_type = ntohs(eh->ether_type);
1164         switch (eh_type) {
1165 #ifdef INET6
1166         case ETHERTYPE_IPV6:
1167         {
1168                 CURVNET_SET(lc->ifp->if_vnet);
1169                 if (V_ip6_forwarding != 0) {
1170                         /* XXX-BZ stats but changing lro_ctrl is a problem. */
1171                         CURVNET_RESTORE();
1172                         return (TCP_LRO_CANNOT);
1173                 }
1174                 CURVNET_RESTORE();
1175                 l3hdr = ip6 = (struct ip6_hdr *)(eh + 1);
1176                 error = tcp_lro_rx_ipv6(lc, m, ip6, &th);
1177                 if (error != 0)
1178                         return (error);
1179                 tcp_data_len = ntohs(ip6->ip6_plen);
1180                 ip_len = sizeof(*ip6) + tcp_data_len;
1181                 break;
1182         }
1183 #endif
1184 #ifdef INET
1185         case ETHERTYPE_IP:
1186         {
1187                 CURVNET_SET(lc->ifp->if_vnet);
1188                 if (V_ipforwarding != 0) {
1189                         /* XXX-BZ stats but changing lro_ctrl is a problem. */
1190                         CURVNET_RESTORE();
1191                         return (TCP_LRO_CANNOT);
1192                 }
1193                 CURVNET_RESTORE();
1194                 l3hdr = ip4 = (struct ip *)(eh + 1);
1195                 error = tcp_lro_rx_ipv4(lc, m, ip4, &th);
1196                 if (error != 0)
1197                         return (error);
1198                 ip_len = ntohs(ip4->ip_len);
1199                 tcp_data_len = ip_len - sizeof(*ip4);
1200                 break;
1201         }
1202 #endif
1203         /* XXX-BZ what happens in case of VLAN(s)? */
1204         default:
1205                 return (TCP_LRO_NOT_SUPPORTED);
1206         }
1207
1208         /*
1209          * If the frame is padded beyond the end of the IP packet, then we must
1210          * trim the extra bytes off.
1211          */
1212         l = m->m_pkthdr.len - (ETHER_HDR_LEN + ip_len);
1213         if (l != 0) {
1214                 if (l < 0)
1215                         /* Truncated packet. */
1216                         return (TCP_LRO_CANNOT);
1217
1218                 m_adj(m, -l);
1219         }
1220         /*
1221          * Check TCP header constraints.
1222          */
1223         if (th->th_flags & TH_SYN)
1224                 return (TCP_LRO_CANNOT);
1225         if ((th->th_flags & ~(TH_ACK | TH_PUSH)) != 0)
1226                 need_flush = 1;
1227         else
1228                 need_flush = 0;
1229         l = (th->th_off << 2);
1230         ts_ptr = (uint32_t *)(th + 1);
1231         tcp_data_len -= l;
1232         l -= sizeof(*th);
1233         if (l != 0 && (__predict_false(l != TCPOLEN_TSTAMP_APPA) ||
1234                        (*ts_ptr != ntohl(TCPOPT_NOP<<24|TCPOPT_NOP<<16|
1235                                          TCPOPT_TIMESTAMP<<8|TCPOLEN_TIMESTAMP)))) {
1236                 /*
1237                  * We have an option besides Timestamps, maybe
1238                  * it is a sack (most likely) which means we
1239                  * will probably need to wake up a sleeper (if
1240                  * the guy does queueing).
1241                  */
1242                 need_flush = 2;
1243         }
1244
1245         /* If the driver did not pass in the checksum, set it now. */
1246         if (csum == 0x0000)
1247                 csum = th->th_sum;
1248         seq = ntohl(th->th_seq);
1249         if (!use_hash) {
1250                 bucket = &lc->lro_hash[0];
1251         } else if (M_HASHTYPE_ISHASH(m)) {
1252                 bucket = &lc->lro_hash[m->m_pkthdr.flowid % lc->lro_hashsz];
1253         } else {
1254                 uint32_t hash;
1255
1256                 switch (eh_type) {
1257 #ifdef INET
1258                 case ETHERTYPE_IP:
1259                         hash = ip4->ip_src.s_addr + ip4->ip_dst.s_addr;
1260                         break;
1261 #endif
1262 #ifdef INET6
1263                 case ETHERTYPE_IPV6:
1264                         hash = ip6->ip6_src.s6_addr32[0] +
1265                                 ip6->ip6_dst.s6_addr32[0];
1266                         hash += ip6->ip6_src.s6_addr32[1] +
1267                                 ip6->ip6_dst.s6_addr32[1];
1268                         hash += ip6->ip6_src.s6_addr32[2] +
1269                                 ip6->ip6_dst.s6_addr32[2];
1270                         hash += ip6->ip6_src.s6_addr32[3] +
1271                                 ip6->ip6_dst.s6_addr32[3];
1272                         break;
1273 #endif
1274                 default:
1275                         hash = 0;
1276                         break;
1277                 }
1278                 hash += th->th_sport + th->th_dport;
1279                 bucket = &lc->lro_hash[hash % lc->lro_hashsz];
1280         }
1281
1282         /* Try to find a matching previous segment. */
1283         LIST_FOREACH(le, bucket, hash_next) {
1284                 if (le->eh_type != eh_type)
1285                         continue;
1286                 if (le->source_port != th->th_sport ||
1287                     le->dest_port != th->th_dport)
1288                         continue;
1289                 switch (eh_type) {
1290 #ifdef INET6
1291                 case ETHERTYPE_IPV6:
1292                         if (bcmp(&le->source_ip6, &ip6->ip6_src,
1293                                  sizeof(struct in6_addr)) != 0 ||
1294                             bcmp(&le->dest_ip6, &ip6->ip6_dst,
1295                                  sizeof(struct in6_addr)) != 0)
1296                                 continue;
1297                         break;
1298 #endif
1299 #ifdef INET
1300                 case ETHERTYPE_IP:
1301                         if (le->source_ip4 != ip4->ip_src.s_addr ||
1302                             le->dest_ip4 != ip4->ip_dst.s_addr)
1303                                 continue;
1304                         break;
1305 #endif
1306                 }
1307                 if (tcp_data_len || SEQ_GT(ntohl(th->th_ack), ntohl(le->ack_seq)) ||
1308                     (th->th_ack == le->ack_seq)) {
1309                         m->m_pkthdr.lro_len = tcp_data_len;
1310                 } else {
1311                         /* no data and old ack */
1312                         m_freem(m);
1313                         return (0);
1314                 }
1315                 if (need_flush)
1316                         le->need_wakeup = need_flush;
1317                 /* Save of the data only csum */
1318                 m->m_pkthdr.rcvif = lc->ifp;
1319                 m->m_pkthdr.lro_csum = tcp_lro_rx_csum_fixup(le, l3hdr, th,
1320                                                       tcp_data_len, ~csum);
1321                 th->th_sum = csum;      /* Restore checksum */
1322                 /* Save off the tail I am appending too (prev) */
1323                 le->m_prev_last = le->m_last_mbuf;
1324                 /* Mark me in the last spot */
1325                 le->m_last_mbuf->m_nextpkt = m;
1326                 /* Now set the tail to me  */
1327                 le->m_last_mbuf = m;
1328                 le->mbuf_cnt++;
1329                 m->m_nextpkt = NULL;
1330                 /* Add to the total size of data */
1331                 le->tcp_tot_p_len += tcp_data_len;
1332                 lro_set_mtime(&le->mtime, &arrv);
1333                 return (0);
1334         }
1335         /* Try to find an empty slot. */
1336         if (LIST_EMPTY(&lc->lro_free))
1337                 return (TCP_LRO_NO_ENTRIES);
1338
1339         /* Start a new segment chain. */
1340         le = LIST_FIRST(&lc->lro_free);
1341         LIST_REMOVE(le, next);
1342         tcp_lro_active_insert(lc, bucket, le);
1343         lro_set_mtime(&le->mtime, &arrv);
1344
1345         /* Start filling in details. */
1346         switch (eh_type) {
1347 #ifdef INET6
1348         case ETHERTYPE_IPV6:
1349                 le->le_ip6 = ip6;
1350                 le->source_ip6 = ip6->ip6_src;
1351                 le->dest_ip6 = ip6->ip6_dst;
1352                 le->eh_type = eh_type;
1353                 le->p_len = m->m_pkthdr.len - ETHER_HDR_LEN - sizeof(*ip6);
1354                 break;
1355 #endif
1356 #ifdef INET
1357         case ETHERTYPE_IP:
1358                 le->le_ip4 = ip4;
1359                 le->source_ip4 = ip4->ip_src.s_addr;
1360                 le->dest_ip4 = ip4->ip_dst.s_addr;
1361                 le->eh_type = eh_type;
1362                 le->p_len = m->m_pkthdr.len - ETHER_HDR_LEN;
1363                 break;
1364 #endif
1365         }
1366         le->source_port = th->th_sport;
1367         le->dest_port = th->th_dport;
1368         le->next_seq = seq + tcp_data_len;
1369         le->ack_seq = th->th_ack;
1370         le->window = th->th_win;
1371         if (l != 0) {
1372                 le->timestamp = 1;
1373                 le->tsval = ntohl(*(ts_ptr + 1));
1374                 le->tsecr = *(ts_ptr + 2);
1375         }
1376         KASSERT(le->ulp_csum == 0, ("%s: le=%p le->ulp_csum=0x%04x\n",
1377                                     __func__, le, le->ulp_csum));
1378
1379         le->append_cnt = 0;
1380         le->ulp_csum = tcp_lro_rx_csum_fixup(le, l3hdr, th, tcp_data_len,
1381                                              ~csum);
1382         le->append_cnt++;
1383         th->th_sum = csum;      /* Restore checksum */
1384         le->m_head = m;
1385         m->m_pkthdr.rcvif = lc->ifp;
1386         le->mbuf_cnt = 1;
1387         if (need_flush)
1388                 le->need_wakeup = need_flush;
1389         else
1390                 le->need_wakeup = 0;
1391         le->m_tail = m_last(m);
1392         le->m_last_mbuf = m;
1393         m->m_nextpkt = NULL;
1394         le->m_prev_last = NULL;
1395         /*
1396          * We keep the total size here for cross checking when we may need
1397          * to flush/wakeup in the MBUF_QUEUE case.
1398          */
1399         le->tcp_tot_p_len = tcp_data_len;
1400         m->m_pkthdr.lro_len = tcp_data_len;
1401         return (0);
1402 }
1403
1404 int
1405 tcp_lro_rx(struct lro_ctrl *lc, struct mbuf *m, uint32_t csum)
1406 {
1407
1408         return tcp_lro_rx2(lc, m, csum, 1);
1409 }
1410
1411 void
1412 tcp_lro_queue_mbuf(struct lro_ctrl *lc, struct mbuf *mb)
1413 {
1414         struct timespec arrv;
1415
1416         /* sanity checks */
1417         if (__predict_false(lc->ifp == NULL || lc->lro_mbuf_data == NULL ||
1418             lc->lro_mbuf_max == 0)) {
1419                 /* packet drop */
1420                 m_freem(mb);
1421                 return;
1422         }
1423
1424         /* check if packet is not LRO capable */
1425         if (__predict_false(mb->m_pkthdr.csum_flags == 0 ||
1426             (lc->ifp->if_capenable & IFCAP_LRO) == 0)) {
1427                 /* input packet to network layer */
1428                 (*lc->ifp->if_input) (lc->ifp, mb);
1429                 return;
1430         }
1431         /* Arrival Stamp the packet */
1432
1433         if ((mb->m_flags & M_TSTMP) == 0) {
1434                 /* If no hardware or arrival stamp on the packet add arrival */
1435                 nanouptime(&arrv);
1436                 mb->m_pkthdr.rcv_tstmp = ((arrv.tv_sec * 1000000000) +
1437                                           arrv.tv_nsec);
1438                 mb->m_flags |= M_TSTMP_LRO;
1439         }
1440         /* create sequence number */
1441         lc->lro_mbuf_data[lc->lro_mbuf_count].seq =
1442             (((uint64_t)M_HASHTYPE_GET(mb)) << 56) |
1443             (((uint64_t)mb->m_pkthdr.flowid) << 24) |
1444             ((uint64_t)lc->lro_mbuf_count);
1445
1446         /* enter mbuf */
1447         lc->lro_mbuf_data[lc->lro_mbuf_count].mb = mb;
1448
1449         /* flush if array is full */
1450         if (__predict_false(++lc->lro_mbuf_count == lc->lro_mbuf_max))
1451                 tcp_lro_flush_all(lc);
1452 }
1453
1454 /* end */