]> CyberLeo.Net >> Repos - FreeBSD/FreeBSD.git/blob - sys/netinet/tcp_hpts.c
MFH @ r337607, in preparation for boarding
[FreeBSD/FreeBSD.git] / sys / netinet / tcp_hpts.c
1 /*-
2  * Copyright (c) 2016-2018 Netflix Inc.
3  *
4  * Redistribution and use in source and binary forms, with or without
5  * modification, are permitted provided that the following conditions
6  * are met:
7  * 1. Redistributions of source code must retain the above copyright
8  *    notice, this list of conditions and the following disclaimer.
9  * 2. Redistributions in binary form must reproduce the above copyright
10  *    notice, this list of conditions and the following disclaimer in the
11  *    documentation and/or other materials provided with the distribution.
12  *
13  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
14  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
15  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
16  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
17  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
18  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
19  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
20  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
21  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
22  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
23  * SUCH DAMAGE.
24  *
25  */
26 #include <sys/cdefs.h>
27 __FBSDID("$FreeBSD$");
28
29 #include "opt_inet.h"
30 #include "opt_inet6.h"
31 #include "opt_tcpdebug.h"
32 /**
33  * Some notes about usage.
34  *
35  * The tcp_hpts system is designed to provide a high precision timer
36  * system for tcp. Its main purpose is to provide a mechanism for 
37  * pacing packets out onto the wire. It can be used in two ways
38  * by a given TCP stack (and those two methods can be used simultaneously).
39  *
40  * First, and probably the main thing its used by Rack and BBR for, it can
41  * be used to call tcp_output() of a transport stack at some time in the future.
42  * The normal way this is done is that tcp_output() of the stack schedules
43  * itself to be called again by calling tcp_hpts_insert(tcpcb, slot). The
44  * slot is the time from now that the stack wants to be called but it
45  * must be converted to tcp_hpts's notion of slot. This is done with
46  * one of the macros HPTS_MS_TO_SLOTS or HPTS_USEC_TO_SLOTS. So a typical
47  * call from the tcp_output() routine might look like:
48  *
49  * tcp_hpts_insert(tp, HPTS_USEC_TO_SLOTS(550));
50  *
51  * The above would schedule tcp_ouput() to be called in 550 useconds.
52  * Note that if using this mechanism the stack will want to add near
53  * its top a check to prevent unwanted calls (from user land or the
54  * arrival of incoming ack's). So it would add something like:
55  *
56  * if (inp->inp_in_hpts)
57  *    return;
58  *
59  * to prevent output processing until the time alotted has gone by.
60  * Of course this is a bare bones example and the stack will probably
61  * have more consideration then just the above.
62  *
63  * Now the tcp_hpts system will call tcp_output in one of two forms, 
64  * it will first check to see if the stack as defined a 
65  * tfb_tcp_output_wtime() function, if so that is the routine it
66  * will call, if that function is not defined then it will call the
67  * tfb_tcp_output() function. The only difference between these
68  * two calls is that the former passes the time in to the function
69  * so the function does not have to access the time (which tcp_hpts
70  * already has). What these functions do is of course totally up
71  * to the individual tcp stack.
72  *
73  * Now the second function (actually two functions I guess :D)
74  * the tcp_hpts system provides is the  ability to either abort 
75  * a connection (later) or process  input on a connection. 
76  * Why would you want to do this? To keep processor locality.
77  *
78  * So in order to use the input redirection function the
79  * stack changes its tcp_do_segment() routine to instead
80  * of process the data call the function:
81  *
82  * tcp_queue_pkt_to_input()
83  *
84  * You will note that the arguments to this function look
85  * a lot like tcp_do_segments's arguments. This function
86  * will assure that the tcp_hpts system will
87  * call the functions tfb_tcp_hpts_do_segment() from the
88  * correct CPU. Note that multiple calls can get pushed
89  * into the tcp_hpts system this will be indicated by
90  * the next to last argument to tfb_tcp_hpts_do_segment()
91  * (nxt_pkt). If nxt_pkt is a 1 then another packet is
92  * coming. If nxt_pkt is a 0 then this is the last call
93  * that the tcp_hpts system has available for the tcp stack.
94  * 
95  * The other point of the input system is to be able to safely
96  * drop a tcp connection without worrying about the recursive 
97  * locking that may be occuring on the INP_WLOCK. So if
98  * a stack wants to drop a connection it calls:
99  *
100  *     tcp_set_inp_to_drop(tp, ETIMEDOUT)
101  * 
102  * To schedule the tcp_hpts system to call 
103  * 
104  *    tcp_drop(tp, drop_reason)
105  *
106  * at a future point. This is quite handy to prevent locking
107  * issues when dropping connections.
108  *
109  */
110
111 #include <sys/param.h>
112 #include <sys/bus.h>
113 #include <sys/interrupt.h>
114 #include <sys/module.h>
115 #include <sys/kernel.h>
116 #include <sys/hhook.h>
117 #include <sys/malloc.h>
118 #include <sys/mbuf.h>
119 #include <sys/proc.h>           /* for proc0 declaration */
120 #include <sys/socket.h>
121 #include <sys/socketvar.h>
122 #include <sys/sysctl.h>
123 #include <sys/systm.h>
124 #include <sys/refcount.h>
125 #include <sys/sched.h>
126 #include <sys/queue.h>
127 #include <sys/smp.h>
128 #include <sys/counter.h>
129 #include <sys/time.h>
130 #include <sys/kthread.h>
131 #include <sys/kern_prefetch.h>
132
133 #include <vm/uma.h>
134
135 #include <net/route.h>
136 #include <net/vnet.h>
137
138 #define TCPSTATES               /* for logging */
139
140 #include <netinet/in.h>
141 #include <netinet/in_kdtrace.h>
142 #include <netinet/in_pcb.h>
143 #include <netinet/ip.h>
144 #include <netinet/ip_icmp.h>    /* required for icmp_var.h */
145 #include <netinet/icmp_var.h>   /* for ICMP_BANDLIM */
146 #include <netinet/ip_var.h>
147 #include <netinet/ip6.h>
148 #include <netinet6/in6_pcb.h>
149 #include <netinet6/ip6_var.h>
150 #include <netinet/tcp.h>
151 #include <netinet/tcp_fsm.h>
152 #include <netinet/tcp_seq.h>
153 #include <netinet/tcp_timer.h>
154 #include <netinet/tcp_var.h>
155 #include <netinet/tcpip.h>
156 #include <netinet/cc/cc.h>
157 #include <netinet/tcp_hpts.h>
158
159 #ifdef tcpdebug
160 #include <netinet/tcp_debug.h>
161 #endif                          /* tcpdebug */
162 #ifdef tcp_offload
163 #include <netinet/tcp_offload.h>
164 #endif
165
166 #include "opt_rss.h"
167
168 MALLOC_DEFINE(M_TCPHPTS, "tcp_hpts", "TCP hpts");
169 #ifdef RSS
170 static int tcp_bind_threads = 1;
171 #else
172 static int tcp_bind_threads = 0;
173 #endif
174 TUNABLE_INT("net.inet.tcp.bind_hptss", &tcp_bind_threads);
175
176 static uint32_t tcp_hpts_logging_size = DEFAULT_HPTS_LOG;
177
178 TUNABLE_INT("net.inet.tcp.hpts_logging_sz", &tcp_hpts_logging_size);
179
180 static struct tcp_hptsi tcp_pace;
181
182 static void tcp_wakehpts(struct tcp_hpts_entry *p);
183 static void tcp_wakeinput(struct tcp_hpts_entry *p);
184 static void tcp_input_data(struct tcp_hpts_entry *hpts, struct timeval *tv);
185 static void tcp_hptsi(struct tcp_hpts_entry *hpts, struct timeval *ctick);
186 static void tcp_hpts_thread(void *ctx);
187 static void tcp_init_hptsi(void *st);
188
189 int32_t tcp_min_hptsi_time = DEFAULT_MIN_SLEEP;
190 static int32_t tcp_hpts_callout_skip_swi = 0;
191
192 SYSCTL_NODE(_net_inet_tcp, OID_AUTO, hpts, CTLFLAG_RW, 0, "TCP Hpts controls");
193
194 #define timersub(tvp, uvp, vvp)                                         \
195         do {                                                            \
196                 (vvp)->tv_sec = (tvp)->tv_sec - (uvp)->tv_sec;          \
197                 (vvp)->tv_usec = (tvp)->tv_usec - (uvp)->tv_usec;       \
198                 if ((vvp)->tv_usec < 0) {                               \
199                         (vvp)->tv_sec--;                                \
200                         (vvp)->tv_usec += 1000000;                      \
201                 }                                                       \
202         } while (0)
203
204 static int32_t logging_on = 0;
205 static int32_t hpts_sleep_max = (NUM_OF_HPTSI_SLOTS - 2);
206 static int32_t tcp_hpts_precision = 120;
207
208 SYSCTL_INT(_net_inet_tcp_hpts, OID_AUTO, precision, CTLFLAG_RW,
209     &tcp_hpts_precision, 120,
210     "Value for PRE() precision of callout");
211
212 SYSCTL_INT(_net_inet_tcp_hpts, OID_AUTO, logging, CTLFLAG_RW,
213     &logging_on, 0,
214     "Turn on logging if compiled in");
215
216 counter_u64_t hpts_loops;
217
218 SYSCTL_COUNTER_U64(_net_inet_tcp_hpts, OID_AUTO, loops, CTLFLAG_RD,
219     &hpts_loops, "Number of times hpts had to loop to catch up");
220
221 counter_u64_t back_tosleep;
222
223 SYSCTL_COUNTER_U64(_net_inet_tcp_hpts, OID_AUTO, no_tcbsfound, CTLFLAG_RD,
224     &back_tosleep, "Number of times hpts found no tcbs");
225
226 static int32_t in_newts_every_tcb = 0;
227
228 SYSCTL_INT(_net_inet_tcp_hpts, OID_AUTO, in_tsperpcb, CTLFLAG_RW,
229     &in_newts_every_tcb, 0,
230     "Do we have a new cts every tcb we process for input");
231 static int32_t in_ts_percision = 0;
232
233 SYSCTL_INT(_net_inet_tcp_hpts, OID_AUTO, in_tspercision, CTLFLAG_RW,
234     &in_ts_percision, 0,
235     "Do we use percise timestamp for clients on input");
236 static int32_t out_newts_every_tcb = 0;
237
238 SYSCTL_INT(_net_inet_tcp_hpts, OID_AUTO, out_tsperpcb, CTLFLAG_RW,
239     &out_newts_every_tcb, 0,
240     "Do we have a new cts every tcb we process for output");
241 static int32_t out_ts_percision = 0;
242
243 SYSCTL_INT(_net_inet_tcp_hpts, OID_AUTO, out_tspercision, CTLFLAG_RW,
244     &out_ts_percision, 0,
245     "Do we use a percise timestamp for every output cts");
246
247 SYSCTL_INT(_net_inet_tcp_hpts, OID_AUTO, maxsleep, CTLFLAG_RW,
248     &hpts_sleep_max, 0,
249     "The maximum time the hpts will sleep <1 - 254>");
250
251 SYSCTL_INT(_net_inet_tcp_hpts, OID_AUTO, minsleep, CTLFLAG_RW,
252     &tcp_min_hptsi_time, 0,
253     "The minimum time the hpts must sleep before processing more slots");
254
255 SYSCTL_INT(_net_inet_tcp_hpts, OID_AUTO, skip_swi, CTLFLAG_RW,
256     &tcp_hpts_callout_skip_swi, 0,
257     "Do we have the callout call directly to the hpts?");
258
259 static void
260 __tcp_hpts_log_it(struct tcp_hpts_entry *hpts, struct inpcb *inp, int event, uint32_t slot,
261     uint32_t ticknow, int32_t line)
262 {
263         struct hpts_log *pl;
264
265         HPTS_MTX_ASSERT(hpts);
266         if (hpts->p_log == NULL)
267                 return;
268         pl = &hpts->p_log[hpts->p_log_at];
269         hpts->p_log_at++;
270         if (hpts->p_log_at >= hpts->p_logsize) {
271                 hpts->p_log_at = 0;
272                 hpts->p_log_wrapped = 1;
273         }
274         pl->inp = inp;
275         if (inp) {
276                 pl->t_paceslot = inp->inp_hptsslot;
277                 pl->t_hptsreq = inp->inp_hpts_request;
278                 pl->p_onhpts = inp->inp_in_hpts;
279                 pl->p_oninput = inp->inp_in_input;
280         } else {
281                 pl->t_paceslot = 0;
282                 pl->t_hptsreq = 0;
283                 pl->p_onhpts = 0;
284                 pl->p_oninput = 0;
285         }
286         pl->is_notempty = 1;
287         pl->event = event;
288         pl->line = line;
289         pl->cts = tcp_get_usecs(NULL);
290         pl->p_curtick = hpts->p_curtick;
291         pl->p_prevtick = hpts->p_prevtick;
292         pl->p_on_queue_cnt = hpts->p_on_queue_cnt;
293         pl->ticknow = ticknow;
294         pl->slot_req = slot;
295         pl->p_nxt_slot = hpts->p_nxt_slot;
296         pl->p_cur_slot = hpts->p_cur_slot;
297         pl->p_hpts_sleep_time = hpts->p_hpts_sleep_time;
298         pl->p_flags = (hpts->p_cpu & 0x7f);
299         pl->p_flags <<= 7;
300         pl->p_flags |= (hpts->p_num & 0x7f);
301         pl->p_flags <<= 2;
302         if (hpts->p_hpts_active) {
303                 pl->p_flags |= HPTS_HPTS_ACTIVE;
304         }
305 }
306
307 #define tcp_hpts_log_it(a, b, c, d, e) __tcp_hpts_log_it(a, b, c, d, e, __LINE__)
308
309 static void
310 hpts_timeout_swi(void *arg)
311 {
312         struct tcp_hpts_entry *hpts;
313
314         hpts = (struct tcp_hpts_entry *)arg;
315         swi_sched(hpts->ie_cookie, 0);
316 }
317
318 static void
319 hpts_timeout_dir(void *arg)
320 {
321         tcp_hpts_thread(arg);
322 }
323
324 static inline void
325 hpts_sane_pace_remove(struct tcp_hpts_entry *hpts, struct inpcb *inp, struct hptsh *head, int clear)
326 {
327 #ifdef INVARIANTS
328         if (mtx_owned(&hpts->p_mtx) == 0) {
329                 /* We don't own the mutex? */
330                 panic("%s: hpts:%p inp:%p no hpts mutex", __FUNCTION__, hpts, inp);
331         }
332         if (hpts->p_cpu != inp->inp_hpts_cpu) {
333                 /* It is not the right cpu/mutex? */
334                 panic("%s: hpts:%p inp:%p incorrect CPU", __FUNCTION__, hpts, inp);
335         }
336         if (inp->inp_in_hpts == 0) {
337                 /* We are not on the hpts? */
338                 panic("%s: hpts:%p inp:%p not on the hpts?", __FUNCTION__, hpts, inp);
339         }
340         if (TAILQ_EMPTY(head) &&
341             (hpts->p_on_queue_cnt != 0)) {
342                 /* We should not be empty with a queue count */
343                 panic("%s hpts:%p hpts bucket empty but cnt:%d",
344                     __FUNCTION__, hpts, hpts->p_on_queue_cnt);
345         }
346 #endif
347         TAILQ_REMOVE(head, inp, inp_hpts);
348         hpts->p_on_queue_cnt--;
349         if (hpts->p_on_queue_cnt < 0) {
350                 /* Count should not go negative .. */
351 #ifdef INVARIANTS
352                 panic("Hpts goes negative inp:%p hpts:%p",
353                     inp, hpts);
354 #endif
355                 hpts->p_on_queue_cnt = 0;
356         }
357         if (clear) {
358                 inp->inp_hpts_request = 0;
359                 inp->inp_in_hpts = 0;
360         }
361 }
362
363 static inline void
364 hpts_sane_pace_insert(struct tcp_hpts_entry *hpts, struct inpcb *inp, struct hptsh *head, int line, int noref)
365 {
366 #ifdef INVARIANTS
367         if (mtx_owned(&hpts->p_mtx) == 0) {
368                 /* We don't own the mutex? */
369                 panic("%s: hpts:%p inp:%p no hpts mutex", __FUNCTION__, hpts, inp);
370         }
371         if (hpts->p_cpu != inp->inp_hpts_cpu) {
372                 /* It is not the right cpu/mutex? */
373                 panic("%s: hpts:%p inp:%p incorrect CPU", __FUNCTION__, hpts, inp);
374         }
375         if ((noref == 0) && (inp->inp_in_hpts == 1)) {
376                 /* We are already on the hpts? */
377                 panic("%s: hpts:%p inp:%p already on the hpts?", __FUNCTION__, hpts, inp);
378         }
379 #endif
380         TAILQ_INSERT_TAIL(head, inp, inp_hpts);
381         inp->inp_in_hpts = 1;
382         hpts->p_on_queue_cnt++;
383         if (noref == 0) {
384                 in_pcbref(inp);
385         }
386 }
387
388 static inline void
389 hpts_sane_input_remove(struct tcp_hpts_entry *hpts, struct inpcb *inp, int clear)
390 {
391 #ifdef INVARIANTS
392         if (mtx_owned(&hpts->p_mtx) == 0) {
393                 /* We don't own the mutex? */
394                 panic("%s: hpts:%p inp:%p no hpts mutex", __FUNCTION__, hpts, inp);
395         }
396         if (hpts->p_cpu != inp->inp_input_cpu) {
397                 /* It is not the right cpu/mutex? */
398                 panic("%s: hpts:%p inp:%p incorrect CPU", __FUNCTION__, hpts, inp);
399         }
400         if (inp->inp_in_input == 0) {
401                 /* We are not on the input hpts? */
402                 panic("%s: hpts:%p inp:%p not on the input hpts?", __FUNCTION__, hpts, inp);
403         }
404 #endif
405         TAILQ_REMOVE(&hpts->p_input, inp, inp_input);
406         hpts->p_on_inqueue_cnt--;
407         if (hpts->p_on_inqueue_cnt < 0) {
408 #ifdef INVARIANTS
409                 panic("Hpts in goes negative inp:%p hpts:%p",
410                     inp, hpts);
411 #endif
412                 hpts->p_on_inqueue_cnt = 0;
413         }
414 #ifdef INVARIANTS
415         if (TAILQ_EMPTY(&hpts->p_input) &&
416             (hpts->p_on_inqueue_cnt != 0)) {
417                 /* We should not be empty with a queue count */
418                 panic("%s hpts:%p in_hpts input empty but cnt:%d",
419                     __FUNCTION__, hpts, hpts->p_on_inqueue_cnt);
420         }
421 #endif
422         if (clear)
423                 inp->inp_in_input = 0;
424 }
425
426 static inline void
427 hpts_sane_input_insert(struct tcp_hpts_entry *hpts, struct inpcb *inp, int line)
428 {
429 #ifdef INVARIANTS
430         if (mtx_owned(&hpts->p_mtx) == 0) {
431                 /* We don't own the mutex? */
432                 panic("%s: hpts:%p inp:%p no hpts mutex", __FUNCTION__, hpts, inp);
433         }
434         if (hpts->p_cpu != inp->inp_input_cpu) {
435                 /* It is not the right cpu/mutex? */
436                 panic("%s: hpts:%p inp:%p incorrect CPU", __FUNCTION__, hpts, inp);
437         }
438         if (inp->inp_in_input == 1) {
439                 /* We are already on the input hpts? */
440                 panic("%s: hpts:%p inp:%p already on the input hpts?", __FUNCTION__, hpts, inp);
441         }
442 #endif
443         TAILQ_INSERT_TAIL(&hpts->p_input, inp, inp_input);
444         inp->inp_in_input = 1;
445         hpts->p_on_inqueue_cnt++;
446         in_pcbref(inp);
447 }
448
449 static int
450 sysctl_tcp_hpts_log(SYSCTL_HANDLER_ARGS)
451 {
452         struct tcp_hpts_entry *hpts;
453         size_t sz;
454         int32_t logging_was, i;
455         int32_t error = 0;
456
457         /*
458          * HACK: Turn off logging so no locks are required this really needs
459          * a memory barrier :)
460          */
461         logging_was = logging_on;
462         logging_on = 0;
463         if (!req->oldptr) {
464                 /* How much? */
465                 sz = 0;
466                 for (i = 0; i < tcp_pace.rp_num_hptss; i++) {
467                         hpts = tcp_pace.rp_ent[i];
468                         if (hpts->p_log == NULL)
469                                 continue;
470                         sz += (sizeof(struct hpts_log) * hpts->p_logsize);
471                 }
472                 error = SYSCTL_OUT(req, 0, sz);
473         } else {
474                 for (i = 0; i < tcp_pace.rp_num_hptss; i++) {
475                         hpts = tcp_pace.rp_ent[i];
476                         if (hpts->p_log == NULL)
477                                 continue;
478                         if (hpts->p_log_wrapped)
479                                 sz = (sizeof(struct hpts_log) * hpts->p_logsize);
480                         else
481                                 sz = (sizeof(struct hpts_log) * hpts->p_log_at);
482                         error = SYSCTL_OUT(req, hpts->p_log, sz);
483                 }
484         }
485         logging_on = logging_was;
486         return error;
487 }
488
489 SYSCTL_PROC(_net_inet_tcp_hpts, OID_AUTO, log, CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE,
490     0, 0, sysctl_tcp_hpts_log, "A", "tcp hptsi log");
491
492
493 static void
494 tcp_wakehpts(struct tcp_hpts_entry *hpts)
495 {
496         HPTS_MTX_ASSERT(hpts);
497         swi_sched(hpts->ie_cookie, 0);
498         if (hpts->p_hpts_active == 2) {
499                 /* Rare sleeping on a ENOBUF */
500                 wakeup_one(hpts);
501         }
502 }
503
504 static void
505 tcp_wakeinput(struct tcp_hpts_entry *hpts)
506 {
507         HPTS_MTX_ASSERT(hpts);
508         swi_sched(hpts->ie_cookie, 0);
509         if (hpts->p_hpts_active == 2) {
510                 /* Rare sleeping on a ENOBUF */
511                 wakeup_one(hpts);
512         }
513 }
514
515 struct tcp_hpts_entry *
516 tcp_cur_hpts(struct inpcb *inp)
517 {
518         int32_t hpts_num;
519         struct tcp_hpts_entry *hpts;
520
521         hpts_num = inp->inp_hpts_cpu;
522         hpts = tcp_pace.rp_ent[hpts_num];
523         return (hpts);
524 }
525
526 struct tcp_hpts_entry *
527 tcp_hpts_lock(struct inpcb *inp)
528 {
529         struct tcp_hpts_entry *hpts;
530         int32_t hpts_num;
531
532 again:
533         hpts_num = inp->inp_hpts_cpu;
534         hpts = tcp_pace.rp_ent[hpts_num];
535 #ifdef INVARIANTS
536         if (mtx_owned(&hpts->p_mtx)) {
537                 panic("Hpts:%p owns mtx prior-to lock line:%d",
538                     hpts, __LINE__);
539         }
540 #endif
541         mtx_lock(&hpts->p_mtx);
542         if (hpts_num != inp->inp_hpts_cpu) {
543                 mtx_unlock(&hpts->p_mtx);
544                 goto again;
545         }
546         return (hpts);
547 }
548
549 struct tcp_hpts_entry *
550 tcp_input_lock(struct inpcb *inp)
551 {
552         struct tcp_hpts_entry *hpts;
553         int32_t hpts_num;
554
555 again:
556         hpts_num = inp->inp_input_cpu;
557         hpts = tcp_pace.rp_ent[hpts_num];
558 #ifdef INVARIANTS
559         if (mtx_owned(&hpts->p_mtx)) {
560                 panic("Hpts:%p owns mtx prior-to lock line:%d",
561                     hpts, __LINE__);
562         }
563 #endif
564         mtx_lock(&hpts->p_mtx);
565         if (hpts_num != inp->inp_input_cpu) {
566                 mtx_unlock(&hpts->p_mtx);
567                 goto again;
568         }
569         return (hpts);
570 }
571
572 static void
573 tcp_remove_hpts_ref(struct inpcb *inp, struct tcp_hpts_entry *hpts, int line)
574 {
575         int32_t add_freed;
576
577         if (inp->inp_flags2 & INP_FREED) {
578                 /*
579                  * Need to play a special trick so that in_pcbrele_wlocked
580                  * does not return 1 when it really should have returned 0.
581                  */
582                 add_freed = 1;
583                 inp->inp_flags2 &= ~INP_FREED;
584         } else {
585                 add_freed = 0;
586         }
587 #ifndef INP_REF_DEBUG
588         if (in_pcbrele_wlocked(inp)) {
589                 /*
590                  * This should not happen. We have the inpcb referred to by
591                  * the main socket (why we are called) and the hpts. It
592                  * should always return 0.
593                  */
594                 panic("inpcb:%p release ret 1",
595                     inp);
596         }
597 #else
598         if (__in_pcbrele_wlocked(inp, line)) {
599                 /*
600                  * This should not happen. We have the inpcb referred to by
601                  * the main socket (why we are called) and the hpts. It
602                  * should always return 0.
603                  */
604                 panic("inpcb:%p release ret 1",
605                     inp);
606         }
607 #endif
608         if (add_freed) {
609                 inp->inp_flags2 |= INP_FREED;
610         }
611 }
612
613 static void
614 tcp_hpts_remove_locked_output(struct tcp_hpts_entry *hpts, struct inpcb *inp, int32_t flags, int32_t line)
615 {
616         if (inp->inp_in_hpts) {
617                 hpts_sane_pace_remove(hpts, inp, &hpts->p_hptss[inp->inp_hptsslot], 1);
618                 tcp_remove_hpts_ref(inp, hpts, line);
619         }
620 }
621
622 static void
623 tcp_hpts_remove_locked_input(struct tcp_hpts_entry *hpts, struct inpcb *inp, int32_t flags, int32_t line)
624 {
625         HPTS_MTX_ASSERT(hpts);
626         if (inp->inp_in_input) {
627                 hpts_sane_input_remove(hpts, inp, 1);
628                 tcp_remove_hpts_ref(inp, hpts, line);
629         }
630 }
631
632 /*
633  * Called normally with the INP_LOCKED but it
634  * does not matter, the hpts lock is the key
635  * but the lock order allows us to hold the
636  * INP lock and then get the hpts lock.
637  *
638  * Valid values in the flags are
639  * HPTS_REMOVE_OUTPUT - remove from the output of the hpts.
640  * HPTS_REMOVE_INPUT - remove from the input of the hpts.
641  * Note that you can or both values together and get two
642  * actions.
643  */
644 void
645 __tcp_hpts_remove(struct inpcb *inp, int32_t flags, int32_t line)
646 {
647         struct tcp_hpts_entry *hpts;
648
649         INP_WLOCK_ASSERT(inp);
650         if (flags & HPTS_REMOVE_OUTPUT) {
651                 hpts = tcp_hpts_lock(inp);
652                 tcp_hpts_remove_locked_output(hpts, inp, flags, line);
653                 mtx_unlock(&hpts->p_mtx);
654         }
655         if (flags & HPTS_REMOVE_INPUT) {
656                 hpts = tcp_input_lock(inp);
657                 tcp_hpts_remove_locked_input(hpts, inp, flags, line);
658                 mtx_unlock(&hpts->p_mtx);
659         }
660 }
661
662 static inline int
663 hpts_tick(struct tcp_hpts_entry *hpts, int32_t plus)
664 {
665         return ((hpts->p_prevtick + plus) % NUM_OF_HPTSI_SLOTS);
666 }
667
668 static int
669 tcp_queue_to_hpts_immediate_locked(struct inpcb *inp, struct tcp_hpts_entry *hpts, int32_t line, int32_t noref)
670 {
671         int32_t need_wake = 0;
672         uint32_t ticknow = 0;
673
674         HPTS_MTX_ASSERT(hpts);
675         if (inp->inp_in_hpts == 0) {
676                 /* Ok we need to set it on the hpts in the current slot */
677                 if (hpts->p_hpts_active == 0) {
678                         /* A sleeping hpts we want in next slot to run */
679                         if (logging_on) {
680                                 tcp_hpts_log_it(hpts, inp, HPTSLOG_INSERT_SLEEPER, 0,
681                                     hpts_tick(hpts, 1));
682                         }
683                         inp->inp_hptsslot = hpts_tick(hpts, 1);
684                         inp->inp_hpts_request = 0;
685                         if (logging_on) {
686                                 tcp_hpts_log_it(hpts, inp, HPTSLOG_SLEEP_BEFORE, 1, ticknow);
687                         }
688                         need_wake = 1;
689                 } else if ((void *)inp == hpts->p_inp) {
690                         /*
691                          * We can't allow you to go into the same slot we
692                          * are in. We must put you out.
693                          */
694                         inp->inp_hptsslot = hpts->p_nxt_slot;
695                 } else
696                         inp->inp_hptsslot = hpts->p_cur_slot;
697                 hpts_sane_pace_insert(hpts, inp, &hpts->p_hptss[inp->inp_hptsslot], line, noref);
698                 inp->inp_hpts_request = 0;
699                 if (logging_on) {
700                         tcp_hpts_log_it(hpts, inp, HPTSLOG_IMMEDIATE, 0, 0);
701                 }
702                 if (need_wake) {
703                         /*
704                          * Activate the hpts if it is sleeping and its
705                          * timeout is not 1.
706                          */
707                         if (logging_on) {
708                                 tcp_hpts_log_it(hpts, inp, HPTSLOG_WAKEUP_HPTS, 0, ticknow);
709                         }
710                         hpts->p_direct_wake = 1;
711                         tcp_wakehpts(hpts);
712                 }
713         }
714         return (need_wake);
715 }
716
717 int
718 __tcp_queue_to_hpts_immediate(struct inpcb *inp, int32_t line)
719 {
720         int32_t ret;
721         struct tcp_hpts_entry *hpts;
722
723         INP_WLOCK_ASSERT(inp);
724         hpts = tcp_hpts_lock(inp);
725         ret = tcp_queue_to_hpts_immediate_locked(inp, hpts, line, 0);
726         mtx_unlock(&hpts->p_mtx);
727         return (ret);
728 }
729
730 static void
731 tcp_hpts_insert_locked(struct tcp_hpts_entry *hpts, struct inpcb *inp, uint32_t slot, uint32_t cts, int32_t line,
732     struct hpts_diag *diag, int32_t noref)
733 {
734         int32_t need_new_to = 0;
735         int32_t need_wakeup = 0;
736         uint32_t largest_slot;
737         uint32_t ticknow = 0;
738         uint32_t slot_calc;
739
740         HPTS_MTX_ASSERT(hpts);
741         if (diag) {
742                 memset(diag, 0, sizeof(struct hpts_diag));
743                 diag->p_hpts_active = hpts->p_hpts_active;
744                 diag->p_nxt_slot = hpts->p_nxt_slot;
745                 diag->p_cur_slot = hpts->p_cur_slot;
746                 diag->slot_req = slot;
747         }
748         if ((inp->inp_in_hpts == 0) || noref) {
749                 inp->inp_hpts_request = slot;
750                 if (slot == 0) {
751                         /* Immediate */
752                         tcp_queue_to_hpts_immediate_locked(inp, hpts, line, noref);
753                         return;
754                 }
755                 if (hpts->p_hpts_active) {
756                         /*
757                          * Its slot - 1 since nxt_slot is the next tick that
758                          * will go off since the hpts is awake
759                          */
760                         if (logging_on) {
761                                 tcp_hpts_log_it(hpts, inp, HPTSLOG_INSERT_NORMAL, slot, 0);
762                         }
763                         /*
764                          * We want to make sure that we don't place a inp in
765                          * the range of p_cur_slot <-> p_nxt_slot. If we
766                          * take from p_nxt_slot to the end, plus p_cur_slot
767                          * and then take away 2, we will know how many is
768                          * the max slots we can use.
769                          */
770                         if (hpts->p_nxt_slot > hpts->p_cur_slot) {
771                                 /*
772                                  * Non-wrap case nxt_slot <-> cur_slot we
773                                  * don't want to land in. So the diff gives
774                                  * us what is taken away from the number of
775                                  * slots.
776                                  */
777                                 largest_slot = NUM_OF_HPTSI_SLOTS - (hpts->p_nxt_slot - hpts->p_cur_slot);
778                         } else if (hpts->p_nxt_slot == hpts->p_cur_slot) {
779                                 largest_slot = NUM_OF_HPTSI_SLOTS - 2;
780                         } else {
781                                 /*
782                                  * Wrap case so the diff gives us the number
783                                  * of slots that we can land in.
784                                  */
785                                 largest_slot = hpts->p_cur_slot - hpts->p_nxt_slot;
786                         }
787                         /*
788                          * We take away two so we never have a problem (20
789                          * usec's) out of 1024000 usecs
790                          */
791                         largest_slot -= 2;
792                         if (inp->inp_hpts_request > largest_slot) {
793                                 /*
794                                  * Restrict max jump of slots and remember
795                                  * leftover
796                                  */
797                                 slot = largest_slot;
798                                 inp->inp_hpts_request -= largest_slot;
799                         } else {
800                                 /* This one will run when we hit it */
801                                 inp->inp_hpts_request = 0;
802                         }
803                         if (hpts->p_nxt_slot == hpts->p_cur_slot)
804                                 slot_calc = (hpts->p_nxt_slot + slot) % NUM_OF_HPTSI_SLOTS;
805                         else
806                                 slot_calc = (hpts->p_nxt_slot + slot - 1) % NUM_OF_HPTSI_SLOTS;
807                         if (slot_calc == hpts->p_cur_slot) {
808 #ifdef INVARIANTS
809                                 /* TSNH */
810                                 panic("Hpts:%p impossible slot calculation slot_calc:%u slot:%u largest:%u\n",
811                                     hpts, slot_calc, slot, largest_slot);
812 #endif
813                                 if (slot_calc)
814                                         slot_calc--;
815                                 else
816                                         slot_calc = NUM_OF_HPTSI_SLOTS - 1;
817                         }
818                         inp->inp_hptsslot = slot_calc;
819                         if (diag) {
820                                 diag->inp_hptsslot = inp->inp_hptsslot;
821                         }
822                 } else {
823                         /*
824                          * The hpts is sleeping, we need to figure out where
825                          * it will wake up at and if we need to reschedule
826                          * its time-out.
827                          */
828                         uint32_t have_slept, yet_to_sleep;
829                         uint32_t slot_now;
830                         struct timeval tv;
831
832                         ticknow = tcp_gethptstick(&tv);
833                         slot_now = ticknow % NUM_OF_HPTSI_SLOTS;
834                         /*
835                          * The user wants to be inserted at (slot_now +
836                          * slot) % NUM_OF_HPTSI_SLOTS, so lets set that up.
837                          */
838                         largest_slot = NUM_OF_HPTSI_SLOTS - 2;
839                         if (inp->inp_hpts_request > largest_slot) {
840                                 /* Adjust the residual in inp_hpts_request */
841                                 slot = largest_slot;
842                                 inp->inp_hpts_request -= largest_slot;
843                         } else {
844                                 /* No residual it all fits */
845                                 inp->inp_hpts_request = 0;
846                         }
847                         inp->inp_hptsslot = (slot_now + slot) % NUM_OF_HPTSI_SLOTS;
848                         if (diag) {
849                                 diag->slot_now = slot_now;
850                                 diag->inp_hptsslot = inp->inp_hptsslot;
851                                 diag->p_on_min_sleep = hpts->p_on_min_sleep;
852                         }
853                         if (logging_on) {
854                                 tcp_hpts_log_it(hpts, inp, HPTSLOG_INSERT_SLEEPER, slot, ticknow);
855                         }
856                         /* Now do we need to restart the hpts's timer? */
857                         if (TSTMP_GT(ticknow, hpts->p_curtick))
858                                 have_slept = ticknow - hpts->p_curtick;
859                         else
860                                 have_slept = 0;
861                         if (have_slept < hpts->p_hpts_sleep_time) {
862                                 /* This should be what happens */
863                                 yet_to_sleep = hpts->p_hpts_sleep_time - have_slept;
864                         } else {
865                                 /* We are over-due */
866                                 yet_to_sleep = 0;
867                                 need_wakeup = 1;
868                         }
869                         if (diag) {
870                                 diag->have_slept = have_slept;
871                                 diag->yet_to_sleep = yet_to_sleep;
872                                 diag->hpts_sleep_time = hpts->p_hpts_sleep_time;
873                         }
874                         if ((hpts->p_on_min_sleep == 0) && (yet_to_sleep > slot)) {
875                                 /*
876                                  * We need to reschedule the hptss time-out.
877                                  */
878                                 hpts->p_hpts_sleep_time = slot;
879                                 need_new_to = slot * HPTS_TICKS_PER_USEC;
880                         }
881                 }
882                 hpts_sane_pace_insert(hpts, inp, &hpts->p_hptss[inp->inp_hptsslot], line, noref);
883                 if (logging_on) {
884                         tcp_hpts_log_it(hpts, inp, HPTSLOG_INSERTED, slot, ticknow);
885                 }
886                 /*
887                  * Now how far is the hpts sleeping to? if active is 1, its
888                  * up and ticking we do nothing, otherwise we may need to
889                  * reschedule its callout if need_new_to is set from above.
890                  */
891                 if (need_wakeup) {
892                         if (logging_on) {
893                                 tcp_hpts_log_it(hpts, inp, HPTSLOG_RESCHEDULE, 1, 0);
894                         }
895                         hpts->p_direct_wake = 1;
896                         tcp_wakehpts(hpts);
897                         if (diag) {
898                                 diag->need_new_to = 0;
899                                 diag->co_ret = 0xffff0000;
900                         }
901                 } else if (need_new_to) {
902                         int32_t co_ret;
903                         struct timeval tv;
904                         sbintime_t sb;
905
906                         tv.tv_sec = 0;
907                         tv.tv_usec = 0;
908                         while (need_new_to > HPTS_USEC_IN_SEC) {
909                                 tv.tv_sec++;
910                                 need_new_to -= HPTS_USEC_IN_SEC;
911                         }
912                         tv.tv_usec = need_new_to;
913                         sb = tvtosbt(tv);
914                         if (tcp_hpts_callout_skip_swi == 0) {
915                                 co_ret = callout_reset_sbt_on(&hpts->co, sb, 0,
916                                     hpts_timeout_swi, hpts, hpts->p_cpu,
917                                     (C_DIRECT_EXEC | C_PREL(tcp_hpts_precision)));
918                         } else {
919                                 co_ret = callout_reset_sbt_on(&hpts->co, sb, 0,
920                                     hpts_timeout_dir, hpts,
921                                     hpts->p_cpu,
922                                     C_PREL(tcp_hpts_precision));
923                         }
924                         if (diag) {
925                                 diag->need_new_to = need_new_to;
926                                 diag->co_ret = co_ret;
927                         }
928                 }
929         } else {
930 #ifdef INVARIANTS
931                 panic("Hpts:%p tp:%p already on hpts and add?", hpts, inp);
932 #endif
933         }
934 }
935
936 uint32_t
937 tcp_hpts_insert_diag(struct inpcb *inp, uint32_t slot, int32_t line, struct hpts_diag *diag){
938         struct tcp_hpts_entry *hpts;
939         uint32_t slot_on, cts;
940         struct timeval tv;
941
942         /*
943          * We now return the next-slot the hpts will be on, beyond its
944          * current run (if up) or where it was when it stopped if it is
945          * sleeping.
946          */
947         INP_WLOCK_ASSERT(inp);
948         hpts = tcp_hpts_lock(inp);
949         if (in_ts_percision)
950                 microuptime(&tv);
951         else
952                 getmicrouptime(&tv);
953         cts = tcp_tv_to_usectick(&tv);
954         tcp_hpts_insert_locked(hpts, inp, slot, cts, line, diag, 0);
955         slot_on = hpts->p_nxt_slot;
956         mtx_unlock(&hpts->p_mtx);
957         return (slot_on);
958 }
959
960 uint32_t
961 __tcp_hpts_insert(struct inpcb *inp, uint32_t slot, int32_t line){
962         return (tcp_hpts_insert_diag(inp, slot, line, NULL));
963 }
964
965 int
966 __tcp_queue_to_input_locked(struct inpcb *inp, struct tcp_hpts_entry *hpts, int32_t line)
967 {
968         int32_t retval = 0;
969
970         HPTS_MTX_ASSERT(hpts);
971         if (inp->inp_in_input == 0) {
972                 /* Ok we need to set it on the hpts in the current slot */
973                 hpts_sane_input_insert(hpts, inp, line);
974                 retval = 1;
975                 if (hpts->p_hpts_active == 0) {
976                         /*
977                          * Activate the hpts if it is sleeping.
978                          */
979                         if (logging_on) {
980                                 tcp_hpts_log_it(hpts, inp, HPTSLOG_WAKEUP_INPUT, 0, 0);
981                         }
982                         retval = 2;
983                         hpts->p_direct_wake = 1;
984                         tcp_wakeinput(hpts);
985                 }
986         } else if (hpts->p_hpts_active == 0) {
987                 retval = 4;
988                 hpts->p_direct_wake = 1;
989                 tcp_wakeinput(hpts);
990         }
991         return (retval);
992 }
993
994 void
995 tcp_queue_pkt_to_input(struct tcpcb *tp, struct mbuf *m, struct tcphdr *th,
996     int32_t tlen, int32_t drop_hdrlen, uint8_t iptos)
997 {
998         /* Setup packet for input first */
999         INP_WLOCK_ASSERT(tp->t_inpcb);
1000         m->m_pkthdr.pace_thoff = (uint16_t) ((caddr_t)th - mtod(m, caddr_t));
1001         m->m_pkthdr.pace_tlen = (uint16_t) tlen;
1002         m->m_pkthdr.pace_drphdrlen = drop_hdrlen;
1003         m->m_pkthdr.pace_tos = iptos;
1004         m->m_pkthdr.pace_lock = (curthread->td_epochnest != 0);
1005         if (tp->t_in_pkt == NULL) {
1006                 tp->t_in_pkt = m;
1007                 tp->t_tail_pkt = m;
1008         } else {
1009                 tp->t_tail_pkt->m_nextpkt = m;
1010                 tp->t_tail_pkt = m;
1011         }
1012 }
1013
1014
1015 int32_t
1016 __tcp_queue_to_input(struct tcpcb *tp, struct mbuf *m, struct tcphdr *th,
1017     int32_t tlen, int32_t drop_hdrlen, uint8_t iptos, int32_t line){
1018         struct tcp_hpts_entry *hpts;
1019         int32_t ret;
1020
1021         tcp_queue_pkt_to_input(tp, m, th, tlen, drop_hdrlen, iptos);
1022         hpts = tcp_input_lock(tp->t_inpcb);
1023         ret = __tcp_queue_to_input_locked(tp->t_inpcb, hpts, line);
1024         mtx_unlock(&hpts->p_mtx);
1025         return (ret);
1026 }
1027
1028 void
1029 __tcp_set_inp_to_drop(struct inpcb *inp, uint16_t reason, int32_t line)
1030 {
1031         struct tcp_hpts_entry *hpts;
1032         struct tcpcb *tp;
1033
1034         tp = intotcpcb(inp);
1035         hpts = tcp_input_lock(tp->t_inpcb);
1036         if (inp->inp_in_input == 0) {
1037                 /* Ok we need to set it on the hpts in the current slot */
1038                 hpts_sane_input_insert(hpts, inp, line);
1039                 if (hpts->p_hpts_active == 0) {
1040                         /*
1041                          * Activate the hpts if it is sleeping.
1042                          */
1043                         hpts->p_direct_wake = 1;
1044                         tcp_wakeinput(hpts);
1045                 }
1046         } else if (hpts->p_hpts_active == 0) {
1047                 hpts->p_direct_wake = 1;
1048                 tcp_wakeinput(hpts);
1049         }
1050         inp->inp_hpts_drop_reas = reason;
1051         mtx_unlock(&hpts->p_mtx);
1052 }
1053
1054 static uint16_t
1055 hpts_random_cpu(struct inpcb *inp){
1056         /*
1057          * No flow type set distribute the load randomly.
1058          */
1059         uint16_t cpuid;
1060         uint32_t ran;
1061
1062         /*
1063          * If one has been set use it i.e. we want both in and out on the
1064          * same hpts.
1065          */
1066         if (inp->inp_input_cpu_set) {
1067                 return (inp->inp_input_cpu);
1068         } else if (inp->inp_hpts_cpu_set) {
1069                 return (inp->inp_hpts_cpu);
1070         }
1071         /* Nothing set use a random number */
1072         ran = arc4random();
1073         cpuid = (ran & 0xffff) % mp_ncpus;
1074         return (cpuid);
1075 }
1076
1077 static uint16_t
1078 hpts_cpuid(struct inpcb *inp){
1079         uint16_t cpuid;
1080
1081
1082         /*
1083          * If one has been set use it i.e. we want both in and out on the
1084          * same hpts.
1085          */
1086         if (inp->inp_input_cpu_set) {
1087                 return (inp->inp_input_cpu);
1088         } else if (inp->inp_hpts_cpu_set) {
1089                 return (inp->inp_hpts_cpu);
1090         }
1091         /* If one is set the other must be the same */
1092 #ifdef  RSS
1093         cpuid = rss_hash2cpuid(inp->inp_flowid, inp->inp_flowtype);
1094         if (cpuid == NETISR_CPUID_NONE)
1095                 return (hpts_random_cpu(inp));
1096         else
1097                 return (cpuid);
1098 #else
1099         /*
1100          * We don't have a flowid -> cpuid mapping, so cheat and just map
1101          * unknown cpuids to curcpu.  Not the best, but apparently better
1102          * than defaulting to swi 0.
1103          */
1104         if (inp->inp_flowtype != M_HASHTYPE_NONE) {
1105                 cpuid = inp->inp_flowid % mp_ncpus;
1106                 return (cpuid);
1107         }
1108         cpuid = hpts_random_cpu(inp);
1109         return (cpuid);
1110 #endif
1111 }
1112
1113 /*
1114  * Do NOT try to optimize the processing of inp's
1115  * by first pulling off all the inp's into a temporary
1116  * list (e.g. TAILQ_CONCAT). If you do that the subtle
1117  * interactions of switching CPU's will kill because of
1118  * problems in the linked list manipulation. Basically
1119  * you would switch cpu's with the hpts mutex locked
1120  * but then while you were processing one of the inp's
1121  * some other one that you switch will get a new
1122  * packet on the different CPU. It will insert it
1123  * on the new hptss input list. Creating a temporary
1124  * link in the inp will not fix it either, since
1125  * the other hpts will be doing the same thing and
1126  * you will both end up using the temporary link.
1127  *
1128  * You will die in an ASSERT for tailq corruption if you
1129  * run INVARIANTS or you will die horribly without
1130  * INVARIANTS in some unknown way with a corrupt linked
1131  * list.
1132  */
1133 static void
1134 tcp_input_data(struct tcp_hpts_entry *hpts, struct timeval *tv)
1135 {
1136         struct mbuf *m, *n;
1137         struct tcpcb *tp;
1138         struct inpcb *inp;
1139         uint16_t drop_reason;
1140         int16_t set_cpu;
1141         uint32_t did_prefetch = 0;
1142         int32_t ti_locked = TI_UNLOCKED;
1143         struct epoch_tracker et;
1144
1145         HPTS_MTX_ASSERT(hpts);
1146         while ((inp = TAILQ_FIRST(&hpts->p_input)) != NULL) {
1147                 HPTS_MTX_ASSERT(hpts);
1148                 hpts_sane_input_remove(hpts, inp, 0);
1149                 if (inp->inp_input_cpu_set == 0) {
1150                         set_cpu = 1;
1151                 } else {
1152                         set_cpu = 0;
1153                 }
1154                 hpts->p_inp = inp;
1155                 drop_reason = inp->inp_hpts_drop_reas;
1156                 inp->inp_in_input = 0;
1157                 mtx_unlock(&hpts->p_mtx);
1158                 CURVNET_SET(inp->inp_vnet);
1159                 if (drop_reason) {
1160                         INP_INFO_RLOCK_ET(&V_tcbinfo, et);
1161                         ti_locked = TI_RLOCKED;
1162                 } else {
1163                         ti_locked = TI_UNLOCKED;
1164                 }
1165                 INP_WLOCK(inp);
1166                 if ((inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) ||
1167                     (inp->inp_flags2 & INP_FREED)) {
1168 out:
1169                         hpts->p_inp = NULL;
1170                         if (ti_locked == TI_RLOCKED) {
1171                                 INP_INFO_RUNLOCK_ET(&V_tcbinfo, et);
1172                         }
1173                         if (in_pcbrele_wlocked(inp) == 0) {
1174                                 INP_WUNLOCK(inp);
1175                         }
1176                         ti_locked = TI_UNLOCKED;
1177                         CURVNET_RESTORE();
1178                         mtx_lock(&hpts->p_mtx);
1179                         continue;
1180                 }
1181                 tp = intotcpcb(inp);
1182                 if ((tp == NULL) || (tp->t_inpcb == NULL)) {
1183                         goto out;
1184                 }
1185                 if (drop_reason) {
1186                         /* This tcb is being destroyed for drop_reason */
1187                         m = tp->t_in_pkt;
1188                         if (m)
1189                                 n = m->m_nextpkt;
1190                         else
1191                                 n = NULL;
1192                         tp->t_in_pkt = NULL;
1193                         while (m) {
1194                                 m_freem(m);
1195                                 m = n;
1196                                 if (m)
1197                                         n = m->m_nextpkt;
1198                         }
1199                         tp = tcp_drop(tp, drop_reason);
1200                         INP_INFO_RUNLOCK_ET(&V_tcbinfo, et);
1201                         if (tp == NULL) {
1202                                 INP_WLOCK(inp);
1203                         }
1204                         if (in_pcbrele_wlocked(inp) == 0)
1205                                 INP_WUNLOCK(inp);
1206                         CURVNET_RESTORE();
1207                         mtx_lock(&hpts->p_mtx);
1208                         continue;
1209                 }
1210                 if (set_cpu) {
1211                         /*
1212                          * Setup so the next time we will move to the right
1213                          * CPU. This should be a rare event. It will
1214                          * sometimes happens when we are the client side
1215                          * (usually not the server). Somehow tcp_output()
1216                          * gets called before the tcp_do_segment() sets the
1217                          * intial state. This means the r_cpu and r_hpts_cpu
1218                          * is 0. We get on the hpts, and then tcp_input()
1219                          * gets called setting up the r_cpu to the correct
1220                          * value. The hpts goes off and sees the mis-match.
1221                          * We simply correct it here and the CPU will switch
1222                          * to the new hpts nextime the tcb gets added to the
1223                          * the hpts (not this time) :-)
1224                          */
1225                         tcp_set_hpts(inp);
1226                 }
1227                 m = tp->t_in_pkt;
1228                 n = NULL;
1229                 if (m != NULL &&
1230                     (m->m_pkthdr.pace_lock == TI_RLOCKED ||
1231                     tp->t_state != TCPS_ESTABLISHED)) {
1232                         ti_locked = TI_RLOCKED;
1233                         INP_INFO_RLOCK_ET(&V_tcbinfo, et);
1234                         m = tp->t_in_pkt;
1235                 }
1236                 if (in_newts_every_tcb) {
1237                         if (in_ts_percision)
1238                                 microuptime(tv);
1239                         else
1240                                 getmicrouptime(tv);
1241                 }
1242                 if (tp->t_fb_ptr != NULL) {
1243                         kern_prefetch(tp->t_fb_ptr, &did_prefetch);
1244                         did_prefetch = 1;
1245                 }
1246                 /* Any input work to do, if so do it first */
1247                 if ((m != NULL) && (m == tp->t_in_pkt)) {
1248                         struct tcphdr *th;
1249                         int32_t tlen, drop_hdrlen, nxt_pkt;
1250                         uint8_t iptos;
1251
1252                         n = m->m_nextpkt;
1253                         tp->t_in_pkt = tp->t_tail_pkt = NULL;
1254                         while (m) {
1255                                 th = (struct tcphdr *)(mtod(m, caddr_t)+m->m_pkthdr.pace_thoff);
1256                                 tlen = m->m_pkthdr.pace_tlen;
1257                                 drop_hdrlen = m->m_pkthdr.pace_drphdrlen;
1258                                 iptos = m->m_pkthdr.pace_tos;
1259                                 m->m_nextpkt = NULL;
1260                                 if (n)
1261                                         nxt_pkt = 1;
1262                                 else
1263                                         nxt_pkt = 0;
1264                                 inp->inp_input_calls = 1;
1265                                 if (tp->t_fb->tfb_tcp_hpts_do_segment) {
1266                                         /* Use the hpts specific do_segment */
1267                                         (*tp->t_fb->tfb_tcp_hpts_do_segment) (m, th, inp->inp_socket,
1268                                             tp, drop_hdrlen,
1269                                             tlen, iptos, nxt_pkt, tv);
1270                                 } else {
1271                                         /* Use the default do_segment */
1272                                         (*tp->t_fb->tfb_tcp_do_segment) (m, th, inp->inp_socket,
1273                                             tp, drop_hdrlen,
1274                                                 tlen, iptos);
1275                                 }
1276                                 if (ti_locked == TI_RLOCKED)
1277                                         INP_INFO_RUNLOCK_ET(&V_tcbinfo, et);
1278                                 /*
1279                                  * Do segment returns unlocked we need the
1280                                  * lock again but we also need some kasserts
1281                                  * here.
1282                                  */
1283                                 INP_INFO_UNLOCK_ASSERT(&V_tcbinfo);
1284                                 INP_UNLOCK_ASSERT(inp);
1285                                 m = n;
1286                                 if (m)
1287                                         n = m->m_nextpkt;
1288                                 if (m != NULL &&
1289                                     m->m_pkthdr.pace_lock == TI_RLOCKED) {
1290                                         INP_INFO_RLOCK_ET(&V_tcbinfo, et);
1291                                         ti_locked = TI_RLOCKED;
1292                                 } else
1293                                         ti_locked = TI_UNLOCKED;
1294                                 INP_WLOCK(inp);
1295                                 /*
1296                                  * Since we have an opening here we must
1297                                  * re-check if the tcb went away while we
1298                                  * were getting the lock(s).
1299                                  */
1300                                 if ((inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) ||
1301                                     (inp->inp_flags2 & INP_FREED)) {
1302                                         while (m) {
1303                                                 m_freem(m);
1304                                                 m = n;
1305                                                 if (m)
1306                                                         n = m->m_nextpkt;
1307                                         }
1308                                         goto out;
1309                                 }
1310                                 /*
1311                                  * Now that we hold the INP lock, check if
1312                                  * we need to upgrade our lock.
1313                                  */
1314                                 if (ti_locked == TI_UNLOCKED &&
1315                                     (tp->t_state != TCPS_ESTABLISHED)) {
1316                                         ti_locked = TI_RLOCKED;
1317                                         INP_INFO_RLOCK_ET(&V_tcbinfo, et);
1318                                 }
1319                         }       /** end while(m) */
1320                 }               /** end if ((m != NULL)  && (m == tp->t_in_pkt)) */
1321                 if (in_pcbrele_wlocked(inp) == 0)
1322                         INP_WUNLOCK(inp);
1323                 if (ti_locked == TI_RLOCKED)
1324                         INP_INFO_RUNLOCK_ET(&V_tcbinfo, et);
1325                 INP_INFO_UNLOCK_ASSERT(&V_tcbinfo);
1326                 INP_UNLOCK_ASSERT(inp);
1327                 ti_locked = TI_UNLOCKED;
1328                 mtx_lock(&hpts->p_mtx);
1329                 hpts->p_inp = NULL;
1330                 CURVNET_RESTORE();
1331         }
1332 }
1333
1334 static int
1335 tcp_hpts_est_run(struct tcp_hpts_entry *hpts)
1336 {
1337         int32_t ticks_to_run;
1338
1339         if (hpts->p_prevtick && (SEQ_GT(hpts->p_curtick, hpts->p_prevtick))) {
1340                 ticks_to_run = hpts->p_curtick - hpts->p_prevtick;
1341                 if (ticks_to_run >= (NUM_OF_HPTSI_SLOTS - 1)) {
1342                         ticks_to_run = NUM_OF_HPTSI_SLOTS - 2;
1343                 }
1344         } else {
1345                 if (hpts->p_prevtick == hpts->p_curtick) {
1346                         /* This happens when we get woken up right away */
1347                         return (-1);
1348                 }
1349                 ticks_to_run = 1;
1350         }
1351         /* Set in where we will be when we catch up */
1352         hpts->p_nxt_slot = (hpts->p_cur_slot + ticks_to_run) % NUM_OF_HPTSI_SLOTS;
1353         if (hpts->p_nxt_slot == hpts->p_cur_slot) {
1354                 panic("Impossible math -- hpts:%p p_nxt_slot:%d p_cur_slot:%d ticks_to_run:%d",
1355                     hpts, hpts->p_nxt_slot, hpts->p_cur_slot, ticks_to_run);
1356         }
1357         return (ticks_to_run);
1358 }
1359
1360 static void
1361 tcp_hptsi(struct tcp_hpts_entry *hpts, struct timeval *ctick)
1362 {
1363         struct tcpcb *tp;
1364         struct inpcb *inp = NULL, *ninp;
1365         struct timeval tv;
1366         int32_t ticks_to_run, i, error, tick_now, interum_tick;
1367         int32_t paced_cnt = 0;
1368         int32_t did_prefetch = 0;
1369         int32_t prefetch_ninp = 0;
1370         int32_t prefetch_tp = 0;
1371         uint32_t cts;
1372         int16_t set_cpu;
1373
1374         HPTS_MTX_ASSERT(hpts);
1375         hpts->p_curtick = tcp_tv_to_hptstick(ctick);
1376         cts = tcp_tv_to_usectick(ctick);
1377         memcpy(&tv, ctick, sizeof(struct timeval));
1378         hpts->p_cur_slot = hpts_tick(hpts, 1);
1379
1380         /* Figure out if we had missed ticks */
1381 again:
1382         HPTS_MTX_ASSERT(hpts);
1383         ticks_to_run = tcp_hpts_est_run(hpts);
1384         if (!TAILQ_EMPTY(&hpts->p_input)) {
1385                 tcp_input_data(hpts, &tv);
1386         }
1387 #ifdef INVARIANTS
1388         if (TAILQ_EMPTY(&hpts->p_input) &&
1389             (hpts->p_on_inqueue_cnt != 0)) {
1390                 panic("tp:%p in_hpts input empty but cnt:%d",
1391                     hpts, hpts->p_on_inqueue_cnt);
1392         }
1393 #endif
1394         HPTS_MTX_ASSERT(hpts);
1395         /* Reset the ticks to run and time if we need too */
1396         interum_tick = tcp_gethptstick(&tv);
1397         if (interum_tick != hpts->p_curtick) {
1398                 /* Save off the new time we execute to */
1399                 *ctick = tv;
1400                 hpts->p_curtick = interum_tick;
1401                 cts = tcp_tv_to_usectick(&tv);
1402                 hpts->p_cur_slot = hpts_tick(hpts, 1);
1403                 ticks_to_run = tcp_hpts_est_run(hpts);
1404         }
1405         if (ticks_to_run == -1) {
1406                 goto no_run;
1407         }
1408         if (logging_on) {
1409                 tcp_hpts_log_it(hpts, inp, HPTSLOG_SETTORUN, ticks_to_run, 0);
1410         }
1411         if (hpts->p_on_queue_cnt == 0) {
1412                 goto no_one;
1413         }
1414         HPTS_MTX_ASSERT(hpts);
1415         for (i = 0; i < ticks_to_run; i++) {
1416                 /*
1417                  * Calculate our delay, if there are no extra ticks there
1418                  * was not any
1419                  */
1420                 hpts->p_delayed_by = (ticks_to_run - (i + 1)) * HPTS_TICKS_PER_USEC;
1421                 HPTS_MTX_ASSERT(hpts);
1422                 while ((inp = TAILQ_FIRST(&hpts->p_hptss[hpts->p_cur_slot])) != NULL) {
1423                         /* For debugging */
1424                         if (logging_on) {
1425                                 tcp_hpts_log_it(hpts, inp, HPTSLOG_HPTSI, ticks_to_run, i);
1426                         }
1427                         hpts->p_inp = inp;
1428                         paced_cnt++;
1429                         if (hpts->p_cur_slot != inp->inp_hptsslot) {
1430                                 panic("Hpts:%p inp:%p slot mis-aligned %u vs %u",
1431                                     hpts, inp, hpts->p_cur_slot, inp->inp_hptsslot);
1432                         }
1433                         /* Now pull it */
1434                         if (inp->inp_hpts_cpu_set == 0) {
1435                                 set_cpu = 1;
1436                         } else {
1437                                 set_cpu = 0;
1438                         }
1439                         hpts_sane_pace_remove(hpts, inp, &hpts->p_hptss[hpts->p_cur_slot], 0);
1440                         if ((ninp = TAILQ_FIRST(&hpts->p_hptss[hpts->p_cur_slot])) != NULL) {
1441                                 /* We prefetch the next inp if possible */
1442                                 kern_prefetch(ninp, &prefetch_ninp);
1443                                 prefetch_ninp = 1;
1444                         }
1445                         if (inp->inp_hpts_request) {
1446                                 /*
1447                                  * This guy is deferred out further in time
1448                                  * then our wheel had on it. Push him back
1449                                  * on the wheel.
1450                                  */
1451                                 int32_t remaining_slots;
1452
1453                                 remaining_slots = ticks_to_run - (i + 1);
1454                                 if (inp->inp_hpts_request > remaining_slots) {
1455                                         /*
1456                                          * Keep INVARIANTS happy by clearing
1457                                          * the flag
1458                                          */
1459                                         tcp_hpts_insert_locked(hpts, inp, inp->inp_hpts_request, cts, __LINE__, NULL, 1);
1460                                         hpts->p_inp = NULL;
1461                                         continue;
1462                                 }
1463                                 inp->inp_hpts_request = 0;
1464                         }
1465                         /*
1466                          * We clear the hpts flag here after dealing with
1467                          * remaining slots. This way anyone looking with the
1468                          * TCB lock will see its on the hpts until just
1469                          * before we unlock.
1470                          */
1471                         inp->inp_in_hpts = 0;
1472                         mtx_unlock(&hpts->p_mtx);
1473                         INP_WLOCK(inp);
1474                         if (in_pcbrele_wlocked(inp)) {
1475                                 mtx_lock(&hpts->p_mtx);
1476                                 if (logging_on)
1477                                         tcp_hpts_log_it(hpts, hpts->p_inp, HPTSLOG_INP_DONE, 0, 1);
1478                                 hpts->p_inp = NULL;
1479                                 continue;
1480                         }
1481                         if (inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) {
1482 out_now:
1483 #ifdef INVARIANTS
1484                                 if (mtx_owned(&hpts->p_mtx)) {
1485                                         panic("Hpts:%p owns mtx prior-to lock line:%d",
1486                                             hpts, __LINE__);
1487                                 }
1488 #endif
1489                                 INP_WUNLOCK(inp);
1490                                 mtx_lock(&hpts->p_mtx);
1491                                 if (logging_on)
1492                                         tcp_hpts_log_it(hpts, hpts->p_inp, HPTSLOG_INP_DONE, 0, 3);
1493                                 hpts->p_inp = NULL;
1494                                 continue;
1495                         }
1496                         tp = intotcpcb(inp);
1497                         if ((tp == NULL) || (tp->t_inpcb == NULL)) {
1498                                 goto out_now;
1499                         }
1500                         if (set_cpu) {
1501                                 /*
1502                                  * Setup so the next time we will move to
1503                                  * the right CPU. This should be a rare
1504                                  * event. It will sometimes happens when we
1505                                  * are the client side (usually not the
1506                                  * server). Somehow tcp_output() gets called
1507                                  * before the tcp_do_segment() sets the
1508                                  * intial state. This means the r_cpu and
1509                                  * r_hpts_cpu is 0. We get on the hpts, and
1510                                  * then tcp_input() gets called setting up
1511                                  * the r_cpu to the correct value. The hpts
1512                                  * goes off and sees the mis-match. We
1513                                  * simply correct it here and the CPU will
1514                                  * switch to the new hpts nextime the tcb
1515                                  * gets added to the the hpts (not this one)
1516                                  * :-)
1517                                  */
1518                                 tcp_set_hpts(inp);
1519                         }
1520                         if (out_newts_every_tcb) {
1521                                 struct timeval sv;
1522
1523                                 if (out_ts_percision)
1524                                         microuptime(&sv);
1525                                 else
1526                                         getmicrouptime(&sv);
1527                                 cts = tcp_tv_to_usectick(&sv);
1528                         }
1529                         CURVNET_SET(inp->inp_vnet);
1530                         /*
1531                          * There is a hole here, we get the refcnt on the
1532                          * inp so it will still be preserved but to make
1533                          * sure we can get the INP we need to hold the p_mtx
1534                          * above while we pull out the tp/inp,  as long as
1535                          * fini gets the lock first we are assured of having
1536                          * a sane INP we can lock and test.
1537                          */
1538 #ifdef INVARIANTS
1539                         if (mtx_owned(&hpts->p_mtx)) {
1540                                 panic("Hpts:%p owns mtx before tcp-output:%d",
1541                                     hpts, __LINE__);
1542                         }
1543 #endif
1544                         if (tp->t_fb_ptr != NULL) {
1545                                 kern_prefetch(tp->t_fb_ptr, &did_prefetch);
1546                                 did_prefetch = 1;
1547                         }
1548                         inp->inp_hpts_calls = 1;
1549                         if (tp->t_fb->tfb_tcp_output_wtime != NULL) {
1550                                 error = (*tp->t_fb->tfb_tcp_output_wtime) (tp, &tv);
1551                         } else {
1552                                 error = tp->t_fb->tfb_tcp_output(tp);
1553                         }
1554                         if (ninp && ninp->inp_ppcb) {
1555                                 /*
1556                                  * If we have a nxt inp, see if we can
1557                                  * prefetch its ppcb. Note this may seem
1558                                  * "risky" since we have no locks (other
1559                                  * than the previous inp) and there no
1560                                  * assurance that ninp was not pulled while
1561                                  * we were processing inp and freed. If this
1562                                  * occured it could mean that either:
1563                                  *
1564                                  * a) Its NULL (which is fine we won't go
1565                                  * here) <or> b) Its valid (which is cool we
1566                                  * will prefetch it) <or> c) The inp got
1567                                  * freed back to the slab which was
1568                                  * reallocated. Then the piece of memory was
1569                                  * re-used and something else (not an
1570                                  * address) is in inp_ppcb. If that occurs
1571                                  * we don't crash, but take a TLB shootdown
1572                                  * performance hit (same as if it was NULL
1573                                  * and we tried to pre-fetch it).
1574                                  *
1575                                  * Considering that the likelyhood of <c> is
1576                                  * quite rare we will take a risk on doing
1577                                  * this. If performance drops after testing
1578                                  * we can always take this out. NB: the
1579                                  * kern_prefetch on amd64 actually has
1580                                  * protection against a bad address now via
1581                                  * the DMAP_() tests. This will prevent the
1582                                  * TLB hit, and instead if <c> occurs just
1583                                  * cause us to load cache with a useless
1584                                  * address (to us).
1585                                  */
1586                                 kern_prefetch(ninp->inp_ppcb, &prefetch_tp);
1587                                 prefetch_tp = 1;
1588                         }
1589                         INP_WUNLOCK(inp);
1590                         INP_UNLOCK_ASSERT(inp);
1591                         CURVNET_RESTORE();
1592 #ifdef INVARIANTS
1593                         if (mtx_owned(&hpts->p_mtx)) {
1594                                 panic("Hpts:%p owns mtx prior-to lock line:%d",
1595                                     hpts, __LINE__);
1596                         }
1597 #endif
1598                         mtx_lock(&hpts->p_mtx);
1599                         if (logging_on)
1600                                 tcp_hpts_log_it(hpts, hpts->p_inp, HPTSLOG_INP_DONE, 0, 4);
1601                         hpts->p_inp = NULL;
1602                 }
1603                 HPTS_MTX_ASSERT(hpts);
1604                 hpts->p_inp = NULL;
1605                 hpts->p_cur_slot++;
1606                 if (hpts->p_cur_slot >= NUM_OF_HPTSI_SLOTS) {
1607                         hpts->p_cur_slot = 0;
1608                 }
1609         }
1610 no_one:
1611         HPTS_MTX_ASSERT(hpts);
1612         hpts->p_prevtick = hpts->p_curtick;
1613         hpts->p_delayed_by = 0;
1614         /*
1615          * Check to see if we took an excess amount of time and need to run
1616          * more ticks (if we did not hit eno-bufs).
1617          */
1618         /* Re-run any input that may be there */
1619         (void)tcp_gethptstick(&tv);
1620         if (!TAILQ_EMPTY(&hpts->p_input)) {
1621                 tcp_input_data(hpts, &tv);
1622         }
1623 #ifdef INVARIANTS
1624         if (TAILQ_EMPTY(&hpts->p_input) &&
1625             (hpts->p_on_inqueue_cnt != 0)) {
1626                 panic("tp:%p in_hpts input empty but cnt:%d",
1627                     hpts, hpts->p_on_inqueue_cnt);
1628         }
1629 #endif
1630         tick_now = tcp_gethptstick(&tv);
1631         if (SEQ_GT(tick_now, hpts->p_prevtick)) {
1632                 struct timeval res;
1633
1634                 /* Did we really spend a full tick or more in here? */
1635                 timersub(&tv, ctick, &res);
1636                 if (res.tv_sec || (res.tv_usec >= HPTS_TICKS_PER_USEC)) {
1637                         counter_u64_add(hpts_loops, 1);
1638                         if (logging_on) {
1639                                 tcp_hpts_log_it(hpts, inp, HPTSLOG_TOLONG, (uint32_t) res.tv_usec, tick_now);
1640                         }
1641                         *ctick = res;
1642                         hpts->p_curtick = tick_now;
1643                         goto again;
1644                 }
1645         }
1646 no_run:
1647         {
1648                 uint32_t t = 0, i, fnd = 0;
1649
1650                 if (hpts->p_on_queue_cnt) {
1651
1652
1653                         /*
1654                          * Find next slot that is occupied and use that to
1655                          * be the sleep time.
1656                          */
1657                         for (i = 1, t = hpts->p_nxt_slot; i < NUM_OF_HPTSI_SLOTS; i++) {
1658                                 if (TAILQ_EMPTY(&hpts->p_hptss[t]) == 0) {
1659                                         fnd = 1;
1660                                         break;
1661                                 }
1662                                 t = (t + 1) % NUM_OF_HPTSI_SLOTS;
1663                         }
1664                         if (fnd) {
1665                                 hpts->p_hpts_sleep_time = i;
1666                         } else {
1667                                 counter_u64_add(back_tosleep, 1);
1668 #ifdef INVARIANTS
1669                                 panic("Hpts:%p cnt:%d but non found", hpts, hpts->p_on_queue_cnt);
1670 #endif
1671                                 hpts->p_on_queue_cnt = 0;
1672                                 goto non_found;
1673                         }
1674                         t++;
1675                 } else {
1676                         /* No one on the wheel sleep for all but 2 slots  */
1677 non_found:
1678                         if (hpts_sleep_max == 0)
1679                                 hpts_sleep_max = 1;
1680                         hpts->p_hpts_sleep_time = min((NUM_OF_HPTSI_SLOTS - 2), hpts_sleep_max);
1681                         t = 0;
1682                 }
1683                 if (logging_on) {
1684                         tcp_hpts_log_it(hpts, inp, HPTSLOG_SLEEPSET, t, (hpts->p_hpts_sleep_time * HPTS_TICKS_PER_USEC));
1685                 }
1686         }
1687 }
1688
1689 void
1690 __tcp_set_hpts(struct inpcb *inp, int32_t line)
1691 {
1692         struct tcp_hpts_entry *hpts;
1693
1694         INP_WLOCK_ASSERT(inp);
1695         hpts = tcp_hpts_lock(inp);
1696         if ((inp->inp_in_hpts == 0) &&
1697             (inp->inp_hpts_cpu_set == 0)) {
1698                 inp->inp_hpts_cpu = hpts_cpuid(inp);
1699                 inp->inp_hpts_cpu_set = 1;
1700         }
1701         mtx_unlock(&hpts->p_mtx);
1702         hpts = tcp_input_lock(inp);
1703         if ((inp->inp_input_cpu_set == 0) &&
1704             (inp->inp_in_input == 0)) {
1705                 inp->inp_input_cpu = hpts_cpuid(inp);
1706                 inp->inp_input_cpu_set = 1;
1707         }
1708         mtx_unlock(&hpts->p_mtx);
1709 }
1710
1711 uint16_t
1712 tcp_hpts_delayedby(struct inpcb *inp){
1713         return (tcp_pace.rp_ent[inp->inp_hpts_cpu]->p_delayed_by);
1714 }
1715
1716 static void
1717 tcp_hpts_thread(void *ctx)
1718 {
1719         struct tcp_hpts_entry *hpts;
1720         struct timeval tv;
1721         sbintime_t sb;
1722
1723         hpts = (struct tcp_hpts_entry *)ctx;
1724         mtx_lock(&hpts->p_mtx);
1725         if (hpts->p_direct_wake) {
1726                 /* Signaled by input */
1727                 if (logging_on)
1728                         tcp_hpts_log_it(hpts, NULL, HPTSLOG_AWAKE, 1, 1);
1729                 callout_stop(&hpts->co);
1730         } else {
1731                 /* Timed out */
1732                 if (callout_pending(&hpts->co) ||
1733                     !callout_active(&hpts->co)) {
1734                         if (logging_on)
1735                                 tcp_hpts_log_it(hpts, NULL, HPTSLOG_AWAKE, 2, 2);
1736                         mtx_unlock(&hpts->p_mtx);
1737                         return;
1738                 }
1739                 callout_deactivate(&hpts->co);
1740                 if (logging_on)
1741                         tcp_hpts_log_it(hpts, NULL, HPTSLOG_AWAKE, 3, 3);
1742         }
1743         hpts->p_hpts_active = 1;
1744         (void)tcp_gethptstick(&tv);
1745         tcp_hptsi(hpts, &tv);
1746         HPTS_MTX_ASSERT(hpts);
1747         tv.tv_sec = 0;
1748         tv.tv_usec = hpts->p_hpts_sleep_time * HPTS_TICKS_PER_USEC;
1749         if (tcp_min_hptsi_time && (tv.tv_usec < tcp_min_hptsi_time)) {
1750                 tv.tv_usec = tcp_min_hptsi_time;
1751                 hpts->p_on_min_sleep = 1;
1752         } else {
1753                 /* Clear the min sleep flag */
1754                 hpts->p_on_min_sleep = 0;
1755         }
1756         hpts->p_hpts_active = 0;
1757         sb = tvtosbt(tv);
1758         if (tcp_hpts_callout_skip_swi == 0) {
1759                 callout_reset_sbt_on(&hpts->co, sb, 0,
1760                     hpts_timeout_swi, hpts, hpts->p_cpu,
1761                     (C_DIRECT_EXEC | C_PREL(tcp_hpts_precision)));
1762         } else {
1763                 callout_reset_sbt_on(&hpts->co, sb, 0,
1764                     hpts_timeout_dir, hpts,
1765                     hpts->p_cpu,
1766                     C_PREL(tcp_hpts_precision));
1767         }
1768         hpts->p_direct_wake = 0;
1769         mtx_unlock(&hpts->p_mtx);
1770 }
1771
1772 #undef  timersub
1773
1774 static void
1775 tcp_init_hptsi(void *st)
1776 {
1777         int32_t i, j, error, bound = 0, created = 0;
1778         size_t sz, asz;
1779         struct timeval tv;
1780         sbintime_t sb;
1781         struct tcp_hpts_entry *hpts;
1782         char unit[16];
1783         uint32_t ncpus = mp_ncpus ? mp_ncpus : MAXCPU;
1784
1785         tcp_pace.rp_proc = NULL;
1786         tcp_pace.rp_num_hptss = ncpus;
1787         hpts_loops = counter_u64_alloc(M_WAITOK);
1788         back_tosleep = counter_u64_alloc(M_WAITOK);
1789
1790         sz = (tcp_pace.rp_num_hptss * sizeof(struct tcp_hpts_entry *));
1791         tcp_pace.rp_ent = malloc(sz, M_TCPHPTS, M_WAITOK | M_ZERO);
1792         asz = sizeof(struct hptsh) * NUM_OF_HPTSI_SLOTS;
1793         for (i = 0; i < tcp_pace.rp_num_hptss; i++) {
1794                 tcp_pace.rp_ent[i] = malloc(sizeof(struct tcp_hpts_entry),
1795                     M_TCPHPTS, M_WAITOK | M_ZERO);
1796                 tcp_pace.rp_ent[i]->p_hptss = malloc(asz,
1797                     M_TCPHPTS, M_WAITOK);
1798                 hpts = tcp_pace.rp_ent[i];
1799                 /*
1800                  * Init all the hpts structures that are not specifically
1801                  * zero'd by the allocations. Also lets attach them to the
1802                  * appropriate sysctl block as well.
1803                  */
1804                 mtx_init(&hpts->p_mtx, "tcp_hpts_lck",
1805                     "hpts", MTX_DEF | MTX_DUPOK);
1806                 TAILQ_INIT(&hpts->p_input);
1807                 for (j = 0; j < NUM_OF_HPTSI_SLOTS; j++) {
1808                         TAILQ_INIT(&hpts->p_hptss[j]);
1809                 }
1810                 sysctl_ctx_init(&hpts->hpts_ctx);
1811                 sprintf(unit, "%d", i);
1812                 hpts->hpts_root = SYSCTL_ADD_NODE(&hpts->hpts_ctx,
1813                     SYSCTL_STATIC_CHILDREN(_net_inet_tcp_hpts),
1814                     OID_AUTO,
1815                     unit,
1816                     CTLFLAG_RW, 0,
1817                     "");
1818                 SYSCTL_ADD_INT(&hpts->hpts_ctx,
1819                     SYSCTL_CHILDREN(hpts->hpts_root),
1820                     OID_AUTO, "in_qcnt", CTLFLAG_RD,
1821                     &hpts->p_on_inqueue_cnt, 0,
1822                     "Count TCB's awaiting input processing");
1823                 SYSCTL_ADD_INT(&hpts->hpts_ctx,
1824                     SYSCTL_CHILDREN(hpts->hpts_root),
1825                     OID_AUTO, "out_qcnt", CTLFLAG_RD,
1826                     &hpts->p_on_queue_cnt, 0,
1827                     "Count TCB's awaiting output processing");
1828                 SYSCTL_ADD_UINT(&hpts->hpts_ctx,
1829                     SYSCTL_CHILDREN(hpts->hpts_root),
1830                     OID_AUTO, "active", CTLFLAG_RD,
1831                     &hpts->p_hpts_active, 0,
1832                     "Is the hpts active");
1833                 SYSCTL_ADD_UINT(&hpts->hpts_ctx,
1834                     SYSCTL_CHILDREN(hpts->hpts_root),
1835                     OID_AUTO, "curslot", CTLFLAG_RD,
1836                     &hpts->p_cur_slot, 0,
1837                     "What the current slot is if active");
1838                 SYSCTL_ADD_UINT(&hpts->hpts_ctx,
1839                     SYSCTL_CHILDREN(hpts->hpts_root),
1840                     OID_AUTO, "curtick", CTLFLAG_RD,
1841                     &hpts->p_curtick, 0,
1842                     "What the current tick on if active");
1843                 SYSCTL_ADD_UINT(&hpts->hpts_ctx,
1844                     SYSCTL_CHILDREN(hpts->hpts_root),
1845                     OID_AUTO, "logsize", CTLFLAG_RD,
1846                     &hpts->p_logsize, 0,
1847                     "Hpts logging buffer size");
1848                 hpts->p_hpts_sleep_time = NUM_OF_HPTSI_SLOTS - 2;
1849                 hpts->p_num = i;
1850                 hpts->p_prevtick = hpts->p_curtick = tcp_gethptstick(&tv);
1851                 hpts->p_prevtick -= 1;
1852                 hpts->p_prevtick %= NUM_OF_HPTSI_SLOTS;
1853                 hpts->p_cpu = 0xffff;
1854                 hpts->p_nxt_slot = 1;
1855                 hpts->p_logsize = tcp_hpts_logging_size;
1856                 if (hpts->p_logsize) {
1857                         sz = (sizeof(struct hpts_log) * hpts->p_logsize);
1858                         hpts->p_log = malloc(sz, M_TCPHPTS, M_WAITOK | M_ZERO);
1859                 }
1860                 callout_init(&hpts->co, 1);
1861         }
1862         /*
1863          * Now lets start ithreads to handle the hptss.
1864          */
1865         CPU_FOREACH(i) {
1866                 hpts = tcp_pace.rp_ent[i];
1867                 hpts->p_cpu = i;
1868                 error = swi_add(&hpts->ie, "hpts",
1869                     tcp_hpts_thread, (void *)hpts,
1870                     SWI_NET, INTR_MPSAFE, &hpts->ie_cookie);
1871                 if (error) {
1872                         panic("Can't add hpts:%p i:%d err:%d",
1873                             hpts, i, error);
1874                 }
1875                 created++;
1876                 if (tcp_bind_threads) {
1877                         if (intr_event_bind(hpts->ie, i) == 0)
1878                                 bound++;
1879                 }
1880                 tv.tv_sec = 0;
1881                 tv.tv_usec = hpts->p_hpts_sleep_time * HPTS_TICKS_PER_USEC;
1882                 sb = tvtosbt(tv);
1883                 if (tcp_hpts_callout_skip_swi == 0) {
1884                         callout_reset_sbt_on(&hpts->co, sb, 0,
1885                             hpts_timeout_swi, hpts, hpts->p_cpu,
1886                             (C_DIRECT_EXEC | C_PREL(tcp_hpts_precision)));
1887                 } else {
1888                         callout_reset_sbt_on(&hpts->co, sb, 0,
1889                             hpts_timeout_dir, hpts,
1890                             hpts->p_cpu,
1891                             C_PREL(tcp_hpts_precision));
1892                 }
1893         }
1894         printf("TCP Hpts created %d swi interrupt thread and bound %d\n",
1895             created, bound);
1896         return;
1897 }
1898
1899 SYSINIT(tcphptsi, SI_SUB_KTHREAD_IDLE, SI_ORDER_ANY, tcp_init_hptsi, NULL);
1900 MODULE_VERSION(tcphpts, 1);