]> CyberLeo.Net >> Repos - FreeBSD/FreeBSD.git/blob - lib/libc_r/uthread/uthread_kern.c
This commit was generated by cvs2svn to compensate for changes in r52277,
[FreeBSD/FreeBSD.git] / lib / libc_r / uthread / uthread_kern.c
1 /*
2  * Copyright (c) 1995-1998 John Birrell <jb@cimlogic.com.au>
3  * All rights reserved.
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions
7  * are met:
8  * 1. Redistributions of source code must retain the above copyright
9  *    notice, this list of conditions and the following disclaimer.
10  * 2. Redistributions in binary form must reproduce the above copyright
11  *    notice, this list of conditions and the following disclaimer in the
12  *    documentation and/or other materials provided with the distribution.
13  * 3. All advertising materials mentioning features or use of this software
14  *    must display the following acknowledgement:
15  *      This product includes software developed by John Birrell.
16  * 4. Neither the name of the author nor the names of any co-contributors
17  *    may be used to endorse or promote products derived from this software
18  *    without specific prior written permission.
19  *
20  * THIS SOFTWARE IS PROVIDED BY JOHN BIRRELL AND CONTRIBUTORS ``AS IS'' AND
21  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
22  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
23  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
24  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
25  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
26  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
27  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
28  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
29  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
30  * SUCH DAMAGE.
31  *
32  * $FreeBSD$
33  *
34  */
35 #include <errno.h>
36 #include <poll.h>
37 #include <stdlib.h>
38 #include <stdarg.h>
39 #include <string.h>
40 #include <unistd.h>
41 #include <setjmp.h>
42 #include <sys/types.h>
43 #include <sys/stat.h>
44 #include <sys/time.h>
45 #include <sys/socket.h>
46 #include <sys/uio.h>
47 #include <sys/syscall.h>
48 #include <fcntl.h>
49 #ifdef _THREAD_SAFE
50 #include <pthread.h>
51 #include "pthread_private.h"
52
53 /* Static function prototype definitions: */
54 static void 
55 _thread_kern_poll(int wait_reqd);
56
57 static void
58 dequeue_signals(void);
59
60 static inline void
61 thread_run_switch_hook(pthread_t thread_out, pthread_t thread_in);
62
63 void
64 _thread_kern_sched(ucontext_t * scp)
65 {
66 #ifndef __alpha__
67         char           *fdata;
68 #endif
69         pthread_t       pthread, pthread_h = NULL;
70         pthread_t       last_thread = NULL;
71         struct itimerval itimer;
72         struct timespec ts, ts1;
73         struct timeval  tv, tv1;
74         int             i, set_timer = 0;
75
76         /*
77          * Flag the pthread kernel as executing scheduler code
78          * to avoid a scheduler signal from interrupting this
79          * execution and calling the scheduler again.
80          */
81         _thread_kern_in_sched = 1;
82
83         /* Check if this function was called from the signal handler: */
84         if (scp != NULL) {
85                 /*
86                  * Copy the signal context to the current thread's jump
87                  * buffer: 
88                  */
89                 memcpy(&_thread_run->saved_sigcontext, scp, sizeof(_thread_run->saved_sigcontext));
90
91 #ifndef __alpha__
92                 /* Point to the floating point data in the running thread: */
93                 fdata = _thread_run->saved_fp;
94
95                 /* Save the floating point data: */
96 __asm__("fnsave %0": :"m"(*fdata));
97 #endif
98
99                 /* Flag the signal context as the last state saved: */
100                 _thread_run->sig_saved = 1;
101         }
102         /* Save the state of the current thread: */
103         else if (setjmp(_thread_run->saved_jmp_buf) != 0) {
104                 /*
105                  * This point is reached when a longjmp() is called to
106                  * restore the state of a thread. 
107                  *
108                  * This is the normal way out of the scheduler.
109                  */
110                 _thread_kern_in_sched = 0;
111
112                 if (_sched_switch_hook != NULL) {
113                         /* Run the installed switch hook: */
114                         thread_run_switch_hook(_last_user_thread, _thread_run);
115                 }
116
117                 return;
118         } else
119                 /* Flag the jump buffer was the last state saved: */
120                 _thread_run->sig_saved = 0;
121
122         /* If the currently running thread is a user thread, save it: */
123         if ((_thread_run->flags & PTHREAD_FLAGS_PRIVATE) == 0)
124                 _last_user_thread = _thread_run;
125
126         /*
127          * Enter a scheduling loop that finds the next thread that is
128          * ready to run. This loop completes when there are no more threads
129          * in the global list or when a thread has its state restored by
130          * either a sigreturn (if the state was saved as a sigcontext) or a
131          * longjmp (if the state was saved by a setjmp). 
132          */
133         while (!(TAILQ_EMPTY(&_thread_list))) {
134                 /* Get the current time of day: */
135                 gettimeofday(&tv, NULL);
136                 TIMEVAL_TO_TIMESPEC(&tv, &ts);
137
138                 /*
139                  * Protect the scheduling queues from access by the signal
140                  * handler.
141                  */
142                 _queue_signals = 1;
143
144                 if (_thread_run != &_thread_kern_thread) {
145
146                         /*
147                          * This thread no longer needs to yield the CPU.
148                          */
149                         _thread_run->yield_on_sig_undefer = 0;
150         
151                         /*
152                          * Save the current time as the time that the thread
153                          * became inactive: 
154                          */
155                         _thread_run->last_inactive.tv_sec = tv.tv_sec;
156                         _thread_run->last_inactive.tv_usec = tv.tv_usec;
157         
158                         /*
159                          * Place the currently running thread into the
160                          * appropriate queue(s).
161                          */
162                         switch (_thread_run->state) {
163                         case PS_DEAD:
164                                 /*
165                                  * Dead threads are not placed in any queue:
166                                  */
167                                 break;
168
169                         case PS_RUNNING:
170                                 /*
171                                  * Runnable threads can't be placed in the
172                                  * priority queue until after waiting threads
173                                  * are polled (to preserve round-robin
174                                  * scheduling).
175                                  */
176                                 if ((_thread_run->slice_usec != -1) &&
177                                     (_thread_run->attr.sched_policy != SCHED_FIFO)) {
178                                         /*
179                                          * Accumulate the number of microseconds that
180                                          * this thread has run for:
181                                          */
182                                         _thread_run->slice_usec +=
183                                             (_thread_run->last_inactive.tv_sec -
184                                             _thread_run->last_active.tv_sec) * 1000000 +
185                                             _thread_run->last_inactive.tv_usec -
186                                             _thread_run->last_active.tv_usec;
187         
188                                         /* Check for time quantum exceeded: */
189                                         if (_thread_run->slice_usec > TIMESLICE_USEC)
190                                                 _thread_run->slice_usec = -1;
191                                 }
192                                 break;
193
194                         /*
195                          * States which do not depend on file descriptor I/O
196                          * operations or timeouts: 
197                          */
198                         case PS_DEADLOCK:
199                         case PS_FDLR_WAIT:
200                         case PS_FDLW_WAIT:
201                         case PS_FILE_WAIT:
202                         case PS_JOIN:
203                         case PS_MUTEX_WAIT:
204                         case PS_SIGSUSPEND:
205                         case PS_SIGTHREAD:
206                         case PS_SIGWAIT:
207                         case PS_SUSPENDED:
208                         case PS_WAIT_WAIT:
209                                 /* No timeouts for these states: */
210                                 _thread_run->wakeup_time.tv_sec = -1;
211                                 _thread_run->wakeup_time.tv_nsec = -1;
212
213                                 /* Restart the time slice: */
214                                 _thread_run->slice_usec = -1;
215
216                                 /* Insert into the waiting queue: */
217                                 PTHREAD_WAITQ_INSERT(_thread_run);
218                                 break;
219
220                         /* States which can timeout: */
221                         case PS_COND_WAIT:
222                         case PS_SLEEP_WAIT:
223                                 /* Restart the time slice: */
224                                 _thread_run->slice_usec = -1;
225
226                                 /* Insert into the waiting queue: */
227                                 PTHREAD_WAITQ_INSERT(_thread_run);
228                                 break;
229         
230                         /* States that require periodic work: */
231                         case PS_SPINBLOCK:
232                                 /* No timeouts for this state: */
233                                 _thread_run->wakeup_time.tv_sec = -1;
234                                 _thread_run->wakeup_time.tv_nsec = -1;
235
236                                 /* Increment spinblock count: */
237                                 _spinblock_count++;
238
239                                 /* fall through */
240                         case PS_FDR_WAIT:
241                         case PS_FDW_WAIT:
242                         case PS_POLL_WAIT:
243                         case PS_SELECT_WAIT:
244                                 /* Restart the time slice: */
245                                 _thread_run->slice_usec = -1;
246         
247                                 /* Insert into the waiting queue: */
248                                 PTHREAD_WAITQ_INSERT(_thread_run);
249         
250                                 /* Insert into the work queue: */
251                                 PTHREAD_WORKQ_INSERT(_thread_run);
252                         }
253                 }
254
255                 /* Unprotect the scheduling queues: */
256                 _queue_signals = 0;
257
258                 /*
259                  * Poll file descriptors to update the state of threads
260                  * waiting on file I/O where data may be available: 
261                  */
262                 _thread_kern_poll(0);
263
264                 /* Protect the scheduling queues: */
265                 _queue_signals = 1;
266
267                 /*
268                  * Wake up threads that have timedout.  This has to be
269                  * done after polling in case a thread does a poll or
270                  * select with zero time.
271                  */
272                 PTHREAD_WAITQ_SETACTIVE();
273                 while (((pthread = TAILQ_FIRST(&_waitingq)) != NULL) &&
274                     (pthread->wakeup_time.tv_sec != -1) &&
275                     (((pthread->wakeup_time.tv_sec == 0) &&
276                     (pthread->wakeup_time.tv_nsec == 0)) ||
277                     (pthread->wakeup_time.tv_sec < ts.tv_sec) ||
278                     ((pthread->wakeup_time.tv_sec == ts.tv_sec) &&
279                     (pthread->wakeup_time.tv_nsec <= ts.tv_nsec)))) {
280                         switch (pthread->state) {
281                         case PS_POLL_WAIT:
282                         case PS_SELECT_WAIT:
283                                 /* Return zero file descriptors ready: */
284                                 pthread->data.poll_data->nfds = 0;
285                                 /* fall through */
286                         default:
287                                 /*
288                                  * Remove this thread from the waiting queue
289                                  * (and work queue if necessary) and place it
290                                  * in the ready queue.
291                                  */
292                                 PTHREAD_WAITQ_CLEARACTIVE();
293                                 if (pthread->flags & PTHREAD_FLAGS_IN_WORKQ)
294                                         PTHREAD_WORKQ_REMOVE(pthread);
295                                 PTHREAD_NEW_STATE(pthread, PS_RUNNING);
296                                 PTHREAD_WAITQ_SETACTIVE();
297                                 break;
298                         }
299                         /*
300                          * Flag the timeout in the thread structure:
301                          */
302                         pthread->timeout = 1;
303                 }
304                 PTHREAD_WAITQ_CLEARACTIVE();
305
306                 /*
307                  * Check if there is a current runnable thread that isn't
308                  * already in the ready queue:
309                  */
310                 if ((_thread_run != &_thread_kern_thread) &&
311                     (_thread_run->state == PS_RUNNING) &&
312                     ((_thread_run->flags & PTHREAD_FLAGS_IN_PRIOQ) == 0)) {
313                         if (_thread_run->slice_usec == -1) {
314                                 /*
315                                  * The thread exceeded its time
316                                  * quantum or it yielded the CPU;
317                                  * place it at the tail of the
318                                  * queue for its priority.
319                                  */
320                                 PTHREAD_PRIOQ_INSERT_TAIL(_thread_run);
321                         } else {
322                                 /*
323                                  * The thread hasn't exceeded its
324                                  * interval.  Place it at the head
325                                  * of the queue for its priority.
326                                  */
327                                 PTHREAD_PRIOQ_INSERT_HEAD(_thread_run);
328                         }
329                 }
330
331                 /*
332                  * Get the highest priority thread in the ready queue.
333                  */
334                 pthread_h = PTHREAD_PRIOQ_FIRST();
335
336                 /* Check if there are no threads ready to run: */
337                 if (pthread_h == NULL) {
338                         /*
339                          * Lock the pthread kernel by changing the pointer to
340                          * the running thread to point to the global kernel
341                          * thread structure: 
342                          */
343                         _thread_run = &_thread_kern_thread;
344
345                         /* Unprotect the scheduling queues: */
346                         _queue_signals = 0;
347
348                         /*
349                          * There are no threads ready to run, so wait until
350                          * something happens that changes this condition: 
351                          */
352                         _thread_kern_poll(1);
353                 }
354                 else {
355                         /* Remove the thread from the ready queue: */
356                         PTHREAD_PRIOQ_REMOVE(pthread_h);
357
358                         /* Get first thread on the waiting list: */
359                         pthread = TAILQ_FIRST(&_waitingq);
360
361                         /* Check to see if there is more than one thread: */
362                         if (pthread_h != TAILQ_FIRST(&_thread_list) ||
363                             TAILQ_NEXT(pthread_h, tle) != NULL)
364                                 set_timer = 1;
365                         else
366                                 set_timer = 0;
367
368                         /* Unprotect the scheduling queues: */
369                         _queue_signals = 0;
370
371                         /*
372                          * Check for signals queued while the scheduling
373                          * queues were protected:
374                          */
375                         while (_sigq_check_reqd != 0) {
376                                 /* Clear before handling queued signals: */
377                                 _sigq_check_reqd = 0;
378
379                                 /* Protect the scheduling queues again: */
380                                 _queue_signals = 1;
381
382                                 dequeue_signals();
383
384                                 /*
385                                  * Check for a higher priority thread that
386                                  * became runnable due to signal handling.
387                                  */
388                                 if (((pthread = PTHREAD_PRIOQ_FIRST()) != NULL) &&
389                                     (pthread->active_priority > pthread_h->active_priority)) {
390                                         /*
391                                          * Insert the lower priority thread
392                                          * at the head of its priority list:
393                                          */
394                                         PTHREAD_PRIOQ_INSERT_HEAD(pthread_h);
395
396                                         /* Remove the thread from the ready queue: */
397                                         PTHREAD_PRIOQ_REMOVE(pthread);
398
399                                         /* There's a new thread in town: */
400                                         pthread_h = pthread;
401                                 }
402
403                                 /* Get first thread on the waiting list: */
404                                 pthread = TAILQ_FIRST(&_waitingq);
405
406                                 /*
407                                  * Check to see if there is more than one
408                                  * thread:
409                                  */
410                                 if (pthread_h != TAILQ_FIRST(&_thread_list) ||
411                                     TAILQ_NEXT(pthread_h, tle) != NULL)
412                                         set_timer = 1;
413                                 else
414                                         set_timer = 0;
415
416                                 /* Unprotect the scheduling queues: */
417                                 _queue_signals = 0;
418                         }
419
420                         /* Make the selected thread the current thread: */
421                         _thread_run = pthread_h;
422
423                         /*
424                          * Save the current time as the time that the thread
425                          * became active: 
426                          */
427                         _thread_run->last_active.tv_sec = tv.tv_sec;
428                         _thread_run->last_active.tv_usec = tv.tv_usec;
429
430                         /*
431                          * Define the maximum time before a scheduling signal
432                          * is required: 
433                          */
434                         itimer.it_value.tv_sec = 0;
435                         itimer.it_value.tv_usec = TIMESLICE_USEC;
436
437                         /*
438                          * The interval timer is not reloaded when it
439                          * times out. The interval time needs to be
440                          * calculated every time. 
441                          */
442                         itimer.it_interval.tv_sec = 0;
443                         itimer.it_interval.tv_usec = 0;
444
445                         /* Get first thread on the waiting list: */
446                         if ((pthread != NULL) &&
447                             (pthread->wakeup_time.tv_sec != -1)) {
448                                 /*
449                                  * Calculate the time until this thread
450                                  * is ready, allowing for the clock
451                                  * resolution: 
452                                  */
453                                 ts1.tv_sec = pthread->wakeup_time.tv_sec
454                                     - ts.tv_sec;
455                                 ts1.tv_nsec = pthread->wakeup_time.tv_nsec
456                                     - ts.tv_nsec + _clock_res_nsec;
457
458                                 /*
459                                  * Check for underflow of the nanosecond field:
460                                  */
461                                 while (ts1.tv_nsec < 0) {
462                                         /*
463                                          * Allow for the underflow of the
464                                          * nanosecond field: 
465                                          */
466                                         ts1.tv_sec--;
467                                         ts1.tv_nsec += 1000000000;
468                                 }
469                                 /*
470                                  * Check for overflow of the nanosecond field: 
471                                  */
472                                 while (ts1.tv_nsec >= 1000000000) {
473                                         /*
474                                          * Allow for the overflow of the
475                                          * nanosecond field: 
476                                          */
477                                         ts1.tv_sec++;
478                                         ts1.tv_nsec -= 1000000000;
479                                 }
480                                 /*
481                                  * Convert the timespec structure to a
482                                  * timeval structure: 
483                                  */
484                                 TIMESPEC_TO_TIMEVAL(&tv1, &ts1);
485
486                                 /*
487                                  * Check if the thread will be ready
488                                  * sooner than the earliest ones found
489                                  * so far: 
490                                  */
491                                 if (timercmp(&tv1, &itimer.it_value, <)) {
492                                         /*
493                                          * Update the time value: 
494                                          */
495                                         itimer.it_value.tv_sec = tv1.tv_sec;
496                                         itimer.it_value.tv_usec = tv1.tv_usec;
497                                 }
498                         }
499
500                         /*
501                          * Check if this thread is running for the first time
502                          * or running again after using its full time slice
503                          * allocation: 
504                          */
505                         if (_thread_run->slice_usec == -1) {
506                                 /* Reset the accumulated time slice period: */
507                                 _thread_run->slice_usec = 0;
508                         }
509
510                         /* Check if there is more than one thread: */
511                         if (set_timer != 0) {
512                                 /*
513                                  * Start the interval timer for the
514                                  * calculated time interval: 
515                                  */
516                                 if (setitimer(_ITIMER_SCHED_TIMER, &itimer, NULL) != 0) {
517                                         /*
518                                          * Cannot initialise the timer, so
519                                          * abort this process: 
520                                          */
521                                         PANIC("Cannot set scheduling timer");
522                                 }
523                         }
524
525                         /* Check if a signal context was saved: */
526                         if (_thread_run->sig_saved == 1) {
527 #ifndef __alpha__
528                                 /*
529                                  * Point to the floating point data in the
530                                  * running thread: 
531                                  */
532                                 fdata = _thread_run->saved_fp;
533
534                                 /* Restore the floating point state: */
535                 __asm__("frstor %0": :"m"(*fdata));
536 #endif
537                                 /*
538                                  * Do a sigreturn to restart the thread that
539                                  * was interrupted by a signal: 
540                                  */
541                                 _thread_kern_in_sched = 0;
542
543                                 /*
544                                  * If we had a context switch, run any
545                                  * installed switch hooks.
546                                  */
547                                 if ((_sched_switch_hook != NULL) &&
548                                     (_last_user_thread != _thread_run)) {
549                                         thread_run_switch_hook(_last_user_thread,
550                                             _thread_run);
551                                 }
552                                 _thread_sys_sigreturn(&_thread_run->saved_sigcontext);
553                         } else {
554                                 /*
555                                  * Do a longjmp to restart the thread that
556                                  * was context switched out (by a longjmp to
557                                  * a different thread): 
558                                  */
559                                 longjmp(_thread_run->saved_jmp_buf, 1);
560                         }
561
562                         /* This point should not be reached. */
563                         PANIC("Thread has returned from sigreturn or longjmp");
564                 }
565         }
566
567         /* There are no more threads, so exit this process: */
568         exit(0);
569 }
570
571 void
572 _thread_kern_sched_state(enum pthread_state state, char *fname, int lineno)
573 {
574         /*
575          * Flag the pthread kernel as executing scheduler code
576          * to avoid a scheduler signal from interrupting this
577          * execution and calling the scheduler again.
578          */
579         _thread_kern_in_sched = 1;
580
581         /*
582          * Prevent the signal handler from fiddling with this thread
583          * before its state is set and is placed into the proper queue.
584          */
585         _queue_signals = 1;
586
587         /* Change the state of the current thread: */
588         _thread_run->state = state;
589         _thread_run->fname = fname;
590         _thread_run->lineno = lineno;
591
592         /* Schedule the next thread that is ready: */
593         _thread_kern_sched(NULL);
594         return;
595 }
596
597 void
598 _thread_kern_sched_state_unlock(enum pthread_state state,
599     spinlock_t *lock, char *fname, int lineno)
600 {
601         /*
602          * Flag the pthread kernel as executing scheduler code
603          * to avoid a scheduler signal from interrupting this
604          * execution and calling the scheduler again.
605          */
606         _thread_kern_in_sched = 1;
607
608         /*
609          * Prevent the signal handler from fiddling with this thread
610          * before its state is set and it is placed into the proper
611          * queue(s).
612          */
613         _queue_signals = 1;
614
615         /* Change the state of the current thread: */
616         _thread_run->state = state;
617         _thread_run->fname = fname;
618         _thread_run->lineno = lineno;
619
620         _SPINUNLOCK(lock);
621
622         /* Schedule the next thread that is ready: */
623         _thread_kern_sched(NULL);
624         return;
625 }
626
627 static void
628 _thread_kern_poll(int wait_reqd)
629 {
630         char            bufr[128];
631         int             count = 0;
632         int             i, found;
633         int             kern_pipe_added = 0;
634         int             nfds = 0;
635         int             timeout_ms = 0;
636         struct pthread  *pthread, *pthread_next;
637         ssize_t         num;
638         struct timespec ts;
639         struct timeval  tv;
640
641         /* Check if the caller wants to wait: */
642         if (wait_reqd == 0) {
643                 timeout_ms = 0;
644         }
645         else {
646                 /* Get the current time of day: */
647                 gettimeofday(&tv, NULL);
648                 TIMEVAL_TO_TIMESPEC(&tv, &ts);
649
650                 _queue_signals = 1;
651                 pthread = TAILQ_FIRST(&_waitingq);
652                 _queue_signals = 0;
653
654                 if ((pthread == NULL) || (pthread->wakeup_time.tv_sec == -1)) {
655                         /*
656                          * Either there are no threads in the waiting queue,
657                          * or there are no threads that can timeout.
658                          */
659                         timeout_ms = INFTIM;
660                 }
661                 else {
662                         /*
663                          * Calculate the time left for the next thread to
664                          * timeout allowing for the clock resolution:
665                          */
666                         timeout_ms = ((pthread->wakeup_time.tv_sec - ts.tv_sec) *
667                             1000) + ((pthread->wakeup_time.tv_nsec - ts.tv_nsec +
668                             _clock_res_nsec) / 1000000);
669                         /*
670                          * Don't allow negative timeouts:
671                          */
672                         if (timeout_ms < 0)
673                                 timeout_ms = 0;
674                 }
675         }
676                         
677         /* Protect the scheduling queues: */
678         _queue_signals = 1;
679
680         /*
681          * Check to see if the signal queue needs to be walked to look
682          * for threads awoken by a signal while in the scheduler.
683          */
684         if (_sigq_check_reqd != 0) {
685                 /* Reset flag before handling queued signals: */
686                 _sigq_check_reqd = 0;
687
688                 dequeue_signals();
689         }
690
691         /*
692          * Check for a thread that became runnable due to a signal:
693          */
694         if (PTHREAD_PRIOQ_FIRST() != NULL) {
695                 /*
696                  * Since there is at least one runnable thread,
697                  * disable the wait.
698                  */
699                 timeout_ms = 0;
700         }
701
702         /*
703          * Form the poll table:
704          */
705         nfds = 0;
706         if (timeout_ms != 0) {
707                 /* Add the kernel pipe to the poll table: */
708                 _thread_pfd_table[nfds].fd = _thread_kern_pipe[0];
709                 _thread_pfd_table[nfds].events = POLLRDNORM;
710                 _thread_pfd_table[nfds].revents = 0;
711                 nfds++;
712                 kern_pipe_added = 1;
713         }
714
715         PTHREAD_WAITQ_SETACTIVE();
716         TAILQ_FOREACH(pthread, &_workq, qe) {
717                 switch (pthread->state) {
718                 case PS_SPINBLOCK:
719                         /*
720                          * If the lock is available, let the thread run.
721                          */
722                         if (pthread->data.spinlock->access_lock == 0) {
723                                 PTHREAD_WAITQ_CLEARACTIVE();
724                                 PTHREAD_WORKQ_REMOVE(pthread);
725                                 PTHREAD_NEW_STATE(pthread,PS_RUNNING);
726                                 PTHREAD_WAITQ_SETACTIVE();
727                                 /* One less thread in a spinblock state: */
728                                 _spinblock_count--;
729                                 /*
730                                  * Since there is at least one runnable
731                                  * thread, disable the wait.
732                                  */
733                                 timeout_ms = 0;
734                         }
735                         break;
736
737                 /* File descriptor read wait: */
738                 case PS_FDR_WAIT:
739                         /* Limit number of polled files to table size: */
740                         if (nfds < _thread_dtablesize) {
741                                 _thread_pfd_table[nfds].events = POLLRDNORM;
742                                 _thread_pfd_table[nfds].fd = pthread->data.fd.fd;
743                                 nfds++;
744                         }
745                         break;
746
747                 /* File descriptor write wait: */
748                 case PS_FDW_WAIT:
749                         /* Limit number of polled files to table size: */
750                         if (nfds < _thread_dtablesize) {
751                                 _thread_pfd_table[nfds].events = POLLWRNORM;
752                                 _thread_pfd_table[nfds].fd = pthread->data.fd.fd;
753                                 nfds++;
754                         }
755                         break;
756
757                 /* File descriptor poll or select wait: */
758                 case PS_POLL_WAIT:
759                 case PS_SELECT_WAIT:
760                         /* Limit number of polled files to table size: */
761                         if (pthread->data.poll_data->nfds + nfds <
762                             _thread_dtablesize) {
763                                 for (i = 0; i < pthread->data.poll_data->nfds; i++) {
764                                         _thread_pfd_table[nfds + i].fd =
765                                             pthread->data.poll_data->fds[i].fd;
766                                         _thread_pfd_table[nfds + i].events =
767                                             pthread->data.poll_data->fds[i].events;
768                                 }
769                                 nfds += pthread->data.poll_data->nfds;
770                         }
771                         break;
772
773                 /* Other states do not depend on file I/O. */
774                 default:
775                         break;
776                 }
777         }
778         PTHREAD_WAITQ_CLEARACTIVE();
779
780         /*
781          * Wait for a file descriptor to be ready for read, write, or
782          * an exception, or a timeout to occur: 
783          */
784         count = _thread_sys_poll(_thread_pfd_table, nfds, timeout_ms);
785
786         if (kern_pipe_added != 0)
787                 /*
788                  * Remove the pthread kernel pipe file descriptor
789                  * from the pollfd table: 
790                  */
791                 nfds = 1;
792         else
793                 nfds = 0;
794
795         /*
796          * Check if it is possible that there are bytes in the kernel
797          * read pipe waiting to be read:
798          */
799         if (count < 0 || ((kern_pipe_added != 0) &&
800             (_thread_pfd_table[0].revents & POLLRDNORM))) {
801                 /*
802                  * If the kernel read pipe was included in the
803                  * count: 
804                  */
805                 if (count > 0) {
806                         /* Decrement the count of file descriptors: */
807                         count--;
808                 }
809
810                 if (_sigq_check_reqd != 0) {
811                         /* Reset flag before handling signals: */
812                         _sigq_check_reqd = 0;
813
814                         dequeue_signals();
815                 }
816         }
817
818         /*
819          * Check if any file descriptors are ready:
820          */
821         if (count > 0) {
822                 /*
823                  * Enter a loop to look for threads waiting on file
824                  * descriptors that are flagged as available by the
825                  * _poll syscall: 
826                  */
827                 PTHREAD_WAITQ_SETACTIVE();
828                 TAILQ_FOREACH(pthread, &_workq, qe) {
829                         switch (pthread->state) {
830                         case PS_SPINBLOCK:
831                                 /*
832                                  * If the lock is available, let the thread run.
833                                  */
834                                 if (pthread->data.spinlock->access_lock == 0) {
835                                         PTHREAD_WAITQ_CLEARACTIVE();
836                                         PTHREAD_WORKQ_REMOVE(pthread);
837                                         PTHREAD_NEW_STATE(pthread,PS_RUNNING);
838                                         PTHREAD_WAITQ_SETACTIVE();
839
840                                         /*
841                                          * One less thread in a spinblock state:
842                                          */
843                                         _spinblock_count--;
844                                 }
845                                 break;
846
847                         /* File descriptor read wait: */
848                         case PS_FDR_WAIT:
849                                 if ((nfds < _thread_dtablesize) &&
850                                     (_thread_pfd_table[nfds].revents & POLLRDNORM)) {
851                                         PTHREAD_WAITQ_CLEARACTIVE();
852                                         PTHREAD_WORKQ_REMOVE(pthread);
853                                         PTHREAD_NEW_STATE(pthread,PS_RUNNING);
854                                         PTHREAD_WAITQ_SETACTIVE();
855                                 }
856                                 nfds++;
857                                 break;
858
859                         /* File descriptor write wait: */
860                         case PS_FDW_WAIT:
861                                 if ((nfds < _thread_dtablesize) &&
862                                     (_thread_pfd_table[nfds].revents & POLLWRNORM)) {
863                                         PTHREAD_WAITQ_CLEARACTIVE();
864                                         PTHREAD_WORKQ_REMOVE(pthread);
865                                         PTHREAD_NEW_STATE(pthread,PS_RUNNING);
866                                         PTHREAD_WAITQ_SETACTIVE();
867                                 }
868                                 nfds++;
869                                 break;
870
871                         /* File descriptor poll or select wait: */
872                         case PS_POLL_WAIT:
873                         case PS_SELECT_WAIT:
874                                 if (pthread->data.poll_data->nfds + nfds <
875                                     _thread_dtablesize) {
876                                         /*
877                                          * Enter a loop looking for I/O
878                                          * readiness:
879                                          */
880                                         found = 0;
881                                         for (i = 0; i < pthread->data.poll_data->nfds; i++) {
882                                                 if (_thread_pfd_table[nfds + i].revents != 0) {
883                                                         pthread->data.poll_data->fds[i].revents =
884                                                             _thread_pfd_table[nfds + i].revents;
885                                                         found++;
886                                                 }
887                                         }
888
889                                         /* Increment before destroying: */
890                                         nfds += pthread->data.poll_data->nfds;
891
892                                         if (found != 0) {
893                                                 pthread->data.poll_data->nfds = found;
894                                                 PTHREAD_WAITQ_CLEARACTIVE();
895                                                 PTHREAD_WORKQ_REMOVE(pthread);
896                                                 PTHREAD_NEW_STATE(pthread,PS_RUNNING);
897                                                 PTHREAD_WAITQ_SETACTIVE();
898                                         }
899                                 }
900                                 else
901                                         nfds += pthread->data.poll_data->nfds;
902                                 break;
903
904                         /* Other states do not depend on file I/O. */
905                         default:
906                                 break;
907                         }
908                 }
909                 PTHREAD_WAITQ_CLEARACTIVE();
910         }
911         else if (_spinblock_count != 0) {
912                 /*
913                  * Enter a loop to look for threads waiting on a spinlock
914                  * that is now available.
915                  */
916                 PTHREAD_WAITQ_SETACTIVE();
917                 TAILQ_FOREACH(pthread, &_workq, qe) {
918                         if (pthread->state == PS_SPINBLOCK) {
919                                 /*
920                                  * If the lock is available, let the thread run.
921                                  */
922                                 if (pthread->data.spinlock->access_lock == 0) {
923                                         PTHREAD_WAITQ_CLEARACTIVE();
924                                         PTHREAD_WORKQ_REMOVE(pthread);
925                                         PTHREAD_NEW_STATE(pthread,PS_RUNNING);
926                                         PTHREAD_WAITQ_SETACTIVE();
927
928                                         /*
929                                          * One less thread in a spinblock state:
930                                          */
931                                         _spinblock_count--;
932                                 }
933                         }
934                 }
935                 PTHREAD_WAITQ_CLEARACTIVE();
936         }
937
938         /* Unprotect the scheduling queues: */
939         _queue_signals = 0;
940
941         while (_sigq_check_reqd != 0) {
942                 /* Handle queued signals: */
943                 _sigq_check_reqd = 0;
944
945                 /* Protect the scheduling queues: */
946                 _queue_signals = 1;
947
948                 dequeue_signals();
949
950                 /* Unprotect the scheduling queues: */
951                 _queue_signals = 0;
952         }
953
954         /* Nothing to return. */
955         return;
956 }
957
958 void
959 _thread_kern_set_timeout(struct timespec * timeout)
960 {
961         struct timespec current_time;
962         struct timeval  tv;
963
964         /* Reset the timeout flag for the running thread: */
965         _thread_run->timeout = 0;
966
967         /* Check if the thread is to wait forever: */
968         if (timeout == NULL) {
969                 /*
970                  * Set the wakeup time to something that can be recognised as
971                  * different to an actual time of day: 
972                  */
973                 _thread_run->wakeup_time.tv_sec = -1;
974                 _thread_run->wakeup_time.tv_nsec = -1;
975         }
976         /* Check if no waiting is required: */
977         else if (timeout->tv_sec == 0 && timeout->tv_nsec == 0) {
978                 /* Set the wake up time to 'immediately': */
979                 _thread_run->wakeup_time.tv_sec = 0;
980                 _thread_run->wakeup_time.tv_nsec = 0;
981         } else {
982                 /* Get the current time: */
983                 gettimeofday(&tv, NULL);
984                 TIMEVAL_TO_TIMESPEC(&tv, &current_time);
985
986                 /* Calculate the time for the current thread to wake up: */
987                 _thread_run->wakeup_time.tv_sec = current_time.tv_sec + timeout->tv_sec;
988                 _thread_run->wakeup_time.tv_nsec = current_time.tv_nsec + timeout->tv_nsec;
989
990                 /* Check if the nanosecond field needs to wrap: */
991                 if (_thread_run->wakeup_time.tv_nsec >= 1000000000) {
992                         /* Wrap the nanosecond field: */
993                         _thread_run->wakeup_time.tv_sec += 1;
994                         _thread_run->wakeup_time.tv_nsec -= 1000000000;
995                 }
996         }
997         return;
998 }
999
1000 void
1001 _thread_kern_sig_defer(void)
1002 {
1003         /* Allow signal deferral to be recursive. */
1004         _thread_run->sig_defer_count++;
1005 }
1006
1007 void
1008 _thread_kern_sig_undefer(void)
1009 {
1010         pthread_t pthread;
1011         int need_resched = 0;
1012
1013         /*
1014          * Perform checks to yield only if we are about to undefer
1015          * signals.
1016          */
1017         if (_thread_run->sig_defer_count > 1) {
1018                 /* Decrement the signal deferral count. */
1019                 _thread_run->sig_defer_count--;
1020         }
1021         else if (_thread_run->sig_defer_count == 1) {
1022                 /* Reenable signals: */
1023                 _thread_run->sig_defer_count = 0;
1024
1025                 /*
1026                  * Check if there are queued signals:
1027                  */
1028                 while (_sigq_check_reqd != 0) {
1029                         /* Defer scheduling while we process queued signals: */
1030                         _thread_run->sig_defer_count = 1;
1031
1032                         /* Clear the flag before checking the signal queue: */
1033                         _sigq_check_reqd = 0;
1034
1035                         /* Dequeue and handle signals: */
1036                         dequeue_signals();
1037
1038                         /*
1039                          * Avoiding an unnecessary check to reschedule, check
1040                          * to see if signal handling caused a higher priority
1041                          * thread to become ready.
1042                          */
1043                         if ((need_resched == 0) &&
1044                             (((pthread = PTHREAD_PRIOQ_FIRST()) != NULL) &&
1045                             (pthread->active_priority > _thread_run->active_priority))) {
1046                                 need_resched = 1;
1047                         }
1048
1049                         /* Reenable signals: */
1050                         _thread_run->sig_defer_count = 0;
1051                 }
1052
1053                 /* Yield the CPU if necessary: */
1054                 if (need_resched || _thread_run->yield_on_sig_undefer != 0) {
1055                         _thread_run->yield_on_sig_undefer = 0;
1056                         _thread_kern_sched(NULL);
1057                 }
1058         }
1059 }
1060
1061 static void
1062 dequeue_signals(void)
1063 {
1064         char    bufr[128];
1065         int     i, num;
1066
1067         /*
1068          * Enter a loop to read and handle queued signals from the
1069          * pthread kernel pipe: 
1070          */
1071         while (((num = _thread_sys_read(_thread_kern_pipe[0], bufr,
1072             sizeof(bufr))) > 0) || (num == -1 && errno == EINTR)) {
1073                 /*
1074                  * The buffer read contains one byte per signal and
1075                  * each byte is the signal number.
1076                  */
1077                 for (i = 0; i < num; i++) {
1078                         if ((int) bufr[i] == _SCHED_SIGNAL) {
1079                                 /*
1080                                  * Scheduling signals shouldn't ever be
1081                                  * queued; just ignore it for now.
1082                                  */
1083                         }
1084                         else {
1085                                 /* Handle this signal: */
1086                                 _thread_sig_handle((int) bufr[i], NULL);
1087                         }
1088                 }
1089         }
1090         if ((num < 0) && (errno != EAGAIN)) {
1091                 /*
1092                  * The only error we should expect is if there is
1093                  * no data to read.
1094                  */
1095                 PANIC("Unable to read from thread kernel pipe");
1096         }
1097 }
1098
1099 static inline void
1100 thread_run_switch_hook(pthread_t thread_out, pthread_t thread_in)
1101 {
1102         pthread_t tid_out = thread_out;
1103         pthread_t tid_in = thread_in;
1104
1105         if ((tid_out != NULL) &&
1106             (tid_out->flags & PTHREAD_FLAGS_PRIVATE != 0))
1107                 tid_out = NULL;
1108         if ((tid_in != NULL) &&
1109             (tid_in->flags & PTHREAD_FLAGS_PRIVATE != 0))
1110                 tid_in = NULL;
1111
1112         if ((_sched_switch_hook != NULL) && (tid_out != tid_in)) {
1113                 /* Run the scheduler switch hook: */
1114                 _sched_switch_hook(tid_out, tid_in);
1115         }
1116 }
1117 #endif