2 * kmp_taskq.cpp -- TASKQ support for OpenMP.
5 //===----------------------------------------------------------------------===//
7 // The LLVM Compiler Infrastructure
9 // This file is dual licensed under the MIT and the University of Illinois Open
10 // Source Licenses. See LICENSE.txt for details.
12 //===----------------------------------------------------------------------===//
15 #include "kmp_error.h"
19 #define MAX_MESSAGE 512
21 /* Taskq routines and global variables */
23 #define KMP_DEBUG_REF_CTS(x) KF_TRACE(1, x);
25 #define THREAD_ALLOC_FOR_TASKQ
27 static int in_parallel_context(kmp_team_t *team) {
28 return !team->t.t_serialized;
31 static void __kmp_taskq_eo(int *gtid_ref, int *cid_ref, ident_t *loc_ref) {
33 int tid = __kmp_tid_from_gtid(gtid);
35 kmpc_task_queue_t *taskq;
36 kmp_taskq_t *tq = &__kmp_threads[gtid]->th.th_team->t.t_taskq;
38 if (__kmp_env_consistency_check)
39 #if KMP_USE_DYNAMIC_LOCK
40 __kmp_push_sync(gtid, ct_ordered_in_taskq, loc_ref, NULL, 0);
42 __kmp_push_sync(gtid, ct_ordered_in_taskq, loc_ref, NULL);
45 if (!__kmp_threads[gtid]->th.th_team->t.t_serialized) {
46 KMP_MB(); /* Flush all pending memory write invalidates. */
48 /* GEH - need check here under stats to make sure */
49 /* inside task (curr_thunk[*tid_ref] != NULL) */
51 my_token = tq->tq_curr_thunk[tid]->th_tasknum;
53 taskq = tq->tq_curr_thunk[tid]->th.th_shareds->sv_queue;
55 KMP_WAIT_YIELD(&taskq->tq_tasknum_serving, my_token, KMP_EQ, NULL);
60 static void __kmp_taskq_xo(int *gtid_ref, int *cid_ref, ident_t *loc_ref) {
62 int tid = __kmp_tid_from_gtid(gtid);
64 kmp_taskq_t *tq = &__kmp_threads[gtid]->th.th_team->t.t_taskq;
66 if (__kmp_env_consistency_check)
67 __kmp_pop_sync(gtid, ct_ordered_in_taskq, loc_ref);
69 if (!__kmp_threads[gtid]->th.th_team->t.t_serialized) {
70 KMP_MB(); /* Flush all pending memory write invalidates. */
72 /* GEH - need check here under stats to make sure */
73 /* inside task (curr_thunk[tid] != NULL) */
75 my_token = tq->tq_curr_thunk[tid]->th_tasknum;
77 KMP_MB(); /* Flush all pending memory write invalidates. */
79 tq->tq_curr_thunk[tid]->th.th_shareds->sv_queue->tq_tasknum_serving =
82 KMP_MB(); /* Flush all pending memory write invalidates. */
86 static void __kmp_taskq_check_ordered(kmp_int32 gtid, kmpc_thunk_t *thunk) {
88 kmpc_task_queue_t *taskq;
90 /* assume we are always called from an active parallel context */
92 KMP_MB(); /* Flush all pending memory write invalidates. */
94 my_token = thunk->th_tasknum;
96 taskq = thunk->th.th_shareds->sv_queue;
98 if (taskq->tq_tasknum_serving <= my_token) {
99 KMP_WAIT_YIELD(&taskq->tq_tasknum_serving, my_token, KMP_GE, NULL);
101 taskq->tq_tasknum_serving = my_token + 1;
108 static void __kmp_dump_TQF(kmp_int32 flags) {
109 if (flags & TQF_IS_ORDERED)
110 __kmp_printf("ORDERED ");
111 if (flags & TQF_IS_LASTPRIVATE)
112 __kmp_printf("LAST_PRIV ");
113 if (flags & TQF_IS_NOWAIT)
114 __kmp_printf("NOWAIT ");
115 if (flags & TQF_HEURISTICS)
116 __kmp_printf("HEURIST ");
117 if (flags & TQF_INTERFACE_RESERVED1)
118 __kmp_printf("RESERV1 ");
119 if (flags & TQF_INTERFACE_RESERVED2)
120 __kmp_printf("RESERV2 ");
121 if (flags & TQF_INTERFACE_RESERVED3)
122 __kmp_printf("RESERV3 ");
123 if (flags & TQF_INTERFACE_RESERVED4)
124 __kmp_printf("RESERV4 ");
125 if (flags & TQF_IS_LAST_TASK)
126 __kmp_printf("LAST_TASK ");
127 if (flags & TQF_TASKQ_TASK)
128 __kmp_printf("TASKQ_TASK ");
129 if (flags & TQF_RELEASE_WORKERS)
130 __kmp_printf("RELEASE ");
131 if (flags & TQF_ALL_TASKS_QUEUED)
132 __kmp_printf("ALL_QUEUED ");
133 if (flags & TQF_PARALLEL_CONTEXT)
134 __kmp_printf("PARALLEL ");
135 if (flags & TQF_DEALLOCATED)
136 __kmp_printf("DEALLOC ");
137 if (!(flags & (TQF_INTERNAL_FLAGS | TQF_INTERFACE_FLAGS)))
138 __kmp_printf("(NONE)");
141 static void __kmp_dump_thunk(kmp_taskq_t *tq, kmpc_thunk_t *thunk,
142 kmp_int32 global_tid) {
144 int nproc = __kmp_threads[global_tid]->th.th_team->t.t_nproc;
146 __kmp_printf("\tThunk at %p on (%d): ", thunk, global_tid);
149 for (i = 0; i < nproc; i++) {
150 if (tq->tq_curr_thunk[i] == thunk) {
151 __kmp_printf("[%i] ", i);
154 __kmp_printf("th_shareds=%p, ", thunk->th.th_shareds);
155 __kmp_printf("th_task=%p, ", thunk->th_task);
156 __kmp_printf("th_encl_thunk=%p, ", thunk->th_encl_thunk);
157 __kmp_printf("th_status=%d, ", thunk->th_status);
158 __kmp_printf("th_tasknum=%u, ", thunk->th_tasknum);
159 __kmp_printf("th_flags=");
160 __kmp_dump_TQF(thunk->th_flags);
166 static void __kmp_dump_thunk_stack(kmpc_thunk_t *thunk, kmp_int32 thread_num) {
169 __kmp_printf(" Thunk stack for T#%d: ", thread_num);
171 for (th = thunk; th != NULL; th = th->th_encl_thunk)
172 __kmp_printf("%p ", th);
177 static void __kmp_dump_task_queue(kmp_taskq_t *tq, kmpc_task_queue_t *queue,
178 kmp_int32 global_tid) {
181 kmpc_task_queue_t *taskq;
183 __kmp_printf("Task Queue at %p on (%d):\n", queue, global_tid);
186 int in_parallel = queue->tq_flags & TQF_PARALLEL_CONTEXT;
188 if (__kmp_env_consistency_check) {
189 __kmp_printf(" tq_loc : ");
193 // if (queue->tq.tq_parent != 0)
194 //__kmp_acquire_lock(& queue->tq.tq_parent->tq_link_lck, global_tid);
196 //__kmp_acquire_lock(& queue->tq_link_lck, global_tid);
198 // Make sure data structures are in consistent state before querying them
199 // Seems to work without this for digital/alpha, needed for IBM/RS6000
202 __kmp_printf(" tq_parent : %p\n", queue->tq.tq_parent);
203 __kmp_printf(" tq_first_child : %p\n", queue->tq_first_child);
204 __kmp_printf(" tq_next_child : %p\n", queue->tq_next_child);
205 __kmp_printf(" tq_prev_child : %p\n", queue->tq_prev_child);
206 __kmp_printf(" tq_ref_count : %d\n", queue->tq_ref_count);
208 //__kmp_release_lock(& queue->tq_link_lck, global_tid);
210 // if (queue->tq.tq_parent != 0)
211 //__kmp_release_lock(& queue->tq.tq_parent->tq_link_lck, global_tid);
213 //__kmp_acquire_lock(& queue->tq_free_thunks_lck, global_tid);
214 //__kmp_acquire_lock(& queue->tq_queue_lck, global_tid);
216 // Make sure data structures are in consistent state before querying them
217 // Seems to work without this for digital/alpha, needed for IBM/RS6000
221 __kmp_printf(" tq_shareds : ");
222 for (i = 0; i < ((queue == tq->tq_root) ? queue->tq_nproc : 1); i++)
223 __kmp_printf("%p ", queue->tq_shareds[i].ai_data);
227 __kmp_printf(" tq_tasknum_queuing : %u\n", queue->tq_tasknum_queuing);
228 __kmp_printf(" tq_tasknum_serving : %u\n", queue->tq_tasknum_serving);
231 __kmp_printf(" tq_queue : %p\n", queue->tq_queue);
232 __kmp_printf(" tq_thunk_space : %p\n", queue->tq_thunk_space);
233 __kmp_printf(" tq_taskq_slot : %p\n", queue->tq_taskq_slot);
235 __kmp_printf(" tq_free_thunks : ");
236 for (thunk = queue->tq_free_thunks; thunk != NULL;
237 thunk = thunk->th.th_next_free)
238 __kmp_printf("%p ", thunk);
241 __kmp_printf(" tq_nslots : %d\n", queue->tq_nslots);
242 __kmp_printf(" tq_head : %d\n", queue->tq_head);
243 __kmp_printf(" tq_tail : %d\n", queue->tq_tail);
244 __kmp_printf(" tq_nfull : %d\n", queue->tq_nfull);
245 __kmp_printf(" tq_hiwat : %d\n", queue->tq_hiwat);
246 __kmp_printf(" tq_flags : ");
247 __kmp_dump_TQF(queue->tq_flags);
251 __kmp_printf(" tq_th_thunks : ");
252 for (i = 0; i < queue->tq_nproc; i++) {
253 __kmp_printf("%d ", queue->tq_th_thunks[i].ai_data);
259 __kmp_printf(" Queue slots:\n");
262 for (count = 0; count < queue->tq_nfull; ++count) {
263 __kmp_printf("(%d)", qs);
264 __kmp_dump_thunk(tq, queue->tq_queue[qs].qs_thunk, global_tid);
265 qs = (qs + 1) % queue->tq_nslots;
271 if (queue->tq_taskq_slot != NULL) {
272 __kmp_printf(" TaskQ slot:\n");
273 __kmp_dump_thunk(tq, CCAST(kmpc_thunk_t *, queue->tq_taskq_slot),
277 //__kmp_release_lock(& queue->tq_queue_lck, global_tid);
278 //__kmp_release_lock(& queue->tq_free_thunks_lck, global_tid);
282 __kmp_printf(" Taskq freelist: ");
284 //__kmp_acquire_lock( & tq->tq_freelist_lck, global_tid );
286 // Make sure data structures are in consistent state before querying them
287 // Seems to work without this call for digital/alpha, needed for IBM/RS6000
290 for (taskq = tq->tq_freelist; taskq != NULL; taskq = taskq->tq.tq_next_free)
291 __kmp_printf("%p ", taskq);
293 //__kmp_release_lock( & tq->tq_freelist_lck, global_tid );
295 __kmp_printf("\n\n");
298 static void __kmp_aux_dump_task_queue_tree(kmp_taskq_t *tq,
299 kmpc_task_queue_t *curr_queue,
301 kmp_int32 global_tid) {
303 int nproc = __kmp_threads[global_tid]->th.th_team->t.t_nproc;
304 kmpc_task_queue_t *queue = curr_queue;
306 if (curr_queue == NULL)
311 for (i = 0; i < level; i++)
314 __kmp_printf("%p", curr_queue);
316 for (i = 0; i < nproc; i++) {
317 if (tq->tq_curr_thunk[i] &&
318 tq->tq_curr_thunk[i]->th.th_shareds->sv_queue == curr_queue) {
319 __kmp_printf(" [%i]", i);
325 //__kmp_acquire_lock(& curr_queue->tq_queue_lck, global_tid);
327 // Make sure data structures are in consistent state before querying them
328 // Seems to work without this call for digital/alpha, needed for IBM/RS6000
331 qs = curr_queue->tq_tail;
333 for (count = 0; count < curr_queue->tq_nfull; ++count) {
334 __kmp_printf("%p ", curr_queue->tq_queue[qs].qs_thunk);
335 qs = (qs + 1) % curr_queue->tq_nslots;
338 //__kmp_release_lock(& curr_queue->tq_queue_lck, global_tid);
342 if (curr_queue->tq_first_child) {
343 //__kmp_acquire_lock(& curr_queue->tq_link_lck, global_tid);
345 // Make sure data structures are in consistent state before querying them
346 // Seems to work without this call for digital/alpha, needed for IBM/RS6000
349 if (curr_queue->tq_first_child) {
350 for (queue = CCAST(kmpc_task_queue_t *, curr_queue->tq_first_child);
351 queue != NULL; queue = queue->tq_next_child) {
352 __kmp_aux_dump_task_queue_tree(tq, queue, level + 1, global_tid);
356 //__kmp_release_lock(& curr_queue->tq_link_lck, global_tid);
360 static void __kmp_dump_task_queue_tree(kmp_taskq_t *tq,
361 kmpc_task_queue_t *tqroot,
362 kmp_int32 global_tid) {
363 __kmp_printf("TaskQ Tree at root %p on (%d):\n", tqroot, global_tid);
365 __kmp_aux_dump_task_queue_tree(tq, tqroot, 0, global_tid);
371 /* New taskq storage routines that try to minimize overhead of mallocs but
372 still provide cache line alignment. */
373 static void *__kmp_taskq_allocate(size_t size, kmp_int32 global_tid) {
374 void *addr, *orig_addr;
377 KB_TRACE(5, ("__kmp_taskq_allocate: called size=%d, gtid=%d\n", (int)size,
380 bytes = sizeof(void *) + CACHE_LINE + size;
382 #ifdef THREAD_ALLOC_FOR_TASKQ
384 (void *)__kmp_thread_malloc(__kmp_thread_from_gtid(global_tid), bytes);
386 KE_TRACE(10, ("%%%%%% MALLOC( %d )\n", bytes));
387 orig_addr = (void *)KMP_INTERNAL_MALLOC(bytes);
388 #endif /* THREAD_ALLOC_FOR_TASKQ */
391 KMP_FATAL(OutOfHeapMemory);
395 if (((kmp_uintptr_t)addr & (CACHE_LINE - 1)) != 0) {
396 KB_TRACE(50, ("__kmp_taskq_allocate: adjust for cache alignment\n"));
397 addr = (void *)(((kmp_uintptr_t)addr + CACHE_LINE) & ~(CACHE_LINE - 1));
400 (*(void **)addr) = orig_addr;
403 ("__kmp_taskq_allocate: allocate: %p, use: %p - %p, size: %d, "
405 orig_addr, ((void **)addr) + 1,
406 ((char *)(((void **)addr) + 1)) + size - 1, (int)size, global_tid));
408 return (((void **)addr) + 1);
411 static void __kmpc_taskq_free(void *p, kmp_int32 global_tid) {
412 KB_TRACE(5, ("__kmpc_taskq_free: called addr=%p, gtid=%d\n", p, global_tid));
414 KB_TRACE(10, ("__kmpc_taskq_free: freeing: %p, gtid: %d\n",
415 (*(((void **)p) - 1)), global_tid));
417 #ifdef THREAD_ALLOC_FOR_TASKQ
418 __kmp_thread_free(__kmp_thread_from_gtid(global_tid), *(((void **)p) - 1));
420 KMP_INTERNAL_FREE(*(((void **)p) - 1));
421 #endif /* THREAD_ALLOC_FOR_TASKQ */
424 /* Keep freed kmpc_task_queue_t on an internal freelist and recycle since
425 they're of constant size. */
427 static kmpc_task_queue_t *
428 __kmp_alloc_taskq(kmp_taskq_t *tq, int in_parallel, kmp_int32 nslots,
429 kmp_int32 nthunks, kmp_int32 nshareds, kmp_int32 nproc,
430 size_t sizeof_thunk, size_t sizeof_shareds,
431 kmpc_thunk_t **new_taskq_thunk, kmp_int32 global_tid) {
434 kmpc_task_queue_t *new_queue;
435 kmpc_aligned_shared_vars_t *shared_var_array;
436 char *shared_var_storage;
437 char *pt; /* for doing byte-adjusted address computations */
439 __kmp_acquire_lock(&tq->tq_freelist_lck, global_tid);
441 // Make sure data structures are in consistent state before querying them
442 // Seems to work without this call for digital/alpha, needed for IBM/RS6000
445 if (tq->tq_freelist) {
446 new_queue = tq->tq_freelist;
447 tq->tq_freelist = tq->tq_freelist->tq.tq_next_free;
449 KMP_DEBUG_ASSERT(new_queue->tq_flags & TQF_DEALLOCATED);
451 new_queue->tq_flags = 0;
453 __kmp_release_lock(&tq->tq_freelist_lck, global_tid);
455 __kmp_release_lock(&tq->tq_freelist_lck, global_tid);
457 new_queue = (kmpc_task_queue_t *)__kmp_taskq_allocate(
458 sizeof(kmpc_task_queue_t), global_tid);
459 new_queue->tq_flags = 0;
462 /* space in the task queue for queue slots (allocate as one big chunk */
463 /* of storage including new_taskq_task space) */
466 (CACHE_LINE - (sizeof_thunk % CACHE_LINE)); /* pad to cache line size */
467 pt = (char *)__kmp_taskq_allocate(nthunks * sizeof_thunk, global_tid);
468 new_queue->tq_thunk_space = (kmpc_thunk_t *)pt;
469 *new_taskq_thunk = (kmpc_thunk_t *)(pt + (nthunks - 1) * sizeof_thunk);
471 /* chain the allocated thunks into a freelist for this queue */
473 new_queue->tq_free_thunks = (kmpc_thunk_t *)pt;
475 for (i = 0; i < (nthunks - 2); i++) {
476 ((kmpc_thunk_t *)(pt + i * sizeof_thunk))->th.th_next_free =
477 (kmpc_thunk_t *)(pt + (i + 1) * sizeof_thunk);
479 ((kmpc_thunk_t *)(pt + i * sizeof_thunk))->th_flags = TQF_DEALLOCATED;
483 ((kmpc_thunk_t *)(pt + (nthunks - 2) * sizeof_thunk))->th.th_next_free = NULL;
485 ((kmpc_thunk_t *)(pt + (nthunks - 2) * sizeof_thunk))->th_flags =
489 /* initialize the locks */
492 __kmp_init_lock(&new_queue->tq_link_lck);
493 __kmp_init_lock(&new_queue->tq_free_thunks_lck);
494 __kmp_init_lock(&new_queue->tq_queue_lck);
497 /* now allocate the slots */
499 bytes = nslots * sizeof(kmpc_aligned_queue_slot_t);
500 new_queue->tq_queue =
501 (kmpc_aligned_queue_slot_t *)__kmp_taskq_allocate(bytes, global_tid);
503 /* space for array of pointers to shared variable structures */
504 sizeof_shareds += sizeof(kmpc_task_queue_t *);
506 (CACHE_LINE - (sizeof_shareds % CACHE_LINE)); /* pad to cache line size */
508 bytes = nshareds * sizeof(kmpc_aligned_shared_vars_t);
510 (kmpc_aligned_shared_vars_t *)__kmp_taskq_allocate(bytes, global_tid);
512 bytes = nshareds * sizeof_shareds;
513 shared_var_storage = (char *)__kmp_taskq_allocate(bytes, global_tid);
515 for (i = 0; i < nshareds; i++) {
516 shared_var_array[i].ai_data =
517 (kmpc_shared_vars_t *)(shared_var_storage + i * sizeof_shareds);
518 shared_var_array[i].ai_data->sv_queue = new_queue;
520 new_queue->tq_shareds = shared_var_array;
522 /* array for number of outstanding thunks per thread */
525 bytes = nproc * sizeof(kmpc_aligned_int32_t);
526 new_queue->tq_th_thunks =
527 (kmpc_aligned_int32_t *)__kmp_taskq_allocate(bytes, global_tid);
528 new_queue->tq_nproc = nproc;
530 for (i = 0; i < nproc; i++)
531 new_queue->tq_th_thunks[i].ai_data = 0;
537 static void __kmp_free_taskq(kmp_taskq_t *tq, kmpc_task_queue_t *p,
538 int in_parallel, kmp_int32 global_tid) {
539 __kmpc_taskq_free(p->tq_thunk_space, global_tid);
540 __kmpc_taskq_free(p->tq_queue, global_tid);
542 /* free shared var structure storage */
543 __kmpc_taskq_free(CCAST(kmpc_shared_vars_t *, p->tq_shareds[0].ai_data),
545 /* free array of pointers to shared vars storage */
546 __kmpc_taskq_free(p->tq_shareds, global_tid);
549 p->tq_first_child = NULL;
550 p->tq_next_child = NULL;
551 p->tq_prev_child = NULL;
552 p->tq_ref_count = -10;
553 p->tq_shareds = NULL;
554 p->tq_tasknum_queuing = 0;
555 p->tq_tasknum_serving = 0;
557 p->tq_thunk_space = NULL;
558 p->tq_taskq_slot = NULL;
559 p->tq_free_thunks = NULL;
569 for (i = 0; i < p->tq_nproc; i++)
570 p->tq_th_thunks[i].ai_data = 0;
572 if (__kmp_env_consistency_check)
574 KMP_DEBUG_ASSERT(p->tq_flags & TQF_DEALLOCATED);
575 p->tq_flags = TQF_DEALLOCATED;
576 #endif /* KMP_DEBUG */
579 __kmpc_taskq_free(p->tq_th_thunks, global_tid);
580 __kmp_destroy_lock(&p->tq_link_lck);
581 __kmp_destroy_lock(&p->tq_queue_lck);
582 __kmp_destroy_lock(&p->tq_free_thunks_lck);
585 p->tq_th_thunks = NULL;
586 #endif /* KMP_DEBUG */
588 // Make sure data structures are in consistent state before querying them
589 // Seems to work without this call for digital/alpha, needed for IBM/RS6000
592 __kmp_acquire_lock(&tq->tq_freelist_lck, global_tid);
593 p->tq.tq_next_free = tq->tq_freelist;
596 __kmp_release_lock(&tq->tq_freelist_lck, global_tid);
599 /* Once a group of thunks has been allocated for use in a particular queue,
600 these are managed via a per-queue freelist.
601 We force a check that there's always a thunk free if we need one. */
603 static kmpc_thunk_t *__kmp_alloc_thunk(kmpc_task_queue_t *queue,
604 int in_parallel, kmp_int32 global_tid) {
608 __kmp_acquire_lock(&queue->tq_free_thunks_lck, global_tid);
609 // Make sure data structures are in consistent state before querying them
610 // Seems to work without this call for digital/alpha, needed for IBM/RS6000
614 fl = queue->tq_free_thunks;
616 KMP_DEBUG_ASSERT(fl != NULL);
618 queue->tq_free_thunks = fl->th.th_next_free;
622 __kmp_release_lock(&queue->tq_free_thunks_lck, global_tid);
627 static void __kmp_free_thunk(kmpc_task_queue_t *queue, kmpc_thunk_t *p,
628 int in_parallel, kmp_int32 global_tid) {
631 p->th_encl_thunk = 0;
634 /* Also could zero pointers to private vars */
638 __kmp_acquire_lock(&queue->tq_free_thunks_lck, global_tid);
639 // Make sure data structures are in consistent state before querying them
640 // Seems to work without this call for digital/alpha, needed for IBM/RS6000
644 p->th.th_next_free = queue->tq_free_thunks;
645 queue->tq_free_thunks = p;
648 p->th_flags = TQF_DEALLOCATED;
652 __kmp_release_lock(&queue->tq_free_thunks_lck, global_tid);
655 /* returns nonzero if the queue just became full after the enqueue */
656 static kmp_int32 __kmp_enqueue_task(kmp_taskq_t *tq, kmp_int32 global_tid,
657 kmpc_task_queue_t *queue,
658 kmpc_thunk_t *thunk, int in_parallel) {
661 /* dkp: can we get around the lock in the TQF_RELEASE_WORKERS case (only the
662 * master is executing then) */
664 __kmp_acquire_lock(&queue->tq_queue_lck, global_tid);
665 // Make sure data structures are in consistent state before querying them
666 // Seems to work without this call for digital/alpha, needed for IBM/RS6000
670 KMP_DEBUG_ASSERT(queue->tq_nfull < queue->tq_nslots); // check queue not full
672 queue->tq_queue[(queue->tq_head)++].qs_thunk = thunk;
674 if (queue->tq_head >= queue->tq_nslots)
679 KMP_MB(); /* to assure that nfull is seen to increase before
680 TQF_ALL_TASKS_QUEUED is set */
682 ret = (in_parallel) ? (queue->tq_nfull == queue->tq_nslots) : FALSE;
685 /* don't need to wait until workers are released before unlocking */
686 __kmp_release_lock(&queue->tq_queue_lck, global_tid);
688 if (tq->tq_global_flags & TQF_RELEASE_WORKERS) {
689 // If just creating the root queue, the worker threads are waiting at a
690 // join barrier until now, when there's something in the queue for them to
691 // do; release them now to do work. This should only be done when this is
692 // the first task enqueued, so reset the flag here also.
693 tq->tq_global_flags &= ~TQF_RELEASE_WORKERS; /* no lock needed, workers
694 are still in spin mode */
695 // avoid releasing barrier twice if taskq_task switches threads
698 __kmpc_end_barrier_master(NULL, global_tid);
705 static kmpc_thunk_t *__kmp_dequeue_task(kmp_int32 global_tid,
706 kmpc_task_queue_t *queue,
709 int tid = __kmp_tid_from_gtid(global_tid);
711 KMP_DEBUG_ASSERT(queue->tq_nfull > 0); /* check queue not empty */
713 if (queue->tq.tq_parent != NULL && in_parallel) {
715 __kmp_acquire_lock(&queue->tq.tq_parent->tq_link_lck, global_tid);
716 ct = ++(queue->tq_ref_count);
717 __kmp_release_lock(&queue->tq.tq_parent->tq_link_lck, global_tid);
719 ("line %d gtid %d: Q %p inc %d\n", __LINE__, global_tid, queue, ct));
722 pt = queue->tq_queue[(queue->tq_tail)++].qs_thunk;
724 if (queue->tq_tail >= queue->tq_nslots)
728 queue->tq_th_thunks[tid].ai_data++;
730 KMP_MB(); /* necessary so ai_data increment is propagated to other threads
731 immediately (digital) */
733 KF_TRACE(200, ("__kmp_dequeue_task: T#%d(:%d) now has %d outstanding "
734 "thunks from queue %p\n",
735 global_tid, tid, queue->tq_th_thunks[tid].ai_data, queue));
743 /* necessary so (queue->tq_nfull > 0) above succeeds after tq_nfull is
746 KMP_DEBUG_ASSERT(queue->tq_nfull >= 0);
749 KMP_DEBUG_ASSERT(queue->tq_th_thunks[tid].ai_data <=
750 __KMP_TASKQ_THUNKS_PER_TH);
757 /* Find the next (non-null) task to dequeue and return it.
758 * This is never called unless in_parallel=TRUE
760 * Here are the rules for deciding which queue to take the task from:
761 * 1. Walk up the task queue tree from the current queue's parent and look
762 * on the way up (for loop, below).
763 * 2. Do a depth-first search back down the tree from the root and
764 * look (find_task_in_descendant_queue()).
766 * Here are the rules for deciding which task to take from a queue
767 * (__kmp_find_task_in_queue ()):
768 * 1. Never take the last task from a queue if TQF_IS_LASTPRIVATE; this task
769 * must be staged to make sure we execute the last one with
770 * TQF_IS_LAST_TASK at the end of task queue execution.
771 * 2. If the queue length is below some high water mark and the taskq task
772 * is enqueued, prefer running the taskq task.
773 * 3. Otherwise, take a (normal) task from the queue.
775 * If we do all this and return pt == NULL at the bottom of this routine,
776 * this means there are no more tasks to execute (except possibly for
777 * TQF_IS_LASTPRIVATE).
780 static kmpc_thunk_t *__kmp_find_task_in_queue(kmp_int32 global_tid,
781 kmpc_task_queue_t *queue) {
782 kmpc_thunk_t *pt = NULL;
783 int tid = __kmp_tid_from_gtid(global_tid);
785 /* To prevent deadlock from tq_queue_lck if queue already deallocated */
786 if (!(queue->tq_flags & TQF_DEALLOCATED)) {
788 __kmp_acquire_lock(&queue->tq_queue_lck, global_tid);
790 /* Check again to avoid race in __kmpc_end_taskq() */
791 if (!(queue->tq_flags & TQF_DEALLOCATED)) {
792 // Make sure data structures are in consistent state before querying them
793 // Seems to work without this for digital/alpha, needed for IBM/RS6000
796 if ((queue->tq_taskq_slot != NULL) &&
797 (queue->tq_nfull <= queue->tq_hiwat)) {
798 /* if there's enough room in the queue and the dispatcher */
799 /* (taskq task) is available, schedule more tasks */
800 pt = CCAST(kmpc_thunk_t *, queue->tq_taskq_slot);
801 queue->tq_taskq_slot = NULL;
802 } else if (queue->tq_nfull == 0 ||
803 queue->tq_th_thunks[tid].ai_data >=
804 __KMP_TASKQ_THUNKS_PER_TH) {
805 /* do nothing if no thunks available or this thread can't */
806 /* run any because it already is executing too many */
808 } else if (queue->tq_nfull > 1) {
809 /* always safe to schedule a task even if TQF_IS_LASTPRIVATE */
811 pt = __kmp_dequeue_task(global_tid, queue, TRUE);
812 } else if (!(queue->tq_flags & TQF_IS_LASTPRIVATE)) {
813 // one thing in queue, always safe to schedule if !TQF_IS_LASTPRIVATE
814 pt = __kmp_dequeue_task(global_tid, queue, TRUE);
815 } else if (queue->tq_flags & TQF_IS_LAST_TASK) {
816 /* TQF_IS_LASTPRIVATE, one thing in queue, kmpc_end_taskq_task() */
817 /* has been run so this is last task, run with TQF_IS_LAST_TASK so */
818 /* instrumentation does copy-out. */
819 pt = __kmp_dequeue_task(global_tid, queue, TRUE);
821 TQF_IS_LAST_TASK; /* don't need test_then_or since already locked */
825 /* GEH - What happens here if is lastprivate, but not last task? */
826 __kmp_release_lock(&queue->tq_queue_lck, global_tid);
832 /* Walk a tree of queues starting at queue's first child and return a non-NULL
833 thunk if one can be scheduled. Must only be called when in_parallel=TRUE */
835 static kmpc_thunk_t *
836 __kmp_find_task_in_descendant_queue(kmp_int32 global_tid,
837 kmpc_task_queue_t *curr_queue) {
838 kmpc_thunk_t *pt = NULL;
839 kmpc_task_queue_t *queue = curr_queue;
841 if (curr_queue->tq_first_child != NULL) {
842 __kmp_acquire_lock(&curr_queue->tq_link_lck, global_tid);
843 // Make sure data structures are in consistent state before querying them
844 // Seems to work without this call for digital/alpha, needed for IBM/RS6000
847 queue = CCAST(kmpc_task_queue_t *, curr_queue->tq_first_child);
849 __kmp_release_lock(&curr_queue->tq_link_lck, global_tid);
853 while (queue != NULL) {
855 kmpc_task_queue_t *next;
857 ct = ++(queue->tq_ref_count);
858 __kmp_release_lock(&curr_queue->tq_link_lck, global_tid);
860 ("line %d gtid %d: Q %p inc %d\n", __LINE__, global_tid, queue, ct));
862 pt = __kmp_find_task_in_queue(global_tid, queue);
867 __kmp_acquire_lock(&curr_queue->tq_link_lck, global_tid);
868 // Make sure data structures in consistent state before querying them
869 // Seems to work without this for digital/alpha, needed for IBM/RS6000
872 ct = --(queue->tq_ref_count);
873 KMP_DEBUG_REF_CTS(("line %d gtid %d: Q %p dec %d\n", __LINE__,
874 global_tid, queue, ct));
875 KMP_DEBUG_ASSERT(queue->tq_ref_count >= 0);
877 __kmp_release_lock(&curr_queue->tq_link_lck, global_tid);
882 /* although reference count stays active during descendant walk, shouldn't
883 matter since if children still exist, reference counts aren't being
886 pt = __kmp_find_task_in_descendant_queue(global_tid, queue);
891 __kmp_acquire_lock(&curr_queue->tq_link_lck, global_tid);
892 // Make sure data structures in consistent state before querying them
893 // Seems to work without this for digital/alpha, needed for IBM/RS6000
896 ct = --(queue->tq_ref_count);
897 KMP_DEBUG_REF_CTS(("line %d gtid %d: Q %p dec %d\n", __LINE__,
898 global_tid, queue, ct));
899 KMP_DEBUG_ASSERT(ct >= 0);
901 __kmp_release_lock(&curr_queue->tq_link_lck, global_tid);
906 __kmp_acquire_lock(&curr_queue->tq_link_lck, global_tid);
907 // Make sure data structures in consistent state before querying them
908 // Seems to work without this for digital/alpha, needed for IBM/RS6000
911 next = queue->tq_next_child;
913 ct = --(queue->tq_ref_count);
915 ("line %d gtid %d: Q %p dec %d\n", __LINE__, global_tid, queue, ct));
916 KMP_DEBUG_ASSERT(ct >= 0);
921 __kmp_release_lock(&curr_queue->tq_link_lck, global_tid);
927 /* Walk up the taskq tree looking for a task to execute. If we get to the root,
928 search the tree for a descendent queue task. Must only be called when
930 static kmpc_thunk_t *
931 __kmp_find_task_in_ancestor_queue(kmp_taskq_t *tq, kmp_int32 global_tid,
932 kmpc_task_queue_t *curr_queue) {
933 kmpc_task_queue_t *queue;
938 if (curr_queue->tq.tq_parent != NULL) {
939 queue = curr_queue->tq.tq_parent;
941 while (queue != NULL) {
942 if (queue->tq.tq_parent != NULL) {
944 __kmp_acquire_lock(&queue->tq.tq_parent->tq_link_lck, global_tid);
945 // Make sure data structures in consistent state before querying them
946 // Seems to work without this for digital/alpha, needed for IBM/RS6000
949 ct = ++(queue->tq_ref_count);
950 __kmp_release_lock(&queue->tq.tq_parent->tq_link_lck, global_tid);
951 KMP_DEBUG_REF_CTS(("line %d gtid %d: Q %p inc %d\n", __LINE__,
952 global_tid, queue, ct));
955 pt = __kmp_find_task_in_queue(global_tid, queue);
957 if (queue->tq.tq_parent != NULL) {
959 __kmp_acquire_lock(&queue->tq.tq_parent->tq_link_lck, global_tid);
960 // Make sure data structures in consistent state before querying them
961 // Seems to work without this for digital/alpha, needed for IBM/RS6000
964 ct = --(queue->tq_ref_count);
965 KMP_DEBUG_REF_CTS(("line %d gtid %d: Q %p dec %d\n", __LINE__,
966 global_tid, queue, ct));
967 KMP_DEBUG_ASSERT(ct >= 0);
969 __kmp_release_lock(&queue->tq.tq_parent->tq_link_lck, global_tid);
975 if (queue->tq.tq_parent != NULL) {
977 __kmp_acquire_lock(&queue->tq.tq_parent->tq_link_lck, global_tid);
978 // Make sure data structures in consistent state before querying them
979 // Seems to work without this for digital/alpha, needed for IBM/RS6000
982 ct = --(queue->tq_ref_count);
983 KMP_DEBUG_REF_CTS(("line %d gtid %d: Q %p dec %d\n", __LINE__,
984 global_tid, queue, ct));
985 KMP_DEBUG_ASSERT(ct >= 0);
987 queue = queue->tq.tq_parent;
990 __kmp_release_lock(&queue->tq_link_lck, global_tid);
994 pt = __kmp_find_task_in_descendant_queue(global_tid, tq->tq_root);
999 static int __kmp_taskq_tasks_finished(kmpc_task_queue_t *queue) {
1002 /* KMP_MB(); */ /* is this really necessary? */
1004 for (i = 0; i < queue->tq_nproc; i++) {
1005 if (queue->tq_th_thunks[i].ai_data != 0)
1012 static int __kmp_taskq_has_any_children(kmpc_task_queue_t *queue) {
1013 return (queue->tq_first_child != NULL);
1016 static void __kmp_remove_queue_from_tree(kmp_taskq_t *tq, kmp_int32 global_tid,
1017 kmpc_task_queue_t *queue,
1021 kmpc_thunk_t *thunk;
1025 ("Before Deletion of TaskQ at %p on (%d):\n", queue, global_tid));
1026 KF_DUMP(50, __kmp_dump_task_queue(tq, queue, global_tid));
1028 /* sub-queue in a recursion, not the root task queue */
1029 KMP_DEBUG_ASSERT(queue->tq.tq_parent != NULL);
1032 __kmp_acquire_lock(&queue->tq.tq_parent->tq_link_lck, global_tid);
1033 // Make sure data structures are in consistent state before querying them
1034 // Seems to work without this call for digital/alpha, needed for IBM/RS6000
1038 KMP_DEBUG_ASSERT(queue->tq_first_child == NULL);
1040 /* unlink queue from its siblings if any at this level */
1041 if (queue->tq_prev_child != NULL)
1042 queue->tq_prev_child->tq_next_child = queue->tq_next_child;
1043 if (queue->tq_next_child != NULL)
1044 queue->tq_next_child->tq_prev_child = queue->tq_prev_child;
1045 if (queue->tq.tq_parent->tq_first_child == queue)
1046 queue->tq.tq_parent->tq_first_child = queue->tq_next_child;
1048 queue->tq_prev_child = NULL;
1049 queue->tq_next_child = NULL;
1053 ("line %d gtid %d: Q %p waiting for ref_count of %d to reach 1\n",
1054 __LINE__, global_tid, queue, queue->tq_ref_count));
1056 /* wait until all other threads have stopped accessing this queue */
1057 while (queue->tq_ref_count > 1) {
1058 __kmp_release_lock(&queue->tq.tq_parent->tq_link_lck, global_tid);
1060 KMP_WAIT_YIELD((volatile kmp_uint32 *)&queue->tq_ref_count, 1, KMP_LE,
1063 __kmp_acquire_lock(&queue->tq.tq_parent->tq_link_lck, global_tid);
1064 // Make sure data structures are in consistent state before querying them
1065 // Seems to work without this for digital/alpha, needed for IBM/RS6000
1069 __kmp_release_lock(&queue->tq.tq_parent->tq_link_lck, global_tid);
1073 ("line %d gtid %d: Q %p freeing queue\n", __LINE__, global_tid, queue));
1076 KMP_DEBUG_ASSERT(queue->tq_flags & TQF_ALL_TASKS_QUEUED);
1077 KMP_DEBUG_ASSERT(queue->tq_nfull == 0);
1079 for (i = 0; i < queue->tq_nproc; i++) {
1080 KMP_DEBUG_ASSERT(queue->tq_th_thunks[i].ai_data == 0);
1084 for (thunk = queue->tq_free_thunks; thunk != NULL;
1085 thunk = thunk->th.th_next_free)
1089 queue->tq_nslots + (queue->tq_nproc * __KMP_TASKQ_THUNKS_PER_TH));
1092 /* release storage for queue entry */
1093 __kmp_free_taskq(tq, queue, TRUE, global_tid);
1095 KF_TRACE(50, ("After Deletion of TaskQ at %p on (%d):\n", queue, global_tid));
1096 KF_DUMP(50, __kmp_dump_task_queue_tree(tq, tq->tq_root, global_tid));
1099 /* Starting from indicated queue, proceed downward through tree and remove all
1100 taskqs which are finished, but only go down to taskqs which have the "nowait"
1101 clause present. Assume this is only called when in_parallel=TRUE. */
1103 static void __kmp_find_and_remove_finished_child_taskq(
1104 kmp_taskq_t *tq, kmp_int32 global_tid, kmpc_task_queue_t *curr_queue) {
1105 kmpc_task_queue_t *queue = curr_queue;
1107 if (curr_queue->tq_first_child != NULL) {
1108 __kmp_acquire_lock(&curr_queue->tq_link_lck, global_tid);
1109 // Make sure data structures are in consistent state before querying them
1110 // Seems to work without this call for digital/alpha, needed for IBM/RS6000
1113 queue = CCAST(kmpc_task_queue_t *, curr_queue->tq_first_child);
1114 if (queue != NULL) {
1115 __kmp_release_lock(&curr_queue->tq_link_lck, global_tid);
1119 while (queue != NULL) {
1120 kmpc_task_queue_t *next;
1121 int ct = ++(queue->tq_ref_count);
1123 ("line %d gtid %d: Q %p inc %d\n", __LINE__, global_tid, queue, ct));
1125 /* although reference count stays active during descendant walk, */
1126 /* shouldn't matter since if children still exist, reference */
1127 /* counts aren't being monitored anyway */
1129 if (queue->tq_flags & TQF_IS_NOWAIT) {
1130 __kmp_find_and_remove_finished_child_taskq(tq, global_tid, queue);
1132 if ((queue->tq_flags & TQF_ALL_TASKS_QUEUED) &&
1133 (queue->tq_nfull == 0) && __kmp_taskq_tasks_finished(queue) &&
1134 !__kmp_taskq_has_any_children(queue)) {
1136 /* Only remove this if we have not already marked it for deallocation.
1137 This should prevent multiple threads from trying to free this. */
1139 if (__kmp_test_lock(&queue->tq_queue_lck, global_tid)) {
1140 if (!(queue->tq_flags & TQF_DEALLOCATED)) {
1141 queue->tq_flags |= TQF_DEALLOCATED;
1142 __kmp_release_lock(&queue->tq_queue_lck, global_tid);
1144 __kmp_remove_queue_from_tree(tq, global_tid, queue, TRUE);
1146 /* Can't do any more here since can't be sure where sibling queue
1147 * is so just exit this level */
1150 __kmp_release_lock(&queue->tq_queue_lck, global_tid);
1153 /* otherwise, just fall through and decrement reference count */
1157 __kmp_acquire_lock(&curr_queue->tq_link_lck, global_tid);
1158 // Make sure data structures are in consistent state before querying them
1159 // Seems to work without this for digital/alpha, needed for IBM/RS6000
1162 next = queue->tq_next_child;
1164 ct = --(queue->tq_ref_count);
1166 ("line %d gtid %d: Q %p dec %d\n", __LINE__, global_tid, queue, ct));
1167 KMP_DEBUG_ASSERT(ct >= 0);
1172 __kmp_release_lock(&curr_queue->tq_link_lck, global_tid);
1176 /* Starting from indicated queue, proceed downward through tree and remove all
1177 taskq's assuming all are finished and assuming NO other threads are executing
1179 static void __kmp_remove_all_child_taskq(kmp_taskq_t *tq, kmp_int32 global_tid,
1180 kmpc_task_queue_t *queue) {
1181 kmpc_task_queue_t *next_child;
1183 queue = CCAST(kmpc_task_queue_t *, queue->tq_first_child);
1185 while (queue != NULL) {
1186 __kmp_remove_all_child_taskq(tq, global_tid, queue);
1188 next_child = queue->tq_next_child;
1189 queue->tq_flags |= TQF_DEALLOCATED;
1190 __kmp_remove_queue_from_tree(tq, global_tid, queue, FALSE);
1195 static void __kmp_execute_task_from_queue(kmp_taskq_t *tq, ident_t *loc,
1196 kmp_int32 global_tid,
1197 kmpc_thunk_t *thunk,
1199 kmpc_task_queue_t *queue = thunk->th.th_shareds->sv_queue;
1200 kmp_int32 tid = __kmp_tid_from_gtid(global_tid);
1202 KF_TRACE(100, ("After dequeueing this Task on (%d):\n", global_tid));
1203 KF_DUMP(100, __kmp_dump_thunk(tq, thunk, global_tid));
1204 KF_TRACE(100, ("Task Queue: %p looks like this (%d):\n", queue, global_tid));
1205 KF_DUMP(100, __kmp_dump_task_queue(tq, queue, global_tid));
1207 /* For the taskq task, the curr_thunk pushes and pop pairs are set up as
1210 * happens exactly once:
1211 * 1) __kmpc_taskq : push (if returning thunk only)
1212 * 4) __kmpc_end_taskq_task : pop
1214 * optionally happens *each* time taskq task is dequeued/enqueued:
1215 * 2) __kmpc_taskq_task : pop
1216 * 3) __kmp_execute_task_from_queue : push
1218 * execution ordering: 1,(2,3)*,4
1221 if (!(thunk->th_flags & TQF_TASKQ_TASK)) {
1222 kmp_int32 index = (queue == tq->tq_root) ? tid : 0;
1223 thunk->th.th_shareds =
1224 CCAST(kmpc_shared_vars_t *, queue->tq_shareds[index].ai_data);
1226 if (__kmp_env_consistency_check) {
1227 __kmp_push_workshare(global_tid,
1228 (queue->tq_flags & TQF_IS_ORDERED) ? ct_task_ordered
1233 if (__kmp_env_consistency_check)
1234 __kmp_push_workshare(global_tid, ct_taskq, queue->tq_loc);
1238 thunk->th_encl_thunk = tq->tq_curr_thunk[tid];
1239 tq->tq_curr_thunk[tid] = thunk;
1241 KF_DUMP(200, __kmp_dump_thunk_stack(tq->tq_curr_thunk[tid], global_tid));
1244 KF_TRACE(50, ("Begin Executing Thunk %p from queue %p on (%d)\n", thunk,
1245 queue, global_tid));
1246 thunk->th_task(global_tid, thunk);
1247 KF_TRACE(50, ("End Executing Thunk %p from queue %p on (%d)\n", thunk, queue,
1250 if (!(thunk->th_flags & TQF_TASKQ_TASK)) {
1251 if (__kmp_env_consistency_check)
1252 __kmp_pop_workshare(global_tid,
1253 (queue->tq_flags & TQF_IS_ORDERED) ? ct_task_ordered
1258 tq->tq_curr_thunk[tid] = thunk->th_encl_thunk;
1259 thunk->th_encl_thunk = NULL;
1260 KF_DUMP(200, __kmp_dump_thunk_stack(tq->tq_curr_thunk[tid], global_tid));
1263 if ((thunk->th_flags & TQF_IS_ORDERED) && in_parallel) {
1264 __kmp_taskq_check_ordered(global_tid, thunk);
1267 __kmp_free_thunk(queue, thunk, in_parallel, global_tid);
1269 KF_TRACE(100, ("T#%d After freeing thunk: %p, TaskQ looks like this:\n",
1270 global_tid, thunk));
1271 KF_DUMP(100, __kmp_dump_task_queue(tq, queue, global_tid));
1274 KMP_MB(); /* needed so thunk put on free list before outstanding thunk
1275 count is decremented */
1277 KMP_DEBUG_ASSERT(queue->tq_th_thunks[tid].ai_data >= 1);
1281 ("__kmp_execute_task_from_queue: T#%d has %d thunks in queue %p\n",
1282 global_tid, queue->tq_th_thunks[tid].ai_data - 1, queue));
1284 queue->tq_th_thunks[tid].ai_data--;
1286 /* KMP_MB(); */ /* is MB really necessary ? */
1289 if (queue->tq.tq_parent != NULL && in_parallel) {
1291 __kmp_acquire_lock(&queue->tq.tq_parent->tq_link_lck, global_tid);
1292 ct = --(queue->tq_ref_count);
1293 __kmp_release_lock(&queue->tq.tq_parent->tq_link_lck, global_tid);
1295 ("line %d gtid %d: Q %p dec %d\n", __LINE__, global_tid, queue, ct));
1296 KMP_DEBUG_ASSERT(ct >= 0);
1301 /* starts a taskq; creates and returns a thunk for the taskq_task */
1302 /* also, returns pointer to shared vars for this thread in "shareds" arg */
1303 kmpc_thunk_t *__kmpc_taskq(ident_t *loc, kmp_int32 global_tid,
1304 kmpc_task_t taskq_task, size_t sizeof_thunk,
1305 size_t sizeof_shareds, kmp_int32 flags,
1306 kmpc_shared_vars_t **shareds) {
1308 kmp_int32 nslots, nthunks, nshareds, nproc;
1309 kmpc_task_queue_t *new_queue, *curr_queue;
1310 kmpc_thunk_t *new_taskq_thunk;
1316 KE_TRACE(10, ("__kmpc_taskq called (%d)\n", global_tid));
1318 th = __kmp_threads[global_tid];
1319 team = th->th.th_team;
1320 tq = &team->t.t_taskq;
1321 nproc = team->t.t_nproc;
1322 tid = __kmp_tid_from_gtid(global_tid);
1324 /* find out whether this is a parallel taskq or serialized one. */
1325 in_parallel = in_parallel_context(team);
1329 /* Vector ORDERED SECTION to taskq version */
1330 th->th.th_dispatch->th_deo_fcn = __kmp_taskq_eo;
1332 /* Vector ORDERED SECTION to taskq version */
1333 th->th.th_dispatch->th_dxo_fcn = __kmp_taskq_xo;
1337 // This shouldn't be a barrier region boundary, it will confuse the user.
1338 /* Need the boundary to be at the end taskq instead. */
1339 if (__kmp_barrier(bs_plain_barrier, global_tid, TRUE, 0, NULL, NULL)) {
1340 /* Creating the active root queue, and we are not the master thread. */
1341 /* The master thread below created the queue and tasks have been */
1342 /* enqueued, and the master thread released this barrier. This */
1343 /* worker thread can now proceed and execute tasks. See also the */
1344 /* TQF_RELEASE_WORKERS which is used to handle this case. */
1346 CCAST(kmpc_shared_vars_t *, tq->tq_root->tq_shareds[tid].ai_data);
1347 KE_TRACE(10, ("__kmpc_taskq return (%d)\n", global_tid));
1353 /* master thread only executes this code */
1354 if (tq->tq_curr_thunk_capacity < nproc) {
1355 if (tq->tq_curr_thunk)
1356 __kmp_free(tq->tq_curr_thunk);
1358 /* only need to do this once at outer level, i.e. when tq_curr_thunk is
1360 __kmp_init_lock(&tq->tq_freelist_lck);
1364 (kmpc_thunk_t **)__kmp_allocate(nproc * sizeof(kmpc_thunk_t *));
1365 tq->tq_curr_thunk_capacity = nproc;
1369 tq->tq_global_flags = TQF_RELEASE_WORKERS;
1372 /* dkp: in future, if flags & TQF_HEURISTICS, will choose nslots based */
1373 /* on some heuristics (e.g., depth of queue nesting?). */
1374 nslots = (in_parallel) ? (2 * nproc) : 1;
1376 /* There must be nproc * __KMP_TASKQ_THUNKS_PER_TH extra slots for pending */
1377 /* jobs being executed by other threads, and one extra for taskq slot */
1378 nthunks = (in_parallel) ? (nslots + (nproc * __KMP_TASKQ_THUNKS_PER_TH) + 1)
1381 /* Only the root taskq gets a per-thread array of shareds. */
1382 /* The rest of the taskq's only get one copy of the shared vars. */
1383 nshareds = (!tq->tq_root && in_parallel) ? nproc : 1;
1385 /* create overall queue data structure and its components that require
1387 new_queue = __kmp_alloc_taskq(tq, in_parallel, nslots, nthunks, nshareds,
1388 nproc, sizeof_thunk, sizeof_shareds,
1389 &new_taskq_thunk, global_tid);
1391 /* rest of new_queue initializations */
1392 new_queue->tq_flags = flags & TQF_INTERFACE_FLAGS;
1395 new_queue->tq_tasknum_queuing = 0;
1396 new_queue->tq_tasknum_serving = 0;
1397 new_queue->tq_flags |= TQF_PARALLEL_CONTEXT;
1400 new_queue->tq_taskq_slot = NULL;
1401 new_queue->tq_nslots = nslots;
1402 new_queue->tq_hiwat = HIGH_WATER_MARK(nslots);
1403 new_queue->tq_nfull = 0;
1404 new_queue->tq_head = 0;
1405 new_queue->tq_tail = 0;
1406 new_queue->tq_loc = loc;
1408 if ((new_queue->tq_flags & TQF_IS_ORDERED) && in_parallel) {
1409 /* prepare to serve the first-queued task's ORDERED directive */
1410 new_queue->tq_tasknum_serving = 1;
1412 /* Vector ORDERED SECTION to taskq version */
1413 th->th.th_dispatch->th_deo_fcn = __kmp_taskq_eo;
1415 /* Vector ORDERED SECTION to taskq version */
1416 th->th.th_dispatch->th_dxo_fcn = __kmp_taskq_xo;
1419 /* create a new thunk for the taskq_task in the new_queue */
1420 *shareds = CCAST(kmpc_shared_vars_t *, new_queue->tq_shareds[0].ai_data);
1422 new_taskq_thunk->th.th_shareds = *shareds;
1423 new_taskq_thunk->th_task = taskq_task;
1424 new_taskq_thunk->th_flags = new_queue->tq_flags | TQF_TASKQ_TASK;
1425 new_taskq_thunk->th_status = 0;
1427 KMP_DEBUG_ASSERT(new_taskq_thunk->th_flags & TQF_TASKQ_TASK);
1429 // Make sure these inits complete before threads start using this queue
1430 /* KMP_MB(); */ // (necessary?)
1432 /* insert the new task queue into the tree, but only after all fields
1437 new_queue->tq.tq_parent = NULL;
1438 new_queue->tq_first_child = NULL;
1439 new_queue->tq_next_child = NULL;
1440 new_queue->tq_prev_child = NULL;
1441 new_queue->tq_ref_count = 1;
1442 tq->tq_root = new_queue;
1444 curr_queue = tq->tq_curr_thunk[tid]->th.th_shareds->sv_queue;
1445 new_queue->tq.tq_parent = curr_queue;
1446 new_queue->tq_first_child = NULL;
1447 new_queue->tq_prev_child = NULL;
1448 new_queue->tq_ref_count =
1449 1; /* for this the thread that built the queue */
1451 KMP_DEBUG_REF_CTS(("line %d gtid %d: Q %p alloc %d\n", __LINE__,
1452 global_tid, new_queue, new_queue->tq_ref_count));
1454 __kmp_acquire_lock(&curr_queue->tq_link_lck, global_tid);
1456 // Make sure data structures are in consistent state before querying them
1457 // Seems to work without this for digital/alpha, needed for IBM/RS6000
1460 new_queue->tq_next_child =
1461 CCAST(struct kmpc_task_queue_t *, curr_queue->tq_first_child);
1463 if (curr_queue->tq_first_child != NULL)
1464 curr_queue->tq_first_child->tq_prev_child = new_queue;
1466 curr_queue->tq_first_child = new_queue;
1468 __kmp_release_lock(&curr_queue->tq_link_lck, global_tid);
1471 /* set up thunk stack only after code that determines curr_queue above */
1472 new_taskq_thunk->th_encl_thunk = tq->tq_curr_thunk[tid];
1473 tq->tq_curr_thunk[tid] = new_taskq_thunk;
1475 KF_DUMP(200, __kmp_dump_thunk_stack(tq->tq_curr_thunk[tid], global_tid));
1477 new_taskq_thunk->th_encl_thunk = 0;
1478 new_queue->tq.tq_parent = NULL;
1479 new_queue->tq_first_child = NULL;
1480 new_queue->tq_next_child = NULL;
1481 new_queue->tq_prev_child = NULL;
1482 new_queue->tq_ref_count = 1;
1486 KF_TRACE(150, ("Creating TaskQ Task on (%d):\n", global_tid));
1487 KF_DUMP(150, __kmp_dump_thunk(tq, new_taskq_thunk, global_tid));
1491 ("After TaskQ at %p Creation on (%d):\n", new_queue, global_tid));
1493 KF_TRACE(25, ("After Serial TaskQ at %p Creation on (%d):\n", new_queue,
1497 KF_DUMP(25, __kmp_dump_task_queue(tq, new_queue, global_tid));
1500 KF_DUMP(50, __kmp_dump_task_queue_tree(tq, tq->tq_root, global_tid));
1502 #endif /* KMP_DEBUG */
1504 if (__kmp_env_consistency_check)
1505 __kmp_push_workshare(global_tid, ct_taskq, new_queue->tq_loc);
1507 KE_TRACE(10, ("__kmpc_taskq return (%d)\n", global_tid));
1509 return new_taskq_thunk;
1512 /* ends a taskq; last thread out destroys the queue */
1514 void __kmpc_end_taskq(ident_t *loc, kmp_int32 global_tid,
1515 kmpc_thunk_t *taskq_thunk) {
1522 kmp_int32 is_outermost;
1523 kmpc_task_queue_t *queue;
1524 kmpc_thunk_t *thunk;
1527 KE_TRACE(10, ("__kmpc_end_taskq called (%d)\n", global_tid));
1529 tq = &__kmp_threads[global_tid]->th.th_team->t.t_taskq;
1530 nproc = __kmp_threads[global_tid]->th.th_team->t.t_nproc;
1532 /* For the outermost taskq only, all but one thread will have taskq_thunk ==
1534 queue = (taskq_thunk == NULL) ? tq->tq_root
1535 : taskq_thunk->th.th_shareds->sv_queue;
1537 KE_TRACE(50, ("__kmpc_end_taskq queue=%p (%d) \n", queue, global_tid));
1538 is_outermost = (queue == tq->tq_root);
1539 in_parallel = (queue->tq_flags & TQF_PARALLEL_CONTEXT);
1544 /* this is just a safeguard to release the waiting threads if */
1545 /* the outermost taskq never queues a task */
1547 if (is_outermost && (KMP_MASTER_GTID(global_tid))) {
1548 if (tq->tq_global_flags & TQF_RELEASE_WORKERS) {
1549 /* no lock needed, workers are still in spin mode */
1550 tq->tq_global_flags &= ~TQF_RELEASE_WORKERS;
1552 __kmp_end_split_barrier(bs_plain_barrier, global_tid);
1556 /* keep dequeueing work until all tasks are queued and dequeued */
1559 /* wait until something is available to dequeue */
1560 KMP_INIT_YIELD(spins);
1562 while ((queue->tq_nfull == 0) && (queue->tq_taskq_slot == NULL) &&
1563 (!__kmp_taskq_has_any_children(queue)) &&
1564 (!(queue->tq_flags & TQF_ALL_TASKS_QUEUED))) {
1565 KMP_YIELD_WHEN(TRUE, spins);
1568 /* check to see if we can execute tasks in the queue */
1569 while (((queue->tq_nfull != 0) || (queue->tq_taskq_slot != NULL)) &&
1570 (thunk = __kmp_find_task_in_queue(global_tid, queue)) != NULL) {
1571 KF_TRACE(50, ("Found thunk: %p in primary queue %p (%d)\n", thunk,
1572 queue, global_tid));
1573 __kmp_execute_task_from_queue(tq, loc, global_tid, thunk, in_parallel);
1576 /* see if work found can be found in a descendant queue */
1577 if ((__kmp_taskq_has_any_children(queue)) &&
1578 (thunk = __kmp_find_task_in_descendant_queue(global_tid, queue)) !=
1582 ("Stole thunk: %p in descendant queue: %p while waiting in "
1584 thunk, thunk->th.th_shareds->sv_queue, queue, global_tid));
1586 __kmp_execute_task_from_queue(tq, loc, global_tid, thunk, in_parallel);
1589 } while ((!(queue->tq_flags & TQF_ALL_TASKS_QUEUED)) ||
1590 (queue->tq_nfull != 0));
1592 KF_TRACE(50, ("All tasks queued and dequeued in queue: %p (%d)\n", queue,
1595 /* wait while all tasks are not finished and more work found
1596 in descendant queues */
1598 while ((!__kmp_taskq_tasks_finished(queue)) &&
1599 (thunk = __kmp_find_task_in_descendant_queue(global_tid, queue)) !=
1602 KF_TRACE(50, ("Stole thunk: %p in descendant queue: %p while waiting in "
1604 thunk, thunk->th.th_shareds->sv_queue, queue, global_tid));
1606 __kmp_execute_task_from_queue(tq, loc, global_tid, thunk, in_parallel);
1609 KF_TRACE(50, ("No work found in descendent queues or all work finished in "
1611 queue, global_tid));
1613 if (!is_outermost) {
1614 /* need to return if NOWAIT present and not outermost taskq */
1616 if (queue->tq_flags & TQF_IS_NOWAIT) {
1617 __kmp_acquire_lock(&queue->tq.tq_parent->tq_link_lck, global_tid);
1618 queue->tq_ref_count--;
1619 KMP_DEBUG_ASSERT(queue->tq_ref_count >= 0);
1620 __kmp_release_lock(&queue->tq.tq_parent->tq_link_lck, global_tid);
1623 10, ("__kmpc_end_taskq return for nowait case (%d)\n", global_tid));
1628 __kmp_find_and_remove_finished_child_taskq(tq, global_tid, queue);
1630 /* WAIT until all tasks are finished and no child queues exist before
1632 KMP_INIT_YIELD(spins);
1634 while (!__kmp_taskq_tasks_finished(queue) ||
1635 __kmp_taskq_has_any_children(queue)) {
1636 thunk = __kmp_find_task_in_ancestor_queue(tq, global_tid, queue);
1638 if (thunk != NULL) {
1640 ("Stole thunk: %p in ancestor queue: %p while waiting in "
1642 thunk, thunk->th.th_shareds->sv_queue, queue, global_tid));
1643 __kmp_execute_task_from_queue(tq, loc, global_tid, thunk,
1647 KMP_YIELD_WHEN(thunk == NULL, spins);
1649 __kmp_find_and_remove_finished_child_taskq(tq, global_tid, queue);
1652 __kmp_acquire_lock(&queue->tq_queue_lck, global_tid);
1653 if (!(queue->tq_flags & TQF_DEALLOCATED)) {
1654 queue->tq_flags |= TQF_DEALLOCATED;
1656 __kmp_release_lock(&queue->tq_queue_lck, global_tid);
1658 /* only the allocating thread can deallocate the queue */
1659 if (taskq_thunk != NULL) {
1660 __kmp_remove_queue_from_tree(tq, global_tid, queue, TRUE);
1665 ("__kmpc_end_taskq return for non_outermost queue, wait case (%d)\n",
1671 // Outermost Queue: steal work from descendants until all tasks are finished
1673 KMP_INIT_YIELD(spins);
1675 while (!__kmp_taskq_tasks_finished(queue)) {
1676 thunk = __kmp_find_task_in_descendant_queue(global_tid, queue);
1678 if (thunk != NULL) {
1680 ("Stole thunk: %p in descendant queue: %p while waiting in "
1682 thunk, thunk->th.th_shareds->sv_queue, queue, global_tid));
1684 __kmp_execute_task_from_queue(tq, loc, global_tid, thunk, in_parallel);
1687 KMP_YIELD_WHEN(thunk == NULL, spins);
1690 /* Need this barrier to prevent destruction of queue before threads have all
1691 * executed above code */
1692 /* This may need to be done earlier when NOWAIT is implemented for the
1693 * outermost level */
1695 if (!__kmp_barrier(bs_plain_barrier, global_tid, TRUE, 0, NULL, NULL)) {
1696 /* the queue->tq_flags & TQF_IS_NOWAIT case is not yet handled here; */
1697 /* for right now, everybody waits, and the master thread destroys the */
1698 /* remaining queues. */
1700 __kmp_remove_all_child_taskq(tq, global_tid, queue);
1702 /* Now destroy the root queue */
1703 KF_TRACE(100, ("T#%d Before Deletion of top-level TaskQ at %p:\n",
1704 global_tid, queue));
1705 KF_DUMP(100, __kmp_dump_task_queue(tq, queue, global_tid));
1708 /* the root queue entry */
1709 KMP_DEBUG_ASSERT((queue->tq.tq_parent == NULL) &&
1710 (queue->tq_next_child == NULL));
1712 /* children must all be gone by now because of barrier above */
1713 KMP_DEBUG_ASSERT(queue->tq_first_child == NULL);
1715 for (i = 0; i < nproc; i++) {
1716 KMP_DEBUG_ASSERT(queue->tq_th_thunks[i].ai_data == 0);
1719 for (i = 0, thunk = queue->tq_free_thunks; thunk != NULL;
1720 i++, thunk = thunk->th.th_next_free)
1723 KMP_DEBUG_ASSERT(i ==
1724 queue->tq_nslots + (nproc * __KMP_TASKQ_THUNKS_PER_TH));
1726 for (i = 0; i < nproc; i++) {
1727 KMP_DEBUG_ASSERT(!tq->tq_curr_thunk[i]);
1730 /* unlink the root queue entry */
1733 /* release storage for root queue entry */
1734 KF_TRACE(50, ("After Deletion of top-level TaskQ at %p on (%d):\n", queue,
1737 queue->tq_flags |= TQF_DEALLOCATED;
1738 __kmp_free_taskq(tq, queue, in_parallel, global_tid);
1740 KF_DUMP(50, __kmp_dump_task_queue_tree(tq, tq->tq_root, global_tid));
1742 /* release the workers now that the data structures are up to date */
1743 __kmp_end_split_barrier(bs_plain_barrier, global_tid);
1746 th = __kmp_threads[global_tid];
1748 /* Reset ORDERED SECTION to parallel version */
1749 th->th.th_dispatch->th_deo_fcn = 0;
1751 /* Reset ORDERED SECTION to parallel version */
1752 th->th.th_dispatch->th_dxo_fcn = 0;
1754 /* in serial execution context, dequeue the last task */
1755 /* and execute it, if there were any tasks encountered */
1757 if (queue->tq_nfull > 0) {
1758 KMP_DEBUG_ASSERT(queue->tq_nfull == 1);
1760 thunk = __kmp_dequeue_task(global_tid, queue, in_parallel);
1762 if (queue->tq_flags & TQF_IS_LAST_TASK) {
1763 /* TQF_IS_LASTPRIVATE, one thing in queue, __kmpc_end_taskq_task() */
1764 /* has been run so this is last task, run with TQF_IS_LAST_TASK so */
1765 /* instrumentation does copy-out. */
1767 /* no need for test_then_or call since already locked */
1768 thunk->th_flags |= TQF_IS_LAST_TASK;
1771 KF_TRACE(50, ("T#%d found thunk: %p in serial queue: %p\n", global_tid,
1774 __kmp_execute_task_from_queue(tq, loc, global_tid, thunk, in_parallel);
1777 // destroy the unattached serial queue now that there is no more work to do
1778 KF_TRACE(100, ("Before Deletion of Serialized TaskQ at %p on (%d):\n",
1779 queue, global_tid));
1780 KF_DUMP(100, __kmp_dump_task_queue(tq, queue, global_tid));
1784 for (thunk = queue->tq_free_thunks; thunk != NULL;
1785 thunk = thunk->th.th_next_free)
1787 KMP_DEBUG_ASSERT(i == queue->tq_nslots + 1);
1789 /* release storage for unattached serial queue */
1791 ("Serialized TaskQ at %p deleted on (%d).\n", queue, global_tid));
1793 queue->tq_flags |= TQF_DEALLOCATED;
1794 __kmp_free_taskq(tq, queue, in_parallel, global_tid);
1797 KE_TRACE(10, ("__kmpc_end_taskq return (%d)\n", global_tid));
1800 /* Enqueues a task for thunk previously created by __kmpc_task_buffer. */
1801 /* Returns nonzero if just filled up queue */
1803 kmp_int32 __kmpc_task(ident_t *loc, kmp_int32 global_tid, kmpc_thunk_t *thunk) {
1805 kmpc_task_queue_t *queue;
1809 KE_TRACE(10, ("__kmpc_task called (%d)\n", global_tid));
1811 KMP_DEBUG_ASSERT(!(thunk->th_flags &
1812 TQF_TASKQ_TASK)); /* thunk->th_task is a regular task */
1814 tq = &__kmp_threads[global_tid]->th.th_team->t.t_taskq;
1815 queue = thunk->th.th_shareds->sv_queue;
1816 in_parallel = (queue->tq_flags & TQF_PARALLEL_CONTEXT);
1818 if (in_parallel && (thunk->th_flags & TQF_IS_ORDERED))
1819 thunk->th_tasknum = ++queue->tq_tasknum_queuing;
1821 /* For serial execution dequeue the preceding task and execute it, if one
1823 /* This cannot be the last task. That one is handled in __kmpc_end_taskq */
1825 if (!in_parallel && queue->tq_nfull > 0) {
1826 kmpc_thunk_t *prev_thunk;
1828 KMP_DEBUG_ASSERT(queue->tq_nfull == 1);
1830 prev_thunk = __kmp_dequeue_task(global_tid, queue, in_parallel);
1832 KF_TRACE(50, ("T#%d found thunk: %p in serial queue: %p\n", global_tid,
1833 prev_thunk, queue));
1835 __kmp_execute_task_from_queue(tq, loc, global_tid, prev_thunk, in_parallel);
1838 /* The instrumentation sequence is: __kmpc_task_buffer(), initialize private
1839 variables, __kmpc_task(). The __kmpc_task_buffer routine checks that the
1840 task queue is not full and allocates a thunk (which is then passed to
1841 __kmpc_task()). So, the enqueue below should never fail due to a full
1844 KF_TRACE(100, ("After enqueueing this Task on (%d):\n", global_tid));
1845 KF_DUMP(100, __kmp_dump_thunk(tq, thunk, global_tid));
1847 ret = __kmp_enqueue_task(tq, global_tid, queue, thunk, in_parallel);
1849 KF_TRACE(100, ("Task Queue looks like this on (%d):\n", global_tid));
1850 KF_DUMP(100, __kmp_dump_task_queue(tq, queue, global_tid));
1852 KE_TRACE(10, ("__kmpc_task return (%d)\n", global_tid));
1857 /* enqueues a taskq_task for thunk previously created by __kmpc_taskq */
1858 /* this should never be called unless in a parallel context */
1860 void __kmpc_taskq_task(ident_t *loc, kmp_int32 global_tid, kmpc_thunk_t *thunk,
1862 kmpc_task_queue_t *queue;
1863 kmp_taskq_t *tq = &__kmp_threads[global_tid]->th.th_team->t.t_taskq;
1864 int tid = __kmp_tid_from_gtid(global_tid);
1866 KE_TRACE(10, ("__kmpc_taskq_task called (%d)\n", global_tid));
1867 KF_TRACE(100, ("TaskQ Task argument thunk on (%d):\n", global_tid));
1868 KF_DUMP(100, __kmp_dump_thunk(tq, thunk, global_tid));
1870 queue = thunk->th.th_shareds->sv_queue;
1872 if (__kmp_env_consistency_check)
1873 __kmp_pop_workshare(global_tid, ct_taskq, loc);
1875 /* thunk->th_task is the taskq_task */
1876 KMP_DEBUG_ASSERT(thunk->th_flags & TQF_TASKQ_TASK);
1878 /* not supposed to call __kmpc_taskq_task if it's already enqueued */
1879 KMP_DEBUG_ASSERT(queue->tq_taskq_slot == NULL);
1881 /* dequeue taskq thunk from curr_thunk stack */
1882 tq->tq_curr_thunk[tid] = thunk->th_encl_thunk;
1883 thunk->th_encl_thunk = NULL;
1885 KF_DUMP(200, __kmp_dump_thunk_stack(tq->tq_curr_thunk[tid], global_tid));
1887 thunk->th_status = status;
1889 // Flush thunk->th_status before taskq_task enqueued to avoid race condition
1892 /* enqueue taskq_task in thunk into special slot in queue */
1893 /* GEH - probably don't need to lock taskq slot since only one */
1894 /* thread enqueues & already a lock set at dequeue point */
1896 queue->tq_taskq_slot = thunk;
1898 KE_TRACE(10, ("__kmpc_taskq_task return (%d)\n", global_tid));
1901 /* ends a taskq_task; done generating tasks */
1903 void __kmpc_end_taskq_task(ident_t *loc, kmp_int32 global_tid,
1904 kmpc_thunk_t *thunk) {
1906 kmpc_task_queue_t *queue;
1910 KE_TRACE(10, ("__kmpc_end_taskq_task called (%d)\n", global_tid));
1912 tq = &__kmp_threads[global_tid]->th.th_team->t.t_taskq;
1913 queue = thunk->th.th_shareds->sv_queue;
1914 in_parallel = (queue->tq_flags & TQF_PARALLEL_CONTEXT);
1915 tid = __kmp_tid_from_gtid(global_tid);
1917 if (__kmp_env_consistency_check)
1918 __kmp_pop_workshare(global_tid, ct_taskq, loc);
1921 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
1922 KMP_TEST_THEN_OR32(RCAST(volatile kmp_uint32 *, &queue->tq_flags),
1923 TQF_ALL_TASKS_QUEUED);
1926 __kmp_acquire_lock(&queue->tq_queue_lck, global_tid);
1928 // Make sure data structures are in consistent state before querying them
1929 // Seems to work without this for digital/alpha, needed for IBM/RS6000
1932 queue->tq_flags |= TQF_ALL_TASKS_QUEUED;
1933 __kmp_release_lock(&queue->tq_queue_lck, global_tid);
1938 if (thunk->th_flags & TQF_IS_LASTPRIVATE) {
1939 /* Normally, __kmp_find_task_in_queue() refuses to schedule the last task in
1940 the queue if TQF_IS_LASTPRIVATE so we can positively identify that last
1941 task and run it with its TQF_IS_LAST_TASK bit turned on in th_flags.
1942 When __kmpc_end_taskq_task() is called we are done generating all the
1943 tasks, so we know the last one in the queue is the lastprivate task.
1944 Mark the queue as having gotten to this state via tq_flags &
1945 TQF_IS_LAST_TASK; when that task actually executes mark it via th_flags &
1946 TQF_IS_LAST_TASK (this th_flags bit signals the instrumented code to do
1947 copy-outs after execution). */
1949 /* No synchronization needed for serial context */
1950 queue->tq_flags |= TQF_IS_LAST_TASK;
1952 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
1953 KMP_TEST_THEN_OR32(RCAST(volatile kmp_uint32 *, &queue->tq_flags),
1957 __kmp_acquire_lock(&queue->tq_queue_lck, global_tid);
1959 // Make sure data structures in consistent state before querying them
1960 // Seems to work without this for digital/alpha, needed for IBM/RS6000
1963 queue->tq_flags |= TQF_IS_LAST_TASK;
1964 __kmp_release_lock(&queue->tq_queue_lck, global_tid);
1967 /* to prevent race condition where last task is dequeued but */
1968 /* flag isn't visible yet (not sure about this) */
1973 /* dequeue taskq thunk from curr_thunk stack */
1975 tq->tq_curr_thunk[tid] = thunk->th_encl_thunk;
1976 thunk->th_encl_thunk = NULL;
1978 KF_DUMP(200, __kmp_dump_thunk_stack(tq->tq_curr_thunk[tid], global_tid));
1981 KE_TRACE(10, ("__kmpc_end_taskq_task return (%d)\n", global_tid));
1984 /* returns thunk for a regular task based on taskq_thunk */
1985 /* (__kmpc_taskq_task does the analogous thing for a TQF_TASKQ_TASK) */
1987 kmpc_thunk_t *__kmpc_task_buffer(ident_t *loc, kmp_int32 global_tid,
1988 kmpc_thunk_t *taskq_thunk, kmpc_task_t task) {
1990 kmpc_task_queue_t *queue;
1991 kmpc_thunk_t *new_thunk;
1994 KE_TRACE(10, ("__kmpc_task_buffer called (%d)\n", global_tid));
1997 taskq_thunk->th_flags &
1998 TQF_TASKQ_TASK); /* taskq_thunk->th_task is the taskq_task */
2000 tq = &__kmp_threads[global_tid]->th.th_team->t.t_taskq;
2001 queue = taskq_thunk->th.th_shareds->sv_queue;
2002 in_parallel = (queue->tq_flags & TQF_PARALLEL_CONTEXT);
2004 /* The instrumentation sequence is: __kmpc_task_buffer(), initialize private
2005 variables, __kmpc_task(). The __kmpc_task_buffer routine checks that the
2006 task queue is not full and allocates a thunk (which is then passed to
2007 __kmpc_task()). So, we can pre-allocate a thunk here assuming it will be
2008 the next to be enqueued in __kmpc_task(). */
2010 new_thunk = __kmp_alloc_thunk(queue, in_parallel, global_tid);
2011 new_thunk->th.th_shareds =
2012 CCAST(kmpc_shared_vars_t *, queue->tq_shareds[0].ai_data);
2013 new_thunk->th_encl_thunk = NULL;
2014 new_thunk->th_task = task;
2016 /* GEH - shouldn't need to lock the read of tq_flags here */
2017 new_thunk->th_flags = queue->tq_flags & TQF_INTERFACE_FLAGS;
2019 new_thunk->th_status = 0;
2021 KMP_DEBUG_ASSERT(!(new_thunk->th_flags & TQF_TASKQ_TASK));
2023 KF_TRACE(100, ("Creating Regular Task on (%d):\n", global_tid));
2024 KF_DUMP(100, __kmp_dump_thunk(tq, new_thunk, global_tid));
2026 KE_TRACE(10, ("__kmpc_task_buffer return (%d)\n", global_tid));