LLVM OpenMP* Runtime Library
kmp_tasking.cpp
1 /*
2  * kmp_tasking.cpp -- OpenMP 3.0 tasking support.
3  */
4 
5 //===----------------------------------------------------------------------===//
6 //
7 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
8 // See https://llvm.org/LICENSE.txt for license information.
9 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
10 //
11 //===----------------------------------------------------------------------===//
12 
13 #include "kmp.h"
14 #include "kmp_i18n.h"
15 #include "kmp_itt.h"
16 #include "kmp_stats.h"
17 #include "kmp_wait_release.h"
18 #include "kmp_taskdeps.h"
19 
20 #if OMPT_SUPPORT
21 #include "ompt-specific.h"
22 #endif
23 
24 /* forward declaration */
25 static void __kmp_enable_tasking(kmp_task_team_t *task_team,
26  kmp_info_t *this_thr);
27 static void __kmp_alloc_task_deque(kmp_info_t *thread,
28  kmp_thread_data_t *thread_data);
29 static int __kmp_realloc_task_threads_data(kmp_info_t *thread,
30  kmp_task_team_t *task_team);
31 static void __kmp_bottom_half_finish_proxy(kmp_int32 gtid, kmp_task_t *ptask);
32 
33 #ifdef BUILD_TIED_TASK_STACK
34 
35 // __kmp_trace_task_stack: print the tied tasks from the task stack in order
36 // from top do bottom
37 //
38 // gtid: global thread identifier for thread containing stack
39 // thread_data: thread data for task team thread containing stack
40 // threshold: value above which the trace statement triggers
41 // location: string identifying call site of this function (for trace)
42 static void __kmp_trace_task_stack(kmp_int32 gtid,
43  kmp_thread_data_t *thread_data,
44  int threshold, char *location) {
45  kmp_task_stack_t *task_stack = &thread_data->td.td_susp_tied_tasks;
46  kmp_taskdata_t **stack_top = task_stack->ts_top;
47  kmp_int32 entries = task_stack->ts_entries;
48  kmp_taskdata_t *tied_task;
49 
50  KA_TRACE(
51  threshold,
52  ("__kmp_trace_task_stack(start): location = %s, gtid = %d, entries = %d, "
53  "first_block = %p, stack_top = %p \n",
54  location, gtid, entries, task_stack->ts_first_block, stack_top));
55 
56  KMP_DEBUG_ASSERT(stack_top != NULL);
57  KMP_DEBUG_ASSERT(entries > 0);
58 
59  while (entries != 0) {
60  KMP_DEBUG_ASSERT(stack_top != &task_stack->ts_first_block.sb_block[0]);
61  // fix up ts_top if we need to pop from previous block
62  if (entries & TASK_STACK_INDEX_MASK == 0) {
63  kmp_stack_block_t *stack_block = (kmp_stack_block_t *)(stack_top);
64 
65  stack_block = stack_block->sb_prev;
66  stack_top = &stack_block->sb_block[TASK_STACK_BLOCK_SIZE];
67  }
68 
69  // finish bookkeeping
70  stack_top--;
71  entries--;
72 
73  tied_task = *stack_top;
74 
75  KMP_DEBUG_ASSERT(tied_task != NULL);
76  KMP_DEBUG_ASSERT(tied_task->td_flags.tasktype == TASK_TIED);
77 
78  KA_TRACE(threshold,
79  ("__kmp_trace_task_stack(%s): gtid=%d, entry=%d, "
80  "stack_top=%p, tied_task=%p\n",
81  location, gtid, entries, stack_top, tied_task));
82  }
83  KMP_DEBUG_ASSERT(stack_top == &task_stack->ts_first_block.sb_block[0]);
84 
85  KA_TRACE(threshold,
86  ("__kmp_trace_task_stack(exit): location = %s, gtid = %d\n",
87  location, gtid));
88 }
89 
90 // __kmp_init_task_stack: initialize the task stack for the first time
91 // after a thread_data structure is created.
92 // It should not be necessary to do this again (assuming the stack works).
93 //
94 // gtid: global thread identifier of calling thread
95 // thread_data: thread data for task team thread containing stack
96 static void __kmp_init_task_stack(kmp_int32 gtid,
97  kmp_thread_data_t *thread_data) {
98  kmp_task_stack_t *task_stack = &thread_data->td.td_susp_tied_tasks;
99  kmp_stack_block_t *first_block;
100 
101  // set up the first block of the stack
102  first_block = &task_stack->ts_first_block;
103  task_stack->ts_top = (kmp_taskdata_t **)first_block;
104  memset((void *)first_block, '\0',
105  TASK_STACK_BLOCK_SIZE * sizeof(kmp_taskdata_t *));
106 
107  // initialize the stack to be empty
108  task_stack->ts_entries = TASK_STACK_EMPTY;
109  first_block->sb_next = NULL;
110  first_block->sb_prev = NULL;
111 }
112 
113 // __kmp_free_task_stack: free the task stack when thread_data is destroyed.
114 //
115 // gtid: global thread identifier for calling thread
116 // thread_data: thread info for thread containing stack
117 static void __kmp_free_task_stack(kmp_int32 gtid,
118  kmp_thread_data_t *thread_data) {
119  kmp_task_stack_t *task_stack = &thread_data->td.td_susp_tied_tasks;
120  kmp_stack_block_t *stack_block = &task_stack->ts_first_block;
121 
122  KMP_DEBUG_ASSERT(task_stack->ts_entries == TASK_STACK_EMPTY);
123  // free from the second block of the stack
124  while (stack_block != NULL) {
125  kmp_stack_block_t *next_block = (stack_block) ? stack_block->sb_next : NULL;
126 
127  stack_block->sb_next = NULL;
128  stack_block->sb_prev = NULL;
129  if (stack_block != &task_stack->ts_first_block) {
130  __kmp_thread_free(thread,
131  stack_block); // free the block, if not the first
132  }
133  stack_block = next_block;
134  }
135  // initialize the stack to be empty
136  task_stack->ts_entries = 0;
137  task_stack->ts_top = NULL;
138 }
139 
140 // __kmp_push_task_stack: Push the tied task onto the task stack.
141 // Grow the stack if necessary by allocating another block.
142 //
143 // gtid: global thread identifier for calling thread
144 // thread: thread info for thread containing stack
145 // tied_task: the task to push on the stack
146 static void __kmp_push_task_stack(kmp_int32 gtid, kmp_info_t *thread,
147  kmp_taskdata_t *tied_task) {
148  // GEH - need to consider what to do if tt_threads_data not allocated yet
149  kmp_thread_data_t *thread_data =
150  &thread->th.th_task_team->tt.tt_threads_data[__kmp_tid_from_gtid(gtid)];
151  kmp_task_stack_t *task_stack = &thread_data->td.td_susp_tied_tasks;
152 
153  if (tied_task->td_flags.team_serial || tied_task->td_flags.tasking_ser) {
154  return; // Don't push anything on stack if team or team tasks are serialized
155  }
156 
157  KMP_DEBUG_ASSERT(tied_task->td_flags.tasktype == TASK_TIED);
158  KMP_DEBUG_ASSERT(task_stack->ts_top != NULL);
159 
160  KA_TRACE(20,
161  ("__kmp_push_task_stack(enter): GTID: %d; THREAD: %p; TASK: %p\n",
162  gtid, thread, tied_task));
163  // Store entry
164  *(task_stack->ts_top) = tied_task;
165 
166  // Do bookkeeping for next push
167  task_stack->ts_top++;
168  task_stack->ts_entries++;
169 
170  if (task_stack->ts_entries & TASK_STACK_INDEX_MASK == 0) {
171  // Find beginning of this task block
172  kmp_stack_block_t *stack_block =
173  (kmp_stack_block_t *)(task_stack->ts_top - TASK_STACK_BLOCK_SIZE);
174 
175  // Check if we already have a block
176  if (stack_block->sb_next !=
177  NULL) { // reset ts_top to beginning of next block
178  task_stack->ts_top = &stack_block->sb_next->sb_block[0];
179  } else { // Alloc new block and link it up
180  kmp_stack_block_t *new_block = (kmp_stack_block_t *)__kmp_thread_calloc(
181  thread, sizeof(kmp_stack_block_t));
182 
183  task_stack->ts_top = &new_block->sb_block[0];
184  stack_block->sb_next = new_block;
185  new_block->sb_prev = stack_block;
186  new_block->sb_next = NULL;
187 
188  KA_TRACE(
189  30,
190  ("__kmp_push_task_stack(): GTID: %d; TASK: %p; Alloc new block: %p\n",
191  gtid, tied_task, new_block));
192  }
193  }
194  KA_TRACE(20, ("__kmp_push_task_stack(exit): GTID: %d; TASK: %p\n", gtid,
195  tied_task));
196 }
197 
198 // __kmp_pop_task_stack: Pop the tied task from the task stack. Don't return
199 // the task, just check to make sure it matches the ending task passed in.
200 //
201 // gtid: global thread identifier for the calling thread
202 // thread: thread info structure containing stack
203 // tied_task: the task popped off the stack
204 // ending_task: the task that is ending (should match popped task)
205 static void __kmp_pop_task_stack(kmp_int32 gtid, kmp_info_t *thread,
206  kmp_taskdata_t *ending_task) {
207  // GEH - need to consider what to do if tt_threads_data not allocated yet
208  kmp_thread_data_t *thread_data =
209  &thread->th.th_task_team->tt_threads_data[__kmp_tid_from_gtid(gtid)];
210  kmp_task_stack_t *task_stack = &thread_data->td.td_susp_tied_tasks;
211  kmp_taskdata_t *tied_task;
212 
213  if (ending_task->td_flags.team_serial || ending_task->td_flags.tasking_ser) {
214  // Don't pop anything from stack if team or team tasks are serialized
215  return;
216  }
217 
218  KMP_DEBUG_ASSERT(task_stack->ts_top != NULL);
219  KMP_DEBUG_ASSERT(task_stack->ts_entries > 0);
220 
221  KA_TRACE(20, ("__kmp_pop_task_stack(enter): GTID: %d; THREAD: %p\n", gtid,
222  thread));
223 
224  // fix up ts_top if we need to pop from previous block
225  if (task_stack->ts_entries & TASK_STACK_INDEX_MASK == 0) {
226  kmp_stack_block_t *stack_block = (kmp_stack_block_t *)(task_stack->ts_top);
227 
228  stack_block = stack_block->sb_prev;
229  task_stack->ts_top = &stack_block->sb_block[TASK_STACK_BLOCK_SIZE];
230  }
231 
232  // finish bookkeeping
233  task_stack->ts_top--;
234  task_stack->ts_entries--;
235 
236  tied_task = *(task_stack->ts_top);
237 
238  KMP_DEBUG_ASSERT(tied_task != NULL);
239  KMP_DEBUG_ASSERT(tied_task->td_flags.tasktype == TASK_TIED);
240  KMP_DEBUG_ASSERT(tied_task == ending_task); // If we built the stack correctly
241 
242  KA_TRACE(20, ("__kmp_pop_task_stack(exit): GTID: %d; TASK: %p\n", gtid,
243  tied_task));
244  return;
245 }
246 #endif /* BUILD_TIED_TASK_STACK */
247 
248 // returns 1 if new task is allowed to execute, 0 otherwise
249 // checks Task Scheduling constraint (if requested) and
250 // mutexinoutset dependencies if any
251 static bool __kmp_task_is_allowed(int gtid, const kmp_int32 is_constrained,
252  const kmp_taskdata_t *tasknew,
253  const kmp_taskdata_t *taskcurr) {
254  if (is_constrained && (tasknew->td_flags.tiedness == TASK_TIED)) {
255  // Check if the candidate obeys the Task Scheduling Constraints (TSC)
256  // only descendant of all deferred tied tasks can be scheduled, checking
257  // the last one is enough, as it in turn is the descendant of all others
258  kmp_taskdata_t *current = taskcurr->td_last_tied;
259  KMP_DEBUG_ASSERT(current != NULL);
260  // check if the task is not suspended on barrier
261  if (current->td_flags.tasktype == TASK_EXPLICIT ||
262  current->td_taskwait_thread > 0) { // <= 0 on barrier
263  kmp_int32 level = current->td_level;
264  kmp_taskdata_t *parent = tasknew->td_parent;
265  while (parent != current && parent->td_level > level) {
266  // check generation up to the level of the current task
267  parent = parent->td_parent;
268  KMP_DEBUG_ASSERT(parent != NULL);
269  }
270  if (parent != current)
271  return false;
272  }
273  }
274  // Check mutexinoutset dependencies, acquire locks
275  kmp_depnode_t *node = tasknew->td_depnode;
276  if (UNLIKELY(node && (node->dn.mtx_num_locks > 0))) {
277  for (int i = 0; i < node->dn.mtx_num_locks; ++i) {
278  KMP_DEBUG_ASSERT(node->dn.mtx_locks[i] != NULL);
279  if (__kmp_test_lock(node->dn.mtx_locks[i], gtid))
280  continue;
281  // could not get the lock, release previous locks
282  for (int j = i - 1; j >= 0; --j)
283  __kmp_release_lock(node->dn.mtx_locks[j], gtid);
284  return false;
285  }
286  // negative num_locks means all locks acquired successfully
287  node->dn.mtx_num_locks = -node->dn.mtx_num_locks;
288  }
289  return true;
290 }
291 
292 // __kmp_realloc_task_deque:
293 // Re-allocates a task deque for a particular thread, copies the content from
294 // the old deque and adjusts the necessary data structures relating to the
295 // deque. This operation must be done with the deque_lock being held
296 static void __kmp_realloc_task_deque(kmp_info_t *thread,
297  kmp_thread_data_t *thread_data) {
298  kmp_int32 size = TASK_DEQUE_SIZE(thread_data->td);
299  KMP_DEBUG_ASSERT(TCR_4(thread_data->td.td_deque_ntasks) == size);
300  kmp_int32 new_size = 2 * size;
301 
302  KE_TRACE(10, ("__kmp_realloc_task_deque: T#%d reallocating deque[from %d to "
303  "%d] for thread_data %p\n",
304  __kmp_gtid_from_thread(thread), size, new_size, thread_data));
305 
306  kmp_taskdata_t **new_deque =
307  (kmp_taskdata_t **)__kmp_allocate(new_size * sizeof(kmp_taskdata_t *));
308 
309  int i, j;
310  for (i = thread_data->td.td_deque_head, j = 0; j < size;
311  i = (i + 1) & TASK_DEQUE_MASK(thread_data->td), j++)
312  new_deque[j] = thread_data->td.td_deque[i];
313 
314  __kmp_free(thread_data->td.td_deque);
315 
316  thread_data->td.td_deque_head = 0;
317  thread_data->td.td_deque_tail = size;
318  thread_data->td.td_deque = new_deque;
319  thread_data->td.td_deque_size = new_size;
320 }
321 
322 // __kmp_push_task: Add a task to the thread's deque
323 static kmp_int32 __kmp_push_task(kmp_int32 gtid, kmp_task_t *task) {
324  kmp_info_t *thread = __kmp_threads[gtid];
325  kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(task);
326 
327  // We don't need to map to shadow gtid if it is already hidden helper thread
328  if (taskdata->td_flags.hidden_helper && !KMP_HIDDEN_HELPER_THREAD(gtid)) {
329  gtid = KMP_GTID_TO_SHADOW_GTID(gtid);
330  thread = __kmp_threads[gtid];
331  }
332 
333  kmp_task_team_t *task_team = thread->th.th_task_team;
334  kmp_int32 tid = __kmp_tid_from_gtid(gtid);
335  kmp_thread_data_t *thread_data;
336 
337  KA_TRACE(20,
338  ("__kmp_push_task: T#%d trying to push task %p.\n", gtid, taskdata));
339 
340  if (UNLIKELY(taskdata->td_flags.tiedness == TASK_UNTIED)) {
341  // untied task needs to increment counter so that the task structure is not
342  // freed prematurely
343  kmp_int32 counter = 1 + KMP_ATOMIC_INC(&taskdata->td_untied_count);
344  KMP_DEBUG_USE_VAR(counter);
345  KA_TRACE(
346  20,
347  ("__kmp_push_task: T#%d untied_count (%d) incremented for task %p\n",
348  gtid, counter, taskdata));
349  }
350 
351  // The first check avoids building task_team thread data if serialized
352  if (UNLIKELY(taskdata->td_flags.task_serial)) {
353  KA_TRACE(20, ("__kmp_push_task: T#%d team serialized; returning "
354  "TASK_NOT_PUSHED for task %p\n",
355  gtid, taskdata));
356  return TASK_NOT_PUSHED;
357  }
358 
359  // Now that serialized tasks have returned, we can assume that we are not in
360  // immediate exec mode
361  KMP_DEBUG_ASSERT(__kmp_tasking_mode != tskm_immediate_exec);
362  if (UNLIKELY(!KMP_TASKING_ENABLED(task_team))) {
363  __kmp_enable_tasking(task_team, thread);
364  }
365  KMP_DEBUG_ASSERT(TCR_4(task_team->tt.tt_found_tasks) == TRUE);
366  KMP_DEBUG_ASSERT(TCR_PTR(task_team->tt.tt_threads_data) != NULL);
367 
368  // Find tasking deque specific to encountering thread
369  thread_data = &task_team->tt.tt_threads_data[tid];
370 
371  // No lock needed since only owner can allocate. If the task is hidden_helper,
372  // we don't need it either because we have initialized the dequeue for hidden
373  // helper thread data.
374  if (UNLIKELY(thread_data->td.td_deque == NULL)) {
375  __kmp_alloc_task_deque(thread, thread_data);
376  }
377 
378  int locked = 0;
379  // Check if deque is full
380  if (TCR_4(thread_data->td.td_deque_ntasks) >=
381  TASK_DEQUE_SIZE(thread_data->td)) {
382  if (__kmp_enable_task_throttling &&
383  __kmp_task_is_allowed(gtid, __kmp_task_stealing_constraint, taskdata,
384  thread->th.th_current_task)) {
385  KA_TRACE(20, ("__kmp_push_task: T#%d deque is full; returning "
386  "TASK_NOT_PUSHED for task %p\n",
387  gtid, taskdata));
388  return TASK_NOT_PUSHED;
389  } else {
390  __kmp_acquire_bootstrap_lock(&thread_data->td.td_deque_lock);
391  locked = 1;
392  if (TCR_4(thread_data->td.td_deque_ntasks) >=
393  TASK_DEQUE_SIZE(thread_data->td)) {
394  // expand deque to push the task which is not allowed to execute
395  __kmp_realloc_task_deque(thread, thread_data);
396  }
397  }
398  }
399  // Lock the deque for the task push operation
400  if (!locked) {
401  __kmp_acquire_bootstrap_lock(&thread_data->td.td_deque_lock);
402  // Need to recheck as we can get a proxy task from thread outside of OpenMP
403  if (TCR_4(thread_data->td.td_deque_ntasks) >=
404  TASK_DEQUE_SIZE(thread_data->td)) {
405  if (__kmp_enable_task_throttling &&
406  __kmp_task_is_allowed(gtid, __kmp_task_stealing_constraint, taskdata,
407  thread->th.th_current_task)) {
408  __kmp_release_bootstrap_lock(&thread_data->td.td_deque_lock);
409  KA_TRACE(20, ("__kmp_push_task: T#%d deque is full on 2nd check; "
410  "returning TASK_NOT_PUSHED for task %p\n",
411  gtid, taskdata));
412  return TASK_NOT_PUSHED;
413  } else {
414  // expand deque to push the task which is not allowed to execute
415  __kmp_realloc_task_deque(thread, thread_data);
416  }
417  }
418  }
419  // Must have room since no thread can add tasks but calling thread
420  KMP_DEBUG_ASSERT(TCR_4(thread_data->td.td_deque_ntasks) <
421  TASK_DEQUE_SIZE(thread_data->td));
422 
423  thread_data->td.td_deque[thread_data->td.td_deque_tail] =
424  taskdata; // Push taskdata
425  // Wrap index.
426  thread_data->td.td_deque_tail =
427  (thread_data->td.td_deque_tail + 1) & TASK_DEQUE_MASK(thread_data->td);
428  TCW_4(thread_data->td.td_deque_ntasks,
429  TCR_4(thread_data->td.td_deque_ntasks) + 1); // Adjust task count
430  KMP_FSYNC_RELEASING(thread->th.th_current_task); // releasing self
431  KMP_FSYNC_RELEASING(taskdata); // releasing child
432  KA_TRACE(20, ("__kmp_push_task: T#%d returning TASK_SUCCESSFULLY_PUSHED: "
433  "task=%p ntasks=%d head=%u tail=%u\n",
434  gtid, taskdata, thread_data->td.td_deque_ntasks,
435  thread_data->td.td_deque_head, thread_data->td.td_deque_tail));
436 
437  auto hidden_helper = taskdata->td_flags.hidden_helper;
438 
439  __kmp_release_bootstrap_lock(&thread_data->td.td_deque_lock);
440 
441  // Signal one worker thread to execute the task
442  if (UNLIKELY(hidden_helper)) {
443  // Wake hidden helper threads up if they're sleeping
444  __kmp_hidden_helper_worker_thread_signal();
445  }
446 
447  return TASK_SUCCESSFULLY_PUSHED;
448 }
449 
450 // __kmp_pop_current_task_from_thread: set up current task from called thread
451 // when team ends
452 //
453 // this_thr: thread structure to set current_task in.
454 void __kmp_pop_current_task_from_thread(kmp_info_t *this_thr) {
455  KF_TRACE(10, ("__kmp_pop_current_task_from_thread(enter): T#%d "
456  "this_thread=%p, curtask=%p, "
457  "curtask_parent=%p\n",
458  0, this_thr, this_thr->th.th_current_task,
459  this_thr->th.th_current_task->td_parent));
460 
461  this_thr->th.th_current_task = this_thr->th.th_current_task->td_parent;
462 
463  KF_TRACE(10, ("__kmp_pop_current_task_from_thread(exit): T#%d "
464  "this_thread=%p, curtask=%p, "
465  "curtask_parent=%p\n",
466  0, this_thr, this_thr->th.th_current_task,
467  this_thr->th.th_current_task->td_parent));
468 }
469 
470 // __kmp_push_current_task_to_thread: set up current task in called thread for a
471 // new team
472 //
473 // this_thr: thread structure to set up
474 // team: team for implicit task data
475 // tid: thread within team to set up
476 void __kmp_push_current_task_to_thread(kmp_info_t *this_thr, kmp_team_t *team,
477  int tid) {
478  // current task of the thread is a parent of the new just created implicit
479  // tasks of new team
480  KF_TRACE(10, ("__kmp_push_current_task_to_thread(enter): T#%d this_thread=%p "
481  "curtask=%p "
482  "parent_task=%p\n",
483  tid, this_thr, this_thr->th.th_current_task,
484  team->t.t_implicit_task_taskdata[tid].td_parent));
485 
486  KMP_DEBUG_ASSERT(this_thr != NULL);
487 
488  if (tid == 0) {
489  if (this_thr->th.th_current_task != &team->t.t_implicit_task_taskdata[0]) {
490  team->t.t_implicit_task_taskdata[0].td_parent =
491  this_thr->th.th_current_task;
492  this_thr->th.th_current_task = &team->t.t_implicit_task_taskdata[0];
493  }
494  } else {
495  team->t.t_implicit_task_taskdata[tid].td_parent =
496  team->t.t_implicit_task_taskdata[0].td_parent;
497  this_thr->th.th_current_task = &team->t.t_implicit_task_taskdata[tid];
498  }
499 
500  KF_TRACE(10, ("__kmp_push_current_task_to_thread(exit): T#%d this_thread=%p "
501  "curtask=%p "
502  "parent_task=%p\n",
503  tid, this_thr, this_thr->th.th_current_task,
504  team->t.t_implicit_task_taskdata[tid].td_parent));
505 }
506 
507 // __kmp_task_start: bookkeeping for a task starting execution
508 //
509 // GTID: global thread id of calling thread
510 // task: task starting execution
511 // current_task: task suspending
512 static void __kmp_task_start(kmp_int32 gtid, kmp_task_t *task,
513  kmp_taskdata_t *current_task) {
514  kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(task);
515  kmp_info_t *thread = __kmp_threads[gtid];
516 
517  KA_TRACE(10,
518  ("__kmp_task_start(enter): T#%d starting task %p: current_task=%p\n",
519  gtid, taskdata, current_task));
520 
521  KMP_DEBUG_ASSERT(taskdata->td_flags.tasktype == TASK_EXPLICIT);
522 
523  // mark currently executing task as suspended
524  // TODO: GEH - make sure root team implicit task is initialized properly.
525  // KMP_DEBUG_ASSERT( current_task -> td_flags.executing == 1 );
526  current_task->td_flags.executing = 0;
527 
528 // Add task to stack if tied
529 #ifdef BUILD_TIED_TASK_STACK
530  if (taskdata->td_flags.tiedness == TASK_TIED) {
531  __kmp_push_task_stack(gtid, thread, taskdata);
532  }
533 #endif /* BUILD_TIED_TASK_STACK */
534 
535  // mark starting task as executing and as current task
536  thread->th.th_current_task = taskdata;
537 
538  KMP_DEBUG_ASSERT(taskdata->td_flags.started == 0 ||
539  taskdata->td_flags.tiedness == TASK_UNTIED);
540  KMP_DEBUG_ASSERT(taskdata->td_flags.executing == 0 ||
541  taskdata->td_flags.tiedness == TASK_UNTIED);
542  taskdata->td_flags.started = 1;
543  taskdata->td_flags.executing = 1;
544  KMP_DEBUG_ASSERT(taskdata->td_flags.complete == 0);
545  KMP_DEBUG_ASSERT(taskdata->td_flags.freed == 0);
546 
547  // GEH TODO: shouldn't we pass some sort of location identifier here?
548  // APT: yes, we will pass location here.
549  // need to store current thread state (in a thread or taskdata structure)
550  // before setting work_state, otherwise wrong state is set after end of task
551 
552  KA_TRACE(10, ("__kmp_task_start(exit): T#%d task=%p\n", gtid, taskdata));
553 
554  return;
555 }
556 
557 #if OMPT_SUPPORT
558 //------------------------------------------------------------------------------
559 // __ompt_task_init:
560 // Initialize OMPT fields maintained by a task. This will only be called after
561 // ompt_start_tool, so we already know whether ompt is enabled or not.
562 
563 static inline void __ompt_task_init(kmp_taskdata_t *task, int tid) {
564  // The calls to __ompt_task_init already have the ompt_enabled condition.
565  task->ompt_task_info.task_data.value = 0;
566  task->ompt_task_info.frame.exit_frame = ompt_data_none;
567  task->ompt_task_info.frame.enter_frame = ompt_data_none;
568  task->ompt_task_info.frame.exit_frame_flags =
569  ompt_frame_runtime | ompt_frame_framepointer;
570  task->ompt_task_info.frame.enter_frame_flags =
571  ompt_frame_runtime | ompt_frame_framepointer;
572 }
573 
574 // __ompt_task_start:
575 // Build and trigger task-begin event
576 static inline void __ompt_task_start(kmp_task_t *task,
577  kmp_taskdata_t *current_task,
578  kmp_int32 gtid) {
579  kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(task);
580  ompt_task_status_t status = ompt_task_switch;
581  if (__kmp_threads[gtid]->th.ompt_thread_info.ompt_task_yielded) {
582  status = ompt_task_yield;
583  __kmp_threads[gtid]->th.ompt_thread_info.ompt_task_yielded = 0;
584  }
585  /* let OMPT know that we're about to run this task */
586  if (ompt_enabled.ompt_callback_task_schedule) {
587  ompt_callbacks.ompt_callback(ompt_callback_task_schedule)(
588  &(current_task->ompt_task_info.task_data), status,
589  &(taskdata->ompt_task_info.task_data));
590  }
591  taskdata->ompt_task_info.scheduling_parent = current_task;
592 }
593 
594 // __ompt_task_finish:
595 // Build and trigger final task-schedule event
596 static inline void __ompt_task_finish(kmp_task_t *task,
597  kmp_taskdata_t *resumed_task,
598  ompt_task_status_t status) {
599  if (ompt_enabled.ompt_callback_task_schedule) {
600  kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(task);
601  if (__kmp_omp_cancellation && taskdata->td_taskgroup &&
602  taskdata->td_taskgroup->cancel_request == cancel_taskgroup) {
603  status = ompt_task_cancel;
604  }
605 
606  /* let OMPT know that we're returning to the callee task */
607  ompt_callbacks.ompt_callback(ompt_callback_task_schedule)(
608  &(taskdata->ompt_task_info.task_data), status,
609  (resumed_task ? &(resumed_task->ompt_task_info.task_data) : NULL));
610  }
611 }
612 #endif
613 
614 template <bool ompt>
615 static void __kmpc_omp_task_begin_if0_template(ident_t *loc_ref, kmp_int32 gtid,
616  kmp_task_t *task,
617  void *frame_address,
618  void *return_address) {
619  kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(task);
620  kmp_taskdata_t *current_task = __kmp_threads[gtid]->th.th_current_task;
621 
622  KA_TRACE(10, ("__kmpc_omp_task_begin_if0(enter): T#%d loc=%p task=%p "
623  "current_task=%p\n",
624  gtid, loc_ref, taskdata, current_task));
625 
626  if (UNLIKELY(taskdata->td_flags.tiedness == TASK_UNTIED)) {
627  // untied task needs to increment counter so that the task structure is not
628  // freed prematurely
629  kmp_int32 counter = 1 + KMP_ATOMIC_INC(&taskdata->td_untied_count);
630  KMP_DEBUG_USE_VAR(counter);
631  KA_TRACE(20, ("__kmpc_omp_task_begin_if0: T#%d untied_count (%d) "
632  "incremented for task %p\n",
633  gtid, counter, taskdata));
634  }
635 
636  taskdata->td_flags.task_serial =
637  1; // Execute this task immediately, not deferred.
638  __kmp_task_start(gtid, task, current_task);
639 
640 #if OMPT_SUPPORT
641  if (ompt) {
642  if (current_task->ompt_task_info.frame.enter_frame.ptr == NULL) {
643  current_task->ompt_task_info.frame.enter_frame.ptr =
644  taskdata->ompt_task_info.frame.exit_frame.ptr = frame_address;
645  current_task->ompt_task_info.frame.enter_frame_flags =
646  taskdata->ompt_task_info.frame.exit_frame_flags =
647  ompt_frame_application | ompt_frame_framepointer;
648  }
649  if (ompt_enabled.ompt_callback_task_create) {
650  ompt_task_info_t *parent_info = &(current_task->ompt_task_info);
651  ompt_callbacks.ompt_callback(ompt_callback_task_create)(
652  &(parent_info->task_data), &(parent_info->frame),
653  &(taskdata->ompt_task_info.task_data),
654  ompt_task_explicit | TASK_TYPE_DETAILS_FORMAT(taskdata), 0,
655  return_address);
656  }
657  __ompt_task_start(task, current_task, gtid);
658  }
659 #endif // OMPT_SUPPORT
660 
661  KA_TRACE(10, ("__kmpc_omp_task_begin_if0(exit): T#%d loc=%p task=%p,\n", gtid,
662  loc_ref, taskdata));
663 }
664 
665 #if OMPT_SUPPORT
666 OMPT_NOINLINE
667 static void __kmpc_omp_task_begin_if0_ompt(ident_t *loc_ref, kmp_int32 gtid,
668  kmp_task_t *task,
669  void *frame_address,
670  void *return_address) {
671  __kmpc_omp_task_begin_if0_template<true>(loc_ref, gtid, task, frame_address,
672  return_address);
673 }
674 #endif // OMPT_SUPPORT
675 
676 // __kmpc_omp_task_begin_if0: report that a given serialized task has started
677 // execution
678 //
679 // loc_ref: source location information; points to beginning of task block.
680 // gtid: global thread number.
681 // task: task thunk for the started task.
682 void __kmpc_omp_task_begin_if0(ident_t *loc_ref, kmp_int32 gtid,
683  kmp_task_t *task) {
684 #if OMPT_SUPPORT
685  if (UNLIKELY(ompt_enabled.enabled)) {
686  OMPT_STORE_RETURN_ADDRESS(gtid);
687  __kmpc_omp_task_begin_if0_ompt(loc_ref, gtid, task,
688  OMPT_GET_FRAME_ADDRESS(1),
689  OMPT_LOAD_RETURN_ADDRESS(gtid));
690  return;
691  }
692 #endif
693  __kmpc_omp_task_begin_if0_template<false>(loc_ref, gtid, task, NULL, NULL);
694 }
695 
696 #ifdef TASK_UNUSED
697 // __kmpc_omp_task_begin: report that a given task has started execution
698 // NEVER GENERATED BY COMPILER, DEPRECATED!!!
699 void __kmpc_omp_task_begin(ident_t *loc_ref, kmp_int32 gtid, kmp_task_t *task) {
700  kmp_taskdata_t *current_task = __kmp_threads[gtid]->th.th_current_task;
701 
702  KA_TRACE(
703  10,
704  ("__kmpc_omp_task_begin(enter): T#%d loc=%p task=%p current_task=%p\n",
705  gtid, loc_ref, KMP_TASK_TO_TASKDATA(task), current_task));
706 
707  __kmp_task_start(gtid, task, current_task);
708 
709  KA_TRACE(10, ("__kmpc_omp_task_begin(exit): T#%d loc=%p task=%p,\n", gtid,
710  loc_ref, KMP_TASK_TO_TASKDATA(task)));
711  return;
712 }
713 #endif // TASK_UNUSED
714 
715 // __kmp_free_task: free the current task space and the space for shareds
716 //
717 // gtid: Global thread ID of calling thread
718 // taskdata: task to free
719 // thread: thread data structure of caller
720 static void __kmp_free_task(kmp_int32 gtid, kmp_taskdata_t *taskdata,
721  kmp_info_t *thread) {
722  KA_TRACE(30, ("__kmp_free_task: T#%d freeing data from task %p\n", gtid,
723  taskdata));
724 
725  // Check to make sure all flags and counters have the correct values
726  KMP_DEBUG_ASSERT(taskdata->td_flags.tasktype == TASK_EXPLICIT);
727  KMP_DEBUG_ASSERT(taskdata->td_flags.executing == 0);
728  KMP_DEBUG_ASSERT(taskdata->td_flags.complete == 1);
729  KMP_DEBUG_ASSERT(taskdata->td_flags.freed == 0);
730  KMP_DEBUG_ASSERT(taskdata->td_allocated_child_tasks == 0 ||
731  taskdata->td_flags.task_serial == 1);
732  KMP_DEBUG_ASSERT(taskdata->td_incomplete_child_tasks == 0);
733 
734  taskdata->td_flags.freed = 1;
735 // deallocate the taskdata and shared variable blocks associated with this task
736 #if USE_FAST_MEMORY
737  __kmp_fast_free(thread, taskdata);
738 #else /* ! USE_FAST_MEMORY */
739  __kmp_thread_free(thread, taskdata);
740 #endif
741  KA_TRACE(20, ("__kmp_free_task: T#%d freed task %p\n", gtid, taskdata));
742 }
743 
744 // __kmp_free_task_and_ancestors: free the current task and ancestors without
745 // children
746 //
747 // gtid: Global thread ID of calling thread
748 // taskdata: task to free
749 // thread: thread data structure of caller
750 static void __kmp_free_task_and_ancestors(kmp_int32 gtid,
751  kmp_taskdata_t *taskdata,
752  kmp_info_t *thread) {
753  // Proxy tasks must always be allowed to free their parents
754  // because they can be run in background even in serial mode.
755  kmp_int32 team_serial =
756  (taskdata->td_flags.team_serial || taskdata->td_flags.tasking_ser) &&
757  !taskdata->td_flags.proxy;
758  KMP_DEBUG_ASSERT(taskdata->td_flags.tasktype == TASK_EXPLICIT);
759 
760  kmp_int32 children = KMP_ATOMIC_DEC(&taskdata->td_allocated_child_tasks) - 1;
761  KMP_DEBUG_ASSERT(children >= 0);
762 
763  // Now, go up the ancestor tree to see if any ancestors can now be freed.
764  while (children == 0) {
765  kmp_taskdata_t *parent_taskdata = taskdata->td_parent;
766 
767  KA_TRACE(20, ("__kmp_free_task_and_ancestors(enter): T#%d task %p complete "
768  "and freeing itself\n",
769  gtid, taskdata));
770 
771  // --- Deallocate my ancestor task ---
772  __kmp_free_task(gtid, taskdata, thread);
773 
774  taskdata = parent_taskdata;
775 
776  if (team_serial)
777  return;
778  // Stop checking ancestors at implicit task instead of walking up ancestor
779  // tree to avoid premature deallocation of ancestors.
780  if (taskdata->td_flags.tasktype == TASK_IMPLICIT) {
781  if (taskdata->td_dephash) { // do we need to cleanup dephash?
782  int children = KMP_ATOMIC_LD_ACQ(&taskdata->td_incomplete_child_tasks);
783  kmp_tasking_flags_t flags_old = taskdata->td_flags;
784  if (children == 0 && flags_old.complete == 1) {
785  kmp_tasking_flags_t flags_new = flags_old;
786  flags_new.complete = 0;
787  if (KMP_COMPARE_AND_STORE_ACQ32(
788  RCAST(kmp_int32 *, &taskdata->td_flags),
789  *RCAST(kmp_int32 *, &flags_old),
790  *RCAST(kmp_int32 *, &flags_new))) {
791  KA_TRACE(100, ("__kmp_free_task_and_ancestors: T#%d cleans "
792  "dephash of implicit task %p\n",
793  gtid, taskdata));
794  // cleanup dephash of finished implicit task
795  __kmp_dephash_free_entries(thread, taskdata->td_dephash);
796  }
797  }
798  }
799  return;
800  }
801  // Predecrement simulated by "- 1" calculation
802  children = KMP_ATOMIC_DEC(&taskdata->td_allocated_child_tasks) - 1;
803  KMP_DEBUG_ASSERT(children >= 0);
804  }
805 
806  KA_TRACE(
807  20, ("__kmp_free_task_and_ancestors(exit): T#%d task %p has %d children; "
808  "not freeing it yet\n",
809  gtid, taskdata, children));
810 }
811 
812 // __kmp_task_finish: bookkeeping to do when a task finishes execution
813 //
814 // gtid: global thread ID for calling thread
815 // task: task to be finished
816 // resumed_task: task to be resumed. (may be NULL if task is serialized)
817 //
818 // template<ompt>: effectively ompt_enabled.enabled!=0
819 // the version with ompt=false is inlined, allowing to optimize away all ompt
820 // code in this case
821 template <bool ompt>
822 static void __kmp_task_finish(kmp_int32 gtid, kmp_task_t *task,
823  kmp_taskdata_t *resumed_task) {
824  kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(task);
825  kmp_info_t *thread = __kmp_threads[gtid];
826  kmp_task_team_t *task_team =
827  thread->th.th_task_team; // might be NULL for serial teams...
828  kmp_int32 children = 0;
829 
830  KA_TRACE(10, ("__kmp_task_finish(enter): T#%d finishing task %p and resuming "
831  "task %p\n",
832  gtid, taskdata, resumed_task));
833 
834  KMP_DEBUG_ASSERT(taskdata->td_flags.tasktype == TASK_EXPLICIT);
835 
836 // Pop task from stack if tied
837 #ifdef BUILD_TIED_TASK_STACK
838  if (taskdata->td_flags.tiedness == TASK_TIED) {
839  __kmp_pop_task_stack(gtid, thread, taskdata);
840  }
841 #endif /* BUILD_TIED_TASK_STACK */
842 
843  if (UNLIKELY(taskdata->td_flags.tiedness == TASK_UNTIED)) {
844  // untied task needs to check the counter so that the task structure is not
845  // freed prematurely
846  kmp_int32 counter = KMP_ATOMIC_DEC(&taskdata->td_untied_count) - 1;
847  KA_TRACE(
848  20,
849  ("__kmp_task_finish: T#%d untied_count (%d) decremented for task %p\n",
850  gtid, counter, taskdata));
851  if (counter > 0) {
852  // untied task is not done, to be continued possibly by other thread, do
853  // not free it now
854  if (resumed_task == NULL) {
855  KMP_DEBUG_ASSERT(taskdata->td_flags.task_serial);
856  resumed_task = taskdata->td_parent; // In a serialized task, the resumed
857  // task is the parent
858  }
859  thread->th.th_current_task = resumed_task; // restore current_task
860  resumed_task->td_flags.executing = 1; // resume previous task
861  KA_TRACE(10, ("__kmp_task_finish(exit): T#%d partially done task %p, "
862  "resuming task %p\n",
863  gtid, taskdata, resumed_task));
864  return;
865  }
866  }
867 
868  // bookkeeping for resuming task:
869  // GEH - note tasking_ser => task_serial
870  KMP_DEBUG_ASSERT(
871  (taskdata->td_flags.tasking_ser || taskdata->td_flags.task_serial) ==
872  taskdata->td_flags.task_serial);
873  if (taskdata->td_flags.task_serial) {
874  if (resumed_task == NULL) {
875  resumed_task = taskdata->td_parent; // In a serialized task, the resumed
876  // task is the parent
877  }
878  } else {
879  KMP_DEBUG_ASSERT(resumed_task !=
880  NULL); // verify that resumed task is passed as argument
881  }
882 
883  /* If the tasks' destructor thunk flag has been set, we need to invoke the
884  destructor thunk that has been generated by the compiler. The code is
885  placed here, since at this point other tasks might have been released
886  hence overlapping the destructor invocations with some other work in the
887  released tasks. The OpenMP spec is not specific on when the destructors
888  are invoked, so we should be free to choose. */
889  if (UNLIKELY(taskdata->td_flags.destructors_thunk)) {
890  kmp_routine_entry_t destr_thunk = task->data1.destructors;
891  KMP_ASSERT(destr_thunk);
892  destr_thunk(gtid, task);
893  }
894 
895  KMP_DEBUG_ASSERT(taskdata->td_flags.complete == 0);
896  KMP_DEBUG_ASSERT(taskdata->td_flags.started == 1);
897  KMP_DEBUG_ASSERT(taskdata->td_flags.freed == 0);
898 
899  bool detach = false;
900  if (UNLIKELY(taskdata->td_flags.detachable == TASK_DETACHABLE)) {
901  if (taskdata->td_allow_completion_event.type ==
902  KMP_EVENT_ALLOW_COMPLETION) {
903  // event hasn't been fulfilled yet. Try to detach task.
904  __kmp_acquire_tas_lock(&taskdata->td_allow_completion_event.lock, gtid);
905  if (taskdata->td_allow_completion_event.type ==
906  KMP_EVENT_ALLOW_COMPLETION) {
907  // task finished execution
908  KMP_DEBUG_ASSERT(taskdata->td_flags.executing == 1);
909  taskdata->td_flags.executing = 0; // suspend the finishing task
910 
911 #if OMPT_SUPPORT
912  // For a detached task, which is not completed, we switch back
913  // the omp_fulfill_event signals completion
914  // locking is necessary to avoid a race with ompt_task_late_fulfill
915  if (ompt)
916  __ompt_task_finish(task, resumed_task, ompt_task_detach);
917 #endif
918 
919  // no access to taskdata after this point!
920  // __kmp_fulfill_event might free taskdata at any time from now
921 
922  taskdata->td_flags.proxy = TASK_PROXY; // proxify!
923  detach = true;
924  }
925  __kmp_release_tas_lock(&taskdata->td_allow_completion_event.lock, gtid);
926  }
927  }
928 
929  if (!detach) {
930  taskdata->td_flags.complete = 1; // mark the task as completed
931 
932 #if OMPT_SUPPORT
933  // This is not a detached task, we are done here
934  if (ompt)
935  __ompt_task_finish(task, resumed_task, ompt_task_complete);
936 #endif
937 
938  // Only need to keep track of count if team parallel and tasking not
939  // serialized, or task is detachable and event has already been fulfilled
940  if (!(taskdata->td_flags.team_serial || taskdata->td_flags.tasking_ser) ||
941  taskdata->td_flags.detachable == TASK_DETACHABLE ||
942  taskdata->td_flags.hidden_helper) {
943  // Predecrement simulated by "- 1" calculation
944  children =
945  KMP_ATOMIC_DEC(&taskdata->td_parent->td_incomplete_child_tasks) - 1;
946  KMP_DEBUG_ASSERT(children >= 0);
947  if (taskdata->td_taskgroup)
948  KMP_ATOMIC_DEC(&taskdata->td_taskgroup->count);
949  __kmp_release_deps(gtid, taskdata);
950  } else if (task_team && task_team->tt.tt_found_proxy_tasks) {
951  // if we found proxy tasks there could exist a dependency chain
952  // with the proxy task as origin
953  __kmp_release_deps(gtid, taskdata);
954  }
955  // td_flags.executing must be marked as 0 after __kmp_release_deps has been
956  // called. Othertwise, if a task is executed immediately from the
957  // release_deps code, the flag will be reset to 1 again by this same
958  // function
959  KMP_DEBUG_ASSERT(taskdata->td_flags.executing == 1);
960  taskdata->td_flags.executing = 0; // suspend the finishing task
961  }
962 
963  KA_TRACE(
964  20, ("__kmp_task_finish: T#%d finished task %p, %d incomplete children\n",
965  gtid, taskdata, children));
966 
967  // Free this task and then ancestor tasks if they have no children.
968  // Restore th_current_task first as suggested by John:
969  // johnmc: if an asynchronous inquiry peers into the runtime system
970  // it doesn't see the freed task as the current task.
971  thread->th.th_current_task = resumed_task;
972  if (!detach)
973  __kmp_free_task_and_ancestors(gtid, taskdata, thread);
974 
975  // TODO: GEH - make sure root team implicit task is initialized properly.
976  // KMP_DEBUG_ASSERT( resumed_task->td_flags.executing == 0 );
977  resumed_task->td_flags.executing = 1; // resume previous task
978 
979  KA_TRACE(
980  10, ("__kmp_task_finish(exit): T#%d finished task %p, resuming task %p\n",
981  gtid, taskdata, resumed_task));
982 
983  return;
984 }
985 
986 template <bool ompt>
987 static void __kmpc_omp_task_complete_if0_template(ident_t *loc_ref,
988  kmp_int32 gtid,
989  kmp_task_t *task) {
990  KA_TRACE(10, ("__kmpc_omp_task_complete_if0(enter): T#%d loc=%p task=%p\n",
991  gtid, loc_ref, KMP_TASK_TO_TASKDATA(task)));
992  KMP_DEBUG_ASSERT(gtid >= 0);
993  // this routine will provide task to resume
994  __kmp_task_finish<ompt>(gtid, task, NULL);
995 
996  KA_TRACE(10, ("__kmpc_omp_task_complete_if0(exit): T#%d loc=%p task=%p\n",
997  gtid, loc_ref, KMP_TASK_TO_TASKDATA(task)));
998 
999 #if OMPT_SUPPORT
1000  if (ompt) {
1001  ompt_frame_t *ompt_frame;
1002  __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL);
1003  ompt_frame->enter_frame = ompt_data_none;
1004  ompt_frame->enter_frame_flags =
1005  ompt_frame_runtime | ompt_frame_framepointer;
1006  }
1007 #endif
1008 
1009  return;
1010 }
1011 
1012 #if OMPT_SUPPORT
1013 OMPT_NOINLINE
1014 void __kmpc_omp_task_complete_if0_ompt(ident_t *loc_ref, kmp_int32 gtid,
1015  kmp_task_t *task) {
1016  __kmpc_omp_task_complete_if0_template<true>(loc_ref, gtid, task);
1017 }
1018 #endif // OMPT_SUPPORT
1019 
1020 // __kmpc_omp_task_complete_if0: report that a task has completed execution
1021 //
1022 // loc_ref: source location information; points to end of task block.
1023 // gtid: global thread number.
1024 // task: task thunk for the completed task.
1025 void __kmpc_omp_task_complete_if0(ident_t *loc_ref, kmp_int32 gtid,
1026  kmp_task_t *task) {
1027 #if OMPT_SUPPORT
1028  if (UNLIKELY(ompt_enabled.enabled)) {
1029  __kmpc_omp_task_complete_if0_ompt(loc_ref, gtid, task);
1030  return;
1031  }
1032 #endif
1033  __kmpc_omp_task_complete_if0_template<false>(loc_ref, gtid, task);
1034 }
1035 
1036 #ifdef TASK_UNUSED
1037 // __kmpc_omp_task_complete: report that a task has completed execution
1038 // NEVER GENERATED BY COMPILER, DEPRECATED!!!
1039 void __kmpc_omp_task_complete(ident_t *loc_ref, kmp_int32 gtid,
1040  kmp_task_t *task) {
1041  KA_TRACE(10, ("__kmpc_omp_task_complete(enter): T#%d loc=%p task=%p\n", gtid,
1042  loc_ref, KMP_TASK_TO_TASKDATA(task)));
1043 
1044  __kmp_task_finish<false>(gtid, task,
1045  NULL); // Not sure how to find task to resume
1046 
1047  KA_TRACE(10, ("__kmpc_omp_task_complete(exit): T#%d loc=%p task=%p\n", gtid,
1048  loc_ref, KMP_TASK_TO_TASKDATA(task)));
1049  return;
1050 }
1051 #endif // TASK_UNUSED
1052 
1053 // __kmp_init_implicit_task: Initialize the appropriate fields in the implicit
1054 // task for a given thread
1055 //
1056 // loc_ref: reference to source location of parallel region
1057 // this_thr: thread data structure corresponding to implicit task
1058 // team: team for this_thr
1059 // tid: thread id of given thread within team
1060 // set_curr_task: TRUE if need to push current task to thread
1061 // NOTE: Routine does not set up the implicit task ICVS. This is assumed to
1062 // have already been done elsewhere.
1063 // TODO: Get better loc_ref. Value passed in may be NULL
1064 void __kmp_init_implicit_task(ident_t *loc_ref, kmp_info_t *this_thr,
1065  kmp_team_t *team, int tid, int set_curr_task) {
1066  kmp_taskdata_t *task = &team->t.t_implicit_task_taskdata[tid];
1067 
1068  KF_TRACE(
1069  10,
1070  ("__kmp_init_implicit_task(enter): T#:%d team=%p task=%p, reinit=%s\n",
1071  tid, team, task, set_curr_task ? "TRUE" : "FALSE"));
1072 
1073  task->td_task_id = KMP_GEN_TASK_ID();
1074  task->td_team = team;
1075  // task->td_parent = NULL; // fix for CQ230101 (broken parent task info
1076  // in debugger)
1077  task->td_ident = loc_ref;
1078  task->td_taskwait_ident = NULL;
1079  task->td_taskwait_counter = 0;
1080  task->td_taskwait_thread = 0;
1081 
1082  task->td_flags.tiedness = TASK_TIED;
1083  task->td_flags.tasktype = TASK_IMPLICIT;
1084  task->td_flags.proxy = TASK_FULL;
1085 
1086  // All implicit tasks are executed immediately, not deferred
1087  task->td_flags.task_serial = 1;
1088  task->td_flags.tasking_ser = (__kmp_tasking_mode == tskm_immediate_exec);
1089  task->td_flags.team_serial = (team->t.t_serialized) ? 1 : 0;
1090 
1091  task->td_flags.started = 1;
1092  task->td_flags.executing = 1;
1093  task->td_flags.complete = 0;
1094  task->td_flags.freed = 0;
1095 
1096  task->td_depnode = NULL;
1097  task->td_last_tied = task;
1098  task->td_allow_completion_event.type = KMP_EVENT_UNINITIALIZED;
1099 
1100  if (set_curr_task) { // only do this init first time thread is created
1101  KMP_ATOMIC_ST_REL(&task->td_incomplete_child_tasks, 0);
1102  // Not used: don't need to deallocate implicit task
1103  KMP_ATOMIC_ST_REL(&task->td_allocated_child_tasks, 0);
1104  task->td_taskgroup = NULL; // An implicit task does not have taskgroup
1105  task->td_dephash = NULL;
1106  __kmp_push_current_task_to_thread(this_thr, team, tid);
1107  } else {
1108  KMP_DEBUG_ASSERT(task->td_incomplete_child_tasks == 0);
1109  KMP_DEBUG_ASSERT(task->td_allocated_child_tasks == 0);
1110  }
1111 
1112 #if OMPT_SUPPORT
1113  if (UNLIKELY(ompt_enabled.enabled))
1114  __ompt_task_init(task, tid);
1115 #endif
1116 
1117  KF_TRACE(10, ("__kmp_init_implicit_task(exit): T#:%d team=%p task=%p\n", tid,
1118  team, task));
1119 }
1120 
1121 // __kmp_finish_implicit_task: Release resources associated to implicit tasks
1122 // at the end of parallel regions. Some resources are kept for reuse in the next
1123 // parallel region.
1124 //
1125 // thread: thread data structure corresponding to implicit task
1126 void __kmp_finish_implicit_task(kmp_info_t *thread) {
1127  kmp_taskdata_t *task = thread->th.th_current_task;
1128  if (task->td_dephash) {
1129  int children;
1130  task->td_flags.complete = 1;
1131  children = KMP_ATOMIC_LD_ACQ(&task->td_incomplete_child_tasks);
1132  kmp_tasking_flags_t flags_old = task->td_flags;
1133  if (children == 0 && flags_old.complete == 1) {
1134  kmp_tasking_flags_t flags_new = flags_old;
1135  flags_new.complete = 0;
1136  if (KMP_COMPARE_AND_STORE_ACQ32(RCAST(kmp_int32 *, &task->td_flags),
1137  *RCAST(kmp_int32 *, &flags_old),
1138  *RCAST(kmp_int32 *, &flags_new))) {
1139  KA_TRACE(100, ("__kmp_finish_implicit_task: T#%d cleans "
1140  "dephash of implicit task %p\n",
1141  thread->th.th_info.ds.ds_gtid, task));
1142  __kmp_dephash_free_entries(thread, task->td_dephash);
1143  }
1144  }
1145  }
1146 }
1147 
1148 // __kmp_free_implicit_task: Release resources associated to implicit tasks
1149 // when these are destroyed regions
1150 //
1151 // thread: thread data structure corresponding to implicit task
1152 void __kmp_free_implicit_task(kmp_info_t *thread) {
1153  kmp_taskdata_t *task = thread->th.th_current_task;
1154  if (task && task->td_dephash) {
1155  __kmp_dephash_free(thread, task->td_dephash);
1156  task->td_dephash = NULL;
1157  }
1158 }
1159 
1160 // Round up a size to a power of two specified by val: Used to insert padding
1161 // between structures co-allocated using a single malloc() call
1162 static size_t __kmp_round_up_to_val(size_t size, size_t val) {
1163  if (size & (val - 1)) {
1164  size &= ~(val - 1);
1165  if (size <= KMP_SIZE_T_MAX - val) {
1166  size += val; // Round up if there is no overflow.
1167  }
1168  }
1169  return size;
1170 } // __kmp_round_up_to_va
1171 
1172 // __kmp_task_alloc: Allocate the taskdata and task data structures for a task
1173 //
1174 // loc_ref: source location information
1175 // gtid: global thread number.
1176 // flags: include tiedness & task type (explicit vs. implicit) of the ''new''
1177 // task encountered. Converted from kmp_int32 to kmp_tasking_flags_t in routine.
1178 // sizeof_kmp_task_t: Size in bytes of kmp_task_t data structure including
1179 // private vars accessed in task.
1180 // sizeof_shareds: Size in bytes of array of pointers to shared vars accessed
1181 // in task.
1182 // task_entry: Pointer to task code entry point generated by compiler.
1183 // returns: a pointer to the allocated kmp_task_t structure (task).
1184 kmp_task_t *__kmp_task_alloc(ident_t *loc_ref, kmp_int32 gtid,
1185  kmp_tasking_flags_t *flags,
1186  size_t sizeof_kmp_task_t, size_t sizeof_shareds,
1187  kmp_routine_entry_t task_entry) {
1188  kmp_task_t *task;
1189  kmp_taskdata_t *taskdata;
1190  kmp_info_t *thread = __kmp_threads[gtid];
1191  kmp_info_t *encountering_thread = thread;
1192  kmp_team_t *team = thread->th.th_team;
1193  kmp_taskdata_t *parent_task = thread->th.th_current_task;
1194  size_t shareds_offset;
1195 
1196  if (UNLIKELY(!TCR_4(__kmp_init_middle)))
1197  __kmp_middle_initialize();
1198 
1199  if (flags->hidden_helper) {
1200  if (__kmp_enable_hidden_helper) {
1201  if (!TCR_4(__kmp_init_hidden_helper))
1202  __kmp_hidden_helper_initialize();
1203 
1204  // For a hidden helper task encountered by a regular thread, we will push
1205  // the task to the (gtid%__kmp_hidden_helper_threads_num)-th hidden helper
1206  // thread.
1207  if (!KMP_HIDDEN_HELPER_THREAD(gtid)) {
1208  thread = __kmp_threads[KMP_GTID_TO_SHADOW_GTID(gtid)];
1209  // We don't change the parent-child relation for hidden helper task as
1210  // we need that to do per-task-region synchronization.
1211  }
1212  } else {
1213  // If the hidden helper task is not enabled, reset the flag to FALSE.
1214  flags->hidden_helper = FALSE;
1215  }
1216  }
1217 
1218  KA_TRACE(10, ("__kmp_task_alloc(enter): T#%d loc=%p, flags=(0x%x) "
1219  "sizeof_task=%ld sizeof_shared=%ld entry=%p\n",
1220  gtid, loc_ref, *((kmp_int32 *)flags), sizeof_kmp_task_t,
1221  sizeof_shareds, task_entry));
1222 
1223  KMP_DEBUG_ASSERT(parent_task);
1224  if (parent_task->td_flags.final) {
1225  if (flags->merged_if0) {
1226  }
1227  flags->final = 1;
1228  }
1229 
1230  if (flags->tiedness == TASK_UNTIED && !team->t.t_serialized) {
1231  // Untied task encountered causes the TSC algorithm to check entire deque of
1232  // the victim thread. If no untied task encountered, then checking the head
1233  // of the deque should be enough.
1234  KMP_CHECK_UPDATE(
1235  encountering_thread->th.th_task_team->tt.tt_untied_task_encountered, 1);
1236  }
1237 
1238  // Detachable tasks are not proxy tasks yet but could be in the future. Doing
1239  // the tasking setup
1240  // when that happens is too late.
1241  if (UNLIKELY(flags->proxy == TASK_PROXY ||
1242  flags->detachable == TASK_DETACHABLE || flags->hidden_helper)) {
1243  if (flags->proxy == TASK_PROXY) {
1244  flags->tiedness = TASK_UNTIED;
1245  flags->merged_if0 = 1;
1246  }
1247  /* are we running in a sequential parallel or tskm_immediate_exec... we need
1248  tasking support enabled */
1249  if ((encountering_thread->th.th_task_team) == NULL) {
1250  /* This should only happen if the team is serialized
1251  setup a task team and propagate it to the thread */
1252  KMP_DEBUG_ASSERT(team->t.t_serialized);
1253  KA_TRACE(30,
1254  ("T#%d creating task team in __kmp_task_alloc for proxy task\n",
1255  gtid));
1256  __kmp_task_team_setup(
1257  encountering_thread, team,
1258  1); // 1 indicates setup the current team regardless of nthreads
1259  encountering_thread->th.th_task_team =
1260  team->t.t_task_team[encountering_thread->th.th_task_state];
1261  }
1262  kmp_task_team_t *task_team = encountering_thread->th.th_task_team;
1263 
1264  /* tasking must be enabled now as the task might not be pushed */
1265  if (!KMP_TASKING_ENABLED(task_team)) {
1266  KA_TRACE(
1267  30,
1268  ("T#%d enabling tasking in __kmp_task_alloc for proxy task\n", gtid));
1269  __kmp_enable_tasking(task_team, encountering_thread);
1270  kmp_int32 tid = encountering_thread->th.th_info.ds.ds_tid;
1271  kmp_thread_data_t *thread_data = &task_team->tt.tt_threads_data[tid];
1272  // No lock needed since only owner can allocate
1273  if (thread_data->td.td_deque == NULL) {
1274  __kmp_alloc_task_deque(encountering_thread, thread_data);
1275  }
1276  }
1277 
1278  if ((flags->proxy == TASK_PROXY || flags->detachable == TASK_DETACHABLE) &&
1279  task_team->tt.tt_found_proxy_tasks == FALSE)
1280  TCW_4(task_team->tt.tt_found_proxy_tasks, TRUE);
1281  if (flags->hidden_helper &&
1282  task_team->tt.tt_hidden_helper_task_encountered == FALSE)
1283  TCW_4(task_team->tt.tt_hidden_helper_task_encountered, TRUE);
1284  }
1285 
1286  // Calculate shared structure offset including padding after kmp_task_t struct
1287  // to align pointers in shared struct
1288  shareds_offset = sizeof(kmp_taskdata_t) + sizeof_kmp_task_t;
1289  shareds_offset = __kmp_round_up_to_val(shareds_offset, sizeof(void *));
1290 
1291  // Allocate a kmp_taskdata_t block and a kmp_task_t block.
1292  KA_TRACE(30, ("__kmp_task_alloc: T#%d First malloc size: %ld\n", gtid,
1293  shareds_offset));
1294  KA_TRACE(30, ("__kmp_task_alloc: T#%d Second malloc size: %ld\n", gtid,
1295  sizeof_shareds));
1296 
1297  // Avoid double allocation here by combining shareds with taskdata
1298 #if USE_FAST_MEMORY
1299  taskdata = (kmp_taskdata_t *)__kmp_fast_allocate(
1300  encountering_thread, shareds_offset + sizeof_shareds);
1301 #else /* ! USE_FAST_MEMORY */
1302  taskdata = (kmp_taskdata_t *)__kmp_thread_malloc(
1303  encountering_thread, shareds_offset + sizeof_shareds);
1304 #endif /* USE_FAST_MEMORY */
1305 
1306  task = KMP_TASKDATA_TO_TASK(taskdata);
1307 
1308 // Make sure task & taskdata are aligned appropriately
1309 #if KMP_ARCH_X86 || KMP_ARCH_PPC64 || !KMP_HAVE_QUAD
1310  KMP_DEBUG_ASSERT((((kmp_uintptr_t)taskdata) & (sizeof(double) - 1)) == 0);
1311  KMP_DEBUG_ASSERT((((kmp_uintptr_t)task) & (sizeof(double) - 1)) == 0);
1312 #else
1313  KMP_DEBUG_ASSERT((((kmp_uintptr_t)taskdata) & (sizeof(_Quad) - 1)) == 0);
1314  KMP_DEBUG_ASSERT((((kmp_uintptr_t)task) & (sizeof(_Quad) - 1)) == 0);
1315 #endif
1316  if (sizeof_shareds > 0) {
1317  // Avoid double allocation here by combining shareds with taskdata
1318  task->shareds = &((char *)taskdata)[shareds_offset];
1319  // Make sure shareds struct is aligned to pointer size
1320  KMP_DEBUG_ASSERT((((kmp_uintptr_t)task->shareds) & (sizeof(void *) - 1)) ==
1321  0);
1322  } else {
1323  task->shareds = NULL;
1324  }
1325  task->routine = task_entry;
1326  task->part_id = 0; // AC: Always start with 0 part id
1327 
1328  taskdata->td_task_id = KMP_GEN_TASK_ID();
1329  taskdata->td_team = thread->th.th_team;
1330  taskdata->td_alloc_thread = encountering_thread;
1331  taskdata->td_parent = parent_task;
1332  taskdata->td_level = parent_task->td_level + 1; // increment nesting level
1333  KMP_ATOMIC_ST_RLX(&taskdata->td_untied_count, 0);
1334  taskdata->td_ident = loc_ref;
1335  taskdata->td_taskwait_ident = NULL;
1336  taskdata->td_taskwait_counter = 0;
1337  taskdata->td_taskwait_thread = 0;
1338  KMP_DEBUG_ASSERT(taskdata->td_parent != NULL);
1339  // avoid copying icvs for proxy tasks
1340  if (flags->proxy == TASK_FULL)
1341  copy_icvs(&taskdata->td_icvs, &taskdata->td_parent->td_icvs);
1342 
1343  taskdata->td_flags = *flags;
1344  taskdata->encountering_gtid = gtid;
1345  taskdata->td_task_team = thread->th.th_task_team;
1346  taskdata->td_size_alloc = shareds_offset + sizeof_shareds;
1347  taskdata->td_flags.tasktype = TASK_EXPLICIT;
1348 
1349  // GEH - TODO: fix this to copy parent task's value of tasking_ser flag
1350  taskdata->td_flags.tasking_ser = (__kmp_tasking_mode == tskm_immediate_exec);
1351 
1352  // GEH - TODO: fix this to copy parent task's value of team_serial flag
1353  taskdata->td_flags.team_serial = (team->t.t_serialized) ? 1 : 0;
1354 
1355  // GEH - Note we serialize the task if the team is serialized to make sure
1356  // implicit parallel region tasks are not left until program termination to
1357  // execute. Also, it helps locality to execute immediately.
1358 
1359  taskdata->td_flags.task_serial =
1360  (parent_task->td_flags.final || taskdata->td_flags.team_serial ||
1361  taskdata->td_flags.tasking_ser || flags->merged_if0);
1362 
1363  taskdata->td_flags.started = 0;
1364  taskdata->td_flags.executing = 0;
1365  taskdata->td_flags.complete = 0;
1366  taskdata->td_flags.freed = 0;
1367 
1368  KMP_ATOMIC_ST_RLX(&taskdata->td_incomplete_child_tasks, 0);
1369  // start at one because counts current task and children
1370  KMP_ATOMIC_ST_RLX(&taskdata->td_allocated_child_tasks, 1);
1371  taskdata->td_taskgroup =
1372  parent_task->td_taskgroup; // task inherits taskgroup from the parent task
1373  taskdata->td_dephash = NULL;
1374  taskdata->td_depnode = NULL;
1375  if (flags->tiedness == TASK_UNTIED)
1376  taskdata->td_last_tied = NULL; // will be set when the task is scheduled
1377  else
1378  taskdata->td_last_tied = taskdata;
1379  taskdata->td_allow_completion_event.type = KMP_EVENT_UNINITIALIZED;
1380 #if OMPT_SUPPORT
1381  if (UNLIKELY(ompt_enabled.enabled))
1382  __ompt_task_init(taskdata, gtid);
1383 #endif
1384  // Only need to keep track of child task counts if team parallel and tasking
1385  // not serialized or if it is a proxy or detachable or hidden helper task
1386  if (flags->proxy == TASK_PROXY || flags->detachable == TASK_DETACHABLE ||
1387  flags->hidden_helper ||
1388  !(taskdata->td_flags.team_serial || taskdata->td_flags.tasking_ser)) {
1389  KMP_ATOMIC_INC(&parent_task->td_incomplete_child_tasks);
1390  if (parent_task->td_taskgroup)
1391  KMP_ATOMIC_INC(&parent_task->td_taskgroup->count);
1392  // Only need to keep track of allocated child tasks for explicit tasks since
1393  // implicit not deallocated
1394  if (taskdata->td_parent->td_flags.tasktype == TASK_EXPLICIT) {
1395  KMP_ATOMIC_INC(&taskdata->td_parent->td_allocated_child_tasks);
1396  }
1397  if (flags->hidden_helper) {
1398  taskdata->td_flags.task_serial = FALSE;
1399  // Increment the number of hidden helper tasks to be executed
1400  KMP_ATOMIC_INC(&__kmp_unexecuted_hidden_helper_tasks);
1401  }
1402  }
1403 
1404  KA_TRACE(20, ("__kmp_task_alloc(exit): T#%d created task %p parent=%p\n",
1405  gtid, taskdata, taskdata->td_parent));
1406 
1407  return task;
1408 }
1409 
1410 kmp_task_t *__kmpc_omp_task_alloc(ident_t *loc_ref, kmp_int32 gtid,
1411  kmp_int32 flags, size_t sizeof_kmp_task_t,
1412  size_t sizeof_shareds,
1413  kmp_routine_entry_t task_entry) {
1414  kmp_task_t *retval;
1415  kmp_tasking_flags_t *input_flags = (kmp_tasking_flags_t *)&flags;
1416  __kmp_assert_valid_gtid(gtid);
1417  input_flags->native = FALSE;
1418  // __kmp_task_alloc() sets up all other runtime flags
1419  KA_TRACE(10, ("__kmpc_omp_task_alloc(enter): T#%d loc=%p, flags=(%s %s %s) "
1420  "sizeof_task=%ld sizeof_shared=%ld entry=%p\n",
1421  gtid, loc_ref, input_flags->tiedness ? "tied " : "untied",
1422  input_flags->proxy ? "proxy" : "",
1423  input_flags->detachable ? "detachable" : "", sizeof_kmp_task_t,
1424  sizeof_shareds, task_entry));
1425 
1426  retval = __kmp_task_alloc(loc_ref, gtid, input_flags, sizeof_kmp_task_t,
1427  sizeof_shareds, task_entry);
1428 
1429  KA_TRACE(20, ("__kmpc_omp_task_alloc(exit): T#%d retval %p\n", gtid, retval));
1430 
1431  return retval;
1432 }
1433 
1434 kmp_task_t *__kmpc_omp_target_task_alloc(ident_t *loc_ref, kmp_int32 gtid,
1435  kmp_int32 flags,
1436  size_t sizeof_kmp_task_t,
1437  size_t sizeof_shareds,
1438  kmp_routine_entry_t task_entry,
1439  kmp_int64 device_id) {
1440  if (__kmp_enable_hidden_helper) {
1441  auto &input_flags = reinterpret_cast<kmp_tasking_flags_t &>(flags);
1442  input_flags.hidden_helper = TRUE;
1443  }
1444 
1445  return __kmpc_omp_task_alloc(loc_ref, gtid, flags, sizeof_kmp_task_t,
1446  sizeof_shareds, task_entry);
1447 }
1448 
1462 kmp_int32
1464  kmp_task_t *new_task, kmp_int32 naffins,
1465  kmp_task_affinity_info_t *affin_list) {
1466  return 0;
1467 }
1468 
1469 // __kmp_invoke_task: invoke the specified task
1470 //
1471 // gtid: global thread ID of caller
1472 // task: the task to invoke
1473 // current_task: the task to resume after task invocation
1474 static void __kmp_invoke_task(kmp_int32 gtid, kmp_task_t *task,
1475  kmp_taskdata_t *current_task) {
1476  kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(task);
1477  kmp_info_t *thread;
1478  int discard = 0 /* false */;
1479  KA_TRACE(
1480  30, ("__kmp_invoke_task(enter): T#%d invoking task %p, current_task=%p\n",
1481  gtid, taskdata, current_task));
1482  KMP_DEBUG_ASSERT(task);
1483  if (UNLIKELY(taskdata->td_flags.proxy == TASK_PROXY &&
1484  taskdata->td_flags.complete == 1)) {
1485  // This is a proxy task that was already completed but it needs to run
1486  // its bottom-half finish
1487  KA_TRACE(
1488  30,
1489  ("__kmp_invoke_task: T#%d running bottom finish for proxy task %p\n",
1490  gtid, taskdata));
1491 
1492  __kmp_bottom_half_finish_proxy(gtid, task);
1493 
1494  KA_TRACE(30, ("__kmp_invoke_task(exit): T#%d completed bottom finish for "
1495  "proxy task %p, resuming task %p\n",
1496  gtid, taskdata, current_task));
1497 
1498  return;
1499  }
1500 
1501 #if OMPT_SUPPORT
1502  // For untied tasks, the first task executed only calls __kmpc_omp_task and
1503  // does not execute code.
1504  ompt_thread_info_t oldInfo;
1505  if (UNLIKELY(ompt_enabled.enabled)) {
1506  // Store the threads states and restore them after the task
1507  thread = __kmp_threads[gtid];
1508  oldInfo = thread->th.ompt_thread_info;
1509  thread->th.ompt_thread_info.wait_id = 0;
1510  thread->th.ompt_thread_info.state = (thread->th.th_team_serialized)
1511  ? ompt_state_work_serial
1512  : ompt_state_work_parallel;
1513  taskdata->ompt_task_info.frame.exit_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
1514  }
1515 #endif
1516 
1517  // Decreament the counter of hidden helper tasks to be executed
1518  if (taskdata->td_flags.hidden_helper) {
1519  // Hidden helper tasks can only be executed by hidden helper threads
1520  KMP_ASSERT(KMP_HIDDEN_HELPER_THREAD(gtid));
1521  KMP_ATOMIC_DEC(&__kmp_unexecuted_hidden_helper_tasks);
1522  }
1523 
1524  // Proxy tasks are not handled by the runtime
1525  if (taskdata->td_flags.proxy != TASK_PROXY) {
1526  __kmp_task_start(gtid, task, current_task); // OMPT only if not discarded
1527  }
1528 
1529  // TODO: cancel tasks if the parallel region has also been cancelled
1530  // TODO: check if this sequence can be hoisted above __kmp_task_start
1531  // if cancellation has been enabled for this run ...
1532  if (UNLIKELY(__kmp_omp_cancellation)) {
1533  thread = __kmp_threads[gtid];
1534  kmp_team_t *this_team = thread->th.th_team;
1535  kmp_taskgroup_t *taskgroup = taskdata->td_taskgroup;
1536  if ((taskgroup && taskgroup->cancel_request) ||
1537  (this_team->t.t_cancel_request == cancel_parallel)) {
1538 #if OMPT_SUPPORT && OMPT_OPTIONAL
1539  ompt_data_t *task_data;
1540  if (UNLIKELY(ompt_enabled.ompt_callback_cancel)) {
1541  __ompt_get_task_info_internal(0, NULL, &task_data, NULL, NULL, NULL);
1542  ompt_callbacks.ompt_callback(ompt_callback_cancel)(
1543  task_data,
1544  ((taskgroup && taskgroup->cancel_request) ? ompt_cancel_taskgroup
1545  : ompt_cancel_parallel) |
1546  ompt_cancel_discarded_task,
1547  NULL);
1548  }
1549 #endif
1550  KMP_COUNT_BLOCK(TASK_cancelled);
1551  // this task belongs to a task group and we need to cancel it
1552  discard = 1 /* true */;
1553  }
1554  }
1555 
1556  // Invoke the task routine and pass in relevant data.
1557  // Thunks generated by gcc take a different argument list.
1558  if (!discard) {
1559  if (taskdata->td_flags.tiedness == TASK_UNTIED) {
1560  taskdata->td_last_tied = current_task->td_last_tied;
1561  KMP_DEBUG_ASSERT(taskdata->td_last_tied);
1562  }
1563 #if KMP_STATS_ENABLED
1564  KMP_COUNT_BLOCK(TASK_executed);
1565  switch (KMP_GET_THREAD_STATE()) {
1566  case FORK_JOIN_BARRIER:
1567  KMP_PUSH_PARTITIONED_TIMER(OMP_task_join_bar);
1568  break;
1569  case PLAIN_BARRIER:
1570  KMP_PUSH_PARTITIONED_TIMER(OMP_task_plain_bar);
1571  break;
1572  case TASKYIELD:
1573  KMP_PUSH_PARTITIONED_TIMER(OMP_task_taskyield);
1574  break;
1575  case TASKWAIT:
1576  KMP_PUSH_PARTITIONED_TIMER(OMP_task_taskwait);
1577  break;
1578  case TASKGROUP:
1579  KMP_PUSH_PARTITIONED_TIMER(OMP_task_taskgroup);
1580  break;
1581  default:
1582  KMP_PUSH_PARTITIONED_TIMER(OMP_task_immediate);
1583  break;
1584  }
1585 #endif // KMP_STATS_ENABLED
1586 
1587 // OMPT task begin
1588 #if OMPT_SUPPORT
1589  if (UNLIKELY(ompt_enabled.enabled))
1590  __ompt_task_start(task, current_task, gtid);
1591 #endif
1592 
1593 #if OMPD_SUPPORT
1594  if (ompd_state & OMPD_ENABLE_BP)
1595  ompd_bp_task_begin();
1596 #endif
1597 
1598 #if USE_ITT_BUILD && USE_ITT_NOTIFY
1599  kmp_uint64 cur_time;
1600  kmp_int32 kmp_itt_count_task =
1601  __kmp_forkjoin_frames_mode == 3 && !taskdata->td_flags.task_serial &&
1602  current_task->td_flags.tasktype == TASK_IMPLICIT;
1603  if (kmp_itt_count_task) {
1604  thread = __kmp_threads[gtid];
1605  // Time outer level explicit task on barrier for adjusting imbalance time
1606  if (thread->th.th_bar_arrive_time)
1607  cur_time = __itt_get_timestamp();
1608  else
1609  kmp_itt_count_task = 0; // thread is not on a barrier - skip timing
1610  }
1611  KMP_FSYNC_ACQUIRED(taskdata); // acquired self (new task)
1612 #endif
1613 
1614 #ifdef KMP_GOMP_COMPAT
1615  if (taskdata->td_flags.native) {
1616  ((void (*)(void *))(*(task->routine)))(task->shareds);
1617  } else
1618 #endif /* KMP_GOMP_COMPAT */
1619  {
1620  (*(task->routine))(gtid, task);
1621  }
1622  KMP_POP_PARTITIONED_TIMER();
1623 
1624 #if USE_ITT_BUILD && USE_ITT_NOTIFY
1625  if (kmp_itt_count_task) {
1626  // Barrier imbalance - adjust arrive time with the task duration
1627  thread->th.th_bar_arrive_time += (__itt_get_timestamp() - cur_time);
1628  }
1629  KMP_FSYNC_CANCEL(taskdata); // destroy self (just executed)
1630  KMP_FSYNC_RELEASING(taskdata->td_parent); // releasing parent
1631 #endif
1632  }
1633 
1634 #if OMPD_SUPPORT
1635  if (ompd_state & OMPD_ENABLE_BP)
1636  ompd_bp_task_end();
1637 #endif
1638 
1639  // Proxy tasks are not handled by the runtime
1640  if (taskdata->td_flags.proxy != TASK_PROXY) {
1641 #if OMPT_SUPPORT
1642  if (UNLIKELY(ompt_enabled.enabled)) {
1643  thread->th.ompt_thread_info = oldInfo;
1644  if (taskdata->td_flags.tiedness == TASK_TIED) {
1645  taskdata->ompt_task_info.frame.exit_frame = ompt_data_none;
1646  }
1647  __kmp_task_finish<true>(gtid, task, current_task);
1648  } else
1649 #endif
1650  __kmp_task_finish<false>(gtid, task, current_task);
1651  }
1652 
1653  KA_TRACE(
1654  30,
1655  ("__kmp_invoke_task(exit): T#%d completed task %p, resuming task %p\n",
1656  gtid, taskdata, current_task));
1657  return;
1658 }
1659 
1660 // __kmpc_omp_task_parts: Schedule a thread-switchable task for execution
1661 //
1662 // loc_ref: location of original task pragma (ignored)
1663 // gtid: Global Thread ID of encountering thread
1664 // new_task: task thunk allocated by __kmp_omp_task_alloc() for the ''new task''
1665 // Returns:
1666 // TASK_CURRENT_NOT_QUEUED (0) if did not suspend and queue current task to
1667 // be resumed later.
1668 // TASK_CURRENT_QUEUED (1) if suspended and queued the current task to be
1669 // resumed later.
1670 kmp_int32 __kmpc_omp_task_parts(ident_t *loc_ref, kmp_int32 gtid,
1671  kmp_task_t *new_task) {
1672  kmp_taskdata_t *new_taskdata = KMP_TASK_TO_TASKDATA(new_task);
1673 
1674  KA_TRACE(10, ("__kmpc_omp_task_parts(enter): T#%d loc=%p task=%p\n", gtid,
1675  loc_ref, new_taskdata));
1676 
1677 #if OMPT_SUPPORT
1678  kmp_taskdata_t *parent;
1679  if (UNLIKELY(ompt_enabled.enabled)) {
1680  parent = new_taskdata->td_parent;
1681  if (ompt_enabled.ompt_callback_task_create) {
1682  ompt_callbacks.ompt_callback(ompt_callback_task_create)(
1683  &(parent->ompt_task_info.task_data), &(parent->ompt_task_info.frame),
1684  &(new_taskdata->ompt_task_info.task_data), ompt_task_explicit, 0,
1685  OMPT_GET_RETURN_ADDRESS(0));
1686  }
1687  }
1688 #endif
1689 
1690  /* Should we execute the new task or queue it? For now, let's just always try
1691  to queue it. If the queue fills up, then we'll execute it. */
1692 
1693  if (__kmp_push_task(gtid, new_task) == TASK_NOT_PUSHED) // if cannot defer
1694  { // Execute this task immediately
1695  kmp_taskdata_t *current_task = __kmp_threads[gtid]->th.th_current_task;
1696  new_taskdata->td_flags.task_serial = 1;
1697  __kmp_invoke_task(gtid, new_task, current_task);
1698  }
1699 
1700  KA_TRACE(
1701  10,
1702  ("__kmpc_omp_task_parts(exit): T#%d returning TASK_CURRENT_NOT_QUEUED: "
1703  "loc=%p task=%p, return: TASK_CURRENT_NOT_QUEUED\n",
1704  gtid, loc_ref, new_taskdata));
1705 
1706 #if OMPT_SUPPORT
1707  if (UNLIKELY(ompt_enabled.enabled)) {
1708  parent->ompt_task_info.frame.enter_frame = ompt_data_none;
1709  }
1710 #endif
1711  return TASK_CURRENT_NOT_QUEUED;
1712 }
1713 
1714 // __kmp_omp_task: Schedule a non-thread-switchable task for execution
1715 //
1716 // gtid: Global Thread ID of encountering thread
1717 // new_task:non-thread-switchable task thunk allocated by __kmp_omp_task_alloc()
1718 // serialize_immediate: if TRUE then if the task is executed immediately its
1719 // execution will be serialized
1720 // Returns:
1721 // TASK_CURRENT_NOT_QUEUED (0) if did not suspend and queue current task to
1722 // be resumed later.
1723 // TASK_CURRENT_QUEUED (1) if suspended and queued the current task to be
1724 // resumed later.
1725 kmp_int32 __kmp_omp_task(kmp_int32 gtid, kmp_task_t *new_task,
1726  bool serialize_immediate) {
1727  kmp_taskdata_t *new_taskdata = KMP_TASK_TO_TASKDATA(new_task);
1728 
1729  /* Should we execute the new task or queue it? For now, let's just always try
1730  to queue it. If the queue fills up, then we'll execute it. */
1731  if (new_taskdata->td_flags.proxy == TASK_PROXY ||
1732  __kmp_push_task(gtid, new_task) == TASK_NOT_PUSHED) // if cannot defer
1733  { // Execute this task immediately
1734  kmp_taskdata_t *current_task = __kmp_threads[gtid]->th.th_current_task;
1735  if (serialize_immediate)
1736  new_taskdata->td_flags.task_serial = 1;
1737  __kmp_invoke_task(gtid, new_task, current_task);
1738  }
1739 
1740  return TASK_CURRENT_NOT_QUEUED;
1741 }
1742 
1743 // __kmpc_omp_task: Wrapper around __kmp_omp_task to schedule a
1744 // non-thread-switchable task from the parent thread only!
1745 //
1746 // loc_ref: location of original task pragma (ignored)
1747 // gtid: Global Thread ID of encountering thread
1748 // new_task: non-thread-switchable task thunk allocated by
1749 // __kmp_omp_task_alloc()
1750 // Returns:
1751 // TASK_CURRENT_NOT_QUEUED (0) if did not suspend and queue current task to
1752 // be resumed later.
1753 // TASK_CURRENT_QUEUED (1) if suspended and queued the current task to be
1754 // resumed later.
1755 kmp_int32 __kmpc_omp_task(ident_t *loc_ref, kmp_int32 gtid,
1756  kmp_task_t *new_task) {
1757  kmp_int32 res;
1758  KMP_SET_THREAD_STATE_BLOCK(EXPLICIT_TASK);
1759 
1760 #if KMP_DEBUG || OMPT_SUPPORT
1761  kmp_taskdata_t *new_taskdata = KMP_TASK_TO_TASKDATA(new_task);
1762 #endif
1763  KA_TRACE(10, ("__kmpc_omp_task(enter): T#%d loc=%p task=%p\n", gtid, loc_ref,
1764  new_taskdata));
1765  __kmp_assert_valid_gtid(gtid);
1766 
1767 #if OMPT_SUPPORT
1768  kmp_taskdata_t *parent = NULL;
1769  if (UNLIKELY(ompt_enabled.enabled)) {
1770  if (!new_taskdata->td_flags.started) {
1771  OMPT_STORE_RETURN_ADDRESS(gtid);
1772  parent = new_taskdata->td_parent;
1773  if (!parent->ompt_task_info.frame.enter_frame.ptr) {
1774  parent->ompt_task_info.frame.enter_frame.ptr =
1775  OMPT_GET_FRAME_ADDRESS(0);
1776  }
1777  if (ompt_enabled.ompt_callback_task_create) {
1778  ompt_callbacks.ompt_callback(ompt_callback_task_create)(
1779  &(parent->ompt_task_info.task_data),
1780  &(parent->ompt_task_info.frame),
1781  &(new_taskdata->ompt_task_info.task_data),
1782  ompt_task_explicit | TASK_TYPE_DETAILS_FORMAT(new_taskdata), 0,
1783  OMPT_LOAD_RETURN_ADDRESS(gtid));
1784  }
1785  } else {
1786  // We are scheduling the continuation of an UNTIED task.
1787  // Scheduling back to the parent task.
1788  __ompt_task_finish(new_task,
1789  new_taskdata->ompt_task_info.scheduling_parent,
1790  ompt_task_switch);
1791  new_taskdata->ompt_task_info.frame.exit_frame = ompt_data_none;
1792  }
1793  }
1794 #endif
1795 
1796  res = __kmp_omp_task(gtid, new_task, true);
1797 
1798  KA_TRACE(10, ("__kmpc_omp_task(exit): T#%d returning "
1799  "TASK_CURRENT_NOT_QUEUED: loc=%p task=%p\n",
1800  gtid, loc_ref, new_taskdata));
1801 #if OMPT_SUPPORT
1802  if (UNLIKELY(ompt_enabled.enabled && parent != NULL)) {
1803  parent->ompt_task_info.frame.enter_frame = ompt_data_none;
1804  }
1805 #endif
1806  return res;
1807 }
1808 
1809 // __kmp_omp_taskloop_task: Wrapper around __kmp_omp_task to schedule
1810 // a taskloop task with the correct OMPT return address
1811 //
1812 // loc_ref: location of original task pragma (ignored)
1813 // gtid: Global Thread ID of encountering thread
1814 // new_task: non-thread-switchable task thunk allocated by
1815 // __kmp_omp_task_alloc()
1816 // codeptr_ra: return address for OMPT callback
1817 // Returns:
1818 // TASK_CURRENT_NOT_QUEUED (0) if did not suspend and queue current task to
1819 // be resumed later.
1820 // TASK_CURRENT_QUEUED (1) if suspended and queued the current task to be
1821 // resumed later.
1822 kmp_int32 __kmp_omp_taskloop_task(ident_t *loc_ref, kmp_int32 gtid,
1823  kmp_task_t *new_task, void *codeptr_ra) {
1824  kmp_int32 res;
1825  KMP_SET_THREAD_STATE_BLOCK(EXPLICIT_TASK);
1826 
1827 #if KMP_DEBUG || OMPT_SUPPORT
1828  kmp_taskdata_t *new_taskdata = KMP_TASK_TO_TASKDATA(new_task);
1829 #endif
1830  KA_TRACE(10, ("__kmpc_omp_task(enter): T#%d loc=%p task=%p\n", gtid, loc_ref,
1831  new_taskdata));
1832 
1833 #if OMPT_SUPPORT
1834  kmp_taskdata_t *parent = NULL;
1835  if (UNLIKELY(ompt_enabled.enabled && !new_taskdata->td_flags.started)) {
1836  parent = new_taskdata->td_parent;
1837  if (!parent->ompt_task_info.frame.enter_frame.ptr)
1838  parent->ompt_task_info.frame.enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
1839  if (ompt_enabled.ompt_callback_task_create) {
1840  ompt_callbacks.ompt_callback(ompt_callback_task_create)(
1841  &(parent->ompt_task_info.task_data), &(parent->ompt_task_info.frame),
1842  &(new_taskdata->ompt_task_info.task_data),
1843  ompt_task_explicit | TASK_TYPE_DETAILS_FORMAT(new_taskdata), 0,
1844  codeptr_ra);
1845  }
1846  }
1847 #endif
1848 
1849  res = __kmp_omp_task(gtid, new_task, true);
1850 
1851  KA_TRACE(10, ("__kmpc_omp_task(exit): T#%d returning "
1852  "TASK_CURRENT_NOT_QUEUED: loc=%p task=%p\n",
1853  gtid, loc_ref, new_taskdata));
1854 #if OMPT_SUPPORT
1855  if (UNLIKELY(ompt_enabled.enabled && parent != NULL)) {
1856  parent->ompt_task_info.frame.enter_frame = ompt_data_none;
1857  }
1858 #endif
1859  return res;
1860 }
1861 
1862 template <bool ompt>
1863 static kmp_int32 __kmpc_omp_taskwait_template(ident_t *loc_ref, kmp_int32 gtid,
1864  void *frame_address,
1865  void *return_address) {
1866  kmp_taskdata_t *taskdata = nullptr;
1867  kmp_info_t *thread;
1868  int thread_finished = FALSE;
1869  KMP_SET_THREAD_STATE_BLOCK(TASKWAIT);
1870 
1871  KA_TRACE(10, ("__kmpc_omp_taskwait(enter): T#%d loc=%p\n", gtid, loc_ref));
1872  KMP_DEBUG_ASSERT(gtid >= 0);
1873 
1874  if (__kmp_tasking_mode != tskm_immediate_exec) {
1875  thread = __kmp_threads[gtid];
1876  taskdata = thread->th.th_current_task;
1877 
1878 #if OMPT_SUPPORT && OMPT_OPTIONAL
1879  ompt_data_t *my_task_data;
1880  ompt_data_t *my_parallel_data;
1881 
1882  if (ompt) {
1883  my_task_data = &(taskdata->ompt_task_info.task_data);
1884  my_parallel_data = OMPT_CUR_TEAM_DATA(thread);
1885 
1886  taskdata->ompt_task_info.frame.enter_frame.ptr = frame_address;
1887 
1888  if (ompt_enabled.ompt_callback_sync_region) {
1889  ompt_callbacks.ompt_callback(ompt_callback_sync_region)(
1890  ompt_sync_region_taskwait, ompt_scope_begin, my_parallel_data,
1891  my_task_data, return_address);
1892  }
1893 
1894  if (ompt_enabled.ompt_callback_sync_region_wait) {
1895  ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait)(
1896  ompt_sync_region_taskwait, ompt_scope_begin, my_parallel_data,
1897  my_task_data, return_address);
1898  }
1899  }
1900 #endif // OMPT_SUPPORT && OMPT_OPTIONAL
1901 
1902 // Debugger: The taskwait is active. Store location and thread encountered the
1903 // taskwait.
1904 #if USE_ITT_BUILD
1905 // Note: These values are used by ITT events as well.
1906 #endif /* USE_ITT_BUILD */
1907  taskdata->td_taskwait_counter += 1;
1908  taskdata->td_taskwait_ident = loc_ref;
1909  taskdata->td_taskwait_thread = gtid + 1;
1910 
1911 #if USE_ITT_BUILD
1912  void *itt_sync_obj = NULL;
1913 #if USE_ITT_NOTIFY
1914  KMP_ITT_TASKWAIT_STARTING(itt_sync_obj);
1915 #endif /* USE_ITT_NOTIFY */
1916 #endif /* USE_ITT_BUILD */
1917 
1918  bool must_wait =
1919  !taskdata->td_flags.team_serial && !taskdata->td_flags.final;
1920 
1921  must_wait = must_wait || (thread->th.th_task_team != NULL &&
1922  thread->th.th_task_team->tt.tt_found_proxy_tasks);
1923  // If hidden helper thread is encountered, we must enable wait here.
1924  must_wait =
1925  must_wait ||
1926  (__kmp_enable_hidden_helper && thread->th.th_task_team != NULL &&
1927  thread->th.th_task_team->tt.tt_hidden_helper_task_encountered);
1928 
1929  if (must_wait) {
1930  kmp_flag_32<false, false> flag(
1931  RCAST(std::atomic<kmp_uint32> *,
1932  &(taskdata->td_incomplete_child_tasks)),
1933  0U);
1934  while (KMP_ATOMIC_LD_ACQ(&taskdata->td_incomplete_child_tasks) != 0) {
1935  flag.execute_tasks(thread, gtid, FALSE,
1936  &thread_finished USE_ITT_BUILD_ARG(itt_sync_obj),
1937  __kmp_task_stealing_constraint);
1938  }
1939  }
1940 #if USE_ITT_BUILD
1941  KMP_ITT_TASKWAIT_FINISHED(itt_sync_obj);
1942  KMP_FSYNC_ACQUIRED(taskdata); // acquire self - sync with children
1943 #endif /* USE_ITT_BUILD */
1944 
1945  // Debugger: The taskwait is completed. Location remains, but thread is
1946  // negated.
1947  taskdata->td_taskwait_thread = -taskdata->td_taskwait_thread;
1948 
1949 #if OMPT_SUPPORT && OMPT_OPTIONAL
1950  if (ompt) {
1951  if (ompt_enabled.ompt_callback_sync_region_wait) {
1952  ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait)(
1953  ompt_sync_region_taskwait, ompt_scope_end, my_parallel_data,
1954  my_task_data, return_address);
1955  }
1956  if (ompt_enabled.ompt_callback_sync_region) {
1957  ompt_callbacks.ompt_callback(ompt_callback_sync_region)(
1958  ompt_sync_region_taskwait, ompt_scope_end, my_parallel_data,
1959  my_task_data, return_address);
1960  }
1961  taskdata->ompt_task_info.frame.enter_frame = ompt_data_none;
1962  }
1963 #endif // OMPT_SUPPORT && OMPT_OPTIONAL
1964 
1965  }
1966 
1967  KA_TRACE(10, ("__kmpc_omp_taskwait(exit): T#%d task %p finished waiting, "
1968  "returning TASK_CURRENT_NOT_QUEUED\n",
1969  gtid, taskdata));
1970 
1971  return TASK_CURRENT_NOT_QUEUED;
1972 }
1973 
1974 #if OMPT_SUPPORT && OMPT_OPTIONAL
1975 OMPT_NOINLINE
1976 static kmp_int32 __kmpc_omp_taskwait_ompt(ident_t *loc_ref, kmp_int32 gtid,
1977  void *frame_address,
1978  void *return_address) {
1979  return __kmpc_omp_taskwait_template<true>(loc_ref, gtid, frame_address,
1980  return_address);
1981 }
1982 #endif // OMPT_SUPPORT && OMPT_OPTIONAL
1983 
1984 // __kmpc_omp_taskwait: Wait until all tasks generated by the current task are
1985 // complete
1986 kmp_int32 __kmpc_omp_taskwait(ident_t *loc_ref, kmp_int32 gtid) {
1987 #if OMPT_SUPPORT && OMPT_OPTIONAL
1988  if (UNLIKELY(ompt_enabled.enabled)) {
1989  OMPT_STORE_RETURN_ADDRESS(gtid);
1990  return __kmpc_omp_taskwait_ompt(loc_ref, gtid, OMPT_GET_FRAME_ADDRESS(0),
1991  OMPT_LOAD_RETURN_ADDRESS(gtid));
1992  }
1993 #endif
1994  return __kmpc_omp_taskwait_template<false>(loc_ref, gtid, NULL, NULL);
1995 }
1996 
1997 // __kmpc_omp_taskyield: switch to a different task
1998 kmp_int32 __kmpc_omp_taskyield(ident_t *loc_ref, kmp_int32 gtid, int end_part) {
1999  kmp_taskdata_t *taskdata = NULL;
2000  kmp_info_t *thread;
2001  int thread_finished = FALSE;
2002 
2003  KMP_COUNT_BLOCK(OMP_TASKYIELD);
2004  KMP_SET_THREAD_STATE_BLOCK(TASKYIELD);
2005 
2006  KA_TRACE(10, ("__kmpc_omp_taskyield(enter): T#%d loc=%p end_part = %d\n",
2007  gtid, loc_ref, end_part));
2008  __kmp_assert_valid_gtid(gtid);
2009 
2010  if (__kmp_tasking_mode != tskm_immediate_exec && __kmp_init_parallel) {
2011  thread = __kmp_threads[gtid];
2012  taskdata = thread->th.th_current_task;
2013 // Should we model this as a task wait or not?
2014 // Debugger: The taskwait is active. Store location and thread encountered the
2015 // taskwait.
2016 #if USE_ITT_BUILD
2017 // Note: These values are used by ITT events as well.
2018 #endif /* USE_ITT_BUILD */
2019  taskdata->td_taskwait_counter += 1;
2020  taskdata->td_taskwait_ident = loc_ref;
2021  taskdata->td_taskwait_thread = gtid + 1;
2022 
2023 #if USE_ITT_BUILD
2024  void *itt_sync_obj = NULL;
2025 #if USE_ITT_NOTIFY
2026  KMP_ITT_TASKWAIT_STARTING(itt_sync_obj);
2027 #endif /* USE_ITT_NOTIFY */
2028 #endif /* USE_ITT_BUILD */
2029  if (!taskdata->td_flags.team_serial) {
2030  kmp_task_team_t *task_team = thread->th.th_task_team;
2031  if (task_team != NULL) {
2032  if (KMP_TASKING_ENABLED(task_team)) {
2033 #if OMPT_SUPPORT
2034  if (UNLIKELY(ompt_enabled.enabled))
2035  thread->th.ompt_thread_info.ompt_task_yielded = 1;
2036 #endif
2037  __kmp_execute_tasks_32(
2038  thread, gtid, (kmp_flag_32<> *)NULL, FALSE,
2039  &thread_finished USE_ITT_BUILD_ARG(itt_sync_obj),
2040  __kmp_task_stealing_constraint);
2041 #if OMPT_SUPPORT
2042  if (UNLIKELY(ompt_enabled.enabled))
2043  thread->th.ompt_thread_info.ompt_task_yielded = 0;
2044 #endif
2045  }
2046  }
2047  }
2048 #if USE_ITT_BUILD
2049  KMP_ITT_TASKWAIT_FINISHED(itt_sync_obj);
2050 #endif /* USE_ITT_BUILD */
2051 
2052  // Debugger: The taskwait is completed. Location remains, but thread is
2053  // negated.
2054  taskdata->td_taskwait_thread = -taskdata->td_taskwait_thread;
2055  }
2056 
2057  KA_TRACE(10, ("__kmpc_omp_taskyield(exit): T#%d task %p resuming, "
2058  "returning TASK_CURRENT_NOT_QUEUED\n",
2059  gtid, taskdata));
2060 
2061  return TASK_CURRENT_NOT_QUEUED;
2062 }
2063 
2064 // Task Reduction implementation
2065 //
2066 // Note: initial implementation didn't take into account the possibility
2067 // to specify omp_orig for initializer of the UDR (user defined reduction).
2068 // Corrected implementation takes into account the omp_orig object.
2069 // Compiler is free to use old implementation if omp_orig is not specified.
2070 
2079 typedef struct kmp_taskred_flags {
2081  unsigned lazy_priv : 1;
2082  unsigned reserved31 : 31;
2084 
2088 typedef struct kmp_task_red_input {
2089  void *reduce_shar;
2090  size_t reduce_size;
2091  // three compiler-generated routines (init, fini are optional):
2092  void *reduce_init;
2093  void *reduce_fini;
2094  void *reduce_comb;
2097 
2101 typedef struct kmp_taskred_data {
2102  void *reduce_shar;
2103  size_t reduce_size;
2105  void *reduce_priv;
2106  void *reduce_pend;
2107  // three compiler-generated routines (init, fini are optional):
2108  void *reduce_comb;
2109  void *reduce_init;
2110  void *reduce_fini;
2111  void *reduce_orig;
2113 
2119 typedef struct kmp_taskred_input {
2120  void *reduce_shar;
2121  void *reduce_orig;
2122  size_t reduce_size;
2123  // three compiler-generated routines (init, fini are optional):
2124  void *reduce_init;
2125  void *reduce_fini;
2126  void *reduce_comb;
2133 template <typename T> void __kmp_assign_orig(kmp_taskred_data_t &item, T &src);
2134 template <>
2135 void __kmp_assign_orig<kmp_task_red_input_t>(kmp_taskred_data_t &item,
2136  kmp_task_red_input_t &src) {
2137  item.reduce_orig = NULL;
2138 }
2139 template <>
2140 void __kmp_assign_orig<kmp_taskred_input_t>(kmp_taskred_data_t &item,
2141  kmp_taskred_input_t &src) {
2142  if (src.reduce_orig != NULL) {
2143  item.reduce_orig = src.reduce_orig;
2144  } else {
2145  item.reduce_orig = src.reduce_shar;
2146  } // non-NULL reduce_orig means new interface used
2147 }
2148 
2149 template <typename T> void __kmp_call_init(kmp_taskred_data_t &item, size_t j);
2150 template <>
2151 void __kmp_call_init<kmp_task_red_input_t>(kmp_taskred_data_t &item,
2152  size_t offset) {
2153  ((void (*)(void *))item.reduce_init)((char *)(item.reduce_priv) + offset);
2154 }
2155 template <>
2156 void __kmp_call_init<kmp_taskred_input_t>(kmp_taskred_data_t &item,
2157  size_t offset) {
2158  ((void (*)(void *, void *))item.reduce_init)(
2159  (char *)(item.reduce_priv) + offset, item.reduce_orig);
2160 }
2161 
2162 template <typename T>
2163 void *__kmp_task_reduction_init(int gtid, int num, T *data) {
2164  __kmp_assert_valid_gtid(gtid);
2165  kmp_info_t *thread = __kmp_threads[gtid];
2166  kmp_taskgroup_t *tg = thread->th.th_current_task->td_taskgroup;
2167  kmp_uint32 nth = thread->th.th_team_nproc;
2168  kmp_taskred_data_t *arr;
2169 
2170  // check input data just in case
2171  KMP_ASSERT(tg != NULL);
2172  KMP_ASSERT(data != NULL);
2173  KMP_ASSERT(num > 0);
2174  if (nth == 1) {
2175  KA_TRACE(10, ("__kmpc_task_reduction_init: T#%d, tg %p, exiting nth=1\n",
2176  gtid, tg));
2177  return (void *)tg;
2178  }
2179  KA_TRACE(10, ("__kmpc_task_reduction_init: T#%d, taskgroup %p, #items %d\n",
2180  gtid, tg, num));
2181  arr = (kmp_taskred_data_t *)__kmp_thread_malloc(
2182  thread, num * sizeof(kmp_taskred_data_t));
2183  for (int i = 0; i < num; ++i) {
2184  size_t size = data[i].reduce_size - 1;
2185  // round the size up to cache line per thread-specific item
2186  size += CACHE_LINE - size % CACHE_LINE;
2187  KMP_ASSERT(data[i].reduce_comb != NULL); // combiner is mandatory
2188  arr[i].reduce_shar = data[i].reduce_shar;
2189  arr[i].reduce_size = size;
2190  arr[i].flags = data[i].flags;
2191  arr[i].reduce_comb = data[i].reduce_comb;
2192  arr[i].reduce_init = data[i].reduce_init;
2193  arr[i].reduce_fini = data[i].reduce_fini;
2194  __kmp_assign_orig<T>(arr[i], data[i]);
2195  if (!arr[i].flags.lazy_priv) {
2196  // allocate cache-line aligned block and fill it with zeros
2197  arr[i].reduce_priv = __kmp_allocate(nth * size);
2198  arr[i].reduce_pend = (char *)(arr[i].reduce_priv) + nth * size;
2199  if (arr[i].reduce_init != NULL) {
2200  // initialize all thread-specific items
2201  for (size_t j = 0; j < nth; ++j) {
2202  __kmp_call_init<T>(arr[i], j * size);
2203  }
2204  }
2205  } else {
2206  // only allocate space for pointers now,
2207  // objects will be lazily allocated/initialized if/when requested
2208  // note that __kmp_allocate zeroes the allocated memory
2209  arr[i].reduce_priv = __kmp_allocate(nth * sizeof(void *));
2210  }
2211  }
2212  tg->reduce_data = (void *)arr;
2213  tg->reduce_num_data = num;
2214  return (void *)tg;
2215 }
2216 
2231 void *__kmpc_task_reduction_init(int gtid, int num, void *data) {
2232  return __kmp_task_reduction_init(gtid, num, (kmp_task_red_input_t *)data);
2233 }
2234 
2247 void *__kmpc_taskred_init(int gtid, int num, void *data) {
2248  return __kmp_task_reduction_init(gtid, num, (kmp_taskred_input_t *)data);
2249 }
2250 
2251 // Copy task reduction data (except for shared pointers).
2252 template <typename T>
2253 void __kmp_task_reduction_init_copy(kmp_info_t *thr, int num, T *data,
2254  kmp_taskgroup_t *tg, void *reduce_data) {
2255  kmp_taskred_data_t *arr;
2256  KA_TRACE(20, ("__kmp_task_reduction_init_copy: Th %p, init taskgroup %p,"
2257  " from data %p\n",
2258  thr, tg, reduce_data));
2259  arr = (kmp_taskred_data_t *)__kmp_thread_malloc(
2260  thr, num * sizeof(kmp_taskred_data_t));
2261  // threads will share private copies, thunk routines, sizes, flags, etc.:
2262  KMP_MEMCPY(arr, reduce_data, num * sizeof(kmp_taskred_data_t));
2263  for (int i = 0; i < num; ++i) {
2264  arr[i].reduce_shar = data[i].reduce_shar; // init unique shared pointers
2265  }
2266  tg->reduce_data = (void *)arr;
2267  tg->reduce_num_data = num;
2268 }
2269 
2279 void *__kmpc_task_reduction_get_th_data(int gtid, void *tskgrp, void *data) {
2280  __kmp_assert_valid_gtid(gtid);
2281  kmp_info_t *thread = __kmp_threads[gtid];
2282  kmp_int32 nth = thread->th.th_team_nproc;
2283  if (nth == 1)
2284  return data; // nothing to do
2285 
2286  kmp_taskgroup_t *tg = (kmp_taskgroup_t *)tskgrp;
2287  if (tg == NULL)
2288  tg = thread->th.th_current_task->td_taskgroup;
2289  KMP_ASSERT(tg != NULL);
2290  kmp_taskred_data_t *arr = (kmp_taskred_data_t *)(tg->reduce_data);
2291  kmp_int32 num = tg->reduce_num_data;
2292  kmp_int32 tid = thread->th.th_info.ds.ds_tid;
2293 
2294  KMP_ASSERT(data != NULL);
2295  while (tg != NULL) {
2296  for (int i = 0; i < num; ++i) {
2297  if (!arr[i].flags.lazy_priv) {
2298  if (data == arr[i].reduce_shar ||
2299  (data >= arr[i].reduce_priv && data < arr[i].reduce_pend))
2300  return (char *)(arr[i].reduce_priv) + tid * arr[i].reduce_size;
2301  } else {
2302  // check shared location first
2303  void **p_priv = (void **)(arr[i].reduce_priv);
2304  if (data == arr[i].reduce_shar)
2305  goto found;
2306  // check if we get some thread specific location as parameter
2307  for (int j = 0; j < nth; ++j)
2308  if (data == p_priv[j])
2309  goto found;
2310  continue; // not found, continue search
2311  found:
2312  if (p_priv[tid] == NULL) {
2313  // allocate thread specific object lazily
2314  p_priv[tid] = __kmp_allocate(arr[i].reduce_size);
2315  if (arr[i].reduce_init != NULL) {
2316  if (arr[i].reduce_orig != NULL) { // new interface
2317  ((void (*)(void *, void *))arr[i].reduce_init)(
2318  p_priv[tid], arr[i].reduce_orig);
2319  } else { // old interface (single parameter)
2320  ((void (*)(void *))arr[i].reduce_init)(p_priv[tid]);
2321  }
2322  }
2323  }
2324  return p_priv[tid];
2325  }
2326  }
2327  tg = tg->parent;
2328  arr = (kmp_taskred_data_t *)(tg->reduce_data);
2329  num = tg->reduce_num_data;
2330  }
2331  KMP_ASSERT2(0, "Unknown task reduction item");
2332  return NULL; // ERROR, this line never executed
2333 }
2334 
2335 // Finalize task reduction.
2336 // Called from __kmpc_end_taskgroup()
2337 static void __kmp_task_reduction_fini(kmp_info_t *th, kmp_taskgroup_t *tg) {
2338  kmp_int32 nth = th->th.th_team_nproc;
2339  KMP_DEBUG_ASSERT(nth > 1); // should not be called if nth == 1
2340  kmp_taskred_data_t *arr = (kmp_taskred_data_t *)tg->reduce_data;
2341  kmp_int32 num = tg->reduce_num_data;
2342  for (int i = 0; i < num; ++i) {
2343  void *sh_data = arr[i].reduce_shar;
2344  void (*f_fini)(void *) = (void (*)(void *))(arr[i].reduce_fini);
2345  void (*f_comb)(void *, void *) =
2346  (void (*)(void *, void *))(arr[i].reduce_comb);
2347  if (!arr[i].flags.lazy_priv) {
2348  void *pr_data = arr[i].reduce_priv;
2349  size_t size = arr[i].reduce_size;
2350  for (int j = 0; j < nth; ++j) {
2351  void *priv_data = (char *)pr_data + j * size;
2352  f_comb(sh_data, priv_data); // combine results
2353  if (f_fini)
2354  f_fini(priv_data); // finalize if needed
2355  }
2356  } else {
2357  void **pr_data = (void **)(arr[i].reduce_priv);
2358  for (int j = 0; j < nth; ++j) {
2359  if (pr_data[j] != NULL) {
2360  f_comb(sh_data, pr_data[j]); // combine results
2361  if (f_fini)
2362  f_fini(pr_data[j]); // finalize if needed
2363  __kmp_free(pr_data[j]);
2364  }
2365  }
2366  }
2367  __kmp_free(arr[i].reduce_priv);
2368  }
2369  __kmp_thread_free(th, arr);
2370  tg->reduce_data = NULL;
2371  tg->reduce_num_data = 0;
2372 }
2373 
2374 // Cleanup task reduction data for parallel or worksharing,
2375 // do not touch task private data other threads still working with.
2376 // Called from __kmpc_end_taskgroup()
2377 static void __kmp_task_reduction_clean(kmp_info_t *th, kmp_taskgroup_t *tg) {
2378  __kmp_thread_free(th, tg->reduce_data);
2379  tg->reduce_data = NULL;
2380  tg->reduce_num_data = 0;
2381 }
2382 
2383 template <typename T>
2384 void *__kmp_task_reduction_modifier_init(ident_t *loc, int gtid, int is_ws,
2385  int num, T *data) {
2386  __kmp_assert_valid_gtid(gtid);
2387  kmp_info_t *thr = __kmp_threads[gtid];
2388  kmp_int32 nth = thr->th.th_team_nproc;
2389  __kmpc_taskgroup(loc, gtid); // form new taskgroup first
2390  if (nth == 1) {
2391  KA_TRACE(10,
2392  ("__kmpc_reduction_modifier_init: T#%d, tg %p, exiting nth=1\n",
2393  gtid, thr->th.th_current_task->td_taskgroup));
2394  return (void *)thr->th.th_current_task->td_taskgroup;
2395  }
2396  kmp_team_t *team = thr->th.th_team;
2397  void *reduce_data;
2398  kmp_taskgroup_t *tg;
2399  reduce_data = KMP_ATOMIC_LD_RLX(&team->t.t_tg_reduce_data[is_ws]);
2400  if (reduce_data == NULL &&
2401  __kmp_atomic_compare_store(&team->t.t_tg_reduce_data[is_ws], reduce_data,
2402  (void *)1)) {
2403  // single thread enters this block to initialize common reduction data
2404  KMP_DEBUG_ASSERT(reduce_data == NULL);
2405  // first initialize own data, then make a copy other threads can use
2406  tg = (kmp_taskgroup_t *)__kmp_task_reduction_init<T>(gtid, num, data);
2407  reduce_data = __kmp_thread_malloc(thr, num * sizeof(kmp_taskred_data_t));
2408  KMP_MEMCPY(reduce_data, tg->reduce_data, num * sizeof(kmp_taskred_data_t));
2409  // fini counters should be 0 at this point
2410  KMP_DEBUG_ASSERT(KMP_ATOMIC_LD_RLX(&team->t.t_tg_fini_counter[0]) == 0);
2411  KMP_DEBUG_ASSERT(KMP_ATOMIC_LD_RLX(&team->t.t_tg_fini_counter[1]) == 0);
2412  KMP_ATOMIC_ST_REL(&team->t.t_tg_reduce_data[is_ws], reduce_data);
2413  } else {
2414  while (
2415  (reduce_data = KMP_ATOMIC_LD_ACQ(&team->t.t_tg_reduce_data[is_ws])) ==
2416  (void *)1) { // wait for task reduction initialization
2417  KMP_CPU_PAUSE();
2418  }
2419  KMP_DEBUG_ASSERT(reduce_data > (void *)1); // should be valid pointer here
2420  tg = thr->th.th_current_task->td_taskgroup;
2421  __kmp_task_reduction_init_copy<T>(thr, num, data, tg, reduce_data);
2422  }
2423  return tg;
2424 }
2425 
2442 void *__kmpc_task_reduction_modifier_init(ident_t *loc, int gtid, int is_ws,
2443  int num, void *data) {
2444  return __kmp_task_reduction_modifier_init(loc, gtid, is_ws, num,
2445  (kmp_task_red_input_t *)data);
2446 }
2447 
2462 void *__kmpc_taskred_modifier_init(ident_t *loc, int gtid, int is_ws, int num,
2463  void *data) {
2464  return __kmp_task_reduction_modifier_init(loc, gtid, is_ws, num,
2465  (kmp_taskred_input_t *)data);
2466 }
2467 
2476 void __kmpc_task_reduction_modifier_fini(ident_t *loc, int gtid, int is_ws) {
2477  __kmpc_end_taskgroup(loc, gtid);
2478 }
2479 
2480 // __kmpc_taskgroup: Start a new taskgroup
2481 void __kmpc_taskgroup(ident_t *loc, int gtid) {
2482  __kmp_assert_valid_gtid(gtid);
2483  kmp_info_t *thread = __kmp_threads[gtid];
2484  kmp_taskdata_t *taskdata = thread->th.th_current_task;
2485  kmp_taskgroup_t *tg_new =
2486  (kmp_taskgroup_t *)__kmp_thread_malloc(thread, sizeof(kmp_taskgroup_t));
2487  KA_TRACE(10, ("__kmpc_taskgroup: T#%d loc=%p group=%p\n", gtid, loc, tg_new));
2488  KMP_ATOMIC_ST_RLX(&tg_new->count, 0);
2489  KMP_ATOMIC_ST_RLX(&tg_new->cancel_request, cancel_noreq);
2490  tg_new->parent = taskdata->td_taskgroup;
2491  tg_new->reduce_data = NULL;
2492  tg_new->reduce_num_data = 0;
2493  tg_new->gomp_data = NULL;
2494  taskdata->td_taskgroup = tg_new;
2495 
2496 #if OMPT_SUPPORT && OMPT_OPTIONAL
2497  if (UNLIKELY(ompt_enabled.ompt_callback_sync_region)) {
2498  void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2499  if (!codeptr)
2500  codeptr = OMPT_GET_RETURN_ADDRESS(0);
2501  kmp_team_t *team = thread->th.th_team;
2502  ompt_data_t my_task_data = taskdata->ompt_task_info.task_data;
2503  // FIXME: I think this is wrong for lwt!
2504  ompt_data_t my_parallel_data = team->t.ompt_team_info.parallel_data;
2505 
2506  ompt_callbacks.ompt_callback(ompt_callback_sync_region)(
2507  ompt_sync_region_taskgroup, ompt_scope_begin, &(my_parallel_data),
2508  &(my_task_data), codeptr);
2509  }
2510 #endif
2511 }
2512 
2513 // __kmpc_end_taskgroup: Wait until all tasks generated by the current task
2514 // and its descendants are complete
2515 void __kmpc_end_taskgroup(ident_t *loc, int gtid) {
2516  __kmp_assert_valid_gtid(gtid);
2517  kmp_info_t *thread = __kmp_threads[gtid];
2518  kmp_taskdata_t *taskdata = thread->th.th_current_task;
2519  kmp_taskgroup_t *taskgroup = taskdata->td_taskgroup;
2520  int thread_finished = FALSE;
2521 
2522 #if OMPT_SUPPORT && OMPT_OPTIONAL
2523  kmp_team_t *team;
2524  ompt_data_t my_task_data;
2525  ompt_data_t my_parallel_data;
2526  void *codeptr = nullptr;
2527  if (UNLIKELY(ompt_enabled.enabled)) {
2528  team = thread->th.th_team;
2529  my_task_data = taskdata->ompt_task_info.task_data;
2530  // FIXME: I think this is wrong for lwt!
2531  my_parallel_data = team->t.ompt_team_info.parallel_data;
2532  codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2533  if (!codeptr)
2534  codeptr = OMPT_GET_RETURN_ADDRESS(0);
2535  }
2536 #endif
2537 
2538  KA_TRACE(10, ("__kmpc_end_taskgroup(enter): T#%d loc=%p\n", gtid, loc));
2539  KMP_DEBUG_ASSERT(taskgroup != NULL);
2540  KMP_SET_THREAD_STATE_BLOCK(TASKGROUP);
2541 
2542  if (__kmp_tasking_mode != tskm_immediate_exec) {
2543  // mark task as waiting not on a barrier
2544  taskdata->td_taskwait_counter += 1;
2545  taskdata->td_taskwait_ident = loc;
2546  taskdata->td_taskwait_thread = gtid + 1;
2547 #if USE_ITT_BUILD
2548  // For ITT the taskgroup wait is similar to taskwait until we need to
2549  // distinguish them
2550  void *itt_sync_obj = NULL;
2551 #if USE_ITT_NOTIFY
2552  KMP_ITT_TASKWAIT_STARTING(itt_sync_obj);
2553 #endif /* USE_ITT_NOTIFY */
2554 #endif /* USE_ITT_BUILD */
2555 
2556 #if OMPT_SUPPORT && OMPT_OPTIONAL
2557  if (UNLIKELY(ompt_enabled.ompt_callback_sync_region_wait)) {
2558  ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait)(
2559  ompt_sync_region_taskgroup, ompt_scope_begin, &(my_parallel_data),
2560  &(my_task_data), codeptr);
2561  }
2562 #endif
2563 
2564  if (!taskdata->td_flags.team_serial ||
2565  (thread->th.th_task_team != NULL &&
2566  thread->th.th_task_team->tt.tt_found_proxy_tasks)) {
2567  kmp_flag_32<false, false> flag(
2568  RCAST(std::atomic<kmp_uint32> *, &(taskgroup->count)), 0U);
2569  while (KMP_ATOMIC_LD_ACQ(&taskgroup->count) != 0) {
2570  flag.execute_tasks(thread, gtid, FALSE,
2571  &thread_finished USE_ITT_BUILD_ARG(itt_sync_obj),
2572  __kmp_task_stealing_constraint);
2573  }
2574  }
2575  taskdata->td_taskwait_thread = -taskdata->td_taskwait_thread; // end waiting
2576 
2577 #if OMPT_SUPPORT && OMPT_OPTIONAL
2578  if (UNLIKELY(ompt_enabled.ompt_callback_sync_region_wait)) {
2579  ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait)(
2580  ompt_sync_region_taskgroup, ompt_scope_end, &(my_parallel_data),
2581  &(my_task_data), codeptr);
2582  }
2583 #endif
2584 
2585 #if USE_ITT_BUILD
2586  KMP_ITT_TASKWAIT_FINISHED(itt_sync_obj);
2587  KMP_FSYNC_ACQUIRED(taskdata); // acquire self - sync with descendants
2588 #endif /* USE_ITT_BUILD */
2589  }
2590  KMP_DEBUG_ASSERT(taskgroup->count == 0);
2591 
2592  if (taskgroup->reduce_data != NULL &&
2593  !taskgroup->gomp_data) { // need to reduce?
2594  int cnt;
2595  void *reduce_data;
2596  kmp_team_t *t = thread->th.th_team;
2597  kmp_taskred_data_t *arr = (kmp_taskred_data_t *)taskgroup->reduce_data;
2598  // check if <priv> data of the first reduction variable shared for the team
2599  void *priv0 = arr[0].reduce_priv;
2600  if ((reduce_data = KMP_ATOMIC_LD_ACQ(&t->t.t_tg_reduce_data[0])) != NULL &&
2601  ((kmp_taskred_data_t *)reduce_data)[0].reduce_priv == priv0) {
2602  // finishing task reduction on parallel
2603  cnt = KMP_ATOMIC_INC(&t->t.t_tg_fini_counter[0]);
2604  if (cnt == thread->th.th_team_nproc - 1) {
2605  // we are the last thread passing __kmpc_reduction_modifier_fini()
2606  // finalize task reduction:
2607  __kmp_task_reduction_fini(thread, taskgroup);
2608  // cleanup fields in the team structure:
2609  // TODO: is relaxed store enough here (whole barrier should follow)?
2610  __kmp_thread_free(thread, reduce_data);
2611  KMP_ATOMIC_ST_REL(&t->t.t_tg_reduce_data[0], NULL);
2612  KMP_ATOMIC_ST_REL(&t->t.t_tg_fini_counter[0], 0);
2613  } else {
2614  // we are not the last thread passing __kmpc_reduction_modifier_fini(),
2615  // so do not finalize reduction, just clean own copy of the data
2616  __kmp_task_reduction_clean(thread, taskgroup);
2617  }
2618  } else if ((reduce_data = KMP_ATOMIC_LD_ACQ(&t->t.t_tg_reduce_data[1])) !=
2619  NULL &&
2620  ((kmp_taskred_data_t *)reduce_data)[0].reduce_priv == priv0) {
2621  // finishing task reduction on worksharing
2622  cnt = KMP_ATOMIC_INC(&t->t.t_tg_fini_counter[1]);
2623  if (cnt == thread->th.th_team_nproc - 1) {
2624  // we are the last thread passing __kmpc_reduction_modifier_fini()
2625  __kmp_task_reduction_fini(thread, taskgroup);
2626  // cleanup fields in team structure:
2627  // TODO: is relaxed store enough here (whole barrier should follow)?
2628  __kmp_thread_free(thread, reduce_data);
2629  KMP_ATOMIC_ST_REL(&t->t.t_tg_reduce_data[1], NULL);
2630  KMP_ATOMIC_ST_REL(&t->t.t_tg_fini_counter[1], 0);
2631  } else {
2632  // we are not the last thread passing __kmpc_reduction_modifier_fini(),
2633  // so do not finalize reduction, just clean own copy of the data
2634  __kmp_task_reduction_clean(thread, taskgroup);
2635  }
2636  } else {
2637  // finishing task reduction on taskgroup
2638  __kmp_task_reduction_fini(thread, taskgroup);
2639  }
2640  }
2641  // Restore parent taskgroup for the current task
2642  taskdata->td_taskgroup = taskgroup->parent;
2643  __kmp_thread_free(thread, taskgroup);
2644 
2645  KA_TRACE(10, ("__kmpc_end_taskgroup(exit): T#%d task %p finished waiting\n",
2646  gtid, taskdata));
2647 
2648 #if OMPT_SUPPORT && OMPT_OPTIONAL
2649  if (UNLIKELY(ompt_enabled.ompt_callback_sync_region)) {
2650  ompt_callbacks.ompt_callback(ompt_callback_sync_region)(
2651  ompt_sync_region_taskgroup, ompt_scope_end, &(my_parallel_data),
2652  &(my_task_data), codeptr);
2653  }
2654 #endif
2655 }
2656 
2657 // __kmp_remove_my_task: remove a task from my own deque
2658 static kmp_task_t *__kmp_remove_my_task(kmp_info_t *thread, kmp_int32 gtid,
2659  kmp_task_team_t *task_team,
2660  kmp_int32 is_constrained) {
2661  kmp_task_t *task;
2662  kmp_taskdata_t *taskdata;
2663  kmp_thread_data_t *thread_data;
2664  kmp_uint32 tail;
2665 
2666  KMP_DEBUG_ASSERT(__kmp_tasking_mode != tskm_immediate_exec);
2667  KMP_DEBUG_ASSERT(task_team->tt.tt_threads_data !=
2668  NULL); // Caller should check this condition
2669 
2670  thread_data = &task_team->tt.tt_threads_data[__kmp_tid_from_gtid(gtid)];
2671 
2672  KA_TRACE(10, ("__kmp_remove_my_task(enter): T#%d ntasks=%d head=%u tail=%u\n",
2673  gtid, thread_data->td.td_deque_ntasks,
2674  thread_data->td.td_deque_head, thread_data->td.td_deque_tail));
2675 
2676  if (TCR_4(thread_data->td.td_deque_ntasks) == 0) {
2677  KA_TRACE(10,
2678  ("__kmp_remove_my_task(exit #1): T#%d No tasks to remove: "
2679  "ntasks=%d head=%u tail=%u\n",
2680  gtid, thread_data->td.td_deque_ntasks,
2681  thread_data->td.td_deque_head, thread_data->td.td_deque_tail));
2682  return NULL;
2683  }
2684 
2685  __kmp_acquire_bootstrap_lock(&thread_data->td.td_deque_lock);
2686 
2687  if (TCR_4(thread_data->td.td_deque_ntasks) == 0) {
2688  __kmp_release_bootstrap_lock(&thread_data->td.td_deque_lock);
2689  KA_TRACE(10,
2690  ("__kmp_remove_my_task(exit #2): T#%d No tasks to remove: "
2691  "ntasks=%d head=%u tail=%u\n",
2692  gtid, thread_data->td.td_deque_ntasks,
2693  thread_data->td.td_deque_head, thread_data->td.td_deque_tail));
2694  return NULL;
2695  }
2696 
2697  tail = (thread_data->td.td_deque_tail - 1) &
2698  TASK_DEQUE_MASK(thread_data->td); // Wrap index.
2699  taskdata = thread_data->td.td_deque[tail];
2700 
2701  if (!__kmp_task_is_allowed(gtid, is_constrained, taskdata,
2702  thread->th.th_current_task)) {
2703  // The TSC does not allow to steal victim task
2704  __kmp_release_bootstrap_lock(&thread_data->td.td_deque_lock);
2705  KA_TRACE(10,
2706  ("__kmp_remove_my_task(exit #3): T#%d TSC blocks tail task: "
2707  "ntasks=%d head=%u tail=%u\n",
2708  gtid, thread_data->td.td_deque_ntasks,
2709  thread_data->td.td_deque_head, thread_data->td.td_deque_tail));
2710  return NULL;
2711  }
2712 
2713  thread_data->td.td_deque_tail = tail;
2714  TCW_4(thread_data->td.td_deque_ntasks, thread_data->td.td_deque_ntasks - 1);
2715 
2716  __kmp_release_bootstrap_lock(&thread_data->td.td_deque_lock);
2717 
2718  KA_TRACE(10, ("__kmp_remove_my_task(exit #4): T#%d task %p removed: "
2719  "ntasks=%d head=%u tail=%u\n",
2720  gtid, taskdata, thread_data->td.td_deque_ntasks,
2721  thread_data->td.td_deque_head, thread_data->td.td_deque_tail));
2722 
2723  task = KMP_TASKDATA_TO_TASK(taskdata);
2724  return task;
2725 }
2726 
2727 // __kmp_steal_task: remove a task from another thread's deque
2728 // Assume that calling thread has already checked existence of
2729 // task_team thread_data before calling this routine.
2730 static kmp_task_t *__kmp_steal_task(kmp_info_t *victim_thr, kmp_int32 gtid,
2731  kmp_task_team_t *task_team,
2732  std::atomic<kmp_int32> *unfinished_threads,
2733  int *thread_finished,
2734  kmp_int32 is_constrained) {
2735  kmp_task_t *task;
2736  kmp_taskdata_t *taskdata;
2737  kmp_taskdata_t *current;
2738  kmp_thread_data_t *victim_td, *threads_data;
2739  kmp_int32 target;
2740  kmp_int32 victim_tid;
2741 
2742  KMP_DEBUG_ASSERT(__kmp_tasking_mode != tskm_immediate_exec);
2743 
2744  threads_data = task_team->tt.tt_threads_data;
2745  KMP_DEBUG_ASSERT(threads_data != NULL); // Caller should check this condition
2746 
2747  victim_tid = victim_thr->th.th_info.ds.ds_tid;
2748  victim_td = &threads_data[victim_tid];
2749 
2750  KA_TRACE(10, ("__kmp_steal_task(enter): T#%d try to steal from T#%d: "
2751  "task_team=%p ntasks=%d head=%u tail=%u\n",
2752  gtid, __kmp_gtid_from_thread(victim_thr), task_team,
2753  victim_td->td.td_deque_ntasks, victim_td->td.td_deque_head,
2754  victim_td->td.td_deque_tail));
2755 
2756  if (TCR_4(victim_td->td.td_deque_ntasks) == 0) {
2757  KA_TRACE(10, ("__kmp_steal_task(exit #1): T#%d could not steal from T#%d: "
2758  "task_team=%p ntasks=%d head=%u tail=%u\n",
2759  gtid, __kmp_gtid_from_thread(victim_thr), task_team,
2760  victim_td->td.td_deque_ntasks, victim_td->td.td_deque_head,
2761  victim_td->td.td_deque_tail));
2762  return NULL;
2763  }
2764 
2765  __kmp_acquire_bootstrap_lock(&victim_td->td.td_deque_lock);
2766 
2767  int ntasks = TCR_4(victim_td->td.td_deque_ntasks);
2768  // Check again after we acquire the lock
2769  if (ntasks == 0) {
2770  __kmp_release_bootstrap_lock(&victim_td->td.td_deque_lock);
2771  KA_TRACE(10, ("__kmp_steal_task(exit #2): T#%d could not steal from T#%d: "
2772  "task_team=%p ntasks=%d head=%u tail=%u\n",
2773  gtid, __kmp_gtid_from_thread(victim_thr), task_team, ntasks,
2774  victim_td->td.td_deque_head, victim_td->td.td_deque_tail));
2775  return NULL;
2776  }
2777 
2778  KMP_DEBUG_ASSERT(victim_td->td.td_deque != NULL);
2779  current = __kmp_threads[gtid]->th.th_current_task;
2780  taskdata = victim_td->td.td_deque[victim_td->td.td_deque_head];
2781  if (__kmp_task_is_allowed(gtid, is_constrained, taskdata, current)) {
2782  // Bump head pointer and Wrap.
2783  victim_td->td.td_deque_head =
2784  (victim_td->td.td_deque_head + 1) & TASK_DEQUE_MASK(victim_td->td);
2785  } else {
2786  if (!task_team->tt.tt_untied_task_encountered) {
2787  // The TSC does not allow to steal victim task
2788  __kmp_release_bootstrap_lock(&victim_td->td.td_deque_lock);
2789  KA_TRACE(10, ("__kmp_steal_task(exit #3): T#%d could not steal from "
2790  "T#%d: task_team=%p ntasks=%d head=%u tail=%u\n",
2791  gtid, __kmp_gtid_from_thread(victim_thr), task_team, ntasks,
2792  victim_td->td.td_deque_head, victim_td->td.td_deque_tail));
2793  return NULL;
2794  }
2795  int i;
2796  // walk through victim's deque trying to steal any task
2797  target = victim_td->td.td_deque_head;
2798  taskdata = NULL;
2799  for (i = 1; i < ntasks; ++i) {
2800  target = (target + 1) & TASK_DEQUE_MASK(victim_td->td);
2801  taskdata = victim_td->td.td_deque[target];
2802  if (__kmp_task_is_allowed(gtid, is_constrained, taskdata, current)) {
2803  break; // found victim task
2804  } else {
2805  taskdata = NULL;
2806  }
2807  }
2808  if (taskdata == NULL) {
2809  // No appropriate candidate to steal found
2810  __kmp_release_bootstrap_lock(&victim_td->td.td_deque_lock);
2811  KA_TRACE(10, ("__kmp_steal_task(exit #4): T#%d could not steal from "
2812  "T#%d: task_team=%p ntasks=%d head=%u tail=%u\n",
2813  gtid, __kmp_gtid_from_thread(victim_thr), task_team, ntasks,
2814  victim_td->td.td_deque_head, victim_td->td.td_deque_tail));
2815  return NULL;
2816  }
2817  int prev = target;
2818  for (i = i + 1; i < ntasks; ++i) {
2819  // shift remaining tasks in the deque left by 1
2820  target = (target + 1) & TASK_DEQUE_MASK(victim_td->td);
2821  victim_td->td.td_deque[prev] = victim_td->td.td_deque[target];
2822  prev = target;
2823  }
2824  KMP_DEBUG_ASSERT(
2825  victim_td->td.td_deque_tail ==
2826  (kmp_uint32)((target + 1) & TASK_DEQUE_MASK(victim_td->td)));
2827  victim_td->td.td_deque_tail = target; // tail -= 1 (wrapped))
2828  }
2829  if (*thread_finished) {
2830  // We need to un-mark this victim as a finished victim. This must be done
2831  // before releasing the lock, or else other threads (starting with the
2832  // primary thread victim) might be prematurely released from the barrier!!!
2833  kmp_int32 count;
2834 
2835  count = KMP_ATOMIC_INC(unfinished_threads);
2836 
2837  KA_TRACE(
2838  20,
2839  ("__kmp_steal_task: T#%d inc unfinished_threads to %d: task_team=%p\n",
2840  gtid, count + 1, task_team));
2841 
2842  *thread_finished = FALSE;
2843  }
2844  TCW_4(victim_td->td.td_deque_ntasks, ntasks - 1);
2845 
2846  __kmp_release_bootstrap_lock(&victim_td->td.td_deque_lock);
2847 
2848  KMP_COUNT_BLOCK(TASK_stolen);
2849  KA_TRACE(10,
2850  ("__kmp_steal_task(exit #5): T#%d stole task %p from T#%d: "
2851  "task_team=%p ntasks=%d head=%u tail=%u\n",
2852  gtid, taskdata, __kmp_gtid_from_thread(victim_thr), task_team,
2853  ntasks, victim_td->td.td_deque_head, victim_td->td.td_deque_tail));
2854 
2855  task = KMP_TASKDATA_TO_TASK(taskdata);
2856  return task;
2857 }
2858 
2859 // __kmp_execute_tasks_template: Choose and execute tasks until either the
2860 // condition is statisfied (return true) or there are none left (return false).
2861 //
2862 // final_spin is TRUE if this is the spin at the release barrier.
2863 // thread_finished indicates whether the thread is finished executing all
2864 // the tasks it has on its deque, and is at the release barrier.
2865 // spinner is the location on which to spin.
2866 // spinner == NULL means only execute a single task and return.
2867 // checker is the value to check to terminate the spin.
2868 template <class C>
2869 static inline int __kmp_execute_tasks_template(
2870  kmp_info_t *thread, kmp_int32 gtid, C *flag, int final_spin,
2871  int *thread_finished USE_ITT_BUILD_ARG(void *itt_sync_obj),
2872  kmp_int32 is_constrained) {
2873  kmp_task_team_t *task_team = thread->th.th_task_team;
2874  kmp_thread_data_t *threads_data;
2875  kmp_task_t *task;
2876  kmp_info_t *other_thread;
2877  kmp_taskdata_t *current_task = thread->th.th_current_task;
2878  std::atomic<kmp_int32> *unfinished_threads;
2879  kmp_int32 nthreads, victim_tid = -2, use_own_tasks = 1, new_victim = 0,
2880  tid = thread->th.th_info.ds.ds_tid;
2881 
2882  KMP_DEBUG_ASSERT(__kmp_tasking_mode != tskm_immediate_exec);
2883  KMP_DEBUG_ASSERT(thread == __kmp_threads[gtid]);
2884 
2885  if (task_team == NULL || current_task == NULL)
2886  return FALSE;
2887 
2888  KA_TRACE(15, ("__kmp_execute_tasks_template(enter): T#%d final_spin=%d "
2889  "*thread_finished=%d\n",
2890  gtid, final_spin, *thread_finished));
2891 
2892  thread->th.th_reap_state = KMP_NOT_SAFE_TO_REAP;
2893  threads_data = (kmp_thread_data_t *)TCR_PTR(task_team->tt.tt_threads_data);
2894 
2895  KMP_DEBUG_ASSERT(threads_data != NULL);
2896 
2897  nthreads = task_team->tt.tt_nproc;
2898  unfinished_threads = &(task_team->tt.tt_unfinished_threads);
2899  KMP_DEBUG_ASSERT(nthreads > 1 || task_team->tt.tt_found_proxy_tasks ||
2900  task_team->tt.tt_hidden_helper_task_encountered);
2901  KMP_DEBUG_ASSERT(*unfinished_threads >= 0);
2902 
2903  while (1) { // Outer loop keeps trying to find tasks in case of single thread
2904  // getting tasks from target constructs
2905  while (1) { // Inner loop to find a task and execute it
2906  task = NULL;
2907  if (use_own_tasks) { // check on own queue first
2908  task = __kmp_remove_my_task(thread, gtid, task_team, is_constrained);
2909  }
2910  if ((task == NULL) && (nthreads > 1)) { // Steal a task
2911  int asleep = 1;
2912  use_own_tasks = 0;
2913  // Try to steal from the last place I stole from successfully.
2914  if (victim_tid == -2) { // haven't stolen anything yet
2915  victim_tid = threads_data[tid].td.td_deque_last_stolen;
2916  if (victim_tid !=
2917  -1) // if we have a last stolen from victim, get the thread
2918  other_thread = threads_data[victim_tid].td.td_thr;
2919  }
2920  if (victim_tid != -1) { // found last victim
2921  asleep = 0;
2922  } else if (!new_victim) { // no recent steals and we haven't already
2923  // used a new victim; select a random thread
2924  do { // Find a different thread to steal work from.
2925  // Pick a random thread. Initial plan was to cycle through all the
2926  // threads, and only return if we tried to steal from every thread,
2927  // and failed. Arch says that's not such a great idea.
2928  victim_tid = __kmp_get_random(thread) % (nthreads - 1);
2929  if (victim_tid >= tid) {
2930  ++victim_tid; // Adjusts random distribution to exclude self
2931  }
2932  // Found a potential victim
2933  other_thread = threads_data[victim_tid].td.td_thr;
2934  // There is a slight chance that __kmp_enable_tasking() did not wake
2935  // up all threads waiting at the barrier. If victim is sleeping,
2936  // then wake it up. Since we were going to pay the cache miss
2937  // penalty for referencing another thread's kmp_info_t struct
2938  // anyway,
2939  // the check shouldn't cost too much performance at this point. In
2940  // extra barrier mode, tasks do not sleep at the separate tasking
2941  // barrier, so this isn't a problem.
2942  asleep = 0;
2943  if ((__kmp_tasking_mode == tskm_task_teams) &&
2944  (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) &&
2945  (TCR_PTR(CCAST(void *, other_thread->th.th_sleep_loc)) !=
2946  NULL)) {
2947  asleep = 1;
2948  __kmp_null_resume_wrapper(__kmp_gtid_from_thread(other_thread),
2949  other_thread->th.th_sleep_loc);
2950  // A sleeping thread should not have any tasks on it's queue.
2951  // There is a slight possibility that it resumes, steals a task
2952  // from another thread, which spawns more tasks, all in the time
2953  // that it takes this thread to check => don't write an assertion
2954  // that the victim's queue is empty. Try stealing from a
2955  // different thread.
2956  }
2957  } while (asleep);
2958  }
2959 
2960  if (!asleep) {
2961  // We have a victim to try to steal from
2962  task = __kmp_steal_task(other_thread, gtid, task_team,
2963  unfinished_threads, thread_finished,
2964  is_constrained);
2965  }
2966  if (task != NULL) { // set last stolen to victim
2967  if (threads_data[tid].td.td_deque_last_stolen != victim_tid) {
2968  threads_data[tid].td.td_deque_last_stolen = victim_tid;
2969  // The pre-refactored code did not try more than 1 successful new
2970  // vicitm, unless the last one generated more local tasks;
2971  // new_victim keeps track of this
2972  new_victim = 1;
2973  }
2974  } else { // No tasks found; unset last_stolen
2975  KMP_CHECK_UPDATE(threads_data[tid].td.td_deque_last_stolen, -1);
2976  victim_tid = -2; // no successful victim found
2977  }
2978  }
2979 
2980  if (task == NULL)
2981  break; // break out of tasking loop
2982 
2983 // Found a task; execute it
2984 #if USE_ITT_BUILD && USE_ITT_NOTIFY
2985  if (__itt_sync_create_ptr || KMP_ITT_DEBUG) {
2986  if (itt_sync_obj == NULL) { // we are at fork barrier where we could not
2987  // get the object reliably
2988  itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier);
2989  }
2990  __kmp_itt_task_starting(itt_sync_obj);
2991  }
2992 #endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */
2993  __kmp_invoke_task(gtid, task, current_task);
2994 #if USE_ITT_BUILD
2995  if (itt_sync_obj != NULL)
2996  __kmp_itt_task_finished(itt_sync_obj);
2997 #endif /* USE_ITT_BUILD */
2998  // If this thread is only partway through the barrier and the condition is
2999  // met, then return now, so that the barrier gather/release pattern can
3000  // proceed. If this thread is in the last spin loop in the barrier,
3001  // waiting to be released, we know that the termination condition will not
3002  // be satisfied, so don't waste any cycles checking it.
3003  if (flag == NULL || (!final_spin && flag->done_check())) {
3004  KA_TRACE(
3005  15,
3006  ("__kmp_execute_tasks_template: T#%d spin condition satisfied\n",
3007  gtid));
3008  return TRUE;
3009  }
3010  if (thread->th.th_task_team == NULL) {
3011  break;
3012  }
3013  KMP_YIELD(__kmp_library == library_throughput); // Yield before next task
3014  // If execution of a stolen task results in more tasks being placed on our
3015  // run queue, reset use_own_tasks
3016  if (!use_own_tasks && TCR_4(threads_data[tid].td.td_deque_ntasks) != 0) {
3017  KA_TRACE(20, ("__kmp_execute_tasks_template: T#%d stolen task spawned "
3018  "other tasks, restart\n",
3019  gtid));
3020  use_own_tasks = 1;
3021  new_victim = 0;
3022  }
3023  }
3024 
3025  // The task source has been exhausted. If in final spin loop of barrier,
3026  // check if termination condition is satisfied. The work queue may be empty
3027  // but there might be proxy tasks still executing.
3028  if (final_spin &&
3029  KMP_ATOMIC_LD_ACQ(&current_task->td_incomplete_child_tasks) == 0) {
3030  // First, decrement the #unfinished threads, if that has not already been
3031  // done. This decrement might be to the spin location, and result in the
3032  // termination condition being satisfied.
3033  if (!*thread_finished) {
3034  kmp_int32 count;
3035 
3036  count = KMP_ATOMIC_DEC(unfinished_threads) - 1;
3037  KA_TRACE(20, ("__kmp_execute_tasks_template: T#%d dec "
3038  "unfinished_threads to %d task_team=%p\n",
3039  gtid, count, task_team));
3040  *thread_finished = TRUE;
3041  }
3042 
3043  // It is now unsafe to reference thread->th.th_team !!!
3044  // Decrementing task_team->tt.tt_unfinished_threads can allow the primary
3045  // thread to pass through the barrier, where it might reset each thread's
3046  // th.th_team field for the next parallel region. If we can steal more
3047  // work, we know that this has not happened yet.
3048  if (flag != NULL && flag->done_check()) {
3049  KA_TRACE(
3050  15,
3051  ("__kmp_execute_tasks_template: T#%d spin condition satisfied\n",
3052  gtid));
3053  return TRUE;
3054  }
3055  }
3056 
3057  // If this thread's task team is NULL, primary thread has recognized that
3058  // there are no more tasks; bail out
3059  if (thread->th.th_task_team == NULL) {
3060  KA_TRACE(15,
3061  ("__kmp_execute_tasks_template: T#%d no more tasks\n", gtid));
3062  return FALSE;
3063  }
3064 
3065  // We could be getting tasks from target constructs; if this is the only
3066  // thread, keep trying to execute tasks from own queue
3067  if (nthreads == 1 &&
3068  KMP_ATOMIC_LD_ACQ(&current_task->td_incomplete_child_tasks))
3069  use_own_tasks = 1;
3070  else {
3071  KA_TRACE(15,
3072  ("__kmp_execute_tasks_template: T#%d can't find work\n", gtid));
3073  return FALSE;
3074  }
3075  }
3076 }
3077 
3078 template <bool C, bool S>
3079 int __kmp_execute_tasks_32(
3080  kmp_info_t *thread, kmp_int32 gtid, kmp_flag_32<C, S> *flag, int final_spin,
3081  int *thread_finished USE_ITT_BUILD_ARG(void *itt_sync_obj),
3082  kmp_int32 is_constrained) {
3083  return __kmp_execute_tasks_template(
3084  thread, gtid, flag, final_spin,
3085  thread_finished USE_ITT_BUILD_ARG(itt_sync_obj), is_constrained);
3086 }
3087 
3088 template <bool C, bool S>
3089 int __kmp_execute_tasks_64(
3090  kmp_info_t *thread, kmp_int32 gtid, kmp_flag_64<C, S> *flag, int final_spin,
3091  int *thread_finished USE_ITT_BUILD_ARG(void *itt_sync_obj),
3092  kmp_int32 is_constrained) {
3093  return __kmp_execute_tasks_template(
3094  thread, gtid, flag, final_spin,
3095  thread_finished USE_ITT_BUILD_ARG(itt_sync_obj), is_constrained);
3096 }
3097 
3098 int __kmp_execute_tasks_oncore(
3099  kmp_info_t *thread, kmp_int32 gtid, kmp_flag_oncore *flag, int final_spin,
3100  int *thread_finished USE_ITT_BUILD_ARG(void *itt_sync_obj),
3101  kmp_int32 is_constrained) {
3102  return __kmp_execute_tasks_template(
3103  thread, gtid, flag, final_spin,
3104  thread_finished USE_ITT_BUILD_ARG(itt_sync_obj), is_constrained);
3105 }
3106 
3107 template int
3108 __kmp_execute_tasks_32<false, false>(kmp_info_t *, kmp_int32,
3109  kmp_flag_32<false, false> *, int,
3110  int *USE_ITT_BUILD_ARG(void *), kmp_int32);
3111 
3112 template int __kmp_execute_tasks_64<false, true>(kmp_info_t *, kmp_int32,
3113  kmp_flag_64<false, true> *,
3114  int,
3115  int *USE_ITT_BUILD_ARG(void *),
3116  kmp_int32);
3117 
3118 template int __kmp_execute_tasks_64<true, false>(kmp_info_t *, kmp_int32,
3119  kmp_flag_64<true, false> *,
3120  int,
3121  int *USE_ITT_BUILD_ARG(void *),
3122  kmp_int32);
3123 
3124 // __kmp_enable_tasking: Allocate task team and resume threads sleeping at the
3125 // next barrier so they can assist in executing enqueued tasks.
3126 // First thread in allocates the task team atomically.
3127 static void __kmp_enable_tasking(kmp_task_team_t *task_team,
3128  kmp_info_t *this_thr) {
3129  kmp_thread_data_t *threads_data;
3130  int nthreads, i, is_init_thread;
3131 
3132  KA_TRACE(10, ("__kmp_enable_tasking(enter): T#%d\n",
3133  __kmp_gtid_from_thread(this_thr)));
3134 
3135  KMP_DEBUG_ASSERT(task_team != NULL);
3136  KMP_DEBUG_ASSERT(this_thr->th.th_team != NULL);
3137 
3138  nthreads = task_team->tt.tt_nproc;
3139  KMP_DEBUG_ASSERT(nthreads > 0);
3140  KMP_DEBUG_ASSERT(nthreads == this_thr->th.th_team->t.t_nproc);
3141 
3142  // Allocate or increase the size of threads_data if necessary
3143  is_init_thread = __kmp_realloc_task_threads_data(this_thr, task_team);
3144 
3145  if (!is_init_thread) {
3146  // Some other thread already set up the array.
3147  KA_TRACE(
3148  20,
3149  ("__kmp_enable_tasking(exit): T#%d: threads array already set up.\n",
3150  __kmp_gtid_from_thread(this_thr)));
3151  return;
3152  }
3153  threads_data = (kmp_thread_data_t *)TCR_PTR(task_team->tt.tt_threads_data);
3154  KMP_DEBUG_ASSERT(threads_data != NULL);
3155 
3156  if (__kmp_tasking_mode == tskm_task_teams &&
3157  (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME)) {
3158  // Release any threads sleeping at the barrier, so that they can steal
3159  // tasks and execute them. In extra barrier mode, tasks do not sleep
3160  // at the separate tasking barrier, so this isn't a problem.
3161  for (i = 0; i < nthreads; i++) {
3162  volatile void *sleep_loc;
3163  kmp_info_t *thread = threads_data[i].td.td_thr;
3164 
3165  if (i == this_thr->th.th_info.ds.ds_tid) {
3166  continue;
3167  }
3168  // Since we haven't locked the thread's suspend mutex lock at this
3169  // point, there is a small window where a thread might be putting
3170  // itself to sleep, but hasn't set the th_sleep_loc field yet.
3171  // To work around this, __kmp_execute_tasks_template() periodically checks
3172  // see if other threads are sleeping (using the same random mechanism that
3173  // is used for task stealing) and awakens them if they are.
3174  if ((sleep_loc = TCR_PTR(CCAST(void *, thread->th.th_sleep_loc))) !=
3175  NULL) {
3176  KF_TRACE(50, ("__kmp_enable_tasking: T#%d waking up thread T#%d\n",
3177  __kmp_gtid_from_thread(this_thr),
3178  __kmp_gtid_from_thread(thread)));
3179  __kmp_null_resume_wrapper(__kmp_gtid_from_thread(thread), sleep_loc);
3180  } else {
3181  KF_TRACE(50, ("__kmp_enable_tasking: T#%d don't wake up thread T#%d\n",
3182  __kmp_gtid_from_thread(this_thr),
3183  __kmp_gtid_from_thread(thread)));
3184  }
3185  }
3186  }
3187 
3188  KA_TRACE(10, ("__kmp_enable_tasking(exit): T#%d\n",
3189  __kmp_gtid_from_thread(this_thr)));
3190 }
3191 
3192 /* // TODO: Check the comment consistency
3193  * Utility routines for "task teams". A task team (kmp_task_t) is kind of
3194  * like a shadow of the kmp_team_t data struct, with a different lifetime.
3195  * After a child * thread checks into a barrier and calls __kmp_release() from
3196  * the particular variant of __kmp_<barrier_kind>_barrier_gather(), it can no
3197  * longer assume that the kmp_team_t structure is intact (at any moment, the
3198  * primary thread may exit the barrier code and free the team data structure,
3199  * and return the threads to the thread pool).
3200  *
3201  * This does not work with the tasking code, as the thread is still
3202  * expected to participate in the execution of any tasks that may have been
3203  * spawned my a member of the team, and the thread still needs access to all
3204  * to each thread in the team, so that it can steal work from it.
3205  *
3206  * Enter the existence of the kmp_task_team_t struct. It employs a reference
3207  * counting mechanism, and is allocated by the primary thread before calling
3208  * __kmp_<barrier_kind>_release, and then is release by the last thread to
3209  * exit __kmp_<barrier_kind>_release at the next barrier. I.e. the lifetimes
3210  * of the kmp_task_team_t structs for consecutive barriers can overlap
3211  * (and will, unless the primary thread is the last thread to exit the barrier
3212  * release phase, which is not typical). The existence of such a struct is
3213  * useful outside the context of tasking.
3214  *
3215  * We currently use the existence of the threads array as an indicator that
3216  * tasks were spawned since the last barrier. If the structure is to be
3217  * useful outside the context of tasking, then this will have to change, but
3218  * not setting the field minimizes the performance impact of tasking on
3219  * barriers, when no explicit tasks were spawned (pushed, actually).
3220  */
3221 
3222 static kmp_task_team_t *__kmp_free_task_teams =
3223  NULL; // Free list for task_team data structures
3224 // Lock for task team data structures
3225 kmp_bootstrap_lock_t __kmp_task_team_lock =
3226  KMP_BOOTSTRAP_LOCK_INITIALIZER(__kmp_task_team_lock);
3227 
3228 // __kmp_alloc_task_deque:
3229 // Allocates a task deque for a particular thread, and initialize the necessary
3230 // data structures relating to the deque. This only happens once per thread
3231 // per task team since task teams are recycled. No lock is needed during
3232 // allocation since each thread allocates its own deque.
3233 static void __kmp_alloc_task_deque(kmp_info_t *thread,
3234  kmp_thread_data_t *thread_data) {
3235  __kmp_init_bootstrap_lock(&thread_data->td.td_deque_lock);
3236  KMP_DEBUG_ASSERT(thread_data->td.td_deque == NULL);
3237 
3238  // Initialize last stolen task field to "none"
3239  thread_data->td.td_deque_last_stolen = -1;
3240 
3241  KMP_DEBUG_ASSERT(TCR_4(thread_data->td.td_deque_ntasks) == 0);
3242  KMP_DEBUG_ASSERT(thread_data->td.td_deque_head == 0);
3243  KMP_DEBUG_ASSERT(thread_data->td.td_deque_tail == 0);
3244 
3245  KE_TRACE(
3246  10,
3247  ("__kmp_alloc_task_deque: T#%d allocating deque[%d] for thread_data %p\n",
3248  __kmp_gtid_from_thread(thread), INITIAL_TASK_DEQUE_SIZE, thread_data));
3249  // Allocate space for task deque, and zero the deque
3250  // Cannot use __kmp_thread_calloc() because threads not around for
3251  // kmp_reap_task_team( ).
3252  thread_data->td.td_deque = (kmp_taskdata_t **)__kmp_allocate(
3253  INITIAL_TASK_DEQUE_SIZE * sizeof(kmp_taskdata_t *));
3254  thread_data->td.td_deque_size = INITIAL_TASK_DEQUE_SIZE;
3255 }
3256 
3257 // __kmp_free_task_deque:
3258 // Deallocates a task deque for a particular thread. Happens at library
3259 // deallocation so don't need to reset all thread data fields.
3260 static void __kmp_free_task_deque(kmp_thread_data_t *thread_data) {
3261  if (thread_data->td.td_deque != NULL) {
3262  __kmp_acquire_bootstrap_lock(&thread_data->td.td_deque_lock);
3263  TCW_4(thread_data->td.td_deque_ntasks, 0);
3264  __kmp_free(thread_data->td.td_deque);
3265  thread_data->td.td_deque = NULL;
3266  __kmp_release_bootstrap_lock(&thread_data->td.td_deque_lock);
3267  }
3268 
3269 #ifdef BUILD_TIED_TASK_STACK
3270  // GEH: Figure out what to do here for td_susp_tied_tasks
3271  if (thread_data->td.td_susp_tied_tasks.ts_entries != TASK_STACK_EMPTY) {
3272  __kmp_free_task_stack(__kmp_thread_from_gtid(gtid), thread_data);
3273  }
3274 #endif // BUILD_TIED_TASK_STACK
3275 }
3276 
3277 // __kmp_realloc_task_threads_data:
3278 // Allocates a threads_data array for a task team, either by allocating an
3279 // initial array or enlarging an existing array. Only the first thread to get
3280 // the lock allocs or enlarges the array and re-initializes the array elements.
3281 // That thread returns "TRUE", the rest return "FALSE".
3282 // Assumes that the new array size is given by task_team -> tt.tt_nproc.
3283 // The current size is given by task_team -> tt.tt_max_threads.
3284 static int __kmp_realloc_task_threads_data(kmp_info_t *thread,
3285  kmp_task_team_t *task_team) {
3286  kmp_thread_data_t **threads_data_p;
3287  kmp_int32 nthreads, maxthreads;
3288  int is_init_thread = FALSE;
3289 
3290  if (TCR_4(task_team->tt.tt_found_tasks)) {
3291  // Already reallocated and initialized.
3292  return FALSE;
3293  }
3294 
3295  threads_data_p = &task_team->tt.tt_threads_data;
3296  nthreads = task_team->tt.tt_nproc;
3297  maxthreads = task_team->tt.tt_max_threads;
3298 
3299  // All threads must lock when they encounter the first task of the implicit
3300  // task region to make sure threads_data fields are (re)initialized before
3301  // used.
3302  __kmp_acquire_bootstrap_lock(&task_team->tt.tt_threads_lock);
3303 
3304  if (!TCR_4(task_team->tt.tt_found_tasks)) {
3305  // first thread to enable tasking
3306  kmp_team_t *team = thread->th.th_team;
3307  int i;
3308 
3309  is_init_thread = TRUE;
3310  if (maxthreads < nthreads) {
3311 
3312  if (*threads_data_p != NULL) {
3313  kmp_thread_data_t *old_data = *threads_data_p;
3314  kmp_thread_data_t *new_data = NULL;
3315 
3316  KE_TRACE(
3317  10,
3318  ("__kmp_realloc_task_threads_data: T#%d reallocating "
3319  "threads data for task_team %p, new_size = %d, old_size = %d\n",
3320  __kmp_gtid_from_thread(thread), task_team, nthreads, maxthreads));
3321  // Reallocate threads_data to have more elements than current array
3322  // Cannot use __kmp_thread_realloc() because threads not around for
3323  // kmp_reap_task_team( ). Note all new array entries are initialized
3324  // to zero by __kmp_allocate().
3325  new_data = (kmp_thread_data_t *)__kmp_allocate(
3326  nthreads * sizeof(kmp_thread_data_t));
3327  // copy old data to new data
3328  KMP_MEMCPY_S((void *)new_data, nthreads * sizeof(kmp_thread_data_t),
3329  (void *)old_data, maxthreads * sizeof(kmp_thread_data_t));
3330 
3331 #ifdef BUILD_TIED_TASK_STACK
3332  // GEH: Figure out if this is the right thing to do
3333  for (i = maxthreads; i < nthreads; i++) {
3334  kmp_thread_data_t *thread_data = &(*threads_data_p)[i];
3335  __kmp_init_task_stack(__kmp_gtid_from_thread(thread), thread_data);
3336  }
3337 #endif // BUILD_TIED_TASK_STACK
3338  // Install the new data and free the old data
3339  (*threads_data_p) = new_data;
3340  __kmp_free(old_data);
3341  } else {
3342  KE_TRACE(10, ("__kmp_realloc_task_threads_data: T#%d allocating "
3343  "threads data for task_team %p, size = %d\n",
3344  __kmp_gtid_from_thread(thread), task_team, nthreads));
3345  // Make the initial allocate for threads_data array, and zero entries
3346  // Cannot use __kmp_thread_calloc() because threads not around for
3347  // kmp_reap_task_team( ).
3348  *threads_data_p = (kmp_thread_data_t *)__kmp_allocate(
3349  nthreads * sizeof(kmp_thread_data_t));
3350 #ifdef BUILD_TIED_TASK_STACK
3351  // GEH: Figure out if this is the right thing to do
3352  for (i = 0; i < nthreads; i++) {
3353  kmp_thread_data_t *thread_data = &(*threads_data_p)[i];
3354  __kmp_init_task_stack(__kmp_gtid_from_thread(thread), thread_data);
3355  }
3356 #endif // BUILD_TIED_TASK_STACK
3357  }
3358  task_team->tt.tt_max_threads = nthreads;
3359  } else {
3360  // If array has (more than) enough elements, go ahead and use it
3361  KMP_DEBUG_ASSERT(*threads_data_p != NULL);
3362  }
3363 
3364  // initialize threads_data pointers back to thread_info structures
3365  for (i = 0; i < nthreads; i++) {
3366  kmp_thread_data_t *thread_data = &(*threads_data_p)[i];
3367  thread_data->td.td_thr = team->t.t_threads[i];
3368 
3369  if (thread_data->td.td_deque_last_stolen >= nthreads) {
3370  // The last stolen field survives across teams / barrier, and the number
3371  // of threads may have changed. It's possible (likely?) that a new
3372  // parallel region will exhibit the same behavior as previous region.
3373  thread_data->td.td_deque_last_stolen = -1;
3374  }
3375  }
3376 
3377  KMP_MB();
3378  TCW_SYNC_4(task_team->tt.tt_found_tasks, TRUE);
3379  }
3380 
3381  __kmp_release_bootstrap_lock(&task_team->tt.tt_threads_lock);
3382  return is_init_thread;
3383 }
3384 
3385 // __kmp_free_task_threads_data:
3386 // Deallocates a threads_data array for a task team, including any attached
3387 // tasking deques. Only occurs at library shutdown.
3388 static void __kmp_free_task_threads_data(kmp_task_team_t *task_team) {
3389  __kmp_acquire_bootstrap_lock(&task_team->tt.tt_threads_lock);
3390  if (task_team->tt.tt_threads_data != NULL) {
3391  int i;
3392  for (i = 0; i < task_team->tt.tt_max_threads; i++) {
3393  __kmp_free_task_deque(&task_team->tt.tt_threads_data[i]);
3394  }
3395  __kmp_free(task_team->tt.tt_threads_data);
3396  task_team->tt.tt_threads_data = NULL;
3397  }
3398  __kmp_release_bootstrap_lock(&task_team->tt.tt_threads_lock);
3399 }
3400 
3401 // __kmp_allocate_task_team:
3402 // Allocates a task team associated with a specific team, taking it from
3403 // the global task team free list if possible. Also initializes data
3404 // structures.
3405 static kmp_task_team_t *__kmp_allocate_task_team(kmp_info_t *thread,
3406  kmp_team_t *team) {
3407  kmp_task_team_t *task_team = NULL;
3408  int nthreads;
3409 
3410  KA_TRACE(20, ("__kmp_allocate_task_team: T#%d entering; team = %p\n",
3411  (thread ? __kmp_gtid_from_thread(thread) : -1), team));
3412 
3413  if (TCR_PTR(__kmp_free_task_teams) != NULL) {
3414  // Take a task team from the task team pool
3415  __kmp_acquire_bootstrap_lock(&__kmp_task_team_lock);
3416  if (__kmp_free_task_teams != NULL) {
3417  task_team = __kmp_free_task_teams;
3418  TCW_PTR(__kmp_free_task_teams, task_team->tt.tt_next);
3419  task_team->tt.tt_next = NULL;
3420  }
3421  __kmp_release_bootstrap_lock(&__kmp_task_team_lock);
3422  }
3423 
3424  if (task_team == NULL) {
3425  KE_TRACE(10, ("__kmp_allocate_task_team: T#%d allocating "
3426  "task team for team %p\n",
3427  __kmp_gtid_from_thread(thread), team));
3428  // Allocate a new task team if one is not available. Cannot use
3429  // __kmp_thread_malloc because threads not around for kmp_reap_task_team.
3430  task_team = (kmp_task_team_t *)__kmp_allocate(sizeof(kmp_task_team_t));
3431  __kmp_init_bootstrap_lock(&task_team->tt.tt_threads_lock);
3432 #if USE_ITT_BUILD && USE_ITT_NOTIFY && KMP_DEBUG
3433  // suppress race conditions detection on synchronization flags in debug mode
3434  // this helps to analyze library internals eliminating false positives
3435  __itt_suppress_mark_range(
3436  __itt_suppress_range, __itt_suppress_threading_errors,
3437  &task_team->tt.tt_found_tasks, sizeof(task_team->tt.tt_found_tasks));
3438  __itt_suppress_mark_range(__itt_suppress_range,
3439  __itt_suppress_threading_errors,
3440  CCAST(kmp_uint32 *, &task_team->tt.tt_active),
3441  sizeof(task_team->tt.tt_active));
3442 #endif /* USE_ITT_BUILD && USE_ITT_NOTIFY && KMP_DEBUG */
3443  // Note: __kmp_allocate zeroes returned memory, othewise we would need:
3444  // task_team->tt.tt_threads_data = NULL;
3445  // task_team->tt.tt_max_threads = 0;
3446  // task_team->tt.tt_next = NULL;
3447  }
3448 
3449  TCW_4(task_team->tt.tt_found_tasks, FALSE);
3450  TCW_4(task_team->tt.tt_found_proxy_tasks, FALSE);
3451  task_team->tt.tt_nproc = nthreads = team->t.t_nproc;
3452 
3453  KMP_ATOMIC_ST_REL(&task_team->tt.tt_unfinished_threads, nthreads);
3454  TCW_4(task_team->tt.tt_hidden_helper_task_encountered, FALSE);
3455  TCW_4(task_team->tt.tt_active, TRUE);
3456 
3457  KA_TRACE(20, ("__kmp_allocate_task_team: T#%d exiting; task_team = %p "
3458  "unfinished_threads init'd to %d\n",
3459  (thread ? __kmp_gtid_from_thread(thread) : -1), task_team,
3460  KMP_ATOMIC_LD_RLX(&task_team->tt.tt_unfinished_threads)));
3461  return task_team;
3462 }
3463 
3464 // __kmp_free_task_team:
3465 // Frees the task team associated with a specific thread, and adds it
3466 // to the global task team free list.
3467 void __kmp_free_task_team(kmp_info_t *thread, kmp_task_team_t *task_team) {
3468  KA_TRACE(20, ("__kmp_free_task_team: T#%d task_team = %p\n",
3469  thread ? __kmp_gtid_from_thread(thread) : -1, task_team));
3470 
3471  // Put task team back on free list
3472  __kmp_acquire_bootstrap_lock(&__kmp_task_team_lock);
3473 
3474  KMP_DEBUG_ASSERT(task_team->tt.tt_next == NULL);
3475  task_team->tt.tt_next = __kmp_free_task_teams;
3476  TCW_PTR(__kmp_free_task_teams, task_team);
3477 
3478  __kmp_release_bootstrap_lock(&__kmp_task_team_lock);
3479 }
3480 
3481 // __kmp_reap_task_teams:
3482 // Free all the task teams on the task team free list.
3483 // Should only be done during library shutdown.
3484 // Cannot do anything that needs a thread structure or gtid since they are
3485 // already gone.
3486 void __kmp_reap_task_teams(void) {
3487  kmp_task_team_t *task_team;
3488 
3489  if (TCR_PTR(__kmp_free_task_teams) != NULL) {
3490  // Free all task_teams on the free list
3491  __kmp_acquire_bootstrap_lock(&__kmp_task_team_lock);
3492  while ((task_team = __kmp_free_task_teams) != NULL) {
3493  __kmp_free_task_teams = task_team->tt.tt_next;
3494  task_team->tt.tt_next = NULL;
3495 
3496  // Free threads_data if necessary
3497  if (task_team->tt.tt_threads_data != NULL) {
3498  __kmp_free_task_threads_data(task_team);
3499  }
3500  __kmp_free(task_team);
3501  }
3502  __kmp_release_bootstrap_lock(&__kmp_task_team_lock);
3503  }
3504 }
3505 
3506 // __kmp_wait_to_unref_task_teams:
3507 // Some threads could still be in the fork barrier release code, possibly
3508 // trying to steal tasks. Wait for each thread to unreference its task team.
3509 void __kmp_wait_to_unref_task_teams(void) {
3510  kmp_info_t *thread;
3511  kmp_uint32 spins;
3512  int done;
3513 
3514  KMP_INIT_YIELD(spins);
3515 
3516  for (;;) {
3517  done = TRUE;
3518 
3519  // TODO: GEH - this may be is wrong because some sync would be necessary
3520  // in case threads are added to the pool during the traversal. Need to
3521  // verify that lock for thread pool is held when calling this routine.
3522  for (thread = CCAST(kmp_info_t *, __kmp_thread_pool); thread != NULL;
3523  thread = thread->th.th_next_pool) {
3524 #if KMP_OS_WINDOWS
3525  DWORD exit_val;
3526 #endif
3527  if (TCR_PTR(thread->th.th_task_team) == NULL) {
3528  KA_TRACE(10, ("__kmp_wait_to_unref_task_team: T#%d task_team == NULL\n",
3529  __kmp_gtid_from_thread(thread)));
3530  continue;
3531  }
3532 #if KMP_OS_WINDOWS
3533  // TODO: GEH - add this check for Linux* OS / OS X* as well?
3534  if (!__kmp_is_thread_alive(thread, &exit_val)) {
3535  thread->th.th_task_team = NULL;
3536  continue;
3537  }
3538 #endif
3539 
3540  done = FALSE; // Because th_task_team pointer is not NULL for this thread
3541 
3542  KA_TRACE(10, ("__kmp_wait_to_unref_task_team: Waiting for T#%d to "
3543  "unreference task_team\n",
3544  __kmp_gtid_from_thread(thread)));
3545 
3546  if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) {
3547  volatile void *sleep_loc;
3548  // If the thread is sleeping, awaken it.
3549  if ((sleep_loc = TCR_PTR(CCAST(void *, thread->th.th_sleep_loc))) !=
3550  NULL) {
3551  KA_TRACE(
3552  10,
3553  ("__kmp_wait_to_unref_task_team: T#%d waking up thread T#%d\n",
3554  __kmp_gtid_from_thread(thread), __kmp_gtid_from_thread(thread)));
3555  __kmp_null_resume_wrapper(__kmp_gtid_from_thread(thread), sleep_loc);
3556  }
3557  }
3558  }
3559  if (done) {
3560  break;
3561  }
3562 
3563  // If oversubscribed or have waited a bit, yield.
3564  KMP_YIELD_OVERSUB_ELSE_SPIN(spins);
3565  }
3566 }
3567 
3568 // __kmp_task_team_setup: Create a task_team for the current team, but use
3569 // an already created, unused one if it already exists.
3570 void __kmp_task_team_setup(kmp_info_t *this_thr, kmp_team_t *team, int always) {
3571  KMP_DEBUG_ASSERT(__kmp_tasking_mode != tskm_immediate_exec);
3572 
3573  // If this task_team hasn't been created yet, allocate it. It will be used in
3574  // the region after the next.
3575  // If it exists, it is the current task team and shouldn't be touched yet as
3576  // it may still be in use.
3577  if (team->t.t_task_team[this_thr->th.th_task_state] == NULL &&
3578  (always || team->t.t_nproc > 1)) {
3579  team->t.t_task_team[this_thr->th.th_task_state] =
3580  __kmp_allocate_task_team(this_thr, team);
3581  KA_TRACE(20, ("__kmp_task_team_setup: Primary T#%d created new task_team %p"
3582  " for team %d at parity=%d\n",
3583  __kmp_gtid_from_thread(this_thr),
3584  team->t.t_task_team[this_thr->th.th_task_state], team->t.t_id,
3585  this_thr->th.th_task_state));
3586  }
3587 
3588  // After threads exit the release, they will call sync, and then point to this
3589  // other task_team; make sure it is allocated and properly initialized. As
3590  // threads spin in the barrier release phase, they will continue to use the
3591  // previous task_team struct(above), until they receive the signal to stop
3592  // checking for tasks (they can't safely reference the kmp_team_t struct,
3593  // which could be reallocated by the primary thread). No task teams are formed
3594  // for serialized teams.
3595  if (team->t.t_nproc > 1) {
3596  int other_team = 1 - this_thr->th.th_task_state;
3597  KMP_DEBUG_ASSERT(other_team >= 0 && other_team < 2);
3598  if (team->t.t_task_team[other_team] == NULL) { // setup other team as well
3599  team->t.t_task_team[other_team] =
3600  __kmp_allocate_task_team(this_thr, team);
3601  KA_TRACE(20, ("__kmp_task_team_setup: Primary T#%d created second new "
3602  "task_team %p for team %d at parity=%d\n",
3603  __kmp_gtid_from_thread(this_thr),
3604  team->t.t_task_team[other_team], team->t.t_id, other_team));
3605  } else { // Leave the old task team struct in place for the upcoming region;
3606  // adjust as needed
3607  kmp_task_team_t *task_team = team->t.t_task_team[other_team];
3608  if (!task_team->tt.tt_active ||
3609  team->t.t_nproc != task_team->tt.tt_nproc) {
3610  TCW_4(task_team->tt.tt_nproc, team->t.t_nproc);
3611  TCW_4(task_team->tt.tt_found_tasks, FALSE);
3612  TCW_4(task_team->tt.tt_found_proxy_tasks, FALSE);
3613  KMP_ATOMIC_ST_REL(&task_team->tt.tt_unfinished_threads,
3614  team->t.t_nproc);
3615  TCW_4(task_team->tt.tt_active, TRUE);
3616  }
3617  // if team size has changed, the first thread to enable tasking will
3618  // realloc threads_data if necessary
3619  KA_TRACE(20, ("__kmp_task_team_setup: Primary T#%d reset next task_team "
3620  "%p for team %d at parity=%d\n",
3621  __kmp_gtid_from_thread(this_thr),
3622  team->t.t_task_team[other_team], team->t.t_id, other_team));
3623  }
3624  }
3625 
3626  // For regular thread, task enabling should be called when the task is going
3627  // to be pushed to a dequeue. However, for the hidden helper thread, we need
3628  // it ahead of time so that some operations can be performed without race
3629  // condition.
3630  if (this_thr == __kmp_hidden_helper_main_thread) {
3631  for (int i = 0; i < 2; ++i) {
3632  kmp_task_team_t *task_team = team->t.t_task_team[i];
3633  if (KMP_TASKING_ENABLED(task_team)) {
3634  continue;
3635  }
3636  __kmp_enable_tasking(task_team, this_thr);
3637  for (int j = 0; j < task_team->tt.tt_nproc; ++j) {
3638  kmp_thread_data_t *thread_data = &task_team->tt.tt_threads_data[j];
3639  if (thread_data->td.td_deque == NULL) {
3640  __kmp_alloc_task_deque(__kmp_hidden_helper_threads[j], thread_data);
3641  }
3642  }
3643  }
3644  }
3645 }
3646 
3647 // __kmp_task_team_sync: Propagation of task team data from team to threads
3648 // which happens just after the release phase of a team barrier. This may be
3649 // called by any thread, but only for teams with # threads > 1.
3650 void __kmp_task_team_sync(kmp_info_t *this_thr, kmp_team_t *team) {
3651  KMP_DEBUG_ASSERT(__kmp_tasking_mode != tskm_immediate_exec);
3652 
3653  // Toggle the th_task_state field, to switch which task_team this thread
3654  // refers to
3655  this_thr->th.th_task_state = (kmp_uint8)(1 - this_thr->th.th_task_state);
3656 
3657  // It is now safe to propagate the task team pointer from the team struct to
3658  // the current thread.
3659  TCW_PTR(this_thr->th.th_task_team,
3660  team->t.t_task_team[this_thr->th.th_task_state]);
3661  KA_TRACE(20,
3662  ("__kmp_task_team_sync: Thread T#%d task team switched to task_team "
3663  "%p from Team #%d (parity=%d)\n",
3664  __kmp_gtid_from_thread(this_thr), this_thr->th.th_task_team,
3665  team->t.t_id, this_thr->th.th_task_state));
3666 }
3667 
3668 // __kmp_task_team_wait: Primary thread waits for outstanding tasks after the
3669 // barrier gather phase. Only called by primary thread if #threads in team > 1
3670 // or if proxy tasks were created.
3671 //
3672 // wait is a flag that defaults to 1 (see kmp.h), but waiting can be turned off
3673 // by passing in 0 optionally as the last argument. When wait is zero, primary
3674 // thread does not wait for unfinished_threads to reach 0.
3675 void __kmp_task_team_wait(
3676  kmp_info_t *this_thr,
3677  kmp_team_t *team USE_ITT_BUILD_ARG(void *itt_sync_obj), int wait) {
3678  kmp_task_team_t *task_team = team->t.t_task_team[this_thr->th.th_task_state];
3679 
3680  KMP_DEBUG_ASSERT(__kmp_tasking_mode != tskm_immediate_exec);
3681  KMP_DEBUG_ASSERT(task_team == this_thr->th.th_task_team);
3682 
3683  if ((task_team != NULL) && KMP_TASKING_ENABLED(task_team)) {
3684  if (wait) {
3685  KA_TRACE(20, ("__kmp_task_team_wait: Primary T#%d waiting for all tasks "
3686  "(for unfinished_threads to reach 0) on task_team = %p\n",
3687  __kmp_gtid_from_thread(this_thr), task_team));
3688  // Worker threads may have dropped through to release phase, but could
3689  // still be executing tasks. Wait here for tasks to complete. To avoid
3690  // memory contention, only primary thread checks termination condition.
3691  kmp_flag_32<false, false> flag(
3692  RCAST(std::atomic<kmp_uint32> *,
3693  &task_team->tt.tt_unfinished_threads),
3694  0U);
3695  flag.wait(this_thr, TRUE USE_ITT_BUILD_ARG(itt_sync_obj));
3696  }
3697  // Deactivate the old task team, so that the worker threads will stop
3698  // referencing it while spinning.
3699  KA_TRACE(
3700  20,
3701  ("__kmp_task_team_wait: Primary T#%d deactivating task_team %p: "
3702  "setting active to false, setting local and team's pointer to NULL\n",
3703  __kmp_gtid_from_thread(this_thr), task_team));
3704  KMP_DEBUG_ASSERT(task_team->tt.tt_nproc > 1 ||
3705  task_team->tt.tt_found_proxy_tasks == TRUE);
3706  TCW_SYNC_4(task_team->tt.tt_found_proxy_tasks, FALSE);
3707  KMP_CHECK_UPDATE(task_team->tt.tt_untied_task_encountered, 0);
3708  TCW_SYNC_4(task_team->tt.tt_active, FALSE);
3709  KMP_MB();
3710 
3711  TCW_PTR(this_thr->th.th_task_team, NULL);
3712  }
3713 }
3714 
3715 // __kmp_tasking_barrier:
3716 // This routine is called only when __kmp_tasking_mode == tskm_extra_barrier.
3717 // Internal function to execute all tasks prior to a regular barrier or a join
3718 // barrier. It is a full barrier itself, which unfortunately turns regular
3719 // barriers into double barriers and join barriers into 1 1/2 barriers.
3720 void __kmp_tasking_barrier(kmp_team_t *team, kmp_info_t *thread, int gtid) {
3721  std::atomic<kmp_uint32> *spin = RCAST(
3722  std::atomic<kmp_uint32> *,
3723  &team->t.t_task_team[thread->th.th_task_state]->tt.tt_unfinished_threads);
3724  int flag = FALSE;
3725  KMP_DEBUG_ASSERT(__kmp_tasking_mode == tskm_extra_barrier);
3726 
3727 #if USE_ITT_BUILD
3728  KMP_FSYNC_SPIN_INIT(spin, NULL);
3729 #endif /* USE_ITT_BUILD */
3730  kmp_flag_32<false, false> spin_flag(spin, 0U);
3731  while (!spin_flag.execute_tasks(thread, gtid, TRUE,
3732  &flag USE_ITT_BUILD_ARG(NULL), 0)) {
3733 #if USE_ITT_BUILD
3734  // TODO: What about itt_sync_obj??
3735  KMP_FSYNC_SPIN_PREPARE(RCAST(void *, spin));
3736 #endif /* USE_ITT_BUILD */
3737 
3738  if (TCR_4(__kmp_global.g.g_done)) {
3739  if (__kmp_global.g.g_abort)
3740  __kmp_abort_thread();
3741  break;
3742  }
3743  KMP_YIELD(TRUE);
3744  }
3745 #if USE_ITT_BUILD
3746  KMP_FSYNC_SPIN_ACQUIRED(RCAST(void *, spin));
3747 #endif /* USE_ITT_BUILD */
3748 }
3749 
3750 // __kmp_give_task puts a task into a given thread queue if:
3751 // - the queue for that thread was created
3752 // - there's space in that queue
3753 // Because of this, __kmp_push_task needs to check if there's space after
3754 // getting the lock
3755 static bool __kmp_give_task(kmp_info_t *thread, kmp_int32 tid, kmp_task_t *task,
3756  kmp_int32 pass) {
3757  kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(task);
3758  kmp_task_team_t *task_team = taskdata->td_task_team;
3759 
3760  KA_TRACE(20, ("__kmp_give_task: trying to give task %p to thread %d.\n",
3761  taskdata, tid));
3762 
3763  // If task_team is NULL something went really bad...
3764  KMP_DEBUG_ASSERT(task_team != NULL);
3765 
3766  bool result = false;
3767  kmp_thread_data_t *thread_data = &task_team->tt.tt_threads_data[tid];
3768 
3769  if (thread_data->td.td_deque == NULL) {
3770  // There's no queue in this thread, go find another one
3771  // We're guaranteed that at least one thread has a queue
3772  KA_TRACE(30,
3773  ("__kmp_give_task: thread %d has no queue while giving task %p.\n",
3774  tid, taskdata));
3775  return result;
3776  }
3777 
3778  if (TCR_4(thread_data->td.td_deque_ntasks) >=
3779  TASK_DEQUE_SIZE(thread_data->td)) {
3780  KA_TRACE(
3781  30,
3782  ("__kmp_give_task: queue is full while giving task %p to thread %d.\n",
3783  taskdata, tid));
3784 
3785  // if this deque is bigger than the pass ratio give a chance to another
3786  // thread
3787  if (TASK_DEQUE_SIZE(thread_data->td) / INITIAL_TASK_DEQUE_SIZE >= pass)
3788  return result;
3789 
3790  __kmp_acquire_bootstrap_lock(&thread_data->td.td_deque_lock);
3791  if (TCR_4(thread_data->td.td_deque_ntasks) >=
3792  TASK_DEQUE_SIZE(thread_data->td)) {
3793  // expand deque to push the task which is not allowed to execute
3794  __kmp_realloc_task_deque(thread, thread_data);
3795  }
3796 
3797  } else {
3798 
3799  __kmp_acquire_bootstrap_lock(&thread_data->td.td_deque_lock);
3800 
3801  if (TCR_4(thread_data->td.td_deque_ntasks) >=
3802  TASK_DEQUE_SIZE(thread_data->td)) {
3803  KA_TRACE(30, ("__kmp_give_task: queue is full while giving task %p to "
3804  "thread %d.\n",
3805  taskdata, tid));
3806 
3807  // if this deque is bigger than the pass ratio give a chance to another
3808  // thread
3809  if (TASK_DEQUE_SIZE(thread_data->td) / INITIAL_TASK_DEQUE_SIZE >= pass)
3810  goto release_and_exit;
3811 
3812  __kmp_realloc_task_deque(thread, thread_data);
3813  }
3814  }
3815 
3816  // lock is held here, and there is space in the deque
3817 
3818  thread_data->td.td_deque[thread_data->td.td_deque_tail] = taskdata;
3819  // Wrap index.
3820  thread_data->td.td_deque_tail =
3821  (thread_data->td.td_deque_tail + 1) & TASK_DEQUE_MASK(thread_data->td);
3822  TCW_4(thread_data->td.td_deque_ntasks,
3823  TCR_4(thread_data->td.td_deque_ntasks) + 1);
3824 
3825  result = true;
3826  KA_TRACE(30, ("__kmp_give_task: successfully gave task %p to thread %d.\n",
3827  taskdata, tid));
3828 
3829 release_and_exit:
3830  __kmp_release_bootstrap_lock(&thread_data->td.td_deque_lock);
3831 
3832  return result;
3833 }
3834 
3835 /* The finish of the proxy tasks is divided in two pieces:
3836  - the top half is the one that can be done from a thread outside the team
3837  - the bottom half must be run from a thread within the team
3838 
3839  In order to run the bottom half the task gets queued back into one of the
3840  threads of the team. Once the td_incomplete_child_task counter of the parent
3841  is decremented the threads can leave the barriers. So, the bottom half needs
3842  to be queued before the counter is decremented. The top half is therefore
3843  divided in two parts:
3844  - things that can be run before queuing the bottom half
3845  - things that must be run after queuing the bottom half
3846 
3847  This creates a second race as the bottom half can free the task before the
3848  second top half is executed. To avoid this we use the
3849  td_incomplete_child_task of the proxy task to synchronize the top and bottom
3850  half. */
3851 static void __kmp_first_top_half_finish_proxy(kmp_taskdata_t *taskdata) {
3852  KMP_DEBUG_ASSERT(taskdata->td_flags.tasktype == TASK_EXPLICIT);
3853  KMP_DEBUG_ASSERT(taskdata->td_flags.proxy == TASK_PROXY);
3854  KMP_DEBUG_ASSERT(taskdata->td_flags.complete == 0);
3855  KMP_DEBUG_ASSERT(taskdata->td_flags.freed == 0);
3856 
3857  taskdata->td_flags.complete = 1; // mark the task as completed
3858 
3859  if (taskdata->td_taskgroup)
3860  KMP_ATOMIC_DEC(&taskdata->td_taskgroup->count);
3861 
3862  // Create an imaginary children for this task so the bottom half cannot
3863  // release the task before we have completed the second top half
3864  KMP_ATOMIC_INC(&taskdata->td_incomplete_child_tasks);
3865 }
3866 
3867 static void __kmp_second_top_half_finish_proxy(kmp_taskdata_t *taskdata) {
3868  kmp_int32 children = 0;
3869 
3870  // Predecrement simulated by "- 1" calculation
3871  children =
3872  KMP_ATOMIC_DEC(&taskdata->td_parent->td_incomplete_child_tasks) - 1;
3873  KMP_DEBUG_ASSERT(children >= 0);
3874 
3875  // Remove the imaginary children
3876  KMP_ATOMIC_DEC(&taskdata->td_incomplete_child_tasks);
3877 }
3878 
3879 static void __kmp_bottom_half_finish_proxy(kmp_int32 gtid, kmp_task_t *ptask) {
3880  kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(ptask);
3881  kmp_info_t *thread = __kmp_threads[gtid];
3882 
3883  KMP_DEBUG_ASSERT(taskdata->td_flags.proxy == TASK_PROXY);
3884  KMP_DEBUG_ASSERT(taskdata->td_flags.complete ==
3885  1); // top half must run before bottom half
3886 
3887  // We need to wait to make sure the top half is finished
3888  // Spinning here should be ok as this should happen quickly
3889  while (KMP_ATOMIC_LD_ACQ(&taskdata->td_incomplete_child_tasks) > 0)
3890  ;
3891 
3892  __kmp_release_deps(gtid, taskdata);
3893  __kmp_free_task_and_ancestors(gtid, taskdata, thread);
3894 }
3895 
3904 void __kmpc_proxy_task_completed(kmp_int32 gtid, kmp_task_t *ptask) {
3905  KMP_DEBUG_ASSERT(ptask != NULL);
3906  kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(ptask);
3907  KA_TRACE(
3908  10, ("__kmp_proxy_task_completed(enter): T#%d proxy task %p completing\n",
3909  gtid, taskdata));
3910  __kmp_assert_valid_gtid(gtid);
3911  KMP_DEBUG_ASSERT(taskdata->td_flags.proxy == TASK_PROXY);
3912 
3913  __kmp_first_top_half_finish_proxy(taskdata);
3914  __kmp_second_top_half_finish_proxy(taskdata);
3915  __kmp_bottom_half_finish_proxy(gtid, ptask);
3916 
3917  KA_TRACE(10,
3918  ("__kmp_proxy_task_completed(exit): T#%d proxy task %p completing\n",
3919  gtid, taskdata));
3920 }
3921 
3929 void __kmpc_proxy_task_completed_ooo(kmp_task_t *ptask) {
3930  KMP_DEBUG_ASSERT(ptask != NULL);
3931  kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(ptask);
3932 
3933  KA_TRACE(
3934  10,
3935  ("__kmp_proxy_task_completed_ooo(enter): proxy task completing ooo %p\n",
3936  taskdata));
3937 
3938  KMP_DEBUG_ASSERT(taskdata->td_flags.proxy == TASK_PROXY);
3939 
3940  __kmp_first_top_half_finish_proxy(taskdata);
3941 
3942  // Enqueue task to complete bottom half completion from a thread within the
3943  // corresponding team
3944  kmp_team_t *team = taskdata->td_team;
3945  kmp_int32 nthreads = team->t.t_nproc;
3946  kmp_info_t *thread;
3947 
3948  // This should be similar to start_k = __kmp_get_random( thread ) % nthreads
3949  // but we cannot use __kmp_get_random here
3950  kmp_int32 start_k = 0;
3951  kmp_int32 pass = 1;
3952  kmp_int32 k = start_k;
3953 
3954  do {
3955  // For now we're just linearly trying to find a thread
3956  thread = team->t.t_threads[k];
3957  k = (k + 1) % nthreads;
3958 
3959  // we did a full pass through all the threads
3960  if (k == start_k)
3961  pass = pass << 1;
3962 
3963  } while (!__kmp_give_task(thread, k, ptask, pass));
3964 
3965  __kmp_second_top_half_finish_proxy(taskdata);
3966 
3967  KA_TRACE(
3968  10,
3969  ("__kmp_proxy_task_completed_ooo(exit): proxy task completing ooo %p\n",
3970  taskdata));
3971 }
3972 
3973 kmp_event_t *__kmpc_task_allow_completion_event(ident_t *loc_ref, int gtid,
3974  kmp_task_t *task) {
3975  kmp_taskdata_t *td = KMP_TASK_TO_TASKDATA(task);
3976  if (td->td_allow_completion_event.type == KMP_EVENT_UNINITIALIZED) {
3977  td->td_allow_completion_event.type = KMP_EVENT_ALLOW_COMPLETION;
3978  td->td_allow_completion_event.ed.task = task;
3979  __kmp_init_tas_lock(&td->td_allow_completion_event.lock);
3980  }
3981  return &td->td_allow_completion_event;
3982 }
3983 
3984 void __kmp_fulfill_event(kmp_event_t *event) {
3985  if (event->type == KMP_EVENT_ALLOW_COMPLETION) {
3986  kmp_task_t *ptask = event->ed.task;
3987  kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(ptask);
3988  bool detached = false;
3989  int gtid = __kmp_get_gtid();
3990 
3991  // The associated task might have completed or could be completing at this
3992  // point.
3993  // We need to take the lock to avoid races
3994  __kmp_acquire_tas_lock(&event->lock, gtid);
3995  if (taskdata->td_flags.proxy == TASK_PROXY) {
3996  detached = true;
3997  } else {
3998 #if OMPT_SUPPORT
3999  // The OMPT event must occur under mutual exclusion,
4000  // otherwise the tool might access ptask after free
4001  if (UNLIKELY(ompt_enabled.enabled))
4002  __ompt_task_finish(ptask, NULL, ompt_task_early_fulfill);
4003 #endif
4004  }
4005  event->type = KMP_EVENT_UNINITIALIZED;
4006  __kmp_release_tas_lock(&event->lock, gtid);
4007 
4008  if (detached) {
4009 #if OMPT_SUPPORT
4010  // We free ptask afterwards and know the task is finished,
4011  // so locking is not necessary
4012  if (UNLIKELY(ompt_enabled.enabled))
4013  __ompt_task_finish(ptask, NULL, ompt_task_late_fulfill);
4014 #endif
4015  // If the task detached complete the proxy task
4016  if (gtid >= 0) {
4017  kmp_team_t *team = taskdata->td_team;
4018  kmp_info_t *thread = __kmp_get_thread();
4019  if (thread->th.th_team == team) {
4020  __kmpc_proxy_task_completed(gtid, ptask);
4021  return;
4022  }
4023  }
4024 
4025  // fallback
4027  }
4028  }
4029 }
4030 
4031 // __kmp_task_dup_alloc: Allocate the taskdata and make a copy of source task
4032 // for taskloop
4033 //
4034 // thread: allocating thread
4035 // task_src: pointer to source task to be duplicated
4036 // returns: a pointer to the allocated kmp_task_t structure (task).
4037 kmp_task_t *__kmp_task_dup_alloc(kmp_info_t *thread, kmp_task_t *task_src) {
4038  kmp_task_t *task;
4039  kmp_taskdata_t *taskdata;
4040  kmp_taskdata_t *taskdata_src = KMP_TASK_TO_TASKDATA(task_src);
4041  kmp_taskdata_t *parent_task = taskdata_src->td_parent; // same parent task
4042  size_t shareds_offset;
4043  size_t task_size;
4044 
4045  KA_TRACE(10, ("__kmp_task_dup_alloc(enter): Th %p, source task %p\n", thread,
4046  task_src));
4047  KMP_DEBUG_ASSERT(taskdata_src->td_flags.proxy ==
4048  TASK_FULL); // it should not be proxy task
4049  KMP_DEBUG_ASSERT(taskdata_src->td_flags.tasktype == TASK_EXPLICIT);
4050  task_size = taskdata_src->td_size_alloc;
4051 
4052  // Allocate a kmp_taskdata_t block and a kmp_task_t block.
4053  KA_TRACE(30, ("__kmp_task_dup_alloc: Th %p, malloc size %ld\n", thread,
4054  task_size));
4055 #if USE_FAST_MEMORY
4056  taskdata = (kmp_taskdata_t *)__kmp_fast_allocate(thread, task_size);
4057 #else
4058  taskdata = (kmp_taskdata_t *)__kmp_thread_malloc(thread, task_size);
4059 #endif /* USE_FAST_MEMORY */
4060  KMP_MEMCPY(taskdata, taskdata_src, task_size);
4061 
4062  task = KMP_TASKDATA_TO_TASK(taskdata);
4063 
4064  // Initialize new task (only specific fields not affected by memcpy)
4065  taskdata->td_task_id = KMP_GEN_TASK_ID();
4066  if (task->shareds != NULL) { // need setup shareds pointer
4067  shareds_offset = (char *)task_src->shareds - (char *)taskdata_src;
4068  task->shareds = &((char *)taskdata)[shareds_offset];
4069  KMP_DEBUG_ASSERT((((kmp_uintptr_t)task->shareds) & (sizeof(void *) - 1)) ==
4070  0);
4071  }
4072  taskdata->td_alloc_thread = thread;
4073  taskdata->td_parent = parent_task;
4074  // task inherits the taskgroup from the parent task
4075  taskdata->td_taskgroup = parent_task->td_taskgroup;
4076  // tied task needs to initialize the td_last_tied at creation,
4077  // untied one does this when it is scheduled for execution
4078  if (taskdata->td_flags.tiedness == TASK_TIED)
4079  taskdata->td_last_tied = taskdata;
4080 
4081  // Only need to keep track of child task counts if team parallel and tasking
4082  // not serialized
4083  if (!(taskdata->td_flags.team_serial || taskdata->td_flags.tasking_ser)) {
4084  KMP_ATOMIC_INC(&parent_task->td_incomplete_child_tasks);
4085  if (parent_task->td_taskgroup)
4086  KMP_ATOMIC_INC(&parent_task->td_taskgroup->count);
4087  // Only need to keep track of allocated child tasks for explicit tasks since
4088  // implicit not deallocated
4089  if (taskdata->td_parent->td_flags.tasktype == TASK_EXPLICIT)
4090  KMP_ATOMIC_INC(&taskdata->td_parent->td_allocated_child_tasks);
4091  }
4092 
4093  KA_TRACE(20,
4094  ("__kmp_task_dup_alloc(exit): Th %p, created task %p, parent=%p\n",
4095  thread, taskdata, taskdata->td_parent));
4096 #if OMPT_SUPPORT
4097  if (UNLIKELY(ompt_enabled.enabled))
4098  __ompt_task_init(taskdata, thread->th.th_info.ds.ds_gtid);
4099 #endif
4100  return task;
4101 }
4102 
4103 // Routine optionally generated by the compiler for setting the lastprivate flag
4104 // and calling needed constructors for private/firstprivate objects
4105 // (used to form taskloop tasks from pattern task)
4106 // Parameters: dest task, src task, lastprivate flag.
4107 typedef void (*p_task_dup_t)(kmp_task_t *, kmp_task_t *, kmp_int32);
4108 
4109 KMP_BUILD_ASSERT(sizeof(long) == 4 || sizeof(long) == 8);
4110 
4111 // class to encapsulate manipulating loop bounds in a taskloop task.
4112 // this abstracts away the Intel vs GOMP taskloop interface for setting/getting
4113 // the loop bound variables.
4114 class kmp_taskloop_bounds_t {
4115  kmp_task_t *task;
4116  const kmp_taskdata_t *taskdata;
4117  size_t lower_offset;
4118  size_t upper_offset;
4119 
4120 public:
4121  kmp_taskloop_bounds_t(kmp_task_t *_task, kmp_uint64 *lb, kmp_uint64 *ub)
4122  : task(_task), taskdata(KMP_TASK_TO_TASKDATA(task)),
4123  lower_offset((char *)lb - (char *)task),
4124  upper_offset((char *)ub - (char *)task) {
4125  KMP_DEBUG_ASSERT((char *)lb > (char *)_task);
4126  KMP_DEBUG_ASSERT((char *)ub > (char *)_task);
4127  }
4128  kmp_taskloop_bounds_t(kmp_task_t *_task, const kmp_taskloop_bounds_t &bounds)
4129  : task(_task), taskdata(KMP_TASK_TO_TASKDATA(_task)),
4130  lower_offset(bounds.lower_offset), upper_offset(bounds.upper_offset) {}
4131  size_t get_lower_offset() const { return lower_offset; }
4132  size_t get_upper_offset() const { return upper_offset; }
4133  kmp_uint64 get_lb() const {
4134  kmp_int64 retval;
4135 #if defined(KMP_GOMP_COMPAT)
4136  // Intel task just returns the lower bound normally
4137  if (!taskdata->td_flags.native) {
4138  retval = *(kmp_int64 *)((char *)task + lower_offset);
4139  } else {
4140  // GOMP task has to take into account the sizeof(long)
4141  if (taskdata->td_size_loop_bounds == 4) {
4142  kmp_int32 *lb = RCAST(kmp_int32 *, task->shareds);
4143  retval = (kmp_int64)*lb;
4144  } else {
4145  kmp_int64 *lb = RCAST(kmp_int64 *, task->shareds);
4146  retval = (kmp_int64)*lb;
4147  }
4148  }
4149 #else
4150  (void)taskdata;
4151  retval = *(kmp_int64 *)((char *)task + lower_offset);
4152 #endif // defined(KMP_GOMP_COMPAT)
4153  return retval;
4154  }
4155  kmp_uint64 get_ub() const {
4156  kmp_int64 retval;
4157 #if defined(KMP_GOMP_COMPAT)
4158  // Intel task just returns the upper bound normally
4159  if (!taskdata->td_flags.native) {
4160  retval = *(kmp_int64 *)((char *)task + upper_offset);
4161  } else {
4162  // GOMP task has to take into account the sizeof(long)
4163  if (taskdata->td_size_loop_bounds == 4) {
4164  kmp_int32 *ub = RCAST(kmp_int32 *, task->shareds) + 1;
4165  retval = (kmp_int64)*ub;
4166  } else {
4167  kmp_int64 *ub = RCAST(kmp_int64 *, task->shareds) + 1;
4168  retval = (kmp_int64)*ub;
4169  }
4170  }
4171 #else
4172  retval = *(kmp_int64 *)((char *)task + upper_offset);
4173 #endif // defined(KMP_GOMP_COMPAT)
4174  return retval;
4175  }
4176  void set_lb(kmp_uint64 lb) {
4177 #if defined(KMP_GOMP_COMPAT)
4178  // Intel task just sets the lower bound normally
4179  if (!taskdata->td_flags.native) {
4180  *(kmp_uint64 *)((char *)task + lower_offset) = lb;
4181  } else {
4182  // GOMP task has to take into account the sizeof(long)
4183  if (taskdata->td_size_loop_bounds == 4) {
4184  kmp_uint32 *lower = RCAST(kmp_uint32 *, task->shareds);
4185  *lower = (kmp_uint32)lb;
4186  } else {
4187  kmp_uint64 *lower = RCAST(kmp_uint64 *, task->shareds);
4188  *lower = (kmp_uint64)lb;
4189  }
4190  }
4191 #else
4192  *(kmp_uint64 *)((char *)task + lower_offset) = lb;
4193 #endif // defined(KMP_GOMP_COMPAT)
4194  }
4195  void set_ub(kmp_uint64 ub) {
4196 #if defined(KMP_GOMP_COMPAT)
4197  // Intel task just sets the upper bound normally
4198  if (!taskdata->td_flags.native) {
4199  *(kmp_uint64 *)((char *)task + upper_offset) = ub;
4200  } else {
4201  // GOMP task has to take into account the sizeof(long)
4202  if (taskdata->td_size_loop_bounds == 4) {
4203  kmp_uint32 *upper = RCAST(kmp_uint32 *, task->shareds) + 1;
4204  *upper = (kmp_uint32)ub;
4205  } else {
4206  kmp_uint64 *upper = RCAST(kmp_uint64 *, task->shareds) + 1;
4207  *upper = (kmp_uint64)ub;
4208  }
4209  }
4210 #else
4211  *(kmp_uint64 *)((char *)task + upper_offset) = ub;
4212 #endif // defined(KMP_GOMP_COMPAT)
4213  }
4214 };
4215 
4216 // __kmp_taskloop_linear: Start tasks of the taskloop linearly
4217 //
4218 // loc Source location information
4219 // gtid Global thread ID
4220 // task Pattern task, exposes the loop iteration range
4221 // lb Pointer to loop lower bound in task structure
4222 // ub Pointer to loop upper bound in task structure
4223 // st Loop stride
4224 // ub_glob Global upper bound (used for lastprivate check)
4225 // num_tasks Number of tasks to execute
4226 // grainsize Number of loop iterations per task
4227 // extras Number of chunks with grainsize+1 iterations
4228 // last_chunk Reduction of grainsize for last task
4229 // tc Iterations count
4230 // task_dup Tasks duplication routine
4231 // codeptr_ra Return address for OMPT events
4232 void __kmp_taskloop_linear(ident_t *loc, int gtid, kmp_task_t *task,
4233  kmp_uint64 *lb, kmp_uint64 *ub, kmp_int64 st,
4234  kmp_uint64 ub_glob, kmp_uint64 num_tasks,
4235  kmp_uint64 grainsize, kmp_uint64 extras,
4236  kmp_int64 last_chunk, kmp_uint64 tc,
4237 #if OMPT_SUPPORT
4238  void *codeptr_ra,
4239 #endif
4240  void *task_dup) {
4241  KMP_COUNT_BLOCK(OMP_TASKLOOP);
4242  KMP_TIME_PARTITIONED_BLOCK(OMP_taskloop_scheduling);
4243  p_task_dup_t ptask_dup = (p_task_dup_t)task_dup;
4244  // compiler provides global bounds here
4245  kmp_taskloop_bounds_t task_bounds(task, lb, ub);
4246  kmp_uint64 lower = task_bounds.get_lb();
4247  kmp_uint64 upper = task_bounds.get_ub();
4248  kmp_uint64 i;
4249  kmp_info_t *thread = __kmp_threads[gtid];
4250  kmp_taskdata_t *current_task = thread->th.th_current_task;
4251  kmp_task_t *next_task;
4252  kmp_int32 lastpriv = 0;
4253 
4254  KMP_DEBUG_ASSERT(tc == num_tasks * grainsize +
4255  (last_chunk < 0 ? last_chunk : extras));
4256  KMP_DEBUG_ASSERT(num_tasks > extras);
4257  KMP_DEBUG_ASSERT(num_tasks > 0);
4258  KA_TRACE(20, ("__kmp_taskloop_linear: T#%d: %lld tasks, grainsize %lld, "
4259  "extras %lld, last_chunk %lld, i=%lld,%lld(%d)%lld, dup %p\n",
4260  gtid, num_tasks, grainsize, extras, last_chunk, lower, upper,
4261  ub_glob, st, task_dup));
4262 
4263  // Launch num_tasks tasks, assign grainsize iterations each task
4264  for (i = 0; i < num_tasks; ++i) {
4265  kmp_uint64 chunk_minus_1;
4266  if (extras == 0) {
4267  chunk_minus_1 = grainsize - 1;
4268  } else {
4269  chunk_minus_1 = grainsize;
4270  --extras; // first extras iterations get bigger chunk (grainsize+1)
4271  }
4272  upper = lower + st * chunk_minus_1;
4273  if (upper > *ub) {
4274  upper = *ub;
4275  }
4276  if (i == num_tasks - 1) {
4277  // schedule the last task, set lastprivate flag if needed
4278  if (st == 1) { // most common case
4279  KMP_DEBUG_ASSERT(upper == *ub);
4280  if (upper == ub_glob)
4281  lastpriv = 1;
4282  } else if (st > 0) { // positive loop stride
4283  KMP_DEBUG_ASSERT((kmp_uint64)st > *ub - upper);
4284  if ((kmp_uint64)st > ub_glob - upper)
4285  lastpriv = 1;
4286  } else { // negative loop stride
4287  KMP_DEBUG_ASSERT(upper + st < *ub);
4288  if (upper - ub_glob < (kmp_uint64)(-st))
4289  lastpriv = 1;
4290  }
4291  }
4292  next_task = __kmp_task_dup_alloc(thread, task); // allocate new task
4293  kmp_taskdata_t *next_taskdata = KMP_TASK_TO_TASKDATA(next_task);
4294  kmp_taskloop_bounds_t next_task_bounds =
4295  kmp_taskloop_bounds_t(next_task, task_bounds);
4296 
4297  // adjust task-specific bounds
4298  next_task_bounds.set_lb(lower);
4299  if (next_taskdata->td_flags.native) {
4300  next_task_bounds.set_ub(upper + (st > 0 ? 1 : -1));
4301  } else {
4302  next_task_bounds.set_ub(upper);
4303  }
4304  if (ptask_dup != NULL) // set lastprivate flag, construct firstprivates,
4305  // etc.
4306  ptask_dup(next_task, task, lastpriv);
4307  KA_TRACE(40,
4308  ("__kmp_taskloop_linear: T#%d; task #%llu: task %p: lower %lld, "
4309  "upper %lld stride %lld, (offsets %p %p)\n",
4310  gtid, i, next_task, lower, upper, st,
4311  next_task_bounds.get_lower_offset(),
4312  next_task_bounds.get_upper_offset()));
4313 #if OMPT_SUPPORT
4314  __kmp_omp_taskloop_task(NULL, gtid, next_task,
4315  codeptr_ra); // schedule new task
4316 #else
4317  __kmp_omp_task(gtid, next_task, true); // schedule new task
4318 #endif
4319  lower = upper + st; // adjust lower bound for the next iteration
4320  }
4321  // free the pattern task and exit
4322  __kmp_task_start(gtid, task, current_task); // make internal bookkeeping
4323  // do not execute the pattern task, just do internal bookkeeping
4324  __kmp_task_finish<false>(gtid, task, current_task);
4325 }
4326 
4327 // Structure to keep taskloop parameters for auxiliary task
4328 // kept in the shareds of the task structure.
4329 typedef struct __taskloop_params {
4330  kmp_task_t *task;
4331  kmp_uint64 *lb;
4332  kmp_uint64 *ub;
4333  void *task_dup;
4334  kmp_int64 st;
4335  kmp_uint64 ub_glob;
4336  kmp_uint64 num_tasks;
4337  kmp_uint64 grainsize;
4338  kmp_uint64 extras;
4339  kmp_int64 last_chunk;
4340  kmp_uint64 tc;
4341  kmp_uint64 num_t_min;
4342 #if OMPT_SUPPORT
4343  void *codeptr_ra;
4344 #endif
4345 } __taskloop_params_t;
4346 
4347 void __kmp_taskloop_recur(ident_t *, int, kmp_task_t *, kmp_uint64 *,
4348  kmp_uint64 *, kmp_int64, kmp_uint64, kmp_uint64,
4349  kmp_uint64, kmp_uint64, kmp_int64, kmp_uint64,
4350  kmp_uint64,
4351 #if OMPT_SUPPORT
4352  void *,
4353 #endif
4354  void *);
4355 
4356 // Execute part of the taskloop submitted as a task.
4357 int __kmp_taskloop_task(int gtid, void *ptask) {
4358  __taskloop_params_t *p =
4359  (__taskloop_params_t *)((kmp_task_t *)ptask)->shareds;
4360  kmp_task_t *task = p->task;
4361  kmp_uint64 *lb = p->lb;
4362  kmp_uint64 *ub = p->ub;
4363  void *task_dup = p->task_dup;
4364  // p_task_dup_t ptask_dup = (p_task_dup_t)task_dup;
4365  kmp_int64 st = p->st;
4366  kmp_uint64 ub_glob = p->ub_glob;
4367  kmp_uint64 num_tasks = p->num_tasks;
4368  kmp_uint64 grainsize = p->grainsize;
4369  kmp_uint64 extras = p->extras;
4370  kmp_int64 last_chunk = p->last_chunk;
4371  kmp_uint64 tc = p->tc;
4372  kmp_uint64 num_t_min = p->num_t_min;
4373 #if OMPT_SUPPORT
4374  void *codeptr_ra = p->codeptr_ra;
4375 #endif
4376 #if KMP_DEBUG
4377  kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(task);
4378  KMP_DEBUG_ASSERT(task != NULL);
4379  KA_TRACE(20,
4380  ("__kmp_taskloop_task: T#%d, task %p: %lld tasks, grainsize"
4381  " %lld, extras %lld, last_chunk %lld, i=%lld,%lld(%d), dup %p\n",
4382  gtid, taskdata, num_tasks, grainsize, extras, last_chunk, *lb, *ub,
4383  st, task_dup));
4384 #endif
4385  KMP_DEBUG_ASSERT(num_tasks * 2 + 1 > num_t_min);
4386  if (num_tasks > num_t_min)
4387  __kmp_taskloop_recur(NULL, gtid, task, lb, ub, st, ub_glob, num_tasks,
4388  grainsize, extras, last_chunk, tc, num_t_min,
4389 #if OMPT_SUPPORT
4390  codeptr_ra,
4391 #endif
4392  task_dup);
4393  else
4394  __kmp_taskloop_linear(NULL, gtid, task, lb, ub, st, ub_glob, num_tasks,
4395  grainsize, extras, last_chunk, tc,
4396 #if OMPT_SUPPORT
4397  codeptr_ra,
4398 #endif
4399  task_dup);
4400 
4401  KA_TRACE(40, ("__kmp_taskloop_task(exit): T#%d\n", gtid));
4402  return 0;
4403 }
4404 
4405 // Schedule part of the taskloop as a task,
4406 // execute the rest of the taskloop.
4407 //
4408 // loc Source location information
4409 // gtid Global thread ID
4410 // task Pattern task, exposes the loop iteration range
4411 // lb Pointer to loop lower bound in task structure
4412 // ub Pointer to loop upper bound in task structure
4413 // st Loop stride
4414 // ub_glob Global upper bound (used for lastprivate check)
4415 // num_tasks Number of tasks to execute
4416 // grainsize Number of loop iterations per task
4417 // extras Number of chunks with grainsize+1 iterations
4418 // last_chunk Reduction of grainsize for last task
4419 // tc Iterations count
4420 // num_t_min Threshold to launch tasks recursively
4421 // task_dup Tasks duplication routine
4422 // codeptr_ra Return address for OMPT events
4423 void __kmp_taskloop_recur(ident_t *loc, int gtid, kmp_task_t *task,
4424  kmp_uint64 *lb, kmp_uint64 *ub, kmp_int64 st,
4425  kmp_uint64 ub_glob, kmp_uint64 num_tasks,
4426  kmp_uint64 grainsize, kmp_uint64 extras,
4427  kmp_int64 last_chunk, kmp_uint64 tc,
4428  kmp_uint64 num_t_min,
4429 #if OMPT_SUPPORT
4430  void *codeptr_ra,
4431 #endif
4432  void *task_dup) {
4433  kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(task);
4434  KMP_DEBUG_ASSERT(task != NULL);
4435  KMP_DEBUG_ASSERT(num_tasks > num_t_min);
4436  KA_TRACE(20,
4437  ("__kmp_taskloop_recur: T#%d, task %p: %lld tasks, grainsize"
4438  " %lld, extras %lld, last_chunk %lld, i=%lld,%lld(%d), dup %p\n",
4439  gtid, taskdata, num_tasks, grainsize, extras, last_chunk, *lb, *ub,
4440  st, task_dup));
4441  p_task_dup_t ptask_dup = (p_task_dup_t)task_dup;
4442  kmp_uint64 lower = *lb;
4443  kmp_info_t *thread = __kmp_threads[gtid];
4444  // kmp_taskdata_t *current_task = thread->th.th_current_task;
4445  kmp_task_t *next_task;
4446  size_t lower_offset =
4447  (char *)lb - (char *)task; // remember offset of lb in the task structure
4448  size_t upper_offset =
4449  (char *)ub - (char *)task; // remember offset of ub in the task structure
4450 
4451  KMP_DEBUG_ASSERT(tc == num_tasks * grainsize +
4452  (last_chunk < 0 ? last_chunk : extras));
4453  KMP_DEBUG_ASSERT(num_tasks > extras);
4454  KMP_DEBUG_ASSERT(num_tasks > 0);
4455 
4456  // split the loop in two halves
4457  kmp_uint64 lb1, ub0, tc0, tc1, ext0, ext1;
4458  kmp_int64 last_chunk0 = 0, last_chunk1 = 0;
4459  kmp_uint64 gr_size0 = grainsize;
4460  kmp_uint64 n_tsk0 = num_tasks >> 1; // num_tasks/2 to execute
4461  kmp_uint64 n_tsk1 = num_tasks - n_tsk0; // to schedule as a task
4462  if (last_chunk < 0) {
4463  ext0 = ext1 = 0;
4464  last_chunk1 = last_chunk;
4465  tc0 = grainsize * n_tsk0;
4466  tc1 = tc - tc0;
4467  } else if (n_tsk0 <= extras) {
4468  gr_size0++; // integrate extras into grainsize
4469  ext0 = 0; // no extra iters in 1st half
4470  ext1 = extras - n_tsk0; // remaining extras
4471  tc0 = gr_size0 * n_tsk0;
4472  tc1 = tc - tc0;
4473  } else { // n_tsk0 > extras
4474  ext1 = 0; // no extra iters in 2nd half
4475  ext0 = extras;
4476  tc1 = grainsize * n_tsk1;
4477  tc0 = tc - tc1;
4478  }
4479  ub0 = lower + st * (tc0 - 1);
4480  lb1 = ub0 + st;
4481 
4482  // create pattern task for 2nd half of the loop
4483  next_task = __kmp_task_dup_alloc(thread, task); // duplicate the task
4484  // adjust lower bound (upper bound is not changed) for the 2nd half
4485  *(kmp_uint64 *)((char *)next_task + lower_offset) = lb1;
4486  if (ptask_dup != NULL) // construct firstprivates, etc.
4487  ptask_dup(next_task, task, 0);
4488  *ub = ub0; // adjust upper bound for the 1st half
4489 
4490  // create auxiliary task for 2nd half of the loop
4491  // make sure new task has same parent task as the pattern task
4492  kmp_taskdata_t *current_task = thread->th.th_current_task;
4493  thread->th.th_current_task = taskdata->td_parent;
4494  kmp_task_t *new_task =
4495  __kmpc_omp_task_alloc(loc, gtid, 1, 3 * sizeof(void *),
4496  sizeof(__taskloop_params_t), &__kmp_taskloop_task);
4497  // restore current task
4498  thread->th.th_current_task = current_task;
4499  __taskloop_params_t *p = (__taskloop_params_t *)new_task->shareds;
4500  p->task = next_task;
4501  p->lb = (kmp_uint64 *)((char *)next_task + lower_offset);
4502  p->ub = (kmp_uint64 *)((char *)next_task + upper_offset);
4503  p->task_dup = task_dup;
4504  p->st = st;
4505  p->ub_glob = ub_glob;
4506  p->num_tasks = n_tsk1;
4507  p->grainsize = grainsize;
4508  p->extras = ext1;
4509  p->last_chunk = last_chunk1;
4510  p->tc = tc1;
4511  p->num_t_min = num_t_min;
4512 #if OMPT_SUPPORT
4513  p->codeptr_ra = codeptr_ra;
4514 #endif
4515 
4516 #if OMPT_SUPPORT
4517  // schedule new task with correct return address for OMPT events
4518  __kmp_omp_taskloop_task(NULL, gtid, new_task, codeptr_ra);
4519 #else
4520  __kmp_omp_task(gtid, new_task, true); // schedule new task
4521 #endif
4522 
4523  // execute the 1st half of current subrange
4524  if (n_tsk0 > num_t_min)
4525  __kmp_taskloop_recur(loc, gtid, task, lb, ub, st, ub_glob, n_tsk0, gr_size0,
4526  ext0, last_chunk0, tc0, num_t_min,
4527 #if OMPT_SUPPORT
4528  codeptr_ra,
4529 #endif
4530  task_dup);
4531  else
4532  __kmp_taskloop_linear(loc, gtid, task, lb, ub, st, ub_glob, n_tsk0,
4533  gr_size0, ext0, last_chunk0, tc0,
4534 #if OMPT_SUPPORT
4535  codeptr_ra,
4536 #endif
4537  task_dup);
4538 
4539  KA_TRACE(40, ("__kmp_taskloop_recur(exit): T#%d\n", gtid));
4540 }
4541 
4542 static void __kmp_taskloop(ident_t *loc, int gtid, kmp_task_t *task, int if_val,
4543  kmp_uint64 *lb, kmp_uint64 *ub, kmp_int64 st,
4544  int nogroup, int sched, kmp_uint64 grainsize,
4545  int modifier, void *task_dup) {
4546  kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(task);
4547  KMP_DEBUG_ASSERT(task != NULL);
4548  if (nogroup == 0) {
4549 #if OMPT_SUPPORT && OMPT_OPTIONAL
4550  OMPT_STORE_RETURN_ADDRESS(gtid);
4551 #endif
4552  __kmpc_taskgroup(loc, gtid);
4553  }
4554 
4555  // =========================================================================
4556  // calculate loop parameters
4557  kmp_taskloop_bounds_t task_bounds(task, lb, ub);
4558  kmp_uint64 tc;
4559  // compiler provides global bounds here
4560  kmp_uint64 lower = task_bounds.get_lb();
4561  kmp_uint64 upper = task_bounds.get_ub();
4562  kmp_uint64 ub_glob = upper; // global upper used to calc lastprivate flag
4563  kmp_uint64 num_tasks = 0, extras = 0;
4564  kmp_int64 last_chunk =
4565  0; // reduce grainsize of last task by last_chunk in strict mode
4566  kmp_uint64 num_tasks_min = __kmp_taskloop_min_tasks;
4567  kmp_info_t *thread = __kmp_threads[gtid];
4568  kmp_taskdata_t *current_task = thread->th.th_current_task;
4569 
4570  KA_TRACE(20, ("__kmp_taskloop: T#%d, task %p, lb %lld, ub %lld, st %lld, "
4571  "grain %llu(%d, %d), dup %p\n",
4572  gtid, taskdata, lower, upper, st, grainsize, sched, modifier,
4573  task_dup));
4574 
4575  // compute trip count
4576  if (st == 1) { // most common case
4577  tc = upper - lower + 1;
4578  } else if (st < 0) {
4579  tc = (lower - upper) / (-st) + 1;
4580  } else { // st > 0
4581  tc = (upper - lower) / st + 1;
4582  }
4583  if (tc == 0) {
4584  KA_TRACE(20, ("__kmp_taskloop(exit): T#%d zero-trip loop\n", gtid));
4585  // free the pattern task and exit
4586  __kmp_task_start(gtid, task, current_task);
4587  // do not execute anything for zero-trip loop
4588  __kmp_task_finish<false>(gtid, task, current_task);
4589  return;
4590  }
4591 
4592 #if OMPT_SUPPORT && OMPT_OPTIONAL
4593  ompt_team_info_t *team_info = __ompt_get_teaminfo(0, NULL);
4594  ompt_task_info_t *task_info = __ompt_get_task_info_object(0);
4595  if (ompt_enabled.ompt_callback_work) {
4596  ompt_callbacks.ompt_callback(ompt_callback_work)(
4597  ompt_work_taskloop, ompt_scope_begin, &(team_info->parallel_data),
4598  &(task_info->task_data), tc, OMPT_GET_RETURN_ADDRESS(0));
4599  }
4600 #endif
4601 
4602  if (num_tasks_min == 0)
4603  // TODO: can we choose better default heuristic?
4604  num_tasks_min =
4605  KMP_MIN(thread->th.th_team_nproc * 10, INITIAL_TASK_DEQUE_SIZE);
4606 
4607  // compute num_tasks/grainsize based on the input provided
4608  switch (sched) {
4609  case 0: // no schedule clause specified, we can choose the default
4610  // let's try to schedule (team_size*10) tasks
4611  grainsize = thread->th.th_team_nproc * 10;
4612  KMP_FALLTHROUGH();
4613  case 2: // num_tasks provided
4614  if (grainsize > tc) {
4615  num_tasks = tc; // too big num_tasks requested, adjust values
4616  grainsize = 1;
4617  extras = 0;
4618  } else {
4619  num_tasks = grainsize;
4620  grainsize = tc / num_tasks;
4621  extras = tc % num_tasks;
4622  }
4623  break;
4624  case 1: // grainsize provided
4625  if (grainsize > tc) {
4626  num_tasks = 1;
4627  grainsize = tc; // too big grainsize requested, adjust values
4628  extras = 0;
4629  } else {
4630  if (modifier) {
4631  num_tasks = (tc + grainsize - 1) / grainsize;
4632  last_chunk = tc - (num_tasks * grainsize);
4633  extras = 0;
4634  } else {
4635  num_tasks = tc / grainsize;
4636  // adjust grainsize for balanced distribution of iterations
4637  grainsize = tc / num_tasks;
4638  extras = tc % num_tasks;
4639  }
4640  }
4641  break;
4642  default:
4643  KMP_ASSERT2(0, "unknown scheduling of taskloop");
4644  }
4645 
4646  KMP_DEBUG_ASSERT(tc == num_tasks * grainsize +
4647  (last_chunk < 0 ? last_chunk : extras));
4648  KMP_DEBUG_ASSERT(num_tasks > extras);
4649  KMP_DEBUG_ASSERT(num_tasks > 0);
4650  // =========================================================================
4651 
4652  // check if clause value first
4653  // Also require GOMP_taskloop to reduce to linear (taskdata->td_flags.native)
4654  if (if_val == 0) { // if(0) specified, mark task as serial
4655  taskdata->td_flags.task_serial = 1;
4656  taskdata->td_flags.tiedness = TASK_TIED; // AC: serial task cannot be untied
4657  // always start serial tasks linearly
4658  __kmp_taskloop_linear(loc, gtid, task, lb, ub, st, ub_glob, num_tasks,
4659  grainsize, extras, last_chunk, tc,
4660 #if OMPT_SUPPORT
4661  OMPT_GET_RETURN_ADDRESS(0),
4662 #endif
4663  task_dup);
4664  // !taskdata->td_flags.native => currently force linear spawning of tasks
4665  // for GOMP_taskloop
4666  } else if (num_tasks > num_tasks_min && !taskdata->td_flags.native) {
4667  KA_TRACE(20, ("__kmp_taskloop: T#%d, go recursive: tc %llu, #tasks %llu"
4668  "(%lld), grain %llu, extras %llu, last_chunk %lld\n",
4669  gtid, tc, num_tasks, num_tasks_min, grainsize, extras,
4670  last_chunk));
4671  __kmp_taskloop_recur(loc, gtid, task, lb, ub, st, ub_glob, num_tasks,
4672  grainsize, extras, last_chunk, tc, num_tasks_min,
4673 #if OMPT_SUPPORT
4674  OMPT_GET_RETURN_ADDRESS(0),
4675 #endif
4676  task_dup);
4677  } else {
4678  KA_TRACE(20, ("__kmp_taskloop: T#%d, go linear: tc %llu, #tasks %llu"
4679  "(%lld), grain %llu, extras %llu, last_chunk %lld\n",
4680  gtid, tc, num_tasks, num_tasks_min, grainsize, extras,
4681  last_chunk));
4682  __kmp_taskloop_linear(loc, gtid, task, lb, ub, st, ub_glob, num_tasks,
4683  grainsize, extras, last_chunk, tc,
4684 #if OMPT_SUPPORT
4685  OMPT_GET_RETURN_ADDRESS(0),
4686 #endif
4687  task_dup);
4688  }
4689 
4690 #if OMPT_SUPPORT && OMPT_OPTIONAL
4691  if (ompt_enabled.ompt_callback_work) {
4692  ompt_callbacks.ompt_callback(ompt_callback_work)(
4693  ompt_work_taskloop, ompt_scope_end, &(team_info->parallel_data),
4694  &(task_info->task_data), tc, OMPT_GET_RETURN_ADDRESS(0));
4695  }
4696 #endif
4697 
4698  if (nogroup == 0) {
4699 #if OMPT_SUPPORT && OMPT_OPTIONAL
4700  OMPT_STORE_RETURN_ADDRESS(gtid);
4701 #endif
4702  __kmpc_end_taskgroup(loc, gtid);
4703  }
4704  KA_TRACE(20, ("__kmp_taskloop(exit): T#%d\n", gtid));
4705 }
4706 
4723 void __kmpc_taskloop(ident_t *loc, int gtid, kmp_task_t *task, int if_val,
4724  kmp_uint64 *lb, kmp_uint64 *ub, kmp_int64 st, int nogroup,
4725  int sched, kmp_uint64 grainsize, void *task_dup) {
4726  __kmp_assert_valid_gtid(gtid);
4727  KA_TRACE(20, ("__kmpc_taskloop(enter): T#%d\n", gtid));
4728  __kmp_taskloop(loc, gtid, task, if_val, lb, ub, st, nogroup, sched, grainsize,
4729  0, task_dup);
4730  KA_TRACE(20, ("__kmpc_taskloop(exit): T#%d\n", gtid));
4731 }
4732 
4750 void __kmpc_taskloop_5(ident_t *loc, int gtid, kmp_task_t *task, int if_val,
4751  kmp_uint64 *lb, kmp_uint64 *ub, kmp_int64 st,
4752  int nogroup, int sched, kmp_uint64 grainsize,
4753  int modifier, void *task_dup) {
4754  __kmp_assert_valid_gtid(gtid);
4755  KA_TRACE(20, ("__kmpc_taskloop_5(enter): T#%d\n", gtid));
4756  __kmp_taskloop(loc, gtid, task, if_val, lb, ub, st, nogroup, sched, grainsize,
4757  modifier, task_dup);
4758  KA_TRACE(20, ("__kmpc_taskloop_5(exit): T#%d\n", gtid));
4759 }
struct kmp_taskred_data kmp_taskred_data_t
struct kmp_task_red_input kmp_task_red_input_t
struct kmp_taskred_flags kmp_taskred_flags_t
struct kmp_taskred_input kmp_taskred_input_t
#define KMP_COUNT_BLOCK(name)
Increments specified counter (name).
Definition: kmp_stats.h:904
void __kmpc_taskloop(ident_t *loc, int gtid, kmp_task_t *task, int if_val, kmp_uint64 *lb, kmp_uint64 *ub, kmp_int64 st, int nogroup, int sched, kmp_uint64 grainsize, void *task_dup)
void * __kmpc_taskred_modifier_init(ident_t *loc, int gtid, int is_ws, int num, void *data)
void * __kmpc_taskred_init(int gtid, int num, void *data)
void * __kmpc_task_reduction_init(int gtid, int num, void *data)
void __kmpc_proxy_task_completed_ooo(kmp_task_t *ptask)
void __kmpc_task_reduction_modifier_fini(ident_t *loc, int gtid, int is_ws)
kmp_int32 __kmpc_omp_reg_task_with_affinity(ident_t *loc_ref, kmp_int32 gtid, kmp_task_t *new_task, kmp_int32 naffins, kmp_task_affinity_info_t *affin_list)
void * __kmpc_task_reduction_modifier_init(ident_t *loc, int gtid, int is_ws, int num, void *data)
void __kmpc_taskloop_5(ident_t *loc, int gtid, kmp_task_t *task, int if_val, kmp_uint64 *lb, kmp_uint64 *ub, kmp_int64 st, int nogroup, int sched, kmp_uint64 grainsize, int modifier, void *task_dup)
void __kmpc_proxy_task_completed(kmp_int32 gtid, kmp_task_t *ptask)
void * __kmpc_task_reduction_get_th_data(int gtid, void *tskgrp, void *data)
Definition: kmp.h:233
kmp_taskred_flags_t flags
kmp_taskred_flags_t flags
kmp_taskred_flags_t flags