LLVM OpenMP* Runtime Library
kmp_dispatch.cpp
1 /*
2  * kmp_dispatch.cpp: dynamic scheduling - iteration initialization and dispatch.
3  */
4 
5 //===----------------------------------------------------------------------===//
6 //
7 // The LLVM Compiler Infrastructure
8 //
9 // This file is dual licensed under the MIT and the University of Illinois Open
10 // Source Licenses. See LICENSE.txt for details.
11 //
12 //===----------------------------------------------------------------------===//
13 
14 /* Dynamic scheduling initialization and dispatch.
15  *
16  * NOTE: __kmp_nth is a constant inside of any dispatch loop, however
17  * it may change values between parallel regions. __kmp_max_nth
18  * is the largest value __kmp_nth may take, 1 is the smallest.
19  */
20 
21 // Need to raise Win version from XP to Vista here for support of
22 // InterlockedExchange64
23 #if defined(_WIN32_WINNT) && defined(_M_IX86)
24 #undef _WIN32_WINNT
25 #define _WIN32_WINNT 0x0502
26 #endif
27 
28 #include "kmp.h"
29 #include "kmp_error.h"
30 #include "kmp_i18n.h"
31 #include "kmp_itt.h"
32 #include "kmp_stats.h"
33 #include "kmp_str.h"
34 #if KMP_OS_WINDOWS && KMP_ARCH_X86
35 #include <float.h>
36 #endif
37 #include "kmp_lock.h"
38 #include "kmp_dispatch.h"
39 #if KMP_USE_HIER_SCHED
40 #include "kmp_dispatch_hier.h"
41 #endif
42 
43 #if OMPT_SUPPORT
44 #include "ompt-specific.h"
45 #endif
46 
47 /* ------------------------------------------------------------------------ */
48 /* ------------------------------------------------------------------------ */
49 
50 void __kmp_dispatch_deo_error(int *gtid_ref, int *cid_ref, ident_t *loc_ref) {
51  kmp_info_t *th;
52 
53  KMP_DEBUG_ASSERT(gtid_ref);
54 
55  if (__kmp_env_consistency_check) {
56  th = __kmp_threads[*gtid_ref];
57  if (th->th.th_root->r.r_active &&
58  (th->th.th_dispatch->th_dispatch_pr_current->pushed_ws != ct_none)) {
59 #if KMP_USE_DYNAMIC_LOCK
60  __kmp_push_sync(*gtid_ref, ct_ordered_in_pdo, loc_ref, NULL, 0);
61 #else
62  __kmp_push_sync(*gtid_ref, ct_ordered_in_pdo, loc_ref, NULL);
63 #endif
64  }
65  }
66 }
67 
68 void __kmp_dispatch_dxo_error(int *gtid_ref, int *cid_ref, ident_t *loc_ref) {
69  kmp_info_t *th;
70 
71  if (__kmp_env_consistency_check) {
72  th = __kmp_threads[*gtid_ref];
73  if (th->th.th_dispatch->th_dispatch_pr_current->pushed_ws != ct_none) {
74  __kmp_pop_sync(*gtid_ref, ct_ordered_in_pdo, loc_ref);
75  }
76  }
77 }
78 
79 // Initialize a dispatch_private_info_template<T> buffer for a particular
80 // type of schedule,chunk. The loop description is found in lb (lower bound),
81 // ub (upper bound), and st (stride). nproc is the number of threads relevant
82 // to the scheduling (often the number of threads in a team, but not always if
83 // hierarchical scheduling is used). tid is the id of the thread calling
84 // the function within the group of nproc threads. It will have a value
85 // between 0 and nproc - 1. This is often just the thread id within a team, but
86 // is not necessarily the case when using hierarchical scheduling.
87 // loc is the source file location of the corresponding loop
88 // gtid is the global thread id
89 template <typename T>
90 void __kmp_dispatch_init_algorithm(ident_t *loc, int gtid,
91  dispatch_private_info_template<T> *pr,
92  enum sched_type schedule, T lb, T ub,
93  typename traits_t<T>::signed_t st,
94 #if USE_ITT_BUILD
95  kmp_uint64 *cur_chunk,
96 #endif
97  typename traits_t<T>::signed_t chunk,
98  T nproc, T tid) {
99  typedef typename traits_t<T>::unsigned_t UT;
100  typedef typename traits_t<T>::floating_t DBL;
101 
102  int active;
103  T tc;
104  kmp_info_t *th;
105  kmp_team_t *team;
106 
107 #ifdef KMP_DEBUG
108  typedef typename traits_t<T>::signed_t ST;
109  {
110  char *buff;
111  // create format specifiers before the debug output
112  buff = __kmp_str_format("__kmp_dispatch_init_algorithm: T#%%d called "
113  "pr:%%p lb:%%%s ub:%%%s st:%%%s "
114  "schedule:%%d chunk:%%%s nproc:%%%s tid:%%%s\n",
115  traits_t<T>::spec, traits_t<T>::spec,
116  traits_t<ST>::spec, traits_t<ST>::spec,
117  traits_t<T>::spec, traits_t<T>::spec);
118  KD_TRACE(10, (buff, gtid, pr, lb, ub, st, schedule, chunk, nproc, tid));
119  __kmp_str_free(&buff);
120  }
121 #endif
122  /* setup data */
123  th = __kmp_threads[gtid];
124  team = th->th.th_team;
125  active = !team->t.t_serialized;
126 
127 #if USE_ITT_BUILD
128  int itt_need_metadata_reporting = __itt_metadata_add_ptr &&
129  __kmp_forkjoin_frames_mode == 3 &&
130  KMP_MASTER_GTID(gtid) &&
131 #if OMP_40_ENABLED
132  th->th.th_teams_microtask == NULL &&
133 #endif
134  team->t.t_active_level == 1;
135 #endif
136 #if (KMP_STATIC_STEAL_ENABLED)
137  if (SCHEDULE_HAS_NONMONOTONIC(schedule))
138  // AC: we now have only one implementation of stealing, so use it
139  schedule = kmp_sch_static_steal;
140  else
141 #endif
142  schedule = SCHEDULE_WITHOUT_MODIFIERS(schedule);
143 
144  /* Pick up the nomerge/ordered bits from the scheduling type */
145  if ((schedule >= kmp_nm_lower) && (schedule < kmp_nm_upper)) {
146  pr->flags.nomerge = TRUE;
147  schedule =
148  (enum sched_type)(((int)schedule) - (kmp_nm_lower - kmp_sch_lower));
149  } else {
150  pr->flags.nomerge = FALSE;
151  }
152  pr->type_size = traits_t<T>::type_size; // remember the size of variables
153  if (kmp_ord_lower & schedule) {
154  pr->flags.ordered = TRUE;
155  schedule =
156  (enum sched_type)(((int)schedule) - (kmp_ord_lower - kmp_sch_lower));
157  } else {
158  pr->flags.ordered = FALSE;
159  }
160 
161  if (schedule == kmp_sch_static) {
162  schedule = __kmp_static;
163  } else {
164  if (schedule == kmp_sch_runtime) {
165  // Use the scheduling specified by OMP_SCHEDULE (or __kmp_sch_default if
166  // not specified)
167  schedule = team->t.t_sched.r_sched_type;
168  // Detail the schedule if needed (global controls are differentiated
169  // appropriately)
170  if (schedule == kmp_sch_guided_chunked) {
171  schedule = __kmp_guided;
172  } else if (schedule == kmp_sch_static) {
173  schedule = __kmp_static;
174  }
175  // Use the chunk size specified by OMP_SCHEDULE (or default if not
176  // specified)
177  chunk = team->t.t_sched.chunk;
178 #if USE_ITT_BUILD
179  if (cur_chunk)
180  *cur_chunk = chunk;
181 #endif
182 #ifdef KMP_DEBUG
183  {
184  char *buff;
185  // create format specifiers before the debug output
186  buff = __kmp_str_format("__kmp_dispatch_init_algorithm: T#%%d new: "
187  "schedule:%%d chunk:%%%s\n",
188  traits_t<ST>::spec);
189  KD_TRACE(10, (buff, gtid, schedule, chunk));
190  __kmp_str_free(&buff);
191  }
192 #endif
193  } else {
194  if (schedule == kmp_sch_guided_chunked) {
195  schedule = __kmp_guided;
196  }
197  if (chunk <= 0) {
198  chunk = KMP_DEFAULT_CHUNK;
199  }
200  }
201 
202  if (schedule == kmp_sch_auto) {
203  // mapping and differentiation: in the __kmp_do_serial_initialize()
204  schedule = __kmp_auto;
205 #ifdef KMP_DEBUG
206  {
207  char *buff;
208  // create format specifiers before the debug output
209  buff = __kmp_str_format(
210  "__kmp_dispatch_init_algorithm: kmp_sch_auto: T#%%d new: "
211  "schedule:%%d chunk:%%%s\n",
212  traits_t<ST>::spec);
213  KD_TRACE(10, (buff, gtid, schedule, chunk));
214  __kmp_str_free(&buff);
215  }
216 #endif
217  }
218 
219  /* guided analytical not safe for too many threads */
220  if (schedule == kmp_sch_guided_analytical_chunked && nproc > 1 << 20) {
221  schedule = kmp_sch_guided_iterative_chunked;
222  KMP_WARNING(DispatchManyThreads);
223  }
224 #if OMP_45_ENABLED
225  if (schedule == kmp_sch_runtime_simd) {
226  // compiler provides simd_width in the chunk parameter
227  schedule = team->t.t_sched.r_sched_type;
228  // Detail the schedule if needed (global controls are differentiated
229  // appropriately)
230  if (schedule == kmp_sch_static || schedule == kmp_sch_auto ||
231  schedule == __kmp_static) {
232  schedule = kmp_sch_static_balanced_chunked;
233  } else {
234  if (schedule == kmp_sch_guided_chunked || schedule == __kmp_guided) {
235  schedule = kmp_sch_guided_simd;
236  }
237  chunk = team->t.t_sched.chunk * chunk;
238  }
239 #if USE_ITT_BUILD
240  if (cur_chunk)
241  *cur_chunk = chunk;
242 #endif
243 #ifdef KMP_DEBUG
244  {
245  char *buff;
246  // create format specifiers before the debug output
247  buff = __kmp_str_format("__kmp_dispatch_init: T#%%d new: schedule:%%d"
248  " chunk:%%%s\n",
249  traits_t<ST>::spec);
250  KD_TRACE(10, (buff, gtid, schedule, chunk));
251  __kmp_str_free(&buff);
252  }
253 #endif
254  }
255 #endif // OMP_45_ENABLED
256  pr->u.p.parm1 = chunk;
257  }
258  KMP_ASSERT2((kmp_sch_lower < schedule && schedule < kmp_sch_upper),
259  "unknown scheduling type");
260 
261  pr->u.p.count = 0;
262 
263  if (__kmp_env_consistency_check) {
264  if (st == 0) {
265  __kmp_error_construct(kmp_i18n_msg_CnsLoopIncrZeroProhibited,
266  (pr->flags.ordered ? ct_pdo_ordered : ct_pdo), loc);
267  }
268  }
269  // compute trip count
270  if (st == 1) { // most common case
271  if (ub >= lb) {
272  tc = ub - lb + 1;
273  } else { // ub < lb
274  tc = 0; // zero-trip
275  }
276  } else if (st < 0) {
277  if (lb >= ub) {
278  // AC: cast to unsigned is needed for loops like (i=2B; i>-2B; i-=1B),
279  // where the division needs to be unsigned regardless of the result type
280  tc = (UT)(lb - ub) / (-st) + 1;
281  } else { // lb < ub
282  tc = 0; // zero-trip
283  }
284  } else { // st > 0
285  if (ub >= lb) {
286  // AC: cast to unsigned is needed for loops like (i=-2B; i<2B; i+=1B),
287  // where the division needs to be unsigned regardless of the result type
288  tc = (UT)(ub - lb) / st + 1;
289  } else { // ub < lb
290  tc = 0; // zero-trip
291  }
292  }
293 
294  pr->u.p.lb = lb;
295  pr->u.p.ub = ub;
296  pr->u.p.st = st;
297  pr->u.p.tc = tc;
298 
299 #if KMP_OS_WINDOWS
300  pr->u.p.last_upper = ub + st;
301 #endif /* KMP_OS_WINDOWS */
302 
303  /* NOTE: only the active parallel region(s) has active ordered sections */
304 
305  if (active) {
306  if (pr->flags.ordered) {
307  pr->ordered_bumped = 0;
308  pr->u.p.ordered_lower = 1;
309  pr->u.p.ordered_upper = 0;
310  }
311  }
312 
313  switch (schedule) {
314 #if (KMP_STATIC_STEAL_ENABLED)
315  case kmp_sch_static_steal: {
316  T ntc, init;
317 
318  KD_TRACE(100,
319  ("__kmp_dispatch_init_algorithm: T#%d kmp_sch_static_steal case\n",
320  gtid));
321 
322  ntc = (tc % chunk ? 1 : 0) + tc / chunk;
323  if (nproc > 1 && ntc >= nproc) {
324  KMP_COUNT_BLOCK(OMP_LOOP_STATIC_STEAL);
325  T id = tid;
326  T small_chunk, extras;
327 
328  small_chunk = ntc / nproc;
329  extras = ntc % nproc;
330 
331  init = id * small_chunk + (id < extras ? id : extras);
332  pr->u.p.count = init;
333  pr->u.p.ub = init + small_chunk + (id < extras ? 1 : 0);
334 
335  pr->u.p.parm2 = lb;
336  // pr->pfields.parm3 = 0; // it's not used in static_steal
337  pr->u.p.parm4 = (id + 1) % nproc; // remember neighbour tid
338  pr->u.p.st = st;
339  if (traits_t<T>::type_size > 4) {
340  // AC: TODO: check if 16-byte CAS available and use it to
341  // improve performance (probably wait for explicit request
342  // before spending time on this).
343  // For now use dynamically allocated per-thread lock,
344  // free memory in __kmp_dispatch_next when status==0.
345  KMP_DEBUG_ASSERT(th->th.th_dispatch->th_steal_lock == NULL);
346  th->th.th_dispatch->th_steal_lock =
347  (kmp_lock_t *)__kmp_allocate(sizeof(kmp_lock_t));
348  __kmp_init_lock(th->th.th_dispatch->th_steal_lock);
349  }
350  break;
351  } else {
352  KD_TRACE(100, ("__kmp_dispatch_init_algorithm: T#%d falling-through to "
353  "kmp_sch_static_balanced\n",
354  gtid));
355  schedule = kmp_sch_static_balanced;
356  /* too few iterations: fall-through to kmp_sch_static_balanced */
357  } // if
358  /* FALL-THROUGH to static balanced */
359  } // case
360 #endif
361  case kmp_sch_static_balanced: {
362  T init, limit;
363 
364  KD_TRACE(
365  100,
366  ("__kmp_dispatch_init_algorithm: T#%d kmp_sch_static_balanced case\n",
367  gtid));
368 
369  if (nproc > 1) {
370  T id = tid;
371 
372  if (tc < nproc) {
373  if (id < tc) {
374  init = id;
375  limit = id;
376  pr->u.p.parm1 = (id == tc - 1); /* parm1 stores *plastiter */
377  } else {
378  pr->u.p.count = 1; /* means no more chunks to execute */
379  pr->u.p.parm1 = FALSE;
380  break;
381  }
382  } else {
383  T small_chunk = tc / nproc;
384  T extras = tc % nproc;
385  init = id * small_chunk + (id < extras ? id : extras);
386  limit = init + small_chunk - (id < extras ? 0 : 1);
387  pr->u.p.parm1 = (id == nproc - 1);
388  }
389  } else {
390  if (tc > 0) {
391  init = 0;
392  limit = tc - 1;
393  pr->u.p.parm1 = TRUE;
394  } else {
395  // zero trip count
396  pr->u.p.count = 1; /* means no more chunks to execute */
397  pr->u.p.parm1 = FALSE;
398  break;
399  }
400  }
401 #if USE_ITT_BUILD
402  // Calculate chunk for metadata report
403  if (itt_need_metadata_reporting)
404  if (cur_chunk)
405  *cur_chunk = limit - init + 1;
406 #endif
407  if (st == 1) {
408  pr->u.p.lb = lb + init;
409  pr->u.p.ub = lb + limit;
410  } else {
411  // calculated upper bound, "ub" is user-defined upper bound
412  T ub_tmp = lb + limit * st;
413  pr->u.p.lb = lb + init * st;
414  // adjust upper bound to "ub" if needed, so that MS lastprivate will match
415  // it exactly
416  if (st > 0) {
417  pr->u.p.ub = (ub_tmp + st > ub ? ub : ub_tmp);
418  } else {
419  pr->u.p.ub = (ub_tmp + st < ub ? ub : ub_tmp);
420  }
421  }
422  if (pr->flags.ordered) {
423  pr->u.p.ordered_lower = init;
424  pr->u.p.ordered_upper = limit;
425  }
426  break;
427  } // case
428 #if OMP_45_ENABLED
429  case kmp_sch_static_balanced_chunked: {
430  // similar to balanced, but chunk adjusted to multiple of simd width
431  T nth = nproc;
432  KD_TRACE(100, ("__kmp_dispatch_init_algorithm: T#%d runtime(simd:static)"
433  " -> falling-through to static_greedy\n",
434  gtid));
435  schedule = kmp_sch_static_greedy;
436  if (nth > 1)
437  pr->u.p.parm1 = ((tc + nth - 1) / nth + chunk - 1) & ~(chunk - 1);
438  else
439  pr->u.p.parm1 = tc;
440  break;
441  } // case
442  case kmp_sch_guided_simd:
443 #endif // OMP_45_ENABLED
444  case kmp_sch_guided_iterative_chunked: {
445  KD_TRACE(
446  100,
447  ("__kmp_dispatch_init_algorithm: T#%d kmp_sch_guided_iterative_chunked"
448  " case\n",
449  gtid));
450 
451  if (nproc > 1) {
452  if ((2L * chunk + 1) * nproc >= tc) {
453  /* chunk size too large, switch to dynamic */
454  schedule = kmp_sch_dynamic_chunked;
455  } else {
456  // when remaining iters become less than parm2 - switch to dynamic
457  pr->u.p.parm2 = guided_int_param * nproc * (chunk + 1);
458  *(double *)&pr->u.p.parm3 =
459  guided_flt_param / nproc; // may occupy parm3 and parm4
460  }
461  } else {
462  KD_TRACE(100, ("__kmp_dispatch_init_algorithm: T#%d falling-through to "
463  "kmp_sch_static_greedy\n",
464  gtid));
465  schedule = kmp_sch_static_greedy;
466  /* team->t.t_nproc == 1: fall-through to kmp_sch_static_greedy */
467  KD_TRACE(
468  100,
469  ("__kmp_dispatch_init_algorithm: T#%d kmp_sch_static_greedy case\n",
470  gtid));
471  pr->u.p.parm1 = tc;
472  } // if
473  } // case
474  break;
475  case kmp_sch_guided_analytical_chunked: {
476  KD_TRACE(100, ("__kmp_dispatch_init_algorithm: T#%d "
477  "kmp_sch_guided_analytical_chunked case\n",
478  gtid));
479 
480  if (nproc > 1) {
481  if ((2L * chunk + 1) * nproc >= tc) {
482  /* chunk size too large, switch to dynamic */
483  schedule = kmp_sch_dynamic_chunked;
484  } else {
485  /* commonly used term: (2 nproc - 1)/(2 nproc) */
486  DBL x;
487 
488 #if KMP_OS_WINDOWS && KMP_ARCH_X86
489  /* Linux* OS already has 64-bit computation by default for long double,
490  and on Windows* OS on Intel(R) 64, /Qlong_double doesn't work. On
491  Windows* OS on IA-32 architecture, we need to set precision to 64-bit
492  instead of the default 53-bit. Even though long double doesn't work
493  on Windows* OS on Intel(R) 64, the resulting lack of precision is not
494  expected to impact the correctness of the algorithm, but this has not
495  been mathematically proven. */
496  // save original FPCW and set precision to 64-bit, as
497  // Windows* OS on IA-32 architecture defaults to 53-bit
498  unsigned int oldFpcw = _control87(0, 0);
499  _control87(_PC_64, _MCW_PC); // 0,0x30000
500 #endif
501  /* value used for comparison in solver for cross-over point */
502  long double target = ((long double)chunk * 2 + 1) * nproc / tc;
503 
504  /* crossover point--chunk indexes equal to or greater than
505  this point switch to dynamic-style scheduling */
506  UT cross;
507 
508  /* commonly used term: (2 nproc - 1)/(2 nproc) */
509  x = (long double)1.0 - (long double)0.5 / nproc;
510 
511 #ifdef KMP_DEBUG
512  { // test natural alignment
513  struct _test_a {
514  char a;
515  union {
516  char b;
517  DBL d;
518  };
519  } t;
520  ptrdiff_t natural_alignment =
521  (ptrdiff_t)&t.b - (ptrdiff_t)&t - (ptrdiff_t)1;
522  //__kmp_warn( " %llx %llx %lld", (long long)&t.d, (long long)&t, (long
523  // long)natural_alignment );
524  KMP_DEBUG_ASSERT(
525  (((ptrdiff_t)&pr->u.p.parm3) & (natural_alignment)) == 0);
526  }
527 #endif // KMP_DEBUG
528 
529  /* save the term in thread private dispatch structure */
530  *(DBL *)&pr->u.p.parm3 = x;
531 
532  /* solve for the crossover point to the nearest integer i for which C_i
533  <= chunk */
534  {
535  UT left, right, mid;
536  long double p;
537 
538  /* estimate initial upper and lower bound */
539 
540  /* doesn't matter what value right is as long as it is positive, but
541  it affects performance of the solver */
542  right = 229;
543  p = __kmp_pow<UT>(x, right);
544  if (p > target) {
545  do {
546  p *= p;
547  right <<= 1;
548  } while (p > target && right < (1 << 27));
549  /* lower bound is previous (failed) estimate of upper bound */
550  left = right >> 1;
551  } else {
552  left = 0;
553  }
554 
555  /* bisection root-finding method */
556  while (left + 1 < right) {
557  mid = (left + right) / 2;
558  if (__kmp_pow<UT>(x, mid) > target) {
559  left = mid;
560  } else {
561  right = mid;
562  }
563  } // while
564  cross = right;
565  }
566  /* assert sanity of computed crossover point */
567  KMP_ASSERT(cross && __kmp_pow<UT>(x, cross - 1) > target &&
568  __kmp_pow<UT>(x, cross) <= target);
569 
570  /* save the crossover point in thread private dispatch structure */
571  pr->u.p.parm2 = cross;
572 
573 // C75803
574 #if ((KMP_OS_LINUX || KMP_OS_WINDOWS) && KMP_ARCH_X86) && (!defined(KMP_I8))
575 #define GUIDED_ANALYTICAL_WORKAROUND (*(DBL *)&pr->u.p.parm3)
576 #else
577 #define GUIDED_ANALYTICAL_WORKAROUND (x)
578 #endif
579  /* dynamic-style scheduling offset */
580  pr->u.p.count = tc - __kmp_dispatch_guided_remaining(
581  tc, GUIDED_ANALYTICAL_WORKAROUND, cross) -
582  cross * chunk;
583 #if KMP_OS_WINDOWS && KMP_ARCH_X86
584  // restore FPCW
585  _control87(oldFpcw, _MCW_PC);
586 #endif
587  } // if
588  } else {
589  KD_TRACE(100, ("__kmp_dispatch_init_algorithm: T#%d falling-through to "
590  "kmp_sch_static_greedy\n",
591  gtid));
592  schedule = kmp_sch_static_greedy;
593  /* team->t.t_nproc == 1: fall-through to kmp_sch_static_greedy */
594  pr->u.p.parm1 = tc;
595  } // if
596  } // case
597  break;
598  case kmp_sch_static_greedy:
599  KD_TRACE(
600  100,
601  ("__kmp_dispatch_init_algorithm: T#%d kmp_sch_static_greedy case\n",
602  gtid));
603  pr->u.p.parm1 = (nproc > 1) ? (tc + nproc - 1) / nproc : tc;
604  break;
605  case kmp_sch_static_chunked:
606  case kmp_sch_dynamic_chunked:
607  if (pr->u.p.parm1 <= 0) {
608  pr->u.p.parm1 = KMP_DEFAULT_CHUNK;
609  }
610  KD_TRACE(100, ("__kmp_dispatch_init_algorithm: T#%d "
611  "kmp_sch_static_chunked/kmp_sch_dynamic_chunked cases\n",
612  gtid));
613  break;
614  case kmp_sch_trapezoidal: {
615  /* TSS: trapezoid self-scheduling, minimum chunk_size = parm1 */
616 
617  T parm1, parm2, parm3, parm4;
618  KD_TRACE(100,
619  ("__kmp_dispatch_init_algorithm: T#%d kmp_sch_trapezoidal case\n",
620  gtid));
621 
622  parm1 = chunk;
623 
624  /* F : size of the first cycle */
625  parm2 = (tc / (2 * nproc));
626 
627  if (parm2 < 1) {
628  parm2 = 1;
629  }
630 
631  /* L : size of the last cycle. Make sure the last cycle is not larger
632  than the first cycle. */
633  if (parm1 < 1) {
634  parm1 = 1;
635  } else if (parm1 > parm2) {
636  parm1 = parm2;
637  }
638 
639  /* N : number of cycles */
640  parm3 = (parm2 + parm1);
641  parm3 = (2 * tc + parm3 - 1) / parm3;
642 
643  if (parm3 < 2) {
644  parm3 = 2;
645  }
646 
647  /* sigma : decreasing incr of the trapezoid */
648  parm4 = (parm3 - 1);
649  parm4 = (parm2 - parm1) / parm4;
650 
651  // pointless check, because parm4 >= 0 always
652  // if ( parm4 < 0 ) {
653  // parm4 = 0;
654  //}
655 
656  pr->u.p.parm1 = parm1;
657  pr->u.p.parm2 = parm2;
658  pr->u.p.parm3 = parm3;
659  pr->u.p.parm4 = parm4;
660  } // case
661  break;
662 
663  default: {
664  __kmp_fatal(KMP_MSG(UnknownSchedTypeDetected), // Primary message
665  KMP_HNT(GetNewerLibrary), // Hint
666  __kmp_msg_null // Variadic argument list terminator
667  );
668  } break;
669  } // switch
670  pr->schedule = schedule;
671 }
672 
673 #if KMP_USE_HIER_SCHED
674 template <typename T>
675 inline void __kmp_dispatch_init_hier_runtime(ident_t *loc, T lb, T ub,
676  typename traits_t<T>::signed_t st);
677 template <>
678 inline void
679 __kmp_dispatch_init_hier_runtime<kmp_int32>(ident_t *loc, kmp_int32 lb,
680  kmp_int32 ub, kmp_int32 st) {
681  __kmp_dispatch_init_hierarchy<kmp_int32>(
682  loc, __kmp_hier_scheds.size, __kmp_hier_scheds.layers,
683  __kmp_hier_scheds.scheds, __kmp_hier_scheds.small_chunks, lb, ub, st);
684 }
685 template <>
686 inline void
687 __kmp_dispatch_init_hier_runtime<kmp_uint32>(ident_t *loc, kmp_uint32 lb,
688  kmp_uint32 ub, kmp_int32 st) {
689  __kmp_dispatch_init_hierarchy<kmp_uint32>(
690  loc, __kmp_hier_scheds.size, __kmp_hier_scheds.layers,
691  __kmp_hier_scheds.scheds, __kmp_hier_scheds.small_chunks, lb, ub, st);
692 }
693 template <>
694 inline void
695 __kmp_dispatch_init_hier_runtime<kmp_int64>(ident_t *loc, kmp_int64 lb,
696  kmp_int64 ub, kmp_int64 st) {
697  __kmp_dispatch_init_hierarchy<kmp_int64>(
698  loc, __kmp_hier_scheds.size, __kmp_hier_scheds.layers,
699  __kmp_hier_scheds.scheds, __kmp_hier_scheds.large_chunks, lb, ub, st);
700 }
701 template <>
702 inline void
703 __kmp_dispatch_init_hier_runtime<kmp_uint64>(ident_t *loc, kmp_uint64 lb,
704  kmp_uint64 ub, kmp_int64 st) {
705  __kmp_dispatch_init_hierarchy<kmp_uint64>(
706  loc, __kmp_hier_scheds.size, __kmp_hier_scheds.layers,
707  __kmp_hier_scheds.scheds, __kmp_hier_scheds.large_chunks, lb, ub, st);
708 }
709 
710 // free all the hierarchy scheduling memory associated with the team
711 void __kmp_dispatch_free_hierarchies(kmp_team_t *team) {
712  int num_disp_buff = team->t.t_max_nproc > 1 ? __kmp_dispatch_num_buffers : 2;
713  for (int i = 0; i < num_disp_buff; ++i) {
714  // type does not matter here so use kmp_int32
715  auto sh =
716  reinterpret_cast<dispatch_shared_info_template<kmp_int32> volatile *>(
717  &team->t.t_disp_buffer[i]);
718  if (sh->hier) {
719  sh->hier->deallocate();
720  __kmp_free(sh->hier);
721  }
722  }
723 }
724 #endif
725 
726 // UT - unsigned flavor of T, ST - signed flavor of T,
727 // DBL - double if sizeof(T)==4, or long double if sizeof(T)==8
728 template <typename T>
729 static void
730 __kmp_dispatch_init(ident_t *loc, int gtid, enum sched_type schedule, T lb,
731  T ub, typename traits_t<T>::signed_t st,
732  typename traits_t<T>::signed_t chunk, int push_ws) {
733  typedef typename traits_t<T>::unsigned_t UT;
734 
735  int active;
736  kmp_info_t *th;
737  kmp_team_t *team;
738  kmp_uint32 my_buffer_index;
739  dispatch_private_info_template<T> *pr;
740  dispatch_shared_info_template<T> volatile *sh;
741 
742  KMP_BUILD_ASSERT(sizeof(dispatch_private_info_template<T>) ==
743  sizeof(dispatch_private_info));
744  KMP_BUILD_ASSERT(sizeof(dispatch_shared_info_template<UT>) ==
745  sizeof(dispatch_shared_info));
746 
747  if (!TCR_4(__kmp_init_parallel))
748  __kmp_parallel_initialize();
749 
750 #if INCLUDE_SSC_MARKS
751  SSC_MARK_DISPATCH_INIT();
752 #endif
753 #ifdef KMP_DEBUG
754  typedef typename traits_t<T>::signed_t ST;
755  {
756  char *buff;
757  // create format specifiers before the debug output
758  buff = __kmp_str_format("__kmp_dispatch_init: T#%%d called: schedule:%%d "
759  "chunk:%%%s lb:%%%s ub:%%%s st:%%%s\n",
760  traits_t<ST>::spec, traits_t<T>::spec,
761  traits_t<T>::spec, traits_t<ST>::spec);
762  KD_TRACE(10, (buff, gtid, schedule, chunk, lb, ub, st));
763  __kmp_str_free(&buff);
764  }
765 #endif
766  /* setup data */
767  th = __kmp_threads[gtid];
768  team = th->th.th_team;
769  active = !team->t.t_serialized;
770  th->th.th_ident = loc;
771 
772  // Any half-decent optimizer will remove this test when the blocks are empty
773  // since the macros expand to nothing
774  // when statistics are disabled.
775  if (schedule == __kmp_static) {
776  KMP_COUNT_BLOCK(OMP_LOOP_STATIC);
777  } else {
778  KMP_COUNT_BLOCK(OMP_LOOP_DYNAMIC);
779  }
780 
781 #if KMP_USE_HIER_SCHED
782  // Initialize the scheduling hierarchy if requested in OMP_SCHEDULE envirable
783  // Hierarchical scheduling does not work with ordered, so if ordered is
784  // detected, then revert back to threaded scheduling.
785  bool ordered;
786  enum sched_type my_sched = schedule;
787  my_buffer_index = th->th.th_dispatch->th_disp_index;
788  pr = reinterpret_cast<dispatch_private_info_template<T> *>(
789  &th->th.th_dispatch
790  ->th_disp_buffer[my_buffer_index % __kmp_dispatch_num_buffers]);
791  my_sched = SCHEDULE_WITHOUT_MODIFIERS(my_sched);
792  if ((my_sched >= kmp_nm_lower) && (my_sched < kmp_nm_upper))
793  my_sched =
794  (enum sched_type)(((int)my_sched) - (kmp_nm_lower - kmp_sch_lower));
795  ordered = (kmp_ord_lower & my_sched);
796  if (pr->flags.use_hier) {
797  if (ordered) {
798  KD_TRACE(100, ("__kmp_dispatch_init: T#%d ordered loop detected. "
799  "Disabling hierarchical scheduling.\n",
800  gtid));
801  pr->flags.use_hier = FALSE;
802  }
803  }
804  if (schedule == kmp_sch_runtime && __kmp_hier_scheds.size > 0) {
805  // Don't use hierarchical for ordered parallel loops and don't
806  // use the runtime hierarchy if one was specified in the program
807  if (!ordered && !pr->flags.use_hier)
808  __kmp_dispatch_init_hier_runtime<T>(loc, lb, ub, st);
809  }
810 #endif // KMP_USE_HIER_SCHED
811 
812 #if USE_ITT_BUILD
813  kmp_uint64 cur_chunk = chunk;
814  int itt_need_metadata_reporting = __itt_metadata_add_ptr &&
815  __kmp_forkjoin_frames_mode == 3 &&
816  KMP_MASTER_GTID(gtid) &&
817 #if OMP_40_ENABLED
818  th->th.th_teams_microtask == NULL &&
819 #endif
820  team->t.t_active_level == 1;
821 #endif
822  if (!active) {
823  pr = reinterpret_cast<dispatch_private_info_template<T> *>(
824  th->th.th_dispatch->th_disp_buffer); /* top of the stack */
825  } else {
826  KMP_DEBUG_ASSERT(th->th.th_dispatch ==
827  &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid]);
828 
829  my_buffer_index = th->th.th_dispatch->th_disp_index++;
830 
831  /* What happens when number of threads changes, need to resize buffer? */
832  pr = reinterpret_cast<dispatch_private_info_template<T> *>(
833  &th->th.th_dispatch
834  ->th_disp_buffer[my_buffer_index % __kmp_dispatch_num_buffers]);
835  sh = reinterpret_cast<dispatch_shared_info_template<T> volatile *>(
836  &team->t.t_disp_buffer[my_buffer_index % __kmp_dispatch_num_buffers]);
837  KD_TRACE(10, ("__kmp_dispatch_init: T#%d my_buffer_index:%d\n", gtid,
838  my_buffer_index));
839  }
840 
841  __kmp_dispatch_init_algorithm(loc, gtid, pr, schedule, lb, ub, st,
842 #if USE_ITT_BUILD
843  &cur_chunk,
844 #endif
845  chunk, (T)th->th.th_team_nproc,
846  (T)th->th.th_info.ds.ds_tid);
847  if (active) {
848  if (pr->flags.ordered == 0) {
849  th->th.th_dispatch->th_deo_fcn = __kmp_dispatch_deo_error;
850  th->th.th_dispatch->th_dxo_fcn = __kmp_dispatch_dxo_error;
851  } else {
852  th->th.th_dispatch->th_deo_fcn = __kmp_dispatch_deo<UT>;
853  th->th.th_dispatch->th_dxo_fcn = __kmp_dispatch_dxo<UT>;
854  }
855  }
856 
857  if (active) {
858  /* The name of this buffer should be my_buffer_index when it's free to use
859  * it */
860 
861  KD_TRACE(100, ("__kmp_dispatch_init: T#%d before wait: my_buffer_index:%d "
862  "sh->buffer_index:%d\n",
863  gtid, my_buffer_index, sh->buffer_index));
864  __kmp_wait_yield<kmp_uint32>(&sh->buffer_index, my_buffer_index,
865  __kmp_eq<kmp_uint32> USE_ITT_BUILD_ARG(NULL));
866  // Note: KMP_WAIT_YIELD() cannot be used there: buffer index and
867  // my_buffer_index are *always* 32-bit integers.
868  KMP_MB(); /* is this necessary? */
869  KD_TRACE(100, ("__kmp_dispatch_init: T#%d after wait: my_buffer_index:%d "
870  "sh->buffer_index:%d\n",
871  gtid, my_buffer_index, sh->buffer_index));
872 
873  th->th.th_dispatch->th_dispatch_pr_current = (dispatch_private_info_t *)pr;
874  th->th.th_dispatch->th_dispatch_sh_current =
875  CCAST(dispatch_shared_info_t *, (volatile dispatch_shared_info_t *)sh);
876 #if USE_ITT_BUILD
877  if (pr->flags.ordered) {
878  __kmp_itt_ordered_init(gtid);
879  }
880  // Report loop metadata
881  if (itt_need_metadata_reporting) {
882  // Only report metadata by master of active team at level 1
883  kmp_uint64 schedtype = 0;
884  switch (schedule) {
885  case kmp_sch_static_chunked:
886  case kmp_sch_static_balanced: // Chunk is calculated in the switch above
887  break;
888  case kmp_sch_static_greedy:
889  cur_chunk = pr->u.p.parm1;
890  break;
891  case kmp_sch_dynamic_chunked:
892  schedtype = 1;
893  break;
894  case kmp_sch_guided_iterative_chunked:
895  case kmp_sch_guided_analytical_chunked:
896 #if OMP_45_ENABLED
897  case kmp_sch_guided_simd:
898 #endif
899  schedtype = 2;
900  break;
901  default:
902  // Should we put this case under "static"?
903  // case kmp_sch_static_steal:
904  schedtype = 3;
905  break;
906  }
907  __kmp_itt_metadata_loop(loc, schedtype, pr->u.p.tc, cur_chunk);
908  }
909 #if KMP_USE_HIER_SCHED
910  if (pr->flags.use_hier) {
911  pr->u.p.count = 0;
912  pr->u.p.ub = pr->u.p.lb = pr->u.p.st = pr->u.p.tc = 0;
913  }
914 #endif // KMP_USER_HIER_SCHED
915 #endif /* USE_ITT_BUILD */
916  }
917 
918 #ifdef KMP_DEBUG
919  {
920  char *buff;
921  // create format specifiers before the debug output
922  buff = __kmp_str_format(
923  "__kmp_dispatch_init: T#%%d returning: schedule:%%d ordered:%%%s "
924  "lb:%%%s ub:%%%s"
925  " st:%%%s tc:%%%s count:%%%s\n\tordered_lower:%%%s ordered_upper:%%%s"
926  " parm1:%%%s parm2:%%%s parm3:%%%s parm4:%%%s\n",
927  traits_t<UT>::spec, traits_t<T>::spec, traits_t<T>::spec,
928  traits_t<ST>::spec, traits_t<UT>::spec, traits_t<UT>::spec,
929  traits_t<UT>::spec, traits_t<UT>::spec, traits_t<T>::spec,
930  traits_t<T>::spec, traits_t<T>::spec, traits_t<T>::spec);
931  KD_TRACE(10, (buff, gtid, pr->schedule, pr->flags.ordered, pr->u.p.lb,
932  pr->u.p.ub, pr->u.p.st, pr->u.p.tc, pr->u.p.count,
933  pr->u.p.ordered_lower, pr->u.p.ordered_upper, pr->u.p.parm1,
934  pr->u.p.parm2, pr->u.p.parm3, pr->u.p.parm4));
935  __kmp_str_free(&buff);
936  }
937 #endif
938 #if (KMP_STATIC_STEAL_ENABLED)
939  // It cannot be guaranteed that after execution of a loop with some other
940  // schedule kind all the parm3 variables will contain the same value. Even if
941  // all parm3 will be the same, it still exists a bad case like using 0 and 1
942  // rather than program life-time increment. So the dedicated variable is
943  // required. The 'static_steal_counter' is used.
944  if (schedule == kmp_sch_static_steal) {
945  // Other threads will inspect this variable when searching for a victim.
946  // This is a flag showing that other threads may steal from this thread
947  // since then.
948  volatile T *p = &pr->u.p.static_steal_counter;
949  *p = *p + 1;
950  }
951 #endif // ( KMP_STATIC_STEAL_ENABLED )
952 
953 #if OMPT_SUPPORT && OMPT_OPTIONAL
954  if (ompt_enabled.ompt_callback_work) {
955  ompt_team_info_t *team_info = __ompt_get_teaminfo(0, NULL);
956  ompt_task_info_t *task_info = __ompt_get_task_info_object(0);
957  ompt_callbacks.ompt_callback(ompt_callback_work)(
958  ompt_work_loop, ompt_scope_begin, &(team_info->parallel_data),
959  &(task_info->task_data), pr->u.p.tc, OMPT_LOAD_RETURN_ADDRESS(gtid));
960  }
961 #endif
962  KMP_PUSH_PARTITIONED_TIMER(OMP_loop_dynamic);
963 }
964 
965 /* For ordered loops, either __kmp_dispatch_finish() should be called after
966  * every iteration, or __kmp_dispatch_finish_chunk() should be called after
967  * every chunk of iterations. If the ordered section(s) were not executed
968  * for this iteration (or every iteration in this chunk), we need to set the
969  * ordered iteration counters so that the next thread can proceed. */
970 template <typename UT>
971 static void __kmp_dispatch_finish(int gtid, ident_t *loc) {
972  typedef typename traits_t<UT>::signed_t ST;
973  kmp_info_t *th = __kmp_threads[gtid];
974 
975  KD_TRACE(100, ("__kmp_dispatch_finish: T#%d called\n", gtid));
976  if (!th->th.th_team->t.t_serialized) {
977 
978  dispatch_private_info_template<UT> *pr =
979  reinterpret_cast<dispatch_private_info_template<UT> *>(
980  th->th.th_dispatch->th_dispatch_pr_current);
981  dispatch_shared_info_template<UT> volatile *sh =
982  reinterpret_cast<dispatch_shared_info_template<UT> volatile *>(
983  th->th.th_dispatch->th_dispatch_sh_current);
984  KMP_DEBUG_ASSERT(pr);
985  KMP_DEBUG_ASSERT(sh);
986  KMP_DEBUG_ASSERT(th->th.th_dispatch ==
987  &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid]);
988 
989  if (pr->ordered_bumped) {
990  KD_TRACE(
991  1000,
992  ("__kmp_dispatch_finish: T#%d resetting ordered_bumped to zero\n",
993  gtid));
994  pr->ordered_bumped = 0;
995  } else {
996  UT lower = pr->u.p.ordered_lower;
997 
998 #ifdef KMP_DEBUG
999  {
1000  char *buff;
1001  // create format specifiers before the debug output
1002  buff = __kmp_str_format("__kmp_dispatch_finish: T#%%d before wait: "
1003  "ordered_iteration:%%%s lower:%%%s\n",
1004  traits_t<UT>::spec, traits_t<UT>::spec);
1005  KD_TRACE(1000, (buff, gtid, sh->u.s.ordered_iteration, lower));
1006  __kmp_str_free(&buff);
1007  }
1008 #endif
1009 
1010  __kmp_wait_yield<UT>(&sh->u.s.ordered_iteration, lower,
1011  __kmp_ge<UT> USE_ITT_BUILD_ARG(NULL));
1012  KMP_MB(); /* is this necessary? */
1013 #ifdef KMP_DEBUG
1014  {
1015  char *buff;
1016  // create format specifiers before the debug output
1017  buff = __kmp_str_format("__kmp_dispatch_finish: T#%%d after wait: "
1018  "ordered_iteration:%%%s lower:%%%s\n",
1019  traits_t<UT>::spec, traits_t<UT>::spec);
1020  KD_TRACE(1000, (buff, gtid, sh->u.s.ordered_iteration, lower));
1021  __kmp_str_free(&buff);
1022  }
1023 #endif
1024 
1025  test_then_inc<ST>((volatile ST *)&sh->u.s.ordered_iteration);
1026  } // if
1027  } // if
1028  KD_TRACE(100, ("__kmp_dispatch_finish: T#%d returned\n", gtid));
1029 }
1030 
1031 #ifdef KMP_GOMP_COMPAT
1032 
1033 template <typename UT>
1034 static void __kmp_dispatch_finish_chunk(int gtid, ident_t *loc) {
1035  typedef typename traits_t<UT>::signed_t ST;
1036  kmp_info_t *th = __kmp_threads[gtid];
1037 
1038  KD_TRACE(100, ("__kmp_dispatch_finish_chunk: T#%d called\n", gtid));
1039  if (!th->th.th_team->t.t_serialized) {
1040  // int cid;
1041  dispatch_private_info_template<UT> *pr =
1042  reinterpret_cast<dispatch_private_info_template<UT> *>(
1043  th->th.th_dispatch->th_dispatch_pr_current);
1044  dispatch_shared_info_template<UT> volatile *sh =
1045  reinterpret_cast<dispatch_shared_info_template<UT> volatile *>(
1046  th->th.th_dispatch->th_dispatch_sh_current);
1047  KMP_DEBUG_ASSERT(pr);
1048  KMP_DEBUG_ASSERT(sh);
1049  KMP_DEBUG_ASSERT(th->th.th_dispatch ==
1050  &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid]);
1051 
1052  // for (cid = 0; cid < KMP_MAX_ORDERED; ++cid) {
1053  UT lower = pr->u.p.ordered_lower;
1054  UT upper = pr->u.p.ordered_upper;
1055  UT inc = upper - lower + 1;
1056 
1057  if (pr->ordered_bumped == inc) {
1058  KD_TRACE(
1059  1000,
1060  ("__kmp_dispatch_finish: T#%d resetting ordered_bumped to zero\n",
1061  gtid));
1062  pr->ordered_bumped = 0;
1063  } else {
1064  inc -= pr->ordered_bumped;
1065 
1066 #ifdef KMP_DEBUG
1067  {
1068  char *buff;
1069  // create format specifiers before the debug output
1070  buff = __kmp_str_format(
1071  "__kmp_dispatch_finish_chunk: T#%%d before wait: "
1072  "ordered_iteration:%%%s lower:%%%s upper:%%%s\n",
1073  traits_t<UT>::spec, traits_t<UT>::spec, traits_t<UT>::spec);
1074  KD_TRACE(1000, (buff, gtid, sh->u.s.ordered_iteration, lower, upper));
1075  __kmp_str_free(&buff);
1076  }
1077 #endif
1078 
1079  __kmp_wait_yield<UT>(&sh->u.s.ordered_iteration, lower,
1080  __kmp_ge<UT> USE_ITT_BUILD_ARG(NULL));
1081 
1082  KMP_MB(); /* is this necessary? */
1083  KD_TRACE(1000, ("__kmp_dispatch_finish_chunk: T#%d resetting "
1084  "ordered_bumped to zero\n",
1085  gtid));
1086  pr->ordered_bumped = 0;
1088 #ifdef KMP_DEBUG
1089  {
1090  char *buff;
1091  // create format specifiers before the debug output
1092  buff = __kmp_str_format(
1093  "__kmp_dispatch_finish_chunk: T#%%d after wait: "
1094  "ordered_iteration:%%%s inc:%%%s lower:%%%s upper:%%%s\n",
1095  traits_t<UT>::spec, traits_t<UT>::spec, traits_t<UT>::spec,
1096  traits_t<UT>::spec);
1097  KD_TRACE(1000,
1098  (buff, gtid, sh->u.s.ordered_iteration, inc, lower, upper));
1099  __kmp_str_free(&buff);
1100  }
1101 #endif
1102 
1103  test_then_add<ST>((volatile ST *)&sh->u.s.ordered_iteration, inc);
1104  }
1105  // }
1106  }
1107  KD_TRACE(100, ("__kmp_dispatch_finish_chunk: T#%d returned\n", gtid));
1108 }
1109 
1110 #endif /* KMP_GOMP_COMPAT */
1111 
1112 template <typename T>
1113 int __kmp_dispatch_next_algorithm(int gtid,
1114  dispatch_private_info_template<T> *pr,
1115  dispatch_shared_info_template<T> volatile *sh,
1116  kmp_int32 *p_last, T *p_lb, T *p_ub,
1117  typename traits_t<T>::signed_t *p_st, T nproc,
1118  T tid) {
1119  typedef typename traits_t<T>::unsigned_t UT;
1120  typedef typename traits_t<T>::signed_t ST;
1121  typedef typename traits_t<T>::floating_t DBL;
1122  int status = 0;
1123  kmp_int32 last = 0;
1124  T start;
1125  ST incr;
1126  UT limit, trip, init;
1127  kmp_info_t *th = __kmp_threads[gtid];
1128  kmp_team_t *team = th->th.th_team;
1129 
1130  KMP_DEBUG_ASSERT(th->th.th_dispatch ==
1131  &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid]);
1132  KMP_DEBUG_ASSERT(pr);
1133  KMP_DEBUG_ASSERT(sh);
1134  KMP_DEBUG_ASSERT(tid >= 0 && tid < nproc);
1135 #ifdef KMP_DEBUG
1136  {
1137  char *buff;
1138  // create format specifiers before the debug output
1139  buff =
1140  __kmp_str_format("__kmp_dispatch_next_algorithm: T#%%d called pr:%%p "
1141  "sh:%%p nproc:%%%s tid:%%%s\n",
1142  traits_t<T>::spec, traits_t<T>::spec);
1143  KD_TRACE(10, (buff, gtid, pr, sh, nproc, tid));
1144  __kmp_str_free(&buff);
1145  }
1146 #endif
1147 
1148  // zero trip count
1149  if (pr->u.p.tc == 0) {
1150  KD_TRACE(10,
1151  ("__kmp_dispatch_next_algorithm: T#%d early exit trip count is "
1152  "zero status:%d\n",
1153  gtid, status));
1154  return 0;
1155  }
1156 
1157  switch (pr->schedule) {
1158 #if (KMP_STATIC_STEAL_ENABLED)
1159  case kmp_sch_static_steal: {
1160  T chunk = pr->u.p.parm1;
1161 
1162  KD_TRACE(100,
1163  ("__kmp_dispatch_next_algorithm: T#%d kmp_sch_static_steal case\n",
1164  gtid));
1165 
1166  trip = pr->u.p.tc - 1;
1167 
1168  if (traits_t<T>::type_size > 4) {
1169  // use lock for 8-byte and CAS for 4-byte induction
1170  // variable. TODO (optional): check and use 16-byte CAS
1171  kmp_lock_t *lck = th->th.th_dispatch->th_steal_lock;
1172  KMP_DEBUG_ASSERT(lck != NULL);
1173  if (pr->u.p.count < (UT)pr->u.p.ub) {
1174  __kmp_acquire_lock(lck, gtid);
1175  // try to get own chunk of iterations
1176  init = (pr->u.p.count)++;
1177  status = (init < (UT)pr->u.p.ub);
1178  __kmp_release_lock(lck, gtid);
1179  } else {
1180  status = 0; // no own chunks
1181  }
1182  if (!status) { // try to steal
1183  kmp_info_t **other_threads = team->t.t_threads;
1184  int while_limit = nproc; // nproc attempts to find a victim
1185  int while_index = 0;
1186  // TODO: algorithm of searching for a victim
1187  // should be cleaned up and measured
1188  while ((!status) && (while_limit != ++while_index)) {
1189  T remaining;
1190  T victimIdx = pr->u.p.parm4;
1191  T oldVictimIdx = victimIdx ? victimIdx - 1 : nproc - 1;
1192  dispatch_private_info_template<T> *victim =
1193  reinterpret_cast<dispatch_private_info_template<T> *>(
1194  other_threads[victimIdx]
1195  ->th.th_dispatch->th_dispatch_pr_current);
1196  while ((victim == NULL || victim == pr ||
1197  (*(volatile T *)&victim->u.p.static_steal_counter !=
1198  *(volatile T *)&pr->u.p.static_steal_counter)) &&
1199  oldVictimIdx != victimIdx) {
1200  victimIdx = (victimIdx + 1) % nproc;
1201  victim = reinterpret_cast<dispatch_private_info_template<T> *>(
1202  other_threads[victimIdx]
1203  ->th.th_dispatch->th_dispatch_pr_current);
1204  }
1205  if (!victim || (*(volatile T *)&victim->u.p.static_steal_counter !=
1206  *(volatile T *)&pr->u.p.static_steal_counter)) {
1207  continue; // try once more (nproc attempts in total)
1208  // no victim is ready yet to participate in stealing
1209  // because all victims are still in kmp_init_dispatch
1210  }
1211  if (victim->u.p.count + 2 > (UT)victim->u.p.ub) {
1212  pr->u.p.parm4 = (victimIdx + 1) % nproc; // shift start tid
1213  continue; // not enough chunks to steal, goto next victim
1214  }
1215 
1216  lck = other_threads[victimIdx]->th.th_dispatch->th_steal_lock;
1217  KMP_ASSERT(lck != NULL);
1218  __kmp_acquire_lock(lck, gtid);
1219  limit = victim->u.p.ub; // keep initial ub
1220  if (victim->u.p.count >= limit ||
1221  (remaining = limit - victim->u.p.count) < 2) {
1222  __kmp_release_lock(lck, gtid);
1223  pr->u.p.parm4 = (victimIdx + 1) % nproc; // next victim
1224  continue; // not enough chunks to steal
1225  }
1226  // stealing succeded, reduce victim's ub by 1/4 of undone chunks or
1227  // by 1
1228  if (remaining > 3) {
1229  // steal 1/4 of remaining
1230  KMP_COUNT_DEVELOPER_VALUE(FOR_static_steal_stolen, remaining >> 2);
1231  init = (victim->u.p.ub -= (remaining >> 2));
1232  } else {
1233  // steal 1 chunk of 2 or 3 remaining
1234  KMP_COUNT_DEVELOPER_VALUE(FOR_static_steal_stolen, 1);
1235  init = (victim->u.p.ub -= 1);
1236  }
1237  __kmp_release_lock(lck, gtid);
1238 
1239  KMP_DEBUG_ASSERT(init + 1 <= limit);
1240  pr->u.p.parm4 = victimIdx; // remember victim to steal from
1241  status = 1;
1242  while_index = 0;
1243  // now update own count and ub with stolen range but init chunk
1244  __kmp_acquire_lock(th->th.th_dispatch->th_steal_lock, gtid);
1245  pr->u.p.count = init + 1;
1246  pr->u.p.ub = limit;
1247  __kmp_release_lock(th->th.th_dispatch->th_steal_lock, gtid);
1248  } // while (search for victim)
1249  } // if (try to find victim and steal)
1250  } else {
1251  // 4-byte induction variable, use 8-byte CAS for pair (count, ub)
1252  typedef union {
1253  struct {
1254  UT count;
1255  T ub;
1256  } p;
1257  kmp_int64 b;
1258  } union_i4;
1259  // All operations on 'count' or 'ub' must be combined atomically
1260  // together.
1261  {
1262  union_i4 vold, vnew;
1263  vold.b = *(volatile kmp_int64 *)(&pr->u.p.count);
1264  vnew = vold;
1265  vnew.p.count++;
1266  while (!KMP_COMPARE_AND_STORE_ACQ64(
1267  (volatile kmp_int64 *)&pr->u.p.count,
1268  *VOLATILE_CAST(kmp_int64 *) & vold.b,
1269  *VOLATILE_CAST(kmp_int64 *) & vnew.b)) {
1270  KMP_CPU_PAUSE();
1271  vold.b = *(volatile kmp_int64 *)(&pr->u.p.count);
1272  vnew = vold;
1273  vnew.p.count++;
1274  }
1275  vnew = vold;
1276  init = vnew.p.count;
1277  status = (init < (UT)vnew.p.ub);
1278  }
1279 
1280  if (!status) {
1281  kmp_info_t **other_threads = team->t.t_threads;
1282  int while_limit = nproc; // nproc attempts to find a victim
1283  int while_index = 0;
1284 
1285  // TODO: algorithm of searching for a victim
1286  // should be cleaned up and measured
1287  while ((!status) && (while_limit != ++while_index)) {
1288  union_i4 vold, vnew;
1289  kmp_int32 remaining;
1290  T victimIdx = pr->u.p.parm4;
1291  T oldVictimIdx = victimIdx ? victimIdx - 1 : nproc - 1;
1292  dispatch_private_info_template<T> *victim =
1293  reinterpret_cast<dispatch_private_info_template<T> *>(
1294  other_threads[victimIdx]
1295  ->th.th_dispatch->th_dispatch_pr_current);
1296  while ((victim == NULL || victim == pr ||
1297  (*(volatile T *)&victim->u.p.static_steal_counter !=
1298  *(volatile T *)&pr->u.p.static_steal_counter)) &&
1299  oldVictimIdx != victimIdx) {
1300  victimIdx = (victimIdx + 1) % nproc;
1301  victim = reinterpret_cast<dispatch_private_info_template<T> *>(
1302  other_threads[victimIdx]
1303  ->th.th_dispatch->th_dispatch_pr_current);
1304  }
1305  if (!victim || (*(volatile T *)&victim->u.p.static_steal_counter !=
1306  *(volatile T *)&pr->u.p.static_steal_counter)) {
1307  continue; // try once more (nproc attempts in total)
1308  // no victim is ready yet to participate in stealing
1309  // because all victims are still in kmp_init_dispatch
1310  }
1311  pr->u.p.parm4 = victimIdx; // new victim found
1312  while (1) { // CAS loop if victim has enough chunks to steal
1313  vold.b = *(volatile kmp_int64 *)(&victim->u.p.count);
1314  vnew = vold;
1315 
1316  KMP_DEBUG_ASSERT((vnew.p.ub - 1) * (UT)chunk <= trip);
1317  if (vnew.p.count >= (UT)vnew.p.ub ||
1318  (remaining = vnew.p.ub - vnew.p.count) < 2) {
1319  pr->u.p.parm4 = (victimIdx + 1) % nproc; // shift start victim id
1320  break; // not enough chunks to steal, goto next victim
1321  }
1322  if (remaining > 3) {
1323  vnew.p.ub -= (remaining >> 2); // try to steal 1/4 of remaining
1324  } else {
1325  vnew.p.ub -= 1; // steal 1 chunk of 2 or 3 remaining
1326  }
1327  KMP_DEBUG_ASSERT((vnew.p.ub - 1) * (UT)chunk <= trip);
1328  // TODO: Should this be acquire or release?
1329  if (KMP_COMPARE_AND_STORE_ACQ64(
1330  (volatile kmp_int64 *)&victim->u.p.count,
1331  *VOLATILE_CAST(kmp_int64 *) & vold.b,
1332  *VOLATILE_CAST(kmp_int64 *) & vnew.b)) {
1333  // stealing succedded
1334  KMP_COUNT_DEVELOPER_VALUE(FOR_static_steal_stolen,
1335  vold.p.ub - vnew.p.ub);
1336  status = 1;
1337  while_index = 0;
1338  // now update own count and ub
1339  init = vnew.p.ub;
1340  vold.p.count = init + 1;
1341 #if KMP_ARCH_X86
1342  KMP_XCHG_FIXED64((volatile kmp_int64 *)(&pr->u.p.count), vold.b);
1343 #else
1344  *(volatile kmp_int64 *)(&pr->u.p.count) = vold.b;
1345 #endif
1346  break;
1347  } // if (check CAS result)
1348  KMP_CPU_PAUSE(); // CAS failed, repeate attempt
1349  } // while (try to steal from particular victim)
1350  } // while (search for victim)
1351  } // if (try to find victim and steal)
1352  } // if (4-byte induction variable)
1353  if (!status) {
1354  *p_lb = 0;
1355  *p_ub = 0;
1356  if (p_st != NULL)
1357  *p_st = 0;
1358  } else {
1359  start = pr->u.p.parm2;
1360  init *= chunk;
1361  limit = chunk + init - 1;
1362  incr = pr->u.p.st;
1363  KMP_COUNT_DEVELOPER_VALUE(FOR_static_steal_chunks, 1);
1364 
1365  KMP_DEBUG_ASSERT(init <= trip);
1366  if ((last = (limit >= trip)) != 0)
1367  limit = trip;
1368  if (p_st != NULL)
1369  *p_st = incr;
1370 
1371  if (incr == 1) {
1372  *p_lb = start + init;
1373  *p_ub = start + limit;
1374  } else {
1375  *p_lb = start + init * incr;
1376  *p_ub = start + limit * incr;
1377  }
1378 
1379  if (pr->flags.ordered) {
1380  pr->u.p.ordered_lower = init;
1381  pr->u.p.ordered_upper = limit;
1382  } // if
1383  } // if
1384  break;
1385  } // case
1386 #endif // ( KMP_STATIC_STEAL_ENABLED )
1387  case kmp_sch_static_balanced: {
1388  KD_TRACE(
1389  10,
1390  ("__kmp_dispatch_next_algorithm: T#%d kmp_sch_static_balanced case\n",
1391  gtid));
1392  /* check if thread has any iteration to do */
1393  if ((status = !pr->u.p.count) != 0) {
1394  pr->u.p.count = 1;
1395  *p_lb = pr->u.p.lb;
1396  *p_ub = pr->u.p.ub;
1397  last = pr->u.p.parm1;
1398  if (p_st != NULL)
1399  *p_st = pr->u.p.st;
1400  } else { /* no iterations to do */
1401  pr->u.p.lb = pr->u.p.ub + pr->u.p.st;
1402  }
1403  } // case
1404  break;
1405  case kmp_sch_static_greedy: /* original code for kmp_sch_static_greedy was
1406  merged here */
1407  case kmp_sch_static_chunked: {
1408  T parm1;
1409 
1410  KD_TRACE(100, ("__kmp_dispatch_next_algorithm: T#%d "
1411  "kmp_sch_static_[affinity|chunked] case\n",
1412  gtid));
1413  parm1 = pr->u.p.parm1;
1414 
1415  trip = pr->u.p.tc - 1;
1416  init = parm1 * (pr->u.p.count + tid);
1417 
1418  if ((status = (init <= trip)) != 0) {
1419  start = pr->u.p.lb;
1420  incr = pr->u.p.st;
1421  limit = parm1 + init - 1;
1422 
1423  if ((last = (limit >= trip)) != 0)
1424  limit = trip;
1425 
1426  if (p_st != NULL)
1427  *p_st = incr;
1428 
1429  pr->u.p.count += nproc;
1430 
1431  if (incr == 1) {
1432  *p_lb = start + init;
1433  *p_ub = start + limit;
1434  } else {
1435  *p_lb = start + init * incr;
1436  *p_ub = start + limit * incr;
1437  }
1438 
1439  if (pr->flags.ordered) {
1440  pr->u.p.ordered_lower = init;
1441  pr->u.p.ordered_upper = limit;
1442  } // if
1443  } // if
1444  } // case
1445  break;
1446 
1447  case kmp_sch_dynamic_chunked: {
1448  T chunk = pr->u.p.parm1;
1449 
1450  KD_TRACE(
1451  100,
1452  ("__kmp_dispatch_next_algorithm: T#%d kmp_sch_dynamic_chunked case\n",
1453  gtid));
1454 
1455  init = chunk * test_then_inc_acq<ST>((volatile ST *)&sh->u.s.iteration);
1456  trip = pr->u.p.tc - 1;
1457 
1458  if ((status = (init <= trip)) == 0) {
1459  *p_lb = 0;
1460  *p_ub = 0;
1461  if (p_st != NULL)
1462  *p_st = 0;
1463  } else {
1464  start = pr->u.p.lb;
1465  limit = chunk + init - 1;
1466  incr = pr->u.p.st;
1467 
1468  if ((last = (limit >= trip)) != 0)
1469  limit = trip;
1470 
1471  if (p_st != NULL)
1472  *p_st = incr;
1473 
1474  if (incr == 1) {
1475  *p_lb = start + init;
1476  *p_ub = start + limit;
1477  } else {
1478  *p_lb = start + init * incr;
1479  *p_ub = start + limit * incr;
1480  }
1481 
1482  if (pr->flags.ordered) {
1483  pr->u.p.ordered_lower = init;
1484  pr->u.p.ordered_upper = limit;
1485  } // if
1486  } // if
1487  } // case
1488  break;
1489 
1490  case kmp_sch_guided_iterative_chunked: {
1491  T chunkspec = pr->u.p.parm1;
1492  KD_TRACE(100, ("__kmp_dispatch_next_algorithm: T#%d kmp_sch_guided_chunked "
1493  "iterative case\n",
1494  gtid));
1495  trip = pr->u.p.tc;
1496  // Start atomic part of calculations
1497  while (1) {
1498  ST remaining; // signed, because can be < 0
1499  init = sh->u.s.iteration; // shared value
1500  remaining = trip - init;
1501  if (remaining <= 0) { // AC: need to compare with 0 first
1502  // nothing to do, don't try atomic op
1503  status = 0;
1504  break;
1505  }
1506  if ((T)remaining <
1507  pr->u.p.parm2) { // compare with K*nproc*(chunk+1), K=2 by default
1508  // use dynamic-style shcedule
1509  // atomically inrement iterations, get old value
1510  init = test_then_add<ST>(RCAST(volatile ST *, &sh->u.s.iteration),
1511  (ST)chunkspec);
1512  remaining = trip - init;
1513  if (remaining <= 0) {
1514  status = 0; // all iterations got by other threads
1515  } else {
1516  // got some iterations to work on
1517  status = 1;
1518  if ((T)remaining > chunkspec) {
1519  limit = init + chunkspec - 1;
1520  } else {
1521  last = 1; // the last chunk
1522  limit = init + remaining - 1;
1523  } // if
1524  } // if
1525  break;
1526  } // if
1527  limit = init +
1528  (UT)(remaining * *(double *)&pr->u.p.parm3); // divide by K*nproc
1529  if (compare_and_swap<ST>(RCAST(volatile ST *, &sh->u.s.iteration),
1530  (ST)init, (ST)limit)) {
1531  // CAS was successful, chunk obtained
1532  status = 1;
1533  --limit;
1534  break;
1535  } // if
1536  } // while
1537  if (status != 0) {
1538  start = pr->u.p.lb;
1539  incr = pr->u.p.st;
1540  if (p_st != NULL)
1541  *p_st = incr;
1542  *p_lb = start + init * incr;
1543  *p_ub = start + limit * incr;
1544  if (pr->flags.ordered) {
1545  pr->u.p.ordered_lower = init;
1546  pr->u.p.ordered_upper = limit;
1547  } // if
1548  } else {
1549  *p_lb = 0;
1550  *p_ub = 0;
1551  if (p_st != NULL)
1552  *p_st = 0;
1553  } // if
1554  } // case
1555  break;
1556 
1557 #if OMP_45_ENABLED
1558  case kmp_sch_guided_simd: {
1559  // same as iterative but curr-chunk adjusted to be multiple of given
1560  // chunk
1561  T chunk = pr->u.p.parm1;
1562  KD_TRACE(100,
1563  ("__kmp_dispatch_next_algorithm: T#%d kmp_sch_guided_simd case\n",
1564  gtid));
1565  trip = pr->u.p.tc;
1566  // Start atomic part of calculations
1567  while (1) {
1568  ST remaining; // signed, because can be < 0
1569  init = sh->u.s.iteration; // shared value
1570  remaining = trip - init;
1571  if (remaining <= 0) { // AC: need to compare with 0 first
1572  status = 0; // nothing to do, don't try atomic op
1573  break;
1574  }
1575  KMP_DEBUG_ASSERT(init % chunk == 0);
1576  // compare with K*nproc*(chunk+1), K=2 by default
1577  if ((T)remaining < pr->u.p.parm2) {
1578  // use dynamic-style shcedule
1579  // atomically inrement iterations, get old value
1580  init = test_then_add<ST>(RCAST(volatile ST *, &sh->u.s.iteration),
1581  (ST)chunk);
1582  remaining = trip - init;
1583  if (remaining <= 0) {
1584  status = 0; // all iterations got by other threads
1585  } else {
1586  // got some iterations to work on
1587  status = 1;
1588  if ((T)remaining > chunk) {
1589  limit = init + chunk - 1;
1590  } else {
1591  last = 1; // the last chunk
1592  limit = init + remaining - 1;
1593  } // if
1594  } // if
1595  break;
1596  } // if
1597  // divide by K*nproc
1598  UT span = remaining * (*(double *)&pr->u.p.parm3);
1599  UT rem = span % chunk;
1600  if (rem) // adjust so that span%chunk == 0
1601  span += chunk - rem;
1602  limit = init + span;
1603  if (compare_and_swap<ST>(RCAST(volatile ST *, &sh->u.s.iteration),
1604  (ST)init, (ST)limit)) {
1605  // CAS was successful, chunk obtained
1606  status = 1;
1607  --limit;
1608  break;
1609  } // if
1610  } // while
1611  if (status != 0) {
1612  start = pr->u.p.lb;
1613  incr = pr->u.p.st;
1614  if (p_st != NULL)
1615  *p_st = incr;
1616  *p_lb = start + init * incr;
1617  *p_ub = start + limit * incr;
1618  if (pr->flags.ordered) {
1619  pr->u.p.ordered_lower = init;
1620  pr->u.p.ordered_upper = limit;
1621  } // if
1622  } else {
1623  *p_lb = 0;
1624  *p_ub = 0;
1625  if (p_st != NULL)
1626  *p_st = 0;
1627  } // if
1628  } // case
1629  break;
1630 #endif // OMP_45_ENABLED
1631 
1632  case kmp_sch_guided_analytical_chunked: {
1633  T chunkspec = pr->u.p.parm1;
1634  UT chunkIdx;
1635 #if KMP_OS_WINDOWS && KMP_ARCH_X86
1636  /* for storing original FPCW value for Windows* OS on
1637  IA-32 architecture 8-byte version */
1638  unsigned int oldFpcw;
1639  unsigned int fpcwSet = 0;
1640 #endif
1641  KD_TRACE(100, ("__kmp_dispatch_next_algorithm: T#%d "
1642  "kmp_sch_guided_analytical_chunked case\n",
1643  gtid));
1644 
1645  trip = pr->u.p.tc;
1646 
1647  KMP_DEBUG_ASSERT(nproc > 1);
1648  KMP_DEBUG_ASSERT((2UL * chunkspec + 1) * (UT)nproc < trip);
1649 
1650  while (1) { /* this while loop is a safeguard against unexpected zero
1651  chunk sizes */
1652  chunkIdx = test_then_inc_acq<ST>((volatile ST *)&sh->u.s.iteration);
1653  if (chunkIdx >= (UT)pr->u.p.parm2) {
1654  --trip;
1655  /* use dynamic-style scheduling */
1656  init = chunkIdx * chunkspec + pr->u.p.count;
1657  /* need to verify init > 0 in case of overflow in the above
1658  * calculation */
1659  if ((status = (init > 0 && init <= trip)) != 0) {
1660  limit = init + chunkspec - 1;
1661 
1662  if ((last = (limit >= trip)) != 0)
1663  limit = trip;
1664  }
1665  break;
1666  } else {
1667 /* use exponential-style scheduling */
1668 /* The following check is to workaround the lack of long double precision on
1669  Windows* OS.
1670  This check works around the possible effect that init != 0 for chunkIdx == 0.
1671  */
1672 #if KMP_OS_WINDOWS && KMP_ARCH_X86
1673  /* If we haven't already done so, save original
1674  FPCW and set precision to 64-bit, as Windows* OS
1675  on IA-32 architecture defaults to 53-bit */
1676  if (!fpcwSet) {
1677  oldFpcw = _control87(0, 0);
1678  _control87(_PC_64, _MCW_PC);
1679  fpcwSet = 0x30000;
1680  }
1681 #endif
1682  if (chunkIdx) {
1683  init = __kmp_dispatch_guided_remaining<T>(
1684  trip, *(DBL *)&pr->u.p.parm3, chunkIdx);
1685  KMP_DEBUG_ASSERT(init);
1686  init = trip - init;
1687  } else
1688  init = 0;
1689  limit = trip - __kmp_dispatch_guided_remaining<T>(
1690  trip, *(DBL *)&pr->u.p.parm3, chunkIdx + 1);
1691  KMP_ASSERT(init <= limit);
1692  if (init < limit) {
1693  KMP_DEBUG_ASSERT(limit <= trip);
1694  --limit;
1695  status = 1;
1696  break;
1697  } // if
1698  } // if
1699  } // while (1)
1700 #if KMP_OS_WINDOWS && KMP_ARCH_X86
1701  /* restore FPCW if necessary
1702  AC: check fpcwSet flag first because oldFpcw can be uninitialized here
1703  */
1704  if (fpcwSet && (oldFpcw & fpcwSet))
1705  _control87(oldFpcw, _MCW_PC);
1706 #endif
1707  if (status != 0) {
1708  start = pr->u.p.lb;
1709  incr = pr->u.p.st;
1710  if (p_st != NULL)
1711  *p_st = incr;
1712  *p_lb = start + init * incr;
1713  *p_ub = start + limit * incr;
1714  if (pr->flags.ordered) {
1715  pr->u.p.ordered_lower = init;
1716  pr->u.p.ordered_upper = limit;
1717  }
1718  } else {
1719  *p_lb = 0;
1720  *p_ub = 0;
1721  if (p_st != NULL)
1722  *p_st = 0;
1723  }
1724  } // case
1725  break;
1726 
1727  case kmp_sch_trapezoidal: {
1728  UT index;
1729  T parm2 = pr->u.p.parm2;
1730  T parm3 = pr->u.p.parm3;
1731  T parm4 = pr->u.p.parm4;
1732  KD_TRACE(100,
1733  ("__kmp_dispatch_next_algorithm: T#%d kmp_sch_trapezoidal case\n",
1734  gtid));
1735 
1736  index = test_then_inc<ST>((volatile ST *)&sh->u.s.iteration);
1737 
1738  init = (index * ((2 * parm2) - (index - 1) * parm4)) / 2;
1739  trip = pr->u.p.tc - 1;
1740 
1741  if ((status = ((T)index < parm3 && init <= trip)) == 0) {
1742  *p_lb = 0;
1743  *p_ub = 0;
1744  if (p_st != NULL)
1745  *p_st = 0;
1746  } else {
1747  start = pr->u.p.lb;
1748  limit = ((index + 1) * (2 * parm2 - index * parm4)) / 2 - 1;
1749  incr = pr->u.p.st;
1750 
1751  if ((last = (limit >= trip)) != 0)
1752  limit = trip;
1753 
1754  if (p_st != NULL)
1755  *p_st = incr;
1756 
1757  if (incr == 1) {
1758  *p_lb = start + init;
1759  *p_ub = start + limit;
1760  } else {
1761  *p_lb = start + init * incr;
1762  *p_ub = start + limit * incr;
1763  }
1764 
1765  if (pr->flags.ordered) {
1766  pr->u.p.ordered_lower = init;
1767  pr->u.p.ordered_upper = limit;
1768  } // if
1769  } // if
1770  } // case
1771  break;
1772  default: {
1773  status = 0; // to avoid complaints on uninitialized variable use
1774  __kmp_fatal(KMP_MSG(UnknownSchedTypeDetected), // Primary message
1775  KMP_HNT(GetNewerLibrary), // Hint
1776  __kmp_msg_null // Variadic argument list terminator
1777  );
1778  } break;
1779  } // switch
1780  if (p_last)
1781  *p_last = last;
1782 #ifdef KMP_DEBUG
1783  if (pr->flags.ordered) {
1784  char *buff;
1785  // create format specifiers before the debug output
1786  buff = __kmp_str_format("__kmp_dispatch_next_algorithm: T#%%d "
1787  "ordered_lower:%%%s ordered_upper:%%%s\n",
1788  traits_t<UT>::spec, traits_t<UT>::spec);
1789  KD_TRACE(1000, (buff, gtid, pr->u.p.ordered_lower, pr->u.p.ordered_upper));
1790  __kmp_str_free(&buff);
1791  }
1792  {
1793  char *buff;
1794  // create format specifiers before the debug output
1795  buff = __kmp_str_format(
1796  "__kmp_dispatch_next_algorithm: T#%%d exit status:%%d p_last:%%d "
1797  "p_lb:%%%s p_ub:%%%s p_st:%%%s\n",
1798  traits_t<T>::spec, traits_t<T>::spec, traits_t<ST>::spec);
1799  KD_TRACE(10, (buff, gtid, status, *p_last, *p_lb, *p_ub, *p_st));
1800  __kmp_str_free(&buff);
1801  }
1802 #endif
1803  return status;
1804 }
1805 
1806 /* Define a macro for exiting __kmp_dispatch_next(). If status is 0 (no more
1807  work), then tell OMPT the loop is over. In some cases kmp_dispatch_fini()
1808  is not called. */
1809 #if OMPT_SUPPORT && OMPT_OPTIONAL
1810 #define OMPT_LOOP_END \
1811  if (status == 0) { \
1812  if (ompt_enabled.ompt_callback_work) { \
1813  ompt_team_info_t *team_info = __ompt_get_teaminfo(0, NULL); \
1814  ompt_task_info_t *task_info = __ompt_get_task_info_object(0); \
1815  ompt_callbacks.ompt_callback(ompt_callback_work)( \
1816  ompt_work_loop, ompt_scope_end, &(team_info->parallel_data), \
1817  &(task_info->task_data), 0, codeptr); \
1818  } \
1819  }
1820 // TODO: implement count
1821 #else
1822 #define OMPT_LOOP_END // no-op
1823 #endif
1824 
1825 #if KMP_STATS_ENABLED
1826 #define KMP_STATS_LOOP_END \
1827  { \
1828  kmp_int64 u, l, t, i; \
1829  l = (kmp_int64)(*p_lb); \
1830  u = (kmp_int64)(*p_ub); \
1831  i = (kmp_int64)(pr->u.p.st); \
1832  if (status == 0) { \
1833  t = 0; \
1834  KMP_POP_PARTITIONED_TIMER(); \
1835  } else if (i == 1) { \
1836  if (u >= l) \
1837  t = u - l + 1; \
1838  else \
1839  t = 0; \
1840  } else if (i < 0) { \
1841  if (l >= u) \
1842  t = (l - u) / (-i) + 1; \
1843  else \
1844  t = 0; \
1845  } else { \
1846  if (u >= l) \
1847  t = (u - l) / i + 1; \
1848  else \
1849  t = 0; \
1850  } \
1851  KMP_COUNT_VALUE(OMP_loop_dynamic_iterations, t); \
1852  }
1853 #else
1854 #define KMP_STATS_LOOP_END /* Nothing */
1855 #endif
1856 
1857 template <typename T>
1858 static int __kmp_dispatch_next(ident_t *loc, int gtid, kmp_int32 *p_last,
1859  T *p_lb, T *p_ub,
1860  typename traits_t<T>::signed_t *p_st
1861 #if OMPT_SUPPORT && OMPT_OPTIONAL
1862  ,
1863  void *codeptr
1864 #endif
1865  ) {
1866 
1867  typedef typename traits_t<T>::unsigned_t UT;
1868  typedef typename traits_t<T>::signed_t ST;
1869  // This is potentially slightly misleading, schedule(runtime) will appear here
1870  // even if the actual runtme schedule is static. (Which points out a
1871  // disadavantage of schedule(runtime): even when static scheduling is used it
1872  // costs more than a compile time choice to use static scheduling would.)
1873  KMP_TIME_PARTITIONED_BLOCK(OMP_loop_dynamic_scheduling);
1874 
1875  int status;
1876  dispatch_private_info_template<T> *pr;
1877  kmp_info_t *th = __kmp_threads[gtid];
1878  kmp_team_t *team = th->th.th_team;
1879 
1880  KMP_DEBUG_ASSERT(p_lb && p_ub && p_st); // AC: these cannot be NULL
1881  KD_TRACE(
1882  1000,
1883  ("__kmp_dispatch_next: T#%d called p_lb:%p p_ub:%p p_st:%p p_last: %p\n",
1884  gtid, p_lb, p_ub, p_st, p_last));
1885 
1886  if (team->t.t_serialized) {
1887  /* NOTE: serialize this dispatch becase we are not at the active level */
1888  pr = reinterpret_cast<dispatch_private_info_template<T> *>(
1889  th->th.th_dispatch->th_disp_buffer); /* top of the stack */
1890  KMP_DEBUG_ASSERT(pr);
1891 
1892  if ((status = (pr->u.p.tc != 0)) == 0) {
1893  *p_lb = 0;
1894  *p_ub = 0;
1895  // if ( p_last != NULL )
1896  // *p_last = 0;
1897  if (p_st != NULL)
1898  *p_st = 0;
1899  if (__kmp_env_consistency_check) {
1900  if (pr->pushed_ws != ct_none) {
1901  pr->pushed_ws = __kmp_pop_workshare(gtid, pr->pushed_ws, loc);
1902  }
1903  }
1904  } else if (pr->flags.nomerge) {
1905  kmp_int32 last;
1906  T start;
1907  UT limit, trip, init;
1908  ST incr;
1909  T chunk = pr->u.p.parm1;
1910 
1911  KD_TRACE(100, ("__kmp_dispatch_next: T#%d kmp_sch_dynamic_chunked case\n",
1912  gtid));
1913 
1914  init = chunk * pr->u.p.count++;
1915  trip = pr->u.p.tc - 1;
1916 
1917  if ((status = (init <= trip)) == 0) {
1918  *p_lb = 0;
1919  *p_ub = 0;
1920  // if ( p_last != NULL )
1921  // *p_last = 0;
1922  if (p_st != NULL)
1923  *p_st = 0;
1924  if (__kmp_env_consistency_check) {
1925  if (pr->pushed_ws != ct_none) {
1926  pr->pushed_ws = __kmp_pop_workshare(gtid, pr->pushed_ws, loc);
1927  }
1928  }
1929  } else {
1930  start = pr->u.p.lb;
1931  limit = chunk + init - 1;
1932  incr = pr->u.p.st;
1933 
1934  if ((last = (limit >= trip)) != 0) {
1935  limit = trip;
1936 #if KMP_OS_WINDOWS
1937  pr->u.p.last_upper = pr->u.p.ub;
1938 #endif /* KMP_OS_WINDOWS */
1939  }
1940  if (p_last != NULL)
1941  *p_last = last;
1942  if (p_st != NULL)
1943  *p_st = incr;
1944  if (incr == 1) {
1945  *p_lb = start + init;
1946  *p_ub = start + limit;
1947  } else {
1948  *p_lb = start + init * incr;
1949  *p_ub = start + limit * incr;
1950  }
1951 
1952  if (pr->flags.ordered) {
1953  pr->u.p.ordered_lower = init;
1954  pr->u.p.ordered_upper = limit;
1955 #ifdef KMP_DEBUG
1956  {
1957  char *buff;
1958  // create format specifiers before the debug output
1959  buff = __kmp_str_format("__kmp_dispatch_next: T#%%d "
1960  "ordered_lower:%%%s ordered_upper:%%%s\n",
1961  traits_t<UT>::spec, traits_t<UT>::spec);
1962  KD_TRACE(1000, (buff, gtid, pr->u.p.ordered_lower,
1963  pr->u.p.ordered_upper));
1964  __kmp_str_free(&buff);
1965  }
1966 #endif
1967  } // if
1968  } // if
1969  } else {
1970  pr->u.p.tc = 0;
1971  *p_lb = pr->u.p.lb;
1972  *p_ub = pr->u.p.ub;
1973 #if KMP_OS_WINDOWS
1974  pr->u.p.last_upper = *p_ub;
1975 #endif /* KMP_OS_WINDOWS */
1976  if (p_last != NULL)
1977  *p_last = TRUE;
1978  if (p_st != NULL)
1979  *p_st = pr->u.p.st;
1980  } // if
1981 #ifdef KMP_DEBUG
1982  {
1983  char *buff;
1984  // create format specifiers before the debug output
1985  buff = __kmp_str_format(
1986  "__kmp_dispatch_next: T#%%d serialized case: p_lb:%%%s "
1987  "p_ub:%%%s p_st:%%%s p_last:%%p %%d returning:%%d\n",
1988  traits_t<T>::spec, traits_t<T>::spec, traits_t<ST>::spec);
1989  KD_TRACE(10, (buff, gtid, *p_lb, *p_ub, *p_st, p_last, *p_last, status));
1990  __kmp_str_free(&buff);
1991  }
1992 #endif
1993 #if INCLUDE_SSC_MARKS
1994  SSC_MARK_DISPATCH_NEXT();
1995 #endif
1996  OMPT_LOOP_END;
1997  KMP_STATS_LOOP_END;
1998  return status;
1999  } else {
2000  kmp_int32 last = 0;
2001  dispatch_shared_info_template<T> volatile *sh;
2002 
2003  KMP_DEBUG_ASSERT(th->th.th_dispatch ==
2004  &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid]);
2005 
2006  pr = reinterpret_cast<dispatch_private_info_template<T> *>(
2007  th->th.th_dispatch->th_dispatch_pr_current);
2008  KMP_DEBUG_ASSERT(pr);
2009  sh = reinterpret_cast<dispatch_shared_info_template<T> volatile *>(
2010  th->th.th_dispatch->th_dispatch_sh_current);
2011  KMP_DEBUG_ASSERT(sh);
2012 
2013 #if KMP_USE_HIER_SCHED
2014  if (pr->flags.use_hier)
2015  status = sh->hier->next(loc, gtid, pr, &last, p_lb, p_ub, p_st);
2016  else
2017 #endif // KMP_USE_HIER_SCHED
2018  status = __kmp_dispatch_next_algorithm<T>(gtid, pr, sh, &last, p_lb, p_ub,
2019  p_st, th->th.th_team_nproc,
2020  th->th.th_info.ds.ds_tid);
2021  // status == 0: no more iterations to execute
2022  if (status == 0) {
2023  UT num_done;
2024 
2025  num_done = test_then_inc<ST>((volatile ST *)&sh->u.s.num_done);
2026 #ifdef KMP_DEBUG
2027  {
2028  char *buff;
2029  // create format specifiers before the debug output
2030  buff = __kmp_str_format(
2031  "__kmp_dispatch_next: T#%%d increment num_done:%%%s\n",
2032  traits_t<UT>::spec);
2033  KD_TRACE(10, (buff, gtid, sh->u.s.num_done));
2034  __kmp_str_free(&buff);
2035  }
2036 #endif
2037 
2038 #if KMP_USE_HIER_SCHED
2039  pr->flags.use_hier = FALSE;
2040 #endif
2041  if ((ST)num_done == th->th.th_team_nproc - 1) {
2042 #if (KMP_STATIC_STEAL_ENABLED)
2043  if (pr->schedule == kmp_sch_static_steal &&
2044  traits_t<T>::type_size > 4) {
2045  int i;
2046  kmp_info_t **other_threads = team->t.t_threads;
2047  // loop complete, safe to destroy locks used for stealing
2048  for (i = 0; i < th->th.th_team_nproc; ++i) {
2049  kmp_lock_t *lck = other_threads[i]->th.th_dispatch->th_steal_lock;
2050  KMP_ASSERT(lck != NULL);
2051  __kmp_destroy_lock(lck);
2052  __kmp_free(lck);
2053  other_threads[i]->th.th_dispatch->th_steal_lock = NULL;
2054  }
2055  }
2056 #endif
2057  /* NOTE: release this buffer to be reused */
2058 
2059  KMP_MB(); /* Flush all pending memory write invalidates. */
2060 
2061  sh->u.s.num_done = 0;
2062  sh->u.s.iteration = 0;
2063 
2064  /* TODO replace with general release procedure? */
2065  if (pr->flags.ordered) {
2066  sh->u.s.ordered_iteration = 0;
2067  }
2068 
2069  KMP_MB(); /* Flush all pending memory write invalidates. */
2070 
2071  sh->buffer_index += __kmp_dispatch_num_buffers;
2072  KD_TRACE(100, ("__kmp_dispatch_next: T#%d change buffer_index:%d\n",
2073  gtid, sh->buffer_index));
2074 
2075  KMP_MB(); /* Flush all pending memory write invalidates. */
2076 
2077  } // if
2078  if (__kmp_env_consistency_check) {
2079  if (pr->pushed_ws != ct_none) {
2080  pr->pushed_ws = __kmp_pop_workshare(gtid, pr->pushed_ws, loc);
2081  }
2082  }
2083 
2084  th->th.th_dispatch->th_deo_fcn = NULL;
2085  th->th.th_dispatch->th_dxo_fcn = NULL;
2086  th->th.th_dispatch->th_dispatch_sh_current = NULL;
2087  th->th.th_dispatch->th_dispatch_pr_current = NULL;
2088  } // if (status == 0)
2089 #if KMP_OS_WINDOWS
2090  else if (last) {
2091  pr->u.p.last_upper = pr->u.p.ub;
2092  }
2093 #endif /* KMP_OS_WINDOWS */
2094  if (p_last != NULL && status != 0)
2095  *p_last = last;
2096  } // if
2097 
2098 #ifdef KMP_DEBUG
2099  {
2100  char *buff;
2101  // create format specifiers before the debug output
2102  buff = __kmp_str_format(
2103  "__kmp_dispatch_next: T#%%d normal case: "
2104  "p_lb:%%%s p_ub:%%%s p_st:%%%s p_last:%%p (%%d) returning:%%d\n",
2105  traits_t<T>::spec, traits_t<T>::spec, traits_t<ST>::spec);
2106  KD_TRACE(10, (buff, gtid, *p_lb, *p_ub, p_st ? *p_st : 0, p_last,
2107  (p_last ? *p_last : 0), status));
2108  __kmp_str_free(&buff);
2109  }
2110 #endif
2111 #if INCLUDE_SSC_MARKS
2112  SSC_MARK_DISPATCH_NEXT();
2113 #endif
2114  OMPT_LOOP_END;
2115  KMP_STATS_LOOP_END;
2116  return status;
2117 }
2118 
2119 template <typename T>
2120 static void __kmp_dist_get_bounds(ident_t *loc, kmp_int32 gtid,
2121  kmp_int32 *plastiter, T *plower, T *pupper,
2122  typename traits_t<T>::signed_t incr) {
2123  typedef typename traits_t<T>::unsigned_t UT;
2124  kmp_uint32 team_id;
2125  kmp_uint32 nteams;
2126  UT trip_count;
2127  kmp_team_t *team;
2128  kmp_info_t *th;
2129 
2130  KMP_DEBUG_ASSERT(plastiter && plower && pupper);
2131  KE_TRACE(10, ("__kmpc_dist_get_bounds called (%d)\n", gtid));
2132 #ifdef KMP_DEBUG
2133  typedef typename traits_t<T>::signed_t ST;
2134  {
2135  char *buff;
2136  // create format specifiers before the debug output
2137  buff = __kmp_str_format("__kmpc_dist_get_bounds: T#%%d liter=%%d "
2138  "iter=(%%%s, %%%s, %%%s) signed?<%s>\n",
2139  traits_t<T>::spec, traits_t<T>::spec,
2140  traits_t<ST>::spec, traits_t<T>::spec);
2141  KD_TRACE(100, (buff, gtid, *plastiter, *plower, *pupper, incr));
2142  __kmp_str_free(&buff);
2143  }
2144 #endif
2145 
2146  if (__kmp_env_consistency_check) {
2147  if (incr == 0) {
2148  __kmp_error_construct(kmp_i18n_msg_CnsLoopIncrZeroProhibited, ct_pdo,
2149  loc);
2150  }
2151  if (incr > 0 ? (*pupper < *plower) : (*plower < *pupper)) {
2152  // The loop is illegal.
2153  // Some zero-trip loops maintained by compiler, e.g.:
2154  // for(i=10;i<0;++i) // lower >= upper - run-time check
2155  // for(i=0;i>10;--i) // lower <= upper - run-time check
2156  // for(i=0;i>10;++i) // incr > 0 - compile-time check
2157  // for(i=10;i<0;--i) // incr < 0 - compile-time check
2158  // Compiler does not check the following illegal loops:
2159  // for(i=0;i<10;i+=incr) // where incr<0
2160  // for(i=10;i>0;i-=incr) // where incr<0
2161  __kmp_error_construct(kmp_i18n_msg_CnsLoopIncrIllegal, ct_pdo, loc);
2162  }
2163  }
2164  th = __kmp_threads[gtid];
2165  team = th->th.th_team;
2166 #if OMP_40_ENABLED
2167  KMP_DEBUG_ASSERT(th->th.th_teams_microtask); // we are in the teams construct
2168  nteams = th->th.th_teams_size.nteams;
2169 #endif
2170  team_id = team->t.t_master_tid;
2171  KMP_DEBUG_ASSERT(nteams == (kmp_uint32)team->t.t_parent->t.t_nproc);
2172 
2173  // compute global trip count
2174  if (incr == 1) {
2175  trip_count = *pupper - *plower + 1;
2176  } else if (incr == -1) {
2177  trip_count = *plower - *pupper + 1;
2178  } else if (incr > 0) {
2179  // upper-lower can exceed the limit of signed type
2180  trip_count = (UT)(*pupper - *plower) / incr + 1;
2181  } else {
2182  trip_count = (UT)(*plower - *pupper) / (-incr) + 1;
2183  }
2184 
2185  if (trip_count <= nteams) {
2186  KMP_DEBUG_ASSERT(
2187  __kmp_static == kmp_sch_static_greedy ||
2188  __kmp_static ==
2189  kmp_sch_static_balanced); // Unknown static scheduling type.
2190  // only some teams get single iteration, others get nothing
2191  if (team_id < trip_count) {
2192  *pupper = *plower = *plower + team_id * incr;
2193  } else {
2194  *plower = *pupper + incr; // zero-trip loop
2195  }
2196  if (plastiter != NULL)
2197  *plastiter = (team_id == trip_count - 1);
2198  } else {
2199  if (__kmp_static == kmp_sch_static_balanced) {
2200  UT chunk = trip_count / nteams;
2201  UT extras = trip_count % nteams;
2202  *plower +=
2203  incr * (team_id * chunk + (team_id < extras ? team_id : extras));
2204  *pupper = *plower + chunk * incr - (team_id < extras ? 0 : incr);
2205  if (plastiter != NULL)
2206  *plastiter = (team_id == nteams - 1);
2207  } else {
2208  T chunk_inc_count =
2209  (trip_count / nteams + ((trip_count % nteams) ? 1 : 0)) * incr;
2210  T upper = *pupper;
2211  KMP_DEBUG_ASSERT(__kmp_static == kmp_sch_static_greedy);
2212  // Unknown static scheduling type.
2213  *plower += team_id * chunk_inc_count;
2214  *pupper = *plower + chunk_inc_count - incr;
2215  // Check/correct bounds if needed
2216  if (incr > 0) {
2217  if (*pupper < *plower)
2218  *pupper = traits_t<T>::max_value;
2219  if (plastiter != NULL)
2220  *plastiter = *plower <= upper && *pupper > upper - incr;
2221  if (*pupper > upper)
2222  *pupper = upper; // tracker C73258
2223  } else {
2224  if (*pupper > *plower)
2225  *pupper = traits_t<T>::min_value;
2226  if (plastiter != NULL)
2227  *plastiter = *plower >= upper && *pupper < upper - incr;
2228  if (*pupper < upper)
2229  *pupper = upper; // tracker C73258
2230  }
2231  }
2232  }
2233 }
2234 
2235 //-----------------------------------------------------------------------------
2236 // Dispatch routines
2237 // Transfer call to template< type T >
2238 // __kmp_dispatch_init( ident_t *loc, int gtid, enum sched_type schedule,
2239 // T lb, T ub, ST st, ST chunk )
2240 extern "C" {
2241 
2258 void __kmpc_dispatch_init_4(ident_t *loc, kmp_int32 gtid,
2259  enum sched_type schedule, kmp_int32 lb,
2260  kmp_int32 ub, kmp_int32 st, kmp_int32 chunk) {
2261  KMP_DEBUG_ASSERT(__kmp_init_serial);
2262 #if OMPT_SUPPORT && OMPT_OPTIONAL
2263  OMPT_STORE_RETURN_ADDRESS(gtid);
2264 #endif
2265  __kmp_dispatch_init<kmp_int32>(loc, gtid, schedule, lb, ub, st, chunk, true);
2266 }
2270 void __kmpc_dispatch_init_4u(ident_t *loc, kmp_int32 gtid,
2271  enum sched_type schedule, kmp_uint32 lb,
2272  kmp_uint32 ub, kmp_int32 st, kmp_int32 chunk) {
2273  KMP_DEBUG_ASSERT(__kmp_init_serial);
2274 #if OMPT_SUPPORT && OMPT_OPTIONAL
2275  OMPT_STORE_RETURN_ADDRESS(gtid);
2276 #endif
2277  __kmp_dispatch_init<kmp_uint32>(loc, gtid, schedule, lb, ub, st, chunk, true);
2278 }
2279 
2283 void __kmpc_dispatch_init_8(ident_t *loc, kmp_int32 gtid,
2284  enum sched_type schedule, kmp_int64 lb,
2285  kmp_int64 ub, kmp_int64 st, kmp_int64 chunk) {
2286  KMP_DEBUG_ASSERT(__kmp_init_serial);
2287 #if OMPT_SUPPORT && OMPT_OPTIONAL
2288  OMPT_STORE_RETURN_ADDRESS(gtid);
2289 #endif
2290  __kmp_dispatch_init<kmp_int64>(loc, gtid, schedule, lb, ub, st, chunk, true);
2291 }
2292 
2296 void __kmpc_dispatch_init_8u(ident_t *loc, kmp_int32 gtid,
2297  enum sched_type schedule, kmp_uint64 lb,
2298  kmp_uint64 ub, kmp_int64 st, kmp_int64 chunk) {
2299  KMP_DEBUG_ASSERT(__kmp_init_serial);
2300 #if OMPT_SUPPORT && OMPT_OPTIONAL
2301  OMPT_STORE_RETURN_ADDRESS(gtid);
2302 #endif
2303  __kmp_dispatch_init<kmp_uint64>(loc, gtid, schedule, lb, ub, st, chunk, true);
2304 }
2305 
2315 void __kmpc_dist_dispatch_init_4(ident_t *loc, kmp_int32 gtid,
2316  enum sched_type schedule, kmp_int32 *p_last,
2317  kmp_int32 lb, kmp_int32 ub, kmp_int32 st,
2318  kmp_int32 chunk) {
2319  KMP_DEBUG_ASSERT(__kmp_init_serial);
2320 #if OMPT_SUPPORT && OMPT_OPTIONAL
2321  OMPT_STORE_RETURN_ADDRESS(gtid);
2322 #endif
2323  __kmp_dist_get_bounds<kmp_int32>(loc, gtid, p_last, &lb, &ub, st);
2324  __kmp_dispatch_init<kmp_int32>(loc, gtid, schedule, lb, ub, st, chunk, true);
2325 }
2326 
2327 void __kmpc_dist_dispatch_init_4u(ident_t *loc, kmp_int32 gtid,
2328  enum sched_type schedule, kmp_int32 *p_last,
2329  kmp_uint32 lb, kmp_uint32 ub, kmp_int32 st,
2330  kmp_int32 chunk) {
2331  KMP_DEBUG_ASSERT(__kmp_init_serial);
2332 #if OMPT_SUPPORT && OMPT_OPTIONAL
2333  OMPT_STORE_RETURN_ADDRESS(gtid);
2334 #endif
2335  __kmp_dist_get_bounds<kmp_uint32>(loc, gtid, p_last, &lb, &ub, st);
2336  __kmp_dispatch_init<kmp_uint32>(loc, gtid, schedule, lb, ub, st, chunk, true);
2337 }
2338 
2339 void __kmpc_dist_dispatch_init_8(ident_t *loc, kmp_int32 gtid,
2340  enum sched_type schedule, kmp_int32 *p_last,
2341  kmp_int64 lb, kmp_int64 ub, kmp_int64 st,
2342  kmp_int64 chunk) {
2343  KMP_DEBUG_ASSERT(__kmp_init_serial);
2344 #if OMPT_SUPPORT && OMPT_OPTIONAL
2345  OMPT_STORE_RETURN_ADDRESS(gtid);
2346 #endif
2347  __kmp_dist_get_bounds<kmp_int64>(loc, gtid, p_last, &lb, &ub, st);
2348  __kmp_dispatch_init<kmp_int64>(loc, gtid, schedule, lb, ub, st, chunk, true);
2349 }
2350 
2351 void __kmpc_dist_dispatch_init_8u(ident_t *loc, kmp_int32 gtid,
2352  enum sched_type schedule, kmp_int32 *p_last,
2353  kmp_uint64 lb, kmp_uint64 ub, kmp_int64 st,
2354  kmp_int64 chunk) {
2355  KMP_DEBUG_ASSERT(__kmp_init_serial);
2356 #if OMPT_SUPPORT && OMPT_OPTIONAL
2357  OMPT_STORE_RETURN_ADDRESS(gtid);
2358 #endif
2359  __kmp_dist_get_bounds<kmp_uint64>(loc, gtid, p_last, &lb, &ub, st);
2360  __kmp_dispatch_init<kmp_uint64>(loc, gtid, schedule, lb, ub, st, chunk, true);
2361 }
2362 
2376 int __kmpc_dispatch_next_4(ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last,
2377  kmp_int32 *p_lb, kmp_int32 *p_ub, kmp_int32 *p_st) {
2378 #if OMPT_SUPPORT && OMPT_OPTIONAL
2379  OMPT_STORE_RETURN_ADDRESS(gtid);
2380 #endif
2381  return __kmp_dispatch_next<kmp_int32>(loc, gtid, p_last, p_lb, p_ub, p_st
2382 #if OMPT_SUPPORT && OMPT_OPTIONAL
2383  ,
2384  OMPT_LOAD_RETURN_ADDRESS(gtid)
2385 #endif
2386  );
2387 }
2388 
2392 int __kmpc_dispatch_next_4u(ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last,
2393  kmp_uint32 *p_lb, kmp_uint32 *p_ub,
2394  kmp_int32 *p_st) {
2395 #if OMPT_SUPPORT && OMPT_OPTIONAL
2396  OMPT_STORE_RETURN_ADDRESS(gtid);
2397 #endif
2398  return __kmp_dispatch_next<kmp_uint32>(loc, gtid, p_last, p_lb, p_ub, p_st
2399 #if OMPT_SUPPORT && OMPT_OPTIONAL
2400  ,
2401  OMPT_LOAD_RETURN_ADDRESS(gtid)
2402 #endif
2403  );
2404 }
2405 
2409 int __kmpc_dispatch_next_8(ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last,
2410  kmp_int64 *p_lb, kmp_int64 *p_ub, kmp_int64 *p_st) {
2411 #if OMPT_SUPPORT && OMPT_OPTIONAL
2412  OMPT_STORE_RETURN_ADDRESS(gtid);
2413 #endif
2414  return __kmp_dispatch_next<kmp_int64>(loc, gtid, p_last, p_lb, p_ub, p_st
2415 #if OMPT_SUPPORT && OMPT_OPTIONAL
2416  ,
2417  OMPT_LOAD_RETURN_ADDRESS(gtid)
2418 #endif
2419  );
2420 }
2421 
2425 int __kmpc_dispatch_next_8u(ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last,
2426  kmp_uint64 *p_lb, kmp_uint64 *p_ub,
2427  kmp_int64 *p_st) {
2428 #if OMPT_SUPPORT && OMPT_OPTIONAL
2429  OMPT_STORE_RETURN_ADDRESS(gtid);
2430 #endif
2431  return __kmp_dispatch_next<kmp_uint64>(loc, gtid, p_last, p_lb, p_ub, p_st
2432 #if OMPT_SUPPORT && OMPT_OPTIONAL
2433  ,
2434  OMPT_LOAD_RETURN_ADDRESS(gtid)
2435 #endif
2436  );
2437 }
2438 
2445 void __kmpc_dispatch_fini_4(ident_t *loc, kmp_int32 gtid) {
2446  __kmp_dispatch_finish<kmp_uint32>(gtid, loc);
2447 }
2448 
2452 void __kmpc_dispatch_fini_8(ident_t *loc, kmp_int32 gtid) {
2453  __kmp_dispatch_finish<kmp_uint64>(gtid, loc);
2454 }
2455 
2459 void __kmpc_dispatch_fini_4u(ident_t *loc, kmp_int32 gtid) {
2460  __kmp_dispatch_finish<kmp_uint32>(gtid, loc);
2461 }
2462 
2466 void __kmpc_dispatch_fini_8u(ident_t *loc, kmp_int32 gtid) {
2467  __kmp_dispatch_finish<kmp_uint64>(gtid, loc);
2468 }
2471 //-----------------------------------------------------------------------------
2472 // Non-template routines from kmp_dispatch.cpp used in other sources
2473 
2474 kmp_uint32 __kmp_eq_4(kmp_uint32 value, kmp_uint32 checker) {
2475  return value == checker;
2476 }
2477 
2478 kmp_uint32 __kmp_neq_4(kmp_uint32 value, kmp_uint32 checker) {
2479  return value != checker;
2480 }
2481 
2482 kmp_uint32 __kmp_lt_4(kmp_uint32 value, kmp_uint32 checker) {
2483  return value < checker;
2484 }
2485 
2486 kmp_uint32 __kmp_ge_4(kmp_uint32 value, kmp_uint32 checker) {
2487  return value >= checker;
2488 }
2489 
2490 kmp_uint32 __kmp_le_4(kmp_uint32 value, kmp_uint32 checker) {
2491  return value <= checker;
2492 }
2493 
2494 kmp_uint32
2495 __kmp_wait_yield_4(volatile kmp_uint32 *spinner, kmp_uint32 checker,
2496  kmp_uint32 (*pred)(kmp_uint32, kmp_uint32),
2497  void *obj // Higher-level synchronization object, or NULL.
2498  ) {
2499  // note: we may not belong to a team at this point
2500  volatile kmp_uint32 *spin = spinner;
2501  kmp_uint32 check = checker;
2502  kmp_uint32 spins;
2503  kmp_uint32 (*f)(kmp_uint32, kmp_uint32) = pred;
2504  kmp_uint32 r;
2505 
2506  KMP_FSYNC_SPIN_INIT(obj, CCAST(kmp_uint32 *, spin));
2507  KMP_INIT_YIELD(spins);
2508  // main wait spin loop
2509  while (!f(r = TCR_4(*spin), check)) {
2510  KMP_FSYNC_SPIN_PREPARE(obj);
2511  /* GEH - remove this since it was accidentally introduced when kmp_wait was
2512  split. It causes problems with infinite recursion because of exit lock */
2513  /* if ( TCR_4(__kmp_global.g.g_done) && __kmp_global.g.g_abort)
2514  __kmp_abort_thread(); */
2515 
2516  /* if we have waited a bit, or are oversubscribed, yield */
2517  /* pause is in the following code */
2518  KMP_YIELD(TCR_4(__kmp_nth) > __kmp_avail_proc);
2519  KMP_YIELD_SPIN(spins);
2520  }
2521  KMP_FSYNC_SPIN_ACQUIRED(obj);
2522  return r;
2523 }
2524 
2525 void __kmp_wait_yield_4_ptr(
2526  void *spinner, kmp_uint32 checker, kmp_uint32 (*pred)(void *, kmp_uint32),
2527  void *obj // Higher-level synchronization object, or NULL.
2528  ) {
2529  // note: we may not belong to a team at this point
2530  void *spin = spinner;
2531  kmp_uint32 check = checker;
2532  kmp_uint32 spins;
2533  kmp_uint32 (*f)(void *, kmp_uint32) = pred;
2534 
2535  KMP_FSYNC_SPIN_INIT(obj, spin);
2536  KMP_INIT_YIELD(spins);
2537  // main wait spin loop
2538  while (!f(spin, check)) {
2539  KMP_FSYNC_SPIN_PREPARE(obj);
2540  /* if we have waited a bit, or are oversubscribed, yield */
2541  /* pause is in the following code */
2542  KMP_YIELD(TCR_4(__kmp_nth) > __kmp_avail_proc);
2543  KMP_YIELD_SPIN(spins);
2544  }
2545  KMP_FSYNC_SPIN_ACQUIRED(obj);
2546 }
2547 
2548 } // extern "C"
2549 
2550 #ifdef KMP_GOMP_COMPAT
2551 
2552 void __kmp_aux_dispatch_init_4(ident_t *loc, kmp_int32 gtid,
2553  enum sched_type schedule, kmp_int32 lb,
2554  kmp_int32 ub, kmp_int32 st, kmp_int32 chunk,
2555  int push_ws) {
2556  __kmp_dispatch_init<kmp_int32>(loc, gtid, schedule, lb, ub, st, chunk,
2557  push_ws);
2558 }
2559 
2560 void __kmp_aux_dispatch_init_4u(ident_t *loc, kmp_int32 gtid,
2561  enum sched_type schedule, kmp_uint32 lb,
2562  kmp_uint32 ub, kmp_int32 st, kmp_int32 chunk,
2563  int push_ws) {
2564  __kmp_dispatch_init<kmp_uint32>(loc, gtid, schedule, lb, ub, st, chunk,
2565  push_ws);
2566 }
2567 
2568 void __kmp_aux_dispatch_init_8(ident_t *loc, kmp_int32 gtid,
2569  enum sched_type schedule, kmp_int64 lb,
2570  kmp_int64 ub, kmp_int64 st, kmp_int64 chunk,
2571  int push_ws) {
2572  __kmp_dispatch_init<kmp_int64>(loc, gtid, schedule, lb, ub, st, chunk,
2573  push_ws);
2574 }
2575 
2576 void __kmp_aux_dispatch_init_8u(ident_t *loc, kmp_int32 gtid,
2577  enum sched_type schedule, kmp_uint64 lb,
2578  kmp_uint64 ub, kmp_int64 st, kmp_int64 chunk,
2579  int push_ws) {
2580  __kmp_dispatch_init<kmp_uint64>(loc, gtid, schedule, lb, ub, st, chunk,
2581  push_ws);
2582 }
2583 
2584 void __kmp_aux_dispatch_fini_chunk_4(ident_t *loc, kmp_int32 gtid) {
2585  __kmp_dispatch_finish_chunk<kmp_uint32>(gtid, loc);
2586 }
2587 
2588 void __kmp_aux_dispatch_fini_chunk_8(ident_t *loc, kmp_int32 gtid) {
2589  __kmp_dispatch_finish_chunk<kmp_uint64>(gtid, loc);
2590 }
2591 
2592 void __kmp_aux_dispatch_fini_chunk_4u(ident_t *loc, kmp_int32 gtid) {
2593  __kmp_dispatch_finish_chunk<kmp_uint32>(gtid, loc);
2594 }
2595 
2596 void __kmp_aux_dispatch_fini_chunk_8u(ident_t *loc, kmp_int32 gtid) {
2597  __kmp_dispatch_finish_chunk<kmp_uint64>(gtid, loc);
2598 }
2599 
2600 #endif /* KMP_GOMP_COMPAT */
2601 
2602 /* ------------------------------------------------------------------------ */
void __kmpc_dispatch_fini_4(ident_t *loc, kmp_int32 gtid)
void __kmpc_dispatch_init_4(ident_t *loc, kmp_int32 gtid, enum sched_type schedule, kmp_int32 lb, kmp_int32 ub, kmp_int32 st, kmp_int32 chunk)
int __kmpc_dispatch_next_4u(ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last, kmp_uint32 *p_lb, kmp_uint32 *p_ub, kmp_int32 *p_st)
void __kmpc_dispatch_init_8u(ident_t *loc, kmp_int32 gtid, enum sched_type schedule, kmp_uint64 lb, kmp_uint64 ub, kmp_int64 st, kmp_int64 chunk)
int __kmpc_dispatch_next_4(ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last, kmp_int32 *p_lb, kmp_int32 *p_ub, kmp_int32 *p_st)
int __kmpc_dispatch_next_8(ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last, kmp_int64 *p_lb, kmp_int64 *p_ub, kmp_int64 *p_st)
#define KMP_COUNT_BLOCK(name)
Increments specified counter (name).
Definition: kmp_stats.h:890
sched_type
Definition: kmp.h:332
Definition: kmp.h:219
void __kmpc_dispatch_fini_8u(ident_t *loc, kmp_int32 gtid)
void __kmpc_dispatch_fini_4u(ident_t *loc, kmp_int32 gtid)
void __kmpc_dispatch_fini_8(ident_t *loc, kmp_int32 gtid)
void __kmpc_dispatch_init_4u(ident_t *loc, kmp_int32 gtid, enum sched_type schedule, kmp_uint32 lb, kmp_uint32 ub, kmp_int32 st, kmp_int32 chunk)
void __kmpc_dist_dispatch_init_4(ident_t *loc, kmp_int32 gtid, enum sched_type schedule, kmp_int32 *p_last, kmp_int32 lb, kmp_int32 ub, kmp_int32 st, kmp_int32 chunk)
int __kmpc_dispatch_next_8u(ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last, kmp_uint64 *p_lb, kmp_uint64 *p_ub, kmp_int64 *p_st)
void __kmpc_dispatch_init_8(ident_t *loc, kmp_int32 gtid, enum sched_type schedule, kmp_int64 lb, kmp_int64 ub, kmp_int64 st, kmp_int64 chunk)