LLVM OpenMP* Runtime Library
kmp_dispatch.h
1 /*
2  * kmp_dispatch.h: dynamic scheduling - iteration initialization and dispatch.
3  */
4 
5 //===----------------------------------------------------------------------===//
6 //
7 // The LLVM Compiler Infrastructure
8 //
9 // This file is dual licensed under the MIT and the University of Illinois Open
10 // Source Licenses. See LICENSE.txt for details.
11 //
12 //===----------------------------------------------------------------------===//
13 
14 #ifndef KMP_DISPATCH_H
15 #define KMP_DISPATCH_H
16 
17 /* ------------------------------------------------------------------------ */
18 /* ------------------------------------------------------------------------ */
19 
20 // Need to raise Win version from XP to Vista here for support of
21 // InterlockedExchange64
22 #if defined(_WIN32_WINNT) && defined(_M_IX86)
23 #undef _WIN32_WINNT
24 #define _WIN32_WINNT 0x0502
25 #endif
26 
27 #include "kmp.h"
28 #include "kmp_error.h"
29 #include "kmp_i18n.h"
30 #include "kmp_itt.h"
31 #include "kmp_stats.h"
32 #include "kmp_str.h"
33 #if KMP_OS_WINDOWS && KMP_ARCH_X86
34 #include <float.h>
35 #endif
36 
37 #if OMPT_SUPPORT
38 #include "ompt-internal.h"
39 #include "ompt-specific.h"
40 #endif
41 
42 /* ------------------------------------------------------------------------ */
43 /* ------------------------------------------------------------------------ */
44 #if KMP_USE_HIER_SCHED
45 // Forward declarations of some hierarchical scheduling data structures
46 template <typename T> struct kmp_hier_t;
47 template <typename T> struct kmp_hier_top_unit_t;
48 #endif // KMP_USE_HIER_SCHED
49 
50 template <typename T> struct dispatch_shared_info_template;
51 template <typename T> struct dispatch_private_info_template;
52 
53 template <typename T>
54 extern void __kmp_dispatch_init_algorithm(ident_t *loc, int gtid,
55  dispatch_private_info_template<T> *pr,
56  enum sched_type schedule, T lb, T ub,
57  typename traits_t<T>::signed_t st,
58 #if USE_ITT_BUILD
59  kmp_uint64 *cur_chunk,
60 #endif
61  typename traits_t<T>::signed_t chunk,
62  T nproc, T unit_id);
63 template <typename T>
64 extern int __kmp_dispatch_next_algorithm(
65  int gtid, dispatch_private_info_template<T> *pr,
66  dispatch_shared_info_template<T> volatile *sh, kmp_int32 *p_last, T *p_lb,
67  T *p_ub, typename traits_t<T>::signed_t *p_st, T nproc, T unit_id);
68 
69 void __kmp_dispatch_dxo_error(int *gtid_ref, int *cid_ref, ident_t *loc_ref);
70 void __kmp_dispatch_deo_error(int *gtid_ref, int *cid_ref, ident_t *loc_ref);
71 
72 #if KMP_STATIC_STEAL_ENABLED
73 
74 // replaces dispatch_private_info{32,64} structures and
75 // dispatch_private_info{32,64}_t types
76 template <typename T> struct dispatch_private_infoXX_template {
77  typedef typename traits_t<T>::unsigned_t UT;
78  typedef typename traits_t<T>::signed_t ST;
79  UT count; // unsigned
80  T ub;
81  /* Adding KMP_ALIGN_CACHE here doesn't help / can hurt performance */
82  T lb;
83  ST st; // signed
84  UT tc; // unsigned
85  T static_steal_counter; // for static_steal only; maybe better to put after ub
86 
87  /* parm[1-4] are used in different ways by different scheduling algorithms */
88 
89  // KMP_ALIGN( 32 ) ensures ( if the KMP_ALIGN macro is turned on )
90  // a) parm3 is properly aligned and
91  // b) all parm1-4 are in the same cache line.
92  // Because of parm1-4 are used together, performance seems to be better
93  // if they are in the same line (not measured though).
94 
95  struct KMP_ALIGN(32) { // compiler does not accept sizeof(T)*4
96  T parm1;
97  T parm2;
98  T parm3;
99  T parm4;
100  };
101 
102  UT ordered_lower; // unsigned
103  UT ordered_upper; // unsigned
104 #if KMP_OS_WINDOWS
105  T last_upper;
106 #endif /* KMP_OS_WINDOWS */
107 };
108 
109 #else /* KMP_STATIC_STEAL_ENABLED */
110 
111 // replaces dispatch_private_info{32,64} structures and
112 // dispatch_private_info{32,64}_t types
113 template <typename T> struct dispatch_private_infoXX_template {
114  typedef typename traits_t<T>::unsigned_t UT;
115  typedef typename traits_t<T>::signed_t ST;
116  T lb;
117  T ub;
118  ST st; // signed
119  UT tc; // unsigned
120 
121  T parm1;
122  T parm2;
123  T parm3;
124  T parm4;
125 
126  UT count; // unsigned
127 
128  UT ordered_lower; // unsigned
129  UT ordered_upper; // unsigned
130 #if KMP_OS_WINDOWS
131  T last_upper;
132 #endif /* KMP_OS_WINDOWS */
133 };
134 #endif /* KMP_STATIC_STEAL_ENABLED */
135 
136 template <typename T> struct KMP_ALIGN_CACHE dispatch_private_info_template {
137  // duplicate alignment here, otherwise size of structure is not correct in our
138  // compiler
139  union KMP_ALIGN_CACHE private_info_tmpl {
140  dispatch_private_infoXX_template<T> p;
141  dispatch_private_info64_t p64;
142  } u;
143  enum sched_type schedule; /* scheduling algorithm */
144  kmp_sched_flags_t flags; /* flags (e.g., ordered, nomerge, etc.) */
145  kmp_uint32 ordered_bumped;
146  // to retain the structure size after making order
147  kmp_int32 ordered_dummy[KMP_MAX_ORDERED - 3];
148  dispatch_private_info *next; /* stack of buffers for nest of serial regions */
149  kmp_uint32 type_size;
150 #if KMP_USE_HIER_SCHED
151  kmp_int32 hier_id;
152  kmp_hier_top_unit_t<T> *hier_parent;
153  // member functions
154  kmp_int32 get_hier_id() const { return hier_id; }
155  kmp_hier_top_unit_t<T> *get_parent() { return hier_parent; }
156 #endif
157  enum cons_type pushed_ws;
158 };
159 
160 // replaces dispatch_shared_info{32,64} structures and
161 // dispatch_shared_info{32,64}_t types
162 template <typename T> struct dispatch_shared_infoXX_template {
163  typedef typename traits_t<T>::unsigned_t UT;
164  /* chunk index under dynamic, number of idle threads under static-steal;
165  iteration index otherwise */
166  volatile UT iteration;
167  volatile UT num_done;
168  volatile UT ordered_iteration;
169  // to retain the structure size making ordered_iteration scalar
170  UT ordered_dummy[KMP_MAX_ORDERED - 3];
171 };
172 
173 // replaces dispatch_shared_info structure and dispatch_shared_info_t type
174 template <typename T> struct dispatch_shared_info_template {
175  typedef typename traits_t<T>::unsigned_t UT;
176  // we need union here to keep the structure size
177  union shared_info_tmpl {
178  dispatch_shared_infoXX_template<UT> s;
179  dispatch_shared_info64_t s64;
180  } u;
181  volatile kmp_uint32 buffer_index;
182 #if OMP_45_ENABLED
183  volatile kmp_int32 doacross_buf_idx; // teamwise index
184  kmp_uint32 *doacross_flags; // array of iteration flags (0/1)
185  kmp_int32 doacross_num_done; // count finished threads
186 #endif
187 #if KMP_USE_HIER_SCHED
188  kmp_hier_t<T> *hier;
189 #endif
190 #if KMP_USE_HWLOC
191  // When linking with libhwloc, the ORDERED EPCC test slowsdown on big
192  // machines (> 48 cores). Performance analysis showed that a cache thrash
193  // was occurring and this padding helps alleviate the problem.
194  char padding[64];
195 #endif
196 };
197 
198 /* ------------------------------------------------------------------------ */
199 /* ------------------------------------------------------------------------ */
200 
201 #undef USE_TEST_LOCKS
202 
203 // test_then_add template (general template should NOT be used)
204 template <typename T> static __forceinline T test_then_add(volatile T *p, T d);
205 
206 template <>
207 __forceinline kmp_int32 test_then_add<kmp_int32>(volatile kmp_int32 *p,
208  kmp_int32 d) {
209  kmp_int32 r;
210  r = KMP_TEST_THEN_ADD32(p, d);
211  return r;
212 }
213 
214 template <>
215 __forceinline kmp_int64 test_then_add<kmp_int64>(volatile kmp_int64 *p,
216  kmp_int64 d) {
217  kmp_int64 r;
218  r = KMP_TEST_THEN_ADD64(p, d);
219  return r;
220 }
221 
222 // test_then_inc_acq template (general template should NOT be used)
223 template <typename T> static __forceinline T test_then_inc_acq(volatile T *p);
224 
225 template <>
226 __forceinline kmp_int32 test_then_inc_acq<kmp_int32>(volatile kmp_int32 *p) {
227  kmp_int32 r;
228  r = KMP_TEST_THEN_INC_ACQ32(p);
229  return r;
230 }
231 
232 template <>
233 __forceinline kmp_int64 test_then_inc_acq<kmp_int64>(volatile kmp_int64 *p) {
234  kmp_int64 r;
235  r = KMP_TEST_THEN_INC_ACQ64(p);
236  return r;
237 }
238 
239 // test_then_inc template (general template should NOT be used)
240 template <typename T> static __forceinline T test_then_inc(volatile T *p);
241 
242 template <>
243 __forceinline kmp_int32 test_then_inc<kmp_int32>(volatile kmp_int32 *p) {
244  kmp_int32 r;
245  r = KMP_TEST_THEN_INC32(p);
246  return r;
247 }
248 
249 template <>
250 __forceinline kmp_int64 test_then_inc<kmp_int64>(volatile kmp_int64 *p) {
251  kmp_int64 r;
252  r = KMP_TEST_THEN_INC64(p);
253  return r;
254 }
255 
256 // compare_and_swap template (general template should NOT be used)
257 template <typename T>
258 static __forceinline kmp_int32 compare_and_swap(volatile T *p, T c, T s);
259 
260 template <>
261 __forceinline kmp_int32 compare_and_swap<kmp_int32>(volatile kmp_int32 *p,
262  kmp_int32 c, kmp_int32 s) {
263  return KMP_COMPARE_AND_STORE_REL32(p, c, s);
264 }
265 
266 template <>
267 __forceinline kmp_int32 compare_and_swap<kmp_int64>(volatile kmp_int64 *p,
268  kmp_int64 c, kmp_int64 s) {
269  return KMP_COMPARE_AND_STORE_REL64(p, c, s);
270 }
271 
272 template <typename T> kmp_uint32 __kmp_ge(T value, T checker) {
273  return value >= checker;
274 }
275 template <typename T> kmp_uint32 __kmp_eq(T value, T checker) {
276  return value == checker;
277 }
278 
279 /*
280  Spin wait loop that first does pause, then yield.
281  Waits until function returns non-zero when called with *spinner and check.
282  Does NOT put threads to sleep.
283  Arguments:
284  UT is unsigned 4- or 8-byte type
285  spinner - memory location to check value
286  checker - value which spinner is >, <, ==, etc.
287  pred - predicate function to perform binary comparison of some sort
288 #if USE_ITT_BUILD
289  obj -- is higher-level synchronization object to report to ittnotify. It
290  is used to report locks consistently. For example, if lock is acquired
291  immediately, its address is reported to ittnotify via
292  KMP_FSYNC_ACQUIRED(). However, it lock cannot be acquired immediately
293  and lock routine calls to KMP_WAIT_YIELD(), the later should report the
294  same address, not an address of low-level spinner.
295 #endif // USE_ITT_BUILD
296  TODO: make inline function (move to header file for icl)
297 */
298 template <typename UT>
299 static UT __kmp_wait_yield(volatile UT *spinner, UT checker,
300  kmp_uint32 (*pred)(UT, UT)
301  USE_ITT_BUILD_ARG(void *obj)) {
302  // note: we may not belong to a team at this point
303  volatile UT *spin = spinner;
304  UT check = checker;
305  kmp_uint32 spins;
306  kmp_uint32 (*f)(UT, UT) = pred;
307  UT r;
308 
309  KMP_FSYNC_SPIN_INIT(obj, CCAST(UT *, spin));
310  KMP_INIT_YIELD(spins);
311  // main wait spin loop
312  while (!f(r = *spin, check)) {
313  KMP_FSYNC_SPIN_PREPARE(obj);
314  /* GEH - remove this since it was accidentally introduced when kmp_wait was
315  split.
316  It causes problems with infinite recursion because of exit lock */
317  /* if ( TCR_4(__kmp_global.g.g_done) && __kmp_global.g.g_abort)
318  __kmp_abort_thread(); */
319 
320  // if we are oversubscribed,
321  // or have waited a bit (and KMP_LIBRARY=throughput, then yield
322  // pause is in the following code
323  KMP_YIELD(TCR_4(__kmp_nth) > __kmp_avail_proc);
324  KMP_YIELD_SPIN(spins);
325  }
326  KMP_FSYNC_SPIN_ACQUIRED(obj);
327  return r;
328 }
329 
330 /* ------------------------------------------------------------------------ */
331 /* ------------------------------------------------------------------------ */
332 
333 template <typename UT>
334 void __kmp_dispatch_deo(int *gtid_ref, int *cid_ref, ident_t *loc_ref) {
335  dispatch_private_info_template<UT> *pr;
336 
337  int gtid = *gtid_ref;
338  // int cid = *cid_ref;
339  kmp_info_t *th = __kmp_threads[gtid];
340  KMP_DEBUG_ASSERT(th->th.th_dispatch);
341 
342  KD_TRACE(100, ("__kmp_dispatch_deo: T#%d called\n", gtid));
343  if (__kmp_env_consistency_check) {
344  pr = reinterpret_cast<dispatch_private_info_template<UT> *>(
345  th->th.th_dispatch->th_dispatch_pr_current);
346  if (pr->pushed_ws != ct_none) {
347 #if KMP_USE_DYNAMIC_LOCK
348  __kmp_push_sync(gtid, ct_ordered_in_pdo, loc_ref, NULL, 0);
349 #else
350  __kmp_push_sync(gtid, ct_ordered_in_pdo, loc_ref, NULL);
351 #endif
352  }
353  }
354 
355  if (!th->th.th_team->t.t_serialized) {
356  dispatch_shared_info_template<UT> *sh =
357  reinterpret_cast<dispatch_shared_info_template<UT> *>(
358  th->th.th_dispatch->th_dispatch_sh_current);
359  UT lower;
360 
361  if (!__kmp_env_consistency_check) {
362  pr = reinterpret_cast<dispatch_private_info_template<UT> *>(
363  th->th.th_dispatch->th_dispatch_pr_current);
364  }
365  lower = pr->u.p.ordered_lower;
366 
367 #if !defined(KMP_GOMP_COMPAT)
368  if (__kmp_env_consistency_check) {
369  if (pr->ordered_bumped) {
370  struct cons_header *p = __kmp_threads[gtid]->th.th_cons;
371  __kmp_error_construct2(kmp_i18n_msg_CnsMultipleNesting,
372  ct_ordered_in_pdo, loc_ref,
373  &p->stack_data[p->w_top]);
374  }
375  }
376 #endif /* !defined(KMP_GOMP_COMPAT) */
377 
378  KMP_MB();
379 #ifdef KMP_DEBUG
380  {
381  char *buff;
382  // create format specifiers before the debug output
383  buff = __kmp_str_format("__kmp_dispatch_deo: T#%%d before wait: "
384  "ordered_iter:%%%s lower:%%%s\n",
385  traits_t<UT>::spec, traits_t<UT>::spec);
386  KD_TRACE(1000, (buff, gtid, sh->u.s.ordered_iteration, lower));
387  __kmp_str_free(&buff);
388  }
389 #endif
390  __kmp_wait_yield<UT>(&sh->u.s.ordered_iteration, lower,
391  __kmp_ge<UT> USE_ITT_BUILD_ARG(NULL));
392  KMP_MB(); /* is this necessary? */
393 #ifdef KMP_DEBUG
394  {
395  char *buff;
396  // create format specifiers before the debug output
397  buff = __kmp_str_format("__kmp_dispatch_deo: T#%%d after wait: "
398  "ordered_iter:%%%s lower:%%%s\n",
399  traits_t<UT>::spec, traits_t<UT>::spec);
400  KD_TRACE(1000, (buff, gtid, sh->u.s.ordered_iteration, lower));
401  __kmp_str_free(&buff);
402  }
403 #endif
404  }
405  KD_TRACE(100, ("__kmp_dispatch_deo: T#%d returned\n", gtid));
406 }
407 
408 template <typename UT>
409 void __kmp_dispatch_dxo(int *gtid_ref, int *cid_ref, ident_t *loc_ref) {
410  typedef typename traits_t<UT>::signed_t ST;
411  dispatch_private_info_template<UT> *pr;
412 
413  int gtid = *gtid_ref;
414  // int cid = *cid_ref;
415  kmp_info_t *th = __kmp_threads[gtid];
416  KMP_DEBUG_ASSERT(th->th.th_dispatch);
417 
418  KD_TRACE(100, ("__kmp_dispatch_dxo: T#%d called\n", gtid));
419  if (__kmp_env_consistency_check) {
420  pr = reinterpret_cast<dispatch_private_info_template<UT> *>(
421  th->th.th_dispatch->th_dispatch_pr_current);
422  if (pr->pushed_ws != ct_none) {
423  __kmp_pop_sync(gtid, ct_ordered_in_pdo, loc_ref);
424  }
425  }
426 
427  if (!th->th.th_team->t.t_serialized) {
428  dispatch_shared_info_template<UT> *sh =
429  reinterpret_cast<dispatch_shared_info_template<UT> *>(
430  th->th.th_dispatch->th_dispatch_sh_current);
431 
432  if (!__kmp_env_consistency_check) {
433  pr = reinterpret_cast<dispatch_private_info_template<UT> *>(
434  th->th.th_dispatch->th_dispatch_pr_current);
435  }
436 
437  KMP_FSYNC_RELEASING(CCAST(UT *, &sh->u.s.ordered_iteration));
438 #if !defined(KMP_GOMP_COMPAT)
439  if (__kmp_env_consistency_check) {
440  if (pr->ordered_bumped != 0) {
441  struct cons_header *p = __kmp_threads[gtid]->th.th_cons;
442  /* How to test it? - OM */
443  __kmp_error_construct2(kmp_i18n_msg_CnsMultipleNesting,
444  ct_ordered_in_pdo, loc_ref,
445  &p->stack_data[p->w_top]);
446  }
447  }
448 #endif /* !defined(KMP_GOMP_COMPAT) */
449 
450  KMP_MB(); /* Flush all pending memory write invalidates. */
451 
452  pr->ordered_bumped += 1;
453 
454  KD_TRACE(1000,
455  ("__kmp_dispatch_dxo: T#%d bumping ordered ordered_bumped=%d\n",
456  gtid, pr->ordered_bumped));
457 
458  KMP_MB(); /* Flush all pending memory write invalidates. */
459 
460  /* TODO use general release procedure? */
461  test_then_inc<ST>((volatile ST *)&sh->u.s.ordered_iteration);
462 
463  KMP_MB(); /* Flush all pending memory write invalidates. */
464  }
465  KD_TRACE(100, ("__kmp_dispatch_dxo: T#%d returned\n", gtid));
466 }
467 
468 /* Computes and returns x to the power of y, where y must a non-negative integer
469  */
470 template <typename UT>
471 static __forceinline long double __kmp_pow(long double x, UT y) {
472  long double s = 1.0L;
473 
474  KMP_DEBUG_ASSERT(x > 0.0 && x < 1.0);
475  // KMP_DEBUG_ASSERT(y >= 0); // y is unsigned
476  while (y) {
477  if (y & 1)
478  s *= x;
479  x *= x;
480  y >>= 1;
481  }
482  return s;
483 }
484 
485 /* Computes and returns the number of unassigned iterations after idx chunks
486  have been assigned
487  (the total number of unassigned iterations in chunks with index greater than
488  or equal to idx).
489  __forceinline seems to be broken so that if we __forceinline this function,
490  the behavior is wrong
491  (one of the unit tests, sch_guided_analytical_basic.cpp, fails)
492 */
493 template <typename T>
494 static __inline typename traits_t<T>::unsigned_t
495 __kmp_dispatch_guided_remaining(T tc, typename traits_t<T>::floating_t base,
496  typename traits_t<T>::unsigned_t idx) {
497  /* Note: On Windows* OS on IA-32 architecture and Intel(R) 64, at
498  least for ICL 8.1, long double arithmetic may not really have
499  long double precision, even with /Qlong_double. Currently, we
500  workaround that in the caller code, by manipulating the FPCW for
501  Windows* OS on IA-32 architecture. The lack of precision is not
502  expected to be a correctness issue, though.
503  */
504  typedef typename traits_t<T>::unsigned_t UT;
505 
506  long double x = tc * __kmp_pow<UT>(base, idx);
507  UT r = (UT)x;
508  if (x == r)
509  return r;
510  return r + 1;
511 }
512 
513 // Parameters of the guided-iterative algorithm:
514 // p2 = n * nproc * ( chunk + 1 ) // point of switching to dynamic
515 // p3 = 1 / ( n * nproc ) // remaining iterations multiplier
516 // by default n = 2. For example with n = 3 the chunks distribution will be more
517 // flat.
518 // With n = 1 first chunk is the same as for static schedule, e.g. trip / nproc.
519 static const int guided_int_param = 2;
520 static const double guided_flt_param = 0.5; // = 1.0 / guided_int_param;
521 #endif // KMP_DISPATCH_H
sched_type
Definition: kmp.h:332
Definition: kmp.h:219