Intel® OpenMP* Runtime Library
 All Classes Functions Variables Typedefs Enumerations Enumerator Groups Pages
kmp_dispatch.cpp
1 /*
2  * kmp_dispatch.cpp: dynamic scheduling - iteration initialization and dispatch.
3  * $Revision: 42489 $
4  * $Date: 2013-07-08 11:00:09 -0500 (Mon, 08 Jul 2013) $
5  */
6 
7 /* <copyright>
8  Copyright (c) 1997-2013 Intel Corporation. All Rights Reserved.
9 
10  Redistribution and use in source and binary forms, with or without
11  modification, are permitted provided that the following conditions
12  are met:
13 
14  * Redistributions of source code must retain the above copyright
15  notice, this list of conditions and the following disclaimer.
16  * Redistributions in binary form must reproduce the above copyright
17  notice, this list of conditions and the following disclaimer in the
18  documentation and/or other materials provided with the distribution.
19  * Neither the name of Intel Corporation nor the names of its
20  contributors may be used to endorse or promote products derived
21  from this software without specific prior written permission.
22 
23  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
24  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
25  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
26  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
27  HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
28  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
29  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
30  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
31  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
32  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
33  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
34 
35 </copyright> */
36 
37 /*
38  * Dynamic scheduling initialization and dispatch.
39  *
40  * NOTE: __kmp_nth is a constant inside of any dispatch loop, however
41  * it may change values between parallel regions. __kmp_max_nth
42  * is the largest value __kmp_nth may take, 1 is the smallest.
43  *
44  */
45 
46 /* ------------------------------------------------------------------------ */
47 /* ------------------------------------------------------------------------ */
48 
49 #include "kmp.h"
50 #include "kmp_i18n.h"
51 #include "kmp_itt.h"
52 #include "kmp_str.h"
53 #include "kmp_error.h"
54 #if KMP_OS_WINDOWS && KMP_ARCH_X86
55  #include <float.h>
56 #endif
57 
58 /* ------------------------------------------------------------------------ */
59 /* ------------------------------------------------------------------------ */
60 
61 #ifdef KMP_STATIC_STEAL_ENABLED
62 
63  // replaces dispatch_private_info{32,64} structures and dispatch_private_info{32,64}_t types
64  template< typename T >
65  struct dispatch_private_infoXX_template {
66  typedef typename traits_t< T >::unsigned_t UT;
67  typedef typename traits_t< T >::signed_t ST;
68  UT count; // unsigned
69  T ub;
70  /* Adding KMP_ALIGN_CACHE here doesn't help / can hurt performance */
71  T lb;
72  ST st; // signed
73  UT tc; // unsigned
74  T static_steal_counter; // for static_steal only; maybe better to put after ub
75 
76  /* parm[1-4] are used in different ways by different scheduling algorithms */
77 
78  // KMP_ALIGN( 32 ) ensures ( if the KMP_ALIGN macro is turned on )
79  // a) parm3 is properly aligned and
80  // b) all parm1-4 are in the same cache line.
81  // Because of parm1-4 are used together, performance seems to be better
82  // if they are in the same line (not measured though).
83 
84  struct KMP_ALIGN( 32 ) { // compiler does not accept sizeof(T)*4
85  T parm1;
86  T parm2;
87  T parm3;
88  T parm4;
89  };
90 
91  UT ordered_lower; // unsigned
92  UT ordered_upper; // unsigned
93  #if KMP_OS_WINDOWS
94  T last_upper;
95  #endif /* KMP_OS_WINDOWS */
96  };
97 
98 #else /* KMP_STATIC_STEAL_ENABLED */
99 
100  // replaces dispatch_private_info{32,64} structures and dispatch_private_info{32,64}_t types
101  template< typename T >
102  struct dispatch_private_infoXX_template {
103  typedef typename traits_t< T >::unsigned_t UT;
104  typedef typename traits_t< T >::signed_t ST;
105  T lb;
106  T ub;
107  ST st; // signed
108  UT tc; // unsigned
109 
110  T parm1;
111  T parm2;
112  T parm3;
113  T parm4;
114 
115  UT count; // unsigned
116 
117  UT ordered_lower; // unsigned
118  UT ordered_upper; // unsigned
119  #if KMP_OS_WINDOWS
120  T last_upper;
121  #endif /* KMP_OS_WINDOWS */
122  };
123 
124 #endif /* KMP_STATIC_STEAL_ENABLED */
125 
126 // replaces dispatch_private_info structure and dispatch_private_info_t type
127 template< typename T >
128 struct KMP_ALIGN_CACHE dispatch_private_info_template {
129  // duplicate alignment here, otherwise size of structure is not correct in our compiler
130  union KMP_ALIGN_CACHE private_info_tmpl {
131  dispatch_private_infoXX_template< T > p;
132  dispatch_private_info64_t p64;
133  } u;
134  enum sched_type schedule; /* scheduling algorithm */
135  kmp_uint32 ordered; /* ordered clause specified */
136  kmp_uint32 ordered_bumped;
137  kmp_int32 ordered_dummy[KMP_MAX_ORDERED-3]; // to retain the structure size after making order
138  dispatch_private_info * next; /* stack of buffers for nest of serial regions */
139  kmp_uint32 nomerge; /* don't merge iters if serialized */
140  kmp_uint32 type_size;
141  enum cons_type pushed_ws;
142 };
143 
144 
145 // replaces dispatch_shared_info{32,64} structures and dispatch_shared_info{32,64}_t types
146 template< typename UT >
147 struct dispatch_shared_infoXX_template {
148  /* chunk index under dynamic, number of idle threads under static-steal;
149  iteration index otherwise */
150  volatile UT iteration;
151  volatile UT num_done;
152  volatile UT ordered_iteration;
153  UT ordered_dummy[KMP_MAX_ORDERED-1]; // to retain the structure size making ordered_iteration scalar
154 };
155 
156 // replaces dispatch_shared_info structure and dispatch_shared_info_t type
157 template< typename UT >
158 struct dispatch_shared_info_template {
159  // we need union here to keep the structure size
160  union shared_info_tmpl {
161  dispatch_shared_infoXX_template< UT > s;
162  dispatch_shared_info64_t s64;
163  } u;
164  volatile kmp_uint32 buffer_index;
165 };
166 
167 /* ------------------------------------------------------------------------ */
168 /* ------------------------------------------------------------------------ */
169 
170 static void
171 __kmp_static_delay( int arg )
172 {
173  /* Work around weird code-gen bug that causes assert to trip */
174  #if KMP_ARCH_X86_64 && KMP_OS_LINUX
175  #else
176  KMP_ASSERT( arg >= 0 );
177  #endif
178 }
179 
180 static void
181 __kmp_static_yield( int arg )
182 {
183  __kmp_yield( arg );
184 }
185 
186 #undef USE_TEST_LOCKS
187 
188 // test_then_add template (general template should NOT be used)
189 template< typename T >
190 static __forceinline T
191 test_then_add( volatile T *p, T d ) { KMP_ASSERT(0); };
192 
193 template<>
194 __forceinline kmp_int32
195 test_then_add< kmp_int32 >( volatile kmp_int32 *p, kmp_int32 d )
196 {
197  kmp_int32 r;
198  r = KMP_TEST_THEN_ADD32( p, d );
199  return r;
200 }
201 
202 template<>
203 __forceinline kmp_int64
204 test_then_add< kmp_int64 >( volatile kmp_int64 *p, kmp_int64 d )
205 {
206  kmp_int64 r;
207  r = KMP_TEST_THEN_ADD64( p, d );
208  return r;
209 }
210 
211 // test_then_inc_acq template (general template should NOT be used)
212 template< typename T >
213 static __forceinline T
214 test_then_inc_acq( volatile T *p ) { KMP_ASSERT(0); };
215 
216 template<>
217 __forceinline kmp_int32
218 test_then_inc_acq< kmp_int32 >( volatile kmp_int32 *p )
219 {
220  kmp_int32 r;
221  r = KMP_TEST_THEN_INC_ACQ32( p );
222  return r;
223 }
224 
225 template<>
226 __forceinline kmp_int64
227 test_then_inc_acq< kmp_int64 >( volatile kmp_int64 *p )
228 {
229  kmp_int64 r;
230  r = KMP_TEST_THEN_INC_ACQ64( p );
231  return r;
232 }
233 
234 // test_then_inc template (general template should NOT be used)
235 template< typename T >
236 static __forceinline T
237 test_then_inc( volatile T *p ) { KMP_ASSERT(0); };
238 
239 template<>
240 __forceinline kmp_int32
241 test_then_inc< kmp_int32 >( volatile kmp_int32 *p )
242 {
243  kmp_int32 r;
244  r = KMP_TEST_THEN_INC32( p );
245  return r;
246 }
247 
248 template<>
249 __forceinline kmp_int64
250 test_then_inc< kmp_int64 >( volatile kmp_int64 *p )
251 {
252  kmp_int64 r;
253  r = KMP_TEST_THEN_INC64( p );
254  return r;
255 }
256 
257 // compare_and_swap template (general template should NOT be used)
258 template< typename T >
259 static __forceinline kmp_int32
260 compare_and_swap( volatile T *p, T c, T s ) { KMP_ASSERT(0); };
261 
262 template<>
263 __forceinline kmp_int32
264 compare_and_swap< kmp_int32 >( volatile kmp_int32 *p, kmp_int32 c, kmp_int32 s )
265 {
266  return KMP_COMPARE_AND_STORE_REL32( p, c, s );
267 }
268 
269 template<>
270 __forceinline kmp_int32
271 compare_and_swap< kmp_int64 >( volatile kmp_int64 *p, kmp_int64 c, kmp_int64 s )
272 {
273  return KMP_COMPARE_AND_STORE_REL64( p, c, s );
274 }
275 
276 /*
277  Spin wait loop that first does pause, then yield.
278  Waits until function returns non-zero when called with *spinner and check.
279  Does NOT put threads to sleep.
280 #if USE_ITT_BUILD
281  Arguments:
282  obj -- is higher-level syncronization object to report to ittnotify. It is used to report
283  locks consistently. For example, if lock is acquired immediately, its address is
284  reported to ittnotify via KMP_FSYNC_ACQUIRED(). However, it lock cannot be acquired
285  immediately and lock routine calls to KMP_WAIT_YIELD(), the later should report the same
286  address, not an address of low-level spinner.
287 #endif // USE_ITT_BUILD
288 */
289 template< typename UT >
290 // ToDo: make inline function (move to header file for icl)
291 static UT // unsigned 4- or 8-byte type
292 __kmp_wait_yield( volatile UT * spinner,
293  UT checker,
294  kmp_uint32 (* pred)( UT, UT )
295  USE_ITT_BUILD_ARG(void * obj) // Higher-level synchronization object, or NULL.
296  )
297 {
298  // note: we may not belong to a team at this point
299  register volatile UT * spin = spinner;
300  register UT check = checker;
301  register kmp_uint32 spins;
302  register kmp_uint32 (*f) ( UT, UT ) = pred;
303  register UT r;
304 
305  KMP_FSYNC_SPIN_INIT( obj, (void*) spin );
306  KMP_INIT_YIELD( spins );
307  // main wait spin loop
308 #if USE_ITT_BUILD && defined (USE_ITT) && KMP_OS_WINDOWS
309  //&& ( ___kmp_size_type > 4 )
310  // ITT + Windows* OS --> volatile
311  while(!f(r = *(volatile UT *)spin, check))
312 #else
313  while(!f(r = *spin, check))
314 #endif
315  {
316  KMP_FSYNC_SPIN_PREPARE( obj );
317  /* GEH - remove this since it was accidentally introduced when kmp_wait was split.
318  It causes problems with infinite recursion because of exit lock */
319  /* if ( TCR_4(__kmp_global.g.g_done) && __kmp_global.g.g_abort)
320  __kmp_abort_thread(); */
321 
322  __kmp_static_delay(TRUE);
323 
324  // if we are oversubscribed,
325  // or have waited a bit (and KMP_LIBRARY=throughput, then yield
326  // pause is in the following code
327  KMP_YIELD( TCR_4(__kmp_nth) > __kmp_avail_proc );
328  KMP_YIELD_SPIN( spins );
329  }
330  KMP_FSYNC_SPIN_ACQUIRED( obj );
331  return r;
332 }
333 
334 template< typename UT >
335 static kmp_uint32 __kmp_eq( UT value, UT checker) {
336  return value == checker;
337 }
338 
339 template< typename UT >
340 static kmp_uint32 __kmp_neq( UT value, UT checker) {
341  return value != checker;
342 }
343 
344 template< typename UT >
345 static kmp_uint32 __kmp_lt( UT value, UT checker) {
346  return value < checker;
347 }
348 
349 template< typename UT >
350 static kmp_uint32 __kmp_ge( UT value, UT checker) {
351  return value >= checker;
352 }
353 
354 template< typename UT >
355 static kmp_uint32 __kmp_le( UT value, UT checker) {
356  return value <= checker;
357 }
358 
359 
360 /* ------------------------------------------------------------------------ */
361 /* ------------------------------------------------------------------------ */
362 
363 static void
364 __kmp_dispatch_deo_error( int *gtid_ref, int *cid_ref, ident_t *loc_ref )
365 {
366  kmp_info_t *th;
367 
368  KMP_DEBUG_ASSERT( gtid_ref );
369 
370  if ( __kmp_env_consistency_check ) {
371  th = __kmp_threads[*gtid_ref];
372  if ( th -> th.th_root -> r.r_active
373  && ( th -> th.th_dispatch -> th_dispatch_pr_current -> pushed_ws != ct_none ) ) {
374  __kmp_push_sync( *gtid_ref, ct_ordered_in_pdo, loc_ref, NULL );
375  }
376  }
377 }
378 
379 template< typename UT >
380 static void
381 __kmp_dispatch_deo( int *gtid_ref, int *cid_ref, ident_t *loc_ref )
382 {
383  typedef typename traits_t< UT >::signed_t ST;
384  dispatch_private_info_template< UT > * pr;
385 
386  int gtid = *gtid_ref;
387 // int cid = *cid_ref;
388  kmp_info_t *th = __kmp_threads[ gtid ];
389  KMP_DEBUG_ASSERT( th -> th.th_dispatch );
390 
391  KD_TRACE(100, ("__kmp_dispatch_deo: T#%d called\n", gtid ) );
392  if ( __kmp_env_consistency_check ) {
393  pr = reinterpret_cast< dispatch_private_info_template< UT >* >
394  ( th -> th.th_dispatch -> th_dispatch_pr_current );
395  if ( pr -> pushed_ws != ct_none ) {
396  __kmp_push_sync( gtid, ct_ordered_in_pdo, loc_ref, NULL );
397  }
398  }
399 
400  if ( ! th -> th.th_team -> t.t_serialized ) {
401  dispatch_shared_info_template< UT > * sh = reinterpret_cast< dispatch_shared_info_template< UT >* >
402  ( th -> th.th_dispatch -> th_dispatch_sh_current );
403  UT lower;
404 
405  if ( ! __kmp_env_consistency_check ) {
406  pr = reinterpret_cast< dispatch_private_info_template< UT >* >
407  ( th -> th.th_dispatch -> th_dispatch_pr_current );
408  }
409  lower = pr->u.p.ordered_lower;
410 
411  #if ! defined( KMP_GOMP_COMPAT )
412  if ( __kmp_env_consistency_check ) {
413  if ( pr->ordered_bumped ) {
414  struct cons_header *p = __kmp_threads[ gtid ]->th.th_cons;
415  __kmp_error_construct2(
416  kmp_i18n_msg_CnsMultipleNesting,
417  ct_ordered_in_pdo, loc_ref,
418  & p->stack_data[ p->w_top ]
419  );
420  }
421  }
422  #endif /* !defined(KMP_GOMP_COMPAT) */
423 
424  KMP_MB();
425  #ifdef KMP_DEBUG
426  {
427  const char * buff;
428  // create format specifiers before the debug output
429  buff = __kmp_str_format(
430  "__kmp_dispatch_deo: T#%%d before wait: ordered_iter:%%%s lower:%%%s\n",
431  traits_t< UT >::spec, traits_t< UT >::spec );
432  KD_TRACE(1000, ( buff, gtid, sh->u.s.ordered_iteration, lower ) );
433  __kmp_str_free( &buff );
434  }
435  #endif
436 
437  __kmp_wait_yield< UT >( &sh->u.s.ordered_iteration, lower, __kmp_ge< UT >
438  USE_ITT_BUILD_ARG( NULL )
439  );
440  KMP_MB(); /* is this necessary? */
441  #ifdef KMP_DEBUG
442  {
443  const char * buff;
444  // create format specifiers before the debug output
445  buff = __kmp_str_format(
446  "__kmp_dispatch_deo: T#%%d after wait: ordered_iter:%%%s lower:%%%s\n",
447  traits_t< UT >::spec, traits_t< UT >::spec );
448  KD_TRACE(1000, ( buff, gtid, sh->u.s.ordered_iteration, lower ) );
449  __kmp_str_free( &buff );
450  }
451  #endif
452  }
453  KD_TRACE(100, ("__kmp_dispatch_deo: T#%d returned\n", gtid ) );
454 }
455 
456 static void
457 __kmp_dispatch_dxo_error( int *gtid_ref, int *cid_ref, ident_t *loc_ref )
458 {
459  kmp_info_t *th;
460 
461  if ( __kmp_env_consistency_check ) {
462  th = __kmp_threads[*gtid_ref];
463  if ( th -> th.th_dispatch -> th_dispatch_pr_current -> pushed_ws != ct_none ) {
464  __kmp_pop_sync( *gtid_ref, ct_ordered_in_pdo, loc_ref );
465  }
466  }
467 }
468 
469 template< typename UT >
470 static void
471 __kmp_dispatch_dxo( int *gtid_ref, int *cid_ref, ident_t *loc_ref )
472 {
473  typedef typename traits_t< UT >::signed_t ST;
474  dispatch_private_info_template< UT > * pr;
475 
476  int gtid = *gtid_ref;
477 // int cid = *cid_ref;
478  kmp_info_t *th = __kmp_threads[ gtid ];
479  KMP_DEBUG_ASSERT( th -> th.th_dispatch );
480 
481  KD_TRACE(100, ("__kmp_dispatch_dxo: T#%d called\n", gtid ) );
482  if ( __kmp_env_consistency_check ) {
483  pr = reinterpret_cast< dispatch_private_info_template< UT >* >
484  ( th -> th.th_dispatch -> th_dispatch_pr_current );
485  if ( pr -> pushed_ws != ct_none ) {
486  __kmp_pop_sync( gtid, ct_ordered_in_pdo, loc_ref );
487  }
488  }
489 
490  if ( ! th -> th.th_team -> t.t_serialized ) {
491  dispatch_shared_info_template< UT > * sh = reinterpret_cast< dispatch_shared_info_template< UT >* >
492  ( th -> th.th_dispatch -> th_dispatch_sh_current );
493 
494  if ( ! __kmp_env_consistency_check ) {
495  pr = reinterpret_cast< dispatch_private_info_template< UT >* >
496  ( th -> th.th_dispatch -> th_dispatch_pr_current );
497  }
498 
499  KMP_FSYNC_RELEASING( & sh->u.s.ordered_iteration );
500  #if ! defined( KMP_GOMP_COMPAT )
501  if ( __kmp_env_consistency_check ) {
502  if ( pr->ordered_bumped != 0 ) {
503  struct cons_header *p = __kmp_threads[ gtid ]->th.th_cons;
504  /* How to test it? - OM */
505  __kmp_error_construct2(
506  kmp_i18n_msg_CnsMultipleNesting,
507  ct_ordered_in_pdo, loc_ref,
508  & p->stack_data[ p->w_top ]
509  );
510  }
511  }
512  #endif /* !defined(KMP_GOMP_COMPAT) */
513 
514  KMP_MB(); /* Flush all pending memory write invalidates. */
515 
516  pr->ordered_bumped += 1;
517 
518  KD_TRACE(1000, ("__kmp_dispatch_dxo: T#%d bumping ordered ordered_bumped=%d\n",
519  gtid, pr->ordered_bumped ) );
520 
521  KMP_MB(); /* Flush all pending memory write invalidates. */
522 
523  /* TODO use general release procedure? */
524  test_then_inc< ST >( (volatile ST *) & sh->u.s.ordered_iteration );
525 
526  KMP_MB(); /* Flush all pending memory write invalidates. */
527  }
528  KD_TRACE(100, ("__kmp_dispatch_dxo: T#%d returned\n", gtid ) );
529 }
530 
531 /* Computes and returns x to the power of y, where y must a non-negative integer */
532 template< typename UT >
533 static __forceinline long double
534 __kmp_pow(long double x, UT y) {
535  long double s=1.0L;
536 
537  KMP_DEBUG_ASSERT(x > 0.0 && x < 1.0);
538  //KMP_DEBUG_ASSERT(y >= 0); // y is unsigned
539  while(y) {
540  if ( y & 1 )
541  s *= x;
542  x *= x;
543  y >>= 1;
544  }
545  return s;
546 }
547 
548 /* Computes and returns the number of unassigned iterations after idx chunks have been assigned
549  (the total number of unassigned iterations in chunks with index greater than or equal to idx).
550  __forceinline seems to be broken so that if we __forceinline this function, the behavior is wrong
551  (one of the unit tests, sch_guided_analytical_basic.cpp, fails)
552 */
553 template< typename T >
554 static __inline typename traits_t< T >::unsigned_t
555 __kmp_dispatch_guided_remaining(
556  T tc,
557  typename traits_t< T >::floating_t base,
558  typename traits_t< T >::unsigned_t idx
559 ) {
560  /* Note: On Windows* OS on IA-32 architecture and Intel(R) 64, at
561  least for ICL 8.1, long double arithmetic may not really have
562  long double precision, even with /Qlong_double. Currently, we
563  workaround that in the caller code, by manipulating the FPCW for
564  Windows* OS on IA-32 architecture. The lack of precision is not
565  expected to be a correctness issue, though.
566  */
567  typedef typename traits_t< T >::unsigned_t UT;
568 
569  long double x = tc * __kmp_pow< UT >(base, idx);
570  UT r = (UT) x;
571  if ( x == r )
572  return r;
573  return r + 1;
574 }
575 
576 // Parameters of the guided-iterative algorithm:
577 // p2 = n * nproc * ( chunk + 1 ) // point of switching to dynamic
578 // p3 = 1 / ( n * nproc ) // remaining iterations multiplier
579 // by default n = 2. For example with n = 3 the chunks distribution will be more flat.
580 // With n = 1 first chunk is the same as for static schedule, e.g. trip / nproc.
581 static int guided_int_param = 2;
582 static double guided_flt_param = 0.5;// = 1.0 / guided_int_param;
583 
584 // UT - unsigned flavor of T, ST - signed flavor of T,
585 // DBL - double if sizeof(T)==4, or long double if sizeof(T)==8
586 template< typename T >
587 static void
588 __kmp_dispatch_init(
589  ident_t * loc,
590  int gtid,
591  enum sched_type schedule,
592  T lb,
593  T ub,
594  typename traits_t< T >::signed_t st,
595  typename traits_t< T >::signed_t chunk,
596  int push_ws
597 ) {
598  typedef typename traits_t< T >::unsigned_t UT;
599  typedef typename traits_t< T >::signed_t ST;
600  typedef typename traits_t< T >::floating_t DBL;
601  static const int ___kmp_size_type = sizeof( UT );
602 
603  int active;
604  T tc;
605  kmp_info_t * th;
606  kmp_team_t * team;
607  kmp_uint32 my_buffer_index;
608  dispatch_private_info_template< T > * pr;
609  dispatch_shared_info_template< UT > volatile * sh;
610 
611  KMP_BUILD_ASSERT( sizeof( dispatch_private_info_template< T > ) == sizeof( dispatch_private_info ) );
612  KMP_BUILD_ASSERT( sizeof( dispatch_shared_info_template< UT > ) == sizeof( dispatch_shared_info ) );
613 
614  if ( ! TCR_4( __kmp_init_parallel ) )
615  __kmp_parallel_initialize();
616 
617  #ifdef KMP_DEBUG
618  {
619  const char * buff;
620  // create format specifiers before the debug output
621  buff = __kmp_str_format(
622  "__kmp_dispatch_init: T#%%d called: schedule:%%d chunk:%%%s lb:%%%s ub:%%%s st:%%%s\n",
623  traits_t< ST >::spec, traits_t< T >::spec, traits_t< T >::spec, traits_t< ST >::spec );
624  KD_TRACE(10, ( buff, gtid, schedule, chunk, lb, ub, st ) );
625  __kmp_str_free( &buff );
626  }
627  #endif
628  /* setup data */
629  th = __kmp_threads[ gtid ];
630  team = th -> th.th_team;
631  active = ! team -> t.t_serialized;
632  th->th.th_ident = loc;
633 
634  if ( ! active ) {
635  pr = reinterpret_cast< dispatch_private_info_template< T >* >
636  ( th -> th.th_dispatch -> th_disp_buffer ); /* top of the stack */
637  } else {
638  KMP_DEBUG_ASSERT( th->th.th_dispatch ==
639  &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid] );
640 
641  my_buffer_index = th->th.th_dispatch->th_disp_index ++;
642 
643  /* What happens when number of threads changes, need to resize buffer? */
644  pr = reinterpret_cast< dispatch_private_info_template< T > * >
645  ( &th -> th.th_dispatch -> th_disp_buffer[ my_buffer_index % KMP_MAX_DISP_BUF ] );
646  sh = reinterpret_cast< dispatch_shared_info_template< UT > volatile * >
647  ( &team -> t.t_disp_buffer[ my_buffer_index % KMP_MAX_DISP_BUF ] );
648  }
649 
650  /* Pick up the nomerge/ordered bits from the scheduling type */
651  if ( (schedule >= kmp_nm_lower) && (schedule < kmp_nm_upper) ) {
652  pr->nomerge = TRUE;
653  schedule = (enum sched_type)(((int)schedule) - (kmp_nm_lower - kmp_sch_lower));
654  } else {
655  pr->nomerge = FALSE;
656  }
657  pr->type_size = ___kmp_size_type; // remember the size of variables
658  if ( kmp_ord_lower & schedule ) {
659  pr->ordered = TRUE;
660  schedule = (enum sched_type)(((int)schedule) - (kmp_ord_lower - kmp_sch_lower));
661  } else {
662  pr->ordered = FALSE;
663  }
664  if ( schedule == kmp_sch_static ) {
665  schedule = __kmp_static;
666  } else {
667  if ( schedule == kmp_sch_runtime ) {
668  #if OMP_30_ENABLED
669  // Use the scheduling specified by OMP_SCHEDULE (or __kmp_sch_default if not specified)
670  schedule = team -> t.t_sched.r_sched_type;
671  // Detail the schedule if needed (global controls are differentiated appropriately)
672  if ( schedule == kmp_sch_guided_chunked ) {
673  schedule = __kmp_guided;
674  } else if ( schedule == kmp_sch_static ) {
675  schedule = __kmp_static;
676  }
677  // Use the chunk size specified by OMP_SCHEDULE (or default if not specified)
678  chunk = team -> t.t_sched.chunk;
679  #else
680  kmp_r_sched_t r_sched = __kmp_get_schedule_global();
681  // Use the scheduling specified by OMP_SCHEDULE and/or KMP_SCHEDULE or default
682  schedule = r_sched.r_sched_type;
683  chunk = r_sched.chunk;
684  #endif
685 
686  #ifdef KMP_DEBUG
687  {
688  const char * buff;
689  // create format specifiers before the debug output
690  buff = __kmp_str_format(
691  "__kmp_dispatch_init: T#%%d new: schedule:%%d chunk:%%%s\n",
692  traits_t< ST >::spec );
693  KD_TRACE(10, ( buff, gtid, schedule, chunk ) );
694  __kmp_str_free( &buff );
695  }
696  #endif
697  } else {
698  if ( schedule == kmp_sch_guided_chunked ) {
699  schedule = __kmp_guided;
700  }
701  if ( chunk <= 0 ) {
702  chunk = KMP_DEFAULT_CHUNK;
703  }
704  }
705 
706  #if OMP_30_ENABLED
707  if ( schedule == kmp_sch_auto ) {
708  // mapping and differentiation: in the __kmp_do_serial_initialize()
709  schedule = __kmp_auto;
710  #ifdef KMP_DEBUG
711  {
712  const char * buff;
713  // create format specifiers before the debug output
714  buff = __kmp_str_format(
715  "__kmp_dispatch_init: kmp_sch_auto: T#%%d new: schedule:%%d chunk:%%%s\n",
716  traits_t< ST >::spec );
717  KD_TRACE(10, ( buff, gtid, schedule, chunk ) );
718  __kmp_str_free( &buff );
719  }
720  #endif
721  }
722  #endif // OMP_30_ENABLED
723 
724  /* guided analytical not safe for too many threads */
725  if ( team->t.t_nproc > 1<<20 && schedule == kmp_sch_guided_analytical_chunked ) {
726  schedule = kmp_sch_guided_iterative_chunked;
727  KMP_WARNING( DispatchManyThreads );
728  }
729  pr->u.p.parm1 = chunk;
730  }
731  KMP_ASSERT2( (kmp_sch_lower < schedule && schedule < kmp_sch_upper),
732  "unknown scheduling type" );
733 
734  pr->u.p.count = 0;
735 
736  if ( __kmp_env_consistency_check ) {
737  if ( st == 0 ) {
738  __kmp_error_construct(
739  kmp_i18n_msg_CnsLoopIncrZeroProhibited,
740  ( pr->ordered ? ct_pdo_ordered : ct_pdo ), loc
741  );
742  }
743  }
744 
745  tc = ( ub - lb + st );
746  if ( st != 1 ) {
747  if ( st < 0 ) {
748  if ( lb < ub ) {
749  tc = 0; // zero-trip
750  } else { // lb >= ub
751  tc = (ST)tc / st; // convert to signed division
752  }
753  } else { // st > 0
754  if ( ub < lb ) {
755  tc = 0; // zero-trip
756  } else { // lb >= ub
757  tc /= st;
758  }
759  }
760  } else if ( ub < lb ) { // st == 1
761  tc = 0; // zero-trip
762  }
763 
764  pr->u.p.lb = lb;
765  pr->u.p.ub = ub;
766  pr->u.p.st = st;
767  pr->u.p.tc = tc;
768 
769  #if KMP_OS_WINDOWS
770  pr->u.p.last_upper = ub + st;
771  #endif /* KMP_OS_WINDOWS */
772 
773  /* NOTE: only the active parallel region(s) has active ordered sections */
774 
775  if ( active ) {
776  if ( pr->ordered == 0 ) {
777  th -> th.th_dispatch -> th_deo_fcn = __kmp_dispatch_deo_error;
778  th -> th.th_dispatch -> th_dxo_fcn = __kmp_dispatch_dxo_error;
779  } else {
780  pr->ordered_bumped = 0;
781 
782  pr->u.p.ordered_lower = 1;
783  pr->u.p.ordered_upper = 0;
784 
785  th -> th.th_dispatch -> th_deo_fcn = __kmp_dispatch_deo< UT >;
786  th -> th.th_dispatch -> th_dxo_fcn = __kmp_dispatch_dxo< UT >;
787  }
788  }
789 
790  if ( __kmp_env_consistency_check ) {
791  enum cons_type ws = pr->ordered ? ct_pdo_ordered : ct_pdo;
792  if ( push_ws ) {
793  __kmp_push_workshare( gtid, ws, loc );
794  pr->pushed_ws = ws;
795  } else {
796  __kmp_check_workshare( gtid, ws, loc );
797  pr->pushed_ws = ct_none;
798  }
799  }
800 
801  switch ( schedule ) {
802  #if ( KMP_STATIC_STEAL_ENABLED && KMP_ARCH_X86_64 )
804  {
805  T nproc = team->t.t_nproc;
806  T ntc, init;
807 
808  KD_TRACE(100, ("__kmp_dispatch_init: T#%d kmp_sch_static_steal case\n", gtid ) );
809 
810  ntc = (tc % chunk ? 1 : 0) + tc / chunk;
811  if ( nproc > 1 && ntc >= nproc ) {
812  T id = __kmp_tid_from_gtid(gtid);
813  T small_chunk, extras;
814 
815  small_chunk = ntc / nproc;
816  extras = ntc % nproc;
817 
818  init = id * small_chunk + ( id < extras ? id : extras );
819  pr->u.p.count = init;
820  pr->u.p.ub = init + small_chunk + ( id < extras ? 1 : 0 );
821 
822  pr->u.p.parm2 = lb;
823  //pr->pfields.parm3 = 0; // it's not used in static_steal
824  pr->u.p.parm4 = id;
825  pr->u.p.st = st;
826  break;
827  } else {
828  KD_TRACE(100, ("__kmp_dispatch_init: T#%d falling-through to kmp_sch_static_balanced\n",
829  gtid ) );
830  schedule = kmp_sch_static_balanced;
831  /* too few iterations: fall-through to kmp_sch_static_balanced */
832  } // if
833  /* FALL-THROUGH to static balanced */
834  } // case
835  #endif
836  case kmp_sch_static_balanced:
837  {
838  T nproc = team->t.t_nproc;
839  T init, limit;
840 
841  KD_TRACE(100, ("__kmp_dispatch_init: T#%d kmp_sch_static_balanced case\n",
842  gtid ) );
843 
844  if ( nproc > 1 ) {
845  T id = __kmp_tid_from_gtid(gtid);
846 
847  if ( tc < nproc ) {
848  if ( id < tc ) {
849  init = id;
850  limit = id;
851  pr->u.p.parm1 = (id == tc - 1); /* parm1 stores *plastiter */
852  } else {
853  pr->u.p.count = 1; /* means no more chunks to execute */
854  pr->u.p.parm1 = FALSE;
855  break;
856  }
857  } else {
858  T small_chunk = tc / nproc;
859  T extras = tc % nproc;
860  init = id * small_chunk + (id < extras ? id : extras);
861  limit = init + small_chunk - (id < extras ? 0 : 1);
862  pr->u.p.parm1 = (id == nproc - 1);
863  }
864  } else {
865  if ( tc > 0 ) {
866  init = 0;
867  limit = tc - 1;
868  pr->u.p.parm1 = TRUE;
869  } else {
870  // zero trip count
871  pr->u.p.count = 1; /* means no more chunks to execute */
872  pr->u.p.parm1 = FALSE;
873  break;
874  }
875  }
876  if ( st == 1 ) {
877  pr->u.p.lb = lb + init;
878  pr->u.p.ub = lb + limit;
879  } else {
880  T ub_tmp = lb + limit * st; // calculated upper bound, "ub" is user-defined upper bound
881  pr->u.p.lb = lb + init * st;
882  // adjust upper bound to "ub" if needed, so that MS lastprivate will match it exactly
883  if ( st > 0 ) {
884  pr->u.p.ub = ( ub_tmp + st > ub ? ub : ub_tmp );
885  } else {
886  pr->u.p.ub = ( ub_tmp + st < ub ? ub : ub_tmp );
887  }
888  }
889  if ( pr->ordered ) {
890  pr->u.p.ordered_lower = init;
891  pr->u.p.ordered_upper = limit;
892  }
893  break;
894  } // case
895  case kmp_sch_guided_iterative_chunked :
896  {
897  int nproc = team->t.t_nproc;
898  KD_TRACE(100,("__kmp_dispatch_init: T#%d kmp_sch_guided_iterative_chunked case\n",gtid));
899 
900  if ( nproc > 1 ) {
901  if ( (2UL * chunk + 1 ) * nproc >= tc ) {
902  /* chunk size too large, switch to dynamic */
903  schedule = kmp_sch_dynamic_chunked;
904  } else {
905  // when remaining iters become less than parm2 - switch to dynamic
906  pr->u.p.parm2 = guided_int_param * nproc * ( chunk + 1 );
907  *(double*)&pr->u.p.parm3 = guided_flt_param / nproc; // may occupy parm3 and parm4
908  }
909  } else {
910  KD_TRACE(100,("__kmp_dispatch_init: T#%d falling-through to kmp_sch_static_greedy\n",gtid));
911  schedule = kmp_sch_static_greedy;
912  /* team->t.t_nproc == 1: fall-through to kmp_sch_static_greedy */
913  KD_TRACE(100,("__kmp_dispatch_init: T#%d kmp_sch_static_greedy case\n",gtid));
914  pr->u.p.parm1 = tc;
915  } // if
916  } // case
917  break;
918  case kmp_sch_guided_analytical_chunked:
919  {
920  int nproc = team->t.t_nproc;
921  KD_TRACE(100, ("__kmp_dispatch_init: T#%d kmp_sch_guided_analytical_chunked case\n", gtid));
922 
923  if ( nproc > 1 ) {
924  if ( (2UL * chunk + 1 ) * nproc >= tc ) {
925  /* chunk size too large, switch to dynamic */
926  schedule = kmp_sch_dynamic_chunked;
927  } else {
928  /* commonly used term: (2 nproc - 1)/(2 nproc) */
929  DBL x;
930 
931  #if KMP_OS_WINDOWS && KMP_ARCH_X86
932  /* Linux* OS already has 64-bit computation by default for
933  long double, and on Windows* OS on Intel(R) 64,
934  /Qlong_double doesn't work. On Windows* OS
935  on IA-32 architecture, we need to set precision to
936  64-bit instead of the default 53-bit. Even though long
937  double doesn't work on Windows* OS on Intel(R) 64, the
938  resulting lack of precision is not expected to impact
939  the correctness of the algorithm, but this has not been
940  mathematically proven.
941  */
942  // save original FPCW and set precision to 64-bit, as
943  // Windows* OS on IA-32 architecture defaults to 53-bit
944  unsigned int oldFpcw = _control87(0,0x30000);
945  #endif
946  /* value used for comparison in solver for cross-over point */
947  long double target = ((long double)chunk * 2 + 1) * nproc / tc;
948 
949  /* crossover point--chunk indexes equal to or greater than
950  this point switch to dynamic-style scheduling */
951  UT cross;
952 
953  /* commonly used term: (2 nproc - 1)/(2 nproc) */
954  x = (long double)1.0 - (long double)0.5 / nproc;
955 
956  #ifdef KMP_DEBUG
957  { // test natural alignment
958  struct _test_a {
959  char a;
960  union {
961  char b;
962  DBL d;
963  };
964  } t;
965  ptrdiff_t natural_alignment = (ptrdiff_t)&t.b - (ptrdiff_t)&t - (ptrdiff_t)1;
966  //__kmp_warn( " %llx %llx %lld", (long long)&t.d, (long long)&t, (long long)natural_alignment );
967  KMP_DEBUG_ASSERT( ( ( (ptrdiff_t)&pr->u.p.parm3 ) & ( natural_alignment ) ) == 0 );
968  }
969  #endif // KMP_DEBUG
970 
971  /* save the term in thread private dispatch structure */
972  *(DBL*)&pr->u.p.parm3 = x;
973 
974  /* solve for the crossover point to the nearest integer i for which C_i <= chunk */
975  {
976  UT left, right, mid;
977  long double p;
978 
979  /* estimate initial upper and lower bound */
980 
981  /* doesn't matter what value right is as long as it is positive, but
982  it affects performance of the solver
983  */
984  right = 229;
985  p = __kmp_pow< UT >(x,right);
986  if ( p > target ) {
987  do{
988  p *= p;
989  right <<= 1;
990  } while(p>target && right < (1<<27));
991  left = right >> 1; /* lower bound is previous (failed) estimate of upper bound */
992  } else {
993  left = 0;
994  }
995 
996  /* bisection root-finding method */
997  while ( left + 1 < right ) {
998  mid = (left + right) / 2;
999  if ( __kmp_pow< UT >(x,mid) > target ) {
1000  left = mid;
1001  } else {
1002  right = mid;
1003  }
1004  } // while
1005  cross = right;
1006  }
1007  /* assert sanity of computed crossover point */
1008  KMP_ASSERT(cross && __kmp_pow< UT >(x, cross - 1) > target && __kmp_pow< UT >(x, cross) <= target);
1009 
1010  /* save the crossover point in thread private dispatch structure */
1011  pr->u.p.parm2 = cross;
1012 
1013  // C75803
1014  #if ( ( KMP_OS_LINUX || KMP_OS_WINDOWS ) && KMP_ARCH_X86 ) && ( ! defined( KMP_I8 ) )
1015  #define GUIDED_ANALYTICAL_WORKAROUND (*( DBL * )&pr->u.p.parm3)
1016  #else
1017  #define GUIDED_ANALYTICAL_WORKAROUND (x)
1018  #endif
1019  /* dynamic-style scheduling offset */
1020  pr->u.p.count = tc - __kmp_dispatch_guided_remaining(tc, GUIDED_ANALYTICAL_WORKAROUND, cross) - cross * chunk;
1021  #if KMP_OS_WINDOWS && KMP_ARCH_X86
1022  // restore FPCW
1023  _control87(oldFpcw,0x30000);
1024  #endif
1025  } // if
1026  } else {
1027  KD_TRACE(100, ("__kmp_dispatch_init: T#%d falling-through to kmp_sch_static_greedy\n",
1028  gtid ) );
1029  schedule = kmp_sch_static_greedy;
1030  /* team->t.t_nproc == 1: fall-through to kmp_sch_static_greedy */
1031  pr->u.p.parm1 = tc;
1032  } // if
1033  } // case
1034  break;
1035  case kmp_sch_static_greedy:
1036  KD_TRACE(100,("__kmp_dispatch_init: T#%d kmp_sch_static_greedy case\n",gtid));
1037  pr->u.p.parm1 = ( team -> t.t_nproc > 1 ) ?
1038  ( tc + team->t.t_nproc - 1 ) / team->t.t_nproc :
1039  tc;
1040  break;
1041  case kmp_sch_static_chunked :
1042  case kmp_sch_dynamic_chunked :
1043  KD_TRACE(100,("__kmp_dispatch_init: T#%d kmp_sch_static_chunked/kmp_sch_dynamic_chunked cases\n", gtid));
1044  break;
1045  case kmp_sch_trapezoidal :
1046  {
1047  /* TSS: trapezoid self-scheduling, minimum chunk_size = parm1 */
1048 
1049  T parm1, parm2, parm3, parm4;
1050  KD_TRACE(100, ("__kmp_dispatch_init: T#%d kmp_sch_trapezoidal case\n", gtid ) );
1051 
1052  parm1 = chunk;
1053 
1054  /* F : size of the first cycle */
1055  parm2 = ( tc / (2 * team->t.t_nproc) );
1056 
1057  if ( parm2 < 1 ) {
1058  parm2 = 1;
1059  }
1060 
1061  /* L : size of the last cycle. Make sure the last cycle
1062  * is not larger than the first cycle.
1063  */
1064  if ( parm1 < 1 ) {
1065  parm1 = 1;
1066  } else if ( parm1 > parm2 ) {
1067  parm1 = parm2;
1068  }
1069 
1070  /* N : number of cycles */
1071  parm3 = ( parm2 + parm1 );
1072  parm3 = ( 2 * tc + parm3 - 1) / parm3;
1073 
1074  if ( parm3 < 2 ) {
1075  parm3 = 2;
1076  }
1077 
1078  /* sigma : decreasing incr of the trapezoid */
1079  parm4 = ( parm3 - 1 );
1080  parm4 = ( parm2 - parm1 ) / parm4;
1081 
1082  // pointless check, because parm4 >= 0 always
1083  //if ( parm4 < 0 ) {
1084  // parm4 = 0;
1085  //}
1086 
1087  pr->u.p.parm1 = parm1;
1088  pr->u.p.parm2 = parm2;
1089  pr->u.p.parm3 = parm3;
1090  pr->u.p.parm4 = parm4;
1091  } // case
1092  break;
1093 
1094  default:
1095  {
1096  __kmp_msg(
1097  kmp_ms_fatal, // Severity
1098  KMP_MSG( UnknownSchedTypeDetected ), // Primary message
1099  KMP_HNT( GetNewerLibrary ), // Hint
1100  __kmp_msg_null // Variadic argument list terminator
1101  );
1102  }
1103  break;
1104  } // switch
1105  pr->schedule = schedule;
1106  if ( active ) {
1107  /* The name of this buffer should be my_buffer_index when it's free to use it */
1108 
1109  KD_TRACE(100, ("__kmp_dispatch_init: T#%d before wait: my_buffer_index:%d sh->buffer_index:%d\n",
1110  gtid, my_buffer_index, sh->buffer_index) );
1111  __kmp_wait_yield< kmp_uint32 >( & sh->buffer_index, my_buffer_index, __kmp_eq< kmp_uint32 >
1112  USE_ITT_BUILD_ARG( NULL )
1113  );
1114  // Note: KMP_WAIT_YIELD() cannot be used there: buffer index and my_buffer_index are
1115  // *always* 32-bit integers.
1116  KMP_MB(); /* is this necessary? */
1117  KD_TRACE(100, ("__kmp_dispatch_init: T#%d after wait: my_buffer_index:%d sh->buffer_index:%d\n",
1118  gtid, my_buffer_index, sh->buffer_index) );
1119 
1120  th -> th.th_dispatch -> th_dispatch_pr_current = (dispatch_private_info_t*) pr;
1121  th -> th.th_dispatch -> th_dispatch_sh_current = (dispatch_shared_info_t*) sh;
1122 #if USE_ITT_BUILD
1123  if ( pr->ordered ) {
1124  __kmp_itt_ordered_init( gtid );
1125  }; // if
1126 #endif /* USE_ITT_BUILD */
1127  }; // if
1128  #ifdef KMP_DEBUG
1129  {
1130  const char * buff;
1131  // create format specifiers before the debug output
1132  buff = __kmp_str_format(
1133  "__kmp_dispatch_init: T#%%d returning: schedule:%%d ordered:%%%s lb:%%%s ub:%%%s" \
1134  " st:%%%s tc:%%%s count:%%%s\n\tordered_lower:%%%s ordered_upper:%%%s" \
1135  " parm1:%%%s parm2:%%%s parm3:%%%s parm4:%%%s\n",
1136  traits_t< UT >::spec, traits_t< T >::spec, traits_t< T >::spec,
1137  traits_t< ST >::spec, traits_t< UT >::spec, traits_t< UT >::spec,
1138  traits_t< UT >::spec, traits_t< UT >::spec, traits_t< T >::spec,
1139  traits_t< T >::spec, traits_t< T >::spec, traits_t< T >::spec );
1140  KD_TRACE(10, ( buff,
1141  gtid, pr->schedule, pr->ordered, pr->u.p.lb, pr->u.p.ub,
1142  pr->u.p.st, pr->u.p.tc, pr->u.p.count,
1143  pr->u.p.ordered_lower, pr->u.p.ordered_upper, pr->u.p.parm1,
1144  pr->u.p.parm2, pr->u.p.parm3, pr->u.p.parm4 ) );
1145  __kmp_str_free( &buff );
1146  }
1147  #endif
1148  #if ( KMP_STATIC_STEAL_ENABLED )
1149  if ( ___kmp_size_type < 8 ) {
1150  // It cannot be guaranteed that after execution of a loop with some other schedule kind
1151  // all the parm3 variables will contain the same value.
1152  // Even if all parm3 will be the same, it still exists a bad case like using 0 and 1
1153  // rather than program life-time increment.
1154  // So the dedicated variable is required. The 'static_steal_counter' is used.
1155  if( schedule == kmp_sch_static_steal ) {
1156  // Other threads will inspect this variable when searching for a victim.
1157  // This is a flag showing that other threads may steal from this thread since then.
1158  volatile T * p = &pr->u.p.static_steal_counter;
1159  *p = *p + 1;
1160  }
1161  }
1162  #endif // ( KMP_STATIC_STEAL_ENABLED && USE_STEALING )
1163 }
1164 
1165 /*
1166  * For ordered loops, either __kmp_dispatch_finish() should be called after
1167  * every iteration, or __kmp_dispatch_finish_chunk() should be called after
1168  * every chunk of iterations. If the ordered section(s) were not executed
1169  * for this iteration (or every iteration in this chunk), we need to set the
1170  * ordered iteration counters so that the next thread can proceed.
1171  */
1172 template< typename UT >
1173 static void
1174 __kmp_dispatch_finish( int gtid, ident_t *loc )
1175 {
1176  typedef typename traits_t< UT >::signed_t ST;
1177  kmp_info_t *th = __kmp_threads[ gtid ];
1178 
1179  KD_TRACE(100, ("__kmp_dispatch_finish: T#%d called\n", gtid ) );
1180  if ( ! th -> th.th_team -> t.t_serialized ) {
1181 
1182  dispatch_private_info_template< UT > * pr =
1183  reinterpret_cast< dispatch_private_info_template< UT >* >
1184  ( th->th.th_dispatch->th_dispatch_pr_current );
1185  dispatch_shared_info_template< UT > volatile * sh =
1186  reinterpret_cast< dispatch_shared_info_template< UT >volatile* >
1187  ( th->th.th_dispatch->th_dispatch_sh_current );
1188  KMP_DEBUG_ASSERT( pr );
1189  KMP_DEBUG_ASSERT( sh );
1190  KMP_DEBUG_ASSERT( th->th.th_dispatch ==
1191  &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid] );
1192 
1193  if ( pr->ordered_bumped ) {
1194  KD_TRACE(1000, ("__kmp_dispatch_finish: T#%d resetting ordered_bumped to zero\n",
1195  gtid ) );
1196  pr->ordered_bumped = 0;
1197  } else {
1198  UT lower = pr->u.p.ordered_lower;
1199 
1200  #ifdef KMP_DEBUG
1201  {
1202  const char * buff;
1203  // create format specifiers before the debug output
1204  buff = __kmp_str_format(
1205  "__kmp_dispatch_finish: T#%%d before wait: ordered_iteration:%%%s lower:%%%s\n",
1206  traits_t< UT >::spec, traits_t< UT >::spec );
1207  KD_TRACE(1000, ( buff, gtid, sh->u.s.ordered_iteration, lower ) );
1208  __kmp_str_free( &buff );
1209  }
1210  #endif
1211 
1212  __kmp_wait_yield< UT >(&sh->u.s.ordered_iteration, lower, __kmp_ge< UT >
1213  USE_ITT_BUILD_ARG(NULL)
1214  );
1215  KMP_MB(); /* is this necessary? */
1216  #ifdef KMP_DEBUG
1217  {
1218  const char * buff;
1219  // create format specifiers before the debug output
1220  buff = __kmp_str_format(
1221  "__kmp_dispatch_finish: T#%%d after wait: ordered_iteration:%%%s lower:%%%s\n",
1222  traits_t< UT >::spec, traits_t< UT >::spec );
1223  KD_TRACE(1000, ( buff, gtid, sh->u.s.ordered_iteration, lower ) );
1224  __kmp_str_free( &buff );
1225  }
1226  #endif
1227 
1228  test_then_inc< ST >( (volatile ST *) & sh->u.s.ordered_iteration );
1229  } // if
1230  } // if
1231  KD_TRACE(100, ("__kmp_dispatch_finish: T#%d returned\n", gtid ) );
1232 }
1233 
1234 #ifdef KMP_GOMP_COMPAT
1235 
1236 template< typename UT >
1237 static void
1238 __kmp_dispatch_finish_chunk( int gtid, ident_t *loc )
1239 {
1240  typedef typename traits_t< UT >::signed_t ST;
1241  kmp_info_t *th = __kmp_threads[ gtid ];
1242 
1243  KD_TRACE(100, ("__kmp_dispatch_finish_chunk: T#%d called\n", gtid ) );
1244  if ( ! th -> th.th_team -> t.t_serialized ) {
1245 // int cid;
1246  dispatch_private_info_template< UT > * pr =
1247  reinterpret_cast< dispatch_private_info_template< UT >* >
1248  ( th->th.th_dispatch->th_dispatch_pr_current );
1249  dispatch_shared_info_template< UT > volatile * sh =
1250  reinterpret_cast< dispatch_shared_info_template< UT >volatile* >
1251  ( th->th.th_dispatch->th_dispatch_sh_current );
1252  KMP_DEBUG_ASSERT( pr );
1253  KMP_DEBUG_ASSERT( sh );
1254  KMP_DEBUG_ASSERT( th->th.th_dispatch ==
1255  &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid] );
1256 
1257 // for (cid = 0; cid < KMP_MAX_ORDERED; ++cid) {
1258  UT lower = pr->u.p.ordered_lower;
1259  UT upper = pr->u.p.ordered_upper;
1260  UT inc = upper - lower + 1;
1261 
1262  if ( pr->ordered_bumped == inc ) {
1263  KD_TRACE(1000, ("__kmp_dispatch_finish: T#%d resetting ordered_bumped to zero\n",
1264  gtid ) );
1265  pr->ordered_bumped = 0;
1266  } else {
1267  inc -= pr->ordered_bumped;
1268 
1269  #ifdef KMP_DEBUG
1270  {
1271  const char * buff;
1272  // create format specifiers before the debug output
1273  buff = __kmp_str_format(
1274  "__kmp_dispatch_finish_chunk: T#%%d before wait: " \
1275  "ordered_iteration:%%%s lower:%%%s upper:%%%s\n",
1276  traits_t< UT >::spec, traits_t< UT >::spec, traits_t< UT >::spec );
1277  KD_TRACE(1000, ( buff, gtid, sh->u.s.ordered_iteration, lower, upper ) );
1278  __kmp_str_free( &buff );
1279  }
1280  #endif
1281 
1282  __kmp_wait_yield< UT >(&sh->u.s.ordered_iteration, lower, __kmp_ge< UT >
1283  USE_ITT_BUILD_ARG(NULL)
1284  );
1285 
1286  KMP_MB(); /* is this necessary? */
1287  KD_TRACE(1000, ("__kmp_dispatch_finish_chunk: T#%d resetting ordered_bumped to zero\n",
1288  gtid ) );
1289  pr->ordered_bumped = 0;
1291  #ifdef KMP_DEBUG
1292  {
1293  const char * buff;
1294  // create format specifiers before the debug output
1295  buff = __kmp_str_format(
1296  "__kmp_dispatch_finish_chunk: T#%%d after wait: " \
1297  "ordered_iteration:%%%s inc:%%%s lower:%%%s upper:%%%s\n",
1298  traits_t< UT >::spec, traits_t< UT >::spec, traits_t< UT >::spec, traits_t< UT >::spec );
1299  KD_TRACE(1000, ( buff, gtid, sh->u.s.ordered_iteration, inc, lower, upper ) );
1300  __kmp_str_free( &buff );
1301  }
1302  #endif
1303 
1304  test_then_add< ST >( (volatile ST *) & sh->u.s.ordered_iteration, inc);
1305  }
1306 // }
1307  }
1308  KD_TRACE(100, ("__kmp_dispatch_finish_chunk: T#%d returned\n", gtid ) );
1309 }
1310 
1311 #endif /* KMP_GOMP_COMPAT */
1312 
1313 template< typename T >
1314 static int
1315 __kmp_dispatch_next(
1316  ident_t *loc, int gtid, kmp_int32 *p_last, T *p_lb, T *p_ub, typename traits_t< T >::signed_t *p_st
1317 ) {
1318 
1319  typedef typename traits_t< T >::unsigned_t UT;
1320  typedef typename traits_t< T >::signed_t ST;
1321  typedef typename traits_t< T >::floating_t DBL;
1322  static const int ___kmp_size_type = sizeof( UT );
1323 
1324  int status;
1325  dispatch_private_info_template< T > * pr;
1326  kmp_info_t * th = __kmp_threads[ gtid ];
1327  kmp_team_t * team = th -> th.th_team;
1328 
1329  #ifdef KMP_DEBUG
1330  {
1331  const char * buff;
1332  // create format specifiers before the debug output
1333  buff = __kmp_str_format(
1334  "__kmp_dispatch_next: T#%%d called p_lb:%%%s p_ub:%%%s p_st:%%%s p_last: %%p\n",
1335  traits_t< T >::spec, traits_t< T >::spec, traits_t< ST >::spec );
1336  KD_TRACE(1000, ( buff, gtid, *p_lb, *p_ub, p_st ? *p_st : 0, p_last ) );
1337  __kmp_str_free( &buff );
1338  }
1339  #endif
1340 
1341  if ( team -> t.t_serialized ) {
1342  /* NOTE: serialize this dispatch becase we are not at the active level */
1343  pr = reinterpret_cast< dispatch_private_info_template< T >* >
1344  ( th -> th.th_dispatch -> th_disp_buffer ); /* top of the stack */
1345  KMP_DEBUG_ASSERT( pr );
1346 
1347  if ( (status = (pr->u.p.tc != 0)) == 0 ) {
1348  *p_lb = 0;
1349  *p_ub = 0;
1350  if ( p_st != 0 ) {
1351  *p_st = 0;
1352  }
1353  if ( __kmp_env_consistency_check ) {
1354  if ( pr->pushed_ws != ct_none ) {
1355  pr->pushed_ws = __kmp_pop_workshare( gtid, pr->pushed_ws, loc );
1356  }
1357  }
1358  } else if ( pr->nomerge ) {
1359  kmp_int32 last;
1360  T start;
1361  UT limit, trip, init;
1362  ST incr;
1363  T chunk = pr->u.p.parm1;
1364 
1365  KD_TRACE(100, ("__kmp_dispatch_next: T#%d kmp_sch_dynamic_chunked case\n", gtid ) );
1366 
1367  init = chunk * pr->u.p.count++;
1368  trip = pr->u.p.tc - 1;
1369 
1370  if ( (status = (init <= trip)) == 0 ) {
1371  *p_lb = 0;
1372  *p_ub = 0;
1373  if ( p_st != 0 ) *p_st = 0;
1374  if ( __kmp_env_consistency_check ) {
1375  if ( pr->pushed_ws != ct_none ) {
1376  pr->pushed_ws = __kmp_pop_workshare( gtid, pr->pushed_ws, loc );
1377  }
1378  }
1379  } else {
1380  start = pr->u.p.lb;
1381  limit = chunk + init - 1;
1382  incr = pr->u.p.st;
1383 
1384  if ( (last = (limit >= trip)) != 0 ) {
1385  limit = trip;
1386  #if KMP_OS_WINDOWS
1387  pr->u.p.last_upper = pr->u.p.ub;
1388  #endif /* KMP_OS_WINDOWS */
1389  }
1390  if ( p_last ) {
1391  *p_last = last;
1392  }
1393  if ( p_st != 0 ) {
1394  *p_st = incr;
1395  }
1396  if ( incr == 1 ) {
1397  *p_lb = start + init;
1398  *p_ub = start + limit;
1399  } else {
1400  *p_lb = start + init * incr;
1401  *p_ub = start + limit * incr;
1402  }
1403 
1404  if ( pr->ordered ) {
1405  pr->u.p.ordered_lower = init;
1406  pr->u.p.ordered_upper = limit;
1407  #ifdef KMP_DEBUG
1408  {
1409  const char * buff;
1410  // create format specifiers before the debug output
1411  buff = __kmp_str_format(
1412  "__kmp_dispatch_next: T#%%d ordered_lower:%%%s ordered_upper:%%%s\n",
1413  traits_t< UT >::spec, traits_t< UT >::spec );
1414  KD_TRACE(1000, ( buff, gtid, pr->u.p.ordered_lower, pr->u.p.ordered_upper ) );
1415  __kmp_str_free( &buff );
1416  }
1417  #endif
1418  } // if
1419  } // if
1420  } else {
1421  pr->u.p.tc = 0;
1422 
1423  *p_lb = pr->u.p.lb;
1424  *p_ub = pr->u.p.ub;
1425  #if KMP_OS_WINDOWS
1426  pr->u.p.last_upper = *p_ub;
1427  #endif /* KMP_OS_WINDOWS */
1428 
1429  if ( p_st != 0 ) {
1430  *p_st = pr->u.p.st;
1431  }
1432  if ( p_last ) {
1433  *p_last = TRUE;
1434  }
1435  } // if
1436  #ifdef KMP_DEBUG
1437  {
1438  const char * buff;
1439  // create format specifiers before the debug output
1440  buff = __kmp_str_format(
1441  "__kmp_dispatch_next: T#%%d serialized case: p_lb:%%%s " \
1442  "p_ub:%%%s p_st:%%%s p_last:%%p returning:%%d\n",
1443  traits_t< T >::spec, traits_t< T >::spec, traits_t< ST >::spec );
1444  KD_TRACE(10, ( buff, gtid, *p_lb, *p_ub, *p_st, p_last, status) );
1445  __kmp_str_free( &buff );
1446  }
1447  #endif
1448  return status;
1449  } else {
1450  kmp_int32 last = 0;
1451  dispatch_shared_info_template< UT > *sh;
1452  T start;
1453  ST incr;
1454  UT limit, trip, init;
1455 
1456  KMP_DEBUG_ASSERT( th->th.th_dispatch ==
1457  &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid] );
1458 
1459  pr = reinterpret_cast< dispatch_private_info_template< T >* >
1460  ( th->th.th_dispatch->th_dispatch_pr_current );
1461  KMP_DEBUG_ASSERT( pr );
1462  sh = reinterpret_cast< dispatch_shared_info_template< UT >* >
1463  ( th->th.th_dispatch->th_dispatch_sh_current );
1464  KMP_DEBUG_ASSERT( sh );
1465 
1466  if ( pr->u.p.tc == 0 ) {
1467  // zero trip count
1468  status = 0;
1469  } else {
1470  switch (pr->schedule) {
1471  #if ( KMP_STATIC_STEAL_ENABLED && KMP_ARCH_X86_64 )
1472  case kmp_sch_static_steal:
1473  {
1474  T chunk = pr->u.p.parm1;
1475 
1476  KD_TRACE(100, ("__kmp_dispatch_next: T#%d kmp_sch_static_steal case\n", gtid) );
1477 
1478  trip = pr->u.p.tc - 1;
1479 
1480  if ( ___kmp_size_type > 4 ) {
1481  // Other threads do not look into the data of this thread,
1482  // so it's not necessary to make volatile casting.
1483  init = ( pr->u.p.count )++;
1484  status = ( init < pr->u.p.ub );
1485  } else {
1486  typedef union {
1487  struct {
1488  UT count;
1489  T ub;
1490  } p;
1491  kmp_int64 b;
1492  } union_i4;
1493  // All operations on 'count' or 'ub' must be combined atomically together.
1494  // stealing implemented only for 4-byte indexes
1495  {
1496  union_i4 vold, vnew;
1497  vold.b = *( volatile kmp_int64 * )(&pr->u.p.count);
1498  vnew = vold;
1499  vnew.p.count++;
1500  while( ! KMP_COMPARE_AND_STORE_ACQ64(
1501  ( volatile kmp_int64* )&pr->u.p.count,
1502  *VOLATILE_CAST(kmp_int64 *)&vold.b,
1503  *VOLATILE_CAST(kmp_int64 *)&vnew.b ) ) {
1504  KMP_CPU_PAUSE();
1505  vold.b = *( volatile kmp_int64 * )(&pr->u.p.count);
1506  vnew = vold;
1507  vnew.p.count++;
1508  }
1509  vnew = vold;
1510  init = vnew.p.count;
1511  status = ( init < vnew.p.ub ) ;
1512  }
1513 
1514  if( !status ) {
1515  kmp_info_t **other_threads = team->t.t_threads;
1516  int while_limit = 10;
1517  int while_index = 0;
1518 
1519  // TODO: algorithm of searching for a victim
1520  // should be cleaned up and measured
1521  while ( ( !status ) && ( while_limit != ++while_index ) ) {
1522  union_i4 vold, vnew;
1523  kmp_int32 remaining; // kmp_int32 because KMP_I4 only
1524  T victimIdx = pr->u.p.parm4;
1525  T oldVictimIdx = victimIdx;
1526  dispatch_private_info_template< T > * victim;
1527 
1528  do {
1529  if( !victimIdx ) {
1530  victimIdx = team->t.t_nproc - 1;
1531  } else {
1532  --victimIdx;
1533  }
1534  victim = reinterpret_cast< dispatch_private_info_template< T >* >
1535  ( other_threads[victimIdx]->th.th_dispatch->th_dispatch_pr_current );
1536  } while ( (victim == NULL || victim == pr) && oldVictimIdx != victimIdx );
1537  // TODO: think about a proper place of this test
1538  if ( ( !victim ) ||
1539  ( (*( volatile T * )&victim->u.p.static_steal_counter) !=
1540  (*( volatile T * )&pr->u.p.static_steal_counter) ) ) {
1541  // TODO: delay would be nice
1542  continue;
1543  // the victim is not ready yet to participate in stealing
1544  // because the victim is still in kmp_init_dispatch
1545  }
1546  if ( oldVictimIdx == victimIdx ) {
1547  break;
1548  }
1549  pr->u.p.parm4 = victimIdx;
1550 
1551  while( 1 ) {
1552  vold.b = *( volatile kmp_int64 * )( &victim->u.p.count );
1553  vnew = vold;
1554 
1555  KMP_DEBUG_ASSERT( (vnew.p.ub - 1) * chunk <= trip );
1556  if ( vnew.p.count >= vnew.p.ub || (remaining = vnew.p.ub - vnew.p.count) < 4 ) {
1557  break;
1558  }
1559  vnew.p.ub -= (remaining >> 2);
1560  KMP_DEBUG_ASSERT((vnew.p.ub - 1) * chunk <= trip);
1561  #pragma warning( push )
1562  // disable warning on pointless comparison of unsigned with 0
1563  #pragma warning( disable: 186 )
1564  KMP_DEBUG_ASSERT(vnew.p.ub >= 0);
1565  #pragma warning( pop )
1566  // TODO: Should this be acquire or release?
1567  if ( KMP_COMPARE_AND_STORE_ACQ64(
1568  ( volatile kmp_int64 * )&victim->u.p.count,
1569  *VOLATILE_CAST(kmp_int64 *)&vold.b,
1570  *VOLATILE_CAST(kmp_int64 *)&vnew.b ) ) {
1571  status = 1;
1572  while_index = 0;
1573  // now update own count and ub
1574  #if KMP_ARCH_X86
1575  // stealing executed on non-KMP_ARCH_X86 only
1576  // Atomic 64-bit write on ia32 is
1577  // unavailable, so we do this in steps.
1578  // This code is not tested.
1579  init = vold.p.count;
1580  pr->u.p.ub = 0;
1581  pr->u.p.count = init + 1;
1582  pr->u.p.ub = vnew.p.count;
1583  #else
1584  init = vnew.p.ub;
1585  vold.p.count = init + 1;
1586  // TODO: is it safe and enough?
1587  *( volatile kmp_int64 * )(&pr->u.p.count) = vold.b;
1588  #endif // KMP_ARCH_X86
1589  break;
1590  } // if
1591  KMP_CPU_PAUSE();
1592  } // while (1)
1593  } // while
1594  } // if
1595  } // if
1596  if ( !status ) {
1597  *p_lb = 0;
1598  *p_ub = 0;
1599  if ( p_st != 0 ) *p_st = 0;
1600  } else {
1601  start = pr->u.p.parm2;
1602  init *= chunk;
1603  limit = chunk + init - 1;
1604  incr = pr->u.p.st;
1605 
1606  KMP_DEBUG_ASSERT(init <= trip);
1607  if ( (last = (limit >= trip)) != 0 )
1608  limit = trip;
1609  if ( p_last ) {
1610  *p_last = last;
1611  }
1612  if ( p_st != 0 ) *p_st = incr;
1613 
1614  if ( incr == 1 ) {
1615  *p_lb = start + init;
1616  *p_ub = start + limit;
1617  } else {
1618  *p_lb = start + init * incr;
1619  *p_ub = start + limit * incr;
1620  }
1621 
1622  if ( pr->ordered ) {
1623  pr->u.p.ordered_lower = init;
1624  pr->u.p.ordered_upper = limit;
1625  #ifdef KMP_DEBUG
1626  {
1627  const char * buff;
1628  // create format specifiers before the debug output
1629  buff = __kmp_str_format(
1630  "__kmp_dispatch_next: T#%%d ordered_lower:%%%s ordered_upper:%%%s\n",
1631  traits_t< UT >::spec, traits_t< UT >::spec );
1632  KD_TRACE(1000, ( buff, gtid, pr->u.p.ordered_lower, pr->u.p.ordered_upper ) );
1633  __kmp_str_free( &buff );
1634  }
1635  #endif
1636  } // if
1637  } // if
1638  break;
1639  } // case
1640  #endif // ( KMP_STATIC_STEAL_ENABLED && KMP_ARCH_X86_64 )
1641  case kmp_sch_static_balanced:
1642  {
1643  KD_TRACE(100, ("__kmp_dispatch_next: T#%d kmp_sch_static_balanced case\n", gtid) );
1644  if ( (status = !pr->u.p.count) != 0 ) { /* check if thread has any iteration to do */
1645  pr->u.p.count = 1;
1646  *p_lb = pr->u.p.lb;
1647  *p_ub = pr->u.p.ub;
1648  last = pr->u.p.parm1;
1649  if ( p_last ) {
1650  *p_last = last;
1651  }
1652  if ( p_st )
1653  *p_st = pr->u.p.st;
1654  } else { /* no iterations to do */
1655  pr->u.p.lb = pr->u.p.ub + pr->u.p.st;
1656  }
1657  if ( pr->ordered ) {
1658  #ifdef KMP_DEBUG
1659  {
1660  const char * buff;
1661  // create format specifiers before the debug output
1662  buff = __kmp_str_format(
1663  "__kmp_dispatch_next: T#%%d ordered_lower:%%%s ordered_upper:%%%s\n",
1664  traits_t< UT >::spec, traits_t< UT >::spec );
1665  KD_TRACE(1000, ( buff, gtid, pr->u.p.ordered_lower, pr->u.p.ordered_upper ) );
1666  __kmp_str_free( &buff );
1667  }
1668  #endif
1669  } // if
1670  } // case
1671  break;
1672  case kmp_sch_static_greedy: /* original code for kmp_sch_static_greedy was merged here */
1673  case kmp_sch_static_chunked:
1674  {
1675  T parm1;
1676 
1677  KD_TRACE(100, ("__kmp_dispatch_next: T#%d kmp_sch_static_[affinity|chunked] case\n",
1678  gtid ) );
1679  parm1 = pr->u.p.parm1;
1680 
1681  trip = pr->u.p.tc - 1;
1682  init = parm1 * (pr->u.p.count + __kmp_tid_from_gtid(gtid));
1683 
1684  if ( (status = (init <= trip)) != 0 ) {
1685  start = pr->u.p.lb;
1686  incr = pr->u.p.st;
1687  limit = parm1 + init - 1;
1688 
1689  if ( (last = (limit >= trip)) != 0 )
1690  limit = trip;
1691 
1692  if ( p_last ) {
1693  *p_last = last;
1694  }
1695  if ( p_st != 0 ) *p_st = incr;
1696 
1697  pr->u.p.count += team->t.t_nproc;
1698 
1699  if ( incr == 1 ) {
1700  *p_lb = start + init;
1701  *p_ub = start + limit;
1702  }
1703  else {
1704  *p_lb = start + init * incr;
1705  *p_ub = start + limit * incr;
1706  }
1707 
1708  if ( pr->ordered ) {
1709  pr->u.p.ordered_lower = init;
1710  pr->u.p.ordered_upper = limit;
1711  #ifdef KMP_DEBUG
1712  {
1713  const char * buff;
1714  // create format specifiers before the debug output
1715  buff = __kmp_str_format(
1716  "__kmp_dispatch_next: T#%%d ordered_lower:%%%s ordered_upper:%%%s\n",
1717  traits_t< UT >::spec, traits_t< UT >::spec );
1718  KD_TRACE(1000, ( buff, gtid, pr->u.p.ordered_lower, pr->u.p.ordered_upper ) );
1719  __kmp_str_free( &buff );
1720  }
1721  #endif
1722  } // if
1723  } // if
1724  } // case
1725  break;
1726 
1727  case kmp_sch_dynamic_chunked:
1728  {
1729  T chunk = pr->u.p.parm1;
1730 
1731  KD_TRACE(100, ("__kmp_dispatch_next: T#%d kmp_sch_dynamic_chunked case\n",
1732  gtid ) );
1733 
1734  init = chunk * test_then_inc_acq< ST >((volatile ST *) & sh->u.s.iteration );
1735  trip = pr->u.p.tc - 1;
1736 
1737  if ( (status = (init <= trip)) == 0 ) {
1738  *p_lb = 0;
1739  *p_ub = 0;
1740  if ( p_st != 0 ) *p_st = 0;
1741  } else {
1742  start = pr->u.p.lb;
1743  limit = chunk + init - 1;
1744  incr = pr->u.p.st;
1745 
1746  if ( (last = (limit >= trip)) != 0 )
1747  limit = trip;
1748  if ( p_last ) {
1749  *p_last = last;
1750  }
1751  if ( p_st != 0 ) *p_st = incr;
1752 
1753  if ( incr == 1 ) {
1754  *p_lb = start + init;
1755  *p_ub = start + limit;
1756  } else {
1757  *p_lb = start + init * incr;
1758  *p_ub = start + limit * incr;
1759  }
1760 
1761  if ( pr->ordered ) {
1762  pr->u.p.ordered_lower = init;
1763  pr->u.p.ordered_upper = limit;
1764  #ifdef KMP_DEBUG
1765  {
1766  const char * buff;
1767  // create format specifiers before the debug output
1768  buff = __kmp_str_format(
1769  "__kmp_dispatch_next: T#%%d ordered_lower:%%%s ordered_upper:%%%s\n",
1770  traits_t< UT >::spec, traits_t< UT >::spec );
1771  KD_TRACE(1000, ( buff, gtid, pr->u.p.ordered_lower, pr->u.p.ordered_upper ) );
1772  __kmp_str_free( &buff );
1773  }
1774  #endif
1775  } // if
1776  } // if
1777  } // case
1778  break;
1779 
1780  case kmp_sch_guided_iterative_chunked:
1781  {
1782  T chunkspec = pr->u.p.parm1;
1783  KD_TRACE(100,
1784  ("__kmp_dispatch_next: T#%d kmp_sch_guided_chunked iterative case\n",gtid));
1785  trip = pr->u.p.tc;
1786  // Start atomic part of calculations
1787  while(1) {
1788  ST remaining; // signed, because can be < 0
1789  init = sh->u.s.iteration; // shared value
1790  remaining = trip - init;
1791  if ( remaining <= 0 ) { // AC: need to compare with 0 first
1792  // nothing to do, don't try atomic op
1793  status = 0;
1794  break;
1795  }
1796  if ( remaining < pr->u.p.parm2 ) { // compare with K*nproc*(chunk+1), K=2 by default
1797  // use dynamic-style shcedule
1798  // atomically inrement iterations, get old value
1799  init = test_then_add<ST>( (ST*)&sh->u.s.iteration, (ST)chunkspec );
1800  remaining = trip - init;
1801  if (remaining <= 0) {
1802  status = 0; // all iterations got by other threads
1803  } else {
1804  // got some iterations to work on
1805  status = 1;
1806  if ( remaining > chunkspec ) {
1807  limit = init + chunkspec - 1;
1808  } else {
1809  last = 1; // the last chunk
1810  limit = init + remaining - 1;
1811  } // if
1812  } // if
1813  break;
1814  } // if
1815  limit = init + (UT)( remaining * *(double*)&pr->u.p.parm3 ); // divide by K*nproc
1816  if ( compare_and_swap<ST>( (ST*)&sh->u.s.iteration, (ST)init, (ST)limit ) ) {
1817  // CAS was successful, chunk obtained
1818  status = 1;
1819  --limit;
1820  break;
1821  } // if
1822  } // while
1823  if ( status != 0 ) {
1824  start = pr->u.p.lb;
1825  incr = pr->u.p.st;
1826  if ( p_st != NULL )
1827  *p_st = incr;
1828  if ( p_last != NULL )
1829  *p_last = last;
1830  *p_lb = start + init * incr;
1831  *p_ub = start + limit * incr;
1832  if ( pr->ordered ) {
1833  pr->u.p.ordered_lower = init;
1834  pr->u.p.ordered_upper = limit;
1835  #ifdef KMP_DEBUG
1836  {
1837  const char * buff;
1838  // create format specifiers before the debug output
1839  buff = __kmp_str_format(
1840  "__kmp_dispatch_next: T#%%d ordered_lower:%%%s ordered_upper:%%%s\n",
1841  traits_t< UT >::spec, traits_t< UT >::spec );
1842  KD_TRACE(1000, ( buff, gtid, pr->u.p.ordered_lower, pr->u.p.ordered_upper ) );
1843  __kmp_str_free( &buff );
1844  }
1845  #endif
1846  } // if
1847  } else {
1848  *p_lb = 0;
1849  *p_ub = 0;
1850  if ( p_st != NULL )
1851  *p_st = 0;
1852  } // if
1853  } // case
1854  break;
1855 
1856  case kmp_sch_guided_analytical_chunked:
1857  {
1858  T chunkspec = pr->u.p.parm1;
1859  UT chunkIdx;
1860  #if KMP_OS_WINDOWS && KMP_ARCH_X86
1861  /* for storing original FPCW value for Windows* OS on
1862  IA-32 architecture 8-byte version */
1863  unsigned int oldFpcw;
1864  int fpcwSet = 0;
1865  #endif
1866  KD_TRACE(100, ("__kmp_dispatch_next: T#%d kmp_sch_guided_chunked analytical case\n",
1867  gtid ) );
1868 
1869  trip = pr->u.p.tc;
1870 
1871  KMP_DEBUG_ASSERT(team->t.t_nproc > 1);
1872  KMP_DEBUG_ASSERT((2UL * chunkspec + 1) * team->t.t_nproc < trip);
1873 
1874  while(1) { /* this while loop is a safeguard against unexpected zero chunk sizes */
1875  chunkIdx = test_then_inc_acq< ST >((volatile ST *) & sh->u.s.iteration );
1876  if ( chunkIdx >= pr->u.p.parm2 ) {
1877  --trip;
1878  /* use dynamic-style scheduling */
1879  init = chunkIdx * chunkspec + pr->u.p.count;
1880  /* need to verify init > 0 in case of overflow in the above calculation */
1881  if ( (status = (init > 0 && init <= trip)) != 0 ) {
1882  limit = init + chunkspec -1;
1883 
1884  if ( (last = (limit >= trip)) != 0 )
1885  limit = trip;
1886  }
1887  break;
1888  } else {
1889  /* use exponential-style scheduling */
1890  /* The following check is to workaround the lack of long double precision on Windows* OS.
1891  This check works around the possible effect that init != 0 for chunkIdx == 0.
1892  */
1893  #if KMP_OS_WINDOWS && KMP_ARCH_X86
1894  /* If we haven't already done so, save original
1895  FPCW and set precision to 64-bit, as Windows* OS
1896  on IA-32 architecture defaults to 53-bit */
1897  if ( !fpcwSet ) {
1898  oldFpcw = _control87(0,0x30000);
1899  fpcwSet = 0x30000;
1900  }
1901  #endif
1902  if ( chunkIdx ) {
1903  init = __kmp_dispatch_guided_remaining< T >(
1904  trip, *( DBL * )&pr->u.p.parm3, chunkIdx );
1905  KMP_DEBUG_ASSERT(init);
1906  init = trip - init;
1907  } else
1908  init = 0;
1909  limit = trip - __kmp_dispatch_guided_remaining< T >(
1910  trip, *( DBL * )&pr->u.p.parm3, chunkIdx + 1 );
1911  KMP_ASSERT(init <= limit);
1912  if ( init < limit ) {
1913  KMP_DEBUG_ASSERT(limit <= trip);
1914  --limit;
1915  status = 1;
1916  break;
1917  } // if
1918  } // if
1919  } // while (1)
1920  #if KMP_OS_WINDOWS && KMP_ARCH_X86
1921  /* restore FPCW if necessary */
1922  if ( oldFpcw & fpcwSet != 0 )
1923  _control87(oldFpcw,0x30000);
1924  #endif
1925  if ( status != 0 ) {
1926  start = pr->u.p.lb;
1927  incr = pr->u.p.st;
1928  if ( p_st != NULL )
1929  *p_st = incr;
1930  if ( p_last != NULL )
1931  *p_last = last;
1932  *p_lb = start + init * incr;
1933  *p_ub = start + limit * incr;
1934  if ( pr->ordered ) {
1935  pr->u.p.ordered_lower = init;
1936  pr->u.p.ordered_upper = limit;
1937  #ifdef KMP_DEBUG
1938  {
1939  const char * buff;
1940  // create format specifiers before the debug output
1941  buff = __kmp_str_format(
1942  "__kmp_dispatch_next: T#%%d ordered_lower:%%%s ordered_upper:%%%s\n",
1943  traits_t< UT >::spec, traits_t< UT >::spec );
1944  KD_TRACE(1000, ( buff, gtid, pr->u.p.ordered_lower, pr->u.p.ordered_upper ) );
1945  __kmp_str_free( &buff );
1946  }
1947  #endif
1948  }
1949  } else {
1950  *p_lb = 0;
1951  *p_ub = 0;
1952  if ( p_st != NULL )
1953  *p_st = 0;
1954  }
1955  } // case
1956  break;
1957 
1958  case kmp_sch_trapezoidal:
1959  {
1960  UT index;
1961  T parm2 = pr->u.p.parm2;
1962  T parm3 = pr->u.p.parm3;
1963  T parm4 = pr->u.p.parm4;
1964  KD_TRACE(100, ("__kmp_dispatch_next: T#%d kmp_sch_trapezoidal case\n",
1965  gtid ) );
1966 
1967  index = test_then_inc< ST >( (volatile ST *) & sh->u.s.iteration );
1968 
1969  init = ( index * ( (2*parm2) - (index-1)*parm4 ) ) / 2;
1970  trip = pr->u.p.tc - 1;
1971 
1972  if ( (status = (index < parm3 && init <= trip)) == 0 ) {
1973  *p_lb = 0;
1974  *p_ub = 0;
1975  if ( p_st != 0 ) *p_st = 0;
1976  } else {
1977  start = pr->u.p.lb;
1978  limit = ( (index+1) * ( 2*parm2 - index*parm4 ) ) / 2 - 1;
1979  incr = pr->u.p.st;
1980 
1981  if ( (last = (limit >= trip)) != 0 )
1982  limit = trip;
1983 
1984  if ( p_last != 0 ) {
1985  *p_last = last;
1986  }
1987  if ( p_st != 0 ) *p_st = incr;
1988 
1989  if ( incr == 1 ) {
1990  *p_lb = start + init;
1991  *p_ub = start + limit;
1992  } else {
1993  *p_lb = start + init * incr;
1994  *p_ub = start + limit * incr;
1995  }
1996 
1997  if ( pr->ordered ) {
1998  pr->u.p.ordered_lower = init;
1999  pr->u.p.ordered_upper = limit;
2000  #ifdef KMP_DEBUG
2001  {
2002  const char * buff;
2003  // create format specifiers before the debug output
2004  buff = __kmp_str_format(
2005  "__kmp_dispatch_next: T#%%d ordered_lower:%%%s ordered_upper:%%%s\n",
2006  traits_t< UT >::spec, traits_t< UT >::spec );
2007  KD_TRACE(1000, ( buff, gtid, pr->u.p.ordered_lower, pr->u.p.ordered_upper ) );
2008  __kmp_str_free( &buff );
2009  }
2010  #endif
2011  } // if
2012  } // if
2013  } // case
2014  break;
2015  } // switch
2016  } // if tc == 0;
2017 
2018  if ( status == 0 ) {
2019  UT num_done;
2020 
2021  num_done = test_then_inc< ST >( (volatile ST *) & sh->u.s.num_done );
2022  #ifdef KMP_DEBUG
2023  {
2024  const char * buff;
2025  // create format specifiers before the debug output
2026  buff = __kmp_str_format(
2027  "__kmp_dispatch_next: T#%%d increment num_done:%%%s\n",
2028  traits_t< UT >::spec );
2029  KD_TRACE(100, ( buff, gtid, sh->u.s.num_done ) );
2030  __kmp_str_free( &buff );
2031  }
2032  #endif
2033 
2034  if ( num_done == team->t.t_nproc-1 ) {
2035  /* NOTE: release this buffer to be reused */
2036 
2037  KMP_MB(); /* Flush all pending memory write invalidates. */
2038 
2039  sh->u.s.num_done = 0;
2040  sh->u.s.iteration = 0;
2041 
2042  /* TODO replace with general release procedure? */
2043  if ( pr->ordered ) {
2044  sh->u.s.ordered_iteration = 0;
2045  }
2046 
2047  KMP_MB(); /* Flush all pending memory write invalidates. */
2048 
2049  sh -> buffer_index += KMP_MAX_DISP_BUF;
2050  KD_TRACE(100, ("__kmp_dispatch_next: T#%d change buffer_index:%d\n",
2051  gtid, sh->buffer_index) );
2052 
2053  KMP_MB(); /* Flush all pending memory write invalidates. */
2054 
2055  } // if
2056  if ( __kmp_env_consistency_check ) {
2057  if ( pr->pushed_ws != ct_none ) {
2058  pr->pushed_ws = __kmp_pop_workshare( gtid, pr->pushed_ws, loc );
2059  }
2060  }
2061 
2062  th -> th.th_dispatch -> th_deo_fcn = NULL;
2063  th -> th.th_dispatch -> th_dxo_fcn = NULL;
2064  th -> th.th_dispatch -> th_dispatch_sh_current = NULL;
2065  th -> th.th_dispatch -> th_dispatch_pr_current = NULL;
2066  } // if (status == 0)
2067 #if KMP_OS_WINDOWS
2068  else if ( last ) {
2069  pr->u.p.last_upper = pr->u.p.ub;
2070  }
2071 #endif /* KMP_OS_WINDOWS */
2072  } // if
2073 
2074  #ifdef KMP_DEBUG
2075  {
2076  const char * buff;
2077  // create format specifiers before the debug output
2078  buff = __kmp_str_format(
2079  "__kmp_dispatch_next: T#%%d normal case: " \
2080  "p_lb:%%%s p_ub:%%%s p_st:%%%s p_last:%%p returning:%%d\n",
2081  traits_t< T >::spec, traits_t< T >::spec, traits_t< ST >::spec );
2082  KD_TRACE(10, ( buff, gtid, *p_lb, *p_ub, p_st ? *p_st : 0, p_last, status ) );
2083  __kmp_str_free( &buff );
2084  }
2085  #endif
2086  return status;
2087 }
2088 
2089 //-----------------------------------------------------------------------------------------
2090 // Dispatch routines
2091 // Transfer call to template< type T >
2092 // __kmp_dispatch_init( ident_t *loc, int gtid, enum sched_type schedule,
2093 // T lb, T ub, ST st, ST chunk )
2094 extern "C" {
2095 
2111 void
2112 __kmpc_dispatch_init_4( ident_t *loc, kmp_int32 gtid, enum sched_type schedule,
2113  kmp_int32 lb, kmp_int32 ub, kmp_int32 st, kmp_int32 chunk )
2114 {
2115  KMP_DEBUG_ASSERT( __kmp_init_serial );
2116  __kmp_dispatch_init< kmp_int32 >( loc, gtid, schedule, lb, ub, st, chunk, true );
2117 }
2121 void
2122 __kmpc_dispatch_init_4u( ident_t *loc, kmp_int32 gtid, enum sched_type schedule,
2123  kmp_uint32 lb, kmp_uint32 ub, kmp_int32 st, kmp_int32 chunk )
2124 {
2125  KMP_DEBUG_ASSERT( __kmp_init_serial );
2126  __kmp_dispatch_init< kmp_uint32 >( loc, gtid, schedule, lb, ub, st, chunk, true );
2127 }
2128 
2132 void
2133 __kmpc_dispatch_init_8( ident_t *loc, kmp_int32 gtid, enum sched_type schedule,
2134  kmp_int64 lb, kmp_int64 ub,
2135  kmp_int64 st, kmp_int64 chunk )
2136 {
2137  KMP_DEBUG_ASSERT( __kmp_init_serial );
2138  __kmp_dispatch_init< kmp_int64 >( loc, gtid, schedule, lb, ub, st, chunk, true );
2139 }
2140 
2144 void
2145 __kmpc_dispatch_init_8u( ident_t *loc, kmp_int32 gtid, enum sched_type schedule,
2146  kmp_uint64 lb, kmp_uint64 ub,
2147  kmp_int64 st, kmp_int64 chunk )
2148 {
2149  KMP_DEBUG_ASSERT( __kmp_init_serial );
2150  __kmp_dispatch_init< kmp_uint64 >( loc, gtid, schedule, lb, ub, st, chunk, true );
2151 }
2152 
2165 int
2166 __kmpc_dispatch_next_4( ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last,
2167  kmp_int32 *p_lb, kmp_int32 *p_ub, kmp_int32 *p_st )
2168 {
2169  return __kmp_dispatch_next< kmp_int32 >( loc, gtid, p_last, p_lb, p_ub, p_st );
2170 }
2171 
2175 int
2176 __kmpc_dispatch_next_4u( ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last,
2177  kmp_uint32 *p_lb, kmp_uint32 *p_ub, kmp_int32 *p_st )
2178 {
2179  return __kmp_dispatch_next< kmp_uint32 >( loc, gtid, p_last, p_lb, p_ub, p_st );
2180 }
2181 
2185 int
2186 __kmpc_dispatch_next_8( ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last,
2187  kmp_int64 *p_lb, kmp_int64 *p_ub, kmp_int64 *p_st )
2188 {
2189  return __kmp_dispatch_next< kmp_int64 >( loc, gtid, p_last, p_lb, p_ub, p_st );
2190 }
2191 
2195 int
2196 __kmpc_dispatch_next_8u( ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last,
2197  kmp_uint64 *p_lb, kmp_uint64 *p_ub, kmp_int64 *p_st )
2198 {
2199  return __kmp_dispatch_next< kmp_uint64 >( loc, gtid, p_last, p_lb, p_ub, p_st );
2200 }
2201 
2208 void
2209 __kmpc_dispatch_fini_4( ident_t *loc, kmp_int32 gtid )
2210 {
2211  __kmp_dispatch_finish< kmp_uint32 >( gtid, loc );
2212 }
2213 
2217 void
2218 __kmpc_dispatch_fini_8( ident_t *loc, kmp_int32 gtid )
2219 {
2220  __kmp_dispatch_finish< kmp_uint64 >( gtid, loc );
2221 }
2222 
2226 void
2227 __kmpc_dispatch_fini_4u( ident_t *loc, kmp_int32 gtid )
2228 {
2229  __kmp_dispatch_finish< kmp_uint32 >( gtid, loc );
2230 }
2231 
2235 void
2236 __kmpc_dispatch_fini_8u( ident_t *loc, kmp_int32 gtid )
2237 {
2238  __kmp_dispatch_finish< kmp_uint64 >( gtid, loc );
2239 }
2242 //-----------------------------------------------------------------------------------------
2243 //Non-template routines from kmp_dispatch.c used in other sources
2244 
2245 kmp_uint32 __kmp_eq_4( kmp_uint32 value, kmp_uint32 checker) {
2246  return value == checker;
2247 }
2248 
2249 kmp_uint32 __kmp_neq_4( kmp_uint32 value, kmp_uint32 checker) {
2250  return value != checker;
2251 }
2252 
2253 kmp_uint32 __kmp_lt_4( kmp_uint32 value, kmp_uint32 checker) {
2254  return value < checker;
2255 }
2256 
2257 kmp_uint32 __kmp_ge_4( kmp_uint32 value, kmp_uint32 checker) {
2258  return value >= checker;
2259 }
2260 
2261 kmp_uint32 __kmp_le_4( kmp_uint32 value, kmp_uint32 checker) {
2262  return value <= checker;
2263 }
2264 kmp_uint32 __kmp_eq_8( kmp_uint64 value, kmp_uint64 checker) {
2265  return value == checker;
2266 }
2267 
2268 kmp_uint32 __kmp_neq_8( kmp_uint64 value, kmp_uint64 checker) {
2269  return value != checker;
2270 }
2271 
2272 kmp_uint32 __kmp_lt_8( kmp_uint64 value, kmp_uint64 checker) {
2273  return value < checker;
2274 }
2275 
2276 kmp_uint32 __kmp_ge_8( kmp_uint64 value, kmp_uint64 checker) {
2277  return value >= checker;
2278 }
2279 
2280 kmp_uint32 __kmp_le_8( kmp_uint64 value, kmp_uint64 checker) {
2281  return value <= checker;
2282 }
2283 
2284 kmp_uint32
2285 __kmp_wait_yield_4(volatile kmp_uint32 * spinner,
2286  kmp_uint32 checker,
2287  kmp_uint32 (* pred)( kmp_uint32, kmp_uint32 )
2288  , void * obj // Higher-level synchronization object, or NULL.
2289  )
2290 {
2291  // note: we may not belong to a team at this point
2292  register volatile kmp_uint32 * spin = spinner;
2293  register kmp_uint32 check = checker;
2294  register kmp_uint32 spins;
2295  register kmp_uint32 (*f) ( kmp_uint32, kmp_uint32 ) = pred;
2296  register kmp_uint32 r;
2297 
2298  KMP_FSYNC_SPIN_INIT( obj, (void*) spin );
2299  KMP_INIT_YIELD( spins );
2300  // main wait spin loop
2301  while(!f(r = TCR_4(*spin), check)) {
2302  KMP_FSYNC_SPIN_PREPARE( obj );
2303  /* GEH - remove this since it was accidentally introduced when kmp_wait was split.
2304  It causes problems with infinite recursion because of exit lock */
2305  /* if ( TCR_4(__kmp_global.g.g_done) && __kmp_global.g.g_abort)
2306  __kmp_abort_thread(); */
2307 
2308  __kmp_static_delay(TRUE);
2309 
2310  /* if we have waited a bit, or are oversubscribed, yield */
2311  /* pause is in the following code */
2312  KMP_YIELD( TCR_4(__kmp_nth) > __kmp_avail_proc );
2313  KMP_YIELD_SPIN( spins );
2314  }
2315  KMP_FSYNC_SPIN_ACQUIRED( obj );
2316  return r;
2317 }
2318 
2319 kmp_uint64
2320 __kmp_wait_yield_8( volatile kmp_uint64 * spinner,
2321  kmp_uint64 checker,
2322  kmp_uint32 (* pred)( kmp_uint64, kmp_uint64 )
2323  , void * obj // Higher-level synchronization object, or NULL.
2324  )
2325 {
2326  // note: we may not belong to a team at this point
2327  register volatile kmp_uint64 * spin = spinner;
2328  register kmp_uint64 check = checker;
2329  register kmp_uint32 spins;
2330  register kmp_uint32 (*f) ( kmp_uint64, kmp_uint64 ) = pred;
2331  register kmp_uint64 r;
2332 
2333  KMP_FSYNC_SPIN_INIT( obj, (void*) spin );
2334  KMP_INIT_YIELD( spins );
2335  // main wait spin loop
2336 #if USE_ITT_BUILD && defined( USE_ITT ) && KMP_OS_WINDOWS
2337  // ITT + Windows* OS --> volatile
2338  while(!f(r = *(volatile kmp_uint64 *)spin, check))
2339 #else
2340  while(!f(r = *spin, check))
2341 #endif
2342  {
2343  KMP_FSYNC_SPIN_PREPARE( obj );
2344  /* GEH - remove this since it was accidentally introduced when kmp_wait was split.
2345  It causes problems with infinite recursion because of exit lock */
2346  /* if ( TCR_4(__kmp_global.g.g_done) && __kmp_global.g.g_abort)
2347  __kmp_abort_thread(); */
2348 
2349  __kmp_static_delay(TRUE);
2350 
2351  // if we are oversubscribed,
2352  // or have waited a bit (and KMP_LIBARRY=throughput, then yield
2353  // pause is in the following code
2354  KMP_YIELD( TCR_4(__kmp_nth) > __kmp_avail_proc );
2355  KMP_YIELD_SPIN( spins );
2356  }
2357  KMP_FSYNC_SPIN_ACQUIRED( obj );
2358  return r;
2359 }
2360 
2361 } // extern "C"
2362 
2363 #ifdef KMP_GOMP_COMPAT
2364 
2365 void
2366 __kmp_aux_dispatch_init_4( ident_t *loc, kmp_int32 gtid, enum sched_type schedule,
2367  kmp_int32 lb, kmp_int32 ub, kmp_int32 st,
2368  kmp_int32 chunk, int push_ws )
2369 {
2370  __kmp_dispatch_init< kmp_int32 >( loc, gtid, schedule, lb, ub, st, chunk,
2371  push_ws );
2372 }
2373 
2374 void
2375 __kmp_aux_dispatch_init_4u( ident_t *loc, kmp_int32 gtid, enum sched_type schedule,
2376  kmp_uint32 lb, kmp_uint32 ub, kmp_int32 st,
2377  kmp_int32 chunk, int push_ws )
2378 {
2379  __kmp_dispatch_init< kmp_uint32 >( loc, gtid, schedule, lb, ub, st, chunk,
2380  push_ws );
2381 }
2382 
2383 void
2384 __kmp_aux_dispatch_init_8( ident_t *loc, kmp_int32 gtid, enum sched_type schedule,
2385  kmp_int64 lb, kmp_int64 ub, kmp_int64 st,
2386  kmp_int64 chunk, int push_ws )
2387 {
2388  __kmp_dispatch_init< kmp_int64 >( loc, gtid, schedule, lb, ub, st, chunk,
2389  push_ws );
2390 }
2391 
2392 void
2393 __kmp_aux_dispatch_init_8u( ident_t *loc, kmp_int32 gtid, enum sched_type schedule,
2394  kmp_uint64 lb, kmp_uint64 ub, kmp_int64 st,
2395  kmp_int64 chunk, int push_ws )
2396 {
2397  __kmp_dispatch_init< kmp_uint64 >( loc, gtid, schedule, lb, ub, st, chunk,
2398  push_ws );
2399 }
2400 
2401 void
2402 __kmp_aux_dispatch_fini_chunk_4( ident_t *loc, kmp_int32 gtid )
2403 {
2404  __kmp_dispatch_finish_chunk< kmp_uint32 >( gtid, loc );
2405 }
2406 
2407 void
2408 __kmp_aux_dispatch_fini_chunk_8( ident_t *loc, kmp_int32 gtid )
2409 {
2410  __kmp_dispatch_finish_chunk< kmp_uint64 >( gtid, loc );
2411 }
2412 
2413 void
2414 __kmp_aux_dispatch_fini_chunk_4u( ident_t *loc, kmp_int32 gtid )
2415 {
2416  __kmp_dispatch_finish_chunk< kmp_uint32 >( gtid, loc );
2417 }
2418 
2419 void
2420 __kmp_aux_dispatch_fini_chunk_8u( ident_t *loc, kmp_int32 gtid )
2421 {
2422  __kmp_dispatch_finish_chunk< kmp_uint64 >( gtid, loc );
2423 }
2424 
2425 #endif /* KMP_GOMP_COMPAT */
2426 
2427 /* ------------------------------------------------------------------------ */
2428 /* ------------------------------------------------------------------------ */
2429