Intel® OpenMP* Runtime Library
 All Classes Functions Variables Typedefs Enumerations Enumerator Groups Pages
kmp_runtime.c
1 /*
2  * kmp_runtime.c -- KPTS runtime support library
3  * $Revision: 42510 $
4  * $Date: 2013-07-12 05:20:11 -0500 (Fri, 12 Jul 2013) $
5  */
6 
7 /* <copyright>
8  Copyright (c) 1997-2013 Intel Corporation. All Rights Reserved.
9 
10  Redistribution and use in source and binary forms, with or without
11  modification, are permitted provided that the following conditions
12  are met:
13 
14  * Redistributions of source code must retain the above copyright
15  notice, this list of conditions and the following disclaimer.
16  * Redistributions in binary form must reproduce the above copyright
17  notice, this list of conditions and the following disclaimer in the
18  documentation and/or other materials provided with the distribution.
19  * Neither the name of Intel Corporation nor the names of its
20  contributors may be used to endorse or promote products derived
21  from this software without specific prior written permission.
22 
23  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
24  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
25  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
26  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
27  HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
28  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
29  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
30  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
31  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
32  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
33  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
34 
35 </copyright> */
36 
37 #include "kmp.h"
38 #include "kmp_atomic.h"
39 #include "kmp_wrapper_getpid.h"
40 #include "kmp_environment.h"
41 #include "kmp_itt.h"
42 #include "kmp_str.h"
43 #include "kmp_settings.h"
44 #include "kmp_i18n.h"
45 #include "kmp_io.h"
46 #include "kmp_error.h"
47 
48 /* these are temporary issues to be dealt with */
49 #define KMP_USE_PRCTL 0
50 #define KMP_USE_POOLED_ALLOC 0
51 
52 #if KMP_OS_WINDOWS
53 #include <process.h>
54 #endif
55 
56 
57 #if defined(KMP_GOMP_COMPAT)
58 char const __kmp_version_alt_comp[] = KMP_VERSION_PREFIX "alternative compiler support: yes";
59 #endif /* defined(KMP_GOMP_COMPAT) */
60 
61 char const __kmp_version_omp_api[] = KMP_VERSION_PREFIX "API version: "
62 #if OMP_30_ENABLED
63  "3.1 (201107)";
64 #else
65  "2.5 (200505)";
66 #endif
67 
68 #ifdef KMP_DEBUG
69 
70 char const __kmp_version_lock[] = KMP_VERSION_PREFIX "lock type: run time selectable";
71 
72 char const __kmp_version_perf_v19[] = KMP_VERSION_PREFIX "perf v19: "
73 #if KMP_PERF_V19 == KMP_ON
74  "on";
75 #elif KMP_PERF_V19 == KMP_OFF
76  "off";
77 #else
78  #error "Must specify KMP_PERF_V19 option"
79 #endif
80 
81 char const __kmp_version_perf_v106[] = KMP_VERSION_PREFIX "perf v106: "
82 #if KMP_PERF_V106 == KMP_ON
83  "on";
84 #elif KMP_PERF_V106 == KMP_OFF
85  "off";
86 #else
87  #error "Must specify KMP_PERF_V106 option"
88 #endif
89 
90 #endif /* KMP_DEBUG */
91 
92 
93 
94 /* ------------------------------------------------------------------------ */
95 /* ------------------------------------------------------------------------ */
96 
97 kmp_info_t __kmp_monitor;
98 
99 /* ------------------------------------------------------------------------ */
100 /* ------------------------------------------------------------------------ */
101 
102 /* Forward declarations */
103 
104 void __kmp_cleanup( void );
105 
106 static void __kmp_initialize_info( kmp_info_t *, kmp_team_t *, int tid, int gtid );
107 static void __kmp_initialize_team(
108  kmp_team_t * team,
109  int new_nproc,
110  #if OMP_30_ENABLED
111  kmp_internal_control_t * new_icvs,
112  ident_t * loc
113  #else
114  int new_set_nproc, int new_set_dynamic, int new_set_nested,
115  int new_set_blocktime, int new_bt_intervals, int new_bt_set
116  #endif // OMP_30_ENABLED
117 );
118 static void __kmp_partition_places( kmp_team_t *team );
119 static void __kmp_do_serial_initialize( void );
120 
121 
122 #ifdef USE_LOAD_BALANCE
123 static int __kmp_load_balance_nproc( kmp_root_t * root, int set_nproc );
124 #endif
125 
126 static int __kmp_expand_threads(int nWish, int nNeed);
127 static int __kmp_unregister_root_other_thread( int gtid );
128 static void __kmp_unregister_library( void ); // called by __kmp_internal_end()
129 static void __kmp_reap_thread( kmp_info_t * thread, int is_root );
130 static kmp_info_t *__kmp_thread_pool_insert_pt = NULL;
131 
132 /* ------------------------------------------------------------------------ */
133 /* ------------------------------------------------------------------------ */
134 
135 /* Calculate the identifier of the current thread */
136 /* fast (and somewhat portable) way to get unique */
137 /* identifier of executing thread. */
138 /* returns KMP_GTID_DNE if we haven't been assigned a gtid */
139 
140 int
141 __kmp_get_global_thread_id( )
142 {
143  int i;
144  kmp_info_t **other_threads;
145  size_t stack_data;
146  char *stack_addr;
147  size_t stack_size;
148  char *stack_base;
149 
150  KA_TRACE( 1000, ( "*** __kmp_get_global_thread_id: entering, nproc=%d all_nproc=%d\n",
151  __kmp_nth, __kmp_all_nth ));
152 
153  /* JPH - to handle the case where __kmpc_end(0) is called immediately prior to a
154  parallel region, made it return KMP_GTID_DNE to force serial_initialize by
155  caller. Had to handle KMP_GTID_DNE at all call-sites, or else guarantee
156  __kmp_init_gtid for this to work. */
157 
158  if ( !TCR_4(__kmp_init_gtid) ) return KMP_GTID_DNE;
159 
160 #ifdef KMP_TDATA_GTID
161  if ( TCR_4(__kmp_gtid_mode) >= 3) {
162  KA_TRACE( 1000, ( "*** __kmp_get_global_thread_id: using TDATA\n" ));
163  return __kmp_gtid;
164  }
165 #endif
166  if ( TCR_4(__kmp_gtid_mode) >= 2) {
167  KA_TRACE( 1000, ( "*** __kmp_get_global_thread_id: using keyed TLS\n" ));
168  return __kmp_gtid_get_specific();
169  }
170  KA_TRACE( 1000, ( "*** __kmp_get_global_thread_id: using internal alg.\n" ));
171 
172  stack_addr = (char*) & stack_data;
173  other_threads = __kmp_threads;
174 
175  /*
176  ATT: The code below is a source of potential bugs due to unsynchronized access to
177  __kmp_threads array. For example:
178  1. Current thread loads other_threads[i] to thr and checks it, it is non-NULL.
179  2. Current thread is suspended by OS.
180  3. Another thread unregisters and finishes (debug versions of free() may fill memory
181  with something like 0xEF).
182  4. Current thread is resumed.
183  5. Current thread reads junk from *thr.
184  TODO: Fix it.
185  --ln
186  */
187 
188  for( i = 0 ; i < __kmp_threads_capacity ; i++ ) {
189 
190  kmp_info_t *thr = (kmp_info_t *)TCR_SYNC_PTR(other_threads[i]);
191  if( !thr ) continue;
192 
193  stack_size = (size_t)TCR_PTR(thr -> th.th_info.ds.ds_stacksize);
194  stack_base = (char *)TCR_PTR(thr -> th.th_info.ds.ds_stackbase);
195 
196  /* stack grows down -- search through all of the active threads */
197 
198  if( stack_addr <= stack_base ) {
199  size_t stack_diff = stack_base - stack_addr;
200 
201  if( stack_diff <= stack_size ) {
202  /* The only way we can be closer than the allocated */
203  /* stack size is if we are running on this thread. */
204  KMP_DEBUG_ASSERT( __kmp_gtid_get_specific() == i );
205  return i;
206  }
207  }
208  }
209 
210  /* get specific to try and determine our gtid */
211  KA_TRACE( 1000, ( "*** __kmp_get_global_thread_id: internal alg. failed to find "
212  "thread, using TLS\n" ));
213  i = __kmp_gtid_get_specific();
214 
215  /*fprintf( stderr, "=== %d\n", i ); */ /* GROO */
216 
217  /* if we havn't been assigned a gtid, then return code */
218  if( i<0 ) return i;
219 
220  /* dynamically updated stack window for uber threads to avoid get_specific call */
221  if( ! TCR_4(other_threads[i]->th.th_info.ds.ds_stackgrow) ) {
222  KMP_FATAL( StackOverflow, i );
223  }
224 
225  stack_base = (char *) other_threads[i] -> th.th_info.ds.ds_stackbase;
226  if( stack_addr > stack_base ) {
227  TCW_PTR(other_threads[i]->th.th_info.ds.ds_stackbase, stack_addr);
228  TCW_PTR(other_threads[i]->th.th_info.ds.ds_stacksize,
229  other_threads[i]->th.th_info.ds.ds_stacksize + stack_addr - stack_base);
230  } else {
231  TCW_PTR(other_threads[i]->th.th_info.ds.ds_stacksize, stack_base - stack_addr);
232  }
233 
234  /* Reprint stack bounds for ubermaster since they have been refined */
235  if ( __kmp_storage_map ) {
236  char *stack_end = (char *) other_threads[i] -> th.th_info.ds.ds_stackbase;
237  char *stack_beg = stack_end - other_threads[i] -> th.th_info.ds.ds_stacksize;
238  __kmp_print_storage_map_gtid( i, stack_beg, stack_end,
239  other_threads[i] -> th.th_info.ds.ds_stacksize,
240  "th_%d stack (refinement)", i );
241  }
242  return i;
243 }
244 
245 int
246 __kmp_get_global_thread_id_reg( )
247 {
248  int gtid;
249 
250  if ( !__kmp_init_serial ) {
251  gtid = KMP_GTID_DNE;
252  } else
253 #ifdef KMP_TDATA_GTID
254  if ( TCR_4(__kmp_gtid_mode) >= 3 ) {
255  KA_TRACE( 1000, ( "*** __kmp_get_global_thread_id_reg: using TDATA\n" ));
256  gtid = __kmp_gtid;
257  } else
258 #endif
259  if ( TCR_4(__kmp_gtid_mode) >= 2 ) {
260  KA_TRACE( 1000, ( "*** __kmp_get_global_thread_id_reg: using keyed TLS\n" ));
261  gtid = __kmp_gtid_get_specific();
262  } else {
263  KA_TRACE( 1000, ( "*** __kmp_get_global_thread_id_reg: using internal alg.\n" ));
264  gtid = __kmp_get_global_thread_id();
265  }
266 
267  /* we must be a new uber master sibling thread */
268  if( gtid == KMP_GTID_DNE ) {
269  KA_TRACE( 10, ( "__kmp_get_global_thread_id_reg: Encountered new root thread. "
270  "Registering a new gtid.\n" ));
271  __kmp_acquire_bootstrap_lock( &__kmp_initz_lock );
272  if( !__kmp_init_serial ) {
273  __kmp_do_serial_initialize();
274  gtid = __kmp_gtid_get_specific();
275  } else {
276  gtid = __kmp_register_root(FALSE);
277  }
278  __kmp_release_bootstrap_lock( &__kmp_initz_lock );
279  /*__kmp_printf( "+++ %d\n", gtid ); */ /* GROO */
280  }
281 
282  KMP_DEBUG_ASSERT( gtid >=0 );
283 
284  return gtid;
285 }
286 
287 /* caller must hold forkjoin_lock */
288 void
289 __kmp_check_stack_overlap( kmp_info_t *th )
290 {
291  int f;
292  char *stack_beg = NULL;
293  char *stack_end = NULL;
294  int gtid;
295 
296  KA_TRACE(10,("__kmp_check_stack_overlap: called\n"));
297  if ( __kmp_storage_map ) {
298  stack_end = (char *) th -> th.th_info.ds.ds_stackbase;
299  stack_beg = stack_end - th -> th.th_info.ds.ds_stacksize;
300 
301  gtid = __kmp_gtid_from_thread( th );
302 
303  if (gtid == KMP_GTID_MONITOR) {
304  __kmp_print_storage_map_gtid( gtid, stack_beg, stack_end, th->th.th_info.ds.ds_stacksize,
305  "th_%s stack (%s)", "mon",
306  ( th->th.th_info.ds.ds_stackgrow ) ? "initial" : "actual" );
307  } else {
308  __kmp_print_storage_map_gtid( gtid, stack_beg, stack_end, th->th.th_info.ds.ds_stacksize,
309  "th_%d stack (%s)", gtid,
310  ( th->th.th_info.ds.ds_stackgrow ) ? "initial" : "actual" );
311  }
312  }
313 
314  /* No point in checking ubermaster threads since they use refinement and cannot overlap */
315  if ( __kmp_env_checks == TRUE && !KMP_UBER_GTID(gtid = __kmp_gtid_from_thread( th )))
316  {
317  KA_TRACE(10,("__kmp_check_stack_overlap: performing extensive checking\n"));
318  if ( stack_beg == NULL ) {
319  stack_end = (char *) th -> th.th_info.ds.ds_stackbase;
320  stack_beg = stack_end - th -> th.th_info.ds.ds_stacksize;
321  }
322 
323  for( f=0 ; f < __kmp_threads_capacity ; f++ ) {
324  kmp_info_t *f_th = (kmp_info_t *)TCR_SYNC_PTR(__kmp_threads[f]);
325 
326  if( f_th && f_th != th ) {
327  char *other_stack_end = (char *)TCR_PTR(f_th->th.th_info.ds.ds_stackbase);
328  char *other_stack_beg = other_stack_end -
329  (size_t)TCR_PTR(f_th->th.th_info.ds.ds_stacksize);
330  if((stack_beg > other_stack_beg && stack_beg < other_stack_end) ||
331  (stack_end > other_stack_beg && stack_end < other_stack_end)) {
332 
333  /* Print the other stack values before the abort */
334  if ( __kmp_storage_map )
335  __kmp_print_storage_map_gtid( -1, other_stack_beg, other_stack_end,
336  (size_t)TCR_PTR(f_th->th.th_info.ds.ds_stacksize),
337  "th_%d stack (overlapped)",
338  __kmp_gtid_from_thread( f_th ) );
339 
340  __kmp_msg( kmp_ms_fatal, KMP_MSG( StackOverlap ), KMP_HNT( ChangeStackLimit ), __kmp_msg_null );
341  }
342  }
343  }
344  }
345  KA_TRACE(10,("__kmp_check_stack_overlap: returning\n"));
346 }
347 
348 
349 /* ------------------------------------------------------------------------ */
350 
351 #ifndef KMP_DEBUG
352 # define __kmp_static_delay( arg ) /* nothing to do */
353 #else
354 
355 static void
356 __kmp_static_delay( int arg )
357 {
358 /* Work around weird code-gen bug that causes assert to trip */
359 # if KMP_ARCH_X86_64 && KMP_OS_LINUX
360  KMP_ASSERT( arg != 0 );
361 # else
362  KMP_ASSERT( arg >= 0 );
363 # endif
364 }
365 #endif /* KMP_DEBUG */
366 
367 static void
368 __kmp_static_yield( int arg )
369 {
370  __kmp_yield( arg );
371 }
372 
373 /*
374  * Spin wait loop that first does pause, then yield, then sleep.
375  * Wait until spinner is equal to checker to exit.
376  *
377  * A thread that calls __kmp_wait_sleep must make certain that another thread
378  * calls __kmp_release to wake it back up up to prevent deadlocks!
379  */
380 
381 void
382 __kmp_wait_sleep( kmp_info_t *this_thr,
383  volatile kmp_uint *spinner,
384  kmp_uint checker,
385  int final_spin
386  USE_ITT_BUILD_ARG (void * itt_sync_obj)
387 )
388 {
389  /* note: we may not belong to a team at this point */
390  register volatile kmp_uint *spin = spinner;
391  register kmp_uint check = checker;
392  register kmp_uint32 spins;
393  register int hibernate;
394  int th_gtid, th_tid;
395 #if OMP_30_ENABLED
396  int flag = FALSE;
397 #endif /* OMP_30_ENABLED */
398 
399  KMP_FSYNC_SPIN_INIT( spin, NULL );
400  if( TCR_4(*spin) == check ) {
401  KMP_FSYNC_SPIN_ACQUIRED( spin );
402  return;
403  }
404 
405  th_gtid = this_thr->th.th_info.ds.ds_gtid;
406 
407  KA_TRACE( 20, ("__kmp_wait_sleep: T#%d waiting for spin(%p) == %d\n",
408  th_gtid,
409  spin, check ) );
410 
411  /* setup for waiting */
412  KMP_INIT_YIELD( spins );
413 
414  if ( __kmp_dflt_blocktime != KMP_MAX_BLOCKTIME ) {
415  //
416  // The worker threads cannot rely on the team struct existing at this
417  // point. Use the bt values cached in the thread struct instead.
418  //
419  #ifdef KMP_ADJUST_BLOCKTIME
420  if ( __kmp_zero_bt && ! this_thr->th.th_team_bt_set ) {
421  /* force immediate suspend if not set by user and more threads than available procs */
422  hibernate = 0;
423  } else {
424  hibernate = this_thr->th.th_team_bt_intervals;
425  }
426  #else
427  hibernate = this_thr->th.th_team_bt_intervals;
428  #endif /* KMP_ADJUST_BLOCKTIME */
429  if ( hibernate == 0 ) {
430  hibernate = -1;
431  }
432  hibernate += TCR_4( __kmp_global.g.g_time.dt.t_value );
433 
434  KF_TRACE( 20, ("__kmp_wait_sleep: T#%d now=%d, hibernate=%d, intervals=%d\n",
435  th_gtid, __kmp_global.g.g_time.dt.t_value, hibernate,
436  hibernate - __kmp_global.g.g_time.dt.t_value ));
437  }
438 
439  KMP_MB();
440 
441  /* main wait spin loop */
442  while( TCR_4(*spin) != check ) {
443  int in_pool;
444 
445 #if OMP_30_ENABLED
446  //
447  // If the task team is NULL, it means one of things:
448  // 1) A newly-created thread is first being released by
449  // __kmp_fork_barrier(), and its task team has not been set up
450  // yet.
451  // 2) All tasks have been executed to completion, this thread has
452  // decremented the task team's ref ct and possibly deallocated
453  // it, and should no longer reference it.
454  // 3) Tasking is off for this region. This could be because we
455  // are in a serialized region (perhaps the outer one), or else
456  // tasking was manually disabled (KMP_TASKING=0).
457  //
458  kmp_task_team_t * task_team = NULL;
459  if ( __kmp_tasking_mode != tskm_immediate_exec ) {
460  task_team = this_thr->th.th_task_team;
461  if ( task_team != NULL ) {
462  if ( ! TCR_SYNC_4( task_team->tt.tt_active ) ) {
463  KMP_DEBUG_ASSERT( ! KMP_MASTER_TID( this_thr->th.th_info.ds.ds_tid ) );
464  __kmp_unref_task_team( task_team, this_thr );
465  } else if ( KMP_TASKING_ENABLED( task_team, this_thr->th.th_task_state ) ) {
466  __kmp_execute_tasks( this_thr, th_gtid, spin, check, final_spin, &flag
467  USE_ITT_BUILD_ARG( itt_sync_obj )
468  );
469  }
470  }; // if
471  }; // if
472 #endif /* OMP_30_ENABLED */
473 
474  KMP_FSYNC_SPIN_PREPARE( spin );
475  if( TCR_4(__kmp_global.g.g_done) ) {
476  if( __kmp_global.g.g_abort )
477  __kmp_abort_thread( );
478  break;
479  }
480 
481  __kmp_static_delay( 1 );
482 
483  /* if we are oversubscribed,
484  or have waited a bit (and KMP_LIBRARY=throughput), then yield */
485  KMP_YIELD( TCR_4(__kmp_nth) > __kmp_avail_proc );
486  // TODO: Should it be number of cores instead of thread contexts? Like:
487  // KMP_YIELD( TCR_4(__kmp_nth) > __kmp_ncores );
488  // Need performance improvement data to make the change...
489  KMP_YIELD_SPIN( spins );
490 
491  //
492  // Check if this thread was transferred from a team
493  // to the thread pool (or vice-versa) while spinning.
494  //
495  in_pool = !!TCR_4(this_thr->th.th_in_pool);
496  if ( in_pool != !!this_thr->th.th_active_in_pool ) {
497  if ( in_pool ) {
498  //
499  // recently transferred from team to pool
500  //
501  KMP_TEST_THEN_INC32(
502  (kmp_int32 *) &__kmp_thread_pool_active_nth );
503  this_thr->th.th_active_in_pool = TRUE;
504 
505  //
506  // Here, we cannot assert that
507  //
508  // KMP_DEBUG_ASSERT( TCR_4(__kmp_thread_pool_active_nth)
509  // <= __kmp_thread_pool_nth );
510  //
511  // __kmp_thread_pool_nth is inc/dec'd by the master thread
512  // while the fork/join lock is held, whereas
513  // __kmp_thread_pool_active_nth is inc/dec'd asynchronously
514  // by the workers. The two can get out of sync for brief
515  // periods of time.
516  //
517  }
518  else {
519  //
520  // recently transferred from pool to team
521  //
522  KMP_TEST_THEN_DEC32(
523  (kmp_int32 *) &__kmp_thread_pool_active_nth );
524  KMP_DEBUG_ASSERT( TCR_4(__kmp_thread_pool_active_nth) >= 0 );
525  this_thr->th.th_active_in_pool = FALSE;
526  }
527  }
528 
529 #if OMP_30_ENABLED
530  // Don't suspend if there is a likelihood of new tasks being spawned.
531  if ( ( task_team != NULL ) && TCR_4(task_team->tt.tt_found_tasks) ) {
532  continue;
533  }
534 #endif /* OMP_30_ENABLED */
535 
536  /* Don't suspend if KMP_BLOCKTIME is set to "infinite" */
537  if ( __kmp_dflt_blocktime == KMP_MAX_BLOCKTIME ) {
538  continue;
539  }
540 
541  /* if we have waited a bit more, fall asleep */
542  if( TCR_4( __kmp_global.g.g_time.dt.t_value ) <= hibernate ) {
543  continue;
544  }
545 
546  KF_TRACE( 50, ("__kmp_wait_sleep: T#%d suspend time reached\n", th_gtid ) );
547 
548  __kmp_suspend( th_gtid, spin, check );
549 
550  if( TCR_4( __kmp_global.g.g_done ) && __kmp_global.g.g_abort ) {
551  __kmp_abort_thread( );
552  }
553 
554  /* TODO */
555  /* if thread is done with work and timesout, disband/free */
556  }
557 
558  KMP_FSYNC_SPIN_ACQUIRED( spin );
559 }
560 
561 
562 /*
563  * Release the thread specified by target_thr from waiting by setting the location
564  * specified by spin and resume the thread if indicated by the sleep parameter.
565  *
566  * A thread that calls __kmp_wait_sleep must call this function to wake up the
567  * potentially sleeping thread and prevent deadlocks!
568  */
569 
570 void
571 __kmp_release( kmp_info_t *target_thr, volatile kmp_uint *spin,
572  enum kmp_mem_fence_type fetchadd_fence )
573 {
574  kmp_uint old_spin;
575  #ifdef KMP_DEBUG
576  int target_gtid = target_thr->th.th_info.ds.ds_gtid;
577  int gtid = TCR_4(__kmp_init_gtid) ? __kmp_get_gtid() : -1;
578  #endif
579 
580  KF_TRACE( 20, ( "__kmp_release: T#%d releasing T#%d spin(%p) fence_type(%d)\n",
581  gtid, target_gtid, spin, fetchadd_fence ));
582 
583  KMP_DEBUG_ASSERT( spin );
584 
585  KMP_DEBUG_ASSERT( fetchadd_fence == kmp_acquire_fence ||
586  fetchadd_fence == kmp_release_fence );
587 
588  KMP_FSYNC_RELEASING( spin );
589 
590  old_spin = ( fetchadd_fence == kmp_acquire_fence )
591  ? KMP_TEST_THEN_ADD4_ACQ32( (volatile kmp_int32 *) spin )
592  : KMP_TEST_THEN_ADD4_32( (volatile kmp_int32 *) spin );
593 
594  KF_TRACE( 100, ( "__kmp_release: T#%d old spin(%p)=%d, set new spin=%d\n",
595  gtid, spin, old_spin, *spin ) );
596 
597  if ( __kmp_dflt_blocktime != KMP_MAX_BLOCKTIME ) {
598  /* Only need to check sleep stuff if infinite block time not set */
599  if ( old_spin & KMP_BARRIER_SLEEP_STATE ) {
600  #ifndef KMP_DEBUG
601  int target_gtid = target_thr->th.th_info.ds.ds_gtid;
602  #endif
603  /* wake up thread if needed */
604  KF_TRACE( 50, ( "__kmp_release: T#%d waking up thread T#%d since sleep spin(%p) set\n",
605  gtid, target_gtid, spin ));
606  __kmp_resume( target_gtid, spin );
607  } else {
608  KF_TRACE( 50, ( "__kmp_release: T#%d don't wake up thread T#%d since sleep spin(%p) not set\n",
609  gtid, target_gtid, spin ));
610  }
611  }
612 }
613 
614 /* ------------------------------------------------------------------------ */
615 
616 void
617 __kmp_infinite_loop( void )
618 {
619  static int done = FALSE;
620 
621  while (! done) {
622  KMP_YIELD( 1 );
623  }
624 }
625 
626 #define MAX_MESSAGE 512
627 
628 void
629 __kmp_print_storage_map_gtid( int gtid, void *p1, void *p2, size_t size, char const *format, ...) {
630  char buffer[MAX_MESSAGE];
631  int node;
632  va_list ap;
633 
634  va_start( ap, format);
635  sprintf( buffer, "OMP storage map: %p %p%8lu %s\n", p1, p2, (unsigned long) size, format );
636  __kmp_acquire_bootstrap_lock( & __kmp_stdio_lock );
637  __kmp_vprintf( kmp_err, buffer, ap );
638 #if KMP_PRINT_DATA_PLACEMENT
639  if(gtid >= 0) {
640  if(p1 <= p2 && (char*)p2 - (char*)p1 == size) {
641  if( __kmp_storage_map_verbose ) {
642  node = __kmp_get_host_node(p1);
643  if(node < 0) /* doesn't work, so don't try this next time */
644  __kmp_storage_map_verbose = FALSE;
645  else {
646  char *last;
647  int lastNode;
648  int localProc = __kmp_get_cpu_from_gtid(gtid);
649 
650  p1 = (void *)( (size_t)p1 & ~((size_t)PAGE_SIZE - 1) );
651  p2 = (void *)( ((size_t) p2 - 1) & ~((size_t)PAGE_SIZE - 1) );
652  if(localProc >= 0)
653  __kmp_printf_no_lock(" GTID %d localNode %d\n", gtid, localProc>>1);
654  else
655  __kmp_printf_no_lock(" GTID %d\n", gtid);
656 # if KMP_USE_PRCTL
657 /* The more elaborate format is disabled for now because of the prctl hanging bug. */
658  do {
659  last = p1;
660  lastNode = node;
661  /* This loop collates adjacent pages with the same host node. */
662  do {
663  (char*)p1 += PAGE_SIZE;
664  } while(p1 <= p2 && (node = __kmp_get_host_node(p1)) == lastNode);
665  __kmp_printf_no_lock(" %p-%p memNode %d\n", last,
666  (char*)p1 - 1, lastNode);
667  } while(p1 <= p2);
668 # else
669  __kmp_printf_no_lock(" %p-%p memNode %d\n", p1,
670  (char*)p1 + (PAGE_SIZE - 1), __kmp_get_host_node(p1));
671  if(p1 < p2) {
672  __kmp_printf_no_lock(" %p-%p memNode %d\n", p2,
673  (char*)p2 + (PAGE_SIZE - 1), __kmp_get_host_node(p2));
674  }
675 # endif
676  }
677  }
678  } else
679  __kmp_printf_no_lock(" %s\n", KMP_I18N_STR( StorageMapWarning ) );
680  }
681 #endif /* KMP_PRINT_DATA_PLACEMENT */
682  __kmp_release_bootstrap_lock( & __kmp_stdio_lock );
683 }
684 
685 void
686 __kmp_warn( char const * format, ... )
687 {
688  char buffer[MAX_MESSAGE];
689  va_list ap;
690 
691  if ( __kmp_generate_warnings == kmp_warnings_off ) {
692  return;
693  }
694 
695  va_start( ap, format );
696 
697  snprintf( buffer, sizeof(buffer) , "OMP warning: %s\n", format );
698  __kmp_acquire_bootstrap_lock( & __kmp_stdio_lock );
699  __kmp_vprintf( kmp_err, buffer, ap );
700  __kmp_release_bootstrap_lock( & __kmp_stdio_lock );
701 
702  va_end( ap );
703 }
704 
705 void
706 __kmp_abort_process()
707 {
708 
709  // Later threads may stall here, but that's ok because abort() will kill them.
710  __kmp_acquire_bootstrap_lock( & __kmp_exit_lock );
711 
712  if ( __kmp_debug_buf ) {
713  __kmp_dump_debug_buffer();
714  }; // if
715 
716  if ( KMP_OS_WINDOWS ) {
717  // Let other threads know of abnormal termination and prevent deadlock
718  // if abort happened during library initialization or shutdown
719  __kmp_global.g.g_abort = SIGABRT;
720 
721  /*
722  On Windows* OS by default abort() causes pop-up error box, which stalls nightly testing.
723  Unfortunately, we cannot reliably suppress pop-up error boxes. _set_abort_behavior()
724  works well, but this function is not available in VS7 (this is not problem for DLL, but
725  it is a problem for static OpenMP RTL). SetErrorMode (and so, timelimit utility) does
726  not help, at least in some versions of MS C RTL.
727 
728  It seems following sequence is the only way to simulate abort() and avoid pop-up error
729  box.
730  */
731  raise( SIGABRT );
732  _exit( 3 ); // Just in case, if signal ignored, exit anyway.
733  } else {
734  abort();
735  }; // if
736 
737  __kmp_infinite_loop();
738  __kmp_release_bootstrap_lock( & __kmp_exit_lock );
739 
740 } // __kmp_abort_process
741 
742 void
743 __kmp_abort_thread( void )
744 {
745  // TODO: Eliminate g_abort global variable and this function.
746  // In case of abort just call abort(), it will kill all the threads.
747  __kmp_infinite_loop();
748 } // __kmp_abort_thread
749 
750 /* ------------------------------------------------------------------------ */
751 
752 /*
753  * Print out the storage map for the major kmp_info_t thread data structures
754  * that are allocated together.
755  */
756 
757 static void
758 __kmp_print_thread_storage_map( kmp_info_t *thr, int gtid )
759 {
760  __kmp_print_storage_map_gtid( gtid, thr, thr + 1, sizeof(kmp_info_t), "th_%d", gtid );
761 
762  __kmp_print_storage_map_gtid( gtid, &thr->th.th_info, &thr->th.th_team, sizeof(kmp_desc_t),
763  "th_%d.th_info", gtid );
764 
765  __kmp_print_storage_map_gtid( gtid, &thr->th.th_local, &thr->th.th_pri_head, sizeof(kmp_local_t),
766  "th_%d.th_local", gtid );
767 
768  __kmp_print_storage_map_gtid( gtid, &thr->th.th_bar[0], &thr->th.th_bar[bs_last_barrier],
769  sizeof(kmp_balign_t) * bs_last_barrier, "th_%d.th_bar", gtid );
770 
771  __kmp_print_storage_map_gtid( gtid, &thr->th.th_bar[bs_plain_barrier],
772  &thr->th.th_bar[bs_plain_barrier+1],
773  sizeof(kmp_balign_t), "th_%d.th_bar[plain]", gtid);
774 
775  __kmp_print_storage_map_gtid( gtid, &thr->th.th_bar[bs_forkjoin_barrier],
776  &thr->th.th_bar[bs_forkjoin_barrier+1],
777  sizeof(kmp_balign_t), "th_%d.th_bar[forkjoin]", gtid);
778 
779  #if KMP_FAST_REDUCTION_BARRIER
780  __kmp_print_storage_map_gtid( gtid, &thr->th.th_bar[bs_reduction_barrier],
781  &thr->th.th_bar[bs_reduction_barrier+1],
782  sizeof(kmp_balign_t), "th_%d.th_bar[reduction]", gtid);
783  #endif // KMP_FAST_REDUCTION_BARRIER
784 }
785 
786 /*
787  * Print out the storage map for the major kmp_team_t team data structures
788  * that are allocated together.
789  */
790 
791 static void
792 __kmp_print_team_storage_map( const char *header, kmp_team_t *team, int team_id, int num_thr )
793 {
794  int num_disp_buff = team->t.t_max_nproc > 1 ? KMP_MAX_DISP_BUF : 2;
795  __kmp_print_storage_map_gtid( -1, team, team + 1, sizeof(kmp_team_t), "%s_%d",
796  header, team_id );
797 
798  __kmp_print_storage_map_gtid( -1, &team->t.t_bar[0], &team->t.t_bar[bs_last_barrier],
799  sizeof(kmp_balign_team_t) * bs_last_barrier, "%s_%d.t_bar", header, team_id );
800 
801 
802  __kmp_print_storage_map_gtid( -1, &team->t.t_bar[bs_plain_barrier], &team->t.t_bar[bs_plain_barrier+1],
803  sizeof(kmp_balign_team_t), "%s_%d.t_bar[plain]", header, team_id );
804 
805  __kmp_print_storage_map_gtid( -1, &team->t.t_bar[bs_forkjoin_barrier], &team->t.t_bar[bs_forkjoin_barrier+1],
806  sizeof(kmp_balign_team_t), "%s_%d.t_bar[forkjoin]", header, team_id );
807 
808  #if KMP_FAST_REDUCTION_BARRIER
809  __kmp_print_storage_map_gtid( -1, &team->t.t_bar[bs_reduction_barrier], &team->t.t_bar[bs_reduction_barrier+1],
810  sizeof(kmp_balign_team_t), "%s_%d.t_bar[reduction]", header, team_id );
811  #endif // KMP_FAST_REDUCTION_BARRIER
812 
813  __kmp_print_storage_map_gtid( -1, &team->t.t_dispatch[0], &team->t.t_dispatch[num_thr],
814  sizeof(kmp_disp_t) * num_thr, "%s_%d.t_dispatch", header, team_id );
815 
816  __kmp_print_storage_map_gtid( -1, &team->t.t_threads[0], &team->t.t_threads[num_thr],
817  sizeof(kmp_info_t *) * num_thr, "%s_%d.t_threads", header, team_id );
818 
819  __kmp_print_storage_map_gtid( -1, &team->t.t_disp_buffer[0], &team->t.t_disp_buffer[num_disp_buff],
820  sizeof(dispatch_shared_info_t) * num_disp_buff, "%s_%d.t_disp_buffer",
821  header, team_id );
822 
823  /*
824  __kmp_print_storage_map_gtid( -1, &team->t.t_set_nproc[0], &team->t.t_set_nproc[num_thr],
825  sizeof(int) * num_thr, "%s_%d.t_set_nproc", header, team_id );
826 
827  __kmp_print_storage_map_gtid( -1, &team->t.t_set_dynamic[0], &team->t.t_set_dynamic[num_thr],
828  sizeof(int) * num_thr, "%s_%d.t_set_dynamic", header, team_id );
829 
830  __kmp_print_storage_map_gtid( -1, &team->t.t_set_nested[0], &team->t.t_set_nested[num_thr],
831  sizeof(int) * num_thr, "%s_%d.t_set_nested", header, team_id );
832 
833  __kmp_print_storage_map_gtid( -1, &team->t.t_set_blocktime[0], &team->t.t_set_blocktime[num_thr],
834  sizeof(int) * num_thr, "%s_%d.t_set_nproc", header, team_id );
835 
836  __kmp_print_storage_map_gtid( -1, &team->t.t_set_bt_intervals[0], &team->t.t_set_bt_intervals[num_thr],
837  sizeof(int) * num_thr, "%s_%d.t_set_dynamic", header, team_id );
838 
839  __kmp_print_storage_map_gtid( -1, &team->t.t_set_bt_set[0], &team->t.t_set_bt_set[num_thr],
840  sizeof(int) * num_thr, "%s_%d.t_set_nested", header, team_id );
841 
842 #if OMP_30_ENABLED
843  //__kmp_print_storage_map_gtid( -1, &team->t.t_set_max_active_levels[0], &team->t.t_set_max_active_levels[num_thr],
844  // sizeof(int) * num_thr, "%s_%d.t_set_max_active_levels", header, team_id );
845 
846  __kmp_print_storage_map_gtid( -1, &team->t.t_set_sched[0], &team->t.t_set_sched[num_thr],
847  sizeof(kmp_r_sched_t) * num_thr, "%s_%d.t_set_sched", header, team_id );
848 #endif // OMP_30_ENABLED
849 #if OMP_40_ENABLED
850  __kmp_print_storage_map_gtid( -1, &team->t.t_set_proc_bind[0], &team->t.t_set_proc_bind[num_thr],
851  sizeof(kmp_proc_bind_t) * num_thr, "%s_%d.t_set_proc_bind", header, team_id );
852 #endif
853  */
854 
855  __kmp_print_storage_map_gtid( -1, &team->t.t_taskq, &team->t.t_copypriv_data,
856  sizeof(kmp_taskq_t), "%s_%d.t_taskq", header, team_id );
857 }
858 
859 static void __kmp_init_allocator() {}
860 static void __kmp_fini_allocator() {}
861 static void __kmp_fini_allocator_thread() {}
862 
863 /* ------------------------------------------------------------------------ */
864 
865 #ifdef GUIDEDLL_EXPORTS
866 # if KMP_OS_WINDOWS
867 
868 
869 static void
870 __kmp_reset_lock( kmp_bootstrap_lock_t* lck ) {
871  // TODO: Change to __kmp_break_bootstrap_lock().
872  __kmp_init_bootstrap_lock( lck ); // make the lock released
873 }
874 
875 static void
876 __kmp_reset_locks_on_process_detach( int gtid_req ) {
877  int i;
878  int thread_count;
879 
880  // PROCESS_DETACH is expected to be called by a thread
881  // that executes ProcessExit() or FreeLibrary().
882  // OS terminates other threads (except the one calling ProcessExit or FreeLibrary).
883  // So, it might be safe to access the __kmp_threads[] without taking the forkjoin_lock.
884  // However, in fact, some threads can be still alive here, although being about to be terminated.
885  // The threads in the array with ds_thread==0 are most suspicious.
886  // Actually, it can be not safe to access the __kmp_threads[].
887 
888  // TODO: does it make sense to check __kmp_roots[] ?
889 
890  // Let's check that there are no other alive threads registered with the OMP lib.
891  while( 1 ) {
892  thread_count = 0;
893  for( i = 0; i < __kmp_threads_capacity; ++i ) {
894  if( !__kmp_threads ) continue;
895  kmp_info_t* th = __kmp_threads[ i ];
896  if( th == NULL ) continue;
897  int gtid = th->th.th_info.ds.ds_gtid;
898  if( gtid == gtid_req ) continue;
899  if( gtid < 0 ) continue;
900  DWORD exit_val;
901  int alive = __kmp_is_thread_alive( th, &exit_val );
902  if( alive ) {
903  ++thread_count;
904  }
905  }
906  if( thread_count == 0 ) break; // success
907  }
908 
909  // Assume that I'm alone.
910 
911  // Now it might be probably safe to check and reset locks.
912  // __kmp_forkjoin_lock and __kmp_stdio_lock are expected to be reset.
913  __kmp_reset_lock( &__kmp_forkjoin_lock );
914  #ifdef KMP_DEBUG
915  __kmp_reset_lock( &__kmp_stdio_lock );
916  #endif // KMP_DEBUG
917 
918 
919 }
920 
921 BOOL WINAPI
922 DllMain( HINSTANCE hInstDLL, DWORD fdwReason, LPVOID lpReserved ) {
923  //__kmp_acquire_bootstrap_lock( &__kmp_initz_lock );
924 
925  switch( fdwReason ) {
926 
927  case DLL_PROCESS_ATTACH:
928  KA_TRACE( 10, ("DllMain: PROCESS_ATTACH\n" ));
929 
930  return TRUE;
931 
932  case DLL_PROCESS_DETACH:
933  KA_TRACE( 10, ("DllMain: PROCESS_DETACH T#%d\n",
934  __kmp_gtid_get_specific() ));
935 
936  if( lpReserved != NULL )
937  {
938  // lpReserved is used for telling the difference:
939  // lpReserved == NULL when FreeLibrary() was called,
940  // lpReserved != NULL when the process terminates.
941  // When FreeLibrary() is called, worker threads remain alive.
942  // So they will release the forkjoin lock by themselves.
943  // When the process terminates, worker threads disappear triggering
944  // the problem of unreleased forkjoin lock as described below.
945 
946  // A worker thread can take the forkjoin lock
947  // in __kmp_suspend()->__kmp_rml_decrease_load_before_sleep().
948  // The problem comes up if that worker thread becomes dead
949  // before it releases the forkjoin lock.
950  // The forkjoin lock remains taken, while the thread
951  // executing DllMain()->PROCESS_DETACH->__kmp_internal_end_library() below
952  // will try to take the forkjoin lock and will always fail,
953  // so that the application will never finish [normally].
954  // This scenario is possible if __kmpc_end() has not been executed.
955  // It looks like it's not a corner case, but common cases:
956  // - the main function was compiled by an alternative compiler;
957  // - the main function was compiled by icl but without /Qopenmp (application with plugins);
958  // - application terminates by calling C exit(), Fortran CALL EXIT() or Fortran STOP.
959  // - alive foreign thread prevented __kmpc_end from doing cleanup.
960 
961  // This is a hack to work around the problem.
962  // TODO: !!! to figure out something better.
963  __kmp_reset_locks_on_process_detach( __kmp_gtid_get_specific() );
964  }
965 
966  __kmp_internal_end_library( __kmp_gtid_get_specific() );
967 
968  return TRUE;
969 
970  case DLL_THREAD_ATTACH:
971  KA_TRACE( 10, ("DllMain: THREAD_ATTACH\n" ));
972 
973  /* if we wanted to register new siblings all the time here call
974  * __kmp_get_gtid(); */
975  return TRUE;
976 
977  case DLL_THREAD_DETACH:
978  KA_TRACE( 10, ("DllMain: THREAD_DETACH T#%d\n",
979  __kmp_gtid_get_specific() ));
980 
981  __kmp_internal_end_thread( __kmp_gtid_get_specific() );
982  return TRUE;
983  }
984 
985  return TRUE;
986 }
987 
988 # endif /* KMP_OS_WINDOWS */
989 #endif /* GUIDEDLL_EXPORTS
990 
991 
992 /* ------------------------------------------------------------------------ */
993 
994 /* Change the library type to "status" and return the old type */
995 /* called from within initialization routines where __kmp_initz_lock is held */
996 int
997 __kmp_change_library( int status )
998 {
999  int old_status;
1000 
1001  old_status = __kmp_yield_init & 1; // check whether KMP_LIBRARY=throughput (even init count)
1002 
1003  if (status) {
1004  __kmp_yield_init |= 1; // throughput => turnaround (odd init count)
1005  }
1006  else {
1007  __kmp_yield_init &= ~1; // turnaround => throughput (even init count)
1008  }
1009 
1010  return old_status; // return previous setting of whether KMP_LIBRARY=throughput
1011 }
1012 
1013 /* ------------------------------------------------------------------------ */
1014 /* ------------------------------------------------------------------------ */
1015 
1016 /* __kmp_parallel_deo --
1017  * Wait until it's our turn.
1018  */
1019 void
1020 __kmp_parallel_deo( int *gtid_ref, int *cid_ref, ident_t *loc_ref )
1021 {
1022  int gtid = *gtid_ref;
1023 #ifdef BUILD_PARALLEL_ORDERED
1024  kmp_team_t *team = __kmp_team_from_gtid( gtid );
1025 #endif /* BUILD_PARALLEL_ORDERED */
1026 
1027  if( __kmp_env_consistency_check ) {
1028  if( __kmp_threads[gtid] -> th.th_root -> r.r_active )
1029  __kmp_push_sync( gtid, ct_ordered_in_parallel, loc_ref, NULL );
1030  }
1031 #ifdef BUILD_PARALLEL_ORDERED
1032  if( !team -> t.t_serialized ) {
1033  kmp_uint32 spins;
1034 
1035  KMP_MB();
1036  KMP_WAIT_YIELD(&team -> t.t_ordered.dt.t_value, __kmp_tid_from_gtid( gtid ), KMP_EQ, NULL);
1037  KMP_MB();
1038  }
1039 #endif /* BUILD_PARALLEL_ORDERED */
1040 }
1041 
1042 /* __kmp_parallel_dxo --
1043  * Signal the next task.
1044  */
1045 
1046 void
1047 __kmp_parallel_dxo( int *gtid_ref, int *cid_ref, ident_t *loc_ref )
1048 {
1049  int gtid = *gtid_ref;
1050 #ifdef BUILD_PARALLEL_ORDERED
1051  int tid = __kmp_tid_from_gtid( gtid );
1052  kmp_team_t *team = __kmp_team_from_gtid( gtid );
1053 #endif /* BUILD_PARALLEL_ORDERED */
1054 
1055  if( __kmp_env_consistency_check ) {
1056  if( __kmp_threads[gtid] -> th.th_root -> r.r_active )
1057  __kmp_pop_sync( gtid, ct_ordered_in_parallel, loc_ref );
1058  }
1059 #ifdef BUILD_PARALLEL_ORDERED
1060  if ( ! team -> t.t_serialized ) {
1061  KMP_MB(); /* Flush all pending memory write invalidates. */
1062 
1063  /* use the tid of the next thread in this team */
1064  /* TODO repleace with general release procedure */
1065  team -> t.t_ordered.dt.t_value = ((tid + 1) % team->t.t_nproc );
1066 
1067  KMP_MB(); /* Flush all pending memory write invalidates. */
1068  }
1069 #endif /* BUILD_PARALLEL_ORDERED */
1070 }
1071 
1072 /* ------------------------------------------------------------------------ */
1073 /* ------------------------------------------------------------------------ */
1074 
1075 /* ------------------------------------------------------------------------ */
1076 /* ------------------------------------------------------------------------ */
1077 
1078 /* The BARRIER for a SINGLE process section is always explicit */
1079 
1080 int
1081 __kmp_enter_single( int gtid, ident_t *id_ref, int push_ws )
1082 {
1083  int status;
1084  kmp_info_t *th;
1085  kmp_team_t *team;
1086 
1087  if( ! TCR_4(__kmp_init_parallel) )
1088  __kmp_parallel_initialize();
1089 
1090  th = __kmp_threads[ gtid ];
1091  team = th -> th.th_team;
1092  status = 0;
1093 
1094  th->th.th_ident = id_ref;
1095 
1096  if ( team -> t.t_serialized ) {
1097  status = 1;
1098  } else {
1099  kmp_int32 old_this = th->th.th_local.this_construct;
1100 
1101  ++th->th.th_local.this_construct;
1102  /* try to set team count to thread count--success means thread got the
1103  single block
1104  */
1105  /* TODO: Should this be acquire or release? */
1106  status = KMP_COMPARE_AND_STORE_ACQ32(&team -> t.t_construct, old_this,
1107  th->th.th_local.this_construct);
1108  }
1109 
1110  if( __kmp_env_consistency_check ) {
1111  if (status && push_ws) {
1112  __kmp_push_workshare( gtid, ct_psingle, id_ref );
1113  } else {
1114  __kmp_check_workshare( gtid, ct_psingle, id_ref );
1115  }
1116  }
1117 #if USE_ITT_BUILD
1118  if ( status ) {
1119  __kmp_itt_single_start( gtid );
1120  }
1121 #endif /* USE_ITT_BUILD */
1122  return status;
1123 }
1124 
1125 void
1126 __kmp_exit_single( int gtid )
1127 {
1128 #if USE_ITT_BUILD
1129  __kmp_itt_single_end( gtid );
1130 #endif /* USE_ITT_BUILD */
1131  if( __kmp_env_consistency_check )
1132  __kmp_pop_workshare( gtid, ct_psingle, NULL );
1133 }
1134 
1135 
1136 /* ------------------------------------------------------------------------ */
1137 /* ------------------------------------------------------------------------ */
1138 
1139 static void
1140 __kmp_linear_barrier_gather( enum barrier_type bt,
1141  kmp_info_t *this_thr,
1142  int gtid,
1143  int tid,
1144  void (*reduce)(void *, void *)
1145  USE_ITT_BUILD_ARG(void * itt_sync_obj)
1146  )
1147 {
1148  register kmp_team_t *team = this_thr -> th.th_team;
1149  register kmp_bstate_t *thr_bar = & this_thr -> th.th_bar[ bt ].bb;
1150  register kmp_info_t **other_threads = team -> t.t_threads;
1151 
1152  KA_TRACE( 20, ("__kmp_linear_barrier_gather: T#%d(%d:%d) enter for barrier type %d\n",
1153  gtid, team->t.t_id, tid, bt ) );
1154 
1155  KMP_DEBUG_ASSERT( this_thr == other_threads[this_thr->th.th_info.ds.ds_tid] );
1156 
1157  /*
1158  * We now perform a linear reduction to signal that all
1159  * of the threads have arrived.
1160  *
1161  * Collect all the worker team member threads.
1162  */
1163  if ( ! KMP_MASTER_TID( tid )) {
1164 
1165  KA_TRACE( 20, ( "__kmp_linear_barrier_gather: T#%d(%d:%d) releasing T#%d(%d:%d)"
1166  "arrived(%p): %u => %u\n",
1167  gtid, team->t.t_id, tid,
1168  __kmp_gtid_from_tid( 0, team ), team->t.t_id, 0,
1169  &thr_bar -> b_arrived, thr_bar -> b_arrived,
1170  thr_bar -> b_arrived + KMP_BARRIER_STATE_BUMP
1171  ) );
1172 
1173  /* mark arrival to master thread */
1174  //
1175  // After performing this write, a worker thread may not assume that
1176  // the team is valid any more - it could be deallocated by the master
1177  // thread at any time.
1178  //
1179  __kmp_release( other_threads[0], &thr_bar -> b_arrived, kmp_release_fence );
1180 
1181  } else {
1182  register kmp_balign_team_t *team_bar = & team -> t.t_bar[ bt ];
1183  register int nproc = this_thr -> th.th_team_nproc;
1184  register int i;
1185  register kmp_uint new_state;
1186 
1187  /* Don't have to worry about sleep bit here or atomic since team setting */
1188  new_state = team_bar -> b_arrived + KMP_BARRIER_STATE_BUMP;
1189 
1190  /* Collect all the worker team member threads. */
1191  for (i = 1; i < nproc; i++) {
1192 #if KMP_CACHE_MANAGE
1193  /* prefetch next thread's arrived count */
1194  if ( i+1 < nproc )
1195  KMP_CACHE_PREFETCH( &other_threads[ i+1 ] -> th.th_bar[ bt ].bb.b_arrived );
1196 #endif /* KMP_CACHE_MANAGE */
1197  KA_TRACE( 20, ( "__kmp_linear_barrier_gather: T#%d(%d:%d) wait T#%d(%d:%d) "
1198  "arrived(%p) == %u\n",
1199  gtid, team->t.t_id, tid,
1200  __kmp_gtid_from_tid( i, team ), team->t.t_id, i,
1201  &other_threads[i] -> th.th_bar[ bt ].bb.b_arrived,
1202  new_state ) );
1203 
1204  /* wait for worker thread to arrive */
1205  __kmp_wait_sleep( this_thr,
1206  & other_threads[ i ] -> th.th_bar[ bt ].bb.b_arrived,
1207  new_state, FALSE
1208  USE_ITT_BUILD_ARG( itt_sync_obj )
1209  );
1210 
1211  if (reduce) {
1212 
1213  KA_TRACE( 100, ( "__kmp_linear_barrier_gather: T#%d(%d:%d) += T#%d(%d:%d)\n",
1214  gtid, team->t.t_id, tid,
1215  __kmp_gtid_from_tid( i, team ), team->t.t_id, i ) );
1216 
1217  (*reduce)( this_thr -> th.th_local.reduce_data,
1218  other_threads[ i ] -> th.th_local.reduce_data );
1219 
1220  }
1221 
1222  }
1223 
1224  /* Don't have to worry about sleep bit here or atomic since team setting */
1225  team_bar -> b_arrived = new_state;
1226  KA_TRACE( 20, ( "__kmp_linear_barrier_gather: T#%d(%d:%d) set team %d "
1227  "arrived(%p) = %u\n",
1228  gtid, team->t.t_id, tid, team->t.t_id,
1229  &team_bar -> b_arrived, new_state ) );
1230  }
1231 
1232  KA_TRACE( 20, ( "__kmp_linear_barrier_gather: T#%d(%d:%d) exit for barrier type %d\n",
1233  gtid, team->t.t_id, tid, bt ) );
1234 }
1235 
1236 
1237 static void
1238 __kmp_tree_barrier_gather( enum barrier_type bt,
1239  kmp_info_t *this_thr,
1240  int gtid,
1241  int tid,
1242  void (*reduce) (void *, void *)
1243  USE_ITT_BUILD_ARG( void * itt_sync_obj )
1244  )
1245 {
1246  register kmp_team_t *team = this_thr -> th.th_team;
1247  register kmp_bstate_t *thr_bar = & this_thr -> th.th_bar[ bt ].bb;
1248  register kmp_info_t **other_threads = team -> t.t_threads;
1249  register kmp_uint32 nproc = this_thr -> th.th_team_nproc;
1250  register kmp_uint32 branch_bits = __kmp_barrier_gather_branch_bits[ bt ];
1251  register kmp_uint32 branch_factor = 1 << branch_bits ;
1252  register kmp_uint32 child;
1253  register kmp_int32 child_tid;
1254  register kmp_uint new_state;
1255 
1256  KA_TRACE( 20, ( "__kmp_tree_barrier_gather: T#%d(%d:%d) enter for barrier type %d\n",
1257  gtid, team->t.t_id, tid, bt ) );
1258 
1259  KMP_DEBUG_ASSERT( this_thr == other_threads[this_thr->th.th_info.ds.ds_tid] );
1260 
1261  /*
1262  * We now perform a tree gather to wait until all
1263  * of the threads have arrived, and reduce any required data
1264  * as we go.
1265  */
1266 
1267  child_tid = (tid << branch_bits) + 1;
1268 
1269  if ( child_tid < nproc ) {
1270 
1271  /* parent threads wait for all their children to arrive */
1272  new_state = team -> t.t_bar[ bt ].b_arrived + KMP_BARRIER_STATE_BUMP;
1273  child = 1;
1274 
1275  do {
1276  register kmp_info_t *child_thr = other_threads[ child_tid ];
1277  register kmp_bstate_t *child_bar = & child_thr -> th.th_bar[ bt ].bb;
1278 #if KMP_CACHE_MANAGE
1279  /* prefetch next thread's arrived count */
1280  if ( child+1 <= branch_factor && child_tid+1 < nproc )
1281  KMP_CACHE_PREFETCH( &other_threads[ child_tid+1 ] -> th.th_bar[ bt ].bb.b_arrived );
1282 #endif /* KMP_CACHE_MANAGE */
1283  KA_TRACE( 20, ( "__kmp_tree_barrier_gather: T#%d(%d:%d) wait T#%d(%d:%d) "
1284  "arrived(%p) == %u\n",
1285  gtid, team->t.t_id, tid,
1286  __kmp_gtid_from_tid( child_tid, team ), team->t.t_id, child_tid,
1287  &child_bar -> b_arrived, new_state ) );
1288 
1289  /* wait for child to arrive */
1290  __kmp_wait_sleep( this_thr, &child_bar -> b_arrived, new_state, FALSE
1291  USE_ITT_BUILD_ARG( itt_sync_obj)
1292  );
1293 
1294  if (reduce) {
1295 
1296  KA_TRACE( 100, ( "__kmp_tree_barrier_gather: T#%d(%d:%d) += T#%d(%d:%d)\n",
1297  gtid, team->t.t_id, tid,
1298  __kmp_gtid_from_tid( child_tid, team ), team->t.t_id,
1299  child_tid ) );
1300 
1301  (*reduce)( this_thr -> th.th_local.reduce_data,
1302  child_thr -> th.th_local.reduce_data );
1303 
1304  }
1305 
1306  child++;
1307  child_tid++;
1308  }
1309  while ( child <= branch_factor && child_tid < nproc );
1310  }
1311 
1312  if ( !KMP_MASTER_TID(tid) ) {
1313  /* worker threads */
1314  register kmp_int32 parent_tid = (tid - 1) >> branch_bits;
1315 
1316  KA_TRACE( 20, ( "__kmp_tree_barrier_gather: T#%d(%d:%d) releasing T#%d(%d:%d) "
1317  "arrived(%p): %u => %u\n",
1318  gtid, team->t.t_id, tid,
1319  __kmp_gtid_from_tid( parent_tid, team ), team->t.t_id, parent_tid,
1320  &thr_bar -> b_arrived, thr_bar -> b_arrived,
1321  thr_bar -> b_arrived + KMP_BARRIER_STATE_BUMP
1322  ) );
1323 
1324  /* mark arrival to parent thread */
1325  //
1326  // After performing this write, a worker thread may not assume that
1327  // the team is valid any more - it could be deallocated by the master
1328  // thread at any time.
1329  //
1330  __kmp_release( other_threads[parent_tid], &thr_bar -> b_arrived, kmp_release_fence );
1331 
1332  } else {
1333  /* Need to update the team arrived pointer if we are the master thread */
1334 
1335  if ( nproc > 1 )
1336  /* New value was already computed in above loop */
1337  team -> t.t_bar[ bt ].b_arrived = new_state;
1338  else
1339  team -> t.t_bar[ bt ].b_arrived += KMP_BARRIER_STATE_BUMP;
1340 
1341  KA_TRACE( 20, ( "__kmp_tree_barrier_gather: T#%d(%d:%d) set team %d arrived(%p) = %u\n",
1342  gtid, team->t.t_id, tid, team->t.t_id,
1343  &team->t.t_bar[bt].b_arrived, team->t.t_bar[bt].b_arrived ) );
1344  }
1345 
1346  KA_TRACE( 20, ( "__kmp_tree_barrier_gather: T#%d(%d:%d) exit for barrier type %d\n",
1347  gtid, team->t.t_id, tid, bt ) );
1348 }
1349 
1350 
1351 static void
1352 __kmp_hyper_barrier_gather( enum barrier_type bt,
1353  kmp_info_t *this_thr,
1354  int gtid,
1355  int tid,
1356  void (*reduce) (void *, void *)
1357  USE_ITT_BUILD_ARG (void * itt_sync_obj)
1358  )
1359 {
1360  register kmp_team_t *team = this_thr -> th.th_team;
1361  register kmp_bstate_t *thr_bar = & this_thr -> th.th_bar[ bt ].bb;
1362  register kmp_info_t **other_threads = team -> t.t_threads;
1363  register kmp_uint new_state = KMP_BARRIER_UNUSED_STATE;
1364  register kmp_uint32 num_threads = this_thr -> th.th_team_nproc;
1365  register kmp_uint32 branch_bits = __kmp_barrier_gather_branch_bits[ bt ];
1366  register kmp_uint32 branch_factor = 1 << branch_bits ;
1367  register kmp_uint32 offset;
1368  register kmp_uint32 level;
1369 
1370  KA_TRACE( 20, ( "__kmp_hyper_barrier_gather: T#%d(%d:%d) enter for barrier type %d\n",
1371  gtid, team->t.t_id, tid, bt ) );
1372 
1373  KMP_DEBUG_ASSERT( this_thr == other_threads[this_thr->th.th_info.ds.ds_tid] );
1374 
1375  /*
1376  * We now perform a hypercube-embedded tree gather to wait until all
1377  * of the threads have arrived, and reduce any required data
1378  * as we go.
1379  */
1380 
1381  for ( level=0, offset =1;
1382  offset < num_threads;
1383  level += branch_bits, offset <<= branch_bits )
1384  {
1385  register kmp_uint32 child;
1386  register kmp_int32 child_tid;
1387 
1388  if ( ((tid >> level) & (branch_factor - 1)) != 0 ) {
1389  register kmp_int32 parent_tid = tid & ~( (1 << (level + branch_bits)) -1 );
1390 
1391  KA_TRACE( 20, ( "__kmp_hyper_barrier_gather: T#%d(%d:%d) releasing T#%d(%d:%d) "
1392  "arrived(%p): %u => %u\n",
1393  gtid, team->t.t_id, tid,
1394  __kmp_gtid_from_tid( parent_tid, team ), team->t.t_id, parent_tid,
1395  &thr_bar -> b_arrived, thr_bar -> b_arrived,
1396  thr_bar -> b_arrived + KMP_BARRIER_STATE_BUMP
1397  ) );
1398 
1399  /* mark arrival to parent thread */
1400  //
1401  // After performing this write (in the last iteration of the
1402  // enclosing for loop), a worker thread may not assume that the
1403  // team is valid any more - it could be deallocated by the master
1404  // thread at any time.
1405  //
1406  __kmp_release( other_threads[parent_tid], &thr_bar -> b_arrived, kmp_release_fence );
1407  break;
1408  }
1409 
1410  /* parent threads wait for children to arrive */
1411 
1412  for ( child = 1, child_tid = tid + (1 << level);
1413  child < branch_factor && child_tid < num_threads;
1414  child++, child_tid += (1 << level) )
1415  {
1416  register kmp_info_t *child_thr = other_threads[ child_tid ];
1417  register kmp_bstate_t *child_bar = & child_thr -> th.th_bar[ bt ].bb;
1418 #if KMP_CACHE_MANAGE
1419  register kmp_uint32 next_child_tid = child_tid + (1 << level);
1420  /* prefetch next thread's arrived count */
1421  if ( child+1 < branch_factor && next_child_tid < num_threads )
1422  KMP_CACHE_PREFETCH( &other_threads[ next_child_tid ] -> th.th_bar[ bt ].bb.b_arrived );
1423 #endif /* KMP_CACHE_MANAGE */
1424  /* Only read this arrived flag once per thread that needs it */
1425  if (new_state == KMP_BARRIER_UNUSED_STATE)
1426  new_state = team -> t.t_bar[ bt ].b_arrived + KMP_BARRIER_STATE_BUMP;
1427 
1428  KA_TRACE( 20, ( "__kmp_hyper_barrier_gather: T#%d(%d:%d) wait T#%d(%d:%d) "
1429  "arrived(%p) == %u\n",
1430  gtid, team->t.t_id, tid,
1431  __kmp_gtid_from_tid( child_tid, team ), team->t.t_id, child_tid,
1432  &child_bar -> b_arrived, new_state ) );
1433 
1434  /* wait for child to arrive */
1435  __kmp_wait_sleep( this_thr, &child_bar -> b_arrived, new_state, FALSE
1436  USE_ITT_BUILD_ARG (itt_sync_obj)
1437  );
1438 
1439  if (reduce) {
1440 
1441  KA_TRACE( 100, ( "__kmp_hyper_barrier_gather: T#%d(%d:%d) += T#%d(%d:%d)\n",
1442  gtid, team->t.t_id, tid,
1443  __kmp_gtid_from_tid( child_tid, team ), team->t.t_id,
1444  child_tid ) );
1445 
1446  (*reduce)( this_thr -> th.th_local.reduce_data,
1447  child_thr -> th.th_local.reduce_data );
1448 
1449  }
1450  }
1451  }
1452 
1453 
1454  if ( KMP_MASTER_TID(tid) ) {
1455  /* Need to update the team arrived pointer if we are the master thread */
1456 
1457  if (new_state == KMP_BARRIER_UNUSED_STATE)
1458  team -> t.t_bar[ bt ].b_arrived += KMP_BARRIER_STATE_BUMP;
1459  else
1460  team -> t.t_bar[ bt ].b_arrived = new_state;
1461 
1462  KA_TRACE( 20, ( "__kmp_hyper_barrier_gather: T#%d(%d:%d) set team %d arrived(%p) = %u\n",
1463  gtid, team->t.t_id, tid, team->t.t_id,
1464  &team->t.t_bar[bt].b_arrived, team->t.t_bar[bt].b_arrived ) );
1465  }
1466 
1467  KA_TRACE( 20, ( "__kmp_hyper_barrier_gather: T#%d(%d:%d) exit for barrier type %d\n",
1468  gtid, team->t.t_id, tid, bt ) );
1469 
1470 }
1471 
1472 static void
1473 __kmp_linear_barrier_release( enum barrier_type bt,
1474  kmp_info_t *this_thr,
1475  int gtid,
1476  int tid,
1477  int propagate_icvs
1478  USE_ITT_BUILD_ARG(void * itt_sync_obj)
1479  )
1480 {
1481  register kmp_bstate_t *thr_bar = &this_thr -> th.th_bar[ bt ].bb;
1482  register kmp_team_t *team;
1483 
1484  if (KMP_MASTER_TID( tid )) {
1485  register int i;
1486  register kmp_uint32 nproc = this_thr -> th.th_team_nproc;
1487  register kmp_info_t **other_threads;
1488 
1489  team = __kmp_threads[ gtid ]-> th.th_team;
1490  KMP_DEBUG_ASSERT( team != NULL );
1491  other_threads = team -> t.t_threads;
1492 
1493  KA_TRACE( 20, ( "__kmp_linear_barrier_release: T#%d(%d:%d) master enter for barrier type %d\n",
1494  gtid, team->t.t_id, tid, bt ) );
1495 
1496  /* release all of the worker threads */
1497  if (nproc > 1) {
1498  for (i = 1; i < nproc; i++) {
1499 #if KMP_CACHE_MANAGE
1500  /* prefetch next thread's go flag */
1501  if( i+1 < nproc )
1502  KMP_CACHE_PREFETCH( &other_threads[ i+1 ]-> th.th_bar[ bt ].bb.b_go );
1503 #endif /* KMP_CACHE_MANAGE */
1504 
1505 #if KMP_BARRIER_ICV_PUSH
1506  if ( propagate_icvs ) {
1507  __kmp_init_implicit_task( team->t.t_ident,
1508  team->t.t_threads[i], team, i, FALSE );
1509  copy_icvs( &team->t.t_implicit_task_taskdata[i].td_icvs,
1510  &team->t.t_implicit_task_taskdata[0].td_icvs );
1511  }
1512 #endif // KMP_BARRIER_ICV_PUSH
1513 
1514  KA_TRACE( 20, ( "__kmp_linear_barrier_release: T#%d(%d:%d) releasing T#%d(%d:%d) "
1515  "go(%p): %u => %u\n",
1516  gtid, team->t.t_id, tid,
1517  other_threads[i]->th.th_info.ds.ds_gtid, team->t.t_id, i,
1518  &other_threads[i]->th.th_bar[bt].bb.b_go,
1519  other_threads[i]->th.th_bar[bt].bb.b_go,
1520  other_threads[i]->th.th_bar[bt].bb.b_go + KMP_BARRIER_STATE_BUMP
1521  ) );
1522 
1523  __kmp_release( other_threads[ i ],
1524  &other_threads[ i ]-> th.th_bar[ bt ].bb.b_go, kmp_acquire_fence );
1525  }
1526  }
1527  } else {
1528  /* Wait for the MASTER thread to release us */
1529 
1530  KA_TRACE( 20, ( "__kmp_linear_barrier_release: T#%d wait go(%p) == %u\n",
1531  gtid, &thr_bar -> b_go, KMP_BARRIER_STATE_BUMP ) );
1532 
1533  __kmp_wait_sleep( this_thr, &thr_bar -> b_go, KMP_BARRIER_STATE_BUMP, TRUE
1534  USE_ITT_BUILD_ARG(itt_sync_obj)
1535  );
1536 
1537 #if USE_ITT_BUILD && OMP_30_ENABLED && USE_ITT_NOTIFY
1538  if ( ( __itt_sync_create_ptr && itt_sync_obj == NULL ) || KMP_ITT_DEBUG ) {
1539  // we are on a fork barrier where we could not get the object reliably (or ITTNOTIFY is disabled)
1540  itt_sync_obj = __kmp_itt_barrier_object( gtid, bs_forkjoin_barrier, 0, -1 );
1541  // cancel wait on previous parallel region...
1542  __kmp_itt_task_starting( itt_sync_obj );
1543 
1544  if ( bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done) )
1545  return;
1546 
1547  itt_sync_obj = __kmp_itt_barrier_object( gtid, bs_forkjoin_barrier );
1548  if ( itt_sync_obj != NULL )
1549  __kmp_itt_task_finished( itt_sync_obj ); // call prepare as early as possible for "new" barrier
1550 
1551  } else
1552 #endif /* USE_ITT_BUILD && OMP_30_ENABLED && USE_ITT_NOTIFY */
1553  //
1554  // early exit for reaping threads releasing forkjoin barrier
1555  //
1556  if ( bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done) )
1557  return;
1558 
1559  //
1560  // The worker thread may now assume that the team is valid.
1561  //
1562 #if USE_ITT_BUILD && !OMP_30_ENABLED && USE_ITT_NOTIFY
1563  // libguide only code (cannot use *itt_task* routines)
1564  if ( ( __itt_sync_create_ptr && itt_sync_obj == NULL ) || KMP_ITT_DEBUG ) {
1565  // we are on a fork barrier where we could not get the object reliably
1566  itt_sync_obj = __kmp_itt_barrier_object( gtid, bs_forkjoin_barrier );
1567  __kmp_itt_barrier_starting( gtid, itt_sync_obj ); // no need to call releasing, but we have paired calls...
1568  }
1569 #endif /* USE_ITT_BUILD && !OMP_30_ENABLED && USE_ITT_NOTIFY */
1570  #ifdef KMP_DEBUG
1571  tid = __kmp_tid_from_gtid( gtid );
1572  team = __kmp_threads[ gtid ]-> th.th_team;
1573  #endif
1574  KMP_DEBUG_ASSERT( team != NULL );
1575 
1576  TCW_4(thr_bar->b_go, KMP_INIT_BARRIER_STATE);
1577  KA_TRACE( 20, ("__kmp_linear_barrier_release: T#%d(%d:%d) set go(%p) = %u\n",
1578  gtid, team->t.t_id, tid, &thr_bar->b_go, KMP_INIT_BARRIER_STATE ) );
1579 
1580  KMP_MB(); /* Flush all pending memory write invalidates. */
1581  }
1582 
1583  KA_TRACE( 20, ( "__kmp_linear_barrier_release: T#%d(%d:%d) exit for barrier type %d\n",
1584  gtid, team->t.t_id, tid, bt ) );
1585 }
1586 
1587 
1588 static void
1589 __kmp_tree_barrier_release( enum barrier_type bt,
1590  kmp_info_t *this_thr,
1591  int gtid,
1592  int tid,
1593  int propagate_icvs
1594  USE_ITT_BUILD_ARG(void * itt_sync_obj)
1595  )
1596 {
1597  /* handle fork barrier workers who aren't part of a team yet */
1598  register kmp_team_t *team;
1599  register kmp_bstate_t *thr_bar = & this_thr -> th.th_bar[ bt ].bb;
1600  register kmp_uint32 nproc;
1601  register kmp_uint32 branch_bits = __kmp_barrier_release_branch_bits[ bt ];
1602  register kmp_uint32 branch_factor = 1 << branch_bits ;
1603  register kmp_uint32 child;
1604  register kmp_int32 child_tid;
1605 
1606  /*
1607  * We now perform a tree release for all
1608  * of the threads that have been gathered
1609  */
1610 
1611  if ( ! KMP_MASTER_TID( tid )) {
1612  /* worker threads */
1613 
1614  KA_TRACE( 20, ( "__kmp_tree_barrier_release: T#%d wait go(%p) == %u\n",
1615  gtid, &thr_bar -> b_go, KMP_BARRIER_STATE_BUMP ) );
1616 
1617  /* wait for parent thread to release us */
1618  __kmp_wait_sleep( this_thr, &thr_bar -> b_go, KMP_BARRIER_STATE_BUMP, TRUE
1619  USE_ITT_BUILD_ARG(itt_sync_obj)
1620  );
1621 
1622 #if USE_ITT_BUILD && OMP_30_ENABLED && USE_ITT_NOTIFY
1623  if ( ( __itt_sync_create_ptr && itt_sync_obj == NULL ) || KMP_ITT_DEBUG ) {
1624  // we are on a fork barrier where we could not get the object reliably (or ITTNOTIFY is disabled)
1625  itt_sync_obj = __kmp_itt_barrier_object( gtid, bs_forkjoin_barrier, 0, -1 );
1626  // cancel wait on previous parallel region...
1627  __kmp_itt_task_starting( itt_sync_obj );
1628 
1629  if ( bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done) )
1630  return;
1631 
1632  itt_sync_obj = __kmp_itt_barrier_object( gtid, bs_forkjoin_barrier );
1633  if ( itt_sync_obj != NULL )
1634  __kmp_itt_task_finished( itt_sync_obj ); // call prepare as early as possible for "new" barrier
1635 
1636  } else
1637 #endif /* USE_ITT_BUILD && OMP_30_ENABLED && USE_ITT_NOTIFY */
1638  //
1639  // early exit for reaping threads releasing forkjoin barrier
1640  //
1641  if ( bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done) )
1642  return;
1643 
1644  //
1645  // The worker thread may now assume that the team is valid.
1646  //
1647 #if USE_ITT_BUILD && !OMP_30_ENABLED && USE_ITT_NOTIFY
1648  // libguide only code (cannot use *itt_task* routines)
1649  if ( ( __itt_sync_create_ptr && itt_sync_obj == NULL ) || KMP_ITT_DEBUG ) {
1650  // we are on a fork barrier where we could not get the object reliably
1651  itt_sync_obj = __kmp_itt_barrier_object( gtid, bs_forkjoin_barrier );
1652  __kmp_itt_barrier_starting( gtid, itt_sync_obj ); // no need to call releasing, but we have paired calls...
1653  }
1654 #endif /* USE_ITT_BUILD && !OMP_30_ENABLED && USE_ITT_NOTIFY */
1655  team = __kmp_threads[ gtid ]-> th.th_team;
1656  KMP_DEBUG_ASSERT( team != NULL );
1657  tid = __kmp_tid_from_gtid( gtid );
1658 
1659  TCW_4(thr_bar->b_go, KMP_INIT_BARRIER_STATE);
1660  KA_TRACE( 20, ( "__kmp_tree_barrier_release: T#%d(%d:%d) set go(%p) = %u\n",
1661  gtid, team->t.t_id, tid, &thr_bar->b_go, KMP_INIT_BARRIER_STATE ) );
1662 
1663  KMP_MB(); /* Flush all pending memory write invalidates. */
1664 
1665  } else {
1666  team = __kmp_threads[ gtid ]-> th.th_team;
1667  KMP_DEBUG_ASSERT( team != NULL );
1668 
1669  KA_TRACE( 20, ( "__kmp_tree_barrier_release: T#%d(%d:%d) master enter for barrier type %d\n",
1670  gtid, team->t.t_id, tid, bt ) );
1671  }
1672 
1673  nproc = this_thr -> th.th_team_nproc;
1674  child_tid = ( tid << branch_bits ) + 1;
1675 
1676  if ( child_tid < nproc ) {
1677  register kmp_info_t **other_threads = team -> t.t_threads;
1678  child = 1;
1679  /* parent threads release all their children */
1680 
1681  do {
1682  register kmp_info_t *child_thr = other_threads[ child_tid ];
1683  register kmp_bstate_t *child_bar = & child_thr -> th.th_bar[ bt ].bb;
1684 #if KMP_CACHE_MANAGE
1685  /* prefetch next thread's go count */
1686  if ( child+1 <= branch_factor && child_tid+1 < nproc )
1687  KMP_CACHE_PREFETCH( &other_threads[ child_tid+1 ] -> th.th_bar[ bt ].bb.b_go );
1688 #endif /* KMP_CACHE_MANAGE */
1689 
1690 #if KMP_BARRIER_ICV_PUSH
1691  if ( propagate_icvs ) {
1692  __kmp_init_implicit_task( team->t.t_ident,
1693  team->t.t_threads[child_tid], team, child_tid, FALSE );
1694  copy_icvs( &team->t.t_implicit_task_taskdata[child_tid].td_icvs,
1695  &team->t.t_implicit_task_taskdata[0].td_icvs );
1696  }
1697 #endif // KMP_BARRIER_ICV_PUSH
1698 
1699  KA_TRACE( 20, ( "__kmp_tree_barrier_release: T#%d(%d:%d) releasing T#%d(%d:%d)"
1700  "go(%p): %u => %u\n",
1701  gtid, team->t.t_id, tid,
1702  __kmp_gtid_from_tid( child_tid, team ), team->t.t_id,
1703  child_tid, &child_bar -> b_go, child_bar -> b_go,
1704  child_bar -> b_go + KMP_BARRIER_STATE_BUMP ) );
1705 
1706  /* release child from barrier */
1707  __kmp_release( child_thr, &child_bar -> b_go, kmp_acquire_fence );
1708 
1709  child++;
1710  child_tid++;
1711  }
1712  while ( child <= branch_factor && child_tid < nproc );
1713  }
1714 
1715  KA_TRACE( 20, ( "__kmp_tree_barrier_release: T#%d(%d:%d) exit for barrier type %d\n",
1716  gtid, team->t.t_id, tid, bt ) );
1717 }
1718 
1719 /* The reverse versions seem to beat the forward versions overall */
1720 #define KMP_REVERSE_HYPER_BAR
1721 #ifdef KMP_REVERSE_HYPER_BAR
1722 static void
1723 __kmp_hyper_barrier_release( enum barrier_type bt,
1724  kmp_info_t *this_thr,
1725  int gtid,
1726  int tid,
1727  int propagate_icvs
1728  USE_ITT_BUILD_ARG(void * itt_sync_obj)
1729  )
1730 {
1731  /* handle fork barrier workers who aren't part of a team yet */
1732  register kmp_team_t *team;
1733  register kmp_bstate_t *thr_bar = & this_thr -> th.th_bar[ bt ].bb;
1734  register kmp_info_t **other_threads;
1735  register kmp_uint32 num_threads;
1736  register kmp_uint32 branch_bits = __kmp_barrier_release_branch_bits[ bt ];
1737  register kmp_uint32 branch_factor = 1 << branch_bits;
1738  register kmp_uint32 child;
1739  register kmp_int32 child_tid;
1740  register kmp_uint32 offset;
1741  register kmp_uint32 level;
1742 
1743  /*
1744  * We now perform a hypercube-embedded tree release for all
1745  * of the threads that have been gathered, but in the exact
1746  * reverse order from the corresponding gather (for load balance.
1747  */
1748 
1749  if ( ! KMP_MASTER_TID( tid )) {
1750  /* worker threads */
1751 
1752  KA_TRACE( 20, ( "__kmp_hyper_barrier_release: T#%d wait go(%p) == %u\n",
1753  gtid, &thr_bar -> b_go, KMP_BARRIER_STATE_BUMP ) );
1754 
1755  /* wait for parent thread to release us */
1756  __kmp_wait_sleep( this_thr, &thr_bar -> b_go, KMP_BARRIER_STATE_BUMP, TRUE
1757  USE_ITT_BUILD_ARG( itt_sync_obj )
1758  );
1759 
1760 #if USE_ITT_BUILD && OMP_30_ENABLED && USE_ITT_NOTIFY
1761  if ( ( __itt_sync_create_ptr && itt_sync_obj == NULL ) || KMP_ITT_DEBUG ) {
1762  // we are on a fork barrier where we could not get the object reliably
1763  itt_sync_obj = __kmp_itt_barrier_object( gtid, bs_forkjoin_barrier, 0, -1 );
1764  // cancel wait on previous parallel region...
1765  __kmp_itt_task_starting( itt_sync_obj );
1766 
1767  if ( bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done) )
1768  return;
1769 
1770  itt_sync_obj = __kmp_itt_barrier_object( gtid, bs_forkjoin_barrier );
1771  if ( itt_sync_obj != NULL )
1772  __kmp_itt_task_finished( itt_sync_obj ); // call prepare as early as possible for "new" barrier
1773 
1774  } else
1775 #endif /* USE_ITT_BUILD && OMP_30_ENABLED && USE_ITT_NOTIFY */
1776  //
1777  // early exit for reaping threads releasing forkjoin barrier
1778  //
1779  if ( bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done) )
1780  return;
1781 
1782  //
1783  // The worker thread may now assume that the team is valid.
1784  //
1785 #if USE_ITT_BUILD && !OMP_30_ENABLED && USE_ITT_NOTIFY
1786  // libguide only code (cannot use *itt_task* routines)
1787  if ( ( __itt_sync_create_ptr && itt_sync_obj == NULL ) || KMP_ITT_DEBUG ) {
1788  // we are on a fork barrier where we could not get the object reliably
1789  itt_sync_obj = __kmp_itt_barrier_object( gtid, bs_forkjoin_barrier );
1790  __kmp_itt_barrier_starting( gtid, itt_sync_obj ); // no need to call releasing, but we have paired calls...
1791  }
1792 #endif /* USE_ITT_BUILD && !OMP_30_ENABLED && USE_ITT_NOTIFY */
1793  team = __kmp_threads[ gtid ]-> th.th_team;
1794  KMP_DEBUG_ASSERT( team != NULL );
1795  tid = __kmp_tid_from_gtid( gtid );
1796 
1797  TCW_4(thr_bar->b_go, KMP_INIT_BARRIER_STATE);
1798  KA_TRACE( 20, ( "__kmp_hyper_barrier_release: T#%d(%d:%d) set go(%p) = %u\n",
1799  gtid, team->t.t_id, tid, &thr_bar->b_go, KMP_INIT_BARRIER_STATE ) );
1800 
1801  KMP_MB(); /* Flush all pending memory write invalidates. */
1802 
1803  } else { /* KMP_MASTER_TID(tid) */
1804  team = __kmp_threads[ gtid ]-> th.th_team;
1805  KMP_DEBUG_ASSERT( team != NULL );
1806 
1807  KA_TRACE( 20, ( "__kmp_hyper_barrier_release: T#%d(%d:%d) master enter for barrier type %d\n",
1808  gtid, team->t.t_id, tid, bt ) );
1809  }
1810 
1811  num_threads = this_thr -> th.th_team_nproc;
1812  other_threads = team -> t.t_threads;
1813 
1814  /* count up to correct level for parent */
1815  for ( level = 0, offset = 1;
1816  offset < num_threads && (((tid >> level) & (branch_factor-1)) == 0);
1817  level += branch_bits, offset <<= branch_bits );
1818 
1819  /* now go down from there */
1820  for ( level -= branch_bits, offset >>= branch_bits;
1821  offset != 0;
1822  level -= branch_bits, offset >>= branch_bits )
1823  {
1824  register kmp_uint32 child;
1825  register kmp_int32 child_tid;
1826 
1827  /* Now go in reverse order through the children, highest to lowest.
1828  Initial setting of child is conservative here. */
1829  child = num_threads >> ((level==0)?level:level-1);
1830  for ( child = (child < branch_factor-1) ? child : branch_factor-1,
1831  child_tid = tid + (child << level);
1832  child >= 1;
1833  child--, child_tid -= (1 << level) )
1834  {
1835 
1836  if ( child_tid >= num_threads ) continue; /* child doesn't exist so keep going */
1837  else {
1838  register kmp_info_t *child_thr = other_threads[ child_tid ];
1839  register kmp_bstate_t *child_bar = & child_thr -> th.th_bar[ bt ].bb;
1840 #if KMP_CACHE_MANAGE
1841  register kmp_uint32 next_child_tid = child_tid - (1 << level);
1842  /* prefetch next thread's go count */
1843  if ( child-1 >= 1 && next_child_tid < num_threads )
1844  KMP_CACHE_PREFETCH( &other_threads[ next_child_tid ]->th.th_bar[ bt ].bb.b_go );
1845 #endif /* KMP_CACHE_MANAGE */
1846 
1847 #if KMP_BARRIER_ICV_PUSH
1848  if ( propagate_icvs ) {
1849  KMP_DEBUG_ASSERT( team != NULL );
1850  __kmp_init_implicit_task( team->t.t_ident,
1851  team->t.t_threads[child_tid], team, child_tid, FALSE );
1852  copy_icvs( &team->t.t_implicit_task_taskdata[child_tid].td_icvs,
1853  &team->t.t_implicit_task_taskdata[0].td_icvs );
1854  }
1855 #endif // KMP_BARRIER_ICV_PUSH
1856 
1857  KA_TRACE( 20, ( "__kmp_hyper_barrier_release: T#%d(%d:%d) releasing T#%d(%d:%d)"
1858  "go(%p): %u => %u\n",
1859  gtid, team->t.t_id, tid,
1860  __kmp_gtid_from_tid( child_tid, team ), team->t.t_id,
1861  child_tid, &child_bar -> b_go, child_bar -> b_go,
1862  child_bar -> b_go + KMP_BARRIER_STATE_BUMP ) );
1863 
1864  /* release child from barrier */
1865  __kmp_release( child_thr, &child_bar -> b_go, kmp_acquire_fence );
1866  }
1867  }
1868  }
1869 
1870  KA_TRACE( 20, ( "__kmp_hyper_barrier_release: T#%d(%d:%d) exit for barrier type %d\n",
1871  gtid, team->t.t_id, tid, bt ) );
1872 }
1873 
1874 #else /* !KMP_REVERSE_HYPER_BAR */
1875 
1876 static void
1877 __kmp_hyper_barrier_release( enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid, int propagate_icvs )
1878 {
1879  /* handle fork barrier workers who aren't part of a team yet */
1880  register kmp_team_t *team;
1881  register kmp_bstate_t *thr_bar = & this_thr -> th.th_bar[ bt ].bb;
1882  register kmp_info_t **other_threads;
1883  register kmp_uint32 num_threads;
1884  register kmp_uint32 branch_bits = __kmp_barrier_release_branch_bits[ bt ];
1885  register kmp_uint32 branch_factor = 1 << branch_bits;
1886  register kmp_uint32 child;
1887  register kmp_int32 child_tid;
1888  register kmp_uint32 offset;
1889  register kmp_uint32 level;
1890 
1891  /*
1892  * We now perform a hypercube-embedded tree release for all
1893  * of the threads that have been gathered, but in the same order
1894  * as the gather.
1895  */
1896 
1897  if ( ! KMP_MASTER_TID( tid )) {
1898  /* worker threads */
1899 
1900  KA_TRACE( 20, ( "__kmp_hyper_barrier_release: T#%d wait go(%p) == %u\n",
1901  gtid, &thr_bar -> b_go, KMP_BARRIER_STATE_BUMP ) );
1902 
1903  /* wait for parent thread to release us */
1904  __kmp_wait_sleep( this_thr, &thr_bar -> b_go, KMP_BARRIER_STATE_BUMP, TRUE, NULL );
1905 
1906 #if USE_ITT_BUILD && OMP_30_ENABLED && USE_ITT_NOTIFY
1907  if ( ( __itt_sync_create_ptr && itt_sync_obj == NULL ) || KMP_ITT_DEBUG ) {
1908  // we are on a fork barrier where we could not get the object reliably
1909  itt_sync_obj = __kmp_itt_barrier_object( gtid, bs_forkjoin_barrier, 0, -1 );
1910  // cancel wait on previous parallel region...
1911  __kmp_itt_task_starting( itt_sync_obj );
1912 
1913  if ( bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done) )
1914  return;
1915 
1916  itt_sync_obj = __kmp_itt_barrier_object( gtid, bs_forkjoin_barrier );
1917  if ( itt_sync_obj != NULL )
1918  __kmp_itt_task_finished( itt_sync_obj ); // call prepare as early as possible for "new" barrier
1919 
1920  } else
1921 #endif /* USE_ITT_BUILD && OMP_30_ENABLED && USE_ITT_NOTIFY */
1922  //
1923  // early exit for reaping threads releasing forkjoin barrier
1924  //
1925  if ( bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done) )
1926  return;
1927 
1928  //
1929  // The worker thread may now assume that the team is valid.
1930  //
1931 #if USE_ITT_BUILD && !OMP_30_ENABLED && USE_ITT_NOTIFY
1932  // libguide only code (cannot use *itt_task* routines)
1933  if ( ( __itt_sync_create_ptr && itt_sync_obj == NULL ) || KMP_ITT_DEBUG ) {
1934  // we are on a fork barrier where we could not get the object reliably
1935  itt_sync_obj = __kmp_itt_barrier_object( gtid, bs_forkjoin_barrier );
1936  __kmp_itt_barrier_starting( gtid, itt_sync_obj ); // no need to call releasing, but we have paired calls...
1937  }
1938 #endif /* USE_ITT_BUILD && !OMP_30_ENABLED && USE_ITT_NOTIFY */
1939  team = __kmp_threads[ gtid ]-> th.th_team;
1940  KMP_DEBUG_ASSERT( team != NULL );
1941  tid = __kmp_tid_from_gtid( gtid );
1942 
1943  TCW_4(thr_bar->b_go, KMP_INIT_BARRIER_STATE);
1944  KA_TRACE( 20, ( "__kmp_hyper_barrier_release: T#%d(%d:%d) set go(%p) = %u\n",
1945  gtid, ( team != NULL ) ? team->t.t_id : -1, tid,
1946  &thr_bar->b_go, KMP_INIT_BARRIER_STATE ) );
1947 
1948  KMP_MB(); /* Flush all pending memory write invalidates. */
1949 
1950  } else { /* KMP_MASTER_TID(tid) */
1951  team = __kmp_threads[ gtid ]-> th.th_team;
1952  KMP_DEBUG_ASSERT( team != NULL );
1953 
1954  KA_TRACE( 20, ( "__kmp_hyper_barrier_release: T#%d(%d:%d) enter for barrier type %d\n",
1955  gtid, team->t.t_id, tid, bt ) );
1956  }
1957 
1958  /* Now set up team parameters since workers have been released */
1959  if ( team == NULL ) {
1960  /* handle fork barrier workers who are now part of a team */
1961  tid = __kmp_tid_from_gtid( gtid );
1962  team = __kmp_threads[ gtid ]-> th.th_team;
1963  }
1964  num_threads = this_thr -> th.th_team_nproc;
1965  other_threads = team -> t.t_threads;
1966 
1967  /* Go down the tree, level by level */
1968  for ( level = 0, offset = 1;
1969  offset < num_threads;
1970  level += branch_bits, offset <<= branch_bits )
1971  {
1972  register kmp_uint32 child;
1973  register kmp_int32 child_tid;
1974 
1975  if (((tid >> level) & (branch_factor - 1)) != 0)
1976  /* No need to go any lower than this, since this is the level
1977  parent would be notified */
1978  break;
1979 
1980  /* iterate through children on this level of the tree */
1981  for ( child = 1, child_tid = tid + (1 << level);
1982  child < branch_factor && child_tid < num_threads;
1983  child++, child_tid += (1 << level) )
1984  {
1985  register kmp_info_t *child_thr = other_threads[ child_tid ];
1986  register kmp_bstate_t *child_bar = & child_thr -> th.th_bar[ bt ].bb;
1987 #if KMP_CACHE_MANAGE
1988  {
1989  register kmp_uint32 next_child_tid = child_tid + (1 << level);
1990  /* prefetch next thread's go count */
1991  if ( child+1 < branch_factor && next_child_tid < num_threads )
1992  KMP_CACHE_PREFETCH( &other_threads[ next_child_tid ]->th.th_bar[ bt ].bb.b_go );
1993  }
1994 #endif /* KMP_CACHE_MANAGE */
1995 
1996 #if KMP_BARRIER_ICV_PUSH
1997  if ( propagate_icvs ) {
1998  KMP_DEBUG_ASSERT( team != NULL );
1999  __kmp_init_implicit_task( team->t.t_ident,
2000  team->t.t_threads[child_tid], team, child_tid, FALSE );
2001  copy_icvs( &team->t.t_implicit_task_taskdata[child_tid].td_icvs,
2002  &team->t.t_implicit_task_taskdata[0].td_icvs );
2003  }
2004 #endif // KMP_BARRIER_ICV_PUSH
2005 
2006  KA_TRACE( 20, ( "__kmp_hyper_barrier_release: T#%d(%d:%d) releasing "
2007  "T#%d(%d:%d) go(%p): %u => %u\n",
2008  gtid, team->t.t_id, tid,
2009  __kmp_gtid_from_tid( child_tid, team ), team->t.t_id,
2010  child_tid, &child_bar -> b_go, child_bar -> b_go,
2011  child_bar -> b_go + KMP_BARRIER_STATE_BUMP ) );
2012 
2013  /* release child from barrier */
2014  __kmp_release( child_thr, &child_bar -> b_go, kmp_acquire_fence );
2015  }
2016  }
2017 
2018  KA_TRACE( 20, ( "__kmp_hyper_barrier_release: T#%d(%d:%d) exit for barrier type %d\n",
2019  gtid, team->t.t_id, tid, bt ) );
2020 }
2021 #endif /* KMP_REVERSE_HYPER_BAR */
2022 
2023 
2024 /*
2025  * Internal function to do a barrier.
2026  * If is_split is true, do a split barrier, otherwise, do a plain barrier
2027  * If reduce is non-NULL, do a split reduction barrier, otherwise, do a split barrier
2028  * Returns 0 if master thread, 1 if worker thread.
2029  */
2030 int
2031 __kmp_barrier( enum barrier_type bt, int gtid, int is_split,
2032  size_t reduce_size, void *reduce_data, void (*reduce)(void *, void *) )
2033 {
2034  register int tid = __kmp_tid_from_gtid( gtid );
2035  register kmp_info_t *this_thr = __kmp_threads[ gtid ];
2036  register kmp_team_t *team = this_thr -> th.th_team;
2037  register int status = 0;
2038 
2039  KA_TRACE( 15, ( "__kmp_barrier: T#%d(%d:%d) has arrived\n",
2040  gtid, __kmp_team_from_gtid(gtid)->t.t_id, __kmp_tid_from_gtid(gtid) ) );
2041 
2042  if ( ! team->t.t_serialized ) {
2043 #if USE_ITT_BUILD
2044  // This value will be used in itt notify events below.
2045  void * itt_sync_obj = NULL;
2046  #if USE_ITT_NOTIFY
2047  if ( __itt_sync_create_ptr || KMP_ITT_DEBUG )
2048  itt_sync_obj = __kmp_itt_barrier_object( gtid, bt, 1 );
2049  #endif
2050 #endif /* USE_ITT_BUILD */
2051  #if OMP_30_ENABLED
2052  if ( __kmp_tasking_mode == tskm_extra_barrier ) {
2053  __kmp_tasking_barrier( team, this_thr, gtid );
2054  KA_TRACE( 15, ( "__kmp_barrier: T#%d(%d:%d) past tasking barrier\n",
2055  gtid, __kmp_team_from_gtid(gtid)->t.t_id, __kmp_tid_from_gtid(gtid) ) );
2056  }
2057  #endif /* OMP_30_ENABLED */
2058 
2059  //
2060  // Copy the blocktime info to the thread, where __kmp_wait_sleep()
2061  // can access it when the team struct is not guaranteed to exist.
2062  //
2063  // See the note about the corresponding code in __kmp_join_barrier()
2064  // being performance-critical.
2065  //
2066  if ( __kmp_dflt_blocktime != KMP_MAX_BLOCKTIME ) {
2067  #if OMP_30_ENABLED
2068  this_thr -> th.th_team_bt_intervals = team -> t.t_implicit_task_taskdata[tid].td_icvs.bt_intervals;
2069  this_thr -> th.th_team_bt_set = team -> t.t_implicit_task_taskdata[tid].td_icvs.bt_set;
2070  #else
2071  this_thr -> th.th_team_bt_intervals = team -> t.t_set_bt_intervals[tid];
2072  this_thr -> th.th_team_bt_set= team -> t.t_set_bt_set[tid];
2073  #endif // OMP_30_ENABLED
2074  }
2075 
2076 #if USE_ITT_BUILD
2077  if ( __itt_sync_create_ptr || KMP_ITT_DEBUG )
2078  __kmp_itt_barrier_starting( gtid, itt_sync_obj );
2079 #endif /* USE_ITT_BUILD */
2080 
2081  if ( reduce != NULL ) {
2082  //KMP_DEBUG_ASSERT( is_split == TRUE ); // #C69956
2083  this_thr -> th.th_local.reduce_data = reduce_data;
2084  }
2085  if ( __kmp_barrier_gather_pattern[ bt ] == bp_linear_bar || __kmp_barrier_gather_branch_bits[ bt ] == 0 ) {
2086  __kmp_linear_barrier_gather( bt, this_thr, gtid, tid, reduce
2087  USE_ITT_BUILD_ARG( itt_sync_obj )
2088  );
2089  } else if ( __kmp_barrier_gather_pattern[ bt ] == bp_tree_bar ) {
2090  __kmp_tree_barrier_gather( bt, this_thr, gtid, tid, reduce
2091  USE_ITT_BUILD_ARG( itt_sync_obj )
2092  );
2093  } else {
2094  __kmp_hyper_barrier_gather( bt, this_thr, gtid, tid, reduce
2095  USE_ITT_BUILD_ARG( itt_sync_obj )
2096  );
2097  }; // if
2098 
2099 #if USE_ITT_BUILD
2100  // TODO: In case of split reduction barrier, master thread may send aquired event early,
2101  // before the final summation into the shared variable is done (final summation can be a
2102  // long operation for array reductions).
2103  if ( __itt_sync_create_ptr || KMP_ITT_DEBUG )
2104  __kmp_itt_barrier_middle( gtid, itt_sync_obj );
2105 #endif /* USE_ITT_BUILD */
2106 
2107  KMP_MB();
2108 
2109  if ( KMP_MASTER_TID( tid ) ) {
2110  status = 0;
2111 
2112  #if OMP_30_ENABLED
2113  if ( __kmp_tasking_mode != tskm_immediate_exec ) {
2114  __kmp_task_team_wait( this_thr, team
2115  USE_ITT_BUILD_ARG( itt_sync_obj )
2116  );
2117  __kmp_task_team_setup( this_thr, team );
2118  }
2119  #endif /* OMP_30_ENABLED */
2120 
2121  } else {
2122  status = 1;
2123  }
2124  if ( status == 1 || ! is_split ) {
2125  if ( __kmp_barrier_release_pattern[ bt ] == bp_linear_bar || __kmp_barrier_release_branch_bits[ bt ] == 0 ) {
2126  __kmp_linear_barrier_release( bt, this_thr, gtid, tid, FALSE
2127  USE_ITT_BUILD_ARG( itt_sync_obj )
2128  );
2129  } else if ( __kmp_barrier_release_pattern[ bt ] == bp_tree_bar ) {
2130  __kmp_tree_barrier_release( bt, this_thr, gtid, tid, FALSE
2131  USE_ITT_BUILD_ARG( itt_sync_obj )
2132  );
2133  } else {
2134  __kmp_hyper_barrier_release( bt, this_thr, gtid, tid, FALSE
2135  USE_ITT_BUILD_ARG( itt_sync_obj )
2136  );
2137  }
2138  #if OMP_30_ENABLED
2139  if ( __kmp_tasking_mode != tskm_immediate_exec ) {
2140  __kmp_task_team_sync( this_thr, team );
2141  }
2142  #endif /* OMP_30_ENABLED */
2143  }
2144 
2145 #if USE_ITT_BUILD
2146  // GEH: TODO: Move this under if-condition above and also include in __kmp_end_split_barrier().
2147  // This will more accurately represent the actual release time of the threads for split barriers.
2148  if ( __itt_sync_create_ptr || KMP_ITT_DEBUG )
2149  __kmp_itt_barrier_finished( gtid, itt_sync_obj );
2150 #endif /* USE_ITT_BUILD */
2151 
2152  } else { // Team is serialized.
2153 
2154  status = 0;
2155 
2156  #if OMP_30_ENABLED
2157  if ( __kmp_tasking_mode != tskm_immediate_exec ) {
2158  //
2159  // The task team should be NULL for serialized code.
2160  // (tasks will be executed immediately).
2161  //
2162  KMP_DEBUG_ASSERT( team->t.t_task_team == NULL );
2163  KMP_DEBUG_ASSERT( this_thr->th.th_task_team == NULL );
2164  }
2165  #endif /* OMP_30_ENABLED */
2166  }
2167 
2168  KA_TRACE( 15, ( "__kmp_barrier: T#%d(%d:%d) is leaving with return value %d\n",
2169  gtid, __kmp_team_from_gtid(gtid)->t.t_id, __kmp_tid_from_gtid(gtid),
2170  status ) );
2171  return status;
2172 }
2173 
2174 
2175 void
2176 __kmp_end_split_barrier( enum barrier_type bt, int gtid )
2177 {
2178  int tid = __kmp_tid_from_gtid( gtid );
2179  kmp_info_t *this_thr = __kmp_threads[ gtid ];
2180  kmp_team_t *team = this_thr -> th.th_team;
2181 
2182  if( ! team -> t.t_serialized ) {
2183  if( KMP_MASTER_GTID( gtid ) ) {
2184  if ( __kmp_barrier_release_pattern[ bt ] == bp_linear_bar || __kmp_barrier_release_branch_bits[ bt ] == 0 ) {
2185  __kmp_linear_barrier_release( bt, this_thr, gtid, tid, FALSE
2186 #if USE_ITT_BUILD
2187  , NULL
2188 #endif /* USE_ITT_BUILD */
2189  );
2190  } else if ( __kmp_barrier_release_pattern[ bt ] == bp_tree_bar ) {
2191  __kmp_tree_barrier_release( bt, this_thr, gtid, tid, FALSE
2192 #if USE_ITT_BUILD
2193  , NULL
2194 #endif /* USE_ITT_BUILD */
2195  );
2196  } else {
2197  __kmp_hyper_barrier_release( bt, this_thr, gtid, tid, FALSE
2198 #if USE_ITT_BUILD
2199  , NULL
2200 #endif /* USE_ITT_BUILD */
2201  );
2202  }; // if
2203  #if OMP_30_ENABLED
2204  if ( __kmp_tasking_mode != tskm_immediate_exec ) {
2205  __kmp_task_team_sync( this_thr, team );
2206  }; // if
2207  #endif /* OMP_30_ENABLED */
2208  }
2209  }
2210 }
2211 
2212 /* ------------------------------------------------------------------------ */
2213 /* ------------------------------------------------------------------------ */
2214 
2215 /*
2216  * determine if we can go parallel or must use a serialized parallel region and
2217  * how many threads we can use
2218  * set_nproc is the number of threads requested for the team
2219  * returns 0 if we should serialize or only use one thread,
2220  * otherwise the number of threads to use
2221  * The forkjoin lock is held by the caller.
2222  */
2223 static int
2224 __kmp_reserve_threads( kmp_root_t *root, kmp_team_t *parent_team,
2225  int master_tid, int set_nthreads
2226 #if OMP_40_ENABLED
2227  , int enter_teams
2228 #endif /* OMP_40_ENABLED */
2229 )
2230 {
2231  int capacity;
2232  int new_nthreads;
2233  int use_rml_to_adjust_nth;
2234  KMP_DEBUG_ASSERT( __kmp_init_serial );
2235  KMP_DEBUG_ASSERT( root && parent_team );
2236 
2237  //
2238  // Initial check to see if we should use a serialized team.
2239  //
2240  if ( set_nthreads == 1 ) {
2241  KC_TRACE( 10, ( "__kmp_reserve_threads: T#%d reserving 1 thread; requested %d threads\n",
2242  __kmp_get_gtid(), set_nthreads ));
2243  return 1;
2244  }
2245  if ( ( !get__nested_2(parent_team,master_tid) && (root->r.r_in_parallel
2246 #if OMP_40_ENABLED
2247  && !enter_teams
2248 #endif /* OMP_40_ENABLED */
2249  ) ) || ( __kmp_library == library_serial ) ) {
2250  KC_TRACE( 10, ( "__kmp_reserve_threads: T#%d serializing team; requested %d threads\n",
2251  __kmp_get_gtid(), set_nthreads ));
2252  return 1;
2253  }
2254 
2255  //
2256  // If dyn-var is set, dynamically adjust the number of desired threads,
2257  // according to the method specified by dynamic_mode.
2258  //
2259  new_nthreads = set_nthreads;
2260  use_rml_to_adjust_nth = FALSE;
2261  if ( ! get__dynamic_2( parent_team, master_tid ) ) {
2262  ;
2263  }
2264 #ifdef USE_LOAD_BALANCE
2265  else if ( __kmp_global.g.g_dynamic_mode == dynamic_load_balance ) {
2266  new_nthreads = __kmp_load_balance_nproc( root, set_nthreads );
2267  if ( new_nthreads == 1 ) {
2268  KC_TRACE( 10, ( "__kmp_reserve_threads: T#%d load balance reduced reservation to 1 thread\n",
2269  master_tid ));
2270  return 1;
2271  }
2272  if ( new_nthreads < set_nthreads ) {
2273  KC_TRACE( 10, ( "__kmp_reserve_threads: T#%d load balance reduced reservation to %d threads\n",
2274  master_tid, new_nthreads ));
2275  }
2276  }
2277 #endif /* USE_LOAD_BALANCE */
2278  else if ( __kmp_global.g.g_dynamic_mode == dynamic_thread_limit ) {
2279  new_nthreads = __kmp_avail_proc - __kmp_nth + (root->r.r_active ? 1
2280  : root->r.r_hot_team->t.t_nproc);
2281  if ( new_nthreads <= 1 ) {
2282  KC_TRACE( 10, ( "__kmp_reserve_threads: T#%d thread limit reduced reservation to 1 thread\n",
2283  master_tid ));
2284  return 1;
2285  }
2286  if ( new_nthreads < set_nthreads ) {
2287  KC_TRACE( 10, ( "__kmp_reserve_threads: T#%d thread limit reduced reservation to %d threads\n",
2288  master_tid, new_nthreads ));
2289  }
2290  else {
2291  new_nthreads = set_nthreads;
2292  }
2293  }
2294  else if ( __kmp_global.g.g_dynamic_mode == dynamic_random ) {
2295  if ( set_nthreads > 2 ) {
2296  new_nthreads = __kmp_get_random( parent_team->t.t_threads[master_tid] );
2297  new_nthreads = ( new_nthreads % set_nthreads ) + 1;
2298  if ( new_nthreads == 1 ) {
2299  KC_TRACE( 10, ( "__kmp_reserve_threads: T#%d dynamic random reduced reservation to 1 thread\n",
2300  master_tid ));
2301  return 1;
2302  }
2303  if ( new_nthreads < set_nthreads ) {
2304  KC_TRACE( 10, ( "__kmp_reserve_threads: T#%d dynamic random reduced reservation to %d threads\n",
2305  master_tid, new_nthreads ));
2306  }
2307  }
2308  }
2309  else {
2310  KMP_ASSERT( 0 );
2311  }
2312 
2313  //
2314  // Respect KMP_ALL_THREADS, KMP_MAX_THREADS, OMP_THREAD_LIMIT.
2315  //
2316  if ( __kmp_nth + new_nthreads - ( root->r.r_active ? 1 :
2317  root->r.r_hot_team->t.t_nproc ) > __kmp_max_nth ) {
2318  int tl_nthreads = __kmp_max_nth - __kmp_nth + ( root->r.r_active ? 1 :
2319  root->r.r_hot_team->t.t_nproc );
2320  if ( tl_nthreads <= 0 ) {
2321  tl_nthreads = 1;
2322  }
2323 
2324  //
2325  // If dyn-var is false, emit a 1-time warning.
2326  //
2327  if ( ! get__dynamic_2( parent_team, master_tid )
2328  && ( ! __kmp_reserve_warn ) ) {
2329  __kmp_reserve_warn = 1;
2330  __kmp_msg(
2331  kmp_ms_warning,
2332  KMP_MSG( CantFormThrTeam, set_nthreads, tl_nthreads ),
2333  KMP_HNT( Unset_ALL_THREADS ),
2334  __kmp_msg_null
2335  );
2336  }
2337  if ( tl_nthreads == 1 ) {
2338  KC_TRACE( 10, ( "__kmp_reserve_threads: T#%d KMP_ALL_THREADS reduced reservation to 1 thread\n",
2339  master_tid ));
2340  return 1;
2341  }
2342  KC_TRACE( 10, ( "__kmp_reserve_threads: T#%d KMP_ALL_THREADS reduced reservation to %d threads\n",
2343  master_tid, tl_nthreads ));
2344  new_nthreads = tl_nthreads;
2345  }
2346 
2347 
2348  //
2349  // Check if the threads array is large enough, or needs expanding.
2350  //
2351  // See comment in __kmp_register_root() about the adjustment if
2352  // __kmp_threads[0] == NULL.
2353  //
2354  capacity = __kmp_threads_capacity;
2355  if ( TCR_PTR(__kmp_threads[0]) == NULL ) {
2356  --capacity;
2357  }
2358  if ( __kmp_nth + new_nthreads - ( root->r.r_active ? 1 :
2359  root->r.r_hot_team->t.t_nproc ) > capacity ) {
2360  //
2361  // Expand the threads array.
2362  //
2363  int slotsRequired = __kmp_nth + new_nthreads - ( root->r.r_active ? 1 :
2364  root->r.r_hot_team->t.t_nproc ) - capacity;
2365  int slotsAdded = __kmp_expand_threads(slotsRequired, slotsRequired);
2366  if ( slotsAdded < slotsRequired ) {
2367  //
2368  // The threads array was not expanded enough.
2369  //
2370  new_nthreads -= ( slotsRequired - slotsAdded );
2371  KMP_ASSERT( new_nthreads >= 1 );
2372 
2373  //
2374  // If dyn-var is false, emit a 1-time warning.
2375  //
2376  if ( ! get__dynamic_2( parent_team, master_tid )
2377  && ( ! __kmp_reserve_warn ) ) {
2378  __kmp_reserve_warn = 1;
2379  if ( __kmp_tp_cached ) {
2380  __kmp_msg(
2381  kmp_ms_warning,
2382  KMP_MSG( CantFormThrTeam, set_nthreads, new_nthreads ),
2383  KMP_HNT( Set_ALL_THREADPRIVATE, __kmp_tp_capacity ),
2384  KMP_HNT( PossibleSystemLimitOnThreads ),
2385  __kmp_msg_null
2386  );
2387  }
2388  else {
2389  __kmp_msg(
2390  kmp_ms_warning,
2391  KMP_MSG( CantFormThrTeam, set_nthreads, new_nthreads ),
2392  KMP_HNT( SystemLimitOnThreads ),
2393  __kmp_msg_null
2394  );
2395  }
2396  }
2397  }
2398  }
2399 
2400  if ( new_nthreads == 1 ) {
2401  KC_TRACE( 10, ( "__kmp_reserve_threads: T#%d serializing team after reclaiming dead roots and rechecking; requested %d threads\n",
2402  __kmp_get_gtid(), set_nthreads ) );
2403  return 1;
2404  }
2405 
2406  KC_TRACE( 10, ( "__kmp_reserve_threads: T#%d allocating %d threads; requested %d threads\n",
2407  __kmp_get_gtid(), new_nthreads, set_nthreads ));
2408  return new_nthreads;
2409 }
2410 
2411 /* ------------------------------------------------------------------------ */
2412 /* ------------------------------------------------------------------------ */
2413 
2414 /* allocate threads from the thread pool and assign them to the new team */
2415 /* we are assured that there are enough threads available, because we
2416  * checked on that earlier within critical section forkjoin */
2417 
2418 static void
2419 __kmp_fork_team_threads( kmp_root_t *root, kmp_team_t *team,
2420  kmp_info_t *master_th, int master_gtid )
2421 {
2422  int i;
2423 
2424  KA_TRACE( 10, ("__kmp_fork_team_threads: new_nprocs = %d\n", team->t.t_nproc ) );
2425  KMP_DEBUG_ASSERT( master_gtid == __kmp_get_gtid() );
2426  KMP_MB();
2427 
2428  /* first, let's setup the master thread */
2429  master_th -> th.th_info .ds.ds_tid = 0;
2430  master_th -> th.th_team = team;
2431  master_th -> th.th_team_nproc = team -> t.t_nproc;
2432  master_th -> th.th_team_master = master_th;
2433  master_th -> th.th_team_serialized = FALSE;
2434  master_th -> th.th_dispatch = & team -> t.t_dispatch[ 0 ];
2435 
2436  /* make sure we are not the optimized hot team */
2437  if ( team != root->r.r_hot_team ) {
2438 
2439  /* install the master thread */
2440  team -> t.t_threads[ 0 ] = master_th;
2441  __kmp_initialize_info( master_th, team, 0, master_gtid );
2442 
2443  /* now, install the worker threads */
2444  for ( i=1 ; i < team->t.t_nproc ; i++ ) {
2445 
2446  /* fork or reallocate a new thread and install it in team */
2447  team -> t.t_threads[ i ] = __kmp_allocate_thread( root, team, i );
2448  KMP_DEBUG_ASSERT( team->t.t_threads[i] );
2449  KMP_DEBUG_ASSERT( team->t.t_threads[i]->th.th_team == team );
2450  /* align team and thread arrived states */
2451  KA_TRACE( 20, ("__kmp_fork_team_threads: T#%d(%d:%d) init arrived T#%d(%d:%d) join =%u, plain=%u\n",
2452  __kmp_gtid_from_tid( 0, team ), team->t.t_id, 0,
2453  __kmp_gtid_from_tid( i, team ), team->t.t_id, i,
2454  team->t.t_bar[ bs_forkjoin_barrier ].b_arrived,
2455  team->t.t_bar[ bs_plain_barrier ].b_arrived ) );
2456 
2457  { // Initialize threads' barrier data.
2458  int b;
2459  kmp_balign_t * balign = team->t.t_threads[ i ]->th.th_bar;
2460  for ( b = 0; b < bs_last_barrier; ++ b ) {
2461  balign[ b ].bb.b_arrived = team->t.t_bar[ b ].b_arrived;
2462  }; // for b
2463  }
2464  }
2465 
2466 #if OMP_40_ENABLED && (KMP_OS_WINDOWS || KMP_OS_LINUX)
2467  __kmp_partition_places( team );
2468 #endif
2469 
2470  }
2471 
2472  KMP_MB();
2473 }
2474 
2475 static void
2476 __kmp_alloc_argv_entries( int argc, kmp_team_t *team, int realloc ); // forward declaration
2477 
2478 /* most of the work for a fork */
2479 /* return true if we really went parallel, false if serialized */
2480 int
2481 __kmp_fork_call(
2482  ident_t * loc,
2483  int gtid,
2484  int exec_master, // 0 - GNU native code, master doesn't invoke microtask
2485  // 1 - Intel code, master invokes microtask
2486  // 2 - MS native code, use special invoker
2487  kmp_int32 argc,
2488  microtask_t microtask,
2489  launch_t invoker,
2490 /* TODO: revert workaround for Intel(R) 64 tracker #96 */
2491 #if KMP_ARCH_X86_64 && KMP_OS_LINUX
2492  va_list * ap
2493 #else
2494  va_list ap
2495 #endif
2496  )
2497 {
2498  void **argv;
2499  int i;
2500  int master_tid;
2501  int master_this_cons;
2502  int master_last_cons;
2503  kmp_team_t *team;
2504  kmp_team_t *parent_team;
2505  kmp_info_t *master_th;
2506  kmp_root_t *root;
2507  int nthreads;
2508  int master_active;
2509  int master_set_numthreads;
2510  int level;
2511 #if OMP_40_ENABLED
2512  int teams_level;
2513 #endif
2514 
2515  KA_TRACE( 20, ("__kmp_fork_call: enter T#%d\n", gtid ));
2516 
2517  /* initialize if needed */
2518  KMP_DEBUG_ASSERT( __kmp_init_serial );
2519  if( ! TCR_4(__kmp_init_parallel) )
2520  __kmp_parallel_initialize();
2521 
2522  /* setup current data */
2523  master_th = __kmp_threads[ gtid ];
2524  parent_team = master_th -> th.th_team;
2525  master_tid = master_th -> th.th_info.ds.ds_tid;
2526  master_this_cons = master_th -> th.th_local.this_construct;
2527  master_last_cons = master_th -> th.th_local.last_construct;
2528  root = master_th -> th.th_root;
2529  master_active = root -> r.r_active;
2530  master_set_numthreads = master_th -> th.th_set_nproc;
2531 #if OMP_30_ENABLED
2532  // Nested level will be an index in the nested nthreads array
2533  level = parent_team->t.t_level;
2534 #endif // OMP_30_ENABLED
2535 #if OMP_40_ENABLED
2536  teams_level = master_th->th.th_teams_level; // needed to check nesting inside the teams
2537 #endif
2538 
2539 
2540  master_th->th.th_ident = loc;
2541 
2542 #if OMP_40_ENABLED
2543  if ( master_th->th.th_team_microtask &&
2544  ap && microtask != (microtask_t)__kmp_teams_master && level == teams_level ) {
2545  // AC: This is start of parallel that is nested inside teams construct.
2546  // The team is actual (hot), all workers are ready at the fork barrier.
2547  // No lock needed to initialize the team a bit, then free workers.
2548  parent_team->t.t_ident = loc;
2549  parent_team->t.t_argc = argc;
2550  argv = (void**)parent_team->t.t_argv;
2551  for( i=argc-1; i >= 0; --i )
2552 /* TODO: revert workaround for Intel(R) 64 tracker #96 */
2553 #if KMP_ARCH_X86_64 && KMP_OS_LINUX
2554  *argv++ = va_arg( *ap, void * );
2555 #else
2556  *argv++ = va_arg( ap, void * );
2557 #endif
2558  /* Increment our nested depth levels, but not increase the serialization */
2559  if ( parent_team == master_th->th.th_serial_team ) {
2560  // AC: we are in serialized parallel
2561  __kmpc_serialized_parallel(loc, gtid);
2562  KMP_DEBUG_ASSERT( parent_team->t.t_serialized > 1 );
2563  parent_team->t.t_serialized--; // AC: need this in order enquiry functions
2564  // work correctly, will restore at join time
2565  __kmp_invoke_microtask( microtask, gtid, 0, argc, parent_team->t.t_argv );
2566  return TRUE;
2567  }
2568  parent_team->t.t_pkfn = microtask;
2569  parent_team->t.t_invoke = invoker;
2570  KMP_TEST_THEN_INC32( (kmp_int32*) &root->r.r_in_parallel );
2571  parent_team->t.t_active_level ++;
2572  parent_team->t.t_level ++;
2573 
2574 #if USE_ITT_BUILD
2575  if ( ( __itt_frame_begin_v3_ptr && __kmp_forkjoin_frames ) || KMP_ITT_DEBUG )
2576  __kmp_itt_region_forking( gtid );
2577 #endif /* USE_ITT_BUILD */
2578 
2579  KF_TRACE( 10, ( "__kmp_internal_fork : root=%p, team=%p, master_th=%p, gtid=%d\n", root, parent_team, master_th, gtid ) );
2580 
2581 
2582  __kmp_internal_fork( loc, gtid, parent_team );
2583  KF_TRACE( 10, ( "__kmp_internal_fork : after : root=%p, team=%p, master_th=%p, gtid=%d\n", root, parent_team, master_th, gtid ) );
2584 
2585  /* Invoke microtask for MASTER thread */
2586  KA_TRACE( 20, ("__kmp_fork_call: T#%d(%d:0) invoke microtask = %p\n",
2587  gtid, parent_team->t.t_id, parent_team->t.t_pkfn ) );
2588 
2589  if (! parent_team->t.t_invoke( gtid )) {
2590  KMP_ASSERT2( 0, "cannot invoke microtask for MASTER thread" );
2591  }
2592  KA_TRACE( 20, ("__kmp_fork_call: T#%d(%d:0) done microtask = %p\n",
2593  gtid, parent_team->t.t_id, parent_team->t.t_pkfn ) );
2594  KMP_MB(); /* Flush all pending memory write invalidates. */
2595 
2596  KA_TRACE( 20, ("__kmp_fork_call: parallel exit T#%d\n", gtid ));
2597 
2598  return TRUE;
2599  }
2600 #endif /* OMP_40_ENABLED */
2601 
2602 #if OMP_30_ENABLED && KMP_DEBUG
2603  if ( __kmp_tasking_mode != tskm_immediate_exec ) {
2604  KMP_DEBUG_ASSERT( master_th->th.th_task_team == parent_team->t.t_task_team );
2605  }
2606 #endif // OMP_30_ENABLED
2607 
2608  /* determine how many new threads we can use */
2609  __kmp_acquire_bootstrap_lock( &__kmp_forkjoin_lock );
2610 
2611 #if OMP_30_ENABLED
2612  if ( parent_team->t.t_active_level >= master_th->th.th_current_task->td_icvs.max_active_levels ) {
2613  nthreads = 1;
2614  }
2615  else
2616 #endif // OMP_30_ENABLED
2617 
2618  {
2619  nthreads = master_set_numthreads ?
2620  master_set_numthreads : get__nproc_2( parent_team, master_tid );
2621  nthreads = __kmp_reserve_threads( root, parent_team, master_tid, nthreads
2622 #if OMP_40_ENABLED
2623  // AC: If we execute teams from parallel region (on host), then teams
2624  // should be created but each can only have 1 thread if nesting is disabled.
2625  // If teams called from serial region, then teams and their threads
2626  // should be created regardless of the nesting setting.
2627  ,( ( ap == NULL && teams_level == 0 ) ||
2628  ( ap && teams_level > 0 && teams_level == level ) )
2629 #endif /* OMP_40_ENABLED */
2630  );
2631  }
2632  KMP_DEBUG_ASSERT( nthreads > 0 );
2633 
2634  /* If we temporarily changed the set number of threads then restore it now */
2635  master_th -> th.th_set_nproc = 0;
2636 
2637 
2638  /* create a serialized parallel region? */
2639  if ( nthreads == 1 ) {
2640  /* josh todo: hypothetical question: what do we do for OS X*? */
2641 #if KMP_OS_LINUX && ( KMP_ARCH_X86 || KMP_ARCH_X86_64 )
2642  void * args[ argc ];
2643 #else
2644  void * * args = (void**) alloca( argc * sizeof( void * ) );
2645 #endif /* KMP_OS_LINUX && ( KMP_ARCH_X86 || KMP_ARCH_X86_64 ) */
2646 
2647  __kmp_release_bootstrap_lock( &__kmp_forkjoin_lock );
2648  KA_TRACE( 20, ("__kmp_fork_call: T#%d serializing parallel region\n", gtid ));
2649 
2650  __kmpc_serialized_parallel(loc, gtid);
2651 
2652  if ( exec_master == 0 ) {
2653  // we were called from GNU native code
2654  KA_TRACE( 20, ("__kmp_fork_call: T#%d serial exit\n", gtid ));
2655  return FALSE;
2656  } else if ( exec_master == 1 ) {
2657  /* TODO this sucks, use the compiler itself to pass args! :) */
2658  master_th -> th.th_serial_team -> t.t_ident = loc;
2659 #if OMP_40_ENABLED
2660  if ( !ap ) {
2661  // revert change made in __kmpc_serialized_parallel()
2662  master_th -> th.th_serial_team -> t.t_level--;
2663  // Get args from parent team for teams construct
2664  __kmp_invoke_microtask( microtask, gtid, 0, argc, parent_team->t.t_argv );
2665  } else if ( microtask == (microtask_t)__kmp_teams_master ) {
2666  KMP_DEBUG_ASSERT( master_th->th.th_team == master_th->th.th_serial_team );
2667  team = master_th->th.th_team;
2668  //team->t.t_pkfn = microtask;
2669  team->t.t_invoke = invoker;
2670  __kmp_alloc_argv_entries( argc, team, TRUE );
2671  team->t.t_argc = argc;
2672  argv = (void**) team->t.t_argv;
2673  if ( ap ) {
2674  for( i=argc-1; i >= 0; --i )
2675  /* TODO: revert workaround for Intel(R) 64 tracker #96 */
2676  #if KMP_ARCH_X86_64 && KMP_OS_LINUX
2677  *argv++ = va_arg( *ap, void * );
2678  #else
2679  *argv++ = va_arg( ap, void * );
2680  #endif
2681  } else {
2682  for( i=0; i < argc; ++i )
2683  // Get args from parent team for teams construct
2684  argv[i] = parent_team->t.t_argv[i];
2685  }
2686  // AC: revert change made in __kmpc_serialized_parallel()
2687  // because initial code in teams should have level=0
2688  team->t.t_level--;
2689  // AC: call special invoker for outer "parallel" of the teams construct
2690  invoker(gtid);
2691  } else {
2692 #endif /* OMP_40_ENABLED */
2693  argv = args;
2694  for( i=argc-1; i >= 0; --i )
2695  /* TODO: revert workaround for Intel(R) 64 tracker #96 */
2696  #if KMP_ARCH_X86_64 && KMP_OS_LINUX
2697  *argv++ = va_arg( *ap, void * );
2698  #else
2699  *argv++ = va_arg( ap, void * );
2700  #endif
2701  KMP_MB();
2702  __kmp_invoke_microtask( microtask, gtid, 0, argc, args );
2703 #if OMP_40_ENABLED
2704  }
2705 #endif /* OMP_40_ENABLED */
2706  }
2707  else {
2708  KMP_ASSERT2( exec_master <= 1, "__kmp_fork_call: unknown parameter exec_master" );
2709  }
2710 
2711  KA_TRACE( 20, ("__kmp_fork_call: T#%d serial exit\n", gtid ));
2712 
2713  KMP_MB();
2714  return FALSE;
2715  }
2716 
2717 #if OMP_30_ENABLED
2718  // GEH: only modify the executing flag in the case when not serialized
2719  // serialized case is handled in kmpc_serialized_parallel
2720  KF_TRACE( 10, ( "__kmp_fork_call: parent_team_aclevel=%d, master_th=%p, curtask=%p, curtask_max_aclevel=%d\n",
2721  parent_team->t.t_active_level, master_th, master_th->th.th_current_task,
2722  master_th->th.th_current_task->td_icvs.max_active_levels ) );
2723  // TODO: GEH - cannot do this assertion because root thread not set up as executing
2724  // KMP_ASSERT( master_th->th.th_current_task->td_flags.executing == 1 );
2725  master_th->th.th_current_task->td_flags.executing = 0;
2726 #endif
2727 
2728 #if OMP_40_ENABLED
2729  if ( !master_th->th.th_team_microtask || level > teams_level )
2730 #endif /* OMP_40_ENABLED */
2731  {
2732  /* Increment our nested depth level */
2733  KMP_TEST_THEN_INC32( (kmp_int32*) &root->r.r_in_parallel );
2734  }
2735 
2736 #if OMP_30_ENABLED
2737  //
2738  // See if we need to make a copy of the ICVs.
2739  //
2740  int nthreads_icv = master_th->th.th_current_task->td_icvs.nproc;
2741  if ( ( level + 1 < __kmp_nested_nth.used ) &&
2742  ( __kmp_nested_nth.nth[level + 1] != nthreads_icv ) ) {
2743  nthreads_icv = __kmp_nested_nth.nth[level + 1];
2744  }
2745  else {
2746  nthreads_icv = 0; // don't update
2747  }
2748 
2749 #if OMP_40_ENABLED
2750  //
2751  // Figure out the proc_bind_policy for the new team.
2752  //
2753  kmp_proc_bind_t proc_bind = master_th->th.th_set_proc_bind;
2754  kmp_proc_bind_t proc_bind_icv; // proc_bind_default means don't update
2755 
2756  if ( master_th->th.th_current_task->td_icvs.proc_bind == proc_bind_false ) {
2757  proc_bind = proc_bind_false;
2758  proc_bind_icv = proc_bind_default;
2759  }
2760  else {
2761  proc_bind_icv = master_th->th.th_current_task->td_icvs.proc_bind;
2762  if ( proc_bind == proc_bind_default ) {
2763  //
2764  // No proc_bind clause was specified, so use the current value
2765  // of proc-bind-var for this parallel region.
2766  //
2767  proc_bind = proc_bind_icv;
2768  }
2769  else {
2770  //
2771  // The proc_bind policy was specified explicitly on the parallel
2772  // clause. This overrides the proc-bind-var for this parallel
2773  // region, but does not change proc-bind-var.
2774  //
2775  }
2776 
2777  //
2778  // Figure the value of proc-bind-var for the child threads.
2779  //
2780  if ( ( level + 1 < __kmp_nested_proc_bind.used )
2781  && ( __kmp_nested_proc_bind.bind_types[level + 1] != proc_bind_icv ) ) {
2782  proc_bind_icv = __kmp_nested_proc_bind.bind_types[level + 1];
2783  }
2784  else {
2785  proc_bind_icv = proc_bind_default;
2786  }
2787  }
2788 
2789  //
2790  // Reset for next parallel region
2791  //
2792  master_th->th.th_set_proc_bind = proc_bind_default;
2793 #endif /* OMP_40_ENABLED */
2794 
2795  if ( ( nthreads_icv > 0 )
2796 #if OMP_40_ENABLED
2797  || ( proc_bind_icv != proc_bind_default )
2798 #endif /* OMP_40_ENABLED */
2799  )
2800  {
2801  kmp_internal_control_t new_icvs;
2802  copy_icvs( & new_icvs, & master_th->th.th_current_task->td_icvs );
2803  new_icvs.next = NULL;
2804 
2805  if ( nthreads_icv > 0 ) {
2806  new_icvs.nproc = nthreads_icv;
2807  }
2808 
2809 #if OMP_40_ENABLED
2810  if ( proc_bind_icv != proc_bind_default ) {
2811  new_icvs.proc_bind = proc_bind_icv;
2812  }
2813 #endif /* OMP_40_ENABLED */
2814 
2815  /* allocate a new parallel team */
2816  KF_TRACE( 10, ( "__kmp_fork_call: before __kmp_allocate_team\n" ) );
2817  team = __kmp_allocate_team(root, nthreads, nthreads,
2818 #if OMP_40_ENABLED
2819  proc_bind,
2820 #endif
2821  &new_icvs, argc );
2822  } else
2823 #endif /* OMP_30_ENABLED */
2824  {
2825  /* allocate a new parallel team */
2826  KF_TRACE( 10, ( "__kmp_fork_call: before __kmp_allocate_team\n" ) );
2827  team = __kmp_allocate_team(root, nthreads, nthreads,
2828 #if OMP_40_ENABLED
2829  proc_bind,
2830 #endif
2831 #if OMP_30_ENABLED
2832  &master_th->th.th_current_task->td_icvs,
2833 #else
2834  parent_team->t.t_set_nproc[master_tid],
2835  parent_team->t.t_set_dynamic[master_tid],
2836  parent_team->t.t_set_nested[master_tid],
2837  parent_team->t.t_set_blocktime[master_tid],
2838  parent_team->t.t_set_bt_intervals[master_tid],
2839  parent_team->t.t_set_bt_set[master_tid],
2840 #endif // OMP_30_ENABLED
2841  argc );
2842  }
2843 
2844  KF_TRACE( 10, ( "__kmp_fork_call: after __kmp_allocate_team - team = %p\n",
2845  team ) );
2846 
2847  /* setup the new team */
2848  team->t.t_master_tid = master_tid;
2849  team->t.t_master_this_cons = master_this_cons;
2850  team->t.t_master_last_cons = master_last_cons;
2851 
2852  team->t.t_parent = parent_team;
2853  TCW_SYNC_PTR(team->t.t_pkfn, microtask);
2854  team->t.t_invoke = invoker; /* TODO move this to root, maybe */
2855  team->t.t_ident = loc;
2856 #if OMP_30_ENABLED
2857  // TODO: parent_team->t.t_level == INT_MAX ???
2858 #if OMP_40_ENABLED
2859  if ( !master_th->th.th_team_microtask || level > teams_level ) {
2860 #endif /* OMP_40_ENABLED */
2861  team->t.t_level = parent_team->t.t_level + 1;
2862  team->t.t_active_level = parent_team->t.t_active_level + 1;
2863 #if OMP_40_ENABLED
2864  } else {
2865  // AC: Do not increase parallel level at start of the teams construct
2866  team->t.t_level = parent_team->t.t_level;
2867  team->t.t_active_level = parent_team->t.t_active_level;
2868  }
2869 #endif /* OMP_40_ENABLED */
2870  team->t.t_sched = get__sched_2( parent_team, master_tid ); // set master's schedule as new run-time schedule
2871 
2872 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
2873  if ( __kmp_inherit_fp_control ) {
2874  __kmp_store_x87_fpu_control_word( &team->t.t_x87_fpu_control_word );
2875  __kmp_store_mxcsr( &team->t.t_mxcsr );
2876  team->t.t_mxcsr &= KMP_X86_MXCSR_MASK;
2877  team->t.t_fp_control_saved = TRUE;
2878  }
2879  else {
2880  team->t.t_fp_control_saved = FALSE;
2881  }
2882 #endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
2883 
2884  if ( __kmp_tasking_mode != tskm_immediate_exec ) {
2885  //
2886  // Set the master thread's task team to the team's task team.
2887  // Unless this is the hot team, it should be NULL.
2888  //
2889  KMP_DEBUG_ASSERT( master_th->th.th_task_team == parent_team->t.t_task_team );
2890  KA_TRACE( 20, ( "__kmp_fork_call: Master T#%d pushing task_team %p / team %p, new task_team %p / team %p\n",
2891  __kmp_gtid_from_thread( master_th ), master_th->th.th_task_team,
2892  parent_team, team->t.t_task_team, team ) );
2893  master_th->th.th_task_team = team->t.t_task_team;
2894  KMP_DEBUG_ASSERT( ( master_th->th.th_task_team == NULL ) || ( team == root->r.r_hot_team ) ) ;
2895  }
2896 #endif // OMP_30_ENABLED
2897 
2898  KA_TRACE( 20, ("__kmp_fork_call: T#%d(%d:%d)->(%d:0) created a team of %d threads\n",
2899  gtid, parent_team->t.t_id, team->t.t_master_tid, team->t.t_id, team->t.t_nproc ));
2900  KMP_DEBUG_ASSERT( team != root->r.r_hot_team ||
2901  ( team->t.t_master_tid == 0 &&
2902  ( team->t.t_parent == root->r.r_root_team || team->t.t_parent->t.t_serialized ) ));
2903  KMP_MB();
2904 
2905  /* now, setup the arguments */
2906  argv = (void**) team -> t.t_argv;
2907 #if OMP_40_ENABLED
2908  if ( ap ) {
2909 #endif /* OMP_40_ENABLED */
2910  for( i=argc-1; i >= 0; --i )
2911 /* TODO: revert workaround for Intel(R) 64 tracker #96 */
2912 #if KMP_ARCH_X86_64 && KMP_OS_LINUX
2913  *argv++ = va_arg( *ap, void * );
2914 #else
2915  *argv++ = va_arg( ap, void * );
2916 #endif
2917 #if OMP_40_ENABLED
2918  } else {
2919  for( i=0; i < argc; ++i )
2920  // Get args from parent team for teams construct
2921  argv[i] = team->t.t_parent->t.t_argv[i];
2922  }
2923 #endif /* OMP_40_ENABLED */
2924 
2925  /* now actually fork the threads */
2926 
2927  team->t.t_master_active = master_active;
2928  if (!root -> r.r_active) /* Only do the assignment if it makes a difference to prevent cache ping-pong */
2929  root -> r.r_active = TRUE;
2930 
2931  __kmp_fork_team_threads( root, team, master_th, gtid );
2932 
2933 
2934  __kmp_release_bootstrap_lock( &__kmp_forkjoin_lock );
2935 
2936 
2937 #if USE_ITT_BUILD
2938  if ( ( __itt_frame_begin_v3_ptr && __kmp_forkjoin_frames ) || KMP_ITT_DEBUG )
2939  __kmp_itt_region_forking( gtid );
2940 #endif /* USE_ITT_BUILD */
2941 
2942  /* now go on and do the work */
2943  KMP_DEBUG_ASSERT( team == __kmp_threads[gtid]->th.th_team );
2944  KMP_MB();
2945 
2946  KF_TRACE( 10, ( "__kmp_internal_fork : root=%p, team=%p, master_th=%p, gtid=%d\n", root, team, master_th, gtid ) );
2947 
2948 #if USE_ITT_BUILD
2949  if ( __itt_stack_caller_create_ptr ) {
2950  team->t.t_stack_id = __kmp_itt_stack_caller_create(); // create new stack stitching id before entering fork barrier
2951  }
2952 #endif /* USE_ITT_BUILD */
2953 
2954 #if OMP_40_ENABLED
2955  if ( ap ) // AC: skip __kmp_internal_fork at teams construct, let only master threads execute
2956 #endif /* OMP_40_ENABLED */
2957  {
2958  __kmp_internal_fork( loc, gtid, team );
2959  KF_TRACE( 10, ( "__kmp_internal_fork : after : root=%p, team=%p, master_th=%p, gtid=%d\n", root, team, master_th, gtid ) );
2960  }
2961 
2962  if (! exec_master) {
2963  KA_TRACE( 20, ("__kmp_fork_call: parallel exit T#%d\n", gtid ));
2964  return TRUE;
2965  }
2966 
2967  /* Invoke microtask for MASTER thread */
2968  KA_TRACE( 20, ("__kmp_fork_call: T#%d(%d:0) invoke microtask = %p\n",
2969  gtid, team->t.t_id, team->t.t_pkfn ) );
2970 
2971  if (! team->t.t_invoke( gtid )) {
2972  KMP_ASSERT2( 0, "cannot invoke microtask for MASTER thread" );
2973  }
2974  KA_TRACE( 20, ("__kmp_fork_call: T#%d(%d:0) done microtask = %p\n",
2975  gtid, team->t.t_id, team->t.t_pkfn ) );
2976  KMP_MB(); /* Flush all pending memory write invalidates. */
2977 
2978  KA_TRACE( 20, ("__kmp_fork_call: parallel exit T#%d\n", gtid ));
2979 
2980  return TRUE;
2981 }
2982 
2983 
2984 void
2985 __kmp_join_call(ident_t *loc, int gtid
2986 #if OMP_40_ENABLED
2987  , int exit_teams
2988 #endif /* OMP_40_ENABLED */
2989 )
2990 {
2991  kmp_team_t *team;
2992  kmp_team_t *parent_team;
2993  kmp_info_t *master_th;
2994  kmp_root_t *root;
2995  int master_active;
2996  int i;
2997 
2998  KA_TRACE( 20, ("__kmp_join_call: enter T#%d\n", gtid ));
2999 
3000  /* setup current data */
3001  master_th = __kmp_threads[ gtid ];
3002  root = master_th -> th.th_root;
3003  team = master_th -> th.th_team;
3004  parent_team = team->t.t_parent;
3005 
3006  master_th->th.th_ident = loc;
3007 
3008 #if OMP_30_ENABLED && KMP_DEBUG
3009  if ( __kmp_tasking_mode != tskm_immediate_exec ) {
3010  KA_TRACE( 20, ( "__kmp_join_call: T#%d, old team = %p old task_team = %p, th_task_team = %p\n",
3011  __kmp_gtid_from_thread( master_th ), team,
3012  team -> t.t_task_team, master_th->th.th_task_team) );
3013  KMP_DEBUG_ASSERT( master_th->th.th_task_team == team->t.t_task_team );
3014  }
3015 #endif // OMP_30_ENABLED
3016 
3017  if( team->t.t_serialized ) {
3018 #if OMP_40_ENABLED
3019  if ( master_th->th.th_team_microtask ) {
3020  // We are in teams construct
3021  int level = team->t.t_level;
3022  int tlevel = master_th->th.th_teams_level;
3023  if ( level == tlevel ) {
3024  // AC: we haven't incremented it earlier at start of teams construct,
3025  // so do it here - at the end of teams construct
3026  team->t.t_level++;
3027  } else if ( level == tlevel + 1 ) {
3028  // AC: we are exiting parallel inside teams, need to increment serialization
3029  // in order to restore it in the next call to __kmpc_end_serialized_parallel
3030  team->t.t_serialized++;
3031  }
3032  }
3033 #endif /* OMP_40_ENABLED */
3034  __kmpc_end_serialized_parallel( loc, gtid );
3035  return;
3036  }
3037 
3038  master_active = team->t.t_master_active;
3039 
3040 #if OMP_40_ENABLED
3041  if (!exit_teams)
3042 #endif /* OMP_40_ENABLED */
3043  {
3044  // AC: No barrier for internal teams at exit from teams construct.
3045  // But there is barrier for external team (league).
3046  __kmp_internal_join( loc, gtid, team );
3047  }
3048  KMP_MB();
3049 
3050 #if USE_ITT_BUILD
3051  if ( __itt_stack_caller_create_ptr ) {
3052  __kmp_itt_stack_caller_destroy( (__itt_caller)team->t.t_stack_id ); // destroy the stack stitching id after join barrier
3053  }
3054 
3055  if ( ( __itt_frame_end_v3_ptr && __kmp_forkjoin_frames ) || KMP_ITT_DEBUG )
3056  __kmp_itt_region_joined( gtid );
3057 #endif /* USE_ITT_BUILD */
3058 
3059 #if OMP_40_ENABLED
3060  if ( master_th->th.th_team_microtask &&
3061  !exit_teams &&
3062  team->t.t_pkfn != (microtask_t)__kmp_teams_master &&
3063  team->t.t_level == master_th->th.th_teams_level + 1 ) {
3064  // AC: We need to leave the team structure intact at the end
3065  // of parallel inside the teams construct, so that at the next
3066  // parallel same (hot) team works, only adjust nesting levels
3067 
3068  /* Decrement our nested depth level */
3069  team->t.t_level --;
3070  team->t.t_active_level --;
3071  KMP_TEST_THEN_DEC32( (kmp_int32*) &root->r.r_in_parallel );
3072  return;
3073  }
3074 #endif /* OMP_40_ENABLED */
3075  /* do cleanup and restore the parent team */
3076  master_th -> th.th_info .ds.ds_tid = team -> t.t_master_tid;
3077  master_th -> th.th_local.this_construct = team -> t.t_master_this_cons;
3078  master_th -> th.th_local.last_construct = team -> t.t_master_last_cons;
3079 
3080  master_th -> th.th_dispatch =
3081  & parent_team -> t.t_dispatch[ team -> t.t_master_tid ];
3082 
3083  /* jc: The following lock has instructions with REL and ACQ semantics,
3084  separating the parallel user code called in this parallel region
3085  from the serial user code called after this function returns.
3086  */
3087  __kmp_acquire_bootstrap_lock( &__kmp_forkjoin_lock );
3088 
3089 #if OMP_40_ENABLED
3090  if ( !master_th->th.th_team_microtask || team->t.t_level > master_th->th.th_teams_level )
3091 #endif /* OMP_40_ENABLED */
3092  {
3093  /* Decrement our nested depth level */
3094  KMP_TEST_THEN_DEC32( (kmp_int32*) &root->r.r_in_parallel );
3095  }
3096  KMP_DEBUG_ASSERT( root->r.r_in_parallel >= 0 );
3097 
3098  #if OMP_30_ENABLED
3099  KF_TRACE( 10, ("__kmp_join_call1: T#%d, this_thread=%p team=%p\n",
3100  0, master_th, team ) );
3101  __kmp_pop_current_task_from_thread( master_th );
3102  #endif // OMP_30_ENABLED
3103 
3104 #if OMP_40_ENABLED && (KMP_OS_WINDOWS || KMP_OS_LINUX)
3105  //
3106  // Restore master thread's partition.
3107  //
3108  master_th -> th.th_first_place = team -> t.t_first_place;
3109  master_th -> th.th_last_place = team -> t.t_last_place;
3110 #endif /* OMP_40_ENABLED */
3111 
3112 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
3113  if ( __kmp_inherit_fp_control && team->t.t_fp_control_saved ) {
3114  __kmp_clear_x87_fpu_status_word();
3115  __kmp_load_x87_fpu_control_word( &team->t.t_x87_fpu_control_word );
3116  __kmp_load_mxcsr( &team->t.t_mxcsr );
3117  }
3118 #endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
3119 
3120  if ( root -> r.r_active != master_active )
3121  root -> r.r_active = master_active;
3122 
3123  __kmp_free_team( root, team ); /* this will free worker threads */
3124 
3125  /* this race was fun to find. make sure the following is in the critical
3126  * region otherwise assertions may fail occasiounally since the old team
3127  * may be reallocated and the hierarchy appears inconsistent. it is
3128  * actually safe to run and won't cause any bugs, but will cause thoose
3129  * assertion failures. it's only one deref&assign so might as well put this
3130  * in the critical region */
3131  master_th -> th.th_team = parent_team;
3132  master_th -> th.th_team_nproc = parent_team -> t.t_nproc;
3133  master_th -> th.th_team_master = parent_team -> t.t_threads[0];
3134  master_th -> th.th_team_serialized = parent_team -> t.t_serialized;
3135 
3136  /* restore serialized team, if need be */
3137  if( parent_team -> t.t_serialized &&
3138  parent_team != master_th->th.th_serial_team &&
3139  parent_team != root->r.r_root_team ) {
3140  __kmp_free_team( root, master_th -> th.th_serial_team );
3141  master_th -> th.th_serial_team = parent_team;
3142  }
3143 
3144 #if OMP_30_ENABLED
3145  if ( __kmp_tasking_mode != tskm_immediate_exec ) {
3146  //
3147  // Copy the task team from the new child / old parent team
3148  // to the thread. If non-NULL, copy the state flag also.
3149  //
3150  if ( ( master_th -> th.th_task_team = parent_team -> t.t_task_team ) != NULL ) {
3151  master_th -> th.th_task_state = master_th -> th.th_task_team -> tt.tt_state;
3152  }
3153  KA_TRACE( 20, ( "__kmp_join_call: Master T#%d restoring task_team %p / team %p\n",
3154  __kmp_gtid_from_thread( master_th ), master_th->th.th_task_team,
3155  parent_team ) );
3156  }
3157 #endif /* OMP_30_ENABLED */
3158 
3159  #if OMP_30_ENABLED
3160  // TODO: GEH - cannot do this assertion because root thread not set up as executing
3161  // KMP_ASSERT( master_th->th.th_current_task->td_flags.executing == 0 );
3162  master_th->th.th_current_task->td_flags.executing = 1;
3163  #endif // OMP_30_ENABLED
3164 
3165  __kmp_release_bootstrap_lock( &__kmp_forkjoin_lock );
3166 
3167  KMP_MB();
3168  KA_TRACE( 20, ("__kmp_join_call: exit T#%d\n", gtid ));
3169 }
3170 
3171 /* ------------------------------------------------------------------------ */
3172 /* ------------------------------------------------------------------------ */
3173 
3174 /* Check whether we should push an internal control record onto the
3175  serial team stack. If so, do it. */
3176 void
3177 __kmp_save_internal_controls ( kmp_info_t * thread )
3178 {
3179 
3180  if ( thread -> th.th_team != thread -> th.th_serial_team ) {
3181  return;
3182  }
3183  if (thread -> th.th_team -> t.t_serialized > 1) {
3184  int push = 0;
3185 
3186  if (thread -> th.th_team -> t.t_control_stack_top == NULL) {
3187  push = 1;
3188  } else {
3189  if ( thread -> th.th_team -> t.t_control_stack_top -> serial_nesting_level !=
3190  thread -> th.th_team -> t.t_serialized ) {
3191  push = 1;
3192  }
3193  }
3194  if (push) { /* push a record on the serial team's stack */
3195  kmp_internal_control_t * control = (kmp_internal_control_t *) __kmp_allocate(sizeof(kmp_internal_control_t));
3196 
3197 #if OMP_30_ENABLED
3198  copy_icvs( control, & thread->th.th_current_task->td_icvs );
3199 #else
3200  control->nproc = thread->th.th_team->t.t_set_nproc[0];
3201  control->dynamic = thread->th.th_team->t.t_set_dynamic[0];
3202  control->nested = thread->th.th_team->t.t_set_nested[0];
3203  control->blocktime = thread->th.th_team->t.t_set_blocktime[0];
3204  control->bt_intervals = thread->th.th_team->t.t_set_bt_intervals[0];
3205  control->bt_set = thread->th.th_team->t.t_set_bt_set[0];
3206 #endif // OMP_30_ENABLED
3207 
3208  control->serial_nesting_level = thread->th.th_team->t.t_serialized;
3209 
3210  control->next = thread -> th.th_team -> t.t_control_stack_top;
3211  thread -> th.th_team -> t.t_control_stack_top = control;
3212  }
3213  }
3214 }
3215 
3216 /* Changes set_nproc */
3217 void
3218 __kmp_set_num_threads( int new_nth, int gtid )
3219 {
3220  kmp_info_t *thread;
3221  kmp_root_t *root;
3222 
3223  KF_TRACE( 10, ("__kmp_set_num_threads: new __kmp_nth = %d\n", new_nth ));
3224  KMP_DEBUG_ASSERT( __kmp_init_serial );
3225 
3226  if (new_nth < 1)
3227  new_nth = 1;
3228  else if (new_nth > __kmp_max_nth)
3229  new_nth = __kmp_max_nth;
3230 
3231  thread = __kmp_threads[gtid];
3232 
3233  __kmp_save_internal_controls( thread );
3234 
3235  set__nproc( thread, new_nth );
3236 
3237  //
3238  // If this omp_set_num_threads() call will cause the hot team size to be
3239  // reduced (in the absence of a num_threads clause), then reduce it now,
3240  // rather than waiting for the next parallel region.
3241  //
3242  root = thread->th.th_root;
3243  if ( __kmp_init_parallel && ( ! root->r.r_active )
3244  && ( root->r.r_hot_team->t.t_nproc > new_nth ) ) {
3245  kmp_team_t *hot_team = root->r.r_hot_team;
3246  int f;
3247 
3248  __kmp_acquire_bootstrap_lock( &__kmp_forkjoin_lock );
3249 
3250 
3251 #if OMP_30_ENABLED
3252  if ( __kmp_tasking_mode != tskm_immediate_exec ) {
3253  kmp_task_team_t *task_team = hot_team->t.t_task_team;
3254  if ( ( task_team != NULL ) && TCR_SYNC_4(task_team->tt.tt_active) ) {
3255  //
3256  // Signal the worker threads (esp. the extra ones) to stop
3257  // looking for tasks while spin waiting. The task teams
3258  // are reference counted and will be deallocated by the
3259  // last worker thread.
3260  //
3261  KMP_DEBUG_ASSERT( hot_team->t.t_nproc > 1 );
3262  TCW_SYNC_4( task_team->tt.tt_active, FALSE );
3263  KMP_MB();
3264 
3265  KA_TRACE( 20, ( "__kmp_set_num_threads: setting task_team %p to NULL\n",
3266  &hot_team->t.t_task_team ) );
3267  hot_team->t.t_task_team = NULL;
3268  }
3269  else {
3270  KMP_DEBUG_ASSERT( task_team == NULL );
3271  }
3272  }
3273 #endif // OMP_30_ENABLED
3274 
3275  //
3276  // Release the extra threads we don't need any more.
3277  //
3278  for ( f = new_nth; f < hot_team->t.t_nproc; f++ ) {
3279  KMP_DEBUG_ASSERT( hot_team->t.t_threads[f] != NULL );
3280  __kmp_free_thread( hot_team->t.t_threads[f] );
3281  hot_team->t.t_threads[f] = NULL;
3282  }
3283  hot_team->t.t_nproc = new_nth;
3284 
3285 
3286  __kmp_release_bootstrap_lock( &__kmp_forkjoin_lock );
3287 
3288  //
3289  // Update the t_nproc field in the threads that are still active.
3290  //
3291  for( f=0 ; f < new_nth; f++ ) {
3292  KMP_DEBUG_ASSERT( hot_team->t.t_threads[f] != NULL );
3293  hot_team->t.t_threads[f]->th.th_team_nproc = new_nth;
3294  }
3295 #if KMP_MIC
3296  // Special flag in case omp_set_num_threads() call
3297  hot_team -> t.t_size_changed = -1;
3298 #endif
3299  }
3300 
3301 }
3302 
3303 #if OMP_30_ENABLED
3304 /* Changes max_active_levels */
3305 void
3306 __kmp_set_max_active_levels( int gtid, int max_active_levels )
3307 {
3308  kmp_info_t *thread;
3309 
3310  KF_TRACE( 10, ( "__kmp_set_max_active_levels: new max_active_levels for thread %d = (%d)\n", gtid, max_active_levels ) );
3311  KMP_DEBUG_ASSERT( __kmp_init_serial );
3312 
3313  // validate max_active_levels
3314  if( max_active_levels < 0 ) {
3315  KMP_WARNING( ActiveLevelsNegative, max_active_levels );
3316  // We ignore this call if the user has specified a negative value.
3317  // The current setting won't be changed. The last valid setting will be used.
3318  // A warning will be issued (if warnings are allowed as controlled by the KMP_WARNINGS env var).
3319  KF_TRACE( 10, ( "__kmp_set_max_active_levels: the call is ignored: new max_active_levels for thread %d = (%d)\n", gtid, max_active_levels ) );
3320  return;
3321  }
3322  if( max_active_levels <= KMP_MAX_ACTIVE_LEVELS_LIMIT ) {
3323  // it's OK, the max_active_levels is within the valid range: [ 0; KMP_MAX_ACTIVE_LEVELS_LIMIT ]
3324  // We allow a zero value. (implementation defined behavior)
3325  } else {
3326  KMP_WARNING( ActiveLevelsExceedLimit, max_active_levels, KMP_MAX_ACTIVE_LEVELS_LIMIT );
3327  max_active_levels = KMP_MAX_ACTIVE_LEVELS_LIMIT;
3328  // Current upper limit is MAX_INT. (implementation defined behavior)
3329  // If the input exceeds the upper limit, we correct the input to be the upper limit. (implementation defined behavior)
3330  // Actually, the flow should never get here until we use MAX_INT limit.
3331  }
3332  KF_TRACE( 10, ( "__kmp_set_max_active_levels: after validation: new max_active_levels for thread %d = (%d)\n", gtid, max_active_levels ) );
3333 
3334  thread = __kmp_threads[ gtid ];
3335 
3336  __kmp_save_internal_controls( thread );
3337 
3338  set__max_active_levels( thread, max_active_levels );
3339 
3340 }
3341 
3342 /* Gets max_active_levels */
3343 int
3344 __kmp_get_max_active_levels( int gtid )
3345 {
3346  kmp_info_t *thread;
3347 
3348  KF_TRACE( 10, ( "__kmp_get_max_active_levels: thread %d\n", gtid ) );
3349  KMP_DEBUG_ASSERT( __kmp_init_serial );
3350 
3351  thread = __kmp_threads[ gtid ];
3352  KMP_DEBUG_ASSERT( thread -> th.th_current_task );
3353  KF_TRACE( 10, ( "__kmp_get_max_active_levels: thread %d, curtask=%p, curtask_maxaclevel=%d\n",
3354  gtid, thread -> th.th_current_task, thread -> th.th_current_task -> td_icvs.max_active_levels ) );
3355  return thread -> th.th_current_task -> td_icvs.max_active_levels;
3356 }
3357 
3358 /* Changes def_sched_var ICV values (run-time schedule kind and chunk) */
3359 void
3360 __kmp_set_schedule( int gtid, kmp_sched_t kind, int chunk )
3361 {
3362  kmp_info_t *thread;
3363 // kmp_team_t *team;
3364 
3365  KF_TRACE( 10, ("__kmp_set_schedule: new schedule for thread %d = (%d, %d)\n", gtid, (int)kind, chunk ));
3366  KMP_DEBUG_ASSERT( __kmp_init_serial );
3367 
3368  // Check if the kind parameter is valid, correct if needed.
3369  // Valid parameters should fit in one of two intervals - standard or extended:
3370  // <lower>, <valid>, <upper_std>, <lower_ext>, <valid>, <upper>
3371  // 2008-01-25: 0, 1 - 4, 5, 100, 101 - 102, 103
3372  if ( kind <= kmp_sched_lower || kind >= kmp_sched_upper ||
3373  ( kind <= kmp_sched_lower_ext && kind >= kmp_sched_upper_std ) )
3374  {
3375  // TODO: Hint needs attention in case we change the default schedule.
3376  __kmp_msg(
3377  kmp_ms_warning,
3378  KMP_MSG( ScheduleKindOutOfRange, kind ),
3379  KMP_HNT( DefaultScheduleKindUsed, "static, no chunk" ),
3380  __kmp_msg_null
3381  );
3382  kind = kmp_sched_default;
3383  chunk = 0; // ignore chunk value in case of bad kind
3384  }
3385 
3386  thread = __kmp_threads[ gtid ];
3387 
3388  __kmp_save_internal_controls( thread );
3389 
3390  if ( kind < kmp_sched_upper_std ) {
3391  if ( kind == kmp_sched_static && chunk < KMP_DEFAULT_CHUNK ) {
3392  // differ static chunked vs. unchunked:
3393  // chunk should be invalid to indicate unchunked schedule (which is the default)
3394  thread -> th.th_current_task -> td_icvs.sched.r_sched_type = kmp_sch_static;
3395  } else {
3396  thread -> th.th_current_task -> td_icvs.sched.r_sched_type = __kmp_sch_map[ kind - kmp_sched_lower - 1 ];
3397  }
3398  } else {
3399  // __kmp_sch_map[ kind - kmp_sched_lower_ext + kmp_sched_upper_std - kmp_sched_lower - 2 ];
3400  thread -> th.th_current_task -> td_icvs.sched.r_sched_type =
3401  __kmp_sch_map[ kind - kmp_sched_lower_ext + kmp_sched_upper_std - kmp_sched_lower - 2 ];
3402  }
3403  if ( kind == kmp_sched_auto ) {
3404  // ignore parameter chunk for schedule auto
3405  thread -> th.th_current_task -> td_icvs.sched.chunk = KMP_DEFAULT_CHUNK;
3406  } else {
3407  thread -> th.th_current_task -> td_icvs.sched.chunk = chunk;
3408  }
3409 }
3410 
3411 /* Gets def_sched_var ICV values */
3412 void
3413 __kmp_get_schedule( int gtid, kmp_sched_t * kind, int * chunk )
3414 {
3415  kmp_info_t *thread;
3416  enum sched_type th_type;
3417  int i;
3418 
3419  KF_TRACE( 10, ("__kmp_get_schedule: thread %d\n", gtid ));
3420  KMP_DEBUG_ASSERT( __kmp_init_serial );
3421 
3422  thread = __kmp_threads[ gtid ];
3423 
3424  //th_type = thread -> th.th_team -> t.t_set_sched[ thread->th.th_info.ds.ds_tid ].r_sched_type;
3425  th_type = thread -> th.th_current_task -> td_icvs.sched.r_sched_type;
3426 
3427  switch ( th_type ) {
3428  case kmp_sch_static:
3429  case kmp_sch_static_greedy:
3430  case kmp_sch_static_balanced:
3431  *kind = kmp_sched_static;
3432  *chunk = 0; // chunk was not set, try to show this fact via zero value
3433  return;
3434  case kmp_sch_static_chunked:
3435  *kind = kmp_sched_static;
3436  break;
3437  case kmp_sch_dynamic_chunked:
3438  *kind = kmp_sched_dynamic;
3439  break;
3441  case kmp_sch_guided_iterative_chunked:
3442  case kmp_sch_guided_analytical_chunked:
3443  *kind = kmp_sched_guided;
3444  break;
3445  case kmp_sch_auto:
3446  *kind = kmp_sched_auto;
3447  break;
3448  case kmp_sch_trapezoidal:
3449  *kind = kmp_sched_trapezoidal;
3450  break;
3451 /*
3452  case kmp_sch_static_steal:
3453  *kind = kmp_sched_static_steal;
3454  break;
3455 */
3456  default:
3457  KMP_FATAL( UnknownSchedulingType, th_type );
3458  }
3459 
3460  //*chunk = thread -> th.th_team -> t.t_set_sched[ thread->th.th_info.ds.ds_tid ].chunk;
3461  *chunk = thread -> th.th_current_task -> td_icvs.sched.chunk;
3462 }
3463 
3464 int
3465 __kmp_get_ancestor_thread_num( int gtid, int level ) {
3466 
3467  int ii, dd;
3468  kmp_team_t *team;
3469  kmp_info_t *thr;
3470 
3471  KF_TRACE( 10, ("__kmp_get_ancestor_thread_num: thread %d %d\n", gtid, level ));
3472  KMP_DEBUG_ASSERT( __kmp_init_serial );
3473 
3474  // validate level
3475  if( level == 0 ) return 0;
3476  if( level < 0 ) return -1;
3477  thr = __kmp_threads[ gtid ];
3478  team = thr->th.th_team;
3479  ii = team -> t.t_level;
3480  if( level > ii ) return -1;
3481 
3482 #if OMP_40_ENABLED
3483  if( thr->th.th_team_microtask ) {
3484  // AC: we are in teams region where multiple nested teams have same level
3485  int tlevel = thr->th.th_teams_level; // the level of the teams construct
3486  if( level <= tlevel ) { // otherwise usual algorithm works (will not touch the teams)
3487  KMP_DEBUG_ASSERT( ii >= tlevel );
3488  // AC: As we need to pass by the teams league, we need to artificially increase ii
3489  if ( ii == tlevel ) {
3490  ii += 2; // three teams have same level
3491  } else {
3492  ii ++; // two teams have same level
3493  }
3494  }
3495  }
3496 #endif
3497 
3498  if( ii == level ) return __kmp_tid_from_gtid( gtid );
3499 
3500  dd = team -> t.t_serialized;
3501  level++;
3502  while( ii > level )
3503  {
3504  for( dd = team -> t.t_serialized; ( dd > 0 ) && ( ii > level ); dd--, ii-- )
3505  {
3506  }
3507  if( ( team -> t.t_serialized ) && ( !dd ) ) {
3508  team = team->t.t_parent;
3509  continue;
3510  }
3511  if( ii > level ) {
3512  team = team->t.t_parent;
3513  dd = team -> t.t_serialized;
3514  ii--;
3515  }
3516  }
3517 
3518  return ( dd > 1 ) ? ( 0 ) : ( team -> t.t_master_tid );
3519 }
3520 
3521 int
3522 __kmp_get_team_size( int gtid, int level ) {
3523 
3524  int ii, dd;
3525  kmp_team_t *team;
3526  kmp_info_t *thr;
3527 
3528  KF_TRACE( 10, ("__kmp_get_team_size: thread %d %d\n", gtid, level ));
3529  KMP_DEBUG_ASSERT( __kmp_init_serial );
3530 
3531  // validate level
3532  if( level == 0 ) return 1;
3533  if( level < 0 ) return -1;
3534  thr = __kmp_threads[ gtid ];
3535  team = thr->th.th_team;
3536  ii = team -> t.t_level;
3537  if( level > ii ) return -1;
3538 
3539 #if OMP_40_ENABLED
3540  if( thr->th.th_team_microtask ) {
3541  // AC: we are in teams region where multiple nested teams have same level
3542  int tlevel = thr->th.th_teams_level; // the level of the teams construct
3543  if( level <= tlevel ) { // otherwise usual algorithm works (will not touch the teams)
3544  KMP_DEBUG_ASSERT( ii >= tlevel );
3545  // AC: As we need to pass by the teams league, we need to artificially increase ii
3546  if ( ii == tlevel ) {
3547  ii += 2; // three teams have same level
3548  } else {
3549  ii ++; // two teams have same level
3550  }
3551  }
3552  }
3553 #endif
3554 
3555  while( ii > level )
3556  {
3557  for( dd = team -> t.t_serialized; ( dd > 0 ) && ( ii > level ); dd--, ii-- )
3558  {
3559  }
3560  if( team -> t.t_serialized && ( !dd ) ) {
3561  team = team->t.t_parent;
3562  continue;
3563  }
3564  if( ii > level ) {
3565  team = team->t.t_parent;
3566  ii--;
3567  }
3568  }
3569 
3570  return team -> t.t_nproc;
3571 }
3572 
3573 #endif // OMP_30_ENABLED
3574 
3575 kmp_r_sched_t
3576 __kmp_get_schedule_global() {
3577 // This routine created because pairs (__kmp_sched, __kmp_chunk) and (__kmp_static, __kmp_guided)
3578 // may be changed by kmp_set_defaults independently. So one can get the updated schedule here.
3579 
3580  kmp_r_sched_t r_sched;
3581 
3582  // create schedule from 4 globals: __kmp_sched, __kmp_chunk, __kmp_static, __kmp_guided
3583  // __kmp_sched should keep original value, so that user can set KMP_SCHEDULE multiple times,
3584  // and thus have different run-time schedules in different roots (even in OMP 2.5)
3585  if ( __kmp_sched == kmp_sch_static ) {
3586  r_sched.r_sched_type = __kmp_static; // replace STATIC with more detailed schedule (balanced or greedy)
3587  } else if ( __kmp_sched == kmp_sch_guided_chunked ) {
3588  r_sched.r_sched_type = __kmp_guided; // replace GUIDED with more detailed schedule (iterative or analytical)
3589  } else {
3590  r_sched.r_sched_type = __kmp_sched; // (STATIC_CHUNKED), or (DYNAMIC_CHUNKED), or other
3591  }
3592 
3593  if ( __kmp_chunk < KMP_DEFAULT_CHUNK ) { // __kmp_chunk may be wrong here (if it was not ever set)
3594  r_sched.chunk = KMP_DEFAULT_CHUNK;
3595  } else {
3596  r_sched.chunk = __kmp_chunk;
3597  }
3598 
3599  return r_sched;
3600 }
3601 
3602 /* ------------------------------------------------------------------------ */
3603 /* ------------------------------------------------------------------------ */
3604 
3605 
3606 /*
3607  * Allocate (realloc == FALSE) * or reallocate (realloc == TRUE)
3608  * at least argc number of *t_argv entries for the requested team.
3609  */
3610 static void
3611 __kmp_alloc_argv_entries( int argc, kmp_team_t *team, int realloc )
3612 {
3613 
3614  KMP_DEBUG_ASSERT( team );
3615  if( !realloc || argc > team -> t.t_max_argc ) {
3616 
3617  KA_TRACE( 100, ( "__kmp_alloc_argv_entries: team %d: needed entries=%d, current entries=%d\n",
3618  team->t.t_id, argc, ( realloc ) ? team->t.t_max_argc : 0 ));
3619 #if (KMP_PERF_V106 == KMP_ON)
3620  /* if previously allocated heap space for args, free them */
3621  if ( realloc && team -> t.t_argv != &team -> t.t_inline_argv[0] )
3622  __kmp_free( (void *) team -> t.t_argv );
3623 
3624  if ( argc <= KMP_INLINE_ARGV_ENTRIES ) {
3625  /* use unused space in the cache line for arguments */
3626  team -> t.t_max_argc = KMP_INLINE_ARGV_ENTRIES;
3627  KA_TRACE( 100, ( "__kmp_alloc_argv_entries: team %d: inline allocate %d argv entries\n",
3628  team->t.t_id, team->t.t_max_argc ));
3629  team -> t.t_argv = &team -> t.t_inline_argv[0];
3630  if ( __kmp_storage_map ) {
3631  __kmp_print_storage_map_gtid( -1, &team->t.t_inline_argv[0],
3632  &team->t.t_inline_argv[KMP_INLINE_ARGV_ENTRIES],
3633  (sizeof(void *) * KMP_INLINE_ARGV_ENTRIES),
3634  "team_%d.t_inline_argv",
3635  team->t.t_id );
3636  }
3637  } else {
3638  /* allocate space for arguments in the heap */
3639  team -> t.t_max_argc = ( argc <= (KMP_MIN_MALLOC_ARGV_ENTRIES >> 1 )) ?
3640  KMP_MIN_MALLOC_ARGV_ENTRIES : 2 * argc;
3641  KA_TRACE( 100, ( "__kmp_alloc_argv_entries: team %d: dynamic allocate %d argv entries\n",
3642  team->t.t_id, team->t.t_max_argc ));
3643  team -> t.t_argv = (void**) __kmp_page_allocate( sizeof(void*) * team->t.t_max_argc );
3644  if ( __kmp_storage_map ) {
3645  __kmp_print_storage_map_gtid( -1, &team->t.t_argv[0], &team->t.t_argv[team->t.t_max_argc],
3646  sizeof(void *) * team->t.t_max_argc, "team_%d.t_argv",
3647  team->t.t_id );
3648  }
3649  }
3650 #else /* KMP_PERF_V106 == KMP_OFF */
3651  if ( realloc )
3652  __kmp_free( (void*) team -> t.t_argv );
3653  team -> t.t_max_argc = ( argc <= (KMP_MIN_MALLOC_ARGV_ENTRIES >> 1 )) ?
3654  KMP_MIN_MALLOC_ARGV_ENTRIES : 2 * argc;
3655  KA_TRACE( 100, ( "__kmp_alloc_argv_entries: team %d: dynamic allocate %d argv entries\n",
3656  team->t.t_id, team->t.t_max_argc ));
3657  team -> t.t_argv = __kmp_page_allocate( sizeof(void*) * team->t.t_max_argc );
3658  if ( __kmp_storage_map ) {
3659  __kmp_print_storage_map_gtid( -1, &team->t.t_argv[0], &team->t.t_argv[team->t.t_max_argc],
3660  sizeof(void *) * team->t.t_max_argc, "team_%d.t_argv", team->t.t_id );
3661  }
3662 #endif /* KMP_PERF_V106 */
3663 
3664  }
3665 }
3666 
3667 static void
3668 __kmp_allocate_team_arrays(kmp_team_t *team, int max_nth)
3669 {
3670  int i;
3671  int num_disp_buff = max_nth > 1 ? KMP_MAX_DISP_BUF : 2;
3672 #if KMP_USE_POOLED_ALLOC
3673  char *ptr = __kmp_allocate(max_nth *
3674  ( sizeof(kmp_info_t*) + sizeof(dispatch_shared_info_t)*2
3675  + sizeof(kmp_disp_t) + sizeof(int)*6
3676 # if OMP_30_ENABLED
3677  //+ sizeof(int)
3678  + sizeof(kmp_r_sched_t)
3679  + sizeof(kmp_taskdata_t)
3680 # endif // OMP_30_ENABLED
3681  ) );
3682 
3683  team -> t.t_threads = (kmp_info_t**) ptr; ptr += sizeof(kmp_info_t*) * max_nth;
3684  team -> t.t_disp_buffer = (dispatch_shared_info_t*) ptr;
3685  ptr += sizeof(dispatch_shared_info_t) * num_disp_buff;
3686  team -> t.t_dispatch = (kmp_disp_t*) ptr; ptr += sizeof(kmp_disp_t) * max_nth;
3687  team -> t.t_set_nproc = (int*) ptr; ptr += sizeof(int) * max_nth;
3688  team -> t.t_set_dynamic = (int*) ptr; ptr += sizeof(int) * max_nth;
3689  team -> t.t_set_nested = (int*) ptr; ptr += sizeof(int) * max_nth;
3690  team -> t.t_set_blocktime = (int*) ptr; ptr += sizeof(int) * max_nth;
3691  team -> t.t_set_bt_intervals = (int*) ptr; ptr += sizeof(int) * max_nth;
3692  team -> t.t_set_bt_set = (int*) ptr;
3693 # if OMP_30_ENABLED
3694  ptr += sizeof(int) * max_nth;
3695  //team -> t.t_set_max_active_levels = (int*) ptr; ptr += sizeof(int) * max_nth;
3696  team -> t.t_set_sched = (kmp_r_sched_t*) ptr;
3697  ptr += sizeof(kmp_r_sched_t) * max_nth;
3698  team -> t.t_implicit_task_taskdata = (kmp_taskdata_t*) ptr;
3699  ptr += sizeof(kmp_taskdata_t) * max_nth;
3700 # endif // OMP_30_ENABLED
3701 #else
3702 
3703  team -> t.t_threads = (kmp_info_t**) __kmp_allocate( sizeof(kmp_info_t*) * max_nth );
3704  team -> t.t_disp_buffer = (dispatch_shared_info_t*)
3705  __kmp_allocate( sizeof(dispatch_shared_info_t) * num_disp_buff );
3706  team -> t.t_dispatch = (kmp_disp_t*) __kmp_allocate( sizeof(kmp_disp_t) * max_nth );
3707  #if OMP_30_ENABLED
3708  //team -> t.t_set_max_active_levels = (int*) __kmp_allocate( sizeof(int) * max_nth );
3709  //team -> t.t_set_sched = (kmp_r_sched_t*) __kmp_allocate( sizeof(kmp_r_sched_t) * max_nth );
3710  team -> t.t_implicit_task_taskdata = (kmp_taskdata_t*) __kmp_allocate( sizeof(kmp_taskdata_t) * max_nth );
3711  #else
3712  team -> t.t_set_nproc = (int*) __kmp_allocate( sizeof(int) * max_nth );
3713  team -> t.t_set_dynamic = (int*) __kmp_allocate( sizeof(int) * max_nth );
3714  team -> t.t_set_nested = (int*) __kmp_allocate( sizeof(int) * max_nth );
3715  team -> t.t_set_blocktime = (int*) __kmp_allocate( sizeof(int) * max_nth );
3716  team -> t.t_set_bt_intervals = (int*) __kmp_allocate( sizeof(int) * max_nth );
3717  team -> t.t_set_bt_set = (int*) __kmp_allocate( sizeof(int) * max_nth );
3718 # endif // OMP_30_ENABLED
3719 #endif
3720  team->t.t_max_nproc = max_nth;
3721 
3722  /* setup dispatch buffers */
3723  for(i = 0 ; i < num_disp_buff; ++i)
3724  team -> t.t_disp_buffer[i].buffer_index = i;
3725 }
3726 
3727 static void
3728 __kmp_free_team_arrays(kmp_team_t *team) {
3729  /* Note: this does not free the threads in t_threads (__kmp_free_threads) */
3730  int i;
3731  for ( i = 0; i < team->t.t_max_nproc; ++ i ) {
3732  if ( team->t.t_dispatch[ i ].th_disp_buffer != NULL ) {
3733  __kmp_free( team->t.t_dispatch[ i ].th_disp_buffer );
3734  team->t.t_dispatch[ i ].th_disp_buffer = NULL;
3735  }; // if
3736  }; // for
3737  __kmp_free(team->t.t_threads);
3738  #if !KMP_USE_POOLED_ALLOC
3739  __kmp_free(team->t.t_disp_buffer);
3740  __kmp_free(team->t.t_dispatch);
3741  #if OMP_30_ENABLED
3742  //__kmp_free(team->t.t_set_max_active_levels);
3743  //__kmp_free(team->t.t_set_sched);
3744  __kmp_free(team->t.t_implicit_task_taskdata);
3745  #else
3746  __kmp_free(team->t.t_set_nproc);
3747  __kmp_free(team->t.t_set_dynamic);
3748  __kmp_free(team->t.t_set_nested);
3749  __kmp_free(team->t.t_set_blocktime);
3750  __kmp_free(team->t.t_set_bt_intervals);
3751  __kmp_free(team->t.t_set_bt_set);
3752  # endif // OMP_30_ENABLED
3753  #endif
3754  team->t.t_threads = NULL;
3755  team->t.t_disp_buffer = NULL;
3756  team->t.t_dispatch = NULL;
3757 #if OMP_30_ENABLED
3758  //team->t.t_set_sched = 0;
3759  //team->t.t_set_max_active_levels = 0;
3760  team->t.t_implicit_task_taskdata = 0;
3761 #else
3762  team->t.t_set_nproc = 0;
3763  team->t.t_set_dynamic = 0;
3764  team->t.t_set_nested = 0;
3765  team->t.t_set_blocktime = 0;
3766  team->t.t_set_bt_intervals = 0;
3767  team->t.t_set_bt_set = 0;
3768 #endif // OMP_30_ENABLED
3769 }
3770 
3771 static void
3772 __kmp_reallocate_team_arrays(kmp_team_t *team, int max_nth) {
3773  kmp_info_t **oldThreads = team->t.t_threads;
3774 
3775  #if !KMP_USE_POOLED_ALLOC
3776  __kmp_free(team->t.t_disp_buffer);
3777  __kmp_free(team->t.t_dispatch);
3778  #if OMP_30_ENABLED
3779  //__kmp_free(team->t.t_set_max_active_levels);
3780  //__kmp_free(team->t.t_set_sched);
3781  __kmp_free(team->t.t_implicit_task_taskdata);
3782  #else
3783  __kmp_free(team->t.t_set_nproc);
3784  __kmp_free(team->t.t_set_dynamic);
3785  __kmp_free(team->t.t_set_nested);
3786  __kmp_free(team->t.t_set_blocktime);
3787  __kmp_free(team->t.t_set_bt_intervals);
3788  __kmp_free(team->t.t_set_bt_set);
3789  # endif // OMP_30_ENABLED
3790  #endif
3791  __kmp_allocate_team_arrays(team, max_nth);
3792 
3793  memcpy(team->t.t_threads, oldThreads, team->t.t_nproc * sizeof (kmp_info_t*));
3794 
3795  __kmp_free(oldThreads);
3796 }
3797 
3798 static kmp_internal_control_t
3799 __kmp_get_global_icvs( void ) {
3800 
3801 #if OMP_30_ENABLED
3802  kmp_r_sched_t r_sched = __kmp_get_schedule_global(); // get current state of scheduling globals
3803 #endif /* OMP_30_ENABLED */
3804 
3805 #if OMP_40_ENABLED
3806  KMP_DEBUG_ASSERT( __kmp_nested_proc_bind.used > 0 );
3807 #endif /* OMP_40_ENABLED */
3808 
3809  kmp_internal_control_t g_icvs = {
3810  0, //int serial_nesting_level; //corresponds to the value of the th_team_serialized field
3811  __kmp_dflt_nested, //int nested; //internal control for nested parallelism (per thread)
3812  __kmp_global.g.g_dynamic, //internal control for dynamic adjustment of threads (per thread)
3813  __kmp_dflt_team_nth,
3814  //int nproc; //internal control for # of threads for next parallel region (per thread)
3815  // (use a max ub on value if __kmp_parallel_initialize not called yet)
3816  __kmp_dflt_blocktime, //int blocktime; //internal control for blocktime
3817  __kmp_bt_intervals, //int bt_intervals; //internal control for blocktime intervals
3818  __kmp_env_blocktime, //int bt_set; //internal control for whether blocktime is explicitly set
3819 #if OMP_30_ENABLED
3820  __kmp_dflt_max_active_levels, //int max_active_levels; //internal control for max_active_levels
3821  r_sched, //kmp_r_sched_t sched; //internal control for runtime schedule {sched,chunk} pair
3822 #endif /* OMP_30_ENABLED */
3823 #if OMP_40_ENABLED
3824  __kmp_nested_proc_bind.bind_types[0],
3825 #endif /* OMP_40_ENABLED */
3826  NULL //struct kmp_internal_control *next;
3827  };
3828 
3829  return g_icvs;
3830 }
3831 
3832 static kmp_internal_control_t
3833 __kmp_get_x_global_icvs( const kmp_team_t *team ) {
3834 
3835  #if OMP_30_ENABLED
3836  kmp_internal_control_t gx_icvs;
3837  gx_icvs.serial_nesting_level = 0; // probably =team->t.t_serial like in save_inter_controls
3838  copy_icvs( & gx_icvs, & team->t.t_threads[0]->th.th_current_task->td_icvs );
3839  gx_icvs.next = NULL;
3840  #else
3841  kmp_internal_control_t gx_icvs =
3842  {
3843  0,
3844  team->t.t_set_nested[0],
3845  team->t.t_set_dynamic[0],
3846  team->t.t_set_nproc[0],
3847  team->t.t_set_blocktime[0],
3848  team->t.t_set_bt_intervals[0],
3849  team->t.t_set_bt_set[0],
3850  NULL //struct kmp_internal_control *next;
3851  };
3852  #endif // OMP_30_ENABLED
3853 
3854  return gx_icvs;
3855 }
3856 
3857 static void
3858 __kmp_initialize_root( kmp_root_t *root )
3859 {
3860  int f;
3861  kmp_team_t *root_team;
3862  kmp_team_t *hot_team;
3863  size_t disp_size, dispatch_size, bar_size;
3864  int hot_team_max_nth;
3865 #if OMP_30_ENABLED
3866  kmp_r_sched_t r_sched = __kmp_get_schedule_global(); // get current state of scheduling globals
3867  kmp_internal_control_t r_icvs = __kmp_get_global_icvs();
3868 #endif // OMP_30_ENABLED
3869  KMP_DEBUG_ASSERT( root );
3870  KMP_ASSERT( ! root->r.r_begin );
3871 
3872  /* setup the root state structure */
3873  __kmp_init_lock( &root->r.r_begin_lock );
3874  root -> r.r_begin = FALSE;
3875  root -> r.r_active = FALSE;
3876  root -> r.r_in_parallel = 0;
3877  root -> r.r_blocktime = __kmp_dflt_blocktime;
3878  root -> r.r_nested = __kmp_dflt_nested;
3879 
3880  /* setup the root team for this task */
3881  /* allocate the root team structure */
3882  KF_TRACE( 10, ( "__kmp_initialize_root: before root_team\n" ) );
3883  root_team =
3884  __kmp_allocate_team(
3885  root,
3886  1, // new_nproc
3887  1, // max_nproc
3888 #if OMP_40_ENABLED
3889  __kmp_nested_proc_bind.bind_types[0],
3890 #endif
3891 #if OMP_30_ENABLED
3892  &r_icvs,
3893 #else
3894  __kmp_dflt_team_nth_ub, // num_treads
3895  __kmp_global.g.g_dynamic, // dynamic
3896  __kmp_dflt_nested, // nested
3897  __kmp_dflt_blocktime, // blocktime
3898  __kmp_bt_intervals, // bt_intervals
3899  __kmp_env_blocktime, // bt_set
3900 #endif // OMP_30_ENABLED
3901  0 // argc
3902  );
3903 
3904  KF_TRACE( 10, ( "__kmp_initialize_root: after root_team = %p\n", root_team ) );
3905 
3906  root -> r.r_root_team = root_team;
3907  root_team -> t.t_control_stack_top = NULL;
3908 
3909  /* initialize root team */
3910  root_team -> t.t_threads[0] = NULL;
3911  root_team -> t.t_nproc = 1;
3912  root_team -> t.t_serialized = 1;
3913 #if OMP_30_ENABLED
3914  // TODO???: root_team -> t.t_max_active_levels = __kmp_dflt_max_active_levels;
3915  root_team -> t.t_sched.r_sched_type = r_sched.r_sched_type;
3916  root_team -> t.t_sched.chunk = r_sched.chunk;
3917 #endif // OMP_30_ENABLED
3918  KA_TRACE( 20, ("__kmp_initialize_root: init root team %d arrived: join=%u, plain=%u\n",
3919  root_team->t.t_id, KMP_INIT_BARRIER_STATE, KMP_INIT_BARRIER_STATE ));
3920 
3921  /* setup the hot team for this task */
3922  /* allocate the hot team structure */
3923  KF_TRACE( 10, ( "__kmp_initialize_root: before hot_team\n" ) );
3924  hot_team =
3925  __kmp_allocate_team(
3926  root,
3927  1, // new_nproc
3928  __kmp_dflt_team_nth_ub * 2, // max_nproc
3929 #if OMP_40_ENABLED
3930  __kmp_nested_proc_bind.bind_types[0],
3931 #endif
3932 #if OMP_30_ENABLED
3933  &r_icvs,
3934 #else
3935  __kmp_dflt_team_nth_ub, // num_treads
3936  __kmp_global.g.g_dynamic, // dynamic
3937  __kmp_dflt_nested, // nested
3938  __kmp_dflt_blocktime, // blocktime
3939  __kmp_bt_intervals, // bt_intervals
3940  __kmp_env_blocktime, // bt_set
3941 #endif // OMP_30_ENABLED
3942  0 // argc
3943  );
3944  KF_TRACE( 10, ( "__kmp_initialize_root: after hot_team = %p\n", hot_team ) );
3945 
3946  root -> r.r_hot_team = hot_team;
3947  root_team -> t.t_control_stack_top = NULL;
3948 
3949  /* first-time initialization */
3950  hot_team -> t.t_parent = root_team;
3951 
3952  /* initialize hot team */
3953  hot_team_max_nth = hot_team->t.t_max_nproc;
3954  for ( f = 0; f < hot_team_max_nth; ++ f ) {
3955  hot_team -> t.t_threads[ f ] = NULL;
3956  }; // for
3957  hot_team -> t.t_nproc = 1;
3958 #if OMP_30_ENABLED
3959  // TODO???: hot_team -> t.t_max_active_levels = __kmp_dflt_max_active_levels;
3960  hot_team -> t.t_sched.r_sched_type = r_sched.r_sched_type;
3961  hot_team -> t.t_sched.chunk = r_sched.chunk;
3962 #endif // OMP_30_ENABLED
3963 #if KMP_MIC
3964  hot_team -> t.t_size_changed = 0;
3965 #endif
3966 
3967 }
3968 
3969 #ifdef KMP_DEBUG
3970 
3971 
3972 typedef struct kmp_team_list_item {
3973  kmp_team_p const * entry;
3974  struct kmp_team_list_item * next;
3975 } kmp_team_list_item_t;
3976 typedef kmp_team_list_item_t * kmp_team_list_t;
3977 
3978 
3979 static void
3980 __kmp_print_structure_team_accum( // Add team to list of teams.
3981  kmp_team_list_t list, // List of teams.
3982  kmp_team_p const * team // Team to add.
3983 ) {
3984 
3985  // List must terminate with item where both entry and next are NULL.
3986  // Team is added to the list only once.
3987  // List is sorted in ascending order by team id.
3988  // Team id is *not* a key.
3989 
3990  kmp_team_list_t l;
3991 
3992  KMP_DEBUG_ASSERT( list != NULL );
3993  if ( team == NULL ) {
3994  return;
3995  }; // if
3996 
3997  __kmp_print_structure_team_accum( list, team->t.t_parent );
3998  __kmp_print_structure_team_accum( list, team->t.t_next_pool );
3999 
4000  // Search list for the team.
4001  l = list;
4002  while ( l->next != NULL && l->entry != team ) {
4003  l = l->next;
4004  }; // while
4005  if ( l->next != NULL ) {
4006  return; // Team has been added before, exit.
4007  }; // if
4008 
4009  // Team is not found. Search list again for insertion point.
4010  l = list;
4011  while ( l->next != NULL && l->entry->t.t_id <= team->t.t_id ) {
4012  l = l->next;
4013  }; // while
4014 
4015  // Insert team.
4016  {
4017  kmp_team_list_item_t * item =
4018  (kmp_team_list_item_t *)KMP_INTERNAL_MALLOC( sizeof( kmp_team_list_item_t ) );
4019  * item = * l;
4020  l->entry = team;
4021  l->next = item;
4022  }
4023 
4024 }
4025 
4026 static void
4027 __kmp_print_structure_team(
4028  char const * title,
4029  kmp_team_p const * team
4030 
4031 ) {
4032  __kmp_printf( "%s", title );
4033  if ( team != NULL ) {
4034  __kmp_printf( "%2x %p\n", team->t.t_id, team );
4035  } else {
4036  __kmp_printf( " - (nil)\n" );
4037  }; // if
4038 }
4039 
4040 static void
4041 __kmp_print_structure_thread(
4042  char const * title,
4043  kmp_info_p const * thread
4044 
4045 ) {
4046  __kmp_printf( "%s", title );
4047  if ( thread != NULL ) {
4048  __kmp_printf( "%2d %p\n", thread->th.th_info.ds.ds_gtid, thread );
4049  } else {
4050  __kmp_printf( " - (nil)\n" );
4051  }; // if
4052 }
4053 
4054 static void
4055 __kmp_print_structure(
4056  void
4057 ) {
4058 
4059  kmp_team_list_t list;
4060 
4061  // Initialize list of teams.
4062  list = (kmp_team_list_item_t *)KMP_INTERNAL_MALLOC( sizeof( kmp_team_list_item_t ) );
4063  list->entry = NULL;
4064  list->next = NULL;
4065 
4066  __kmp_printf( "\n------------------------------\nGlobal Thread Table\n------------------------------\n" );
4067  {
4068  int gtid;
4069  for ( gtid = 0; gtid < __kmp_threads_capacity; ++ gtid ) {
4070  __kmp_printf( "%2d", gtid );
4071  if ( __kmp_threads != NULL ) {
4072  __kmp_printf( " %p", __kmp_threads[ gtid ] );
4073  }; // if
4074  if ( __kmp_root != NULL ) {
4075  __kmp_printf( " %p", __kmp_root[ gtid ] );
4076  }; // if
4077  __kmp_printf( "\n" );
4078  }; // for gtid
4079  }
4080 
4081  // Print out __kmp_threads array.
4082  __kmp_printf( "\n------------------------------\nThreads\n------------------------------\n" );
4083  if ( __kmp_threads != NULL ) {
4084  int gtid;
4085  for ( gtid = 0; gtid < __kmp_threads_capacity; ++ gtid ) {
4086  kmp_info_t const * thread = __kmp_threads[ gtid ];
4087  if ( thread != NULL ) {
4088  __kmp_printf( "GTID %2d %p:\n", gtid, thread );
4089  __kmp_printf( " Our Root: %p\n", thread->th.th_root );
4090  __kmp_print_structure_team( " Our Team: ", thread->th.th_team );
4091  __kmp_print_structure_team( " Serial Team: ", thread->th.th_serial_team );
4092  __kmp_printf( " Threads: %2d\n", thread->th.th_team_nproc );
4093  __kmp_print_structure_thread( " Master: ", thread->th.th_team_master );
4094  __kmp_printf( " Serialized?: %2d\n", thread->th.th_team_serialized );
4095  __kmp_printf( " Set NProc: %2d\n", thread->th.th_set_nproc );
4096 #if OMP_40_ENABLED
4097  __kmp_printf( " Set Proc Bind: %2d\n", thread->th.th_set_proc_bind );
4098 #endif
4099  __kmp_print_structure_thread( " Next in pool: ", thread->th.th_next_pool );
4100  __kmp_printf( "\n" );
4101  __kmp_print_structure_team_accum( list, thread->th.th_team );
4102  __kmp_print_structure_team_accum( list, thread->th.th_serial_team );
4103  }; // if
4104  }; // for gtid
4105  } else {
4106  __kmp_printf( "Threads array is not allocated.\n" );
4107  }; // if
4108 
4109  // Print out __kmp_root array.
4110  __kmp_printf( "\n------------------------------\nUbers\n------------------------------\n" );
4111  if ( __kmp_root != NULL ) {
4112  int gtid;
4113  for ( gtid = 0; gtid < __kmp_threads_capacity; ++ gtid ) {
4114  kmp_root_t const * root = __kmp_root[ gtid ];
4115  if ( root != NULL ) {
4116  __kmp_printf( "GTID %2d %p:\n", gtid, root );
4117  __kmp_print_structure_team( " Root Team: ", root->r.r_root_team );
4118  __kmp_print_structure_team( " Hot Team: ", root->r.r_hot_team );
4119  __kmp_print_structure_thread( " Uber Thread: ", root->r.r_uber_thread );
4120  __kmp_printf( " Active?: %2d\n", root->r.r_active );
4121  __kmp_printf( " Nested?: %2d\n", root->r.r_nested );
4122  __kmp_printf( " In Parallel: %2d\n", root->r.r_in_parallel );
4123  __kmp_printf( "\n" );
4124  __kmp_print_structure_team_accum( list, root->r.r_root_team );
4125  __kmp_print_structure_team_accum( list, root->r.r_hot_team );
4126  }; // if
4127  }; // for gtid
4128  } else {
4129  __kmp_printf( "Ubers array is not allocated.\n" );
4130  }; // if
4131 
4132  __kmp_printf( "\n------------------------------\nTeams\n------------------------------\n" );
4133  while ( list->next != NULL ) {
4134  kmp_team_p const * team = list->entry;
4135  int i;
4136  __kmp_printf( "Team %2x %p:\n", team->t.t_id, team );
4137  __kmp_print_structure_team( " Parent Team: ", team->t.t_parent );
4138  __kmp_printf( " Master TID: %2d\n", team->t.t_master_tid );
4139  __kmp_printf( " Max threads: %2d\n", team->t.t_max_nproc );
4140  __kmp_printf( " Levels of serial: %2d\n", team->t.t_serialized );
4141  __kmp_printf( " Number threads: %2d\n", team->t.t_nproc );
4142  for ( i = 0; i < team->t.t_nproc; ++ i ) {
4143  __kmp_printf( " Thread %2d: ", i );
4144  __kmp_print_structure_thread( "", team->t.t_threads[ i ] );
4145  }; // for i
4146  __kmp_print_structure_team( " Next in pool: ", team->t.t_next_pool );
4147  __kmp_printf( "\n" );
4148  list = list->next;
4149  }; // while
4150 
4151  // Print out __kmp_thread_pool and __kmp_team_pool.
4152  __kmp_printf( "\n------------------------------\nPools\n------------------------------\n" );
4153  __kmp_print_structure_thread( "Thread pool: ", (kmp_info_t *)__kmp_thread_pool );
4154  __kmp_print_structure_team( "Team pool: ", (kmp_team_t *)__kmp_team_pool );
4155  __kmp_printf( "\n" );
4156 
4157  // Free team list.
4158  while ( list != NULL ) {
4159  kmp_team_list_item_t * item = list;
4160  list = list->next;
4161  KMP_INTERNAL_FREE( item );
4162  }; // while
4163 
4164 }
4165 
4166 #endif
4167 
4168 
4169 //---------------------------------------------------------------------------
4170 // Stuff for per-thread fast random number generator
4171 // Table of primes
4172 
4173 static const unsigned __kmp_primes[] = {
4174  0x9e3779b1, 0xffe6cc59, 0x2109f6dd, 0x43977ab5,
4175  0xba5703f5, 0xb495a877, 0xe1626741, 0x79695e6b,
4176  0xbc98c09f, 0xd5bee2b3, 0x287488f9, 0x3af18231,
4177  0x9677cd4d, 0xbe3a6929, 0xadc6a877, 0xdcf0674b,
4178  0xbe4d6fe9, 0x5f15e201, 0x99afc3fd, 0xf3f16801,
4179  0xe222cfff, 0x24ba5fdb, 0x0620452d, 0x79f149e3,
4180  0xc8b93f49, 0x972702cd, 0xb07dd827, 0x6c97d5ed,
4181  0x085a3d61, 0x46eb5ea7, 0x3d9910ed, 0x2e687b5b,
4182  0x29609227, 0x6eb081f1, 0x0954c4e1, 0x9d114db9,
4183  0x542acfa9, 0xb3e6bd7b, 0x0742d917, 0xe9f3ffa7,
4184  0x54581edb, 0xf2480f45, 0x0bb9288f, 0xef1affc7,
4185  0x85fa0ca7, 0x3ccc14db, 0xe6baf34b, 0x343377f7,
4186  0x5ca19031, 0xe6d9293b, 0xf0a9f391, 0x5d2e980b,
4187  0xfc411073, 0xc3749363, 0xb892d829, 0x3549366b,
4188  0x629750ad, 0xb98294e5, 0x892d9483, 0xc235baf3,
4189  0x3d2402a3, 0x6bdef3c9, 0xbec333cd, 0x40c9520f
4190 };
4191 
4192 //---------------------------------------------------------------------------
4193 // __kmp_get_random: Get a random number using a linear congruential method.
4194 
4195 unsigned short
4196 __kmp_get_random( kmp_info_t * thread )
4197 {
4198  unsigned x = thread -> th.th_x;
4199  unsigned short r = x>>16;
4200 
4201  thread -> th.th_x = x*thread->th.th_a+1;
4202 
4203  KA_TRACE(30, ("__kmp_get_random: THREAD: %d, RETURN: %u\n",
4204  thread->th.th_info.ds.ds_tid, r) );
4205 
4206  return r;
4207 }
4208 //--------------------------------------------------------
4209 // __kmp_init_random: Initialize a random number generator
4210 
4211 void
4212 __kmp_init_random( kmp_info_t * thread )
4213 {
4214  unsigned seed = thread->th.th_info.ds.ds_tid;
4215 
4216  thread -> th.th_a = __kmp_primes[seed%(sizeof(__kmp_primes)/sizeof(__kmp_primes[0]))];
4217  thread -> th.th_x = (seed+1)*thread->th.th_a+1;
4218  KA_TRACE(30, ("__kmp_init_random: THREAD: %u; A: %u\n", seed, thread -> th.th_a) );
4219 }
4220 
4221 
4222 #if KMP_OS_WINDOWS
4223 /* reclaim array entries for root threads that are already dead, returns number reclaimed */
4224 static int
4225 __kmp_reclaim_dead_roots(void) {
4226  int i, r = 0;
4227 
4228  for(i = 0; i < __kmp_threads_capacity; ++i) {
4229  if( KMP_UBER_GTID( i ) &&
4230  !__kmp_still_running((kmp_info_t *)TCR_SYNC_PTR(__kmp_threads[i])) &&
4231  !__kmp_root[i]->r.r_active ) { // AC: reclaim only roots died in non-active state
4232  r += __kmp_unregister_root_other_thread(i);
4233  }
4234  }
4235  return r;
4236 }
4237 #endif
4238 
4239 /*
4240  This function attempts to create free entries in __kmp_threads and __kmp_root, and returns the number of
4241  free entries generated.
4242 
4243  For Windows* OS static library, the first mechanism used is to reclaim array entries for root threads that are
4244  already dead.
4245 
4246  On all platforms, expansion is attempted on the arrays __kmp_threads_ and __kmp_root, with appropriate
4247  update to __kmp_threads_capacity. Array capacity is increased by doubling with clipping to
4248  __kmp_tp_capacity, if threadprivate cache array has been created.
4249  Synchronization with __kmpc_threadprivate_cached is done using __kmp_tp_cached_lock.
4250 
4251  After any dead root reclamation, if the clipping value allows array expansion to result in the generation
4252  of a total of nWish free slots, the function does that expansion. If not, but the clipping value allows
4253  array expansion to result in the generation of a total of nNeed free slots, the function does that expansion.
4254  Otherwise, nothing is done beyond the possible initial root thread reclamation. However, if nNeed is zero,
4255  a best-effort attempt is made to fulfil nWish as far as possible, i.e. the function will attempt to create
4256  as many free slots as possible up to nWish.
4257 
4258  If any argument is negative, the behavior is undefined.
4259 */
4260 static int
4261 __kmp_expand_threads(int nWish, int nNeed) {
4262  int added = 0;
4263  int old_tp_cached;
4264  int __kmp_actual_max_nth;
4265 
4266  if(nNeed > nWish) /* normalize the arguments */
4267  nWish = nNeed;
4268 #if KMP_OS_WINDOWS && !defined GUIDEDLL_EXPORTS
4269 /* only for Windows static library */
4270  /* reclaim array entries for root threads that are already dead */
4271  added = __kmp_reclaim_dead_roots();
4272 
4273  if(nNeed) {
4274  nNeed -= added;
4275  if(nNeed < 0)
4276  nNeed = 0;
4277  }
4278  if(nWish) {
4279  nWish -= added;
4280  if(nWish < 0)
4281  nWish = 0;
4282  }
4283 #endif
4284  if(nWish <= 0)
4285  return added;
4286 
4287  while(1) {
4288  int nTarget;
4289  int minimumRequiredCapacity;
4290  int newCapacity;
4291  kmp_info_t **newThreads;
4292  kmp_root_t **newRoot;
4293 
4294  //
4295  // Note that __kmp_threads_capacity is not bounded by __kmp_max_nth.
4296  // If __kmp_max_nth is set to some value less than __kmp_sys_max_nth
4297  // by the user via OMP_THREAD_LIMIT, then __kmp_threads_capacity may
4298  // become > __kmp_max_nth in one of two ways:
4299  //
4300  // 1) The initialization thread (gtid = 0) exits. __kmp_threads[0]
4301  // may not be resused by another thread, so we may need to increase
4302  // __kmp_threads_capacity to __kmp_max_threads + 1.
4303  //
4304  // 2) New foreign root(s) are encountered. We always register new
4305  // foreign roots. This may cause a smaller # of threads to be
4306  // allocated at subsequent parallel regions, but the worker threads
4307  // hang around (and eventually go to sleep) and need slots in the
4308  // __kmp_threads[] array.
4309  //
4310  // Anyway, that is the reason for moving the check to see if
4311  // __kmp_max_threads was exceeded into __kmp_reseerve_threads()
4312  // instead of having it performed here. -BB
4313  //
4314  old_tp_cached = __kmp_tp_cached;
4315  __kmp_actual_max_nth = old_tp_cached ? __kmp_tp_capacity : __kmp_sys_max_nth;
4316  KMP_DEBUG_ASSERT(__kmp_actual_max_nth >= __kmp_threads_capacity);
4317 
4318  /* compute expansion headroom to check if we can expand and whether to aim for nWish or nNeed */
4319  nTarget = nWish;
4320  if(__kmp_actual_max_nth - __kmp_threads_capacity < nTarget) {
4321  /* can't fulfil nWish, so try nNeed */
4322  if(nNeed) {
4323  nTarget = nNeed;
4324  if(__kmp_actual_max_nth - __kmp_threads_capacity < nTarget) {
4325  /* possible expansion too small -- give up */
4326  break;
4327  }
4328  } else {
4329  /* best-effort */
4330  nTarget = __kmp_actual_max_nth - __kmp_threads_capacity;
4331  if(!nTarget) {
4332  /* can expand at all -- give up */
4333  break;
4334  }
4335  }
4336  }
4337  minimumRequiredCapacity = __kmp_threads_capacity + nTarget;
4338 
4339  newCapacity = __kmp_threads_capacity;
4340  do{
4341  newCapacity =
4342  newCapacity <= (__kmp_actual_max_nth >> 1) ?
4343  (newCapacity << 1) :
4344  __kmp_actual_max_nth;
4345  } while(newCapacity < minimumRequiredCapacity);
4346  newThreads = (kmp_info_t**) __kmp_allocate((sizeof(kmp_info_t*) + sizeof(kmp_root_t*)) * newCapacity + CACHE_LINE);
4347  newRoot = (kmp_root_t**) ((char*)newThreads + sizeof(kmp_info_t*) * newCapacity );
4348  memcpy(newThreads, __kmp_threads, __kmp_threads_capacity * sizeof(kmp_info_t*));
4349  memcpy(newRoot, __kmp_root, __kmp_threads_capacity * sizeof(kmp_root_t*));
4350  memset(newThreads + __kmp_threads_capacity, 0,
4351  (newCapacity - __kmp_threads_capacity) * sizeof(kmp_info_t*));
4352  memset(newRoot + __kmp_threads_capacity, 0,
4353  (newCapacity - __kmp_threads_capacity) * sizeof(kmp_root_t*));
4354 
4355  if(!old_tp_cached && __kmp_tp_cached && newCapacity > __kmp_tp_capacity) {
4356  /* __kmp_tp_cached has changed, i.e. __kmpc_threadprivate_cached has allocated a threadprivate cache
4357  while we were allocating the expanded array, and our new capacity is larger than the threadprivate
4358  cache capacity, so we should deallocate the expanded arrays and try again. This is the first check
4359  of a double-check pair.
4360  */
4361  __kmp_free(newThreads);
4362  continue; /* start over and try again */
4363  }
4364  __kmp_acquire_bootstrap_lock(&__kmp_tp_cached_lock);
4365  if(!old_tp_cached && __kmp_tp_cached && newCapacity > __kmp_tp_capacity) {
4366  /* Same check as above, but this time with the lock so we can be sure if we can succeed. */
4367  __kmp_release_bootstrap_lock(&__kmp_tp_cached_lock);
4368  __kmp_free(newThreads);
4369  continue; /* start over and try again */
4370  } else {
4371  /* success */
4372  // __kmp_free( __kmp_threads ); // ATT: It leads to crash. Need to be investigated.
4373  //
4374  *(kmp_info_t**volatile*)&__kmp_threads = newThreads;
4375  *(kmp_root_t**volatile*)&__kmp_root = newRoot;
4376  added += newCapacity - __kmp_threads_capacity;
4377  *(volatile int*)&__kmp_threads_capacity = newCapacity;
4378  __kmp_release_bootstrap_lock(&__kmp_tp_cached_lock);
4379  break; /* succeded, so we can exit the loop */
4380  }
4381  }
4382  return added;
4383 }
4384 
4385 /* register the current thread as a root thread and obtain our gtid */
4386 /* we must have the __kmp_initz_lock held at this point */
4387 /* Argument TRUE only if are the thread that calls from __kmp_do_serial_initialize() */
4388 int
4389 __kmp_register_root( int initial_thread )
4390 {
4391  kmp_info_t *root_thread;
4392  kmp_root_t *root;
4393  int gtid;
4394  int capacity;
4395  __kmp_acquire_bootstrap_lock( &__kmp_forkjoin_lock );
4396  KA_TRACE( 20, ("__kmp_register_root: entered\n"));
4397  KMP_MB();
4398 
4399 
4400  /*
4401  2007-03-02:
4402 
4403  If initial thread did not invoke OpenMP RTL yet, and this thread is not an initial one,
4404  "__kmp_all_nth >= __kmp_threads_capacity" condition does not work as expected -- it may
4405  return false (that means there is at least one empty slot in __kmp_threads array), but it
4406  is possible the only free slot is #0, which is reserved for initial thread and so cannot be
4407  used for this one. Following code workarounds this bug.
4408 
4409  However, right solution seems to be not reserving slot #0 for initial thread because:
4410  (1) there is no magic in slot #0,
4411  (2) we cannot detect initial thread reliably (the first thread which does serial
4412  initialization may be not a real initial thread).
4413  */
4414  capacity = __kmp_threads_capacity;
4415  if ( ! initial_thread && TCR_PTR(__kmp_threads[0]) == NULL ) {
4416  -- capacity;
4417  }; // if
4418 
4419  /* see if there are too many threads */
4420  if ( __kmp_all_nth >= capacity && !__kmp_expand_threads( 1, 1 ) ) {
4421  if ( __kmp_tp_cached ) {
4422  __kmp_msg(
4423  kmp_ms_fatal,
4424  KMP_MSG( CantRegisterNewThread ),
4425  KMP_HNT( Set_ALL_THREADPRIVATE, __kmp_tp_capacity ),
4426  KMP_HNT( PossibleSystemLimitOnThreads ),
4427  __kmp_msg_null
4428  );
4429  }
4430  else {
4431  __kmp_msg(
4432  kmp_ms_fatal,
4433  KMP_MSG( CantRegisterNewThread ),
4434  KMP_HNT( SystemLimitOnThreads ),
4435  __kmp_msg_null
4436  );
4437  }
4438  }; // if
4439 
4440  /* find an available thread slot */
4441  /* Don't reassign the zero slot since we need that to only be used by initial
4442  thread */
4443  for( gtid=(initial_thread ? 0 : 1) ; TCR_PTR(__kmp_threads[gtid]) != NULL ; gtid++ );
4444  KA_TRACE( 1, ("__kmp_register_root: found slot in threads array: T#%d\n", gtid ));
4445  KMP_ASSERT( gtid < __kmp_threads_capacity );
4446 
4447  /* update global accounting */
4448  __kmp_all_nth ++;
4449  TCW_4(__kmp_nth, __kmp_nth + 1);
4450 
4451  //
4452  // if __kmp_adjust_gtid_mode is set, then we use method #1 (sp search)
4453  // for low numbers of procs, and method #2 (keyed API call) for higher
4454  // numbers of procs.
4455  //
4456  if ( __kmp_adjust_gtid_mode ) {
4457  if ( __kmp_all_nth >= __kmp_tls_gtid_min ) {
4458  if ( TCR_4(__kmp_gtid_mode) != 2) {
4459  TCW_4(__kmp_gtid_mode, 2);
4460  }
4461  }
4462  else {
4463  if (TCR_4(__kmp_gtid_mode) != 1 ) {
4464  TCW_4(__kmp_gtid_mode, 1);
4465  }
4466  }
4467  }
4468 
4469 #ifdef KMP_ADJUST_BLOCKTIME
4470  /* Adjust blocktime to zero if necessary */
4471  /* Middle initialization might not have ocurred yet */
4472  if ( !__kmp_env_blocktime && ( __kmp_avail_proc > 0 ) ) {
4473  if ( __kmp_nth > __kmp_avail_proc ) {
4474  __kmp_zero_bt = TRUE;
4475  }
4476  }
4477 #endif /* KMP_ADJUST_BLOCKTIME */
4478 
4479  /* setup this new hierarchy */
4480  if( ! ( root = __kmp_root[gtid] )) {
4481  root = __kmp_root[gtid] = (kmp_root_t*) __kmp_allocate( sizeof(kmp_root_t) );
4482  KMP_DEBUG_ASSERT( ! root->r.r_root_team );
4483  }
4484 
4485  __kmp_initialize_root( root );
4486 
4487  /* setup new root thread structure */
4488  if( root -> r.r_uber_thread ) {
4489  root_thread = root -> r.r_uber_thread;
4490  } else {
4491  root_thread = (kmp_info_t*) __kmp_allocate( sizeof(kmp_info_t) );
4492  if ( __kmp_storage_map ) {
4493  __kmp_print_thread_storage_map( root_thread, gtid );
4494  }
4495  root_thread -> th.th_info .ds.ds_gtid = gtid;
4496  root_thread -> th.th_root = root;
4497  if( __kmp_env_consistency_check ) {
4498  root_thread -> th.th_cons = __kmp_allocate_cons_stack( gtid );
4499  }
4500  #if USE_FAST_MEMORY
4501  __kmp_initialize_fast_memory( root_thread );
4502  #endif /* USE_FAST_MEMORY */
4503 
4504  #if KMP_USE_BGET
4505  KMP_DEBUG_ASSERT( root_thread -> th.th_local.bget_data == NULL );
4506  __kmp_initialize_bget( root_thread );
4507  #endif
4508  __kmp_init_random( root_thread ); // Initialize random number generator
4509  }
4510 
4511  /* setup the serial team held in reserve by the root thread */
4512  if( ! root_thread -> th.th_serial_team ) {
4513  #if OMP_30_ENABLED
4514  kmp_internal_control_t r_icvs = __kmp_get_global_icvs();
4515  #endif // OMP_30_ENABLED
4516  KF_TRACE( 10, ( "__kmp_register_root: before serial_team\n" ) );
4517  root_thread -> th.th_serial_team = __kmp_allocate_team( root, 1, 1,
4518 #if OMP_40_ENABLED
4519  proc_bind_default,
4520 #endif
4521 #if OMP_30_ENABLED
4522  &r_icvs,
4523 #else
4524  __kmp_dflt_team_nth_ub,
4525  __kmp_global.g.g_dynamic,
4526  __kmp_dflt_nested,
4527  __kmp_dflt_blocktime,
4528  __kmp_bt_intervals,
4529  __kmp_env_blocktime,
4530 #endif // OMP_30_ENABLED
4531  0 );
4532  }
4533  KMP_ASSERT( root_thread -> th.th_serial_team );
4534  KF_TRACE( 10, ( "__kmp_register_root: after serial_team = %p\n",
4535  root_thread -> th.th_serial_team ) );
4536 
4537  /* drop root_thread into place */
4538  TCW_SYNC_PTR(__kmp_threads[gtid], root_thread);
4539 
4540  root -> r.r_root_team -> t.t_threads[0] = root_thread;
4541  root -> r.r_hot_team -> t.t_threads[0] = root_thread;
4542  root_thread -> th.th_serial_team -> t.t_threads[0] = root_thread;
4543  root -> r.r_uber_thread = root_thread;
4544 
4545  /* initialize the thread, get it ready to go */
4546  __kmp_initialize_info( root_thread, root->r.r_root_team, 0, gtid );
4547 
4548  /* prepare the master thread for get_gtid() */
4549  __kmp_gtid_set_specific( gtid );
4550  #ifdef KMP_TDATA_GTID
4551  __kmp_gtid = gtid;
4552  #endif
4553  __kmp_create_worker( gtid, root_thread, __kmp_stksize );
4554  KMP_DEBUG_ASSERT( __kmp_gtid_get_specific() == gtid );
4555  TCW_4(__kmp_init_gtid, TRUE);
4556 
4557  KA_TRACE( 20, ("__kmp_register_root: T#%d init T#%d(%d:%d) arrived: join=%u, plain=%u\n",
4558  gtid, __kmp_gtid_from_tid( 0, root->r.r_hot_team ),
4559  root -> r.r_hot_team -> t.t_id, 0, KMP_INIT_BARRIER_STATE,
4560  KMP_INIT_BARRIER_STATE ) );
4561  { // Initialize barrier data.
4562  int b;
4563  for ( b = 0; b < bs_last_barrier; ++ b ) {
4564  root_thread->th.th_bar[ b ].bb.b_arrived = KMP_INIT_BARRIER_STATE;
4565  }; // for
4566  }
4567  KMP_DEBUG_ASSERT( root->r.r_hot_team->t.t_bar[ bs_forkjoin_barrier ].b_arrived == KMP_INIT_BARRIER_STATE );
4568 
4569 
4570 #if KMP_OS_WINDOWS || KMP_OS_LINUX
4571  if ( TCR_4(__kmp_init_middle) ) {
4572  __kmp_affinity_set_init_mask( gtid, TRUE );
4573  }
4574 #endif /* KMP_OS_WINDOWS || KMP_OS_LINUX */
4575 
4576  __kmp_root_counter ++;
4577 
4578  KMP_MB();
4579  __kmp_release_bootstrap_lock( &__kmp_forkjoin_lock );
4580 
4581  return gtid;
4582 }
4583 
4584 /* Resets a root thread and clear its root and hot teams.
4585  Returns the number of __kmp_threads entries directly and indirectly freed.
4586 */
4587 static int
4588 __kmp_reset_root(int gtid, kmp_root_t *root)
4589 {
4590  kmp_team_t * root_team = root->r.r_root_team;
4591  kmp_team_t * hot_team = root->r.r_hot_team;
4592  int n = hot_team->t.t_nproc;
4593  int i;
4594 
4595  KMP_DEBUG_ASSERT( ! root->r.r_active );
4596 
4597  root->r.r_root_team = NULL;
4598  root->r.r_hot_team = NULL;
4599  // __kmp_free_team() does not free hot teams, so we have to clear r_hot_team before call
4600  // to __kmp_free_team().
4601  __kmp_free_team( root, root_team );
4602  __kmp_free_team( root, hot_team );
4603 
4604 #if OMP_30_ENABLED
4605  //
4606  // Before we can reap the thread, we need to make certain that all
4607  // other threads in the teams that had this root as ancestor have stopped trying to steal tasks.
4608  //
4609  if ( __kmp_tasking_mode != tskm_immediate_exec ) {
4610  __kmp_wait_to_unref_task_teams();
4611  }
4612 #endif /* OMP_30_ENABLED */
4613 
4614  #if KMP_OS_WINDOWS
4615  /* Close Handle of root duplicated in __kmp_create_worker (tr #62919) */
4616  KA_TRACE( 10, ("__kmp_reset_root: free handle, th = %p, handle = %" KMP_UINTPTR_SPEC "\n",
4617  (LPVOID)&(root->r.r_uber_thread->th),
4618  root->r.r_uber_thread->th.th_info.ds.ds_thread ) );
4619  __kmp_free_handle( root->r.r_uber_thread->th.th_info.ds.ds_thread );
4620  #endif /* KMP_OS_WINDOWS */
4621 
4622  TCW_4(__kmp_nth, __kmp_nth - 1); // __kmp_reap_thread will decrement __kmp_all_nth.
4623  __kmp_reap_thread( root->r.r_uber_thread, 1 );
4624 
4625  // We canot put root thread to __kmp_thread_pool, so we have to reap it istead of freeing.
4626  root->r.r_uber_thread = NULL;
4627  /* mark root as no longer in use */
4628  root -> r.r_begin = FALSE;
4629 
4630  return n;
4631 }
4632 
4633 void
4634 __kmp_unregister_root_current_thread( int gtid )
4635 {
4636  kmp_root_t *root = __kmp_root[gtid];
4637 
4638  KA_TRACE( 1, ("__kmp_unregister_root_current_thread: enter T#%d\n", gtid ));
4639  KMP_DEBUG_ASSERT( __kmp_threads && __kmp_threads[gtid] );
4640  KMP_ASSERT( KMP_UBER_GTID( gtid ));
4641  KMP_ASSERT( root == __kmp_threads[gtid]->th.th_root );
4642  KMP_ASSERT( root->r.r_active == FALSE );
4643 
4644  /* this lock should be ok, since unregister_root_current_thread is never called during
4645  * and abort, only during a normal close. furthermore, if you have the
4646  * forkjoin lock, you should never try to get the initz lock */
4647 
4648  __kmp_acquire_bootstrap_lock( &__kmp_forkjoin_lock );
4649 
4650  KMP_MB();
4651 
4652  __kmp_reset_root(gtid, root);
4653 
4654  /* free up this thread slot */
4655  __kmp_gtid_set_specific( KMP_GTID_DNE );
4656 #ifdef KMP_TDATA_GTID
4657  __kmp_gtid = KMP_GTID_DNE;
4658 #endif
4659 
4660  KMP_MB();
4661  KC_TRACE( 10, ("__kmp_unregister_root_current_thread: T#%d unregistered\n", gtid ));
4662 
4663  __kmp_release_bootstrap_lock( &__kmp_forkjoin_lock );
4664 }
4665 
4666 /* __kmp_forkjoin_lock must be already held
4667  Unregisters a root thread that is not the current thread. Returns the number of
4668  __kmp_threads entries freed as a result.
4669  */
4670 static int
4671 __kmp_unregister_root_other_thread( int gtid )
4672 {
4673  kmp_root_t *root = __kmp_root[gtid];
4674  int r;
4675 
4676  KA_TRACE( 1, ("__kmp_unregister_root_other_thread: enter T#%d\n", gtid ));
4677  KMP_DEBUG_ASSERT( __kmp_threads && __kmp_threads[gtid] );
4678  KMP_ASSERT( KMP_UBER_GTID( gtid ));
4679  KMP_ASSERT( root == __kmp_threads[gtid]->th.th_root );
4680  KMP_ASSERT( root->r.r_active == FALSE );
4681 
4682  r = __kmp_reset_root(gtid, root);
4683  KC_TRACE( 10, ("__kmp_unregister_root_other_thread: T#%d unregistered\n", gtid ));
4684  return r;
4685 }
4686 
4687 #if OMP_30_ENABLED
4688 
4689 #if KMP_DEBUG
4690 void __kmp_task_info() {
4691 
4692  kmp_int32 gtid = __kmp_entry_gtid();
4693  kmp_int32 tid = __kmp_tid_from_gtid( gtid );
4694  kmp_info_t *this_thr = __kmp_threads[ gtid ];
4695  kmp_team_t *steam = this_thr -> th.th_serial_team;
4696  kmp_team_t *team = this_thr -> th.th_team;
4697 
4698  __kmp_printf( "__kmp_task_info: gtid=%d tid=%d t_thread=%p team=%p curtask=%p ptask=%p\n",
4699  gtid, tid, this_thr, team, this_thr->th.th_current_task, team->t.t_implicit_task_taskdata[tid].td_parent );
4700 }
4701 #endif // KMP_DEBUG
4702 
4703 #endif // OMP_30_ENABLED
4704 
4705 /* TODO optimize with one big memclr, take out what isn't needed,
4706  * split responsility to workers as much as possible, and delay
4707  * initialization of features as much as possible */
4708 static void
4709 __kmp_initialize_info( kmp_info_t *this_thr, kmp_team_t *team, int tid, int gtid )
4710 {
4711  /* this_thr->th.th_info.ds.ds_gtid is setup in kmp_allocate_thread/create_worker
4712  * this_thr->th.th_serial_team is setup in __kmp_allocate_thread */
4713 
4714  KMP_DEBUG_ASSERT( this_thr != NULL );
4715  KMP_DEBUG_ASSERT( this_thr -> th.th_serial_team );
4716  KMP_DEBUG_ASSERT( team );
4717  KMP_DEBUG_ASSERT( team -> t.t_threads );
4718  KMP_DEBUG_ASSERT( team -> t.t_dispatch );
4719  KMP_DEBUG_ASSERT( team -> t.t_threads[0] );
4720  KMP_DEBUG_ASSERT( team -> t.t_threads[0] -> th.th_root );
4721 
4722  KMP_MB();
4723 
4724  TCW_SYNC_PTR(this_thr->th.th_team, team);
4725 
4726  this_thr->th.th_info.ds.ds_tid = tid;
4727  this_thr->th.th_set_nproc = 0;
4728 #if OMP_40_ENABLED
4729  this_thr->th.th_set_proc_bind = proc_bind_default;
4730 # if (KMP_OS_WINDOWS || KMP_OS_LINUX)
4731  this_thr->th.th_new_place = this_thr->th.th_current_place;
4732 # endif
4733 #endif
4734  this_thr->th.th_root = team -> t.t_threads[0] -> th.th_root;
4735 
4736  /* setup the thread's cache of the team structure */
4737  this_thr->th.th_team_nproc = team -> t.t_nproc;
4738  this_thr->th.th_team_master = team -> t.t_threads[0];
4739  this_thr->th.th_team_serialized = team -> t.t_serialized;
4740 #if OMP_40_ENABLED
4741  this_thr->th.th_team_microtask = team -> t.t_threads[0] -> th.th_team_microtask;
4742  this_thr->th.th_teams_level = team -> t.t_threads[0] -> th.th_teams_level;
4743  this_thr->th.th_set_nth_teams = team -> t.t_threads[0] -> th.th_set_nth_teams;
4744 #endif /* OMP_40_ENABLED */
4745  TCW_PTR(this_thr->th.th_sleep_loc, NULL);
4746 
4747 #if OMP_30_ENABLED
4748  KMP_DEBUG_ASSERT( team -> t.t_implicit_task_taskdata );
4749  this_thr->th.th_task_state = 0;
4750 
4751  KF_TRACE( 10, ( "__kmp_initialize_info1: T#%d:%d this_thread=%p curtask=%p\n",
4752  tid, gtid, this_thr, this_thr->th.th_current_task ) );
4753 
4754  __kmp_init_implicit_task( this_thr->th.th_team_master->th.th_ident, this_thr, team, tid, TRUE );
4755 
4756  KF_TRACE( 10, ( "__kmp_initialize_info2: T#%d:%d this_thread=%p curtask=%p\n",
4757  tid, gtid, this_thr, this_thr->th.th_current_task ) );
4758  // TODO: Initialize ICVs from parent; GEH - isn't that already done in __kmp_initialize_team()?
4759 #endif // OMP_30_ENABLED
4760 
4761  /* TODO no worksharing in speculative threads */
4762  this_thr -> th.th_dispatch = &team -> t.t_dispatch[ tid ];
4763 
4764  this_thr->th.th_local.this_construct = 0;
4765  this_thr->th.th_local.last_construct = 0;
4766 
4767 #ifdef BUILD_TV
4768  this_thr->th.th_local.tv_data = 0;
4769 #endif
4770 
4771  if ( ! this_thr->th.th_pri_common ) {
4772  this_thr->th.th_pri_common = (struct common_table *) __kmp_allocate( sizeof(struct common_table) );
4773  if ( __kmp_storage_map ) {
4774  __kmp_print_storage_map_gtid(
4775  gtid, this_thr->th.th_pri_common, this_thr->th.th_pri_common + 1,
4776  sizeof( struct common_table ), "th_%d.th_pri_common\n", gtid
4777  );
4778  }; // if
4779  this_thr->th.th_pri_head = NULL;
4780  }; // if
4781 
4782  /* Initialize dynamic dispatch */
4783  {
4784  volatile kmp_disp_t *dispatch = this_thr -> th.th_dispatch;
4785  /*
4786  * Use team max_nproc since this will never change for the team.
4787  */
4788  size_t disp_size = sizeof( dispatch_private_info_t ) *
4789  ( team->t.t_max_nproc == 1 ? 1 : KMP_MAX_DISP_BUF );
4790  KD_TRACE( 10, ("__kmp_initialize_info: T#%d max_nproc: %d\n", gtid, team->t.t_max_nproc ) );
4791  KMP_ASSERT( dispatch );
4792  KMP_DEBUG_ASSERT( team -> t.t_dispatch );
4793  KMP_DEBUG_ASSERT( dispatch == &team->t.t_dispatch[ tid ] );
4794 
4795  dispatch->th_disp_index = 0;
4796 
4797  if( ! dispatch -> th_disp_buffer ) {
4798  dispatch -> th_disp_buffer = (dispatch_private_info_t *) __kmp_allocate( disp_size );
4799 
4800  if ( __kmp_storage_map ) {
4801  __kmp_print_storage_map_gtid( gtid, &dispatch->th_disp_buffer[ 0 ],
4802  &dispatch->th_disp_buffer[ team->t.t_max_nproc == 1 ? 1 : KMP_MAX_DISP_BUF ],
4803  disp_size, "th_%d.th_dispatch.th_disp_buffer "
4804  "(team_%d.t_dispatch[%d].th_disp_buffer)",
4805  gtid, team->t.t_id, gtid );
4806  }
4807  } else {
4808  memset( & dispatch -> th_disp_buffer[0], '\0', disp_size );
4809  }
4810 
4811  dispatch -> th_dispatch_pr_current = 0;
4812  dispatch -> th_dispatch_sh_current = 0;
4813 
4814  dispatch -> th_deo_fcn = 0; /* ORDERED */
4815  dispatch -> th_dxo_fcn = 0; /* END ORDERED */
4816  }
4817 
4818  this_thr->th.th_next_pool = NULL;
4819 
4820  KMP_DEBUG_ASSERT( !this_thr->th.th_spin_here );
4821  KMP_DEBUG_ASSERT( this_thr->th.th_next_waiting == 0 );
4822 
4823  KMP_MB();
4824 }
4825 
4826 
4827 /* allocate a new thread for the requesting team. this is only called from within a
4828  * forkjoin critical section. we will first try to get an available thread from the
4829  * thread pool. if none is available, we will fork a new one assuming we are able
4830  * to create a new one. this should be assured, as the caller should check on this
4831  * first.
4832  */
4833 kmp_info_t *
4834 __kmp_allocate_thread( kmp_root_t *root, kmp_team_t *team, int new_tid )
4835 {
4836  kmp_team_t *serial_team;
4837  kmp_info_t *new_thr;
4838  int new_gtid;
4839 
4840  KA_TRACE( 20, ("__kmp_allocate_thread: T#%d\n", __kmp_get_gtid() ));
4841  KMP_DEBUG_ASSERT( root && team );
4842  KMP_DEBUG_ASSERT( KMP_MASTER_GTID( __kmp_get_gtid() ));
4843  KMP_MB();
4844 
4845  /* first, try to get one from the thread pool */
4846  if ( __kmp_thread_pool ) {
4847 
4848  new_thr = (kmp_info_t*)__kmp_thread_pool;
4849  __kmp_thread_pool = (volatile kmp_info_t *) new_thr->th.th_next_pool;
4850  if ( new_thr == __kmp_thread_pool_insert_pt ) {
4851  __kmp_thread_pool_insert_pt = NULL;
4852  }
4853  TCW_4(new_thr->th.th_in_pool, FALSE);
4854  //
4855  // Don't touch th_active_in_pool or th_active.
4856  // The worker thread adjusts those flags as it sleeps/awakens.
4857  //
4858 
4859  __kmp_thread_pool_nth--;
4860 
4861  KA_TRACE( 20, ("__kmp_allocate_thread: T#%d using thread T#%d\n",
4862  __kmp_get_gtid(), new_thr->th.th_info.ds.ds_gtid ));
4863  KMP_ASSERT( ! new_thr -> th.th_team );
4864  KMP_DEBUG_ASSERT( __kmp_nth < __kmp_threads_capacity );
4865  KMP_DEBUG_ASSERT( __kmp_thread_pool_nth >= 0 );
4866 
4867  /* setup the thread structure */
4868  __kmp_initialize_info( new_thr, team, new_tid, new_thr->th.th_info.ds.ds_gtid );
4869  KMP_DEBUG_ASSERT( new_thr->th.th_serial_team );
4870 
4871  TCW_4(__kmp_nth, __kmp_nth + 1);
4872 
4873 #ifdef KMP_ADJUST_BLOCKTIME
4874  /* Adjust blocktime back to zero if necessar y */
4875  /* Middle initialization might not have ocurred yet */
4876  if ( !__kmp_env_blocktime && ( __kmp_avail_proc > 0 ) ) {
4877  if ( __kmp_nth > __kmp_avail_proc ) {
4878  __kmp_zero_bt = TRUE;
4879  }
4880  }
4881 #endif /* KMP_ADJUST_BLOCKTIME */
4882 
4883  KF_TRACE( 10, ("__kmp_allocate_thread: T#%d using thread %p T#%d\n",
4884  __kmp_get_gtid(), new_thr, new_thr->th.th_info.ds.ds_gtid ));
4885 
4886  KMP_MB();
4887  return new_thr;
4888  }
4889 
4890 
4891  /* no, well fork a new one */
4892  KMP_ASSERT( __kmp_nth == __kmp_all_nth );
4893  KMP_ASSERT( __kmp_all_nth < __kmp_threads_capacity );
4894 
4895  //
4896  // If this is the first worker thread the RTL is creating, then also
4897  // launch the monitor thread. We try to do this as early as possible.
4898  //
4899  if ( ! TCR_4( __kmp_init_monitor ) ) {
4900  __kmp_acquire_bootstrap_lock( & __kmp_monitor_lock );
4901  if ( ! TCR_4( __kmp_init_monitor ) ) {
4902  KF_TRACE( 10, ( "before __kmp_create_monitor\n" ) );
4903  TCW_4( __kmp_init_monitor, 1 );
4904  __kmp_create_monitor( & __kmp_monitor );
4905  KF_TRACE( 10, ( "after __kmp_create_monitor\n" ) );
4906  }
4907  __kmp_release_bootstrap_lock( & __kmp_monitor_lock );
4908  }
4909 
4910  KMP_MB();
4911  for( new_gtid=1 ; TCR_PTR(__kmp_threads[new_gtid]) != NULL; ++new_gtid ) {
4912  KMP_DEBUG_ASSERT( new_gtid < __kmp_threads_capacity );
4913  }
4914 
4915  /* allocate space for it. */
4916  new_thr = (kmp_info_t*) __kmp_allocate( sizeof(kmp_info_t) );
4917 
4918  TCW_SYNC_PTR(__kmp_threads[new_gtid], new_thr);
4919 
4920  if ( __kmp_storage_map ) {
4921  __kmp_print_thread_storage_map( new_thr, new_gtid );
4922  }
4923 
4924  /* add the reserve serialized team, initialized from the team's master thread */
4925  {
4926  #if OMP_30_ENABLED
4927  kmp_internal_control_t r_icvs = __kmp_get_x_global_icvs( team );
4928  #endif // OMP_30_ENABLED
4929  KF_TRACE( 10, ( "__kmp_allocate_thread: before th_serial/serial_team\n" ) );
4930  new_thr -> th.th_serial_team = serial_team =
4931  (kmp_team_t*) __kmp_allocate_team( root, 1, 1,
4932 #if OMP_40_ENABLED
4933  proc_bind_default,
4934 #endif
4935 #if OMP_30_ENABLED
4936  &r_icvs,
4937 #else
4938  team->t.t_set_nproc[0],
4939  team->t.t_set_dynamic[0],
4940  team->t.t_set_nested[0],
4941  team->t.t_set_blocktime[0],
4942  team->t.t_set_bt_intervals[0],
4943  team->t.t_set_bt_set[0],
4944 #endif // OMP_30_ENABLED
4945  0 );
4946  }
4947  KMP_ASSERT ( serial_team );
4948  serial_team -> t.t_threads[0] = new_thr;
4949  KF_TRACE( 10, ( "__kmp_allocate_thread: after th_serial/serial_team : new_thr=%p\n",
4950  new_thr ) );
4951 
4952  /* setup the thread structures */
4953  __kmp_initialize_info( new_thr, team, new_tid, new_gtid );
4954 
4955  #if USE_FAST_MEMORY
4956  __kmp_initialize_fast_memory( new_thr );
4957  #endif /* USE_FAST_MEMORY */
4958 
4959  #if KMP_USE_BGET
4960  KMP_DEBUG_ASSERT( new_thr -> th.th_local.bget_data == NULL );
4961  __kmp_initialize_bget( new_thr );
4962  #endif
4963 
4964  __kmp_init_random( new_thr ); // Initialize random number generator
4965 
4966  /* Initialize these only once when thread is grabbed for a team allocation */
4967  KA_TRACE( 20, ("__kmp_allocate_thread: T#%d init go fork=%u, plain=%u\n",
4968  __kmp_get_gtid(), KMP_INIT_BARRIER_STATE, KMP_INIT_BARRIER_STATE ));
4969 
4970  new_thr->th.th_bar[ bs_forkjoin_barrier ].bb.b_go = KMP_INIT_BARRIER_STATE;
4971  new_thr->th.th_bar[ bs_plain_barrier ].bb.b_go = KMP_INIT_BARRIER_STATE;
4972  #if KMP_FAST_REDUCTION_BARRIER
4973  new_thr->th.th_bar[ bs_reduction_barrier ].bb.b_go = KMP_INIT_BARRIER_STATE;
4974  #endif // KMP_FAST_REDUCTION_BARRIER
4975 
4976  new_thr->th.th_spin_here = FALSE;
4977  new_thr->th.th_next_waiting = 0;
4978 
4979 #if OMP_40_ENABLED && (KMP_OS_WINDOWS || KMP_OS_LINUX)
4980  new_thr->th.th_current_place = KMP_PLACE_UNDEFINED;
4981  new_thr->th.th_new_place = KMP_PLACE_UNDEFINED;
4982  new_thr->th.th_first_place = KMP_PLACE_UNDEFINED;
4983  new_thr->th.th_last_place = KMP_PLACE_UNDEFINED;
4984 #endif
4985 
4986  TCW_4(new_thr->th.th_in_pool, FALSE);
4987  new_thr->th.th_active_in_pool = FALSE;
4988  TCW_4(new_thr->th.th_active, TRUE);
4989 
4990  /* adjust the global counters */
4991  __kmp_all_nth ++;
4992  __kmp_nth ++;
4993 
4994  //
4995  // if __kmp_adjust_gtid_mode is set, then we use method #1 (sp search)
4996  // for low numbers of procs, and method #2 (keyed API call) for higher
4997  // numbers of procs.
4998  //
4999  if ( __kmp_adjust_gtid_mode ) {
5000  if ( __kmp_all_nth >= __kmp_tls_gtid_min ) {
5001  if ( TCR_4(__kmp_gtid_mode) != 2) {
5002  TCW_4(__kmp_gtid_mode, 2);
5003  }
5004  }
5005  else {
5006  if (TCR_4(__kmp_gtid_mode) != 1 ) {
5007  TCW_4(__kmp_gtid_mode, 1);
5008  }
5009  }
5010  }
5011 
5012 #ifdef KMP_ADJUST_BLOCKTIME
5013  /* Adjust blocktime back to zero if necessary */
5014  /* Middle initialization might not have ocurred yet */
5015  if ( !__kmp_env_blocktime && ( __kmp_avail_proc > 0 ) ) {
5016  if ( __kmp_nth > __kmp_avail_proc ) {
5017  __kmp_zero_bt = TRUE;
5018  }
5019  }
5020 #endif /* KMP_ADJUST_BLOCKTIME */
5021 
5022  /* actually fork it and create the new worker thread */
5023  KF_TRACE( 10, ("__kmp_allocate_thread: before __kmp_create_worker: %p\n", new_thr ));
5024  __kmp_create_worker( new_gtid, new_thr, __kmp_stksize );
5025  KF_TRACE( 10, ("__kmp_allocate_thread: after __kmp_create_worker: %p\n", new_thr ));
5026 
5027 
5028  KA_TRACE( 20, ("__kmp_allocate_thread: T#%d forked T#%d\n", __kmp_get_gtid(), new_gtid ));
5029  KMP_MB();
5030  return new_thr;
5031 }
5032 
5033 /*
5034  * reinitialize team for reuse.
5035  *
5036  * The hot team code calls this case at every fork barrier, so EPCC barrier
5037  * test are extremely sensitive to changes in it, esp. writes to the team
5038  * struct, which cause a cache invalidation in all threads.
5039  *
5040  * IF YOU TOUCH THIS ROUTINE, RUN EPCC C SYNCBENCH ON A BIG-IRON MACHINE!!!
5041  */
5042 static void
5043 __kmp_reinitialize_team(
5044  kmp_team_t * team,
5045  int new_nproc,
5046  #if OMP_30_ENABLED
5047  kmp_internal_control_t * new_icvs,
5048  ident_t * loc
5049  #else
5050  int new_set_nproc, int new_set_dynamic, int new_set_nested,
5051  int new_set_blocktime, int new_bt_intervals, int new_bt_set
5052  #endif // OMP_30_ENABLED
5053 ) {
5054  int f;
5055  #if OMP_30_ENABLED
5056  KMP_DEBUG_ASSERT( team && new_nproc && new_icvs );
5057  KMP_DEBUG_ASSERT( ( ! TCR_4(__kmp_init_parallel) ) || new_icvs->nproc );
5058  team->t.t_ident = loc;
5059  #else
5060  KMP_DEBUG_ASSERT( team && new_nproc && new_set_nproc );
5061  #endif // OMP_30_ENABLED
5062 
5063  team->t.t_id = KMP_GEN_TEAM_ID();
5064 
5065 #if KMP_BARRIER_ICV_PULL
5066  //
5067  // Copy the ICV's to the team structure, where all of the worker threads
5068  // can access them and make their own copies after the barrier.
5069  //
5070  copy_icvs( &team->t.t_initial_icvs, new_icvs );
5071 
5072  //
5073  // Set up the master thread's copy of the ICV's. __kmp_fork_call()
5074  // assumes they are already set in the master thread.
5075  // FIXME - change that code to use the team->t.t_initial_icvs copy
5076  // and eliminate this copy.
5077  //
5078  __kmp_init_implicit_task( loc, team->t.t_threads[0], team, 0, FALSE );
5079  copy_icvs( &team->t.t_implicit_task_taskdata[0].td_icvs, new_icvs );
5080  KF_TRACE( 10, ( "__kmp_reinitialize_team2: T#%d this_thread=%p team=%p\n",
5081  0, team->t.t_threads[0], team ) );
5082 
5083 #elif KMP_BARRIER_ICV_PUSH
5084  //
5085  // Set the ICV's in the master thread only.
5086  // They will be propagated by the fork barrier.
5087  //
5088  __kmp_init_implicit_task( loc, team->t.t_threads[0], team, 0, FALSE );
5089  copy_icvs( &team->t.t_implicit_task_taskdata[0].td_icvs, new_icvs );
5090  KF_TRACE( 10, ( "__kmp_reinitialize_team2: T#%d this_thread=%p team=%p\n",
5091  0, team->t.t_threads[0], team ) );
5092 
5093 #else
5094  //
5095  // Copy the icvs to each of the threads. This takes O(nthreads) time.
5096  //
5097  for( f=0 ; f<new_nproc ; f++) {
5098 # if OMP_30_ENABLED
5099  // TODO: GEH - pass in better source location info since usually NULL here
5100  KF_TRACE( 10, ( "__kmp_reinitialize_team1: T#%d this_thread=%p team=%p\n",
5101  f, team->t.t_threads[f], team ) );
5102  __kmp_init_implicit_task( loc, team->t.t_threads[f], team, f, FALSE );
5103  copy_icvs( &team->t.t_implicit_task_taskdata[f].td_icvs, new_icvs );
5104  KF_TRACE( 10, ( "__kmp_reinitialize_team2: T#%d this_thread=%p team=%p\n",
5105  f, team->t.t_threads[f], team ) );
5106 # else
5107  team -> t.t_set_nproc[f] = new_set_nproc;
5108  team -> t.t_set_dynamic[f] = new_set_dynamic;
5109  team -> t.t_set_nested[f] = new_set_nested;
5110  team -> t.t_set_blocktime[f] = new_set_blocktime;
5111  team -> t.t_set_bt_intervals[f] = new_bt_intervals;
5112  team -> t.t_set_bt_set[f] = new_bt_set;
5113 # endif // OMP_30_ENABLED
5114  }
5115 
5116 #endif // KMP_BARRIER_ICV_PUSH || KMP_BARRIER_ICV_PULL
5117 
5118 }
5119 
5120 /* initialize the team data structure
5121  * this assumes the t_threads and t_max_nproc are already set
5122  * also, we don't touch the arguments */
5123 static void
5124 __kmp_initialize_team(
5125  kmp_team_t * team,
5126  int new_nproc,
5127  #if OMP_30_ENABLED
5128  kmp_internal_control_t * new_icvs,
5129  ident_t * loc
5130  #else
5131  int new_set_nproc, int new_set_dynamic, int new_set_nested,
5132  int new_set_blocktime, int new_bt_intervals, int new_bt_set
5133  #endif // OMP_30_ENABLED
5134 ) {
5135  /* verify */
5136  KMP_DEBUG_ASSERT( team );
5137  KMP_DEBUG_ASSERT( new_nproc <= team->t.t_max_nproc );
5138  KMP_DEBUG_ASSERT( team->t.t_threads );
5139  KMP_MB();
5140 
5141  team -> t.t_master_tid = 0; /* not needed */
5142  /* team -> t.t_master_bar; not needed */
5143  team -> t.t_serialized = 0;
5144  team -> t.t_nproc = new_nproc;
5145 
5146  /* team -> t.t_parent = NULL; TODO not needed & would mess up hot team */
5147  team -> t.t_next_pool = NULL;
5148  /* memset( team -> t.t_threads, 0, sizeof(kmp_info_t*)*new_nproc ); would mess up hot team */
5149 
5150  TCW_SYNC_PTR(team->t.t_pkfn, NULL); /* not needed */
5151  team -> t.t_invoke = NULL; /* not needed */
5152 
5153 #if OMP_30_ENABLED
5154  // TODO???: team -> t.t_max_active_levels = new_max_active_levels;
5155  team -> t.t_sched = new_icvs->sched;
5156 #endif // OMP_30_ENABLED
5157 
5158 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
5159  team -> t.t_fp_control_saved = FALSE; /* not needed */
5160  team -> t.t_x87_fpu_control_word = 0; /* not needed */
5161  team -> t.t_mxcsr = 0; /* not needed */
5162 #endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
5163 
5164  team -> t.t_construct = 0;
5165  __kmp_init_lock( & team -> t.t_single_lock );
5166 
5167  team -> t.t_ordered .dt.t_value = 0;
5168  team -> t.t_master_active = FALSE;
5169 
5170  memset( & team -> t.t_taskq, '\0', sizeof( kmp_taskq_t ));
5171 
5172 #ifdef KMP_DEBUG
5173  team -> t.t_copypriv_data = NULL; /* not necessary, but nice for debugging */
5174 #endif
5175  team -> t.t_copyin_counter = 0; /* for barrier-free copyin implementation */
5176 
5177  team -> t.t_control_stack_top = NULL;
5178 
5179  __kmp_reinitialize_team(
5180  team, new_nproc,
5181  #if OMP_30_ENABLED
5182  new_icvs,
5183  loc
5184  #else
5185  new_set_nproc, new_set_dynamic, new_set_nested,
5186  new_set_blocktime, new_bt_intervals, new_bt_set
5187  #endif // OMP_30_ENABLED
5188  );
5189 
5190  KMP_MB();
5191 }
5192 
5193 #if KMP_OS_LINUX
5194 /* Sets full mask for thread and returns old mask, no changes to structures. */
5195 static void
5196 __kmp_set_thread_affinity_mask_full_tmp( kmp_affin_mask_t *old_mask )
5197 {
5198  if ( KMP_AFFINITY_CAPABLE() ) {
5199  int status;
5200  if ( old_mask != NULL ) {
5201  status = __kmp_get_system_affinity( old_mask, TRUE );
5202  int error = errno;
5203  if ( status != 0 ) {
5204  __kmp_msg(
5205  kmp_ms_fatal,
5206  KMP_MSG( ChangeThreadAffMaskError ),
5207  KMP_ERR( error ),
5208  __kmp_msg_null
5209  );
5210  }
5211  }
5212  __kmp_set_system_affinity( __kmp_affinity_get_fullMask(), TRUE );
5213  }
5214 }
5215 #endif
5216 
5217 #if OMP_40_ENABLED && (KMP_OS_WINDOWS || KMP_OS_LINUX)
5218 
5219 //
5220 // __kmp_partition_places() is the heart of the OpenMP 4.0 affinity mechanism.
5221 // It calculats the worker + master thread's partition based upon the parent
5222 // thread's partition, and binds each worker to a thread in thier partition.
5223 // The master thread's partition should already include its current binding.
5224 //
5225 static void
5226 __kmp_partition_places( kmp_team_t *team )
5227 {
5228  //
5229  // Copy the master thread's place partion to the team struct
5230  //
5231  kmp_info_t *master_th = team->t.t_threads[0];
5232  KMP_DEBUG_ASSERT( master_th != NULL );
5233  kmp_proc_bind_t proc_bind = team->t.t_proc_bind;
5234  int first_place = master_th->th.th_first_place;
5235  int last_place = master_th->th.th_last_place;
5236  int masters_place = master_th->th.th_current_place;
5237  team->t.t_first_place = first_place;
5238  team->t.t_last_place = last_place;
5239 
5240  KA_TRACE( 20, ("__kmp_partition_places: enter: proc_bind = %d T#%d(%d:0) bound to place %d partition = [%d,%d]\n",
5241  proc_bind, __kmp_gtid_from_thread( team->t.t_threads[0] ), team->t.t_id,
5242  masters_place, first_place, last_place ) );
5243 
5244  switch ( proc_bind ) {
5245 
5246  case proc_bind_default:
5247  //
5248  // serial teams might have the proc_bind policy set to
5249  // proc_bind_default. It doesn't matter, as we don't
5250  // rebind the master thread for any proc_bind policy.
5251  //
5252  KMP_DEBUG_ASSERT( team->t.t_nproc == 1 );
5253  break;
5254 
5255  case proc_bind_master:
5256  {
5257  int f;
5258  int n_th = team->t.t_nproc;
5259  for ( f = 1; f < n_th; f++ ) {
5260  kmp_info_t *th = team->t.t_threads[f];
5261  KMP_DEBUG_ASSERT( th != NULL );
5262  th->th.th_first_place = first_place;
5263  th->th.th_last_place = last_place;
5264  th->th.th_new_place = masters_place;
5265 
5266  KA_TRACE( 100, ("__kmp_partition_places: master: T#%d(%d:%d) place %d partition = [%d,%d]\n",
5267  __kmp_gtid_from_thread( team->t.t_threads[f] ),
5268  team->t.t_id, f, masters_place, first_place, last_place ) );
5269  }
5270  }
5271  break;
5272 
5273  case proc_bind_close:
5274  {
5275  int f;
5276  int n_th = team->t.t_nproc;
5277  int n_places;
5278  if ( first_place <= last_place ) {
5279  n_places = last_place - first_place + 1;
5280  }
5281  else {
5282  n_places = __kmp_affinity_num_masks - first_place + last_place + 1;
5283  }
5284  if ( n_th <= n_places ) {
5285  int place = masters_place;
5286  for ( f = 1; f < n_th; f++ ) {
5287  kmp_info_t *th = team->t.t_threads[f];
5288  KMP_DEBUG_ASSERT( th != NULL );
5289 
5290  if ( place == last_place ) {
5291  place = first_place;
5292  }
5293  else if ( place == __kmp_affinity_num_masks - 1) {
5294  place = 0;
5295  }
5296  else {
5297  place++;
5298  }
5299  th->th.th_first_place = first_place;
5300  th->th.th_last_place = last_place;
5301  th->th.th_new_place = place;
5302 
5303  KA_TRACE( 100, ("__kmp_partition_places: close: T#%d(%d:%d) place %d partition = [%d,%d]\n",
5304  __kmp_gtid_from_thread( team->t.t_threads[f] ),
5305  team->t.t_id, f, place, first_place, last_place ) );
5306  }
5307  }
5308  else {
5309  int S, rem, gap, s_count;
5310  S = n_th / n_places;
5311  s_count = 0;
5312  rem = n_th - ( S * n_places );
5313  gap = rem > 0 ? n_places/rem : n_places;
5314  int place = masters_place;
5315  int gap_ct = gap;
5316  for ( f = 0; f < n_th; f++ ) {
5317  kmp_info_t *th = team->t.t_threads[f];
5318  KMP_DEBUG_ASSERT( th != NULL );
5319 
5320  th->th.th_first_place = first_place;
5321  th->th.th_last_place = last_place;
5322  th->th.th_new_place = place;
5323  s_count++;
5324 
5325  if ( (s_count == S) && rem && (gap_ct == gap) ) {
5326  // do nothing, add an extra thread to place on next iteration
5327  }
5328  else if ( (s_count == S+1) && rem && (gap_ct == gap) ) {
5329  // we added an extra thread to this place; move to next place
5330  if ( place == last_place ) {
5331  place = first_place;
5332  }
5333  else if ( place == __kmp_affinity_num_masks - 1) {
5334  place = 0;
5335  }
5336  else {
5337  place++;
5338  }
5339  s_count = 0;
5340  gap_ct = 1;
5341  rem--;
5342  }
5343  else if (s_count == S) { // place full; don't add extra
5344  if ( place == last_place ) {
5345  place = first_place;
5346  }
5347  else if ( place == __kmp_affinity_num_masks - 1) {
5348  place = 0;
5349  }
5350  else {
5351  place++;
5352  }
5353  gap_ct++;
5354  s_count = 0;
5355  }
5356 
5357  KA_TRACE( 100, ("__kmp_partition_places: close: T#%d(%d:%d) place %d partition = [%d,%d]\n",
5358  __kmp_gtid_from_thread( team->t.t_threads[f] ),
5359  team->t.t_id, f, th->th.th_new_place, first_place,
5360  last_place ) );
5361  }
5362  KMP_DEBUG_ASSERT( place == masters_place );
5363  }
5364  }
5365  break;
5366 
5367  case proc_bind_spread:
5368  {
5369  int f;
5370  int n_th = team->t.t_nproc;
5371  int n_places;
5372  if ( first_place <= last_place ) {
5373  n_places = last_place - first_place + 1;
5374  }
5375  else {
5376  n_places = __kmp_affinity_num_masks - first_place + last_place + 1;
5377  }
5378  if ( n_th <= n_places ) {
5379  int place = masters_place;
5380  int S = n_places/n_th;
5381  int s_count, rem, gap, gap_ct;
5382  rem = n_places - n_th*S;
5383  gap = rem ? n_th/rem : 1;
5384  gap_ct = gap;
5385  for ( f = 0; f < n_th; f++ ) {
5386  kmp_info_t *th = team->t.t_threads[f];
5387  KMP_DEBUG_ASSERT( th != NULL );
5388 
5389  th->th.th_first_place = place;
5390  th->th.th_new_place = place;
5391  s_count = 1;
5392  while (s_count < S) {
5393  if ( place == last_place ) {
5394  place = first_place;
5395  }
5396  else if ( place == __kmp_affinity_num_masks - 1) {
5397  place = 0;
5398  }
5399  else {
5400  place++;
5401  }
5402  s_count++;
5403  }
5404  if (rem && (gap_ct == gap)) {
5405  if ( place == last_place ) {
5406  place = first_place;
5407  }
5408  else if ( place == __kmp_affinity_num_masks - 1) {
5409  place = 0;
5410  }
5411  else {
5412  place++;
5413  }
5414  rem--;
5415  gap_ct = 0;
5416  }
5417  th->th.th_last_place = place;
5418  gap_ct++;
5419 
5420  if ( place == last_place ) {
5421  place = first_place;
5422  }
5423  else if ( place == __kmp_affinity_num_masks - 1) {
5424  place = 0;
5425  }
5426  else {
5427  place++;
5428  }
5429 
5430  KA_TRACE( 100, ("__kmp_partition_places: spread: T#%d(%d:%d) place %d partition = [%d,%d]\n",
5431  __kmp_gtid_from_thread( team->t.t_threads[f] ),
5432  team->t.t_id, f, th->th.th_new_place,
5433  th->th.th_first_place, th->th.th_last_place ) );
5434  }
5435  KMP_DEBUG_ASSERT( place == masters_place );
5436  }
5437  else {
5438  int S, rem, gap, s_count;
5439  S = n_th / n_places;
5440  s_count = 0;
5441  rem = n_th - ( S * n_places );
5442  gap = rem > 0 ? n_places/rem : n_places;
5443  int place = masters_place;
5444  int gap_ct = gap;
5445  for ( f = 0; f < n_th; f++ ) {
5446  kmp_info_t *th = team->t.t_threads[f];
5447  KMP_DEBUG_ASSERT( th != NULL );
5448 
5449  th->th.th_first_place = place;
5450  th->th.th_last_place = place;
5451  th->th.th_new_place = place;
5452  s_count++;
5453 
5454  if ( (s_count == S) && rem && (gap_ct == gap) ) {
5455  // do nothing, add an extra thread to place on next iteration
5456  }
5457  else if ( (s_count == S+1) && rem && (gap_ct == gap) ) {
5458  // we added an extra thread to this place; move on to next place
5459  if ( place == last_place ) {
5460  place = first_place;
5461  }
5462  else if ( place == __kmp_affinity_num_masks - 1) {
5463  place = 0;
5464  }
5465  else {
5466  place++;
5467  }
5468  s_count = 0;
5469  gap_ct = 1;
5470  rem--;
5471  }
5472  else if (s_count == S) { // place is full; don't add extra thread
5473  if ( place == last_place ) {
5474  place = first_place;
5475  }
5476  else if ( place == __kmp_affinity_num_masks - 1) {
5477  place = 0;
5478  }
5479  else {
5480  place++;
5481  }
5482  gap_ct++;
5483  s_count = 0;
5484  }
5485 
5486  KA_TRACE( 100, ("__kmp_partition_places: spread: T#%d(%d:%d) place %d partition = [%d,%d]\n",
5487  __kmp_gtid_from_thread( team->t.t_threads[f] ),
5488  team->t.t_id, f, th->th.th_new_place,
5489  th->th.th_first_place, th->th.th_last_place) );
5490  }
5491  KMP_DEBUG_ASSERT( place == masters_place );
5492  }
5493  }
5494  break;
5495 
5496  default:
5497  break;
5498  }
5499 
5500  KA_TRACE( 20, ("__kmp_partition_places: exit T#%d\n", team->t.t_id ) );
5501 }
5502 
5503 #endif /* OMP_40_ENABLED && (KMP_OS_WINDOWS || KMP_OS_LINUX) */
5504 
5505 /* allocate a new team data structure to use. take one off of the free pool if available */
5506 kmp_team_t *
5507 __kmp_allocate_team( kmp_root_t *root, int new_nproc, int max_nproc,
5508 #if OMP_40_ENABLED
5509  kmp_proc_bind_t new_proc_bind,
5510 #endif
5511 #if OMP_30_ENABLED
5512  kmp_internal_control_t *new_icvs,
5513 #else
5514  int new_set_nproc, int new_set_dynamic, int new_set_nested,
5515  int new_set_blocktime, int new_bt_intervals, int new_bt_set,
5516 #endif
5517  int argc )
5518 {
5519  int f;
5520  kmp_team_t *team;
5521  char *ptr;
5522  size_t size;
5523 
5524  KA_TRACE( 20, ("__kmp_allocate_team: called\n"));
5525  KMP_DEBUG_ASSERT( new_nproc >=1 && argc >=0 );
5526  KMP_DEBUG_ASSERT( max_nproc >= new_nproc );
5527  KMP_MB();
5528 
5529  //
5530  // optimization to use a "hot" team for the top level,
5531  // as it is usually the same
5532  //
5533  if ( ! root->r.r_active && new_nproc > 1 ) {
5534 
5535  KMP_DEBUG_ASSERT( new_nproc == max_nproc );
5536 
5537  team = root -> r.r_hot_team;
5538 
5539 #if OMP_30_ENABLED && KMP_DEBUG
5540  if ( __kmp_tasking_mode != tskm_immediate_exec ) {
5541  KA_TRACE( 20, ("__kmp_allocate_team: hot team task_team = %p before reinit\n",
5542  team -> t.t_task_team ));
5543  }
5544 #endif
5545 
5546  /* has the number of threads changed? */
5547  if( team -> t.t_nproc > new_nproc ) {
5548  KA_TRACE( 20, ("__kmp_allocate_team: decreasing hot team thread count to %d\n", new_nproc ));
5549 
5550 #if KMP_MIC
5551  team -> t.t_size_changed = 1;
5552 #endif
5553 #if OMP_30_ENABLED
5554  if ( __kmp_tasking_mode != tskm_immediate_exec ) {
5555  kmp_task_team_t *task_team = team->t.t_task_team;
5556  if ( ( task_team != NULL ) && TCR_SYNC_4(task_team->tt.tt_active) ) {
5557  //
5558  // Signal the worker threads (esp. the extra ones) to stop
5559  // looking for tasks while spin waiting. The task teams
5560  // are reference counted and will be deallocated by the
5561  // last worker thread.
5562  //
5563  KMP_DEBUG_ASSERT( team->t.t_nproc > 1 );
5564  TCW_SYNC_4( task_team->tt.tt_active, FALSE );
5565  KMP_MB();
5566 
5567  KA_TRACE( 20, ( "__kmp_allocate_team: setting task_team %p to NULL\n",
5568  &team->t.t_task_team ) );
5569  team->t.t_task_team = NULL;
5570  }
5571  else {
5572  KMP_DEBUG_ASSERT( task_team == NULL );
5573  }
5574  }
5575 #endif // OMP_30_ENABLED
5576 
5577  /* release the extra threads we don't need any more */
5578  for( f = new_nproc ; f < team->t.t_nproc ; f++ ) {
5579  KMP_DEBUG_ASSERT( team->t.t_threads[ f ] );
5580  __kmp_free_thread( team->t.t_threads[ f ] );
5581  team -> t.t_threads[ f ] = NULL;
5582  }
5583 
5584  team -> t.t_nproc = new_nproc;
5585 #if OMP_30_ENABLED
5586  // TODO???: team -> t.t_max_active_levels = new_max_active_levels;
5587  team -> t.t_sched = new_icvs->sched;
5588 #endif
5589  __kmp_reinitialize_team( team, new_nproc,
5590 #if OMP_30_ENABLED
5591  new_icvs,
5592  root->r.r_uber_thread->th.th_ident
5593 #else
5594  new_set_nproc, new_set_dynamic, new_set_nested,
5595  new_set_blocktime, new_bt_intervals, new_bt_set
5596 #endif
5597  );
5598 
5599 #if OMP_30_ENABLED
5600  if ( __kmp_tasking_mode != tskm_immediate_exec ) {
5601  kmp_task_team_t *task_team = team->t.t_task_team;
5602  if ( task_team != NULL ) {
5603  KMP_DEBUG_ASSERT( ! TCR_4(task_team->tt.tt_found_tasks) );
5604  task_team->tt.tt_nproc = new_nproc;
5605  task_team->tt.tt_unfinished_threads = new_nproc;
5606  task_team->tt.tt_ref_ct = new_nproc - 1;
5607  }
5608  }
5609 #endif
5610 
5611  /* update the remaining threads */
5612  for( f = 0 ; f < new_nproc ; f++ ) {
5613  team -> t.t_threads[ f ] -> th.th_team_nproc = team->t.t_nproc;
5614  }
5615 
5616 #if OMP_30_ENABLED
5617  // restore the current task state of the master thread: should be the implicit task
5618  KF_TRACE( 10, ("__kmp_allocate_team: T#%d, this_thread=%p team=%p\n",
5619  0, team->t.t_threads[0], team ) );
5620 
5621  __kmp_push_current_task_to_thread( team -> t.t_threads[ 0 ], team, 0 );
5622 #endif
5623 
5624 #ifdef KMP_DEBUG
5625  for ( f = 0; f < team->t.t_nproc; f++ ) {
5626  KMP_DEBUG_ASSERT( team->t.t_threads[f] &&
5627  team->t.t_threads[f]->th.th_team_nproc == team->t.t_nproc );
5628  }
5629 #endif
5630 
5631 #if OMP_40_ENABLED
5632  team->t.t_proc_bind = new_proc_bind;
5633 # if KMP_OS_WINDOWS || KMP_OS_LINUX
5634  __kmp_partition_places( team );
5635 # endif
5636 #endif
5637 
5638  }
5639  else if ( team -> t.t_nproc < new_nproc ) {
5640 #if KMP_OS_LINUX
5641  kmp_affin_mask_t *old_mask;
5642  if ( KMP_AFFINITY_CAPABLE() ) {
5643  KMP_CPU_ALLOC(old_mask);
5644  }
5645 #endif
5646 
5647  KA_TRACE( 20, ("__kmp_allocate_team: increasing hot team thread count to %d\n", new_nproc ));
5648 
5649 #if KMP_MIC
5650  team -> t.t_size_changed = 1;
5651 #endif
5652 
5653 
5654  if(team -> t.t_max_nproc < new_nproc) {
5655  /* reallocate larger arrays */
5656  __kmp_reallocate_team_arrays(team, new_nproc);
5657  __kmp_reinitialize_team( team, new_nproc,
5658 #if OMP_30_ENABLED
5659  new_icvs,
5660  NULL // TODO: !!!
5661 #else
5662  new_set_nproc, new_set_dynamic, new_set_nested,
5663  new_set_blocktime, new_bt_intervals, new_bt_set
5664 #endif
5665  );
5666  }
5667 
5668 #if KMP_OS_LINUX
5669  /* Temporarily set full mask for master thread before
5670  creation of workers. The reason is that workers inherit
5671  the affinity from master, so if a lot of workers are
5672  created on the single core quickly, they don't get
5673  a chance to set their own affinity for a long time.
5674  */
5675  __kmp_set_thread_affinity_mask_full_tmp( old_mask );
5676 #endif
5677 
5678  /* allocate new threads for the hot team */
5679  for( f = team->t.t_nproc ; f < new_nproc ; f++ ) {
5680  kmp_info_t * new_worker = __kmp_allocate_thread( root, team, f );
5681  KMP_DEBUG_ASSERT( new_worker );
5682  team->t.t_threads[ f ] = new_worker;
5683  new_worker->th.th_team_nproc = team->t.t_nproc;
5684 
5685  KA_TRACE( 20, ("__kmp_allocate_team: team %d init T#%d arrived: join=%u, plain=%u\n",
5686  team->t.t_id, __kmp_gtid_from_tid( f, team ), team->t.t_id, f,
5687  team->t.t_bar[bs_forkjoin_barrier].b_arrived,
5688  team->t.t_bar[bs_plain_barrier].b_arrived ) );
5689 
5690  { // Initialize barrier data for new threads.
5691  int b;
5692  kmp_balign_t * balign = new_worker->th.th_bar;
5693  for ( b = 0; b < bp_last_bar; ++ b ) {
5694  balign[ b ].bb.b_arrived = team->t.t_bar[ b ].b_arrived;
5695  }
5696  }
5697  }
5698 
5699 #if KMP_OS_LINUX
5700  if ( KMP_AFFINITY_CAPABLE() ) {
5701  /* Restore initial master thread's affinity mask */
5702  __kmp_set_system_affinity( old_mask, TRUE );
5703  KMP_CPU_FREE(old_mask);
5704  }
5705 #endif
5706 
5707  /* make sure everyone is syncronized */
5708  __kmp_initialize_team( team, new_nproc,
5709 #if OMP_30_ENABLED
5710  new_icvs,
5711  root->r.r_uber_thread->th.th_ident
5712 #else
5713  new_set_nproc, new_set_dynamic, new_set_nested,
5714  new_set_blocktime, new_bt_intervals, new_bt_set
5715 #endif
5716  );
5717 
5718 #if OMP_30_ENABLED
5719  if ( __kmp_tasking_mode != tskm_immediate_exec ) {
5720  kmp_task_team_t *task_team = team->t.t_task_team;
5721  if ( task_team != NULL ) {
5722  KMP_DEBUG_ASSERT( ! TCR_4(task_team->tt.tt_found_tasks) );
5723  task_team->tt.tt_nproc = new_nproc;
5724  task_team->tt.tt_unfinished_threads = new_nproc;
5725  task_team->tt.tt_ref_ct = new_nproc - 1;
5726  }
5727  }
5728 #endif
5729 
5730  /* reinitialize the old threads */
5731  for( f = 0 ; f < team->t.t_nproc ; f++ )
5732  __kmp_initialize_info( team->t.t_threads[ f ], team, f,
5733  __kmp_gtid_from_tid( f, team ) );
5734 #ifdef KMP_DEBUG
5735  for ( f = 0; f < team->t.t_nproc; ++ f ) {
5736  KMP_DEBUG_ASSERT( team->t.t_threads[f] &&
5737  team->t.t_threads[f]->th.th_team_nproc == team->t.t_nproc );
5738  }
5739 #endif
5740 
5741 #if OMP_40_ENABLED
5742  team->t.t_proc_bind = new_proc_bind;
5743 # if KMP_OS_WINDOWS || KMP_OS_LINUX
5744  __kmp_partition_places( team );
5745 # endif
5746 #endif
5747 
5748  }
5749  else {
5750  KA_TRACE( 20, ("__kmp_allocate_team: reusing hot team\n" ));
5751 #if KMP_MIC
5752  // This case can mean that omp_set_num_threads() was called and the hot team size
5753  // was already reduced, so we check the special flag
5754  if ( team -> t.t_size_changed == -1 ) {
5755  team -> t.t_size_changed = 1;
5756  } else {
5757  team -> t.t_size_changed = 0;
5758  }
5759 #endif
5760 
5761 #if OMP_30_ENABLED
5762  // TODO???: team -> t.t_max_active_levels = new_max_active_levels;
5763  team -> t.t_sched = new_icvs->sched;
5764 #endif
5765 
5766  __kmp_reinitialize_team( team, new_nproc,
5767 #if OMP_30_ENABLED
5768  new_icvs,
5769  root->r.r_uber_thread->th.th_ident
5770 #else
5771  new_set_nproc, new_set_dynamic, new_set_nested,
5772  new_set_blocktime, new_bt_intervals, new_bt_set
5773 #endif
5774  );
5775 
5776 #if OMP_30_ENABLED
5777  KF_TRACE( 10, ("__kmp_allocate_team2: T#%d, this_thread=%p team=%p\n",
5778  0, team->t.t_threads[0], team ) );
5779  __kmp_push_current_task_to_thread( team -> t.t_threads[ 0 ], team, 0 );
5780 #endif
5781 
5782 #if OMP_40_ENABLED
5783 # if (KMP_OS_WINDOWS || KMP_OS_LINUX)
5784  if ( team->t.t_proc_bind == new_proc_bind ) {
5785  KA_TRACE( 200, ("__kmp_allocate_team: reusing hot team #%d bindings: proc_bind = %d, partition = [%d,%d]\n",
5786  team->t.t_id, new_proc_bind, team->t.t_first_place,
5787  team->t.t_last_place ) );
5788  }
5789  else {
5790  team->t.t_proc_bind = new_proc_bind;
5791  __kmp_partition_places( team );
5792  }
5793 # else
5794  if ( team->t.t_proc_bind != new_proc_bind ) {
5795  team->t.t_proc_bind = new_proc_bind;
5796  }
5797 # endif /* (KMP_OS_WINDOWS || KMP_OS_LINUX) */
5798 #endif /* OMP_40_ENABLED */
5799  }
5800 
5801  /* reallocate space for arguments if necessary */
5802  __kmp_alloc_argv_entries( argc, team, TRUE );
5803  team -> t.t_argc = argc;
5804  //
5805  // The hot team re-uses the previous task team,
5806  // if untouched during the previous release->gather phase.
5807  //
5808 
5809  KF_TRACE( 10, ( " hot_team = %p\n", team ) );
5810 
5811 #if OMP_30_ENABLED && KMP_DEBUG
5812  if ( __kmp_tasking_mode != tskm_immediate_exec ) {
5813  KA_TRACE( 20, ("__kmp_allocate_team: hot team task_team = %p after reinit\n",
5814  team -> t.t_task_team ));
5815  }
5816 #endif
5817 
5818  KMP_MB();
5819 
5820  return team;
5821  }
5822 
5823  /* next, let's try to take one from the team pool */
5824  KMP_MB();
5825  for( team = (kmp_team_t*) __kmp_team_pool ; (team) ; )
5826  {
5827  /* TODO: consider resizing undersized teams instead of reaping them, now that we have a resizing mechanism */
5828  if ( team->t.t_max_nproc >= max_nproc ) {
5829  /* take this team from the team pool */
5830  __kmp_team_pool = team->t.t_next_pool;
5831 
5832  /* setup the team for fresh use */
5833  __kmp_initialize_team( team, new_nproc,
5834 #if OMP_30_ENABLED
5835  new_icvs,
5836  NULL // TODO: !!!
5837 #else
5838  new_set_nproc, new_set_dynamic, new_set_nested,
5839  new_set_blocktime, new_bt_intervals, new_bt_set
5840 #endif
5841  );
5842 
5843 #if OMP_30_ENABLED
5844  KA_TRACE( 20, ( "__kmp_allocate_team: setting task_team %p to NULL\n",
5845  &team->t.t_task_team ) );
5846  team -> t.t_task_team = NULL;
5847 #endif
5848 
5849  /* reallocate space for arguments if necessary */
5850  __kmp_alloc_argv_entries( argc, team, TRUE );
5851  team -> t.t_argc = argc;
5852 
5853  KA_TRACE( 20, ("__kmp_allocate_team: team %d init arrived: join=%u, plain=%u\n",
5854  team->t.t_id, KMP_INIT_BARRIER_STATE, KMP_INIT_BARRIER_STATE ));
5855  { // Initialize barrier data.
5856  int b;
5857  for ( b = 0; b < bs_last_barrier; ++ b) {
5858  team->t.t_bar[ b ].b_arrived = KMP_INIT_BARRIER_STATE;
5859  }
5860  }
5861 
5862 #if OMP_40_ENABLED
5863  team->t.t_proc_bind = new_proc_bind;
5864 #endif
5865 
5866  KA_TRACE( 20, ("__kmp_allocate_team: using team from pool %d.\n", team->t.t_id ));
5867  KMP_MB();
5868 
5869  return team;
5870  }
5871 
5872  /* reap team if it is too small, then loop back and check the next one */
5873  /* not sure if this is wise, but, will be redone during the hot-teams rewrite. */
5874  /* TODO: Use technique to find the right size hot-team, don't reap them */
5875  team = __kmp_reap_team( team );
5876  __kmp_team_pool = team;
5877  }
5878 
5879  /* nothing available in the pool, no matter, make a new team! */
5880  KMP_MB();
5881  team = (kmp_team_t*) __kmp_allocate( sizeof( kmp_team_t ) );
5882 
5883  /* and set it up */
5884  team -> t.t_max_nproc = max_nproc;
5885  /* NOTE well, for some reason allocating one big buffer and dividing it
5886  * up seems to really hurt performance a lot on the P4, so, let's not use
5887  * this... */
5888  __kmp_allocate_team_arrays( team, max_nproc );
5889  __kmp_initialize_team( team, new_nproc,
5890 #if OMP_30_ENABLED
5891  new_icvs,
5892  NULL // TODO: !!!
5893 #else
5894  new_set_nproc, new_set_dynamic, new_set_nested,
5895  new_set_blocktime, new_bt_intervals, new_bt_set
5896 #endif
5897  );
5898 
5899 #if OMP_30_ENABLED
5900  KA_TRACE( 20, ( "__kmp_allocate_team: setting task_team %p to NULL\n",
5901  &team->t.t_task_team ) );
5902  team -> t.t_task_team = NULL; // to be removed, as __kmp_allocate zeroes memory, no need to duplicate
5903 #endif
5904 
5905  if ( __kmp_storage_map ) {
5906  __kmp_print_team_storage_map( "team", team, team->t.t_id, new_nproc );
5907  }
5908 
5909  /* allocate space for arguments */
5910  __kmp_alloc_argv_entries( argc, team, FALSE );
5911  team -> t.t_argc = argc;
5912 
5913  KA_TRACE( 20, ("__kmp_allocate_team: team %d init arrived: join=%u, plain=%u\n",
5914  team->t.t_id, KMP_INIT_BARRIER_STATE, KMP_INIT_BARRIER_STATE ));
5915  { // Initialize barrier data.
5916  int b;
5917  for ( b = 0; b < bs_last_barrier; ++ b ) {
5918  team->t.t_bar[ b ].b_arrived = KMP_INIT_BARRIER_STATE;
5919  }
5920  }
5921 
5922 #if OMP_40_ENABLED
5923  team->t.t_proc_bind = new_proc_bind;
5924 #endif
5925 
5926  KMP_MB();
5927 
5928  KA_TRACE( 20, ("__kmp_allocate_team: done creating a new team %d.\n", team->t.t_id ));
5929 
5930  return team;
5931 }
5932 
5933 /* TODO implement hot-teams at all levels */
5934 /* TODO implement lazy thread release on demand (disband request) */
5935 
5936 /* free the team. return it to the team pool. release all the threads
5937  * associated with it */
5938 void
5939 __kmp_free_team( kmp_root_t *root, kmp_team_t *team )
5940 {
5941  int f;
5942  KA_TRACE( 20, ("__kmp_free_team: T#%d freeing team %d\n", __kmp_get_gtid(), team->t.t_id ));
5943 
5944  /* verify state */
5945  KMP_DEBUG_ASSERT( root );
5946  KMP_DEBUG_ASSERT( team );
5947  KMP_DEBUG_ASSERT( team->t.t_nproc <= team->t.t_max_nproc );
5948  KMP_DEBUG_ASSERT( team->t.t_threads );
5949 
5950  /* team is done working */
5951  TCW_SYNC_PTR(team->t.t_pkfn, NULL); // Important for Debugging Support Library.
5952  team -> t.t_copyin_counter = 0; // init counter for possible reuse
5953  // Do not reset pointer to parent team to NULL for hot teams.
5954 
5955  /* if we are a nested team, release our threads */
5956  if( team != root->r.r_hot_team ) {
5957 
5958 #if OMP_30_ENABLED
5959  if ( __kmp_tasking_mode != tskm_immediate_exec ) {
5960  kmp_task_team_t *task_team = team->t.t_task_team;
5961  if ( task_team != NULL ) {
5962  //
5963  // Signal the worker threads to stop looking for tasks while
5964  // spin waiting. The task teams are reference counted and will
5965  // be deallocated by the last worker thread via the thread's
5966  // pointer to the task team.
5967  //
5968  KA_TRACE( 20, ( "__kmp_free_team: deactivating task_team %p\n",
5969  task_team ) );
5970  KMP_DEBUG_ASSERT( team->t.t_nproc > 1 );
5971  TCW_SYNC_4( task_team->tt.tt_active, FALSE );
5972  KMP_MB();
5973  team->t.t_task_team = NULL;
5974  }
5975  }
5976 #endif /* OMP_30_ENABLED */
5977 
5978  // Reset pointer to parent team only for non-hot teams.
5979  team -> t.t_parent = NULL;
5980 
5981 
5982  /* free the worker threads */
5983  for ( f = 1; f < team->t.t_nproc; ++ f ) {
5984  KMP_DEBUG_ASSERT( team->t.t_threads[ f ] );
5985  __kmp_free_thread( team->t.t_threads[ f ] );
5986  team->t.t_threads[ f ] = NULL;
5987  }
5988 
5989 
5990  /* put the team back in the team pool */
5991  /* TODO limit size of team pool, call reap_team if pool too large */
5992  team -> t.t_next_pool = (kmp_team_t*) __kmp_team_pool;
5993  __kmp_team_pool = (volatile kmp_team_t*) team;
5994  }
5995 
5996  KMP_MB();
5997 }
5998 
5999 
6000 /* reap the team. destroy it, reclaim all its resources and free its memory */
6001 kmp_team_t *
6002 __kmp_reap_team( kmp_team_t *team )
6003 {
6004  kmp_team_t *next_pool = team -> t.t_next_pool;
6005 
6006  KMP_DEBUG_ASSERT( team );
6007  KMP_DEBUG_ASSERT( team -> t.t_dispatch );
6008  KMP_DEBUG_ASSERT( team -> t.t_disp_buffer );
6009  KMP_DEBUG_ASSERT( team -> t.t_threads );
6010  #if OMP_30_ENABLED
6011  #else
6012  KMP_DEBUG_ASSERT( team -> t.t_set_nproc );
6013  #endif
6014  KMP_DEBUG_ASSERT( team -> t.t_argv );
6015 
6016  /* TODO clean the threads that are a part of this? */
6017 
6018  /* free stuff */
6019 
6020  __kmp_free_team_arrays( team );
6021 #if (KMP_PERF_V106 == KMP_ON)
6022  if ( team -> t.t_argv != &team -> t.t_inline_argv[0] )
6023  __kmp_free( (void*) team -> t.t_argv );
6024 #else
6025  __kmp_free( (void*) team -> t.t_argv );
6026 #endif
6027  __kmp_free( team );
6028 
6029  KMP_MB();
6030  return next_pool;
6031 }
6032 
6033 //
6034 // Free the thread. Don't reap it, just place it on the pool of available
6035 // threads.
6036 //
6037 // Changes for Quad issue 527845: We need a predictable OMP tid <-> gtid
6038 // binding for the affinity mechanism to be useful.
6039 //
6040 // Now, we always keep the free list (__kmp_thread_pool) sorted by gtid.
6041 // However, we want to avoid a potential performance problem by always
6042 // scanning through the list to find the correct point at which to insert
6043 // the thread (potential N**2 behavior). To do this we keep track of the
6044 // last place a thread struct was inserted (__kmp_thread_pool_insert_pt).
6045 // With single-level parallelism, threads will always be added to the tail
6046 // of the list, kept track of by __kmp_thread_pool_insert_pt. With nested
6047 // parallelism, all bets are off and we may need to scan through the entire
6048 // free list.
6049 //
6050 // This change also has a potentially large performance benefit, for some
6051 // applications. Previously, as threads were freed from the hot team, they
6052 // would be placed back on the free list in inverse order. If the hot team
6053 // grew back to it's original size, then the freed thread would be placed
6054 // back on the hot team in reverse order. This could cause bad cache
6055 // locality problems on programs where the size of the hot team regularly
6056 // grew and shrunk.
6057 //
6058 // Now, for single-level parallelism, the OMP tid is alway == gtid.
6059 //
6060 void
6061 __kmp_free_thread( kmp_info_t *this_th )
6062 {
6063  int gtid;
6064  kmp_info_t **scan;
6065 
6066  KA_TRACE( 20, ("__kmp_free_thread: T#%d putting T#%d back on free pool.\n",
6067  __kmp_get_gtid(), this_th->th.th_info.ds.ds_gtid ));
6068 
6069  KMP_DEBUG_ASSERT( this_th );
6070 
6071 
6072  /* put thread back on the free pool */
6073  TCW_PTR(this_th->th.th_team, NULL);
6074  TCW_PTR(this_th->th.th_root, NULL);
6075  TCW_PTR(this_th->th.th_dispatch, NULL); /* NOT NEEDED */
6076 
6077  //
6078  // If the __kmp_thread_pool_insert_pt is already past the new insert
6079  // point, then we need to re-scan the entire list.
6080  //
6081  gtid = this_th->th.th_info.ds.ds_gtid;
6082  if ( __kmp_thread_pool_insert_pt != NULL ) {
6083  KMP_DEBUG_ASSERT( __kmp_thread_pool != NULL );
6084  if ( __kmp_thread_pool_insert_pt->th.th_info.ds.ds_gtid > gtid ) {
6085  __kmp_thread_pool_insert_pt = NULL;
6086  }
6087  }
6088 
6089  //
6090  // Scan down the list to find the place to insert the thread.
6091  // scan is the address of a link in the list, possibly the address of
6092  // __kmp_thread_pool itself.
6093  //
6094  // In the absence of nested parallism, the for loop will have 0 iterations.
6095  //
6096  if ( __kmp_thread_pool_insert_pt != NULL ) {
6097  scan = &( __kmp_thread_pool_insert_pt->th.th_next_pool );
6098  }
6099  else {
6100  scan = (kmp_info_t **)&__kmp_thread_pool;
6101  }
6102  for (; ( *scan != NULL ) && ( (*scan)->th.th_info.ds.ds_gtid < gtid );
6103  scan = &( (*scan)->th.th_next_pool ) );
6104 
6105  //
6106  // Insert the new element on the list, and set __kmp_thread_pool_insert_pt
6107  // to its address.
6108  //
6109  TCW_PTR(this_th->th.th_next_pool, *scan);
6110  __kmp_thread_pool_insert_pt = *scan = this_th;
6111  KMP_DEBUG_ASSERT( ( this_th->th.th_next_pool == NULL )
6112  || ( this_th->th.th_info.ds.ds_gtid
6113  < this_th->th.th_next_pool->th.th_info.ds.ds_gtid ) );
6114  TCW_4(this_th->th.th_in_pool, TRUE);
6115  __kmp_thread_pool_nth++;
6116 
6117  TCW_4(__kmp_nth, __kmp_nth - 1);
6118 
6119 #ifdef KMP_ADJUST_BLOCKTIME
6120  /* Adjust blocktime back to user setting or default if necessary */
6121  /* Middle initialization might never have ocurred */
6122  if ( !__kmp_env_blocktime && ( __kmp_avail_proc > 0 ) ) {
6123  KMP_DEBUG_ASSERT( __kmp_avail_proc > 0 );
6124  if ( __kmp_nth <= __kmp_avail_proc ) {
6125  __kmp_zero_bt = FALSE;
6126  }
6127  }
6128 #endif /* KMP_ADJUST_BLOCKTIME */
6129 
6130  KMP_MB();
6131 }
6132 
6133 void
6134 __kmp_join_barrier( int gtid )
6135 {
6136  register kmp_info_t *this_thr = __kmp_threads[ gtid ];
6137  register kmp_team_t *team;
6138  register kmp_uint count;
6139  register kmp_uint nproc;
6140  kmp_info_t *master_thread;
6141  int tid;
6142  #ifdef KMP_DEBUG
6143  int team_id;
6144  #endif /* KMP_DEBUG */
6145 #if USE_ITT_BUILD
6146  void * itt_sync_obj = NULL;
6147  #if USE_ITT_NOTIFY
6148  if ( __itt_sync_create_ptr || KMP_ITT_DEBUG ) // don't call routine without need
6149  itt_sync_obj = __kmp_itt_barrier_object( gtid, bs_forkjoin_barrier ); // get object created at fork_barrier
6150  #endif
6151 #endif /* USE_ITT_BUILD */
6152 
6153  KMP_MB();
6154 
6155  /* get current info */
6156  team = this_thr -> th.th_team;
6157  /* nproc = team -> t.t_nproc;*/
6158  nproc = this_thr -> th.th_team_nproc;
6159  KMP_DEBUG_ASSERT( nproc == team->t.t_nproc );
6160  tid = __kmp_tid_from_gtid(gtid);
6161  #ifdef KMP_DEBUG
6162  team_id = team -> t.t_id;
6163  #endif /* KMP_DEBUG */
6164  /* master_thread = team -> t.t_threads[0];*/
6165  master_thread = this_thr -> th.th_team_master;
6166  #ifdef KMP_DEBUG
6167  if ( master_thread != team->t.t_threads[0] ) {
6168  __kmp_print_structure();
6169  }
6170  #endif /* KMP_DEBUG */
6171  KMP_DEBUG_ASSERT( master_thread == team->t.t_threads[0] );
6172  KMP_MB();
6173 
6174  /* verify state */
6175  KMP_DEBUG_ASSERT( __kmp_threads && __kmp_threads[gtid] );
6176  KMP_DEBUG_ASSERT( TCR_PTR(this_thr->th.th_team) );
6177  KMP_DEBUG_ASSERT( TCR_PTR(this_thr->th.th_root) );
6178  KMP_DEBUG_ASSERT( this_thr == team -> t.t_threads[tid] );
6179 
6180  KA_TRACE( 10, ("__kmp_join_barrier: T#%d(%d:%d) arrived at join barrier\n",
6181  gtid, team_id, tid ));
6182 
6183  #if OMP_30_ENABLED
6184  if ( __kmp_tasking_mode == tskm_extra_barrier ) {
6185  __kmp_tasking_barrier( team, this_thr, gtid );
6186 
6187  KA_TRACE( 10, ("__kmp_join_barrier: T#%d(%d:%d) past taking barrier\n",
6188  gtid, team_id, tid ));
6189  }
6190  #ifdef KMP_DEBUG
6191  if ( __kmp_tasking_mode != tskm_immediate_exec ) {
6192  KA_TRACE( 20, ( "__kmp_join_barrier: T#%d, old team = %d, old task_team = %p, th_task_team = %p\n",
6193  __kmp_gtid_from_thread( this_thr ), team_id, team -> t.t_task_team,
6194  this_thr->th.th_task_team ) );
6195  KMP_DEBUG_ASSERT( this_thr->th.th_task_team == team->t.t_task_team );
6196  }
6197  #endif /* KMP_DEBUG */
6198  #endif /* OMP_30_ENABLED */
6199 
6200  //
6201  // Copy the blocktime info to the thread, where __kmp_wait_sleep()
6202  // can access it when the team struct is not guaranteed to exist.
6203  //
6204  // Doing these loads causes a cache miss slows down EPCC parallel by 2x.
6205  // As a workaround, we do not perform the copy if blocktime=infinite,
6206  // since the values are not used by __kmp_wait_sleep() in that case.
6207  //
6208  if ( __kmp_dflt_blocktime != KMP_MAX_BLOCKTIME ) {
6209  #if OMP_30_ENABLED
6210  this_thr -> th.th_team_bt_intervals = team -> t.t_implicit_task_taskdata[tid].td_icvs.bt_intervals;
6211  this_thr -> th.th_team_bt_set = team -> t.t_implicit_task_taskdata[tid].td_icvs.bt_set;
6212  #else
6213  this_thr -> th.th_team_bt_intervals = team -> t.t_set_bt_intervals[tid];
6214  this_thr -> th.th_team_bt_set= team -> t.t_set_bt_set[tid];
6215  #endif // OMP_30_ENABLED
6216  }
6217 
6218  #if KMP_OS_WINDOWS
6219  // AC: wait here until monitor has started. This is a fix for CQ232808.
6220  // The reason is that if the library is loaded/unloaded in a loop with small (parallel)
6221  // work in between, then there is high probability that monitor thread started after
6222  // the library shutdown. At shutdown it is too late to cope with the problem, because
6223  // when the master is in DllMain (process detach) the monitor has no chances to start
6224  // (it is blocked), and master has no means to inform the monitor that the library has gone,
6225  // because all the memory which the monitor can access is going to be released/reset.
6226  //
6227  // The moment before barrier_gather sounds appropriate, because master needs to
6228  // wait for all workers anyway, and we want this to happen as late as possible,
6229  // but before the shutdown which may happen after the barrier.
6230  if( KMP_MASTER_TID( tid ) && TCR_4(__kmp_init_monitor) < 2 ) {
6231  __kmp_wait_sleep( this_thr, (volatile kmp_uint32*)&__kmp_init_monitor, 2, 0
6232  USE_ITT_BUILD_ARG( itt_sync_obj )
6233  );
6234  }
6235  #endif
6236 
6237 #if USE_ITT_BUILD
6238  if ( __itt_sync_create_ptr || KMP_ITT_DEBUG )
6239  __kmp_itt_barrier_starting( gtid, itt_sync_obj );
6240 #endif /* USE_ITT_BUILD */
6241 
6242  if ( __kmp_barrier_gather_pattern[ bs_forkjoin_barrier ] == bp_linear_bar || __kmp_barrier_gather_branch_bits[ bs_forkjoin_barrier ] == 0 ) {
6243  __kmp_linear_barrier_gather( bs_forkjoin_barrier, this_thr, gtid, tid, NULL
6244  USE_ITT_BUILD_ARG( itt_sync_obj )
6245  );
6246  } else if ( __kmp_barrier_gather_pattern[ bs_forkjoin_barrier ] == bp_tree_bar ) {
6247  __kmp_tree_barrier_gather( bs_forkjoin_barrier, this_thr, gtid, tid, NULL
6248  USE_ITT_BUILD_ARG( itt_sync_obj )
6249  );
6250  } else {
6251  __kmp_hyper_barrier_gather( bs_forkjoin_barrier, this_thr, gtid, tid, NULL
6252  USE_ITT_BUILD_ARG( itt_sync_obj )
6253  );
6254  }; // if
6255 
6256 #if USE_ITT_BUILD
6257  if ( __itt_sync_create_ptr || KMP_ITT_DEBUG )
6258  __kmp_itt_barrier_middle( gtid, itt_sync_obj );
6259 #endif /* USE_ITT_BUILD */
6260 
6261  //
6262  // From this point on, the team data structure may be deallocated
6263  // at any time by the master thread - it is unsafe to reference it
6264  // in any of the worker threads.
6265  //
6266  // Any per-team data items that need to be referenced before the end
6267  // of the barrier should be moved to the kmp_task_team_t structs.
6268  //
6269 
6270  #if OMP_30_ENABLED
6271  if ( KMP_MASTER_TID( tid ) ) {
6272  if ( __kmp_tasking_mode != tskm_immediate_exec ) {
6273  // Master shouldn't call decrease_load(). // TODO: enable master threads.
6274  // Master should have th_may_decrease_load == 0. // TODO: enable master threads.
6275  __kmp_task_team_wait( this_thr, team
6276  USE_ITT_BUILD_ARG( itt_sync_obj )
6277  );
6278  }
6279  }
6280  #endif /* OMP_30_ENABLED */
6281 
6282  #if KMP_DEBUG
6283  if( KMP_MASTER_TID( tid )) {
6284  KA_TRACE( 15, ( "__kmp_join_barrier: T#%d(%d:%d) says all %d team threads arrived\n",
6285  gtid, team_id, tid, nproc ));
6286  }
6287  #endif /* KMP_DEBUG */
6288 
6289  /* TODO now, mark worker threads as done so they may be disbanded */
6290 
6291  KMP_MB(); /* Flush all pending memory write invalidates. */
6292  KA_TRACE( 10, ("__kmp_join_barrier: T#%d(%d:%d) leaving\n",
6293  gtid, team_id, tid ));
6294 }
6295 
6296 
6297 /* TODO release worker threads' fork barriers as we are ready instead of all at once */
6298 
6299 void
6300 __kmp_fork_barrier( int gtid, int tid )
6301 {
6302  kmp_info_t *this_thr = __kmp_threads[ gtid ];
6303  kmp_team_t *team = ( tid == 0 ) ? this_thr -> th.th_team : NULL;
6304 #if USE_ITT_BUILD
6305  void * itt_sync_obj = NULL;
6306 #endif /* USE_ITT_BUILD */
6307 
6308  KA_TRACE( 10, ( "__kmp_fork_barrier: T#%d(%d:%d) has arrived\n",
6309  gtid, ( team != NULL ) ? team->t.t_id : -1, tid ));
6310 
6311  /* th_team pointer only valid for master thread here */
6312  if ( KMP_MASTER_TID( tid ) ) {
6313 
6314 #if USE_ITT_BUILD && USE_ITT_NOTIFY
6315  if ( __itt_sync_create_ptr || KMP_ITT_DEBUG ) {
6316  itt_sync_obj = __kmp_itt_barrier_object( gtid, bs_forkjoin_barrier, 1 ); // create itt barrier object
6317  //__kmp_itt_barrier_starting( gtid, itt_sync_obj ); // AC: no need to call prepare right before acquired
6318  __kmp_itt_barrier_middle( gtid, itt_sync_obj ); // call acquired / releasing
6319  }
6320 #endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */
6321 
6322 #ifdef KMP_DEBUG
6323 
6324  register kmp_info_t **other_threads = team -> t.t_threads;
6325  register int i;
6326 
6327  /* verify state */
6328  KMP_MB();
6329 
6330  for( i = 1; i < team -> t.t_nproc ; i++ ) {
6331  KA_TRACE( 500, ( "__kmp_fork_barrier: T#%d(%d:0) checking T#%d(%d:%d) fork "
6332  "go == %u.\n",
6333  gtid, team->t.t_id, other_threads[i]->th.th_info.ds.ds_gtid,
6334  team->t.t_id, other_threads[i]->th.th_info.ds.ds_tid,
6335  other_threads[i]->th.th_bar[ bs_forkjoin_barrier ].bb.b_go ) );
6336 
6337  KMP_DEBUG_ASSERT( ( TCR_4( other_threads[i]->th.th_bar[bs_forkjoin_barrier].bb.b_go )
6338  & ~(KMP_BARRIER_SLEEP_STATE) )
6339  == KMP_INIT_BARRIER_STATE );
6340  KMP_DEBUG_ASSERT( other_threads[i]->th.th_team == team );
6341 
6342  }
6343 #endif
6344 
6345 #if OMP_30_ENABLED
6346  if ( __kmp_tasking_mode != tskm_immediate_exec ) {
6347  __kmp_task_team_setup( this_thr, team );
6348  }
6349 #endif /* OMP_30_ENABLED */
6350 
6351  //
6352  // The master thread may have changed its blocktime between the
6353  // join barrier and the fork barrier.
6354  //
6355  // Copy the blocktime info to the thread, where __kmp_wait_sleep()
6356  // can access it when the team struct is not guaranteed to exist.
6357  //
6358  // See the note about the corresponding code in __kmp_join_barrier()
6359  // being performance-critical.
6360  //
6361  if ( __kmp_dflt_blocktime != KMP_MAX_BLOCKTIME ) {
6362 #if OMP_30_ENABLED
6363  this_thr -> th.th_team_bt_intervals = team -> t.t_implicit_task_taskdata[tid].td_icvs.bt_intervals;
6364  this_thr -> th.th_team_bt_set = team -> t.t_implicit_task_taskdata[tid].td_icvs.bt_set;
6365 #else
6366  this_thr -> th.th_team_bt_intervals = team -> t.t_set_bt_intervals[tid];
6367  this_thr -> th.th_team_bt_set= team -> t.t_set_bt_set[tid];
6368 #endif // OMP_30_ENABLED
6369  }
6370  } // master
6371 
6372  if ( __kmp_barrier_release_pattern[ bs_forkjoin_barrier ] == bp_linear_bar || __kmp_barrier_release_branch_bits[ bs_forkjoin_barrier ] == 0 ) {
6373  __kmp_linear_barrier_release( bs_forkjoin_barrier, this_thr, gtid, tid, TRUE
6374  USE_ITT_BUILD_ARG( itt_sync_obj )
6375  );
6376  } else if ( __kmp_barrier_release_pattern[ bs_forkjoin_barrier ] == bp_tree_bar ) {
6377  __kmp_tree_barrier_release( bs_forkjoin_barrier, this_thr, gtid, tid, TRUE
6378  USE_ITT_BUILD_ARG( itt_sync_obj )
6379  );
6380  } else {
6381  __kmp_hyper_barrier_release( bs_forkjoin_barrier, this_thr, gtid, tid, TRUE
6382  USE_ITT_BUILD_ARG( itt_sync_obj )
6383  );
6384  }; // if
6385 
6386  //
6387  // early exit for reaping threads releasing forkjoin barrier
6388  //
6389  if ( TCR_4(__kmp_global.g.g_done) ) {
6390 
6391 #if OMP_30_ENABLED
6392  if ( this_thr->th.th_task_team != NULL ) {
6393  if ( KMP_MASTER_TID( tid ) ) {
6394  TCW_PTR(this_thr->th.th_task_team, NULL);
6395  }
6396  else {
6397  __kmp_unref_task_team( this_thr->th.th_task_team, this_thr );
6398  }
6399  }
6400 #endif /* OMP_30_ENABLED */
6401 
6402 #if USE_ITT_BUILD && USE_ITT_NOTIFY
6403  if ( __itt_sync_create_ptr || KMP_ITT_DEBUG ) {
6404  if ( !KMP_MASTER_TID( tid ) ) {
6405  itt_sync_obj = __kmp_itt_barrier_object( gtid, bs_forkjoin_barrier );
6406  if ( itt_sync_obj )
6407  __kmp_itt_barrier_finished( gtid, itt_sync_obj );
6408  }
6409  }
6410 #endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */
6411  KA_TRACE( 10, ( "__kmp_fork_barrier: T#%d is leaving early\n", gtid ));
6412  return;
6413  }
6414 
6415  //
6416  // We can now assume that a valid team structure has been allocated
6417  // by the master and propagated to all worker threads.
6418  //
6419  // The current thread, however, may not be part of the team, so we can't
6420  // blindly assume that the team pointer is non-null.
6421  //
6422  team = (kmp_team_t *)TCR_PTR(this_thr->th.th_team);
6423  KMP_DEBUG_ASSERT( team != NULL );
6424  tid = __kmp_tid_from_gtid( gtid );
6425 
6426 #if OMP_30_ENABLED
6427 
6428 # if KMP_BARRIER_ICV_PULL
6429  //
6430  // FIXME - after __kmp_fork_call() is modified to not look at the
6431  // master thread's implicit task ICV's, remove the ! KMP_MASTER_TID
6432  // restriction from this if condition.
6433  //
6434  if (! KMP_MASTER_TID( tid ) ) {
6435  //
6436  // Copy the initial ICV's from the team struct to the implicit task
6437  // for this tid.
6438  //
6439  __kmp_init_implicit_task( team->t.t_ident, team->t.t_threads[tid],
6440  team, tid, FALSE );
6441  copy_icvs( &team->t.t_implicit_task_taskdata[tid].td_icvs,
6442  &team->t.t_initial_icvs );
6443  }
6444 # endif // KMP_BARRIER_ICV_PULL
6445 
6446  if ( __kmp_tasking_mode != tskm_immediate_exec ) {
6447  __kmp_task_team_sync( this_thr, team );
6448  }
6449 
6450 #endif /* OMP_30_ENABLED */
6451 
6452 #if OMP_40_ENABLED && (KMP_OS_WINDOWS || KMP_OS_LINUX)
6453  kmp_proc_bind_t proc_bind = team->t.t_proc_bind;
6454  if ( proc_bind == proc_bind_intel ) {
6455 #endif
6456 #if KMP_MIC
6457  //
6458  // Call dynamic affinity settings
6459  //
6460  if( __kmp_affinity_type == affinity_balanced && team->t.t_size_changed ) {
6461  __kmp_balanced_affinity( tid, team->t.t_nproc );
6462  }
6463 #endif
6464 #if OMP_40_ENABLED && (KMP_OS_WINDOWS || KMP_OS_LINUX)
6465  }
6466  else if ( ( proc_bind != proc_bind_false )
6467  && ( proc_bind != proc_bind_disabled )) {
6468  if ( this_thr->th.th_new_place == this_thr->th.th_current_place ) {
6469  KA_TRACE( 100, ( "__kmp_fork_barrier: T#%d already in correct place %d\n",
6470  __kmp_gtid_from_thread( this_thr ), this_thr->th.th_current_place ) );
6471  }
6472  else {
6473  __kmp_affinity_set_place( gtid );
6474  }
6475  }
6476 #endif
6477 
6478 #if USE_ITT_BUILD && USE_ITT_NOTIFY
6479  if ( __itt_sync_create_ptr || KMP_ITT_DEBUG ) {
6480  if ( !KMP_MASTER_TID( tid ) ) {
6481  itt_sync_obj = __kmp_itt_barrier_object( gtid, bs_forkjoin_barrier ); // get correct barrier object
6482  __kmp_itt_barrier_finished( gtid, itt_sync_obj ); // workers call acquired
6483  } // (prepare called inside barrier_release)
6484  }
6485 #endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */
6486  KA_TRACE( 10, ( "__kmp_fork_barrier: T#%d(%d:%d) is leaving\n",
6487  gtid, team->t.t_id, tid ));
6488 }
6489 
6490 
6491 /* ------------------------------------------------------------------------ */
6492 /* ------------------------------------------------------------------------ */
6493 
6494 void *
6495 __kmp_launch_thread( kmp_info_t *this_thr )
6496 {
6497  int gtid = this_thr->th.th_info.ds.ds_gtid;
6498 /* void *stack_data;*/
6499  kmp_team_t *(*volatile pteam);
6500 
6501  KMP_MB();
6502  KA_TRACE( 10, ("__kmp_launch_thread: T#%d start\n", gtid ) );
6503 
6504  if( __kmp_env_consistency_check ) {
6505  this_thr -> th.th_cons = __kmp_allocate_cons_stack( gtid ); // ATT: Memory leak?
6506  }
6507 
6508  /* This is the place where threads wait for work */
6509  while( ! TCR_4(__kmp_global.g.g_done) ) {
6510  KMP_DEBUG_ASSERT( this_thr == __kmp_threads[ gtid ] );
6511  KMP_MB();
6512 
6513  /* wait for work to do */
6514  KA_TRACE( 20, ("__kmp_launch_thread: T#%d waiting for work\n", gtid ));
6515 
6516  /* No tid yet since not part of a team */
6517  __kmp_fork_barrier( gtid, KMP_GTID_DNE );
6518 
6519  pteam = (kmp_team_t *(*))(& this_thr->th.th_team);
6520 
6521  /* have we been allocated? */
6522  if ( TCR_SYNC_PTR(*pteam) && !TCR_4(__kmp_global.g.g_done) ) {
6523  /* we were just woken up, so run our new task */
6524  if ( TCR_SYNC_PTR((*pteam)->t.t_pkfn) != NULL ) {
6525  int rc;
6526  KA_TRACE( 20, ("__kmp_launch_thread: T#%d(%d:%d) invoke microtask = %p\n",
6527  gtid, (*pteam)->t.t_id, __kmp_tid_from_gtid(gtid), (*pteam)->t.t_pkfn ));
6528 
6529 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
6530  if ( __kmp_inherit_fp_control && (*pteam)->t.t_fp_control_saved ) {
6531  __kmp_clear_x87_fpu_status_word();
6532  __kmp_load_x87_fpu_control_word( &(*pteam)->t.t_x87_fpu_control_word );
6533  __kmp_load_mxcsr( &(*pteam)->t.t_mxcsr );
6534  }
6535 #endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
6536 
6537  rc = (*pteam) -> t.t_invoke( gtid );
6538  KMP_ASSERT( rc );
6539 
6540  KMP_MB();
6541  KA_TRACE( 20, ("__kmp_launch_thread: T#%d(%d:%d) done microtask = %p\n",
6542  gtid, (*pteam)->t.t_id, __kmp_tid_from_gtid(gtid), (*pteam)->t.t_pkfn ));
6543  }
6544 
6545  /* join barrier after parallel region */
6546  __kmp_join_barrier( gtid );
6547  }
6548  }
6549  TCR_SYNC_PTR(__kmp_global.g.g_done);
6550 
6551 #if OMP_30_ENABLED
6552  if ( TCR_PTR( this_thr->th.th_task_team ) != NULL ) {
6553  __kmp_unref_task_team( this_thr->th.th_task_team, this_thr );
6554  }
6555 #endif /* OMP_30_ENABLED */
6556 
6557  /* run the destructors for the threadprivate data for this thread */
6558  __kmp_common_destroy_gtid( gtid );
6559 
6560  KA_TRACE( 10, ("__kmp_launch_thread: T#%d done\n", gtid ) );
6561  KMP_MB();
6562  return this_thr;
6563 }
6564 
6565 /* ------------------------------------------------------------------------ */
6566 /* ------------------------------------------------------------------------ */
6567 
6568 
6569 
6570 void
6571 __kmp_internal_end_dest( void *specific_gtid )
6572 {
6573  #ifdef __INTEL_COMPILER
6574  #pragma warning( push )
6575  #pragma warning( disable: 810 ) // conversion from "void *" to "int" may lose significant bits
6576  #endif
6577  // Make sure no significant bits are lost
6578  int gtid = (kmp_intptr_t)specific_gtid - 1;
6579  #ifdef __INTEL_COMPILER
6580  #pragma warning( pop )
6581  #endif
6582 
6583  KA_TRACE( 30, ("__kmp_internal_end_dest: T#%d\n", gtid));
6584  /* NOTE: the gtid is stored as gitd+1 in the thread-local-storage
6585  * this is because 0 is reserved for the nothing-stored case */
6586 
6587  /* josh: One reason for setting the gtid specific data even when it is being
6588  destroyed by pthread is to allow gtid lookup through thread specific data
6589  (__kmp_gtid_get_specific). Some of the code, especially stat code,
6590  that gets executed in the call to __kmp_internal_end_thread, actually
6591  gets the gtid through the thread specific data. Setting it here seems
6592  rather inelegant and perhaps wrong, but allows __kmp_internal_end_thread
6593  to run smoothly.
6594  todo: get rid of this after we remove the dependence on
6595  __kmp_gtid_get_specific
6596  */
6597  if(gtid >= 0 && KMP_UBER_GTID(gtid))
6598  __kmp_gtid_set_specific( gtid );
6599  #ifdef KMP_TDATA_GTID
6600  __kmp_gtid = gtid;
6601  #endif
6602  __kmp_internal_end_thread( gtid );
6603 }
6604 
6605 #if KMP_OS_UNIX && GUIDEDLL_EXPORTS
6606 
6607 // 2009-09-08 (lev): It looks the destructor does not work. In simple test cases destructors work
6608 // perfectly, but in real libiomp5.so I have no evidence it is ever called. However, -fini linker
6609 // option in makefile.mk works fine.
6610 
6611 __attribute__(( destructor ))
6612 void
6613 __kmp_internal_end_dtor( void )
6614 {
6615  __kmp_internal_end_atexit();
6616 }
6617 
6618 void
6619 __kmp_internal_end_fini( void )
6620 {
6621  __kmp_internal_end_atexit();
6622 }
6623 
6624 #endif
6625 
6626 /* [Windows] josh: when the atexit handler is called, there may still be more than one thread alive */
6627 void
6628 __kmp_internal_end_atexit( void )
6629 {
6630  KA_TRACE( 30, ( "__kmp_internal_end_atexit\n" ) );
6631  /* [Windows]
6632  josh: ideally, we want to completely shutdown the library in this atexit handler, but
6633  stat code that depends on thread specific data for gtid fails because that data becomes
6634  unavailable at some point during the shutdown, so we call __kmp_internal_end_thread
6635  instead. We should eventually remove the dependency on __kmp_get_specific_gtid in the
6636  stat code and use __kmp_internal_end_library to cleanly shutdown the library.
6637 
6638 // TODO: Can some of this comment about GVS be removed?
6639  I suspect that the offending stat code is executed when the calling thread tries to
6640  clean up a dead root thread's data structures, resulting in GVS code trying to close
6641  the GVS structures for that thread, but since the stat code uses
6642  __kmp_get_specific_gtid to get the gtid with the assumption that the calling thread is
6643  cleaning up itself instead of another thread, it gets confused. This happens because
6644  allowing a thread to unregister and cleanup another thread is a recent modification for
6645  addressing an issue with Maxon Cinema4D. Based on the current design (20050722), a
6646  thread may end up trying to unregister another thread only if thread death does not
6647  trigger the calling of __kmp_internal_end_thread. For Linux* OS, there is the thread
6648  specific data destructor function to detect thread death. For Windows dynamic, there
6649  is DllMain(THREAD_DETACH). For Windows static, there is nothing. Thus, the
6650  workaround is applicable only for Windows static stat library.
6651  */
6652  __kmp_internal_end_library( -1 );
6653  #if KMP_OS_WINDOWS
6654  __kmp_close_console();
6655  #endif
6656 }
6657 
6658 static void
6659 __kmp_reap_thread(
6660  kmp_info_t * thread,
6661  int is_root
6662 ) {
6663 
6664  // It is assumed __kmp_forkjoin_lock is aquired.
6665 
6666  int gtid;
6667 
6668  KMP_DEBUG_ASSERT( thread != NULL );
6669 
6670  gtid = thread->th.th_info.ds.ds_gtid;
6671 
6672  if ( ! is_root ) {
6673 
6674  if ( __kmp_dflt_blocktime != KMP_MAX_BLOCKTIME ) {
6675  /* Assume the threads are at the fork barrier here */
6676  KA_TRACE( 20, ("__kmp_reap_thread: releasing T#%d from fork barrier for reap\n", gtid ) );
6677  /* Need release fence here to prevent seg faults for tree forkjoin barrier (GEH) */
6678  __kmp_release(
6679  thread,
6680  &thread->th.th_bar[ bs_forkjoin_barrier ].bb.b_go,
6681  kmp_release_fence
6682  );
6683  }; // if
6684 
6685 
6686  // Terminate OS thread.
6687  __kmp_reap_worker( thread );
6688 
6689  //
6690  // The thread was killed asynchronously. If it was actively
6691  // spinning in the in the thread pool, decrement the global count.
6692  //
6693  // There is a small timing hole here - if the worker thread was
6694  // just waking up after sleeping in the pool, had reset it's
6695  // th_active_in_pool flag but not decremented the global counter
6696  // __kmp_thread_pool_active_nth yet, then the global counter
6697  // might not get updated.
6698  //
6699  // Currently, this can only happen as the library is unloaded,
6700  // so there are no harmful side effects.
6701  //
6702  if ( thread->th.th_active_in_pool ) {
6703  thread->th.th_active_in_pool = FALSE;
6704  KMP_TEST_THEN_DEC32(
6705  (kmp_int32 *) &__kmp_thread_pool_active_nth );
6706  KMP_DEBUG_ASSERT( TCR_4(__kmp_thread_pool_active_nth) >= 0 );
6707  }
6708 
6709  // Decrement # of [worker] threads in the pool.
6710  KMP_DEBUG_ASSERT( __kmp_thread_pool_nth > 0 );
6711  --__kmp_thread_pool_nth;
6712  }; // if
6713 
6714  // Free the fast memory for tasking
6715  #if USE_FAST_MEMORY
6716  __kmp_free_fast_memory( thread );
6717  #endif /* USE_FAST_MEMORY */
6718 
6719  __kmp_suspend_uninitialize_thread( thread );
6720 
6721  KMP_DEBUG_ASSERT( __kmp_threads[ gtid ] == thread );
6722  TCW_SYNC_PTR(__kmp_threads[gtid], NULL);
6723 
6724  -- __kmp_all_nth;
6725  // __kmp_nth was decremented when thread is added to the pool.
6726 
6727 #ifdef KMP_ADJUST_BLOCKTIME
6728  /* Adjust blocktime back to user setting or default if necessary */
6729  /* Middle initialization might never have ocurred */
6730  if ( !__kmp_env_blocktime && ( __kmp_avail_proc > 0 ) ) {
6731  KMP_DEBUG_ASSERT( __kmp_avail_proc > 0 );
6732  if ( __kmp_nth <= __kmp_avail_proc ) {
6733  __kmp_zero_bt = FALSE;
6734  }
6735  }
6736 #endif /* KMP_ADJUST_BLOCKTIME */
6737 
6738  /* free the memory being used */
6739  if( __kmp_env_consistency_check ) {
6740  if ( thread->th.th_cons ) {
6741  __kmp_free_cons_stack( thread->th.th_cons );
6742  thread->th.th_cons = NULL;
6743  }; // if
6744  }
6745 
6746  if ( thread->th.th_pri_common != NULL ) {
6747  __kmp_free( thread->th.th_pri_common );
6748  thread->th.th_pri_common = NULL;
6749  }; // if
6750 
6751  #if KMP_USE_BGET
6752  if ( thread->th.th_local.bget_data != NULL ) {
6753  __kmp_finalize_bget( thread );
6754  }; // if
6755  #endif
6756 
6757 #if (KMP_OS_WINDOWS || KMP_OS_LINUX)
6758  if ( thread->th.th_affin_mask != NULL ) {
6759  KMP_CPU_FREE( thread->th.th_affin_mask );
6760  thread->th.th_affin_mask = NULL;
6761  }; // if
6762 #endif /* (KMP_OS_WINDOWS || KMP_OS_LINUX) */
6763 
6764  __kmp_reap_team( thread->th.th_serial_team );
6765  thread->th.th_serial_team = NULL;
6766  __kmp_free( thread );
6767 
6768  KMP_MB();
6769 
6770 } // __kmp_reap_thread
6771 
6772 static void
6773 __kmp_internal_end(void)
6774 {
6775  int i;
6776 
6777  /* First, unregister the library */
6778  __kmp_unregister_library();
6779 
6780  #if KMP_OS_WINDOWS
6781  /* In Win static library, we can't tell when a root actually dies, so we
6782  reclaim the data structures for any root threads that have died but not
6783  unregistered themselves, in order to shut down cleanly.
6784  In Win dynamic library we also can't tell when a thread dies.
6785  */
6786  __kmp_reclaim_dead_roots(); // AC: moved here to always clean resources of dead roots
6787  #endif
6788 
6789  for( i=0 ; i<__kmp_threads_capacity ; i++ )
6790  if( __kmp_root[i] )
6791  if( __kmp_root[i] -> r.r_active )
6792  break;
6793  KMP_MB(); /* Flush all pending memory write invalidates. */
6794  TCW_SYNC_4(__kmp_global.g.g_done, TRUE);
6795 
6796  if ( i < __kmp_threads_capacity ) {
6797  // 2009-09-08 (lev): Other alive roots found. Why do we kill the monitor??
6798  KMP_MB(); /* Flush all pending memory write invalidates. */
6799 
6800  //
6801  // Need to check that monitor was initialized before reaping it.
6802  // If we are called form __kmp_atfork_child (which sets
6803  // __kmp_init_parallel = 0), then __kmp_monitor will appear to
6804  // contain valid data, but it is only valid in the parent process,
6805  // not the child.
6806  //
6807  // One of the possible fixes for CQ138434 / CQ140126
6808  // (used in 20091103_dreamworks patch)
6809  //
6810  // New behavior (201008): instead of keying off of the flag
6811  // __kmp_init_parallel, the monitor thread creation is keyed off
6812  // of the new flag __kmp_init_monitor.
6813  //
6814  __kmp_acquire_bootstrap_lock( & __kmp_monitor_lock );
6815  if ( TCR_4( __kmp_init_monitor ) ) {
6816  __kmp_reap_monitor( & __kmp_monitor );
6817  TCW_4( __kmp_init_monitor, 0 );
6818  }
6819  __kmp_release_bootstrap_lock( & __kmp_monitor_lock );
6820  KA_TRACE( 10, ("__kmp_internal_end: monitor reaped\n" ) );
6821  } else {
6822  /* TODO move this to cleanup code */
6823  #ifdef KMP_DEBUG
6824  /* make sure that everything has properly ended */
6825  for ( i = 0; i < __kmp_threads_capacity; i++ ) {
6826  if( __kmp_root[i] ) {
6827  KMP_ASSERT( ! KMP_UBER_GTID( i ) );
6828  KMP_ASSERT( ! __kmp_root[i] -> r.r_active );
6829  }
6830  }
6831  #endif
6832 
6833  KMP_MB();
6834 
6835  // Reap the worker threads.
6836  // This is valid for now, but be careful if threads are reaped sooner.
6837  while ( __kmp_thread_pool != NULL ) { // Loop thru all the thread in the pool.
6838  // Get the next thread from the pool.
6839  kmp_info_t * thread = (kmp_info_t *) __kmp_thread_pool;
6840  __kmp_thread_pool = thread->th.th_next_pool;
6841  // Reap it.
6842  thread->th.th_next_pool = NULL;
6843  thread->th.th_in_pool = FALSE;
6844  __kmp_reap_thread( thread, 0 );
6845  }; // while
6846  __kmp_thread_pool_insert_pt = NULL;
6847 
6848  // Reap teams.
6849  while ( __kmp_team_pool != NULL ) { // Loop thru all the teams in the pool.
6850  // Get the next team from the pool.
6851  kmp_team_t * team = (kmp_team_t *) __kmp_team_pool;
6852  __kmp_team_pool = team->t.t_next_pool;
6853  // Reap it.
6854  team->t.t_next_pool = NULL;
6855  __kmp_reap_team( team );
6856  }; // while
6857 
6858  #if OMP_30_ENABLED
6859  __kmp_reap_task_teams( );
6860  #endif /* OMP_30_ENABLED */
6861 
6862  for ( i = 0; i < __kmp_threads_capacity; ++ i ) {
6863  // TBD: Add some checking...
6864  // Something like KMP_DEBUG_ASSERT( __kmp_thread[ i ] == NULL );
6865  }
6866 
6867  /* Make sure all threadprivate destructors get run by joining with all worker
6868  threads before resetting this flag */
6869  TCW_SYNC_4(__kmp_init_common, FALSE);
6870 
6871  KA_TRACE( 10, ("__kmp_internal_end: all workers reaped\n" ) );
6872  KMP_MB();
6873 
6874  //
6875  // See note above: One of the possible fixes for CQ138434 / CQ140126
6876  //
6877  // FIXME: push both code fragments down and CSE them?
6878  // push them into __kmp_cleanup() ?
6879  //
6880  __kmp_acquire_bootstrap_lock( & __kmp_monitor_lock );
6881  if ( TCR_4( __kmp_init_monitor ) ) {
6882  __kmp_reap_monitor( & __kmp_monitor );
6883  TCW_4( __kmp_init_monitor, 0 );
6884  }
6885  __kmp_release_bootstrap_lock( & __kmp_monitor_lock );
6886  KA_TRACE( 10, ("__kmp_internal_end: monitor reaped\n" ) );
6887 
6888  } /* else !__kmp_global.t_active */
6889  TCW_4(__kmp_init_gtid, FALSE);
6890  KMP_MB(); /* Flush all pending memory write invalidates. */
6891 
6892 
6893  __kmp_cleanup();
6894 }
6895 
6896 void
6897 __kmp_internal_end_library( int gtid_req )
6898 {
6899  int i;
6900 
6901  /* if we have already cleaned up, don't try again, it wouldn't be pretty */
6902  /* this shouldn't be a race condition because __kmp_internal_end() is the
6903  * only place to clear __kmp_serial_init */
6904  /* we'll check this later too, after we get the lock */
6905  // 2009-09-06: We do not set g_abort without setting g_done. This check looks redundaant,
6906  // because the next check will work in any case.
6907  if( __kmp_global.g.g_abort ) {
6908  KA_TRACE( 11, ("__kmp_internal_end_library: abort, exiting\n" ));
6909  /* TODO abort? */
6910  return;
6911  }
6912  if( TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial ) {
6913  KA_TRACE( 10, ("__kmp_internal_end_library: already finished\n" ));
6914  return;
6915  }
6916 
6917 
6918  KMP_MB(); /* Flush all pending memory write invalidates. */
6919 
6920  /* find out who we are and what we should do */
6921  {
6922  int gtid = (gtid_req>=0) ? gtid_req : __kmp_gtid_get_specific();
6923  KA_TRACE( 10, ("__kmp_internal_end_library: enter T#%d (%d)\n", gtid, gtid_req ));
6924  if( gtid == KMP_GTID_SHUTDOWN ) {
6925  KA_TRACE( 10, ("__kmp_internal_end_library: !__kmp_init_runtime, system already shutdown\n" ));
6926  return;
6927  } else if( gtid == KMP_GTID_MONITOR ) {
6928  KA_TRACE( 10, ("__kmp_internal_end_library: monitor thread, gtid not registered, or system shutdown\n" ));
6929  return;
6930  } else if( gtid == KMP_GTID_DNE ) {
6931  KA_TRACE( 10, ("__kmp_internal_end_library: gtid not registered or system shutdown\n" ));
6932  /* we don't know who we are, but we may still shutdown the library */
6933  } else if( KMP_UBER_GTID( gtid )) {
6934  /* unregister ourselves as an uber thread. gtid is no longer valid */
6935  if( __kmp_root[gtid] -> r.r_active ) {
6936  __kmp_global.g.g_abort = -1;
6937  TCW_SYNC_4(__kmp_global.g.g_done, TRUE);
6938  KA_TRACE( 10, ("__kmp_internal_end_library: root still active, abort T#%d\n", gtid ));
6939  return;
6940  } else {
6941  KA_TRACE( 10, ("__kmp_internal_end_library: unregistering sibling T#%d\n", gtid ));
6942  __kmp_unregister_root_current_thread( gtid );
6943  }
6944  } else {
6945  /* worker threads may call this function through the atexit handler, if they call exit() */
6946  /* For now, skip the usual subsequent processing and just dump the debug buffer.
6947  TODO: do a thorough shutdown instead
6948  */
6949  #ifdef DUMP_DEBUG_ON_EXIT
6950  if ( __kmp_debug_buf )
6951  __kmp_dump_debug_buffer( );
6952  #endif
6953  return;
6954  }
6955  }
6956  /* synchronize the termination process */
6957  __kmp_acquire_bootstrap_lock( &__kmp_initz_lock );
6958 
6959  /* have we already finished */
6960  if( __kmp_global.g.g_abort ) {
6961  KA_TRACE( 10, ("__kmp_internal_end_library: abort, exiting\n" ));
6962  /* TODO abort? */
6963  __kmp_release_bootstrap_lock( &__kmp_initz_lock );
6964  return;
6965  }
6966  if( TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial ) {
6967  __kmp_release_bootstrap_lock( &__kmp_initz_lock );
6968  return;
6969  }
6970 
6971  /* We need this lock to enforce mutex between this reading of
6972  __kmp_threads_capacity and the writing by __kmp_register_root.
6973  Alternatively, we can use a counter of roots that is
6974  atomically updated by __kmp_get_global_thread_id_reg,
6975  __kmp_do_serial_initialize and __kmp_internal_end_*.
6976  */
6977  __kmp_acquire_bootstrap_lock( &__kmp_forkjoin_lock );
6978 
6979  /* now we can safely conduct the actual termination */
6980  __kmp_internal_end();
6981 
6982  __kmp_release_bootstrap_lock( &__kmp_forkjoin_lock );
6983  __kmp_release_bootstrap_lock( &__kmp_initz_lock );
6984 
6985  KA_TRACE( 10, ("__kmp_internal_end_library: exit\n" ) );
6986 
6987  #ifdef DUMP_DEBUG_ON_EXIT
6988  if ( __kmp_debug_buf )
6989  __kmp_dump_debug_buffer();
6990  #endif
6991 
6992  #if KMP_OS_WINDOWS
6993  __kmp_close_console();
6994  #endif
6995 
6996  __kmp_fini_allocator();
6997 
6998 } // __kmp_internal_end_library
6999 
7000 void
7001 __kmp_internal_end_thread( int gtid_req )
7002 {
7003  int i;
7004 
7005  /* if we have already cleaned up, don't try again, it wouldn't be pretty */
7006  /* this shouldn't be a race condition because __kmp_internal_end() is the
7007  * only place to clear __kmp_serial_init */
7008  /* we'll check this later too, after we get the lock */
7009  // 2009-09-06: We do not set g_abort without setting g_done. This check looks redundant,
7010  // because the next check will work in any case.
7011  if( __kmp_global.g.g_abort ) {
7012  KA_TRACE( 11, ("__kmp_internal_end_thread: abort, exiting\n" ));
7013  /* TODO abort? */
7014  return;
7015  }
7016  if( TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial ) {
7017  KA_TRACE( 10, ("__kmp_internal_end_thread: already finished\n" ));
7018  return;
7019  }
7020 
7021  KMP_MB(); /* Flush all pending memory write invalidates. */
7022 
7023  /* find out who we are and what we should do */
7024  {
7025  int gtid = (gtid_req>=0) ? gtid_req : __kmp_gtid_get_specific();
7026  KA_TRACE( 10, ("__kmp_internal_end_thread: enter T#%d (%d)\n", gtid, gtid_req ));
7027  if( gtid == KMP_GTID_SHUTDOWN ) {
7028  KA_TRACE( 10, ("__kmp_internal_end_thread: !__kmp_init_runtime, system already shutdown\n" ));
7029  return;
7030  } else if( gtid == KMP_GTID_MONITOR ) {
7031  KA_TRACE( 10, ("__kmp_internal_end_thread: monitor thread, gtid not registered, or system shutdown\n" ));
7032  return;
7033  } else if( gtid == KMP_GTID_DNE ) {
7034  KA_TRACE( 10, ("__kmp_internal_end_thread: gtid not registered or system shutdown\n" ));
7035  return;
7036  /* we don't know who we are */
7037  } else if( KMP_UBER_GTID( gtid )) {
7038  /* unregister ourselves as an uber thread. gtid is no longer valid */
7039  if( __kmp_root[gtid] -> r.r_active ) {
7040  __kmp_global.g.g_abort = -1;
7041  TCW_SYNC_4(__kmp_global.g.g_done, TRUE);
7042  KA_TRACE( 10, ("__kmp_internal_end_thread: root still active, abort T#%d\n", gtid ));
7043  return;
7044  } else {
7045  KA_TRACE( 10, ("__kmp_internal_end_thread: unregistering sibling T#%d\n", gtid ));
7046  __kmp_unregister_root_current_thread( gtid );
7047  }
7048  } else {
7049  /* just a worker thread, let's leave */
7050  KA_TRACE( 10, ("__kmp_internal_end_thread: worker thread T#%d\n", gtid ));
7051 
7052  #if OMP_30_ENABLED
7053  if ( gtid >= 0 ) {
7054  kmp_info_t *this_thr = __kmp_threads[ gtid ];
7055  if (TCR_PTR(this_thr->th.th_task_team) != NULL) {
7056  __kmp_unref_task_team(this_thr->th.th_task_team, this_thr);
7057  }
7058  }
7059  #endif /* OMP_30_ENABLED */
7060 
7061  KA_TRACE( 10, ("__kmp_internal_end_thread: worker thread done, exiting T#%d\n", gtid ));
7062  return;
7063  }
7064  }
7065  #if defined GUIDEDLL_EXPORTS
7066  // AC: lets not shutdown the Linux* OS dynamic library at the exit of uber thread,
7067  // because we will better shutdown later in the library destructor.
7068  // The reason of this change is performance problem when non-openmp thread
7069  // in a loop forks and joins many openmp threads. We can save a lot of time
7070  // keeping worker threads alive until the program shutdown.
7071  // OM: Removed Linux* OS restriction to fix the crash on OS X* (DPD200239966) and
7072  // Windows(DPD200287443) that occurs when using critical sections from foreign threads.
7073  KA_TRACE( 10, ("__kmp_internal_end_thread: exiting\n") );
7074  return;
7075  #endif
7076  /* synchronize the termination process */
7077  __kmp_acquire_bootstrap_lock( &__kmp_initz_lock );
7078 
7079  /* have we already finished */
7080  if( __kmp_global.g.g_abort ) {
7081  KA_TRACE( 10, ("__kmp_internal_end_thread: abort, exiting\n" ));
7082  /* TODO abort? */
7083  __kmp_release_bootstrap_lock( &__kmp_initz_lock );
7084  return;
7085  }
7086  if( TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial ) {
7087  __kmp_release_bootstrap_lock( &__kmp_initz_lock );
7088  return;
7089  }
7090 
7091  /* We need this lock to enforce mutex between this reading of
7092  __kmp_threads_capacity and the writing by __kmp_register_root.
7093  Alternatively, we can use a counter of roots that is
7094  atomically updated by __kmp_get_global_thread_id_reg,
7095  __kmp_do_serial_initialize and __kmp_internal_end_*.
7096  */
7097 
7098  /* should we finish the run-time? are all siblings done? */
7099  __kmp_acquire_bootstrap_lock( &__kmp_forkjoin_lock );
7100 
7101  for ( i = 0; i < __kmp_threads_capacity; ++ i ) {
7102  if ( KMP_UBER_GTID( i ) ) {
7103  KA_TRACE( 10, ("__kmp_internal_end_thread: remaining sibling task: gtid==%d\n", i ));
7104  __kmp_release_bootstrap_lock( &__kmp_forkjoin_lock );
7105  __kmp_release_bootstrap_lock( &__kmp_initz_lock );
7106  return;
7107  };
7108  }
7109 
7110  /* now we can safely conduct the actual termination */
7111 
7112  __kmp_internal_end();
7113 
7114  __kmp_release_bootstrap_lock( &__kmp_forkjoin_lock );
7115  __kmp_release_bootstrap_lock( &__kmp_initz_lock );
7116 
7117  KA_TRACE( 10, ("__kmp_internal_end_thread: exit\n" ) );
7118 
7119  #ifdef DUMP_DEBUG_ON_EXIT
7120  if ( __kmp_debug_buf )
7121  __kmp_dump_debug_buffer();
7122  #endif
7123 } // __kmp_internal_end_thread
7124 
7125 // -------------------------------------------------------------------------------------------------
7126 // Library registration stuff.
7127 
7128 static long __kmp_registration_flag = 0;
7129  // Random value used to indicate library initialization.
7130 static char * __kmp_registration_str = NULL;
7131  // Value to be saved in env var __KMP_REGISTERED_LIB_<pid>.
7132 
7133 
7134 static inline
7135 char *
7136 __kmp_reg_status_name() {
7137  /*
7138  On RHEL 3u5 if linked statically, getpid() returns different values in each thread.
7139  If registration and unregistration go in different threads (omp_misc_other_root_exit.cpp test case),
7140  the name of registered_lib_env env var can not be found, because the name will contain different pid.
7141  */
7142  return __kmp_str_format( "__KMP_REGISTERED_LIB_%d", (int) getpid() );
7143 } // __kmp_reg_status_get
7144 
7145 
7146 void
7147 __kmp_register_library_startup(
7148  void
7149 ) {
7150 
7151  char * name = __kmp_reg_status_name(); // Name of the environment variable.
7152  int done = 0;
7153  union {
7154  double dtime;
7155  long ltime;
7156  } time;
7157  #if KMP_OS_WINDOWS
7158  __kmp_initialize_system_tick();
7159  #endif
7160  __kmp_read_system_time( & time.dtime );
7161  __kmp_registration_flag = 0xCAFE0000L | ( time.ltime & 0x0000FFFFL );
7162  __kmp_registration_str =
7163  __kmp_str_format(
7164  "%p-%lx-%s",
7165  & __kmp_registration_flag,
7166  __kmp_registration_flag,
7167  KMP_LIBRARY_FILE
7168  );
7169 
7170  KA_TRACE( 50, ( "__kmp_register_library_startup: %s=\"%s\"\n", name, __kmp_registration_str ) );
7171 
7172  while ( ! done ) {
7173 
7174  char * value = NULL; // Actual value of the environment variable.
7175 
7176  // Set environment variable, but do not overwrite if it is exist.
7177  __kmp_env_set( name, __kmp_registration_str, 0 );
7178  // Check the variable is written.
7179  value = __kmp_env_get( name );
7180  if ( value != NULL && strcmp( value, __kmp_registration_str ) == 0 ) {
7181 
7182  done = 1; // Ok, environment variable set successfully, exit the loop.
7183 
7184  } else {
7185 
7186  // Oops. Write failed. Another copy of OpenMP RTL is in memory.
7187  // Check whether it alive or dead.
7188  int neighbor = 0; // 0 -- unknown status, 1 -- alive, 2 -- dead.
7189  char * tail = value;
7190  char * flag_addr_str = NULL;
7191  char * flag_val_str = NULL;
7192  char const * file_name = NULL;
7193  __kmp_str_split( tail, '-', & flag_addr_str, & tail );
7194  __kmp_str_split( tail, '-', & flag_val_str, & tail );
7195  file_name = tail;
7196  if ( tail != NULL ) {
7197  long * flag_addr = 0;
7198  long flag_val = 0;
7199  sscanf( flag_addr_str, "%p", & flag_addr );
7200  sscanf( flag_val_str, "%lx", & flag_val );
7201  if ( flag_addr != 0 && flag_val != 0 && strcmp( file_name, "" ) != 0 ) {
7202  // First, check whether environment-encoded address is mapped into addr space.
7203  // If so, dereference it to see if it still has the right value.
7204 
7205  if ( __kmp_is_address_mapped( flag_addr ) && * flag_addr == flag_val ) {
7206  neighbor = 1;
7207  } else {
7208  // If not, then we know the other copy of the library is no longer running.
7209  neighbor = 2;
7210  }; // if
7211  }; // if
7212  }; // if
7213  switch ( neighbor ) {
7214  case 0 : // Cannot parse environment variable -- neighbor status unknown.
7215  // Assume it is the incompatible format of future version of the library.
7216  // Assume the other library is alive.
7217  // WARN( ... ); // TODO: Issue a warning.
7218  file_name = "unknown library";
7219  // Attention! Falling to the next case. That's intentional.
7220  case 1 : { // Neighbor is alive.
7221  // Check it is allowed.
7222  char * duplicate_ok = __kmp_env_get( "KMP_DUPLICATE_LIB_OK" );
7223  if ( ! __kmp_str_match_true( duplicate_ok ) ) {
7224  // That's not allowed. Issue fatal error.
7225  __kmp_msg(
7226  kmp_ms_fatal,
7227  KMP_MSG( DuplicateLibrary, KMP_LIBRARY_FILE, file_name ),
7228  KMP_HNT( DuplicateLibrary ),
7229  __kmp_msg_null
7230  );
7231  }; // if
7232  KMP_INTERNAL_FREE( duplicate_ok );
7233  __kmp_duplicate_library_ok = 1;
7234  done = 1; // Exit the loop.
7235  } break;
7236  case 2 : { // Neighbor is dead.
7237  // Clear the variable and try to register library again.
7238  __kmp_env_unset( name );
7239  } break;
7240  default : {
7241  KMP_DEBUG_ASSERT( 0 );
7242  } break;
7243  }; // switch
7244 
7245  }; // if
7246  KMP_INTERNAL_FREE( (void *) value );
7247 
7248  }; // while
7249  KMP_INTERNAL_FREE( (void *) name );
7250 
7251 } // func __kmp_register_library_startup
7252 
7253 
7254 void
7255 __kmp_unregister_library( void ) {
7256 
7257  char * name = __kmp_reg_status_name();
7258  char * value = __kmp_env_get( name );
7259 
7260  KMP_DEBUG_ASSERT( __kmp_registration_flag != 0 );
7261  KMP_DEBUG_ASSERT( __kmp_registration_str != NULL );
7262  if ( value != NULL && strcmp( value, __kmp_registration_str ) == 0 ) {
7263  // Ok, this is our variable. Delete it.
7264  __kmp_env_unset( name );
7265  }; // if
7266 
7267  KMP_INTERNAL_FREE( __kmp_registration_str );
7268  KMP_INTERNAL_FREE( value );
7269  KMP_INTERNAL_FREE( name );
7270 
7271  __kmp_registration_flag = 0;
7272  __kmp_registration_str = NULL;
7273 
7274 } // __kmp_unregister_library
7275 
7276 
7277 // End of Library registration stuff.
7278 // -------------------------------------------------------------------------------------------------
7279 
7280 static void
7281 __kmp_do_serial_initialize( void )
7282 {
7283  int i, gtid;
7284  int size;
7285 
7286  KA_TRACE( 10, ("__kmp_serial_initialize: enter\n" ) );
7287 
7288  KMP_DEBUG_ASSERT( sizeof( kmp_int32 ) == 4 );
7289  KMP_DEBUG_ASSERT( sizeof( kmp_uint32 ) == 4 );
7290  KMP_DEBUG_ASSERT( sizeof( kmp_int64 ) == 8 );
7291  KMP_DEBUG_ASSERT( sizeof( kmp_uint64 ) == 8 );
7292  KMP_DEBUG_ASSERT( sizeof( kmp_intptr_t ) == sizeof( void * ) );
7293 
7294  __kmp_validate_locks();
7295 
7296  /* Initialize internal memory allocator */
7297  __kmp_init_allocator();
7298 
7299  /* Register the library startup via an environment variable
7300  and check to see whether another copy of the library is already
7301  registered. */
7302 
7303  __kmp_register_library_startup( );
7304 
7305  /* TODO reinitialization of library */
7306  if( TCR_4(__kmp_global.g.g_done) ) {
7307  KA_TRACE( 10, ("__kmp_do_serial_initialize: reinitialization of library\n" ) );
7308  }
7309 
7310  __kmp_global.g.g_abort = 0;
7311  TCW_SYNC_4(__kmp_global.g.g_done, FALSE);
7312 
7313  /* initialize the locks */
7314 #if KMP_USE_ADAPTIVE_LOCKS
7315 #if KMP_DEBUG_ADAPTIVE_LOCKS
7316  __kmp_init_speculative_stats();
7317 #endif
7318 #endif
7319  __kmp_init_lock( & __kmp_global_lock );
7320  __kmp_init_queuing_lock( & __kmp_dispatch_lock );
7321  __kmp_init_lock( & __kmp_debug_lock );
7322  __kmp_init_atomic_lock( & __kmp_atomic_lock );
7323  __kmp_init_atomic_lock( & __kmp_atomic_lock_1i );
7324  __kmp_init_atomic_lock( & __kmp_atomic_lock_2i );
7325  __kmp_init_atomic_lock( & __kmp_atomic_lock_4i );
7326  __kmp_init_atomic_lock( & __kmp_atomic_lock_4r );
7327  __kmp_init_atomic_lock( & __kmp_atomic_lock_8i );
7328  __kmp_init_atomic_lock( & __kmp_atomic_lock_8r );
7329  __kmp_init_atomic_lock( & __kmp_atomic_lock_8c );
7330  __kmp_init_atomic_lock( & __kmp_atomic_lock_10r );
7331  __kmp_init_atomic_lock( & __kmp_atomic_lock_16r );
7332  __kmp_init_atomic_lock( & __kmp_atomic_lock_16c );
7333  __kmp_init_atomic_lock( & __kmp_atomic_lock_20c );
7334  __kmp_init_atomic_lock( & __kmp_atomic_lock_32c );
7335  __kmp_init_bootstrap_lock( & __kmp_forkjoin_lock );
7336  __kmp_init_bootstrap_lock( & __kmp_exit_lock );
7337  __kmp_init_bootstrap_lock( & __kmp_monitor_lock );
7338  __kmp_init_bootstrap_lock( & __kmp_tp_cached_lock );
7339 
7340  /* conduct initialization and initial setup of configuration */
7341 
7342  __kmp_runtime_initialize();
7343 
7344  // Some global variable initialization moved here from kmp_env_initialize()
7345 #ifdef KMP_DEBUG
7346  kmp_diag = 0;
7347 #endif
7348  __kmp_abort_delay = 0;
7349 
7350  // From __kmp_init_dflt_team_nth()
7351  /* assume the entire machine will be used */
7352  __kmp_dflt_team_nth_ub = __kmp_xproc;
7353  if( __kmp_dflt_team_nth_ub < KMP_MIN_NTH ) {
7354  __kmp_dflt_team_nth_ub = KMP_MIN_NTH;
7355  }
7356  if( __kmp_dflt_team_nth_ub > __kmp_sys_max_nth ) {
7357  __kmp_dflt_team_nth_ub = __kmp_sys_max_nth;
7358  }
7359  __kmp_max_nth = __kmp_sys_max_nth;
7360  __kmp_threads_capacity = __kmp_initial_threads_capacity( __kmp_dflt_team_nth_ub );
7361 
7362  // Three vars below moved here from __kmp_env_initialize() "KMP_BLOCKTIME" part
7363  __kmp_dflt_blocktime = KMP_DEFAULT_BLOCKTIME;
7364  __kmp_monitor_wakeups = KMP_WAKEUPS_FROM_BLOCKTIME( __kmp_dflt_blocktime, __kmp_monitor_wakeups );
7365  __kmp_bt_intervals = KMP_INTERVALS_FROM_BLOCKTIME( __kmp_dflt_blocktime, __kmp_monitor_wakeups );
7366  // From "KMP_LIBRARY" part of __kmp_env_initialize()
7367  __kmp_library = library_throughput;
7368  // From KMP_SCHEDULE initialization
7369  __kmp_static = kmp_sch_static_balanced;
7370  // AC: do not use analytical here, because it is non-monotonous
7371  //__kmp_guided = kmp_sch_guided_iterative_chunked;
7372  #if OMP_30_ENABLED
7373  //__kmp_auto = kmp_sch_guided_analytical_chunked; // AC: it is the default, no need to repeate assignment
7374  #endif // OMP_30_ENABLED
7375  // Barrier initialization. Moved here from __kmp_env_initialize() Barrier branch bit control and barrier method
7376  // control parts
7377  #if KMP_FAST_REDUCTION_BARRIER
7378  #define kmp_reduction_barrier_gather_bb ((int)1)
7379  #define kmp_reduction_barrier_release_bb ((int)1)
7380  #define kmp_reduction_barrier_gather_pat bp_hyper_bar
7381  #define kmp_reduction_barrier_release_pat bp_hyper_bar
7382  #endif // KMP_FAST_REDUCTION_BARRIER
7383  for ( i=bs_plain_barrier; i<bs_last_barrier; i++ ) {
7384  __kmp_barrier_gather_branch_bits [ i ] = __kmp_barrier_gather_bb_dflt;
7385  __kmp_barrier_release_branch_bits[ i ] = __kmp_barrier_release_bb_dflt;
7386  __kmp_barrier_gather_pattern [ i ] = __kmp_barrier_gather_pat_dflt;
7387  __kmp_barrier_release_pattern[ i ] = __kmp_barrier_release_pat_dflt;
7388  #if KMP_FAST_REDUCTION_BARRIER
7389  if( i == bs_reduction_barrier ) { // tested and confirmed on ALTIX only ( lin_64 ): hyper,1
7390  __kmp_barrier_gather_branch_bits [ i ] = kmp_reduction_barrier_gather_bb;
7391  __kmp_barrier_release_branch_bits[ i ] = kmp_reduction_barrier_release_bb;
7392  __kmp_barrier_gather_pattern [ i ] = kmp_reduction_barrier_gather_pat;
7393  __kmp_barrier_release_pattern[ i ] = kmp_reduction_barrier_release_pat;
7394  }
7395  #endif // KMP_FAST_REDUCTION_BARRIER
7396  }
7397  #if KMP_FAST_REDUCTION_BARRIER
7398  #undef kmp_reduction_barrier_release_pat
7399  #undef kmp_reduction_barrier_gather_pat
7400  #undef kmp_reduction_barrier_release_bb
7401  #undef kmp_reduction_barrier_gather_bb
7402  #endif // KMP_FAST_REDUCTION_BARRIER
7403  #if KMP_MIC
7404  // AC: plane=3,2, forkjoin=2,1 are optimal for 240 threads on KNC
7405  __kmp_barrier_gather_branch_bits [ 0 ] = 3; // plane gather
7406  __kmp_barrier_release_branch_bits[ 1 ] = 1; // forkjoin release
7407  #endif
7408 
7409  // From KMP_CHECKS initialization
7410 #ifdef KMP_DEBUG
7411  __kmp_env_checks = TRUE; /* development versions have the extra checks */
7412 #else
7413  __kmp_env_checks = FALSE; /* port versions do not have the extra checks */
7414 #endif
7415 
7416  // From "KMP_FOREIGN_THREADS_THREADPRIVATE" initialization
7417  __kmp_foreign_tp = TRUE;
7418 
7419  __kmp_global.g.g_dynamic = FALSE;
7420  __kmp_global.g.g_dynamic_mode = dynamic_default;
7421 
7422  __kmp_env_initialize( NULL );
7423  // Print all messages in message catalog for testing purposes.
7424  #ifdef KMP_DEBUG
7425  char const * val = __kmp_env_get( "KMP_DUMP_CATALOG" );
7426  if ( __kmp_str_match_true( val ) ) {
7427  kmp_str_buf_t buffer;
7428  __kmp_str_buf_init( & buffer );
7429  __kmp_i18n_dump_catalog( buffer );
7430  __kmp_printf( "%s", buffer.str );
7431  __kmp_str_buf_free( & buffer );
7432  }; // if
7433  __kmp_env_free( & val );
7434  #endif
7435 
7436  // Moved here from __kmp_env_initialize() "KMP_ALL_THREADPRIVATE" part
7437  __kmp_tp_capacity = __kmp_default_tp_capacity(__kmp_dflt_team_nth_ub, __kmp_max_nth, __kmp_allThreadsSpecified);
7438 
7439  // omalyshe: This initialisation beats env var setting.
7440  //__kmp_load_balance_interval = 1.0;
7441 
7442  // If the library is shut down properly, both pools must be NULL. Just in case, set them
7443  // to NULL -- some memory may leak, but subsequent code will work even if pools are not freed.
7444  KMP_DEBUG_ASSERT( __kmp_thread_pool == NULL );
7445  KMP_DEBUG_ASSERT( __kmp_thread_pool_insert_pt == NULL );
7446  KMP_DEBUG_ASSERT( __kmp_team_pool == NULL );
7447  __kmp_thread_pool = NULL;
7448  __kmp_thread_pool_insert_pt = NULL;
7449  __kmp_team_pool = NULL;
7450 
7451  /* Allocate all of the variable sized records */
7452  /* NOTE: __kmp_threads_capacity entries are allocated, but the arrays are expandable */
7453  /* Since allocation is cache-aligned, just add extra padding at the end */
7454  size = (sizeof(kmp_info_t*) + sizeof(kmp_root_t*))*__kmp_threads_capacity + CACHE_LINE;
7455  __kmp_threads = (kmp_info_t**) __kmp_allocate( size );
7456  __kmp_root = (kmp_root_t**) ((char*)__kmp_threads + sizeof(kmp_info_t*) * __kmp_threads_capacity );
7457 
7458  /* init thread counts */
7459  KMP_DEBUG_ASSERT( __kmp_all_nth == 0 ); // Asserts fail if the library is reinitializing and
7460  KMP_DEBUG_ASSERT( __kmp_nth == 0 ); // something was wrong in termination.
7461  __kmp_all_nth = 0;
7462  __kmp_nth = 0;
7463 
7464  /* setup the uber master thread and hierarchy */
7465  gtid = __kmp_register_root( TRUE );
7466  KA_TRACE( 10, ("__kmp_do_serial_initialize T#%d\n", gtid ));
7467  KMP_ASSERT( KMP_UBER_GTID( gtid ) );
7468  KMP_ASSERT( KMP_INITIAL_GTID( gtid ) );
7469 
7470  KMP_MB(); /* Flush all pending memory write invalidates. */
7471 
7472  __kmp_common_initialize();
7473 
7474  #if KMP_OS_UNIX
7475  /* invoke the child fork handler */
7476  __kmp_register_atfork();
7477  #endif
7478 
7479  #if ! defined GUIDEDLL_EXPORTS
7480  {
7481  /* Invoke the exit handler when the program finishes, only for static library.
7482  For dynamic library, we already have _fini and DllMain.
7483  */
7484  int rc = atexit( __kmp_internal_end_atexit );
7485  if ( rc != 0 ) {
7486  __kmp_msg( kmp_ms_fatal, KMP_MSG( FunctionError, "atexit()" ), KMP_ERR( rc ), __kmp_msg_null );
7487  }; // if
7488  }
7489  #endif
7490 
7491  #if KMP_HANDLE_SIGNALS
7492  #if KMP_OS_UNIX
7493  /* NOTE: make sure that this is called before the user installs
7494  * their own signal handlers so that the user handlers
7495  * are called first. this way they can return false,
7496  * not call our handler, avoid terminating the library,
7497  * and continue execution where they left off. */
7498  __kmp_install_signals( FALSE );
7499  #endif /* KMP_OS_UNIX */
7500  #if KMP_OS_WINDOWS
7501  __kmp_install_signals( TRUE );
7502  #endif /* KMP_OS_WINDOWS */
7503  #endif
7504 
7505  /* we have finished the serial initialization */
7506  __kmp_init_counter ++;
7507 
7508  __kmp_init_serial = TRUE;
7509 
7510  if (__kmp_settings) {
7511  __kmp_env_print();
7512  }
7513 
7514  KMP_MB();
7515 
7516  KA_TRACE( 10, ("__kmp_do_serial_initialize: exit\n" ) );
7517 }
7518 
7519 void
7520 __kmp_serial_initialize( void )
7521 {
7522  if ( __kmp_init_serial ) {
7523  return;
7524  }
7525  __kmp_acquire_bootstrap_lock( &__kmp_initz_lock );
7526  if ( __kmp_init_serial ) {
7527  __kmp_release_bootstrap_lock( &__kmp_initz_lock );
7528  return;
7529  }
7530  __kmp_do_serial_initialize();
7531  __kmp_release_bootstrap_lock( &__kmp_initz_lock );
7532 }
7533 
7534 static void
7535 __kmp_do_middle_initialize( void )
7536 {
7537  int i, j;
7538  int prev_dflt_team_nth;
7539 
7540  if( !__kmp_init_serial ) {
7541  __kmp_do_serial_initialize();
7542  }
7543 
7544  KA_TRACE( 10, ("__kmp_middle_initialize: enter\n" ) );
7545 
7546  //
7547  // Save the previous value for the __kmp_dflt_team_nth so that
7548  // we can avoid some reinitialization if it hasn't changed.
7549  //
7550  prev_dflt_team_nth = __kmp_dflt_team_nth;
7551 
7552 #if KMP_OS_WINDOWS || KMP_OS_LINUX
7553  //
7554  // __kmp_affinity_initialize() will try to set __kmp_ncores to the
7555  // number of cores on the machine.
7556  //
7557  __kmp_affinity_initialize();
7558 
7559  //
7560  // Run through the __kmp_threads array and set the affinity mask
7561  // for each root thread that is currently registered with the RTL.
7562  //
7563  for ( i = 0; i < __kmp_threads_capacity; i++ ) {
7564  if ( TCR_PTR( __kmp_threads[ i ] ) != NULL ) {
7565  __kmp_affinity_set_init_mask( i, TRUE );
7566  }
7567  }
7568 #endif /* KMP_OS_WINDOWS || KMP_OS_LINUX */
7569 
7570  KMP_ASSERT( __kmp_xproc > 0 );
7571  if ( __kmp_avail_proc == 0 ) {
7572  __kmp_avail_proc = __kmp_xproc;
7573  }
7574 
7575  // If there were empty places in num_threads list (OMP_NUM_THREADS=,,2,3), correct them now
7576  j = 0;
7577  while ( __kmp_nested_nth.used && ! __kmp_nested_nth.nth[ j ] ) {
7578  __kmp_nested_nth.nth[ j ] = __kmp_dflt_team_nth = __kmp_dflt_team_nth_ub = __kmp_avail_proc;
7579  j++;
7580  }
7581 
7582  if ( __kmp_dflt_team_nth == 0 ) {
7583 #ifdef KMP_DFLT_NTH_CORES
7584  //
7585  // Default #threads = #cores
7586  //
7587  __kmp_dflt_team_nth = __kmp_ncores;
7588  KA_TRACE( 20, ("__kmp_middle_initialize: setting __kmp_dflt_team_nth = __kmp_ncores (%d)\n",
7589  __kmp_dflt_team_nth ) );
7590 #else
7591  //
7592  // Default #threads = #available OS procs
7593  //
7594  __kmp_dflt_team_nth = __kmp_avail_proc;
7595  KA_TRACE( 20, ("__kmp_middle_initialize: setting __kmp_dflt_team_nth = __kmp_avail_proc(%d)\n",
7596  __kmp_dflt_team_nth ) );
7597 #endif /* KMP_DFLT_NTH_CORES */
7598  }
7599 
7600  if ( __kmp_dflt_team_nth < KMP_MIN_NTH ) {
7601  __kmp_dflt_team_nth = KMP_MIN_NTH;
7602  }
7603  if( __kmp_dflt_team_nth > __kmp_sys_max_nth ) {
7604  __kmp_dflt_team_nth = __kmp_sys_max_nth;
7605  }
7606 
7607  //
7608  // There's no harm in continuing if the following check fails,
7609  // but it indicates an error in the previous logic.
7610  //
7611  KMP_DEBUG_ASSERT( __kmp_dflt_team_nth <= __kmp_dflt_team_nth_ub );
7612 
7613  if ( __kmp_dflt_team_nth != prev_dflt_team_nth ) {
7614  //
7615  // Run through the __kmp_threads array and set the num threads icv
7616  // for each root thread that is currently registered with the RTL
7617  // (which has not already explicitly set its nthreads-var with a
7618  // call to omp_set_num_threads()).
7619  //
7620  for ( i = 0; i < __kmp_threads_capacity; i++ ) {
7621  kmp_info_t *thread = __kmp_threads[ i ];
7622  if ( thread == NULL ) continue;
7623 #if OMP_30_ENABLED
7624  if ( thread->th.th_current_task->td_icvs.nproc != 0 ) continue;
7625 #else
7626  if ( thread->th.th_team->t.t_set_nproc[ thread->th.th_info.ds.ds_tid ] != 0 ) continue;
7627 #endif /* OMP_30_ENABLED */
7628 
7629  set__nproc_p( __kmp_threads[ i ], __kmp_dflt_team_nth );
7630  }
7631  }
7632  KA_TRACE( 20, ("__kmp_middle_initialize: final value for __kmp_dflt_team_nth = %d\n",
7633  __kmp_dflt_team_nth) );
7634 
7635 #ifdef KMP_ADJUST_BLOCKTIME
7636  /* Adjust blocktime to zero if necessary */
7637  /* now that __kmp_avail_proc is set */
7638  if ( !__kmp_env_blocktime && ( __kmp_avail_proc > 0 ) ) {
7639  KMP_DEBUG_ASSERT( __kmp_avail_proc > 0 );
7640  if ( __kmp_nth > __kmp_avail_proc ) {
7641  __kmp_zero_bt = TRUE;
7642  }
7643  }
7644 #endif /* KMP_ADJUST_BLOCKTIME */
7645 
7646  /* we have finished middle initialization */
7647  TCW_SYNC_4(__kmp_init_middle, TRUE);
7648 
7649  KA_TRACE( 10, ("__kmp_do_middle_initialize: exit\n" ) );
7650 }
7651 
7652 void
7653 __kmp_middle_initialize( void )
7654 {
7655  if ( __kmp_init_middle ) {
7656  return;
7657  }
7658  __kmp_acquire_bootstrap_lock( &__kmp_initz_lock );
7659  if ( __kmp_init_middle ) {
7660  __kmp_release_bootstrap_lock( &__kmp_initz_lock );
7661  return;
7662  }
7663  __kmp_do_middle_initialize();
7664  __kmp_release_bootstrap_lock( &__kmp_initz_lock );
7665 }
7666 
7667 void
7668 __kmp_parallel_initialize( void )
7669 {
7670  int gtid = __kmp_entry_gtid(); // this might be a new root
7671 
7672  /* syncronize parallel initialization (for sibling) */
7673  if( TCR_4(__kmp_init_parallel) ) return;
7674  __kmp_acquire_bootstrap_lock( &__kmp_initz_lock );
7675  if( TCR_4(__kmp_init_parallel) ) { __kmp_release_bootstrap_lock( &__kmp_initz_lock ); return; }
7676 
7677  /* TODO reinitialization after we have already shut down */
7678  if( TCR_4(__kmp_global.g.g_done) ) {
7679  KA_TRACE( 10, ("__kmp_parallel_initialize: attempt to init while shutting down\n" ) );
7680  __kmp_infinite_loop();
7681  }
7682 
7683  /* jc: The lock __kmp_initz_lock is already held, so calling __kmp_serial_initialize
7684  would cause a deadlock. So we call __kmp_do_serial_initialize directly.
7685  */
7686  if( !__kmp_init_middle ) {
7687  __kmp_do_middle_initialize();
7688  }
7689 
7690  /* begin initialization */
7691  KA_TRACE( 10, ("__kmp_parallel_initialize: enter\n" ) );
7692  KMP_ASSERT( KMP_UBER_GTID( gtid ) );
7693 
7694 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
7695  //
7696  // Save the FP control regs.
7697  // Worker threads will set theirs to these values at thread startup.
7698  //
7699  __kmp_store_x87_fpu_control_word( &__kmp_init_x87_fpu_control_word );
7700  __kmp_store_mxcsr( &__kmp_init_mxcsr );
7701  __kmp_init_mxcsr &= KMP_X86_MXCSR_MASK;
7702 #endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
7703 
7704 #if KMP_OS_UNIX
7705 # if KMP_HANDLE_SIGNALS
7706  /* must be after __kmp_serial_initialize */
7707  __kmp_install_signals( TRUE );
7708 # endif
7709 #endif
7710 
7711  __kmp_suspend_initialize();
7712 
7713 # if defined(USE_LOAD_BALANCE)
7714  if ( __kmp_global.g.g_dynamic_mode == dynamic_default ) {
7715  __kmp_global.g.g_dynamic_mode = dynamic_load_balance;
7716  }
7717 #else
7718  if ( __kmp_global.g.g_dynamic_mode == dynamic_default ) {
7719  __kmp_global.g.g_dynamic_mode = dynamic_thread_limit;
7720  }
7721 #endif
7722 
7723  if ( __kmp_version ) {
7724  __kmp_print_version_2();
7725  }
7726 
7727  /* we have finished parallel initialization */
7728  TCW_SYNC_4(__kmp_init_parallel, TRUE);
7729 
7730  KMP_MB();
7731  KA_TRACE( 10, ("__kmp_parallel_initialize: exit\n" ) );
7732 
7733  __kmp_release_bootstrap_lock( &__kmp_initz_lock );
7734 }
7735 
7736 
7737 /* ------------------------------------------------------------------------ */
7738 
7739 void
7740 __kmp_run_before_invoked_task( int gtid, int tid, kmp_info_t *this_thr,
7741  kmp_team_t *team )
7742 {
7743  kmp_disp_t *dispatch;
7744 
7745  KMP_MB();
7746 
7747  /* none of the threads have encountered any constructs, yet. */
7748  this_thr->th.th_local.this_construct = 0;
7749  this_thr->th.th_local.last_construct = 0;
7750 #if KMP_CACHE_MANAGE
7751  KMP_CACHE_PREFETCH( &this_thr -> th.th_bar[ bs_forkjoin_barrier ].bb.b_arrived );
7752 #endif /* KMP_CACHE_MANAGE */
7753  dispatch = (kmp_disp_t *)TCR_PTR(this_thr->th.th_dispatch);
7754  KMP_DEBUG_ASSERT( dispatch );
7755  KMP_DEBUG_ASSERT( team -> t.t_dispatch );
7756  //KMP_DEBUG_ASSERT( this_thr -> th.th_dispatch == &team -> t.t_dispatch[ this_thr->th.th_info.ds.ds_tid ] );
7757 
7758  dispatch -> th_disp_index = 0; /* reset the dispatch buffer counter */
7759 
7760  if( __kmp_env_consistency_check )
7761  __kmp_push_parallel( gtid, team->t.t_ident );
7762 
7763  KMP_MB(); /* Flush all pending memory write invalidates. */
7764 }
7765 
7766 void
7767 __kmp_run_after_invoked_task( int gtid, int tid, kmp_info_t *this_thr,
7768  kmp_team_t *team )
7769 {
7770  if( __kmp_env_consistency_check )
7771  __kmp_pop_parallel( gtid, team->t.t_ident );
7772 }
7773 
7774 int
7775 __kmp_invoke_task_func( int gtid )
7776 {
7777  int rc;
7778  int tid = __kmp_tid_from_gtid( gtid );
7779  kmp_info_t *this_thr = __kmp_threads[ gtid ];
7780  kmp_team_t *team = this_thr -> th.th_team;
7781 
7782  __kmp_run_before_invoked_task( gtid, tid, this_thr, team );
7783 #if USE_ITT_BUILD
7784  if ( __itt_stack_caller_create_ptr ) {
7785  __kmp_itt_stack_callee_enter( (__itt_caller)team->t.t_stack_id ); // inform ittnotify about entering user's code
7786  }
7787 #endif /* USE_ITT_BUILD */
7788  rc = __kmp_invoke_microtask( (microtask_t) TCR_SYNC_PTR(team->t.t_pkfn),
7789  gtid, tid, (int) team->t.t_argc, (void **) team->t.t_argv );
7790 
7791 #if USE_ITT_BUILD
7792  if ( __itt_stack_caller_create_ptr ) {
7793  __kmp_itt_stack_callee_leave( (__itt_caller)team->t.t_stack_id ); // inform ittnotify about leaving user's code
7794  }
7795 #endif /* USE_ITT_BUILD */
7796  __kmp_run_after_invoked_task( gtid, tid, this_thr, team );
7797 
7798  return rc;
7799 }
7800 
7801 #if OMP_40_ENABLED
7802 void
7803 __kmp_teams_master( microtask_t microtask, int gtid )
7804 {
7805  // This routine is called by all master threads in teams construct
7806  kmp_info_t *this_thr = __kmp_threads[ gtid ];
7807  kmp_team_t *team = this_thr -> th.th_team;
7808  ident_t *loc = team->t.t_ident;
7809 
7810 #if KMP_DEBUG
7811  int tid = __kmp_tid_from_gtid( gtid );
7812  KA_TRACE( 20, ("__kmp_teams_master: T#%d, Tid %d, microtask %p\n",
7813  gtid, tid, microtask) );
7814 #endif
7815 
7816  // Launch league of teams now, but not let workers execute
7817  // (they hang on fork barrier until next parallel)
7818  this_thr->th.th_set_nproc = this_thr->th.th_set_nth_teams;
7819  __kmp_fork_call( loc, gtid, TRUE,
7820  team->t.t_argc,
7821  microtask,
7822  VOLATILE_CAST(launch_t) __kmp_invoke_task_func,
7823  NULL );
7824  __kmp_join_call( loc, gtid, 1 ); // AC: last parameter "1" eliminates join barrier which won't work because
7825  // worker threads are in a fork barrier waiting for more parallel regions
7826 }
7827 
7828 int
7829 __kmp_invoke_teams_master( int gtid )
7830 {
7831  #if KMP_DEBUG
7832  if ( !__kmp_threads[gtid]-> th.th_team->t.t_serialized )
7833  KMP_DEBUG_ASSERT( (void*)__kmp_threads[gtid]-> th.th_team->t.t_pkfn == (void*)__kmp_teams_master );
7834  #endif
7835 
7836  __kmp_teams_master( (microtask_t)__kmp_threads[gtid]->th.th_team_microtask, gtid );
7837 
7838  return 1;
7839 }
7840 #endif /* OMP_40_ENABLED */
7841 
7842 /* this sets the requested number of threads for the next parallel region
7843  * encountered by this team */
7844 /* since this should be enclosed in the forkjoin critical section it
7845  * should avoid race conditions with assymmetrical nested parallelism */
7846 
7847 void
7848 __kmp_push_num_threads( ident_t *id, int gtid, int num_threads )
7849 {
7850  kmp_info_t *thr = __kmp_threads[gtid];
7851 
7852  if( num_threads > 0 )
7853  thr -> th.th_set_nproc = num_threads;
7854 }
7855 
7856 #if OMP_40_ENABLED
7857 
7858 /* this sets the requested number of teams for the teams region and/or
7859  * the number of threads for the next parallel region encountered */
7860 void
7861 __kmp_push_num_teams( ident_t *id, int gtid, int num_teams, int num_threads )
7862 {
7863  kmp_info_t *thr = __kmp_threads[gtid];
7864  // The number of teams is the number of threads in the outer "parallel"
7865  if( num_teams > 0 ) {
7866  thr -> th.th_set_nproc = num_teams;
7867  } else {
7868  thr -> th.th_set_nproc = 1; // AC: default number of teams is 1;
7869  // TODO: should it be __kmp_ncores ?
7870  }
7871  // The number of threads is for inner parallel regions
7872  if( num_threads > 0 ) {
7873  thr -> th.th_set_nth_teams = num_threads;
7874  } else {
7875  if( !TCR_4(__kmp_init_middle) )
7876  __kmp_middle_initialize();
7877  thr -> th.th_set_nth_teams = __kmp_avail_proc / thr -> th.th_set_nproc;
7878  }
7879 }
7880 
7881 
7882 //
7883 // Set the proc_bind var to use in the following parallel region.
7884 //
7885 void
7886 __kmp_push_proc_bind( ident_t *id, int gtid, kmp_proc_bind_t proc_bind )
7887 {
7888  kmp_info_t *thr = __kmp_threads[gtid];
7889  thr -> th.th_set_proc_bind = proc_bind;
7890 }
7891 
7892 #endif /* OMP_40_ENABLED */
7893 
7894 /* Launch the worker threads into the microtask. */
7895 
7896 void
7897 __kmp_internal_fork( ident_t *id, int gtid, kmp_team_t *team )
7898 {
7899  kmp_info_t *this_thr = __kmp_threads[gtid];
7900 
7901 #ifdef KMP_DEBUG
7902  int f;
7903 #endif /* KMP_DEBUG */
7904 
7905  KMP_DEBUG_ASSERT( team );
7906  KMP_DEBUG_ASSERT( this_thr -> th.th_team == team );
7907  KMP_ASSERT( KMP_MASTER_GTID(gtid) );
7908  KMP_MB(); /* Flush all pending memory write invalidates. */
7909 
7910  team -> t.t_construct = 0; /* no single directives seen yet */
7911  team -> t.t_ordered.dt.t_value = 0; /* thread 0 enters the ordered section first */
7912 
7913  /* Reset the identifiers on the dispatch buffer */
7914  KMP_DEBUG_ASSERT( team -> t.t_disp_buffer );
7915  if ( team->t.t_max_nproc > 1 ) {
7916  int i;
7917  for (i = 0; i < KMP_MAX_DISP_BUF; ++i)
7918  team -> t.t_disp_buffer[ i ].buffer_index = i;
7919  } else {
7920  team -> t.t_disp_buffer[ 0 ].buffer_index = 0;
7921  }
7922 
7923  KMP_MB(); /* Flush all pending memory write invalidates. */
7924  KMP_ASSERT( this_thr -> th.th_team == team );
7925 
7926 #ifdef KMP_DEBUG
7927  for( f=0 ; f<team->t.t_nproc ; f++ ) {
7928  KMP_DEBUG_ASSERT( team->t.t_threads[f] &&
7929  team->t.t_threads[f]->th.th_team_nproc == team->t.t_nproc );
7930  }
7931 #endif /* KMP_DEBUG */
7932 
7933  /* release the worker threads so they may begin working */
7934  __kmp_fork_barrier( gtid, 0 );
7935 }
7936 
7937 
7938 void
7939 __kmp_internal_join( ident_t *id, int gtid, kmp_team_t *team )
7940 {
7941  kmp_info_t *this_thr = __kmp_threads[gtid];
7942 
7943  KMP_DEBUG_ASSERT( team );
7944  KMP_DEBUG_ASSERT( this_thr -> th.th_team == team );
7945  KMP_ASSERT( KMP_MASTER_GTID(gtid) );
7946  KMP_MB(); /* Flush all pending memory write invalidates. */
7947 
7948  /* Join barrier after fork */
7949 
7950 #ifdef KMP_DEBUG
7951  if (__kmp_threads[gtid] && __kmp_threads[gtid]->th.th_team_nproc != team->t.t_nproc ) {
7952  __kmp_printf("GTID: %d, __kmp_threads[%d]=%p\n",gtid, gtid, __kmp_threads[gtid]);
7953  __kmp_printf("__kmp_threads[%d]->th.th_team_nproc=%d, TEAM: %p, team->t.t_nproc=%d\n",
7954  gtid, __kmp_threads[gtid]->th.th_team_nproc, team, team->t.t_nproc);
7955  __kmp_print_structure();
7956  }
7957  KMP_DEBUG_ASSERT( __kmp_threads[gtid] &&
7958  __kmp_threads[gtid]->th.th_team_nproc == team->t.t_nproc );
7959 #endif /* KMP_DEBUG */
7960 
7961  __kmp_join_barrier( gtid ); /* wait for everyone */
7962 
7963  KMP_MB(); /* Flush all pending memory write invalidates. */
7964  KMP_ASSERT( this_thr -> th.th_team == team );
7965 }
7966 
7967 
7968 /* ------------------------------------------------------------------------ */
7969 /* ------------------------------------------------------------------------ */
7970 
7971 #ifdef USE_LOAD_BALANCE
7972 
7973 //
7974 // Return the worker threads actively spinning in the hot team, if we
7975 // are at the outermost level of parallelism. Otherwise, return 0.
7976 //
7977 static int
7978 __kmp_active_hot_team_nproc( kmp_root_t *root )
7979 {
7980  int i;
7981  int retval;
7982  kmp_team_t *hot_team;
7983 
7984  if ( root->r.r_active ) {
7985  return 0;
7986  }
7987  hot_team = root->r.r_hot_team;
7988  if ( __kmp_dflt_blocktime == KMP_MAX_BLOCKTIME ) {
7989  return hot_team->t.t_nproc - 1; // Don't count master thread
7990  }
7991 
7992  //
7993  // Skip the master thread - it is accounted for elsewhere.
7994  //
7995  retval = 0;
7996  for ( i = 1; i < hot_team->t.t_nproc; i++ ) {
7997  if ( hot_team->t.t_threads[i]->th.th_active ) {
7998  retval++;
7999  }
8000  }
8001  return retval;
8002 }
8003 
8004 //
8005 // Perform an automatic adjustment to the number of
8006 // threads used by the next parallel region.
8007 //
8008 static int
8009 __kmp_load_balance_nproc( kmp_root_t *root, int set_nproc )
8010 {
8011  int retval;
8012  int pool_active;
8013  int hot_team_active;
8014  int team_curr_active;
8015  int system_active;
8016 
8017  KB_TRACE( 20, ("__kmp_load_balance_nproc: called root:%p set_nproc:%d\n",
8018  root, set_nproc ) );
8019  KMP_DEBUG_ASSERT( root );
8020  #if OMP_30_ENABLED
8021  KMP_DEBUG_ASSERT( root->r.r_root_team->t.t_threads[0]->th.th_current_task->td_icvs.dynamic == TRUE );
8022  #else
8023  KMP_DEBUG_ASSERT( root->r.r_root_team->t.t_set_dynamic[0] == TRUE );
8024  #endif
8025  KMP_DEBUG_ASSERT( set_nproc > 1 );
8026 
8027  if ( set_nproc == 1) {
8028  KB_TRACE( 20, ("__kmp_load_balance_nproc: serial execution.\n" ) );
8029  return 1;
8030  }
8031 
8032  //
8033  // Threads that are active in the thread pool, active in the hot team
8034  // for this particular root (if we are at the outer par level), and
8035  // the currently executing thread (to become the master) are available
8036  // to add to the new team, but are currently contributing to the system
8037  // load, and must be accounted for.
8038  //
8039  pool_active = TCR_4(__kmp_thread_pool_active_nth);
8040  hot_team_active = __kmp_active_hot_team_nproc( root );
8041  team_curr_active = pool_active + hot_team_active + 1;
8042 
8043  //
8044  // Check the system load.
8045  //
8046  system_active = __kmp_get_load_balance( __kmp_avail_proc + team_curr_active );
8047  KB_TRACE( 30, ("__kmp_load_balance_nproc: system active = %d pool active = %d hot team active = %d\n",
8048  system_active, pool_active, hot_team_active ) );
8049 
8050  if ( system_active < 0 ) {
8051  //
8052  // There was an error reading the necessary info from /proc,
8053  // so use the thread limit algorithm instead. Once we set
8054  // __kmp_global.g.g_dynamic_mode = dynamic_thread_limit,
8055  // we shouldn't wind up getting back here.
8056  //
8057  __kmp_global.g.g_dynamic_mode = dynamic_thread_limit;
8058  KMP_WARNING( CantLoadBalUsing, "KMP_DYNAMIC_MODE=thread limit" );
8059 
8060  //
8061  // Make this call behave like the thread limit algorithm.
8062  //
8063  retval = __kmp_avail_proc - __kmp_nth + (root->r.r_active ? 1
8064  : root->r.r_hot_team->t.t_nproc);
8065  if ( retval > set_nproc ) {
8066  retval = set_nproc;
8067  }
8068  if ( retval < KMP_MIN_NTH ) {
8069  retval = KMP_MIN_NTH;
8070  }
8071 
8072  KB_TRACE( 20, ("__kmp_load_balance_nproc: thread limit exit. retval:%d\n", retval ) );
8073  return retval;
8074  }
8075 
8076  //
8077  // There is a slight delay in the load balance algorithm in detecting
8078  // new running procs. The real system load at this instant should be
8079  // at least as large as the #active omp thread that are available to
8080  // add to the team.
8081  //
8082  if ( system_active < team_curr_active ) {
8083  system_active = team_curr_active;
8084  }
8085  retval = __kmp_avail_proc - system_active + team_curr_active;
8086  if ( retval > set_nproc ) {
8087  retval = set_nproc;
8088  }
8089  if ( retval < KMP_MIN_NTH ) {
8090  retval = KMP_MIN_NTH;
8091  }
8092 
8093  KB_TRACE( 20, ("__kmp_load_balance_nproc: exit. retval:%d\n", retval ) );
8094  return retval;
8095 } // __kmp_load_balance_nproc()
8096 
8097 #endif /* USE_LOAD_BALANCE */
8098 
8099 
8100 /* ------------------------------------------------------------------------ */
8101 /* ------------------------------------------------------------------------ */
8102 
8103 /* NOTE: this is called with the __kmp_init_lock held */
8104 void
8105 __kmp_cleanup( void )
8106 {
8107  int f;
8108 
8109  KA_TRACE( 10, ("__kmp_cleanup: enter\n" ) );
8110 
8111  if (TCR_4(__kmp_init_parallel)) {
8112 #if KMP_HANDLE_SIGNALS
8113  __kmp_remove_signals();
8114 #endif
8115  TCW_4(__kmp_init_parallel, FALSE);
8116  }
8117 
8118  if (TCR_4(__kmp_init_middle)) {
8119 #if KMP_OS_WINDOWS || KMP_OS_LINUX
8120  __kmp_affinity_uninitialize();
8121 #endif /* KMP_OS_WINDOWS || KMP_OS_LINUX */
8122  TCW_4(__kmp_init_middle, FALSE);
8123  }
8124 
8125  KA_TRACE( 10, ("__kmp_cleanup: go serial cleanup\n" ) );
8126 
8127  if (__kmp_init_serial) {
8128 
8129  __kmp_runtime_destroy();
8130 
8131  __kmp_init_serial = FALSE;
8132  }
8133 
8134  for ( f = 0; f < __kmp_threads_capacity; f++ ) {
8135  if ( __kmp_root[ f ] != NULL ) {
8136  __kmp_free( __kmp_root[ f ] );
8137  __kmp_root[ f ] = NULL;
8138  }
8139  }
8140  __kmp_free( __kmp_threads );
8141  // __kmp_threads and __kmp_root were allocated at once, as single block, so there is no need in
8142  // freeing __kmp_root.
8143  __kmp_threads = NULL;
8144  __kmp_root = NULL;
8145  __kmp_threads_capacity = 0;
8146 
8147  __kmp_cleanup_user_locks();
8148 
8149  #if KMP_OS_LINUX || KMP_OS_WINDOWS
8150  KMP_INTERNAL_FREE( (void *) __kmp_cpuinfo_file );
8151  __kmp_cpuinfo_file = NULL;
8152  #endif /* KMP_OS_LINUX || KMP_OS_WINDOWS */
8153 
8154  #if KMP_USE_ADAPTIVE_LOCKS
8155  #if KMP_DEBUG_ADAPTIVE_LOCKS
8156  __kmp_print_speculative_stats();
8157  #endif
8158  #endif
8159  KMP_INTERNAL_FREE( __kmp_nested_nth.nth );
8160  __kmp_nested_nth.nth = NULL;
8161  __kmp_nested_nth.size = 0;
8162  __kmp_nested_nth.used = 0;
8163 
8164  __kmp_i18n_catclose();
8165 
8166  KA_TRACE( 10, ("__kmp_cleanup: exit\n" ) );
8167 }
8168 
8169 /* ------------------------------------------------------------------------ */
8170 /* ------------------------------------------------------------------------ */
8171 
8172 int
8173 __kmp_ignore_mppbeg( void )
8174 {
8175  char *env;
8176 
8177  if ((env = getenv( "KMP_IGNORE_MPPBEG" )) != NULL) {
8178  if (__kmp_str_match_false( env ))
8179  return FALSE;
8180  }
8181  // By default __kmpc_begin() is no-op.
8182  return TRUE;
8183 }
8184 
8185 int
8186 __kmp_ignore_mppend( void )
8187 {
8188  char *env;
8189 
8190  if ((env = getenv( "KMP_IGNORE_MPPEND" )) != NULL) {
8191  if (__kmp_str_match_false( env ))
8192  return FALSE;
8193  }
8194  // By default __kmpc_end() is no-op.
8195  return TRUE;
8196 }
8197 
8198 void
8199 __kmp_internal_begin( void )
8200 {
8201  int gtid;
8202  kmp_root_t *root;
8203 
8204  /* this is a very important step as it will register new sibling threads
8205  * and assign these new uber threads a new gtid */
8206  gtid = __kmp_entry_gtid();
8207  root = __kmp_threads[ gtid ] -> th.th_root;
8208  KMP_ASSERT( KMP_UBER_GTID( gtid ));
8209 
8210  if( root->r.r_begin ) return;
8211  __kmp_acquire_lock( &root->r.r_begin_lock, gtid );
8212  if( root->r.r_begin ) {
8213  __kmp_release_lock( & root->r.r_begin_lock, gtid );
8214  return;
8215  }
8216 
8217  root -> r.r_begin = TRUE;
8218 
8219  __kmp_release_lock( & root->r.r_begin_lock, gtid );
8220 }
8221 
8222 
8223 /* ------------------------------------------------------------------------ */
8224 /* ------------------------------------------------------------------------ */
8225 
8226 void
8227 __kmp_user_set_library (enum library_type arg)
8228 {
8229  int gtid;
8230  kmp_root_t *root;
8231  kmp_info_t *thread;
8232 
8233  /* first, make sure we are initialized so we can get our gtid */
8234 
8235  gtid = __kmp_entry_gtid();
8236  thread = __kmp_threads[ gtid ];
8237 
8238  root = thread -> th.th_root;
8239 
8240  KA_TRACE( 20, ("__kmp_user_set_library: enter T#%d, arg: %d, %d\n", gtid, arg, library_serial ));
8241  if (root->r.r_in_parallel) { /* Must be called in serial section of top-level thread */
8242  KMP_WARNING( SetLibraryIncorrectCall );
8243  return;
8244  }
8245 
8246  switch ( arg ) {
8247  case library_serial :
8248  thread -> th.th_set_nproc = 0;
8249  set__nproc_p( thread, 1 );
8250  break;
8251  case library_turnaround :
8252  thread -> th.th_set_nproc = 0;
8253  set__nproc_p( thread, __kmp_dflt_team_nth ? __kmp_dflt_team_nth : __kmp_dflt_team_nth_ub );
8254  break;
8255  case library_throughput :
8256  thread -> th.th_set_nproc = 0;
8257  set__nproc_p( thread, __kmp_dflt_team_nth ? __kmp_dflt_team_nth : __kmp_dflt_team_nth_ub );
8258  break;
8259  default:
8260  KMP_FATAL( UnknownLibraryType, arg );
8261  }
8262 
8263  __kmp_aux_set_library ( arg );
8264 }
8265 
8266 void
8267 __kmp_aux_set_stacksize( size_t arg )
8268 {
8269  if (! __kmp_init_serial)
8270  __kmp_serial_initialize();
8271 
8272 #if KMP_OS_DARWIN
8273  if (arg & (0x1000 - 1)) {
8274  arg &= ~(0x1000 - 1);
8275  if(arg + 0x1000) /* check for overflow if we round up */
8276  arg += 0x1000;
8277  }
8278 #endif
8279  __kmp_acquire_bootstrap_lock( &__kmp_initz_lock );
8280 
8281  /* only change the default stacksize before the first parallel region */
8282  if (! TCR_4(__kmp_init_parallel)) {
8283  size_t value = arg; /* argument is in bytes */
8284 
8285  if (value < __kmp_sys_min_stksize )
8286  value = __kmp_sys_min_stksize ;
8287  else if (value > KMP_MAX_STKSIZE)
8288  value = KMP_MAX_STKSIZE;
8289 
8290  __kmp_stksize = value;
8291 
8292  __kmp_env_stksize = TRUE; /* was KMP_STACKSIZE specified? */
8293  }
8294 
8295  __kmp_release_bootstrap_lock( &__kmp_initz_lock );
8296 }
8297 
8298 /* set the behaviour of the runtime library */
8299 /* TODO this can cause some odd behaviour with sibling parallelism... */
8300 void
8301 __kmp_aux_set_library (enum library_type arg)
8302 {
8303  __kmp_library = arg;
8304 
8305  switch ( __kmp_library ) {
8306  case library_serial :
8307  {
8308  KMP_INFORM( LibraryIsSerial );
8309  (void) __kmp_change_library( TRUE );
8310  }
8311  break;
8312  case library_turnaround :
8313  (void) __kmp_change_library( TRUE );
8314  break;
8315  case library_throughput :
8316  (void) __kmp_change_library( FALSE );
8317  break;
8318  default:
8319  KMP_FATAL( UnknownLibraryType, arg );
8320  }
8321 }
8322 
8323 /* ------------------------------------------------------------------------ */
8324 /* ------------------------------------------------------------------------ */
8325 
8326 void
8327 __kmp_aux_set_blocktime (int arg, kmp_info_t *thread, int tid)
8328 {
8329  int blocktime = arg; /* argument is in milliseconds */
8330  int bt_intervals;
8331  int bt_set;
8332 
8333  __kmp_save_internal_controls( thread );
8334 
8335  /* Normalize and set blocktime for the teams */
8336  if (blocktime < KMP_MIN_BLOCKTIME)
8337  blocktime = KMP_MIN_BLOCKTIME;
8338  else if (blocktime > KMP_MAX_BLOCKTIME)
8339  blocktime = KMP_MAX_BLOCKTIME;
8340 
8341  set__blocktime_team( thread -> th.th_team, tid, blocktime );
8342  set__blocktime_team( thread -> th.th_serial_team, 0, blocktime );
8343 
8344  /* Calculate and set blocktime intervals for the teams */
8345  bt_intervals = KMP_INTERVALS_FROM_BLOCKTIME(blocktime, __kmp_monitor_wakeups);
8346 
8347  set__bt_intervals_team( thread -> th.th_team, tid, bt_intervals );
8348  set__bt_intervals_team( thread -> th.th_serial_team, 0, bt_intervals );
8349 
8350  /* Set whether blocktime has been set to "TRUE" */
8351  bt_set = TRUE;
8352 
8353  set__bt_set_team( thread -> th.th_team, tid, bt_set );
8354  set__bt_set_team( thread -> th.th_serial_team, 0, bt_set );
8355  KF_TRACE(10, ( "kmp_set_blocktime: T#%d(%d:%d), blocktime=%d, bt_intervals=%d, monitor_updates=%d\n",
8356  __kmp_gtid_from_tid(tid, thread->th.th_team),
8357  thread->th.th_team->t.t_id, tid, blocktime, bt_intervals, __kmp_monitor_wakeups ) );
8358 }
8359 
8360 void
8361 __kmp_aux_set_defaults(
8362  char const * str,
8363  int len
8364 ) {
8365  if ( ! __kmp_init_serial ) {
8366  __kmp_serial_initialize();
8367  };
8368  __kmp_env_initialize( str );
8369 
8370  if (__kmp_settings) {
8371  __kmp_env_print();
8372  }
8373 } // __kmp_aux_set_defaults
8374 
8375 /* ------------------------------------------------------------------------ */
8376 
8377 /*
8378  * internal fast reduction routines
8379  */
8380 
8381 // implementation rev. 0.4
8382 // AT: determine CPU, and always use 'critical method' if non-Intel
8383 // AT: test loc != NULL
8384 // AT: what to return if lck == NULL
8385 // AT: tune the cut-off point for atomic reduce method
8386 // AT: tune what to return depending on the CPU and platform configuration
8387 // AT: tune what to return depending on team size
8388 // AT: move this function out to kmp_csupport.c
8389 PACKED_REDUCTION_METHOD_T
8390 __kmp_determine_reduction_method( ident_t *loc, kmp_int32 global_tid,
8391  kmp_int32 num_vars, size_t reduce_size, void *reduce_data, void (*reduce_func)(void *lhs_data, void *rhs_data),
8392  kmp_critical_name *lck )
8393 {
8394 
8395  // Default reduction method: critical construct ( lck != NULL, like in current PAROPT )
8396  // If ( reduce_data!=NULL && reduce_func!=NULL ): the tree-reduction method can be selected by RTL
8397  // If loc->flags contains KMP_IDENT_ATOMIC_REDUCE, the atomic reduce method can be selected by RTL
8398  // Finally, it's up to OpenMP RTL to make a decision on which method to select among generated by PAROPT.
8399 
8400  PACKED_REDUCTION_METHOD_T retval;
8401 
8402  int team_size;
8403 
8404  KMP_DEBUG_ASSERT( loc ); // it would be nice to test ( loc != 0 )
8405  KMP_DEBUG_ASSERT( lck ); // it would be nice to test ( lck != 0 )
8406 
8407  #define FAST_REDUCTION_ATOMIC_METHOD_GENERATED ( ( loc->flags & ( KMP_IDENT_ATOMIC_REDUCE ) ) == ( KMP_IDENT_ATOMIC_REDUCE ) )
8408  #define FAST_REDUCTION_TREE_METHOD_GENERATED ( ( reduce_data ) && ( reduce_func ) )
8409 
8410  retval = critical_reduce_block;
8411 
8412  team_size = __kmp_get_team_num_threads( global_tid ); // another choice of getting a team size ( with 1 dynamic deference ) is slower
8413 
8414  if( team_size == 1 ) {
8415 
8416  retval = empty_reduce_block;
8417 
8418  } else {
8419 
8420  int atomic_available = FAST_REDUCTION_ATOMIC_METHOD_GENERATED;
8421  int tree_available = FAST_REDUCTION_TREE_METHOD_GENERATED;
8422 
8423  #if KMP_ARCH_X86_64
8424 
8425  #if KMP_OS_LINUX || KMP_OS_WINDOWS || KMP_OS_DARWIN
8426  #if KMP_MIC
8427  #define REDUCTION_TEAMSIZE_CUTOFF 8
8428  #else // KMP_MIC
8429  #define REDUCTION_TEAMSIZE_CUTOFF 4
8430  #endif // KMP_MIC
8431  if( tree_available ) {
8432  if( team_size <= REDUCTION_TEAMSIZE_CUTOFF ) {
8433  if ( atomic_available ) {
8434  retval = atomic_reduce_block;
8435  }
8436  } else {
8437  retval = TREE_REDUCE_BLOCK_WITH_REDUCTION_BARRIER;
8438  }
8439  } else if ( atomic_available ) {
8440  retval = atomic_reduce_block;
8441  }
8442  #else
8443  #error "Unknown or unsupported OS"
8444  #endif // KMP_OS_LINUX || KMP_OS_WINDOWS || KMP_OS_DARWIN
8445 
8446  #elif KMP_ARCH_X86
8447 
8448  #if KMP_OS_LINUX || KMP_OS_WINDOWS
8449 
8450  // similar to win_32
8451  // 4x1x2 fxqlin04, the 'linear,linear' barrier
8452 
8453  // similar to lin_32
8454  // 4x1x2 fxqwin04, the 'linear,linear' barrier
8455 
8456  // actual measurement shows that the critical section method is better if team_size <= 8;
8457  // what happenes when team_size > 8 ? ( no machine to test )
8458 
8459  // TO DO: need to run a 32-bit code on Intel(R) 64
8460  // TO DO: test the 'hyper,hyper,1,1' barrier
8461 
8462  // basic tuning
8463 
8464  if( atomic_available ) {
8465  if( num_vars <= 2 ) { // && ( team_size <= 8 ) due to false-sharing ???
8466  retval = atomic_reduce_block;
8467  }
8468  } // otherwise: use critical section
8469 
8470  #elif KMP_OS_DARWIN
8471 
8472 
8473  if( atomic_available && ( num_vars <= 3 ) ) {
8474  retval = atomic_reduce_block;
8475  } else if( tree_available ) {
8476  if( ( reduce_size > ( 9 * sizeof( kmp_real64 ) ) ) && ( reduce_size < ( 2000 * sizeof( kmp_real64 ) ) ) ) {
8477  retval = TREE_REDUCE_BLOCK_WITH_PLAIN_BARRIER;
8478  }
8479  } // otherwise: use critical section
8480 
8481  #else
8482  #error "Unknown or unsupported OS"
8483  #endif
8484 
8485  #else
8486  #error "Unknown or unsupported architecture"
8487  #endif
8488 
8489  }
8490 
8491  //AT: TO DO: critical block method not implemented by PAROPT
8492  //if( retval == __kmp_critical_reduce_block ) {
8493  // if( lck == NULL ) { // critical block method not implemented by PAROPT
8494  // }
8495  //}
8496 
8497  // tune what to return depending on the CPU and platform configuration
8498  // (sometimes tree method is slower than critical)
8499 
8500  // probably tune what to return depending on team size
8501 
8502 
8503  // KMP_FORCE_REDUCTION
8504 
8505  if( __kmp_force_reduction_method != reduction_method_not_defined ) {
8506 
8507  PACKED_REDUCTION_METHOD_T forced_retval;
8508 
8509  int atomic_available, tree_available;
8510 
8511  switch( ( forced_retval = __kmp_force_reduction_method ) )
8512  {
8513  case critical_reduce_block:
8514  KMP_ASSERT( lck ); // lck should be != 0
8515  if( team_size <= 1 ) {
8516  forced_retval = empty_reduce_block;
8517  }
8518  break;
8519 
8520  case atomic_reduce_block:
8521  atomic_available = FAST_REDUCTION_ATOMIC_METHOD_GENERATED;
8522  KMP_ASSERT( atomic_available ); // atomic_available should be != 0
8523  break;
8524 
8525  case tree_reduce_block:
8526  tree_available = FAST_REDUCTION_TREE_METHOD_GENERATED;
8527  KMP_ASSERT( tree_available ); // tree_available should be != 0
8528  #if KMP_FAST_REDUCTION_BARRIER
8529  forced_retval = TREE_REDUCE_BLOCK_WITH_REDUCTION_BARRIER;
8530  #endif
8531  break;
8532 
8533  default:
8534  KMP_ASSERT( 0 ); // "unsupported method specified"
8535  }
8536 
8537  retval = forced_retval;
8538  }
8539 
8540  KA_TRACE(10, ( "reduction method selected=%08x\n", retval ) );
8541 
8542  #undef FAST_REDUCTION_TREE_METHOD_GENERATED
8543  #undef FAST_REDUCTION_ATOMIC_METHOD_GENERATED
8544 
8545  return ( retval );
8546 }
8547 
8548 // this function is for testing set/get/determine reduce method
8549 kmp_int32
8550 __kmp_get_reduce_method( void ) {
8551  return ( ( __kmp_entry_thread() -> th.th_local.packed_reduction_method ) >> 8 );
8552 }
8553 
8554 /* ------------------------------------------------------------------------ */