Intel® OpenMP* Runtime Library
 All Classes Functions Variables Typedefs Enumerations Enumerator Groups Pages
kmp_affinity.cpp
1 /*
2  * kmp_affinity.cpp -- affinity management
3  * $Revision: 42274 $
4  * $Date: 2013-04-12 15:25:11 -0500 (Fri, 12 Apr 2013) $
5  */
6 
7 /* <copyright>
8  Copyright (c) 1997-2013 Intel Corporation. All Rights Reserved.
9 
10  Redistribution and use in source and binary forms, with or without
11  modification, are permitted provided that the following conditions
12  are met:
13 
14  * Redistributions of source code must retain the above copyright
15  notice, this list of conditions and the following disclaimer.
16  * Redistributions in binary form must reproduce the above copyright
17  notice, this list of conditions and the following disclaimer in the
18  documentation and/or other materials provided with the distribution.
19  * Neither the name of Intel Corporation nor the names of its
20  contributors may be used to endorse or promote products derived
21  from this software without specific prior written permission.
22 
23  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
24  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
25  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
26  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
27  HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
28  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
29  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
30  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
31  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
32  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
33  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
34 
35 </copyright> */
36 
37 #include "kmp.h"
38 #include "kmp_i18n.h"
39 #include "kmp_io.h"
40 #include "kmp_str.h"
41 
42 
43 #if KMP_OS_WINDOWS || KMP_OS_LINUX
44 
45 //
46 // Print the affinity mask to the character array in a pretty format.
47 //
48 char *
49 __kmp_affinity_print_mask(char *buf, int buf_len, kmp_affin_mask_t *mask)
50 {
51  KMP_ASSERT(buf_len >= 40);
52  char *scan = buf;
53  char *end = buf + buf_len - 1;
54 
55  //
56  // Find first element / check for empty set.
57  //
58  int i;
59  for (i = 0; i < KMP_CPU_SETSIZE; i++) {
60  if (KMP_CPU_ISSET(i, mask)) {
61  break;
62  }
63  }
64  if (i == KMP_CPU_SETSIZE) {
65  sprintf(scan, "{<empty>}");
66  while (*scan != '\0') scan++;
67  KMP_ASSERT(scan <= end);
68  return buf;
69  }
70 
71  sprintf(scan, "{%d", i);
72  while (*scan != '\0') scan++;
73  i++;
74  for (; i < KMP_CPU_SETSIZE; i++) {
75  if (! KMP_CPU_ISSET(i, mask)) {
76  continue;
77  }
78 
79  //
80  // Check for buffer overflow. A string of the form ",<n>" will have
81  // at most 10 characters, plus we want to leave room to print ",...}"
82  // if the set is too large to print for a total of 15 characters.
83  // We already left room for '\0' in setting end.
84  //
85  if (end - scan < 15) {
86  break;
87  }
88  sprintf(scan, ",%-d", i);
89  while (*scan != '\0') scan++;
90  }
91  if (i < KMP_CPU_SETSIZE) {
92  sprintf(scan, ",...");
93  while (*scan != '\0') scan++;
94  }
95  sprintf(scan, "}");
96  while (*scan != '\0') scan++;
97  KMP_ASSERT(scan <= end);
98  return buf;
99 }
100 
101 
102 void
103 __kmp_affinity_entire_machine_mask(kmp_affin_mask_t *mask)
104 {
105  KMP_CPU_ZERO(mask);
106 
107 # if KMP_OS_WINDOWS && KMP_ARCH_X86_64
108 
109  if (__kmp_num_proc_groups > 1) {
110  int group;
111  struct GROUP_AFFINITY ga;
112  KMP_DEBUG_ASSERT(__kmp_GetActiveProcessorCount != NULL);
113  for (group = 0; group < __kmp_num_proc_groups; group++) {
114  int i;
115  int num = __kmp_GetActiveProcessorCount(group);
116  for (i = 0; i < num; i++) {
117  KMP_CPU_SET(i + group * (CHAR_BIT * sizeof(DWORD_PTR)), mask);
118  }
119  }
120  }
121  else
122 
123 # endif /* KMP_OS_WINDOWS && KMP_ARCH_X86_64 */
124 
125  {
126  int proc;
127  for (proc = 0; proc < __kmp_xproc; proc++) {
128  KMP_CPU_SET(proc, mask);
129  }
130  }
131 }
132 
133 
134 //
135 // In Linux* OS debug & cover (-O0) builds, we need to avoid inline member
136 // functions.
137 //
138 // The icc codegen emits sections with extremely long names, of the form
139 // ".gnu.linkonce.<mangled_name>". There seems to have been a linker bug
140 // introduced between GNU ld version 2.14.90.0.4 and 2.15.92.0.2 involving
141 // some sort of memory corruption or table overflow that is triggered by
142 // these long strings. I checked the latest version of the linker -
143 // GNU ld (Linux* OS/GNU Binutils) 2.18.50.0.7.20080422 - and the bug is not
144 // fixed.
145 //
146 // Unfortunately, my attempts to reproduce it in a smaller example have
147 // failed - I'm not sure what the prospects are of getting it fixed
148 // properly - but we need a reproducer smaller than all of libiomp.
149 //
150 // Work around the problem by avoiding inline constructors in such builds.
151 // We do this for all platforms, not just Linux* OS - non-inline functions are
152 // more debuggable and provide better coverage into than inline functions.
153 // Use inline functions in shipping libs, for performance.
154 //
155 
156 # if !defined(KMP_DEBUG) && !defined(COVER)
157 
158 class Address {
159 public:
160  static const unsigned maxDepth = 32;
161  unsigned labels[maxDepth];
162  unsigned childNums[maxDepth];
163  unsigned depth;
164  unsigned leader;
165  Address(unsigned _depth)
166  : depth(_depth), leader(FALSE) {
167  }
168  Address &operator=(const Address &b) {
169  depth = b.depth;
170  for (unsigned i = 0; i < depth; i++) {
171  labels[i] = b.labels[i];
172  childNums[i] = b.childNums[i];
173  }
174  leader = FALSE;
175  return *this;
176  }
177  bool operator==(const Address &b) const {
178  if (depth != b.depth)
179  return false;
180  for (unsigned i = 0; i < depth; i++)
181  if(labels[i] != b.labels[i])
182  return false;
183  return true;
184  }
185  bool isClose(const Address &b, int level) const {
186  if (depth != b.depth)
187  return false;
188  if (level >= depth)
189  return true;
190  for (unsigned i = 0; i < (depth - level); i++)
191  if(labels[i] != b.labels[i])
192  return false;
193  return true;
194  }
195  bool operator!=(const Address &b) const {
196  return !operator==(b);
197  }
198 };
199 
200 class AddrUnsPair {
201 public:
202  Address first;
203  unsigned second;
204  AddrUnsPair(Address _first, unsigned _second)
205  : first(_first), second(_second) {
206  }
207  AddrUnsPair &operator=(const AddrUnsPair &b)
208  {
209  first = b.first;
210  second = b.second;
211  return *this;
212  }
213 };
214 
215 # else
216 
217 class Address {
218 public:
219  static const unsigned maxDepth = 32;
220  unsigned labels[maxDepth];
221  unsigned childNums[maxDepth];
222  unsigned depth;
223  unsigned leader;
224  Address(unsigned _depth);
225  Address &operator=(const Address &b);
226  bool operator==(const Address &b) const;
227  bool isClose(const Address &b, int level) const;
228  bool operator!=(const Address &b) const;
229 };
230 
231 Address::Address(unsigned _depth)
232 {
233  depth = _depth;
234  leader = FALSE;
235 }
236 
237 Address &Address::operator=(const Address &b) {
238  depth = b.depth;
239  for (unsigned i = 0; i < depth; i++) {
240  labels[i] = b.labels[i];
241  childNums[i] = b.childNums[i];
242  }
243  leader = FALSE;
244  return *this;
245 }
246 
247 bool Address::operator==(const Address &b) const {
248  if (depth != b.depth)
249  return false;
250  for (unsigned i = 0; i < depth; i++)
251  if(labels[i] != b.labels[i])
252  return false;
253  return true;
254 }
255 
256 bool Address::isClose(const Address &b, int level) const {
257  if (depth != b.depth)
258  return false;
259  if (level >= depth)
260  return true;
261  for (unsigned i = 0; i < (depth - level); i++)
262  if(labels[i] != b.labels[i])
263  return false;
264  return true;
265 }
266 
267 bool Address::operator!=(const Address &b) const {
268  return !operator==(b);
269 }
270 
271 class AddrUnsPair {
272 public:
273  Address first;
274  unsigned second;
275  AddrUnsPair(Address _first, unsigned _second);
276  AddrUnsPair &operator=(const AddrUnsPair &b);
277 };
278 
279 AddrUnsPair::AddrUnsPair(Address _first, unsigned _second)
280  : first(_first), second(_second)
281 {
282 }
283 
284 AddrUnsPair &AddrUnsPair::operator=(const AddrUnsPair &b)
285 {
286  first = b.first;
287  second = b.second;
288  return *this;
289 }
290 
291 # endif /* !defined(KMP_DEBUG) && !defined(COVER) */
292 
293 
294 static int
295 __kmp_affinity_cmp_Address_labels(const void *a, const void *b)
296 {
297  const Address *aa = (const Address *)&(((AddrUnsPair *)a)
298  ->first);
299  const Address *bb = (const Address *)&(((AddrUnsPair *)b)
300  ->first);
301  unsigned depth = aa->depth;
302  unsigned i;
303  KMP_DEBUG_ASSERT(depth == bb->depth);
304  for (i = 0; i < depth; i++) {
305  if (aa->labels[i] < bb->labels[i]) return -1;
306  if (aa->labels[i] > bb->labels[i]) return 1;
307  }
308  return 0;
309 }
310 
311 
312 static int
313 __kmp_affinity_cmp_Address_child_num(const void *a, const void *b)
314 {
315  const Address *aa = (const Address *)&(((AddrUnsPair *)a)
316  ->first);
317  const Address *bb = (const Address *)&(((AddrUnsPair *)b)
318  ->first);
319  unsigned depth = aa->depth;
320  unsigned i;
321  KMP_DEBUG_ASSERT(depth == bb->depth);
322  KMP_DEBUG_ASSERT(__kmp_affinity_compact <= depth);
323  for (i = 0; i < __kmp_affinity_compact; i++) {
324  int j = depth - i - 1;
325  if (aa->childNums[j] < bb->childNums[j]) return -1;
326  if (aa->childNums[j] > bb->childNums[j]) return 1;
327  }
328  for (; i < depth; i++) {
329  int j = i - __kmp_affinity_compact;
330  if (aa->childNums[j] < bb->childNums[j]) return -1;
331  if (aa->childNums[j] > bb->childNums[j]) return 1;
332  }
333  return 0;
334 }
335 
336 
337 //
338 // When sorting by labels, __kmp_affinity_assign_child_nums() must first be
339 // called to renumber the labels from [0..n] and place them into the child_num
340 // vector of the address object. This is done in case the labels used for
341 // the children at one node of the heirarchy differ from those used for
342 // another node at the same level. Example: suppose the machine has 2 nodes
343 // with 2 packages each. The first node contains packages 601 and 602, and
344 // second node contains packages 603 and 604. If we try to sort the table
345 // for "scatter" affinity, the table will still be sorted 601, 602, 603, 604
346 // because we are paying attention to the labels themselves, not the ordinal
347 // child numbers. By using the child numbers in the sort, the result is
348 // {0,0}=601, {0,1}=603, {1,0}=602, {1,1}=604.
349 //
350 static void
351 __kmp_affinity_assign_child_nums(AddrUnsPair *address2os,
352  int numAddrs)
353 {
354  KMP_DEBUG_ASSERT(numAddrs > 0);
355  int depth = address2os->first.depth;
356  unsigned *counts = (unsigned *)__kmp_allocate(depth * sizeof(unsigned));
357  unsigned *lastLabel = (unsigned *)__kmp_allocate(depth
358  * sizeof(unsigned));
359  int labCt;
360  for (labCt = 0; labCt < depth; labCt++) {
361  address2os[0].first.childNums[labCt] = counts[labCt] = 0;
362  lastLabel[labCt] = address2os[0].first.labels[labCt];
363  }
364  int i;
365  for (i = 1; i < numAddrs; i++) {
366  for (labCt = 0; labCt < depth; labCt++) {
367  if (address2os[i].first.labels[labCt] != lastLabel[labCt]) {
368  int labCt2;
369  for (labCt2 = labCt + 1; labCt2 < depth; labCt2++) {
370  counts[labCt2] = 0;
371  lastLabel[labCt2] = address2os[i].first.labels[labCt2];
372  }
373  counts[labCt]++;
374  lastLabel[labCt] = address2os[i].first.labels[labCt];
375  break;
376  }
377  }
378  for (labCt = 0; labCt < depth; labCt++) {
379  address2os[i].first.childNums[labCt] = counts[labCt];
380  }
381  for (; labCt < Address::maxDepth; labCt++) {
382  address2os[i].first.childNums[labCt] = 0;
383  }
384  }
385 }
386 
387 
388 //
389 // All of the __kmp_affinity_create_*_map() routines should set
390 // __kmp_affinity_masks to a vector of affinity mask objects of length
391 // __kmp_affinity_num_masks, if __kmp_affinity_type != affinity_none, and
392 // return the number of levels in the machine topology tree (zero if
393 // __kmp_affinity_type == affinity_none).
394 //
395 // All of the __kmp_affinity_create_*_map() routines should set *fullMask
396 // to the affinity mask for the initialization thread. They need to save and
397 // restore the mask, and it could be needed later, so saving it is just an
398 // optimization to avoid calling kmp_get_system_affinity() again.
399 //
400 static kmp_affin_mask_t *fullMask = NULL;
401 
402 kmp_affin_mask_t *
403 __kmp_affinity_get_fullMask() { return fullMask; }
404 
405 
406 static int nCoresPerPkg, nPackages;
407 int __kmp_nThreadsPerCore;
408 
409 //
410 // __kmp_affinity_uniform_topology() doesn't work when called from
411 // places which support arbitrarily many levels in the machine topology
412 // map, i.e. the non-default cases in __kmp_affinity_create_cpuinfo_map()
413 // __kmp_affinity_create_x2apicid_map().
414 //
415 inline static bool
416 __kmp_affinity_uniform_topology()
417 {
418  return __kmp_avail_proc == (__kmp_nThreadsPerCore * nCoresPerPkg * nPackages);
419 }
420 
421 
422 //
423 // Print out the detailed machine topology map, i.e. the physical locations
424 // of each OS proc.
425 //
426 static void
427 __kmp_affinity_print_topology(AddrUnsPair *address2os, int len, int depth,
428  int pkgLevel, int coreLevel, int threadLevel)
429 {
430  int proc;
431 
432  KMP_INFORM(OSProcToPhysicalThreadMap, "KMP_AFFINITY");
433  for (proc = 0; proc < len; proc++) {
434  int level;
435  kmp_str_buf_t buf;
436  __kmp_str_buf_init(&buf);
437  for (level = 0; level < depth; level++) {
438  if (level == threadLevel) {
439  __kmp_str_buf_print(&buf, "%s ", KMP_I18N_STR(Thread));
440  }
441  else if (level == coreLevel) {
442  __kmp_str_buf_print(&buf, "%s ", KMP_I18N_STR(Core));
443  }
444  else if (level == pkgLevel) {
445  __kmp_str_buf_print(&buf, "%s ", KMP_I18N_STR(Package));
446  }
447  else if (level > pkgLevel) {
448  __kmp_str_buf_print(&buf, "%s_%d ", KMP_I18N_STR(Node),
449  level - pkgLevel - 1);
450  }
451  else {
452  __kmp_str_buf_print(&buf, "L%d ", level);
453  }
454  __kmp_str_buf_print(&buf, "%d ",
455  address2os[proc].first.labels[level]);
456  }
457  KMP_INFORM(OSProcMapToPack, "KMP_AFFINITY", address2os[proc].second,
458  buf.str);
459  __kmp_str_buf_free(&buf);
460  }
461 }
462 
463 
464 //
465 // If we don't know how to retrieve the machine's processor topology, or
466 // encounter an error in doing so, this routine is called to form a "flat"
467 // mapping of os thread id's <-> processor id's.
468 //
469 static int
470 __kmp_affinity_create_flat_map(AddrUnsPair **address2os,
471  kmp_i18n_id_t *const msg_id)
472 {
473  *address2os = NULL;
474  *msg_id = kmp_i18n_null;
475 
476  //
477  // Even if __kmp_affinity_type == affinity_none, this routine might still
478  // called to set __kmp_ht_enabled, & __kmp_ncores, as well as
479  // __kmp_nThreadsPerCore, nCoresPerPkg, & nPackages.
480  //
481  if (! KMP_AFFINITY_CAPABLE()) {
482  KMP_ASSERT(__kmp_affinity_type == affinity_none);
483  __kmp_ncores = nPackages = __kmp_xproc;
484  __kmp_nThreadsPerCore = nCoresPerPkg = 1;
485  __kmp_ht_enabled = FALSE;
486  if (__kmp_affinity_verbose) {
487  KMP_INFORM(AffFlatTopology, "KMP_AFFINITY");
488  KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc);
489  KMP_INFORM(Uniform, "KMP_AFFINITY");
490  KMP_INFORM(Topology, "KMP_AFFINITY", nPackages, nCoresPerPkg,
491  __kmp_nThreadsPerCore, __kmp_ncores);
492  }
493  return 0;
494  }
495 
496  //
497  // When affinity is off, this routine will still be called to set
498  // __kmp_ht_enabled, & __kmp_ncores, as well as __kmp_nThreadsPerCore,
499  // nCoresPerPkg, & nPackages. Make sure all these vars are set
500  // correctly, and return now if affinity is not enabled.
501  //
502  __kmp_ncores = nPackages = __kmp_avail_proc;
503  __kmp_nThreadsPerCore = nCoresPerPkg = 1;
504  __kmp_ht_enabled = FALSE;
505  if (__kmp_affinity_verbose) {
506  char buf[KMP_AFFIN_MASK_PRINT_LEN];
507  __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, fullMask);
508 
509  KMP_INFORM(AffCapableUseFlat, "KMP_AFFINITY");
510  if (__kmp_affinity_respect_mask) {
511  KMP_INFORM(InitOSProcSetRespect, "KMP_AFFINITY", buf);
512  } else {
513  KMP_INFORM(InitOSProcSetNotRespect, "KMP_AFFINITY", buf);
514  }
515  KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc);
516  KMP_INFORM(Uniform, "KMP_AFFINITY");
517  KMP_INFORM(Topology, "KMP_AFFINITY", nPackages, nCoresPerPkg,
518  __kmp_nThreadsPerCore, __kmp_ncores);
519  }
520  if (__kmp_affinity_type == affinity_none) {
521  return 0;
522  }
523 
524  //
525  // Contruct the data structure to be returned.
526  //
527  *address2os = (AddrUnsPair*)
528  __kmp_allocate(sizeof(**address2os) * __kmp_avail_proc);
529  int avail_ct = 0;
530  int i;
531  for (i = 0; i < KMP_CPU_SETSIZE; ++i) {
532  //
533  // Skip this proc if it is not included in the machine model.
534  //
535  if (! KMP_CPU_ISSET(i, fullMask)) {
536  continue;
537  }
538 
539  Address addr(1);
540  addr.labels[0] = i;
541  (*address2os)[avail_ct++] = AddrUnsPair(addr,i);
542  }
543  if (__kmp_affinity_verbose) {
544  KMP_INFORM(OSProcToPackage, "KMP_AFFINITY");
545  }
546 
547  if (__kmp_affinity_gran_levels < 0) {
548  //
549  // Only the package level is modeled in the machine topology map,
550  // so the #levels of granularity is either 0 or 1.
551  //
552  if (__kmp_affinity_gran > affinity_gran_package) {
553  __kmp_affinity_gran_levels = 1;
554  }
555  else {
556  __kmp_affinity_gran_levels = 0;
557  }
558  }
559  return 1;
560 }
561 
562 
563 # if KMP_OS_WINDOWS && KMP_ARCH_X86_64
564 
565 //
566 // If multiple Windows* OS processor groups exist, we can create a 2-level
567 // topology map with the groups at level 0 and the individual procs at
568 // level 1.
569 //
570 // This facilitates letting the threads float among all procs in a group,
571 // if granularity=group (the default when there are multiple groups).
572 //
573 static int
574 __kmp_affinity_create_proc_group_map(AddrUnsPair **address2os,
575  kmp_i18n_id_t *const msg_id)
576 {
577  *address2os = NULL;
578  *msg_id = kmp_i18n_null;
579 
580  //
581  // If we don't have multiple processor groups, return now.
582  // The flat mapping will be used.
583  //
584  if ((! KMP_AFFINITY_CAPABLE()) || (__kmp_get_proc_group(fullMask) >= 0)) {
585  // FIXME set *msg_id
586  return -1;
587  }
588 
589  //
590  // Contruct the data structure to be returned.
591  //
592  *address2os = (AddrUnsPair*)
593  __kmp_allocate(sizeof(**address2os) * __kmp_avail_proc);
594  int avail_ct = 0;
595  int i;
596  for (i = 0; i < KMP_CPU_SETSIZE; ++i) {
597  //
598  // Skip this proc if it is not included in the machine model.
599  //
600  if (! KMP_CPU_ISSET(i, fullMask)) {
601  continue;
602  }
603 
604  Address addr(2);
605  addr.labels[0] = i / (CHAR_BIT * sizeof(DWORD_PTR));
606  addr.labels[1] = i % (CHAR_BIT * sizeof(DWORD_PTR));
607  (*address2os)[avail_ct++] = AddrUnsPair(addr,i);
608 
609  if (__kmp_affinity_verbose) {
610  KMP_INFORM(AffOSProcToGroup, "KMP_AFFINITY", i, addr.labels[0],
611  addr.labels[1]);
612  }
613  }
614 
615  if (__kmp_affinity_gran_levels < 0) {
616  if (__kmp_affinity_gran == affinity_gran_group) {
617  __kmp_affinity_gran_levels = 1;
618  }
619  else if ((__kmp_affinity_gran == affinity_gran_fine)
620  || (__kmp_affinity_gran == affinity_gran_thread)) {
621  __kmp_affinity_gran_levels = 0;
622  }
623  else {
624  const char *gran_str = NULL;
625  if (__kmp_affinity_gran == affinity_gran_core) {
626  gran_str = "core";
627  }
628  else if (__kmp_affinity_gran == affinity_gran_package) {
629  gran_str = "package";
630  }
631  else if (__kmp_affinity_gran == affinity_gran_node) {
632  gran_str = "node";
633  }
634  else {
635  KMP_ASSERT(0);
636  }
637 
638  // Warning: can't use affinity granularity \"gran\" with group topology method, using "thread"
639  __kmp_affinity_gran_levels = 0;
640  }
641  }
642  return 2;
643 }
644 
645 # endif /* KMP_OS_WINDOWS && KMP_ARCH_X86_64 */
646 
647 
648 # if KMP_ARCH_X86 || KMP_ARCH_X86_64
649 
650 static int
651 __kmp_cpuid_mask_width(int count) {
652  int r = 0;
653 
654  while((1<<r) < count)
655  ++r;
656  return r;
657 }
658 
659 
660 class apicThreadInfo {
661 public:
662  unsigned osId; // param to __kmp_affinity_bind_thread
663  unsigned apicId; // from cpuid after binding
664  unsigned maxCoresPerPkg; // ""
665  unsigned maxThreadsPerPkg; // ""
666  unsigned pkgId; // inferred from above values
667  unsigned coreId; // ""
668  unsigned threadId; // ""
669 };
670 
671 
672 static int
673 __kmp_affinity_cmp_apicThreadInfo_os_id(const void *a, const void *b)
674 {
675  const apicThreadInfo *aa = (const apicThreadInfo *)a;
676  const apicThreadInfo *bb = (const apicThreadInfo *)b;
677  if (aa->osId < bb->osId) return -1;
678  if (aa->osId > bb->osId) return 1;
679  return 0;
680 }
681 
682 
683 static int
684 __kmp_affinity_cmp_apicThreadInfo_phys_id(const void *a, const void *b)
685 {
686  const apicThreadInfo *aa = (const apicThreadInfo *)a;
687  const apicThreadInfo *bb = (const apicThreadInfo *)b;
688  if (aa->pkgId < bb->pkgId) return -1;
689  if (aa->pkgId > bb->pkgId) return 1;
690  if (aa->coreId < bb->coreId) return -1;
691  if (aa->coreId > bb->coreId) return 1;
692  if (aa->threadId < bb->threadId) return -1;
693  if (aa->threadId > bb->threadId) return 1;
694  return 0;
695 }
696 
697 
698 //
699 // On IA-32 architecture and Intel(R) 64 architecture, we attempt to use
700 // an algorithm which cycles through the available os threads, setting
701 // the current thread's affinity mask to that thread, and then retrieves
702 // the Apic Id for each thread context using the cpuid instruction.
703 //
704 static int
705 __kmp_affinity_create_apicid_map(AddrUnsPair **address2os,
706  kmp_i18n_id_t *const msg_id)
707 {
708  int rc;
709  *address2os = NULL;
710  *msg_id = kmp_i18n_null;
711 
712 # if KMP_MIC
713  {
714  // The code below will use cpuid(4).
715  // Check if cpuid(4) is supported.
716  // FIXME? - this really doesn't need to be specific to MIC.
717  kmp_cpuid buf;
718  __kmp_x86_cpuid(0, 0, &buf);
719  if (buf.eax < 4) {
720  *msg_id = kmp_i18n_str_NoLeaf4Support;
721  return -1;
722  }
723  }
724 # endif // KMP_MIC
725 
726  //
727  // Even if __kmp_affinity_type == affinity_none, this routine is still
728  // called to set __kmp_ht_enabled, & __kmp_ncores, as well as
729  // __kmp_nThreadsPerCore, nCoresPerPkg, & nPackages.
730  //
731  // The algorithm used starts by setting the affinity to each available
732  // thread and retreiving info from the cpuid instruction, so if we are not
733  // capable of calling __kmp_affinity_get_map()/__kmp_affinity_get_map(),
734  // then we need to do something else.
735  //
736  if (! KMP_AFFINITY_CAPABLE()) {
737  //
738  // Hack to try and infer the machine topology using only the data
739  // available from cpuid on the current thread, and __kmp_xproc.
740  //
741  KMP_ASSERT(__kmp_affinity_type == affinity_none);
742 
743  //
744  // Get an upper bound on the number of threads per package using
745  // cpuid(1).
746  //
747  // On some OS/chps combinations where HT is supported by the chip
748  // but is disabled, this value will be 2 on a single core chip.
749  // Usually, it will be 2 if HT is enabled and 1 if HT is disabled.
750  //
751  kmp_cpuid buf;
752  __kmp_x86_cpuid(1, 0, &buf);
753  int maxThreadsPerPkg = (buf.ebx >> 16) & 0xff;
754  if (maxThreadsPerPkg == 0) {
755  maxThreadsPerPkg = 1;
756  }
757 
758  //
759  // The num cores per pkg comes from cpuid(4).
760  // 1 must be added to the encoded value.
761  //
762  // The author of cpu_count.cpp treated this only an upper bound
763  // on the number of cores, but I haven't seen any cases where it
764  // was greater than the actual number of cores, so we will treat
765  // it as exact in this block of code.
766  //
767  // First, we need to check if cpuid(4) is supported on this chip.
768  // To see if cpuid(n) is supported, issue cpuid(0) and check if eax
769  // has the value n or greater.
770  //
771  __kmp_x86_cpuid(0, 0, &buf);
772  if (buf.eax >= 4) {
773  __kmp_x86_cpuid(4, 0, &buf);
774  nCoresPerPkg = ((buf.eax >> 26) & 0x3f) + 1;
775  }
776  else {
777  nCoresPerPkg = 1;
778  }
779 
780  //
781  // There is no way to reliably tell if HT is enabled without issuing
782  // the cpuid instruction from every thread, can correlating the cpuid
783  // info, so if the machine is not affinity capable, we assume that HT
784  // is off. We have seen quite a few machines where maxThreadsPerPkg
785  // is 2, yet the machine does not support HT.
786  //
787  // - Older OSes are usually found on machines with older chips, which
788  // do not support HT.
789  //
790  // - The performance penalty for mistakenly identifying a machine as
791  // HT when it isn't (which results in blocktime being incorrecly set
792  // to 0) is greater than the penalty when for mistakenly identifying
793  // a machine as being 1 thread/core when it is really HT enabled
794  // (which results in blocktime being incorrectly set to a positive
795  // value).
796  //
797  __kmp_ncores = __kmp_xproc;
798  nPackages = (__kmp_xproc + nCoresPerPkg - 1) / nCoresPerPkg;
799  __kmp_nThreadsPerCore = 1;
800  __kmp_ht_enabled = FALSE;
801  if (__kmp_affinity_verbose) {
802  KMP_INFORM(AffNotCapableUseLocCpuid, "KMP_AFFINITY");
803  KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc);
804  if (__kmp_affinity_uniform_topology()) {
805  KMP_INFORM(Uniform, "KMP_AFFINITY");
806  } else {
807  KMP_INFORM(NonUniform, "KMP_AFFINITY");
808  }
809  KMP_INFORM(Topology, "KMP_AFFINITY", nPackages, nCoresPerPkg,
810  __kmp_nThreadsPerCore, __kmp_ncores);
811  }
812  return 0;
813  }
814 
815  //
816  //
817  // From here on, we can assume that it is safe to call
818  // __kmp_get_system_affinity() and __kmp_set_system_affinity(),
819  // even if __kmp_affinity_type = affinity_none.
820  //
821 
822  //
823  // Save the affinity mask for the current thread.
824  //
825  kmp_affin_mask_t *oldMask;
826  KMP_CPU_ALLOC(oldMask);
827  KMP_ASSERT(oldMask != NULL);
828  __kmp_get_system_affinity(oldMask, TRUE);
829 
830  //
831  // Run through each of the available contexts, binding the current thread
832  // to it, and obtaining the pertinent information using the cpuid instr.
833  //
834  // The relevant information is:
835  //
836  // Apic Id: Bits 24:31 of ebx after issuing cpuid(1) - each thread context
837  // has a uniqie Apic Id, which is of the form pkg# : core# : thread#.
838  //
839  // Max Threads Per Pkg: Bits 16:23 of ebx after issuing cpuid(1). The
840  // value of this field determines the width of the core# + thread#
841  // fields in the Apic Id. It is also an upper bound on the number
842  // of threads per package, but it has been verified that situations
843  // happen were it is not exact. In particular, on certain OS/chip
844  // combinations where Intel(R) Hyper-Threading Technology is supported
845  // by the chip but has
846  // been disabled, the value of this field will be 2 (for a single core
847  // chip). On other OS/chip combinations supporting
848  // Intel(R) Hyper-Threading Technology, the value of
849  // this field will be 1 when Intel(R) Hyper-Threading Technology is
850  // disabled and 2 when it is enabled.
851  //
852  // Max Cores Per Pkg: Bits 26:31 of eax after issuing cpuid(4). The
853  // value of this field (+1) determines the width of the core# field in
854  // the Apic Id. The comments in "cpucount.cpp" say that this value is
855  // an upper bound, but the IA-32 architecture manual says that it is
856  // exactly the number of cores per package, and I haven't seen any
857  // case where it wasn't.
858  //
859  // From this information, deduce the package Id, core Id, and thread Id,
860  // and set the corresponding fields in the apicThreadInfo struct.
861  //
862  unsigned i;
863  apicThreadInfo *threadInfo = (apicThreadInfo *)__kmp_allocate(
864  __kmp_avail_proc * sizeof(apicThreadInfo));
865  unsigned nApics = 0;
866  for (i = 0; i < KMP_CPU_SETSIZE; ++i) {
867  //
868  // Skip this proc if it is not included in the machine model.
869  //
870  if (! KMP_CPU_ISSET(i, fullMask)) {
871  continue;
872  }
873  KMP_DEBUG_ASSERT(nApics < __kmp_avail_proc);
874 
875  __kmp_affinity_bind_thread(i);
876  threadInfo[nApics].osId = i;
877 
878  //
879  // The apic id and max threads per pkg come from cpuid(1).
880  //
881  kmp_cpuid buf;
882  __kmp_x86_cpuid(1, 0, &buf);
883  if (! (buf.edx >> 9) & 1) {
884  __kmp_set_system_affinity(oldMask, TRUE);
885  __kmp_free(threadInfo);
886  KMP_CPU_FREE(oldMask);
887  *msg_id = kmp_i18n_str_ApicNotPresent;
888  return -1;
889  }
890  threadInfo[nApics].apicId = (buf.ebx >> 24) & 0xff;
891  threadInfo[nApics].maxThreadsPerPkg = (buf.ebx >> 16) & 0xff;
892  if (threadInfo[nApics].maxThreadsPerPkg == 0) {
893  threadInfo[nApics].maxThreadsPerPkg = 1;
894  }
895 
896  //
897  // Max cores per pkg comes from cpuid(4).
898  // 1 must be added to the encoded value.
899  //
900  // First, we need to check if cpuid(4) is supported on this chip.
901  // To see if cpuid(n) is supported, issue cpuid(0) and check if eax
902  // has the value n or greater.
903  //
904  __kmp_x86_cpuid(0, 0, &buf);
905  if (buf.eax >= 4) {
906  __kmp_x86_cpuid(4, 0, &buf);
907  threadInfo[nApics].maxCoresPerPkg = ((buf.eax >> 26) & 0x3f) + 1;
908  }
909  else {
910  threadInfo[nApics].maxCoresPerPkg = 1;
911  }
912 
913  //
914  // Infer the pkgId / coreId / threadId using only the info
915  // obtained locally.
916  //
917  int widthCT = __kmp_cpuid_mask_width(
918  threadInfo[nApics].maxThreadsPerPkg);
919  threadInfo[nApics].pkgId = threadInfo[nApics].apicId >> widthCT;
920 
921  int widthC = __kmp_cpuid_mask_width(
922  threadInfo[nApics].maxCoresPerPkg);
923  int widthT = widthCT - widthC;
924  if (widthT < 0) {
925  //
926  // I've never seen this one happen, but I suppose it could, if
927  // the cpuid instruction on a chip was really screwed up.
928  // Make sure to restore the affinity mask before the tail call.
929  //
930  __kmp_set_system_affinity(oldMask, TRUE);
931  __kmp_free(threadInfo);
932  KMP_CPU_FREE(oldMask);
933  *msg_id = kmp_i18n_str_InvalidCpuidInfo;
934  return -1;
935  }
936 
937  int maskC = (1 << widthC) - 1;
938  threadInfo[nApics].coreId = (threadInfo[nApics].apicId >> widthT)
939  &maskC;
940 
941  int maskT = (1 << widthT) - 1;
942  threadInfo[nApics].threadId = threadInfo[nApics].apicId &maskT;
943 
944  nApics++;
945  }
946 
947  //
948  // We've collected all the info we need.
949  // Restore the old affinity mask for this thread.
950  //
951  __kmp_set_system_affinity(oldMask, TRUE);
952 
953  //
954  // If there's only one thread context to bind to, form an Address object
955  // with depth 1 and return immediately (or, if affinity is off, set
956  // address2os to NULL and return).
957  //
958  // If it is configured to omit the package level when there is only a
959  // single package, the logic at the end of this routine won't work if
960  // there is only a single thread - it would try to form an Address
961  // object with depth 0.
962  //
963  KMP_ASSERT(nApics > 0);
964  if (nApics == 1) {
965  __kmp_ncores = nPackages = 1;
966  __kmp_nThreadsPerCore = nCoresPerPkg = 1;
967  __kmp_ht_enabled = FALSE;
968  if (__kmp_affinity_verbose) {
969  char buf[KMP_AFFIN_MASK_PRINT_LEN];
970  __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, oldMask);
971 
972  KMP_INFORM(AffUseGlobCpuid, "KMP_AFFINITY");
973  if (__kmp_affinity_respect_mask) {
974  KMP_INFORM(InitOSProcSetRespect, "KMP_AFFINITY", buf);
975  } else {
976  KMP_INFORM(InitOSProcSetNotRespect, "KMP_AFFINITY", buf);
977  }
978  KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc);
979  KMP_INFORM(Uniform, "KMP_AFFINITY");
980  KMP_INFORM(Topology, "KMP_AFFINITY", nPackages, nCoresPerPkg,
981  __kmp_nThreadsPerCore, __kmp_ncores);
982  }
983 
984  if (__kmp_affinity_type == affinity_none) {
985  __kmp_free(threadInfo);
986  KMP_CPU_FREE(oldMask);
987  return 0;
988  }
989 
990  *address2os = (AddrUnsPair*)__kmp_allocate(sizeof(AddrUnsPair));
991  Address addr(1);
992  addr.labels[0] = threadInfo[0].pkgId;
993  (*address2os)[0] = AddrUnsPair(addr, threadInfo[0].osId);
994 
995  if (__kmp_affinity_gran_levels < 0) {
996  __kmp_affinity_gran_levels = 0;
997  }
998 
999  if (__kmp_affinity_verbose) {
1000  __kmp_affinity_print_topology(*address2os, 1, 1, 0, -1, -1);
1001  }
1002 
1003  __kmp_free(threadInfo);
1004  KMP_CPU_FREE(oldMask);
1005  return 1;
1006  }
1007 
1008  //
1009  // Sort the threadInfo table by physical Id.
1010  //
1011  qsort(threadInfo, nApics, sizeof(*threadInfo),
1012  __kmp_affinity_cmp_apicThreadInfo_phys_id);
1013 
1014  //
1015  // The table is now sorted by pkgId / coreId / threadId, but we really
1016  // don't know the radix of any of the fields. pkgId's may be sparsely
1017  // assigned among the chips on a system. Although coreId's are usually
1018  // assigned [0 .. coresPerPkg-1] and threadId's are usually assigned
1019  // [0..threadsPerCore-1], we don't want to make any such assumptions.
1020  //
1021  // For that matter, we don't know what coresPerPkg and threadsPerCore
1022  // (or the total # packages) are at this point - we want to determine
1023  // that now. We only have an upper bound on the first two figures.
1024  //
1025  // We also perform a consistency check at this point: the values returned
1026  // by the cpuid instruction for any thread bound to a given package had
1027  // better return the same info for maxThreadsPerPkg and maxCoresPerPkg.
1028  //
1029  nPackages = 1;
1030  nCoresPerPkg = 1;
1031  __kmp_nThreadsPerCore = 1;
1032  unsigned nCores = 1;
1033 
1034  unsigned pkgCt = 1; // to determine radii
1035  unsigned lastPkgId = threadInfo[0].pkgId;
1036  unsigned coreCt = 1;
1037  unsigned lastCoreId = threadInfo[0].coreId;
1038  unsigned threadCt = 1;
1039  unsigned lastThreadId = threadInfo[0].threadId;
1040 
1041  // intra-pkg consist checks
1042  unsigned prevMaxCoresPerPkg = threadInfo[0].maxCoresPerPkg;
1043  unsigned prevMaxThreadsPerPkg = threadInfo[0].maxThreadsPerPkg;
1044 
1045  for (i = 1; i < nApics; i++) {
1046  if (threadInfo[i].pkgId != lastPkgId) {
1047  nCores++;
1048  pkgCt++;
1049  lastPkgId = threadInfo[i].pkgId;
1050  if (coreCt > nCoresPerPkg) nCoresPerPkg = coreCt;
1051  coreCt = 1;
1052  lastCoreId = threadInfo[i].coreId;
1053  if (threadCt > __kmp_nThreadsPerCore) __kmp_nThreadsPerCore = threadCt;
1054  threadCt = 1;
1055  lastThreadId = threadInfo[i].threadId;
1056 
1057  //
1058  // This is a different package, so go on to the next iteration
1059  // without doing any consistency checks. Reset the consistency
1060  // check vars, though.
1061  //
1062  prevMaxCoresPerPkg = threadInfo[i].maxCoresPerPkg;
1063  prevMaxThreadsPerPkg = threadInfo[i].maxThreadsPerPkg;
1064  continue;
1065  }
1066 
1067  if (threadInfo[i].coreId != lastCoreId) {
1068  nCores++;
1069  coreCt++;
1070  lastCoreId = threadInfo[i].coreId;
1071  if (threadCt > __kmp_nThreadsPerCore) __kmp_nThreadsPerCore = threadCt;
1072  threadCt = 1;
1073  lastThreadId = threadInfo[i].threadId;
1074  }
1075  else if (threadInfo[i].threadId != lastThreadId) {
1076  threadCt++;
1077  lastThreadId = threadInfo[i].threadId;
1078  }
1079  else {
1080  __kmp_free(threadInfo);
1081  KMP_CPU_FREE(oldMask);
1082  *msg_id = kmp_i18n_str_LegacyApicIDsNotUnique;
1083  return -1;
1084  }
1085 
1086  //
1087  // Check to make certain that the maxCoresPerPkg and maxThreadsPerPkg
1088  // fields agree between all the threads bounds to a given package.
1089  //
1090  if ((prevMaxCoresPerPkg != threadInfo[i].maxCoresPerPkg)
1091  || (prevMaxThreadsPerPkg != threadInfo[i].maxThreadsPerPkg)) {
1092  __kmp_free(threadInfo);
1093  KMP_CPU_FREE(oldMask);
1094  *msg_id = kmp_i18n_str_InconsistentCpuidInfo;
1095  return -1;
1096  }
1097  }
1098  nPackages = pkgCt;
1099  if (coreCt > nCoresPerPkg) nCoresPerPkg = coreCt;
1100  if (threadCt > __kmp_nThreadsPerCore) __kmp_nThreadsPerCore = threadCt;
1101 
1102  //
1103  // When affinity is off, this routine will still be called to set
1104  // __kmp_ht_enabled, & __kmp_ncores, as well as __kmp_nThreadsPerCore,
1105  // nCoresPerPkg, & nPackages. Make sure all these vars are set
1106  // correctly, and return now if affinity is not enabled.
1107  //
1108  __kmp_ht_enabled = (__kmp_nThreadsPerCore > 1);
1109  __kmp_ncores = nCores;
1110  if (__kmp_affinity_verbose) {
1111  char buf[KMP_AFFIN_MASK_PRINT_LEN];
1112  __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, oldMask);
1113 
1114  KMP_INFORM(AffUseGlobCpuid, "KMP_AFFINITY");
1115  if (__kmp_affinity_respect_mask) {
1116  KMP_INFORM(InitOSProcSetRespect, "KMP_AFFINITY", buf);
1117  } else {
1118  KMP_INFORM(InitOSProcSetNotRespect, "KMP_AFFINITY", buf);
1119  }
1120  KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc);
1121  if (__kmp_affinity_uniform_topology()) {
1122  KMP_INFORM(Uniform, "KMP_AFFINITY");
1123  } else {
1124  KMP_INFORM(NonUniform, "KMP_AFFINITY");
1125  }
1126  KMP_INFORM(Topology, "KMP_AFFINITY", nPackages, nCoresPerPkg,
1127  __kmp_nThreadsPerCore, __kmp_ncores);
1128 
1129  }
1130 
1131  if (__kmp_affinity_type == affinity_none) {
1132  __kmp_free(threadInfo);
1133  KMP_CPU_FREE(oldMask);
1134  return 0;
1135  }
1136 
1137  //
1138  // Now that we've determined the number of packages, the number of cores
1139  // per package, and the number of threads per core, we can construct the
1140  // data structure that is to be returned.
1141  //
1142  int pkgLevel = 0;
1143  int coreLevel = (nCoresPerPkg <= 1) ? -1 : 1;
1144  int threadLevel = (__kmp_nThreadsPerCore <= 1) ? -1 : ((coreLevel >= 0) ? 2 : 1);
1145  unsigned depth = (pkgLevel >= 0) + (coreLevel >= 0) + (threadLevel >= 0);
1146 
1147  KMP_ASSERT(depth > 0);
1148  *address2os = (AddrUnsPair*)__kmp_allocate(sizeof(AddrUnsPair) * nApics);
1149 
1150  for (i = 0; i < nApics; ++i) {
1151  Address addr(depth);
1152  unsigned os = threadInfo[i].osId;
1153  int d = 0;
1154 
1155  if (pkgLevel >= 0) {
1156  addr.labels[d++] = threadInfo[i].pkgId;
1157  }
1158  if (coreLevel >= 0) {
1159  addr.labels[d++] = threadInfo[i].coreId;
1160  }
1161  if (threadLevel >= 0) {
1162  addr.labels[d++] = threadInfo[i].threadId;
1163  }
1164  (*address2os)[i] = AddrUnsPair(addr, os);
1165  }
1166 
1167  if (__kmp_affinity_gran_levels < 0) {
1168  //
1169  // Set the granularity level based on what levels are modeled
1170  // in the machine topology map.
1171  //
1172  __kmp_affinity_gran_levels = 0;
1173  if ((threadLevel >= 0)
1174  && (__kmp_affinity_gran > affinity_gran_thread)) {
1175  __kmp_affinity_gran_levels++;
1176  }
1177  if ((coreLevel >= 0) && (__kmp_affinity_gran > affinity_gran_core)) {
1178  __kmp_affinity_gran_levels++;
1179  }
1180  if ((pkgLevel >= 0) && (__kmp_affinity_gran > affinity_gran_package)) {
1181  __kmp_affinity_gran_levels++;
1182  }
1183  }
1184 
1185  if (__kmp_affinity_verbose) {
1186  __kmp_affinity_print_topology(*address2os, nApics, depth, pkgLevel,
1187  coreLevel, threadLevel);
1188  }
1189 
1190  __kmp_free(threadInfo);
1191  KMP_CPU_FREE(oldMask);
1192  return depth;
1193 }
1194 
1195 
1196 //
1197 // Intel(R) microarchitecture code name Nehalem, Dunnington and later
1198 // architectures support a newer interface for specifying the x2APIC Ids,
1199 // based on cpuid leaf 11.
1200 //
1201 static int
1202 __kmp_affinity_create_x2apicid_map(AddrUnsPair **address2os,
1203  kmp_i18n_id_t *const msg_id)
1204 {
1205  kmp_cpuid buf;
1206 
1207  *address2os = NULL;
1208  *msg_id = kmp_i18n_null;
1209 
1210  //
1211  // Check to see if cpuid leaf 11 is supported.
1212  //
1213  __kmp_x86_cpuid(0, 0, &buf);
1214  if (buf.eax < 11) {
1215  *msg_id = kmp_i18n_str_NoLeaf11Support;
1216  return -1;
1217  }
1218  __kmp_x86_cpuid(11, 0, &buf);
1219  if (buf.ebx == 0) {
1220  *msg_id = kmp_i18n_str_NoLeaf11Support;
1221  return -1;
1222  }
1223 
1224  //
1225  // Find the number of levels in the machine topology. While we're at it,
1226  // get the default values for __kmp_nThreadsPerCore & nCoresPerPkg. We will
1227  // try to get more accurate values later by explicitly counting them,
1228  // but get reasonable defaults now, in case we return early.
1229  //
1230  int level;
1231  int threadLevel = -1;
1232  int coreLevel = -1;
1233  int pkgLevel = -1;
1234  __kmp_nThreadsPerCore = nCoresPerPkg = nPackages = 1;
1235 
1236  for (level = 0;; level++) {
1237  if (level > 31) {
1238  //
1239  // FIXME: Hack for DPD200163180
1240  //
1241  // If level is big then something went wrong -> exiting
1242  //
1243  // There could actually be 32 valid levels in the machine topology,
1244  // but so far, the only machine we have seen which does not exit
1245  // this loop before iteration 32 has fubar x2APIC settings.
1246  //
1247  // For now, just reject this case based upon loop trip count.
1248  //
1249  *msg_id = kmp_i18n_str_InvalidCpuidInfo;
1250  return -1;
1251  }
1252  __kmp_x86_cpuid(11, level, &buf);
1253  if (buf.ebx == 0) {
1254  if (pkgLevel < 0) {
1255  //
1256  // Will infer nPackages from __kmp_xproc
1257  //
1258  pkgLevel = level;
1259  level++;
1260  }
1261  break;
1262  }
1263  int kind = (buf.ecx >> 8) & 0xff;
1264  if (kind == 1) {
1265  //
1266  // SMT level
1267  //
1268  threadLevel = level;
1269  coreLevel = -1;
1270  pkgLevel = -1;
1271  __kmp_nThreadsPerCore = buf.ebx & 0xff;
1272  if (__kmp_nThreadsPerCore == 0) {
1273  *msg_id = kmp_i18n_str_InvalidCpuidInfo;
1274  return -1;
1275  }
1276  }
1277  else if (kind == 2) {
1278  //
1279  // core level
1280  //
1281  coreLevel = level;
1282  pkgLevel = -1;
1283  nCoresPerPkg = buf.ebx & 0xff;
1284  if (nCoresPerPkg == 0) {
1285  *msg_id = kmp_i18n_str_InvalidCpuidInfo;
1286  return -1;
1287  }
1288  }
1289  else {
1290  if (level <= 0) {
1291  *msg_id = kmp_i18n_str_InvalidCpuidInfo;
1292  return -1;
1293  }
1294  if (pkgLevel >= 0) {
1295  continue;
1296  }
1297  pkgLevel = level;
1298  nPackages = buf.ebx & 0xff;
1299  if (nPackages == 0) {
1300  *msg_id = kmp_i18n_str_InvalidCpuidInfo;
1301  return -1;
1302  }
1303  }
1304  }
1305  int depth = level;
1306 
1307  //
1308  // In the above loop, "level" was counted from the finest level (usually
1309  // thread) to the coarsest. The caller expects that we will place the
1310  // labels in (*address2os)[].first.labels[] in the inverse order, so
1311  // we need to invert the vars saying which level means what.
1312  //
1313  if (threadLevel >= 0) {
1314  threadLevel = depth - threadLevel - 1;
1315  }
1316  if (coreLevel >= 0) {
1317  coreLevel = depth - coreLevel - 1;
1318  }
1319  KMP_DEBUG_ASSERT(pkgLevel >= 0);
1320  pkgLevel = depth - pkgLevel - 1;
1321 
1322  //
1323  // The algorithm used starts by setting the affinity to each available
1324  // thread and retrieving info from the cpuid instruction, so if we are not
1325  // capable of calling __kmp_affinity_get_map()/__kmp_affinity_get_map(),
1326  // then we need to do something else - use the defaults that we calculated
1327  // from issuing cpuid without binding to each proc.
1328  //
1329  if (! KMP_AFFINITY_CAPABLE())
1330  {
1331  //
1332  // Hack to try and infer the machine topology using only the data
1333  // available from cpuid on the current thread, and __kmp_xproc.
1334  //
1335  KMP_ASSERT(__kmp_affinity_type == affinity_none);
1336 
1337  __kmp_ncores = __kmp_xproc / __kmp_nThreadsPerCore;
1338  nPackages = (__kmp_xproc + nCoresPerPkg - 1) / nCoresPerPkg;
1339  __kmp_ht_enabled = (__kmp_nThreadsPerCore > 1);
1340  if (__kmp_affinity_verbose) {
1341  KMP_INFORM(AffNotCapableUseLocCpuidL11, "KMP_AFFINITY");
1342  KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc);
1343  if (__kmp_affinity_uniform_topology()) {
1344  KMP_INFORM(Uniform, "KMP_AFFINITY");
1345  } else {
1346  KMP_INFORM(NonUniform, "KMP_AFFINITY");
1347  }
1348  KMP_INFORM(Topology, "KMP_AFFINITY", nPackages, nCoresPerPkg,
1349  __kmp_nThreadsPerCore, __kmp_ncores);
1350  }
1351  return 0;
1352  }
1353 
1354  //
1355  //
1356  // From here on, we can assume that it is safe to call
1357  // __kmp_get_system_affinity() and __kmp_set_system_affinity(),
1358  // even if __kmp_affinity_type = affinity_none.
1359  //
1360 
1361  //
1362  // Save the affinity mask for the current thread.
1363  //
1364  kmp_affin_mask_t *oldMask;
1365  KMP_CPU_ALLOC(oldMask);
1366  __kmp_get_system_affinity(oldMask, TRUE);
1367 
1368  //
1369  // Allocate the data structure to be returned.
1370  //
1371  AddrUnsPair *retval = (AddrUnsPair *)
1372  __kmp_allocate(sizeof(AddrUnsPair) * __kmp_avail_proc);
1373 
1374  //
1375  // Run through each of the available contexts, binding the current thread
1376  // to it, and obtaining the pertinent information using the cpuid instr.
1377  //
1378  int proc;
1379  int nApics = 0;
1380  for (proc = 0; proc < KMP_CPU_SETSIZE; ++proc) {
1381  //
1382  // Skip this proc if it is not included in the machine model.
1383  //
1384  if (! KMP_CPU_ISSET(proc, fullMask)) {
1385  continue;
1386  }
1387  KMP_DEBUG_ASSERT(nApics < __kmp_avail_proc);
1388 
1389  __kmp_affinity_bind_thread(proc);
1390 
1391  //
1392  // Extrach the labels for each level in the machine topology map
1393  // from the Apic ID.
1394  //
1395  Address addr(depth);
1396  int prev_shift = 0;
1397 
1398  for (level = 0; level < depth; level++) {
1399  __kmp_x86_cpuid(11, level, &buf);
1400  unsigned apicId = buf.edx;
1401  if (buf.ebx == 0) {
1402  if (level != depth - 1) {
1403  KMP_CPU_FREE(oldMask);
1404  *msg_id = kmp_i18n_str_InconsistentCpuidInfo;
1405  return -1;
1406  }
1407  addr.labels[depth - level - 1] = apicId >> prev_shift;
1408  level++;
1409  break;
1410  }
1411  int shift = buf.eax & 0x1f;
1412  int mask = (1 << shift) - 1;
1413  addr.labels[depth - level - 1] = (apicId & mask) >> prev_shift;
1414  prev_shift = shift;
1415  }
1416  if (level != depth) {
1417  KMP_CPU_FREE(oldMask);
1418  *msg_id = kmp_i18n_str_InconsistentCpuidInfo;
1419  return -1;
1420  }
1421 
1422  retval[nApics] = AddrUnsPair(addr, proc);
1423  nApics++;
1424  }
1425 
1426  //
1427  // We've collected all the info we need.
1428  // Restore the old affinity mask for this thread.
1429  //
1430  __kmp_set_system_affinity(oldMask, TRUE);
1431 
1432  //
1433  // If there's only one thread context to bind to, return now.
1434  //
1435  KMP_ASSERT(nApics > 0);
1436  if (nApics == 1) {
1437  __kmp_ncores = nPackages = 1;
1438  __kmp_nThreadsPerCore = nCoresPerPkg = 1;
1439  __kmp_ht_enabled = FALSE;
1440  if (__kmp_affinity_verbose) {
1441  char buf[KMP_AFFIN_MASK_PRINT_LEN];
1442  __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, oldMask);
1443 
1444  KMP_INFORM(AffUseGlobCpuidL11, "KMP_AFFINITY");
1445  if (__kmp_affinity_respect_mask) {
1446  KMP_INFORM(InitOSProcSetRespect, "KMP_AFFINITY", buf);
1447  } else {
1448  KMP_INFORM(InitOSProcSetNotRespect, "KMP_AFFINITY", buf);
1449  }
1450  KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc);
1451  KMP_INFORM(Uniform, "KMP_AFFINITY");
1452  KMP_INFORM(Topology, "KMP_AFFINITY", nPackages, nCoresPerPkg,
1453  __kmp_nThreadsPerCore, __kmp_ncores);
1454  }
1455 
1456  if (__kmp_affinity_type == affinity_none) {
1457  __kmp_free(retval);
1458  KMP_CPU_FREE(oldMask);
1459  return 0;
1460  }
1461 
1462  //
1463  // Form an Address object which only includes the package level.
1464  //
1465  Address addr(1);
1466  addr.labels[0] = retval[0].first.labels[pkgLevel];
1467  retval[0].first = addr;
1468 
1469  if (__kmp_affinity_gran_levels < 0) {
1470  __kmp_affinity_gran_levels = 0;
1471  }
1472 
1473  if (__kmp_affinity_verbose) {
1474  __kmp_affinity_print_topology(retval, 1, 1, 0, -1, -1);
1475  }
1476 
1477  *address2os = retval;
1478  KMP_CPU_FREE(oldMask);
1479  return 1;
1480  }
1481 
1482  //
1483  // Sort the table by physical Id.
1484  //
1485  qsort(retval, nApics, sizeof(*retval), __kmp_affinity_cmp_Address_labels);
1486 
1487  //
1488  // Find the radix at each of the levels.
1489  //
1490  unsigned *totals = (unsigned *)__kmp_allocate(depth * sizeof(unsigned));
1491  unsigned *counts = (unsigned *)__kmp_allocate(depth * sizeof(unsigned));
1492  unsigned *maxCt = (unsigned *)__kmp_allocate(depth * sizeof(unsigned));
1493  unsigned *last = (unsigned *)__kmp_allocate(depth * sizeof(unsigned));
1494  for (level = 0; level < depth; level++) {
1495  totals[level] = 1;
1496  maxCt[level] = 1;
1497  counts[level] = 1;
1498  last[level] = retval[0].first.labels[level];
1499  }
1500 
1501  //
1502  // From here on, the iteration variable "level" runs from the finest
1503  // level to the coarsest, i.e. we iterate forward through
1504  // (*address2os)[].first.labels[] - in the previous loops, we iterated
1505  // backwards.
1506  //
1507  for (proc = 1; proc < nApics; proc++) {
1508  int level;
1509  for (level = 0; level < depth; level++) {
1510  if (retval[proc].first.labels[level] != last[level]) {
1511  unsigned j;
1512  for (j = level + 1; j < depth; j++) {
1513  totals[j]++;
1514  counts[j] = 1;
1515  // The line below causes printing incorrect topology information
1516  // in case the max value for some level (maxCt[level]) is encountered earlier than
1517  // some less value while going through the array.
1518  // For example, let pkg0 has 4 cores and pkg1 has 2 cores. Then maxCt[1] == 2
1519  // whereas it must be 4.
1520  // TODO!!! Check if it can be commented safely
1521  //maxCt[j] = 1;
1522  last[j] = retval[proc].first.labels[j];
1523  }
1524  totals[level]++;
1525  counts[level]++;
1526  if (counts[level] > maxCt[level]) {
1527  maxCt[level] = counts[level];
1528  }
1529  last[level] = retval[proc].first.labels[level];
1530  break;
1531  }
1532  else if (level == depth - 1) {
1533  __kmp_free(last);
1534  __kmp_free(maxCt);
1535  __kmp_free(counts);
1536  __kmp_free(totals);
1537  __kmp_free(retval);
1538  KMP_CPU_FREE(oldMask);
1539  *msg_id = kmp_i18n_str_x2ApicIDsNotUnique;
1540  return -1;
1541  }
1542  }
1543  }
1544 
1545  //
1546  // When affinity is off, this routine will still be called to set
1547  // __kmp_ht_enabled, & __kmp_ncores, as well as __kmp_nThreadsPerCore,
1548  // nCoresPerPkg, & nPackages. Make sure all these vars are set
1549  // correctly, and return if affinity is not enabled.
1550  //
1551  if (threadLevel >= 0) {
1552  __kmp_nThreadsPerCore = maxCt[threadLevel];
1553  }
1554  else {
1555  __kmp_nThreadsPerCore = 1;
1556  }
1557  __kmp_ht_enabled = (__kmp_nThreadsPerCore > 1);
1558 
1559  nPackages = totals[pkgLevel];
1560 
1561  if (coreLevel >= 0) {
1562  __kmp_ncores = totals[coreLevel];
1563  nCoresPerPkg = maxCt[coreLevel];
1564  }
1565  else {
1566  __kmp_ncores = nPackages;
1567  nCoresPerPkg = 1;
1568  }
1569 
1570  //
1571  // Check to see if the machine topology is uniform
1572  //
1573  unsigned prod = maxCt[0];
1574  for (level = 1; level < depth; level++) {
1575  prod *= maxCt[level];
1576  }
1577  bool uniform = (prod == totals[level - 1]);
1578 
1579  //
1580  // Print the machine topology summary.
1581  //
1582  if (__kmp_affinity_verbose) {
1583  char mask[KMP_AFFIN_MASK_PRINT_LEN];
1584  __kmp_affinity_print_mask(mask, KMP_AFFIN_MASK_PRINT_LEN, oldMask);
1585 
1586  KMP_INFORM(AffUseGlobCpuidL11, "KMP_AFFINITY");
1587  if (__kmp_affinity_respect_mask) {
1588  KMP_INFORM(InitOSProcSetRespect, "KMP_AFFINITY", mask);
1589  } else {
1590  KMP_INFORM(InitOSProcSetNotRespect, "KMP_AFFINITY", mask);
1591  }
1592  KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc);
1593  if (uniform) {
1594  KMP_INFORM(Uniform, "KMP_AFFINITY");
1595  } else {
1596  KMP_INFORM(NonUniform, "KMP_AFFINITY");
1597  }
1598 
1599  kmp_str_buf_t buf;
1600  __kmp_str_buf_init(&buf);
1601 
1602  __kmp_str_buf_print(&buf, "%d", totals[0]);
1603  for (level = 1; level <= pkgLevel; level++) {
1604  __kmp_str_buf_print(&buf, " x %d", maxCt[level]);
1605  }
1606  KMP_INFORM(TopologyExtra, "KMP_AFFINITY", buf.str, nCoresPerPkg,
1607  __kmp_nThreadsPerCore, __kmp_ncores);
1608 
1609  __kmp_str_buf_free(&buf);
1610  }
1611 
1612  if (__kmp_affinity_type == affinity_none) {
1613  __kmp_free(last);
1614  __kmp_free(maxCt);
1615  __kmp_free(counts);
1616  __kmp_free(totals);
1617  __kmp_free(retval);
1618  KMP_CPU_FREE(oldMask);
1619  return 0;
1620  }
1621 
1622  //
1623  // Find any levels with radiix 1, and remove them from the map
1624  // (except for the package level).
1625  //
1626  int new_depth = 0;
1627  for (level = 0; level < depth; level++) {
1628  if ((maxCt[level] == 1) && (level != pkgLevel)) {
1629  continue;
1630  }
1631  new_depth++;
1632  }
1633 
1634  //
1635  // If we are removing any levels, allocate a new vector to return,
1636  // and copy the relevant information to it.
1637  //
1638  if (new_depth != depth) {
1639  AddrUnsPair *new_retval = (AddrUnsPair *)__kmp_allocate(
1640  sizeof(AddrUnsPair) * nApics);
1641  for (proc = 0; proc < nApics; proc++) {
1642  Address addr(new_depth);
1643  new_retval[proc] = AddrUnsPair(addr, retval[proc].second);
1644  }
1645  int new_level = 0;
1646  for (level = 0; level < depth; level++) {
1647  if ((maxCt[level] == 1) && (level != pkgLevel)) {
1648  if (level == threadLevel) {
1649  threadLevel = -1;
1650  }
1651  else if ((threadLevel >= 0) && (level < threadLevel)) {
1652  threadLevel--;
1653  }
1654  if (level == coreLevel) {
1655  coreLevel = -1;
1656  }
1657  else if ((coreLevel >= 0) && (level < coreLevel)) {
1658  coreLevel--;
1659  }
1660  if (level < pkgLevel) {
1661  pkgLevel--;
1662  }
1663  continue;
1664  }
1665  for (proc = 0; proc < nApics; proc++) {
1666  new_retval[proc].first.labels[new_level]
1667  = retval[proc].first.labels[level];
1668  }
1669  new_level++;
1670  }
1671 
1672  __kmp_free(retval);
1673  retval = new_retval;
1674  depth = new_depth;
1675  }
1676 
1677  if (__kmp_affinity_gran_levels < 0) {
1678  //
1679  // Set the granularity level based on what levels are modeled
1680  // in the machine topology map.
1681  //
1682  __kmp_affinity_gran_levels = 0;
1683  if ((threadLevel >= 0) && (__kmp_affinity_gran > affinity_gran_thread)) {
1684  __kmp_affinity_gran_levels++;
1685  }
1686  if ((coreLevel >= 0) && (__kmp_affinity_gran > affinity_gran_core)) {
1687  __kmp_affinity_gran_levels++;
1688  }
1689  if (__kmp_affinity_gran > affinity_gran_package) {
1690  __kmp_affinity_gran_levels++;
1691  }
1692  }
1693 
1694  if (__kmp_affinity_verbose) {
1695  __kmp_affinity_print_topology(retval, nApics, depth, pkgLevel,
1696  coreLevel, threadLevel);
1697  }
1698 
1699  __kmp_free(last);
1700  __kmp_free(maxCt);
1701  __kmp_free(counts);
1702  __kmp_free(totals);
1703  KMP_CPU_FREE(oldMask);
1704  *address2os = retval;
1705  return depth;
1706 }
1707 
1708 
1709 # endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
1710 
1711 
1712 #define osIdIndex 0
1713 #define threadIdIndex 1
1714 #define coreIdIndex 2
1715 #define pkgIdIndex 3
1716 #define nodeIdIndex 4
1717 
1718 typedef unsigned *ProcCpuInfo;
1719 static unsigned maxIndex = pkgIdIndex;
1720 
1721 
1722 static int
1723 __kmp_affinity_cmp_ProcCpuInfo_os_id(const void *a, const void *b)
1724 {
1725  const unsigned *aa = (const unsigned *)a;
1726  const unsigned *bb = (const unsigned *)b;
1727  if (aa[osIdIndex] < bb[osIdIndex]) return -1;
1728  if (aa[osIdIndex] > bb[osIdIndex]) return 1;
1729  return 0;
1730 };
1731 
1732 
1733 static int
1734 __kmp_affinity_cmp_ProcCpuInfo_phys_id(const void *a, const void *b)
1735 {
1736  unsigned i;
1737  const unsigned *aa = *((const unsigned **)a);
1738  const unsigned *bb = *((const unsigned **)b);
1739  for (i = maxIndex; ; i--) {
1740  if (aa[i] < bb[i]) return -1;
1741  if (aa[i] > bb[i]) return 1;
1742  if (i == osIdIndex) break;
1743  }
1744  return 0;
1745 }
1746 
1747 
1748 //
1749 // Parse /proc/cpuinfo (or an alternate file in the same format) to obtain the
1750 // affinity map.
1751 //
1752 static int
1753 __kmp_affinity_create_cpuinfo_map(AddrUnsPair **address2os, int *line,
1754  kmp_i18n_id_t *const msg_id, FILE *f)
1755 {
1756  *address2os = NULL;
1757  *msg_id = kmp_i18n_null;
1758 
1759  //
1760  // Scan of the file, and count the number of "processor" (osId) fields,
1761  // and find the higest value of <n> for a node_<n> field.
1762  //
1763  char buf[256];
1764  unsigned num_records = 0;
1765  while (! feof(f)) {
1766  buf[sizeof(buf) - 1] = 1;
1767  if (! fgets(buf, sizeof(buf), f)) {
1768  //
1769  // Read errors presumably because of EOF
1770  //
1771  break;
1772  }
1773 
1774  char s1[] = "processor";
1775  if (strncmp(buf, s1, sizeof(s1) - 1) == 0) {
1776  num_records++;
1777  continue;
1778  }
1779 
1780  //
1781  // FIXME - this will match "node_<n> <garbage>"
1782  //
1783  int level;
1784  if (sscanf(buf, "node_%d id", &level) == 1) {
1785  if (nodeIdIndex + level >= maxIndex) {
1786  maxIndex = nodeIdIndex + level;
1787  }
1788  continue;
1789  }
1790  }
1791 
1792  //
1793  // Check for empty file / no valid processor records, or too many.
1794  // The number of records can't exceed the number of valid bits in the
1795  // affinity mask.
1796  //
1797  if (num_records == 0) {
1798  *line = 0;
1799  *msg_id = kmp_i18n_str_NoProcRecords;
1800  return -1;
1801  }
1802  if (num_records > __kmp_xproc) {
1803  *line = 0;
1804  *msg_id = kmp_i18n_str_TooManyProcRecords;
1805  return -1;
1806  }
1807 
1808  //
1809  // Set the file pointer back to the begginning, so that we can scan the
1810  // file again, this time performing a full parse of the data.
1811  // Allocate a vector of ProcCpuInfo object, where we will place the data.
1812  // Adding an extra element at the end allows us to remove a lot of extra
1813  // checks for termination conditions.
1814  //
1815  if (fseek(f, 0, SEEK_SET) != 0) {
1816  *line = 0;
1817  *msg_id = kmp_i18n_str_CantRewindCpuinfo;
1818  return -1;
1819  }
1820 
1821  //
1822  // Allocate the array of records to store the proc info in. The dummy
1823  // element at the end makes the logic in filling them out easier to code.
1824  //
1825  unsigned **threadInfo = (unsigned **)__kmp_allocate((num_records + 1)
1826  * sizeof(unsigned *));
1827  unsigned i;
1828  for (i = 0; i <= num_records; i++) {
1829  threadInfo[i] = (unsigned *)__kmp_allocate((maxIndex + 1)
1830  * sizeof(unsigned));
1831  }
1832 
1833 #define CLEANUP_THREAD_INFO \
1834  for (i = 0; i <= num_records; i++) { \
1835  __kmp_free(threadInfo[i]); \
1836  } \
1837  __kmp_free(threadInfo);
1838 
1839  //
1840  // A value of UINT_MAX means that we didn't find the field
1841  //
1842  unsigned __index;
1843 
1844 #define INIT_PROC_INFO(p) \
1845  for (__index = 0; __index <= maxIndex; __index++) { \
1846  (p)[__index] = UINT_MAX; \
1847  }
1848 
1849  for (i = 0; i <= num_records; i++) {
1850  INIT_PROC_INFO(threadInfo[i]);
1851  }
1852 
1853  unsigned num_avail = 0;
1854  *line = 0;
1855  while (! feof(f)) {
1856  //
1857  // Create an inner scoping level, so that all the goto targets at the
1858  // end of the loop appear in an outer scoping level. This avoids
1859  // warnings about jumping past an initialization to a target in the
1860  // same block.
1861  //
1862  {
1863  buf[sizeof(buf) - 1] = 1;
1864  bool long_line = false;
1865  if (! fgets(buf, sizeof(buf), f)) {
1866  //
1867  // Read errors presumably because of EOF
1868  //
1869  // If there is valid data in threadInfo[num_avail], then fake
1870  // a blank line in ensure that the last address gets parsed.
1871  //
1872  bool valid = false;
1873  for (i = 0; i <= maxIndex; i++) {
1874  if (threadInfo[num_avail][i] != UINT_MAX) {
1875  valid = true;
1876  }
1877  }
1878  if (! valid) {
1879  break;
1880  }
1881  buf[0] = 0;
1882  } else if (!buf[sizeof(buf) - 1]) {
1883  //
1884  // The line is longer than the buffer. Set a flag and don't
1885  // emit an error if we were going to ignore the line, anyway.
1886  //
1887  long_line = true;
1888 
1889 #define CHECK_LINE \
1890  if (long_line) { \
1891  CLEANUP_THREAD_INFO; \
1892  *msg_id = kmp_i18n_str_LongLineCpuinfo; \
1893  return -1; \
1894  }
1895  }
1896  (*line)++;
1897 
1898  char s1[] = "processor";
1899  if (strncmp(buf, s1, sizeof(s1) - 1) == 0) {
1900  CHECK_LINE;
1901  char *p = strchr(buf + sizeof(s1) - 1, ':');
1902  unsigned val;
1903  if ((p == NULL) || (sscanf(p + 1, "%u\n", &val) != 1)) goto no_val;
1904  if (threadInfo[num_avail][osIdIndex] != UINT_MAX) goto dup_field;
1905  threadInfo[num_avail][osIdIndex] = val;
1906  continue;
1907  }
1908  char s2[] = "physical id";
1909  if (strncmp(buf, s2, sizeof(s2) - 1) == 0) {
1910  CHECK_LINE;
1911  char *p = strchr(buf + sizeof(s2) - 1, ':');
1912  unsigned val;
1913  if ((p == NULL) || (sscanf(p + 1, "%u\n", &val) != 1)) goto no_val;
1914  if (threadInfo[num_avail][pkgIdIndex] != UINT_MAX) goto dup_field;
1915  threadInfo[num_avail][pkgIdIndex] = val;
1916  continue;
1917  }
1918  char s3[] = "core id";
1919  if (strncmp(buf, s3, sizeof(s3) - 1) == 0) {
1920  CHECK_LINE;
1921  char *p = strchr(buf + sizeof(s3) - 1, ':');
1922  unsigned val;
1923  if ((p == NULL) || (sscanf(p + 1, "%u\n", &val) != 1)) goto no_val;
1924  if (threadInfo[num_avail][coreIdIndex] != UINT_MAX) goto dup_field;
1925  threadInfo[num_avail][coreIdIndex] = val;
1926  continue;
1927  }
1928  char s4[] = "thread id";
1929  if (strncmp(buf, s4, sizeof(s4) - 1) == 0) {
1930  CHECK_LINE;
1931  char *p = strchr(buf + sizeof(s4) - 1, ':');
1932  unsigned val;
1933  if ((p == NULL) || (sscanf(p + 1, "%u\n", &val) != 1)) goto no_val;
1934  if (threadInfo[num_avail][threadIdIndex] != UINT_MAX) goto dup_field;
1935  threadInfo[num_avail][threadIdIndex] = val;
1936  continue;
1937  }
1938  int level;
1939  if (sscanf(buf, "node_%d id", &level) == 1) {
1940  CHECK_LINE;
1941  char *p = strchr(buf + sizeof(s4) - 1, ':');
1942  unsigned val;
1943  if ((p == NULL) || (sscanf(p + 1, "%u\n", &val) != 1)) goto no_val;
1944  KMP_ASSERT(nodeIdIndex + level <= maxIndex);
1945  if (threadInfo[num_avail][nodeIdIndex + level] != UINT_MAX) goto dup_field;
1946  threadInfo[num_avail][nodeIdIndex + level] = val;
1947  continue;
1948  }
1949 
1950  //
1951  // We didn't recognize the leading token on the line.
1952  // There are lots of leading tokens that we don't recognize -
1953  // if the line isn't empty, go on to the next line.
1954  //
1955  if ((*buf != 0) && (*buf != '\n')) {
1956  //
1957  // If the line is longer than the buffer, read characters
1958  // until we find a newline.
1959  //
1960  if (long_line) {
1961  int ch;
1962  while (((ch = fgetc(f)) != EOF) && (ch != '\n'));
1963  }
1964  continue;
1965  }
1966 
1967  //
1968  // A newline has signalled the end of the processor record.
1969  // Check that there aren't too many procs specified.
1970  //
1971  if (num_avail == __kmp_xproc) {
1972  CLEANUP_THREAD_INFO;
1973  *msg_id = kmp_i18n_str_TooManyEntries;
1974  return -1;
1975  }
1976 
1977  //
1978  // Check for missing fields. The osId field must be there, and we
1979  // currently require that the physical id field is specified, also.
1980  //
1981  if (threadInfo[num_avail][osIdIndex] == UINT_MAX) {
1982  CLEANUP_THREAD_INFO;
1983  *msg_id = kmp_i18n_str_MissingProcField;
1984  return -1;
1985  }
1986  if (threadInfo[0][pkgIdIndex] == UINT_MAX) {
1987  CLEANUP_THREAD_INFO;
1988  *msg_id = kmp_i18n_str_MissingPhysicalIDField;
1989  return -1;
1990  }
1991 
1992  //
1993  // Skip this proc if it is not included in the machine model.
1994  //
1995  if (! KMP_CPU_ISSET(threadInfo[num_avail][osIdIndex], fullMask)) {
1996  INIT_PROC_INFO(threadInfo[num_avail]);
1997  continue;
1998  }
1999 
2000  //
2001  // We have a successful parse of this proc's info.
2002  // Increment the counter, and prepare for the next proc.
2003  //
2004  num_avail++;
2005  KMP_ASSERT(num_avail <= num_records);
2006  INIT_PROC_INFO(threadInfo[num_avail]);
2007  }
2008  continue;
2009 
2010  no_val:
2011  CLEANUP_THREAD_INFO;
2012  *msg_id = kmp_i18n_str_MissingValCpuinfo;
2013  return -1;
2014 
2015  dup_field:
2016  CLEANUP_THREAD_INFO;
2017  *msg_id = kmp_i18n_str_DuplicateFieldCpuinfo;
2018  return -1;
2019  }
2020  *line = 0;
2021 
2022 # if KMP_MIC && REDUCE_TEAM_SIZE
2023  unsigned teamSize = 0;
2024 # endif // KMP_MIC && REDUCE_TEAM_SIZE
2025 
2026  // check for num_records == __kmp_xproc ???
2027 
2028  //
2029  // If there's only one thread context to bind to, form an Address object
2030  // with depth 1 and return immediately (or, if affinity is off, set
2031  // address2os to NULL and return).
2032  //
2033  // If it is configured to omit the package level when there is only a
2034  // single package, the logic at the end of this routine won't work if
2035  // there is only a single thread - it would try to form an Address
2036  // object with depth 0.
2037  //
2038  KMP_ASSERT(num_avail > 0);
2039  KMP_ASSERT(num_avail <= num_records);
2040  if (num_avail == 1) {
2041  __kmp_ncores = 1;
2042  __kmp_nThreadsPerCore = nCoresPerPkg = nPackages = 1;
2043  __kmp_ht_enabled = FALSE;
2044  if (__kmp_affinity_verbose) {
2045  if (! KMP_AFFINITY_CAPABLE()) {
2046  KMP_INFORM(AffNotCapableUseCpuinfo, "KMP_AFFINITY");
2047  KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc);
2048  KMP_INFORM(Uniform, "KMP_AFFINITY");
2049  }
2050  else {
2051  char buf[KMP_AFFIN_MASK_PRINT_LEN];
2052  __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN,
2053  fullMask);
2054  KMP_INFORM(AffCapableUseCpuinfo, "KMP_AFFINITY");
2055  if (__kmp_affinity_respect_mask) {
2056  KMP_INFORM(InitOSProcSetRespect, "KMP_AFFINITY", buf);
2057  } else {
2058  KMP_INFORM(InitOSProcSetNotRespect, "KMP_AFFINITY", buf);
2059  }
2060  KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc);
2061  KMP_INFORM(Uniform, "KMP_AFFINITY");
2062  }
2063  int index;
2064  kmp_str_buf_t buf;
2065  __kmp_str_buf_init(&buf);
2066  __kmp_str_buf_print(&buf, "1");
2067  for (index = maxIndex - 1; index > pkgIdIndex; index--) {
2068  __kmp_str_buf_print(&buf, " x 1");
2069  }
2070  KMP_INFORM(TopologyExtra, "KMP_AFFINITY", buf.str, 1, 1, 1);
2071  __kmp_str_buf_free(&buf);
2072  }
2073 
2074  if (__kmp_affinity_type == affinity_none) {
2075  CLEANUP_THREAD_INFO;
2076  return 0;
2077  }
2078 
2079  *address2os = (AddrUnsPair*)__kmp_allocate(sizeof(AddrUnsPair));
2080  Address addr(1);
2081  addr.labels[0] = threadInfo[0][pkgIdIndex];
2082  (*address2os)[0] = AddrUnsPair(addr, threadInfo[0][osIdIndex]);
2083 
2084  if (__kmp_affinity_gran_levels < 0) {
2085  __kmp_affinity_gran_levels = 0;
2086  }
2087 
2088  if (__kmp_affinity_verbose) {
2089  __kmp_affinity_print_topology(*address2os, 1, 1, 0, -1, -1);
2090  }
2091 
2092  CLEANUP_THREAD_INFO;
2093  return 1;
2094  }
2095 
2096  //
2097  // Sort the threadInfo table by physical Id.
2098  //
2099  qsort(threadInfo, num_avail, sizeof(*threadInfo),
2100  __kmp_affinity_cmp_ProcCpuInfo_phys_id);
2101 
2102  //
2103  // The table is now sorted by pkgId / coreId / threadId, but we really
2104  // don't know the radix of any of the fields. pkgId's may be sparsely
2105  // assigned among the chips on a system. Although coreId's are usually
2106  // assigned [0 .. coresPerPkg-1] and threadId's are usually assigned
2107  // [0..threadsPerCore-1], we don't want to make any such assumptions.
2108  //
2109  // For that matter, we don't know what coresPerPkg and threadsPerCore
2110  // (or the total # packages) are at this point - we want to determine
2111  // that now. We only have an upper bound on the first two figures.
2112  //
2113  unsigned *counts = (unsigned *)__kmp_allocate((maxIndex + 1)
2114  * sizeof(unsigned));
2115  unsigned *maxCt = (unsigned *)__kmp_allocate((maxIndex + 1)
2116  * sizeof(unsigned));
2117  unsigned *totals = (unsigned *)__kmp_allocate((maxIndex + 1)
2118  * sizeof(unsigned));
2119  unsigned *lastId = (unsigned *)__kmp_allocate((maxIndex + 1)
2120  * sizeof(unsigned));
2121 
2122  bool assign_thread_ids = false;
2123  int threadIdCt;
2124  int index;
2125 
2126  restart_radix_check:
2127  threadIdCt = 0;
2128 
2129  //
2130  // Initialize the counter arrays with data from threadInfo[0].
2131  //
2132  if (assign_thread_ids) {
2133  if (threadInfo[0][threadIdIndex] == UINT_MAX) {
2134  threadInfo[0][threadIdIndex] = threadIdCt++;
2135  }
2136  else if (threadIdCt <= threadInfo[0][threadIdIndex]) {
2137  threadIdCt = threadInfo[0][threadIdIndex] + 1;
2138  }
2139  }
2140  for (index = 0; index <= maxIndex; index++) {
2141  counts[index] = 1;
2142  maxCt[index] = 1;
2143  totals[index] = 1;
2144  lastId[index] = threadInfo[0][index];;
2145  }
2146 
2147  //
2148  // Run through the rest of the OS procs.
2149  //
2150  for (i = 1; i < num_avail; i++) {
2151  //
2152  // Find the most significant index whose id differs
2153  // from the id for the previous OS proc.
2154  //
2155  for (index = maxIndex; index >= threadIdIndex; index--) {
2156  if (assign_thread_ids && (index == threadIdIndex)) {
2157  //
2158  // Auto-assign the thread id field if it wasn't specified.
2159  //
2160  if (threadInfo[i][threadIdIndex] == UINT_MAX) {
2161  threadInfo[i][threadIdIndex] = threadIdCt++;
2162  }
2163 
2164  //
2165  // Aparrently the thread id field was specified for some
2166  // entries and not others. Start the thread id counter
2167  // off at the next higher thread id.
2168  //
2169  else if (threadIdCt <= threadInfo[i][threadIdIndex]) {
2170  threadIdCt = threadInfo[i][threadIdIndex] + 1;
2171  }
2172  }
2173  if (threadInfo[i][index] != lastId[index]) {
2174  //
2175  // Run through all indices which are less significant,
2176  // and reset the counts to 1.
2177  //
2178  // At all levels up to and including index, we need to
2179  // increment the totals and record the last id.
2180  //
2181  int index2;
2182  for (index2 = threadIdIndex; index2 < index; index2++) {
2183  totals[index2]++;
2184  if (counts[index2] > maxCt[index2]) {
2185  maxCt[index2] = counts[index2];
2186  }
2187  counts[index2] = 1;
2188  lastId[index2] = threadInfo[i][index2];
2189  }
2190  counts[index]++;
2191  totals[index]++;
2192  lastId[index] = threadInfo[i][index];
2193 
2194  if (assign_thread_ids && (index > threadIdIndex)) {
2195 
2196 # if KMP_MIC && REDUCE_TEAM_SIZE
2197  //
2198  // The default team size is the total #threads in the machine
2199  // minus 1 thread for every core that has 3 or more threads.
2200  //
2201  teamSize += ( threadIdCt <= 2 ) ? ( threadIdCt ) : ( threadIdCt - 1 );
2202 # endif // KMP_MIC && REDUCE_TEAM_SIZE
2203 
2204  //
2205  // Restart the thread counter, as we are on a new core.
2206  //
2207  threadIdCt = 0;
2208 
2209  //
2210  // Auto-assign the thread id field if it wasn't specified.
2211  //
2212  if (threadInfo[i][threadIdIndex] == UINT_MAX) {
2213  threadInfo[i][threadIdIndex] = threadIdCt++;
2214  }
2215 
2216  //
2217  // Aparrently the thread id field was specified for some
2218  // entries and not others. Start the thread id counter
2219  // off at the next higher thread id.
2220  //
2221  else if (threadIdCt <= threadInfo[i][threadIdIndex]) {
2222  threadIdCt = threadInfo[i][threadIdIndex] + 1;
2223  }
2224  }
2225  break;
2226  }
2227  }
2228  if (index < threadIdIndex) {
2229  //
2230  // If thread ids were specified, it is an error if they are not
2231  // unique. Also, check that we waven't already restarted the
2232  // loop (to be safe - shouldn't need to).
2233  //
2234  if ((threadInfo[i][threadIdIndex] != UINT_MAX)
2235  || assign_thread_ids) {
2236  __kmp_free(lastId);
2237  __kmp_free(totals);
2238  __kmp_free(maxCt);
2239  __kmp_free(counts);
2240  CLEANUP_THREAD_INFO;
2241  *msg_id = kmp_i18n_str_PhysicalIDsNotUnique;
2242  return -1;
2243  }
2244 
2245  //
2246  // If the thread ids were not specified and we see entries
2247  // entries that are duplicates, start the loop over and
2248  // assign the thread ids manually.
2249  //
2250  assign_thread_ids = true;
2251  goto restart_radix_check;
2252  }
2253  }
2254 
2255 # if KMP_MIC && REDUCE_TEAM_SIZE
2256  //
2257  // The default team size is the total #threads in the machine
2258  // minus 1 thread for every core that has 3 or more threads.
2259  //
2260  teamSize += ( threadIdCt <= 2 ) ? ( threadIdCt ) : ( threadIdCt - 1 );
2261 # endif // KMP_MIC && REDUCE_TEAM_SIZE
2262 
2263  for (index = threadIdIndex; index <= maxIndex; index++) {
2264  if (counts[index] > maxCt[index]) {
2265  maxCt[index] = counts[index];
2266  }
2267  }
2268 
2269  __kmp_nThreadsPerCore = maxCt[threadIdIndex];
2270  nCoresPerPkg = maxCt[coreIdIndex];
2271  nPackages = totals[pkgIdIndex];
2272 
2273  //
2274  // Check to see if the machine topology is uniform
2275  //
2276  unsigned prod = totals[maxIndex];
2277  for (index = threadIdIndex; index < maxIndex; index++) {
2278  prod *= maxCt[index];
2279  }
2280  bool uniform = (prod == totals[threadIdIndex]);
2281 
2282  //
2283  // When affinity is off, this routine will still be called to set
2284  // __kmp_ht_enabled, & __kmp_ncores, as well as __kmp_nThreadsPerCore,
2285  // nCoresPerPkg, & nPackages. Make sure all these vars are set
2286  // correctly, and return now if affinity is not enabled.
2287  //
2288  __kmp_ht_enabled = (maxCt[threadIdIndex] > 1); // threads per core > 1
2289  __kmp_ncores = totals[coreIdIndex];
2290 
2291  if (__kmp_affinity_verbose) {
2292  if (! KMP_AFFINITY_CAPABLE()) {
2293  KMP_INFORM(AffNotCapableUseCpuinfo, "KMP_AFFINITY");
2294  KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc);
2295  if (uniform) {
2296  KMP_INFORM(Uniform, "KMP_AFFINITY");
2297  } else {
2298  KMP_INFORM(NonUniform, "KMP_AFFINITY");
2299  }
2300  }
2301  else {
2302  char buf[KMP_AFFIN_MASK_PRINT_LEN];
2303  __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, fullMask);
2304  KMP_INFORM(AffCapableUseCpuinfo, "KMP_AFFINITY");
2305  if (__kmp_affinity_respect_mask) {
2306  KMP_INFORM(InitOSProcSetRespect, "KMP_AFFINITY", buf);
2307  } else {
2308  KMP_INFORM(InitOSProcSetNotRespect, "KMP_AFFINITY", buf);
2309  }
2310  KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc);
2311  if (uniform) {
2312  KMP_INFORM(Uniform, "KMP_AFFINITY");
2313  } else {
2314  KMP_INFORM(NonUniform, "KMP_AFFINITY");
2315  }
2316  }
2317  kmp_str_buf_t buf;
2318  __kmp_str_buf_init(&buf);
2319 
2320  __kmp_str_buf_print(&buf, "%d", totals[maxIndex]);
2321  for (index = maxIndex - 1; index >= pkgIdIndex; index--) {
2322  __kmp_str_buf_print(&buf, " x %d", maxCt[index]);
2323  }
2324  KMP_INFORM(TopologyExtra, "KMP_AFFINITY", buf.str, maxCt[coreIdIndex],
2325  maxCt[threadIdIndex], __kmp_ncores);
2326 
2327  __kmp_str_buf_free(&buf);
2328  }
2329 
2330 # if KMP_MIC && REDUCE_TEAM_SIZE
2331  //
2332  // Set the default team size.
2333  //
2334  if ((__kmp_dflt_team_nth == 0) && (teamSize > 0)) {
2335  __kmp_dflt_team_nth = teamSize;
2336  KA_TRACE(20, ("__kmp_affinity_create_cpuinfo_map: setting __kmp_dflt_team_nth = %d\n",
2337  __kmp_dflt_team_nth));
2338  }
2339 # endif // KMP_MIC && REDUCE_TEAM_SIZE
2340 
2341  if (__kmp_affinity_type == affinity_none) {
2342  __kmp_free(lastId);
2343  __kmp_free(totals);
2344  __kmp_free(maxCt);
2345  __kmp_free(counts);
2346  CLEANUP_THREAD_INFO;
2347  return 0;
2348  }
2349 
2350  //
2351  // Count the number of levels which have more nodes at that level than
2352  // at the parent's level (with there being an implicit root node of
2353  // the top level). This is equivalent to saying that there is at least
2354  // one node at this level which has a sibling. These levels are in the
2355  // map, and the package level is always in the map.
2356  //
2357  bool *inMap = (bool *)__kmp_allocate((maxIndex + 1) * sizeof(bool));
2358  int level = 0;
2359  for (index = threadIdIndex; index < maxIndex; index++) {
2360  KMP_ASSERT(totals[index] >= totals[index + 1]);
2361  inMap[index] = (totals[index] > totals[index + 1]);
2362  }
2363  inMap[maxIndex] = (totals[maxIndex] > 1);
2364  inMap[pkgIdIndex] = true;
2365 
2366  int depth = 0;
2367  for (index = threadIdIndex; index <= maxIndex; index++) {
2368  if (inMap[index]) {
2369  depth++;
2370  }
2371  }
2372  KMP_ASSERT(depth > 0);
2373 
2374  //
2375  // Construct the data structure that is to be returned.
2376  //
2377  *address2os = (AddrUnsPair*)
2378  __kmp_allocate(sizeof(AddrUnsPair) * num_avail);
2379  int pkgLevel = -1;
2380  int coreLevel = -1;
2381  int threadLevel = -1;
2382 
2383  for (i = 0; i < num_avail; ++i) {
2384  Address addr(depth);
2385  unsigned os = threadInfo[i][osIdIndex];
2386  int src_index;
2387  int dst_index = 0;
2388 
2389  for (src_index = maxIndex; src_index >= threadIdIndex; src_index--) {
2390  if (! inMap[src_index]) {
2391  continue;
2392  }
2393  addr.labels[dst_index] = threadInfo[i][src_index];
2394  if (src_index == pkgIdIndex) {
2395  pkgLevel = dst_index;
2396  }
2397  else if (src_index == coreIdIndex) {
2398  coreLevel = dst_index;
2399  }
2400  else if (src_index == threadIdIndex) {
2401  threadLevel = dst_index;
2402  }
2403  dst_index++;
2404  }
2405  (*address2os)[i] = AddrUnsPair(addr, os);
2406  }
2407 
2408  if (__kmp_affinity_gran_levels < 0) {
2409  //
2410  // Set the granularity level based on what levels are modeled
2411  // in the machine topology map.
2412  //
2413  int src_index;
2414  __kmp_affinity_gran_levels = 0;
2415  for (src_index = threadIdIndex; src_index <= maxIndex; src_index++) {
2416  if (! inMap[src_index]) {
2417  continue;
2418  }
2419  switch (src_index) {
2420  case threadIdIndex:
2421  if (__kmp_affinity_gran > affinity_gran_thread) {
2422  __kmp_affinity_gran_levels++;
2423  }
2424 
2425  break;
2426  case coreIdIndex:
2427  if (__kmp_affinity_gran > affinity_gran_core) {
2428  __kmp_affinity_gran_levels++;
2429  }
2430  break;
2431 
2432  case pkgIdIndex:
2433  if (__kmp_affinity_gran > affinity_gran_package) {
2434  __kmp_affinity_gran_levels++;
2435  }
2436  break;
2437  }
2438  }
2439  }
2440 
2441  if (__kmp_affinity_verbose) {
2442  __kmp_affinity_print_topology(*address2os, num_avail, depth, pkgLevel,
2443  coreLevel, threadLevel);
2444  }
2445 
2446  __kmp_free(inMap);
2447  __kmp_free(lastId);
2448  __kmp_free(totals);
2449  __kmp_free(maxCt);
2450  __kmp_free(counts);
2451  CLEANUP_THREAD_INFO;
2452  return depth;
2453 }
2454 
2455 
2456 //
2457 // Create and return a table of affinity masks, indexed by OS thread ID.
2458 // This routine handles OR'ing together all the affinity masks of threads
2459 // that are sufficiently close, if granularity > fine.
2460 //
2461 static kmp_affin_mask_t *
2462 __kmp_create_masks(unsigned *maxIndex, unsigned *numUnique,
2463  AddrUnsPair *address2os, unsigned numAddrs)
2464 {
2465  //
2466  // First form a table of affinity masks in order of OS thread id.
2467  //
2468  unsigned depth;
2469  unsigned maxOsId;
2470  unsigned i;
2471 
2472  KMP_ASSERT(numAddrs > 0);
2473  depth = address2os[0].first.depth;
2474 
2475  maxOsId = 0;
2476  for (i = 0; i < numAddrs; i++) {
2477  unsigned osId = address2os[i].second;
2478  if (osId > maxOsId) {
2479  maxOsId = osId;
2480  }
2481  }
2482  kmp_affin_mask_t *osId2Mask = (kmp_affin_mask_t *)__kmp_allocate(
2483  (maxOsId + 1) * __kmp_affin_mask_size);
2484 
2485  //
2486  // Sort the address2os table according to physical order. Doing so
2487  // will put all threads on the same core/package/node in consecutive
2488  // locations.
2489  //
2490  qsort(address2os, numAddrs, sizeof(*address2os),
2491  __kmp_affinity_cmp_Address_labels);
2492 
2493  KMP_ASSERT(__kmp_affinity_gran_levels >= 0);
2494  if (__kmp_affinity_verbose && (__kmp_affinity_gran_levels > 0)) {
2495  KMP_INFORM(ThreadsMigrate, "KMP_AFFINITY", __kmp_affinity_gran_levels);
2496  }
2497  if (__kmp_affinity_gran_levels >= depth) {
2498  if (__kmp_affinity_verbose || (__kmp_affinity_warnings
2499  && (__kmp_affinity_type != affinity_none))) {
2500  KMP_WARNING(AffThreadsMayMigrate);
2501  }
2502  }
2503 
2504  //
2505  // Run through the table, forming the masks for all threads on each
2506  // core. Threads on the same core will have identical "Address"
2507  // objects, not considering the last level, which must be the thread
2508  // id. All threads on a core will appear consecutively.
2509  //
2510  unsigned unique = 0;
2511  unsigned j = 0; // index of 1st thread on core
2512  unsigned leader = 0;
2513  Address *leaderAddr = &(address2os[0].first);
2514  kmp_affin_mask_t *sum
2515  = (kmp_affin_mask_t *)alloca(__kmp_affin_mask_size);
2516  KMP_CPU_ZERO(sum);
2517  KMP_CPU_SET(address2os[0].second, sum);
2518  for (i = 1; i < numAddrs; i++) {
2519  //
2520  // If this thread is sufficiently close to the leader (withing the
2521  // granularity setting), then set the bit for this os thread in the
2522  // affinity mask for this group, and go on to the next thread.
2523  //
2524  if (leaderAddr->isClose(address2os[i].first,
2525  __kmp_affinity_gran_levels)) {
2526  KMP_CPU_SET(address2os[i].second, sum);
2527  continue;
2528  }
2529 
2530  //
2531  // For every thread in this group, copy the mask to the thread's
2532  // entry in the osId2Mask table. Mark the first address as a
2533  // leader.
2534  //
2535  for (; j < i; j++) {
2536  unsigned osId = address2os[j].second;
2537  KMP_DEBUG_ASSERT(osId <= maxOsId);
2538  kmp_affin_mask_t *mask = KMP_CPU_INDEX(osId2Mask, osId);
2539  KMP_CPU_COPY(mask, sum);
2540  address2os[j].first.leader = (j == leader);
2541  }
2542  unique++;
2543 
2544  //
2545  // Start a new mask.
2546  //
2547  leader = i;
2548  leaderAddr = &(address2os[i].first);
2549  KMP_CPU_ZERO(sum);
2550  KMP_CPU_SET(address2os[i].second, sum);
2551  }
2552 
2553  //
2554  // For every thread in last group, copy the mask to the thread's
2555  // entry in the osId2Mask table.
2556  //
2557  for (; j < i; j++) {
2558  unsigned osId = address2os[j].second;
2559  KMP_DEBUG_ASSERT(osId <= maxOsId);
2560  kmp_affin_mask_t *mask = KMP_CPU_INDEX(osId2Mask, osId);
2561  KMP_CPU_COPY(mask, sum);
2562  address2os[j].first.leader = (j == leader);
2563  }
2564  unique++;
2565 
2566  *maxIndex = maxOsId;
2567  *numUnique = unique;
2568  return osId2Mask;
2569 }
2570 
2571 
2572 //
2573 // Stuff for the affinity proclist parsers. It's easier to declare these vars
2574 // as file-static than to try and pass them through the calling sequence of
2575 // the recursive-descent OMP_PLACES parser.
2576 //
2577 static kmp_affin_mask_t *newMasks;
2578 static int numNewMasks;
2579 static int nextNewMask;
2580 
2581 #define ADD_MASK(_mask) \
2582  { \
2583  if (nextNewMask >= numNewMasks) { \
2584  numNewMasks *= 2; \
2585  newMasks = (kmp_affin_mask_t *)KMP_INTERNAL_REALLOC(newMasks, \
2586  numNewMasks * __kmp_affin_mask_size); \
2587  } \
2588  KMP_CPU_COPY(KMP_CPU_INDEX(newMasks, nextNewMask), (_mask)); \
2589  nextNewMask++; \
2590  }
2591 
2592 #define ADD_MASK_OSID(_osId,_osId2Mask,_maxOsId) \
2593  { \
2594  if (((_osId) > _maxOsId) || \
2595  (! KMP_CPU_ISSET((_osId), KMP_CPU_INDEX(_osId2Mask, (_osId))))) {\
2596  if (__kmp_affinity_verbose || (__kmp_affinity_warnings \
2597  && (__kmp_affinity_type != affinity_none))) { \
2598  KMP_WARNING(AffIgnoreInvalidProcID, _osId); \
2599  } \
2600  } \
2601  else { \
2602  ADD_MASK(KMP_CPU_INDEX(_osId2Mask, (_osId))); \
2603  } \
2604  }
2605 
2606 
2607 //
2608 // Re-parse the proclist (for the explicit affinity type), and form the list
2609 // of affinity newMasks indexed by gtid.
2610 //
2611 static void
2612 __kmp_affinity_process_proclist(kmp_affin_mask_t **out_masks,
2613  unsigned int *out_numMasks, const char *proclist,
2614  kmp_affin_mask_t *osId2Mask, int maxOsId)
2615 {
2616  const char *scan = proclist;
2617  const char *next = proclist;
2618 
2619  //
2620  // We use malloc() for the temporary mask vector,
2621  // so that we can use realloc() to extend it.
2622  //
2623  numNewMasks = 2;
2624  newMasks = (kmp_affin_mask_t *)KMP_INTERNAL_MALLOC(numNewMasks
2625  * __kmp_affin_mask_size);
2626  nextNewMask = 0;
2627  kmp_affin_mask_t *sumMask = (kmp_affin_mask_t *)__kmp_allocate(
2628  __kmp_affin_mask_size);
2629  int setSize = 0;
2630 
2631  for (;;) {
2632  int start, end, stride;
2633 
2634  SKIP_WS(scan);
2635  next = scan;
2636  if (*next == '\0') {
2637  break;
2638  }
2639 
2640  if (*next == '{') {
2641  int num;
2642  setSize = 0;
2643  next++; // skip '{'
2644  SKIP_WS(next);
2645  scan = next;
2646 
2647  //
2648  // Read the first integer in the set.
2649  //
2650  KMP_ASSERT2((*next >= '0') && (*next <= '9'),
2651  "bad proclist");
2652  SKIP_DIGITS(next);
2653  num = __kmp_str_to_int(scan, *next);
2654  KMP_ASSERT2(num >= 0, "bad explicit proc list");
2655 
2656  //
2657  // Copy the mask for that osId to the sum (union) mask.
2658  //
2659  if ((num > maxOsId) ||
2660  (! KMP_CPU_ISSET(num, KMP_CPU_INDEX(osId2Mask, num)))) {
2661  if (__kmp_affinity_verbose || (__kmp_affinity_warnings
2662  && (__kmp_affinity_type != affinity_none))) {
2663  KMP_WARNING(AffIgnoreInvalidProcID, num);
2664  }
2665  KMP_CPU_ZERO(sumMask);
2666  }
2667  else {
2668  KMP_CPU_COPY(sumMask, KMP_CPU_INDEX(osId2Mask, num));
2669  setSize = 1;
2670  }
2671 
2672  for (;;) {
2673  //
2674  // Check for end of set.
2675  //
2676  SKIP_WS(next);
2677  if (*next == '}') {
2678  next++; // skip '}'
2679  break;
2680  }
2681 
2682  //
2683  // Skip optional comma.
2684  //
2685  if (*next == ',') {
2686  next++;
2687  }
2688  SKIP_WS(next);
2689 
2690  //
2691  // Read the next integer in the set.
2692  //
2693  scan = next;
2694  KMP_ASSERT2((*next >= '0') && (*next <= '9'),
2695  "bad explicit proc list");
2696 
2697  SKIP_DIGITS(next);
2698  num = __kmp_str_to_int(scan, *next);
2699  KMP_ASSERT2(num >= 0, "bad explicit proc list");
2700 
2701  //
2702  // Add the mask for that osId to the sum mask.
2703  //
2704  if ((num > maxOsId) ||
2705  (! KMP_CPU_ISSET(num, KMP_CPU_INDEX(osId2Mask, num)))) {
2706  if (__kmp_affinity_verbose || (__kmp_affinity_warnings
2707  && (__kmp_affinity_type != affinity_none))) {
2708  KMP_WARNING(AffIgnoreInvalidProcID, num);
2709  }
2710  }
2711  else {
2712  KMP_CPU_UNION(sumMask, KMP_CPU_INDEX(osId2Mask, num));
2713  setSize++;
2714  }
2715  }
2716  if (setSize > 0) {
2717  ADD_MASK(sumMask);
2718  }
2719 
2720  SKIP_WS(next);
2721  if (*next == ',') {
2722  next++;
2723  }
2724  scan = next;
2725  continue;
2726  }
2727 
2728  //
2729  // Read the first integer.
2730  //
2731  KMP_ASSERT2((*next >= '0') && (*next <= '9'), "bad explicit proc list");
2732  SKIP_DIGITS(next);
2733  start = __kmp_str_to_int(scan, *next);
2734  KMP_ASSERT2(start >= 0, "bad explicit proc list");
2735  SKIP_WS(next);
2736 
2737  //
2738  // If this isn't a range, then add a mask to the list and go on.
2739  //
2740  if (*next != '-') {
2741  ADD_MASK_OSID(start, osId2Mask, maxOsId);
2742 
2743  //
2744  // Skip optional comma.
2745  //
2746  if (*next == ',') {
2747  next++;
2748  }
2749  scan = next;
2750  continue;
2751  }
2752 
2753  //
2754  // This is a range. Skip over the '-' and read in the 2nd int.
2755  //
2756  next++; // skip '-'
2757  SKIP_WS(next);
2758  scan = next;
2759  KMP_ASSERT2((*next >= '0') && (*next <= '9'), "bad explicit proc list");
2760  SKIP_DIGITS(next);
2761  end = __kmp_str_to_int(scan, *next);
2762  KMP_ASSERT2(end >= 0, "bad explicit proc list");
2763 
2764  //
2765  // Check for a stride parameter
2766  //
2767  stride = 1;
2768  SKIP_WS(next);
2769  if (*next == ':') {
2770  //
2771  // A stride is specified. Skip over the ':" and read the 3rd int.
2772  //
2773  int sign = +1;
2774  next++; // skip ':'
2775  SKIP_WS(next);
2776  scan = next;
2777  if (*next == '-') {
2778  sign = -1;
2779  next++;
2780  SKIP_WS(next);
2781  scan = next;
2782  }
2783  KMP_ASSERT2((*next >= '0') && (*next <= '9'),
2784  "bad explicit proc list");
2785  SKIP_DIGITS(next);
2786  stride = __kmp_str_to_int(scan, *next);
2787  KMP_ASSERT2(stride >= 0, "bad explicit proc list");
2788  stride *= sign;
2789  }
2790 
2791  //
2792  // Do some range checks.
2793  //
2794  KMP_ASSERT2(stride != 0, "bad explicit proc list");
2795  if (stride > 0) {
2796  KMP_ASSERT2(start <= end, "bad explicit proc list");
2797  }
2798  else {
2799  KMP_ASSERT2(start >= end, "bad explicit proc list");
2800  }
2801  KMP_ASSERT2((end - start) / stride <= 65536, "bad explicit proc list");
2802 
2803  //
2804  // Add the mask for each OS proc # to the list.
2805  //
2806  if (stride > 0) {
2807  do {
2808  ADD_MASK_OSID(start, osId2Mask, maxOsId);
2809  start += stride;
2810  } while (start <= end);
2811  }
2812  else {
2813  do {
2814  ADD_MASK_OSID(start, osId2Mask, maxOsId);
2815  start += stride;
2816  } while (start >= end);
2817  }
2818 
2819  //
2820  // Skip optional comma.
2821  //
2822  SKIP_WS(next);
2823  if (*next == ',') {
2824  next++;
2825  }
2826  scan = next;
2827  }
2828 
2829  *out_numMasks = nextNewMask;
2830  if (nextNewMask == 0) {
2831  *out_masks = NULL;
2832  KMP_INTERNAL_FREE(newMasks);
2833  return;
2834  }
2835  *out_masks
2836  = (kmp_affin_mask_t *)__kmp_allocate(nextNewMask * __kmp_affin_mask_size);
2837  memcpy(*out_masks, newMasks, nextNewMask * __kmp_affin_mask_size);
2838  __kmp_free(sumMask);
2839  KMP_INTERNAL_FREE(newMasks);
2840 }
2841 
2842 
2843 # if OMP_40_ENABLED
2844 
2845 /*-----------------------------------------------------------------------------
2846 
2847 Re-parse the OMP_PLACES proc id list, forming the newMasks for the different
2848 places. Again, Here is the grammar:
2849 
2850 place_list := place
2851 place_list := place , place_list
2852 place := num
2853 place := place : num
2854 place := place : num : signed
2855 place := { subplacelist }
2856 place := ! place // (lowest priority)
2857 subplace_list := subplace
2858 subplace_list := subplace , subplace_list
2859 subplace := num
2860 subplace := num : num
2861 subplace := num : num : signed
2862 signed := num
2863 signed := + signed
2864 signed := - signed
2865 
2866 -----------------------------------------------------------------------------*/
2867 
2868 static void
2869 __kmp_process_subplace_list(const char **scan, kmp_affin_mask_t *osId2Mask,
2870  int maxOsId, kmp_affin_mask_t *tempMask, int *setSize)
2871 {
2872  const char *next;
2873 
2874  for (;;) {
2875  int start, count, stride, i;
2876 
2877  //
2878  // Read in the starting proc id
2879  //
2880  SKIP_WS(*scan);
2881  KMP_ASSERT2((**scan >= '0') && (**scan <= '9'),
2882  "bad explicit places list");
2883  next = *scan;
2884  SKIP_DIGITS(next);
2885  start = __kmp_str_to_int(*scan, *next);
2886  KMP_ASSERT(start >= 0);
2887  *scan = next;
2888 
2889  //
2890  // valid follow sets are ',' ':' and '}'
2891  //
2892  SKIP_WS(*scan);
2893  if (**scan == '}' || **scan == ',') {
2894  if ((start > maxOsId) ||
2895  (! KMP_CPU_ISSET(start, KMP_CPU_INDEX(osId2Mask, start)))) {
2896  if (__kmp_affinity_verbose || (__kmp_affinity_warnings
2897  && (__kmp_affinity_type != affinity_none))) {
2898  KMP_WARNING(AffIgnoreInvalidProcID, start);
2899  }
2900  }
2901  else {
2902  KMP_CPU_UNION(tempMask, KMP_CPU_INDEX(osId2Mask, start));
2903  (*setSize)++;
2904  }
2905  if (**scan == '}') {
2906  break;
2907  }
2908  (*scan)++; // skip ','
2909  continue;
2910  }
2911  KMP_ASSERT2(**scan == ':', "bad explicit places list");
2912  (*scan)++; // skip ':'
2913 
2914  //
2915  // Read count parameter
2916  //
2917  SKIP_WS(*scan);
2918  KMP_ASSERT2((**scan >= '0') && (**scan <= '9'),
2919  "bad explicit places list");
2920  next = *scan;
2921  SKIP_DIGITS(next);
2922  count = __kmp_str_to_int(*scan, *next);
2923  KMP_ASSERT(count >= 0);
2924  *scan = next;
2925 
2926  //
2927  // valid follow sets are ',' ':' and '}'
2928  //
2929  SKIP_WS(*scan);
2930  if (**scan == '}' || **scan == ',') {
2931  for (i = 0; i < count; i++) {
2932  if ((start > maxOsId) ||
2933  (! KMP_CPU_ISSET(start, KMP_CPU_INDEX(osId2Mask, start)))) {
2934  if (__kmp_affinity_verbose || (__kmp_affinity_warnings
2935  && (__kmp_affinity_type != affinity_none))) {
2936  KMP_WARNING(AffIgnoreInvalidProcID, start);
2937  }
2938  break; // don't proliferate warnings for large count
2939  }
2940  else {
2941  KMP_CPU_UNION(tempMask, KMP_CPU_INDEX(osId2Mask, start));
2942  start++;
2943  (*setSize)++;
2944  }
2945  }
2946  if (**scan == '}') {
2947  break;
2948  }
2949  (*scan)++; // skip ','
2950  continue;
2951  }
2952  KMP_ASSERT2(**scan == ':', "bad explicit places list");
2953  (*scan)++; // skip ':'
2954 
2955  //
2956  // Read stride parameter
2957  //
2958  int sign = +1;
2959  for (;;) {
2960  SKIP_WS(*scan);
2961  if (**scan == '+') {
2962  (*scan)++; // skip '+'
2963  continue;
2964  }
2965  if (**scan == '-') {
2966  sign *= -1;
2967  (*scan)++; // skip '-'
2968  continue;
2969  }
2970  break;
2971  }
2972  SKIP_WS(*scan);
2973  KMP_ASSERT2((**scan >= '0') && (**scan <= '9'),
2974  "bad explicit places list");
2975  next = *scan;
2976  SKIP_DIGITS(next);
2977  stride = __kmp_str_to_int(*scan, *next);
2978  KMP_ASSERT(stride >= 0);
2979  *scan = next;
2980  stride *= sign;
2981 
2982  //
2983  // valid follow sets are ',' and '}'
2984  //
2985  SKIP_WS(*scan);
2986  if (**scan == '}' || **scan == ',') {
2987  for (i = 0; i < count; i++) {
2988  if ((start > maxOsId) ||
2989  (! KMP_CPU_ISSET(start, KMP_CPU_INDEX(osId2Mask, start)))) {
2990  if (__kmp_affinity_verbose || (__kmp_affinity_warnings
2991  && (__kmp_affinity_type != affinity_none))) {
2992  KMP_WARNING(AffIgnoreInvalidProcID, start);
2993  }
2994  break; // don't proliferate warnings for large count
2995  }
2996  else {
2997  KMP_CPU_UNION(tempMask, KMP_CPU_INDEX(osId2Mask, start));
2998  start += stride;
2999  (*setSize)++;
3000  }
3001  }
3002  if (**scan == '}') {
3003  break;
3004  }
3005  (*scan)++; // skip ','
3006  continue;
3007  }
3008 
3009  KMP_ASSERT2(0, "bad explicit places list");
3010  }
3011 }
3012 
3013 
3014 static void
3015 __kmp_process_place(const char **scan, kmp_affin_mask_t *osId2Mask,
3016  int maxOsId, kmp_affin_mask_t *tempMask, int *setSize)
3017 {
3018  const char *next;
3019 
3020  //
3021  // valid follow sets are '{' '!' and num
3022  //
3023  SKIP_WS(*scan);
3024  if (**scan == '{') {
3025  (*scan)++; // skip '{'
3026  __kmp_process_subplace_list(scan, osId2Mask, maxOsId , tempMask,
3027  setSize);
3028  KMP_ASSERT2(**scan == '}', "bad explicit places list");
3029  (*scan)++; // skip '}'
3030  }
3031  else if (**scan == '!') {
3032  __kmp_process_place(scan, osId2Mask, maxOsId, tempMask, setSize);
3033  KMP_CPU_COMPLEMENT(tempMask);
3034  (*scan)++; // skip '!'
3035  }
3036  else if ((**scan >= '0') && (**scan <= '9')) {
3037  next = *scan;
3038  SKIP_DIGITS(next);
3039  int num = __kmp_str_to_int(*scan, *next);
3040  KMP_ASSERT(num >= 0);
3041  if ((num > maxOsId) ||
3042  (! KMP_CPU_ISSET(num, KMP_CPU_INDEX(osId2Mask, num)))) {
3043  if (__kmp_affinity_verbose || (__kmp_affinity_warnings
3044  && (__kmp_affinity_type != affinity_none))) {
3045  KMP_WARNING(AffIgnoreInvalidProcID, num);
3046  }
3047  }
3048  else {
3049  KMP_CPU_UNION(tempMask, KMP_CPU_INDEX(osId2Mask, num));
3050  (*setSize)++;
3051  }
3052  *scan = next; // skip num
3053  }
3054  else {
3055  KMP_ASSERT2(0, "bad explicit places list");
3056  }
3057 }
3058 
3059 
3060 static void
3061 __kmp_affinity_process_placelist(kmp_affin_mask_t **out_masks,
3062  unsigned int *out_numMasks, const char *placelist,
3063  kmp_affin_mask_t *osId2Mask, int maxOsId)
3064 {
3065  const char *scan = placelist;
3066  const char *next = placelist;
3067 
3068  numNewMasks = 2;
3069  newMasks = (kmp_affin_mask_t *)KMP_INTERNAL_MALLOC(numNewMasks
3070  * __kmp_affin_mask_size);
3071  nextNewMask = 0;
3072 
3073  kmp_affin_mask_t *tempMask = (kmp_affin_mask_t *)__kmp_allocate(
3074  __kmp_affin_mask_size);
3075  KMP_CPU_ZERO(tempMask);
3076  int setSize = 0;
3077 
3078  for (;;) {
3079  int start, count, stride;
3080 
3081  __kmp_process_place(&scan, osId2Mask, maxOsId, tempMask, &setSize);
3082 
3083  //
3084  // valid follow sets are ',' ':' and EOL
3085  //
3086  SKIP_WS(scan);
3087  if (*scan == '\0' || *scan == ',') {
3088  if (setSize > 0) {
3089  ADD_MASK(tempMask);
3090  }
3091  KMP_CPU_ZERO(tempMask);
3092  setSize = 0;
3093  if (*scan == '\0') {
3094  break;
3095  }
3096  scan++; // skip ','
3097  continue;
3098  }
3099 
3100  KMP_ASSERT2(*scan == ':', "bad explicit places list");
3101  scan++; // skip ':'
3102 
3103  //
3104  // Read count parameter
3105  //
3106  SKIP_WS(scan);
3107  KMP_ASSERT2((*scan >= '0') && (*scan <= '9'),
3108  "bad explicit places list");
3109  next = scan;
3110  SKIP_DIGITS(next);
3111  count = __kmp_str_to_int(scan, *next);
3112  KMP_ASSERT(count >= 0);
3113  scan = next;
3114 
3115  //
3116  // valid follow sets are ',' ':' and EOL
3117  //
3118  SKIP_WS(scan);
3119  if (*scan == '\0' || *scan == ',') {
3120  int i;
3121  for (i = 0; i < count; i++) {
3122  int j;
3123  if (setSize == 0) {
3124  break;
3125  }
3126  ADD_MASK(tempMask);
3127  setSize = 0;
3128  for (j = __kmp_affin_mask_size * CHAR_BIT - 1; j > 0; j--) {
3129  //
3130  // Use a temp var in case macro is changed to evaluate
3131  // args multiple times.
3132  //
3133  if (KMP_CPU_ISSET(j - stride, tempMask)) {
3134  KMP_CPU_SET(j, tempMask);
3135  setSize++;
3136  }
3137  else {
3138  KMP_CPU_CLR(j, tempMask);
3139  }
3140  }
3141  for (; j >= 0; j--) {
3142  KMP_CPU_CLR(j, tempMask);
3143  }
3144  }
3145  KMP_CPU_ZERO(tempMask);
3146  setSize = 0;
3147 
3148  if (*scan == '\0') {
3149  break;
3150  }
3151  scan++; // skip ','
3152  continue;
3153  }
3154 
3155  KMP_ASSERT2(*scan == ':', "bad explicit places list");
3156  scan++; // skip ':'
3157 
3158  //
3159  // Read stride parameter
3160  //
3161  int sign = +1;
3162  for (;;) {
3163  SKIP_WS(scan);
3164  if (*scan == '+') {
3165  scan++; // skip '+'
3166  continue;
3167  }
3168  if (*scan == '-') {
3169  sign *= -1;
3170  scan++; // skip '-'
3171  continue;
3172  }
3173  break;
3174  }
3175  SKIP_WS(scan);
3176  KMP_ASSERT2((*scan >= '0') && (*scan <= '9'),
3177  "bad explicit places list");
3178  next = scan;
3179  SKIP_DIGITS(next);
3180  stride = __kmp_str_to_int(scan, *next);
3181  KMP_DEBUG_ASSERT(stride >= 0);
3182  scan = next;
3183  stride *= sign;
3184 
3185  if (stride > 0) {
3186  int i;
3187  for (i = 0; i < count; i++) {
3188  int j;
3189  if (setSize == 0) {
3190  break;
3191  }
3192  ADD_MASK(tempMask);
3193  setSize = 0;
3194  for (j = __kmp_affin_mask_size * CHAR_BIT - 1; j >= stride; j--) {
3195  if (KMP_CPU_ISSET(j - stride, tempMask)) {
3196  KMP_CPU_SET(j, tempMask);
3197  setSize++;
3198  }
3199  else {
3200  KMP_CPU_CLR(j, tempMask);
3201  }
3202  }
3203  for (; j >= 0; j--) {
3204  KMP_CPU_CLR(j, tempMask);
3205  }
3206  }
3207  }
3208  else {
3209  int i;
3210  for (i = 0; i < count; i++) {
3211  int j;
3212  if (setSize == 0) {
3213  break;
3214  }
3215  ADD_MASK(tempMask);
3216  setSize = 0;
3217  for (j = 0; j < (__kmp_affin_mask_size * CHAR_BIT) + stride;
3218  j++) {
3219  if (KMP_CPU_ISSET(j - stride, tempMask)) {
3220  KMP_CPU_SET(j, tempMask);
3221  setSize++;
3222  }
3223  else {
3224  KMP_CPU_CLR(j, tempMask);
3225  }
3226  }
3227  for (; j < __kmp_affin_mask_size * CHAR_BIT; j++) {
3228  KMP_CPU_CLR(j, tempMask);
3229  }
3230  }
3231  }
3232  KMP_CPU_ZERO(tempMask);
3233  setSize = 0;
3234 
3235  //
3236  // valid follow sets are ',' and EOL
3237  //
3238  SKIP_WS(scan);
3239  if (*scan == '\0') {
3240  break;
3241  }
3242  if (*scan == ',') {
3243  scan++; // skip ','
3244  continue;
3245  }
3246 
3247  KMP_ASSERT2(0, "bad explicit places list");
3248  }
3249 
3250  *out_numMasks = nextNewMask;
3251  if (nextNewMask == 0) {
3252  *out_masks = NULL;
3253  KMP_INTERNAL_FREE(newMasks);
3254  return;
3255  }
3256  *out_masks
3257  = (kmp_affin_mask_t *)__kmp_allocate(nextNewMask * __kmp_affin_mask_size);
3258  memcpy(*out_masks, newMasks, nextNewMask * __kmp_affin_mask_size);
3259  __kmp_free(tempMask);
3260  KMP_INTERNAL_FREE(newMasks);
3261 }
3262 
3263 # endif /* OMP_40_ENABLED */
3264 
3265 #undef ADD_MASK
3266 #undef ADD_MASK_OSID
3267 
3268 
3269 # if KMP_MIC
3270 
3271 static void
3272 __kmp_apply_thread_places(AddrUnsPair **pAddr, int depth)
3273 {
3274  if ( __kmp_place_num_cores == 0 ) {
3275  if ( __kmp_place_num_threads_per_core == 0 ) {
3276  return; // no cores limiting actions requested, exit
3277  }
3278  __kmp_place_num_cores = nCoresPerPkg; // use all available cores
3279  }
3280  if ( !__kmp_affinity_uniform_topology() || depth != 3 ) {
3281  KMP_WARNING( AffThrPlaceUnsupported );
3282  return; // don't support non-uniform topology or not-3-level architecture
3283  }
3284  if ( __kmp_place_num_threads_per_core == 0 ) {
3285  __kmp_place_num_threads_per_core = __kmp_nThreadsPerCore; // use all HW contexts
3286  }
3287  if ( __kmp_place_core_offset + __kmp_place_num_cores > nCoresPerPkg ) {
3288  KMP_WARNING( AffThrPlaceManyCores );
3289  return;
3290  }
3291 
3292  AddrUnsPair *newAddr = (AddrUnsPair *)__kmp_allocate( sizeof(AddrUnsPair) *
3293  nPackages * __kmp_place_num_cores * __kmp_place_num_threads_per_core);
3294  int i, j, k, n_old = 0, n_new = 0;
3295  for ( i = 0; i < nPackages; ++i ) {
3296  for ( j = 0; j < nCoresPerPkg; ++j ) {
3297  if ( j < __kmp_place_core_offset || j >= __kmp_place_core_offset + __kmp_place_num_cores ) {
3298  n_old += __kmp_nThreadsPerCore; // skip not-requested core
3299  } else {
3300  for ( k = 0; k < __kmp_nThreadsPerCore; ++k ) {
3301  if ( k < __kmp_place_num_threads_per_core ) {
3302  newAddr[n_new] = (*pAddr)[n_old]; // copy requested core' data to new location
3303  n_new++;
3304  }
3305  n_old++;
3306  }
3307  }
3308  }
3309  }
3310  nCoresPerPkg = __kmp_place_num_cores; // correct nCoresPerPkg
3311  __kmp_nThreadsPerCore = __kmp_place_num_threads_per_core; // correct __kmp_nThreadsPerCore
3312  __kmp_avail_proc = n_new; // correct avail_proc
3313  __kmp_ncores = nPackages * __kmp_place_num_cores; // correct ncores
3314 
3315  __kmp_free( *pAddr );
3316  *pAddr = newAddr; // replace old topology with new one
3317 }
3318 
3319 # endif /* KMP_MIC */
3320 
3321 
3322 static AddrUnsPair *address2os = NULL;
3323 static int * procarr = NULL;
3324 static int __kmp_aff_depth = 0;
3325 
3326 static void
3327 __kmp_aux_affinity_initialize(void)
3328 {
3329  if (__kmp_affinity_masks != NULL) {
3330  KMP_ASSERT(fullMask != NULL);
3331  return;
3332  }
3333 
3334  //
3335  // Create the "full" mask - this defines all of the processors that we
3336  // consider to be in the machine model. If respect is set, then it is
3337  // the initialization thread's affinity mask. Otherwise, it is all
3338  // processors that we know about on the machine.
3339  //
3340  if (fullMask == NULL) {
3341  fullMask = (kmp_affin_mask_t *)__kmp_allocate(__kmp_affin_mask_size);
3342  }
3343  if (KMP_AFFINITY_CAPABLE()) {
3344  if (__kmp_affinity_respect_mask) {
3345  __kmp_get_system_affinity(fullMask, TRUE);
3346 
3347  //
3348  // Count the number of available processors.
3349  //
3350  unsigned i;
3351  __kmp_avail_proc = 0;
3352  for (i = 0; i < KMP_CPU_SETSIZE; ++i) {
3353  if (! KMP_CPU_ISSET(i, fullMask)) {
3354  continue;
3355  }
3356  __kmp_avail_proc++;
3357  }
3358  if (__kmp_avail_proc > __kmp_xproc) {
3359  if (__kmp_affinity_verbose || (__kmp_affinity_warnings
3360  && (__kmp_affinity_type != affinity_none))) {
3361  KMP_WARNING(ErrorInitializeAffinity);
3362  }
3363  __kmp_affinity_type = affinity_none;
3364  __kmp_affin_mask_size = 0;
3365  return;
3366  }
3367  }
3368  else {
3369  __kmp_affinity_entire_machine_mask(fullMask);
3370  __kmp_avail_proc = __kmp_xproc;
3371  }
3372  }
3373 
3374  int depth = -1;
3375  kmp_i18n_id_t msg_id = kmp_i18n_null;
3376 
3377  //
3378  // For backward compatiblity, setting KMP_CPUINFO_FILE =>
3379  // KMP_TOPOLOGY_METHOD=cpuinfo
3380  //
3381  if ((__kmp_cpuinfo_file != NULL) &&
3382  (__kmp_affinity_top_method == affinity_top_method_all)) {
3383  __kmp_affinity_top_method = affinity_top_method_cpuinfo;
3384  }
3385 
3386  if (__kmp_affinity_top_method == affinity_top_method_all) {
3387  //
3388  // In the default code path, errors are not fatal - we just try using
3389  // another method. We only emit a warning message if affinity is on,
3390  // or the verbose flag is set, an the nowarnings flag was not set.
3391  //
3392  const char *file_name = NULL;
3393  int line = 0;
3394 
3395 # if KMP_ARCH_X86 || KMP_ARCH_X86_64
3396 
3397  if (__kmp_affinity_verbose) {
3398  KMP_INFORM(AffInfoStr, "KMP_AFFINITY", KMP_I18N_STR(Decodingx2APIC));
3399  }
3400 
3401  file_name = NULL;
3402  depth = __kmp_affinity_create_x2apicid_map(&address2os, &msg_id);
3403  if (depth == 0) {
3404  KMP_ASSERT(__kmp_affinity_type == affinity_none);
3405  KMP_ASSERT(address2os == NULL);
3406  return;
3407  }
3408 
3409  if (depth < 0) {
3410  if ((msg_id != kmp_i18n_null)
3411  && (__kmp_affinity_verbose || (__kmp_affinity_warnings
3412  && (__kmp_affinity_type != affinity_none)))) {
3413 # if KMP_MIC
3414  if (__kmp_affinity_verbose) {
3415  KMP_INFORM(AffInfoStrStr, "KMP_AFFINITY", __kmp_i18n_catgets(msg_id),
3416  KMP_I18N_STR(DecodingLegacyAPIC));
3417  }
3418 # else
3419  KMP_WARNING(AffInfoStrStr, "KMP_AFFINITY", __kmp_i18n_catgets(msg_id),
3420  KMP_I18N_STR(DecodingLegacyAPIC));
3421 # endif
3422  }
3423 
3424  file_name = NULL;
3425  depth = __kmp_affinity_create_apicid_map(&address2os, &msg_id);
3426  if (depth == 0) {
3427  KMP_ASSERT(__kmp_affinity_type == affinity_none);
3428  KMP_ASSERT(address2os == NULL);
3429  return;
3430  }
3431  }
3432 
3433 # endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
3434 
3435 # if KMP_OS_LINUX
3436 
3437  if (depth < 0) {
3438  if ((msg_id != kmp_i18n_null)
3439  && (__kmp_affinity_verbose || (__kmp_affinity_warnings
3440  && (__kmp_affinity_type != affinity_none)))) {
3441 # if KMP_MIC
3442  if (__kmp_affinity_verbose) {
3443  KMP_INFORM(AffStrParseFilename, "KMP_AFFINITY", __kmp_i18n_catgets(msg_id), "/proc/cpuinfo");
3444  }
3445 # else
3446  KMP_WARNING(AffStrParseFilename, "KMP_AFFINITY", __kmp_i18n_catgets(msg_id), "/proc/cpuinfo");
3447 # endif
3448  }
3449  else if (__kmp_affinity_verbose) {
3450  KMP_INFORM(AffParseFilename, "KMP_AFFINITY", "/proc/cpuinfo");
3451  }
3452 
3453  FILE *f = fopen("/proc/cpuinfo", "r");
3454  if (f == NULL) {
3455  msg_id = kmp_i18n_str_CantOpenCpuinfo;
3456  }
3457  else {
3458  file_name = "/proc/cpuinfo";
3459  depth = __kmp_affinity_create_cpuinfo_map(&address2os, &line, &msg_id, f);
3460  fclose(f);
3461  if (depth == 0) {
3462  KMP_ASSERT(__kmp_affinity_type == affinity_none);
3463  KMP_ASSERT(address2os == NULL);
3464  return;
3465  }
3466  }
3467  }
3468 
3469 # endif /* KMP_OS_LINUX */
3470 
3471  if (depth < 0) {
3472  if (msg_id != kmp_i18n_null
3473  && (__kmp_affinity_verbose || (__kmp_affinity_warnings
3474  && (__kmp_affinity_type != affinity_none)))) {
3475  if (file_name == NULL) {
3476  KMP_WARNING(UsingFlatOS, __kmp_i18n_catgets(msg_id));
3477  }
3478  else if (line == 0) {
3479  KMP_WARNING(UsingFlatOSFile, file_name, __kmp_i18n_catgets(msg_id));
3480  }
3481  else {
3482  KMP_WARNING(UsingFlatOSFileLine, file_name, line, __kmp_i18n_catgets(msg_id));
3483  }
3484  }
3485 
3486  file_name = "";
3487  depth = __kmp_affinity_create_flat_map(&address2os, &msg_id);
3488  if (depth == 0) {
3489  KMP_ASSERT(__kmp_affinity_type == affinity_none);
3490  KMP_ASSERT(address2os == NULL);
3491  return;
3492  }
3493  KMP_ASSERT(depth > 0);
3494  KMP_ASSERT(address2os != NULL);
3495  }
3496  }
3497 
3498  //
3499  // If the user has specified that a paricular topology discovery method
3500  // is to be used, then we abort if that method fails. The exception is
3501  // group affinity, which might have been implicitly set.
3502  //
3503 
3504 # if KMP_ARCH_X86 || KMP_ARCH_X86_64
3505 
3506  else if (__kmp_affinity_top_method == affinity_top_method_x2apicid) {
3507  if (__kmp_affinity_verbose) {
3508  KMP_INFORM(AffInfoStr, "KMP_AFFINITY",
3509  KMP_I18N_STR(Decodingx2APIC));
3510  }
3511 
3512  depth = __kmp_affinity_create_x2apicid_map(&address2os, &msg_id);
3513  if (depth == 0) {
3514  KMP_ASSERT(__kmp_affinity_type == affinity_none);
3515  KMP_ASSERT(address2os == NULL);
3516  return;
3517  }
3518 
3519  if (depth < 0) {
3520  KMP_ASSERT(msg_id != kmp_i18n_null);
3521  KMP_FATAL(MsgExiting, __kmp_i18n_catgets(msg_id));
3522  }
3523  }
3524  else if (__kmp_affinity_top_method == affinity_top_method_apicid) {
3525  if (__kmp_affinity_verbose) {
3526  KMP_INFORM(AffInfoStr, "KMP_AFFINITY",
3527  KMP_I18N_STR(DecodingLegacyAPIC));
3528  }
3529 
3530  depth = __kmp_affinity_create_apicid_map(&address2os, &msg_id);
3531  if (depth == 0) {
3532  KMP_ASSERT(__kmp_affinity_type == affinity_none);
3533  KMP_ASSERT(address2os == NULL);
3534  return;
3535  }
3536 
3537  if (depth < 0) {
3538  KMP_ASSERT(msg_id != kmp_i18n_null);
3539  KMP_FATAL(MsgExiting, __kmp_i18n_catgets(msg_id));
3540  }
3541  }
3542 
3543 # endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
3544 
3545  else if (__kmp_affinity_top_method == affinity_top_method_cpuinfo) {
3546  const char *filename;
3547  if (__kmp_cpuinfo_file != NULL) {
3548  filename = __kmp_cpuinfo_file;
3549  }
3550  else {
3551  filename = "/proc/cpuinfo";
3552  }
3553 
3554  if (__kmp_affinity_verbose) {
3555  KMP_INFORM(AffParseFilename, "KMP_AFFINITY", filename);
3556  }
3557 
3558  FILE *f = fopen(filename, "r");
3559  if (f == NULL) {
3560  int code = errno;
3561  if (__kmp_cpuinfo_file != NULL) {
3562  __kmp_msg(
3563  kmp_ms_fatal,
3564  KMP_MSG(CantOpenFileForReading, filename),
3565  KMP_ERR(code),
3566  KMP_HNT(NameComesFrom_CPUINFO_FILE),
3567  __kmp_msg_null
3568  );
3569  }
3570  else {
3571  __kmp_msg(
3572  kmp_ms_fatal,
3573  KMP_MSG(CantOpenFileForReading, filename),
3574  KMP_ERR(code),
3575  __kmp_msg_null
3576  );
3577  }
3578  }
3579  int line = 0;
3580  depth = __kmp_affinity_create_cpuinfo_map(&address2os, &line, &msg_id, f);
3581  fclose(f);
3582  if (depth < 0) {
3583  KMP_ASSERT(msg_id != kmp_i18n_null);
3584  if (line > 0) {
3585  KMP_FATAL(FileLineMsgExiting, filename, line, __kmp_i18n_catgets(msg_id));
3586  }
3587  else {
3588  KMP_FATAL(FileMsgExiting, filename, __kmp_i18n_catgets(msg_id));
3589  }
3590  }
3591  if (__kmp_affinity_type == affinity_none) {
3592  KMP_ASSERT(depth == 0);
3593  KMP_ASSERT(address2os == NULL);
3594  return;
3595  }
3596  }
3597 
3598 # if KMP_OS_WINDOWS && KMP_ARCH_X86_64
3599 
3600  else if (__kmp_affinity_top_method == affinity_top_method_group) {
3601  if (__kmp_affinity_verbose) {
3602  KMP_INFORM(AffWindowsProcGroupMap, "KMP_AFFINITY");
3603  }
3604 
3605  depth = __kmp_affinity_create_proc_group_map(&address2os, &msg_id);
3606  KMP_ASSERT(depth != 0);
3607 
3608  if (depth < 0) {
3609  if ((msg_id != kmp_i18n_null)
3610  && (__kmp_affinity_verbose || (__kmp_affinity_warnings
3611  && (__kmp_affinity_type != affinity_none)))) {
3612  KMP_WARNING(UsingFlatOS, __kmp_i18n_catgets(msg_id));
3613  }
3614 
3615  depth = __kmp_affinity_create_flat_map(&address2os, &msg_id);
3616  if (depth == 0) {
3617  KMP_ASSERT(__kmp_affinity_type == affinity_none);
3618  KMP_ASSERT(address2os == NULL);
3619  return;
3620  }
3621  // should not fail
3622  KMP_ASSERT(depth > 0);
3623  KMP_ASSERT(address2os != NULL);
3624  }
3625  }
3626 
3627 # endif /* KMP_OS_WINDOWS && KMP_ARCH_X86_64 */
3628 
3629  else if (__kmp_affinity_top_method == affinity_top_method_flat) {
3630  if (__kmp_affinity_verbose) {
3631  KMP_INFORM(AffUsingFlatOS, "KMP_AFFINITY");
3632  }
3633 
3634  depth = __kmp_affinity_create_flat_map(&address2os, &msg_id);
3635  if (depth == 0) {
3636  KMP_ASSERT(__kmp_affinity_type == affinity_none);
3637  KMP_ASSERT(address2os == NULL);
3638  return;
3639  }
3640  // should not fail
3641  KMP_ASSERT(depth > 0);
3642  KMP_ASSERT(address2os != NULL);
3643  }
3644 
3645  if (address2os == NULL) {
3646  if (KMP_AFFINITY_CAPABLE()
3647  && (__kmp_affinity_verbose || (__kmp_affinity_warnings
3648  && (__kmp_affinity_type != affinity_none)))) {
3649  KMP_WARNING(ErrorInitializeAffinity);
3650  }
3651  __kmp_affinity_type = affinity_none;
3652  __kmp_affin_mask_size = 0;
3653  return;
3654  }
3655 
3656 # if KMP_MIC
3657  __kmp_apply_thread_places(&address2os, depth);
3658 # endif
3659 
3660  //
3661  // Create the table of masks, indexed by thread Id.
3662  //
3663  unsigned maxIndex;
3664  unsigned numUnique;
3665  kmp_affin_mask_t *osId2Mask = __kmp_create_masks(&maxIndex, &numUnique,
3666  address2os, __kmp_avail_proc);
3667  if (__kmp_affinity_gran_levels == 0) {
3668  KMP_DEBUG_ASSERT(numUnique == __kmp_avail_proc);
3669  }
3670 
3671  //
3672  // Set the childNums vector in all Address objects. This must be done
3673  // before we can sort using __kmp_affinity_cmp_Address_child_num(),
3674  // which takes into account the setting of __kmp_affinity_compact.
3675  //
3676  __kmp_affinity_assign_child_nums(address2os, __kmp_avail_proc);
3677 
3678  switch (__kmp_affinity_type) {
3679 
3680  case affinity_explicit:
3681  KMP_DEBUG_ASSERT(__kmp_affinity_proclist != NULL);
3682 # if OMP_40_ENABLED
3683  if (__kmp_nested_proc_bind.bind_types[0] == proc_bind_intel)
3684 # endif
3685  {
3686  __kmp_affinity_process_proclist(&__kmp_affinity_masks,
3687  &__kmp_affinity_num_masks, __kmp_affinity_proclist, osId2Mask,
3688  maxIndex);
3689  }
3690 # if OMP_40_ENABLED
3691  else {
3692  __kmp_affinity_process_placelist(&__kmp_affinity_masks,
3693  &__kmp_affinity_num_masks, __kmp_affinity_proclist, osId2Mask,
3694  maxIndex);
3695  }
3696 # endif
3697  if (__kmp_affinity_num_masks == 0) {
3698  if (__kmp_affinity_verbose || (__kmp_affinity_warnings
3699  && (__kmp_affinity_type != affinity_none))) {
3700  KMP_WARNING(AffNoValidProcID);
3701  }
3702  __kmp_affinity_type = affinity_none;
3703  return;
3704  }
3705  break;
3706 
3707  //
3708  // The other affinity types rely on sorting the Addresses according
3709  // to some permutation of the machine topology tree. Set
3710  // __kmp_affinity_compact and __kmp_affinity_offset appropriately,
3711  // then jump to a common code fragment to do the sort and create
3712  // the array of affinity masks.
3713  //
3714 
3715  case affinity_logical:
3716  __kmp_affinity_compact = 0;
3717  if (__kmp_affinity_offset) {
3718  __kmp_affinity_offset = __kmp_nThreadsPerCore * __kmp_affinity_offset
3719  % __kmp_avail_proc;
3720  }
3721  goto sortAddresses;
3722 
3723  case affinity_physical:
3724  if (__kmp_nThreadsPerCore > 1) {
3725  __kmp_affinity_compact = 1;
3726  if (__kmp_affinity_compact >= depth) {
3727  __kmp_affinity_compact = 0;
3728  }
3729  } else {
3730  __kmp_affinity_compact = 0;
3731  }
3732  if (__kmp_affinity_offset) {
3733  __kmp_affinity_offset = __kmp_nThreadsPerCore * __kmp_affinity_offset
3734  % __kmp_avail_proc;
3735  }
3736  goto sortAddresses;
3737 
3738  case affinity_scatter:
3739  if (__kmp_affinity_compact >= depth) {
3740  __kmp_affinity_compact = 0;
3741  }
3742  else {
3743  __kmp_affinity_compact = depth - 1 - __kmp_affinity_compact;
3744  }
3745  goto sortAddresses;
3746 
3747  case affinity_compact:
3748  if (__kmp_affinity_compact >= depth) {
3749  __kmp_affinity_compact = depth - 1;
3750  }
3751  goto sortAddresses;
3752 
3753 # if KMP_MIC
3754  case affinity_balanced:
3755  // Balanced works only for the case of a single package and uniform topology
3756  if( nPackages > 1 ) {
3757  if( __kmp_affinity_verbose || __kmp_affinity_warnings ) {
3758  KMP_WARNING( AffBalancedNotAvail, "KMP_AFFINITY" );
3759  }
3760  __kmp_affinity_type = affinity_none;
3761  return;
3762  } else if( __kmp_affinity_uniform_topology() ) {
3763  break;
3764  } else { // Non-uniform topology
3765 
3766  // Save the depth for further usage
3767  __kmp_aff_depth = depth;
3768 
3769  // Number of hyper threads per core in HT machine
3770  int nth_per_core = __kmp_nThreadsPerCore;
3771 
3772  int core_level;
3773  if( nth_per_core > 1 ) {
3774  core_level = depth - 2;
3775  } else {
3776  core_level = depth - 1;
3777  }
3778  int ncores = address2os[ __kmp_avail_proc - 1 ].first.labels[ core_level ] + 1;
3779  int nproc = nth_per_core * ncores;
3780 
3781  procarr = ( int * )__kmp_allocate( sizeof( int ) * nproc );
3782  for( int i = 0; i < nproc; i++ ) {
3783  procarr[ i ] = -1;
3784  }
3785 
3786  int k = 0;
3787  for( int i = 0; i < __kmp_avail_proc; i++ ) {
3788  int proc = address2os[ i ].second;
3789  // If depth == 3 then level=0 - package, level=1 - core, level=2 - thread.
3790  // If there is only one thread per core then depth == 2: level 0 - package,
3791  // level 1 - core.
3792  int level = depth - 1;
3793 
3794  // __kmp_nth_per_core == 1
3795  int thread = 0;
3796  int core = address2os[ i ].first.labels[ level ];
3797  // If the thread level exists, that is we have more than one thread context per core
3798  if( nth_per_core > 1 ) {
3799  thread = address2os[ i ].first.labels[ level ] % nth_per_core;
3800  core = address2os[ i ].first.labels[ level - 1 ];
3801  }
3802  k = core * nth_per_core + thread;
3803  procarr[ k ] = proc;
3804  k++;
3805  }
3806 
3807  break;
3808  }
3809 # endif
3810 
3811  sortAddresses:
3812  //
3813  // Allocate the gtid->affinity mask table.
3814  //
3815  if (__kmp_affinity_dups) {
3816  __kmp_affinity_num_masks = __kmp_avail_proc;
3817  }
3818  else {
3819  __kmp_affinity_num_masks = numUnique;
3820  }
3821 
3822 # if OMP_40_ENABLED
3823  if ( ( __kmp_nested_proc_bind.bind_types[0] != proc_bind_intel )
3824  && ( __kmp_affinity_num_places > 0 )
3825  && ( __kmp_affinity_num_places < __kmp_affinity_num_masks ) ) {
3826  __kmp_affinity_num_masks = __kmp_affinity_num_places;
3827  }
3828 # endif
3829 
3830  __kmp_affinity_masks = (kmp_affin_mask_t*)__kmp_allocate(
3831  __kmp_affinity_num_masks * __kmp_affin_mask_size);
3832 
3833  //
3834  // Sort the address2os table according to the current setting of
3835  // __kmp_affinity_compact, then fill out __kmp_affinity_masks.
3836  //
3837  qsort(address2os, __kmp_avail_proc, sizeof(*address2os),
3838  __kmp_affinity_cmp_Address_child_num);
3839  {
3840  unsigned i;
3841  unsigned j;
3842  for (i = 0, j = 0; i < __kmp_avail_proc; i++) {
3843  if ((! __kmp_affinity_dups) && (! address2os[i].first.leader)) {
3844  continue;
3845  }
3846  unsigned osId = address2os[i].second;
3847  kmp_affin_mask_t *src = KMP_CPU_INDEX(osId2Mask, osId);
3848  kmp_affin_mask_t *dest
3849  = KMP_CPU_INDEX(__kmp_affinity_masks, j);
3850  KMP_ASSERT(KMP_CPU_ISSET(osId, src));
3851  KMP_CPU_COPY(dest, src);
3852  if (++j >= __kmp_affinity_num_masks) {
3853  break;
3854  }
3855  }
3856  KMP_DEBUG_ASSERT(j == __kmp_affinity_num_masks);
3857  }
3858  break;
3859 
3860  default:
3861  KMP_ASSERT2(0, "Unexpected affinity setting");
3862  }
3863 
3864  __kmp_free(osId2Mask);
3865 }
3866 
3867 
3868 void
3869 __kmp_affinity_initialize(void)
3870 {
3871  //
3872  // Much of the code above was written assumming that if a machine was not
3873  // affinity capable, then __kmp_affinity_type == affinity_none. We now
3874  // explicitly represent this as __kmp_affinity_type == affinity_disabled.
3875  //
3876  // There are too many checks for __kmp_affinity_type == affinity_none
3877  // in this code. Instead of trying to change them all, check if
3878  // __kmp_affinity_type == affinity_disabled, and if so, slam it with
3879  // affinity_none, call the real initialization routine, then restore
3880  // __kmp_affinity_type to affinity_disabled.
3881  //
3882  int disabled = (__kmp_affinity_type == affinity_disabled);
3883  if (! KMP_AFFINITY_CAPABLE()) {
3884  KMP_ASSERT(disabled);
3885  }
3886  if (disabled) {
3887  __kmp_affinity_type = affinity_none;
3888  }
3889  __kmp_aux_affinity_initialize();
3890  if (disabled) {
3891  __kmp_affinity_type = affinity_disabled;
3892  }
3893 }
3894 
3895 
3896 void
3897 __kmp_affinity_uninitialize(void)
3898 {
3899  if (__kmp_affinity_masks != NULL) {
3900  __kmp_free(__kmp_affinity_masks);
3901  __kmp_affinity_masks = NULL;
3902  }
3903  if (fullMask != NULL) {
3904  KMP_CPU_FREE(fullMask);
3905  fullMask = NULL;
3906  }
3907  __kmp_affinity_num_masks = 0;
3908 # if OMP_40_ENABLED
3909  __kmp_affinity_num_places = 0;
3910 # endif
3911  if (__kmp_affinity_proclist != NULL) {
3912  __kmp_free(__kmp_affinity_proclist);
3913  __kmp_affinity_proclist = NULL;
3914  }
3915  if( address2os != NULL ) {
3916  __kmp_free( address2os );
3917  address2os = NULL;
3918  }
3919  if( procarr != NULL ) {
3920  __kmp_free( procarr );
3921  procarr = NULL;
3922  }
3923 }
3924 
3925 
3926 void
3927 __kmp_affinity_set_init_mask(int gtid, int isa_root)
3928 {
3929  if (! KMP_AFFINITY_CAPABLE()) {
3930  return;
3931  }
3932 
3933  kmp_info_t *th = (kmp_info_t *)TCR_SYNC_PTR(__kmp_threads[gtid]);
3934  if (th->th.th_affin_mask == NULL) {
3935  KMP_CPU_ALLOC(th->th.th_affin_mask);
3936  }
3937  else {
3938  KMP_CPU_ZERO(th->th.th_affin_mask);
3939  }
3940 
3941  //
3942  // Copy the thread mask to the kmp_info_t strucuture.
3943  // If __kmp_affinity_type == affinity_none, copy the "full" mask, i.e. one
3944  // that has all of the OS proc ids set, or if __kmp_affinity_respect_mask
3945  // is set, then the full mask is the same as the mask of the initialization
3946  // thread.
3947  //
3948  kmp_affin_mask_t *mask;
3949  int i;
3950 
3951 # if OMP_40_ENABLED
3952  if (__kmp_nested_proc_bind.bind_types[0] == proc_bind_intel)
3953 # endif
3954  {
3955  if ((__kmp_affinity_type == affinity_none)
3956 # if KMP_MIC
3957  || (__kmp_affinity_type == affinity_balanced)
3958 # endif
3959  ) {
3960 # if KMP_OS_WINDOWS && KMP_ARCH_X86_64
3961  if (__kmp_num_proc_groups > 1) {
3962  return;
3963  }
3964 # endif
3965  KMP_ASSERT(fullMask != NULL);
3966  i = -1;
3967  mask = fullMask;
3968  }
3969  else {
3970  KMP_DEBUG_ASSERT( __kmp_affinity_num_masks > 0 );
3971  i = (gtid + __kmp_affinity_offset) % __kmp_affinity_num_masks;
3972  mask = KMP_CPU_INDEX(__kmp_affinity_masks, i);
3973  }
3974  }
3975 # if OMP_40_ENABLED
3976  else {
3977  if ((! isa_root)
3978  || (__kmp_nested_proc_bind.bind_types[0] == proc_bind_false)) {
3979 # if KMP_OS_WINDOWS && KMP_ARCH_X86_64
3980  if (__kmp_num_proc_groups > 1) {
3981  return;
3982  }
3983 # endif
3984  KMP_ASSERT(fullMask != NULL);
3985  i = KMP_PLACE_ALL;
3986  mask = fullMask;
3987  }
3988  else {
3989  //
3990  // int i = some hash function or just a counter that doesn't
3991  // always start at 0. Use gtid for now.
3992  //
3993  KMP_DEBUG_ASSERT( __kmp_affinity_num_masks > 0 );
3994  i = (gtid + __kmp_affinity_offset) % __kmp_affinity_num_masks;
3995  mask = KMP_CPU_INDEX(__kmp_affinity_masks, i);
3996  }
3997  }
3998 # endif
3999 
4000 # if OMP_40_ENABLED
4001  th->th.th_current_place = i;
4002  if (isa_root) {
4003  th->th.th_new_place = i;
4004  th->th.th_first_place = 0;
4005  th->th.th_last_place = __kmp_affinity_num_masks - 1;
4006  }
4007 
4008  if (i == KMP_PLACE_ALL) {
4009  KA_TRACE(100, ("__kmp_affinity_set_init_mask: binding T#%d to all places\n",
4010  gtid));
4011  }
4012  else {
4013  KA_TRACE(100, ("__kmp_affinity_set_init_mask: binding T#%d to place %d\n",
4014  gtid, i));
4015  }
4016 # else
4017  if (i == -1) {
4018  KA_TRACE(100, ("__kmp_affinity_set_init_mask: binding T#%d to fullMask\n",
4019  gtid));
4020  }
4021  else {
4022  KA_TRACE(100, ("__kmp_affinity_set_init_mask: binding T#%d to mask %d\n",
4023  gtid, i));
4024  }
4025 # endif /* OMP_40_ENABLED */
4026 
4027  KMP_CPU_COPY(th->th.th_affin_mask, mask);
4028 
4029  if (__kmp_affinity_verbose) {
4030  char buf[KMP_AFFIN_MASK_PRINT_LEN];
4031  __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN,
4032  th->th.th_affin_mask);
4033  KMP_INFORM(BoundToOSProcSet, "KMP_AFFINITY", gtid, buf);
4034  }
4035 
4036 # if KMP_OS_WINDOWS
4037  //
4038  // On Windows* OS, the process affinity mask might have changed.
4039  // If the user didn't request affinity and this call fails,
4040  // just continue silently. See CQ171393.
4041  //
4042  if ( __kmp_affinity_type == affinity_none ) {
4043  __kmp_set_system_affinity(th->th.th_affin_mask, FALSE);
4044  }
4045  else
4046 # endif
4047  __kmp_set_system_affinity(th->th.th_affin_mask, TRUE);
4048 }
4049 
4050 
4051 # if OMP_40_ENABLED
4052 
4053 void
4054 __kmp_affinity_set_place(int gtid)
4055 {
4056  int retval;
4057 
4058  if (! KMP_AFFINITY_CAPABLE()) {
4059  return;
4060  }
4061 
4062  kmp_info_t *th = (kmp_info_t *)TCR_SYNC_PTR(__kmp_threads[gtid]);
4063 
4064  KA_TRACE(100, ("__kmp_affinity_set_place: binding T#%d to place %d (current place = %d)\n",
4065  gtid, th->th.th_new_place, th->th.th_current_place));
4066 
4067  //
4068  // Check that the new place is withing this thread's partition.
4069  //
4070  KMP_DEBUG_ASSERT(th->th.th_affin_mask != NULL);
4071  KMP_DEBUG_ASSERT(th->th.th_new_place >= 0);
4072  KMP_DEBUG_ASSERT(th->th.th_new_place <= __kmp_affinity_num_masks);
4073  if (th->th.th_first_place <= th->th.th_last_place) {
4074  KMP_DEBUG_ASSERT((th->th.th_new_place >= th->th.th_first_place)
4075  && (th->th.th_new_place <= th->th.th_last_place));
4076  }
4077  else {
4078  KMP_DEBUG_ASSERT((th->th.th_new_place <= th->th.th_first_place)
4079  || (th->th.th_new_place >= th->th.th_last_place));
4080  }
4081 
4082  //
4083  // Copy the thread mask to the kmp_info_t strucuture,
4084  // and set this thread's affinity.
4085  //
4086  kmp_affin_mask_t *mask = KMP_CPU_INDEX(__kmp_affinity_masks,
4087  th->th.th_new_place);
4088  KMP_CPU_COPY(th->th.th_affin_mask, mask);
4089  th->th.th_current_place = th->th.th_new_place;
4090 
4091  if (__kmp_affinity_verbose) {
4092  char buf[KMP_AFFIN_MASK_PRINT_LEN];
4093  __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN,
4094  th->th.th_affin_mask);
4095  KMP_INFORM(BoundToOSProcSet, "OMP_PROC_BIND", gtid, buf);
4096  }
4097  __kmp_set_system_affinity(th->th.th_affin_mask, TRUE);
4098 }
4099 
4100 # endif /* OMP_40_ENABLED */
4101 
4102 
4103 int
4104 __kmp_aux_set_affinity(void **mask)
4105 {
4106  int gtid;
4107  kmp_info_t *th;
4108  int retval;
4109 
4110  if (! KMP_AFFINITY_CAPABLE()) {
4111  return -1;
4112  }
4113 
4114  gtid = __kmp_entry_gtid();
4115  KA_TRACE(1000, ;{
4116  char buf[KMP_AFFIN_MASK_PRINT_LEN];
4117  __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN,
4118  (kmp_affin_mask_t *)(*mask));
4119  __kmp_debug_printf("kmp_set_affinity: setting affinity mask for thread %d = %s\n",
4120  gtid, buf);
4121  });
4122 
4123  if (__kmp_env_consistency_check) {
4124  if ((mask == NULL) || (*mask == NULL)) {
4125  KMP_FATAL(AffinityInvalidMask, "kmp_set_affinity");
4126  }
4127  else {
4128  int proc;
4129  int num_procs = 0;
4130 
4131  for (proc = 0; proc < KMP_CPU_SETSIZE; proc++) {
4132  if (! KMP_CPU_ISSET(proc, (kmp_affin_mask_t *)(*mask))) {
4133  continue;
4134  }
4135  num_procs++;
4136  if (! KMP_CPU_ISSET(proc, fullMask)) {
4137  KMP_FATAL(AffinityInvalidMask, "kmp_set_affinity");
4138  break;
4139  }
4140  }
4141  if (num_procs == 0) {
4142  KMP_FATAL(AffinityInvalidMask, "kmp_set_affinity");
4143  }
4144 
4145 # if KMP_OS_WINDOWS && KMP_ARCH_X86_64
4146  if (__kmp_get_proc_group((kmp_affin_mask_t *)(*mask)) < 0) {
4147  KMP_FATAL(AffinityInvalidMask, "kmp_set_affinity");
4148  }
4149 # endif /* KMP_OS_WINDOWS && KMP_ARCH_X86_64 */
4150 
4151  }
4152  }
4153 
4154  th = __kmp_threads[gtid];
4155  KMP_DEBUG_ASSERT(th->th.th_affin_mask != NULL);
4156  retval = __kmp_set_system_affinity((kmp_affin_mask_t *)(*mask), FALSE);
4157  if (retval == 0) {
4158  KMP_CPU_COPY(th->th.th_affin_mask, (kmp_affin_mask_t *)(*mask));
4159  }
4160 
4161 # if OMP_40_ENABLED
4162  th->th.th_current_place = KMP_PLACE_UNDEFINED;
4163  th->th.th_new_place = KMP_PLACE_UNDEFINED;
4164  th->th.th_first_place = 0;
4165  th->th.th_last_place = __kmp_affinity_num_masks - 1;
4166 # endif
4167 
4168  return retval;
4169 }
4170 
4171 
4172 int
4173 __kmp_aux_get_affinity(void **mask)
4174 {
4175  int gtid;
4176  int retval;
4177  kmp_info_t *th;
4178 
4179  if (! KMP_AFFINITY_CAPABLE()) {
4180  return -1;
4181  }
4182 
4183  gtid = __kmp_entry_gtid();
4184  th = __kmp_threads[gtid];
4185  KMP_DEBUG_ASSERT(th->th.th_affin_mask != NULL);
4186 
4187  KA_TRACE(1000, ;{
4188  char buf[KMP_AFFIN_MASK_PRINT_LEN];
4189  __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN,
4190  th->th.th_affin_mask);
4191  __kmp_printf("kmp_get_affinity: stored affinity mask for thread %d = %s\n", gtid, buf);
4192  });
4193 
4194  if (__kmp_env_consistency_check) {
4195  if ((mask == NULL) || (*mask == NULL)) {
4196  KMP_FATAL(AffinityInvalidMask, "kmp_get_affinity");
4197  }
4198  }
4199 
4200 # if !KMP_OS_WINDOWS
4201 
4202  retval = __kmp_get_system_affinity((kmp_affin_mask_t *)(*mask), FALSE);
4203  KA_TRACE(1000, ;{
4204  char buf[KMP_AFFIN_MASK_PRINT_LEN];
4205  __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN,
4206  (kmp_affin_mask_t *)(*mask));
4207  __kmp_printf("kmp_get_affinity: system affinity mask for thread %d = %s\n", gtid, buf);
4208  });
4209  return retval;
4210 
4211 # else
4212 
4213  KMP_CPU_COPY((kmp_affin_mask_t *)(*mask), th->th.th_affin_mask);
4214  return 0;
4215 
4216 # endif /* KMP_OS_WINDOWS */
4217 
4218 }
4219 
4220 
4221 int
4222 __kmp_aux_set_affinity_mask_proc(int proc, void **mask)
4223 {
4224  int retval;
4225 
4226  if (! KMP_AFFINITY_CAPABLE()) {
4227  return -1;
4228  }
4229 
4230  KA_TRACE(1000, ;{
4231  int gtid = __kmp_entry_gtid();
4232  char buf[KMP_AFFIN_MASK_PRINT_LEN];
4233  __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN,
4234  (kmp_affin_mask_t *)(*mask));
4235  __kmp_debug_printf("kmp_set_affinity_mask_proc: setting proc %d in affinity mask for thread %d = %s\n",
4236  proc, gtid, buf);
4237  });
4238 
4239  if (__kmp_env_consistency_check) {
4240  if ((mask == NULL) || (*mask == NULL)) {
4241  KMP_FATAL(AffinityInvalidMask, "kmp_set_affinity_mask_proc");
4242  }
4243  }
4244 
4245  if ((proc < 0) || (proc >= KMP_CPU_SETSIZE)) {
4246  return -1;
4247  }
4248  if (! KMP_CPU_ISSET(proc, fullMask)) {
4249  return -2;
4250  }
4251 
4252  KMP_CPU_SET(proc, (kmp_affin_mask_t *)(*mask));
4253  return 0;
4254 }
4255 
4256 
4257 int
4258 __kmp_aux_unset_affinity_mask_proc(int proc, void **mask)
4259 {
4260  int retval;
4261 
4262  if (! KMP_AFFINITY_CAPABLE()) {
4263  return -1;
4264  }
4265 
4266  KA_TRACE(1000, ;{
4267  int gtid = __kmp_entry_gtid();
4268  char buf[KMP_AFFIN_MASK_PRINT_LEN];
4269  __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN,
4270  (kmp_affin_mask_t *)(*mask));
4271  __kmp_debug_printf("kmp_unset_affinity_mask_proc: unsetting proc %d in affinity mask for thread %d = %s\n",
4272  proc, gtid, buf);
4273  });
4274 
4275  if (__kmp_env_consistency_check) {
4276  if ((mask == NULL) || (*mask == NULL)) {
4277  KMP_FATAL(AffinityInvalidMask, "kmp_unset_affinity_mask_proc");
4278  }
4279  }
4280 
4281  if ((proc < 0) || (proc >= KMP_CPU_SETSIZE)) {
4282  return -1;
4283  }
4284  if (! KMP_CPU_ISSET(proc, fullMask)) {
4285  return -2;
4286  }
4287 
4288  KMP_CPU_CLR(proc, (kmp_affin_mask_t *)(*mask));
4289  return 0;
4290 }
4291 
4292 
4293 int
4294 __kmp_aux_get_affinity_mask_proc(int proc, void **mask)
4295 {
4296  int retval;
4297 
4298  if (! KMP_AFFINITY_CAPABLE()) {
4299  return -1;
4300  }
4301 
4302  KA_TRACE(1000, ;{
4303  int gtid = __kmp_entry_gtid();
4304  char buf[KMP_AFFIN_MASK_PRINT_LEN];
4305  __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN,
4306  (kmp_affin_mask_t *)(*mask));
4307  __kmp_debug_printf("kmp_get_affinity_mask_proc: getting proc %d in affinity mask for thread %d = %s\n",
4308  proc, gtid, buf);
4309  });
4310 
4311  if (__kmp_env_consistency_check) {
4312  if ((mask == NULL) || (*mask == NULL)) {
4313  KMP_FATAL(AffinityInvalidMask, "kmp_set_affinity_mask_proc");
4314  }
4315  }
4316 
4317  if ((proc < 0) || (proc >= KMP_CPU_SETSIZE)) {
4318  return 0;
4319  }
4320  if (! KMP_CPU_ISSET(proc, fullMask)) {
4321  return 0;
4322  }
4323 
4324  return KMP_CPU_ISSET(proc, (kmp_affin_mask_t *)(*mask));
4325 }
4326 
4327 # if KMP_MIC
4328 
4329 // Dynamic affinity settings - Affinity balanced
4330 void __kmp_balanced_affinity( int tid, int nthreads )
4331 {
4332  if( __kmp_affinity_uniform_topology() ) {
4333  int coreID;
4334  int threadID;
4335  // Number of hyper threads per core in HT machine
4336  int __kmp_nth_per_core = __kmp_avail_proc / __kmp_ncores;
4337  // Number of cores
4338  int ncores = __kmp_ncores;
4339  // How many threads will be bound to each core
4340  int chunk = nthreads / ncores;
4341  // How many cores will have an additional thread bound to it - "big cores"
4342  int big_cores = nthreads % ncores;
4343  // Number of threads on the big cores
4344  int big_nth = ( chunk + 1 ) * big_cores;
4345  if( tid < big_nth ) {
4346  coreID = tid / (chunk + 1 );
4347  threadID = ( tid % (chunk + 1 ) ) % __kmp_nth_per_core ;
4348  } else { //tid >= big_nth
4349  coreID = ( tid - big_cores ) / chunk;
4350  threadID = ( ( tid - big_cores ) % chunk ) % __kmp_nth_per_core ;
4351  }
4352 
4353  KMP_DEBUG_ASSERT2(KMP_AFFINITY_CAPABLE(),
4354  "Illegal set affinity operation when not capable");
4355 
4356  kmp_affin_mask_t *mask = (kmp_affin_mask_t *)alloca(__kmp_affin_mask_size);
4357  KMP_CPU_ZERO(mask);
4358 
4359  // Granularity == thread
4360  if( __kmp_affinity_gran == affinity_gran_fine || __kmp_affinity_gran == affinity_gran_thread) {
4361  int osID = address2os[ coreID * __kmp_nth_per_core + threadID ].second;
4362  KMP_CPU_SET( osID, mask);
4363  } else if( __kmp_affinity_gran == affinity_gran_core ) { // Granularity == core
4364  for( int i = 0; i < __kmp_nth_per_core; i++ ) {
4365  int osID;
4366  osID = address2os[ coreID * __kmp_nth_per_core + i ].second;
4367  KMP_CPU_SET( osID, mask);
4368  }
4369  }
4370  if (__kmp_affinity_verbose) {
4371  char buf[KMP_AFFIN_MASK_PRINT_LEN];
4372  __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, mask);
4373  KMP_INFORM(BoundToOSProcSet, "KMP_AFFINITY", tid, buf);
4374  }
4375  __kmp_set_system_affinity( mask, TRUE );
4376  } else { // Non-uniform topology
4377 
4378  kmp_affin_mask_t *mask = (kmp_affin_mask_t *)alloca(__kmp_affin_mask_size);
4379  KMP_CPU_ZERO(mask);
4380 
4381  // Number of hyper threads per core in HT machine
4382  int nth_per_core = __kmp_nThreadsPerCore;
4383  int core_level;
4384  if( nth_per_core > 1 ) {
4385  core_level = __kmp_aff_depth - 2;
4386  } else {
4387  core_level = __kmp_aff_depth - 1;
4388  }
4389 
4390  // Number of cores - maximum value; it does not count trail cores with 0 processors
4391  int ncores = address2os[ __kmp_avail_proc - 1 ].first.labels[ core_level ] + 1;
4392 
4393  // For performance gain consider the special case nthreads == __kmp_avail_proc
4394  if( nthreads == __kmp_avail_proc ) {
4395  if( __kmp_affinity_gran == affinity_gran_fine || __kmp_affinity_gran == affinity_gran_thread) {
4396  int osID = address2os[ tid ].second;
4397  KMP_CPU_SET( osID, mask);
4398  } else if( __kmp_affinity_gran == affinity_gran_core ) { // Granularity == core
4399  int coreID = address2os[ tid ].first.labels[ core_level ];
4400  // We'll count found osIDs for the current core; they can be not more than nth_per_core;
4401  // since the address2os is sortied we can break when cnt==nth_per_core
4402  int cnt = 0;
4403  for( int i = 0; i < __kmp_avail_proc; i++ ) {
4404  int osID = address2os[ i ].second;
4405  int core = address2os[ i ].first.labels[ core_level ];
4406  if( core == coreID ) {
4407  KMP_CPU_SET( osID, mask);
4408  cnt++;
4409  if( cnt == nth_per_core ) {
4410  break;
4411  }
4412  }
4413  }
4414  }
4415  } else if( nthreads <= __kmp_ncores ) {
4416 
4417  int core = 0;
4418  for( int i = 0; i < ncores; i++ ) {
4419  // Check if this core from procarr[] is in the mask
4420  int in_mask = 0;
4421  for( int j = 0; j < nth_per_core; j++ ) {
4422  if( procarr[ i * nth_per_core + j ] != - 1 ) {
4423  in_mask = 1;
4424  break;
4425  }
4426  }
4427  if( in_mask ) {
4428  if( tid == core ) {
4429  for( int j = 0; j < nth_per_core; j++ ) {
4430  int osID = procarr[ i * nth_per_core + j ];
4431  if( osID != -1 ) {
4432  KMP_CPU_SET( osID, mask );
4433  // For granularity=thread it is enough to set the first available osID for this core
4434  if( __kmp_affinity_gran == affinity_gran_fine || __kmp_affinity_gran == affinity_gran_thread) {
4435  break;
4436  }
4437  }
4438  }
4439  break;
4440  } else {
4441  core++;
4442  }
4443  }
4444  }
4445 
4446  } else { // nthreads > __kmp_ncores
4447 
4448  // Array to save the number of processors at each core
4449  int nproc_at_core[ ncores ];
4450  // Array to save the number of cores with "x" available processors;
4451  int ncores_with_x_procs[ nth_per_core + 1 ];
4452  // Array to save the number of cores with # procs from x to nth_per_core
4453  int ncores_with_x_to_max_procs[ nth_per_core + 1 ];
4454 
4455  for( int i = 0; i <= nth_per_core; i++ ) {
4456  ncores_with_x_procs[ i ] = 0;
4457  ncores_with_x_to_max_procs[ i ] = 0;
4458  }
4459 
4460  for( int i = 0; i < ncores; i++ ) {
4461  int cnt = 0;
4462  for( int j = 0; j < nth_per_core; j++ ) {
4463  if( procarr[ i * nth_per_core + j ] != -1 ) {
4464  cnt++;
4465  }
4466  }
4467  nproc_at_core[ i ] = cnt;
4468  ncores_with_x_procs[ cnt ]++;
4469  }
4470 
4471  for( int i = 0; i <= nth_per_core; i++ ) {
4472  for( int j = i; j <= nth_per_core; j++ ) {
4473  ncores_with_x_to_max_procs[ i ] += ncores_with_x_procs[ j ];
4474  }
4475  }
4476 
4477  // Max number of processors
4478  int nproc = nth_per_core * ncores;
4479  // An array to keep number of threads per each context
4480  int * newarr = ( int * )__kmp_allocate( sizeof( int ) * nproc );
4481  for( int i = 0; i < nproc; i++ ) {
4482  newarr[ i ] = 0;
4483  }
4484 
4485  int nth = nthreads;
4486  int flag = 0;
4487  while( nth > 0 ) {
4488  for( int j = 1; j <= nth_per_core; j++ ) {
4489  int cnt = ncores_with_x_to_max_procs[ j ];
4490  for( int i = 0; i < ncores; i++ ) {
4491  // Skip the core with 0 processors
4492  if( nproc_at_core[ i ] == 0 ) {
4493  continue;
4494  }
4495  for( int k = 0; k < nth_per_core; k++ ) {
4496  if( procarr[ i * nth_per_core + k ] != -1 ) {
4497  if( newarr[ i * nth_per_core + k ] == 0 ) {
4498  newarr[ i * nth_per_core + k ] = 1;
4499  cnt--;
4500  nth--;
4501  break;
4502  } else {
4503  if( flag != 0 ) {
4504  newarr[ i * nth_per_core + k ] ++;
4505  cnt--;
4506  nth--;
4507  break;
4508  }
4509  }
4510  }
4511  }
4512  if( cnt == 0 || nth == 0 ) {
4513  break;
4514  }
4515  }
4516  if( nth == 0 ) {
4517  break;
4518  }
4519  }
4520  flag = 1;
4521  }
4522  int sum = 0;
4523  for( int i = 0; i < nproc; i++ ) {
4524  sum += newarr[ i ];
4525  if( sum > tid ) {
4526  // Granularity == thread
4527  if( __kmp_affinity_gran == affinity_gran_fine || __kmp_affinity_gran == affinity_gran_thread) {
4528  int osID = procarr[ i ];
4529  KMP_CPU_SET( osID, mask);
4530  } else if( __kmp_affinity_gran == affinity_gran_core ) { // Granularity == core
4531  int coreID = i / nth_per_core;
4532  for( int ii = 0; ii < nth_per_core; ii++ ) {
4533  int osID = procarr[ coreID * nth_per_core + ii ];
4534  if( osID != -1 ) {
4535  KMP_CPU_SET( osID, mask);
4536  }
4537  }
4538  }
4539  break;
4540  }
4541  }
4542  __kmp_free( newarr );
4543  }
4544 
4545  if (__kmp_affinity_verbose) {
4546  char buf[KMP_AFFIN_MASK_PRINT_LEN];
4547  __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, mask);
4548  KMP_INFORM(BoundToOSProcSet, "KMP_AFFINITY", tid, buf);
4549  }
4550  __kmp_set_system_affinity( mask, TRUE );
4551  }
4552 }
4553 
4554 # endif /* KMP_MIC */
4555 
4556 #elif KMP_OS_DARWIN
4557  // affinity not supported
4558 #else
4559  #error "Unknown or unsupported OS"
4560 #endif // KMP_OS_WINDOWS || KMP_OS_LINUX
4561