contrib/openmp/runtime/src/kmp_affinity.h

   1 /*
   2  * kmp_affinity.h -- header for affinity management
   3  */
   4
   5 //===----------------------------------------------------------------------===//
   6 //
   7 //                     The LLVM Compiler Infrastructure
   8 //
   9 // This file is dual licensed under the MIT and the University of Illinois Open
  10 // Source Licenses. See LICENSE.txt for details.
  11 //
  12 //===----------------------------------------------------------------------===//
  13
  14 #ifndef KMP_AFFINITY_H
  15 #define KMP_AFFINITY_H
  16
  17 #include "kmp.h"
  18 #include "kmp_os.h"
  19
  20 #if KMP_AFFINITY_SUPPORTED
  21 #if KMP_USE_HWLOC
  22 class KMPHwlocAffinity : public KMPAffinity {
  23 public:
  24   class Mask : public KMPAffinity::Mask {
  25     hwloc_cpuset_t mask;
  26
  27   public:
  28     Mask() {
  29       mask = hwloc_bitmap_alloc();
  30       this->zero();
  31     }
  32     ~Mask() { hwloc_bitmap_free(mask); }
  33     void set(int i) override { hwloc_bitmap_set(mask, i); }
  34     bool is_set(int i) const override { return hwloc_bitmap_isset(mask, i); }
  35     void clear(int i) override { hwloc_bitmap_clr(mask, i); }
  36     void zero() override { hwloc_bitmap_zero(mask); }
  37     void copy(const KMPAffinity::Mask *src) override {
  38       const Mask *convert = static_cast<const Mask *>(src);
  39       hwloc_bitmap_copy(mask, convert->mask);
  40     }
  41     void bitwise_and(const KMPAffinity::Mask *rhs) override {
  42       const Mask *convert = static_cast<const Mask *>(rhs);
  43       hwloc_bitmap_and(mask, mask, convert->mask);
  44     }
  45     void bitwise_or(const KMPAffinity::Mask *rhs) override {
  46       const Mask *convert = static_cast<const Mask *>(rhs);
  47       hwloc_bitmap_or(mask, mask, convert->mask);
  48     }
  49     void bitwise_not() override { hwloc_bitmap_not(mask, mask); }
  50     int begin() const override { return hwloc_bitmap_first(mask); }
  51     int end() const override { return -1; }
  52     int next(int previous) const override {
  53       return hwloc_bitmap_next(mask, previous);
  54     }
  55     int get_system_affinity(bool abort_on_error) override {
  56       KMP_ASSERT2(KMP_AFFINITY_CAPABLE(),
  57                   "Illegal get affinity operation when not capable");
  58       int retval =
  59           hwloc_get_cpubind(__kmp_hwloc_topology, mask, HWLOC_CPUBIND_THREAD);
  60       if (retval >= 0) {
  61         return 0;
  62       }
  63       int error = errno;
  64       if (abort_on_error) {
  65         __kmp_fatal(KMP_MSG(FatalSysError), KMP_ERR(error), __kmp_msg_null);
  66       }
  67       return error;
  68     }
  69     int set_system_affinity(bool abort_on_error) const override {
  70       KMP_ASSERT2(KMP_AFFINITY_CAPABLE(),
  71                   "Illegal get affinity operation when not capable");
  72       int retval =
  73           hwloc_set_cpubind(__kmp_hwloc_topology, mask, HWLOC_CPUBIND_THREAD);
  74       if (retval >= 0) {
  75         return 0;
  76       }
  77       int error = errno;
  78       if (abort_on_error) {
  79         __kmp_fatal(KMP_MSG(FatalSysError), KMP_ERR(error), __kmp_msg_null);
  80       }
  81       return error;
  82     }
  83     int get_proc_group() const override {
  84       int group = -1;
  85 #if KMP_OS_WINDOWS
  86       if (__kmp_num_proc_groups == 1) {
  87         return 1;
  88       }
  89       for (int i = 0; i < __kmp_num_proc_groups; i++) {
  90         // On windows, the long type is always 32 bits
  91         unsigned long first_32_bits = hwloc_bitmap_to_ith_ulong(mask, i * 2);
  92         unsigned long second_32_bits =
  93             hwloc_bitmap_to_ith_ulong(mask, i * 2 + 1);
  94         if (first_32_bits == 0 && second_32_bits == 0) {
  95           continue;
  96         }
  97         if (group >= 0) {
  98           return -1;
  99         }
 100         group = i;
 101       }
 102 #endif /* KMP_OS_WINDOWS */
 103       return group;
 104     }
 105   };
 106   void determine_capable(const char *var) override {
 107     const hwloc_topology_support *topology_support;
 108     if (__kmp_hwloc_topology == NULL) {
 109       if (hwloc_topology_init(&__kmp_hwloc_topology) < 0) {
 110         __kmp_hwloc_error = TRUE;
 111         if (__kmp_affinity_verbose)
 112           KMP_WARNING(AffHwlocErrorOccurred, var, "hwloc_topology_init()");
 113       }
 114       if (hwloc_topology_load(__kmp_hwloc_topology) < 0) {
 115         __kmp_hwloc_error = TRUE;
 116         if (__kmp_affinity_verbose)
 117           KMP_WARNING(AffHwlocErrorOccurred, var, "hwloc_topology_load()");
 118       }
 119     }
 120     topology_support = hwloc_topology_get_support(__kmp_hwloc_topology);
 121     // Is the system capable of setting/getting this thread's affinity?
 122     // Also, is topology discovery possible? (pu indicates ability to discover
 123     // processing units). And finally, were there no errors when calling any
 124     // hwloc_* API functions?
 125     if (topology_support && topology_support->cpubind->set_thisthread_cpubind &&
 126         topology_support->cpubind->get_thisthread_cpubind &&
 127         topology_support->discovery->pu && !__kmp_hwloc_error) {
 128       // enables affinity according to KMP_AFFINITY_CAPABLE() macro
 129       KMP_AFFINITY_ENABLE(TRUE);
 130     } else {
 131       // indicate that hwloc didn't work and disable affinity
 132       __kmp_hwloc_error = TRUE;
 133       KMP_AFFINITY_DISABLE();
 134     }
 135   }
 136   void bind_thread(int which) override {
 137     KMP_ASSERT2(KMP_AFFINITY_CAPABLE(),
 138                 "Illegal set affinity operation when not capable");
 139     KMPAffinity::Mask *mask;
 140     KMP_CPU_ALLOC_ON_STACK(mask);
 141     KMP_CPU_ZERO(mask);
 142     KMP_CPU_SET(which, mask);
 143     __kmp_set_system_affinity(mask, TRUE);
 144     KMP_CPU_FREE_FROM_STACK(mask);
 145   }
 146   KMPAffinity::Mask *allocate_mask() override { return new Mask(); }
 147   void deallocate_mask(KMPAffinity::Mask *m) override { delete m; }
 148   KMPAffinity::Mask *allocate_mask_array(int num) override {
 149     return new Mask[num];
 150   }
 151   void deallocate_mask_array(KMPAffinity::Mask *array) override {
 152     Mask *hwloc_array = static_cast<Mask *>(array);
 153     delete[] hwloc_array;
 154   }
 155   KMPAffinity::Mask *index_mask_array(KMPAffinity::Mask *array,
 156                                       int index) override {
 157     Mask *hwloc_array = static_cast<Mask *>(array);
 158     return &(hwloc_array[index]);
 159   }
 160   api_type get_api_type() const override { return HWLOC; }
 161 };
 162 #endif /* KMP_USE_HWLOC */
 163
 164 #if KMP_OS_LINUX
 165 /* On some of the older OS's that we build on, these constants aren't present
 166    in <asm/unistd.h> #included from <sys.syscall.h>. They must be the same on
 167    all systems of the same arch where they are defined, and they cannot change.
 168    stone forever. */
 169 #include <sys/syscall.h>
 170 #if KMP_ARCH_X86 || KMP_ARCH_ARM
 171 #ifndef __NR_sched_setaffinity
 172 #define __NR_sched_setaffinity 241
 173 #elif __NR_sched_setaffinity != 241
 174 #error Wrong code for setaffinity system call.
 175 #endif /* __NR_sched_setaffinity */
 176 #ifndef __NR_sched_getaffinity
 177 #define __NR_sched_getaffinity 242
 178 #elif __NR_sched_getaffinity != 242
 179 #error Wrong code for getaffinity system call.
 180 #endif /* __NR_sched_getaffinity */
 181 #elif KMP_ARCH_AARCH64
 182 #ifndef __NR_sched_setaffinity
 183 #define __NR_sched_setaffinity 122
 184 #elif __NR_sched_setaffinity != 122
 185 #error Wrong code for setaffinity system call.
 186 #endif /* __NR_sched_setaffinity */
 187 #ifndef __NR_sched_getaffinity
 188 #define __NR_sched_getaffinity 123
 189 #elif __NR_sched_getaffinity != 123
 190 #error Wrong code for getaffinity system call.
 191 #endif /* __NR_sched_getaffinity */
 192 #elif KMP_ARCH_X86_64
 193 #ifndef __NR_sched_setaffinity
 194 #define __NR_sched_setaffinity 203
 195 #elif __NR_sched_setaffinity != 203
 196 #error Wrong code for setaffinity system call.
 197 #endif /* __NR_sched_setaffinity */
 198 #ifndef __NR_sched_getaffinity
 199 #define __NR_sched_getaffinity 204
 200 #elif __NR_sched_getaffinity != 204
 201 #error Wrong code for getaffinity system call.
 202 #endif /* __NR_sched_getaffinity */
 203 #elif KMP_ARCH_PPC64
 204 #ifndef __NR_sched_setaffinity
 205 #define __NR_sched_setaffinity 222
 206 #elif __NR_sched_setaffinity != 222
 207 #error Wrong code for setaffinity system call.
 208 #endif /* __NR_sched_setaffinity */
 209 #ifndef __NR_sched_getaffinity
 210 #define __NR_sched_getaffinity 223
 211 #elif __NR_sched_getaffinity != 223
 212 #error Wrong code for getaffinity system call.
 213 #endif /* __NR_sched_getaffinity */
 214 #elif KMP_ARCH_MIPS
 215 #ifndef __NR_sched_setaffinity
 216 #define __NR_sched_setaffinity 4239
 217 #elif __NR_sched_setaffinity != 4239
 218 #error Wrong code for setaffinity system call.
 219 #endif /* __NR_sched_setaffinity */
 220 #ifndef __NR_sched_getaffinity
 221 #define __NR_sched_getaffinity 4240
 222 #elif __NR_sched_getaffinity != 4240
 223 #error Wrong code for getaffinity system call.
 224 #endif /* __NR_sched_getaffinity */
 225 #elif KMP_ARCH_MIPS64
 226 #ifndef __NR_sched_setaffinity
 227 #define __NR_sched_setaffinity 5195
 228 #elif __NR_sched_setaffinity != 5195
 229 #error Wrong code for setaffinity system call.
 230 #endif /* __NR_sched_setaffinity */
 231 #ifndef __NR_sched_getaffinity
 232 #define __NR_sched_getaffinity 5196
 233 #elif __NR_sched_getaffinity != 5196
 234 #error Wrong code for getaffinity system call.
 235 #endif /* __NR_sched_getaffinity */
 236 #error Unknown or unsupported architecture
 237 #endif /* KMP_ARCH_* */
 238 class KMPNativeAffinity : public KMPAffinity {
 239   class Mask : public KMPAffinity::Mask {
 240     typedef unsigned char mask_t;
 241     static const int BITS_PER_MASK_T = sizeof(mask_t) * CHAR_BIT;
 242
 243   public:
 244     mask_t *mask;
 245     Mask() { mask = (mask_t *)__kmp_allocate(__kmp_affin_mask_size); }
 246     ~Mask() {
 247       if (mask)
 248         __kmp_free(mask);
 249     }
 250     void set(int i) override {
 251       mask[i / BITS_PER_MASK_T] |= ((mask_t)1 << (i % BITS_PER_MASK_T));
 252     }
 253     bool is_set(int i) const override {
 254       return (mask[i / BITS_PER_MASK_T] & ((mask_t)1 << (i % BITS_PER_MASK_T)));
 255     }
 256     void clear(int i) override {
 257       mask[i / BITS_PER_MASK_T] &= ~((mask_t)1 << (i % BITS_PER_MASK_T));
 258     }
 259     void zero() override {
 260       for (size_t i = 0; i < __kmp_affin_mask_size; ++i)
 261         mask[i] = 0;
 262     }
 263     void copy(const KMPAffinity::Mask *src) override {
 264       const Mask *convert = static_cast<const Mask *>(src);
 265       for (size_t i = 0; i < __kmp_affin_mask_size; ++i)
 266         mask[i] = convert->mask[i];
 267     }
 268     void bitwise_and(const KMPAffinity::Mask *rhs) override {
 269       const Mask *convert = static_cast<const Mask *>(rhs);
 270       for (size_t i = 0; i < __kmp_affin_mask_size; ++i)
 271         mask[i] &= convert->mask[i];
 272     }
 273     void bitwise_or(const KMPAffinity::Mask *rhs) override {
 274       const Mask *convert = static_cast<const Mask *>(rhs);
 275       for (size_t i = 0; i < __kmp_affin_mask_size; ++i)
 276         mask[i] |= convert->mask[i];
 277     }
 278     void bitwise_not() override {
 279       for (size_t i = 0; i < __kmp_affin_mask_size; ++i)
 280         mask[i] = ~(mask[i]);
 281     }
 282     int begin() const override {
 283       int retval = 0;
 284       while (retval < end() && !is_set(retval))
 285         ++retval;
 286       return retval;
 287     }
 288     int end() const override { return __kmp_affin_mask_size * BITS_PER_MASK_T; }
 289     int next(int previous) const override {
 290       int retval = previous + 1;
 291       while (retval < end() && !is_set(retval))
 292         ++retval;
 293       return retval;
 294     }
 295     int get_system_affinity(bool abort_on_error) override {
 296       KMP_ASSERT2(KMP_AFFINITY_CAPABLE(),
 297                   "Illegal get affinity operation when not capable");
 298       int retval =
 299           syscall(__NR_sched_getaffinity, 0, __kmp_affin_mask_size, mask);
 300       if (retval >= 0) {
 301         return 0;
 302       }
 303       int error = errno;
 304       if (abort_on_error) {
 305         __kmp_fatal(KMP_MSG(FatalSysError), KMP_ERR(error), __kmp_msg_null);
 306       }
 307       return error;
 308     }
 309     int set_system_affinity(bool abort_on_error) const override {
 310       KMP_ASSERT2(KMP_AFFINITY_CAPABLE(),
 311                   "Illegal get affinity operation when not capable");
 312       int retval =
 313           syscall(__NR_sched_setaffinity, 0, __kmp_affin_mask_size, mask);
 314       if (retval >= 0) {
 315         return 0;
 316       }
 317       int error = errno;
 318       if (abort_on_error) {
 319         __kmp_fatal(KMP_MSG(FatalSysError), KMP_ERR(error), __kmp_msg_null);
 320       }
 321       return error;
 322     }
 323   };
 324   void determine_capable(const char *env_var) override {
 325     __kmp_affinity_determine_capable(env_var);
 326   }
 327   void bind_thread(int which) override { __kmp_affinity_bind_thread(which); }
 328   KMPAffinity::Mask *allocate_mask() override {
 329     KMPNativeAffinity::Mask *retval = new Mask();
 330     return retval;
 331   }
 332   void deallocate_mask(KMPAffinity::Mask *m) override {
 333     KMPNativeAffinity::Mask *native_mask =
 334         static_cast<KMPNativeAffinity::Mask *>(m);
 335     delete native_mask;
 336   }
 337   KMPAffinity::Mask *allocate_mask_array(int num) override {
 338     return new Mask[num];
 339   }
 340   void deallocate_mask_array(KMPAffinity::Mask *array) override {
 341     Mask *linux_array = static_cast<Mask *>(array);
 342     delete[] linux_array;
 343   }
 344   KMPAffinity::Mask *index_mask_array(KMPAffinity::Mask *array,
 345                                       int index) override {
 346     Mask *linux_array = static_cast<Mask *>(array);
 347     return &(linux_array[index]);
 348   }
 349   api_type get_api_type() const override { return NATIVE_OS; }
 350 };
 351 #endif /* KMP_OS_LINUX */
 352
 353 #if KMP_OS_WINDOWS
 354 class KMPNativeAffinity : public KMPAffinity {
 355   class Mask : public KMPAffinity::Mask {
 356     typedef ULONG_PTR mask_t;
 357     static const int BITS_PER_MASK_T = sizeof(mask_t) * CHAR_BIT;
 358     mask_t *mask;
 359
 360   public:
 361     Mask() {
 362       mask = (mask_t *)__kmp_allocate(sizeof(mask_t) * __kmp_num_proc_groups);
 363     }
 364     ~Mask() {
 365       if (mask)
 366         __kmp_free(mask);
 367     }
 368     void set(int i) override {
 369       mask[i / BITS_PER_MASK_T] |= ((mask_t)1 << (i % BITS_PER_MASK_T));
 370     }
 371     bool is_set(int i) const override {
 372       return (mask[i / BITS_PER_MASK_T] & ((mask_t)1 << (i % BITS_PER_MASK_T)));
 373     }
 374     void clear(int i) override {
 375       mask[i / BITS_PER_MASK_T] &= ~((mask_t)1 << (i % BITS_PER_MASK_T));
 376     }
 377     void zero() override {
 378       for (int i = 0; i < __kmp_num_proc_groups; ++i)
 379         mask[i] = 0;
 380     }
 381     void copy(const KMPAffinity::Mask *src) override {
 382       const Mask *convert = static_cast<const Mask *>(src);
 383       for (int i = 0; i < __kmp_num_proc_groups; ++i)
 384         mask[i] = convert->mask[i];
 385     }
 386     void bitwise_and(const KMPAffinity::Mask *rhs) override {
 387       const Mask *convert = static_cast<const Mask *>(rhs);
 388       for (int i = 0; i < __kmp_num_proc_groups; ++i)
 389         mask[i] &= convert->mask[i];
 390     }
 391     void bitwise_or(const KMPAffinity::Mask *rhs) override {
 392       const Mask *convert = static_cast<const Mask *>(rhs);
 393       for (int i = 0; i < __kmp_num_proc_groups; ++i)
 394         mask[i] |= convert->mask[i];
 395     }
 396     void bitwise_not() override {
 397       for (int i = 0; i < __kmp_num_proc_groups; ++i)
 398         mask[i] = ~(mask[i]);
 399     }
 400     int begin() const override {
 401       int retval = 0;
 402       while (retval < end() && !is_set(retval))
 403         ++retval;
 404       return retval;
 405     }
 406     int end() const override { return __kmp_num_proc_groups * BITS_PER_MASK_T; }
 407     int next(int previous) const override {
 408       int retval = previous + 1;
 409       while (retval < end() && !is_set(retval))
 410         ++retval;
 411       return retval;
 412     }
 413     int set_system_affinity(bool abort_on_error) const override {
 414       if (__kmp_num_proc_groups > 1) {
 415         // Check for a valid mask.
 416         GROUP_AFFINITY ga;
 417         int group = get_proc_group();
 418         if (group < 0) {
 419           if (abort_on_error) {
 420             KMP_FATAL(AffinityInvalidMask, "kmp_set_affinity");
 421           }
 422           return -1;
 423         }
 424         // Transform the bit vector into a GROUP_AFFINITY struct
 425         // and make the system call to set affinity.
 426         ga.Group = group;
 427         ga.Mask = mask[group];
 428         ga.Reserved[0] = ga.Reserved[1] = ga.Reserved[2] = 0;
 429
 430         KMP_DEBUG_ASSERT(__kmp_SetThreadGroupAffinity != NULL);
 431         if (__kmp_SetThreadGroupAffinity(GetCurrentThread(), &ga, NULL) == 0) {
 432           DWORD error = GetLastError();
 433           if (abort_on_error) {
 434             __kmp_fatal(KMP_MSG(CantSetThreadAffMask), KMP_ERR(error),
 435                         __kmp_msg_null);
 436           }
 437           return error;
 438         }
 439       } else {
 440         if (!SetThreadAffinityMask(GetCurrentThread(), *mask)) {
 441           DWORD error = GetLastError();
 442           if (abort_on_error) {
 443             __kmp_fatal(KMP_MSG(CantSetThreadAffMask), KMP_ERR(error),
 444                         __kmp_msg_null);
 445           }
 446           return error;
 447         }
 448       }
 449       return 0;
 450     }
 451     int get_system_affinity(bool abort_on_error) override {
 452       if (__kmp_num_proc_groups > 1) {
 453         this->zero();
 454         GROUP_AFFINITY ga;
 455         KMP_DEBUG_ASSERT(__kmp_GetThreadGroupAffinity != NULL);
 456         if (__kmp_GetThreadGroupAffinity(GetCurrentThread(), &ga) == 0) {
 457           DWORD error = GetLastError();
 458           if (abort_on_error) {
 459             __kmp_fatal(KMP_MSG(FunctionError, "GetThreadGroupAffinity()"),
 460                         KMP_ERR(error), __kmp_msg_null);
 461           }
 462           return error;
 463         }
 464         if ((ga.Group < 0) || (ga.Group > __kmp_num_proc_groups) ||
 465             (ga.Mask == 0)) {
 466           return -1;
 467         }
 468         mask[ga.Group] = ga.Mask;
 469       } else {
 470         mask_t newMask, sysMask, retval;
 471         if (!GetProcessAffinityMask(GetCurrentProcess(), &newMask, &sysMask)) {
 472           DWORD error = GetLastError();
 473           if (abort_on_error) {
 474             __kmp_fatal(KMP_MSG(FunctionError, "GetProcessAffinityMask()"),
 475                         KMP_ERR(error), __kmp_msg_null);
 476           }
 477           return error;
 478         }
 479         retval = SetThreadAffinityMask(GetCurrentThread(), newMask);
 480         if (!retval) {
 481           DWORD error = GetLastError();
 482           if (abort_on_error) {
 483             __kmp_fatal(KMP_MSG(FunctionError, "SetThreadAffinityMask()"),
 484                         KMP_ERR(error), __kmp_msg_null);
 485           }
 486           return error;
 487         }
 488         newMask = SetThreadAffinityMask(GetCurrentThread(), retval);
 489         if (!newMask) {
 490           DWORD error = GetLastError();
 491           if (abort_on_error) {
 492             __kmp_fatal(KMP_MSG(FunctionError, "SetThreadAffinityMask()"),
 493                         KMP_ERR(error), __kmp_msg_null);
 494           }
 495         }
 496         *mask = retval;
 497       }
 498       return 0;
 499     }
 500     int get_proc_group() const override {
 501       int group = -1;
 502       if (__kmp_num_proc_groups == 1) {
 503         return 1;
 504       }
 505       for (int i = 0; i < __kmp_num_proc_groups; i++) {
 506         if (mask[i] == 0)
 507           continue;
 508         if (group >= 0)
 509           return -1;
 510         group = i;
 511       }
 512       return group;
 513     }
 514   };
 515   void determine_capable(const char *env_var) override {
 516     __kmp_affinity_determine_capable(env_var);
 517   }
 518   void bind_thread(int which) override { __kmp_affinity_bind_thread(which); }
 519   KMPAffinity::Mask *allocate_mask() override { return new Mask(); }
 520   void deallocate_mask(KMPAffinity::Mask *m) override { delete m; }
 521   KMPAffinity::Mask *allocate_mask_array(int num) override {
 522     return new Mask[num];
 523   }
 524   void deallocate_mask_array(KMPAffinity::Mask *array) override {
 525     Mask *windows_array = static_cast<Mask *>(array);
 526     delete[] windows_array;
 527   }
 528   KMPAffinity::Mask *index_mask_array(KMPAffinity::Mask *array,
 529                                       int index) override {
 530     Mask *windows_array = static_cast<Mask *>(array);
 531     return &(windows_array[index]);
 532   }
 533   api_type get_api_type() const override { return NATIVE_OS; }
 534 };
 535 #endif /* KMP_OS_WINDOWS */
 536 #endif /* KMP_AFFINITY_SUPPORTED */
 537
 538 class Address {
 539 public:
 540   static const unsigned maxDepth = 32;
 541   unsigned labels[maxDepth];
 542   unsigned childNums[maxDepth];
 543   unsigned depth;
 544   unsigned leader;
 545   Address(unsigned _depth) : depth(_depth), leader(FALSE) {}
 546   Address &operator=(const Address &b) {
 547     depth = b.depth;
 548     for (unsigned i = 0; i < depth; i++) {
 549       labels[i] = b.labels[i];
 550       childNums[i] = b.childNums[i];
 551     }
 552     leader = FALSE;
 553     return *this;
 554   }
 555   bool operator==(const Address &b) const {
 556     if (depth != b.depth)
 557       return false;
 558     for (unsigned i = 0; i < depth; i++)
 559       if (labels[i] != b.labels[i])
 560         return false;
 561     return true;
 562   }
 563   bool isClose(const Address &b, int level) const {
 564     if (depth != b.depth)
 565       return false;
 566     if ((unsigned)level >= depth)
 567       return true;
 568     for (unsigned i = 0; i < (depth - level); i++)
 569       if (labels[i] != b.labels[i])
 570         return false;
 571     return true;
 572   }
 573   bool operator!=(const Address &b) const { return !operator==(b); }
 574   void print() const {
 575     unsigned i;
 576     printf("Depth: %u --- ", depth);
 577     for (i = 0; i < depth; i++) {
 578       printf("%u ", labels[i]);
 579     }
 580   }
 581 };
 582
 583 class AddrUnsPair {
 584 public:
 585   Address first;
 586   unsigned second;
 587   AddrUnsPair(Address _first, unsigned _second)
 588       : first(_first), second(_second) {}
 589   AddrUnsPair &operator=(const AddrUnsPair &b) {
 590     first = b.first;
 591     second = b.second;
 592     return *this;
 593   }
 594   void print() const {
 595     printf("first = ");
 596     first.print();
 597     printf(" --- second = %u", second);
 598   }
 599   bool operator==(const AddrUnsPair &b) const {
 600     if (first != b.first)
 601       return false;
 602     if (second != b.second)
 603       return false;
 604     return true;
 605   }
 606   bool operator!=(const AddrUnsPair &b) const { return !operator==(b); }
 607 };
 608
 609 static int __kmp_affinity_cmp_Address_labels(const void *a, const void *b) {
 610   const Address *aa = &(((const AddrUnsPair *)a)->first);
 611   const Address *bb = &(((const AddrUnsPair *)b)->first);
 612   unsigned depth = aa->depth;
 613   unsigned i;
 614   KMP_DEBUG_ASSERT(depth == bb->depth);
 615   for (i = 0; i < depth; i++) {
 616     if (aa->labels[i] < bb->labels[i])
 617       return -1;
 618     if (aa->labels[i] > bb->labels[i])
 619       return 1;
 620   }
 621   return 0;
 622 }
 623
 624 /* A structure for holding machine-specific hierarchy info to be computed once
 625    at init. This structure represents a mapping of threads to the actual machine
 626    hierarchy, or to our best guess at what the hierarchy might be, for the
 627    purpose of performing an efficient barrier. In the worst case, when there is
 628    no machine hierarchy information, it produces a tree suitable for a barrier,
 629    similar to the tree used in the hyper barrier. */
 630 class hierarchy_info {
 631 public:
 632   /* Good default values for number of leaves and branching factor, given no
 633      affinity information. Behaves a bit like hyper barrier. */
 634   static const kmp_uint32 maxLeaves = 4;
 635   static const kmp_uint32 minBranch = 4;
 636   /** Number of levels in the hierarchy. Typical levels are threads/core,
 637       cores/package or socket, packages/node, nodes/machine, etc. We don't want
 638       to get specific with nomenclature. When the machine is oversubscribed we
 639       add levels to duplicate the hierarchy, doubling the thread capacity of the
 640       hierarchy each time we add a level. */
 641   kmp_uint32 maxLevels;
 642
 643   /** This is specifically the depth of the machine configuration hierarchy, in
 644       terms of the number of levels along the longest path from root to any
 645       leaf. It corresponds to the number of entries in numPerLevel if we exclude
 646       all but one trailing 1. */
 647   kmp_uint32 depth;
 648   kmp_uint32 base_num_threads;
 649   enum init_status { initialized = 0, not_initialized = 1, initializing = 2 };
 650   volatile kmp_int8 uninitialized; // 0=initialized, 1=not initialized,
 651   // 2=initialization in progress
 652   volatile kmp_int8 resizing; // 0=not resizing, 1=resizing
 653
 654   /** Level 0 corresponds to leaves. numPerLevel[i] is the number of children
 655       the parent of a node at level i has. For example, if we have a machine
 656       with 4 packages, 4 cores/package and 2 HT per core, then numPerLevel =
 657       {2, 4, 4, 1, 1}. All empty levels are set to 1. */
 658   kmp_uint32 *numPerLevel;
 659   kmp_uint32 *skipPerLevel;
 660
 661   void deriveLevels(AddrUnsPair *adr2os, int num_addrs) {
 662     int hier_depth = adr2os[0].first.depth;
 663     int level = 0;
 664     for (int i = hier_depth - 1; i >= 0; --i) {
 665       int max = -1;
 666       for (int j = 0; j < num_addrs; ++j) {
 667         int next = adr2os[j].first.childNums[i];
 668         if (next > max)
 669           max = next;
 670       }
 671       numPerLevel[level] = max + 1;
 672       ++level;
 673     }
 674   }
 675
 676   hierarchy_info()
 677       : maxLevels(7), depth(1), uninitialized(not_initialized), resizing(0) {}
 678
 679   void fini() {
 680     if (!uninitialized && numPerLevel) {
 681       __kmp_free(numPerLevel);
 682       numPerLevel = NULL;
 683       uninitialized = not_initialized;
 684     }
 685   }
 686
 687   void init(AddrUnsPair *adr2os, int num_addrs) {
 688     kmp_int8 bool_result = KMP_COMPARE_AND_STORE_ACQ8(
 689         &uninitialized, not_initialized, initializing);
 690     if (bool_result == 0) { // Wait for initialization
 691       while (TCR_1(uninitialized) != initialized)
 692         KMP_CPU_PAUSE();
 693       return;
 694     }
 695     KMP_DEBUG_ASSERT(bool_result == 1);
 696
 697     /* Added explicit initialization of the data fields here to prevent usage of
 698        dirty value observed when static library is re-initialized multiple times
 699        (e.g. when non-OpenMP thread repeatedly launches/joins thread that uses
 700        OpenMP). */
 701     depth = 1;
 702     resizing = 0;
 703     maxLevels = 7;
 704     numPerLevel =
 705         (kmp_uint32 *)__kmp_allocate(maxLevels * 2 * sizeof(kmp_uint32));
 706     skipPerLevel = &(numPerLevel[maxLevels]);
 707     for (kmp_uint32 i = 0; i < maxLevels;
 708          ++i) { // init numPerLevel[*] to 1 item per level
 709       numPerLevel[i] = 1;
 710       skipPerLevel[i] = 1;
 711     }
 712
 713     // Sort table by physical ID
 714     if (adr2os) {
 715       qsort(adr2os, num_addrs, sizeof(*adr2os),
 716             __kmp_affinity_cmp_Address_labels);
 717       deriveLevels(adr2os, num_addrs);
 718     } else {
 719       numPerLevel[0] = maxLeaves;
 720       numPerLevel[1] = num_addrs / maxLeaves;
 721       if (num_addrs % maxLeaves)
 722         numPerLevel[1]++;
 723     }
 724
 725     base_num_threads = num_addrs;
 726     for (int i = maxLevels - 1; i >= 0;
 727          --i) // count non-empty levels to get depth
 728       if (numPerLevel[i] != 1 || depth > 1) // only count one top-level '1'
 729         depth++;
 730
 731     kmp_uint32 branch = minBranch;
 732     if (numPerLevel[0] == 1)
 733       branch = num_addrs / maxLeaves;
 734     if (branch < minBranch)
 735       branch = minBranch;
 736     for (kmp_uint32 d = 0; d < depth - 1; ++d) { // optimize hierarchy width
 737       while (numPerLevel[d] > branch ||
 738              (d == 0 && numPerLevel[d] > maxLeaves)) { // max 4 on level 0!
 739         if (numPerLevel[d] & 1)
 740           numPerLevel[d]++;
 741         numPerLevel[d] = numPerLevel[d] >> 1;
 742         if (numPerLevel[d + 1] == 1)
 743           depth++;
 744         numPerLevel[d + 1] = numPerLevel[d + 1] << 1;
 745       }
 746       if (numPerLevel[0] == 1) {
 747         branch = branch >> 1;
 748         if (branch < 4)
 749           branch = minBranch;
 750       }
 751     }
 752
 753     for (kmp_uint32 i = 1; i < depth; ++i)
 754       skipPerLevel[i] = numPerLevel[i - 1] * skipPerLevel[i - 1];
 755     // Fill in hierarchy in the case of oversubscription
 756     for (kmp_uint32 i = depth; i < maxLevels; ++i)
 757       skipPerLevel[i] = 2 * skipPerLevel[i - 1];
 758
 759     uninitialized = initialized; // One writer
 760   }
 761
 762   // Resize the hierarchy if nproc changes to something larger than before
 763   void resize(kmp_uint32 nproc) {
 764     kmp_int8 bool_result = KMP_COMPARE_AND_STORE_ACQ8(&resizing, 0, 1);
 765     while (bool_result == 0) { // someone else is trying to resize
 766       KMP_CPU_PAUSE();
 767       if (nproc <= base_num_threads) // happy with other thread's resize
 768         return;
 769       else // try to resize
 770         bool_result = KMP_COMPARE_AND_STORE_ACQ8(&resizing, 0, 1);
 771     }
 772     KMP_DEBUG_ASSERT(bool_result != 0);
 773     if (nproc <= base_num_threads)
 774       return; // happy with other thread's resize
 775
 776     // Calculate new maxLevels
 777     kmp_uint32 old_sz = skipPerLevel[depth - 1];
 778     kmp_uint32 incs = 0, old_maxLevels = maxLevels;
 779     // First see if old maxLevels is enough to contain new size
 780     for (kmp_uint32 i = depth; i < maxLevels && nproc > old_sz; ++i) {
 781       skipPerLevel[i] = 2 * skipPerLevel[i - 1];
 782       numPerLevel[i - 1] *= 2;
 783       old_sz *= 2;
 784       depth++;
 785     }
 786     if (nproc > old_sz) { // Not enough space, need to expand hierarchy
 787       while (nproc > old_sz) {
 788         old_sz *= 2;
 789         incs++;
 790         depth++;
 791       }
 792       maxLevels += incs;
 793
 794       // Resize arrays
 795       kmp_uint32 *old_numPerLevel = numPerLevel;
 796       kmp_uint32 *old_skipPerLevel = skipPerLevel;
 797       numPerLevel = skipPerLevel = NULL;
 798       numPerLevel =
 799           (kmp_uint32 *)__kmp_allocate(maxLevels * 2 * sizeof(kmp_uint32));
 800       skipPerLevel = &(numPerLevel[maxLevels]);
 801
 802       // Copy old elements from old arrays
 803       for (kmp_uint32 i = 0; i < old_maxLevels;
 804            ++i) { // init numPerLevel[*] to 1 item per level
 805         numPerLevel[i] = old_numPerLevel[i];
 806         skipPerLevel[i] = old_skipPerLevel[i];
 807       }
 808
 809       // Init new elements in arrays to 1
 810       for (kmp_uint32 i = old_maxLevels; i < maxLevels;
 811            ++i) { // init numPerLevel[*] to 1 item per level
 812         numPerLevel[i] = 1;
 813         skipPerLevel[i] = 1;
 814       }
 815
 816       // Free old arrays
 817       __kmp_free(old_numPerLevel);
 818     }
 819
 820     // Fill in oversubscription levels of hierarchy
 821     for (kmp_uint32 i = old_maxLevels; i < maxLevels; ++i)
 822       skipPerLevel[i] = 2 * skipPerLevel[i - 1];
 823
 824     base_num_threads = nproc;
 825     resizing = 0; // One writer
 826   }
 827 };
 828 #endif // KMP_AFFINITY_H