contrib/llvm-project/openmp/runtime/src/kmp_affinity.h

   1 /*
   2  * kmp_affinity.h -- header for affinity management
   3  */
   4
   5 //===----------------------------------------------------------------------===//
   6 //
   7 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
   8 // See https://llvm.org/LICENSE.txt for license information.
   9 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
  10 //
  11 //===----------------------------------------------------------------------===//
  12
  13 #ifndef KMP_AFFINITY_H
  14 #define KMP_AFFINITY_H
  15
  16 #include "kmp.h"
  17 #include "kmp_os.h"
  18
  19 #if KMP_AFFINITY_SUPPORTED
  20 #if KMP_USE_HWLOC
  21 class KMPHwlocAffinity : public KMPAffinity {
  22 public:
  23   class Mask : public KMPAffinity::Mask {
  24     hwloc_cpuset_t mask;
  25
  26   public:
  27     Mask() {
  28       mask = hwloc_bitmap_alloc();
  29       this->zero();
  30     }
  31     ~Mask() { hwloc_bitmap_free(mask); }
  32     void set(int i) override { hwloc_bitmap_set(mask, i); }
  33     bool is_set(int i) const override { return hwloc_bitmap_isset(mask, i); }
  34     void clear(int i) override { hwloc_bitmap_clr(mask, i); }
  35     void zero() override { hwloc_bitmap_zero(mask); }
  36     void copy(const KMPAffinity::Mask *src) override {
  37       const Mask *convert = static_cast<const Mask *>(src);
  38       hwloc_bitmap_copy(mask, convert->mask);
  39     }
  40     void bitwise_and(const KMPAffinity::Mask *rhs) override {
  41       const Mask *convert = static_cast<const Mask *>(rhs);
  42       hwloc_bitmap_and(mask, mask, convert->mask);
  43     }
  44     void bitwise_or(const KMPAffinity::Mask *rhs) override {
  45       const Mask *convert = static_cast<const Mask *>(rhs);
  46       hwloc_bitmap_or(mask, mask, convert->mask);
  47     }
  48     void bitwise_not() override { hwloc_bitmap_not(mask, mask); }
  49     int begin() const override { return hwloc_bitmap_first(mask); }
  50     int end() const override { return -1; }
  51     int next(int previous) const override {
  52       return hwloc_bitmap_next(mask, previous);
  53     }
  54     int get_system_affinity(bool abort_on_error) override {
  55       KMP_ASSERT2(KMP_AFFINITY_CAPABLE(),
  56                   "Illegal get affinity operation when not capable");
  57       int retval =
  58           hwloc_get_cpubind(__kmp_hwloc_topology, mask, HWLOC_CPUBIND_THREAD);
  59       if (retval >= 0) {
  60         return 0;
  61       }
  62       int error = errno;
  63       if (abort_on_error) {
  64         __kmp_fatal(KMP_MSG(FatalSysError), KMP_ERR(error), __kmp_msg_null);
  65       }
  66       return error;
  67     }
  68     int set_system_affinity(bool abort_on_error) const override {
  69       KMP_ASSERT2(KMP_AFFINITY_CAPABLE(),
  70                   "Illegal get affinity operation when not capable");
  71       int retval =
  72           hwloc_set_cpubind(__kmp_hwloc_topology, mask, HWLOC_CPUBIND_THREAD);
  73       if (retval >= 0) {
  74         return 0;
  75       }
  76       int error = errno;
  77       if (abort_on_error) {
  78         __kmp_fatal(KMP_MSG(FatalSysError), KMP_ERR(error), __kmp_msg_null);
  79       }
  80       return error;
  81     }
  82     int get_proc_group() const override {
  83       int group = -1;
  84 #if KMP_OS_WINDOWS
  85       if (__kmp_num_proc_groups == 1) {
  86         return 1;
  87       }
  88       for (int i = 0; i < __kmp_num_proc_groups; i++) {
  89         // On windows, the long type is always 32 bits
  90         unsigned long first_32_bits = hwloc_bitmap_to_ith_ulong(mask, i * 2);
  91         unsigned long second_32_bits =
  92             hwloc_bitmap_to_ith_ulong(mask, i * 2 + 1);
  93         if (first_32_bits == 0 && second_32_bits == 0) {
  94           continue;
  95         }
  96         if (group >= 0) {
  97           return -1;
  98         }
  99         group = i;
 100       }
 101 #endif /* KMP_OS_WINDOWS */
 102       return group;
 103     }
 104   };
 105   void determine_capable(const char *var) override {
 106     const hwloc_topology_support *topology_support;
 107     if (__kmp_hwloc_topology == NULL) {
 108       if (hwloc_topology_init(&__kmp_hwloc_topology) < 0) {
 109         __kmp_hwloc_error = TRUE;
 110         if (__kmp_affinity_verbose)
 111           KMP_WARNING(AffHwlocErrorOccurred, var, "hwloc_topology_init()");
 112       }
 113       if (hwloc_topology_load(__kmp_hwloc_topology) < 0) {
 114         __kmp_hwloc_error = TRUE;
 115         if (__kmp_affinity_verbose)
 116           KMP_WARNING(AffHwlocErrorOccurred, var, "hwloc_topology_load()");
 117       }
 118     }
 119     topology_support = hwloc_topology_get_support(__kmp_hwloc_topology);
 120     // Is the system capable of setting/getting this thread's affinity?
 121     // Also, is topology discovery possible? (pu indicates ability to discover
 122     // processing units). And finally, were there no errors when calling any
 123     // hwloc_* API functions?
 124     if (topology_support && topology_support->cpubind->set_thisthread_cpubind &&
 125         topology_support->cpubind->get_thisthread_cpubind &&
 126         topology_support->discovery->pu && !__kmp_hwloc_error) {
 127       // enables affinity according to KMP_AFFINITY_CAPABLE() macro
 128       KMP_AFFINITY_ENABLE(TRUE);
 129     } else {
 130       // indicate that hwloc didn't work and disable affinity
 131       __kmp_hwloc_error = TRUE;
 132       KMP_AFFINITY_DISABLE();
 133     }
 134   }
 135   void bind_thread(int which) override {
 136     KMP_ASSERT2(KMP_AFFINITY_CAPABLE(),
 137                 "Illegal set affinity operation when not capable");
 138     KMPAffinity::Mask *mask;
 139     KMP_CPU_ALLOC_ON_STACK(mask);
 140     KMP_CPU_ZERO(mask);
 141     KMP_CPU_SET(which, mask);
 142     __kmp_set_system_affinity(mask, TRUE);
 143     KMP_CPU_FREE_FROM_STACK(mask);
 144   }
 145   KMPAffinity::Mask *allocate_mask() override { return new Mask(); }
 146   void deallocate_mask(KMPAffinity::Mask *m) override { delete m; }
 147   KMPAffinity::Mask *allocate_mask_array(int num) override {
 148     return new Mask[num];
 149   }
 150   void deallocate_mask_array(KMPAffinity::Mask *array) override {
 151     Mask *hwloc_array = static_cast<Mask *>(array);
 152     delete[] hwloc_array;
 153   }
 154   KMPAffinity::Mask *index_mask_array(KMPAffinity::Mask *array,
 155                                       int index) override {
 156     Mask *hwloc_array = static_cast<Mask *>(array);
 157     return &(hwloc_array[index]);
 158   }
 159   api_type get_api_type() const override { return HWLOC; }
 160 };
 161 #endif /* KMP_USE_HWLOC */
 162
 163 #if KMP_OS_LINUX
 164 /* On some of the older OS's that we build on, these constants aren't present
 165    in <asm/unistd.h> #included from <sys.syscall.h>. They must be the same on
 166    all systems of the same arch where they are defined, and they cannot change.
 167    stone forever. */
 168 #include <sys/syscall.h>
 169 #if KMP_ARCH_X86 || KMP_ARCH_ARM
 170 #ifndef __NR_sched_setaffinity
 171 #define __NR_sched_setaffinity 241
 172 #elif __NR_sched_setaffinity != 241
 173 #error Wrong code for setaffinity system call.
 174 #endif /* __NR_sched_setaffinity */
 175 #ifndef __NR_sched_getaffinity
 176 #define __NR_sched_getaffinity 242
 177 #elif __NR_sched_getaffinity != 242
 178 #error Wrong code for getaffinity system call.
 179 #endif /* __NR_sched_getaffinity */
 180 #elif KMP_ARCH_AARCH64
 181 #ifndef __NR_sched_setaffinity
 182 #define __NR_sched_setaffinity 122
 183 #elif __NR_sched_setaffinity != 122
 184 #error Wrong code for setaffinity system call.
 185 #endif /* __NR_sched_setaffinity */
 186 #ifndef __NR_sched_getaffinity
 187 #define __NR_sched_getaffinity 123
 188 #elif __NR_sched_getaffinity != 123
 189 #error Wrong code for getaffinity system call.
 190 #endif /* __NR_sched_getaffinity */
 191 #elif KMP_ARCH_X86_64
 192 #ifndef __NR_sched_setaffinity
 193 #define __NR_sched_setaffinity 203
 194 #elif __NR_sched_setaffinity != 203
 195 #error Wrong code for setaffinity system call.
 196 #endif /* __NR_sched_setaffinity */
 197 #ifndef __NR_sched_getaffinity
 198 #define __NR_sched_getaffinity 204
 199 #elif __NR_sched_getaffinity != 204
 200 #error Wrong code for getaffinity system call.
 201 #endif /* __NR_sched_getaffinity */
 202 #elif KMP_ARCH_PPC64
 203 #ifndef __NR_sched_setaffinity
 204 #define __NR_sched_setaffinity 222
 205 #elif __NR_sched_setaffinity != 222
 206 #error Wrong code for setaffinity system call.
 207 #endif /* __NR_sched_setaffinity */
 208 #ifndef __NR_sched_getaffinity
 209 #define __NR_sched_getaffinity 223
 210 #elif __NR_sched_getaffinity != 223
 211 #error Wrong code for getaffinity system call.
 212 #endif /* __NR_sched_getaffinity */
 213 #elif KMP_ARCH_MIPS
 214 #ifndef __NR_sched_setaffinity
 215 #define __NR_sched_setaffinity 4239
 216 #elif __NR_sched_setaffinity != 4239
 217 #error Wrong code for setaffinity system call.
 218 #endif /* __NR_sched_setaffinity */
 219 #ifndef __NR_sched_getaffinity
 220 #define __NR_sched_getaffinity 4240
 221 #elif __NR_sched_getaffinity != 4240
 222 #error Wrong code for getaffinity system call.
 223 #endif /* __NR_sched_getaffinity */
 224 #elif KMP_ARCH_MIPS64
 225 #ifndef __NR_sched_setaffinity
 226 #define __NR_sched_setaffinity 5195
 227 #elif __NR_sched_setaffinity != 5195
 228 #error Wrong code for setaffinity system call.
 229 #endif /* __NR_sched_setaffinity */
 230 #ifndef __NR_sched_getaffinity
 231 #define __NR_sched_getaffinity 5196
 232 #elif __NR_sched_getaffinity != 5196
 233 #error Wrong code for getaffinity system call.
 234 #endif /* __NR_sched_getaffinity */
 235 #error Unknown or unsupported architecture
 236 #endif /* KMP_ARCH_* */
 237 class KMPNativeAffinity : public KMPAffinity {
 238   class Mask : public KMPAffinity::Mask {
 239     typedef unsigned char mask_t;
 240     static const int BITS_PER_MASK_T = sizeof(mask_t) * CHAR_BIT;
 241
 242   public:
 243     mask_t *mask;
 244     Mask() { mask = (mask_t *)__kmp_allocate(__kmp_affin_mask_size); }
 245     ~Mask() {
 246       if (mask)
 247         __kmp_free(mask);
 248     }
 249     void set(int i) override {
 250       mask[i / BITS_PER_MASK_T] |= ((mask_t)1 << (i % BITS_PER_MASK_T));
 251     }
 252     bool is_set(int i) const override {
 253       return (mask[i / BITS_PER_MASK_T] & ((mask_t)1 << (i % BITS_PER_MASK_T)));
 254     }
 255     void clear(int i) override {
 256       mask[i / BITS_PER_MASK_T] &= ~((mask_t)1 << (i % BITS_PER_MASK_T));
 257     }
 258     void zero() override {
 259       for (size_t i = 0; i < __kmp_affin_mask_size; ++i)
 260         mask[i] = 0;
 261     }
 262     void copy(const KMPAffinity::Mask *src) override {
 263       const Mask *convert = static_cast<const Mask *>(src);
 264       for (size_t i = 0; i < __kmp_affin_mask_size; ++i)
 265         mask[i] = convert->mask[i];
 266     }
 267     void bitwise_and(const KMPAffinity::Mask *rhs) override {
 268       const Mask *convert = static_cast<const Mask *>(rhs);
 269       for (size_t i = 0; i < __kmp_affin_mask_size; ++i)
 270         mask[i] &= convert->mask[i];
 271     }
 272     void bitwise_or(const KMPAffinity::Mask *rhs) override {
 273       const Mask *convert = static_cast<const Mask *>(rhs);
 274       for (size_t i = 0; i < __kmp_affin_mask_size; ++i)
 275         mask[i] |= convert->mask[i];
 276     }
 277     void bitwise_not() override {
 278       for (size_t i = 0; i < __kmp_affin_mask_size; ++i)
 279         mask[i] = ~(mask[i]);
 280     }
 281     int begin() const override {
 282       int retval = 0;
 283       while (retval < end() && !is_set(retval))
 284         ++retval;
 285       return retval;
 286     }
 287     int end() const override { return __kmp_affin_mask_size * BITS_PER_MASK_T; }
 288     int next(int previous) const override {
 289       int retval = previous + 1;
 290       while (retval < end() && !is_set(retval))
 291         ++retval;
 292       return retval;
 293     }
 294     int get_system_affinity(bool abort_on_error) override {
 295       KMP_ASSERT2(KMP_AFFINITY_CAPABLE(),
 296                   "Illegal get affinity operation when not capable");
 297       int retval =
 298           syscall(__NR_sched_getaffinity, 0, __kmp_affin_mask_size, mask);
 299       if (retval >= 0) {
 300         return 0;
 301       }
 302       int error = errno;
 303       if (abort_on_error) {
 304         __kmp_fatal(KMP_MSG(FatalSysError), KMP_ERR(error), __kmp_msg_null);
 305       }
 306       return error;
 307     }
 308     int set_system_affinity(bool abort_on_error) const override {
 309       KMP_ASSERT2(KMP_AFFINITY_CAPABLE(),
 310                   "Illegal get affinity operation when not capable");
 311       int retval =
 312           syscall(__NR_sched_setaffinity, 0, __kmp_affin_mask_size, mask);
 313       if (retval >= 0) {
 314         return 0;
 315       }
 316       int error = errno;
 317       if (abort_on_error) {
 318         __kmp_fatal(KMP_MSG(FatalSysError), KMP_ERR(error), __kmp_msg_null);
 319       }
 320       return error;
 321     }
 322   };
 323   void determine_capable(const char *env_var) override {
 324     __kmp_affinity_determine_capable(env_var);
 325   }
 326   void bind_thread(int which) override { __kmp_affinity_bind_thread(which); }
 327   KMPAffinity::Mask *allocate_mask() override {
 328     KMPNativeAffinity::Mask *retval = new Mask();
 329     return retval;
 330   }
 331   void deallocate_mask(KMPAffinity::Mask *m) override {
 332     KMPNativeAffinity::Mask *native_mask =
 333         static_cast<KMPNativeAffinity::Mask *>(m);
 334     delete native_mask;
 335   }
 336   KMPAffinity::Mask *allocate_mask_array(int num) override {
 337     return new Mask[num];
 338   }
 339   void deallocate_mask_array(KMPAffinity::Mask *array) override {
 340     Mask *linux_array = static_cast<Mask *>(array);
 341     delete[] linux_array;
 342   }
 343   KMPAffinity::Mask *index_mask_array(KMPAffinity::Mask *array,
 344                                       int index) override {
 345     Mask *linux_array = static_cast<Mask *>(array);
 346     return &(linux_array[index]);
 347   }
 348   api_type get_api_type() const override { return NATIVE_OS; }
 349 };
 350 #endif /* KMP_OS_LINUX */
 351
 352 #if KMP_OS_WINDOWS
 353 class KMPNativeAffinity : public KMPAffinity {
 354   class Mask : public KMPAffinity::Mask {
 355     typedef ULONG_PTR mask_t;
 356     static const int BITS_PER_MASK_T = sizeof(mask_t) * CHAR_BIT;
 357     mask_t *mask;
 358
 359   public:
 360     Mask() {
 361       mask = (mask_t *)__kmp_allocate(sizeof(mask_t) * __kmp_num_proc_groups);
 362     }
 363     ~Mask() {
 364       if (mask)
 365         __kmp_free(mask);
 366     }
 367     void set(int i) override {
 368       mask[i / BITS_PER_MASK_T] |= ((mask_t)1 << (i % BITS_PER_MASK_T));
 369     }
 370     bool is_set(int i) const override {
 371       return (mask[i / BITS_PER_MASK_T] & ((mask_t)1 << (i % BITS_PER_MASK_T)));
 372     }
 373     void clear(int i) override {
 374       mask[i / BITS_PER_MASK_T] &= ~((mask_t)1 << (i % BITS_PER_MASK_T));
 375     }
 376     void zero() override {
 377       for (int i = 0; i < __kmp_num_proc_groups; ++i)
 378         mask[i] = 0;
 379     }
 380     void copy(const KMPAffinity::Mask *src) override {
 381       const Mask *convert = static_cast<const Mask *>(src);
 382       for (int i = 0; i < __kmp_num_proc_groups; ++i)
 383         mask[i] = convert->mask[i];
 384     }
 385     void bitwise_and(const KMPAffinity::Mask *rhs) override {
 386       const Mask *convert = static_cast<const Mask *>(rhs);
 387       for (int i = 0; i < __kmp_num_proc_groups; ++i)
 388         mask[i] &= convert->mask[i];
 389     }
 390     void bitwise_or(const KMPAffinity::Mask *rhs) override {
 391       const Mask *convert = static_cast<const Mask *>(rhs);
 392       for (int i = 0; i < __kmp_num_proc_groups; ++i)
 393         mask[i] |= convert->mask[i];
 394     }
 395     void bitwise_not() override {
 396       for (int i = 0; i < __kmp_num_proc_groups; ++i)
 397         mask[i] = ~(mask[i]);
 398     }
 399     int begin() const override {
 400       int retval = 0;
 401       while (retval < end() && !is_set(retval))
 402         ++retval;
 403       return retval;
 404     }
 405     int end() const override { return __kmp_num_proc_groups * BITS_PER_MASK_T; }
 406     int next(int previous) const override {
 407       int retval = previous + 1;
 408       while (retval < end() && !is_set(retval))
 409         ++retval;
 410       return retval;
 411     }
 412     int set_system_affinity(bool abort_on_error) const override {
 413       if (__kmp_num_proc_groups > 1) {
 414         // Check for a valid mask.
 415         GROUP_AFFINITY ga;
 416         int group = get_proc_group();
 417         if (group < 0) {
 418           if (abort_on_error) {
 419             KMP_FATAL(AffinityInvalidMask, "kmp_set_affinity");
 420           }
 421           return -1;
 422         }
 423         // Transform the bit vector into a GROUP_AFFINITY struct
 424         // and make the system call to set affinity.
 425         ga.Group = group;
 426         ga.Mask = mask[group];
 427         ga.Reserved[0] = ga.Reserved[1] = ga.Reserved[2] = 0;
 428
 429         KMP_DEBUG_ASSERT(__kmp_SetThreadGroupAffinity != NULL);
 430         if (__kmp_SetThreadGroupAffinity(GetCurrentThread(), &ga, NULL) == 0) {
 431           DWORD error = GetLastError();
 432           if (abort_on_error) {
 433             __kmp_fatal(KMP_MSG(CantSetThreadAffMask), KMP_ERR(error),
 434                         __kmp_msg_null);
 435           }
 436           return error;
 437         }
 438       } else {
 439         if (!SetThreadAffinityMask(GetCurrentThread(), *mask)) {
 440           DWORD error = GetLastError();
 441           if (abort_on_error) {
 442             __kmp_fatal(KMP_MSG(CantSetThreadAffMask), KMP_ERR(error),
 443                         __kmp_msg_null);
 444           }
 445           return error;
 446         }
 447       }
 448       return 0;
 449     }
 450     int get_system_affinity(bool abort_on_error) override {
 451       if (__kmp_num_proc_groups > 1) {
 452         this->zero();
 453         GROUP_AFFINITY ga;
 454         KMP_DEBUG_ASSERT(__kmp_GetThreadGroupAffinity != NULL);
 455         if (__kmp_GetThreadGroupAffinity(GetCurrentThread(), &ga) == 0) {
 456           DWORD error = GetLastError();
 457           if (abort_on_error) {
 458             __kmp_fatal(KMP_MSG(FunctionError, "GetThreadGroupAffinity()"),
 459                         KMP_ERR(error), __kmp_msg_null);
 460           }
 461           return error;
 462         }
 463         if ((ga.Group < 0) || (ga.Group > __kmp_num_proc_groups) ||
 464             (ga.Mask == 0)) {
 465           return -1;
 466         }
 467         mask[ga.Group] = ga.Mask;
 468       } else {
 469         mask_t newMask, sysMask, retval;
 470         if (!GetProcessAffinityMask(GetCurrentProcess(), &newMask, &sysMask)) {
 471           DWORD error = GetLastError();
 472           if (abort_on_error) {
 473             __kmp_fatal(KMP_MSG(FunctionError, "GetProcessAffinityMask()"),
 474                         KMP_ERR(error), __kmp_msg_null);
 475           }
 476           return error;
 477         }
 478         retval = SetThreadAffinityMask(GetCurrentThread(), newMask);
 479         if (!retval) {
 480           DWORD error = GetLastError();
 481           if (abort_on_error) {
 482             __kmp_fatal(KMP_MSG(FunctionError, "SetThreadAffinityMask()"),
 483                         KMP_ERR(error), __kmp_msg_null);
 484           }
 485           return error;
 486         }
 487         newMask = SetThreadAffinityMask(GetCurrentThread(), retval);
 488         if (!newMask) {
 489           DWORD error = GetLastError();
 490           if (abort_on_error) {
 491             __kmp_fatal(KMP_MSG(FunctionError, "SetThreadAffinityMask()"),
 492                         KMP_ERR(error), __kmp_msg_null);
 493           }
 494         }
 495         *mask = retval;
 496       }
 497       return 0;
 498     }
 499     int get_proc_group() const override {
 500       int group = -1;
 501       if (__kmp_num_proc_groups == 1) {
 502         return 1;
 503       }
 504       for (int i = 0; i < __kmp_num_proc_groups; i++) {
 505         if (mask[i] == 0)
 506           continue;
 507         if (group >= 0)
 508           return -1;
 509         group = i;
 510       }
 511       return group;
 512     }
 513   };
 514   void determine_capable(const char *env_var) override {
 515     __kmp_affinity_determine_capable(env_var);
 516   }
 517   void bind_thread(int which) override { __kmp_affinity_bind_thread(which); }
 518   KMPAffinity::Mask *allocate_mask() override { return new Mask(); }
 519   void deallocate_mask(KMPAffinity::Mask *m) override { delete m; }
 520   KMPAffinity::Mask *allocate_mask_array(int num) override {
 521     return new Mask[num];
 522   }
 523   void deallocate_mask_array(KMPAffinity::Mask *array) override {
 524     Mask *windows_array = static_cast<Mask *>(array);
 525     delete[] windows_array;
 526   }
 527   KMPAffinity::Mask *index_mask_array(KMPAffinity::Mask *array,
 528                                       int index) override {
 529     Mask *windows_array = static_cast<Mask *>(array);
 530     return &(windows_array[index]);
 531   }
 532   api_type get_api_type() const override { return NATIVE_OS; }
 533 };
 534 #endif /* KMP_OS_WINDOWS */
 535 #endif /* KMP_AFFINITY_SUPPORTED */
 536
 537 class Address {
 538 public:
 539   static const unsigned maxDepth = 32;
 540   unsigned labels[maxDepth];
 541   unsigned childNums[maxDepth];
 542   unsigned depth;
 543   unsigned leader;
 544   Address(unsigned _depth) : depth(_depth), leader(FALSE) {}
 545   Address &operator=(const Address &b) {
 546     depth = b.depth;
 547     for (unsigned i = 0; i < depth; i++) {
 548       labels[i] = b.labels[i];
 549       childNums[i] = b.childNums[i];
 550     }
 551     leader = FALSE;
 552     return *this;
 553   }
 554   bool operator==(const Address &b) const {
 555     if (depth != b.depth)
 556       return false;
 557     for (unsigned i = 0; i < depth; i++)
 558       if (labels[i] != b.labels[i])
 559         return false;
 560     return true;
 561   }
 562   bool isClose(const Address &b, int level) const {
 563     if (depth != b.depth)
 564       return false;
 565     if ((unsigned)level >= depth)
 566       return true;
 567     for (unsigned i = 0; i < (depth - level); i++)
 568       if (labels[i] != b.labels[i])
 569         return false;
 570     return true;
 571   }
 572   bool operator!=(const Address &b) const { return !operator==(b); }
 573   void print() const {
 574     unsigned i;
 575     printf("Depth: %u --- ", depth);
 576     for (i = 0; i < depth; i++) {
 577       printf("%u ", labels[i]);
 578     }
 579   }
 580 };
 581
 582 class AddrUnsPair {
 583 public:
 584   Address first;
 585   unsigned second;
 586   AddrUnsPair(Address _first, unsigned _second)
 587       : first(_first), second(_second) {}
 588   AddrUnsPair &operator=(const AddrUnsPair &b) {
 589     first = b.first;
 590     second = b.second;
 591     return *this;
 592   }
 593   void print() const {
 594     printf("first = ");
 595     first.print();
 596     printf(" --- second = %u", second);
 597   }
 598   bool operator==(const AddrUnsPair &b) const {
 599     if (first != b.first)
 600       return false;
 601     if (second != b.second)
 602       return false;
 603     return true;
 604   }
 605   bool operator!=(const AddrUnsPair &b) const { return !operator==(b); }
 606 };
 607
 608 static int __kmp_affinity_cmp_Address_labels(const void *a, const void *b) {
 609   const Address *aa = &(((const AddrUnsPair *)a)->first);
 610   const Address *bb = &(((const AddrUnsPair *)b)->first);
 611   unsigned depth = aa->depth;
 612   unsigned i;
 613   KMP_DEBUG_ASSERT(depth == bb->depth);
 614   for (i = 0; i < depth; i++) {
 615     if (aa->labels[i] < bb->labels[i])
 616       return -1;
 617     if (aa->labels[i] > bb->labels[i])
 618       return 1;
 619   }
 620   return 0;
 621 }
 622
 623 /* A structure for holding machine-specific hierarchy info to be computed once
 624    at init. This structure represents a mapping of threads to the actual machine
 625    hierarchy, or to our best guess at what the hierarchy might be, for the
 626    purpose of performing an efficient barrier. In the worst case, when there is
 627    no machine hierarchy information, it produces a tree suitable for a barrier,
 628    similar to the tree used in the hyper barrier. */
 629 class hierarchy_info {
 630 public:
 631   /* Good default values for number of leaves and branching factor, given no
 632      affinity information. Behaves a bit like hyper barrier. */
 633   static const kmp_uint32 maxLeaves = 4;
 634   static const kmp_uint32 minBranch = 4;
 635   /** Number of levels in the hierarchy. Typical levels are threads/core,
 636       cores/package or socket, packages/node, nodes/machine, etc. We don't want
 637       to get specific with nomenclature. When the machine is oversubscribed we
 638       add levels to duplicate the hierarchy, doubling the thread capacity of the
 639       hierarchy each time we add a level. */
 640   kmp_uint32 maxLevels;
 641
 642   /** This is specifically the depth of the machine configuration hierarchy, in
 643       terms of the number of levels along the longest path from root to any
 644       leaf. It corresponds to the number of entries in numPerLevel if we exclude
 645       all but one trailing 1. */
 646   kmp_uint32 depth;
 647   kmp_uint32 base_num_threads;
 648   enum init_status { initialized = 0, not_initialized = 1, initializing = 2 };
 649   volatile kmp_int8 uninitialized; // 0=initialized, 1=not initialized,
 650   // 2=initialization in progress
 651   volatile kmp_int8 resizing; // 0=not resizing, 1=resizing
 652
 653   /** Level 0 corresponds to leaves. numPerLevel[i] is the number of children
 654       the parent of a node at level i has. For example, if we have a machine
 655       with 4 packages, 4 cores/package and 2 HT per core, then numPerLevel =
 656       {2, 4, 4, 1, 1}. All empty levels are set to 1. */
 657   kmp_uint32 *numPerLevel;
 658   kmp_uint32 *skipPerLevel;
 659
 660   void deriveLevels(AddrUnsPair *adr2os, int num_addrs) {
 661     int hier_depth = adr2os[0].first.depth;
 662     int level = 0;
 663     for (int i = hier_depth - 1; i >= 0; --i) {
 664       int max = -1;
 665       for (int j = 0; j < num_addrs; ++j) {
 666         int next = adr2os[j].first.childNums[i];
 667         if (next > max)
 668           max = next;
 669       }
 670       numPerLevel[level] = max + 1;
 671       ++level;
 672     }
 673   }
 674
 675   hierarchy_info()
 676       : maxLevels(7), depth(1), uninitialized(not_initialized), resizing(0) {}
 677
 678   void fini() {
 679     if (!uninitialized && numPerLevel) {
 680       __kmp_free(numPerLevel);
 681       numPerLevel = NULL;
 682       uninitialized = not_initialized;
 683     }
 684   }
 685
 686   void init(AddrUnsPair *adr2os, int num_addrs) {
 687     kmp_int8 bool_result = KMP_COMPARE_AND_STORE_ACQ8(
 688         &uninitialized, not_initialized, initializing);
 689     if (bool_result == 0) { // Wait for initialization
 690       while (TCR_1(uninitialized) != initialized)
 691         KMP_CPU_PAUSE();
 692       return;
 693     }
 694     KMP_DEBUG_ASSERT(bool_result == 1);
 695
 696     /* Added explicit initialization of the data fields here to prevent usage of
 697        dirty value observed when static library is re-initialized multiple times
 698        (e.g. when non-OpenMP thread repeatedly launches/joins thread that uses
 699        OpenMP). */
 700     depth = 1;
 701     resizing = 0;
 702     maxLevels = 7;
 703     numPerLevel =
 704         (kmp_uint32 *)__kmp_allocate(maxLevels * 2 * sizeof(kmp_uint32));
 705     skipPerLevel = &(numPerLevel[maxLevels]);
 706     for (kmp_uint32 i = 0; i < maxLevels;
 707          ++i) { // init numPerLevel[*] to 1 item per level
 708       numPerLevel[i] = 1;
 709       skipPerLevel[i] = 1;
 710     }
 711
 712     // Sort table by physical ID
 713     if (adr2os) {
 714       qsort(adr2os, num_addrs, sizeof(*adr2os),
 715             __kmp_affinity_cmp_Address_labels);
 716       deriveLevels(adr2os, num_addrs);
 717     } else {
 718       numPerLevel[0] = maxLeaves;
 719       numPerLevel[1] = num_addrs / maxLeaves;
 720       if (num_addrs % maxLeaves)
 721         numPerLevel[1]++;
 722     }
 723
 724     base_num_threads = num_addrs;
 725     for (int i = maxLevels - 1; i >= 0;
 726          --i) // count non-empty levels to get depth
 727       if (numPerLevel[i] != 1 || depth > 1) // only count one top-level '1'
 728         depth++;
 729
 730     kmp_uint32 branch = minBranch;
 731     if (numPerLevel[0] == 1)
 732       branch = num_addrs / maxLeaves;
 733     if (branch < minBranch)
 734       branch = minBranch;
 735     for (kmp_uint32 d = 0; d < depth - 1; ++d) { // optimize hierarchy width
 736       while (numPerLevel[d] > branch ||
 737              (d == 0 && numPerLevel[d] > maxLeaves)) { // max 4 on level 0!
 738         if (numPerLevel[d] & 1)
 739           numPerLevel[d]++;
 740         numPerLevel[d] = numPerLevel[d] >> 1;
 741         if (numPerLevel[d + 1] == 1)
 742           depth++;
 743         numPerLevel[d + 1] = numPerLevel[d + 1] << 1;
 744       }
 745       if (numPerLevel[0] == 1) {
 746         branch = branch >> 1;
 747         if (branch < 4)
 748           branch = minBranch;
 749       }
 750     }
 751
 752     for (kmp_uint32 i = 1; i < depth; ++i)
 753       skipPerLevel[i] = numPerLevel[i - 1] * skipPerLevel[i - 1];
 754     // Fill in hierarchy in the case of oversubscription
 755     for (kmp_uint32 i = depth; i < maxLevels; ++i)
 756       skipPerLevel[i] = 2 * skipPerLevel[i - 1];
 757
 758     uninitialized = initialized; // One writer
 759   }
 760
 761   // Resize the hierarchy if nproc changes to something larger than before
 762   void resize(kmp_uint32 nproc) {
 763     kmp_int8 bool_result = KMP_COMPARE_AND_STORE_ACQ8(&resizing, 0, 1);
 764     while (bool_result == 0) { // someone else is trying to resize
 765       KMP_CPU_PAUSE();
 766       if (nproc <= base_num_threads) // happy with other thread's resize
 767         return;
 768       else // try to resize
 769         bool_result = KMP_COMPARE_AND_STORE_ACQ8(&resizing, 0, 1);
 770     }
 771     KMP_DEBUG_ASSERT(bool_result != 0);
 772     if (nproc <= base_num_threads)
 773       return; // happy with other thread's resize
 774
 775     // Calculate new maxLevels
 776     kmp_uint32 old_sz = skipPerLevel[depth - 1];
 777     kmp_uint32 incs = 0, old_maxLevels = maxLevels;
 778     // First see if old maxLevels is enough to contain new size
 779     for (kmp_uint32 i = depth; i < maxLevels && nproc > old_sz; ++i) {
 780       skipPerLevel[i] = 2 * skipPerLevel[i - 1];
 781       numPerLevel[i - 1] *= 2;
 782       old_sz *= 2;
 783       depth++;
 784     }
 785     if (nproc > old_sz) { // Not enough space, need to expand hierarchy
 786       while (nproc > old_sz) {
 787         old_sz *= 2;
 788         incs++;
 789         depth++;
 790       }
 791       maxLevels += incs;
 792
 793       // Resize arrays
 794       kmp_uint32 *old_numPerLevel = numPerLevel;
 795       kmp_uint32 *old_skipPerLevel = skipPerLevel;
 796       numPerLevel = skipPerLevel = NULL;
 797       numPerLevel =
 798           (kmp_uint32 *)__kmp_allocate(maxLevels * 2 * sizeof(kmp_uint32));
 799       skipPerLevel = &(numPerLevel[maxLevels]);
 800
 801       // Copy old elements from old arrays
 802       for (kmp_uint32 i = 0; i < old_maxLevels;
 803            ++i) { // init numPerLevel[*] to 1 item per level
 804         numPerLevel[i] = old_numPerLevel[i];
 805         skipPerLevel[i] = old_skipPerLevel[i];
 806       }
 807
 808       // Init new elements in arrays to 1
 809       for (kmp_uint32 i = old_maxLevels; i < maxLevels;
 810            ++i) { // init numPerLevel[*] to 1 item per level
 811         numPerLevel[i] = 1;
 812         skipPerLevel[i] = 1;
 813       }
 814
 815       // Free old arrays
 816       __kmp_free(old_numPerLevel);
 817     }
 818
 819     // Fill in oversubscription levels of hierarchy
 820     for (kmp_uint32 i = old_maxLevels; i < maxLevels; ++i)
 821       skipPerLevel[i] = 2 * skipPerLevel[i - 1];
 822
 823     base_num_threads = nproc;
 824     resizing = 0; // One writer
 825   }
 826 };
 827 #endif // KMP_AFFINITY_H