contrib/llvm/tools/lld/include/lld/Core/Parallel.h

   1 //===- lld/Core/Parallel.h - Parallel utilities ---------------------------===//
   2 //
   3 //                             The LLVM Linker
   4 //
   5 // This file is distributed under the University of Illinois Open Source
   6 // License. See LICENSE.TXT for details.
   7 //
   8 //===----------------------------------------------------------------------===//
   9
  10 #ifndef LLD_CORE_PARALLEL_H
  11 #define LLD_CORE_PARALLEL_H
  12
  13 #include "lld/Core/Instrumentation.h"
  14 #include "lld/Core/LLVM.h"
  15 #include "llvm/Support/MathExtras.h"
  16 #include "llvm/Support/thread.h"
  17
  18 #include <algorithm>
  19 #include <atomic>
  20 #include <condition_variable>
  21 #include <mutex>
  22 #include <stack>
  23
  24 #if defined(_MSC_VER) && LLVM_ENABLE_THREADS
  25 #include <concrt.h>
  26 #include <ppl.h>
  27 #endif
  28
  29 namespace lld {
  30 /// \brief Allows one or more threads to wait on a potentially unknown number of
  31 ///   events.
  32 ///
  33 /// A latch starts at \p count. inc() increments this, and dec() decrements it.
  34 /// All calls to sync() will block while the count is not 0.
  35 ///
  36 /// Calling dec() on a Latch with a count of 0 has undefined behaivor.
  37 class Latch {
  38   uint32_t _count;
  39   mutable std::mutex _condMut;
  40   mutable std::condition_variable _cond;
  41
  42 public:
  43   explicit Latch(uint32_t count = 0) : _count(count) {}
  44   ~Latch() { sync(); }
  45
  46   void inc() {
  47     std::unique_lock<std::mutex> lock(_condMut);
  48     ++_count;
  49   }
  50
  51   void dec() {
  52     std::unique_lock<std::mutex> lock(_condMut);
  53     if (--_count == 0)
  54       _cond.notify_all();
  55   }
  56
  57   void sync() const {
  58     std::unique_lock<std::mutex> lock(_condMut);
  59     _cond.wait(lock, [&] {
  60       return _count == 0;
  61     });
  62   }
  63 };
  64
  65 // Classes in this namespace are implementation details of this header.
  66 namespace internal {
  67
  68 /// \brief An abstract class that takes closures and runs them asynchronously.
  69 class Executor {
  70 public:
  71   virtual ~Executor() = default;
  72   virtual void add(std::function<void()> func) = 0;
  73 };
  74
  75 #if !defined(LLVM_ENABLE_THREADS) || LLVM_ENABLE_THREADS == 0
  76 class SyncExecutor : public Executor {
  77 public:
  78   virtual void add(std::function<void()> func) {
  79     func();
  80   }
  81 };
  82
  83 inline Executor *getDefaultExecutor() {
  84   static SyncExecutor exec;
  85   return &exec;
  86 }
  87 #elif defined(_MSC_VER)
  88 /// \brief An Executor that runs tasks via ConcRT.
  89 class ConcRTExecutor : public Executor {
  90   struct Taskish {
  91     Taskish(std::function<void()> task) : _task(task) {}
  92
  93     std::function<void()> _task;
  94
  95     static void run(void *p) {
  96       Taskish *self = static_cast<Taskish *>(p);
  97       self->_task();
  98       concurrency::Free(self);
  99     }
 100   };
 101
 102 public:
 103   virtual void add(std::function<void()> func) {
 104     Concurrency::CurrentScheduler::ScheduleTask(Taskish::run,
 105         new (concurrency::Alloc(sizeof(Taskish))) Taskish(func));
 106   }
 107 };
 108
 109 inline Executor *getDefaultExecutor() {
 110   static ConcRTExecutor exec;
 111   return &exec;
 112 }
 113 #else
 114 /// \brief An implementation of an Executor that runs closures on a thread pool
 115 ///   in filo order.
 116 class ThreadPoolExecutor : public Executor {
 117 public:
 118   explicit ThreadPoolExecutor(unsigned threadCount =
 119                                   std::thread::hardware_concurrency())
 120       : _stop(false), _done(threadCount) {
 121     // Spawn all but one of the threads in another thread as spawning threads
 122     // can take a while.
 123     std::thread([&, threadCount] {
 124       for (size_t i = 1; i < threadCount; ++i) {
 125         std::thread([=] {
 126           work();
 127         }).detach();
 128       }
 129       work();
 130     }).detach();
 131   }
 132
 133   ~ThreadPoolExecutor() override {
 134     std::unique_lock<std::mutex> lock(_mutex);
 135     _stop = true;
 136     lock.unlock();
 137     _cond.notify_all();
 138     // Wait for ~Latch.
 139   }
 140
 141   void add(std::function<void()> f) override {
 142     std::unique_lock<std::mutex> lock(_mutex);
 143     _workStack.push(f);
 144     lock.unlock();
 145     _cond.notify_one();
 146   }
 147
 148 private:
 149   void work() {
 150     while (true) {
 151       std::unique_lock<std::mutex> lock(_mutex);
 152       _cond.wait(lock, [&] {
 153         return _stop || !_workStack.empty();
 154       });
 155       if (_stop)
 156         break;
 157       auto task = _workStack.top();
 158       _workStack.pop();
 159       lock.unlock();
 160       task();
 161     }
 162     _done.dec();
 163   }
 164
 165   std::atomic<bool> _stop;
 166   std::stack<std::function<void()>> _workStack;
 167   std::mutex _mutex;
 168   std::condition_variable _cond;
 169   Latch _done;
 170 };
 171
 172 inline Executor *getDefaultExecutor() {
 173   static ThreadPoolExecutor exec;
 174   return &exec;
 175 }
 176 #endif
 177
 178 }  // namespace internal
 179
 180 /// \brief Allows launching a number of tasks and waiting for them to finish
 181 ///   either explicitly via sync() or implicitly on destruction.
 182 class TaskGroup {
 183   Latch _latch;
 184
 185 public:
 186   void spawn(std::function<void()> f) {
 187     _latch.inc();
 188     internal::getDefaultExecutor()->add([&, f] {
 189       f();
 190       _latch.dec();
 191     });
 192   }
 193
 194   void sync() const { _latch.sync(); }
 195 };
 196
 197 #if !defined(LLVM_ENABLE_THREADS) || LLVM_ENABLE_THREADS == 0
 198 template <class RandomAccessIterator, class Comp>
 199 void parallel_sort(
 200     RandomAccessIterator start, RandomAccessIterator end,
 201     const Comp &comp = std::less<
 202         typename std::iterator_traits<RandomAccessIterator>::value_type>()) {
 203   std::sort(start, end, comp);
 204 }
 205 #elif defined(_MSC_VER)
 206 // Use ppl parallel_sort on Windows.
 207 template <class RandomAccessIterator, class Comp>
 208 void parallel_sort(
 209     RandomAccessIterator start, RandomAccessIterator end,
 210     const Comp &comp = std::less<
 211         typename std::iterator_traits<RandomAccessIterator>::value_type>()) {
 212   concurrency::parallel_sort(start, end, comp);
 213 }
 214 #else
 215 namespace detail {
 216 const ptrdiff_t minParallelSize = 1024;
 217
 218 /// \brief Inclusive median.
 219 template <class RandomAccessIterator, class Comp>
 220 RandomAccessIterator medianOf3(RandomAccessIterator start,
 221                                RandomAccessIterator end, const Comp &comp) {
 222   RandomAccessIterator mid = start + (std::distance(start, end) / 2);
 223   return comp(*start, *(end - 1))
 224          ? (comp(*mid, *(end - 1)) ? (comp(*start, *mid) ? mid : start)
 225                                    : end - 1)
 226          : (comp(*mid, *start) ? (comp(*(end - 1), *mid) ? mid : end - 1)
 227                                : start);
 228 }
 229
 230 template <class RandomAccessIterator, class Comp>
 231 void parallel_quick_sort(RandomAccessIterator start, RandomAccessIterator end,
 232                          const Comp &comp, TaskGroup &tg, size_t depth) {
 233   // Do a sequential sort for small inputs.
 234   if (std::distance(start, end) < detail::minParallelSize || depth == 0) {
 235     std::sort(start, end, comp);
 236     return;
 237   }
 238
 239   // Partition.
 240   auto pivot = medianOf3(start, end, comp);
 241   // Move pivot to end.
 242   std::swap(*(end - 1), *pivot);
 243   pivot = std::partition(start, end - 1, [&comp, end](decltype(*start) v) {
 244     return comp(v, *(end - 1));
 245   });
 246   // Move pivot to middle of partition.
 247   std::swap(*pivot, *(end - 1));
 248
 249   // Recurse.
 250   tg.spawn([=, &comp, &tg] {
 251     parallel_quick_sort(start, pivot, comp, tg, depth - 1);
 252   });
 253   parallel_quick_sort(pivot + 1, end, comp, tg, depth - 1);
 254 }
 255 }
 256
 257 template <class RandomAccessIterator, class Comp>
 258 void parallel_sort(
 259     RandomAccessIterator start, RandomAccessIterator end,
 260     const Comp &comp = std::less<
 261         typename std::iterator_traits<RandomAccessIterator>::value_type>()) {
 262   TaskGroup tg;
 263   detail::parallel_quick_sort(start, end, comp, tg,
 264                               llvm::Log2_64(std::distance(start, end)) + 1);
 265 }
 266 #endif
 267
 268 template <class T> void parallel_sort(T *start, T *end) {
 269   parallel_sort(start, end, std::less<T>());
 270 }
 271
 272 #if !defined(LLVM_ENABLE_THREADS) || LLVM_ENABLE_THREADS == 0
 273 template <class IterTy, class FuncTy>
 274 void parallel_for_each(IterTy Begin, IterTy End, FuncTy Fn) {
 275   std::for_each(Begin, End, Fn);
 276 }
 277
 278 template <class IndexTy, class FuncTy>
 279 void parallel_for(IndexTy Begin, IndexTy End, FuncTy Fn) {
 280   for (IndexTy I = Begin; I != End; ++I)
 281     Fn(I);
 282 }
 283 #elif defined(_MSC_VER)
 284 // Use ppl parallel_for_each on Windows.
 285 template <class IterTy, class FuncTy>
 286 void parallel_for_each(IterTy Begin, IterTy End, FuncTy Fn) {
 287   concurrency::parallel_for_each(Begin, End, Fn);
 288 }
 289
 290 template <class IndexTy, class FuncTy>
 291 void parallel_for(IndexTy Begin, IndexTy End, FuncTy Fn) {
 292   concurrency::parallel_for(Begin, End, Fn);
 293 }
 294 #else
 295 template <class IterTy, class FuncTy>
 296 void parallel_for_each(IterTy Begin, IterTy End, FuncTy Fn) {
 297   // TaskGroup has a relatively high overhead, so we want to reduce
 298   // the number of spawn() calls. We'll create up to 1024 tasks here.
 299   // (Note that 1024 is an arbitrary number. This code probably needs
 300   // improving to take the number of available cores into account.)
 301   ptrdiff_t TaskSize = std::distance(Begin, End) / 1024;
 302   if (TaskSize == 0)
 303     TaskSize = 1;
 304
 305   TaskGroup Tg;
 306   while (TaskSize <= std::distance(Begin, End)) {
 307     Tg.spawn([=, &Fn] { std::for_each(Begin, Begin + TaskSize, Fn); });
 308     Begin += TaskSize;
 309   }
 310   Tg.spawn([=, &Fn] { std::for_each(Begin, End, Fn); });
 311 }
 312
 313 template <class IndexTy, class FuncTy>
 314 void parallel_for(IndexTy Begin, IndexTy End, FuncTy Fn) {
 315   ptrdiff_t TaskSize = (End - Begin) / 1024;
 316   if (TaskSize == 0)
 317     TaskSize = 1;
 318
 319   TaskGroup Tg;
 320   IndexTy I = Begin;
 321   for (; I < End; I += TaskSize) {
 322     Tg.spawn([=, &Fn] {
 323       for (IndexTy J = I, E = I + TaskSize; J != E; ++J)
 324         Fn(J);
 325     });
 326     Begin += TaskSize;
 327   }
 328   Tg.spawn([=, &Fn] {
 329     for (IndexTy J = I; J < End; ++J)
 330       Fn(J);
 331   });
 332 }
 333 #endif
 334 } // end namespace lld
 335
 336 #endif // LLD_CORE_PARALLEL_H