kaskade7/html/threading_8hh_source.html

/* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */

/*                                                                           */

/*  This file is part of the library KASKADE 7                               */

/*  https://www.zib.de/research/projects/kaskade7-finite-element-toolbox     */

/*                                                                           */

/*  Copyright (C) 2012-2019 Zuse Institute Berlin                            */

/*                                                                           */

/*  KASKADE 7 is distributed under the terms of the ZIB Academic License.    */

/*    see $KASKADE/academic.txt                                              */

/*                                                                           */

/* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */


#ifndef THREADING_HH

#define THREADING_HH


#include <fstream>

#include <functional>

#include <future>

#include <map>

#include <mutex>

#include <queue>

#include <utility>

#include <vector>


#include <boost/thread/condition_variable.hpp>

#include <boost/thread/locks.hpp>

#include <boost/thread/mutex.hpp>

#include <boost/thread/thread.hpp>


#include "utilities/kalloc.hh"

#include "utilities/timing.hh"


namespace Kaskade

{

  extern std::mutex DuneQuadratureRulesMutex;


  extern boost::mutex refElementMutex;


  //---------------------------------------------------------------------------


  void equalWeightRanges(std::vector<size_t>& x, size_t n);


  template <class BlockIndex, class Index>

  Index uniformWeightRangeStart(BlockIndex i, BlockIndex n, Index m)

  {

    assert(i>=0 && i<=n && n>0);

    return (i*m)/n;

  }


  template <class Index>

  Index uniformWeightRange(Index j, Index n, Index m)

  {

    assert(j>=0 && j<m && n>0 && m>0);


    // We're looking for i such that floor(i*m/n) <= j < floor((i+1)*m/n), i.e. index j has

    // to be in the half-open range given by the partitioning points of the range. Now this

    // implies i*m/n-1 <= j < (i+1)*m/n and also j*n/m-1 < i <= (j+1)*n/m. As i is integer,

    // this implies floor(j*n/m) <= i <= floor((j+1)*n/m). Typically, n/m is small, in which

    // case this closed interval is likely to contain just one natural number - the result.

    Index low = (j*n)/m;       // floor is implied by integer arithmetics

    Index high = ((j+1)*n)/m;  // floor is implied by integer arithmetics


    if (low==high)

    {

      assert(uniformWeightRangeStart(low,n,m)<=j && j<uniformWeightRangeStart(low+1,n,m));

      return low;

    }


    // The implied inequality chains above are not sharp estimates, therefore the interval

    // [low,high] can contain several natural numbers. This is always the case if n>m, i.e., we have

    // more ranges than entries and several ranges are empty. But it can also happen if j*n/m is

    // just below the next integral number. Now we have to walk through all natural numbers in

    // the interval to find the correct range. TODO: is there a direct way?

    Index i = low;

    while (i<high && !(uniformWeightRangeStart(i,n,m)<=j && j<uniformWeightRangeStart(i+1,n,m)))

      ++i;


    assert(uniformWeightRangeStart(i,n,m)<=j && j<uniformWeightRangeStart(i+1,n,m));

    return i;

  }


  //---------------------------------------------------------------------------


  template <class T>

  class ConcurrentQueue {

  public:

    ConcurrentQueue()

    : runningWorkers(0)

    {

    }


    ConcurrentQueue(ConcurrentQueue&& q)

    {

      boost::lock_guard<boost::mutex> lock(q.mutex);

      queue = std::move(q.queue);

    }


    ConcurrentQueue& operator=(ConcurrentQueue const& q)

    {

      boost::lock_guard<boost::mutex> lock(q.mutex);

      queue = q.queue;

      return *this;

    }


    bool empty() const

    {

      // we need a lock here because someone may add / remove tasks right now

      boost::lock_guard<boost::mutex> lock(mutex);

      return queue.empty();

    }


    size_t size() const

    {

      // we need a lock here because someone may add / remove tasks right now

      boost::lock_guard<boost::mutex> lock(mutex);

      return queue.size();

    }


    void push_back(T&& t)

    {

      {

      boost::lock_guard<boost::mutex> lock(mutex);

      queue.push(std::move(t));

      } // release lock before waking up consumers

      filled.notify_one();

    }


    T pop_front()

    {

      boost::unique_lock<boost::mutex> lock(mutex);


      // Wait for data to become available.

      while (queue.empty())

        filled.wait(lock);


      // extract and remove data. When move assignment throws an exception, the queue

      // remains unmodified

      T t = std::move(queue.front());

      queue.pop();

      return t;

    }


    int running(int n)

    {

      boost::lock_guard<boost::mutex> lock(mutex);

      runningWorkers += n;

      return runningWorkers;

    }


    int running() const

    {

      return runningWorkers;

    }


  private:

    std::queue<T> queue;

    mutable boost::mutex mutex; // has to be mutable because of empty/size

    boost::condition_variable filled;

    int runningWorkers;         // number of worker threads running on this queue

  };


  //----------------------------------------------------------------------------


  typedef std::packaged_task<void()> Task;


  typedef std::future<void> Ticket;


  //----------------------------------------------------------------------------


  class NumaThreadPool

  {

  public:

    static NumaThreadPool& instance(int maxThreads = std::numeric_limits<int>::max());


    int nodes() const

    {

      return nNode;

    }


    int cpus() const

    {

      return nCpu;

    }


    int runningOnGlobalQueue() const

    {

      return globalQueue.running();

    }


    int cpus(int node) const

    {

      return cpuByNode[node].size();

    }


    int maxCpusOnNode() const

    {

      return maxCpusPerNode;

    }


    bool isSequential() const

    {

      return sequential;

    }


    Ticket run(Task&& task);


    Ticket runOnNode(int node, Task&& task);


    Kalloc& allocator(int node);


    void* allocate(size_t n, int node);


    void deallocate(void* p, size_t n, int node);


    size_t alignment(int node) const;


    void reserve(size_t n, size_t k, int node);


  private:


    // private constructor to be called by instance()

    NumaThreadPool(int maxThreads);

    ~NumaThreadPool();


    Ticket runOnQueue(ConcurrentQueue<Task>& queue, Task&& task);


    int nCpu, nNode, maxCpusPerNode;

    std::vector<ConcurrentQueue<Task>>    nodeQueue;    // task queues for passing to worker threads on nodes

    ConcurrentQueue<Task>                 globalQueue;

    boost::thread_group                   threads;      // worker threads

    std::vector<int>                      nodeByCpu;    // NUMA topology info

    std::vector<std::vector<int>>         cpuByNode;    // NUMA topology info

    std::map<void*,std::pair<size_t,int>> memBlocks;    // NUMA memory allocations

    std::vector<Kalloc>                   nodeMemory;   // NUMA memory management

    bool sequential;                                    // if true, execute all tasks immediately

  };


  //-----------------------------------------------------------------------------------------------------


  template <class Func>

  void parallelFor(Func const& f, int maxTasks = std::numeric_limits<int>::max())

  {

    NumaThreadPool& pool = NumaThreadPool::instance();


    // If tasks are to be executed sequentially, we shortcut the task pool. Less for avoiding overhead,

    // but more for avoiding task packaging, which looses call stack info if exceptions are thrown.

    if (pool.isSequential())

    {

      f(0,1);

      return;

    }


    int nTasks = std::min(4*pool.cpus(),maxTasks);


    std::vector<Ticket> tickets(nTasks);

    for (int i=0; i<nTasks; ++i)

      tickets[i] = pool.run(Task([i,&f,nTasks] { f(i,nTasks); }));


    for (auto& ticket: tickets)   // wait for the tasks to be completed

      ticket.get();               // we use get() instead of wait() to rethrow any exceptions

  }


  template <class Func>

  void parallelFor(size_t first, size_t last, Func const& f, size_t nTasks=std::numeric_limits<size_t>::max())

  {

    assert(last>=first);

    NumaThreadPool& pool = NumaThreadPool::instance();

    nTasks = std::min(last-first,std::min(nTasks,size_t(pool.cpus())));


    std::vector<Ticket> tickets(nTasks);

    std::mutex mutex;


    TaskTiming tt(nTasks+2);


    tt.start(nTasks);

    for (size_t i=0; i<nTasks; ++i)

    {

      tickets[i] = pool.run(Task([&f,&first,last,&mutex,i,&tt]

                            {

                              tt.start(i);

                              while (true)

                              {

                                std::unique_lock<std::mutex> lock(mutex);

                                size_t myPos = first;

                                ++first;

                                lock.unlock();


                                if (myPos>=last)

                                {

                                  tt.stop(i);

                                  return;

                                }


                                f(myPos);

                              }

                            }));


      // Waking up threads appears to be quite time-consuming. If we have many threads in the pool and not so much work,

      // the last threads are woken up when all the work has already been done. Then tasks are created without benefit.

      // In comparison, locking on a mutex appears to be blazingly fast, so we check whether there is still work to be

      // done, and if not, leave the loop. This should be particularly beneficial in hyperthreaded systems.

      std::lock_guard<std::mutex> lock(mutex);

      if (first>=last)

      {

        tickets.resize(i+1);

        break;

      }

    }

    tt.stop(nTasks);


    tt.start(nTasks+1);

    for (auto& ticket: tickets)   // wait for the tasks to be completed

      ticket.get();               // we use get() instead of wait() to rethrow any exceptions

    tt.stop(nTasks+1);


    std::ofstream out("timing.gnu");

    out << tt;

  }


  template <class Func>

  void parallelForNodes(Func const& f, int maxTasks = std::numeric_limits<int>::max())

  {

    NumaThreadPool& pool = NumaThreadPool::instance();

    int nTasks = std::min(pool.nodes(),maxTasks);

    std::vector<Ticket> tickets(nTasks);

    for (int i=0; i<nTasks; ++i)

      tickets[i] = pool.runOnNode(i,Task([i,&f,&nTasks] { f(i,nTasks); }));

    for (auto& ticket: tickets) // wait for the tasks to be completed

      ticket.get();             // we use get() instead of wait() to rethrow any exceptions

  }


  //----------------------------------------------------------------------------


  namespace ThreadingDetail

  {

    class NumaAllocatorBase

    {

    public:

      NumaAllocatorBase(int node);


      size_t max_size() const;


      void* allocate(size_t n);


      void deallocate(void* p, size_t n);


      int node() const

      {

        return nod;

      }


    private:

      int nod;

      Kalloc* allocator;

    };

  }

  template <class T>

  class NumaAllocator

  {

  public:

    typedef T value_type;

    typedef T* pointer;

    typedef T const* const_pointer;

    typedef T& reference;

    typedef T const& const_reference;

    typedef std::size_t size_type;

    typedef std::ptrdiff_t difference_type;


    // make sure that on copying, the target copy has its data in the same

    // NUMA memory region as the source by default -- this improves data locality.

    typedef std::true_type propagate_on_container_copy_assignment;

    typedef std::true_type propagate_on_container_move_assignment;

    typedef std::true_type propagate_on_container_swap;


    template <class U>

    struct rebind

    {

      typedef NumaAllocator<U> other;

    };


    NumaAllocator(int node): alloc(node)

    {

    }


    int node() const

    {

      return alloc.node();

    }


    pointer address(reference x) const

    {

      return &x;

    }


    const_pointer address( const_reference x ) const

    {

      return &x;

    }


    size_type max_size() const

    {

      return alloc.max_size() / sizeof(T);

    }


    pointer allocate(size_type n, std::allocator<void>::const_pointer /* hint */ = 0)

    {

      if (n>0)

        return static_cast<pointer>(alloc.allocate(n*sizeof(T)));

      else

        return nullptr;

    }


    void deallocate(pointer p, size_type n)

    {

      if (p)

        alloc.deallocate(static_cast<void*>(p),n*sizeof(T));

    }


    template< class U, class... Args >

    void construct(U* p, Args&&... args)

    {

      ::new((void*)p) U(std::forward<Args>(args)...);

    }


    template <class U>

    void destroy(U* p)

    {

      p->~U();

    }


    template <class U>

    bool operator==(NumaAllocator<U> const& other) const

    {

      return node()==other.node();

    }


    template <class U>

    bool operator!=(NumaAllocator<U> const& other) const

    {

      return !(node() == other.node());

    }


  private:

    ThreadingDetail::NumaAllocatorBase alloc;

  };


  //----------------------------------------------------------------------------


  class Mutex

  {

  public:

    Mutex() = default;


    Mutex(Mutex const& m) {}


    Mutex& operator=(Mutex const& m) { return *this; }


    boost::mutex& get() { return mutex; }


  private:

    boost::mutex mutex;

  };


  //-------------------------------------------------------------------------------------


  void runInBackground(std::function<void()>& f);

}


#endif

Kaskade::ConcurrentQueue
A concurrent fifo queue.
Definition: threading.hh:131

Kaskade::ConcurrentQueue::size
size_t size() const
Returns the number of tasks waiting.
Definition: threading.hh:174

Kaskade::ConcurrentQueue::empty
bool empty() const
Definition: threading.hh:164

Kaskade::ConcurrentQueue::ConcurrentQueue
ConcurrentQueue()
Constructs an empty queue.
Definition: threading.hh:136

Kaskade::ConcurrentQueue::running
int running(int n)
Change the number of running worker threads.
Definition: threading.hh:219

Kaskade::ConcurrentQueue::push_back
void push_back(T &&t)
Stores an element at the end of the queue.
Definition: threading.hh:184

Kaskade::ConcurrentQueue::operator=
ConcurrentQueue & operator=(ConcurrentQueue const &q)
Assignment.
Definition: threading.hh:157

Kaskade::ConcurrentQueue::pop_front
T pop_front()
Retrieves the foremost element.
Definition: threading.hh:198

Kaskade::ConcurrentQueue::ConcurrentQueue
ConcurrentQueue(ConcurrentQueue &&q)
Moves a queue.
Definition: threading.hh:146

Kaskade::ConcurrentQueue::running
int running() const
Get the number of running worker threads.
Definition: threading.hh:230

Kaskade::Kalloc
A simple memory manager for NUMA systems.
Definition: kalloc.hh:40

Kaskade::Mutex
A utility class implementing appropriate copy semantics for boost mutexes.
Definition: threading.hh:772

Kaskade::Mutex::get
boost::mutex & get()
provides access to the mutex to perform the locking.
Definition: threading.hh:798

Kaskade::Mutex::operator=
Mutex & operator=(Mutex const &m)
Assignment.
Definition: threading.hh:793

Kaskade::Mutex::Mutex
Mutex(Mutex const &m)
Copy constructor.
Definition: threading.hh:786

Kaskade::Mutex::Mutex
Mutex()=default
Default constructor.

Kaskade::NumaAllocator
An STL allocator that uses memory of a specific NUMA node only.
Definition: threading.hh:653

Kaskade::NumaAllocator::construct
void construct(U *p, Args &&... args)
Definition: threading.hh:731

Kaskade::NumaAllocator::deallocate
void deallocate(pointer p, size_type n)
Definition: threading.hh:724

Kaskade::NumaAllocator::NumaAllocator
NumaAllocator(int node)
Construct an allocator for allocating on the given NUMA node.
Definition: threading.hh:681

Kaskade::NumaAllocator::value_type
T value_type
Definition: threading.hh:655

Kaskade::NumaAllocator::address
const_pointer address(const_reference x) const
Definition: threading.hh:698

Kaskade::NumaAllocator::destroy
void destroy(U *p)
Definition: threading.hh:737

Kaskade::NumaAllocator::reference
T & reference
Definition: threading.hh:658

Kaskade::NumaAllocator::address
pointer address(reference x) const
Definition: threading.hh:693

Kaskade::NumaAllocator::const_reference
T const  & const_reference
Definition: threading.hh:659

Kaskade::NumaAllocator::pointer
T * pointer
Definition: threading.hh:656

Kaskade::NumaAllocator::propagate_on_container_copy_assignment
std::true_type propagate_on_container_copy_assignment
Definition: threading.hh:665

Kaskade::NumaAllocator::operator==
bool operator==(NumaAllocator< U > const &other) const
comparison for equality
Definition: threading.hh:748

Kaskade::NumaAllocator::operator!=
bool operator!=(NumaAllocator< U > const &other) const
Definition: threading.hh:754

Kaskade::NumaAllocator::node
int node() const
Reports the node on which we allocate.
Definition: threading.hh:688

Kaskade::NumaAllocator::const_pointer
T const  * const_pointer
Definition: threading.hh:657

Kaskade::NumaAllocator::propagate_on_container_swap
std::true_type propagate_on_container_swap
Definition: threading.hh:667

Kaskade::NumaAllocator::size_type
std::size_t size_type
Definition: threading.hh:660

Kaskade::NumaAllocator::propagate_on_container_move_assignment
std::true_type propagate_on_container_move_assignment
Definition: threading.hh:666

Kaskade::NumaAllocator::max_size
size_type max_size() const
Definition: threading.hh:703

Kaskade::NumaAllocator::difference_type
std::ptrdiff_t difference_type
Definition: threading.hh:661

Kaskade::NumaAllocator::allocate
pointer allocate(size_type n, std::allocator< void >::const_pointer=0)
Allocates the requested amount of memory.
Definition: threading.hh:716

Kaskade::NumaThreadPool
Implementation of thread pools suitable for parallelization of (more or less) memory-bound algorithms...
Definition: threading.hh:293

Kaskade::NumaThreadPool::runningOnGlobalQueue
int runningOnGlobalQueue() const
Reports how many worker threads are running to work on the global task queue.
Definition: threading.hh:335

Kaskade::NumaThreadPool::instance
static NumaThreadPool & instance(int maxThreads=std::numeric_limits< int >::max())
Returns a globally unique thread pool instance.

Kaskade::NumaThreadPool::nodes
int nodes() const
Reports the number of NUMA nodes (i.e., memory interfaces/CPU sockets)
Definition: threading.hh:316

Kaskade::NumaThreadPool::alignment
size_t alignment(int node) const
Reports the alignment size of allocator at given NUMA node.

Kaskade::NumaThreadPool::allocate
void * allocate(size_t n, int node)
Allocates memory on a specific node.

Kaskade::NumaThreadPool::maxCpusOnNode
int maxCpusOnNode() const
Reports the maximal number of CPUs on one node.
Definition: threading.hh:351

Kaskade::NumaThreadPool::runOnNode
Ticket runOnNode(int node, Task &&task)
Schedules a task to be executed on a CPU belonging to the given NUMA node.

Kaskade::NumaThreadPool::cpus
int cpus() const
Reports the total number of CPUs (usually a multiple of nodes).
Definition: threading.hh:327

Kaskade::NumaThreadPool::deallocate
void deallocate(void *p, size_t n, int node)
frees a chunk of memory previously allocated

Kaskade::NumaThreadPool::reserve
void reserve(size_t n, size_t k, int node)
Tells the allocator to prepare for subsequent allocation of several memory blocks of same size.

Kaskade::NumaThreadPool::isSequential
bool isSequential() const
Returns true if tasks are executed sequentially. sequential execution can be enforced by calling Numa...
Definition: threading.hh:362

Kaskade::NumaThreadPool::cpus
int cpus(int node) const
Reports the number of CPUs on the given node (usually the same for all nodes).
Definition: threading.hh:343

Kaskade::NumaThreadPool::run
Ticket run(Task &&task)
Schedules a task to be executed on an arbitrary CPU.

Kaskade::NumaThreadPool::allocator
Kalloc & allocator(int node)
Returns the allocator used for the given node.

Kaskade::TaskTiming
A class that gathers data on task timing and provides gnuplot visualization.
Definition: timing.hh:291

Kaskade::TaskTiming::start
void start(int task)
defines the start of given task.

Kaskade::TaskTiming::stop
void stop(int task)
defines the start of given task.

Dune::max
Dune::FieldVector< T, n > max(Dune::FieldVector< T, n > x, Dune::FieldVector< T, n > const &y)
Componentwise maximum.
Definition: fixdune.hh:110

Dune::min
Dune::FieldVector< T, n > min(Dune::FieldVector< T, n > x, Dune::FieldVector< T, n > const &y)
Componentwise minimum.
Definition: fixdune.hh:122

Kaskade::uniformWeightRangeStart
Index uniformWeightRangeStart(BlockIndex i, BlockIndex n, Index m)
Computes partitioning points of ranges for uniform weight distributions.
Definition: threading.hh:75

Kaskade::Task
std::packaged_task< void()> Task
Abstract interface for tasks to be scheduled for concurrent execution.
Definition: threading.hh:248

Kaskade::runInBackground
void runInBackground(std::function< void()> &f)
Executes a function in a child process.

Kaskade::Ticket
std::future< void > Ticket
Abstract waitable job ticket for submitted tasks.
Definition: threading.hh:254

Kaskade::parallelForNodes
void parallelForNodes(Func const &f, int maxTasks=std::numeric_limits< int >::max())
A parallel for loop that executes the given functor in parallel on different NUMA nodes.
Definition: threading.hh:604

Kaskade::uniformWeightRange
Index uniformWeightRange(Index j, Index n, Index m)
Computes the range in which an index is to be found when partitioned for uniform weights.
Definition: threading.hh:91

Kaskade::equalWeightRanges
void equalWeightRanges(std::vector< size_t > &x, size_t n)
Computes partitioning points such that the sum of weights in each partition is roughly the same.

Kaskade::refElementMutex
boost::mutex refElementMutex
A global lock for the Dune::GenericReferenceElement singletons, which are not thread-safe.

Kaskade::parallelFor
void parallelFor(Func const &f, int maxTasks=std::numeric_limits< int >::max())
A parallel for loop that executes the given functor in parallel on different CPUs.
Definition: threading.hh:489

Kaskade::DuneQuadratureRulesMutex
std::mutex DuneQuadratureRulesMutex
A global lock for the Dune::QuadratureRules factory, which is not thread-safe as of 2015-01-01.

kalloc.hh

Kaskade
Definition: abstract_interface.hh:15

Kaskade::NumaAllocator::rebind
Definition: threading.hh:671

Kaskade::NumaAllocator::rebind::other
NumaAllocator< U > other
Definition: threading.hh:672

timing.hh