kaskade7/html/threaded_matrix_8hh_source.html

/* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */

/*                                                                           */

/*  This file is part of the library KASKADE 7                               */

/*  https://www.zib.de/research/projects/kaskade7-finite-element-toolbox     */

/*                                                                           */

/*  Copyright (C) 2012-2023 Zuse Institute Berlin                            */

/*                                                                           */

/*  KASKADE 7 is distributed under the terms of the ZIB Academic License.    */

/*    see $KASKADE/academic.txt                                              */

/*                                                                           */

/* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */


#ifndef THREADED_MATRIX

#define THREADED_MATRIX


#include <cmath>

#include <iostream>

#include <boost/timer/timer.hpp>


#include "dune/common/config.h"

#include <dune/istl/bcrsmatrix.hh>

#include <dune/istl/operators.hh>


#include "fem/firstless.hh"

#include "fem/fixdune.hh"

#include "linalg/dynamicMatrix.hh"

#include "utilities/duneInterface.hh"

#include "utilities/threading.hh"

#include "utilities/detailed_exception.hh"


namespace Kaskade {

  // forward declarations

  template <class Entry, class Index>

  class NumaBCRSMatrix;


  template <class Scalar, class Index>

  class MatrixAsTriplet;


  template <class Target, class Source, class RowIndices, class ColIndices>

  Target submatrix(Source const& A, RowIndices const& ri, ColIndices const& ci);


  namespace ThreadedMatrixDetail

  {

    // Computes for each row the number of entries in that row. The count is stored in the

    // vector rowCount, which has to be big enough.

    template <class Entry, class Allocator, class Index>

    void getRowCount(Dune::BCRSMatrix<Entry,Allocator> const& matrix, bool isTransposed, std::vector<Index>& rowCount)

    {

      for (auto ri=matrix.begin(); ri!=matrix.end(); ++ri)

        if (isTransposed)

          for (auto ci=ri->begin(); ci!=ri->end(); ++ci)

            ++rowCount[ci.index()];

        else

          rowCount[ri.index()] = ri->size();

    }


    template <class Entry, class Index2, class Index>

    void getRowCount(NumaBCRSMatrix<Entry,Index2> const& matrix, bool isTransposed, std::vector<Index>& rowCount)

    {

      for (auto ri=matrix.begin(); ri!=matrix.end(); ++ri)

        if (isTransposed)

          for (auto ci=ri->begin(); ci!=ri->end(); ++ci)

            ++rowCount[ci.index()];

        else

          rowCount[ri.index()] = ri->size();

    }


    template <class Entry, class Index, class Index2>

    void getRowCount(MatrixAsTriplet<Entry,Index2> const& matrix, bool isTransposed, std::vector<Index>& rowCount)

    {

      auto const& indices = isTransposed? matrix.cidx: matrix.ridx;

      for (auto r: indices)

        ++rowCount[r];

    }


    // Class with partial specialization for copying from a Dune ISTL sparse matrix into ThreadedMatrix Chunks.

    // Supported modes of copying are (i) one-to-one direct copy (ii) transposed copy (iii) source is symmetric

    // (superdiagonal part is never accessed).

    template <class Entry, class Matrix, bool symmetric, bool transposed, class Index>

    struct CopyMatrixToChunk

    {

      // The simple case: not symmetric, not transposed

      static void init(Index first, Index last, Matrix const& matrix, std::vector<size_t,NumaAllocator<size_t>>& colStart,

                       std::vector<Index,NumaAllocator<Index>>& cols, std::vector<Entry,NumaAllocator<Entry>>& values,

                       std::vector<Index> const& /* nRowEntries */)

      {

        for (Index i=0; i<last-first; ++i)

        {

          Index j=0;

          for (auto ci=matrix[first+i].begin(); ci!=matrix[first+i].end(); ++ci, ++j)

          {

            values[colStart[i]+j] = *ci;

            cols[colStart[i]+j] = ci.index();

          }

        }

      }

    };


    template <class Entry, class Matrix, bool transposed, class Index>

    struct CopyMatrixToChunk<Entry, Matrix,true,transposed,Index>

    {

      // The hard case: symmetric with only lower triangular part to be accessed

      static void init(Index first, Index last, Matrix const& matrix, std::vector<size_t,NumaAllocator<size_t>>& colStart,

                       std::vector<Index,NumaAllocator<Index>>& cols, std::vector<Entry,NumaAllocator<Entry>>& values,

                       std::vector<Index> const& nRowEntries)

      {

        // Symmetric case is a mixture of regular and transposed. First we insert the regular entries,

        // subsequently the transposed ones.


        // Start with regular entries (including the diagonal)

        std::vector<Index> entriesInRow(last-first,0);

        for (Index i=0; i<last-first; ++i)

        {

          Index j=0;

          for (auto ci=matrix[first+i].begin(); ci!=matrix[first+i].end(); ++ci, ++j)

          {

            values[colStart[i]+j] = *ci;

            cols[colStart[i]+j] = ci.index();

          }

          entriesInRow[i] = matrix[first+i].size();

          assert(entriesInRow[i]<=nRowEntries[first+i]);

        }


        // Do the transposed entries (excluding the diagonal). Note that as we only want entries with

        // column index >= first+1, we can start (for the transposed part) at row first+1

        for (Index r=first+1; r!=matrix.N(); ++r)

          for (auto c=matrix[r].begin(); c!=matrix[r].end(); ++c)

          {

            // get transposed indices

            Index ridx = c.index();

            if (ridx>=first && ridx<last)

            {

              Index cidx = r;


              if (cidx > ridx) // take only superdiagonal entries

                {

                  assert(ridx-first<colStart.size() && ridx-first < entriesInRow.size());

                  Index pos = colStart[ridx-first] + entriesInRow[ridx-first];

                  assert(pos<colStart[ridx-first+1]);

                  cols[pos] = cidx;

                  values[pos] = transpose(*c);

                  ++entriesInRow[ridx-first];

                  assert(entriesInRow[ridx-first]<=nRowEntries[ridx]);

                }

            }

          }

      }

    };


    template <class Entry, class Matrix, class Index>

    struct CopyMatrixToChunk<Entry, Matrix,false,true,Index>

    {

      // The case of transposed, nonsymmetric supplied matrices.

      static void init(Index first, Index last, Matrix const& matrix, std::vector<size_t,NumaAllocator<size_t>>& colStart,

                       std::vector<Index,NumaAllocator<Index>>& cols, std::vector<Entry,NumaAllocator<Entry>>& values,

                       std::vector<Index> const& nRowEntries)

      {

        // Essentially we have to exchange row and column indices. If we go through the matrix

        // row by row, the entries of the transposed matrix to be stored in our CRS chunk come

        // with random row indices but increasing column indices. Hence we can push the entries

        // at the back of each row. We only have to remember where the current end of each row is.

        std::vector<Index> entriesInRow(last-first,0);


        for (auto r=matrix.begin(); r!=matrix.end(); ++r)

          for (auto c=r->begin(); c!=r->end(); ++c)

          {

            // get transposed indices

            Index ridx = c.index();

            if (ridx>=first && ridx<last)

            {

              Index cidx = r.index();


              assert(ridx-first<colStart.size() && ridx-first < entriesInRow.size());

              Index pos = colStart[ridx-first] + entriesInRow[ridx-first];

              assert(pos<colStart[ridx-first+1]);

              cols[pos] = cidx;

              values[pos] = transpose(*c);

              ++entriesInRow[ridx-first];

              assert(entriesInRow[ridx-first]<=nRowEntries[ridx]);

            }

          }

      }

    };


    //-------------------------------------------------------------------------


    // Class for copying matrix entries. If the entry is to be transposed during

    // copying (e.g. because only the lower triangular part of the source block matrix

    // has been stored), the matrix dimensions may change due to transposition

    // (if they are not equal). This case is covered here - we know the entry is

    // to be transposed.

    template <class To, class From, class scalarsMatch = std::false_type>

    struct MatrixEntry

    {

      static void copy(From const& from, To& to, bool isTransposed)

      {

        assert(isTransposed);

        to = transpose(from);

      }

    };


    // This specialization is for the case of different floating point types as scalar entries in the matrix blocks.

    // Currently this is only implemented for blocks of quadratic shape and non transposed blocks.

    template <class To, class From>

    struct MatrixEntry<To, From, typename std::is_same<typename To::field_type, typename From::field_type>::type>

    {

      static void copy(From const& from, To& to, bool isTransposed)

      {

        assert(!isTransposed);

        static_assert(To::rows == From::rows && To::cols == From::cols, "Mismatch of dimensions.");

        for(int i=0;i<To::rows;++i)

          for(int j=0;j<To::cols;++j)

            to[i][j] = static_cast<typename To::field_type>(from[i][j]);

      }

    };


    // The case of matching dimensions of source and target is covered here.

    // Both cases can occur, transposition or not. Partial specialization of

    // this handler class is necessary due to the runtime switch between

    // transposed or not - this dynamic switch is only legal in C++ if the

    // dimensions are the same in both cases.

    template <class Entry>

    struct MatrixEntry<Entry,Entry>

    {

      static void copy(Entry const& from, Entry& to, bool isTransposed)

      {

        if (isTransposed && from.rows > 1 && from.cols > 1 && from.rows != from.cols)

        {

          std::cerr << "Transposing rectangular Dune::FieldMatrix not implemented!" << std::endl

                    << "#rows=" << from.rows << ", #cols=" << from.cols << std::endl;

          exit(1);

        }

         if (isTransposed && from.rows == from.cols)

//           to = transpose(from);

//       the following transposing code also covers the case that

//       the memory locations of from and to are the same

         {

            for (int i=0;i<from.rows;i++)

            {

              to[i][i]=from[i][i];

              for (int j=i+1;j<from.cols;j++)

              {

                typename Entry::field_type s=from[j][i];

                to[j][i]=from[i][j];

                to[i][j]=s;

              }

            }

         }

         else

           to = from;

      }

    };


    //-------------------------------------------------------------------------


    template <class Index=size_t>

    class CRSChunkPatternInfo

    {

    public:

      CRSChunkPatternInfo(Index first_, Index last_, Index cols_, bool symmetric, int node)

      : firstRow(first_), lastRow(last_), cols(cols_), numaNode(node), symm(symmetric)

      {}


      Index first() const { return firstRow; }


      Index last() const { return lastRow; }


      Index columns() const { return cols; }


      int node() const { return numaNode; }


      bool symmetric() const { return symm; }


    protected:

      Index firstRow, lastRow; // first and one behind last row

      Index cols;              // number of columns


    private:

      int numaNode;

      bool symm;

    };


    //-------------------------------------------------------------------------


    template <class Index=size_t>

    class CRSChunkPatternCreator: public CRSChunkPatternInfo<Index>

    {

    public:

      typedef std::vector<Index> IndexArray; // column indices are indexed by user-provided Index type

      // We could use a Numa allocator here, but testing as of 2014-03-12 revealed that the performance is

      // *much* worse with Numa allocators, even though the Numa allocator is actually faster in

      // synthetic tests of a structure similar to here. This reason for this disparity is up to now unknown.


      CRSChunkPatternCreator(Index firstRow, Index lastRow, Index ncols, bool symmetric, int node)

      : CRSChunkPatternInfo<Index>(firstRow,lastRow,ncols,symmetric,node),

        cols(lastRow-firstRow)

      {

      }


      void reserve(Index nnzPerRow=8)

      {

        // perform allocation

        for (auto& r: cols)

          r.reserve(nnzPerRow);

      }


      void clear()

      {

        for (auto& r: cols)

        {

          r.clear();

          r.shrink_to_fit();

        }

      }


      template <class IterRow, class IterCol>

      void addElements(IterRow fromRow, IterRow const toRow,

                       IterCol const fromCol, IterCol const toCol, bool colIsSorted=false)

      {

        // sort column indices if needed, and remove duplicates (set_union below does NOT remove duplicates...)

        sortedCols.clear();

        sortedCols.insert(end(sortedCols),fromCol,toCol);

        if (!colIsSorted)

          std::sort(begin(sortedCols),end(sortedCols));

        sortedCols.erase(std::unique(begin(sortedCols),end(sortedCols)),end(sortedCols));


        // For each row, merge the specified column indices into the column indices that are already present.

        for ( ; fromRow != toRow; ++fromRow)                         // step through all affected rows

          if (this->first() <= *fromRow && *fromRow < this->last())  // but treat only those that actually lie in our chunk

          {

            IndexArray& c = cols[*fromRow-this->first()];

            auto top = this->symmetric()?

                              std::upper_bound(begin(sortedCols),end(sortedCols),*fromRow):

                              end(sortedCols); // omit superdiagonal elements if symmetric

            tmp.resize(c.size() + std::distance(begin(sortedCols),top));         // reserve space for the union

            tmp.erase(std::set_union(begin(c),end(c),begin(sortedCols),top,      // compute the union

                                     begin(tmp)),end(tmp));                      // and cut the container at the proper end

            std::swap(tmp,c);                                                    // write the sorted range into its target

          }

        // Note that instead of a temporary buffer tmp we could append the new column indices and use

        // std::inplace_merge. This, however, uses a temporary buffer of its own behind the scenes, one that is held

        // by the STL library implementation somewhere. It is highly probable that this memory is *not* located

        // on our NUMA node, but frequently accessed from all nodes. Hence, false sharing is bound to occur with

        // inplace_merge even if it performs proper locking.

      }


      void addAllElements(Index columns)

      {

        for (Index i=0; i<cols.size(); ++i)

        {

          cols[i].resize(this->symmetric()? i+this->first()+1: columns);

          std::iota(begin(cols[i]),end(cols[i]),static_cast<Index>(0));

        }

      }


      IndexArray const& row(Index i) const { return cols[i-this->first()]; }


      size_t nonzeroes() const

      {

        size_t nnz = 0;

        for (auto const& r: cols)

          nnz += r.size();

        return nnz;

      }


      size_t balanceForward(size_t const covered, size_t const nnz, int chunks, std::vector<IndexArray>& moveRows);


      size_t balanceBackward(size_t covered, size_t const nnz, int chunks, std::vector<IndexArray>& moveRows);


    private:

      std::vector<IndexArray> cols;      // For each row a std::vector of column indices

      IndexArray tmp;                    // some temporary buffer

      IndexArray sortedCols;             // some temporary buffer for sorted column indices

    };


    //-------------------------------------------------------------------------


    template <class Index=size_t>

    class CRSChunkPattern: public CRSChunkPatternInfo<Index>

    {

    public:


      CRSChunkPattern(CRSChunkPatternCreator<Index> const& creator);


      template <class Expanded, class Condensed, class Matrix>

      CRSChunkPattern(Index first, Index last, Expanded const& eIndices, Condensed const& cIndices,

                      Matrix const& mat, int node)

      : CRSChunkPatternInfo<Index>(first,last,eIndices.size(),false,node)

      , colStarts(last-first+1,0,NumaAllocator<size_t>(node)), cols(NumaAllocator<Index>(node))

      {

        cols.reserve(last-first);                  // prevent frequent reallocation (heuristic)


        for (Index i=first; i<last; ++i)           // scan all our rows

        {

          colStarts[i-first] = cols.size();        // note where this row starts


          auto const& row = mat[eIndices[i]];      // get a handle to the column indices in that row


          auto cend = row.end();

          for (auto c = row.begin(); c!=cend; ++c) // step through all column entries in that row

          {

            Index ci = cIndices[c.index()];         // extract the condensed index

            if (ci<eIndices.size())                 // if that is in the condensed matrix range...

              cols.push_back(ci);                   // ...include it

          }

        }


        colStarts.back() = cols.size();            // sentinel


        // investigate sparsity

        nnzPerRow = this->last()==this->first()? 0 : nonzeroes() / (this->last()-this->first());

      }


      template <class Matrix>

      CRSChunkPattern(Matrix const& matrix, bool isSymmetric, bool isTransposed,

                      Index firstRow, Index lastRow, bool symmetric, int node)

      : CRSChunkPatternInfo<Index>(firstRow,lastRow,matrix.M(),symmetric,node)

      , colStarts(NumaAllocator<size_t>(node)), cols(NumaAllocator<Index>(node))

      {

        if (isSymmetric)         // Make sure that transposed is only flagged if

          isTransposed = false;  // it's not symmetric.


        // Compute start of row indices. Step through the supplied matrix and count all elements that

        // fall in one of our rows.

        std::vector<size_t,NumaAllocator<size_t>> rowCount(this->last()-this->first(),0,NumaAllocator<size_t>(node));


        // Compute the row range of the supplied matrix we have to scan.

        Index fromRow = this->first(), toRow = this->last();

        if (isSymmetric && !symmetric) // If the supplied matrix is symmetrically stored but we are not, we have to

          toRow = matrix.N();          // scan the later rows for the elements that appear in later columns here.

        if (isTransposed)                   // If the supplied matrix is transposed, we have to scan all the rows

          fromRow = 0, toRow = matrix.N();  // in order to cover all the columns in our range.


        for (Index ri=fromRow; ri<toRow; ++ri)

        {

          auto cend = matrix[ri].end();

          for (auto ci=matrix[ri].begin(); ci!=cend; ++ci)

          {

            Index r = ri, c = ci.index();

            if (isTransposed) // supplied matrix is transposed, swap row and column indices

              std::swap(r,c); // to get the row/col indices in our world

            if (this->first()<=r && r<this->last() && (!symmetric || c<=r)) // yep, this element is in our chunk

              ++rowCount[r-this->first()];                                  //

            if (isSymmetric && !symmetric && c<r && this->first()<=c && c<this->last()) // if supplied matrix is stored symmetrically but we are not

              ++rowCount[c-this->first()];                                              // copy truely subdiagonal entries to the upper triangle

          }

        }


        // compute the partial sums of number of elements per row.

        colStarts.resize(rowCount.size()+1,0);

        std::partial_sum(rowCount.begin(),rowCount.end(),colStarts.begin()+1);


        // Get space for all nonzeros

        cols.resize(colStarts.back());


        // Copy the column indices of elements.

        for (Index ri=fromRow; ri<toRow; ++ri)

        {

          auto cend = matrix[ri].end();

          for (auto ci=matrix[ri].begin(); ci!=cend; ++ci)

          {

            Index r = ri, c = ci.index();

            if (isTransposed) // supplied matrix is transposed, swap row and column indices

              std::swap(r,c); // to get the row/col indices in our world

            if (this->first()<=r && r<this->last() && (!symmetric || c<=r))    // yep, this element is in our chunk

            {

              cols[colStarts[r-this->first()]+rowCount[r-this->first()]-1] = c; // enter the column index

              --rowCount[r-this->first()];                                      // one less element to come

            }

            if (isSymmetric && !symmetric && c<r && this->first()<=c && c<this->last()) // if supplied matrix is stored symmetrically but we are not

            {

              cols[colStarts[c-this->first()]+rowCount[c-this->first()]-1] = r;         // copy truely subdiagonal entries to the upper triangle

              --rowCount[c-this->first()];                                              // one less element to come

            }

          }

        }


        // sort the column indices in each row

        for (Index ri=0; ri<rowCount.size(); ++ri) {

          assert(rowCount[ri] == 0);

          std::sort(cols.begin()+colStarts[ri],cols.begin()+colStarts[ri+1]);

        }


        // investigate sparsity

        nnzPerRow = this->last()==this->first()? 0 : nonzeroes() / (this->last()-this->first());

      }


      template <class Scalar, class Index2>

      CRSChunkPattern(MatrixAsTriplet<Scalar,Index2> const& matrix, bool isSymmetric, bool isTransposed,  // TODO: fuse with constructor above

                      Index firstRow, Index lastRow, bool symmetric, int node)

      : CRSChunkPatternInfo<Index>(firstRow,lastRow,matrix.M(),symmetric,node)

      , colStarts(NumaAllocator<size_t>(node)), cols(NumaAllocator<Index>(node))

      {

        if (isSymmetric)         // Make sure that transposed is only flagged if

          isTransposed = false;  // it's not symmetric.


        // Compute start of row indices. Step through the supplied matrix and count all elements that

        // fall in one of our rows.

        std::vector<size_t,NumaAllocator<size_t>> rowCount(this->last()-this->first(),0,NumaAllocator<size_t>(node));


        for (size_t i=0; i<matrix.ridx.size(); ++i)

        {

          Index r = matrix.ridx[i], c = matrix.cidx[i];

          if (isTransposed) // supplied matrix is transposed, swap row and column indices

            std::swap(r,c); // to get the row/col indices in our world

          if (this->first()<=r && r<this->last() && (!symmetric || c<=r)) // yep, this element is in our chunk

            ++rowCount[r-this->first()];                                  //

          if (isSymmetric && !symmetric && c<r && this->first()<=c && c<this->last()) // if supplied matrix is stored symmetrically but we are not

            ++rowCount[c-this->first()];                                              // copy truely subdiagonal entries to the upper triangle

        }


        // compute the partial sums of number of elements per row.

        colStarts.resize(rowCount.size()+1,0);

        std::partial_sum(rowCount.begin(),rowCount.end(),colStarts.begin()+1);


        // Get space for all nonzeros

        cols.resize(colStarts.back());


        // Copy the column indices of elements.

        for (size_t i=0; i<matrix.ridx.size(); ++i)

        {

          Index r = matrix.ridx[i], c = matrix.cidx[i];

          if (isTransposed) // supplied matrix is transposed, swap row and column indices

            std::swap(r,c); // to get the row/col indices in our world

          if (this->first()<=r && r<this->last() && (!symmetric || c<=r))    // yep, this element is in our chunk

          {

            cols[colStarts[r-this->first()]+rowCount[r-this->first()]-1] = c; // enter the column index

            --rowCount[r-this->first()];                                      // one less element to come

          }

          if (isSymmetric && !symmetric && c<r && this->first()<=c && c<this->last()) // if supplied matrix is stored symmetrically but we are not

          {

            cols[colStarts[c-this->first()]+rowCount[c-this->first()]-1] = r;         // copy truely subdiagonal entries to the upper triangle

            --rowCount[c-this->first()];                                              // one less element to come

          }

        }


        // sort the column indices in each row

        for (Index ri=0; ri<rowCount.size(); ++ri)    // TODO: do this in parallel on our node?

        {

          assert(rowCount[ri] == 0);

          auto first = cols.begin()+colStarts[ri];

          auto last = cols.begin()+colStarts[ri+1];

          std::sort(first,last);                      // sort in ascending order

          last = std::unique(first,last);             // duplicate entries may come from triplet matrices, remove

          rowCount[ri] = last-first;                  // now we have the correct number of entries.

        }


        // Merging duplicate column indices in a row may have reduced the number of entries in that row,

        // hence there may be a gap to the start of the next row. Compactify the column index array now.

        for (Index ri=1; ri<rowCount.size(); ++ri)

        {

          Index newStart = colStarts[ri-1] + rowCount[ri-1];

          if (newStart != colStarts[ri])                // entries need to be shifted

          {

            std::copy(cols.begin()+colStarts[ri],cols.begin()+colStarts[ri]+rowCount[ri],cols.begin()+newStart);

            colStarts[ri] = newStart;

          }

        }

        if (!rowCount.empty())                                  // set the sentinel to point just behind the last entry...

          colStarts.back() = colStarts[rowCount.size()-1]+rowCount.back();

        cols.erase(cols.begin()+colStarts.back(),cols.end());   // ...and drop the rubbish left over in the tail


        // investigate sparsity

        nnzPerRow = this->last()==this->first()? 0 : nonzeroes() / (this->last()-this->first());

      }


      // returns the column index of entry at position idx

      Index col(size_t idx) const { return cols[idx]; }


      size_t colStart(Index row) const { return colStarts[row]; }


      typename std::vector<Index,NumaAllocator<Index>>::const_iterator colStartIterator(Index row) const

      {

        return cols.begin()+colStart(row);

      }


      size_t position(Index r, Index c) const

      {

        auto b = cols.begin();

        r -= this->first();

        assert(0 <= r && r+1 < colStarts.size());

        auto p = std::lower_bound(b+colStarts[r],b+colStarts[r+1],c); // col indices are sorted in each row

        if (p==b+colStarts[r+1] || *p!=c)                             // return sentinel in case the entry is not here

          return std::numeric_limits<size_t>::max();

        return p-b;                                                   // return offset

      }


      bool exists(Index r, Index c) const

      {

        return position(r,c) != std::numeric_limits<size_t>::max();

      }


      size_t storage() const { return cols.size(); }


      size_t nonzeroes() const;


      Index nonzeroesPerRow() const { return nnzPerRow; }


    private:


      // Raw data storage. The allocators used are NUMA allocators, guaranteeing local memory access.

      std::vector<size_t,NumaAllocator<size_t>> colStarts; // memory positions are indexed by size_t (since there may be MANY)

      std::vector<Index,NumaAllocator<Index>> cols;        // column indices are indexed by user-provided Index type

      Index nnzPerRow;                                     // average number of nonzeroes per row (rounded)


    };


    //-------------------------------------------------------------------------


    template <class Entry, class Index>

    class NumaBCRSMatrixConstRowIterator

    {

    public:

      typedef typename std::vector<Index,NumaAllocator<Index>>::const_iterator ColIterator;

      typedef typename std::vector<Entry,NumaAllocator<Entry>>::const_iterator ValueIterator;

      using Self = NumaBCRSMatrixConstRowIterator<Entry,Index>;


      // typedefs required by std::iterator_traits

      using value_type = Entry;

      using difference_type = std::ptrdiff_t;

      using pointer = Entry const*;

      using reference = Entry const&;

      using iterator_category = std::random_access_iterator_tag;


      NumaBCRSMatrixConstRowIterator(ColIterator col_, ValueIterator val_): col(col_), val(val_) {}


      Index index() const { return *col; }


      Entry const& operator*() const { return *val; }

      Entry const* operator->() const { return &*val; }

      void operator++() { ++col; ++val; }

      void operator--() { --col; --val; }

      void operator+=(Index i) { col += i; val += i; }

      void operator-=(Index i) { col -= i; val -= i; }

      bool operator==(Self const& it) const { return val==it.val; }

      bool operator!=(Self const& it) const { return ! (*this==it); }

      difference_type operator-(Self const& it) const { return val-it.val; }


    protected:

      ColIterator col;

      ValueIterator val;

    };


    template <class Entry, class Index>

    class NumaBCRSMatrixRowIterator: public NumaBCRSMatrixConstRowIterator<Entry,Index>

    {

    public:

      typedef typename std::vector<Index,NumaAllocator<Index>>::const_iterator ColIterator;

      typedef typename std::vector<Entry,NumaAllocator<Entry>>::iterator ValueIterator;


      NumaBCRSMatrixRowIterator(ColIterator col_, ValueIterator val_): NumaBCRSMatrixConstRowIterator<Entry,Index>(col_,val_) {}


      Entry& operator*() const { return const_cast<Entry&>(*this->val); }

      Entry* operator->() const { return &(**this); }

    };


    //-------------------------------------------------------------------------


    template <class Arguments, class Operation>

    class NumaBCRSMatrixExpressionChunk

    {

    public:

      class iterator

      {

      };


      iterator begin(size_t) const

      {

      }


      iterator end(size_t) const

      {

      }

    };


    template <class Arguments, class Operation>

    class NumaBCRSMatrixExpression

    {

    public:

      NumaBCRSMatrixExpressionChunk<Arguments,Operation> const& operator[](int i) const

      {

      }

    };


    //-------------------------------------------------------------------------


    template <class Entry, class Index=size_t>

    class CRSChunk

    {

      typedef CRSChunk<Entry,Index> Self;


    public:

      using Scalar = typename EntryTraits<Entry>::field_type;

//       typedef typename GetScalar<Entry>::type Scalar;


      CRSChunk(int node)

      : values(NumaAllocator<Entry>(node))

      {

      }


      CRSChunk(std::shared_ptr<CRSChunkPattern<Index>> const& pattern_, Entry const& init)

      : pat(pattern_), scatterMutex(4), values(pat->nonzeroes(),init,NumaAllocator<Entry>(pat->node()))

      {

      }


      template <class Matrix>

      CRSChunk(std::shared_ptr<CRSChunkPattern<Index>> const& pattern_, Matrix const& matrix, bool isSymmetric, bool isTransposed)

      : CRSChunk(pattern_,Entry(0))

      {

        typedef typename Matrix::block_type SuppliedEntry;


        Index first = pat->first(), last = pat->last();


        if (isSymmetric)         // Make sure that transposed is only flagged if

          isTransposed = false;  // it's not symmetric.


        // Compute the row range of the supplied matrix we have to scan.

        Index fromRow = first, toRow = last;

        if (isSymmetric && !pat->symmetric()) // If the supplied matrix is symmetrically stored but we are not, we have to

          toRow = matrix.N();                 // scan the later rows for the elements that appear in later columns here.

        if (isTransposed)                     // If the supplied matrix is transposed, we have to scan all the rows

          fromRow = 0, toRow = matrix.N();    // in order to cover all the columns in our range.


        // Copy the column indices of elements.

        for (Index ri=fromRow; ri<toRow; ++ri)

        {

          auto cend = matrix[ri].end();

          for (auto ci=matrix[ri].begin(); ci!=cend; ++ci)

          {

            Index r = ri, c = ci.index();

            if (isTransposed) // supplied matrix is transposed, swap row and column indices

              std::swap(r,c); // to get the row/col indices in our world

            if (first<=r && r<last && (!pat->symmetric() || c<=r))                                   // yep, this element is in our chunk

            {

              assert(pat->position(r,c) < values.size());

              MatrixEntry<Entry,SuppliedEntry>::copy(*ci,values[pat->position(r,c)],isTransposed);   // copy, transpose if needed

            }

            if (isSymmetric && !pat->symmetric() && c<r && first<=c && c<last)                       // if supplied matrix is stored symmetrically but we are not..

              MatrixEntry<Entry,SuppliedEntry>::copy(*ci,values[pat->position(c,r)],true);           // copy, transpose

          }

        }

      }


      template <class Scalar, class Index2>

      CRSChunk(std::shared_ptr<CRSChunkPattern<Index>> const& pattern_, MatrixAsTriplet<Scalar,Index2> const& matrix, bool isSymmetric, bool isTransposed)

      : CRSChunk(pattern_,Entry(0))

      {

        Index first = pat->first(), last = pat->last();


        if (isSymmetric)         // Make sure that transposed is only flagged if

          isTransposed = false;  // it's not symmetric.


        // Copy the column indices of elements.

        for (size_t i=0; i<matrix.ridx.size(); ++i)

        {

          Index r = matrix.ridx[i], c = matrix.cidx[i];

          if (isTransposed) // supplied matrix is transposed, swap row and column indices

            std::swap(r,c); // to get the row/col indices in our world

          if (first<=r && r<last && (!pat->symmetric() || c<=r))                                   // yep, this element is in our chunk

          {

            assert(pat->position(r,c) < values.size());

            values[pat->position(r,c)] = matrix.data[i];                                           // copy (no transpose as triplet entries are scalar)

          }

          if (isSymmetric && !pat->symmetric() && c<r && first<=c && c<last)                       // if supplied matrix is stored symmetrically but we are not..

            values[pat->position(c,r)] = matrix.data[i];                                           // copy (no transpose as triplet entries are scalar)

        }

      }


      template <class Matrix>

      CRSChunk(Matrix const& matrix, bool isSymmetric, bool isTransposed, Index firstRow, Index lastRow, bool symmetric, int node)

      : CRSChunk(std::make_shared<CRSChunkPattern<Index>>(matrix,isSymmetric,isTransposed,firstRow,lastRow,symmetric,node),

                 matrix,isSymmetric,isTransposed)

      {}


      template <class Expanded, class Condensed,  class Matrix>

      CRSChunk(std::shared_ptr<CRSChunkPattern<Index>> const& pattern_,

                     Expanded const& eIndices, Condensed const& cIndices, Matrix const& matrix)

      : CRSChunk(pattern_,Entry(0))

      {

        Index first = pat->first(), last = pat->last();

        for (Index r=first; r<last; ++r)

        {

          auto row = matrix[eIndices[r]];

          auto p = begin(values) + pat->colStart(r-first);

          for (auto c=row.begin(); c!=row.end(); ++c)

            if (cIndices[c.index()]<eIndices.size())

            {

              *p = *c;

              assert(!std::isnan(*p));

              ++p;

            }


          assert(p==begin(values)+pat->colStart(r-first+1));

        }

      }


      Self& operator=(Self const& c) = default;


      template <class Value>

      Self&  operator=(Value const& a)

      {

        std::fill(begin(values),end(values),a);

        return *this;

      }


      template <class Arguments, class Operation>

      Self& operator=(ThreadedMatrixDetail::NumaBCRSMatrixExpressionChunk<Arguments,Operation> const& e)

      {

        Index first = pat->first();

        for (Index r=0; r<pat->last()-first; ++r)

        {

          auto eend = e.end(r);

          auto ci = pat->colStartIterator(r);

          auto cend = pat->colStartIterator(r+1);

          auto vi = begin(values)+pat->colStart(r);

          for (auto ei = e.begin(r); ei != eend; ++ei) // step through the expression row

          {

            auto pos = std::find(ci,cend,ei.index());  // find our entry

            assert(pos != cend);                       // our pattern must be a superset of the expression pattern

            vi += pos-ci;                              // advance to found entry

            ci = pos;

            *vi = *ei;                                 // assign

          }

        }

      }


      template <class Domain, class Range>

      Scalar apply(Scalar a, Domain const& x, Range& y, bool initialize) const

      {

        Scalar dp = 0;

        auto const& p = *pat;

        Index first = p.first();

        Index last = p.last();


        // Perform matrix vector multiplication. Step through all rows in our chunk...

        // On Numa nodes with several cores, one may think that using multiple threads

        // on this node could improve the performance. Alas, this appears not to be the

        // case as of 2014-05 (both aged AMD Opteron 32cores/8nodes and a newer 32/4 Intel

        // Xeon). It seems that a single thread saturates the memory channel(s).

        // Hence we stay with the sequential version here.

        for (Index row=first; row<last; ++row)

        {

          // and compute the scalar product of (sparse) row and vector

          typename Range::block_type z(0);


          size_t jEnd = p.colStart(row+1-first);

          for (size_t j=p.colStart(row-first); j<jEnd; ++j)

            if (Entry::rows==1 && Entry::cols==1)

              z.axpy(values[j][0][0],x[p.col(j)]);              // this is necessary as otherwise scalar matrix entries and

            else                                                // vecotr-valued rhs entries do not work together

              values[j].umv(x[p.col(j)],z);


          // scale result as requested

          z *= a;

          if (initialize)   y[row] = z;

          else              y[row] += z;


          // accumulate duality product

          if (Entry::rows==Entry::cols && row<x.size())

            dp += y[row]*x[row];

        }


        return dp;

      }


      template <class Domain, class Range>

      void applyTransposed(Scalar a, Domain const& x, Range& y) const

      {

        auto const& p = *pat;

        Index first = p.first(); // first row in our chunk

        Index last = p.last();   // last row in our chunk


        assert(x.size()>=last);


        // We compute y_i = a sum_j A_ji^T x_j, with j in the outer loop

        for (Index j=first; j<last; ++j)                        // looping over the rows of A, i.e. the columns of A^T

        {

          size_t jEnd = p.colStart(j+1-first);

          for (size_t k=p.colStart(j-first); k<jEnd; ++k)       // looping over the columns in the row

          {

            auto i = p.col(k);                                  // the column index in A, i.e. the row index in A^T

            if (Entry::rows==1 && Entry::cols==1)

              y[i] += a*values[k][0][0] * x[j];                 // this is necessary as otherwise scalar matrix entries and

            else                                                // vecotr-valued rhs entries do not work together

              values[k].usmtv(a,x[j],y[i]);                     // y_i += a * (A^T)_ij x_j = a * (A_ji)^T x_j

          }

        }

      }


      template <class Domain, class Range>

      void gatherMirrored(Scalar a, Domain const& x, Range& y, CRSChunk<Entry,Index> const& block, bool subdiagonal, bool initToZero) const

      {

        auto const& p = *block.pat;

        Index firstRow = p.first();

        Index lastRow = p.last();

        Index firstCol = pat->first();

        Index lastCol = pat->last();


        assert(lastRow<=y.size());


        if (initToZero)

          for (Index row=firstRow; row<lastRow; ++row)

            y[row] = 0;


        for (Index row=firstRow; row<lastRow; ++row)

        {

          auto const& xrow = x[row];


          auto cend = p.colStartIterator(row+1-firstRow);

          auto ci = firstCol==0?                                                       // We consider all elements with column index in our

                    p.colStartIterator(row-firstRow):                                  // row chunk. Compute the starting point by binary search

                    std::lower_bound(p.colStartIterator(row-firstRow),cend,firstCol);  // (or used the shortcut if we know we start from the beginning).

          auto vi = block.values.begin() + (ci-p.colStartIterator(0));

          if (subdiagonal)

            for ( ; ci!=cend && *ci<lastCol && *ci<row; ++ci, ++vi)                    // Then step through the row until we hit the upper bound,

              vi->usmtv(a,xrow,y[*ci]);                                                // which is either the upper row range end or the diagonal,

          else                                                                         // depending on the matrix symmetry (encoded in subdiagonal).

            for ( ; ci!=cend && *ci<lastCol; ++ci, ++vi)

              vi->usmtv(a,xrow,y[*ci]);

        }

      }


      CRSChunkPattern<Index> const& pattern() const { return *pat; }


      auto valStartIterator(Index row)

      {

        return values.begin()+pat->colStart(row);

      }


      template <class LMIterator>

      void scatter(LMIterator first, LMIterator last)

      {

        Index firstRow = pat->first();

        Index lastRow = pat->last();


        if (first==last || firstRow==lastRow)    // empty data or empty chunk

          return;                                // -> nothing to do


        Index nnzPerRow = pat->nonzeroesPerRow();


        // scatter into uniform ranges

        Index n = scatterMutex.size();

        for (Index i=0; i<n; ++i)

        {

          #ifndef KASKADE_SEQUENTIAL

          boost::lock_guard<boost::mutex> lock(scatterMutex[i].get());

          #endif


          Index rowRangeStart = uniformWeightRangeStart(i,  n,lastRow-firstRow)+firstRow;

          Index rowRangeEnd   = uniformWeightRangeStart(i+1,n,lastRow-firstRow)+firstRow;


          if (nnzPerRow == pat->columns())                     // completely dense

            scatterRowRange<0>(first,last,firstRow,rowRangeStart,rowRangeEnd);

          else if (nnzPerRow > 10*(first->cidx().size()))      // many more entries than we scatter right now

            scatterRowRange<1>(first,last,firstRow,rowRangeStart,rowRangeEnd);

          else                                                 // rather few entries per row

            scatterRowRange<2>(first,last,firstRow,rowRangeStart,rowRangeEnd);

        }

      }


      template <class F, class EntryB, class IndexB>

      void entrywiseOp(F const& f, NumaBCRSMatrix<EntryB,IndexB> const& B)

      {

        auto const& p = *pat;

        Index first = p.first();  // first row index

        Index last = p.last();    // end row index


        for (Index row=first; row<last; ++row)

        {

          auto const& Br = B[row];                              // get the other matrix row

          size_t jEnd = p.colStart(row+1-first);                // step through all entries in this row

          auto bri = Br.begin();

          auto brend = Br.end();

          for (size_t j=p.colStart(row-first); j<jEnd; ++j)

          {

            while (bri!= brend && bri.index()<p.col(j))         // advance other row pointer until we find our entry (but do not overshoot)

              ++bri;

            if (bri!=brend && bri.index()==p.col(j))            // if the column indices coincide, apply the functor

              values[j] = f(values[j],*bri);

          }

        }

      }


    private:

      std::shared_ptr<CRSChunkPattern<Index>> pat;

      std::vector<Mutex> scatterMutex;


      // Raw data storage. The allocators used are NUMA allocators to guarantee local memory access.

      std::vector<Entry,NumaAllocator<Entry>> values;     // matrix entries


      // scatters given local matrices into the provided row range. The expected sparsity is

      // encoded as static template parameter in order to allow an efficient if-statement in

      // the innermost loop

      template <int sparsity, class LMIterator>

      void scatterRowRange(LMIterator first, LMIterator last, Index startRow, Index firstRow, Index lastRow)

      {

        // simple-minded scatter implementation

        for ( ; first!=last; ++first)    // step through all local matrices

        {

          for (auto rgl: first->ridx())  // step through all global rows affected by this matrix

          {

            Index rg = rgl.first;

            if (rg >= lastRow)           // global row indices in rgl.first are sorted ascendingly,

              break;                     // if we pass our last row, we can stop with this local matrix

            if (firstRow<=rg)

              scatterRow<sparsity>(*first,rg-startRow,rgl.second);

          }

        }

      }


      template <int sparsity, class LM, class LIndex>

      void scatterRow(LM const& a, Index rg, LIndex rl)

      {

        auto cbegin = pat->colStartIterator(rg);

        auto cend = pat->colStartIterator(rg+1);

        auto vbegin = valStartIterator(rg);


        if (LM::lumped) // only diagonal, i.e. column index is row index both in global and local matrix

        {

          auto pos = std::find(cbegin,cend,rg+pat->first());

          assert(pos != cend);

          vbegin[pos-cbegin] += a(rl,rl);

        }

        else

        {

          for (auto cgl: a.cidx())

          {

            auto pos = sparsity==0? std::lower_bound(cbegin,cend,cgl.first) // find entry in row with global column number cgl.first

                     : sparsity==1? std::lower_bound(cbegin,cend,cgl.first) // TODO: specialized find for dense matrices.

                     :              std::find(cbegin,cend,cgl.first);


            if (pos==cend)

              return;                                    // reached end of row - skip the rest (there may be more, e.g. if we are symmetric)


            vbegin += pos-cbegin;                        // move on to the found entry

            cbegin =  pos;


            *vbegin += a(rl,cgl.second);                 // add up appropriate entry from local matrix

          }

        }

      }

    };


    //---------------------------------------------------------------------------------------------


    template <class Entry, class Index>

    class NumaBCRSRow

    {

    public:

      typedef Index size_type;

      typedef Entry block_type;


      typedef NumaBCRSMatrixRowIterator<Entry,Index>      Iterator;

      typedef NumaBCRSMatrixConstRowIterator<Entry,Index> ConstIterator;


      size_type size() const { return std::distance(vBegin,vEnd); }


      Index const* getindexptr() const { return &*cBegin; }


      block_type const* getptr() const { return &*vBegin; }

      block_type*       getptr()       { return &*vBegin; }


      ConstIterator begin() const { return ConstIterator(cBegin,vBegin); }

      ConstIterator end() const { return ConstIterator(cEnd,vEnd); }


      Iterator begin() { return Iterator(cBegin,vBegin); }

      Iterator end() { return Iterator(cEnd,vEnd); }


      ConstIterator find(Index c) const

      {

        auto ci = std::lower_bound(cBegin,cEnd,c);

        if (ci==cEnd || *ci!=c)

          return end();

        return ConstIterator(ci,vBegin+(ci-cBegin));

      }


      Iterator find(Index c)

      {

        auto ci = std::lower_bound(cBegin,cEnd,c);

        if (ci==cEnd || *ci!=c)

          return end();

        return Iterator(ci,vBegin+(ci-cBegin));

      }


      Entry const& operator[](Index c) const

      {

        static Entry const zero(0);


        auto ci = std::lower_bound(cBegin,cEnd,c);

        if (ci==cEnd || *ci!=c)

          return zero;


        return vBegin[ci-cBegin];

      }


      Entry& operator[](Index c)

      {

        auto ci = std::lower_bound(cBegin,cEnd,c);

        if (ci==cEnd || *ci!=c)

          throw LookupException("write access to nonexistent matrix entry",__FILE__,__LINE__);


        return vBegin[ci-cBegin];

      }


    protected:

      typedef typename std::vector<Index,NumaAllocator<Index>>::const_iterator ColIterator;

      typedef typename std::vector<Entry,NumaAllocator<Entry>>::iterator       ValueIterator;


      ColIterator cBegin, cEnd;                               // iterators giving the range of column indices

      ValueIterator vBegin, vEnd;                             // iterators giving the range of values


      // Extracts the row begin and end iterators at the current position (if it is a valid position)

      void update(NumaBCRSMatrix<Entry,Index>& matrix, Index row, int chunk)

      {

        if (0<=row && row<matrix.N())                         // we have a valid row in a valid chunk in front of us

        {

          auto& c = matrix.chunk(chunk);

          auto const& p = c.pattern();

          assert(p.first()<=row && row<p.last());             // check that the row is contained in the chunk

          Index first = p.first();

          cBegin = p.colStartIterator(row-first);

          cEnd = p.colStartIterator(row-first+1);

          vBegin = c.valStartIterator(row-first);

          vEnd = c.valStartIterator(row-first+1);

        }

      }

    };


    // -----------------------------------------------------------------------------------


    template <class Entry, class Index>

    class NumaBCRSMatrixConstIterator: public NumaBCRSRow<Entry,Index> // TODO: derive from const row type!

    {

      typedef NumaBCRSMatrixConstIterator<Entry,Index> Self;

      typedef NumaBCRSRow<Entry,Index> Row;

      using value_type = Row;


    public:

      NumaBCRSMatrixConstIterator(NumaBCRSMatrix<Entry,Index> const& matrix_, Index row_)

      : matrix(const_cast<NumaBCRSMatrix<Entry,Index>*>(&matrix_)), row(row_), chunk(getChunk(row_))

      {

        assert(row>=0 && row<=matrix->N());

        this->update(*matrix,row,chunk);

      }


      void operator++()

      {

        ++row;                                                   // next row

        if (row==matrix->N())                                    // we've reached the end

          return;                                                // -> skip anything else

        while (matrix->getPattern()->rowStart()[chunk+1]==row)   // oops, we tripped into the next chunk...

          ++chunk;                                               // which may be empty (zero rows), then go on

        this->update(*matrix,row,chunk);

      }


      void operator--()

      {

        --row;

        while (matrix->getPattern()->rowStart()[chunk] > row) // oops, we stepped before our chunk

          --chunk;                                            // previous one may be empty, then go on

        this->update(*matrix,row,chunk);

      }


      void operator+=(Index inc)

      {

        row += inc;

        if (row > matrix->N())

        {

          row = matrix->N();                  // we're behind the end - move to end

          return;                             // such that we compare equal to end.

        }

        chunk = getChunk(row);

        this->update(*matrix,row,chunk);

      }


      bool operator==(Self const& it) const { return matrix==it.matrix && row==it.row; }

      bool operator!=(Self const& it) const { return !(*this==it); }


      Index index() const { return row; }


      Row const& operator*() const { return *this; }

      Row const* operator->() const { return this; }


    private:

      NumaBCRSMatrix<Entry,Index>* matrix;

      Index row;

      int chunk;


      int getChunk(Index r) const { return matrix->getPattern()->chunk(r); }

    };


    template <class Entry, class Index>

    class NumaBCRSMatrixIterator: public NumaBCRSMatrixConstIterator<Entry,Index>

    {

      typedef NumaBCRSMatrixIterator<Entry,Index> Self;

      typedef NumaBCRSRow<Entry,Index> Row;


    public:

      NumaBCRSMatrixIterator(NumaBCRSMatrix<Entry,Index>& matrix_, Index row_)

      : NumaBCRSMatrixConstIterator<Entry,Index>(matrix_,row_) {}


      Row& operator*()  { return *this; }

      Row* operator->()  { return this; }

    };


  } // end of namespace ThreadedMatrixDetail

  //---------------------------------------------------------------------------

  //---------------------------------------------------------------------------


  template <class Index=size_t>

  class NumaCRSPatternCreator

  {

    typedef ThreadedMatrixDetail::CRSChunkPatternCreator<Index> ChunkCreator;


  public:

    NumaCRSPatternCreator(Index rows_, Index cols, bool symmetric=false, int nnzPerRow=0)

    : rows(rows_), columns(cols)

    , sym(symmetric && rows==columns)

    {

      int nodes = static_cast<Index>(NumaThreadPool::instance().nodes());

      creators.reserve(nodes);


      for (int i=0; i<nodes; ++i)

        if (sym)

          creators.push_back(ChunkCreator(i*rows/nodes,(i+1)*rows/nodes,cols,sym,i)); // TODO: more balanced

        else

          creators.push_back(ChunkCreator(i*rows/nodes,(i+1)*rows/nodes,cols,sym,i));


      if (nnzPerRow > 0)

        parallelForNodes([this,nnzPerRow](int i, int n)

        {

          this->creators[i].reserve(nnzPerRow);

        },nodes);

    }


    // The default destructor does not parallelize the release of memory on the NUMA chunks.Thus we

    // define our own destructor.

    ~NumaCRSPatternCreator()

    {

      int nodes = NumaThreadPool::instance().nodes();

      parallelForNodes([this](int i, int n)

      {

        this->creators[i].clear();

      },nodes);

    }


    void addElement(Index row, Index col)

    {

      addElements(&row,&row+1,&col,&col+1,true);

    }


    void addDenseBlock(Index fromRow, Index toRow, Index fromCol, Index toCol)

    {

      assert(fromRow>=0 && toRow<=rows && fromRow<=toRow);

      assert(fromCol>=0 && toCol<=columns && fromCol<=toCol);


      for (int r=fromRow; r<toRow; ++r)           // TODO: this is probably inefficient and not

        for (int c=fromCol; c<toCol; ++c)         //       NUMA-parallelized. Consider a more

          addElement(r,c);                        //       efficient implementation

    }


    template <class IterRow, class IterCol>

    void addElements(IterRow const fromRow, IterRow const toRow, IterCol const fromCol, IterCol const toCol, bool colIsSorted=false)

    {

      assert(fromCol==toCol || *std::max_element(fromCol,toCol) < columns);

      assert(fromRow==toRow || *std::max_element(fromRow,toRow) < rows);

      for (auto& c: creators)

        c.addElements(fromRow,toRow,fromCol,toCol,colIsSorted);

    }


    template <class RowRangeSequence, class ColRangeSequence>

    void addElements(RowRangeSequence const& rrs, ColRangeSequence const& crs, bool colsAreSorted=false)

    {

      assert(rrs.size() == crs.size());


      for (auto c=std::begin(crs); c!=std::end(crs); ++c)

        assert((*c).empty() || (*std::max_element(std::begin(*c),std::end(*c)) < columns)); // check that all column indices are ok


      parallelForNodes([&rrs,&crs,colsAreSorted,this](int i, int nodes) {

          auto c = std::begin(crs);

          for (auto r=std::begin(rrs); r!=std::end(rrs); ++r, ++c)

            this->creators[i].addElements(std::begin(*r),std::end(*r),std::begin(*c),std::end(*c),colsAreSorted);

        },nodes());

    }


    void addAllElements()

    {

      parallelForNodes([this](int i, int nodes) {

        this->creators[i].addAllElements(this->columns);

      }, nodes());

    }


    void addDiagonal()

    {

      for (Index i=0; i<std::min(rows,columns); ++i)

        addElement(i,i);

    }


    size_t nonzeroes() const

    {

      size_t nnz = 0;

      for (auto const& c: creators)

        nnz += c.nonzeroes();

      return nnz;

    }


    Index cols() const { return columns; }


    void balance();


    int nodes() const { return creators.size(); }


    ChunkCreator const& creator(int node) const { return creators[node]; }


    bool isSymmetric() const { return sym; }


  private:

    Index rows, columns;

    std::vector<ChunkCreator> creators;

    bool sym;

  };


  //---------------------------------------------------------------------------


  template <class Index=size_t>

  class NumaCRSPattern

  {

    typedef ThreadedMatrixDetail::CRSChunkPattern<Index> ChunkPattern;


  public:


    NumaCRSPattern()

    : NumaCRSPattern(NumaCRSPatternCreator<Index>(0,0))

    {}


    NumaCRSPattern(NumaCRSPatternCreator<Index> const& creator)

    : patterns(creator.nodes()), sym(creator.isSymmetric()), rowSt(creator.nodes()+1), cols(creator.cols())

    {

      parallelForNodes([this,&creator](int i, int n)

      {

        this->patterns[i] = std::make_shared<ChunkPattern>(creator.creator(i));

        this->rowSt[i] = this->patterns[i]->first();

      },creator.nodes());

      rowSt[patterns.size()] = patterns.back()->last();

    }


    template <class Expanded, class Condensed, class Matrix>

    NumaCRSPattern(Expanded const& eIndices, Condensed const& cIndices, Matrix const& mat)

    : patterns(NumaThreadPool::instance().nodes()), sym(false), rowSt(eIndices.size(),0), cols(eIndices.size())

    {

      assert(mat.M()==mat.N()); // works only for quadratic matrices

      assert(eIndices.size() <= cIndices.size());


      for (Index r=0; r<eIndices.size(); ++r)          // extract the number of entries in each

      {                                                // row of the condensed matrix

        auto row = mat[eIndices[r]];

        for (auto ci=row.begin(); ci!=row.end(); ++ci)

          if (cIndices[ci.index()] < eIndices.size())  // only count those entries that fall in our col range

            ++rowSt[r];

      }


      equalWeightRanges(rowSt,patterns.size());        // balance the number of entries in each chunk


      for (int i=0; i<patterns.size(); ++i)            // create patterns

        patterns[i] = std::make_shared<ChunkPattern>(rowSt[i],rowSt[i+1],eIndices,cIndices,mat,i);

    }


    template <class Matrix>

    NumaCRSPattern(Matrix const& matrix, bool isSymmetric, bool isTransposed, bool symmetric)

    : sym(symmetric), cols(isTransposed? matrix.N(): matrix.M())

    {

      assert(isSymmetric || !symmetric);


      // Compute an equilibrated distribution of rows to chunks (attempt as many chunks as NUMA nodes).

      Index rows = isTransposed? matrix.M(): matrix.N();

      std::vector<size_t> rowCount(rows,0);

      ThreadedMatrixDetail::getRowCount(matrix,isTransposed,rowCount);

      equalWeightRanges(rowCount,NumaThreadPool::instance().nodes());


      // Create chunk patterns

      patterns.reserve(rowCount.size()-1);

      for (int i=0; i<rowCount.size()-1; ++i)

        patterns.push_back(std::make_shared<ChunkPattern>(matrix,isSymmetric,isTransposed,

                                                          rowCount[i],rowCount[i+1],symmetric,i));


      rowSt.resize(patterns.size()+1);

      for (int i=0; i<patterns.size(); ++i)

        rowSt[i] = patterns[i]->first();

      rowSt[patterns.size()] = patterns.back()->last();

    }


    Index N() const { return rowSt.back(); }


    Index M() const { return cols; }


    size_t storage() const

    {

      size_t nnz = 0;

      for (auto const& p: patterns)

        nnz += p->storage();

      return nnz;

    }


    size_t nonzeroes() const

    {

      size_t nnz = 0;

      for (auto const& p: patterns)

        nnz += p->nonzeroes();

      return nnz;

    }


    int nodes() const { return patterns.size(); }


    std::shared_ptr<ChunkPattern> pattern(int i) const { return patterns[i]; }


    bool isSymmetric() const { return sym; }


    int chunk(Index row) const

    {

      assert(0<=row && row<=N());

      auto it = std::upper_bound(rowSt.begin(),rowSt.end(),row);  // use upper bound here, with lower_bound rows 0 and 1 would end up

      int c = it - rowSt.begin() - 1;                             // in different chunks...

      assert(0<=c && c<=patterns.size());

      assert(c==patterns.size() || (patterns[c]->first() <= row && row < patterns[c]->last()));

      return c;

    }


    std::vector<Index> const& rowStart() const { return rowSt; }


    bool exists(Index r, Index c) const

    {

      return patterns[chunk(r)]->exists(r,c);

    }


  private:

    std::vector<std::shared_ptr<ChunkPattern>> patterns;

    bool sym;

    std::vector<Index> rowSt;   // row start indices

    Index cols;                 // how many columns the pattern has

  };


  template <class Index, class Index2>

  NumaCRSPattern<Index> operator+(NumaCRSPattern<Index> const& pa, NumaCRSPattern<Index2> const& pb)

  {

    assert(pa.N()==pb.N() && pa.M()==pb.M());

    Index nnzPerRow = (pa.nonzeroes()+pb.nonzeroes())/pa.N();

    NumaCRSPatternCreator<Index> creator(pa.N(),pa.M(),pa.isSymmetric()&&pb.isSymmetric(),nnzPerRow);


    for (int i=0; i<pa.nodes(); ++i)

    {

      bool symmetrize = pa.isSymmetric() && !pb.isSymmetric();

      auto const& chunkPattern = *pa.pattern(i);

      Index first = chunkPattern.first();

      for (Index row=first; row<chunkPattern.last(); ++row)

        for (auto it=chunkPattern.colStartIterator(row-first); it!=chunkPattern.colStartIterator(row+1-first); ++it)

        {

          creator.addElement(row,*it);

          if (symmetrize)

            creator.addElement(*it,row);

        }

    }


    for (int i=0; i<pb.nodes(); ++i)

    {

      bool symmetrize = pb.isSymmetric() && !pa.isSymmetric();

      auto const& chunkPattern = *pb.pattern(i);

      Index2 first = chunkPattern.first();

      for (Index2 row=first; row<chunkPattern.last(); ++row)

        for (auto it=chunkPattern.colStartIterator(row-first); it!=chunkPattern.colStartIterator(row+1-first); ++it)

        {

          creator.addElement(static_cast<Index>(row),*it);

          if (symmetrize)

            creator.addElement(*it,static_cast<Index>(row));

        }

    }


    return NumaCRSPattern<Index>(creator);

  }


  //---------------------------------------------------------------------------


  template <class Entry, class Index=size_t>

  class NumaBCRSMatrix

  {

    typedef ThreadedMatrixDetail::CRSChunk<Entry,Index> Chunk;

    typedef NumaBCRSMatrix<Entry,Index> Self;


  public:

    typedef ScalarType<Entry> Scalar;

    typedef Scalar field_type;        // for compatibility with Dune

    typedef Entry block_type;

    using value_type = Entry;         // or should this be row_type?

    using size_type = Index;          // compatibility with Dune::BCRSMatrix


    typedef ThreadedMatrixDetail::NumaBCRSMatrixIterator<Entry,Index>      iterator;

    typedef ThreadedMatrixDetail::NumaBCRSMatrixConstIterator<Entry,Index> const_iterator;

    typedef iterator       row_type;

    typedef const_iterator const_row_type;


    typedef iterator ConstRowIterator;


    using ColIterator = typename iterator::Iterator;


    using ConstColIterator = typename iterator::ConstIterator;


    NumaBCRSMatrix()

    : NumaBCRSMatrix(std::make_shared<NumaCRSPattern<Index>>())

    {}


    NumaBCRSMatrix(Self const& A) = default; // TODO: do this in parallel


    NumaBCRSMatrix(Self&& A) = default;  // move should be efficient without parallelization


    NumaBCRSMatrix(NumaCRSPatternCreator<Index> const& creator, Entry const& init=Entry(0))

    : NumaBCRSMatrix(std::make_shared<NumaCRSPattern<Index>>(creator),init)

    {}


    NumaBCRSMatrix(std::shared_ptr<NumaCRSPattern<Index>> const& pattern_, Entry const& init=Entry(0))

    : pattern(pattern_)

    {

      for (int i=0; i<pattern->nodes(); ++i)

        chunks.push_back(Chunk(i));


      parallelForNodes([this,&init] (int i, int n)

      {

          this->chunks[i] = std::move(Chunk(this->pattern->pattern(i),init));

      },chunks.size());


      dps.resize(chunks.size());

    }


    template <class Expanded, class Condensed, class Matrix>

    NumaBCRSMatrix(Expanded const& eIndices, Condensed const& cIndices, Matrix const& mat)

    : pattern(std::make_shared<NumaCRSPattern<Index>>(eIndices,cIndices,mat))   // TODO: move this to private, and have a convenience constructor creating cIndices under the hood

    {

      chunks.reserve(pattern->nodes());

      for (int i=0; i<pattern->nodes(); ++i)

        chunks.push_back(Chunk(pattern->pattern(i),eIndices,cIndices,mat)); // TODO: do this in parallel

      dps.resize(chunks.size());

    }


  private:


    // Creates a vector cIndices that for each index in the source matrix tells

    // the index in the extracted submatrix (or a too large sentinel value if the

    // index is not contained in the submatrix). I.e. result[eIndices[i]]==i

    // for 0 <= i < eIndices.size(), and result[j]>=eIndices.size() otherwise.

    template <class Expanded>

    static std::vector<Index> createCondensed(Expanded const& eIndices, Index n)

    {

      std::vector<Index> cIndices(n,eIndices.size());

      for (Index i=0; i<eIndices.size(); ++i)

        cIndices[eIndices[i]] = i;

      return cIndices;

    }


  public:


    template <class Expanded, class Matrix>

    NumaBCRSMatrix(Expanded const& eIndices, Matrix const& mat)

    : NumaBCRSMatrix(eIndices,createCondensed(eIndices,mat.N()),mat)

    { }


    template <class Matrix>

    NumaBCRSMatrix(std::shared_ptr<NumaCRSPattern<Index>> const& pattern_, Matrix const& matrix,

                   bool isSymmetric, bool isTransposed)

    : pattern(pattern_)

    {

      chunks.reserve(pattern->nodes());

      for (int i=0; i<pattern->nodes(); ++i)

        chunks.push_back(Chunk(pattern->pattern(i),matrix,isSymmetric,isTransposed)); // TODO: do this in parallel

      dps.resize(chunks.size());

    }


    template <class OtherEntry>

    NumaBCRSMatrix(NumaBCRSMatrix<OtherEntry,Index> const& matrix)

    : NumaBCRSMatrix(matrix.getPattern(),matrix,matrix.getPattern()->isSymmetric(),false)

    { }


    template <class Matrix>

    NumaBCRSMatrix(Matrix const& matrix,

                   bool isSymmetric, bool isTransposed=false, bool symmetric=false)

    : NumaBCRSMatrix(std::make_shared<NumaCRSPattern<Index>>(matrix,isSymmetric,isTransposed,symmetric),

                     matrix,isSymmetric,isTransposed)

    {

      assert(matrix.N()==matrix.M() || !(symmetric||isSymmetric));   // only go into symmetric mode if the matrix is square

      assert(Entry::rows==Entry::cols || !(symmetric||isSymmetric)); // symmetry only if the entries are square


      // Make sure symmetry is handled correctly (even if not in debug mode). TODO: shall we take this as a feature instead of complaining via assert?

      symmetric &= Entry::rows==Entry::cols && matrix.N()==matrix.M();

      isSymmetric &= Entry::rows==Entry::cols && matrix.N()==matrix.M();

    }


    NumaBCRSMatrix(NumaBCRSMatrix<Entry,Index> const& matrix,

                   bool isSymmetric, bool isTransposed=false, bool symmetric=false)

    : NumaBCRSMatrix(isTransposed==false && isSymmetric==symmetric? matrix.pattern            // just copy

                                                                  : std::make_shared<NumaCRSPattern<Index>>(matrix,isSymmetric,isTransposed,symmetric),

                     matrix,isSymmetric,isTransposed)

    {

      assert(matrix.N()==matrix.M() || !(symmetric||isSymmetric));   // only go into symmetric mode if the matrix is square

      assert(Entry::rows==Entry::cols || !(symmetric||isSymmetric)); // symmetry only if the entries are square

      assert(isTransposed? (matrix.N()==M() && matrix.M()==N()): (matrix.N()==N() && matrix.M()==M()));

      assert(!isTransposed || Entry::rows==Entry::cols);

    }


    Self& operator=(Self const& mat) = default;


    Self& operator=(Self&& mat) = default;


    Self& operator=(Entry const& a)

    {

      if (chunks.size()>0)

        parallelForNodes([this,&a] (int i, int n)

        {

          this->chunks[i] = a;

        },chunks.size());


      return *this;

    }


    Self& operator=(typename Entry::field_type const& a)

    {

      return *this = Entry(a);

    }


    template <class Arguments, class Operation>

    Self& operator=(ThreadedMatrixDetail::NumaBCRSMatrixExpression<Arguments,Operation> const& e)

    {

      if (chunks.size()>0)

        parallelForNodes([this,&e] (int i, int n)

        {

          this->chunks[i] = e[i];

        },chunks.size());


      return *this;

    }


    template <class EntryB, class IndexB>

    Self& operator+=(NumaBCRSMatrix<EntryB,IndexB> const& B)

    {

      entrywiseOp([](Entry const& a, Entry const& b) { return a+b; }, B);

      return *this;

    }


    template <class EntryB, class IndexB>

    Self& operator-=(NumaBCRSMatrix<EntryB,IndexB> const& B)

    {

      entrywiseOp([](Entry const& a, Entry const& b) { return a-b; }, B);

      return *this;

    }


    template <class Factor>

    Self& operator*=(Factor a)

    {

      entrywiseOp([=](Entry const& e, Entry const&)  { return a*e; }, *this);

      return *this;

    }


    iterator       begin()       { return iterator(*this,0); }

    const_iterator begin() const { return const_iterator(*this,0); }


    iterator       end()       { return iterator(*this,N()); }

    const_iterator end() const { return const_iterator(*this,N()); }


    row_type       operator[](Index r)       { return iterator(*this,r); }

    const_row_type operator[](Index r) const { return const_iterator(*this,r); }


    template <class RowIndices, class ColIndices>

    Self operator()(RowIndices const& ri, ColIndices const& ci) const

    {

      return submatrix<Self>(*this,ri,ci);

    };


    Index N() const { return pattern->N(); }


    Index M() const { return pattern->M(); }


    size_t nonzeroes() const { return pattern->nonzeroes(); }


    std::shared_ptr<NumaCRSPattern<Index>> getPattern() const { return pattern; }


    Chunk& chunk(int i) { return chunks[i]; }


    bool exists(Index r, Index c) const

    {

      return pattern->exists(r,c);

    }


    field_type frobenius_norm() const

    {

      return sqrt(frobenius_norm2());

    }


    field_type frobenius_norm2() const

    {

      field_type sum = 0;

      for (auto ri=begin(); ri!=end(); ++ri)

        for (auto ci=ri->begin(); ci!=ri->end(); ++ci)

          sum += ci->frobenius_norm2();

      return sqrt(sum);

    }


    template <class X, class Y>

    field_type mv(X const& x, Y& y) const { return doMv(1.0,x,y,true); }


    template <class X, class Y>

    void mtv(X const& x, Y& y) const { doMtv(1.0,x,y,true); }


    template <class X, class Y>

    void mmv(X const& x, Y& y) const { doMv(-1.0,x,y,true); }


    template <class X, class Y>

    field_type smv(field_type const& a, X const& x, Y& y) const { return doMv(a,x,y,true); }


    template <class X, class Y>

    void smtv(field_type const& a, X const& x, Y& y) const { doMtv(a,x,y,true); }


    template <class X, class Y>

    field_type umv(X const& x, Y& y) const { return doMv(1.0,x,y,false); }


    template <class X, class Y>

    void umtv(X const& x, Y& y) const { doMtv(1.0,x,y,false); }


    template <class X, class Y>

    field_type usmv(field_type const& a, X const& x, Y& y) const { return doMv(a,x,y,false); }


    template <class X, class Y>

    void usmtv(field_type const& a, X const& x, Y& y) const { doMtv(a,x,y,false); }


    template <class LMIterator>

    void scatter(LMIterator first, LMIterator last)

    {

      for (auto& c: chunks)

        c.scatter(first,last);

    }


    template <class RowIndices, class ColIndices, class BinaryOp=std::plus<Entry>>

    void scatter(DynamicMatrix<Entry> const& B, RowIndices const& rows, ColIndices const& cols,

                 BinaryOp const& op = BinaryOp())

    {

      // TODO: implement this in the chunks for performance

      for (int r=0; r<rows.size(); ++r)

        for (int c=0; c<cols.size(); ++c)

        {

          auto ci = (*this)[rows[r]].find(cols[c]);

          assert(ci!=(*this)[rows[r]].end());

          *ci = op(*ci,B[r][c]);

        }

    }


  private:

    std::shared_ptr<NumaCRSPattern<Index>> pattern;

    std::vector<Chunk>                     chunks;

    mutable std::vector<field_type>        dps;     // could be local to doMv, but we don't like frequent reallocations


    // Perform matrix-vector multiplication y = (initToZero? 0: y) + a A x and return  a x^T Ax

    template <class X, class Y>

    Scalar doMv(Scalar const& a, X const& x, Y& y, bool initToZero) const

    {

      assert(y.size()==N());

      assert(x.size()==M());


      if (chunks.size()>1)

      {

        parallelForNodes([this,a,&x,&y,initToZero] (int i, int n)

        {

          if (pattern->isSymmetric()) // if it's symmetric, we have to add the transpose of the other blocks

            for (int k=i; k<n; ++k)

              this->chunks[i].gatherMirrored(a,x,y,this->chunks[k],true,initToZero&&k==i);

          this->dps[i] = this->chunks[i].apply(a,x,y,initToZero&&!pattern->isSymmetric());

        },chunks.size());


        return std::accumulate(dps.begin(),dps.end(),0.0);

      }

      else

      {

        if (pattern->isSymmetric())

          chunks[0].gatherMirrored(a,x,y,chunks[0],true,initToZero);

        return chunks[0].apply(a,x,y,initToZero&&!pattern->isSymmetric());

      }

    }


    // Perform transpose matrix-vector multiplication y = a A^T x and return y^T x

    template <class X, class Y>

    void doMtv(Scalar const& a, X const& x, Y& y, bool initToZero) const

    {

      assert(y.size()==M());

      assert(x.size()==N());


      if (pattern->isSymmetric())  // matrix is symmetric, hence we can multiply with A instead of A^T

        doMv(a,x,y,initToZero);

      else

      {

        if (initToZero)

          y = 0;

        for (auto const& chunk: chunks)  // WARNING: applyTransposed is not thread-safe due to global unstructured scattering

          chunk.applyTransposed(a,x,y);  // into y. It is possible to parallelize this, in a window-shingled fashion, but is

      }                                  // it worth the implementation effort?

    }


    // performs elementwise operations on two matrices (most useful for axpy-like operations):

    // a_ij = f(a_ij,b_ij)

    // This affects exactly the entries present in *both* matrices. Others are unchanged. Matrices have to have the same shape.

    template <class F, class EntryB, class IndexB>

    void entrywiseOp(F const& f, NumaBCRSMatrix<EntryB,IndexB> const& B)

    {

      assert(N()==B.N() && M()==B.M());

      parallelForNodes([this,&f,&B](int i, int n)

      {

        this->chunks[i].entrywiseOp(f,B);

      },chunks.size());

    }

  };


  // ------------------------------------------------------------------------------------------


  template <class Entry, class Index>

  std::ostream& operator <<(std::ostream& out, NumaBCRSMatrix<Entry,Index> const& A)

  {

    for (auto ri=A.begin(); ri!=A.end(); ++ri)

      for (auto ci=ri->begin(); ci!=ri->end(); ++ci)

        out << ri.index() << ' ' << ci.index() << ' ' << *ci << std::endl;

    return out;

  }


  // ----------------------------------------------------------------------------------------------


  template <class SparseMatrix>

  DynamicMatrix<typename SparseMatrix::block_type> full(SparseMatrix const& A)

  {

    DynamicMatrix<typename SparseMatrix::block_type> B(A.N(),A.M());

    for (auto ri=A.begin(); ri!=A.end(); ++ri)

      for (auto ci=ri->begin(); ci!=ri->end(); ++ci)

        B[ri.index()][ci.index()] = *ci;

    return B;

  }


  template <class SparseMatrix, class RowRange, class ColRange>

  DynamicMatrix<typename SparseMatrix::block_type> full(SparseMatrix const& A, RowRange const& rows, ColRange const& cols)

  {

    using std::begin;

    using std::end;

    using Index = typename ColRange::value_type;


    // First we prepare a lookup structure for efficient determination where

    // a sparse matrix entry should end up.

    std::vector<std::pair<Index,Index>> cidx;

    cidx.reserve(cols.size());


    Index i=0;

    for (auto c: cols)

    {

      cidx.push_back(std::make_pair(c,i));

      ++i;

    }

    std::sort(begin(cidx),end(cidx),FirstLess());


    // Now create the matrix and extract the entries.

    DynamicMatrix<typename SparseMatrix::block_type> B(rows.size(),cols.size());


    for (size_t r=0; r<rows.size(); ++r)

      for (auto ci=A[rows[r]].begin(); ci!=A[rows[r]].end(); ++ci)

      {

        auto it = std::lower_bound(begin(cidx),end(cidx),

                                   std::make_pair(ci.index(),ci.index()),FirstLess());

        if (it!=end(cidx) && it->first==ci.index())       // this is an entry from our column range

          B[r][it->second] = *ci;

      }


    return B;

  }


  // ----------------------------------------------------------------------------------------------


  template <class Entry, class Index, class Index2>

  NumaBCRSMatrix<Entry,Index> operator+(NumaBCRSMatrix<Entry,Index> const& A, NumaBCRSMatrix<Entry,Index2> const& B)

  {

    auto pat = std::make_shared<NumaCRSPattern<Index>>(*A.getPattern()+*B.getPattern());

    NumaBCRSMatrix<Entry,Index> AB(pat);

    assert(!A.getPattern()->isSymmetric() && !B.getPattern()->isSymmetric());


    // TODO: parallelize

    for (Index row=0; row<AB.N(); ++row)

    {

      auto ABrow = AB[row];

      for (auto ci=A[row].begin(); ci!=A[row].end(); ++ci)

        ABrow[ci.index()] += *ci;

      for (auto ci=B[row].begin(); ci!=B[row].end(); ++ci)

        ABrow[static_cast<Index>(ci.index())] += *ci;

    }

    return AB;

  }


  // ----------------------------------------------------------------------------------------------


  template <class Scalar, int n, class Index=size_t>

  NumaBCRSMatrix<Dune::FieldMatrix<Scalar,n,n>,Index> sparseUnitMatrix(Index N)

  {

    NumaCRSPatternCreator<Index> creator(N,N,false,1);

    for (Index i=0; i<N; ++i)

      creator.addElement(i,i);

    return NumaBCRSMatrix<Dune::FieldMatrix<Scalar,n,n>,Index>(creator,unitMatrix<Scalar,n>());

  }


  // ----------------------------------------------------------------------------------------------


  template <class Scalar, int n, int m, class Index=size_t>

  NumaBCRSMatrix<Dune::FieldMatrix<Scalar,n,m>,Index> sparseZeroMatrix(Index N, Index M)

  {

    NumaCRSPatternCreator<Index> creator(N,M,false,1);

    return NumaBCRSMatrix<Dune::FieldMatrix<Scalar,n,m>,Index>(creator);

  }


  // ----------------------------------------------------------------------------------------------


  template <class Entry, class Index>

  auto transpose(NumaBCRSMatrix<Entry,Index> const& A)

  {

    NumaBCRSMatrix<typename EntryTraits<Entry>::transpose_type,Index> At(A,false,true,false);

    return At;

  }


  // ----------------------------------------------------------------------------------------------


  template <class Target, class Source, class RowIndices, class ColIndices>

  Target eraseRowsNCols(Source const& A, RowIndices const& ri, ColIndices const& ci)

  {

    // rows and columns to keep

    std::vector<size_t> rows(A.N()), cols(A.M());

    std::iota(rows.begin(),rows.end(),0);           // vector of rows to keep

    std::iota(cols.begin(),cols.end(),0);           // vector of columns to keep


    if(!ri.empty())

      for(int i=0; i<ri.size();++i)

        rows.erase(rows.begin + ri[i]);               // erase the corresponding row indices


    if(!ci.empty())

      for(int i=0; i<ci.size();++i)

        cols.erase(cols.begin + ci[i]);               // erase the corresponding columns indices


    return submatrix(A,rows,cols);

  }


  template <class Target, class Source, class RowIndices>

  Target eraseRows(Source const& A, RowIndices const& ri)

  {

    // empty column vector (no columns to delete)

    std::vector<size_t> cols;

    return eraseRowsNCols(A,ri,cols);

  }


  template <class Target, class Source, class ColIndices>

  Target eraseCols(Source const& A, ColIndices const& ci)

  {

    // empty row vector (no rows to delete)

    std::vector<size_t> rows;

    return eraseRowsNCols(A,rows,ci);

  }


  template <class Source>

  std::vector<size_t> nonZeroColumns(Source const& A)

  {

    // flag vector:

    // 1 = non zero column

    // 0 = zero column

    std::vector<size_t> nzFlags(A.M(),0);

    for(auto r=A.begin(); r!=A.end(); ++r)

      for(auto c=r->begin(); c!= r->end(); ++c)

        nzFlags.at(c.index()) = 1;


    // build column index vector

    std::vector<size_t> nzCols;

    for(int i=0; i<nzFlags.size(); ++i)

      if(nzFlags.at(i) == 1) nzCols.push_back(i);


    return nzCols;

  }


  template<int row2, int col2, int row1, int col1, class Scalar, class Index>

  NumaBCRSMatrix<Dune::FieldMatrix<Scalar,row2,col2>,Index>

  reshapeBlocks(NumaBCRSMatrix<Dune::FieldMatrix<Scalar,row1,col1>,Index> const& A)

  {

    static_assert(row1 % row2 == 0);

    static_assert(col1 % col2 == 0);

    int const qN = row1/row2;

    int const qM = col1/col2;


    Index N = A.N()*qN;

    Index M = A.M()*qM;

    NumaCRSPatternCreator<Index> creator(N, M, A.getPattern()->isSymmetric());


    for(auto row=A.begin(); row!=A.end(); ++row)

      for(auto col=row->begin(); col!=row->end(); ++col)                                        //loop over non-zero elements in A

        for(int i=0; i<qN; ++i)

          for(int j=0; j<qM; ++j)

            creator.addElement(row.index()*qN + i, col.index()*qM + j);                         //create qN*qM non-zero entries in B


    NumaBCRSMatrix<Dune::FieldMatrix<Scalar,row2,col2>, Index> B(creator);                      //empty matrix with sparsity pattern


    for(auto row=A.begin(); row!=A.end(); ++row)

      for(auto col=row->begin(); col!=row->end(); ++col)                                        // loop over non-zero elements in A

      {

        Dune::FieldMatrix<Scalar,row1,col1> mat = *col;                                         // get block entry of A

        for(int k=0; k<row1; ++k)

          for(int l=0; l<col1; ++l)

            B[row.index()*qN + k/row2][col.index()*qM + l/col2][k%row2][l%col2] = mat[k][l];    // add data at corresponding entries of B

      }


    return B;

  }


  template<int blockrows, int blockcols, class Scalar, class Index>

  NumaBCRSMatrix<Dune::FieldMatrix<Scalar,blockrows,blockcols>, Index>

  horzcat(NumaBCRSMatrix<Dune::FieldMatrix<Scalar,blockrows,blockcols>, Index> const& A,

          NumaBCRSMatrix<Dune::FieldMatrix<Scalar,blockrows,blockcols>, Index> const& B)

  {

    assert(A.N() == B.N());

    Index cols = (A.M() + B.M());

    Index rows = A.N();


    NumaCRSPatternCreator<Index> creator(rows, cols, false);


    //loop over non-zero elements in A

    for(auto row=A.begin(); row!=A.end(); ++row)

      for(auto col=row->begin(); col!=row->end(); ++col)

        creator.addElement(row.index(), col.index());

    //loop over non-zero elements in B

    for(auto row=B.begin(); row!=B.end(); ++row)

      for(auto col=row->begin(); col!=row->end(); ++col)

        creator.addElement(row.index(), col.index()+A.M()); // append columns on the right


    NumaBCRSMatrix<Dune::FieldMatrix<Scalar,blockrows,blockcols>,Index> result(creator);            //empty matrix with sparsity pattern


    for(auto row=A.begin(); row!=A.end(); ++row)

      for(auto col=row->begin(); col!=row->end(); ++col)                                            // loop over non-zero elements in A

        result[row.index()][col.index()] = *col;


    for(auto row=B.begin(); row!=B.end(); ++row)

      for(auto col=row->begin(); col!=row->end(); ++col)

        result[row.index()][col.index()+A.M()] = *col;


    return result;

  }


  template<int blockrows, int blockcols, class Scalar, class Index>

  NumaBCRSMatrix<Dune::FieldMatrix<Scalar,blockrows,blockcols>, Index>

  vertcat(NumaBCRSMatrix<Dune::FieldMatrix<Scalar,blockrows,blockcols>, Index> const& A,

          NumaBCRSMatrix<Dune::FieldMatrix<Scalar,blockrows,blockcols>, Index> const& B)

  {

    assert(A.M() == B.M());

    Index cols = A.M();

    Index rows = (A.N()+B.N());

    NumaCRSPatternCreator<Index> creator(rows, cols, false);


    //loop over non-zero elements in A

    for(auto row=A.begin(); row!=A.end(); ++row)

      for(auto col=row->begin(); col!=row->end(); ++col)

          creator.addElement(row.index(), col.index());

    //loop over non-zero elements in B

    for(auto row=B.begin(); row!=B.end(); ++row)

      for(auto col=row->begin(); col!=row->end(); ++col)

            creator.addElement(row.index()+A.N(), col.index()); // append columns on the right


    NumaBCRSMatrix<Dune::FieldMatrix<Scalar,blockrows,blockcols>, Index> result(creator);                      //empty matrix with sparsity pattern


    for(auto row=A.begin(); row!=A.end(); ++row)

      for(auto col=row->begin(); col!=row->end(); ++col)                                            // loop over non-zero elements in A

        result[row.index()][col.index()] = *col;


    for(auto row=B.begin(); row!=B.end(); ++row)

      for(auto col=row->begin(); col!=row->end(); ++col)                                            // loop over non-zero elements in B

        result[row.index()+A.N()][col.index()] = *col;


    return result;

  }


  template<int blockrows, int blockcols, class Index=size_t>

  NumaBCRSMatrix<Dune::FieldMatrix<double,blockrows,blockcols>, Index> diagcat(NumaBCRSMatrix<Dune::FieldMatrix<double,blockrows,blockcols>, Index> const& A,

          NumaBCRSMatrix<Dune::FieldMatrix<double,blockrows,blockcols>, Index> const& B)

  {

    Index cols = (A.M()+B.M());

    Index rows = (A.N()+B.N());

    NumaCRSPatternCreator<Index> creator(rows, cols, false);


    //loop over non-zero elements in A

    for(auto row=A.begin(); row!=A.end(); ++row)

      for(auto col=row->begin(); col!=row->end(); ++col)

            creator.addElement(row.index(), col.index());

    //loop over non-zero elements in B

    for(auto row=B.begin(); row!=B.end(); ++row)

      for(auto col=row->begin(); col!=row->end(); ++col)

            creator.addElement(row.index()+A.N(), col.index()+A.M()); // append columns on the right


    NumaBCRSMatrix<Dune::FieldMatrix<double,blockrows,blockcols>, Index> result(creator);                      //empty matrix with sparsity pattern


    for(auto row=A.begin(); row!=A.end(); ++row)

      for(auto col=row->begin(); col!=row->end(); ++col)                                            // loop over non-zero elements in A

      {

        auto mat = *col;

        for(int k=0; k < blockrows; ++k)

          for(int l= 0; l < blockcols; ++l)

            result[row.index()][col.index()][k][l] =mat[k][l];

      }


    for(auto row=B.begin(); row!=B.end(); ++row)

      for(auto col=row->begin(); col!=row->end(); ++col)                                            // loop over non-zero elements in B

      {

        auto mat = *col;

        for(int k=0; k < blockrows; ++k)

          for(int l= 0; l < blockcols; ++l)

            result[row.index()+A.N()][col.index()+A.M()][k][l] = mat[k][l];

      }


    return result;

  }


}   // end namespace Kaskade


#endif

Dune::FieldMatrix
Definition: errorDistribution.hh:30

Kaskade::DynamicMatrix< Entry >

Kaskade::LookupException
An exception that can be thrown whenever a key lookup fails.
Definition: detailed_exception.hh:75

Kaskade::MatrixAsTriplet
Definition: triplet.hh:57

Kaskade::MatrixAsTriplet::ridx
std::vector< SparseIndexInt > ridx
row indices
Definition: triplet.hh:669

Kaskade::MatrixAsTriplet::cidx
std::vector< SparseIndexInt > cidx
column indices
Definition: triplet.hh:671

Kaskade::MatrixAsTriplet::data
std::vector< Scalar > data
data
Definition: triplet.hh:673

Kaskade::NumaAllocator
An STL allocator that uses memory of a specific NUMA node only.
Definition: threading.hh:653

Kaskade::NumaBCRSMatrix
A NUMA-aware compressed row storage matrix adhering mostly to the Dune ISTL interface (to complete....
Definition: threadedMatrix.hh:2115

Kaskade::NumaBCRSMatrix::ConstRowIterator
iterator ConstRowIterator
Definition: threadedMatrix.hh:2134

Kaskade::NumaBCRSMatrix::end
iterator end()
returns an iterator to the first row
Definition: threadedMatrix.hh:2474

Kaskade::NumaBCRSMatrix::operator-=
Self & operator-=(NumaBCRSMatrix< EntryB, IndexB > const &B)
Subtracts a sparse matrix from this one.
Definition: threadedMatrix.hh:2423

Kaskade::NumaBCRSMatrix::NumaBCRSMatrix
NumaBCRSMatrix(Expanded const &eIndices, Condensed const &cIndices, Matrix const &mat)
Deprecated. Use simpler version without cIndices instead.
Definition: threadedMatrix.hh:2203

Kaskade::NumaBCRSMatrix::const_row_type
const_iterator const_row_type
Definition: threadedMatrix.hh:2132

Kaskade::NumaBCRSMatrix::size_type
Index size_type
Definition: threadedMatrix.hh:2124

Kaskade::NumaBCRSMatrix::mmv
void mmv(X const &x, Y &y) const
Matrix-vector multiplication .
Definition: threadedMatrix.hh:2594

Kaskade::NumaBCRSMatrix::NumaBCRSMatrix
NumaBCRSMatrix(NumaBCRSMatrix< OtherEntry, Index > const &matrix)
Constructor copying a given matrix.
Definition: threadedMatrix.hh:2281

Kaskade::NumaBCRSMatrix::NumaBCRSMatrix
NumaBCRSMatrix(Matrix const &matrix, bool isSymmetric, bool isTransposed=false, bool symmetric=false)
Constructor.
Definition: threadedMatrix.hh:2302

Kaskade::NumaBCRSMatrix::operator=
Self & operator=(Self &&mat)=default
Move assignment.

Kaskade::NumaBCRSMatrix::chunk
Chunk & chunk(int i)
Obtains a reference to the given chunk.
Definition: threadedMatrix.hh:2532

Kaskade::NumaBCRSMatrix::operator=
Self & operator=(ThreadedMatrixDetail::NumaBCRSMatrixExpression< Arguments, Operation > const &e)
NOT YET IMPLEMENTED Assigns the given Numa matrix expression.
Definition: threadedMatrix.hh:2394

Kaskade::NumaBCRSMatrix::operator+=
Self & operator+=(NumaBCRSMatrix< EntryB, IndexB > const &B)
Adds a sparse matrix to this one.
Definition: threadedMatrix.hh:2411

Kaskade::NumaBCRSMatrix::row_type
iterator row_type
Definition: threadedMatrix.hh:2131

Kaskade::NumaBCRSMatrix::nonzeroes
size_t nonzeroes() const
Returns the number of structurally nonzero elements.
Definition: threadedMatrix.hh:2522

Kaskade::NumaBCRSMatrix::NumaBCRSMatrix
NumaBCRSMatrix(Self &&A)=default
Move constructor.

Kaskade::NumaBCRSMatrix::operator*=
Self & operator*=(Factor a)
Multiplication with a "scalar".
Definition: threadedMatrix.hh:2435

Kaskade::NumaBCRSMatrix::NumaBCRSMatrix
NumaBCRSMatrix(std::shared_ptr< NumaCRSPattern< Index > > const &pattern_, Entry const &init=Entry(0))
Constructor creating a matrix from a given sparsity pattern.
Definition: threadedMatrix.hh:2183

Kaskade::NumaBCRSMatrix::getPattern
std::shared_ptr< NumaCRSPattern< Index > > getPattern() const
Returns a pointer to the sparsity pattern.
Definition: threadedMatrix.hh:2527

Kaskade::NumaBCRSMatrix::value_type
Entry value_type
Definition: threadedMatrix.hh:2123

Kaskade::NumaBCRSMatrix::operator=
Self & operator=(typename Entry::field_type const &a)
Assigns the given scalar value to each entry.
Definition: threadedMatrix.hh:2385

Kaskade::NumaBCRSMatrix::NumaBCRSMatrix
NumaBCRSMatrix(NumaCRSPatternCreator< Index > const &creator, Entry const &init=Entry(0))
Constructor creating a matrix from a given sparsity pattern creator.
Definition: threadedMatrix.hh:2173

Kaskade::NumaBCRSMatrix::exists
bool exists(Index r, Index c) const
returns true if (r,c) is structurally nonzero
Definition: threadedMatrix.hh:2537

Kaskade::NumaBCRSMatrix::scatter
void scatter(DynamicMatrix< Entry > const &B, RowIndices const &rows, ColIndices const &cols, BinaryOp const &op=BinaryOp())
Scatters given submarix into the matrix.
Definition: threadedMatrix.hh:2704

Kaskade::NumaBCRSMatrix::field_type
Scalar field_type
Definition: threadedMatrix.hh:2121

Kaskade::NumaBCRSMatrix::smv
field_type smv(field_type const &a, X const &x, Y &y) const
Matrix-vector multiplication .
Definition: threadedMatrix.hh:2600

Kaskade::NumaBCRSMatrix::scatter
void scatter(LMIterator first, LMIterator last)
Scatters given sub-matrices into the matrix by adding up their entries.
Definition: threadedMatrix.hh:2673

Kaskade::NumaBCRSMatrix::end
const_iterator end() const
Definition: threadedMatrix.hh:2475

Kaskade::NumaBCRSMatrix::NumaBCRSMatrix
NumaBCRSMatrix()
Constructs an empty 0x0 matrix.
Definition: threadedMatrix.hh:2151

Kaskade::NumaBCRSMatrix::NumaBCRSMatrix
NumaBCRSMatrix(Self const &A)=default
Copy constructor.

Kaskade::NumaBCRSMatrix::NumaBCRSMatrix
NumaBCRSMatrix(std::shared_ptr< NumaCRSPattern< Index > > const &pattern_, Matrix const &matrix, bool isSymmetric, bool isTransposed)
Constructor copying a given matrix.
Definition: threadedMatrix.hh:2258

Kaskade::NumaBCRSMatrix::iterator
ThreadedMatrixDetail::NumaBCRSMatrixIterator< Entry, Index > iterator
iterator type stepping through the rows
Definition: threadedMatrix.hh:2129

Kaskade::NumaBCRSMatrix::operator=
Self & operator=(Entry const &a)
Assigns the given value to each entry.
Definition: threadedMatrix.hh:2371

Kaskade::NumaBCRSMatrix::NumaBCRSMatrix
NumaBCRSMatrix(Expanded const &eIndices, Matrix const &mat)
Indexed submatrix constructor.
Definition: threadedMatrix.hh:2242

Kaskade::NumaBCRSMatrix::begin
iterator begin()
returns an iterator to the first row
Definition: threadedMatrix.hh:2468

Kaskade::NumaBCRSMatrix::usmv
field_type usmv(field_type const &a, X const &x, Y &y) const
Matrix-vector multiplication  and subsequent computation of  if A is square.
Definition: threadedMatrix.hh:2630

Kaskade::NumaBCRSMatrix::operator[]
row_type operator[](Index r)
Subscript operator allowing random access to rows.
Definition: threadedMatrix.hh:2482

Kaskade::NumaBCRSMatrix::operator[]
const_row_type operator[](Index r) const
Definition: threadedMatrix.hh:2483

Kaskade::NumaBCRSMatrix::umtv
void umtv(X const &x, Y &y) const
Matrix-vector multiplication .
Definition: threadedMatrix.hh:2624

Kaskade::NumaBCRSMatrix::begin
const_iterator begin() const
Definition: threadedMatrix.hh:2469

Kaskade::NumaBCRSMatrix::ColIterator
typename iterator::Iterator ColIterator
column iterator stepping through the entries of a row
Definition: threadedMatrix.hh:2137

Kaskade::NumaBCRSMatrix::frobenius_norm
field_type frobenius_norm() const
Computes the Frobenius norm .
Definition: threadedMatrix.hh:2546

Kaskade::NumaBCRSMatrix::const_iterator
ThreadedMatrixDetail::NumaBCRSMatrixConstIterator< Entry, Index > const_iterator
Definition: threadedMatrix.hh:2130

Kaskade::NumaBCRSMatrix::NumaBCRSMatrix
NumaBCRSMatrix(NumaBCRSMatrix< Entry, Index > const &matrix, bool isSymmetric, bool isTransposed=false, bool symmetric=false)
Constructor.
Definition: threadedMatrix.hh:2333

Kaskade::NumaBCRSMatrix::operator=
Self & operator=(Self const &mat)=default
Copy assignment.

Kaskade::NumaBCRSMatrix::smtv
void smtv(field_type const &a, X const &x, Y &y) const
Matrix-vector multiplication .
Definition: threadedMatrix.hh:2609

Kaskade::NumaBCRSMatrix::usmtv
void usmtv(field_type const &a, X const &x, Y &y) const
Matrix-vector multiplication .
Definition: threadedMatrix.hh:2639

Kaskade::NumaBCRSMatrix::umv
field_type umv(X const &x, Y &y) const
Matrix-vector multiplication  and subsequent computation of  if A is square.
Definition: threadedMatrix.hh:2615

Kaskade::NumaBCRSMatrix::operator()
Self operator()(RowIndices const &ri, ColIndices const &ci) const
Definition: threadedMatrix.hh:2492

Kaskade::NumaBCRSMatrix::M
Index M() const
The number of columns.
Definition: threadedMatrix.hh:2514

Kaskade::NumaBCRSMatrix::mtv
void mtv(X const &x, Y &y) const
Matrix-vector multiplication .
Definition: threadedMatrix.hh:2588

Kaskade::NumaBCRSMatrix::ConstColIterator
typename iterator::ConstIterator ConstColIterator
column iterator stepping through the const entries of a row
Definition: threadedMatrix.hh:2140

Kaskade::NumaBCRSMatrix::frobenius_norm2
field_type frobenius_norm2() const
Computes the square of the Frobenius norm .
Definition: threadedMatrix.hh:2555

Kaskade::NumaBCRSMatrix::N
Index N() const
The number of rows.
Definition: threadedMatrix.hh:2509

Kaskade::NumaBCRSMatrix::Scalar
ScalarType< Entry > Scalar
Definition: threadedMatrix.hh:2120

Kaskade::NumaBCRSMatrix::block_type
Entry block_type
Definition: threadedMatrix.hh:2122

Kaskade::NumaBCRSMatrix::mv
field_type mv(X const &x, Y &y) const
Matrix-vector multiplication  with computation of  if A is square.
Definition: threadedMatrix.hh:2577

Kaskade::NumaCRSPatternCreator
A NUMA-aware creator for matrix sparsity patterns.
Definition: threadedMatrix.hh:1617

Kaskade::NumaCRSPatternCreator::cols
Index cols() const
The number of columns.
Definition: threadedMatrix.hh:1802

Kaskade::NumaCRSPatternCreator::addElements
void addElements(IterRow const fromRow, IterRow const toRow, IterCol const fromCol, IterCol const toCol, bool colIsSorted=false)
Enters entries into the sparsity pattern.
Definition: threadedMatrix.hh:1726

Kaskade::NumaCRSPatternCreator::creator
ChunkCreator const & creator(int node) const
Returns the chunk creator.
Definition: threadedMatrix.hh:1824

Kaskade::NumaCRSPatternCreator::balance
void balance()
Redistributes the rows to the NUMA chunks in order to have the same number of entries in each chunk.

Kaskade::NumaCRSPatternCreator::addElement
void addElement(Index row, Index col)
Enters a single entry into the sparsity pattern.
Definition: threadedMatrix.hh:1688

Kaskade::NumaCRSPatternCreator::nodes
int nodes() const
Returns the number of NUMA nodes/chunks used.
Definition: threadedMatrix.hh:1818

Kaskade::NumaCRSPatternCreator::addElements
void addElements(RowRangeSequence const &rrs, ColRangeSequence const &crs, bool colsAreSorted=false)
Enters elements into the sparsity pattern.
Definition: threadedMatrix.hh:1749

Kaskade::NumaCRSPatternCreator::nonzeroes
size_t nonzeroes() const
The number of structurally nonzero elements.
Definition: threadedMatrix.hh:1791

Kaskade::NumaCRSPatternCreator::addAllElements
void addAllElements()
Enters all possible elements (defining a dense matrix).
Definition: threadedMatrix.hh:1770

Kaskade::NumaCRSPatternCreator::isSymmetric
bool isSymmetric() const
Returns the symmetry status of the pattern.
Definition: threadedMatrix.hh:1831

Kaskade::NumaCRSPatternCreator::NumaCRSPatternCreator
NumaCRSPatternCreator(Index rows_, Index cols, bool symmetric=false, int nnzPerRow=0)
Constructs a rows times cols matrix sparsity structure creator.
Definition: threadedMatrix.hh:1652

Kaskade::NumaCRSPatternCreator::~NumaCRSPatternCreator
~NumaCRSPatternCreator()
Definition: threadedMatrix.hh:1674

Kaskade::NumaCRSPatternCreator::addDiagonal
void addDiagonal()
Enters the diagonal elements.
Definition: threadedMatrix.hh:1780

Kaskade::NumaCRSPatternCreator::addDenseBlock
void addDenseBlock(Index fromRow, Index toRow, Index fromCol, Index toCol)
Enters a contiguous dense block into the sparsity pattern.
Definition: threadedMatrix.hh:1701

Kaskade::NumaCRSPattern
A NUMA-aware compressed row storage sparsity pattern.
Definition: threadedMatrix.hh:1848

Kaskade::NumaCRSPattern::pattern
std::shared_ptr< ChunkPattern > pattern(int i) const
Returns the individual patterns.
Definition: threadedMatrix.hh:1988

Kaskade::NumaCRSPattern::nodes
int nodes() const
Returns the number of NUMA nodes/chunks used.
Definition: threadedMatrix.hh:1983

Kaskade::NumaCRSPattern::chunk
int chunk(Index row) const
Returns the number of the chunk containing the given row.
Definition: threadedMatrix.hh:2003

Kaskade::NumaCRSPattern::N
Index N() const
The number of rows.
Definition: threadedMatrix.hh:1949

Kaskade::NumaCRSPattern::NumaCRSPattern
NumaCRSPattern()
Constructs an empty 0x0 pattern.
Definition: threadedMatrix.hh:1856

Kaskade::NumaCRSPattern::NumaCRSPattern
NumaCRSPattern(NumaCRSPatternCreator< Index > const &creator)
Constructor creating a sparsity pattern from the given creator.
Definition: threadedMatrix.hh:1863

Kaskade::NumaCRSPattern::NumaCRSPattern
NumaCRSPattern(Expanded const &eIndices, Condensed const &cIndices, Matrix const &mat)
Constructor.
Definition: threadedMatrix.hh:1888

Kaskade::NumaCRSPattern::isSymmetric
bool isSymmetric() const
Returns the symmetry status of the pattern.
Definition: threadedMatrix.hh:1995

Kaskade::NumaCRSPattern::M
Index M() const
The number of columns.
Definition: threadedMatrix.hh:1954

Kaskade::NumaCRSPattern::rowStart
std::vector< Index > const & rowStart() const
Returns the limiting row indices between the chunks.
Definition: threadedMatrix.hh:2019

Kaskade::NumaCRSPattern::nonzeroes
size_t nonzeroes() const
Returns the number of structurally nonzero entries.
Definition: threadedMatrix.hh:1972

Kaskade::NumaCRSPattern::storage
size_t storage() const
Returns the number of stored entries.
Definition: threadedMatrix.hh:1961

Kaskade::NumaCRSPattern::exists
bool exists(Index r, Index c) const
queries whether an entry is present in the sparsity pattern
Definition: threadedMatrix.hh:2024

Kaskade::NumaCRSPattern::NumaCRSPattern
NumaCRSPattern(Matrix const &matrix, bool isSymmetric, bool isTransposed, bool symmetric)
Constructor extracting the sparsity pattern of a given matrix (usually a Dune::BCRSMatrix).
Definition: threadedMatrix.hh:1923

Kaskade::NumaThreadPool
Implementation of thread pools suitable for parallelization of (more or less) memory-bound algorithms...
Definition: threading.hh:293

Kaskade::NumaThreadPool::instance
static NumaThreadPool & instance(int maxThreads=std::numeric_limits< int >::max())
Returns a globally unique thread pool instance.

Kaskade::NumaThreadPool::nodes
int nodes() const
Reports the number of NUMA nodes (i.e., memory interfaces/CPU sockets)
Definition: threading.hh:316

Kaskade::ThreadedMatrixDetail::CRSChunk
This class stores a couple of compressed row storage rows in memory allocated locally on a NUMA node.
Definition: threadedMatrix.hh:923

Kaskade::ThreadedMatrixDetail::CRSChunk::CRSChunk
CRSChunk(std::shared_ptr< CRSChunkPattern< Index > > const &pattern_, MatrixAsTriplet< Scalar, Index2 > const &matrix, bool isSymmetric, bool isTransposed)
Constructor.
Definition: threadedMatrix.hh:1012

Kaskade::ThreadedMatrixDetail::CRSChunk::CRSChunk
CRSChunk(int node)
Constructor initializing an empty Chunk.
Definition: threadedMatrix.hh:937

Kaskade::ThreadedMatrixDetail::CRSChunk::pattern
CRSChunkPattern< Index > const & pattern() const
Returns the sparsity pattern of this chunk.
Definition: threadedMatrix.hh:1245

Kaskade::ThreadedMatrixDetail::CRSChunk::scatter
void scatter(LMIterator first, LMIterator last)
Scatters given sub-matrices into the chunk by adding up their entries.
Definition: threadedMatrix.hh:1260

Kaskade::ThreadedMatrixDetail::CRSChunk::CRSChunk
CRSChunk(std::shared_ptr< CRSChunkPattern< Index > > const &pattern_, Entry const &init)
Constructor.
Definition: threadedMatrix.hh:945

Kaskade::ThreadedMatrixDetail::CRSChunk::valStartIterator
auto valStartIterator(Index row)
Returns an iterator to the start of the values for the given local row index.
Definition: threadedMatrix.hh:1251

Kaskade::ThreadedMatrixDetail::CRSChunk::applyTransposed
void applyTransposed(Scalar a, Domain const &x, Range &y) const
Transpose matrix-vector multiplication  or .
Definition: threadedMatrix.hh:1178

Kaskade::ThreadedMatrixDetail::CRSChunk::apply
Scalar apply(Scalar a, Domain const &x, Range &y, bool initialize) const
Matrix-vector multiplication  or  and .
Definition: threadedMatrix.hh:1135

Kaskade::ThreadedMatrixDetail::CRSChunk::CRSChunk
CRSChunk(std::shared_ptr< CRSChunkPattern< Index > > const &pattern_, Matrix const &matrix, bool isSymmetric, bool isTransposed)
Constructor.
Definition: threadedMatrix.hh:963

Kaskade::ThreadedMatrixDetail::CRSChunk::operator=
Self & operator=(Self const &c)=default
Copy assignment.

Kaskade::ThreadedMatrixDetail::CRSChunk::gatherMirrored
void gatherMirrored(Scalar a, Domain const &x, Range &y, CRSChunk< Entry, Index > const &block, bool subdiagonal, bool initToZero) const
Matrix-vector multiplication of transposed parts.
Definition: threadedMatrix.hh:1210

Kaskade::ThreadedMatrixDetail::CRSChunk::entrywiseOp
void entrywiseOp(F const &f, NumaBCRSMatrix< EntryB, IndexB > const &B)
Performs entry-wise operations on our own matrix and some other.
Definition: threadedMatrix.hh:1296

Kaskade::ThreadedMatrixDetail::CRSChunk::CRSChunk
CRSChunk(std::shared_ptr< CRSChunkPattern< Index > > const &pattern_, Expanded const &eIndices, Condensed const &cIndices, Matrix const &matrix)
Constructor.
Definition: threadedMatrix.hh:1069

Kaskade::ThreadedMatrixDetail::CRSChunk::operator=
Self & operator=(Value const &a)
Assigns the given value to each entry.
Definition: threadedMatrix.hh:1099

Kaskade::ThreadedMatrixDetail::CRSChunk::CRSChunk
CRSChunk(Matrix const &matrix, bool isSymmetric, bool isTransposed, Index firstRow, Index lastRow, bool symmetric, int node)
Constructor.
Definition: threadedMatrix.hh:1051

Kaskade::ThreadedMatrixDetail::CRSChunk::operator=
Self & operator=(ThreadedMatrixDetail::NumaBCRSMatrixExpressionChunk< Arguments, Operation > const &e)
Assigns the given matrix expression componentwise.
Definition: threadedMatrix.hh:1110

Kaskade::ThreadedMatrixDetail::CRSChunk::Scalar
typename EntryTraits< Entry >::field_type Scalar
Definition: threadedMatrix.hh:927

Kaskade::ThreadedMatrixDetail::CRSChunkPatternCreator
A class supporting two-stage construction of sparsity patterns of NUMA matrix chunks.
Definition: threadedMatrix.hh:335

Kaskade::ThreadedMatrixDetail::CRSChunkPatternCreator::nonzeroes
size_t nonzeroes() const
Returns the number of stored entries (structurally nonzero elements).
Definition: threadedMatrix.hh:452

Kaskade::ThreadedMatrixDetail::CRSChunkPatternCreator::balanceBackward
size_t balanceBackward(size_t covered, size_t const nnz, int chunks, std::vector< IndexArray > &moveRows)
Moves rows in and out of the chunk in order to equilibrate the number of nonzeroes.

Kaskade::ThreadedMatrixDetail::CRSChunkPatternCreator::CRSChunkPatternCreator
CRSChunkPatternCreator(Index firstRow, Index lastRow, Index ncols, bool symmetric, int node)
Constructor.
Definition: threadedMatrix.hh:354

Kaskade::ThreadedMatrixDetail::CRSChunkPatternCreator::addAllElements
void addAllElements(Index columns)
Enters all possible elements (defining a dense chunk).
Definition: threadedMatrix.hh:434

Kaskade::ThreadedMatrixDetail::CRSChunkPatternCreator::addElements
void addElements(IterRow fromRow, IterRow const toRow, IterCol const fromCol, IterCol const toCol, bool colIsSorted=false)
Enters elements into the sparsity pattern.
Definition: threadedMatrix.hh:401

Kaskade::ThreadedMatrixDetail::CRSChunkPatternCreator::row
IndexArray const & row(Index i) const
Returns the sorted and unique column indices of elements in row .
Definition: threadedMatrix.hh:447

Kaskade::ThreadedMatrixDetail::CRSChunkPatternCreator::clear
void clear()
Clears the complete data, handing back the memory.
Definition: threadedMatrix.hh:376

Kaskade::ThreadedMatrixDetail::CRSChunkPatternCreator::balanceForward
size_t balanceForward(size_t const covered, size_t const nnz, int chunks, std::vector< IndexArray > &moveRows)
Moves rows in and out of the chunk in order to equilibrate the number of nonzeroes.

Kaskade::ThreadedMatrixDetail::CRSChunkPatternCreator::IndexArray
std::vector< Index > IndexArray
An STL container holding a sequence of indices.
Definition: threadedMatrix.hh:340

Kaskade::ThreadedMatrixDetail::CRSChunkPatternCreator::reserve
void reserve(Index nnzPerRow=8)
Reserves a certain amount of entries per row.
Definition: threadedMatrix.hh:366

Kaskade::ThreadedMatrixDetail::CRSChunkPattern
This class maintains the sparsity structure of a couple of matrix rows (a NUMA matrix chunk).
Definition: threadedMatrix.hh:499

Kaskade::ThreadedMatrixDetail::CRSChunkPattern::exists
bool exists(Index r, Index c) const
returns true if the entry with global row and column indices is present in the sparsity pattern
Definition: threadedMatrix.hh:769

Kaskade::ThreadedMatrixDetail::CRSChunkPattern::colStart
size_t colStart(Index row) const
returns the index from which on the entries in given local row are stored
Definition: threadedMatrix.hh:736

Kaskade::ThreadedMatrixDetail::CRSChunkPattern::storage
size_t storage() const
Returns the number of stored entries.
Definition: threadedMatrix.hh:779

Kaskade::ThreadedMatrixDetail::CRSChunkPattern::CRSChunkPattern
CRSChunkPattern(CRSChunkPatternCreator< Index > const &creator)
Constructor.

Kaskade::ThreadedMatrixDetail::CRSChunkPattern::CRSChunkPattern
CRSChunkPattern(Index first, Index last, Expanded const &eIndices, Condensed const &cIndices, Matrix const &mat, int node)
Constructor.
Definition: threadedMatrix.hh:525

Kaskade::ThreadedMatrixDetail::CRSChunkPattern::nonzeroesPerRow
Index nonzeroesPerRow() const
Returns the average number of nonzeroes per row.
Definition: threadedMatrix.hh:791

Kaskade::ThreadedMatrixDetail::CRSChunkPattern::position
size_t position(Index r, Index c) const
returns the position of an element with given global row and column indices
Definition: threadedMatrix.hh:755

Kaskade::ThreadedMatrixDetail::CRSChunkPattern::CRSChunkPattern
CRSChunkPattern(Matrix const &matrix, bool isSymmetric, bool isTransposed, Index firstRow, Index lastRow, bool symmetric, int node)
Constructor.
Definition: threadedMatrix.hh:566

Kaskade::ThreadedMatrixDetail::CRSChunkPattern::CRSChunkPattern
CRSChunkPattern(MatrixAsTriplet< Scalar, Index2 > const &matrix, bool isSymmetric, bool isTransposed, Index firstRow, Index lastRow, bool symmetric, int node)
Constructor.
Definition: threadedMatrix.hh:650

Kaskade::ThreadedMatrixDetail::CRSChunkPattern::col
Index col(size_t idx) const
Definition: threadedMatrix.hh:730

Kaskade::ThreadedMatrixDetail::CRSChunkPattern::nonzeroes
size_t nonzeroes() const
Returns the number of structurally nonzero entries.

Kaskade::ThreadedMatrixDetail::CRSChunkPattern::colStartIterator
std::vector< Index, NumaAllocator< Index > >::const_iterator colStartIterator(Index row) const
an iterator pointing to the start of the column index for the given local row
Definition: threadedMatrix.hh:742

Kaskade::ThreadedMatrixDetail::CRSChunkPatternInfo
A base class representing basic meta information about sparsity patterns of NUMA matrix chunks.
Definition: threadedMatrix.hh:271

Kaskade::ThreadedMatrixDetail::CRSChunkPatternInfo::firstRow
Index firstRow
Definition: threadedMatrix.hh:311

Kaskade::ThreadedMatrixDetail::CRSChunkPatternInfo::lastRow
Index lastRow
Definition: threadedMatrix.hh:311

Kaskade::ThreadedMatrixDetail::CRSChunkPatternInfo::columns
Index columns() const
number of columns in the matrix
Definition: threadedMatrix.hh:298

Kaskade::ThreadedMatrixDetail::CRSChunkPatternInfo::node
int node() const
the NUMA node on which to allocate the memory
Definition: threadedMatrix.hh:303

Kaskade::ThreadedMatrixDetail::CRSChunkPatternInfo::cols
Index cols
Definition: threadedMatrix.hh:312

Kaskade::ThreadedMatrixDetail::CRSChunkPatternInfo::last
Index last() const
end of the half-open row range
Definition: threadedMatrix.hh:293

Kaskade::ThreadedMatrixDetail::CRSChunkPatternInfo::CRSChunkPatternInfo
CRSChunkPatternInfo(Index first_, Index last_, Index cols_, bool symmetric, int node)
Constructor.
Definition: threadedMatrix.hh:281

Kaskade::ThreadedMatrixDetail::CRSChunkPatternInfo::first
Index first() const
start of the covered row range
Definition: threadedMatrix.hh:288

Kaskade::ThreadedMatrixDetail::CRSChunkPatternInfo::symmetric
bool symmetric() const
if true, only the lower triangular part is stored
Definition: threadedMatrix.hh:308

Kaskade::ThreadedMatrixDetail::NumaBCRSMatrixConstIterator
Definition: threadedMatrix.hh:1507

Kaskade::ThreadedMatrixDetail::NumaBCRSMatrixConstIterator::operator!=
bool operator!=(Self const &it) const
Definition: threadedMatrix.hh:1552

Kaskade::ThreadedMatrixDetail::NumaBCRSMatrixConstIterator::operator--
void operator--()
Definition: threadedMatrix.hh:1531

Kaskade::ThreadedMatrixDetail::NumaBCRSMatrixConstIterator::NumaBCRSMatrixConstIterator
NumaBCRSMatrixConstIterator(NumaBCRSMatrix< Entry, Index > const &matrix_, Index row_)
Definition: threadedMatrix.hh:1513

Kaskade::ThreadedMatrixDetail::NumaBCRSMatrixConstIterator::operator->
Row const * operator->() const
Definition: threadedMatrix.hh:1566

Kaskade::ThreadedMatrixDetail::NumaBCRSMatrixConstIterator::operator*
Row const & operator*() const
Dereferentiation yields the row.
Definition: threadedMatrix.hh:1565

Kaskade::ThreadedMatrixDetail::NumaBCRSMatrixConstIterator::operator++
void operator++()
Definition: threadedMatrix.hh:1521

Kaskade::ThreadedMatrixDetail::NumaBCRSMatrixConstIterator::operator==
bool operator==(Self const &it) const
Definition: threadedMatrix.hh:1551

Kaskade::ThreadedMatrixDetail::NumaBCRSMatrixConstIterator::operator+=
void operator+=(Index inc)
Definition: threadedMatrix.hh:1539

Kaskade::ThreadedMatrixDetail::NumaBCRSMatrixConstIterator::index
Index index() const
Definition: threadedMatrix.hh:1555

Kaskade::ThreadedMatrixDetail::NumaBCRSMatrixConstRowIterator
An iterator stepping through all entries in a row.
Definition: threadedMatrix.hh:811

Kaskade::ThreadedMatrixDetail::NumaBCRSMatrixConstRowIterator::value_type
Entry value_type
Definition: threadedMatrix.hh:818

Kaskade::ThreadedMatrixDetail::NumaBCRSMatrixConstRowIterator::iterator_category
std::random_access_iterator_tag iterator_category
Definition: threadedMatrix.hh:822

Kaskade::ThreadedMatrixDetail::NumaBCRSMatrixConstRowIterator::operator!=
bool operator!=(Self const &it) const
Definition: threadedMatrix.hh:851

Kaskade::ThreadedMatrixDetail::NumaBCRSMatrixConstRowIterator::NumaBCRSMatrixConstRowIterator
NumaBCRSMatrixConstRowIterator(ColIterator col_, ValueIterator val_)
Definition: threadedMatrix.hh:824

Kaskade::ThreadedMatrixDetail::NumaBCRSMatrixConstRowIterator::col
ColIterator col
Definition: threadedMatrix.hh:855

Kaskade::ThreadedMatrixDetail::NumaBCRSMatrixConstRowIterator::operator+=
void operator+=(Index i)
Definition: threadedMatrix.hh:844

Kaskade::ThreadedMatrixDetail::NumaBCRSMatrixConstRowIterator::ValueIterator
std::vector< Entry, NumaAllocator< Entry > >::const_iterator ValueIterator
Definition: threadedMatrix.hh:814

Kaskade::ThreadedMatrixDetail::NumaBCRSMatrixConstRowIterator::operator-=
void operator-=(Index i)
Definition: threadedMatrix.hh:845

Kaskade::ThreadedMatrixDetail::NumaBCRSMatrixConstRowIterator::operator++
void operator++()
Definition: threadedMatrix.hh:842

Kaskade::ThreadedMatrixDetail::NumaBCRSMatrixConstRowIterator::operator--
void operator--()
Definition: threadedMatrix.hh:843

Kaskade::ThreadedMatrixDetail::NumaBCRSMatrixConstRowIterator::index
Index index() const
Definition: threadedMatrix.hh:826

Kaskade::ThreadedMatrixDetail::NumaBCRSMatrixConstRowIterator::operator->
Entry const * operator->() const
Definition: threadedMatrix.hh:833

Kaskade::ThreadedMatrixDetail::NumaBCRSMatrixConstRowIterator::reference
Entry const  & reference
Definition: threadedMatrix.hh:821

Kaskade::ThreadedMatrixDetail::NumaBCRSMatrixConstRowIterator::operator*
Entry const & operator*() const
Definition: threadedMatrix.hh:832

Kaskade::ThreadedMatrixDetail::NumaBCRSMatrixConstRowIterator::val
ValueIterator val
Definition: threadedMatrix.hh:856

Kaskade::ThreadedMatrixDetail::NumaBCRSMatrixConstRowIterator::ColIterator
std::vector< Index, NumaAllocator< Index > >::const_iterator ColIterator
Definition: threadedMatrix.hh:813

Kaskade::ThreadedMatrixDetail::NumaBCRSMatrixConstRowIterator::operator==
bool operator==(Self const &it) const
Definition: threadedMatrix.hh:850

Kaskade::ThreadedMatrixDetail::NumaBCRSMatrixConstRowIterator::pointer
Entry const  * pointer
Definition: threadedMatrix.hh:820

Kaskade::ThreadedMatrixDetail::NumaBCRSMatrixConstRowIterator::operator-
difference_type operator-(Self const &it) const
Definition: threadedMatrix.hh:852

Kaskade::ThreadedMatrixDetail::NumaBCRSMatrixConstRowIterator::difference_type
std::ptrdiff_t difference_type
Definition: threadedMatrix.hh:819

Kaskade::ThreadedMatrixDetail::NumaBCRSMatrixExpressionChunk::iterator
Definition: threadedMatrix.hh:888

Kaskade::ThreadedMatrixDetail::NumaBCRSMatrixExpressionChunk
Definition: threadedMatrix.hh:885

Kaskade::ThreadedMatrixDetail::NumaBCRSMatrixExpressionChunk::end
iterator end(size_t) const
Definition: threadedMatrix.hh:895

Kaskade::ThreadedMatrixDetail::NumaBCRSMatrixExpressionChunk::begin
iterator begin(size_t) const
Definition: threadedMatrix.hh:891

Kaskade::ThreadedMatrixDetail::NumaBCRSMatrixExpression
Definition: threadedMatrix.hh:902

Kaskade::ThreadedMatrixDetail::NumaBCRSMatrixExpression::operator[]
NumaBCRSMatrixExpressionChunk< Arguments, Operation > const & operator[](int i) const
Definition: threadedMatrix.hh:904

Kaskade::ThreadedMatrixDetail::NumaBCRSMatrixIterator
Definition: threadedMatrix.hh:1581

Kaskade::ThreadedMatrixDetail::NumaBCRSMatrixIterator::operator*
Row & operator*()
Definition: threadedMatrix.hh:1590

Kaskade::ThreadedMatrixDetail::NumaBCRSMatrixIterator::NumaBCRSMatrixIterator
NumaBCRSMatrixIterator(NumaBCRSMatrix< Entry, Index > &matrix_, Index row_)
Definition: threadedMatrix.hh:1586

Kaskade::ThreadedMatrixDetail::NumaBCRSMatrixIterator::operator->
Row * operator->()
Definition: threadedMatrix.hh:1591

Kaskade::ThreadedMatrixDetail::NumaBCRSMatrixRowIterator
Definition: threadedMatrix.hh:861

Kaskade::ThreadedMatrixDetail::NumaBCRSMatrixRowIterator::operator*
Entry & operator*() const
Definition: threadedMatrix.hh:872

Kaskade::ThreadedMatrixDetail::NumaBCRSMatrixRowIterator::operator->
Entry * operator->() const
Definition: threadedMatrix.hh:873

Kaskade::ThreadedMatrixDetail::NumaBCRSMatrixRowIterator::ValueIterator
std::vector< Entry, NumaAllocator< Entry > >::iterator ValueIterator
Definition: threadedMatrix.hh:864

Kaskade::ThreadedMatrixDetail::NumaBCRSMatrixRowIterator::NumaBCRSMatrixRowIterator
NumaBCRSMatrixRowIterator(ColIterator col_, ValueIterator val_)
Definition: threadedMatrix.hh:866

Kaskade::ThreadedMatrixDetail::NumaBCRSMatrixRowIterator::ColIterator
std::vector< Index, NumaAllocator< Index > >::const_iterator ColIterator
Definition: threadedMatrix.hh:863

Kaskade::ThreadedMatrixDetail::NumaBCRSRow
Definition: threadedMatrix.hh:1382

Kaskade::ThreadedMatrixDetail::NumaBCRSRow::operator[]
Entry const & operator[](Index c) const
Random read access to row entries by global column index.
Definition: threadedMatrix.hh:1451

Kaskade::ThreadedMatrixDetail::NumaBCRSRow::vEnd
ValueIterator vEnd
Definition: threadedMatrix.hh:1484

Kaskade::ThreadedMatrixDetail::NumaBCRSRow::block_type
Entry block_type
Definition: threadedMatrix.hh:1385

Kaskade::ThreadedMatrixDetail::NumaBCRSRow::Iterator
NumaBCRSMatrixRowIterator< Entry, Index > Iterator
Definition: threadedMatrix.hh:1387

Kaskade::ThreadedMatrixDetail::NumaBCRSRow::update
void update(NumaBCRSMatrix< Entry, Index > &matrix, Index row, int chunk)
Definition: threadedMatrix.hh:1487

Kaskade::ThreadedMatrixDetail::NumaBCRSRow::operator[]
Entry & operator[](Index c)
Random write access to row entries by global column index.
Definition: threadedMatrix.hh:1469

Kaskade::ThreadedMatrixDetail::NumaBCRSRow::cBegin
ColIterator cBegin
Definition: threadedMatrix.hh:1483

Kaskade::ThreadedMatrixDetail::NumaBCRSRow::size_type
Index size_type
Definition: threadedMatrix.hh:1384

Kaskade::ThreadedMatrixDetail::NumaBCRSRow::end
ConstIterator end() const
Definition: threadedMatrix.hh:1410

Kaskade::ThreadedMatrixDetail::NumaBCRSRow::begin
Iterator begin()
Definition: threadedMatrix.hh:1412

Kaskade::ThreadedMatrixDetail::NumaBCRSRow::vBegin
ValueIterator vBegin
Definition: threadedMatrix.hh:1484

Kaskade::ThreadedMatrixDetail::NumaBCRSRow::find
Iterator find(Index c)
Looks up a particular entry specified by column index.
Definition: threadedMatrix.hh:1433

Kaskade::ThreadedMatrixDetail::NumaBCRSRow::begin
ConstIterator begin() const
Start of row entries.
Definition: threadedMatrix.hh:1409

Kaskade::ThreadedMatrixDetail::NumaBCRSRow::size
size_type size() const
Returns the number of entries in the current row.
Definition: threadedMatrix.hh:1393

Kaskade::ThreadedMatrixDetail::NumaBCRSRow::cEnd
ColIterator cEnd
Definition: threadedMatrix.hh:1483

Kaskade::ThreadedMatrixDetail::NumaBCRSRow::getindexptr
Index const * getindexptr() const
Low-level access to array of (sorted) column indices in this row.
Definition: threadedMatrix.hh:1398

Kaskade::ThreadedMatrixDetail::NumaBCRSRow::getptr
block_type * getptr()
Definition: threadedMatrix.hh:1404

Kaskade::ThreadedMatrixDetail::NumaBCRSRow::getptr
block_type const * getptr() const
Low-level access to array of values in this row.
Definition: threadedMatrix.hh:1403

Kaskade::ThreadedMatrixDetail::NumaBCRSRow::ColIterator
std::vector< Index, NumaAllocator< Index > >::const_iterator ColIterator
Definition: threadedMatrix.hh:1480

Kaskade::ThreadedMatrixDetail::NumaBCRSRow::ConstIterator
NumaBCRSMatrixConstRowIterator< Entry, Index > ConstIterator
Definition: threadedMatrix.hh:1388

Kaskade::ThreadedMatrixDetail::NumaBCRSRow::ValueIterator
std::vector< Entry, NumaAllocator< Entry > >::iterator ValueIterator
Definition: threadedMatrix.hh:1481

Kaskade::ThreadedMatrixDetail::NumaBCRSRow::find
ConstIterator find(Index c) const
Looks up a particular entry specified by column index.
Definition: threadedMatrix.hh:1420

Kaskade::ThreadedMatrixDetail::NumaBCRSRow::end
Iterator end()
Definition: threadedMatrix.hh:1413

detailed_exception.hh

duneInterface.hh

dynamicMatrix.hh

firstless.hh

fixdune.hh
This file contains various utility functions that augment the basic functionality of Dune.

Kaskade::NumaBCRSMatrix::reshapeBlocks
NumaBCRSMatrix< Dune::FieldMatrix< Scalar, row2, col2 >, Index > reshapeBlocks(NumaBCRSMatrix< Dune::FieldMatrix< Scalar, row1, col1 >, Index > const &A)
reshapes NumaBRCSMAtrix entry block structure
Definition: threadedMatrix.hh:3087

Kaskade::sparseUnitMatrix
NumaBCRSMatrix< Dune::FieldMatrix< Scalar, n, n >, Index > sparseUnitMatrix(Index N)
creates a unit matrix in NUMA block compressed row storage format
Definition: threadedMatrix.hh:2918

Kaskade::NumaBCRSMatrix::vertcat
NumaBCRSMatrix< Dune::FieldMatrix< Scalar, blockrows, blockcols >, Index > vertcat(NumaBCRSMatrix< Dune::FieldMatrix< Scalar, blockrows, blockcols >, Index > const &A, NumaBCRSMatrix< Dune::FieldMatrix< Scalar, blockrows, blockcols >, Index > const &B)
concatenate two matrices vertically
Definition: threadedMatrix.hh:3182

Kaskade::NumaBCRSMatrix::transpose
auto transpose(NumaBCRSMatrix< Entry, Index > const &A)
Creates the transposed sparse matrix .
Definition: threadedMatrix.hh:2956

Kaskade::NumaBCRSMatrix::full
DynamicMatrix< typename SparseMatrix::block_type > full(SparseMatrix const &A)
Converts a sparse NumaBCRSMatrix to a dense matrix.
Definition: threadedMatrix.hh:2814

Dune::max
Dune::FieldVector< T, n > max(Dune::FieldVector< T, n > x, Dune::FieldVector< T, n > const &y)
Componentwise maximum.
Definition: fixdune.hh:110

Kaskade::NumaBCRSMatrix::eraseCols
Target eraseCols(Source const &A, ColIndices const &ci)
"deletes" columns by extracting a copy of the matrix keeping only the non-deleted columns.
Definition: threadedMatrix.hh:3030

Kaskade::sparseZeroMatrix
NumaBCRSMatrix< Dune::FieldMatrix< Scalar, n, m >, Index > sparseZeroMatrix(Index N, Index M)
creates a zero matrix in NUMA block compressed row storage format
Definition: threadedMatrix.hh:2942

Kaskade::NumaBCRSMatrix::full
DynamicMatrix< typename SparseMatrix::block_type > full(SparseMatrix const &A, RowRange const &rows, ColRange const &cols)
Converts a subrange of a sparse matrix to a dense matrix.
Definition: threadedMatrix.hh:2841

Kaskade::NumaBCRSMatrix::eraseRows
Target eraseRows(Source const &A, RowIndices const &ri)
"deletes" rows by extracting a copy of the matrix keeping only the non-deleted rows.
Definition: threadedMatrix.hh:3010

Kaskade::NumaBCRSMatrix::horzcat
NumaBCRSMatrix< Dune::FieldMatrix< Scalar, blockrows, blockcols >, Index > horzcat(NumaBCRSMatrix< Dune::FieldMatrix< Scalar, blockrows, blockcols >, Index > const &A, NumaBCRSMatrix< Dune::FieldMatrix< Scalar, blockrows, blockcols >, Index > const &B)
concatenate two matrices horizontally
Definition: threadedMatrix.hh:3135

Kaskade::NumaBCRSMatrix::eraseRowsNCols
Target eraseRowsNCols(Source const &A, RowIndices const &ri, ColIndices const &ci)
"deletes" rows/columns by extracting a copy of the matrix keeping the non-deleted rows/columns.
Definition: threadedMatrix.hh:2979

Kaskade::NumaBCRSMatrix::nonZeroColumns
std::vector< size_t > nonZeroColumns(Source const &A)
returns all indices of nonzero columns.
Definition: threadedMatrix.hh:3049

Kaskade::NumaBCRSMatrix::diagcat
NumaBCRSMatrix< Dune::FieldMatrix< double, blockrows, blockcols >, Index > diagcat(NumaBCRSMatrix< Dune::FieldMatrix< double, blockrows, blockcols >, Index > const &A, NumaBCRSMatrix< Dune::FieldMatrix< double, blockrows, blockcols >, Index > const &B)
concatenate two matrices diagonally, resulting matrix is zero on the off block-diagonal
Definition: threadedMatrix.hh:3228

Kaskade::NumaBCRSMatrix::operator+
NumaBCRSMatrix< Entry, Index > operator+(NumaBCRSMatrix< Entry, Index > const &A, NumaBCRSMatrix< Entry, Index2 > const &B)
Matrix addition . The sparsity patterns of both matrices can be different. The size of the matrices h...
Definition: threadedMatrix.hh:2885

Dune::min
Dune::FieldVector< T, n > min(Dune::FieldVector< T, n > x, Dune::FieldVector< T, n > const &y)
Componentwise minimum.
Definition: fixdune.hh:122

Kaskade::uniformWeightRangeStart
Index uniformWeightRangeStart(BlockIndex i, BlockIndex n, Index m)
Computes partitioning points of ranges for uniform weight distributions.
Definition: threading.hh:75

Kaskade::parallelForNodes
void parallelForNodes(Func const &f, int maxTasks=std::numeric_limits< int >::max())
A parallel for loop that executes the given functor in parallel on different NUMA nodes.
Definition: threading.hh:604

Kaskade::equalWeightRanges
void equalWeightRanges(std::vector< size_t > &x, size_t n)
Computes partitioning points such that the sum of weights in each partition is roughly the same.

GeometricObject::distance
Scalar distance(Point< Scalar, dim > const &first, Point< Scalar, dim > const &second)

GeometricObject::X
@ X
Definition: geometric_objects.hh:36

GeometricObject::Y
@ Y
Definition: geometric_objects.hh:36

Kaskade::ThreadedMatrixDetail::getRowCount
void getRowCount(Dune::BCRSMatrix< Entry, Allocator > const &matrix, bool isTransposed, std::vector< Index > &rowCount)
Definition: threadedMatrix.hh:54

Kaskade
Definition: abstract_interface.hh:15

Kaskade::operator<<
std::ostream & operator<<(std::ostream &s, std::vector< Scalar > const &vec)
Definition: dune_bridge.hh:47

Kaskade::ScalarType
typename GetScalar< Type >::type ScalarType
Extracts the scalar field type from linear algebra data types.
Definition: duneInterface.hh:110

Kaskade::transpose
T transpose(T x)
Definition: dynamicMatrix.hh:750

Kaskade::operator+
NumaCRSPattern< Index > operator+(NumaCRSPattern< Index > const &pa, NumaCRSPattern< Index2 > const &pb)
Definition: threadedMatrix.hh:2037

Kaskade::submatrix
Target submatrix(Source const &A, RowIndices const &ri, ColIndices const &ci)

Kaskade::EntryTraits::field_type
Entry field_type
Definition: scalar.hh:77

Kaskade::FirstLess
A comparator functor that supports sorting std::pair by their first component.
Definition: firstless.hh:22

Kaskade::ThreadedMatrixDetail::CopyMatrixToChunk< Entry, Matrix, false, true, Index >::init
static void init(Index first, Index last, Matrix const &matrix, std::vector< size_t, NumaAllocator< size_t > > &colStart, std::vector< Index, NumaAllocator< Index > > &cols, std::vector< Entry, NumaAllocator< Entry > > &values, std::vector< Index > const &nRowEntries)
Definition: threadedMatrix.hh:163

Kaskade::ThreadedMatrixDetail::CopyMatrixToChunk< Entry, Matrix, true, transposed, Index >::init
static void init(Index first, Index last, Matrix const &matrix, std::vector< size_t, NumaAllocator< size_t > > &colStart, std::vector< Index, NumaAllocator< Index > > &cols, std::vector< Entry, NumaAllocator< Entry > > &values, std::vector< Index > const &nRowEntries)
Definition: threadedMatrix.hh:111

Kaskade::ThreadedMatrixDetail::CopyMatrixToChunk
Definition: threadedMatrix.hh:89

Kaskade::ThreadedMatrixDetail::CopyMatrixToChunk::init
static void init(Index first, Index last, Matrix const &matrix, std::vector< size_t, NumaAllocator< size_t > > &colStart, std::vector< Index, NumaAllocator< Index > > &cols, std::vector< Entry, NumaAllocator< Entry > > &values, std::vector< Index > const &)
Definition: threadedMatrix.hh:91

Kaskade::ThreadedMatrixDetail::MatrixEntry< Entry, Entry >::copy
static void copy(Entry const &from, Entry &to, bool isTransposed)
Definition: threadedMatrix.hh:234

Kaskade::ThreadedMatrixDetail::MatrixEntry< To, From, typename std::is_same< typename To::field_type, typename From::field_type >::type >::copy
static void copy(From const &from, To &to, bool isTransposed)
Definition: threadedMatrix.hh:216

Kaskade::ThreadedMatrixDetail::MatrixEntry
Definition: threadedMatrix.hh:203

Kaskade::ThreadedMatrixDetail::MatrixEntry::copy
static void copy(From const &from, To &to, bool isTransposed)
Definition: threadedMatrix.hh:204

threading.hh