kaskade7/html/conjugation_8hh_source.html

/* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */

/*                                                                           */

/*  This file is part of the library KASKADE 7                               */

/*  https://www.zib.de/research/projects/kaskade7-finite-element-toolbox     */

/*                                                                           */

/*  Copyright (C) 2002-2020 Zuse Institute Berlin                            */

/*                                                                           */

/*  KASKADE 7 is distributed under the terms of the ZIB Academic License.    */

/*    see $KASKADE/academic.txt                                              */

/*                                                                           */

/* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */


#ifndef CONJUGATION_HH

#define CONJUGATION_HH


#include <cassert>

#include <memory>


#include <boost/timer/timer.hpp>


#include "dune/istl/bcrsmatrix.hh"

#include "dune/istl/matrixindexset.hh"


#include "linalg/localMatrices.hh"

#include "linalg/threadedMatrix.hh"


#include "utilities/timing.hh"


namespace Kaskade

{

  template <class Scalar, class Entry>

  std::unique_ptr<Dune::MatrixIndexSet> conjugationPattern(Dune::BCRSMatrix<Dune::FieldMatrix<Scalar,1,1> > const& P,

                                                           Dune::BCRSMatrix<Entry> const& A ,

                                                           bool onlyLowerTriangle = false)

      {

    assert(A.N()==A.M());

    assert(A.N()==P.N());


    typedef Dune::BCRSMatrix<Entry> MatA;

    typedef Dune::BCRSMatrix<Dune::FieldMatrix<Scalar,1,1> > MatP;


    // If C = P^T A P, we have that C_{ij} = \sum_{k,l} P_{ki} P_{lj} A_{kl}. Hence, the entry A_{kl} contributes to

    // all C_{ij} for which there are nonzero entries P_{ki} and P_{lj} in the rows k and l of P. Thus we can simply

    // run through all nonzeros A_{kl} of A, look up the column indices i,j of rows k and l of P, and flag C_{ij}

    // as nonzero.


    std::unique_ptr<Dune::MatrixIndexSet> nzC(new Dune::MatrixIndexSet(P.M(),P.M()));


    if(onlyLowerTriangle == false)

    {

      // Step through all entries of A

      for (int k=0; k<A.N(); ++k)

        for (typename MatA::ConstColIterator ca=A[k].begin(); ca!=A[k].end(); ++ca)

        {

          int const l = ca.index();

          // Step through all entries of rows k and l of P and add entry

          for (typename MatP::ConstColIterator cpk=P[k].begin(); cpk!=P[k].end(); ++cpk)

            for (typename MatP::ConstColIterator cpl=P[l].begin(); cpl!=P[l].end(); ++cpl)

              nzC->add(cpk.index(),cpl.index());

        }

    }

    else

    {

      // Step through all entries of A

      for (int k=0; k<A.N(); ++k)

        for (typename MatA::ConstColIterator ca=A[k].begin(); ca!=A[k].end(); ++ca)

        {

          int const l = ca.index();

          // Step through all entries of rows k and l of P and add entry

          for (typename MatP::ConstColIterator cpk=P[k].begin(); cpk!=P[k].end(); ++cpk)

            for (typename MatP::ConstColIterator cpl=P[l].begin(); cpl!=P[l].end(); ++cpl)

            {

              if( cpk.index() >= cpl.index() )

                nzC->add(cpk.index(),cpl.index());

              else

                nzC->add(cpl.index(),cpk.index());

            }

        }

    }


    return nzC;


    // An alternative way of computing the sparsity pattern would be to use that the nonzero entries j in column i

    // of C are exactly those for which there is k with (nonzero P_{jk} and there is l with (nonzero P_{il} and A_{lk})).

    // Hence we can obtain the column index set J directly by the following steps:

    // (i) find all l with P_{li} nonzero -> L  [requires to access columns of P - compute the transpose patterns once]

    // (ii) find all k with A_{lk} nonzero for some l in L -> K  [probably sorting K and removing doubled entries would be a good idea here]

    // (iii) find all j with P_{kj} nonzero for some k in K -> J

    // Compared to the above implementation this would have the following (dis)advantages

    // + easy to do in parallel (since write operations are separated)

    // + fewer scattered write accesses to memory

    // - more complex implementation

    // - requires the transpose pattern of P

      }


  template <class IndexP, class EntryP, class IndexA, class EntryA>

  auto conjugation(NumaBCRSMatrix<EntryP,IndexP> const& P, NumaBCRSMatrix<EntryA,IndexA> const& A,

                   bool onlyLowerTriangle = false, bool createDiagonal=false)

  {

    assert(A.N()==A.M());

    static_assert(EntryA::rows==EntryA::cols,"central matrix entries in conjugation have to be square");

    assert(A.N()==P.N());


    Timings& timer = Timings::instance();


    // First create the sparsity pattern

    timer.start("conjugation pattern");

    NumaCRSPatternCreator<IndexP> creator(P.M(),P.M(),onlyLowerTriangle);


    // If C = P^T A P, we have that C_{ij} = \sum_{k,l} P_{ki} P_{lj} A_{kl}. Hence, the entry A_{kl} contributes to

    // all C_{ij} for which there are nonzero entries P_{ki} and P_{lj} in the rows k and l of P. Thus we can simply

    // run through all nonzeros A_{kl} of A, look up the column indices i,j of rows k and l of P, and flag C_{ij}

    // as nonzero.


    { // just a new scope

      // helper routine for extracting all column indices of a row

      auto getColumnIndices = [] (auto const& row, std::vector<IndexP>& ci)

      {

        ci.clear();

        for (auto i=row.begin(); i!=row.end(); ++i)       // indices i for which Pki != 0

          ci.push_back(i.index());

      };

      std::vector<IndexP> is, js;


      // Step through all entries of A

      for (IndexA k=0; k<A.N(); ++k)

      {

        getColumnIndices(P[k],is);                          // indices i for which Pki != 0


        auto row = A[k];

        for (auto ca=row.begin(); ca!=row.end(); ++ca)

        {

          IndexA const l = ca.index();

          getColumnIndices(P[l],js);                        // indices j for which Plj != 0


          // add all combinations i,j

          creator.addElements(std::begin(is),std::end(is),std::begin(js),std::end(js));

          if (onlyLowerTriangle && k>l)                     // subdiagonal entry (k,l) of A -> entry (l,k) must be treated implicitly:

            // add all combinations (j,i)

            creator.addElements(std::begin(js),std::end(js),std::begin(is),std::end(is));

        }

      }

    }


    if (createDiagonal)

      creator.addDiagonal();

    timer.stop("conjugation pattern");


    // An alternative way of computing the sparsity pattern would be to use that the nonzero entries j in column i

    // of C are exactly those for which there is k with (nonzero P_{jk} and there is l with (nonzero P_{il} and A_{lk})).

    // Hence we can obtain the column index set J directly by the following steps:

    // (i) find all l with P_{li} nonzero -> L  [requires to access columns of P - compute the transpose patterns once]

    // (ii) find all k with A_{lk} nonzero for some l in L -> K  [probably sorting K and removing doubled entries would

    //      be a good idea here]

    // (iii) find all j with P_{kj} nonzero for some k in K -> J

    // Compared to the above implementation this would have the following (dis)advantages

    // + easy to do in parallel (since write operations are separated)

    // + fewer scattered write accesses to memory

    // - more complex implementation

    // - requires the transpose pattern of P


    // Create the sparse matrix. First define the resulting entry size: If P has scalar entries, this is just

    // the (possibly matrix-valued) size of entries in A. Otherwise, we take the triple product of P^T A P entries.

    // Note that rows/cols are enum - need to cast to int here.

    constexpr int entrySize = (EntryP::rows==1 && EntryP::cols==1)? (int)EntryA::rows: (int)EntryP::cols;

    using Entry = Dune::FieldMatrix<typename EntryA::value_type,entrySize,entrySize>;


    timer.start("matrix creation");

    NumaBCRSMatrix<Entry,IndexP> pap(creator);

    timer.stop("matrix creation");


    // Fill the sparse matrix PAP. This is done as before by stepping through all Akl entries and scatter

    // Pki*Plj*Akl into PAPij.

    //

    // An alternative way of computing P^TAP would be a gather operation with inverted loop order:

    // Cij = sum_kl Pki*Plj*Akl. This requires P^T for efficient determination of required kl indices.

    // While the transpose construction is efficient, the gather implementation ist not (tested 2016-01-17),

    // presumably because A is larger than PAP and the scattered accesses have a worse locality.

    // Sequential performance was more than 10-fold slower than the scatter implementation below, so

    // we stick to the scatter.

    //

    // A second alternative is to create a triplet matrix first. This appears to be a factor 3 slower in

    // sequential implementation, and incurs a high memory footprint as several entries are duplicate.


    auto getEntryValues = [] (auto const& row, std::vector<EntryP>& vi)

    {

      vi.clear();

      for (auto i=row.begin(); i!=row.end(); ++i)       // indices i for which Pki != 0

        vi.push_back(*i);

    };


    auto getColumnIndices = [] (auto const& row, auto& ci)     // computes (global,local) column index pairs of row k of P

    {

      ci.clear();

      int idx = 0;

      for (auto i=row.begin(); i!=row.end(); ++i, ++idx)       // indices i for which Pki != 0

        ci.push_back(std::make_pair(i.index(),idx));

    };


    timer.start("conjugation scatter");

    parallelFor([&](size_t block, size_t nBlocks)

    {

      size_t rowStart = uniformWeightRangeStart(block,nBlocks,A.N());

      size_t rowEnd   = uniformWeightRangeStart(block+1,nBlocks,A.N());

      std::vector<EntryP> pk, pl;

      using SortedIndices = std::vector<std::pair<IndexP,int>>;

      SortedIndices is, js;

      LocalMatrices<Entry,false,SortedIndices,SortedIndices> localMatrices(pap);

      for (IndexA k=rowStart; k<rowEnd; ++k)

      {

        auto Pk = P[k];

        getColumnIndices(Pk,is);                          // indices i for which Pki != 0

        getEntryValues(Pk,pk);


        auto row = A[k];

        for (auto ca=row.begin(); ca!=row.end(); ++ca)

        {

          IndexA const l = ca.index();                    // column index of Akl

          auto Pl = P[l];

          getColumnIndices(Pl,js);                        // indices j for which Plj != 0

          getEntryValues(Pl,pl);


          // For each entry Akl of A we have to create one local matrix.

          localMatrices.push_back(is,js);


          auto Akl = *ca;

          for (int i=0; i<is.size(); ++i)                 // just scatter Akl to all affected PAP entries

          {

            auto pki = normalForm(pk[i]);

            auto Pki_Akl = transpose(pki) * Akl;

            for (int j=0; j<pl.size(); ++j)

            {

              auto plj = normalForm(pl[j]);

              localMatrices.back()(i,j) = Pki_Akl * plj;

            }

          }


          if (onlyLowerTriangle && k>l)

          {

            // treat Alk entry

            abort();

          }

        }

      }

    },8*NumaThreadPool::instance().cpus());


    timer.stop("conjugation scatter");


    return pap;

  }


  template <class Scalar, class Entry>

  void conjugation(Dune::BCRSMatrix<Entry>& C,

                               Dune::BCRSMatrix<Dune::FieldMatrix<Scalar,1,1> > const& P,

                               Dune::BCRSMatrix<Entry> const& A,

                               bool onlyLowerTriangle = false )

  {

    assert(A.N()==A.M());

    assert(A.N()==P.N());

    assert(C.N()==P.M());

    assert(C.M()==P.M());


    typedef Dune::BCRSMatrix<Entry> MatA;

    typedef Dune::BCRSMatrix<Dune::FieldMatrix<Scalar,1,1> > MatP;


    if(onlyLowerTriangle == false )

    {

      // Step through all entries of A

      for (int k=0; k<A.N(); ++k)

        for (typename MatA::ConstColIterator ca=A[k].begin(); ca!=A[k].end(); ++ca)

        {

          int const l = ca.index();

          // Step through all entries of rows k and l of P and add entry

          for (typename MatP::ConstColIterator cpk=P[k].begin(); cpk!=P[k].end(); ++cpk)

            for (typename MatP::ConstColIterator cpl=P[l].begin(); cpl!=P[l].end(); ++cpl)

              C[cpk.index()][cpl.index()].axpy((*cpl) * (*cpk),(*ca));

        }

    }

    else

    {

      // Step through all entries of A

      for (int k=0; k<A.N(); ++k)

        for (typename MatA::ConstColIterator ca=A[k].begin(); ca!=A[k].end(); ++ca)

        {

          int const l = ca.index();

          for (typename MatP::ConstColIterator cpk=P[k].begin(); cpk!=P[k].end(); ++cpk)

            for (typename MatP::ConstColIterator cpl=P[l].begin(); cpl!=P[l].end(); ++cpl)

            {

              if( cpk.index() >= cpl.index() )

                C[cpk.index()][cpl.index()].axpy((*cpl) * (*cpk), (*ca) );

            }

          if(k>l)

          {

            for (typename MatP::ConstColIterator cpl=P[l].begin(); cpl!=P[l].end(); ++cpl)

              for (typename MatP::ConstColIterator cpk=P[k].begin(); cpk!=P[k].end(); ++cpk)

                if( cpl.index() >= cpk.index() )

                  C[cpl.index()][cpk.index()].axpy((*cpl) * (*cpk) , (*ca) );

          }

        }

    }

  }

}

#endif


Dune::FieldMatrix
Definition: errorDistribution.hh:30

Kaskade::LocalMatrices
A structure for holding a sequence of several local matrices to be filled sequentially and to be scat...
Definition: localMatrices.hh:180

Kaskade::LocalMatrices::back
value_type & back()
A reference to the last pushed local matrix.
Definition: localMatrices.hh:254

Kaskade::LocalMatrices::push_back
void push_back(SortedRowIdx const &ridx, SortedColIdx const &cidx)
Appends another (zero-initialized) local matrix.
Definition: localMatrices.hh:227

Kaskade::NumaBCRSMatrix
A NUMA-aware compressed row storage matrix adhering mostly to the Dune ISTL interface (to complete....
Definition: threadedMatrix.hh:2115

Kaskade::NumaBCRSMatrix::M
Index M() const
The number of columns.
Definition: threadedMatrix.hh:2514

Kaskade::NumaBCRSMatrix::N
Index N() const
The number of rows.
Definition: threadedMatrix.hh:2509

Kaskade::NumaCRSPatternCreator
A NUMA-aware creator for matrix sparsity patterns.
Definition: threadedMatrix.hh:1617

Kaskade::NumaCRSPatternCreator::addElements
void addElements(IterRow const fromRow, IterRow const toRow, IterCol const fromCol, IterCol const toCol, bool colIsSorted=false)
Enters entries into the sparsity pattern.
Definition: threadedMatrix.hh:1726

Kaskade::NumaCRSPatternCreator::addDiagonal
void addDiagonal()
Enters the diagonal elements.
Definition: threadedMatrix.hh:1780

Kaskade::NumaThreadPool::instance
static NumaThreadPool & instance(int maxThreads=std::numeric_limits< int >::max())
Returns a globally unique thread pool instance.

Kaskade::NumaThreadPool::cpus
int cpus() const
Reports the total number of CPUs (usually a multiple of nodes).
Definition: threading.hh:327

Kaskade::Timings
Supports gathering and reporting execution times information for nested program parts.
Definition: timing.hh:64

Kaskade::Timings::instance
static Timings & instance()
Returns a reference to a single default instance.

Kaskade::Timings::stop
void stop(std::string const &name)
Stops the timing of given section.

Kaskade::Timings::start
struct Times const * start(std::string const &name)
Starts or continues the timing of given section.

Kaskade::NumaBCRSMatrix::conjugation
auto conjugation(NumaBCRSMatrix< EntryP, IndexP > const &P, NumaBCRSMatrix< EntryA, IndexA > const &A, bool onlyLowerTriangle=false, bool createDiagonal=false)
Creates the conjugation product .
Definition: conjugation.hh:123

Kaskade::uniformWeightRangeStart
Index uniformWeightRangeStart(BlockIndex i, BlockIndex n, Index m)
Computes partitioning points of ranges for uniform weight distributions.
Definition: threading.hh:75

Kaskade::parallelFor
void parallelFor(Func const &f, int maxTasks=std::numeric_limits< int >::max())
A parallel for loop that executes the given functor in parallel on different CPUs.
Definition: threading.hh:489

localMatrices.hh

Dune::normalForm
auto normalForm(Dune::FieldMatrix< T, n, m > const &A)
Definition: fixdune.hh:289

Kaskade
Definition: abstract_interface.hh:15

Kaskade::conjugation
void conjugation(Dune::BCRSMatrix< Entry > &C, Dune::BCRSMatrix< Dune::FieldMatrix< Scalar, 1, 1 > > const &P, Dune::BCRSMatrix< Entry > const &A, bool onlyLowerTriangle=false)
Computes the triple sparse matrix product .
Definition: conjugation.hh:286

Kaskade::transpose
T transpose(T x)
Definition: dynamicMatrix.hh:750

Kaskade::conjugationPattern
std::unique_ptr< Dune::MatrixIndexSet > conjugationPattern(Dune::BCRSMatrix< Dune::FieldMatrix< Scalar, 1, 1 > > const &P, Dune::BCRSMatrix< Entry > const &A, bool onlyLowerTriangle=false)
Creates the sparsity pattern of .
Definition: conjugation.hh:43

threadedMatrix.hh

timing.hh