KASKADE 7 development version
domainDecompositionPreconditioner.hh
Go to the documentation of this file.
1/* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */
2/* */
3/* This file is part of the library KASKADE 7 */
4/* https://www.zib.de/research/projects/kaskade7-finite-element-toolbox */
5/* */
6/* Copyright (C) 2002-2020 Zuse Institute Berlin */
7/* */
8/* KASKADE 7 is distributed under the terms of the ZIB Academic License. */
9/* see $KASKADE/academic.txt */
10/* */
11/* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */
12
13#ifndef LINALG_PATCHDOMAINDECOMPOSITIONPPRECONDITIONER_HH
14#define LINALG_PATCHDOMAINDECOMPOSITIONPPRECONDITIONER_HH
15
16#include <vector>
17
18#include <boost/timer/timer.hpp>
19
20#include <dune/common/dynvector.hh>
21#include "dune/istl/preconditioner.hh"
22
27#include "fem/supports.hh"
28#include "utilities/scalar.hh"
30#include "utilities/timing.hh"
31
32
33// #define INEXACTSTORAGE
34// #include <random> // for mixed precision *testing*
35// extern double epsilon;
36
37
38
39namespace Kaskade
40{
41 // ----------------------------------------------------------------------------------------------
42 // ----------------------------------------------------------------------------------------------
43
49 {
50 public:
51 InverseLocalMatrixStorageBase(std::vector<size_t> const& ap)
52 : dofs(ap)
53 {
54 }
55
59 std::vector<size_t> const& indices() const
60 {
61 return dofs;
62 }
63
69 size_t storage() const
70 {
71 return storageSize;
72 }
73
80 size_t flops() const
81 {
82 return computeFlops;
83 }
84
85 protected:
88
89 template <int m, class LocalMatrix, class Real>
90 void regularize(LocalMatrix& aloc, Real r) const
91 {
92 if (r > 0)
93 {
94 Real Amax = 0;
95 for (int i=0; i<aloc.N(); ++i)
96 Amax = std::max(static_cast<Real>(std::sqrt(frobenius_norm2(aloc[i][i]))),Amax);
97 for (int i=0; i<aloc.N(); ++i)
98 aloc[i][i] += Amax/r * unitMatrix<Real,m>();
99 }
100 }
101
102 private:
103 std::vector<size_t> dofs;
104 };
105
106 // ----------------------------------------------------------------------------------------------
107
113 template <class Scalar>
115
121 template <int m, class Scalar=float>
123 {
125
126 public:
127
128 using field_type = Scalar;
129
130 template <class T, class Index>
131 DenseInverseLocalMatrixStorage(std::vector<size_t> const& ap,
133 field_type regularizeForCondition, DenseInverseStorageTag<Scalar>)
135 {
136 alocinv = full(A,indices(),indices());
137
138#ifdef INEXACTSTORAGE
139 using Real = typename ScalarTraits<Scalar>::Real;
140 Real anorm = two_norm(alocinv);
141#endif
142
143 regularize<m>(alocinv,regularizeForCondition);
144
145 invertSpd(alocinv); // invert the local block
146
147#ifdef INEXACTSTORAGE
148 DynamicMatrix<Dune::FieldMatrix<Scalar,m,m>> delta(alocinv.N(),alocinv.M());
149
150 std::random_device rd;
151 std::mt19937 gen(rd());
152 std::uniform_real_distribution<> dis(-1.0,1.0);
153 for (int i=0; i<alocinv.N(); ++i)
154 for (int j=0; j<alocinv.M(); ++j)
155 for (int k=0; k<m; ++k)
156 for (int l=0; l<m; ++l)
157 delta[i][j][k][l] = dis(gen);
158 Real dnorm = two_norm(delta);
159
160 Real epsilon = ::epsilon;
161 delta *= epsilon / anorm / dnorm;
162 alocinv += delta;
163#endif
164
165 storageSize = sizeof(StorageEntry)*alocinv.N()*alocinv.M() + sizeof(size_t)*indices().size();
166 auto n = alocinv.N() * m;
167 computeFlops = n*(2*n-1); // in every row n multiplications and n-1 additions
168 }
169
170
171 template <class Vector>
172 void mv(Vector const& x, Vector& y) const
173 {
174 alocinv.mv(x,y);
175 }
176
177
178 private:
180 };
181
182 // ----------------------------------------------------------------------------------------------
183
200 std::array<std::vector<size_t>,3> nestedDissection(std::vector<std::pair<size_t,size_t>> const& edges,
201 std::vector<size_t> const& edgeStart);
202
208 template <class StorageScalar>
210 {
211 public:
212 NestedDissection() = default;
213
217 template <class SparseMatrix>
218 NestedDissection(SparseMatrix const& A, size_t skipEntries, size_t skipDimension);
219
227 template <class FactorizationScalar>
229
230 template <class T>
231 void solve(T& y) const;
232
233 size_t storageSize() const;
234 size_t flops() const;
235
239 size_t size()
240 {
241 return sz;
242 }
243
244 private:
245 template <class FactorizationScalar>
246 friend class NestedDissection;
247
248 std::vector<size_t> v1, v2, s;
249 std::unique_ptr<NestedDissection<StorageScalar>> l1FactorND, l2FactorND;
251 std::vector<StorageScalar> sFactor;
252 size_t sz;
253
261 template <int m>
262 void trisolve(DynamicMatrix<StorageScalar>& y, char trans) const;
263 };
264
272 template <class StorageScalar, class FactorizationScalar>
274 {
275 public:
276
277 NestedDissectionStorageTag(size_t skipEntries_=100, size_t skipDimension_=32)
278 : skipEntries(skipEntries_), skipDimension(skipDimension_)
279 {}
280
285
290 };
291
297 template <int m, class StorageScalar, class FactorizationScalar>
299 {
300 public:
301 using field_type = StorageScalar;
302
303 template <class T, class Index>
304 NestedDissectionCholeskyLocalMatrixStorage(std::vector<size_t> const& ap,
306 FactorizationScalar regularizeForCondition,
309 {
310 using LocalMatrix = Dune::BCRSMatrix<Dune::FieldMatrix<T,m,m>>;
311
312 LocalMatrix aloc = submatrix<LocalMatrix>(A,indices(),indices());
313
314 regularize<m>(aloc,regularizeForCondition);
315
316 choleskyFactor = NestedDissection<FactorizationScalar>(aloc,storageTag.skipEntries,
317 storageTag.skipDimension); // factorize the local block and compress
318
319 storageSize = choleskyFactor.storageSize();
320 computeFlops = choleskyFactor.flops();
321 }
322
323
327 template <class Vector>
328 void mv(Vector const& x, Vector& y) const
329 {
330 y = x;
331 choleskyFactor.solve(y);
332 }
333
334 private:
335 NestedDissection<StorageScalar> choleskyFactor;
336 };
337
338
339 // ----------------------------------------------------------------------------------------------
340
341 namespace DomainDecompositionPreconditionerDetail
342 {
343 template <class Scalar>
344 std::vector<Scalar> choleskyFactor(DynamicMatrix<Scalar> const& A);
345
346 template <class Scalar>
347 void choleskySolve(int n, int nrhs, std::vector<Scalar> const& L, Scalar* b);
348 }
349
356 template <class Scalar>
358
364 template <int m, class Scalar=float>
366 {
368
369 public:
370
371 using field_type = Scalar;
372
373 template <class T, class Index>
374 DenseCholeskyLocalMatrixStorage(std::vector<size_t> const& ap,
376 field_type regularizeForCondition, DenseCholeskyStorageTag<Scalar>)
378 {
380 n = aloc.N();
381
382 regularize<1>(aloc,regularizeForCondition);
383
384 choleskyFactor = DomainDecompositionPreconditionerDetail::choleskyFactor(aloc); // factorize the local block
385 storageSize = sizeof(Scalar)*choleskyFactor.size() + sizeof(size_t)*indices().size();
386 computeFlops = 2*n*n;
387
388#ifdef INEXACTSTORAGE
389 std::random_device rd;
390 std::mt19937 gen(rd());
391 std::uniform_real_distribution<> dis(-1.0,0.0);
392
393 DynamicMatrix<Scalar> L(n,n), delta(n,n);
394 L = 0;
395 delta = 0;
396 auto lpos = begin(choleskyFactor);
397 for (int j=0; j<n; ++j)
398 for (int i=j; i<n; ++i)
399 {
400 L[i][j] = *lpos;
401 ++lpos;
402 delta[i][j] = dis(gen) / (i-j+1);
403 }
404
405 invert(L);
406
407 Scalar epsilon = ::epsilon;
408 delta *= epsilon/two_norm(delta)/two_norm(L);
409
410 lpos = begin(choleskyFactor);
411 for (int j=0; j<n; ++j)
412 for (int i=j; i<n; ++i)
413 {
414 *lpos += delta[i][j];
415 ++lpos;
416 }
417
418#endif
419 }
420
421
422 template <class Vector>
423 void mv(Vector const& x, Vector& y) const
424 {
425 y = x;
427 }
428
429
430 private:
431 std::vector<Scalar> choleskyFactor; // L in LAPACK packed storage
432 int n; // size of L (with scalar entries, not blocks of size m)
433 };
434
435
436
437 // ----------------------------------------------------------------------------------------------
438 // ----------------------------------------------------------------------------------------------
439
449 template <class StorageTag, int components>
451 {
457 using type = void;
458 };
459
460 template <class Scalar, int components>
462 {
464 };
465
466 template <class Scalar, int components>
468 {
470 };
471
472 template <class StorageScalar, class FactorizationScalar, int components>
473 struct PatchDomainDecompositionPreconditionerTraits<NestedDissectionStorageTag<StorageScalar,FactorizationScalar>,components>
474 {
476 };
477
493 template <class Space, int m, class StorageTag,
494 class SparseMatrixIndex = std::size_t>
495 class PatchDomainDecompositionPreconditioner: public SymmetricPreconditioner<typename Space::template Element<m>::type::StorageType,
496 typename Space::template Element<m>::type::StorageType>
497 {
499
500 public:
501 using domain_type = typename Space::template Element<m>::type::StorageType;
503
519 template <class Matrix>
520 PatchDomainDecompositionPreconditioner(Space const& space, Matrix const& A,
521 typename Matrix::field_type regularizeForCondition = -1.0,
522 StorageTag storageTag = StorageTag(),
523 int nTasks = 0)
524 {
525 assert(A.N()>0);
526 assert(A.N()==A.M());
527 Timings& timer = Timings::instance();
528 timer.start("make Schwarz preconditioner");
529
530 // for each patch around a vertex, store the global dof indices
531 std::vector<std::vector<size_t>> patchDofs(space.gridView().size(space.gridView().dimension));
532
533 auto patches = computePatches(space.gridView());
534 assert(patches.size()==patchDofs.size());
535
536 int nThreads = nTasks<=0? 4*NumaThreadPool::instance().cpus()
537 : std::min(nTasks,(int)patches.size());
538 inverseLocalMatrices.resize(nThreads);
539
540 TaskTiming taskTiming(nThreads);
541
542 // Compute the inverse of each block's local stiffness matrix in parallel.
543 parallelFor([&patches,&A,&space,regularizeForCondition,storageTag,&taskTiming,this](size_t j, size_t s)
544 {
545 taskTiming.start(j);
546 // For all patches (in our parallel loop range)
547 for (size_t i=uniformWeightRangeStart(j,s,patches.size()); i<uniformWeightRangeStart(j+1,s,patches.size()); ++i)
548 {
549 auto const& ap = algebraicPatch(space,patches[i]);
550 // todo: (catch possible exception, store by std::exception_ptr (which needs to be passed from main thread and locked), then rethrow in main thread
551 try
552 {
553 InverseLocalMatrix aloc = InverseLocalMatrix(ap,A,regularizeForCondition,storageTag);
554 this->inverseLocalMatrices[j].push_back(std::move(aloc));
555 }
556 catch (DetailedException const& ex)
557 {
558 std::cout << "exception in inverting block " << i << " in thread " << j << " of " << s << ":\n";
559 std::cout << ex.what() << "\n";
560 }
561 }
562 taskTiming.stop(j);
563 },nThreads);
564
565 timer.stop("make Schwarz preconditioner");
566
567std::cerr << "Schwarz preconditioner storage size: " << storage()/(1024*1024) << " MB\n"
568 << " flop count: " << flops()/(1024*1024) << " M\n";
569
570 std::ofstream f("out.gnu");
571 f << taskTiming;
572 }
573
574 virtual typename Space::Scalar applyDp(domain_type& x, range_type const& y)
575 {
576 apply(x,y);
577 return x*y;
578 }
579
585 virtual void apply(domain_type& x, range_type const& y)
586 {
587 using StorageScalar = typename InverseLocalMatrix::field_type;
589
590 // TODO: consider scattering into local copies of global vector? Maybe dependent on the actual
591 // local matrix size? Or range locking?
592 // But for 3D order 3&4 locking appears not to be a performance bottleneck on
593 // a quadcore (as of 2018-11).
594
595 ScopedTimingSection timer("Schwarz preconditioner apply");
596
597 std::mutex rhsAccess;
598 parallelFor([&rhsAccess,&x,&y,this](int j, int s)
599 {
600 for (auto const& alocinv: this->inverseLocalMatrices[j])
601 {
602 auto const& indices = alocinv.indices();
603 int const n = indices.size();
604
605 LocalVector xloc(n);
606 LocalVector yloc(n);
607
608 for (int i=0; i<n; ++i) // gather relevant vector entries from x
609 yloc[i] = y[indices[i]];
610
611 alocinv.mv(yloc,xloc); // perform local multiplication
612
613 std::lock_guard<std::mutex> lock(rhsAccess);
614 for (int i=0; i<n; ++i) // scatter local result to global vector y
615 x[indices[i]] += xloc[i];
616 }
617 },inverseLocalMatrices.size());
618 }
619
620 virtual bool requiresInitializedInput() const
621 {
622 return true;
623 }
624
630 size_t storage() const
631 {
632 size_t s = 0;
633 for (auto const& as: inverseLocalMatrices)
634 for (auto const& a: as)
635 s += a.storage();
636 return s;
637 }
638
642 size_t flops() const
643 {
644 size_t f = 0;
645 for (auto const& as: inverseLocalMatrices)
646 {
647 for (auto const& a: as)
648 f += a.flops();
649 }
650 return f;
651 }
652
653 private:
654 std::vector<std::vector<InverseLocalMatrix>> inverseLocalMatrices;
655 };
656}
657
658#endif
An inverse local matrix representation that simply stores a dense Cholesky factor.
DenseCholeskyLocalMatrixStorage(std::vector< size_t > const &ap, NumaBCRSMatrix< Dune::FieldMatrix< T, m, m >, Index > const &A, field_type regularizeForCondition, DenseCholeskyStorageTag< Scalar >)
A parametrized tag type for selecting storage of inverses by dense Cholesky factors with floating poi...
An inverse local matrix representation that simply stores the dense inverse.
DenseInverseLocalMatrixStorage(std::vector< size_t > const &ap, NumaBCRSMatrix< Dune::FieldMatrix< T, m, m >, Index > const &A, field_type regularizeForCondition, DenseInverseStorageTag< Scalar >)
A parametrized tag type for selecting dense storage of inverses with floating point representation of...
A wrapper class for conveniently providing exceptions with context information.
A LAPACK-compatible dense matrix class with shape specified at runtime.
A base class providing common functionality for storage of inverse local matrices.
void regularize(LocalMatrix &aloc, Real r) const
size_t flops() const
Reports the number of floating point operations per preconditioner application.
size_t storage() const
Reports the storage size (in bytes) of the inverse block matrix.
InverseLocalMatrixStorageBase(std::vector< size_t > const &ap)
std::vector< size_t > const & indices() const
Returns the global degrees of freedom defining the submatrix treated as a block here.
Providing a matrix or array interface to LAPACK-ordered entries.
Providing a vector interface for contiguously stored entries.
Definition: localVectors.hh:31
size_t size() const
The number of entries stored.
Definition: localVectors.hh:47
An inverse local matrix representation that stores a sparse Cholesky factor using nested dissection.
NestedDissectionCholeskyLocalMatrixStorage(std::vector< size_t > const &ap, NumaBCRSMatrix< Dune::FieldMatrix< T, m, m >, Index > const &A, FactorizationScalar regularizeForCondition, NestedDissectionStorageTag< StorageScalar, FactorizationScalar > storageTag)
a nested dissection factorization for symmetric positive definite sparse matrices
size_t storageSize() const
size_t size()
reports the total number of separators in this nested dissection
NestedDissection< StorageScalar > & operator=(NestedDissection< FactorizationScalar > &&nd)
move assignment from different storage scalar type
void solve(T &y) const
NestedDissection(SparseMatrix const &A, size_t skipEntries, size_t skipDimension)
Constructor.
A parametrized tag type for selecting storage of inverses by nested dissection with floating point re...
NestedDissectionStorageTag(size_t skipEntries_=100, size_t skipDimension_=32)
size_t skipDimension
omit further dissection if the size of the system is below this number
size_t skipEntries
omit further dissection if the size of the zero-block (in terms of entries) is below this number
A NUMA-aware compressed row storage matrix adhering mostly to the Dune ISTL interface (to complete....
static NumaThreadPool & instance(int maxThreads=std::numeric_limits< int >::max())
Returns a globally unique thread pool instance.
int cpus() const
Reports the total number of CPUs (usually a multiple of nodes).
Definition: threading.hh:327
An additive overlapping domain decomposition type preconditioner for higher order finite elements app...
virtual void apply(domain_type &x, range_type const &y)
Applys the preconditioner as .
virtual Space::Scalar applyDp(domain_type &x, range_type const &y)
size_t storage() const
Reports the storage size (in bytes) of the preconditioner.
typename Space::template Element< m >::type::StorageType domain_type
PatchDomainDecompositionPreconditioner(Space const &space, Matrix const &A, typename Matrix::field_type regularizeForCondition=-1.0, StorageTag storageTag=StorageTag(), int nTasks=0)
Constructor.
virtual bool requiresInitializedInput() const
Returns true if the target vector x has to be initialized to zero before calling apply or applyDp.
size_t flops() const
reports the number of floating point operations when applying the preconditioner
Scalar Real
The real type on which the scalar field is based.
Definition: scalar.hh:49
A scope guard object that automatically closes a timing section on destruction.
Definition: timing.hh:181
Interface for symmetric preconditioners.
A class that gathers data on task timing and provides gnuplot visualization.
Definition: timing.hh:291
void start(int task)
defines the start of given task.
void stop(int task)
defines the start of given task.
Supports gathering and reporting execution times information for nested program parts.
Definition: timing.hh:64
static Timings & instance()
Returns a reference to a single default instance.
struct Times const * start(std::string const &name)
Starts or continues the timing of given section.
auto computePatches(GridView const &gridview)
Computes the patches around vertices of the grid view.
Definition: supports.hh:165
Dune::FieldVector< T, n > max(Dune::FieldVector< T, n > x, Dune::FieldVector< T, n > const &y)
Componentwise maximum.
Definition: fixdune.hh:110
Dune::FieldVector< T, n > min(Dune::FieldVector< T, n > x, Dune::FieldVector< T, n > const &y)
Componentwise minimum.
Definition: fixdune.hh:122
Index uniformWeightRangeStart(BlockIndex i, BlockIndex n, Index m)
Computes partitioning points of ranges for uniform weight distributions.
Definition: threading.hh:75
void parallelFor(Func const &f, int maxTasks=std::numeric_limits< int >::max())
A parallel for loop that executes the given functor in parallel on different CPUs.
Definition: threading.hh:489
void choleskySolve(int n, int nrhs, std::vector< Scalar > const &L, Scalar *b)
std::vector< Scalar > choleskyFactor(DynamicMatrix< Scalar > const &A)
DynamicMatrix< Scalar > flatMatrix(DynamicMatrix< Scalar > const &A)
Dune::FieldVector< Scalar, dim > Vector
std::array< std::vector< size_t >, 3 > nestedDissection(std::vector< std::pair< size_t, size_t > > const &edges, std::vector< size_t > const &edgeStart)
Computes a nested dissection of the given graph.
EntryTraits< Entry >::real_type frobenius_norm2(Entry const &x)
Definition: scalar.hh:111
Provides the actual type of local matrix storage based on a tag and the number of components.
void type
The actual type used for storing local matrices.