KASKADE 7 development version
conjugation.hh
Go to the documentation of this file.
1/* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */
2/* */
3/* This file is part of the library KASKADE 7 */
4/* https://www.zib.de/research/projects/kaskade7-finite-element-toolbox */
5/* */
6/* Copyright (C) 2002-2020 Zuse Institute Berlin */
7/* */
8/* KASKADE 7 is distributed under the terms of the ZIB Academic License. */
9/* see $KASKADE/academic.txt */
10/* */
11/* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */
12
18#ifndef CONJUGATION_HH
19#define CONJUGATION_HH
20
21#include <cassert>
22#include <memory>
23
24#include <boost/timer/timer.hpp>
25
26#include "dune/istl/bcrsmatrix.hh"
27#include "dune/istl/matrixindexset.hh"
28
31
32#include "utilities/timing.hh"
33
34namespace Kaskade
35{
42 template <class Scalar, class Entry>
43 std::unique_ptr<Dune::MatrixIndexSet> conjugationPattern(Dune::BCRSMatrix<Dune::FieldMatrix<Scalar,1,1> > const& P,
44 Dune::BCRSMatrix<Entry> const& A ,
45 bool onlyLowerTriangle = false)
46 {
47 assert(A.N()==A.M());
48 assert(A.N()==P.N());
49
50 typedef Dune::BCRSMatrix<Entry> MatA;
51 typedef Dune::BCRSMatrix<Dune::FieldMatrix<Scalar,1,1> > MatP;
52
53 // If C = P^T A P, we have that C_{ij} = \sum_{k,l} P_{ki} P_{lj} A_{kl}. Hence, the entry A_{kl} contributes to
54 // all C_{ij} for which there are nonzero entries P_{ki} and P_{lj} in the rows k and l of P. Thus we can simply
55 // run through all nonzeros A_{kl} of A, look up the column indices i,j of rows k and l of P, and flag C_{ij}
56 // as nonzero.
57
58 std::unique_ptr<Dune::MatrixIndexSet> nzC(new Dune::MatrixIndexSet(P.M(),P.M()));
59
60
61 if(onlyLowerTriangle == false)
62 {
63 // Step through all entries of A
64 for (int k=0; k<A.N(); ++k)
65 for (typename MatA::ConstColIterator ca=A[k].begin(); ca!=A[k].end(); ++ca)
66 {
67 int const l = ca.index();
68 // Step through all entries of rows k and l of P and add entry
69 for (typename MatP::ConstColIterator cpk=P[k].begin(); cpk!=P[k].end(); ++cpk)
70 for (typename MatP::ConstColIterator cpl=P[l].begin(); cpl!=P[l].end(); ++cpl)
71 nzC->add(cpk.index(),cpl.index());
72 }
73 }
74 else
75 {
76 // Step through all entries of A
77 for (int k=0; k<A.N(); ++k)
78 for (typename MatA::ConstColIterator ca=A[k].begin(); ca!=A[k].end(); ++ca)
79 {
80 int const l = ca.index();
81 // Step through all entries of rows k and l of P and add entry
82 for (typename MatP::ConstColIterator cpk=P[k].begin(); cpk!=P[k].end(); ++cpk)
83 for (typename MatP::ConstColIterator cpl=P[l].begin(); cpl!=P[l].end(); ++cpl)
84 {
85 if( cpk.index() >= cpl.index() )
86 nzC->add(cpk.index(),cpl.index());
87 else
88 nzC->add(cpl.index(),cpk.index());
89 }
90 }
91 }
92
93 return nzC;
94
95 // An alternative way of computing the sparsity pattern would be to use that the nonzero entries j in column i
96 // of C are exactly those for which there is k with (nonzero P_{jk} and there is l with (nonzero P_{il} and A_{lk})).
97 // Hence we can obtain the column index set J directly by the following steps:
98 // (i) find all l with P_{li} nonzero -> L [requires to access columns of P - compute the transpose patterns once]
99 // (ii) find all k with A_{lk} nonzero for some l in L -> K [probably sorting K and removing doubled entries would be a good idea here]
100 // (iii) find all j with P_{kj} nonzero for some k in K -> J
101 // Compared to the above implementation this would have the following (dis)advantages
102 // + easy to do in parallel (since write operations are separated)
103 // + fewer scattered write accesses to memory
104 // - more complex implementation
105 // - requires the transpose pattern of P
106 }
107
122 template <class IndexP, class EntryP, class IndexA, class EntryA>
124 bool onlyLowerTriangle = false, bool createDiagonal=false)
125 {
126 assert(A.N()==A.M());
127 static_assert(EntryA::rows==EntryA::cols,"central matrix entries in conjugation have to be square");
128 assert(A.N()==P.N());
129
130 Timings& timer = Timings::instance();
131
132
133 // First create the sparsity pattern
134 timer.start("conjugation pattern");
135 NumaCRSPatternCreator<IndexP> creator(P.M(),P.M(),onlyLowerTriangle);
136
137 // If C = P^T A P, we have that C_{ij} = \sum_{k,l} P_{ki} P_{lj} A_{kl}. Hence, the entry A_{kl} contributes to
138 // all C_{ij} for which there are nonzero entries P_{ki} and P_{lj} in the rows k and l of P. Thus we can simply
139 // run through all nonzeros A_{kl} of A, look up the column indices i,j of rows k and l of P, and flag C_{ij}
140 // as nonzero.
141
142 { // just a new scope
143 // helper routine for extracting all column indices of a row
144 auto getColumnIndices = [] (auto const& row, std::vector<IndexP>& ci)
145 {
146 ci.clear();
147 for (auto i=row.begin(); i!=row.end(); ++i) // indices i for which Pki != 0
148 ci.push_back(i.index());
149 };
150 std::vector<IndexP> is, js;
151
152 // Step through all entries of A
153 for (IndexA k=0; k<A.N(); ++k)
154 {
155 getColumnIndices(P[k],is); // indices i for which Pki != 0
156
157 auto row = A[k];
158 for (auto ca=row.begin(); ca!=row.end(); ++ca)
159 {
160 IndexA const l = ca.index();
161 getColumnIndices(P[l],js); // indices j for which Plj != 0
162
163 // add all combinations i,j
164 creator.addElements(std::begin(is),std::end(is),std::begin(js),std::end(js));
165 if (onlyLowerTriangle && k>l) // subdiagonal entry (k,l) of A -> entry (l,k) must be treated implicitly:
166 // add all combinations (j,i)
167 creator.addElements(std::begin(js),std::end(js),std::begin(is),std::end(is));
168 }
169 }
170 }
171
172 if (createDiagonal)
173 creator.addDiagonal();
174 timer.stop("conjugation pattern");
175
176 // An alternative way of computing the sparsity pattern would be to use that the nonzero entries j in column i
177 // of C are exactly those for which there is k with (nonzero P_{jk} and there is l with (nonzero P_{il} and A_{lk})).
178 // Hence we can obtain the column index set J directly by the following steps:
179 // (i) find all l with P_{li} nonzero -> L [requires to access columns of P - compute the transpose patterns once]
180 // (ii) find all k with A_{lk} nonzero for some l in L -> K [probably sorting K and removing doubled entries would
181 // be a good idea here]
182 // (iii) find all j with P_{kj} nonzero for some k in K -> J
183 // Compared to the above implementation this would have the following (dis)advantages
184 // + easy to do in parallel (since write operations are separated)
185 // + fewer scattered write accesses to memory
186 // - more complex implementation
187 // - requires the transpose pattern of P
188
189 // Create the sparse matrix. First define the resulting entry size: If P has scalar entries, this is just
190 // the (possibly matrix-valued) size of entries in A. Otherwise, we take the triple product of P^T A P entries.
191 // Note that rows/cols are enum - need to cast to int here.
192 constexpr int entrySize = (EntryP::rows==1 && EntryP::cols==1)? (int)EntryA::rows: (int)EntryP::cols;
194
195 timer.start("matrix creation");
196 NumaBCRSMatrix<Entry,IndexP> pap(creator);
197 timer.stop("matrix creation");
198
199 // Fill the sparse matrix PAP. This is done as before by stepping through all Akl entries and scatter
200 // Pki*Plj*Akl into PAPij.
201 //
202 // An alternative way of computing P^TAP would be a gather operation with inverted loop order:
203 // Cij = sum_kl Pki*Plj*Akl. This requires P^T for efficient determination of required kl indices.
204 // While the transpose construction is efficient, the gather implementation ist not (tested 2016-01-17),
205 // presumably because A is larger than PAP and the scattered accesses have a worse locality.
206 // Sequential performance was more than 10-fold slower than the scatter implementation below, so
207 // we stick to the scatter.
208 //
209 // A second alternative is to create a triplet matrix first. This appears to be a factor 3 slower in
210 // sequential implementation, and incurs a high memory footprint as several entries are duplicate.
211
212 auto getEntryValues = [] (auto const& row, std::vector<EntryP>& vi)
213 {
214 vi.clear();
215 for (auto i=row.begin(); i!=row.end(); ++i) // indices i for which Pki != 0
216 vi.push_back(*i);
217 };
218
219 auto getColumnIndices = [] (auto const& row, auto& ci) // computes (global,local) column index pairs of row k of P
220 {
221 ci.clear();
222 int idx = 0;
223 for (auto i=row.begin(); i!=row.end(); ++i, ++idx) // indices i for which Pki != 0
224 ci.push_back(std::make_pair(i.index(),idx));
225 };
226
227 timer.start("conjugation scatter");
228 parallelFor([&](size_t block, size_t nBlocks)
229 {
230 size_t rowStart = uniformWeightRangeStart(block,nBlocks,A.N());
231 size_t rowEnd = uniformWeightRangeStart(block+1,nBlocks,A.N());
232 std::vector<EntryP> pk, pl;
233 using SortedIndices = std::vector<std::pair<IndexP,int>>;
234 SortedIndices is, js;
236 for (IndexA k=rowStart; k<rowEnd; ++k)
237 {
238 auto Pk = P[k];
239 getColumnIndices(Pk,is); // indices i for which Pki != 0
240 getEntryValues(Pk,pk);
241
242 auto row = A[k];
243 for (auto ca=row.begin(); ca!=row.end(); ++ca)
244 {
245 IndexA const l = ca.index(); // column index of Akl
246 auto Pl = P[l];
247 getColumnIndices(Pl,js); // indices j for which Plj != 0
248 getEntryValues(Pl,pl);
249
250 // For each entry Akl of A we have to create one local matrix.
251 localMatrices.push_back(is,js);
252
253 auto Akl = *ca;
254 for (int i=0; i<is.size(); ++i) // just scatter Akl to all affected PAP entries
255 {
256 auto pki = normalForm(pk[i]);
257 auto Pki_Akl = transpose(pki) * Akl;
258 for (int j=0; j<pl.size(); ++j)
259 {
260 auto plj = normalForm(pl[j]);
261 localMatrices.back()(i,j) = Pki_Akl * plj;
262 }
263 }
264
265 if (onlyLowerTriangle && k>l)
266 {
267 // treat Alk entry
268 abort();
269 }
270 }
271 }
273
274 timer.stop("conjugation scatter");
275
276 return pap;
277 }
278
285 template <class Scalar, class Entry>
286 void conjugation(Dune::BCRSMatrix<Entry>& C,
287 Dune::BCRSMatrix<Dune::FieldMatrix<Scalar,1,1> > const& P,
288 Dune::BCRSMatrix<Entry> const& A,
289 bool onlyLowerTriangle = false )
290 {
291 assert(A.N()==A.M());
292 assert(A.N()==P.N());
293 assert(C.N()==P.M());
294 assert(C.M()==P.M());
295
296 typedef Dune::BCRSMatrix<Entry> MatA;
297 typedef Dune::BCRSMatrix<Dune::FieldMatrix<Scalar,1,1> > MatP;
298
299 if(onlyLowerTriangle == false )
300 {
301 // Step through all entries of A
302 for (int k=0; k<A.N(); ++k)
303 for (typename MatA::ConstColIterator ca=A[k].begin(); ca!=A[k].end(); ++ca)
304 {
305 int const l = ca.index();
306 // Step through all entries of rows k and l of P and add entry
307 for (typename MatP::ConstColIterator cpk=P[k].begin(); cpk!=P[k].end(); ++cpk)
308 for (typename MatP::ConstColIterator cpl=P[l].begin(); cpl!=P[l].end(); ++cpl)
309 C[cpk.index()][cpl.index()].axpy((*cpl) * (*cpk),(*ca));
310 }
311 }
312 else
313 {
314 // Step through all entries of A
315 for (int k=0; k<A.N(); ++k)
316 for (typename MatA::ConstColIterator ca=A[k].begin(); ca!=A[k].end(); ++ca)
317 {
318 int const l = ca.index();
319 for (typename MatP::ConstColIterator cpk=P[k].begin(); cpk!=P[k].end(); ++cpk)
320 for (typename MatP::ConstColIterator cpl=P[l].begin(); cpl!=P[l].end(); ++cpl)
321 {
322 if( cpk.index() >= cpl.index() )
323 C[cpk.index()][cpl.index()].axpy((*cpl) * (*cpk), (*ca) );
324 }
325 if(k>l)
326 {
327 for (typename MatP::ConstColIterator cpl=P[l].begin(); cpl!=P[l].end(); ++cpl)
328 for (typename MatP::ConstColIterator cpk=P[k].begin(); cpk!=P[k].end(); ++cpk)
329 if( cpl.index() >= cpk.index() )
330 C[cpl.index()][cpk.index()].axpy((*cpl) * (*cpk) , (*ca) );
331 }
332 }
333 }
334 }
335}
336#endif
337
338
A structure for holding a sequence of several local matrices to be filled sequentially and to be scat...
value_type & back()
A reference to the last pushed local matrix.
void push_back(SortedRowIdx const &ridx, SortedColIdx const &cidx)
Appends another (zero-initialized) local matrix.
A NUMA-aware compressed row storage matrix adhering mostly to the Dune ISTL interface (to complete....
Index M() const
The number of columns.
Index N() const
The number of rows.
A NUMA-aware creator for matrix sparsity patterns.
void addElements(IterRow const fromRow, IterRow const toRow, IterCol const fromCol, IterCol const toCol, bool colIsSorted=false)
Enters entries into the sparsity pattern.
void addDiagonal()
Enters the diagonal elements.
static NumaThreadPool & instance(int maxThreads=std::numeric_limits< int >::max())
Returns a globally unique thread pool instance.
int cpus() const
Reports the total number of CPUs (usually a multiple of nodes).
Definition: threading.hh:327
Supports gathering and reporting execution times information for nested program parts.
Definition: timing.hh:64
static Timings & instance()
Returns a reference to a single default instance.
void stop(std::string const &name)
Stops the timing of given section.
struct Times const * start(std::string const &name)
Starts or continues the timing of given section.
auto conjugation(NumaBCRSMatrix< EntryP, IndexP > const &P, NumaBCRSMatrix< EntryA, IndexA > const &A, bool onlyLowerTriangle=false, bool createDiagonal=false)
Creates the conjugation product .
Definition: conjugation.hh:123
Index uniformWeightRangeStart(BlockIndex i, BlockIndex n, Index m)
Computes partitioning points of ranges for uniform weight distributions.
Definition: threading.hh:75
void parallelFor(Func const &f, int maxTasks=std::numeric_limits< int >::max())
A parallel for loop that executes the given functor in parallel on different CPUs.
Definition: threading.hh:489
auto normalForm(Dune::FieldMatrix< T, n, m > const &A)
Definition: fixdune.hh:289
void conjugation(Dune::BCRSMatrix< Entry > &C, Dune::BCRSMatrix< Dune::FieldMatrix< Scalar, 1, 1 > > const &P, Dune::BCRSMatrix< Entry > const &A, bool onlyLowerTriangle=false)
Computes the triple sparse matrix product .
Definition: conjugation.hh:286
T transpose(T x)
std::unique_ptr< Dune::MatrixIndexSet > conjugationPattern(Dune::BCRSMatrix< Dune::FieldMatrix< Scalar, 1, 1 > > const &P, Dune::BCRSMatrix< Entry > const &A, bool onlyLowerTriangle=false)
Creates the sparsity pattern of .
Definition: conjugation.hh:43