KASKADE 7 development version
threadedMatrix.hh
Go to the documentation of this file.
1/* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */
2/* */
3/* This file is part of the library KASKADE 7 */
4/* https://www.zib.de/research/projects/kaskade7-finite-element-toolbox */
5/* */
6/* Copyright (C) 2012-2023 Zuse Institute Berlin */
7/* */
8/* KASKADE 7 is distributed under the terms of the ZIB Academic License. */
9/* see $KASKADE/academic.txt */
10/* */
11/* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */
12
13#ifndef THREADED_MATRIX
14#define THREADED_MATRIX
15
16#include <cmath>
17#include <iostream>
18#include <boost/timer/timer.hpp>
19
20#include "dune/common/config.h"
21#include <dune/istl/bcrsmatrix.hh>
22#include <dune/istl/operators.hh>
23
24#include "fem/firstless.hh"
25#include "fem/fixdune.hh"
30
31
32
33namespace Kaskade {
37 // forward declarations
38 template <class Entry, class Index>
39 class NumaBCRSMatrix;
40
41 template <class Scalar, class Index>
42 class MatrixAsTriplet;
43
44 template <class Target, class Source, class RowIndices, class ColIndices>
45 Target submatrix(Source const& A, RowIndices const& ri, ColIndices const& ci);
46
47
48
49 namespace ThreadedMatrixDetail
50 {
51 // Computes for each row the number of entries in that row. The count is stored in the
52 // vector rowCount, which has to be big enough.
53 template <class Entry, class Allocator, class Index>
54 void getRowCount(Dune::BCRSMatrix<Entry,Allocator> const& matrix, bool isTransposed, std::vector<Index>& rowCount)
55 {
56 for (auto ri=matrix.begin(); ri!=matrix.end(); ++ri)
57 if (isTransposed)
58 for (auto ci=ri->begin(); ci!=ri->end(); ++ci)
59 ++rowCount[ci.index()];
60 else
61 rowCount[ri.index()] = ri->size();
62 }
63
64 template <class Entry, class Index2, class Index>
65 void getRowCount(NumaBCRSMatrix<Entry,Index2> const& matrix, bool isTransposed, std::vector<Index>& rowCount)
66 {
67 for (auto ri=matrix.begin(); ri!=matrix.end(); ++ri)
68 if (isTransposed)
69 for (auto ci=ri->begin(); ci!=ri->end(); ++ci)
70 ++rowCount[ci.index()];
71 else
72 rowCount[ri.index()] = ri->size();
73 }
74
75 template <class Entry, class Index, class Index2>
76 void getRowCount(MatrixAsTriplet<Entry,Index2> const& matrix, bool isTransposed, std::vector<Index>& rowCount)
77 {
78 auto const& indices = isTransposed? matrix.cidx: matrix.ridx;
79 for (auto r: indices)
80 ++rowCount[r];
81 }
82
83
84 // Class with partial specialization for copying from a Dune ISTL sparse matrix into ThreadedMatrix Chunks.
85 // Supported modes of copying are (i) one-to-one direct copy (ii) transposed copy (iii) source is symmetric
86 // (superdiagonal part is never accessed).
87 template <class Entry, class Matrix, bool symmetric, bool transposed, class Index>
89 {
90 // The simple case: not symmetric, not transposed
91 static void init(Index first, Index last, Matrix const& matrix, std::vector<size_t,NumaAllocator<size_t>>& colStart,
92 std::vector<Index,NumaAllocator<Index>>& cols, std::vector<Entry,NumaAllocator<Entry>>& values,
93 std::vector<Index> const& /* nRowEntries */)
94 {
95 for (Index i=0; i<last-first; ++i)
96 {
97 Index j=0;
98 for (auto ci=matrix[first+i].begin(); ci!=matrix[first+i].end(); ++ci, ++j)
99 {
100 values[colStart[i]+j] = *ci;
101 cols[colStart[i]+j] = ci.index();
102 }
103 }
104 }
105 };
106
107 template <class Entry, class Matrix, bool transposed, class Index>
108 struct CopyMatrixToChunk<Entry, Matrix,true,transposed,Index>
109 {
110 // The hard case: symmetric with only lower triangular part to be accessed
111 static void init(Index first, Index last, Matrix const& matrix, std::vector<size_t,NumaAllocator<size_t>>& colStart,
112 std::vector<Index,NumaAllocator<Index>>& cols, std::vector<Entry,NumaAllocator<Entry>>& values,
113 std::vector<Index> const& nRowEntries)
114 {
115 // Symmetric case is a mixture of regular and transposed. First we insert the regular entries,
116 // subsequently the transposed ones.
117
118 // Start with regular entries (including the diagonal)
119 std::vector<Index> entriesInRow(last-first,0);
120 for (Index i=0; i<last-first; ++i)
121 {
122 Index j=0;
123 for (auto ci=matrix[first+i].begin(); ci!=matrix[first+i].end(); ++ci, ++j)
124 {
125 values[colStart[i]+j] = *ci;
126 cols[colStart[i]+j] = ci.index();
127 }
128 entriesInRow[i] = matrix[first+i].size();
129 assert(entriesInRow[i]<=nRowEntries[first+i]);
130 }
131
132 // Do the transposed entries (excluding the diagonal). Note that as we only want entries with
133 // column index >= first+1, we can start (for the transposed part) at row first+1
134 for (Index r=first+1; r!=matrix.N(); ++r)
135 for (auto c=matrix[r].begin(); c!=matrix[r].end(); ++c)
136 {
137 // get transposed indices
138 Index ridx = c.index();
139 if (ridx>=first && ridx<last)
140 {
141 Index cidx = r;
142
143 if (cidx > ridx) // take only superdiagonal entries
144 {
145 assert(ridx-first<colStart.size() && ridx-first < entriesInRow.size());
146 Index pos = colStart[ridx-first] + entriesInRow[ridx-first];
147 assert(pos<colStart[ridx-first+1]);
148 cols[pos] = cidx;
149 values[pos] = transpose(*c);
150 ++entriesInRow[ridx-first];
151 assert(entriesInRow[ridx-first]<=nRowEntries[ridx]);
152 }
153 }
154 }
155 }
156 };
157
158
159 template <class Entry, class Matrix, class Index>
160 struct CopyMatrixToChunk<Entry, Matrix,false,true,Index>
161 {
162 // The case of transposed, nonsymmetric supplied matrices.
163 static void init(Index first, Index last, Matrix const& matrix, std::vector<size_t,NumaAllocator<size_t>>& colStart,
164 std::vector<Index,NumaAllocator<Index>>& cols, std::vector<Entry,NumaAllocator<Entry>>& values,
165 std::vector<Index> const& nRowEntries)
166 {
167 // Essentially we have to exchange row and column indices. If we go through the matrix
168 // row by row, the entries of the transposed matrix to be stored in our CRS chunk come
169 // with random row indices but increasing column indices. Hence we can push the entries
170 // at the back of each row. We only have to remember where the current end of each row is.
171 std::vector<Index> entriesInRow(last-first,0);
172
173 for (auto r=matrix.begin(); r!=matrix.end(); ++r)
174 for (auto c=r->begin(); c!=r->end(); ++c)
175 {
176 // get transposed indices
177 Index ridx = c.index();
178 if (ridx>=first && ridx<last)
179 {
180 Index cidx = r.index();
181
182 assert(ridx-first<colStart.size() && ridx-first < entriesInRow.size());
183 Index pos = colStart[ridx-first] + entriesInRow[ridx-first];
184 assert(pos<colStart[ridx-first+1]);
185 cols[pos] = cidx;
186 values[pos] = transpose(*c);
187 ++entriesInRow[ridx-first];
188 assert(entriesInRow[ridx-first]<=nRowEntries[ridx]);
189 }
190 }
191 }
192 };
193
194 //-------------------------------------------------------------------------
195
196 // Class for copying matrix entries. If the entry is to be transposed during
197 // copying (e.g. because only the lower triangular part of the source block matrix
198 // has been stored), the matrix dimensions may change due to transposition
199 // (if they are not equal). This case is covered here - we know the entry is
200 // to be transposed.
201 template <class To, class From, class scalarsMatch = std::false_type>
203 {
204 static void copy(From const& from, To& to, bool isTransposed)
205 {
206 assert(isTransposed);
207 to = transpose(from);
208 }
209 };
210
211 // This specialization is for the case of different floating point types as scalar entries in the matrix blocks.
212 // Currently this is only implemented for blocks of quadratic shape and non transposed blocks.
213 template <class To, class From>
214 struct MatrixEntry<To, From, typename std::is_same<typename To::field_type, typename From::field_type>::type>
215 {
216 static void copy(From const& from, To& to, bool isTransposed)
217 {
218 assert(!isTransposed);
219 static_assert(To::rows == From::rows && To::cols == From::cols, "Mismatch of dimensions.");
220 for(int i=0;i<To::rows;++i)
221 for(int j=0;j<To::cols;++j)
222 to[i][j] = static_cast<typename To::field_type>(from[i][j]);
223 }
224 };
225
226 // The case of matching dimensions of source and target is covered here.
227 // Both cases can occur, transposition or not. Partial specialization of
228 // this handler class is necessary due to the runtime switch between
229 // transposed or not - this dynamic switch is only legal in C++ if the
230 // dimensions are the same in both cases.
231 template <class Entry>
232 struct MatrixEntry<Entry,Entry>
233 {
234 static void copy(Entry const& from, Entry& to, bool isTransposed)
235 {
236 if (isTransposed && from.rows > 1 && from.cols > 1 && from.rows != from.cols)
237 {
238 std::cerr << "Transposing rectangular Dune::FieldMatrix not implemented!" << std::endl
239 << "#rows=" << from.rows << ", #cols=" << from.cols << std::endl;
240 exit(1);
241 }
242 if (isTransposed && from.rows == from.cols)
243// to = transpose(from);
244// the following transposing code also covers the case that
245// the memory locations of from and to are the same
246 {
247 for (int i=0;i<from.rows;i++)
248 {
249 to[i][i]=from[i][i];
250 for (int j=i+1;j<from.cols;j++)
251 {
252 typename Entry::field_type s=from[j][i];
253 to[j][i]=from[i][j];
254 to[i][j]=s;
255 }
256 }
257 }
258 else
259 to = from;
260 }
261 };
262
263 //-------------------------------------------------------------------------
264
269 template <class Index=size_t>
271 {
272 public:
281 CRSChunkPatternInfo(Index first_, Index last_, Index cols_, bool symmetric, int node)
282 : firstRow(first_), lastRow(last_), cols(cols_), numaNode(node), symm(symmetric)
283 {}
284
288 Index first() const { return firstRow; }
289
293 Index last() const { return lastRow; }
294
298 Index columns() const { return cols; }
299
303 int node() const { return numaNode; }
304
308 bool symmetric() const { return symm; }
309
310 protected:
311 Index firstRow, lastRow; // first and one behind last row
312 Index cols; // number of columns
313
314 private:
315 int numaNode;
316 bool symm;
317 };
318
319 //-------------------------------------------------------------------------
320
333 template <class Index=size_t>
335 {
336 public:
340 typedef std::vector<Index> IndexArray; // column indices are indexed by user-provided Index type
341 // We could use a Numa allocator here, but testing as of 2014-03-12 revealed that the performance is
342 // *much* worse with Numa allocators, even though the Numa allocator is actually faster in
343 // synthetic tests of a structure similar to here. This reason for this disparity is up to now unknown.
344
345
354 CRSChunkPatternCreator(Index firstRow, Index lastRow, Index ncols, bool symmetric, int node)
356 cols(lastRow-firstRow)
357 {
358 }
359
366 void reserve(Index nnzPerRow=8)
367 {
368 // perform allocation
369 for (auto& r: cols)
370 r.reserve(nnzPerRow);
371 }
372
376 void clear()
377 {
378 for (auto& r: cols)
379 {
380 r.clear();
381 r.shrink_to_fit();
382 }
383 }
384
400 template <class IterRow, class IterCol>
401 void addElements(IterRow fromRow, IterRow const toRow,
402 IterCol const fromCol, IterCol const toCol, bool colIsSorted=false)
403 {
404 // sort column indices if needed, and remove duplicates (set_union below does NOT remove duplicates...)
405 sortedCols.clear();
406 sortedCols.insert(end(sortedCols),fromCol,toCol);
407 if (!colIsSorted)
408 std::sort(begin(sortedCols),end(sortedCols));
409 sortedCols.erase(std::unique(begin(sortedCols),end(sortedCols)),end(sortedCols));
410
411 // For each row, merge the specified column indices into the column indices that are already present.
412 for ( ; fromRow != toRow; ++fromRow) // step through all affected rows
413 if (this->first() <= *fromRow && *fromRow < this->last()) // but treat only those that actually lie in our chunk
414 {
415 IndexArray& c = cols[*fromRow-this->first()];
416 auto top = this->symmetric()?
417 std::upper_bound(begin(sortedCols),end(sortedCols),*fromRow):
418 end(sortedCols); // omit superdiagonal elements if symmetric
419 tmp.resize(c.size() + std::distance(begin(sortedCols),top)); // reserve space for the union
420 tmp.erase(std::set_union(begin(c),end(c),begin(sortedCols),top, // compute the union
421 begin(tmp)),end(tmp)); // and cut the container at the proper end
422 std::swap(tmp,c); // write the sorted range into its target
423 }
424 // Note that instead of a temporary buffer tmp we could append the new column indices and use
425 // std::inplace_merge. This, however, uses a temporary buffer of its own behind the scenes, one that is held
426 // by the STL library implementation somewhere. It is highly probable that this memory is *not* located
427 // on our NUMA node, but frequently accessed from all nodes. Hence, false sharing is bound to occur with
428 // inplace_merge even if it performs proper locking.
429 }
430
435 {
436 for (Index i=0; i<cols.size(); ++i)
437 {
438 cols[i].resize(this->symmetric()? i+this->first()+1: columns);
439 std::iota(begin(cols[i]),end(cols[i]),static_cast<Index>(0));
440 }
441 }
442
447 IndexArray const& row(Index i) const { return cols[i-this->first()]; }
448
452 size_t nonzeroes() const
453 {
454 size_t nnz = 0;
455 for (auto const& r: cols)
456 nnz += r.size();
457 return nnz;
458 }
459
470 size_t balanceForward(size_t const covered, size_t const nnz, int chunks, std::vector<IndexArray>& moveRows);
471
482 size_t balanceBackward(size_t covered, size_t const nnz, int chunks, std::vector<IndexArray>& moveRows);
483
484 private:
485 std::vector<IndexArray> cols; // For each row a std::vector of column indices
486 IndexArray tmp; // some temporary buffer
487 IndexArray sortedCols; // some temporary buffer for sorted column indices
488 };
489
490 //-------------------------------------------------------------------------
491
497 template <class Index=size_t>
499 {
500 public:
501
508
524 template <class Expanded, class Condensed, class Matrix>
525 CRSChunkPattern(Index first, Index last, Expanded const& eIndices, Condensed const& cIndices,
526 Matrix const& mat, int node)
527 : CRSChunkPatternInfo<Index>(first,last,eIndices.size(),false,node)
528 , colStarts(last-first+1,0,NumaAllocator<size_t>(node)), cols(NumaAllocator<Index>(node))
529 {
530 cols.reserve(last-first); // prevent frequent reallocation (heuristic)
531
532 for (Index i=first; i<last; ++i) // scan all our rows
533 {
534 colStarts[i-first] = cols.size(); // note where this row starts
535
536 auto const& row = mat[eIndices[i]]; // get a handle to the column indices in that row
537
538 auto cend = row.end();
539 for (auto c = row.begin(); c!=cend; ++c) // step through all column entries in that row
540 {
541 Index ci = cIndices[c.index()]; // extract the condensed index
542 if (ci<eIndices.size()) // if that is in the condensed matrix range...
543 cols.push_back(ci); // ...include it
544 }
545 }
546
547 colStarts.back() = cols.size(); // sentinel
548
549 // investigate sparsity
550 nnzPerRow = this->last()==this->first()? 0 : nonzeroes() / (this->last()-this->first());
551 }
552
565 template <class Matrix>
566 CRSChunkPattern(Matrix const& matrix, bool isSymmetric, bool isTransposed,
567 Index firstRow, Index lastRow, bool symmetric, int node)
569 , colStarts(NumaAllocator<size_t>(node)), cols(NumaAllocator<Index>(node))
570 {
571 if (isSymmetric) // Make sure that transposed is only flagged if
572 isTransposed = false; // it's not symmetric.
573
574 // Compute start of row indices. Step through the supplied matrix and count all elements that
575 // fall in one of our rows.
576 std::vector<size_t,NumaAllocator<size_t>> rowCount(this->last()-this->first(),0,NumaAllocator<size_t>(node));
577
578 // Compute the row range of the supplied matrix we have to scan.
579 Index fromRow = this->first(), toRow = this->last();
580 if (isSymmetric && !symmetric) // If the supplied matrix is symmetrically stored but we are not, we have to
581 toRow = matrix.N(); // scan the later rows for the elements that appear in later columns here.
582 if (isTransposed) // If the supplied matrix is transposed, we have to scan all the rows
583 fromRow = 0, toRow = matrix.N(); // in order to cover all the columns in our range.
584
585 for (Index ri=fromRow; ri<toRow; ++ri)
586 {
587 auto cend = matrix[ri].end();
588 for (auto ci=matrix[ri].begin(); ci!=cend; ++ci)
589 {
590 Index r = ri, c = ci.index();
591 if (isTransposed) // supplied matrix is transposed, swap row and column indices
592 std::swap(r,c); // to get the row/col indices in our world
593 if (this->first()<=r && r<this->last() && (!symmetric || c<=r)) // yep, this element is in our chunk
594 ++rowCount[r-this->first()]; //
595 if (isSymmetric && !symmetric && c<r && this->first()<=c && c<this->last()) // if supplied matrix is stored symmetrically but we are not
596 ++rowCount[c-this->first()]; // copy truely subdiagonal entries to the upper triangle
597 }
598 }
599
600 // compute the partial sums of number of elements per row.
601 colStarts.resize(rowCount.size()+1,0);
602 std::partial_sum(rowCount.begin(),rowCount.end(),colStarts.begin()+1);
603
604 // Get space for all nonzeros
605 cols.resize(colStarts.back());
606
607 // Copy the column indices of elements.
608 for (Index ri=fromRow; ri<toRow; ++ri)
609 {
610 auto cend = matrix[ri].end();
611 for (auto ci=matrix[ri].begin(); ci!=cend; ++ci)
612 {
613 Index r = ri, c = ci.index();
614 if (isTransposed) // supplied matrix is transposed, swap row and column indices
615 std::swap(r,c); // to get the row/col indices in our world
616 if (this->first()<=r && r<this->last() && (!symmetric || c<=r)) // yep, this element is in our chunk
617 {
618 cols[colStarts[r-this->first()]+rowCount[r-this->first()]-1] = c; // enter the column index
619 --rowCount[r-this->first()]; // one less element to come
620 }
621 if (isSymmetric && !symmetric && c<r && this->first()<=c && c<this->last()) // if supplied matrix is stored symmetrically but we are not
622 {
623 cols[colStarts[c-this->first()]+rowCount[c-this->first()]-1] = r; // copy truely subdiagonal entries to the upper triangle
624 --rowCount[c-this->first()]; // one less element to come
625 }
626 }
627 }
628
629 // sort the column indices in each row
630 for (Index ri=0; ri<rowCount.size(); ++ri) {
631 assert(rowCount[ri] == 0);
632 std::sort(cols.begin()+colStarts[ri],cols.begin()+colStarts[ri+1]);
633 }
634
635 // investigate sparsity
636 nnzPerRow = this->last()==this->first()? 0 : nonzeroes() / (this->last()-this->first());
637 }
638
649 template <class Scalar, class Index2>
650 CRSChunkPattern(MatrixAsTriplet<Scalar,Index2> const& matrix, bool isSymmetric, bool isTransposed, // TODO: fuse with constructor above
651 Index firstRow, Index lastRow, bool symmetric, int node)
653 , colStarts(NumaAllocator<size_t>(node)), cols(NumaAllocator<Index>(node))
654 {
655 if (isSymmetric) // Make sure that transposed is only flagged if
656 isTransposed = false; // it's not symmetric.
657
658 // Compute start of row indices. Step through the supplied matrix and count all elements that
659 // fall in one of our rows.
660 std::vector<size_t,NumaAllocator<size_t>> rowCount(this->last()-this->first(),0,NumaAllocator<size_t>(node));
661
662 for (size_t i=0; i<matrix.ridx.size(); ++i)
663 {
664 Index r = matrix.ridx[i], c = matrix.cidx[i];
665 if (isTransposed) // supplied matrix is transposed, swap row and column indices
666 std::swap(r,c); // to get the row/col indices in our world
667 if (this->first()<=r && r<this->last() && (!symmetric || c<=r)) // yep, this element is in our chunk
668 ++rowCount[r-this->first()]; //
669 if (isSymmetric && !symmetric && c<r && this->first()<=c && c<this->last()) // if supplied matrix is stored symmetrically but we are not
670 ++rowCount[c-this->first()]; // copy truely subdiagonal entries to the upper triangle
671 }
672
673 // compute the partial sums of number of elements per row.
674 colStarts.resize(rowCount.size()+1,0);
675 std::partial_sum(rowCount.begin(),rowCount.end(),colStarts.begin()+1);
676
677 // Get space for all nonzeros
678 cols.resize(colStarts.back());
679
680 // Copy the column indices of elements.
681 for (size_t i=0; i<matrix.ridx.size(); ++i)
682 {
683 Index r = matrix.ridx[i], c = matrix.cidx[i];
684 if (isTransposed) // supplied matrix is transposed, swap row and column indices
685 std::swap(r,c); // to get the row/col indices in our world
686 if (this->first()<=r && r<this->last() && (!symmetric || c<=r)) // yep, this element is in our chunk
687 {
688 cols[colStarts[r-this->first()]+rowCount[r-this->first()]-1] = c; // enter the column index
689 --rowCount[r-this->first()]; // one less element to come
690 }
691 if (isSymmetric && !symmetric && c<r && this->first()<=c && c<this->last()) // if supplied matrix is stored symmetrically but we are not
692 {
693 cols[colStarts[c-this->first()]+rowCount[c-this->first()]-1] = r; // copy truely subdiagonal entries to the upper triangle
694 --rowCount[c-this->first()]; // one less element to come
695 }
696 }
697
698 // sort the column indices in each row
699 for (Index ri=0; ri<rowCount.size(); ++ri) // TODO: do this in parallel on our node?
700 {
701 assert(rowCount[ri] == 0);
702 auto first = cols.begin()+colStarts[ri];
703 auto last = cols.begin()+colStarts[ri+1];
704 std::sort(first,last); // sort in ascending order
705 last = std::unique(first,last); // duplicate entries may come from triplet matrices, remove
706 rowCount[ri] = last-first; // now we have the correct number of entries.
707 }
708
709 // Merging duplicate column indices in a row may have reduced the number of entries in that row,
710 // hence there may be a gap to the start of the next row. Compactify the column index array now.
711 for (Index ri=1; ri<rowCount.size(); ++ri)
712 {
713 Index newStart = colStarts[ri-1] + rowCount[ri-1];
714 if (newStart != colStarts[ri]) // entries need to be shifted
715 {
716 std::copy(cols.begin()+colStarts[ri],cols.begin()+colStarts[ri]+rowCount[ri],cols.begin()+newStart);
717 colStarts[ri] = newStart;
718 }
719 }
720 if (!rowCount.empty()) // set the sentinel to point just behind the last entry...
721 colStarts.back() = colStarts[rowCount.size()-1]+rowCount.back();
722 cols.erase(cols.begin()+colStarts.back(),cols.end()); // ...and drop the rubbish left over in the tail
723
724
725 // investigate sparsity
726 nnzPerRow = this->last()==this->first()? 0 : nonzeroes() / (this->last()-this->first());
727 }
728
729 // returns the column index of entry at position idx
730 Index col(size_t idx) const { return cols[idx]; }
731
736 size_t colStart(Index row) const { return colStarts[row]; }
737
742 typename std::vector<Index,NumaAllocator<Index>>::const_iterator colStartIterator(Index row) const
743 {
744 return cols.begin()+colStart(row);
745 }
746
755 size_t position(Index r, Index c) const
756 {
757 auto b = cols.begin();
758 r -= this->first();
759 assert(0 <= r && r+1 < colStarts.size());
760 auto p = std::lower_bound(b+colStarts[r],b+colStarts[r+1],c); // col indices are sorted in each row
761 if (p==b+colStarts[r+1] || *p!=c) // return sentinel in case the entry is not here
763 return p-b; // return offset
764 }
765
769 bool exists(Index r, Index c) const
770 {
772 }
773
779 size_t storage() const { return cols.size(); }
780
786 size_t nonzeroes() const;
787
791 Index nonzeroesPerRow() const { return nnzPerRow; }
792
793 private:
794
795 // Raw data storage. The allocators used are NUMA allocators, guaranteeing local memory access.
796 std::vector<size_t,NumaAllocator<size_t>> colStarts; // memory positions are indexed by size_t (since there may be MANY)
797 std::vector<Index,NumaAllocator<Index>> cols; // column indices are indexed by user-provided Index type
798 Index nnzPerRow; // average number of nonzeroes per row (rounded)
799
800 };
801
802
803 //-------------------------------------------------------------------------
804
809 template <class Entry, class Index>
811 {
812 public:
813 typedef typename std::vector<Index,NumaAllocator<Index>>::const_iterator ColIterator;
814 typedef typename std::vector<Entry,NumaAllocator<Entry>>::const_iterator ValueIterator;
816
817 // typedefs required by std::iterator_traits
818 using value_type = Entry;
819 using difference_type = std::ptrdiff_t;
820 using pointer = Entry const*;
821 using reference = Entry const&;
822 using iterator_category = std::random_access_iterator_tag;
823
825
826 Index index() const { return *col; }
827
832 Entry const& operator*() const { return *val; }
833 Entry const* operator->() const { return &*val; }
842 void operator++() { ++col; ++val; }
843 void operator--() { --col; --val; }
844 void operator+=(Index i) { col += i; val += i; }
845 void operator-=(Index i) { col -= i; val -= i; }
850 bool operator==(Self const& it) const { return val==it.val; }
851 bool operator!=(Self const& it) const { return ! (*this==it); }
852 difference_type operator-(Self const& it) const { return val-it.val; }
853
854 protected:
857 };
858
859 template <class Entry, class Index>
861 {
862 public:
863 typedef typename std::vector<Index,NumaAllocator<Index>>::const_iterator ColIterator;
864 typedef typename std::vector<Entry,NumaAllocator<Entry>>::iterator ValueIterator;
865
867
872 Entry& operator*() const { return const_cast<Entry&>(*this->val); }
873 Entry* operator->() const { return &(**this); }
877 };
878
879
880
881 //-------------------------------------------------------------------------
882
883 template <class Arguments, class Operation>
885 {
886 public:
888 {
889 };
890
891 iterator begin(size_t) const
892 {
893 }
894
895 iterator end(size_t) const
896 {
897 }
898 };
899
900 template <class Arguments, class Operation>
902 {
903 public:
905 {
906 }
907 };
908
909 //-------------------------------------------------------------------------
910
921 template <class Entry, class Index=size_t>
923 {
925
926 public:
928// typedef typename GetScalar<Entry>::type Scalar;
929
937 CRSChunk(int node)
938 : values(NumaAllocator<Entry>(node))
939 {
940 }
941
945 CRSChunk(std::shared_ptr<CRSChunkPattern<Index>> const& pattern_, Entry const& init)
946 : pat(pattern_), scatterMutex(4), values(pat->nonzeroes(),init,NumaAllocator<Entry>(pat->node()))
947 {
948 }
949
962 template <class Matrix>
963 CRSChunk(std::shared_ptr<CRSChunkPattern<Index>> const& pattern_, Matrix const& matrix, bool isSymmetric, bool isTransposed)
964 : CRSChunk(pattern_,Entry(0))
965 {
966 typedef typename Matrix::block_type SuppliedEntry;
967
968 Index first = pat->first(), last = pat->last();
969
970 if (isSymmetric) // Make sure that transposed is only flagged if
971 isTransposed = false; // it's not symmetric.
972
973 // Compute the row range of the supplied matrix we have to scan.
974 Index fromRow = first, toRow = last;
975 if (isSymmetric && !pat->symmetric()) // If the supplied matrix is symmetrically stored but we are not, we have to
976 toRow = matrix.N(); // scan the later rows for the elements that appear in later columns here.
977 if (isTransposed) // If the supplied matrix is transposed, we have to scan all the rows
978 fromRow = 0, toRow = matrix.N(); // in order to cover all the columns in our range.
979
980 // Copy the column indices of elements.
981 for (Index ri=fromRow; ri<toRow; ++ri)
982 {
983 auto cend = matrix[ri].end();
984 for (auto ci=matrix[ri].begin(); ci!=cend; ++ci)
985 {
986 Index r = ri, c = ci.index();
987 if (isTransposed) // supplied matrix is transposed, swap row and column indices
988 std::swap(r,c); // to get the row/col indices in our world
989 if (first<=r && r<last && (!pat->symmetric() || c<=r)) // yep, this element is in our chunk
990 {
991 assert(pat->position(r,c) < values.size());
992 MatrixEntry<Entry,SuppliedEntry>::copy(*ci,values[pat->position(r,c)],isTransposed); // copy, transpose if needed
993 }
994 if (isSymmetric && !pat->symmetric() && c<r && first<=c && c<last) // if supplied matrix is stored symmetrically but we are not..
995 MatrixEntry<Entry,SuppliedEntry>::copy(*ci,values[pat->position(c,r)],true); // copy, transpose
996 }
997 }
998 }
999
1011 template <class Scalar, class Index2>
1012 CRSChunk(std::shared_ptr<CRSChunkPattern<Index>> const& pattern_, MatrixAsTriplet<Scalar,Index2> const& matrix, bool isSymmetric, bool isTransposed)
1013 : CRSChunk(pattern_,Entry(0))
1014 {
1015 Index first = pat->first(), last = pat->last();
1016
1017 if (isSymmetric) // Make sure that transposed is only flagged if
1018 isTransposed = false; // it's not symmetric.
1019
1020
1021 // Copy the column indices of elements.
1022 for (size_t i=0; i<matrix.ridx.size(); ++i)
1023 {
1024 Index r = matrix.ridx[i], c = matrix.cidx[i];
1025 if (isTransposed) // supplied matrix is transposed, swap row and column indices
1026 std::swap(r,c); // to get the row/col indices in our world
1027 if (first<=r && r<last && (!pat->symmetric() || c<=r)) // yep, this element is in our chunk
1028 {
1029 assert(pat->position(r,c) < values.size());
1030 values[pat->position(r,c)] = matrix.data[i]; // copy (no transpose as triplet entries are scalar)
1031 }
1032 if (isSymmetric && !pat->symmetric() && c<r && first<=c && c<last) // if supplied matrix is stored symmetrically but we are not..
1033 values[pat->position(c,r)] = matrix.data[i]; // copy (no transpose as triplet entries are scalar)
1034 }
1035 }
1036
1050 template <class Matrix>
1051 CRSChunk(Matrix const& matrix, bool isSymmetric, bool isTransposed, Index firstRow, Index lastRow, bool symmetric, int node)
1052 : CRSChunk(std::make_shared<CRSChunkPattern<Index>>(matrix,isSymmetric,isTransposed,firstRow,lastRow,symmetric,node),
1053 matrix,isSymmetric,isTransposed)
1054 {}
1055
1068 template <class Expanded, class Condensed, class Matrix>
1069 CRSChunk(std::shared_ptr<CRSChunkPattern<Index>> const& pattern_,
1070 Expanded const& eIndices, Condensed const& cIndices, Matrix const& matrix)
1071 : CRSChunk(pattern_,Entry(0))
1072 {
1073 Index first = pat->first(), last = pat->last();
1074 for (Index r=first; r<last; ++r)
1075 {
1076 auto row = matrix[eIndices[r]];
1077 auto p = begin(values) + pat->colStart(r-first);
1078 for (auto c=row.begin(); c!=row.end(); ++c)
1079 if (cIndices[c.index()]<eIndices.size())
1080 {
1081 *p = *c;
1082 assert(!std::isnan(*p));
1083 ++p;
1084 }
1085
1086 assert(p==begin(values)+pat->colStart(r-first+1));
1087 }
1088 }
1089
1093 Self& operator=(Self const& c) = default;
1094
1098 template <class Value>
1099 Self& operator=(Value const& a)
1100 {
1101 std::fill(begin(values),end(values),a);
1102 return *this;
1103 }
1104
1105
1109 template <class Arguments, class Operation>
1111 {
1112 Index first = pat->first();
1113 for (Index r=0; r<pat->last()-first; ++r)
1114 {
1115 auto eend = e.end(r);
1116 auto ci = pat->colStartIterator(r);
1117 auto cend = pat->colStartIterator(r+1);
1118 auto vi = begin(values)+pat->colStart(r);
1119 for (auto ei = e.begin(r); ei != eend; ++ei) // step through the expression row
1120 {
1121 auto pos = std::find(ci,cend,ei.index()); // find our entry
1122 assert(pos != cend); // our pattern must be a superset of the expression pattern
1123 vi += pos-ci; // advance to found entry
1124 ci = pos;
1125 *vi = *ei; // assign
1126 }
1127 }
1128 }
1129
1134 template <class Domain, class Range>
1135 Scalar apply(Scalar a, Domain const& x, Range& y, bool initialize) const
1136 {
1137 Scalar dp = 0;
1138 auto const& p = *pat;
1139 Index first = p.first();
1140 Index last = p.last();
1141
1142 // Perform matrix vector multiplication. Step through all rows in our chunk...
1143 // On Numa nodes with several cores, one may think that using multiple threads
1144 // on this node could improve the performance. Alas, this appears not to be the
1145 // case as of 2014-05 (both aged AMD Opteron 32cores/8nodes and a newer 32/4 Intel
1146 // Xeon). It seems that a single thread saturates the memory channel(s).
1147 // Hence we stay with the sequential version here.
1148 for (Index row=first; row<last; ++row)
1149 {
1150 // and compute the scalar product of (sparse) row and vector
1151 typename Range::block_type z(0);
1152
1153 size_t jEnd = p.colStart(row+1-first);
1154 for (size_t j=p.colStart(row-first); j<jEnd; ++j)
1155 if (Entry::rows==1 && Entry::cols==1)
1156 z.axpy(values[j][0][0],x[p.col(j)]); // this is necessary as otherwise scalar matrix entries and
1157 else // vecotr-valued rhs entries do not work together
1158 values[j].umv(x[p.col(j)],z);
1159
1160 // scale result as requested
1161 z *= a;
1162 if (initialize) y[row] = z;
1163 else y[row] += z;
1164
1165 // accumulate duality product
1166 if (Entry::rows==Entry::cols && row<x.size())
1167 dp += y[row]*x[row];
1168 }
1169
1170 return dp;
1171 }
1172
1177 template <class Domain, class Range>
1178 void applyTransposed(Scalar a, Domain const& x, Range& y) const
1179 {
1180 auto const& p = *pat;
1181 Index first = p.first(); // first row in our chunk
1182 Index last = p.last(); // last row in our chunk
1183
1184 assert(x.size()>=last);
1185
1186 // We compute y_i = a sum_j A_ji^T x_j, with j in the outer loop
1187 for (Index j=first; j<last; ++j) // looping over the rows of A, i.e. the columns of A^T
1188 {
1189 size_t jEnd = p.colStart(j+1-first);
1190 for (size_t k=p.colStart(j-first); k<jEnd; ++k) // looping over the columns in the row
1191 {
1192 auto i = p.col(k); // the column index in A, i.e. the row index in A^T
1193 if (Entry::rows==1 && Entry::cols==1)
1194 y[i] += a*values[k][0][0] * x[j]; // this is necessary as otherwise scalar matrix entries and
1195 else // vecotr-valued rhs entries do not work together
1196 values[k].usmtv(a,x[j],y[i]); // y_i += a * (A^T)_ij x_j = a * (A_ji)^T x_j
1197 }
1198 }
1199 }
1200
1209 template <class Domain, class Range>
1210 void gatherMirrored(Scalar a, Domain const& x, Range& y, CRSChunk<Entry,Index> const& block, bool subdiagonal, bool initToZero) const
1211 {
1212 auto const& p = *block.pat;
1213 Index firstRow = p.first();
1214 Index lastRow = p.last();
1215 Index firstCol = pat->first();
1216 Index lastCol = pat->last();
1217
1218 assert(lastRow<=y.size());
1219
1220 if (initToZero)
1221 for (Index row=firstRow; row<lastRow; ++row)
1222 y[row] = 0;
1223
1224 for (Index row=firstRow; row<lastRow; ++row)
1225 {
1226 auto const& xrow = x[row];
1227
1228 auto cend = p.colStartIterator(row+1-firstRow);
1229 auto ci = firstCol==0? // We consider all elements with column index in our
1230 p.colStartIterator(row-firstRow): // row chunk. Compute the starting point by binary search
1231 std::lower_bound(p.colStartIterator(row-firstRow),cend,firstCol); // (or used the shortcut if we know we start from the beginning).
1232 auto vi = block.values.begin() + (ci-p.colStartIterator(0));
1233 if (subdiagonal)
1234 for ( ; ci!=cend && *ci<lastCol && *ci<row; ++ci, ++vi) // Then step through the row until we hit the upper bound,
1235 vi->usmtv(a,xrow,y[*ci]); // which is either the upper row range end or the diagonal,
1236 else // depending on the matrix symmetry (encoded in subdiagonal).
1237 for ( ; ci!=cend && *ci<lastCol; ++ci, ++vi)
1238 vi->usmtv(a,xrow,y[*ci]);
1239 }
1240 }
1241
1245 CRSChunkPattern<Index> const& pattern() const { return *pat; }
1246
1251 auto valStartIterator(Index row)
1252 {
1253 return values.begin()+pat->colStart(row);
1254 }
1255
1259 template <class LMIterator>
1260 void scatter(LMIterator first, LMIterator last)
1261 {
1262 Index firstRow = pat->first();
1263 Index lastRow = pat->last();
1264
1265 if (first==last || firstRow==lastRow) // empty data or empty chunk
1266 return; // -> nothing to do
1267
1268 Index nnzPerRow = pat->nonzeroesPerRow();
1269
1270 // scatter into uniform ranges
1271 Index n = scatterMutex.size();
1272 for (Index i=0; i<n; ++i)
1273 {
1274 #ifndef KASKADE_SEQUENTIAL
1275 boost::lock_guard<boost::mutex> lock(scatterMutex[i].get());
1276 #endif
1277
1278 Index rowRangeStart = uniformWeightRangeStart(i, n,lastRow-firstRow)+firstRow;
1279 Index rowRangeEnd = uniformWeightRangeStart(i+1,n,lastRow-firstRow)+firstRow;
1280
1281 if (nnzPerRow == pat->columns()) // completely dense
1282 scatterRowRange<0>(first,last,firstRow,rowRangeStart,rowRangeEnd);
1283 else if (nnzPerRow > 10*(first->cidx().size())) // many more entries than we scatter right now
1284 scatterRowRange<1>(first,last,firstRow,rowRangeStart,rowRangeEnd);
1285 else // rather few entries per row
1286 scatterRowRange<2>(first,last,firstRow,rowRangeStart,rowRangeEnd);
1287 }
1288 }
1289
1295 template <class F, class EntryB, class IndexB>
1296 void entrywiseOp(F const& f, NumaBCRSMatrix<EntryB,IndexB> const& B)
1297 {
1298 auto const& p = *pat;
1299 Index first = p.first(); // first row index
1300 Index last = p.last(); // end row index
1301
1302 for (Index row=first; row<last; ++row)
1303 {
1304 auto const& Br = B[row]; // get the other matrix row
1305 size_t jEnd = p.colStart(row+1-first); // step through all entries in this row
1306 auto bri = Br.begin();
1307 auto brend = Br.end();
1308 for (size_t j=p.colStart(row-first); j<jEnd; ++j)
1309 {
1310 while (bri!= brend && bri.index()<p.col(j)) // advance other row pointer until we find our entry (but do not overshoot)
1311 ++bri;
1312 if (bri!=brend && bri.index()==p.col(j)) // if the column indices coincide, apply the functor
1313 values[j] = f(values[j],*bri);
1314 }
1315 }
1316 }
1317
1318 private:
1319 std::shared_ptr<CRSChunkPattern<Index>> pat;
1320 std::vector<Mutex> scatterMutex;
1321
1322 // Raw data storage. The allocators used are NUMA allocators to guarantee local memory access.
1323 std::vector<Entry,NumaAllocator<Entry>> values; // matrix entries
1324
1325 // scatters given local matrices into the provided row range. The expected sparsity is
1326 // encoded as static template parameter in order to allow an efficient if-statement in
1327 // the innermost loop
1328 template <int sparsity, class LMIterator>
1329 void scatterRowRange(LMIterator first, LMIterator last, Index startRow, Index firstRow, Index lastRow)
1330 {
1331 // simple-minded scatter implementation
1332 for ( ; first!=last; ++first) // step through all local matrices
1333 {
1334 for (auto rgl: first->ridx()) // step through all global rows affected by this matrix
1335 {
1336 Index rg = rgl.first;
1337 if (rg >= lastRow) // global row indices in rgl.first are sorted ascendingly,
1338 break; // if we pass our last row, we can stop with this local matrix
1339 if (firstRow<=rg)
1340 scatterRow<sparsity>(*first,rg-startRow,rgl.second);
1341 }
1342 }
1343 }
1344
1345 template <int sparsity, class LM, class LIndex>
1346 void scatterRow(LM const& a, Index rg, LIndex rl)
1347 {
1348 auto cbegin = pat->colStartIterator(rg);
1349 auto cend = pat->colStartIterator(rg+1);
1350 auto vbegin = valStartIterator(rg);
1351
1352 if (LM::lumped) // only diagonal, i.e. column index is row index both in global and local matrix
1353 {
1354 auto pos = std::find(cbegin,cend,rg+pat->first());
1355 assert(pos != cend);
1356 vbegin[pos-cbegin] += a(rl,rl);
1357 }
1358 else
1359 {
1360 for (auto cgl: a.cidx())
1361 {
1362 auto pos = sparsity==0? std::lower_bound(cbegin,cend,cgl.first) // find entry in row with global column number cgl.first
1363 : sparsity==1? std::lower_bound(cbegin,cend,cgl.first) // TODO: specialized find for dense matrices.
1364 : std::find(cbegin,cend,cgl.first);
1365
1366 if (pos==cend)
1367 return; // reached end of row - skip the rest (there may be more, e.g. if we are symmetric)
1368
1369 vbegin += pos-cbegin; // move on to the found entry
1370 cbegin = pos;
1371
1372 *vbegin += a(rl,cgl.second); // add up appropriate entry from local matrix
1373 }
1374 }
1375 }
1376 };
1377
1378 //---------------------------------------------------------------------------------------------
1379
1380 template <class Entry, class Index>
1382 {
1383 public:
1384 typedef Index size_type;
1385 typedef Entry block_type;
1386
1389
1394
1398 Index const* getindexptr() const { return &*cBegin; }
1399
1403 block_type const* getptr() const { return &*vBegin; }
1404 block_type* getptr() { return &*vBegin; }
1405
1411
1414
1420 ConstIterator find(Index c) const
1421 {
1422 auto ci = std::lower_bound(cBegin,cEnd,c);
1423 if (ci==cEnd || *ci!=c)
1424 return end();
1425 return ConstIterator(ci,vBegin+(ci-cBegin));
1426 }
1427
1434 {
1435 auto ci = std::lower_bound(cBegin,cEnd,c);
1436 if (ci==cEnd || *ci!=c)
1437 return end();
1438 return Iterator(ci,vBegin+(ci-cBegin));
1439 }
1440
1451 Entry const& operator[](Index c) const
1452 {
1453 static Entry const zero(0);
1454
1455 auto ci = std::lower_bound(cBegin,cEnd,c);
1456 if (ci==cEnd || *ci!=c)
1457 return zero;
1458
1459 return vBegin[ci-cBegin];
1460 }
1461
1462
1469 Entry& operator[](Index c)
1470 {
1471 auto ci = std::lower_bound(cBegin,cEnd,c);
1472 if (ci==cEnd || *ci!=c)
1473 throw LookupException("write access to nonexistent matrix entry",__FILE__,__LINE__);
1474
1475 return vBegin[ci-cBegin];
1476 }
1477
1478
1479 protected:
1480 typedef typename std::vector<Index,NumaAllocator<Index>>::const_iterator ColIterator;
1481 typedef typename std::vector<Entry,NumaAllocator<Entry>>::iterator ValueIterator;
1482
1483 ColIterator cBegin, cEnd; // iterators giving the range of column indices
1484 ValueIterator vBegin, vEnd; // iterators giving the range of values
1485
1486 // Extracts the row begin and end iterators at the current position (if it is a valid position)
1487 void update(NumaBCRSMatrix<Entry,Index>& matrix, Index row, int chunk)
1488 {
1489 if (0<=row && row<matrix.N()) // we have a valid row in a valid chunk in front of us
1490 {
1491 auto& c = matrix.chunk(chunk);
1492 auto const& p = c.pattern();
1493 assert(p.first()<=row && row<p.last()); // check that the row is contained in the chunk
1494 Index first = p.first();
1495 cBegin = p.colStartIterator(row-first);
1496 cEnd = p.colStartIterator(row-first+1);
1497 vBegin = c.valStartIterator(row-first);
1498 vEnd = c.valStartIterator(row-first+1);
1499 }
1500 }
1501 };
1502
1503 // -----------------------------------------------------------------------------------
1504
1505 template <class Entry, class Index>
1506 class NumaBCRSMatrixConstIterator: public NumaBCRSRow<Entry,Index> // TODO: derive from const row type!
1507 {
1510 using value_type = Row;
1511
1512 public:
1514 : matrix(const_cast<NumaBCRSMatrix<Entry,Index>*>(&matrix_)), row(row_), chunk(getChunk(row_))
1515 {
1516 assert(row>=0 && row<=matrix->N());
1517 this->update(*matrix,row,chunk);
1518 }
1519
1522 {
1523 ++row; // next row
1524 if (row==matrix->N()) // we've reached the end
1525 return; // -> skip anything else
1526 while (matrix->getPattern()->rowStart()[chunk+1]==row) // oops, we tripped into the next chunk...
1527 ++chunk; // which may be empty (zero rows), then go on
1528 this->update(*matrix,row,chunk);
1529 }
1530
1532 {
1533 --row;
1534 while (matrix->getPattern()->rowStart()[chunk] > row) // oops, we stepped before our chunk
1535 --chunk; // previous one may be empty, then go on
1536 this->update(*matrix,row,chunk);
1537 }
1538
1539 void operator+=(Index inc)
1540 {
1541 row += inc;
1542 if (row > matrix->N())
1543 {
1544 row = matrix->N(); // we're behind the end - move to end
1545 return; // such that we compare equal to end.
1546 }
1547 chunk = getChunk(row);
1548 this->update(*matrix,row,chunk);
1549 }
1550
1551 bool operator==(Self const& it) const { return matrix==it.matrix && row==it.row; }
1552 bool operator!=(Self const& it) const { return !(*this==it); }
1554
1555 Index index() const { return row; }
1556
1558
1565 Row const& operator*() const { return *this; }
1566 Row const* operator->() const { return this; }
1568
1569 private:
1571 Index row;
1572 int chunk;
1573
1574 int getChunk(Index r) const { return matrix->getPattern()->chunk(r); }
1575 };
1576
1577
1578
1579 template <class Entry, class Index>
1581 {
1584
1585 public:
1587 : NumaBCRSMatrixConstIterator<Entry,Index>(matrix_,row_) {}
1588
1589
1590 Row& operator*() { return *this; }
1591 Row* operator->() { return this; }
1592 };
1593
1594
1595
1596
1597 } // end of namespace ThreadedMatrixDetail
1602 //---------------------------------------------------------------------------
1603 //---------------------------------------------------------------------------
1604
1615 template <class Index=size_t>
1617 {
1619
1620 public:
1652 NumaCRSPatternCreator(Index rows_, Index cols, bool symmetric=false, int nnzPerRow=0)
1653 : rows(rows_), columns(cols)
1654 , sym(symmetric && rows==columns)
1655 {
1656 int nodes = static_cast<Index>(NumaThreadPool::instance().nodes());
1657 creators.reserve(nodes);
1658
1659 for (int i=0; i<nodes; ++i)
1660 if (sym)
1661 creators.push_back(ChunkCreator(i*rows/nodes,(i+1)*rows/nodes,cols,sym,i)); // TODO: more balanced
1662 else
1663 creators.push_back(ChunkCreator(i*rows/nodes,(i+1)*rows/nodes,cols,sym,i));
1664
1665 if (nnzPerRow > 0)
1666 parallelForNodes([this,nnzPerRow](int i, int n)
1667 {
1668 this->creators[i].reserve(nnzPerRow);
1669 },nodes);
1670 }
1671
1672 // The default destructor does not parallelize the release of memory on the NUMA chunks.Thus we
1673 // define our own destructor.
1675 {
1677 parallelForNodes([this](int i, int n)
1678 {
1679 this->creators[i].clear();
1680 },nodes);
1681 }
1682
1688 void addElement(Index row, Index col)
1689 {
1690 addElements(&row,&row+1,&col,&col+1,true);
1691 }
1692
1701 void addDenseBlock(Index fromRow, Index toRow, Index fromCol, Index toCol)
1702 {
1703 assert(fromRow>=0 && toRow<=rows && fromRow<=toRow);
1704 assert(fromCol>=0 && toCol<=columns && fromCol<=toCol);
1705
1706 for (int r=fromRow; r<toRow; ++r) // TODO: this is probably inefficient and not
1707 for (int c=fromCol; c<toCol; ++c) // NUMA-parallelized. Consider a more
1708 addElement(r,c); // efficient implementation
1709 }
1710
1725 template <class IterRow, class IterCol>
1726 void addElements(IterRow const fromRow, IterRow const toRow, IterCol const fromCol, IterCol const toCol, bool colIsSorted=false)
1727 {
1728 assert(fromCol==toCol || *std::max_element(fromCol,toCol) < columns);
1729 assert(fromRow==toRow || *std::max_element(fromRow,toRow) < rows);
1730 for (auto& c: creators)
1731 c.addElements(fromRow,toRow,fromCol,toCol,colIsSorted);
1732 }
1733
1748 template <class RowRangeSequence, class ColRangeSequence>
1749 void addElements(RowRangeSequence const& rrs, ColRangeSequence const& crs, bool colsAreSorted=false)
1750 {
1751 assert(rrs.size() == crs.size());
1752
1753 for (auto c=std::begin(crs); c!=std::end(crs); ++c)
1754 assert((*c).empty() || (*std::max_element(std::begin(*c),std::end(*c)) < columns)); // check that all column indices are ok
1755
1756 parallelForNodes([&rrs,&crs,colsAreSorted,this](int i, int nodes) {
1757 auto c = std::begin(crs);
1758 for (auto r=std::begin(rrs); r!=std::end(rrs); ++r, ++c)
1759 this->creators[i].addElements(std::begin(*r),std::end(*r),std::begin(*c),std::end(*c),colsAreSorted);
1760 },nodes());
1761 }
1762
1771 {
1772 parallelForNodes([this](int i, int nodes) {
1773 this->creators[i].addAllElements(this->columns);
1774 }, nodes());
1775 }
1776
1781 {
1782 for (Index i=0; i<std::min(rows,columns); ++i)
1783 addElement(i,i);
1784 }
1785
1791 size_t nonzeroes() const
1792 {
1793 size_t nnz = 0;
1794 for (auto const& c: creators)
1795 nnz += c.nonzeroes();
1796 return nnz;
1797 }
1798
1802 Index cols() const { return columns; }
1803
1811 void balance();
1812
1818 int nodes() const { return creators.size(); }
1819
1824 ChunkCreator const& creator(int node) const { return creators[node]; }
1825
1831 bool isSymmetric() const { return sym; }
1832
1833 private:
1834 Index rows, columns;
1835 std::vector<ChunkCreator> creators;
1836 bool sym;
1837 };
1838
1839 //---------------------------------------------------------------------------
1840
1846 template <class Index=size_t>
1848 {
1850
1851 public:
1852
1858 {}
1859
1864 : patterns(creator.nodes()), sym(creator.isSymmetric()), rowSt(creator.nodes()+1), cols(creator.cols())
1865 {
1866 parallelForNodes([this,&creator](int i, int n)
1867 {
1868 this->patterns[i] = std::make_shared<ChunkPattern>(creator.creator(i));
1869 this->rowSt[i] = this->patterns[i]->first();
1870 },creator.nodes());
1871 rowSt[patterns.size()] = patterns.back()->last();
1872 }
1873
1887 template <class Expanded, class Condensed, class Matrix>
1888 NumaCRSPattern(Expanded const& eIndices, Condensed const& cIndices, Matrix const& mat)
1889 : patterns(NumaThreadPool::instance().nodes()), sym(false), rowSt(eIndices.size(),0), cols(eIndices.size())
1890 {
1891 assert(mat.M()==mat.N()); // works only for quadratic matrices
1892 assert(eIndices.size() <= cIndices.size());
1893
1894 for (Index r=0; r<eIndices.size(); ++r) // extract the number of entries in each
1895 { // row of the condensed matrix
1896 auto row = mat[eIndices[r]];
1897 for (auto ci=row.begin(); ci!=row.end(); ++ci)
1898 if (cIndices[ci.index()] < eIndices.size()) // only count those entries that fall in our col range
1899 ++rowSt[r];
1900 }
1901
1902 equalWeightRanges(rowSt,patterns.size()); // balance the number of entries in each chunk
1903
1904 for (int i=0; i<patterns.size(); ++i) // create patterns
1905 patterns[i] = std::make_shared<ChunkPattern>(rowSt[i],rowSt[i+1],eIndices,cIndices,mat,i);
1906 }
1907
1922 template <class Matrix>
1923 NumaCRSPattern(Matrix const& matrix, bool isSymmetric, bool isTransposed, bool symmetric)
1924 : sym(symmetric), cols(isTransposed? matrix.N(): matrix.M())
1925 {
1926 assert(isSymmetric || !symmetric);
1927
1928 // Compute an equilibrated distribution of rows to chunks (attempt as many chunks as NUMA nodes).
1929 Index rows = isTransposed? matrix.M(): matrix.N();
1930 std::vector<size_t> rowCount(rows,0);
1931 ThreadedMatrixDetail::getRowCount(matrix,isTransposed,rowCount);
1933
1934 // Create chunk patterns
1935 patterns.reserve(rowCount.size()-1);
1936 for (int i=0; i<rowCount.size()-1; ++i)
1937 patterns.push_back(std::make_shared<ChunkPattern>(matrix,isSymmetric,isTransposed,
1938 rowCount[i],rowCount[i+1],symmetric,i));
1939
1940 rowSt.resize(patterns.size()+1);
1941 for (int i=0; i<patterns.size(); ++i)
1942 rowSt[i] = patterns[i]->first();
1943 rowSt[patterns.size()] = patterns.back()->last();
1944 }
1945
1949 Index N() const { return rowSt.back(); }
1950
1954 Index M() const { return cols; }
1955
1961 size_t storage() const
1962 {
1963 size_t nnz = 0;
1964 for (auto const& p: patterns)
1965 nnz += p->storage();
1966 return nnz;
1967 }
1968
1972 size_t nonzeroes() const
1973 {
1974 size_t nnz = 0;
1975 for (auto const& p: patterns)
1976 nnz += p->nonzeroes();
1977 return nnz;
1978 }
1979
1983 int nodes() const { return patterns.size(); }
1984
1988 std::shared_ptr<ChunkPattern> pattern(int i) const { return patterns[i]; }
1989
1995 bool isSymmetric() const { return sym; }
1996
2003 int chunk(Index row) const
2004 {
2005 assert(0<=row && row<=N());
2006 auto it = std::upper_bound(rowSt.begin(),rowSt.end(),row); // use upper bound here, with lower_bound rows 0 and 1 would end up
2007 int c = it - rowSt.begin() - 1; // in different chunks...
2008 assert(0<=c && c<=patterns.size());
2009 assert(c==patterns.size() || (patterns[c]->first() <= row && row < patterns[c]->last()));
2010 return c;
2011 }
2012
2019 std::vector<Index> const& rowStart() const { return rowSt; }
2020
2024 bool exists(Index r, Index c) const
2025 {
2026 return patterns[chunk(r)]->exists(r,c);
2027 }
2028
2029 private:
2030 std::vector<std::shared_ptr<ChunkPattern>> patterns;
2031 bool sym;
2032 std::vector<Index> rowSt; // row start indices
2033 Index cols; // how many columns the pattern has
2034 };
2035
2036 template <class Index, class Index2>
2038 {
2039 assert(pa.N()==pb.N() && pa.M()==pb.M());
2040 Index nnzPerRow = (pa.nonzeroes()+pb.nonzeroes())/pa.N();
2041 NumaCRSPatternCreator<Index> creator(pa.N(),pa.M(),pa.isSymmetric()&&pb.isSymmetric(),nnzPerRow);
2042
2043 for (int i=0; i<pa.nodes(); ++i)
2044 {
2045 bool symmetrize = pa.isSymmetric() && !pb.isSymmetric();
2046 auto const& chunkPattern = *pa.pattern(i);
2047 Index first = chunkPattern.first();
2048 for (Index row=first; row<chunkPattern.last(); ++row)
2049 for (auto it=chunkPattern.colStartIterator(row-first); it!=chunkPattern.colStartIterator(row+1-first); ++it)
2050 {
2051 creator.addElement(row,*it);
2052 if (symmetrize)
2053 creator.addElement(*it,row);
2054 }
2055 }
2056
2057 for (int i=0; i<pb.nodes(); ++i)
2058 {
2059 bool symmetrize = pb.isSymmetric() && !pa.isSymmetric();
2060 auto const& chunkPattern = *pb.pattern(i);
2061 Index2 first = chunkPattern.first();
2062 for (Index2 row=first; row<chunkPattern.last(); ++row)
2063 for (auto it=chunkPattern.colStartIterator(row-first); it!=chunkPattern.colStartIterator(row+1-first); ++it)
2064 {
2065 creator.addElement(static_cast<Index>(row),*it);
2066 if (symmetrize)
2067 creator.addElement(*it,static_cast<Index>(row));
2068 }
2069 }
2070
2071 return NumaCRSPattern<Index>(creator);
2072 }
2073
2074 //---------------------------------------------------------------------------
2075
2113 template <class Entry, class Index=size_t>
2115 {
2118
2119 public:
2121 typedef Scalar field_type; // for compatibility with Dune
2122 typedef Entry block_type;
2123 using value_type = Entry; // or should this be row_type?
2124 using size_type = Index; // compatibility with Dune::BCRSMatrix
2125
2133
2135
2138
2141
2142
2152 : NumaBCRSMatrix(std::make_shared<NumaCRSPattern<Index>>())
2153 {}
2154
2158 NumaBCRSMatrix(Self const& A) = default; // TODO: do this in parallel
2159
2163 NumaBCRSMatrix(Self&& A) = default; // move should be efficient without parallelization
2164
2173 NumaBCRSMatrix(NumaCRSPatternCreator<Index> const& creator, Entry const& init=Entry(0))
2174 : NumaBCRSMatrix(std::make_shared<NumaCRSPattern<Index>>(creator),init)
2175 {}
2176
2183 NumaBCRSMatrix(std::shared_ptr<NumaCRSPattern<Index>> const& pattern_, Entry const& init=Entry(0))
2184 : pattern(pattern_)
2185 {
2186 for (int i=0; i<pattern->nodes(); ++i)
2187 chunks.push_back(Chunk(i));
2188
2189 parallelForNodes([this,&init] (int i, int n)
2190 {
2191 this->chunks[i] = std::move(Chunk(this->pattern->pattern(i),init));
2192 },chunks.size());
2193
2194 dps.resize(chunks.size());
2195 }
2196
2202 template <class Expanded, class Condensed, class Matrix>
2203 NumaBCRSMatrix(Expanded const& eIndices, Condensed const& cIndices, Matrix const& mat)
2204 : pattern(std::make_shared<NumaCRSPattern<Index>>(eIndices,cIndices,mat)) // TODO: move this to private, and have a convenience constructor creating cIndices under the hood
2205 {
2206 chunks.reserve(pattern->nodes());
2207 for (int i=0; i<pattern->nodes(); ++i)
2208 chunks.push_back(Chunk(pattern->pattern(i),eIndices,cIndices,mat)); // TODO: do this in parallel
2209 dps.resize(chunks.size());
2210 }
2211
2212 private:
2213
2214 // Creates a vector cIndices that for each index in the source matrix tells
2215 // the index in the extracted submatrix (or a too large sentinel value if the
2216 // index is not contained in the submatrix). I.e. result[eIndices[i]]==i
2217 // for 0 <= i < eIndices.size(), and result[j]>=eIndices.size() otherwise.
2218 template <class Expanded>
2219 static std::vector<Index> createCondensed(Expanded const& eIndices, Index n)
2220 {
2221 std::vector<Index> cIndices(n,eIndices.size());
2222 for (Index i=0; i<eIndices.size(); ++i)
2223 cIndices[eIndices[i]] = i;
2224 return cIndices;
2225 }
2226
2227 public:
2228
2241 template <class Expanded, class Matrix>
2242 NumaBCRSMatrix(Expanded const& eIndices, Matrix const& mat)
2243 : NumaBCRSMatrix(eIndices,createCondensed(eIndices,mat.N()),mat)
2244 { }
2245
2246
2257 template <class Matrix>
2258 NumaBCRSMatrix(std::shared_ptr<NumaCRSPattern<Index>> const& pattern_, Matrix const& matrix,
2259 bool isSymmetric, bool isTransposed)
2260 : pattern(pattern_)
2261 {
2262 chunks.reserve(pattern->nodes());
2263 for (int i=0; i<pattern->nodes(); ++i)
2264 chunks.push_back(Chunk(pattern->pattern(i),matrix,isSymmetric,isTransposed)); // TODO: do this in parallel
2265 dps.resize(chunks.size());
2266 }
2267
2280 template <class OtherEntry>
2282 : NumaBCRSMatrix(matrix.getPattern(),matrix,matrix.getPattern()->isSymmetric(),false)
2283 { }
2284
2301 template <class Matrix>
2302 NumaBCRSMatrix(Matrix const& matrix,
2303 bool isSymmetric, bool isTransposed=false, bool symmetric=false)
2304 : NumaBCRSMatrix(std::make_shared<NumaCRSPattern<Index>>(matrix,isSymmetric,isTransposed,symmetric),
2305 matrix,isSymmetric,isTransposed)
2306 {
2307 assert(matrix.N()==matrix.M() || !(symmetric||isSymmetric)); // only go into symmetric mode if the matrix is square
2308 assert(Entry::rows==Entry::cols || !(symmetric||isSymmetric)); // symmetry only if the entries are square
2309
2310 // Make sure symmetry is handled correctly (even if not in debug mode). TODO: shall we take this as a feature instead of complaining via assert?
2311 symmetric &= Entry::rows==Entry::cols && matrix.N()==matrix.M();
2312 isSymmetric &= Entry::rows==Entry::cols && matrix.N()==matrix.M();
2313 }
2314
2334 bool isSymmetric, bool isTransposed=false, bool symmetric=false)
2335 : NumaBCRSMatrix(isTransposed==false && isSymmetric==symmetric? matrix.pattern // just copy
2336 : std::make_shared<NumaCRSPattern<Index>>(matrix,isSymmetric,isTransposed,symmetric),
2337 matrix,isSymmetric,isTransposed)
2338 {
2339 assert(matrix.N()==matrix.M() || !(symmetric||isSymmetric)); // only go into symmetric mode if the matrix is square
2340 assert(Entry::rows==Entry::cols || !(symmetric||isSymmetric)); // symmetry only if the entries are square
2341 assert(isTransposed? (matrix.N()==M() && matrix.M()==N()): (matrix.N()==N() && matrix.M()==M()));
2342 assert(!isTransposed || Entry::rows==Entry::cols);
2343 }
2344
2345
2347
2359 Self& operator=(Self const& mat) = default;
2360
2366 Self& operator=(Self&& mat) = default;
2367
2371 Self& operator=(Entry const& a)
2372 {
2373 if (chunks.size()>0)
2374 parallelForNodes([this,&a] (int i, int n)
2375 {
2376 this->chunks[i] = a;
2377 },chunks.size());
2378
2379 return *this;
2380 }
2381
2385 Self& operator=(typename Entry::field_type const& a)
2386 {
2387 return *this = Entry(a);
2388 }
2389
2393 template <class Arguments, class Operation>
2395 {
2396 if (chunks.size()>0)
2397 parallelForNodes([this,&e] (int i, int n)
2398 {
2399 this->chunks[i] = e[i];
2400 },chunks.size());
2401
2402 return *this;
2403 }
2404
2410 template <class EntryB, class IndexB>
2412 {
2413 entrywiseOp([](Entry const& a, Entry const& b) { return a+b; }, B);
2414 return *this;
2415 }
2416
2422 template <class EntryB, class IndexB>
2424 {
2425 entrywiseOp([](Entry const& a, Entry const& b) { return a-b; }, B);
2426 return *this;
2427 }
2428
2434 template <class Factor>
2435 Self& operator*=(Factor a)
2436 {
2437 entrywiseOp([=](Entry const& e, Entry const&) { return a*e; }, *this);
2438 return *this;
2439 }
2440
2442
2468 iterator begin() { return iterator(*this,0); }
2469 const_iterator begin() const { return const_iterator(*this,0); }
2470
2474 iterator end() { return iterator(*this,N()); }
2475 const_iterator end() const { return const_iterator(*this,N()); }
2476
2482 row_type operator[](Index r) { return iterator(*this,r); }
2483 const_row_type operator[](Index r) const { return const_iterator(*this,r); }
2484
2486
2491 template <class RowIndices, class ColIndices>
2492 Self operator()(RowIndices const& ri, ColIndices const& ci) const
2493 {
2494 return submatrix<Self>(*this,ri,ci);
2495 };
2496
2509 Index N() const { return pattern->N(); }
2510
2514 Index M() const { return pattern->M(); }
2515
2522 size_t nonzeroes() const { return pattern->nonzeroes(); }
2523
2527 std::shared_ptr<NumaCRSPattern<Index>> getPattern() const { return pattern; }
2528
2532 Chunk& chunk(int i) { return chunks[i]; }
2533
2537 bool exists(Index r, Index c) const
2538 {
2539 return pattern->exists(r,c);
2540 }
2541
2547 {
2548 return sqrt(frobenius_norm2());
2549 }
2550
2556 {
2557 field_type sum = 0;
2558 for (auto ri=begin(); ri!=end(); ++ri)
2559 for (auto ci=ri->begin(); ci!=ri->end(); ++ci)
2560 sum += ci->frobenius_norm2();
2561 return sqrt(sum);
2562 }
2563
2565
2576 template <class X, class Y>
2577 field_type mv(X const& x, Y& y) const { return doMv(1.0,x,y,true); }
2578
2587 template <class X, class Y>
2588 void mtv(X const& x, Y& y) const { doMtv(1.0,x,y,true); }
2589
2593 template <class X, class Y>
2594 void mmv(X const& x, Y& y) const { doMv(-1.0,x,y,true); }
2595
2599 template <class X, class Y>
2600 field_type smv(field_type const& a, X const& x, Y& y) const { return doMv(a,x,y,true); }
2601
2608 template <class X, class Y>
2609 void smtv(field_type const& a, X const& x, Y& y) const { doMtv(a,x,y,true); }
2610
2614 template <class X, class Y>
2615 field_type umv(X const& x, Y& y) const { return doMv(1.0,x,y,false); }
2616
2623 template <class X, class Y>
2624 void umtv(X const& x, Y& y) const { doMtv(1.0,x,y,false); }
2625
2629 template <class X, class Y>
2630 field_type usmv(field_type const& a, X const& x, Y& y) const { return doMv(a,x,y,false); }
2631
2638 template <class X, class Y>
2639 void usmtv(field_type const& a, X const& x, Y& y) const { doMtv(a,x,y,false); }
2640
2642
2672 template <class LMIterator>
2673 void scatter(LMIterator first, LMIterator last)
2674 {
2675 for (auto& c: chunks)
2676 c.scatter(first,last);
2677 }
2678
2703 template <class RowIndices, class ColIndices, class BinaryOp=std::plus<Entry>>
2704 void scatter(DynamicMatrix<Entry> const& B, RowIndices const& rows, ColIndices const& cols,
2705 BinaryOp const& op = BinaryOp())
2706 {
2707 // TODO: implement this in the chunks for performance
2708 for (int r=0; r<rows.size(); ++r)
2709 for (int c=0; c<cols.size(); ++c)
2710 {
2711 auto ci = (*this)[rows[r]].find(cols[c]);
2712 assert(ci!=(*this)[rows[r]].end());
2713 *ci = op(*ci,B[r][c]);
2714 }
2715 }
2716
2718
2719
2720 private:
2721 std::shared_ptr<NumaCRSPattern<Index>> pattern;
2722 std::vector<Chunk> chunks;
2723 mutable std::vector<field_type> dps; // could be local to doMv, but we don't like frequent reallocations
2724
2725
2726 // Perform matrix-vector multiplication y = (initToZero? 0: y) + a A x and return a x^T Ax
2727 template <class X, class Y>
2728 Scalar doMv(Scalar const& a, X const& x, Y& y, bool initToZero) const
2729 {
2730 assert(y.size()==N());
2731 assert(x.size()==M());
2732
2733 if (chunks.size()>1)
2734 {
2735 parallelForNodes([this,a,&x,&y,initToZero] (int i, int n)
2736 {
2737 if (pattern->isSymmetric()) // if it's symmetric, we have to add the transpose of the other blocks
2738 for (int k=i; k<n; ++k)
2739 this->chunks[i].gatherMirrored(a,x,y,this->chunks[k],true,initToZero&&k==i);
2740 this->dps[i] = this->chunks[i].apply(a,x,y,initToZero&&!pattern->isSymmetric());
2741 },chunks.size());
2742
2743 return std::accumulate(dps.begin(),dps.end(),0.0);
2744 }
2745 else
2746 {
2747 if (pattern->isSymmetric())
2748 chunks[0].gatherMirrored(a,x,y,chunks[0],true,initToZero);
2749 return chunks[0].apply(a,x,y,initToZero&&!pattern->isSymmetric());
2750 }
2751 }
2752
2753 // Perform transpose matrix-vector multiplication y = a A^T x and return y^T x
2754 template <class X, class Y>
2755 void doMtv(Scalar const& a, X const& x, Y& y, bool initToZero) const
2756 {
2757 assert(y.size()==M());
2758 assert(x.size()==N());
2759
2760 if (pattern->isSymmetric()) // matrix is symmetric, hence we can multiply with A instead of A^T
2761 doMv(a,x,y,initToZero);
2762 else
2763 {
2764 if (initToZero)
2765 y = 0;
2766 for (auto const& chunk: chunks) // WARNING: applyTransposed is not thread-safe due to global unstructured scattering
2767 chunk.applyTransposed(a,x,y); // into y. It is possible to parallelize this, in a window-shingled fashion, but is
2768 } // it worth the implementation effort?
2769 }
2770
2771 // performs elementwise operations on two matrices (most useful for axpy-like operations):
2772 // a_ij = f(a_ij,b_ij)
2773 // This affects exactly the entries present in *both* matrices. Others are unchanged. Matrices have to have the same shape.
2774 template <class F, class EntryB, class IndexB>
2775 void entrywiseOp(F const& f, NumaBCRSMatrix<EntryB,IndexB> const& B)
2776 {
2777 assert(N()==B.N() && M()==B.M());
2778 parallelForNodes([this,&f,&B](int i, int n)
2779 {
2780 this->chunks[i].entrywiseOp(f,B);
2781 },chunks.size());
2782 }
2783 };
2784
2785 // ------------------------------------------------------------------------------------------
2786
2794 template <class Entry, class Index>
2795 std::ostream& operator <<(std::ostream& out, NumaBCRSMatrix<Entry,Index> const& A)
2796 {
2797 for (auto ri=A.begin(); ri!=A.end(); ++ri)
2798 for (auto ci=ri->begin(); ci!=ri->end(); ++ci)
2799 out << ri.index() << ' ' << ci.index() << ' ' << *ci << std::endl;
2800 return out;
2801 }
2802
2803 // ----------------------------------------------------------------------------------------------
2804
2813 template <class SparseMatrix>
2815 {
2817 for (auto ri=A.begin(); ri!=A.end(); ++ri)
2818 for (auto ci=ri->begin(); ci!=ri->end(); ++ci)
2819 B[ri.index()][ci.index()] = *ci;
2820 return B;
2821 }
2822
2840 template <class SparseMatrix, class RowRange, class ColRange>
2841 DynamicMatrix<typename SparseMatrix::block_type> full(SparseMatrix const& A, RowRange const& rows, ColRange const& cols)
2842 {
2843 using std::begin;
2844 using std::end;
2845 using Index = typename ColRange::value_type;
2846
2847 // First we prepare a lookup structure for efficient determination where
2848 // a sparse matrix entry should end up.
2849 std::vector<std::pair<Index,Index>> cidx;
2850 cidx.reserve(cols.size());
2851
2852 Index i=0;
2853 for (auto c: cols)
2854 {
2855 cidx.push_back(std::make_pair(c,i));
2856 ++i;
2857 }
2858 std::sort(begin(cidx),end(cidx),FirstLess());
2859
2860 // Now create the matrix and extract the entries.
2861 DynamicMatrix<typename SparseMatrix::block_type> B(rows.size(),cols.size());
2862
2863 for (size_t r=0; r<rows.size(); ++r)
2864 for (auto ci=A[rows[r]].begin(); ci!=A[rows[r]].end(); ++ci)
2865 {
2866 auto it = std::lower_bound(begin(cidx),end(cidx),
2867 std::make_pair(ci.index(),ci.index()),FirstLess());
2868 if (it!=end(cidx) && it->first==ci.index()) // this is an entry from our column range
2869 B[r][it->second] = *ci;
2870 }
2871
2872 return B;
2873 }
2874
2875 // ----------------------------------------------------------------------------------------------
2876
2884 template <class Entry, class Index, class Index2>
2886 {
2887 auto pat = std::make_shared<NumaCRSPattern<Index>>(*A.getPattern()+*B.getPattern());
2889 assert(!A.getPattern()->isSymmetric() && !B.getPattern()->isSymmetric());
2890
2891 // TODO: parallelize
2892 for (Index row=0; row<AB.N(); ++row)
2893 {
2894 auto ABrow = AB[row];
2895 for (auto ci=A[row].begin(); ci!=A[row].end(); ++ci)
2896 ABrow[ci.index()] += *ci;
2897 for (auto ci=B[row].begin(); ci!=B[row].end(); ++ci)
2898 ABrow[static_cast<Index>(ci.index())] += *ci;
2899 }
2900 return AB;
2901 }
2902
2903 // ----------------------------------------------------------------------------------------------
2904
2905
2917 template <class Scalar, int n, class Index=size_t>
2919 {
2920 NumaCRSPatternCreator<Index> creator(N,N,false,1);
2921 for (Index i=0; i<N; ++i)
2922 creator.addElement(i,i);
2923 return NumaBCRSMatrix<Dune::FieldMatrix<Scalar,n,n>,Index>(creator,unitMatrix<Scalar,n>());
2924 }
2925
2926
2927 // ----------------------------------------------------------------------------------------------
2928
2929
2941 template <class Scalar, int n, int m, class Index=size_t>
2943 {
2944 NumaCRSPatternCreator<Index> creator(N,M,false,1);
2945 return NumaBCRSMatrix<Dune::FieldMatrix<Scalar,n,m>,Index>(creator);
2946 }
2947
2948 // ----------------------------------------------------------------------------------------------
2949
2955 template <class Entry, class Index>
2957 {
2959 return At;
2960 }
2961
2962 // ----------------------------------------------------------------------------------------------
2963
2964
2978 template <class Target, class Source, class RowIndices, class ColIndices>
2979 Target eraseRowsNCols(Source const& A, RowIndices const& ri, ColIndices const& ci)
2980 {
2981 // rows and columns to keep
2982 std::vector<size_t> rows(A.N()), cols(A.M());
2983 std::iota(rows.begin(),rows.end(),0); // vector of rows to keep
2984 std::iota(cols.begin(),cols.end(),0); // vector of columns to keep
2985
2986 if(!ri.empty())
2987 for(int i=0; i<ri.size();++i)
2988 rows.erase(rows.begin + ri[i]); // erase the corresponding row indices
2989
2990 if(!ci.empty())
2991 for(int i=0; i<ci.size();++i)
2992 cols.erase(cols.begin + ci[i]); // erase the corresponding columns indices
2993
2994 return submatrix(A,rows,cols);
2995 }
2996
3009 template <class Target, class Source, class RowIndices>
3010 Target eraseRows(Source const& A, RowIndices const& ri)
3011 {
3012 // empty column vector (no columns to delete)
3013 std::vector<size_t> cols;
3014 return eraseRowsNCols(A,ri,cols);
3015 }
3016
3029 template <class Target, class Source, class ColIndices>
3030 Target eraseCols(Source const& A, ColIndices const& ci)
3031 {
3032 // empty row vector (no rows to delete)
3033 std::vector<size_t> rows;
3034 return eraseRowsNCols(A,rows,ci);
3035 }
3036
3037
3048 template <class Source>
3049 std::vector<size_t> nonZeroColumns(Source const& A)
3050 {
3051 // flag vector:
3052 // 1 = non zero column
3053 // 0 = zero column
3054 std::vector<size_t> nzFlags(A.M(),0);
3055 for(auto r=A.begin(); r!=A.end(); ++r)
3056 for(auto c=r->begin(); c!= r->end(); ++c)
3057 nzFlags.at(c.index()) = 1;
3058
3059 // build column index vector
3060 std::vector<size_t> nzCols;
3061 for(int i=0; i<nzFlags.size(); ++i)
3062 if(nzFlags.at(i) == 1) nzCols.push_back(i);
3063
3064 return nzCols;
3065 }
3066
3067
3085 template<int row2, int col2, int row1, int col1, class Scalar, class Index>
3088 {
3089 static_assert(row1 % row2 == 0);
3090 static_assert(col1 % col2 == 0);
3091 int const qN = row1/row2;
3092 int const qM = col1/col2;
3093
3094 Index N = A.N()*qN;
3095 Index M = A.M()*qM;
3096 NumaCRSPatternCreator<Index> creator(N, M, A.getPattern()->isSymmetric());
3097
3098 for(auto row=A.begin(); row!=A.end(); ++row)
3099 for(auto col=row->begin(); col!=row->end(); ++col) //loop over non-zero elements in A
3100 for(int i=0; i<qN; ++i)
3101 for(int j=0; j<qM; ++j)
3102 creator.addElement(row.index()*qN + i, col.index()*qM + j); //create qN*qM non-zero entries in B
3103
3104
3105 NumaBCRSMatrix<Dune::FieldMatrix<Scalar,row2,col2>, Index> B(creator); //empty matrix with sparsity pattern
3106
3107 for(auto row=A.begin(); row!=A.end(); ++row)
3108 for(auto col=row->begin(); col!=row->end(); ++col) // loop over non-zero elements in A
3109 {
3110 Dune::FieldMatrix<Scalar,row1,col1> mat = *col; // get block entry of A
3111 for(int k=0; k<row1; ++k)
3112 for(int l=0; l<col1; ++l)
3113 B[row.index()*qN + k/row2][col.index()*qM + l/col2][k%row2][l%col2] = mat[k][l]; // add data at corresponding entries of B
3114 }
3115
3116 return B;
3117 }
3118
3119
3120
3133 template<int blockrows, int blockcols, class Scalar, class Index>
3137 {
3138 assert(A.N() == B.N());
3139 Index cols = (A.M() + B.M());
3140 Index rows = A.N();
3141
3142 NumaCRSPatternCreator<Index> creator(rows, cols, false);
3143
3144 //loop over non-zero elements in A
3145 for(auto row=A.begin(); row!=A.end(); ++row)
3146 for(auto col=row->begin(); col!=row->end(); ++col)
3147 creator.addElement(row.index(), col.index());
3148 //loop over non-zero elements in B
3149 for(auto row=B.begin(); row!=B.end(); ++row)
3150 for(auto col=row->begin(); col!=row->end(); ++col)
3151 creator.addElement(row.index(), col.index()+A.M()); // append columns on the right
3152
3153
3154 NumaBCRSMatrix<Dune::FieldMatrix<Scalar,blockrows,blockcols>,Index> result(creator); //empty matrix with sparsity pattern
3155
3156 for(auto row=A.begin(); row!=A.end(); ++row)
3157 for(auto col=row->begin(); col!=row->end(); ++col) // loop over non-zero elements in A
3158 result[row.index()][col.index()] = *col;
3159
3160 for(auto row=B.begin(); row!=B.end(); ++row)
3161 for(auto col=row->begin(); col!=row->end(); ++col)
3162 result[row.index()][col.index()+A.M()] = *col;
3163
3164
3165 return result;
3166 }
3167
3180 template<int blockrows, int blockcols, class Scalar, class Index>
3184 {
3185 assert(A.M() == B.M());
3186 Index cols = A.M();
3187 Index rows = (A.N()+B.N());
3188 NumaCRSPatternCreator<Index> creator(rows, cols, false);
3189
3190 //loop over non-zero elements in A
3191 for(auto row=A.begin(); row!=A.end(); ++row)
3192 for(auto col=row->begin(); col!=row->end(); ++col)
3193 creator.addElement(row.index(), col.index());
3194 //loop over non-zero elements in B
3195 for(auto row=B.begin(); row!=B.end(); ++row)
3196 for(auto col=row->begin(); col!=row->end(); ++col)
3197 creator.addElement(row.index()+A.N(), col.index()); // append columns on the right
3198
3199
3200 NumaBCRSMatrix<Dune::FieldMatrix<Scalar,blockrows,blockcols>, Index> result(creator); //empty matrix with sparsity pattern
3201
3202 for(auto row=A.begin(); row!=A.end(); ++row)
3203 for(auto col=row->begin(); col!=row->end(); ++col) // loop over non-zero elements in A
3204 result[row.index()][col.index()] = *col;
3205
3206 for(auto row=B.begin(); row!=B.end(); ++row)
3207 for(auto col=row->begin(); col!=row->end(); ++col) // loop over non-zero elements in B
3208 result[row.index()+A.N()][col.index()] = *col;
3209
3210 return result;
3211 }
3212
3213
3227 template<int blockrows, int blockcols, class Index=size_t>
3230 {
3231 Index cols = (A.M()+B.M());
3232 Index rows = (A.N()+B.N());
3233 NumaCRSPatternCreator<Index> creator(rows, cols, false);
3234
3235 //loop over non-zero elements in A
3236 for(auto row=A.begin(); row!=A.end(); ++row)
3237 for(auto col=row->begin(); col!=row->end(); ++col)
3238 creator.addElement(row.index(), col.index());
3239 //loop over non-zero elements in B
3240 for(auto row=B.begin(); row!=B.end(); ++row)
3241 for(auto col=row->begin(); col!=row->end(); ++col)
3242 creator.addElement(row.index()+A.N(), col.index()+A.M()); // append columns on the right
3243
3244
3245 NumaBCRSMatrix<Dune::FieldMatrix<double,blockrows,blockcols>, Index> result(creator); //empty matrix with sparsity pattern
3246
3247 for(auto row=A.begin(); row!=A.end(); ++row)
3248 for(auto col=row->begin(); col!=row->end(); ++col) // loop over non-zero elements in A
3249 {
3250 auto mat = *col;
3251 for(int k=0; k < blockrows; ++k)
3252 for(int l= 0; l < blockcols; ++l)
3253 result[row.index()][col.index()][k][l] =mat[k][l];
3254 }
3255
3256 for(auto row=B.begin(); row!=B.end(); ++row)
3257 for(auto col=row->begin(); col!=row->end(); ++col) // loop over non-zero elements in B
3258 {
3259 auto mat = *col;
3260 for(int k=0; k < blockrows; ++k)
3261 for(int l= 0; l < blockcols; ++l)
3262 result[row.index()+A.N()][col.index()+A.M()][k][l] = mat[k][l];
3263 }
3264
3265
3266 return result;
3267 }
3268
3269} // end namespace Kaskade
3270
3271#endif
An exception that can be thrown whenever a key lookup fails.
std::vector< SparseIndexInt > ridx
row indices
Definition: triplet.hh:669
std::vector< SparseIndexInt > cidx
column indices
Definition: triplet.hh:671
std::vector< Scalar > data
data
Definition: triplet.hh:673
An STL allocator that uses memory of a specific NUMA node only.
Definition: threading.hh:653
A NUMA-aware compressed row storage matrix adhering mostly to the Dune ISTL interface (to complete....
iterator end()
returns an iterator to the first row
Self & operator-=(NumaBCRSMatrix< EntryB, IndexB > const &B)
Subtracts a sparse matrix from this one.
NumaBCRSMatrix(Expanded const &eIndices, Condensed const &cIndices, Matrix const &mat)
Deprecated. Use simpler version without cIndices instead.
const_iterator const_row_type
void mmv(X const &x, Y &y) const
Matrix-vector multiplication .
NumaBCRSMatrix(NumaBCRSMatrix< OtherEntry, Index > const &matrix)
Constructor copying a given matrix.
NumaBCRSMatrix(Matrix const &matrix, bool isSymmetric, bool isTransposed=false, bool symmetric=false)
Constructor.
Self & operator=(Self &&mat)=default
Move assignment.
Chunk & chunk(int i)
Obtains a reference to the given chunk.
Self & operator=(ThreadedMatrixDetail::NumaBCRSMatrixExpression< Arguments, Operation > const &e)
NOT YET IMPLEMENTED Assigns the given Numa matrix expression.
Self & operator+=(NumaBCRSMatrix< EntryB, IndexB > const &B)
Adds a sparse matrix to this one.
size_t nonzeroes() const
Returns the number of structurally nonzero elements.
NumaBCRSMatrix(Self &&A)=default
Move constructor.
Self & operator*=(Factor a)
Multiplication with a "scalar".
NumaBCRSMatrix(std::shared_ptr< NumaCRSPattern< Index > > const &pattern_, Entry const &init=Entry(0))
Constructor creating a matrix from a given sparsity pattern.
std::shared_ptr< NumaCRSPattern< Index > > getPattern() const
Returns a pointer to the sparsity pattern.
Self & operator=(typename Entry::field_type const &a)
Assigns the given scalar value to each entry.
NumaBCRSMatrix(NumaCRSPatternCreator< Index > const &creator, Entry const &init=Entry(0))
Constructor creating a matrix from a given sparsity pattern creator.
bool exists(Index r, Index c) const
returns true if (r,c) is structurally nonzero
void scatter(DynamicMatrix< Entry > const &B, RowIndices const &rows, ColIndices const &cols, BinaryOp const &op=BinaryOp())
Scatters given submarix into the matrix.
field_type smv(field_type const &a, X const &x, Y &y) const
Matrix-vector multiplication .
void scatter(LMIterator first, LMIterator last)
Scatters given sub-matrices into the matrix by adding up their entries.
const_iterator end() const
NumaBCRSMatrix()
Constructs an empty 0x0 matrix.
NumaBCRSMatrix(Self const &A)=default
Copy constructor.
NumaBCRSMatrix(std::shared_ptr< NumaCRSPattern< Index > > const &pattern_, Matrix const &matrix, bool isSymmetric, bool isTransposed)
Constructor copying a given matrix.
ThreadedMatrixDetail::NumaBCRSMatrixIterator< Entry, Index > iterator
iterator type stepping through the rows
Self & operator=(Entry const &a)
Assigns the given value to each entry.
NumaBCRSMatrix(Expanded const &eIndices, Matrix const &mat)
Indexed submatrix constructor.
iterator begin()
returns an iterator to the first row
field_type usmv(field_type const &a, X const &x, Y &y) const
Matrix-vector multiplication and subsequent computation of if A is square.
row_type operator[](Index r)
Subscript operator allowing random access to rows.
const_row_type operator[](Index r) const
void umtv(X const &x, Y &y) const
Matrix-vector multiplication .
const_iterator begin() const
typename iterator::Iterator ColIterator
column iterator stepping through the entries of a row
field_type frobenius_norm() const
Computes the Frobenius norm .
ThreadedMatrixDetail::NumaBCRSMatrixConstIterator< Entry, Index > const_iterator
NumaBCRSMatrix(NumaBCRSMatrix< Entry, Index > const &matrix, bool isSymmetric, bool isTransposed=false, bool symmetric=false)
Constructor.
Self & operator=(Self const &mat)=default
Copy assignment.
void smtv(field_type const &a, X const &x, Y &y) const
Matrix-vector multiplication .
void usmtv(field_type const &a, X const &x, Y &y) const
Matrix-vector multiplication .
field_type umv(X const &x, Y &y) const
Matrix-vector multiplication and subsequent computation of if A is square.
Self operator()(RowIndices const &ri, ColIndices const &ci) const
Index M() const
The number of columns.
void mtv(X const &x, Y &y) const
Matrix-vector multiplication .
typename iterator::ConstIterator ConstColIterator
column iterator stepping through the const entries of a row
field_type frobenius_norm2() const
Computes the square of the Frobenius norm .
Index N() const
The number of rows.
ScalarType< Entry > Scalar
field_type mv(X const &x, Y &y) const
Matrix-vector multiplication with computation of if A is square.
A NUMA-aware creator for matrix sparsity patterns.
Index cols() const
The number of columns.
void addElements(IterRow const fromRow, IterRow const toRow, IterCol const fromCol, IterCol const toCol, bool colIsSorted=false)
Enters entries into the sparsity pattern.
ChunkCreator const & creator(int node) const
Returns the chunk creator.
void balance()
Redistributes the rows to the NUMA chunks in order to have the same number of entries in each chunk.
void addElement(Index row, Index col)
Enters a single entry into the sparsity pattern.
int nodes() const
Returns the number of NUMA nodes/chunks used.
void addElements(RowRangeSequence const &rrs, ColRangeSequence const &crs, bool colsAreSorted=false)
Enters elements into the sparsity pattern.
size_t nonzeroes() const
The number of structurally nonzero elements.
void addAllElements()
Enters all possible elements (defining a dense matrix).
bool isSymmetric() const
Returns the symmetry status of the pattern.
NumaCRSPatternCreator(Index rows_, Index cols, bool symmetric=false, int nnzPerRow=0)
Constructs a rows times cols matrix sparsity structure creator.
void addDiagonal()
Enters the diagonal elements.
void addDenseBlock(Index fromRow, Index toRow, Index fromCol, Index toCol)
Enters a contiguous dense block into the sparsity pattern.
A NUMA-aware compressed row storage sparsity pattern.
std::shared_ptr< ChunkPattern > pattern(int i) const
Returns the individual patterns.
int nodes() const
Returns the number of NUMA nodes/chunks used.
int chunk(Index row) const
Returns the number of the chunk containing the given row.
Index N() const
The number of rows.
NumaCRSPattern()
Constructs an empty 0x0 pattern.
NumaCRSPattern(NumaCRSPatternCreator< Index > const &creator)
Constructor creating a sparsity pattern from the given creator.
NumaCRSPattern(Expanded const &eIndices, Condensed const &cIndices, Matrix const &mat)
Constructor.
bool isSymmetric() const
Returns the symmetry status of the pattern.
Index M() const
The number of columns.
std::vector< Index > const & rowStart() const
Returns the limiting row indices between the chunks.
size_t nonzeroes() const
Returns the number of structurally nonzero entries.
size_t storage() const
Returns the number of stored entries.
bool exists(Index r, Index c) const
queries whether an entry is present in the sparsity pattern
NumaCRSPattern(Matrix const &matrix, bool isSymmetric, bool isTransposed, bool symmetric)
Constructor extracting the sparsity pattern of a given matrix (usually a Dune::BCRSMatrix).
Implementation of thread pools suitable for parallelization of (more or less) memory-bound algorithms...
Definition: threading.hh:293
static NumaThreadPool & instance(int maxThreads=std::numeric_limits< int >::max())
Returns a globally unique thread pool instance.
int nodes() const
Reports the number of NUMA nodes (i.e., memory interfaces/CPU sockets)
Definition: threading.hh:316
This class stores a couple of compressed row storage rows in memory allocated locally on a NUMA node.
CRSChunk(std::shared_ptr< CRSChunkPattern< Index > > const &pattern_, MatrixAsTriplet< Scalar, Index2 > const &matrix, bool isSymmetric, bool isTransposed)
Constructor.
CRSChunk(int node)
Constructor initializing an empty Chunk.
CRSChunkPattern< Index > const & pattern() const
Returns the sparsity pattern of this chunk.
void scatter(LMIterator first, LMIterator last)
Scatters given sub-matrices into the chunk by adding up their entries.
CRSChunk(std::shared_ptr< CRSChunkPattern< Index > > const &pattern_, Entry const &init)
Constructor.
auto valStartIterator(Index row)
Returns an iterator to the start of the values for the given local row index.
void applyTransposed(Scalar a, Domain const &x, Range &y) const
Transpose matrix-vector multiplication or .
Scalar apply(Scalar a, Domain const &x, Range &y, bool initialize) const
Matrix-vector multiplication or and .
CRSChunk(std::shared_ptr< CRSChunkPattern< Index > > const &pattern_, Matrix const &matrix, bool isSymmetric, bool isTransposed)
Constructor.
Self & operator=(Self const &c)=default
Copy assignment.
void gatherMirrored(Scalar a, Domain const &x, Range &y, CRSChunk< Entry, Index > const &block, bool subdiagonal, bool initToZero) const
Matrix-vector multiplication of transposed parts.
void entrywiseOp(F const &f, NumaBCRSMatrix< EntryB, IndexB > const &B)
Performs entry-wise operations on our own matrix and some other.
CRSChunk(std::shared_ptr< CRSChunkPattern< Index > > const &pattern_, Expanded const &eIndices, Condensed const &cIndices, Matrix const &matrix)
Constructor.
Self & operator=(Value const &a)
Assigns the given value to each entry.
CRSChunk(Matrix const &matrix, bool isSymmetric, bool isTransposed, Index firstRow, Index lastRow, bool symmetric, int node)
Constructor.
Self & operator=(ThreadedMatrixDetail::NumaBCRSMatrixExpressionChunk< Arguments, Operation > const &e)
Assigns the given matrix expression componentwise.
typename EntryTraits< Entry >::field_type Scalar
A class supporting two-stage construction of sparsity patterns of NUMA matrix chunks.
size_t nonzeroes() const
Returns the number of stored entries (structurally nonzero elements).
size_t balanceBackward(size_t covered, size_t const nnz, int chunks, std::vector< IndexArray > &moveRows)
Moves rows in and out of the chunk in order to equilibrate the number of nonzeroes.
CRSChunkPatternCreator(Index firstRow, Index lastRow, Index ncols, bool symmetric, int node)
Constructor.
void addAllElements(Index columns)
Enters all possible elements (defining a dense chunk).
void addElements(IterRow fromRow, IterRow const toRow, IterCol const fromCol, IterCol const toCol, bool colIsSorted=false)
Enters elements into the sparsity pattern.
IndexArray const & row(Index i) const
Returns the sorted and unique column indices of elements in row .
void clear()
Clears the complete data, handing back the memory.
size_t balanceForward(size_t const covered, size_t const nnz, int chunks, std::vector< IndexArray > &moveRows)
Moves rows in and out of the chunk in order to equilibrate the number of nonzeroes.
std::vector< Index > IndexArray
An STL container holding a sequence of indices.
void reserve(Index nnzPerRow=8)
Reserves a certain amount of entries per row.
This class maintains the sparsity structure of a couple of matrix rows (a NUMA matrix chunk).
bool exists(Index r, Index c) const
returns true if the entry with global row and column indices is present in the sparsity pattern
size_t colStart(Index row) const
returns the index from which on the entries in given local row are stored
size_t storage() const
Returns the number of stored entries.
CRSChunkPattern(CRSChunkPatternCreator< Index > const &creator)
Constructor.
CRSChunkPattern(Index first, Index last, Expanded const &eIndices, Condensed const &cIndices, Matrix const &mat, int node)
Constructor.
Index nonzeroesPerRow() const
Returns the average number of nonzeroes per row.
size_t position(Index r, Index c) const
returns the position of an element with given global row and column indices
CRSChunkPattern(Matrix const &matrix, bool isSymmetric, bool isTransposed, Index firstRow, Index lastRow, bool symmetric, int node)
Constructor.
CRSChunkPattern(MatrixAsTriplet< Scalar, Index2 > const &matrix, bool isSymmetric, bool isTransposed, Index firstRow, Index lastRow, bool symmetric, int node)
Constructor.
size_t nonzeroes() const
Returns the number of structurally nonzero entries.
std::vector< Index, NumaAllocator< Index > >::const_iterator colStartIterator(Index row) const
an iterator pointing to the start of the column index for the given local row
A base class representing basic meta information about sparsity patterns of NUMA matrix chunks.
Index columns() const
number of columns in the matrix
int node() const
the NUMA node on which to allocate the memory
Index last() const
end of the half-open row range
CRSChunkPatternInfo(Index first_, Index last_, Index cols_, bool symmetric, int node)
Constructor.
Index first() const
start of the covered row range
bool symmetric() const
if true, only the lower triangular part is stored
NumaBCRSMatrixConstIterator(NumaBCRSMatrix< Entry, Index > const &matrix_, Index row_)
Row const & operator*() const
Dereferentiation yields the row.
An iterator stepping through all entries in a row.
NumaBCRSMatrixConstRowIterator(ColIterator col_, ValueIterator val_)
std::vector< Entry, NumaAllocator< Entry > >::const_iterator ValueIterator
std::vector< Index, NumaAllocator< Index > >::const_iterator ColIterator
NumaBCRSMatrixExpressionChunk< Arguments, Operation > const & operator[](int i) const
NumaBCRSMatrixIterator(NumaBCRSMatrix< Entry, Index > &matrix_, Index row_)
std::vector< Entry, NumaAllocator< Entry > >::iterator ValueIterator
NumaBCRSMatrixRowIterator(ColIterator col_, ValueIterator val_)
std::vector< Index, NumaAllocator< Index > >::const_iterator ColIterator
Entry const & operator[](Index c) const
Random read access to row entries by global column index.
NumaBCRSMatrixRowIterator< Entry, Index > Iterator
void update(NumaBCRSMatrix< Entry, Index > &matrix, Index row, int chunk)
Entry & operator[](Index c)
Random write access to row entries by global column index.
Iterator find(Index c)
Looks up a particular entry specified by column index.
ConstIterator begin() const
Start of row entries.
size_type size() const
Returns the number of entries in the current row.
Index const * getindexptr() const
Low-level access to array of (sorted) column indices in this row.
block_type const * getptr() const
Low-level access to array of values in this row.
std::vector< Index, NumaAllocator< Index > >::const_iterator ColIterator
NumaBCRSMatrixConstRowIterator< Entry, Index > ConstIterator
std::vector< Entry, NumaAllocator< Entry > >::iterator ValueIterator
ConstIterator find(Index c) const
Looks up a particular entry specified by column index.
This file contains various utility functions that augment the basic functionality of Dune.
NumaBCRSMatrix< Dune::FieldMatrix< Scalar, row2, col2 >, Index > reshapeBlocks(NumaBCRSMatrix< Dune::FieldMatrix< Scalar, row1, col1 >, Index > const &A)
reshapes NumaBRCSMAtrix entry block structure
NumaBCRSMatrix< Dune::FieldMatrix< Scalar, n, n >, Index > sparseUnitMatrix(Index N)
creates a unit matrix in NUMA block compressed row storage format
NumaBCRSMatrix< Dune::FieldMatrix< Scalar, blockrows, blockcols >, Index > vertcat(NumaBCRSMatrix< Dune::FieldMatrix< Scalar, blockrows, blockcols >, Index > const &A, NumaBCRSMatrix< Dune::FieldMatrix< Scalar, blockrows, blockcols >, Index > const &B)
concatenate two matrices vertically
auto transpose(NumaBCRSMatrix< Entry, Index > const &A)
Creates the transposed sparse matrix .
DynamicMatrix< typename SparseMatrix::block_type > full(SparseMatrix const &A)
Converts a sparse NumaBCRSMatrix to a dense matrix.
Dune::FieldVector< T, n > max(Dune::FieldVector< T, n > x, Dune::FieldVector< T, n > const &y)
Componentwise maximum.
Definition: fixdune.hh:110
Target eraseCols(Source const &A, ColIndices const &ci)
"deletes" columns by extracting a copy of the matrix keeping only the non-deleted columns.
NumaBCRSMatrix< Dune::FieldMatrix< Scalar, n, m >, Index > sparseZeroMatrix(Index N, Index M)
creates a zero matrix in NUMA block compressed row storage format
DynamicMatrix< typename SparseMatrix::block_type > full(SparseMatrix const &A, RowRange const &rows, ColRange const &cols)
Converts a subrange of a sparse matrix to a dense matrix.
Target eraseRows(Source const &A, RowIndices const &ri)
"deletes" rows by extracting a copy of the matrix keeping only the non-deleted rows.
NumaBCRSMatrix< Dune::FieldMatrix< Scalar, blockrows, blockcols >, Index > horzcat(NumaBCRSMatrix< Dune::FieldMatrix< Scalar, blockrows, blockcols >, Index > const &A, NumaBCRSMatrix< Dune::FieldMatrix< Scalar, blockrows, blockcols >, Index > const &B)
concatenate two matrices horizontally
Target eraseRowsNCols(Source const &A, RowIndices const &ri, ColIndices const &ci)
"deletes" rows/columns by extracting a copy of the matrix keeping the non-deleted rows/columns.
std::vector< size_t > nonZeroColumns(Source const &A)
returns all indices of nonzero columns.
NumaBCRSMatrix< Dune::FieldMatrix< double, blockrows, blockcols >, Index > diagcat(NumaBCRSMatrix< Dune::FieldMatrix< double, blockrows, blockcols >, Index > const &A, NumaBCRSMatrix< Dune::FieldMatrix< double, blockrows, blockcols >, Index > const &B)
concatenate two matrices diagonally, resulting matrix is zero on the off block-diagonal
NumaBCRSMatrix< Entry, Index > operator+(NumaBCRSMatrix< Entry, Index > const &A, NumaBCRSMatrix< Entry, Index2 > const &B)
Matrix addition . The sparsity patterns of both matrices can be different. The size of the matrices h...
Dune::FieldVector< T, n > min(Dune::FieldVector< T, n > x, Dune::FieldVector< T, n > const &y)
Componentwise minimum.
Definition: fixdune.hh:122
Index uniformWeightRangeStart(BlockIndex i, BlockIndex n, Index m)
Computes partitioning points of ranges for uniform weight distributions.
Definition: threading.hh:75
void parallelForNodes(Func const &f, int maxTasks=std::numeric_limits< int >::max())
A parallel for loop that executes the given functor in parallel on different NUMA nodes.
Definition: threading.hh:604
void equalWeightRanges(std::vector< size_t > &x, size_t n)
Computes partitioning points such that the sum of weights in each partition is roughly the same.
Scalar distance(Point< Scalar, dim > const &first, Point< Scalar, dim > const &second)
void getRowCount(Dune::BCRSMatrix< Entry, Allocator > const &matrix, bool isTransposed, std::vector< Index > &rowCount)
std::ostream & operator<<(std::ostream &s, std::vector< Scalar > const &vec)
Definition: dune_bridge.hh:47
typename GetScalar< Type >::type ScalarType
Extracts the scalar field type from linear algebra data types.
T transpose(T x)
NumaCRSPattern< Index > operator+(NumaCRSPattern< Index > const &pa, NumaCRSPattern< Index2 > const &pb)
Target submatrix(Source const &A, RowIndices const &ri, ColIndices const &ci)
Entry field_type
Definition: scalar.hh:77
A comparator functor that supports sorting std::pair by their first component.
Definition: firstless.hh:22
static void init(Index first, Index last, Matrix const &matrix, std::vector< size_t, NumaAllocator< size_t > > &colStart, std::vector< Index, NumaAllocator< Index > > &cols, std::vector< Entry, NumaAllocator< Entry > > &values, std::vector< Index > const &nRowEntries)
static void init(Index first, Index last, Matrix const &matrix, std::vector< size_t, NumaAllocator< size_t > > &colStart, std::vector< Index, NumaAllocator< Index > > &cols, std::vector< Entry, NumaAllocator< Entry > > &values, std::vector< Index > const &nRowEntries)
static void init(Index first, Index last, Matrix const &matrix, std::vector< size_t, NumaAllocator< size_t > > &colStart, std::vector< Index, NumaAllocator< Index > > &cols, std::vector< Entry, NumaAllocator< Entry > > &values, std::vector< Index > const &)
static void copy(Entry const &from, Entry &to, bool isTransposed)
static void copy(From const &from, To &to, bool isTransposed)
static void copy(From const &from, To &to, bool isTransposed)