Ifpack2 Templated Preconditioning Package  Version 1.0
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Groups Pages
Ifpack2_Container_def.hpp
1 // @HEADER
2 // *****************************************************************************
3 // Ifpack2: Templated Object-Oriented Algebraic Preconditioner Package
4 //
5 // Copyright 2009 NTESS and the Ifpack2 contributors.
6 // SPDX-License-Identifier: BSD-3-Clause
7 // *****************************************************************************
8 // @HEADER
9 
10 #ifndef IFPACK2_CONTAINER_DEF_HPP
11 #define IFPACK2_CONTAINER_DEF_HPP
12 
14 #include <Teuchos_Time.hpp>
15 
16 namespace Ifpack2 {
17 
18 // Implementation of Ifpack2::Container
19 
20 template <class MatrixType>
23  const Teuchos::Array<Teuchos::Array<LO>>& partitions,
24  bool pointIndexed)
25  : inputMatrix_(matrix)
26  , inputCrsMatrix_(Teuchos::rcp_dynamic_cast<const crs_matrix_type>(inputMatrix_))
27  , inputBlockMatrix_(Teuchos::rcp_dynamic_cast<const block_crs_matrix_type>(inputMatrix_))
28  , pointIndexed_(pointIndexed)
29  , IsInitialized_(false)
30  , IsComputed_(false) {
31  using Teuchos::Array;
32  using Teuchos::ArrayView;
33  using Teuchos::Comm;
34  using Teuchos::Ptr;
35  using Teuchos::RCP;
36  NumLocalRows_ = inputMatrix_->getLocalNumRows();
37  NumGlobalRows_ = inputMatrix_->getGlobalNumRows();
38  NumGlobalNonzeros_ = inputMatrix_->getGlobalNumEntries();
39  IsParallel_ = inputMatrix_->getRowMap()->getComm()->getSize() != 1;
41  if (hasBlockCrs_)
42  bcrsBlockSize_ = inputBlockMatrix_->getBlockSize();
43  else
44  bcrsBlockSize_ = 1;
47  else
48  scalarsPerRow_ = 1;
49  setBlockSizes(partitions);
50 // Sanity check the partitions
51 #ifdef HAVE_IFPACK2_DEBUG
52  // Check whether the input set of local row indices is correct.
53  const map_type& rowMap = *inputMatrix_->getRowMap();
54  for (int i = 0; i < numBlocks_; i++) {
56  for (LO j = 0; j < blockSizes_[i]; j++) {
57  LO row = blockRows[j];
58  if (pointIndexed) {
59  // convert the point row to the corresponding block row
60  row /= bcrsBlockSize_;
61  }
63  !rowMap.isNodeLocalElement(row),
64  std::invalid_argument,
65  "Ifpack2::Container: "
66  "On process "
67  << rowMap.getComm()->getRank() << " of "
68  << rowMap.getComm()->getSize() << ", in the given set of local row "
69  "indices blockRows = "
70  << Teuchos::toString(blockRows) << ", the following "
71  "entries is not valid local row index on the calling process: "
72  << row << ".");
73  }
74  }
75 #endif
76 }
77 
78 template <class MatrixType>
81 
82 template <class MatrixType>
84 Container<MatrixType>::getBlockRows(int blockIndex) const {
85  return Teuchos::ArrayView<const LO>(&blockRows_[blockOffsets_[blockIndex]], blockSizes_[blockIndex]);
86 }
87 
88 template <class MatrixType>
90  // First, create a grand total of all the rows in all the blocks
91  // Note: If partitioner allowed overlap, this could be greater than the # of local rows
92  LO totalBlockRows = 0;
93  numBlocks_ = partitions.size();
94  blockSizes_.resize(numBlocks_);
95  blockOffsets_.resize(numBlocks_);
96  maxBlockSize_ = 0;
97  for (int i = 0; i < numBlocks_; i++) {
98  LO rowsInBlock = partitions[i].size();
99  blockSizes_[i] = rowsInBlock;
100  blockOffsets_[i] = totalBlockRows;
101  totalBlockRows += rowsInBlock;
102  maxBlockSize_ = std::max(maxBlockSize_, rowsInBlock * scalarsPerRow_);
103  }
104  blockRows_.resize(totalBlockRows);
105  // set blockRows_: each entry is the partition/block of the row
106  LO iter = 0;
107  for (int i = 0; i < numBlocks_; i++) {
108  for (int j = 0; j < blockSizes_[i]; j++) {
109  blockRows_[iter++] = partitions[i][j];
110  }
111  }
112 }
113 
114 template <class MatrixType>
116  if (Diag_.is_null()) {
117  Diag_ = rcp(new vector_type(inputMatrix_->getDomainMap()));
118  inputMatrix_->getLocalDiagCopy(*Diag_);
119  }
120 }
121 
122 template <class MatrixType>
124  return IsInitialized_;
125 }
126 
127 template <class MatrixType>
129  return IsComputed_;
130 }
131 
132 template <class MatrixType>
134  applyMV(const mv_type& X, mv_type& Y) const {
135  TEUCHOS_TEST_FOR_EXCEPT_MSG(true, "Not implemented.");
136 }
137 
138 template <class MatrixType>
140  weightedApplyMV(const mv_type& X, mv_type& Y, vector_type& W) const {
141  TEUCHOS_TEST_FOR_EXCEPT_MSG(true, "Not implemented.");
142 }
143 
144 template <class MatrixType>
145 std::string Container<MatrixType>::
147  return "Generic";
148 }
149 
150 template <class MatrixType>
151 void Container<MatrixType>::DoGSBlock(ConstHostView X, HostView Y, HostView Y2, HostView Resid,
152  SC dampingFactor, LO i) const {
153  TEUCHOS_TEST_FOR_EXCEPT_MSG(true, "Not implemented.");
154 }
155 
156 template <class MatrixType>
157 void Container<MatrixType>::DoJacobi(ConstHostView X, HostView Y, SC dampingFactor) const {
158  using STS = Teuchos::ScalarTraits<ISC>;
159  const ISC one = STS::one();
160  // use blockRows_ and blockSizes_
161  size_t numVecs = X.extent(1);
162  // Non-overlapping Jacobi
163  for (LO i = 0; i < numBlocks_; i++) {
164  // may happen that a partition is empty
165  if (blockSizes_[i] != 1 || hasBlockCrs_) {
166  if (blockSizes_[i] == 0)
167  continue;
168  apply(X, Y, i, Teuchos::NO_TRANS, dampingFactor, one);
169  } else // singleton, can't access Containers_[i] as it was never filled and may be null.
170  {
171  LO LRID = blockRows_[blockOffsets_[i]];
172  getMatDiag();
173  auto diagView = Diag_->getLocalViewHost(Tpetra::Access::ReadOnly);
174  ISC d = one * (static_cast<ISC>(dampingFactor)) / diagView(LRID, 0);
175  for (size_t nv = 0; nv < numVecs; nv++) {
176  ISC x = X(LRID, nv);
177  Y(LRID, nv) += x * d;
178  }
179  }
180  }
181 }
182 
183 template <class MatrixType>
184 void Container<MatrixType>::DoOverlappingJacobi(ConstHostView X, HostView Y, ConstHostView W, SC dampingFactor, bool nonsymScaling) const {
185  using STS = Teuchos::ScalarTraits<SC>;
186  // Overlapping Jacobi
187  for (LO i = 0; i < numBlocks_; i++) {
188  // may happen that a partition is empty
189  if (blockSizes_[i] == 0)
190  continue;
191  if (blockSizes_[i] != 1) {
192  if (!nonsymScaling)
193  weightedApply(X, Y, W, i, Teuchos::NO_TRANS, dampingFactor, STS::one());
194  else {
195  // A crummy way of doing nonsymmetric scaling. We effectively
196  // first reverse scale x, which later gets scaled inside weightedApply
197  // so the net effect is that x is not scaled.
198  // This was done to keep using weightedApply() that is defined in
199  // many spots in the code.
200  HostView tempo("", X.extent(0), X.extent(1));
201  size_t numVecs = X.extent(1);
202  LO bOffset = blockOffsets_[i];
203  for (LO ii = 0; ii < blockSizes_[i]; ii++) {
204  LO LRID = blockRows_[bOffset++];
205  for (size_t jj = 0; jj < numVecs; jj++) tempo(LRID, jj) = X(LRID, jj) / W(LRID, 0);
206  }
207  weightedApply(tempo, Y, W, i, Teuchos::NO_TRANS, dampingFactor, STS::one());
208  }
209  } else // singleton, can't access Containers_[i] as it was never filled and may be null.
210  {
211  const ISC one = STS::one();
212  size_t numVecs = X.extent(1);
213  LO LRID = blockRows_[blockOffsets_[i]];
214  getMatDiag();
215  auto diagView = Diag_->getLocalViewHost(Tpetra::Access::ReadOnly);
216  ISC recip = one / diagView(LRID, 0);
217  ISC wval = W(LRID, 0);
218  ISC combo = wval * recip;
219  ISC d = combo * (static_cast<ISC>(dampingFactor));
220  for (size_t nv = 0; nv < numVecs; nv++) {
221  ISC x = X(LRID, nv);
222  Y(LRID, nv) = x * d + Y(LRID, nv);
223  }
224  }
225  }
226 }
227 
228 // Do Gauss-Seidel with just block i
229 // This is used 3 times: once in DoGaussSeidel and twice in DoSGS
230 template <class MatrixType, typename LocalScalarType>
232  ConstHostView X, HostView Y, HostView Y2, HostView Resid,
233  SC dampingFactor, LO i) const {
234  using Teuchos::ArrayView;
235  using STS = Teuchos::ScalarTraits<ISC>;
236  size_t numVecs = X.extent(1);
237  const ISC one = STS::one();
238  if (this->blockSizes_[i] == 0)
239  return; // Skip empty partitions
240  if (this->hasBlockCrs_ && !this->pointIndexed_) {
241  // Use efficient blocked version
242  ArrayView<const LO> blockRows = this->getBlockRows(i);
243  const size_t localNumRows = this->blockSizes_[i];
244  using inds_type = typename block_crs_matrix_type::local_inds_host_view_type;
245  using vals_type = typename block_crs_matrix_type::values_host_view_type;
246  for (size_t j = 0; j < localNumRows; j++) {
247  LO row = blockRows[j]; // Containers_[i]->ID (j);
248  vals_type values;
249  inds_type colinds;
250  this->inputBlockMatrix_->getLocalRowView(row, colinds, values);
251  LO numEntries = (LO)colinds.size();
252  for (size_t m = 0; m < numVecs; m++) {
253  for (int localR = 0; localR < this->bcrsBlockSize_; localR++)
254  Resid(row * this->bcrsBlockSize_ + localR, m) = X(row * this->bcrsBlockSize_ + localR, m);
255  for (LO k = 0; k < numEntries; ++k) {
256  const LO col = colinds[k];
257  for (int localR = 0; localR < this->bcrsBlockSize_; localR++) {
258  for (int localC = 0; localC < this->bcrsBlockSize_; localC++) {
259  Resid(row * this->bcrsBlockSize_ + localR, m) -=
260  values[k * this->bcrsBlockSize_ * this->bcrsBlockSize_ + localR + localC * this->bcrsBlockSize_] * Y2(col * this->bcrsBlockSize_ + localC, m);
261  }
262  }
263  }
264  }
265  }
266  // solve with this block
267  //
268  // Note: I'm abusing the ordering information, knowing that X/Y
269  // and Y2 have the same ordering for on-proc unknowns.
270  //
271  // Note: Add flop counts for inverse apply
272  apply(Resid, Y2, i, Teuchos::NO_TRANS, dampingFactor, one);
273  } else if (!this->hasBlockCrs_ && this->blockSizes_[i] == 1) {
274  // singleton, can't access Containers_[i] as it was never filled and may be null.
275  // a singleton calculation (just using matrix diagonal) is exact, all residuals should be zero.
276  LO LRID = this->blockOffsets_[i]; // by definition, a singleton 1 row in block.
277  // Use the KokkosSparse internal matrix for low-overhead values/indices access
278  // But, can only do this if the matrix is accessible directly from host, since it's not a DualView
280  container_exec_space().fence();
281  auto localA = this->inputCrsMatrix_->getLocalMatrixHost();
282  using size_type = typename crs_matrix_type::local_matrix_host_type::size_type;
283  const auto& rowmap = localA.graph.row_map;
284  const auto& entries = localA.graph.entries;
285  const auto& values = localA.values;
286  this->getMatDiag();
287  auto diagView = this->Diag_->getLocalViewHost(Tpetra::Access::ReadOnly);
288  ISC d = (static_cast<ISC>(dampingFactor)) / diagView(LRID, 0);
289  for (size_t m = 0; m < numVecs; m++) {
290  // ISC x = X(LRID, m);
291  ISC r = X(LRID, m);
292  for (size_type k = rowmap(LRID); k < rowmap(LRID + 1); k++) {
293  const LO col = entries(k);
294  r -= values(k) * Y2(col, m);
295  }
296 
297  ISC newy = r * d;
298  Y2(LRID, m) += newy;
299  }
300  } else if (!this->inputCrsMatrix_.is_null() &&
301  std::is_same<typename crs_matrix_type::device_type::memory_space, Kokkos::HostSpace>::value) {
302  // Use the KokkosSparse internal matrix for low-overhead values/indices access
303  // But, can only do this if the matrix is accessible directly from host, since it's not a DualView
305  container_exec_space().fence();
306  auto localA = this->inputCrsMatrix_->getLocalMatrixHost();
307  using size_type = typename crs_matrix_type::local_matrix_host_type::size_type;
308  const auto& rowmap = localA.graph.row_map;
309  const auto& entries = localA.graph.entries;
310  const auto& values = localA.values;
311  ArrayView<const LO> blockRows = this->getBlockRows(i);
312  for (size_t j = 0; j < size_t(blockRows.size()); j++) {
313  const LO row = blockRows[j];
314  for (size_t m = 0; m < numVecs; m++) {
315  ISC r = X(row, m);
316  for (size_type k = rowmap(row); k < rowmap(row + 1); k++) {
317  const LO col = entries(k);
318  r -= values(k) * Y2(col, m);
319  }
320  Resid(row, m) = r;
321  }
322  }
323  // solve with this block
324  //
325  // Note: I'm abusing the ordering information, knowing that X/Y
326  // and Y2 have the same ordering for on-proc unknowns.
327  //
328  // Note: Add flop counts for inverse apply
329  apply(Resid, Y2, i, Teuchos::NO_TRANS, dampingFactor, one);
330  } else {
331  // Either a point-indexed block matrix, or a normal row matrix
332  // that doesn't support getLocalMatrix
333  ArrayView<const LO> blockRows = this->getBlockRows(i);
334  for (size_t j = 0; j < size_t(blockRows.size()); j++) {
335  const LO row = blockRows[j];
336  auto rowView = getInputRowView(row);
337  for (size_t m = 0; m < numVecs; m++) {
338  Resid(row, m) = X(row, m);
339  for (size_t k = 0; k < rowView.size(); ++k) {
340  const LO col = rowView.ind(k);
341  Resid(row, m) -= rowView.val(k) * Y2(col, m);
342  }
343  }
344  }
345  // solve with this block
346  //
347  // Note: I'm abusing the ordering information, knowing that X/Y
348  // and Y2 have the same ordering for on-proc unknowns.
349  //
350  // Note: Add flop counts for inverse apply
351  apply(Resid, Y2, i, Teuchos::NO_TRANS, dampingFactor, one);
352  }
353 }
354 
355 template <class MatrixType>
357  DoGaussSeidel(ConstHostView X, HostView Y, HostView Y2, SC dampingFactor) const {
358  using Teuchos::Array;
359  using Teuchos::ArrayRCP;
360  using Teuchos::ArrayView;
361  using Teuchos::Ptr;
362  using Teuchos::RCP;
363  using Teuchos::rcp;
364  using Teuchos::rcpFromRef;
365  // This function just extracts the diagonal if it hasn't already.
366  getMatDiag();
367  auto numVecs = X.extent(1);
368  // X = RHS, Y = initial guess
369  HostView Resid("", X.extent(0), X.extent(1));
370  for (LO i = 0; i < numBlocks_; i++) {
371  DoGSBlock(X, Y, Y2, Resid, dampingFactor, i);
372  }
373  if (IsParallel_) {
374  auto numMyRows = inputMatrix_->getLocalNumRows();
375  for (size_t m = 0; m < numVecs; ++m) {
376  for (size_t i = 0; i < numMyRows * bcrsBlockSize_; ++i) {
377  Y(i, m) = Y2(i, m);
378  }
379  }
380  }
381 }
382 
383 template <class MatrixType>
384 void Container<MatrixType>::
385  DoSGS(ConstHostView X, HostView Y, HostView Y2, SC dampingFactor) const {
386  // X = RHS, Y = initial guess
387  using Teuchos::Array;
388  using Teuchos::ArrayRCP;
389  using Teuchos::ArrayView;
390  using Teuchos::Ptr;
391  using Teuchos::ptr;
392  using Teuchos::RCP;
393  using Teuchos::rcp;
394  using Teuchos::rcpFromRef;
395  auto numVecs = X.extent(1);
396  HostView Resid("", X.extent(0), X.extent(1));
397  // Forward Sweep
398  for (LO i = 0; i < numBlocks_; i++) {
399  DoGSBlock(X, Y, Y2, Resid, dampingFactor, i);
400  }
401  static_assert(std::is_signed<LO>::value,
402  "Local ordinal must be signed (unsigned breaks reverse iteration to 0)");
403  // Reverse Sweep
404  for (LO i = numBlocks_ - 1; i >= 0; --i) {
405  DoGSBlock(X, Y, Y2, Resid, dampingFactor, i);
406  }
407  if (IsParallel_) {
408  auto numMyRows = inputMatrix_->getLocalNumRows();
409  for (size_t m = 0; m < numVecs; ++m) {
410  for (size_t i = 0; i < numMyRows * bcrsBlockSize_; ++i) {
411  Y(i, m) = Y2(i, m);
412  }
413  }
414  }
415 }
416 
417 template <class MatrixType>
418 void Container<MatrixType>::
419  clearBlocks() {
420  numBlocks_ = 0;
421  blockRows_.clear();
422  blockSizes_.clear();
423  blockOffsets_.clear();
424  Diag_ = Teuchos::null; // Diag_ will be recreated if needed
425 }
426 
427 // Implementation of Ifpack2::ContainerImpl
428 
429 template <class MatrixType, class LocalScalarType>
430 ContainerImpl<MatrixType, LocalScalarType>::
431  ContainerImpl(
433  const Teuchos::Array<Teuchos::Array<LO>>& partitions,
434  bool pointIndexed)
435  : Container<MatrixType>(matrix, partitions, pointIndexed) {}
436 
437 template <class MatrixType, class LocalScalarType>
440 
441 template <class MatrixType, class LocalScalarType>
444 
445 template <class MatrixType, class LocalScalarType>
447  applyInverseJacobi(const mv_type& /* X */, mv_type& /* Y */,
448  SC dampingFactor,
449  bool /* zeroStartingSolution = false */,
450  int /* numSweeps = 1 */) const {
451  TEUCHOS_TEST_FOR_EXCEPT_MSG(true, "Not implemented.");
452 }
453 
454 template <class MatrixType, class LocalScalarType>
456  applyMV(const mv_type& X, mv_type& Y) const {
457  ConstHostView XView = X.getLocalViewHost(Tpetra::Access::ReadOnly);
458  HostView YView = Y.getLocalViewHost(Tpetra::Access::ReadWrite);
459  this->apply(XView, YView, 0);
460 }
461 
462 template <class MatrixType, class LocalScalarType>
464  weightedApplyMV(const mv_type& X,
465  mv_type& Y,
466  vector_type& W) const {
467  ConstHostView XView = X.getLocalViewHost(Tpetra::Access::ReadOnly);
468  HostView YView = Y.getLocalViewHost(Tpetra::Access::ReadWrite);
469  ConstHostView WView = W.getLocalViewHost(Tpetra::Access::ReadOnly);
470  weightedApply(XView, YView, WView, 0);
471 }
472 
473 template <class MatrixType, class LocalScalarType>
476  return "Generic";
477 }
478 
479 template <class MatrixType, class LocalScalarType>
481  solveBlock(ConstHostSubviewLocal X,
482  HostSubviewLocal Y,
483  int blockIndex,
484  Teuchos::ETransp mode,
485  const LSC alpha,
486  const LSC beta) const {
487  TEUCHOS_TEST_FOR_EXCEPT_MSG(true, "Not implemented.");
488 }
489 
490 template <class MatrixType, class LocalScalarType>
491 typename ContainerImpl<MatrixType, LocalScalarType>::LO
494  LO LO_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
495  GO GO_INVALID = Teuchos::OrdinalTraits<GO>::invalid();
496  const map_type& globalRowMap = *(this->inputMatrix_->getRowMap());
497  const map_type& globalColMap = *(this->inputMatrix_->getColMap());
498  LO rowLID = row;
499  LO dofOffset = 0;
500  if (this->pointIndexed_) {
501  rowLID = row / this->bcrsBlockSize_;
502  dofOffset = row % this->bcrsBlockSize_;
503  }
504  GO diagGID = globalRowMap.getGlobalElement(rowLID);
506  diagGID == GO_INVALID,
507  std::runtime_error,
508  "Ifpack2::Container::translateRowToCol: "
509  "On process "
510  << this->inputMatrix_->getRowMap()->getComm()->getRank() << ", at least one row index in the set of local "
511  "row indices given to the constructor is not a valid local row index in "
512  "the input matrix's row Map on this process. This should be impossible "
513  "because the constructor checks for this case. Here is the complete set "
514  "of invalid local row indices: "
515  << rowLID << ". "
516  "Please report this bug to the Ifpack2 developers.");
517  // now, can translate diagGID (both a global row AND global col ID) to local column
518  LO colLID = globalColMap.getLocalElement(diagGID);
520  colLID == LO_INVALID,
521  std::runtime_error,
522  "Ifpack2::Container::translateRowToCol: "
523  "On process "
524  << this->inputMatrix_->getRowMap()->getComm()->getRank() << ", "
525  "at least one row index in the set of row indices given to the constructor "
526  "does not have a corresponding column index in the input matrix's column "
527  "Map. This probably means that the column(s) in question is/are empty on "
528  "this process, which would make the submatrix to extract structurally "
529  "singular. The invalid global column index is "
530  << diagGID << ".");
531  // colLID could identify a block column - translate to split column if needed
532  if (this->pointIndexed_)
533  return colLID * this->bcrsBlockSize_ + dofOffset;
534  return colLID;
535 }
536 
537 template <class MatrixType, class LocalScalarType>
539  apply(ConstHostView X,
540  HostView Y,
541  int blockIndex,
542  Teuchos::ETransp mode,
543  SC alpha,
544  SC beta) const {
545  using Teuchos::ArrayView;
546  using Teuchos::as;
547  using Teuchos::RCP;
548  using Teuchos::rcp;
549 
550  // The local operator might have a different Scalar type than
551  // MatrixType. This means that we might have to convert X and Y to
552  // the Tpetra::MultiVector specialization that the local operator
553  // wants. This class' X_ and Y_ internal fields are of the right
554  // type for the local operator, so we can use those as targets.
555 
557 
559  !this->IsComputed_, std::runtime_error,
560  "Ifpack2::Container::apply: "
561  "You must have called the compute() method before you may call apply(). "
562  "You may call the apply() method as many times as you want after calling "
563  "compute() once, but you must have called compute() at least once.");
564 
565  const size_t numVecs = X.extent(1);
566 
567  if (numVecs == 0) {
568  return; // done! nothing to do
569  }
570 
571  // The local operator works on a permuted subset of the local parts
572  // of X and Y. The subset and permutation are defined by the index
573  // array returned by getBlockRows(). If the permutation is trivial
574  // and the subset is exactly equal to the local indices, then we
575  // could use the local parts of X and Y exactly, without needing to
576  // permute. Otherwise, we have to use temporary storage to permute
577  // X and Y. For now, we always use temporary storage.
578  //
579  // Create temporary permuted versions of the input and output.
580  // (Re)allocate X_ and/or Y_ only if necessary. We'll use them to
581  // store the permuted versions of X resp. Y. Note that X_local has
582  // the domain Map of the operator, which may be a permuted subset of
583  // the local Map corresponding to X.getMap(). Similarly, Y_local
584  // has the range Map of the operator, which may be a permuted subset
585  // of the local Map corresponding to Y.getMap(). numRows_ here
586  // gives the number of rows in the row Map of the local Inverse_
587  // operator.
588  //
589  // FIXME (mfh 20 Aug 2013) There might be an implicit assumption
590  // here that the row Map and the range Map of that operator are
591  // the same.
592  //
593  // FIXME (mfh 20 Aug 2013) This "local permutation" functionality
594  // really belongs in Tpetra.
595 
596  if (X_localBlocks_.size() == 0 || X.extent(1) != X_local_.extent(1)) {
597  // need to resize (or create for the first time) the three scratch arrays
598  X_localBlocks_.clear();
599  Y_localBlocks_.clear();
600  X_localBlocks_.reserve(this->numBlocks_);
601  Y_localBlocks_.reserve(this->numBlocks_);
602 
603  X_local_ = HostViewLocal("X_local", this->blockRows_.size() * this->scalarsPerRow_, numVecs);
604  Y_local_ = HostViewLocal("Y_local", this->blockRows_.size() * this->scalarsPerRow_, numVecs);
605 
606  // create all X_local and Y_local managed Views at once, are
607  // reused in subsequent apply() calls
608  for (int i = 0; i < this->numBlocks_; i++) {
609  auto blockBounds = std::make_pair(this->blockOffsets_[i] * this->scalarsPerRow_,
610  (this->blockOffsets_[i] + this->blockSizes_[i]) * this->scalarsPerRow_);
611  X_localBlocks_.emplace_back(X_local_, blockBounds, Kokkos::ALL());
612  Y_localBlocks_.emplace_back(Y_local_, blockBounds, Kokkos::ALL());
613  }
614  }
615 
616  const ArrayView<const LO> blockRows = this->getBlockRows(blockIndex);
617 
618  if (this->scalarsPerRow_ == 1)
619  mvgs.gatherViewToView(X_localBlocks_[blockIndex], X, blockRows);
620  else
621  mvgs.gatherViewToViewBlock(X_localBlocks_[blockIndex], X, blockRows, this->scalarsPerRow_);
622 
623  // We must gather the contents of the output multivector Y even on
624  // input to solveBlock(), since the inverse operator might use it as
625  // an initial guess for a linear solve. We have no way of knowing
626  // whether it does or does not.
627 
628  if (this->scalarsPerRow_ == 1)
629  mvgs.gatherViewToView(Y_localBlocks_[blockIndex], Y, blockRows);
630  else
631  mvgs.gatherViewToViewBlock(Y_localBlocks_[blockIndex], Y, blockRows, this->scalarsPerRow_);
632 
633  // Apply the local operator:
634  // Y_local := beta*Y_local + alpha*M^{-1}*X_local
635  this->solveBlock(X_localBlocks_[blockIndex], Y_localBlocks_[blockIndex], blockIndex, mode,
636  LSC(alpha), LSC(beta));
637 
638  // Scatter the permuted subset output vector Y_local back into the
639  // original output multivector Y.
640  if (this->scalarsPerRow_ == 1)
641  mvgs.scatterViewToView(Y, Y_localBlocks_[blockIndex], blockRows);
642  else
643  mvgs.scatterViewToViewBlock(Y, Y_localBlocks_[blockIndex], blockRows, this->scalarsPerRow_);
644 }
645 
646 template <class MatrixType, class LocalScalarType>
648  weightedApply(ConstHostView X,
649  HostView Y,
650  ConstHostView D,
651  int blockIndex,
652  Teuchos::ETransp mode,
653  SC alpha,
654  SC beta) const {
655  using std::endl;
656  using Teuchos::ArrayRCP;
657  using Teuchos::ArrayView;
658  using Teuchos::Ptr;
659  using Teuchos::ptr;
660  using Teuchos::Range1D;
661  using Teuchos::RCP;
662  using Teuchos::rcp;
663  using Teuchos::rcp_const_cast;
664  using STS = Teuchos::ScalarTraits<SC>;
665 
666  // The local operator template parameter might have a different
667  // Scalar type than MatrixType. This means that we might have to
668  // convert X and Y to the Tpetra::MultiVector specialization that
669  // the local operator wants. This class' X_ and Y_ internal fields
670  // are of the right type for the local operator, so we can use those
671  // as targets.
672 
673  const char prefix[] = "Ifpack2::Container::weightedApply: ";
675  !this->IsComputed_, std::runtime_error, prefix << "You must have called the "
676  "compute() method before you may call this method. You may call "
677  "weightedApply() as many times as you want after calling compute() once, "
678  "but you must have called compute() at least once first.");
679 
680  // bmk 7-2019: BlockRelaxation already checked this, but if that changes...
682  this->scalarsPerRow_ > 1, std::logic_error, prefix << "Use of block rows isn't allowed "
683  "in overlapping Jacobi (the only method that uses weightedApply");
684 
685  const size_t numVecs = X.extent(1);
686 
688  X.extent(1) != Y.extent(1), std::runtime_error,
689  prefix << "X and Y have different numbers of vectors (columns). X has "
690  << X.extent(1) << ", but Y has " << Y.extent(1) << ".");
691 
692  if (numVecs == 0) {
693  return; // done! nothing to do
694  }
695 
696  const size_t numRows = this->blockSizes_[blockIndex];
697 
698  // The local operator works on a permuted subset of the local parts
699  // of X and Y. The subset and permutation are defined by the index
700  // array returned by getBlockRows(). If the permutation is trivial
701  // and the subset is exactly equal to the local indices, then we
702  // could use the local parts of X and Y exactly, without needing to
703  // permute. Otherwise, we have to use temporary storage to permute
704  // X and Y. For now, we always use temporary storage.
705  //
706  // Create temporary permuted versions of the input and output.
707  // (Re)allocate X_ and/or Y_ only if necessary. We'll use them to
708  // store the permuted versions of X resp. Y. Note that X_local has
709  // the domain Map of the operator, which may be a permuted subset of
710  // the local Map corresponding to X.getMap(). Similarly, Y_local
711  // has the range Map of the operator, which may be a permuted subset
712  // of the local Map corresponding to Y.getMap(). numRows_ here
713  // gives the number of rows in the row Map of the local operator.
714  //
715  // FIXME (mfh 20 Aug 2013) There might be an implicit assumption
716  // here that the row Map and the range Map of that operator are
717  // the same.
718  //
719  // FIXME (mfh 20 Aug 2013) This "local permutation" functionality
720  // really belongs in Tpetra.
721  if (X_localBlocks_.size() == 0 || X.extent(1) != X_local_.extent(1)) {
722  // need to resize (or create for the first time) the three scratch arrays
723  X_localBlocks_.clear();
724  Y_localBlocks_.clear();
725  X_localBlocks_.reserve(this->numBlocks_);
726  Y_localBlocks_.reserve(this->numBlocks_);
727 
728  X_local_ = HostViewLocal("X_local", this->blockRows_.size() * this->scalarsPerRow_, numVecs);
729  Y_local_ = HostViewLocal("Y_local", this->blockRows_.size() * this->scalarsPerRow_, numVecs);
730 
731  // create all X_local and Y_local managed Views at once, are
732  // reused in subsequent apply() calls
733  for (int i = 0; i < this->numBlocks_; i++) {
734  auto blockBounds = std::make_pair(this->blockOffsets_[i] * this->scalarsPerRow_,
735  (this->blockOffsets_[i] + this->blockSizes_[i]) * this->scalarsPerRow_);
736  X_localBlocks_.emplace_back(X_local_, blockBounds, Kokkos::ALL());
737  Y_localBlocks_.emplace_back(Y_local_, blockBounds, Kokkos::ALL());
738  }
739  }
740  if (int(weightedApplyScratch_.extent(0)) != 3 * this->maxBlockSize_ ||
741  weightedApplyScratch_.extent(1) != numVecs) {
742  weightedApplyScratch_ = HostViewLocal("weightedApply scratch", 3 * this->maxBlockSize_, numVecs);
743  }
744 
745  ArrayView<const LO> blockRows = this->getBlockRows(blockIndex);
746 
748 
749  // note: BlockCrs w/ weighted Jacobi isn't allowed, so no need to use block gather/scatter
750  mvgs.gatherViewToView(X_localBlocks_[blockIndex], X, blockRows);
751  // We must gather the output multivector Y even on input to
752  // solveBlock(), since the local operator might use it as an initial
753  // guess for a linear solve. We have no way of knowing whether it
754  // does or does not.
755 
756  mvgs.gatherViewToView(Y_localBlocks_[blockIndex], Y, blockRows);
757 
758  // Apply the diagonal scaling D to the input X. It's our choice
759  // whether the result has the original input Map of X, or the
760  // permuted subset Map of X_local. If the latter, we also need to
761  // gather D into the permuted subset Map. We choose the latter, to
762  // save memory and computation. Thus, we do the following:
763  //
764  // 1. Gather D into a temporary vector D_local.
765  // 2. Create a temporary X_scaled to hold diag(D_local) * X_local.
766  // 3. Compute X_scaled := diag(D_loca) * X_local.
767  auto maxBS = this->maxBlockSize_;
768  auto bs = this->blockSizes_[blockIndex] * this->scalarsPerRow_;
769 
770  HostSubviewLocal D_local(weightedApplyScratch_, std::make_pair(0, bs), std::make_pair(0, 1));
771  mvgs.gatherViewToView(D_local, D, blockRows);
772  HostSubviewLocal X_scaled(weightedApplyScratch_, std::make_pair(maxBS, maxBS + bs), Kokkos::ALL());
773  for (size_t j = 0; j < numVecs; j++)
774  for (size_t i = 0; i < numRows; i++)
775  X_scaled(i, j) = X_localBlocks_[blockIndex](i, j) * D_local(i, 0);
776 
777  HostSubviewLocal Y_temp(weightedApplyScratch_, std::make_pair(maxBS * 2, maxBS * 2 + bs), Kokkos::ALL());
778  // Apply the local operator: Y_temp := M^{-1} * X_scaled
779  this->solveBlock(X_scaled, Y_temp, blockIndex, mode, STS::one(), STS::zero());
780  // Y_local := beta * Y_local + alpha * diag(D_local) * Y_temp.
781  //
782  // Note that we still use the permuted subset scaling D_local here,
783  // because Y_temp has the same permuted subset Map. That's good, in
784  // fact, because it's a subset: less data to read and multiply.
785  LISC a = alpha;
786  LISC b = beta;
787  for (size_t j = 0; j < numVecs; j++)
788  for (size_t i = 0; i < numRows; i++)
789  Y_localBlocks_[blockIndex](i, j) = b * Y_localBlocks_[blockIndex](i, j) + a * Y_temp(i, j) * D_local(i, 0);
790 
791  // Copy the permuted subset output vector Y_local into the original
792  // output multivector Y.
793  mvgs.scatterViewToView(Y, Y_localBlocks_[blockIndex], blockRows);
794 }
795 
796 template <class MatrixType, class LocalScalarType>
798  typename ContainerImpl<MatrixType, LocalScalarType>::SC,
799  typename ContainerImpl<MatrixType, LocalScalarType>::LO,
800  typename ContainerImpl<MatrixType, LocalScalarType>::GO,
801  typename ContainerImpl<MatrixType, LocalScalarType>::NO>
803  getInputRowView(LO row) const {
804  typedef typename MatrixType::nonconst_local_inds_host_view_type nonconst_local_inds_host_view_type;
805  typedef typename MatrixType::nonconst_values_host_view_type nonconst_values_host_view_type;
806 
807  typedef typename MatrixType::local_inds_host_view_type local_inds_host_view_type;
808  typedef typename MatrixType::values_host_view_type values_host_view_type;
809  using IST = typename row_matrix_type::impl_scalar_type;
810 
811  if (this->hasBlockCrs_) {
812  using h_inds_type = typename block_crs_matrix_type::local_inds_host_view_type;
813  using h_vals_type = typename block_crs_matrix_type::values_host_view_type;
814  h_inds_type colinds;
815  h_vals_type values;
816  this->inputBlockMatrix_->getLocalRowView(row / this->bcrsBlockSize_, colinds, values);
817  size_t numEntries = colinds.size();
818  // CMS: Can't say I understand what this really does
819  // return StridedRowView(values + row % this->bcrsBlockSize_, colinds, this->bcrsBlockSize_, numEntries * this->bcrsBlockSize_);
820  h_vals_type subvals = Kokkos::subview(values, std::pair<size_t, size_t>(row % this->bcrsBlockSize_, values.size()));
821  return StridedRowView(subvals, colinds, this->bcrsBlockSize_, numEntries * this->bcrsBlockSize_);
822  } else if (!this->inputMatrix_->supportsRowViews()) {
823  size_t maxEntries = this->inputMatrix_->getLocalMaxNumRowEntries();
824  Teuchos::Array<LO> inds(maxEntries);
825  Teuchos::Array<SC> vals(maxEntries);
826  nonconst_local_inds_host_view_type inds_v(inds.data(), maxEntries);
827  nonconst_values_host_view_type vals_v(reinterpret_cast<IST*>(vals.data()), maxEntries);
828  size_t numEntries;
829  this->inputMatrix_->getLocalRowCopy(row, inds_v, vals_v, numEntries);
830  vals.resize(numEntries);
831  inds.resize(numEntries);
832  return StridedRowView(vals, inds);
833  } else {
834  // CMS - This is dangerous and might not work.
835  local_inds_host_view_type colinds;
836  values_host_view_type values;
837  this->inputMatrix_->getLocalRowView(row, colinds, values);
838  return StridedRowView(values, colinds, 1, colinds.size());
839  }
840 }
841 
842 template <class MatrixType, class LocalScalarType>
844  clearBlocks() {
845  X_localBlocks_.clear();
846  Y_localBlocks_.clear();
847  X_local_ = HostViewLocal();
848  Y_local_ = HostViewLocal();
850 }
851 
852 namespace Details {
853 
854 // Implementation of Ifpack2::Details::StridedRowView
855 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
857  StridedRowView(h_vals_type vals_, h_inds_type inds_, int blockSize_, size_t nnz_)
858  : vals(vals_)
859  , inds(inds_)
860  , blockSize(blockSize_)
861  , nnz(nnz_) {}
862 
863 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
866  : vals()
867  , inds()
868  , blockSize(1)
869  , nnz(vals_.size()) {
870  valsCopy.swap(vals_);
871  indsCopy.swap(inds_);
872 }
873 
874 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
876  val(size_t i) const {
877 #ifdef HAVE_IFPACK2_DEBUG
878  TEUCHOS_TEST_FOR_EXCEPTION(i >= nnz, std::runtime_error,
879  "Out-of-bounds access into Ifpack2::Container::StridedRowView");
880 #endif
881  if (vals.size() > 0) {
882  if (blockSize == 1)
883  return vals[i];
884  else
885  return vals[i * blockSize];
886  } else
887  return valsCopy[i];
888 }
889 
890 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
891 LocalOrdinal StridedRowView<Scalar, LocalOrdinal, GlobalOrdinal, Node>::
892  ind(size_t i) const {
893 #ifdef HAVE_IFPACK2_DEBUG
894  TEUCHOS_TEST_FOR_EXCEPTION(i >= nnz, std::runtime_error,
895  "Out-of-bounds access into Ifpack2::Container::StridedRowView");
896 #endif
897  // inds is smaller than vals by a factor of the block size (dofs/node)
898  if (inds.size() > 0) {
899  if (blockSize == 1)
900  return inds[i];
901  else
902  return inds[i / blockSize] * blockSize + i % blockSize;
903  } else
904  return indsCopy[i];
905 }
906 
907 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
908 size_t StridedRowView<Scalar, LocalOrdinal, GlobalOrdinal, Node>::
909  size() const {
910  return nnz;
911 }
912 } // namespace Details
913 
914 } // namespace Ifpack2
915 
916 template <class MatrixType>
917 std::ostream& operator<<(std::ostream& os, const Ifpack2::Container<MatrixType>& obj) {
918  return obj.print(os);
919 }
920 
921 #define IFPACK2_CONTAINER_INSTANT(S, LO, GO, N) \
922  template class Ifpack2::Container<Tpetra::RowMatrix<S, LO, GO, N>>; \
923  template class Ifpack2::ContainerImpl<Tpetra::RowMatrix<S, LO, GO, N>, S>; \
924  template class Ifpack2::Details::StridedRowView<S, LO, GO, N>; \
925  template std::ostream& operator<< <Tpetra::RowMatrix<S, LO, GO, N>>( \
926  std::ostream& os, const Ifpack2::Container<Tpetra::RowMatrix<S, LO, GO, N>>& obj);
927 
928 #endif
virtual void apply(ConstHostView X, HostView Y, int blockIndex, Teuchos::ETransp mode=Teuchos::NO_TRANS, SC alpha=Teuchos::ScalarTraits< SC >::one(), SC beta=Teuchos::ScalarTraits< SC >::zero()) const
Compute Y := alpha * M^{-1} X + beta*Y.
Definition: Ifpack2_Container_def.hpp:539
void weightedApplyMV(const mv_type &X, mv_type &Y, vector_type &W) const
Wrapper for weightedApply with MVs, used in unit tests (never called by BlockRelaxation) ...
Definition: Ifpack2_Container_def.hpp:464
bool isComputed() const
Whether the container has been successfully computed.
Definition: Ifpack2_Container_def.hpp:128
void DoGSBlock(ConstHostView X, HostView Y, HostView Y2, HostView Resid, SC dampingFactor, LO i) const
Do one step of Gauss-Seidel on block i (used by DoGaussSeidel and DoSGS)
Definition: Ifpack2_Container_def.hpp:231
void setBlockSizes(const Teuchos::Array< Teuchos::Array< LO > > &partitions)
Initialize arrays with information about block sizes.
Definition: Ifpack2_Container_def.hpp:89
Teuchos::Array< LO > blockSizes_
Number of rows in each block.
Definition: Ifpack2_Container_decl.hpp:261
typename Kokkos::ArithTraits< SC >::val_type ISC
Internal representation of Scalar in Kokkos::View.
Definition: Ifpack2_Container_decl.hpp:101
StridedRowView(h_vals_type vals_, h_inds_type inds_, int blockSize_, size_t nnz_)
Constructor for row views (preferred)
Definition: Ifpack2_Container_def.hpp:857
virtual void applyMV(const mv_type &X, mv_type &Y) const
Wrapper for apply with MultiVector.
Definition: Ifpack2_Container_def.hpp:134
bool pointIndexed_
(If hasBlockCrs_) Whether the blocks are described using sub-block row indices instead of full block ...
Definition: Ifpack2_Container_decl.hpp:279
int numBlocks_
The number of blocks (partitions) in the container.
Definition: Ifpack2_Container_decl.hpp:257
void swap(Array &x)
The implementation of the numerical features of Container (Jacobi, Gauss-Seidel, SGS). This class allows a custom scalar type (LocalScalarType) to be used for storing blocks and solving the block systems. Hiding this template parameter from the Container interface simplifies the BlockRelaxation and ContainerFactory classes.
Definition: Ifpack2_Container_decl.hpp:307
#define TEUCHOS_TEST_FOR_EXCEPTION(throw_exception_test, Exception, msg)
virtual void setParameters(const Teuchos::ParameterList &List)
Set parameters, if any.
Definition: Ifpack2_Container_def.hpp:443
bool isInitialized() const
Whether the container has been successfully initialized.
Definition: Ifpack2_Container_def.hpp:123
LO scalarsPerRow_
Definition: Ifpack2_Container_decl.hpp:282
virtual ~Container()
Destructor.
Definition: Ifpack2_Container_def.hpp:80
bool hasBlockCrs_
Whether the input matrix is a BlockCRS matrix.
Definition: Ifpack2_Container_decl.hpp:275
Details::StridedRowView< SC, LO, GO, NO > getInputRowView(LO row) const
View a row of the input matrix.
Definition: Ifpack2_Container_def.hpp:803
LocalScalarType LSC
The internal representation of LocalScalarType in Kokkos::View.
Definition: Ifpack2_Container_decl.hpp:326
Declaration and definition of the Ifpack2::Details::MultiVectorLocalGatherScatter class...
Container(const Teuchos::RCP< const row_matrix_type > &matrix, const Teuchos::Array< Teuchos::Array< LO > > &partitions, bool pointIndexed)
Constructor.
Definition: Ifpack2_Container_def.hpp:21
int bcrsBlockSize_
If hasBlockCrs_, the number of DOFs per vertex. Otherwise 1.
Definition: Ifpack2_Container_decl.hpp:277
TEUCHOS_DEPRECATED RCP< T > rcp(T *p, Dealloc_T dealloc, bool owns_mem)
#define TEUCHOS_TEST_FOR_EXCEPT_MSG(throw_exception_test, msg)
LO translateRowToCol(LO row)
Definition: Ifpack2_Container_def.hpp:493
Teuchos::RCP< const block_crs_matrix_type > inputBlockMatrix_
The input matrix, dynamic cast to BlockCrsMatrix. May be null.
Definition: Ifpack2_Container_decl.hpp:254
Teuchos::RCP< const row_matrix_type > inputMatrix_
The input matrix to the constructor.
Definition: Ifpack2_Container_decl.hpp:248
virtual void weightedApplyMV(const mv_type &X, mv_type &Y, vector_type &W) const
Wrapper for weightedApply with MultiVector.
Definition: Ifpack2_Container_def.hpp:140
void applyMV(const mv_type &X, mv_type &Y) const
Wrapper for apply with MVs, used in unit tests (never called by BlockRelaxation)
Definition: Ifpack2_Container_def.hpp:456
virtual void applyInverseJacobi(const mv_type &, mv_type &, SC dampingFactor, bool, int) const
Compute Y := (1 - a) Y + a D^{-1} (X - R*Y).
Definition: Ifpack2_Container_def.hpp:447
typename mv_type::dual_view_type::t_host HostView
Definition: Ifpack2_Container_decl.hpp:105
virtual void weightedApply(ConstHostView X, HostView Y, ConstHostView D, int blockIndex, Teuchos::ETransp mode=Teuchos::NO_TRANS, SC alpha=Teuchos::ScalarTraits< SC >::one(), SC beta=Teuchos::ScalarTraits< SC >::zero()) const
Compute Y := alpha * diag(D) * M^{-1} (diag(D) * X) + beta*Y.
Definition: Ifpack2_Container_def.hpp:648
void resize(size_type new_size, const value_type &x=value_type())
virtual ~ContainerImpl()
Destructor.
Definition: Ifpack2_Container_def.hpp:439
bool IsParallel_
Whether the problem is distributed across multiple MPI processes.
Definition: Ifpack2_Container_decl.hpp:267
static std::string getName()
Definition: Ifpack2_Container_def.hpp:475
TypeTo as(const TypeFrom &t)
Interface for creating and solving a set of local linear problems.
Definition: Ifpack2_Container_decl.hpp:79
Teuchos::ArrayView< const LO > getBlockRows(int blockIndex) const
Local indices of the rows of the input matrix that belong to this block.
Definition: Ifpack2_Container_def.hpp:84
Structure for read-only views of general matrix rows.
Definition: Ifpack2_Container_decl.hpp:294
static std::string getName()
Definition: Ifpack2_Container_def.hpp:146
GO NumGlobalRows_
Number of global rows in input matrix.
Definition: Ifpack2_Container_decl.hpp:271
GO NumGlobalNonzeros_
Number of nonzeros in input matrix.
Definition: Ifpack2_Container_decl.hpp:273
LO NumLocalRows_
Number of local rows in input matrix.
Definition: Ifpack2_Container_decl.hpp:269
Implementation detail of Ifpack2::Container subclasses.
Definition: Ifpack2_Details_MultiVectorLocalGatherScatter.hpp:52
virtual void solveBlock(ConstHostSubviewLocal X, HostSubviewLocal Y, int blockIndex, Teuchos::ETransp mode, const LSC alpha, const LSC beta) const
Definition: Ifpack2_Container_def.hpp:481
std::string toString(const T &t)
virtual void DoGSBlock(ConstHostView X, HostView Y, HostView Y2, HostView Resid, SC dampingFactor, LO i) const
Do one step of Gauss-Seidel on block i (used by DoGaussSeidel and DoSGS)
Definition: Ifpack2_Container_def.hpp:151
bool is_null() const