Ifpack2 Templated Preconditioning Package  Version 1.0
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Groups Pages
Ifpack2_BlockComputeResidualVector.hpp
1 // @HEADER
2 // *****************************************************************************
3 // Ifpack2: Templated Object-Oriented Algebraic Preconditioner Package
4 //
5 // Copyright 2009 NTESS and the Ifpack2 contributors.
6 // SPDX-License-Identifier: BSD-3-Clause
7 // *****************************************************************************
8 // @HEADER
9 
10 #ifndef IFPACK2_BLOCKCOMPUTERES_IMPL_HPP
11 #define IFPACK2_BLOCKCOMPUTERES_IMPL_HPP
12 
13 #include "Ifpack2_BlockHelper.hpp"
14 
15 namespace Ifpack2 {
16 
17 namespace BlockHelperDetails {
18 
22 template <typename MatrixType>
23 struct AmD {
25  using local_ordinal_type_1d_view = typename impl_type::local_ordinal_type_1d_view;
26  using size_type_1d_view = typename impl_type::size_type_1d_view;
27  using i64_3d_view = typename impl_type::i64_3d_view;
28  using impl_scalar_type_1d_view_tpetra = Unmanaged<typename impl_type::impl_scalar_type_1d_view_tpetra>;
29  // rowptr points to the start of each row of A_colindsub.
30  size_type_1d_view rowptr, rowptr_remote;
31  // Indices into A's rows giving the blocks to extract. rowptr(i) points to
32  // the i'th row. Thus, g.entries(A_colindsub(rowptr(row) : rowptr(row+1))),
33  // where g is A's graph, are the columns AmD uses. If seq_method_, then
34  // A_colindsub contains all the LIDs and A_colindsub_remote is empty. If !
35  // seq_method_, then A_colindsub contains owned LIDs and A_colindsub_remote
36  // contains the remote ones.
37  local_ordinal_type_1d_view A_colindsub, A_colindsub_remote;
38  // Precomputed direct offsets to A,x blocks, for owned entries (OverlapTag case) or all entries (AsyncTag case)
39  i64_3d_view A_x_offsets;
40  // Precomputed direct offsets to A,x blocks, for non-owned entries (OverlapTag case). For AsyncTag case this is left empty.
41  i64_3d_view A_x_offsets_remote;
42 
43  // Currently always true.
44  bool is_tpetra_block_crs;
45 
46  // If is_tpetra_block_crs, then this is a pointer to A_'s value data.
47  impl_scalar_type_1d_view_tpetra tpetra_values;
48 
49  AmD() = default;
50  AmD(const AmD &b) = default;
51 };
52 
53 template <typename MatrixType>
54 struct PartInterface {
55  using local_ordinal_type = typename BlockHelperDetails::ImplType<MatrixType>::local_ordinal_type;
56  using local_ordinal_type_1d_view = typename BlockHelperDetails::ImplType<MatrixType>::local_ordinal_type_1d_view;
57  using local_ordinal_type_2d_view = typename BlockHelperDetails::ImplType<MatrixType>::local_ordinal_type_2d_view;
58 
59  PartInterface() = default;
60  PartInterface(const PartInterface &b) = default;
61 
62  // Some terms:
63  // The matrix A is split as A = D + R, where D is the matrix of tridiag
64  // blocks and R is the remainder.
65  // A part is roughly a synonym for a tridiag. The distinction is that a part
66  // is the set of rows belonging to one tridiag and, equivalently, the off-diag
67  // rows in R associated with that tridiag. In contrast, the term tridiag is
68  // used to refer specifically to tridiag data, such as the pointer into the
69  // tridiag data array.
70  // Local (lcl) row are the LIDs. lclrow lists the LIDs belonging to each
71  // tridiag, and partptr points to the beginning of each tridiag. This is the
72  // LID space.
73  // Row index (idx) is the ordinal in the tridiag ordering. lclrow is indexed
74  // by this ordinal. This is the 'index' space.
75  // A flat index is the mathematical index into an array. A pack index
76  // accounts for SIMD packing.
77 
78  // Local row LIDs. Permutation from caller's index space to tridiag index
79  // space.
80  local_ordinal_type_1d_view lclrow;
81  // partptr_ is the pointer array into lclrow_.
82  local_ordinal_type_1d_view partptr; // np+1
83  local_ordinal_type_2d_view partptr_sub;
84  local_ordinal_type_1d_view partptr_schur;
85  // packptr_(i), for i the pack index, indexes partptr_. partptr_(packptr_(i))
86  // is the start of the i'th pack.
87  local_ordinal_type_1d_view packptr; // npack+1
88  local_ordinal_type_1d_view packptr_sub;
89  local_ordinal_type_1d_view packindices_sub;
90  local_ordinal_type_2d_view packindices_schur;
91  // part2rowidx0_(i) is the flat row index of the start of the i'th part. It's
92  // an alias of partptr_ in the case of no overlap.
93  local_ordinal_type_1d_view part2rowidx0; // np+1
94  local_ordinal_type_1d_view part2rowidx0_sub;
95  // part2packrowidx0_(i) is the packed row index. If vector_length is 1, then
96  // it's the same as part2rowidx0_; if it's > 1, then the value is combined
97  // with i % vector_length to get the location in the packed data.
98  local_ordinal_type_1d_view part2packrowidx0; // np+1
99  local_ordinal_type_2d_view part2packrowidx0_sub;
100  local_ordinal_type part2packrowidx0_back; // So we don't need to grab the array from the GPU.
101  // rowidx2part_ maps the row index to the part index.
102  local_ordinal_type_1d_view rowidx2part; // nr
103  local_ordinal_type_1d_view rowidx2part_sub;
104  // True if lcl{row|col} is at most a constant away from row{idx|col}. In
105  // practice, this knowledge is not particularly useful, as packing for batched
106  // processing is done at the same time as the permutation from LID to index
107  // space. But it's easy to detect, so it's recorded in case an optimization
108  // can be made based on it.
109  bool row_contiguous;
110 
111  local_ordinal_type max_partsz;
112  local_ordinal_type max_subpartsz;
113  local_ordinal_type n_subparts_per_part;
114  local_ordinal_type nparts;
115 };
116 
117 // Precompute offsets of each A and x entry to speed up residual.
118 // (Applies for hasBlockCrsMatrix == true and OverlapTag/AsyncTag)
119 // Reading A, x take up to 4, 6 levels of indirection respectively,
120 // but precomputing the offsets reduces it to 2 for both.
121 //
122 // This function allocates and populates these members of AmD:
123 // A_x_offsets, A_x_offsets_remote
124 template <typename MatrixType>
125 void precompute_A_x_offsets(
126  AmD<MatrixType> &amd,
127  const PartInterface<MatrixType> &interf,
128  const Teuchos::RCP<const typename ImplType<MatrixType>::tpetra_crs_graph_type> &g,
129  const typename ImplType<MatrixType>::local_ordinal_type_1d_view &dm2cm,
130  int blocksize,
131  bool ownedRemoteSeparate) {
132  using impl_type = ImplType<MatrixType>;
133  using i64_3d_view = typename impl_type::i64_3d_view;
134  using size_type = typename impl_type::size_type;
135  using local_ordinal_type = typename impl_type::local_ordinal_type;
136  using execution_space = typename impl_type::execution_space;
137  auto local_graph = g->getLocalGraphDevice();
138  const auto A_block_rowptr = local_graph.row_map;
139  const auto A_colind = local_graph.entries;
140  local_ordinal_type numLocalRows = interf.rowidx2part.extent(0);
141  int blocksize_square = blocksize * blocksize;
142  // shallow-copying views to avoid capturing the amd, interf objects in lambdas
143  auto lclrow = interf.lclrow;
144  auto A_colindsub = amd.A_colindsub;
145  auto A_colindsub_remote = amd.A_colindsub_remote;
146  auto rowptr = amd.rowptr;
147  auto rowptr_remote = amd.rowptr_remote;
148  bool is_dm2cm_active = dm2cm.extent(0);
149  if (ownedRemoteSeparate) {
150  // amd.rowptr points to owned entries only, and amd.rowptr_remote points to nonowned.
151  local_ordinal_type maxOwnedEntriesPerRow = 0;
152  local_ordinal_type maxNonownedEntriesPerRow = 0;
153  Kokkos::parallel_reduce(
154  Kokkos::RangePolicy<execution_space>(0, numLocalRows),
155  KOKKOS_LAMBDA(local_ordinal_type i, local_ordinal_type & lmaxOwned, local_ordinal_type & lmaxNonowned) {
156  const local_ordinal_type lr = lclrow(i);
157  local_ordinal_type rowNumOwned = rowptr(lr + 1) - rowptr(lr);
158  if (rowNumOwned > lmaxOwned)
159  lmaxOwned = rowNumOwned;
160  // rowptr_remote won't be allocated for single-rank problems
161  if (rowptr_remote.extent(0)) {
162  local_ordinal_type rowNumNonowned = rowptr_remote(lr + 1) - rowptr_remote(lr);
163  if (rowNumNonowned > lmaxNonowned)
164  lmaxNonowned = rowNumNonowned;
165  } else {
166  lmaxNonowned = 0;
167  }
168  },
169  Kokkos::Max<local_ordinal_type>(maxOwnedEntriesPerRow), Kokkos::Max<local_ordinal_type>(maxNonownedEntriesPerRow));
170  // Allocate the two offsets views now that we know the dimensions
171  // For each one, the middle dimension is 0 for A offsets and 1 for x offsets.
172  // Packing them together in one view improves cache line utilization
173  amd.A_x_offsets = i64_3d_view("amd.A_x_offsets", numLocalRows, 2, maxOwnedEntriesPerRow);
174  amd.A_x_offsets_remote = i64_3d_view("amd.A_x_offsets_remote", numLocalRows, 2, maxNonownedEntriesPerRow);
175  auto A_x_offsets = amd.A_x_offsets;
176  auto A_x_offsets_remote = amd.A_x_offsets_remote;
177  // Now, populate all the offsets. Use ArithTraits<int64_t>::min to mark absent entries.
178  Kokkos::parallel_for(
179  Kokkos::RangePolicy<execution_space>(0, numLocalRows),
180  KOKKOS_LAMBDA(local_ordinal_type i) {
181  const local_ordinal_type lr = lclrow(i);
182  const size_type A_k0 = A_block_rowptr(lr);
183  // Owned entries
184  size_type rowBegin = rowptr(lr);
185  local_ordinal_type rowNumOwned = rowptr(lr + 1) - rowBegin;
186  for (local_ordinal_type entry = 0; entry < maxOwnedEntriesPerRow; entry++) {
187  if (entry < rowNumOwned) {
188  const size_type j = A_k0 + A_colindsub(rowBegin + entry);
189  const local_ordinal_type A_colind_at_j = A_colind(j);
190  const local_ordinal_type loc = is_dm2cm_active ? dm2cm(A_colind_at_j) : A_colind_at_j;
191  A_x_offsets(i, 0, entry) = int64_t(j) * blocksize_square;
192  A_x_offsets(i, 1, entry) = int64_t(loc) * blocksize;
193  } else {
194  A_x_offsets(i, 0, entry) = Kokkos::ArithTraits<int64_t>::min();
195  A_x_offsets(i, 1, entry) = Kokkos::ArithTraits<int64_t>::min();
196  }
197  }
198  // Nonowned entries
199  if (rowptr_remote.extent(0)) {
200  rowBegin = rowptr_remote(lr);
201  local_ordinal_type rowNumNonowned = rowptr_remote(lr + 1) - rowBegin;
202  for (local_ordinal_type entry = 0; entry < maxNonownedEntriesPerRow; entry++) {
203  if (entry < rowNumNonowned) {
204  const size_type j = A_k0 + A_colindsub_remote(rowBegin + entry);
205  const local_ordinal_type A_colind_at_j = A_colind(j);
206  const local_ordinal_type loc = A_colind_at_j - numLocalRows;
207  A_x_offsets_remote(i, 0, entry) = int64_t(j) * blocksize_square;
208  A_x_offsets_remote(i, 1, entry) = int64_t(loc) * blocksize;
209  } else {
210  A_x_offsets_remote(i, 0, entry) = Kokkos::ArithTraits<int64_t>::min();
211  A_x_offsets_remote(i, 1, entry) = Kokkos::ArithTraits<int64_t>::min();
212  }
213  }
214  }
215  });
216  } else {
217  // amd.rowptr points to both owned and nonowned entries, so it tells us how many columns (last dim) A_x_offsets should have.
218  local_ordinal_type maxEntriesPerRow = 0;
219  Kokkos::parallel_reduce(
220  Kokkos::RangePolicy<execution_space>(0, numLocalRows),
221  KOKKOS_LAMBDA(local_ordinal_type i, local_ordinal_type & lmax) {
222  const local_ordinal_type lr = lclrow(i);
223  local_ordinal_type rowNum = rowptr(lr + 1) - rowptr(lr);
224  if (rowNum > lmax)
225  lmax = rowNum;
226  },
227  Kokkos::Max<local_ordinal_type>(maxEntriesPerRow));
228  amd.A_x_offsets = i64_3d_view("amd.A_x_offsets", numLocalRows, 2, maxEntriesPerRow);
229  auto A_x_offsets = amd.A_x_offsets;
230  // Populate A,x offsets. Use ArithTraits<int64_t>::min to mark absent entries.
231  // For x offsets, add a shift blocksize*numLocalRows to represent that it indexes into x_remote instead of x.
232  Kokkos::parallel_for(
233  Kokkos::RangePolicy<execution_space>(0, numLocalRows),
234  KOKKOS_LAMBDA(local_ordinal_type i) {
235  const local_ordinal_type lr = lclrow(i);
236  const size_type A_k0 = A_block_rowptr(lr);
237  // Owned entries
238  size_type rowBegin = rowptr(lr);
239  local_ordinal_type rowOwned = rowptr(lr + 1) - rowBegin;
240  for (local_ordinal_type entry = 0; entry < maxEntriesPerRow; entry++) {
241  if (entry < rowOwned) {
242  const size_type j = A_k0 + A_colindsub(rowBegin + entry);
243  A_x_offsets(i, 0, entry) = j * blocksize_square;
244  const local_ordinal_type A_colind_at_j = A_colind(j);
245  if (A_colind_at_j < numLocalRows) {
246  const local_ordinal_type loc = is_dm2cm_active ? dm2cm[A_colind_at_j] : A_colind_at_j;
247  A_x_offsets(i, 1, entry) = int64_t(loc) * blocksize;
248  } else {
249  A_x_offsets(i, 1, entry) = int64_t(A_colind_at_j) * blocksize;
250  }
251  } else {
252  A_x_offsets(i, 0, entry) = Kokkos::ArithTraits<int64_t>::min();
253  A_x_offsets(i, 1, entry) = Kokkos::ArithTraits<int64_t>::min();
254  }
255  }
256  });
257  }
258 }
259 
263 static inline int ComputeResidualVectorRecommendedCudaVectorSize(const int blksize,
264  const int team_size) {
265  int total_team_size(0);
266  if (blksize <= 5)
267  total_team_size = 32;
268  else if (blksize <= 9)
269  total_team_size = 32; // 64
270  else if (blksize <= 12)
271  total_team_size = 96;
272  else if (blksize <= 16)
273  total_team_size = 128;
274  else if (blksize <= 20)
275  total_team_size = 160;
276  else
277  total_team_size = 160;
278  return total_team_size / team_size;
279 }
280 
281 static inline int ComputeResidualVectorRecommendedHIPVectorSize(const int blksize,
282  const int team_size) {
283  int total_team_size(0);
284  if (blksize <= 5)
285  total_team_size = 32;
286  else if (blksize <= 9)
287  total_team_size = 32; // 64
288  else if (blksize <= 12)
289  total_team_size = 96;
290  else if (blksize <= 16)
291  total_team_size = 128;
292  else if (blksize <= 20)
293  total_team_size = 160;
294  else
295  total_team_size = 160;
296  return total_team_size / team_size;
297 }
298 
299 static inline int ComputeResidualVectorRecommendedSYCLVectorSize(const int blksize,
300  const int team_size) {
301  int total_team_size(0);
302  if (blksize <= 5)
303  total_team_size = 32;
304  else if (blksize <= 9)
305  total_team_size = 32; // 64
306  else if (blksize <= 12)
307  total_team_size = 96;
308  else if (blksize <= 16)
309  total_team_size = 128;
310  else if (blksize <= 20)
311  total_team_size = 160;
312  else
313  total_team_size = 160;
314  return total_team_size / team_size;
315 }
316 
317 template <typename T>
318 static inline int ComputeResidualVectorRecommendedVectorSize(const int blksize,
319  const int team_size) {
320  if (is_cuda<T>::value)
321  return ComputeResidualVectorRecommendedCudaVectorSize(blksize, team_size);
322  if (is_hip<T>::value)
323  return ComputeResidualVectorRecommendedHIPVectorSize(blksize, team_size);
324  if (is_sycl<T>::value)
325  return ComputeResidualVectorRecommendedSYCLVectorSize(blksize, team_size);
326  return -1;
327 }
328 
329 template <typename MatrixType>
330 struct ComputeResidualVector {
331  public:
332  using impl_type = BlockHelperDetails::ImplType<MatrixType>;
333  using node_device_type = typename impl_type::node_device_type;
334  using execution_space = typename impl_type::execution_space;
335  using memory_space = typename impl_type::memory_space;
336 
337  using local_ordinal_type = typename impl_type::local_ordinal_type;
338  using size_type = typename impl_type::size_type;
339  using impl_scalar_type = typename impl_type::impl_scalar_type;
340  using magnitude_type = typename impl_type::magnitude_type;
341  using btdm_scalar_type = typename impl_type::btdm_scalar_type;
342  using btdm_magnitude_type = typename impl_type::btdm_magnitude_type;
344  using local_ordinal_type_1d_view = typename impl_type::local_ordinal_type_1d_view;
345  using size_type_1d_view = typename impl_type::size_type_1d_view;
346  using tpetra_block_access_view_type = typename impl_type::tpetra_block_access_view_type; // block crs (layout right)
347  using impl_scalar_type_1d_view = typename impl_type::impl_scalar_type_1d_view;
348  using impl_scalar_type_2d_view_tpetra = typename impl_type::impl_scalar_type_2d_view_tpetra; // block multivector (layout left)
349  using vector_type_3d_view = typename impl_type::vector_type_3d_view;
350  using btdm_scalar_type_4d_view = typename impl_type::btdm_scalar_type_4d_view;
351  using i64_3d_view = typename impl_type::i64_3d_view;
352  static constexpr int vector_length = impl_type::vector_length;
353 
355  using member_type = typename Kokkos::TeamPolicy<execution_space>::member_type;
356 
357  // enum for max blocksize and vector length
358  enum : int { max_blocksize = 32 };
359 
360  private:
361  ConstUnmanaged<impl_scalar_type_2d_view_tpetra> b;
362  ConstUnmanaged<impl_scalar_type_2d_view_tpetra> x; // x_owned
363  ConstUnmanaged<impl_scalar_type_2d_view_tpetra> x_remote;
364  Unmanaged<impl_scalar_type_2d_view_tpetra> y;
365  Unmanaged<vector_type_3d_view> y_packed;
366  Unmanaged<btdm_scalar_type_4d_view> y_packed_scalar;
367 
368  // AmD information
369  const ConstUnmanaged<size_type_1d_view> rowptr, rowptr_remote;
370  const ConstUnmanaged<local_ordinal_type_1d_view> colindsub, colindsub_remote;
371  const ConstUnmanaged<impl_scalar_type_1d_view> tpetra_values;
372 
373  // block crs graph information
374  // for cuda (kokkos crs graph uses a different size_type from size_t)
375  const ConstUnmanaged<Kokkos::View<size_t *, node_device_type>> A_block_rowptr;
376  const ConstUnmanaged<Kokkos::View<size_t *, node_device_type>> A_point_rowptr;
377  const ConstUnmanaged<Kokkos::View<local_ordinal_type *, node_device_type>> A_colind;
378 
379  // blocksize
380  const local_ordinal_type blocksize_requested;
381 
382  // part interface
383  const ConstUnmanaged<local_ordinal_type_1d_view> part2packrowidx0;
384  const ConstUnmanaged<local_ordinal_type_1d_view> part2rowidx0;
385  const ConstUnmanaged<local_ordinal_type_1d_view> rowidx2part;
386  const ConstUnmanaged<local_ordinal_type_1d_view> partptr;
387  const ConstUnmanaged<local_ordinal_type_1d_view> lclrow;
388  const ConstUnmanaged<local_ordinal_type_1d_view> dm2cm;
389 
390  // block offsets
391  const ConstUnmanaged<i64_3d_view> A_x_offsets;
392  const ConstUnmanaged<i64_3d_view> A_x_offsets_remote;
393 
394  const bool is_dm2cm_active;
395  const bool hasBlockCrsMatrix;
396 
397  public:
398  template <typename LocalCrsGraphType>
399  ComputeResidualVector(const AmD<MatrixType> &amd,
400  const LocalCrsGraphType &block_graph,
401  const LocalCrsGraphType &point_graph,
402  const local_ordinal_type &blocksize_requested_,
403  const PartInterface<MatrixType> &interf,
404  const local_ordinal_type_1d_view &dm2cm_,
405  bool hasBlockCrsMatrix_)
406  : rowptr(amd.rowptr)
407  , rowptr_remote(amd.rowptr_remote)
408  , colindsub(amd.A_colindsub)
409  , colindsub_remote(amd.A_colindsub_remote)
410  , tpetra_values(amd.tpetra_values)
411  , A_block_rowptr(block_graph.row_map)
412  , A_point_rowptr(point_graph.row_map)
413  , A_colind(block_graph.entries)
414  , blocksize_requested(blocksize_requested_)
415  , part2packrowidx0(interf.part2packrowidx0)
416  , part2rowidx0(interf.part2rowidx0)
417  , rowidx2part(interf.rowidx2part)
418  , partptr(interf.partptr)
419  , lclrow(interf.lclrow)
420  , dm2cm(dm2cm_)
421  , A_x_offsets(amd.A_x_offsets)
422  , A_x_offsets_remote(amd.A_x_offsets_remote)
423  , is_dm2cm_active(dm2cm_.span() > 0)
424  , hasBlockCrsMatrix(hasBlockCrsMatrix_) {}
425 
426  inline void
427  SerialDot(const local_ordinal_type &blocksize,
428  const local_ordinal_type &lclRowID,
429  const local_ordinal_type &lclColID,
430  const local_ordinal_type &ii,
431  const ConstUnmanaged<local_ordinal_type_1d_view> colindsub_,
432  const impl_scalar_type *const KOKKOS_RESTRICT xx,
433  /* */ impl_scalar_type *KOKKOS_RESTRICT yy) const {
434  const size_type Aj_c = colindsub_(lclColID);
435  auto point_row_offset = A_point_rowptr(lclRowID * blocksize + ii) + Aj_c * blocksize;
436  impl_scalar_type val = 0;
437 #if defined(KOKKOS_ENABLE_PRAGMA_IVDEP)
438 #pragma ivdep
439 #endif
440 #if defined(KOKKOS_ENABLE_PRAGMA_UNROLL)
441 #pragma unroll
442 #endif
443  for (local_ordinal_type k1 = 0; k1 < blocksize; ++k1)
444  val += tpetra_values(point_row_offset + k1) * xx[k1];
445  yy[ii] -= val;
446  }
447 
448  inline void
449  SerialGemv(const local_ordinal_type &blocksize,
450  const impl_scalar_type *const KOKKOS_RESTRICT AA,
451  const impl_scalar_type *const KOKKOS_RESTRICT xx,
452  /* */ impl_scalar_type *KOKKOS_RESTRICT yy) const {
453  using tlb = BlockHelperDetails::TpetraLittleBlock<Tpetra::Impl::BlockCrsMatrixLittleBlockArrayLayout>;
454  for (local_ordinal_type k0 = 0; k0 < blocksize; ++k0) {
455  impl_scalar_type val = 0;
456 #if defined(KOKKOS_ENABLE_PRAGMA_IVDEP)
457 #pragma ivdep
458 #endif
459 #if defined(KOKKOS_ENABLE_PRAGMA_UNROLL)
460 #pragma unroll
461 #endif
462  for (local_ordinal_type k1 = 0; k1 < blocksize; ++k1)
463  val += AA[tlb::getFlatIndex(k0, k1, blocksize)] * xx[k1];
464  yy[k0] -= val;
465  }
466  }
467 
468  template <typename bbViewType, typename yyViewType>
469  KOKKOS_INLINE_FUNCTION void
470  VectorCopy(const member_type &member,
471  const local_ordinal_type &blocksize,
472  const bbViewType &bb,
473  const yyViewType &yy) const {
474  Kokkos::parallel_for(Kokkos::ThreadVectorRange(member, blocksize), [&](const local_ordinal_type &k0) {
475  yy(k0) = static_cast<typename yyViewType::const_value_type>(bb(k0));
476  });
477  }
478 
479  template <typename xxViewType, typename yyViewType>
480  KOKKOS_INLINE_FUNCTION void
481  VectorDot(const member_type &member,
482  const local_ordinal_type &blocksize,
483  const local_ordinal_type &lclRowID,
484  const local_ordinal_type &lclColID,
485  const local_ordinal_type &ii,
486  const ConstUnmanaged<local_ordinal_type_1d_view> colindsub_,
487  const xxViewType &xx,
488  const yyViewType &yy) const {
489  const size_type Aj_c = colindsub_(lclColID);
490  auto point_row_offset = A_point_rowptr(lclRowID * blocksize + ii) + Aj_c * blocksize;
491  impl_scalar_type val = 0;
492  Kokkos::parallel_reduce(
493  Kokkos::ThreadVectorRange(member, blocksize),
494  [&](const local_ordinal_type &k1, impl_scalar_type &update) {
495  update += tpetra_values(point_row_offset + k1) * xx(k1);
496  },
497  val);
498  Kokkos::single(Kokkos::PerThread(member),
499  [=]() {
500  Kokkos::atomic_add(&yy(ii), typename yyViewType::const_value_type(-val));
501  });
502  }
503 
504  // BMK: This version coalesces accesses to AA for LayoutRight blocks.
505  template <typename AAViewType, typename xxViewType, typename yyViewType>
506  KOKKOS_INLINE_FUNCTION void
507  VectorGemv(const member_type &member,
508  const local_ordinal_type &blocksize,
509  const AAViewType &AA,
510  const xxViewType &xx,
511  const yyViewType &yy) const {
512  for (local_ordinal_type k0 = 0; k0 < blocksize; ++k0) {
513  impl_scalar_type val = 0;
514  Kokkos::parallel_reduce(
515  Kokkos::ThreadVectorRange(member, blocksize),
516  [&](const local_ordinal_type &k1, impl_scalar_type &update) {
517  update += AA(k0, k1) * xx(k1);
518  },
519  val);
520  Kokkos::single(Kokkos::PerThread(member),
521  [=]() {
522  Kokkos::atomic_add(&yy(k0), -val);
523  });
524  }
525  }
526 
527  struct SeqTag {};
528 
529  KOKKOS_INLINE_FUNCTION
530  void
531  operator()(const SeqTag &, const local_ordinal_type &i) const {
532  const local_ordinal_type blocksize = blocksize_requested;
533  const local_ordinal_type blocksize_square = blocksize * blocksize;
534 
535  // constants
536  const Kokkos::pair<local_ordinal_type, local_ordinal_type> block_range(0, blocksize);
537  const local_ordinal_type num_vectors = y.extent(1);
538  const local_ordinal_type row = i * blocksize;
539  for (local_ordinal_type col = 0; col < num_vectors; ++col) {
540  // y := b
541  impl_scalar_type *yy = &y(row, col);
542  const impl_scalar_type *const bb = &b(row, col);
543  memcpy(yy, bb, sizeof(impl_scalar_type) * blocksize);
544 
545  // y -= Rx
546  const size_type A_k0 = A_block_rowptr[i];
547  for (size_type k = rowptr[i]; k < rowptr[i + 1]; ++k) {
548  const size_type j = A_k0 + colindsub[k];
549  const impl_scalar_type *const xx = &x(A_colind[j] * blocksize, col);
550  if (hasBlockCrsMatrix) {
551  const impl_scalar_type *const AA = &tpetra_values(j * blocksize_square);
552  SerialGemv(blocksize, AA, xx, yy);
553  } else {
554  for (local_ordinal_type k0 = 0; k0 < blocksize; ++k0)
555  SerialDot(blocksize, i, k, k0, colindsub, xx, yy);
556  }
557  }
558  }
559  }
560 
561  KOKKOS_INLINE_FUNCTION
562  void
563  operator()(const SeqTag &, const member_type &member) const {
564  // constants
565  const local_ordinal_type blocksize = blocksize_requested;
566  const local_ordinal_type blocksize_square = blocksize * blocksize;
567 
568  const local_ordinal_type lr = member.league_rank();
569  const Kokkos::pair<local_ordinal_type, local_ordinal_type> block_range(0, blocksize);
570  const local_ordinal_type num_vectors = y.extent(1);
571 
572  // subview pattern
573  auto bb = Kokkos::subview(b, block_range, 0);
574  auto xx = bb;
575  auto yy = Kokkos::subview(y, block_range, 0);
576  auto A_block_cst = ConstUnmanaged<tpetra_block_access_view_type>(NULL, blocksize, blocksize);
577 
578  const local_ordinal_type row = lr * blocksize;
579  for (local_ordinal_type col = 0; col < num_vectors; ++col) {
580  // y := b
581  yy.assign_data(&y(row, col));
582  bb.assign_data(&b(row, col));
583  if (member.team_rank() == 0)
584  VectorCopy(member, blocksize, bb, yy);
585  member.team_barrier();
586 
587  // y -= Rx
588  const size_type A_k0 = A_block_rowptr[lr];
589 
590  if (hasBlockCrsMatrix) {
591  Kokkos::parallel_for(Kokkos::TeamThreadRange(member, rowptr[lr], rowptr[lr + 1]),
592  [&](const local_ordinal_type &k) {
593  const size_type j = A_k0 + colindsub[k];
594  xx.assign_data(&x(A_colind[j] * blocksize, col));
595  A_block_cst.assign_data(&tpetra_values(j * blocksize_square));
596  VectorGemv(member, blocksize, A_block_cst, xx, yy);
597  });
598  } else {
599  Kokkos::parallel_for(Kokkos::TeamThreadRange(member, rowptr[lr], rowptr[lr + 1]),
600  [&](const local_ordinal_type &k) {
601  const size_type j = A_k0 + colindsub[k];
602  xx.assign_data(&x(A_colind[j] * blocksize, col));
603 
604  for (local_ordinal_type k0 = 0; k0 < blocksize; ++k0)
605  VectorDot(member, blocksize, lr, k, k0, colindsub, xx, yy);
606  });
607  }
608  }
609  }
610 
611  // * B: block size for compile-time specialization, or 0 for general case (up to max_blocksize)
612  // * async: true if using async importer. overlap is not used in this case.
613  // Whether a column is owned or nonowned is decided at runtime.
614  // * overlap: true if processing the columns that are not locally owned,
615  // false if processing locally owned columns.
616  // * haveBlockMatrix: true if A is a BlockCrsMatrix, false if it's CrsMatrix.
617  template <int B, bool async, bool overlap, bool haveBlockMatrix>
618  struct GeneralTag {
619  static_assert(!(async && overlap),
620  "ComputeResidualVector: async && overlap is not a valid configuration for GeneralTag");
621  };
622 
623  // Define AsyncTag and OverlapTag in terms of GeneralTag:
624  // P == 0 means only compute on owned columns
625  // P == 1 means only compute on nonowned columns
626  template <int P, int B, bool haveBlockMatrix>
627  using OverlapTag = GeneralTag<B, false, P != 0, haveBlockMatrix>;
628 
629  template <int B, bool haveBlockMatrix>
630  using AsyncTag = GeneralTag<B, true, false, haveBlockMatrix>;
631 
632  // CPU implementation for all cases
633  template <int B, bool async, bool overlap, bool haveBlockMatrix>
634  KOKKOS_INLINE_FUNCTION void
635  operator()(const GeneralTag<B, async, overlap, haveBlockMatrix> &, const local_ordinal_type &rowidx) const {
636  const local_ordinal_type blocksize = (B == 0 ? blocksize_requested : B);
637 
638  // constants
639  const local_ordinal_type partidx = rowidx2part(rowidx);
640  const local_ordinal_type pri = part2packrowidx0(partidx) + (rowidx - partptr(partidx));
641  const local_ordinal_type v = partidx % vector_length;
642 
643  const local_ordinal_type num_vectors = y_packed.extent(2);
644  const local_ordinal_type num_local_rows = lclrow.extent(0);
645 
646  // temporary buffer for y flat
647  impl_scalar_type yy[B == 0 ? max_blocksize : B] = {};
648 
649  const local_ordinal_type lr = lclrow(rowidx);
650 
651  auto colindsub_used = overlap ? colindsub_remote : colindsub;
652  auto rowptr_used = overlap ? rowptr_remote : rowptr;
653 
654  for (local_ordinal_type col = 0; col < num_vectors; ++col) {
655  if constexpr (overlap) {
656  // y (temporary) := 0
657  memset((void *)yy, 0, sizeof(impl_scalar_type) * blocksize);
658  } else {
659  // y := b
660  const local_ordinal_type row = lr * blocksize;
661  memcpy(yy, &b(row, col), sizeof(impl_scalar_type) * blocksize);
662  }
663 
664  // y -= Rx
665  const size_type A_k0 = A_block_rowptr[lr];
666  for (size_type k = rowptr_used[lr]; k < rowptr_used[lr + 1]; ++k) {
667  const size_type j = A_k0 + colindsub_used[k];
668  const local_ordinal_type A_colind_at_j = A_colind[j];
669  if constexpr (haveBlockMatrix) {
670  const local_ordinal_type blocksize_square = blocksize * blocksize;
671  const impl_scalar_type *const AA = &tpetra_values(j * blocksize_square);
672  if ((!async && !overlap) || (async && A_colind_at_j < num_local_rows)) {
673  const auto loc = is_dm2cm_active ? dm2cm[A_colind_at_j] : A_colind_at_j;
674  const impl_scalar_type *const xx = &x(loc * blocksize, col);
675  SerialGemv(blocksize, AA, xx, yy);
676  } else {
677  const auto loc = A_colind_at_j - num_local_rows;
678  const impl_scalar_type *const xx_remote = &x_remote(loc * blocksize, col);
679  SerialGemv(blocksize, AA, xx_remote, yy);
680  }
681  } else {
682  if ((!async && !overlap) || (async && A_colind_at_j < num_local_rows)) {
683  const auto loc = is_dm2cm_active ? dm2cm[A_colind_at_j] : A_colind_at_j;
684  const impl_scalar_type *const xx = &x(loc * blocksize, col);
685  for (local_ordinal_type k0 = 0; k0 < blocksize; ++k0)
686  SerialDot(blocksize, lr, k, k0, colindsub_used, xx, yy);
687  } else {
688  const auto loc = A_colind_at_j - num_local_rows;
689  const impl_scalar_type *const xx_remote = &x_remote(loc * blocksize, col);
690  for (local_ordinal_type k0 = 0; k0 < blocksize; ++k0)
691  SerialDot(blocksize, lr, k, k0, colindsub_used, xx_remote, yy);
692  }
693  }
694  }
695  // move yy to y_packed
696  if constexpr (overlap) {
697  for (local_ordinal_type k = 0; k < blocksize; ++k)
698  y_packed(pri, k, col)[v] += yy[k];
699  } else {
700  for (local_ordinal_type k = 0; k < blocksize; ++k)
701  y_packed(pri, k, col)[v] = yy[k];
702  }
703  }
704  }
705 
706  // GPU implementation for hasBlockCrsMatrix == true
707  template <int B, bool async, bool overlap>
708  KOKKOS_INLINE_FUNCTION void
709  operator()(const GeneralTag<B, async, overlap, true> &, const member_type &member) const {
710  const local_ordinal_type blocksize = (B == 0 ? blocksize_requested : B);
711 
712  // constants
713  const local_ordinal_type rowidx = member.league_rank();
714  const local_ordinal_type partidx = rowidx2part(rowidx);
715  const local_ordinal_type pri = part2packrowidx0(partidx) + (rowidx - partptr(partidx));
716  const local_ordinal_type v = partidx % vector_length;
717 
718  const Kokkos::pair<local_ordinal_type, local_ordinal_type> block_range(0, blocksize);
719  const local_ordinal_type num_vectors = y_packed_scalar.extent(2);
720  const local_ordinal_type num_local_rows = lclrow.extent(0);
721 
722  // subview pattern
723  using subview_1D_right_t = decltype(Kokkos::subview(b, block_range, 0));
724  using subview_1D_stride_t = decltype(Kokkos::subview(y_packed_scalar, 0, block_range, 0, 0));
725  subview_1D_right_t bb(nullptr, blocksize);
726  subview_1D_right_t xx(nullptr, blocksize);
727  subview_1D_stride_t yy(nullptr, Kokkos::LayoutStride(blocksize, y_packed_scalar.stride_1()));
728  auto A_block_cst = ConstUnmanaged<tpetra_block_access_view_type>(NULL, blocksize, blocksize);
729 
730  // Get shared allocation for a local copy of x, Ax, and A
731  impl_scalar_type *local_Ax = reinterpret_cast<impl_scalar_type *>(member.team_scratch(0).get_shmem(blocksize * sizeof(impl_scalar_type)));
732  impl_scalar_type *local_x = reinterpret_cast<impl_scalar_type *>(member.thread_scratch(0).get_shmem(blocksize * sizeof(impl_scalar_type)));
733 
734  const local_ordinal_type lr = lclrow(rowidx);
735  const local_ordinal_type row = lr * blocksize;
736  for (local_ordinal_type col = 0; col < num_vectors; ++col) {
737  if (col)
738  member.team_barrier();
739  // y -= Rx
740  // Initialize accumulation array
741  Kokkos::parallel_for(Kokkos::TeamVectorRange(member, blocksize), [&](const local_ordinal_type &i) {
742  local_Ax[i] = 0;
743  });
744  member.team_barrier();
745 
746  int numEntries;
747  if constexpr (!overlap) {
748  numEntries = A_x_offsets.extent(2);
749  } else {
750  numEntries = A_x_offsets_remote.extent(2);
751  }
752 
753  Kokkos::parallel_for(Kokkos::TeamThreadRange(member, 0, numEntries),
754  [&](const int k) {
755  int64_t A_offset = overlap ? A_x_offsets_remote(rowidx, 0, k) : A_x_offsets(rowidx, 0, k);
756  int64_t x_offset = overlap ? A_x_offsets_remote(rowidx, 1, k) : A_x_offsets(rowidx, 1, k);
757  if (A_offset != Kokkos::ArithTraits<int64_t>::min()) {
758  A_block_cst.assign_data(tpetra_values.data() + A_offset);
759  // Pull x into local memory
760  if constexpr (async) {
761  size_type remote_cutoff = blocksize * num_local_rows;
762  if (x_offset >= remote_cutoff)
763  xx.assign_data(&x_remote(x_offset - remote_cutoff, col));
764  else
765  xx.assign_data(&x(x_offset, col));
766  } else {
767  if constexpr (!overlap) {
768  xx.assign_data(&x(x_offset, col));
769  } else {
770  xx.assign_data(&x_remote(x_offset, col));
771  }
772  }
773 
774  Kokkos::parallel_for(Kokkos::ThreadVectorRange(member, blocksize), [&](const local_ordinal_type &i) {
775  local_x[i] = xx(i);
776  });
777 
778  // MatVec op Ax += A*x
779  Kokkos::parallel_for(
780  Kokkos::ThreadVectorRange(member, blocksize),
781  [&](const local_ordinal_type &k0) {
782  impl_scalar_type val = 0;
783  for (int k1 = 0; k1 < blocksize; k1++)
784  val += A_block_cst(k0, k1) * local_x[k1];
785  Kokkos::atomic_add(local_Ax + k0, val);
786  });
787  }
788  });
789  member.team_barrier();
790  // Update y = b - local_Ax
791  yy.assign_data(&y_packed_scalar(pri, 0, col, v));
792  bb.assign_data(&b(row, col));
793  Kokkos::parallel_for(Kokkos::TeamVectorRange(member, blocksize), [&](const local_ordinal_type &i) {
794  if (!overlap)
795  yy(i) = bb(i) - local_Ax[i];
796  else
797  yy(i) -= local_Ax[i];
798  });
799  }
800  }
801 
802  // GPU implementation for hasBlockCrsMatrix == false
803  template <int B, bool async, bool overlap>
804  KOKKOS_INLINE_FUNCTION void
805  operator()(const GeneralTag<B, async, overlap, false> &, const member_type &member) const {
806  const local_ordinal_type blocksize = (B == 0 ? blocksize_requested : B);
807 
808  // constants
809  const local_ordinal_type rowidx = member.league_rank();
810  const local_ordinal_type partidx = rowidx2part(rowidx);
811  const local_ordinal_type pri = part2packrowidx0(partidx) + (rowidx - partptr(partidx));
812  const local_ordinal_type v = partidx % vector_length;
813 
814  const Kokkos::pair<local_ordinal_type, local_ordinal_type> block_range(0, blocksize);
815  const local_ordinal_type num_vectors = y_packed_scalar.extent(2);
816  const local_ordinal_type num_local_rows = lclrow.extent(0);
817 
818  // subview pattern
819  using subview_1D_right_t = decltype(Kokkos::subview(b, block_range, 0));
820  using subview_1D_stride_t = decltype(Kokkos::subview(y_packed_scalar, 0, block_range, 0, 0));
821  subview_1D_right_t bb(nullptr, blocksize);
822  subview_1D_right_t xx(nullptr, blocksize);
823  subview_1D_right_t xx_remote(nullptr, blocksize);
824  subview_1D_stride_t yy(nullptr, Kokkos::LayoutStride(blocksize, y_packed_scalar.stride_1()));
825  auto A_block_cst = ConstUnmanaged<tpetra_block_access_view_type>(NULL, blocksize, blocksize);
826  auto colindsub_used = overlap ? colindsub_remote : colindsub;
827  auto rowptr_used = overlap ? rowptr_remote : rowptr;
828 
829  const local_ordinal_type lr = lclrow(rowidx);
830  const local_ordinal_type row = lr * blocksize;
831  for (local_ordinal_type col = 0; col < num_vectors; ++col) {
832  yy.assign_data(&y_packed_scalar(pri, 0, col, v));
833  if (!overlap) {
834  // y := b
835  bb.assign_data(&b(row, col));
836  if (member.team_rank() == 0)
837  VectorCopy(member, blocksize, bb, yy);
838  member.team_barrier();
839  }
840 
841  // y -= Rx
842  const size_type A_k0 = A_block_rowptr[lr];
843  Kokkos::parallel_for(Kokkos::TeamThreadRange(member, rowptr_used[lr], rowptr_used[lr + 1]),
844  [&](const local_ordinal_type &k) {
845  const size_type j = A_k0 + colindsub_used[k];
846  const local_ordinal_type A_colind_at_j = A_colind[j];
847  if ((async && A_colind_at_j < num_local_rows) || (!async && !overlap)) {
848  const auto loc = is_dm2cm_active ? dm2cm[A_colind_at_j] : A_colind_at_j;
849  xx.assign_data(&x(loc * blocksize, col));
850  for (local_ordinal_type k0 = 0; k0 < blocksize; ++k0)
851  VectorDot(member, blocksize, lr, k, k0, colindsub_used, xx, yy);
852  } else {
853  const auto loc = A_colind_at_j - num_local_rows;
854  xx_remote.assign_data(&x_remote(loc * blocksize, col));
855  for (local_ordinal_type k0 = 0; k0 < blocksize; ++k0)
856  VectorDot(member, blocksize, lr, k, k0, colindsub_used, xx_remote, yy);
857  }
858  });
859  }
860  }
861 
862  // y = b - Rx; seq method
863  template <typename MultiVectorLocalViewTypeY,
864  typename MultiVectorLocalViewTypeB,
865  typename MultiVectorLocalViewTypeX>
866  void run(const MultiVectorLocalViewTypeY &y_,
867  const MultiVectorLocalViewTypeB &b_,
868  const MultiVectorLocalViewTypeX &x_) {
869  IFPACK2_BLOCKHELPER_PROFILER_REGION_BEGIN;
870  IFPACK2_BLOCKHELPER_TIMER_WITH_FENCE("BlockTriDi::ComputeResidual::<SeqTag>", ComputeResidual0, execution_space);
871 
872  y = y_;
873  b = b_;
874  x = x_;
875  if constexpr (is_device<execution_space>::value) {
876  const local_ordinal_type blocksize = blocksize_requested;
877  const local_ordinal_type team_size = 8;
878  const local_ordinal_type vector_size = ComputeResidualVectorRecommendedVectorSize<execution_space>(blocksize, team_size);
879  const Kokkos::TeamPolicy<execution_space, SeqTag> policy(rowptr.extent(0) - 1, team_size, vector_size);
880  Kokkos::parallel_for("ComputeResidual::TeamPolicy::run<SeqTag>", policy, *this);
881  } else {
882  const Kokkos::RangePolicy<execution_space, SeqTag> policy(0, rowptr.extent(0) - 1);
883  Kokkos::parallel_for("ComputeResidual::RangePolicy::run<SeqTag>", policy, *this);
884  }
885  IFPACK2_BLOCKHELPER_PROFILER_REGION_END;
886  IFPACK2_BLOCKHELPER_TIMER_FENCE(execution_space)
887  }
888 
889  // y = b - R (x , x_remote)
890  template <typename MultiVectorLocalViewTypeB,
891  typename MultiVectorLocalViewTypeX,
892  typename MultiVectorLocalViewTypeX_Remote>
893  void run(const vector_type_3d_view &y_packed_,
894  const MultiVectorLocalViewTypeB &b_,
895  const MultiVectorLocalViewTypeX &x_,
896  const MultiVectorLocalViewTypeX_Remote &x_remote_) {
897  IFPACK2_BLOCKHELPER_PROFILER_REGION_BEGIN;
898  IFPACK2_BLOCKHELPER_TIMER_WITH_FENCE("BlockTriDi::ComputeResidual::<AsyncTag>", ComputeResidual0, execution_space);
899 
900  b = b_;
901  x = x_;
902  x_remote = x_remote_;
903  if constexpr (is_device<execution_space>::value) {
904  y_packed_scalar = btdm_scalar_type_4d_view((btdm_scalar_type *)y_packed_.data(),
905  y_packed_.extent(0),
906  y_packed_.extent(1),
907  y_packed_.extent(2),
908  vector_length);
909  } else {
910  y_packed = y_packed_;
911  }
912 
913  if constexpr (is_device<execution_space>::value) {
914  const local_ordinal_type blocksize = blocksize_requested;
915  // local_ordinal_type vl_power_of_two = 1;
916  // for (;vl_power_of_two<=blocksize_requested;vl_power_of_two*=2);
917  // vl_power_of_two *= (vl_power_of_two < blocksize_requested ? 2 : 1);
918  // const local_ordinal_type vl = vl_power_of_two > vector_length ? vector_length : vl_power_of_two;
919 #define BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(B) \
920  { \
921  if (this->hasBlockCrsMatrix) { \
922  const local_ordinal_type team_size = 8; \
923  const local_ordinal_type vector_size = 8; \
924  const size_t shmem_team_size = blocksize * sizeof(btdm_scalar_type); \
925  const size_t shmem_thread_size = blocksize * sizeof(btdm_scalar_type); \
926  Kokkos::TeamPolicy<execution_space, AsyncTag<B, true>> \
927  policy(rowidx2part.extent(0), team_size, vector_size); \
928  policy.set_scratch_size(0, Kokkos::PerTeam(shmem_team_size), Kokkos::PerThread(shmem_thread_size)); \
929  Kokkos::parallel_for("ComputeResidual::TeamPolicy::run<AsyncTag>", \
930  policy, *this); \
931  } else { \
932  const local_ordinal_type team_size = 8; \
933  const local_ordinal_type vector_size = ComputeResidualVectorRecommendedVectorSize<execution_space>(blocksize, team_size); \
934  const Kokkos::TeamPolicy<execution_space, AsyncTag<B, false>> \
935  policy(rowidx2part.extent(0), team_size, vector_size); \
936  Kokkos::parallel_for("ComputeResidual::TeamPolicy::run<AsyncTag>", \
937  policy, *this); \
938  } \
939  } \
940  break
941  switch (blocksize_requested) {
942  case 3: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(3);
943  case 5: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(5);
944  case 7: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(7);
945  case 9: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(9);
946  case 10: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(10);
947  case 11: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(11);
948  case 16: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(16);
949  case 17: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(17);
950  case 18: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(18);
951  default: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(0);
952  }
953 #undef BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL
954  } else {
955 #define BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(B) \
956  { \
957  if (this->hasBlockCrsMatrix) { \
958  const Kokkos::RangePolicy<execution_space, AsyncTag<B, true>> policy(0, rowidx2part.extent(0)); \
959  Kokkos::parallel_for("ComputeResidual::RangePolicy::run<AsyncTag>", \
960  policy, *this); \
961  } else { \
962  const Kokkos::RangePolicy<execution_space, AsyncTag<B, false>> policy(0, rowidx2part.extent(0)); \
963  Kokkos::parallel_for("ComputeResidual::RangePolicy::run<AsyncTag>", \
964  policy, *this); \
965  } \
966  } \
967  break
968 
969  switch (blocksize_requested) {
970  case 3: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(3);
971  case 5: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(5);
972  case 7: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(7);
973  case 9: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(9);
974  case 10: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(10);
975  case 11: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(11);
976  case 16: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(16);
977  case 17: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(17);
978  case 18: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(18);
979  default: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(0);
980  }
981 #undef BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL
982  }
983  IFPACK2_BLOCKHELPER_PROFILER_REGION_END;
984  IFPACK2_BLOCKHELPER_TIMER_FENCE(execution_space)
985  }
986 
987  // y = b - R (y , y_remote)
988  template <typename MultiVectorLocalViewTypeB,
989  typename MultiVectorLocalViewTypeX,
990  typename MultiVectorLocalViewTypeX_Remote>
991  void run(const vector_type_3d_view &y_packed_,
992  const MultiVectorLocalViewTypeB &b_,
993  const MultiVectorLocalViewTypeX &x_,
994  const MultiVectorLocalViewTypeX_Remote &x_remote_,
995  const bool compute_owned) {
996  IFPACK2_BLOCKHELPER_PROFILER_REGION_BEGIN;
997  IFPACK2_BLOCKHELPER_TIMER_WITH_FENCE("BlockTriDi::ComputeResidual::<OverlapTag>", ComputeResidual0, execution_space);
998 
999  b = b_;
1000  x = x_;
1001  x_remote = x_remote_;
1002  if constexpr (is_device<execution_space>::value) {
1003  y_packed_scalar = btdm_scalar_type_4d_view((btdm_scalar_type *)y_packed_.data(),
1004  y_packed_.extent(0),
1005  y_packed_.extent(1),
1006  y_packed_.extent(2),
1007  vector_length);
1008  } else {
1009  y_packed = y_packed_;
1010  }
1011 
1012  if constexpr (is_device<execution_space>::value) {
1013  const local_ordinal_type blocksize = blocksize_requested;
1014  // local_ordinal_type vl_power_of_two = 1;
1015  // for (;vl_power_of_two<=blocksize_requested;vl_power_of_two*=2);
1016  // vl_power_of_two *= (vl_power_of_two < blocksize_requested ? 2 : 1);
1017  // const local_ordinal_type vl = vl_power_of_two > vector_length ? vector_length : vl_power_of_two;
1018 #define BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(B) \
1019  if (this->hasBlockCrsMatrix) { \
1020  const local_ordinal_type team_size = 8; \
1021  const local_ordinal_type vector_size = 8; \
1022  const size_t shmem_team_size = blocksize * sizeof(btdm_scalar_type); \
1023  const size_t shmem_thread_size = blocksize * sizeof(btdm_scalar_type); \
1024  if (compute_owned) { \
1025  Kokkos::TeamPolicy<execution_space, OverlapTag<0, B, true>> \
1026  policy(rowidx2part.extent(0), team_size, vector_size); \
1027  policy.set_scratch_size(0, Kokkos::PerTeam(shmem_team_size), Kokkos::PerThread(shmem_thread_size)); \
1028  Kokkos::parallel_for("ComputeResidual::TeamPolicy::run<OverlapTag<0> >", policy, *this); \
1029  } else { \
1030  Kokkos::TeamPolicy<execution_space, OverlapTag<1, B, true>> \
1031  policy(rowidx2part.extent(0), team_size, vector_size); \
1032  policy.set_scratch_size(0, Kokkos::PerTeam(shmem_team_size), Kokkos::PerThread(shmem_thread_size)); \
1033  Kokkos::parallel_for("ComputeResidual::TeamPolicy::run<OverlapTag<1> >", policy, *this); \
1034  } \
1035  } else { \
1036  const local_ordinal_type team_size = 8; \
1037  const local_ordinal_type vector_size = ComputeResidualVectorRecommendedVectorSize<execution_space>(blocksize, team_size); \
1038  if (compute_owned) { \
1039  const Kokkos::TeamPolicy<execution_space, OverlapTag<0, B, false>> \
1040  policy(rowidx2part.extent(0), team_size, vector_size); \
1041  Kokkos::parallel_for("ComputeResidual::TeamPolicy::run<OverlapTag<0> >", policy, *this); \
1042  } else { \
1043  const Kokkos::TeamPolicy<execution_space, OverlapTag<1, B, false>> \
1044  policy(rowidx2part.extent(0), team_size, vector_size); \
1045  Kokkos::parallel_for("ComputeResidual::TeamPolicy::run<OverlapTag<1> >", policy, *this); \
1046  } \
1047  } \
1048  break
1049  switch (blocksize_requested) {
1050  case 3: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(3);
1051  case 5: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(5);
1052  case 7: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(7);
1053  case 9: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(9);
1054  case 10: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(10);
1055  case 11: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(11);
1056  case 16: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(16);
1057  case 17: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(17);
1058  case 18: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(18);
1059  default: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(0);
1060  }
1061 #undef BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL
1062  } else {
1063 #define BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(B) \
1064  if (this->hasBlockCrsMatrix) { \
1065  if (compute_owned) { \
1066  const Kokkos::RangePolicy<execution_space, OverlapTag<0, B, true>> \
1067  policy(0, rowidx2part.extent(0)); \
1068  Kokkos::parallel_for("ComputeResidual::RangePolicy::run<OverlapTag<0> >", policy, *this); \
1069  } else { \
1070  const Kokkos::RangePolicy<execution_space, OverlapTag<1, B, true>> \
1071  policy(0, rowidx2part.extent(0)); \
1072  Kokkos::parallel_for("ComputeResidual::RangePolicy::run<OverlapTag<1> >", policy, *this); \
1073  } \
1074  } else { \
1075  if (compute_owned) { \
1076  const Kokkos::RangePolicy<execution_space, OverlapTag<0, B, false>> \
1077  policy(0, rowidx2part.extent(0)); \
1078  Kokkos::parallel_for("ComputeResidual::RangePolicy::run<OverlapTag<0> >", policy, *this); \
1079  } else { \
1080  const Kokkos::RangePolicy<execution_space, OverlapTag<1, B, false>> \
1081  policy(0, rowidx2part.extent(0)); \
1082  Kokkos::parallel_for("ComputeResidual::RangePolicy::run<OverlapTag<1> >", policy, *this); \
1083  } \
1084  } \
1085  break
1086 
1087  switch (blocksize_requested) {
1088  case 3: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(3);
1089  case 5: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(5);
1090  case 7: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(7);
1091  case 9: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(9);
1092  case 10: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(10);
1093  case 11: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(11);
1094  case 16: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(16);
1095  case 17: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(17);
1096  case 18: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(18);
1097  default: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(0);
1098  }
1099 #undef BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL
1100  }
1101  IFPACK2_BLOCKHELPER_PROFILER_REGION_END;
1102  IFPACK2_BLOCKHELPER_TIMER_FENCE(execution_space)
1103  }
1104 };
1105 
1106 } // namespace BlockHelperDetails
1107 
1108 } // namespace Ifpack2
1109 
1110 #endif
node_type::device_type node_device_type
Definition: Ifpack2_BlockHelper.hpp:297
size_t size_type
Definition: Ifpack2_BlockHelper.hpp:274
Kokkos::View< size_type *, device_type > size_type_1d_view
Definition: Ifpack2_BlockHelper.hpp:346
Kokkos::Details::ArithTraits< scalar_type >::val_type impl_scalar_type
Definition: Ifpack2_BlockHelper.hpp:283
Definition: Ifpack2_BlockHelper.hpp:270
Definition: Ifpack2_BlockComputeResidualVector.hpp:23