10 #ifndef IFPACK2_BLOCKCOMPUTERES_IMPL_HPP
11 #define IFPACK2_BLOCKCOMPUTERES_IMPL_HPP
13 #include "Ifpack2_BlockHelper.hpp"
17 namespace BlockHelperDetails {
22 template <
typename MatrixType>
25 using local_ordinal_type_1d_view =
typename impl_type::local_ordinal_type_1d_view;
27 using i64_3d_view =
typename impl_type::i64_3d_view;
28 using impl_scalar_type_1d_view_tpetra = Unmanaged<typename impl_type::impl_scalar_type_1d_view_tpetra>;
30 size_type_1d_view rowptr, rowptr_remote;
37 local_ordinal_type_1d_view A_colindsub, A_colindsub_remote;
39 i64_3d_view A_x_offsets;
41 i64_3d_view A_x_offsets_remote;
44 bool is_tpetra_block_crs;
47 impl_scalar_type_1d_view_tpetra tpetra_values;
50 AmD(
const AmD &b) =
default;
53 template <
typename MatrixType>
54 struct PartInterface {
55 using local_ordinal_type =
typename BlockHelperDetails::ImplType<MatrixType>::local_ordinal_type;
56 using local_ordinal_type_1d_view =
typename BlockHelperDetails::ImplType<MatrixType>::local_ordinal_type_1d_view;
57 using local_ordinal_type_2d_view =
typename BlockHelperDetails::ImplType<MatrixType>::local_ordinal_type_2d_view;
59 PartInterface() =
default;
60 PartInterface(
const PartInterface &b) =
default;
80 local_ordinal_type_1d_view lclrow;
82 local_ordinal_type_1d_view partptr;
83 local_ordinal_type_2d_view partptr_sub;
84 local_ordinal_type_1d_view partptr_schur;
87 local_ordinal_type_1d_view packptr;
88 local_ordinal_type_1d_view packptr_sub;
89 local_ordinal_type_1d_view packindices_sub;
90 local_ordinal_type_2d_view packindices_schur;
93 local_ordinal_type_1d_view part2rowidx0;
94 local_ordinal_type_1d_view part2rowidx0_sub;
98 local_ordinal_type_1d_view part2packrowidx0;
99 local_ordinal_type_2d_view part2packrowidx0_sub;
100 local_ordinal_type part2packrowidx0_back;
102 local_ordinal_type_1d_view rowidx2part;
103 local_ordinal_type_1d_view rowidx2part_sub;
111 local_ordinal_type max_partsz;
112 local_ordinal_type max_subpartsz;
113 local_ordinal_type n_subparts_per_part;
114 local_ordinal_type nparts;
124 template <
typename MatrixType>
125 void precompute_A_x_offsets(
126 AmD<MatrixType> &amd,
127 const PartInterface<MatrixType> &interf,
128 const Teuchos::RCP<
const typename ImplType<MatrixType>::tpetra_crs_graph_type> &g,
129 const typename ImplType<MatrixType>::local_ordinal_type_1d_view &dm2cm,
131 bool ownedRemoteSeparate) {
132 using impl_type = ImplType<MatrixType>;
133 using i64_3d_view =
typename impl_type::i64_3d_view;
134 using size_type =
typename impl_type::size_type;
135 using local_ordinal_type =
typename impl_type::local_ordinal_type;
136 using execution_space =
typename impl_type::execution_space;
137 auto local_graph = g->getLocalGraphDevice();
138 const auto A_block_rowptr = local_graph.row_map;
139 const auto A_colind = local_graph.entries;
140 local_ordinal_type numLocalRows = interf.rowidx2part.extent(0);
141 int blocksize_square = blocksize * blocksize;
143 auto lclrow = interf.lclrow;
144 auto A_colindsub = amd.A_colindsub;
145 auto A_colindsub_remote = amd.A_colindsub_remote;
146 auto rowptr = amd.rowptr;
147 auto rowptr_remote = amd.rowptr_remote;
148 bool is_dm2cm_active = dm2cm.extent(0);
149 if (ownedRemoteSeparate) {
151 local_ordinal_type maxOwnedEntriesPerRow = 0;
152 local_ordinal_type maxNonownedEntriesPerRow = 0;
153 Kokkos::parallel_reduce(
154 Kokkos::RangePolicy<execution_space>(0, numLocalRows),
155 KOKKOS_LAMBDA(local_ordinal_type i, local_ordinal_type & lmaxOwned, local_ordinal_type & lmaxNonowned) {
156 const local_ordinal_type lr = lclrow(i);
157 local_ordinal_type rowNumOwned = rowptr(lr + 1) - rowptr(lr);
158 if (rowNumOwned > lmaxOwned)
159 lmaxOwned = rowNumOwned;
161 if (rowptr_remote.extent(0)) {
162 local_ordinal_type rowNumNonowned = rowptr_remote(lr + 1) - rowptr_remote(lr);
163 if (rowNumNonowned > lmaxNonowned)
164 lmaxNonowned = rowNumNonowned;
169 Kokkos::Max<local_ordinal_type>(maxOwnedEntriesPerRow), Kokkos::Max<local_ordinal_type>(maxNonownedEntriesPerRow));
173 amd.A_x_offsets = i64_3d_view(
"amd.A_x_offsets", numLocalRows, 2, maxOwnedEntriesPerRow);
174 amd.A_x_offsets_remote = i64_3d_view(
"amd.A_x_offsets_remote", numLocalRows, 2, maxNonownedEntriesPerRow);
175 auto A_x_offsets = amd.A_x_offsets;
176 auto A_x_offsets_remote = amd.A_x_offsets_remote;
178 Kokkos::parallel_for(
179 Kokkos::RangePolicy<execution_space>(0, numLocalRows),
180 KOKKOS_LAMBDA(local_ordinal_type i) {
181 const local_ordinal_type lr = lclrow(i);
182 const size_type A_k0 = A_block_rowptr(lr);
184 size_type rowBegin = rowptr(lr);
185 local_ordinal_type rowNumOwned = rowptr(lr + 1) - rowBegin;
186 for (local_ordinal_type entry = 0; entry < maxOwnedEntriesPerRow; entry++) {
187 if (entry < rowNumOwned) {
188 const size_type j = A_k0 + A_colindsub(rowBegin + entry);
189 const local_ordinal_type A_colind_at_j = A_colind(j);
190 const local_ordinal_type loc = is_dm2cm_active ? dm2cm(A_colind_at_j) : A_colind_at_j;
191 A_x_offsets(i, 0, entry) = int64_t(j) * blocksize_square;
192 A_x_offsets(i, 1, entry) = int64_t(loc) * blocksize;
194 A_x_offsets(i, 0, entry) = Kokkos::ArithTraits<int64_t>::min();
195 A_x_offsets(i, 1, entry) = Kokkos::ArithTraits<int64_t>::min();
199 if (rowptr_remote.extent(0)) {
200 rowBegin = rowptr_remote(lr);
201 local_ordinal_type rowNumNonowned = rowptr_remote(lr + 1) - rowBegin;
202 for (local_ordinal_type entry = 0; entry < maxNonownedEntriesPerRow; entry++) {
203 if (entry < rowNumNonowned) {
204 const size_type j = A_k0 + A_colindsub_remote(rowBegin + entry);
205 const local_ordinal_type A_colind_at_j = A_colind(j);
206 const local_ordinal_type loc = A_colind_at_j - numLocalRows;
207 A_x_offsets_remote(i, 0, entry) = int64_t(j) * blocksize_square;
208 A_x_offsets_remote(i, 1, entry) = int64_t(loc) * blocksize;
210 A_x_offsets_remote(i, 0, entry) = Kokkos::ArithTraits<int64_t>::min();
211 A_x_offsets_remote(i, 1, entry) = Kokkos::ArithTraits<int64_t>::min();
218 local_ordinal_type maxEntriesPerRow = 0;
219 Kokkos::parallel_reduce(
220 Kokkos::RangePolicy<execution_space>(0, numLocalRows),
221 KOKKOS_LAMBDA(local_ordinal_type i, local_ordinal_type & lmax) {
222 const local_ordinal_type lr = lclrow(i);
223 local_ordinal_type rowNum = rowptr(lr + 1) - rowptr(lr);
227 Kokkos::Max<local_ordinal_type>(maxEntriesPerRow));
228 amd.A_x_offsets = i64_3d_view(
"amd.A_x_offsets", numLocalRows, 2, maxEntriesPerRow);
229 auto A_x_offsets = amd.A_x_offsets;
232 Kokkos::parallel_for(
233 Kokkos::RangePolicy<execution_space>(0, numLocalRows),
234 KOKKOS_LAMBDA(local_ordinal_type i) {
235 const local_ordinal_type lr = lclrow(i);
236 const size_type A_k0 = A_block_rowptr(lr);
238 size_type rowBegin = rowptr(lr);
239 local_ordinal_type rowOwned = rowptr(lr + 1) - rowBegin;
240 for (local_ordinal_type entry = 0; entry < maxEntriesPerRow; entry++) {
241 if (entry < rowOwned) {
242 const size_type j = A_k0 + A_colindsub(rowBegin + entry);
243 A_x_offsets(i, 0, entry) = j * blocksize_square;
244 const local_ordinal_type A_colind_at_j = A_colind(j);
245 if (A_colind_at_j < numLocalRows) {
246 const local_ordinal_type loc = is_dm2cm_active ? dm2cm[A_colind_at_j] : A_colind_at_j;
247 A_x_offsets(i, 1, entry) = int64_t(loc) * blocksize;
249 A_x_offsets(i, 1, entry) = int64_t(A_colind_at_j) * blocksize;
252 A_x_offsets(i, 0, entry) = Kokkos::ArithTraits<int64_t>::min();
253 A_x_offsets(i, 1, entry) = Kokkos::ArithTraits<int64_t>::min();
263 static inline int ComputeResidualVectorRecommendedCudaVectorSize(
const int blksize,
264 const int team_size) {
265 int total_team_size(0);
267 total_team_size = 32;
268 else if (blksize <= 9)
269 total_team_size = 32;
270 else if (blksize <= 12)
271 total_team_size = 96;
272 else if (blksize <= 16)
273 total_team_size = 128;
274 else if (blksize <= 20)
275 total_team_size = 160;
277 total_team_size = 160;
278 return total_team_size / team_size;
281 static inline int ComputeResidualVectorRecommendedHIPVectorSize(
const int blksize,
282 const int team_size) {
283 int total_team_size(0);
285 total_team_size = 32;
286 else if (blksize <= 9)
287 total_team_size = 32;
288 else if (blksize <= 12)
289 total_team_size = 96;
290 else if (blksize <= 16)
291 total_team_size = 128;
292 else if (blksize <= 20)
293 total_team_size = 160;
295 total_team_size = 160;
296 return total_team_size / team_size;
299 static inline int ComputeResidualVectorRecommendedSYCLVectorSize(
const int blksize,
300 const int team_size) {
301 int total_team_size(0);
303 total_team_size = 32;
304 else if (blksize <= 9)
305 total_team_size = 32;
306 else if (blksize <= 12)
307 total_team_size = 96;
308 else if (blksize <= 16)
309 total_team_size = 128;
310 else if (blksize <= 20)
311 total_team_size = 160;
313 total_team_size = 160;
314 return total_team_size / team_size;
317 template <
typename T>
318 static inline int ComputeResidualVectorRecommendedVectorSize(
const int blksize,
319 const int team_size) {
320 if (is_cuda<T>::value)
321 return ComputeResidualVectorRecommendedCudaVectorSize(blksize, team_size);
322 if (is_hip<T>::value)
323 return ComputeResidualVectorRecommendedHIPVectorSize(blksize, team_size);
324 if (is_sycl<T>::value)
325 return ComputeResidualVectorRecommendedSYCLVectorSize(blksize, team_size);
329 template <
typename MatrixType>
330 struct ComputeResidualVector {
332 using impl_type = BlockHelperDetails::ImplType<MatrixType>;
334 using execution_space =
typename impl_type::execution_space;
335 using memory_space =
typename impl_type::memory_space;
337 using local_ordinal_type =
typename impl_type::local_ordinal_type;
340 using magnitude_type =
typename impl_type::magnitude_type;
341 using btdm_scalar_type =
typename impl_type::btdm_scalar_type;
342 using btdm_magnitude_type =
typename impl_type::btdm_magnitude_type;
344 using local_ordinal_type_1d_view =
typename impl_type::local_ordinal_type_1d_view;
346 using tpetra_block_access_view_type =
typename impl_type::tpetra_block_access_view_type;
347 using impl_scalar_type_1d_view =
typename impl_type::impl_scalar_type_1d_view;
348 using impl_scalar_type_2d_view_tpetra =
typename impl_type::impl_scalar_type_2d_view_tpetra;
349 using vector_type_3d_view =
typename impl_type::vector_type_3d_view;
350 using btdm_scalar_type_4d_view =
typename impl_type::btdm_scalar_type_4d_view;
351 using i64_3d_view =
typename impl_type::i64_3d_view;
352 static constexpr
int vector_length = impl_type::vector_length;
355 using member_type =
typename Kokkos::TeamPolicy<execution_space>::member_type;
358 enum :
int { max_blocksize = 32 };
361 ConstUnmanaged<impl_scalar_type_2d_view_tpetra> b;
362 ConstUnmanaged<impl_scalar_type_2d_view_tpetra> x;
363 ConstUnmanaged<impl_scalar_type_2d_view_tpetra> x_remote;
364 Unmanaged<impl_scalar_type_2d_view_tpetra> y;
365 Unmanaged<vector_type_3d_view> y_packed;
366 Unmanaged<btdm_scalar_type_4d_view> y_packed_scalar;
369 const ConstUnmanaged<size_type_1d_view> rowptr, rowptr_remote;
370 const ConstUnmanaged<local_ordinal_type_1d_view> colindsub, colindsub_remote;
371 const ConstUnmanaged<impl_scalar_type_1d_view> tpetra_values;
375 const ConstUnmanaged<Kokkos::View<size_t *, node_device_type>> A_block_rowptr;
376 const ConstUnmanaged<Kokkos::View<size_t *, node_device_type>> A_point_rowptr;
377 const ConstUnmanaged<Kokkos::View<local_ordinal_type *, node_device_type>> A_colind;
380 const local_ordinal_type blocksize_requested;
383 const ConstUnmanaged<local_ordinal_type_1d_view> part2packrowidx0;
384 const ConstUnmanaged<local_ordinal_type_1d_view> part2rowidx0;
385 const ConstUnmanaged<local_ordinal_type_1d_view> rowidx2part;
386 const ConstUnmanaged<local_ordinal_type_1d_view> partptr;
387 const ConstUnmanaged<local_ordinal_type_1d_view> lclrow;
388 const ConstUnmanaged<local_ordinal_type_1d_view> dm2cm;
391 const ConstUnmanaged<i64_3d_view> A_x_offsets;
392 const ConstUnmanaged<i64_3d_view> A_x_offsets_remote;
394 const bool is_dm2cm_active;
395 const bool hasBlockCrsMatrix;
398 template <
typename LocalCrsGraphType>
399 ComputeResidualVector(
const AmD<MatrixType> &amd,
400 const LocalCrsGraphType &block_graph,
401 const LocalCrsGraphType &point_graph,
402 const local_ordinal_type &blocksize_requested_,
403 const PartInterface<MatrixType> &interf,
404 const local_ordinal_type_1d_view &dm2cm_,
405 bool hasBlockCrsMatrix_)
407 , rowptr_remote(amd.rowptr_remote)
408 , colindsub(amd.A_colindsub)
409 , colindsub_remote(amd.A_colindsub_remote)
410 , tpetra_values(amd.tpetra_values)
411 , A_block_rowptr(block_graph.row_map)
412 , A_point_rowptr(point_graph.row_map)
413 , A_colind(block_graph.entries)
414 , blocksize_requested(blocksize_requested_)
415 , part2packrowidx0(interf.part2packrowidx0)
416 , part2rowidx0(interf.part2rowidx0)
417 , rowidx2part(interf.rowidx2part)
418 , partptr(interf.partptr)
419 , lclrow(interf.lclrow)
421 , A_x_offsets(amd.A_x_offsets)
422 , A_x_offsets_remote(amd.A_x_offsets_remote)
423 , is_dm2cm_active(dm2cm_.span() > 0)
424 , hasBlockCrsMatrix(hasBlockCrsMatrix_) {}
427 SerialDot(
const local_ordinal_type &blocksize,
428 const local_ordinal_type &lclRowID,
429 const local_ordinal_type &lclColID,
430 const local_ordinal_type &ii,
431 const ConstUnmanaged<local_ordinal_type_1d_view> colindsub_,
432 const impl_scalar_type *
const KOKKOS_RESTRICT xx,
433 impl_scalar_type *KOKKOS_RESTRICT yy)
const {
434 const size_type Aj_c = colindsub_(lclColID);
435 auto point_row_offset = A_point_rowptr(lclRowID * blocksize + ii) + Aj_c * blocksize;
436 impl_scalar_type val = 0;
437 #if defined(KOKKOS_ENABLE_PRAGMA_IVDEP)
440 #if defined(KOKKOS_ENABLE_PRAGMA_UNROLL)
443 for (local_ordinal_type k1 = 0; k1 < blocksize; ++k1)
444 val += tpetra_values(point_row_offset + k1) * xx[k1];
449 SerialGemv(
const local_ordinal_type &blocksize,
450 const impl_scalar_type *
const KOKKOS_RESTRICT AA,
451 const impl_scalar_type *
const KOKKOS_RESTRICT xx,
452 impl_scalar_type *KOKKOS_RESTRICT yy)
const {
453 using tlb = BlockHelperDetails::TpetraLittleBlock<Tpetra::Impl::BlockCrsMatrixLittleBlockArrayLayout>;
454 for (local_ordinal_type k0 = 0; k0 < blocksize; ++k0) {
455 impl_scalar_type val = 0;
456 #if defined(KOKKOS_ENABLE_PRAGMA_IVDEP)
459 #if defined(KOKKOS_ENABLE_PRAGMA_UNROLL)
462 for (local_ordinal_type k1 = 0; k1 < blocksize; ++k1)
463 val += AA[tlb::getFlatIndex(k0, k1, blocksize)] * xx[k1];
468 template <
typename bbViewType,
typename yyViewType>
469 KOKKOS_INLINE_FUNCTION
void
470 VectorCopy(
const member_type &member,
471 const local_ordinal_type &blocksize,
472 const bbViewType &bb,
473 const yyViewType &yy)
const {
474 Kokkos::parallel_for(Kokkos::ThreadVectorRange(member, blocksize), [&](
const local_ordinal_type &k0) {
475 yy(k0) =
static_cast<typename yyViewType::const_value_type
>(bb(k0));
479 template <
typename xxViewType,
typename yyViewType>
480 KOKKOS_INLINE_FUNCTION
void
481 VectorDot(
const member_type &member,
482 const local_ordinal_type &blocksize,
483 const local_ordinal_type &lclRowID,
484 const local_ordinal_type &lclColID,
485 const local_ordinal_type &ii,
486 const ConstUnmanaged<local_ordinal_type_1d_view> colindsub_,
487 const xxViewType &xx,
488 const yyViewType &yy)
const {
489 const size_type Aj_c = colindsub_(lclColID);
490 auto point_row_offset = A_point_rowptr(lclRowID * blocksize + ii) + Aj_c * blocksize;
491 impl_scalar_type val = 0;
492 Kokkos::parallel_reduce(
493 Kokkos::ThreadVectorRange(member, blocksize),
494 [&](
const local_ordinal_type &k1, impl_scalar_type &update) {
495 update += tpetra_values(point_row_offset + k1) * xx(k1);
498 Kokkos::single(Kokkos::PerThread(member),
500 Kokkos::atomic_add(&yy(ii),
typename yyViewType::const_value_type(-val));
505 template <
typename AAViewType,
typename xxViewType,
typename yyViewType>
506 KOKKOS_INLINE_FUNCTION
void
507 VectorGemv(
const member_type &member,
508 const local_ordinal_type &blocksize,
509 const AAViewType &AA,
510 const xxViewType &xx,
511 const yyViewType &yy)
const {
512 for (local_ordinal_type k0 = 0; k0 < blocksize; ++k0) {
513 impl_scalar_type val = 0;
514 Kokkos::parallel_reduce(
515 Kokkos::ThreadVectorRange(member, blocksize),
516 [&](
const local_ordinal_type &k1, impl_scalar_type &update) {
517 update += AA(k0, k1) * xx(k1);
520 Kokkos::single(Kokkos::PerThread(member),
522 Kokkos::atomic_add(&yy(k0), -val);
529 KOKKOS_INLINE_FUNCTION
531 operator()(
const SeqTag &,
const local_ordinal_type &i)
const {
532 const local_ordinal_type blocksize = blocksize_requested;
533 const local_ordinal_type blocksize_square = blocksize * blocksize;
536 const Kokkos::pair<local_ordinal_type, local_ordinal_type> block_range(0, blocksize);
537 const local_ordinal_type num_vectors = y.extent(1);
538 const local_ordinal_type row = i * blocksize;
539 for (local_ordinal_type col = 0; col < num_vectors; ++col) {
541 impl_scalar_type *yy = &y(row, col);
542 const impl_scalar_type *
const bb = &b(row, col);
543 memcpy(yy, bb,
sizeof(impl_scalar_type) * blocksize);
546 const size_type A_k0 = A_block_rowptr[i];
547 for (size_type k = rowptr[i]; k < rowptr[i + 1]; ++k) {
548 const size_type j = A_k0 + colindsub[k];
549 const impl_scalar_type *
const xx = &x(A_colind[j] * blocksize, col);
550 if (hasBlockCrsMatrix) {
551 const impl_scalar_type *
const AA = &tpetra_values(j * blocksize_square);
552 SerialGemv(blocksize, AA, xx, yy);
554 for (local_ordinal_type k0 = 0; k0 < blocksize; ++k0)
555 SerialDot(blocksize, i, k, k0, colindsub, xx, yy);
561 KOKKOS_INLINE_FUNCTION
563 operator()(
const SeqTag &,
const member_type &member)
const {
565 const local_ordinal_type blocksize = blocksize_requested;
566 const local_ordinal_type blocksize_square = blocksize * blocksize;
568 const local_ordinal_type lr = member.league_rank();
569 const Kokkos::pair<local_ordinal_type, local_ordinal_type> block_range(0, blocksize);
570 const local_ordinal_type num_vectors = y.extent(1);
573 auto bb = Kokkos::subview(b, block_range, 0);
575 auto yy = Kokkos::subview(y, block_range, 0);
576 auto A_block_cst = ConstUnmanaged<tpetra_block_access_view_type>(NULL, blocksize, blocksize);
578 const local_ordinal_type row = lr * blocksize;
579 for (local_ordinal_type col = 0; col < num_vectors; ++col) {
581 yy.assign_data(&y(row, col));
582 bb.assign_data(&b(row, col));
583 if (member.team_rank() == 0)
584 VectorCopy(member, blocksize, bb, yy);
585 member.team_barrier();
588 const size_type A_k0 = A_block_rowptr[lr];
590 if (hasBlockCrsMatrix) {
591 Kokkos::parallel_for(Kokkos::TeamThreadRange(member, rowptr[lr], rowptr[lr + 1]),
592 [&](
const local_ordinal_type &k) {
593 const size_type j = A_k0 + colindsub[k];
594 xx.assign_data(&x(A_colind[j] * blocksize, col));
595 A_block_cst.assign_data(&tpetra_values(j * blocksize_square));
596 VectorGemv(member, blocksize, A_block_cst, xx, yy);
599 Kokkos::parallel_for(Kokkos::TeamThreadRange(member, rowptr[lr], rowptr[lr + 1]),
600 [&](
const local_ordinal_type &k) {
601 const size_type j = A_k0 + colindsub[k];
602 xx.assign_data(&x(A_colind[j] * blocksize, col));
604 for (local_ordinal_type k0 = 0; k0 < blocksize; ++k0)
605 VectorDot(member, blocksize, lr, k, k0, colindsub, xx, yy);
617 template <
int B,
bool async,
bool overlap,
bool haveBlockMatrix>
619 static_assert(!(async && overlap),
620 "ComputeResidualVector: async && overlap is not a valid configuration for GeneralTag");
626 template <
int P,
int B,
bool haveBlockMatrix>
627 using OverlapTag = GeneralTag<B, false, P != 0, haveBlockMatrix>;
629 template <
int B,
bool haveBlockMatrix>
630 using AsyncTag = GeneralTag<B, true, false, haveBlockMatrix>;
633 template <
int B,
bool async,
bool overlap,
bool haveBlockMatrix>
634 KOKKOS_INLINE_FUNCTION
void
635 operator()(
const GeneralTag<B, async, overlap, haveBlockMatrix> &,
const local_ordinal_type &rowidx)
const {
636 const local_ordinal_type blocksize = (B == 0 ? blocksize_requested : B);
639 const local_ordinal_type partidx = rowidx2part(rowidx);
640 const local_ordinal_type pri = part2packrowidx0(partidx) + (rowidx - partptr(partidx));
641 const local_ordinal_type v = partidx % vector_length;
643 const local_ordinal_type num_vectors = y_packed.extent(2);
644 const local_ordinal_type num_local_rows = lclrow.extent(0);
647 impl_scalar_type yy[B == 0 ? max_blocksize : B] = {};
649 const local_ordinal_type lr = lclrow(rowidx);
651 auto colindsub_used = overlap ? colindsub_remote : colindsub;
652 auto rowptr_used = overlap ? rowptr_remote : rowptr;
654 for (local_ordinal_type col = 0; col < num_vectors; ++col) {
655 if constexpr (overlap) {
657 memset((
void *)yy, 0,
sizeof(impl_scalar_type) * blocksize);
660 const local_ordinal_type row = lr * blocksize;
661 memcpy(yy, &b(row, col),
sizeof(impl_scalar_type) * blocksize);
665 const size_type A_k0 = A_block_rowptr[lr];
666 for (size_type k = rowptr_used[lr]; k < rowptr_used[lr + 1]; ++k) {
667 const size_type j = A_k0 + colindsub_used[k];
668 const local_ordinal_type A_colind_at_j = A_colind[j];
669 if constexpr (haveBlockMatrix) {
670 const local_ordinal_type blocksize_square = blocksize * blocksize;
671 const impl_scalar_type *
const AA = &tpetra_values(j * blocksize_square);
672 if ((!async && !overlap) || (async && A_colind_at_j < num_local_rows)) {
673 const auto loc = is_dm2cm_active ? dm2cm[A_colind_at_j] : A_colind_at_j;
674 const impl_scalar_type *
const xx = &x(loc * blocksize, col);
675 SerialGemv(blocksize, AA, xx, yy);
677 const auto loc = A_colind_at_j - num_local_rows;
678 const impl_scalar_type *
const xx_remote = &x_remote(loc * blocksize, col);
679 SerialGemv(blocksize, AA, xx_remote, yy);
682 if ((!async && !overlap) || (async && A_colind_at_j < num_local_rows)) {
683 const auto loc = is_dm2cm_active ? dm2cm[A_colind_at_j] : A_colind_at_j;
684 const impl_scalar_type *
const xx = &x(loc * blocksize, col);
685 for (local_ordinal_type k0 = 0; k0 < blocksize; ++k0)
686 SerialDot(blocksize, lr, k, k0, colindsub_used, xx, yy);
688 const auto loc = A_colind_at_j - num_local_rows;
689 const impl_scalar_type *
const xx_remote = &x_remote(loc * blocksize, col);
690 for (local_ordinal_type k0 = 0; k0 < blocksize; ++k0)
691 SerialDot(blocksize, lr, k, k0, colindsub_used, xx_remote, yy);
696 if constexpr (overlap) {
697 for (local_ordinal_type k = 0; k < blocksize; ++k)
698 y_packed(pri, k, col)[v] += yy[k];
700 for (local_ordinal_type k = 0; k < blocksize; ++k)
701 y_packed(pri, k, col)[v] = yy[k];
707 template <
int B,
bool async,
bool overlap>
708 KOKKOS_INLINE_FUNCTION
void
709 operator()(
const GeneralTag<B, async, overlap, true> &,
const member_type &member)
const {
710 const local_ordinal_type blocksize = (B == 0 ? blocksize_requested : B);
713 const local_ordinal_type rowidx = member.league_rank();
714 const local_ordinal_type partidx = rowidx2part(rowidx);
715 const local_ordinal_type pri = part2packrowidx0(partidx) + (rowidx - partptr(partidx));
716 const local_ordinal_type v = partidx % vector_length;
718 const Kokkos::pair<local_ordinal_type, local_ordinal_type> block_range(0, blocksize);
719 const local_ordinal_type num_vectors = y_packed_scalar.extent(2);
720 const local_ordinal_type num_local_rows = lclrow.extent(0);
723 using subview_1D_right_t = decltype(Kokkos::subview(b, block_range, 0));
724 using subview_1D_stride_t = decltype(Kokkos::subview(y_packed_scalar, 0, block_range, 0, 0));
725 subview_1D_right_t bb(
nullptr, blocksize);
726 subview_1D_right_t xx(
nullptr, blocksize);
727 subview_1D_stride_t yy(
nullptr, Kokkos::LayoutStride(blocksize, y_packed_scalar.stride_1()));
728 auto A_block_cst = ConstUnmanaged<tpetra_block_access_view_type>(NULL, blocksize, blocksize);
731 impl_scalar_type *local_Ax =
reinterpret_cast<impl_scalar_type *
>(member.team_scratch(0).get_shmem(blocksize *
sizeof(impl_scalar_type)));
732 impl_scalar_type *local_x =
reinterpret_cast<impl_scalar_type *
>(member.thread_scratch(0).get_shmem(blocksize *
sizeof(impl_scalar_type)));
734 const local_ordinal_type lr = lclrow(rowidx);
735 const local_ordinal_type row = lr * blocksize;
736 for (local_ordinal_type col = 0; col < num_vectors; ++col) {
738 member.team_barrier();
741 Kokkos::parallel_for(Kokkos::TeamVectorRange(member, blocksize), [&](
const local_ordinal_type &i) {
744 member.team_barrier();
747 if constexpr (!overlap) {
748 numEntries = A_x_offsets.extent(2);
750 numEntries = A_x_offsets_remote.extent(2);
753 Kokkos::parallel_for(Kokkos::TeamThreadRange(member, 0, numEntries),
755 int64_t A_offset = overlap ? A_x_offsets_remote(rowidx, 0, k) : A_x_offsets(rowidx, 0, k);
756 int64_t x_offset = overlap ? A_x_offsets_remote(rowidx, 1, k) : A_x_offsets(rowidx, 1, k);
757 if (A_offset != Kokkos::ArithTraits<int64_t>::min()) {
758 A_block_cst.assign_data(tpetra_values.data() + A_offset);
760 if constexpr (async) {
761 size_type remote_cutoff = blocksize * num_local_rows;
762 if (x_offset >= remote_cutoff)
763 xx.assign_data(&x_remote(x_offset - remote_cutoff, col));
765 xx.assign_data(&x(x_offset, col));
767 if constexpr (!overlap) {
768 xx.assign_data(&x(x_offset, col));
770 xx.assign_data(&x_remote(x_offset, col));
774 Kokkos::parallel_for(Kokkos::ThreadVectorRange(member, blocksize), [&](
const local_ordinal_type &i) {
779 Kokkos::parallel_for(
780 Kokkos::ThreadVectorRange(member, blocksize),
781 [&](
const local_ordinal_type &k0) {
782 impl_scalar_type val = 0;
783 for (
int k1 = 0; k1 < blocksize; k1++)
784 val += A_block_cst(k0, k1) * local_x[k1];
785 Kokkos::atomic_add(local_Ax + k0, val);
789 member.team_barrier();
791 yy.assign_data(&y_packed_scalar(pri, 0, col, v));
792 bb.assign_data(&b(row, col));
793 Kokkos::parallel_for(Kokkos::TeamVectorRange(member, blocksize), [&](
const local_ordinal_type &i) {
795 yy(i) = bb(i) - local_Ax[i];
797 yy(i) -= local_Ax[i];
803 template <
int B,
bool async,
bool overlap>
804 KOKKOS_INLINE_FUNCTION
void
805 operator()(
const GeneralTag<B, async, overlap, false> &,
const member_type &member)
const {
806 const local_ordinal_type blocksize = (B == 0 ? blocksize_requested : B);
809 const local_ordinal_type rowidx = member.league_rank();
810 const local_ordinal_type partidx = rowidx2part(rowidx);
811 const local_ordinal_type pri = part2packrowidx0(partidx) + (rowidx - partptr(partidx));
812 const local_ordinal_type v = partidx % vector_length;
814 const Kokkos::pair<local_ordinal_type, local_ordinal_type> block_range(0, blocksize);
815 const local_ordinal_type num_vectors = y_packed_scalar.extent(2);
816 const local_ordinal_type num_local_rows = lclrow.extent(0);
819 using subview_1D_right_t = decltype(Kokkos::subview(b, block_range, 0));
820 using subview_1D_stride_t = decltype(Kokkos::subview(y_packed_scalar, 0, block_range, 0, 0));
821 subview_1D_right_t bb(
nullptr, blocksize);
822 subview_1D_right_t xx(
nullptr, blocksize);
823 subview_1D_right_t xx_remote(
nullptr, blocksize);
824 subview_1D_stride_t yy(
nullptr, Kokkos::LayoutStride(blocksize, y_packed_scalar.stride_1()));
825 auto A_block_cst = ConstUnmanaged<tpetra_block_access_view_type>(NULL, blocksize, blocksize);
826 auto colindsub_used = overlap ? colindsub_remote : colindsub;
827 auto rowptr_used = overlap ? rowptr_remote : rowptr;
829 const local_ordinal_type lr = lclrow(rowidx);
830 const local_ordinal_type row = lr * blocksize;
831 for (local_ordinal_type col = 0; col < num_vectors; ++col) {
832 yy.assign_data(&y_packed_scalar(pri, 0, col, v));
835 bb.assign_data(&b(row, col));
836 if (member.team_rank() == 0)
837 VectorCopy(member, blocksize, bb, yy);
838 member.team_barrier();
842 const size_type A_k0 = A_block_rowptr[lr];
843 Kokkos::parallel_for(Kokkos::TeamThreadRange(member, rowptr_used[lr], rowptr_used[lr + 1]),
844 [&](
const local_ordinal_type &k) {
845 const size_type j = A_k0 + colindsub_used[k];
846 const local_ordinal_type A_colind_at_j = A_colind[j];
847 if ((async && A_colind_at_j < num_local_rows) || (!async && !overlap)) {
848 const auto loc = is_dm2cm_active ? dm2cm[A_colind_at_j] : A_colind_at_j;
849 xx.assign_data(&x(loc * blocksize, col));
850 for (local_ordinal_type k0 = 0; k0 < blocksize; ++k0)
851 VectorDot(member, blocksize, lr, k, k0, colindsub_used, xx, yy);
853 const auto loc = A_colind_at_j - num_local_rows;
854 xx_remote.assign_data(&x_remote(loc * blocksize, col));
855 for (local_ordinal_type k0 = 0; k0 < blocksize; ++k0)
856 VectorDot(member, blocksize, lr, k, k0, colindsub_used, xx_remote, yy);
863 template <
typename MultiVectorLocalViewTypeY,
864 typename MultiVectorLocalViewTypeB,
865 typename MultiVectorLocalViewTypeX>
866 void run(
const MultiVectorLocalViewTypeY &y_,
867 const MultiVectorLocalViewTypeB &b_,
868 const MultiVectorLocalViewTypeX &x_) {
869 IFPACK2_BLOCKHELPER_PROFILER_REGION_BEGIN;
870 IFPACK2_BLOCKHELPER_TIMER_WITH_FENCE(
"BlockTriDi::ComputeResidual::<SeqTag>", ComputeResidual0, execution_space);
875 if constexpr (is_device<execution_space>::value) {
876 const local_ordinal_type blocksize = blocksize_requested;
877 const local_ordinal_type team_size = 8;
878 const local_ordinal_type vector_size = ComputeResidualVectorRecommendedVectorSize<execution_space>(blocksize, team_size);
879 const Kokkos::TeamPolicy<execution_space, SeqTag> policy(rowptr.extent(0) - 1, team_size, vector_size);
880 Kokkos::parallel_for(
"ComputeResidual::TeamPolicy::run<SeqTag>", policy, *
this);
882 const Kokkos::RangePolicy<execution_space, SeqTag> policy(0, rowptr.extent(0) - 1);
883 Kokkos::parallel_for(
"ComputeResidual::RangePolicy::run<SeqTag>", policy, *
this);
885 IFPACK2_BLOCKHELPER_PROFILER_REGION_END;
886 IFPACK2_BLOCKHELPER_TIMER_FENCE(execution_space)
890 template <
typename MultiVectorLocalViewTypeB,
891 typename MultiVectorLocalViewTypeX,
892 typename MultiVectorLocalViewTypeX_Remote>
893 void run(
const vector_type_3d_view &y_packed_,
894 const MultiVectorLocalViewTypeB &b_,
895 const MultiVectorLocalViewTypeX &x_,
896 const MultiVectorLocalViewTypeX_Remote &x_remote_) {
897 IFPACK2_BLOCKHELPER_PROFILER_REGION_BEGIN;
898 IFPACK2_BLOCKHELPER_TIMER_WITH_FENCE(
"BlockTriDi::ComputeResidual::<AsyncTag>", ComputeResidual0, execution_space);
902 x_remote = x_remote_;
903 if constexpr (is_device<execution_space>::value) {
904 y_packed_scalar = btdm_scalar_type_4d_view((btdm_scalar_type *)y_packed_.data(),
910 y_packed = y_packed_;
913 if constexpr (is_device<execution_space>::value) {
914 const local_ordinal_type blocksize = blocksize_requested;
919 #define BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(B) \
921 if (this->hasBlockCrsMatrix) { \
922 const local_ordinal_type team_size = 8; \
923 const local_ordinal_type vector_size = 8; \
924 const size_t shmem_team_size = blocksize * sizeof(btdm_scalar_type); \
925 const size_t shmem_thread_size = blocksize * sizeof(btdm_scalar_type); \
926 Kokkos::TeamPolicy<execution_space, AsyncTag<B, true>> \
927 policy(rowidx2part.extent(0), team_size, vector_size); \
928 policy.set_scratch_size(0, Kokkos::PerTeam(shmem_team_size), Kokkos::PerThread(shmem_thread_size)); \
929 Kokkos::parallel_for("ComputeResidual::TeamPolicy::run<AsyncTag>", \
932 const local_ordinal_type team_size = 8; \
933 const local_ordinal_type vector_size = ComputeResidualVectorRecommendedVectorSize<execution_space>(blocksize, team_size); \
934 const Kokkos::TeamPolicy<execution_space, AsyncTag<B, false>> \
935 policy(rowidx2part.extent(0), team_size, vector_size); \
936 Kokkos::parallel_for("ComputeResidual::TeamPolicy::run<AsyncTag>", \
941 switch (blocksize_requested) {
942 case 3: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(3);
943 case 5: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(5);
944 case 7: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(7);
945 case 9: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(9);
946 case 10: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(10);
947 case 11: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(11);
948 case 16: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(16);
949 case 17: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(17);
950 case 18: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(18);
951 default: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(0);
953 #undef BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL
955 #define BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(B) \
957 if (this->hasBlockCrsMatrix) { \
958 const Kokkos::RangePolicy<execution_space, AsyncTag<B, true>> policy(0, rowidx2part.extent(0)); \
959 Kokkos::parallel_for("ComputeResidual::RangePolicy::run<AsyncTag>", \
962 const Kokkos::RangePolicy<execution_space, AsyncTag<B, false>> policy(0, rowidx2part.extent(0)); \
963 Kokkos::parallel_for("ComputeResidual::RangePolicy::run<AsyncTag>", \
969 switch (blocksize_requested) {
970 case 3: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(3);
971 case 5: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(5);
972 case 7: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(7);
973 case 9: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(9);
974 case 10: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(10);
975 case 11: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(11);
976 case 16: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(16);
977 case 17: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(17);
978 case 18: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(18);
979 default: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(0);
981 #undef BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL
983 IFPACK2_BLOCKHELPER_PROFILER_REGION_END;
984 IFPACK2_BLOCKHELPER_TIMER_FENCE(execution_space)
988 template <
typename MultiVectorLocalViewTypeB,
989 typename MultiVectorLocalViewTypeX,
990 typename MultiVectorLocalViewTypeX_Remote>
991 void run(
const vector_type_3d_view &y_packed_,
992 const MultiVectorLocalViewTypeB &b_,
993 const MultiVectorLocalViewTypeX &x_,
994 const MultiVectorLocalViewTypeX_Remote &x_remote_,
995 const bool compute_owned) {
996 IFPACK2_BLOCKHELPER_PROFILER_REGION_BEGIN;
997 IFPACK2_BLOCKHELPER_TIMER_WITH_FENCE(
"BlockTriDi::ComputeResidual::<OverlapTag>", ComputeResidual0, execution_space);
1001 x_remote = x_remote_;
1002 if constexpr (is_device<execution_space>::value) {
1003 y_packed_scalar = btdm_scalar_type_4d_view((btdm_scalar_type *)y_packed_.data(),
1004 y_packed_.extent(0),
1005 y_packed_.extent(1),
1006 y_packed_.extent(2),
1009 y_packed = y_packed_;
1012 if constexpr (is_device<execution_space>::value) {
1013 const local_ordinal_type blocksize = blocksize_requested;
1018 #define BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(B) \
1019 if (this->hasBlockCrsMatrix) { \
1020 const local_ordinal_type team_size = 8; \
1021 const local_ordinal_type vector_size = 8; \
1022 const size_t shmem_team_size = blocksize * sizeof(btdm_scalar_type); \
1023 const size_t shmem_thread_size = blocksize * sizeof(btdm_scalar_type); \
1024 if (compute_owned) { \
1025 Kokkos::TeamPolicy<execution_space, OverlapTag<0, B, true>> \
1026 policy(rowidx2part.extent(0), team_size, vector_size); \
1027 policy.set_scratch_size(0, Kokkos::PerTeam(shmem_team_size), Kokkos::PerThread(shmem_thread_size)); \
1028 Kokkos::parallel_for("ComputeResidual::TeamPolicy::run<OverlapTag<0> >", policy, *this); \
1030 Kokkos::TeamPolicy<execution_space, OverlapTag<1, B, true>> \
1031 policy(rowidx2part.extent(0), team_size, vector_size); \
1032 policy.set_scratch_size(0, Kokkos::PerTeam(shmem_team_size), Kokkos::PerThread(shmem_thread_size)); \
1033 Kokkos::parallel_for("ComputeResidual::TeamPolicy::run<OverlapTag<1> >", policy, *this); \
1036 const local_ordinal_type team_size = 8; \
1037 const local_ordinal_type vector_size = ComputeResidualVectorRecommendedVectorSize<execution_space>(blocksize, team_size); \
1038 if (compute_owned) { \
1039 const Kokkos::TeamPolicy<execution_space, OverlapTag<0, B, false>> \
1040 policy(rowidx2part.extent(0), team_size, vector_size); \
1041 Kokkos::parallel_for("ComputeResidual::TeamPolicy::run<OverlapTag<0> >", policy, *this); \
1043 const Kokkos::TeamPolicy<execution_space, OverlapTag<1, B, false>> \
1044 policy(rowidx2part.extent(0), team_size, vector_size); \
1045 Kokkos::parallel_for("ComputeResidual::TeamPolicy::run<OverlapTag<1> >", policy, *this); \
1049 switch (blocksize_requested) {
1050 case 3: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(3);
1051 case 5: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(5);
1052 case 7: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(7);
1053 case 9: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(9);
1054 case 10: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(10);
1055 case 11: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(11);
1056 case 16: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(16);
1057 case 17: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(17);
1058 case 18: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(18);
1059 default: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(0);
1061 #undef BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL
1063 #define BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(B) \
1064 if (this->hasBlockCrsMatrix) { \
1065 if (compute_owned) { \
1066 const Kokkos::RangePolicy<execution_space, OverlapTag<0, B, true>> \
1067 policy(0, rowidx2part.extent(0)); \
1068 Kokkos::parallel_for("ComputeResidual::RangePolicy::run<OverlapTag<0> >", policy, *this); \
1070 const Kokkos::RangePolicy<execution_space, OverlapTag<1, B, true>> \
1071 policy(0, rowidx2part.extent(0)); \
1072 Kokkos::parallel_for("ComputeResidual::RangePolicy::run<OverlapTag<1> >", policy, *this); \
1075 if (compute_owned) { \
1076 const Kokkos::RangePolicy<execution_space, OverlapTag<0, B, false>> \
1077 policy(0, rowidx2part.extent(0)); \
1078 Kokkos::parallel_for("ComputeResidual::RangePolicy::run<OverlapTag<0> >", policy, *this); \
1080 const Kokkos::RangePolicy<execution_space, OverlapTag<1, B, false>> \
1081 policy(0, rowidx2part.extent(0)); \
1082 Kokkos::parallel_for("ComputeResidual::RangePolicy::run<OverlapTag<1> >", policy, *this); \
1087 switch (blocksize_requested) {
1088 case 3: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(3);
1089 case 5: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(5);
1090 case 7: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(7);
1091 case 9: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(9);
1092 case 10: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(10);
1093 case 11: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(11);
1094 case 16: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(16);
1095 case 17: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(17);
1096 case 18: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(18);
1097 default: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(0);
1099 #undef BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL
1101 IFPACK2_BLOCKHELPER_PROFILER_REGION_END;
1102 IFPACK2_BLOCKHELPER_TIMER_FENCE(execution_space)
node_type::device_type node_device_type
Definition: Ifpack2_BlockHelper.hpp:297
size_t size_type
Definition: Ifpack2_BlockHelper.hpp:274
Kokkos::View< size_type *, device_type > size_type_1d_view
Definition: Ifpack2_BlockHelper.hpp:346
Kokkos::Details::ArithTraits< scalar_type >::val_type impl_scalar_type
Definition: Ifpack2_BlockHelper.hpp:283
Definition: Ifpack2_BlockHelper.hpp:270
Definition: Ifpack2_BlockComputeResidualVector.hpp:23