10 #ifndef IFPACK2_BLOCKCOMPUTERESAND_SOLVE_IMPL_HPP
11 #define IFPACK2_BLOCKCOMPUTERESAND_SOLVE_IMPL_HPP
13 #include "Ifpack2_BlockHelper.hpp"
14 #include "Ifpack2_BlockComputeResidualVector.hpp"
16 namespace Ifpack2::BlockHelperDetails {
18 template <
typename ExecSpace,
typename DiagOffsets,
typename Rowptrs,
20 DiagOffsets findDiagOffsets(
const Rowptrs& rowptrs,
const Entries& entries,
21 size_t nrows,
int blocksize) {
22 DiagOffsets diag_offsets(do_not_initialize_tag(
"btdm.diag_offsets"), nrows);
24 Kokkos::parallel_reduce(
25 Kokkos::RangePolicy<ExecSpace>(0, nrows),
26 KOKKOS_LAMBDA(
size_t row,
int& err) {
27 auto rowBegin = rowptrs(row);
28 auto rowEnd = rowptrs(row + 1);
29 for (
size_t j = rowBegin; j < rowEnd; j++) {
30 if (
size_t(entries(j)) == row) {
31 diag_offsets(row) = j * blocksize * blocksize;
39 err,
"Ifpack2 BTD: at least one row has no diagonal entry");
43 template <
typename MatrixType,
int B>
44 struct ComputeResidualAndSolve_1Pass {
45 using impl_type = BlockHelperDetails::ImplType<MatrixType>;
47 using execution_space =
typename impl_type::execution_space;
48 using memory_space =
typename impl_type::memory_space;
50 using local_ordinal_type =
typename impl_type::local_ordinal_type;
53 using magnitude_type =
typename impl_type::magnitude_type;
55 using local_ordinal_type_1d_view =
56 typename impl_type::local_ordinal_type_1d_view;
58 using tpetra_block_access_view_type =
59 typename impl_type::tpetra_block_access_view_type;
61 using impl_scalar_type_1d_view =
typename impl_type::impl_scalar_type_1d_view;
62 using impl_scalar_type_2d_view_tpetra =
63 typename impl_type::impl_scalar_type_2d_view_tpetra;
65 using btdm_scalar_type_3d_view =
typename impl_type::btdm_scalar_type_3d_view;
66 using btdm_scalar_type_4d_view =
typename impl_type::btdm_scalar_type_4d_view;
67 using i64_3d_view =
typename impl_type::i64_3d_view;
70 using member_type =
typename Kokkos::TeamPolicy<execution_space>::member_type;
73 enum :
int { max_blocksize = 32 };
76 ConstUnmanaged<impl_scalar_type_2d_view_tpetra> b;
77 ConstUnmanaged<impl_scalar_type_2d_view_tpetra> x;
78 ConstUnmanaged<impl_scalar_type_2d_view_tpetra> x_remote;
79 Unmanaged<impl_scalar_type_2d_view_tpetra> y;
82 const ConstUnmanaged<impl_scalar_type_1d_view> tpetra_values;
85 const local_ordinal_type blocksize_requested;
88 const ConstUnmanaged<i64_3d_view> A_x_offsets;
89 const ConstUnmanaged<i64_3d_view> A_x_offsets_remote;
92 const ConstUnmanaged<btdm_scalar_type_3d_view> d_inv;
95 const Unmanaged<impl_scalar_type_1d_view> W;
97 impl_scalar_type damping_factor;
100 ComputeResidualAndSolve_1Pass(
const AmD<MatrixType>& amd,
101 const btdm_scalar_type_3d_view& d_inv_,
102 const impl_scalar_type_1d_view& W_,
103 const local_ordinal_type& blocksize_requested_,
104 const impl_scalar_type& damping_factor_)
105 : tpetra_values(amd.tpetra_values)
106 , blocksize_requested(blocksize_requested_)
107 , A_x_offsets(amd.A_x_offsets)
108 , A_x_offsets_remote(amd.A_x_offsets_remote)
111 , damping_factor(damping_factor_) {}
113 KOKKOS_INLINE_FUNCTION
114 void operator()(
const member_type& member)
const {
115 const local_ordinal_type blocksize = (B == 0 ? blocksize_requested : B);
116 const local_ordinal_type rowidx = member.league_rank();
117 const local_ordinal_type row = rowidx * blocksize;
118 const local_ordinal_type num_vectors = b.extent(1);
119 const local_ordinal_type num_local_rows = d_inv.extent(0);
121 const impl_scalar_type* xx;
122 auto A_block_cst = ConstUnmanaged<tpetra_block_access_view_type>(
123 NULL, blocksize, blocksize);
126 impl_scalar_type* local_residual =
reinterpret_cast<impl_scalar_type*
>(
127 member.team_scratch(0).get_shmem(blocksize *
sizeof(impl_scalar_type)));
128 impl_scalar_type* local_Dinv_residual =
reinterpret_cast<impl_scalar_type*
>(
129 member.team_scratch(0).get_shmem(blocksize *
sizeof(impl_scalar_type)));
130 impl_scalar_type* local_x =
131 reinterpret_cast<impl_scalar_type*
>(member.thread_scratch(0).get_shmem(
132 blocksize *
sizeof(impl_scalar_type)));
134 magnitude_type norm = 0;
135 for (local_ordinal_type col = 0; col < num_vectors; ++col) {
136 if (col) member.team_barrier();
139 Kokkos::parallel_for(Kokkos::TeamVectorRange(member, blocksize),
140 [&](
const local_ordinal_type& i) {
141 local_Dinv_residual[i] = 0;
142 local_residual[i] = b(row + i, col);
144 member.team_barrier();
146 int numEntries = A_x_offsets.extent(2);
148 Kokkos::parallel_for(
149 Kokkos::TeamThreadRange(member, 0, numEntries), [&](
const int k) {
150 int64_t A_offset = A_x_offsets(rowidx, 0, k);
151 int64_t x_offset = A_x_offsets(rowidx, 1, k);
152 if (A_offset != Kokkos::ArithTraits<int64_t>::min()) {
153 A_block_cst.assign_data(tpetra_values.data() + A_offset);
155 int64_t remote_cutoff = blocksize * num_local_rows;
156 if (x_offset >= remote_cutoff)
157 xx = &x_remote(x_offset - remote_cutoff, col);
159 xx = &x(x_offset, col);
161 Kokkos::parallel_for(
162 Kokkos::ThreadVectorRange(member, blocksize),
163 [&](
const local_ordinal_type& i) { local_x[i] = xx[i]; });
166 Kokkos::parallel_for(Kokkos::ThreadVectorRange(member, blocksize),
168 impl_scalar_type val = 0;
169 for (
int k1 = 0; k1 < blocksize; k1++)
170 val += A_block_cst(k0, k1) * local_x[k1];
171 Kokkos::atomic_add(local_residual + k0, -val);
175 member.team_barrier();
177 Kokkos::parallel_for(
178 Kokkos::TeamThreadRange(member, blocksize),
179 [&](
const local_ordinal_type& k0) {
180 Kokkos::parallel_reduce(
181 Kokkos::ThreadVectorRange(member, blocksize),
182 [&](
const local_ordinal_type& k1, impl_scalar_type& update) {
183 update += d_inv(rowidx, k0, k1) * local_residual[k1];
185 local_Dinv_residual[k0]);
187 member.team_barrier();
190 magnitude_type colNorm;
191 Kokkos::parallel_reduce(
192 Kokkos::TeamVectorRange(member, blocksize),
193 [&](
const local_ordinal_type& k, magnitude_type& update) {
196 impl_scalar_type old_y = x(row + k, col);
197 impl_scalar_type y_update = local_Dinv_residual[k] - old_y;
198 if constexpr (Kokkos::ArithTraits<impl_scalar_type>::is_complex) {
199 magnitude_type ydiff =
200 Kokkos::ArithTraits<impl_scalar_type>::abs(y_update);
201 update += ydiff * ydiff;
203 update += y_update * y_update;
205 y(row + k, col) = old_y + damping_factor * y_update;
210 Kokkos::single(Kokkos::PerTeam(member), [&]() { W(rowidx) = norm; });
215 void run(
const ConstUnmanaged<impl_scalar_type_2d_view_tpetra>& b_,
216 const ConstUnmanaged<impl_scalar_type_2d_view_tpetra>& x_,
217 const ConstUnmanaged<impl_scalar_type_2d_view_tpetra>& x_remote_,
218 const Unmanaged<impl_scalar_type_2d_view_tpetra>& y_) {
219 IFPACK2_BLOCKHELPER_PROFILER_REGION_BEGIN;
220 IFPACK2_BLOCKHELPER_TIMER_WITH_FENCE(
221 "BlockTriDi::ComputeResidualAndSolve::RunSinglePass",
222 ComputeResidualAndSolve0, execution_space);
227 x_remote = x_remote_;
229 const local_ordinal_type blocksize = blocksize_requested;
230 const local_ordinal_type nrows = d_inv.extent(0);
232 const local_ordinal_type team_size = 8;
233 const local_ordinal_type vector_size = 8;
235 const size_t shmem_team_size = 2 * blocksize *
sizeof(impl_scalar_type);
237 const size_t shmem_thread_size = blocksize *
sizeof(impl_scalar_type);
238 Kokkos::TeamPolicy<execution_space> policy(nrows, team_size, vector_size);
239 policy.set_scratch_size(0, Kokkos::PerTeam(shmem_team_size),
240 Kokkos::PerThread(shmem_thread_size));
241 Kokkos::parallel_for(
"ComputeResidualAndSolve::TeamPolicy::SinglePass",
243 IFPACK2_BLOCKHELPER_PROFILER_REGION_END;
244 IFPACK2_BLOCKHELPER_TIMER_FENCE(execution_space)
248 template <
typename MatrixType,
int B>
249 struct ComputeResidualAndSolve_2Pass {
250 using impl_type = BlockHelperDetails::ImplType<MatrixType>;
252 using execution_space =
typename impl_type::execution_space;
253 using memory_space =
typename impl_type::memory_space;
255 using local_ordinal_type =
typename impl_type::local_ordinal_type;
258 using magnitude_type =
typename impl_type::magnitude_type;
260 using local_ordinal_type_1d_view =
261 typename impl_type::local_ordinal_type_1d_view;
263 using tpetra_block_access_view_type =
264 typename impl_type::tpetra_block_access_view_type;
266 using impl_scalar_type_1d_view =
typename impl_type::impl_scalar_type_1d_view;
267 using impl_scalar_type_2d_view_tpetra =
268 typename impl_type::impl_scalar_type_2d_view_tpetra;
270 using btdm_scalar_type_3d_view =
typename impl_type::btdm_scalar_type_3d_view;
271 using btdm_scalar_type_4d_view =
typename impl_type::btdm_scalar_type_4d_view;
272 using i64_3d_view =
typename impl_type::i64_3d_view;
275 using member_type =
typename Kokkos::TeamPolicy<execution_space>::member_type;
278 enum :
int { max_blocksize = 32 };
285 struct NonownedTag {};
288 ConstUnmanaged<impl_scalar_type_2d_view_tpetra> b;
289 ConstUnmanaged<impl_scalar_type_2d_view_tpetra> x;
290 ConstUnmanaged<impl_scalar_type_2d_view_tpetra> x_remote;
291 Unmanaged<impl_scalar_type_2d_view_tpetra> y;
294 const ConstUnmanaged<impl_scalar_type_1d_view> tpetra_values;
297 const local_ordinal_type blocksize_requested;
300 const ConstUnmanaged<i64_3d_view> A_x_offsets;
301 const ConstUnmanaged<i64_3d_view> A_x_offsets_remote;
304 const ConstUnmanaged<btdm_scalar_type_3d_view> d_inv;
307 const Unmanaged<impl_scalar_type_1d_view> W;
309 impl_scalar_type damping_factor;
312 ComputeResidualAndSolve_2Pass(
const AmD<MatrixType>& amd,
313 const btdm_scalar_type_3d_view& d_inv_,
314 const impl_scalar_type_1d_view& W_,
315 const local_ordinal_type& blocksize_requested_,
316 const impl_scalar_type& damping_factor_)
317 : tpetra_values(amd.tpetra_values)
318 , blocksize_requested(blocksize_requested_)
319 , A_x_offsets(amd.A_x_offsets)
320 , A_x_offsets_remote(amd.A_x_offsets_remote)
323 , damping_factor(damping_factor_) {}
325 KOKKOS_INLINE_FUNCTION
326 void operator()(
const OwnedTag,
const member_type& member)
const {
327 const local_ordinal_type blocksize = (B == 0 ? blocksize_requested : B);
328 const local_ordinal_type rowidx = member.league_rank();
329 const local_ordinal_type row = rowidx * blocksize;
330 const local_ordinal_type num_vectors = b.extent(1);
332 auto A_block_cst = ConstUnmanaged<tpetra_block_access_view_type>(
333 NULL, blocksize, blocksize);
336 impl_scalar_type* local_residual =
reinterpret_cast<impl_scalar_type*
>(
337 member.team_scratch(0).get_shmem(blocksize *
sizeof(impl_scalar_type)));
338 impl_scalar_type* local_x =
339 reinterpret_cast<impl_scalar_type*
>(member.thread_scratch(0).get_shmem(
340 blocksize *
sizeof(impl_scalar_type)));
342 for (local_ordinal_type col = 0; col < num_vectors; ++col) {
343 if (col) member.team_barrier();
346 Kokkos::parallel_for(
347 Kokkos::TeamVectorRange(member, blocksize),
348 [&](
const local_ordinal_type& i) { local_residual[i] = b(row + i, col); });
349 member.team_barrier();
351 int numEntries = A_x_offsets.extent(2);
353 Kokkos::parallel_for(
354 Kokkos::TeamThreadRange(member, 0, numEntries), [&](
const int k) {
355 int64_t A_offset = A_x_offsets(rowidx, 0, k);
356 int64_t x_offset = A_x_offsets(rowidx, 1, k);
357 if (A_offset != Kokkos::ArithTraits<int64_t>::min()) {
358 A_block_cst.assign_data(tpetra_values.data() + A_offset);
360 Kokkos::parallel_for(Kokkos::ThreadVectorRange(member, blocksize),
361 [&](
const local_ordinal_type& i) {
362 local_x[i] = x(x_offset + i, col);
366 Kokkos::parallel_for(Kokkos::ThreadVectorRange(member, blocksize),
367 [&](
const local_ordinal_type& k0) {
368 impl_scalar_type val = 0;
369 for (
int k1 = 0; k1 < blocksize; k1++)
370 val += A_block_cst(k0, k1) * local_x[k1];
371 Kokkos::atomic_add(local_residual + k0, -val);
375 member.team_barrier();
377 if (member.team_rank() == 0) {
378 Kokkos::parallel_for(Kokkos::ThreadVectorRange(member, blocksize),
379 [&](
const local_ordinal_type& k) {
380 y(row + k, col) = local_residual[k];
386 KOKKOS_INLINE_FUNCTION
387 void operator()(
const NonownedTag,
const member_type& member)
const {
388 const local_ordinal_type blocksize = (B == 0 ? blocksize_requested : B);
389 const local_ordinal_type rowidx = member.league_rank();
390 const local_ordinal_type row = rowidx * blocksize;
391 const local_ordinal_type num_vectors = b.extent(1);
393 auto A_block_cst = ConstUnmanaged<tpetra_block_access_view_type>(
394 NULL, blocksize, blocksize);
397 impl_scalar_type* local_residual =
reinterpret_cast<impl_scalar_type*
>(
398 member.team_scratch(0).get_shmem(blocksize *
sizeof(impl_scalar_type)));
399 impl_scalar_type* local_Dinv_residual =
reinterpret_cast<impl_scalar_type*
>(
400 member.team_scratch(0).get_shmem(blocksize *
sizeof(impl_scalar_type)));
401 impl_scalar_type* local_x =
402 reinterpret_cast<impl_scalar_type*
>(member.thread_scratch(0).get_shmem(
403 blocksize *
sizeof(impl_scalar_type)));
405 magnitude_type norm = 0;
406 for (local_ordinal_type col = 0; col < num_vectors; ++col) {
407 if (col) member.team_barrier();
410 Kokkos::parallel_for(Kokkos::TeamVectorRange(member, blocksize),
411 [&](
const local_ordinal_type& i) {
412 local_Dinv_residual[i] = 0;
413 local_residual[i] = y(row + i, col);
415 member.team_barrier();
417 int numEntries = A_x_offsets_remote.extent(2);
419 Kokkos::parallel_for(
420 Kokkos::TeamThreadRange(member, 0, numEntries), [&](
const int k) {
421 int64_t A_offset = A_x_offsets_remote(rowidx, 0, k);
422 int64_t x_offset = A_x_offsets_remote(rowidx, 1, k);
423 if (A_offset != Kokkos::ArithTraits<int64_t>::min()) {
424 A_block_cst.assign_data(tpetra_values.data() + A_offset);
426 Kokkos::parallel_for(Kokkos::ThreadVectorRange(member, blocksize),
427 [&](
const local_ordinal_type& i) {
428 local_x[i] = x_remote(x_offset + i, col);
432 Kokkos::parallel_for(Kokkos::ThreadVectorRange(member, blocksize),
434 impl_scalar_type val = 0;
435 for (
int k1 = 0; k1 < blocksize; k1++)
436 val += A_block_cst(k0, k1) * local_x[k1];
437 Kokkos::atomic_add(local_residual + k0, -val);
441 member.team_barrier();
443 Kokkos::parallel_for(
444 Kokkos::TeamThreadRange(member, blocksize),
445 [&](
const local_ordinal_type& k0) {
446 Kokkos::parallel_reduce(
447 Kokkos::ThreadVectorRange(member, blocksize),
448 [&](
const local_ordinal_type& k1, impl_scalar_type& update) {
449 update += d_inv(rowidx, k0, k1) * local_residual[k1];
451 local_Dinv_residual[k0]);
453 member.team_barrier();
456 magnitude_type colNorm;
457 Kokkos::parallel_reduce(
458 Kokkos::TeamVectorRange(member, blocksize),
459 [&](
const local_ordinal_type& k, magnitude_type& update) {
462 impl_scalar_type old_y = x(row + k, col);
463 impl_scalar_type y_update = local_Dinv_residual[k] - old_y;
464 if constexpr (Kokkos::ArithTraits<impl_scalar_type>::is_complex) {
465 magnitude_type ydiff =
466 Kokkos::ArithTraits<impl_scalar_type>::abs(y_update);
467 update += ydiff * ydiff;
469 update += y_update * y_update;
471 y(row + k, col) = old_y + damping_factor * y_update;
476 Kokkos::single(Kokkos::PerTeam(member), [&]() { W(rowidx) = norm; });
481 void run_pass1(
const ConstUnmanaged<impl_scalar_type_2d_view_tpetra>& b_,
482 const ConstUnmanaged<impl_scalar_type_2d_view_tpetra>& x_,
483 const Unmanaged<impl_scalar_type_2d_view_tpetra>& y_) {
484 IFPACK2_BLOCKHELPER_PROFILER_REGION_BEGIN;
485 IFPACK2_BLOCKHELPER_TIMER_WITH_FENCE(
486 "BlockTriDi::ComputeResidualAndSolve::RunPass1",
487 ComputeResidualAndSolve0, execution_space);
493 const local_ordinal_type blocksize = blocksize_requested;
494 const local_ordinal_type nrows = d_inv.extent(0);
496 const local_ordinal_type team_size = 8;
497 const local_ordinal_type vector_size = 8;
498 const size_t shmem_team_size = blocksize *
sizeof(impl_scalar_type);
499 const size_t shmem_thread_size = blocksize *
sizeof(impl_scalar_type);
500 Kokkos::TeamPolicy<execution_space, OwnedTag> policy(nrows, team_size,
502 policy.set_scratch_size(0, Kokkos::PerTeam(shmem_team_size),
503 Kokkos::PerThread(shmem_thread_size));
504 Kokkos::parallel_for(
"ComputeResidualAndSolve::TeamPolicy::Pass1", policy,
506 IFPACK2_BLOCKHELPER_PROFILER_REGION_END;
507 IFPACK2_BLOCKHELPER_TIMER_FENCE(execution_space)
514 const ConstUnmanaged<impl_scalar_type_2d_view_tpetra>& x_,
515 const ConstUnmanaged<impl_scalar_type_2d_view_tpetra>& x_remote_,
516 const Unmanaged<impl_scalar_type_2d_view_tpetra>& y_) {
517 IFPACK2_BLOCKHELPER_PROFILER_REGION_BEGIN;
518 IFPACK2_BLOCKHELPER_TIMER_WITH_FENCE(
519 "BlockTriDi::ComputeResidualAndSolve::RunPass2",
520 ComputeResidualAndSolve0, execution_space);
523 x_remote = x_remote_;
526 const local_ordinal_type blocksize = blocksize_requested;
527 const local_ordinal_type nrows = d_inv.extent(0);
529 const local_ordinal_type team_size = 8;
530 const local_ordinal_type vector_size = 8;
531 const size_t shmem_team_size = 2 * blocksize *
sizeof(impl_scalar_type);
532 const size_t shmem_thread_size = blocksize *
sizeof(impl_scalar_type);
533 Kokkos::TeamPolicy<execution_space, NonownedTag> policy(nrows, team_size,
535 policy.set_scratch_size(0, Kokkos::PerTeam(shmem_team_size),
536 Kokkos::PerThread(shmem_thread_size));
537 Kokkos::parallel_for(
"ComputeResidualAndSolve::TeamPolicy::Pass2", policy,
539 IFPACK2_BLOCKHELPER_PROFILER_REGION_END;
540 IFPACK2_BLOCKHELPER_TIMER_FENCE(execution_space)
544 template <
typename MatrixType,
int B>
545 struct ComputeResidualAndSolve_SolveOnly {
546 using impl_type = BlockHelperDetails::ImplType<MatrixType>;
548 using execution_space =
typename impl_type::execution_space;
549 using memory_space =
typename impl_type::memory_space;
551 using local_ordinal_type =
typename impl_type::local_ordinal_type;
554 using magnitude_type =
typename impl_type::magnitude_type;
556 using local_ordinal_type_1d_view =
557 typename impl_type::local_ordinal_type_1d_view;
559 using tpetra_block_access_view_type =
560 typename impl_type::tpetra_block_access_view_type;
562 using impl_scalar_type_1d_view =
typename impl_type::impl_scalar_type_1d_view;
563 using impl_scalar_type_2d_view_tpetra =
564 typename impl_type::impl_scalar_type_2d_view_tpetra;
566 using btdm_scalar_type_3d_view =
typename impl_type::btdm_scalar_type_3d_view;
567 using btdm_scalar_type_4d_view =
typename impl_type::btdm_scalar_type_4d_view;
568 using i64_3d_view =
typename impl_type::i64_3d_view;
571 using member_type =
typename Kokkos::TeamPolicy<execution_space>::member_type;
574 enum :
int { max_blocksize = 32 };
577 ConstUnmanaged<impl_scalar_type_2d_view_tpetra> b;
578 ConstUnmanaged<impl_scalar_type_2d_view_tpetra> x;
579 ConstUnmanaged<impl_scalar_type_2d_view_tpetra> x_remote;
580 Unmanaged<impl_scalar_type_2d_view_tpetra> y;
583 const ConstUnmanaged<impl_scalar_type_1d_view> tpetra_values;
586 const local_ordinal_type blocksize_requested;
589 const ConstUnmanaged<i64_3d_view> A_x_offsets;
590 const ConstUnmanaged<i64_3d_view> A_x_offsets_remote;
593 const ConstUnmanaged<btdm_scalar_type_3d_view> d_inv;
596 const Unmanaged<impl_scalar_type_1d_view> W;
598 impl_scalar_type damping_factor;
601 ComputeResidualAndSolve_SolveOnly(
602 const AmD<MatrixType>& amd,
const btdm_scalar_type_3d_view& d_inv_,
603 const impl_scalar_type_1d_view& W_,
604 const local_ordinal_type& blocksize_requested_,
605 const impl_scalar_type& damping_factor_)
606 : tpetra_values(amd.tpetra_values)
607 , blocksize_requested(blocksize_requested_)
608 , A_x_offsets(amd.A_x_offsets)
609 , A_x_offsets_remote(amd.A_x_offsets_remote)
612 , damping_factor(damping_factor_) {}
614 KOKKOS_INLINE_FUNCTION
615 void operator()(
const member_type& member)
const {
616 const local_ordinal_type blocksize = (B == 0 ? blocksize_requested : B);
617 const local_ordinal_type rowidx =
618 member.league_rank() * member.team_size() + member.team_rank();
619 const local_ordinal_type row = rowidx * blocksize;
620 const local_ordinal_type num_vectors = b.extent(1);
623 impl_scalar_type* local_Dinv_residual =
624 reinterpret_cast<impl_scalar_type*
>(member.thread_scratch(0).get_shmem(
625 blocksize *
sizeof(impl_scalar_type)));
627 if (rowidx >= (local_ordinal_type)d_inv.extent(0))
return;
629 magnitude_type norm = 0;
630 for (local_ordinal_type col = 0; col < num_vectors; ++col) {
632 Kokkos::parallel_for(Kokkos::ThreadVectorRange(member, blocksize),
633 [&](
const local_ordinal_type& k0) {
634 impl_scalar_type val = 0;
635 for (local_ordinal_type k1 = 0; k1 < blocksize;
637 val += d_inv(rowidx, k0, k1) * b(row + k1, col);
639 local_Dinv_residual[k0] = val;
642 magnitude_type colNorm;
643 Kokkos::parallel_reduce(
644 Kokkos::ThreadVectorRange(member, blocksize),
645 [&](
const local_ordinal_type& k, magnitude_type& update) {
648 impl_scalar_type y_update = local_Dinv_residual[k];
649 if constexpr (Kokkos::ArithTraits<impl_scalar_type>::is_complex) {
650 magnitude_type ydiff =
651 Kokkos::ArithTraits<impl_scalar_type>::abs(y_update);
652 update += ydiff * ydiff;
654 update += y_update * y_update;
656 y(row + k, col) = damping_factor * y_update;
661 Kokkos::single(Kokkos::PerThread(member), [&]() { W(rowidx) = norm; });
668 void run(
const ConstUnmanaged<impl_scalar_type_2d_view_tpetra>& b_,
669 const Unmanaged<impl_scalar_type_2d_view_tpetra>& y_) {
670 IFPACK2_BLOCKHELPER_PROFILER_REGION_BEGIN;
671 IFPACK2_BLOCKHELPER_TIMER_WITH_FENCE(
672 "BlockTriDi::ComputeResidualAndSolve::Run_Y_Zero",
673 ComputeResidualAndSolve0, execution_space);
678 const local_ordinal_type blocksize = blocksize_requested;
679 const local_ordinal_type nrows = d_inv.extent(0);
681 const local_ordinal_type team_size = 8;
682 const local_ordinal_type vector_size = 8;
683 const size_t shmem_thread_size = blocksize *
sizeof(impl_scalar_type);
684 Kokkos::TeamPolicy<execution_space> policy(
685 (nrows + team_size - 1) / team_size, team_size, vector_size);
686 policy.set_scratch_size(0, Kokkos::PerThread(shmem_thread_size));
687 Kokkos::parallel_for(
"ComputeResidualAndSolve::TeamPolicy::y_zero", policy,
689 IFPACK2_BLOCKHELPER_PROFILER_REGION_END;
690 IFPACK2_BLOCKHELPER_TIMER_FENCE(execution_space)
node_type::device_type node_device_type
Definition: Ifpack2_BlockHelper.hpp:297
size_t size_type
Definition: Ifpack2_BlockHelper.hpp:274
#define TEUCHOS_TEST_FOR_EXCEPT_MSG(throw_exception_test, msg)
Kokkos::View< size_type *, device_type > size_type_1d_view
Definition: Ifpack2_BlockHelper.hpp:346
Kokkos::Details::ArithTraits< scalar_type >::val_type impl_scalar_type
Definition: Ifpack2_BlockHelper.hpp:283